1#!./perl 2 3# The tests are in a separate file 't/re/re_tests'. 4# Each line in that file is a separate test. 5# There are five columns, separated by tabs. 6# 7# Column 1 contains the pattern, optionally enclosed in C<''>. 8# Modifiers can be put after the closing C<'>. 9# 10# Column 2 contains the string to be matched. 11# 12# Column 3 contains the expected result: 13# y expect a match 14# n expect no match 15# c expect an error 16# T the test is a TODO (can be combined with y/n/c) 17# M skip test on miniperl (combine with y/n/c/T) 18# B test exposes a known bug in Perl, should be skipped 19# b test exposes a known bug in Perl, should be skipped if noamp 20# t test exposes a bug with threading, TODO if qr_embed_thr 21# s test should only be run for regex_sets_compat.t 22# S test should not be run for regex_sets_compat.t 23# 24# Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 25# 26# Column 4 contains a string, usually C<$&>. 27# 28# Column 5 contains the expected result of double-quote 29# interpolating that string after the match, or start of error message. 30# 31# Column 6, if present, contains a reason why the test is skipped. 32# This is printed with "skipped", for harness to pick up. 33# 34# \n in the tests are interpolated, as are variables of the form ${\w+}. 35# 36# Blanks lines are treated as PASSING tests to keep the line numbers 37# linked to the test number. 38# 39# If you want to add a regular expression test that can't be expressed 40# in this format, don't add it here: put it in re/pat.t instead. 41# 42# Note that the inputs get passed on as "m're'", so the re bypasses the lexer. 43# This means this file cannot be used for testing anything that the lexer 44# handles; in 5.12 this means just \N{NAME} and \N{U+...}. 45# 46# Note that columns 2,3 and 5 are all enclosed in double quotes and then 47# evalled; so something like a\"\x{100}$1 has length 3+length($1). 48 49my ($file, $iters); 50BEGIN { 51 $iters = shift || 1; # Poor man performance suite, 10000 is OK. 52 53 # Do this open before any chdir 54 $file = shift; 55 if (defined $file) { 56 open TESTS, $file or die "Can't open $file"; 57 } 58 59 chdir 't' if -d 't'; 60 @INC = '../lib'; 61 62} 63 64sub _comment { 65 return map { /^#/ ? "$_\n" : "# $_\n" } 66 map { split /\n/ } @_; 67} 68 69use strict; 70use warnings FATAL=>"all"; 71use vars qw($bang $ffff $nulnul); # used by the tests 72use vars qw($qr $skip_amp $qr_embed $qr_embed_thr $regex_sets); # set by our callers 73 74 75 76if (!defined $file) { 77 open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!"; 78} 79 80my @tests = <TESTS>; 81 82close TESTS; 83 84$bang = sprintf "\\%03o", ord "!"; # \41 would not be portable. 85$ffff = chr(0xff) x 2; 86$nulnul = "\0" x 2; 87my $OP = $qr ? 'qr' : 'm'; 88 89$| = 1; 90printf "1..%d\n# $iters iterations\n", scalar @tests; 91 92my $test; 93TEST: 94foreach (@tests) { 95 $test++; 96 if (!/\S/ || /^\s*#/ || /^__END__$/) { 97 print "ok $test # (Blank line or comment)\n"; 98 if (/#/) { print $_ }; 99 next; 100 } 101 chomp; 102 s/\\n/\n/g unless $regex_sets; 103 my ($pat, $subject, $result, $repl, $expect, $reason) = split(/\t/,$_,6); 104 $reason = '' unless defined $reason; 105 my $input = join(':',$pat,$subject,$result,$repl,$expect); 106 # the double '' below keeps simple syntax highlighters from going crazy 107 $pat = "'$pat'" unless $pat =~ /^[:''\/]/; 108 $pat =~ s/(\$\{\w+\})/$1/eeg; 109 $pat =~ s/\\n/\n/g unless $regex_sets; 110 $subject = eval qq("$subject"); die $@ if $@; 111 $expect = eval qq("$expect"); die $@ if $@; 112 $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/; 113 my $todo_qr = $qr_embed_thr && ($result =~ s/t//); 114 my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//)); 115 ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader; 116 if ($result =~ s/ ( [Ss] ) //x) { 117 if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) { 118 $skip++; 119 $reason = "Test not valid for $0"; 120 } 121 } 122 $reason = 'skipping $&' if $reason eq '' && $skip_amp; 123 $result =~ s/B//i unless $skip; 124 my $todo= $result =~ s/T// ? " # TODO" : ""; 125 if (! $skip && $regex_sets) { 126 127 # If testing regex sets, change the [bracketed] classes into 128 # (?[bracketed]). 129 130 if ($pat !~ / \[ /x) { 131 132 $skip++; 133 $reason = "Pattern doesn't contain [brackets]"; 134 } 135 else { # Use non-regex features of Perl to accomplish this. 136 my $modified = ""; 137 my $in_brackets = 0; 138 139 # Go through the pattern character-by-character. We also add 140 # blanks around each token to test the /x parts of (?[ ]) 141 my $pat_len = length($pat); 142 CHAR: for (my $i = 0; $i < $pat_len; $i++) { 143 my $curchar = substr($pat, $i, 1); 144 if ($curchar eq '\\') { 145 $modified .= " " if $in_brackets; 146 $modified .= $curchar; 147 $i++; 148 149 # Get the character the backslash is escaping 150 $curchar = substr($pat, $i, 1); 151 $modified .= $curchar; 152 153 # If the character following that is a '{}', treat the 154 # entire amount as a single token 155 if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') { 156 my $j = index($pat, '}', $i+2); 157 if ($j < 0) { 158 last unless $in_brackets; 159 if ($result eq 'c') { 160 $skip++; 161 $reason = "Can't handle compilation errors with unmatched '{'"; 162 } 163 else { 164 print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n"; 165 next TEST; 166 } 167 } 168 $modified .= substr($pat, $i+1, $j - $i); 169 $i = $j; 170 } 171 elsif ($curchar eq 'x') { 172 173 # \x without brackets is supposed to be followed by 2 174 # hex digits. Take up to 2, and then add a blank 175 # after the last one. This avoids getting errors from 176 # (?[ ]) for run-ons, like \xabc 177 my $j = $i + 1; 178 for (; $j < $i + 3 && $j < $pat_len; $j++) { 179 my $curord = ord(substr($pat, $j, 1)); 180 if (!(($curord >= ord("A") && $curord <= ord("F")) 181 || ($curord >= ord("a") && $curord <= ord("f")) 182 || ($curord >= ord("0") && $curord <= ord("9")))) 183 { 184 $j++; 185 last; 186 } 187 } 188 $j--; 189 $modified .= substr($pat, $i + 1, $j - $i) . " "; 190 $i = $j; 191 } 192 elsif (ord($curchar) >= ord('0') 193 && (ord($curchar) <= ord('7'))) 194 { 195 # Similarly, octal constants have up to 3 digits. 196 my $j = $i + 1; 197 for (; $j < $i + 3 && $j < $pat_len; $j++) { 198 my $curord = ord(substr($pat, $j, 1)); 199 if (! ($curord >= ord("0") && $curord <= ord("7"))) { 200 $j++; 201 last; 202 } 203 } 204 $j--; 205 $modified .= substr($pat, $i + 1, $j - $i); 206 $i = $j; 207 } 208 209 next; 210 } # End of processing a backslash sequence 211 212 if (! $in_brackets # Skip (?{ }) 213 && $curchar eq '(' 214 && $i < $pat_len - 2 215 && substr($pat, $i+1, 1) eq '?' 216 && substr($pat, $i+2, 1) eq '{') 217 { 218 $skip++; 219 $reason = "Pattern contains '(?{'"; 220 last; 221 } 222 223 # Closing ']' 224 if ($curchar eq ']' && $in_brackets) { 225 $modified .= " ] ])"; 226 $in_brackets = 0; 227 next; 228 } 229 230 # A regular character. 231 if ($curchar ne '[') { 232 if (! $in_brackets) { 233 $modified .= $curchar; 234 } 235 else { 236 $modified .= " $curchar "; 237 } 238 next; 239 } 240 241 # Here is a '['; If not in a bracketed class, treat as the 242 # beginning of one. 243 if (! $in_brackets) { 244 $in_brackets = 1; 245 $modified .= "(?[ [ "; 246 247 # An immediately following ']' or '^]' is not the ending 248 # of the class, but is to be treated literally. 249 if ($i < $pat_len - 1 250 && substr($pat, $i+1, 1) eq ']') 251 { 252 $i ++; 253 $modified .= " ] "; 254 } 255 elsif ($i < $pat_len - 2 256 && substr($pat, $i+1, 1) eq '^' 257 && substr($pat, $i+2, 1) eq ']') 258 { 259 $i += 2; 260 $modified .= " ^ ] "; 261 } 262 next; 263 } 264 265 # Here is a plain '[' within [ ]. Could mean wants to 266 # match a '[', or it could be a posix class that has a 267 # corresponding ']'. Absorb either 268 269 $modified .= ' ['; 270 last if $i >= $pat_len - 1; 271 272 $i++; 273 $curchar = substr($pat, $i, 1); 274 if ($curchar =~ /[:=.]/) { 275 for (my $j = $i + 1; $j < $pat_len; $j++) { 276 next unless substr($pat, $j, 1) eq ']'; 277 last if $j - $i < 2; 278 if (substr($pat, $j - 1, 1) eq $curchar) { 279 # Here, is a posix class 280 $modified .= substr($pat, $i, $j - $i + 1) . " "; 281 $i = $j; 282 next CHAR; 283 } 284 } 285 } 286 287 # Here wasn't a posix class, just process normally 288 $modified .= " $curchar "; 289 } 290 291 if ($in_brackets && ! $skip) { 292 if ($result eq 'c') { 293 $skip++; 294 $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error"; 295 } 296 else { 297 print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n"; 298 next TEST; 299 } 300 } 301 302 # Use our modified pattern instead of the original 303 $pat = $modified; 304 } 305 } 306 307 for my $study ('', 'study $subject', 'utf8::upgrade($subject)', 308 'utf8::upgrade($subject); study $subject') { 309 # Need to make a copy, else the utf8::upgrade of an already studied 310 # scalar confuses things. 311 my $subject = $subject; 312 my $c = $iters; 313 my ($code, $match, $got); 314 if ($repl eq 'pos') { 315 $code= <<EOFCODE; 316 $study; 317 pos(\$subject)=0; 318 \$match = ( \$subject =~ m${pat}g ); 319 \$got = pos(\$subject); 320EOFCODE 321 } 322 elsif ($qr_embed) { 323 $code= <<EOFCODE; 324 my \$RE = qr$pat; 325 $study; 326 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; 327 \$got = "$repl"; 328EOFCODE 329 } 330 elsif ($qr_embed_thr) { 331 $code= <<EOFCODE; 332 # Can't run the match in a subthread, but can do this and 333 # clone the pattern the other way. 334 my \$RE = threads->new(sub {qr$pat})->join(); 335 $study; 336 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; 337 \$got = "$repl"; 338EOFCODE 339 } 340 else { 341 $code= <<EOFCODE; 342 $study; 343 \$match = (\$subject =~ $OP$pat) while \$c--; 344 \$got = "$repl"; 345EOFCODE 346 } 347 $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets; 348 #$code.=qq[\n\$expect="$expect";\n]; 349 #use Devel::Peek; 350 #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/; 351 { 352 # Probably we should annotate specific tests with which warnings 353 # categories they're known to trigger, and hence should be 354 # disabled just for that test 355 no warnings qw(uninitialized regexp); 356 eval $code; 357 } 358 chomp( my $err = $@ ); 359 if ( $skip ) { 360 print "ok $test # skipped", length($reason) ? ". $reason" : '', "\n"; 361 next TEST; 362 } 363 elsif ($result eq 'c') { 364 if ($err !~ m!^\Q$expect!) { print "not ok $test$todo (compile) $input => '$err'\n"; next TEST } 365 last; # no need to study a syntax error 366 } 367 elsif ( $todo_qr ) { 368 print "not ok $test # TODO", length($reason) ? " - $reason" : '', "\n"; 369 next TEST; 370 } 371 elsif ($@) { 372 print "not ok $test$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST; 373 } 374 elsif ($result =~ /^n/) { 375 if ($match) { print "not ok $test$todo ($study) $input => false positive\n"; next TEST } 376 } 377 else { 378 if (!$match || $got ne $expect) { 379 eval { require Data::Dumper }; 380 no warnings "utf8"; # But handle should be utf8 381 if ($@ || !defined &DynaLoader::boot_DynaLoader) { 382 # Data::Dumper will load on miniperl, but fail when used in 383 # anger as it tries to load B. I'd prefer to keep the 384 # regular calls below outside of an eval so that real 385 # (unknown) failures get spotted, not ignored. 386 print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$code\n"); 387 } 388 else { # better diagnostics 389 my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump; 390 my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump; 391 print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$g\n$code\n"); 392 } 393 next TEST; 394 } 395 } 396 } 397 print "ok $test$todo\n"; 398} 399 4001; 401