1#!./perl 2 3# The tests are in a separate file 't/re/re_tests'. 4# Each line in that file is a separate test. 5# There are five columns, separated by tabs. 6# An optional sixth column is used to give a reason, only when skipping tests 7# 8# Column 1 contains the pattern, optionally enclosed in C<''> C<::> or 9# C<//>. Modifiers can be put after the closing delimiter. C<''> will 10# automatically be added to any other patterns. 11# 12# Column 2 contains the string to be matched. 13# 14# Column 3 contains the expected result: 15# y expect a match 16# n expect no match 17# c expect an error 18# T the test is a TODO (can be combined with y/n/c) 19# M skip test on miniperl (combine with y/n/c/T) 20# B test exposes a known bug in Perl, should be skipped 21# b test exposes a known bug in Perl, should be skipped if noamp 22# t test exposes a bug with threading, TODO if qr_embed_thr 23# s test should only be run for regex_sets_compat.t 24# S test should not be run for regex_sets_compat.t 25# a test should only be run on ASCII platforms 26# e test should only be run on EBCDIC platforms 27# 28# Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 29# 30# Column 4 contains a string, usually C<$&>. 31# 32# Column 5 contains the expected result of double-quote 33# interpolating that string after the match, or start of error message. 34# 35# Column 6, if present, contains a reason why the test is skipped. 36# This is printed with "skipped", for harness to pick up. 37# 38# Column 7 can be used for comments 39# 40# \n in the tests are interpolated, as are variables of the form ${\w+}. 41# 42# Blanks lines are treated as PASSING tests to keep the line numbers 43# linked to the test number. 44# 45# If you want to add a regular expression test that can't be expressed 46# in this format, don't add it here: put it in re/pat.t instead. 47# 48# Note that the inputs get passed on as "m're'", so the re bypasses the lexer. 49# This means this file cannot be used for testing anything that the lexer 50# handles; in 5.12 this means just \N{NAME} and \N{U+...}. 51# 52# Note that columns 2,3 and 5 are all enclosed in double quotes and then 53# evalled; so something like a\"\x{100}$1 has length 3+length($1). 54# 55# \x... and \o{...} constants are automatically converted to the native 56# character set if necessary. \[0-7] constants aren't 57 58my ($file, $iters); 59BEGIN { 60 $iters = shift || 1; # Poor man performance suite, 10000 is OK. 61 62 # Do this open before any chdir 63 $file = shift; 64 if (defined $file) { 65 open TESTS, $file or die "Can't open $file"; 66 } 67 68 chdir 't' if -d 't'; 69 @INC = qw '../lib ../ext/re'; 70 if (!defined &DynaLoader::boot_DynaLoader) { # miniperl 71 print("1..0 # Skip Unicode tables not built yet\n"), exit 72 unless eval 'require "unicore/UCD.pl"'; 73 } 74 75 # Some of the tests need a locale; which one doesn't much matter, except 76 # that it be valid. Make sure of that 77 eval { require POSIX; 78 POSIX->import(qw(LC_ALL setlocale)); 79 POSIX::setlocale(&LC_ALL, "C"); 80 }; 81} 82 83sub _comment { 84 return map { /^#/ ? "$_\n" : "# $_\n" } 85 map { split /\n/ } @_; 86} 87 88use strict; 89use warnings FATAL=>"all"; 90no warnings 'experimental::vlb'; 91our ($bang, $ffff, $nulnul); # used by the tests 92our ($qr, $skip_amp, $qr_embed, $qr_embed_thr, $regex_sets, $alpha_assertions, $no_null); # set by our callers 93 94if ($no_null && ! eval { require XS::APItest }) { 95 print("1..0 # Skip XS::APItest not available\n"), exit 96} 97 98my $expanded_text = "expanded name from original test number"; 99my $expanded_text_re = qr/$expanded_text/; 100 101if (!defined $file) { 102 open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!"; 103} 104 105my @tests = <TESTS>; 106 107close TESTS; 108 109my $test_num = 0; 110 111# Some scenarios add extra tests to those just read in. For those where there 112# is a character set translation, the added test will already have been 113# translated, so any test number beginning with this one shouldn't be 114# translated again. 115my $first_already_converted_test_num = @tests + 1; 116 117sub convert_from_ascii_guts { 118 my $string_ref = shift; 119 120 return if $test_num >= $first_already_converted_test_num; 121 122 #my $save = $string_ref; 123 # Convert \x{...}, \o{...} 124 $$string_ref =~ s/ (?<! \\ ) \\x\{ ( .*? ) } / "\\x{" . sprintf("%X", utf8::unicode_to_native(hex $1)) . "}" /gex; 125 $$string_ref =~ s/ (?<! \\ ) \\o\{ ( .*? ) } / "\\o{" . sprintf("%o", utf8::unicode_to_native(oct $1)) . "}" /gex; 126 127 # Convert \xAB 128 $$string_ref =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9]{2} ) / "\\x" . sprintf("%02X", utf8::unicode_to_native(hex $1)) /gex; 129 130 # Convert \xA 131 $$string_ref =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9] ) (?! [A-Fa-f0-9] ) / "\\x" . sprintf("%X", utf8::unicode_to_native(hex $1)) /gex; 132 133 #print STDERR __LINE__, ": $save\n$string_ref\n" if $save ne $string_ref; 134 return; 135} 136 137*convert_from_ascii = (ord("A") == 65) 138 ? sub { 1; } 139 : \&convert_from_ascii_guts; 140 141$bang = sprintf "\\%03o", ord "!"; # \41 would not be portable. 142$ffff = chr(0xff) x 2; 143$nulnul = "\0" x 2; 144my $OP = $qr ? 'qr' : 'm'; 145 146$| = 1; 147 148TEST: 149foreach (@tests) { 150 $test_num++; 151 if (!/\S/ || /^\s*#/ || /^__END__$/) { 152 chomp; 153 my ($not,$comment)= split /\s*#\s*/, $_, 2; 154 $comment ||= "(blank line)"; 155 print "ok $test_num # $comment\n"; 156 next; 157 } 158 chomp; 159 s/\\n/\n/g unless $regex_sets; 160 my ($pat, $subject, $result, $repl, $expect, $reason, $comment) = split(/\t/,$_,7); 161 $comment = "" unless defined $comment; 162 if (!defined $subject) { 163 die "Bad test definition on line $test_num: $_\n"; 164 } 165 $reason = '' unless defined $reason; 166 my $input = join(':',$pat,$subject,$result,$repl,$expect); 167 168 # the double '' below keeps simple syntax highlighters from going crazy 169 $pat = "'$pat'" unless $pat =~ /^[:''\/]/; 170 $pat =~ s/(\$\{\w+\})/$1/eeg; 171 $pat =~ s/\\n/\n/g unless $regex_sets; 172 convert_from_ascii(\$pat); 173 174 my $no_null_pat; 175 if ($no_null && $pat =~ /^'(.*)'\z/) { 176 $no_null_pat = XS::APItest::string_without_null($1); 177 } 178 179 convert_from_ascii(\$subject); 180 $subject = eval qq("$subject"); die $@ if $@; 181 182 convert_from_ascii(\$expect); 183 $expect = eval qq("$expect"); die $@ if $@; 184 $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/; 185 186 my $todo_qr = $qr_embed_thr && ($result =~ s/t//); 187 my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//)); 188 ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader; 189 if ($result =~ s/ ( [Ss] ) //x) { 190 if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) { 191 $skip++; 192 $reason = "Test not valid for $0"; 193 } 194 } 195 if ($result =~ s/a// && ord("A") != 65) { 196 $skip++; 197 $reason = "Test is only valid for ASCII platforms. $reason"; 198 } 199 if ($result =~ s/e// && ord("A") != 193) { 200 $skip++; 201 $reason = "Test is only valid for EBCDIC platforms. $reason"; 202 } 203 $reason = 'skipping $&' if $reason eq '' && $skip_amp; 204 $result =~ s/B//i unless $skip; 205 my $todo= $result =~ s/T// ? " # TODO" : ""; 206 my $testname= $test_num; 207 if ($comment) { 208 $comment=~s/^\s*(?:#\s*)?//; 209 $testname .= " - $comment" if $comment; 210 } 211 if (! $skip && $alpha_assertions) { 212 my $assertions_re = qr/ (?: \Q(?\E (?: > | <? [=>] ) ) /x; 213 if ($pat !~ $assertions_re && $comment !~ $expanded_text_re) { 214 $skip++; 215 $reason = "Pattern doesn't contain assertions"; 216 } 217 elsif ($comment !~ $expanded_text_re) { 218 my $expanded_pat = $pat; 219 220 $pat =~ s/\( \? > /(*atomic:/xg; 221 222 if ($pat =~ s/\( \? = /(*pla:/xg) { 223 $expanded_pat =~ s//(*positive_lookahead:/g; 224 } 225 if ($pat =~ s/\( \? ! /(*nla:/xg) { 226 $expanded_pat =~ s//(*negative_lookahead:/g; 227 } 228 if ($pat =~ s/\( \? <= /(*plb:/xg) { 229 $expanded_pat =~ s//(*positive_lookbehind:/g; 230 } 231 if ($pat =~ s/\( \? <! /(*nlb:/xg) { 232 $expanded_pat =~ s//(*negative_lookbehind:/g; 233 } 234 if ($expanded_pat ne $pat) { 235 $comment .= " $expanded_text $test_num"; 236 push @tests, join "\t", $expanded_pat, 237 $subject // "", 238 $result // "", 239 $repl // "", 240 $expect // "", 241 $reason // "", 242 $comment; 243 } 244 } 245 } 246 elsif (! $skip && $regex_sets) { 247 248 # If testing regex sets, change the [bracketed] classes into 249 # (?[bracketed]). But note that '\[' and '\c[' don't introduce such a 250 # class. (We don't bother looking for an odd number of backslashes, 251 # as this hasn't been needed so far.) 252 if ($pat !~ / (?<!\\c) (?<!\\) \[ /x) { 253 $skip++; 254 $reason = "Pattern doesn't contain [brackets]"; 255 } 256 else { # Use non-regex features of Perl to accomplish this. 257 my $modified = ""; 258 my $in_brackets = 0; 259 260 # Go through the pattern character-by-character. We also add 261 # blanks around each token to test the /x parts of (?[ ]) 262 my $pat_len = length($pat); 263 CHAR: for (my $i = 0; $i < $pat_len; $i++) { 264 my $curchar = substr($pat, $i, 1); 265 if ($curchar eq '\\') { 266 $modified .= " " if $in_brackets; 267 $modified .= $curchar; 268 $i++; 269 270 # Get the character the backslash is escaping 271 $curchar = substr($pat, $i, 1); 272 $modified .= $curchar; 273 274 # If the character following that is a '{}', treat the 275 # entire amount as a single token 276 if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') { 277 my $j = index($pat, '}', $i+2); 278 if ($j < 0) { 279 last unless $in_brackets; 280 if ($result eq 'c') { 281 $skip++; 282 $reason = "Can't handle compilation errors with unmatched '{'"; 283 } 284 else { 285 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n"; 286 next TEST; 287 } 288 } 289 $modified .= substr($pat, $i+1, $j - $i); 290 $i = $j; 291 } 292 elsif ($curchar eq 'x') { 293 294 # \x without brackets is supposed to be followed by 2 295 # hex digits. Take up to 2, and then add a blank 296 # after the last one. This avoids getting errors from 297 # (?[ ]) for run-ons, like \xabc 298 my $j = $i + 1; 299 for (; $j < $i + 3 && $j < $pat_len; $j++) { 300 my $curord = ord(substr($pat, $j, 1)); 301 if (!(($curord >= ord("A") && $curord <= ord("F")) 302 || ($curord >= ord("a") && $curord <= ord("f")) 303 || ($curord >= ord("0") && $curord <= ord("9")))) 304 { 305 $j++; 306 last; 307 } 308 } 309 $j--; 310 $modified .= substr($pat, $i + 1, $j - $i); 311 $modified .= " " if $in_brackets; 312 $i = $j; 313 } 314 elsif (ord($curchar) >= ord('0') 315 && (ord($curchar) <= ord('7'))) 316 { 317 # Similarly, octal constants have up to 3 digits. 318 my $j = $i + 1; 319 for (; $j < $i + 3 && $j < $pat_len; $j++) { 320 my $curord = ord(substr($pat, $j, 1)); 321 if (! ($curord >= ord("0") && $curord <= ord("7"))) { 322 $j++; 323 last; 324 } 325 } 326 $j--; 327 $modified .= substr($pat, $i + 1, $j - $i); 328 $i = $j; 329 } 330 331 next; 332 } # End of processing a backslash sequence 333 334 if (! $in_brackets # Skip (?{ }) 335 && $curchar eq '(' 336 && $i < $pat_len - 2 337 && substr($pat, $i+1, 1) eq '?' 338 && substr($pat, $i+2, 1) eq '{') 339 { 340 $skip++; 341 $reason = "Pattern contains '(?{'"; 342 last; 343 } 344 345 # Closing ']' 346 if ($curchar eq ']' && $in_brackets) { 347 $modified .= " ] ])"; 348 $in_brackets = 0; 349 next; 350 } 351 352 # A regular character. 353 if ($curchar ne '[') { 354 $modified .= " " if $in_brackets; 355 $modified .= $curchar; 356 next; 357 } 358 359 # Here is a '['; If not in a bracketed class, treat as the 360 # beginning of one. 361 if (! $in_brackets) { 362 $in_brackets = 1; 363 $modified .= "(?[ [ "; 364 365 # An immediately following ']' or '^]' is not the ending 366 # of the class, but is to be treated literally. 367 if ($i < $pat_len - 1 368 && substr($pat, $i+1, 1) eq ']') 369 { 370 $i ++; 371 $modified .= " ] "; 372 } 373 elsif ($i < $pat_len - 2 374 && substr($pat, $i+1, 1) eq '^' 375 && substr($pat, $i+2, 1) eq ']') 376 { 377 $i += 2; 378 $modified .= " ^ ] "; 379 } 380 next; 381 } 382 383 # Here is a plain '[' within [ ]. Could mean wants to 384 # match a '[', or it could be a posix class that has a 385 # corresponding ']'. Absorb either 386 387 $modified .= ' ['; 388 last if $i >= $pat_len - 1; 389 390 $i++; 391 $curchar = substr($pat, $i, 1); 392 if ($curchar =~ /[:=.]/) { 393 for (my $j = $i + 1; $j < $pat_len; $j++) { 394 next unless substr($pat, $j, 1) eq ']'; 395 last if $j - $i < 2; 396 if (substr($pat, $j - 1, 1) eq $curchar) { 397 # Here, is a posix class 398 $modified .= substr($pat, $i, $j - $i + 1) . " "; 399 $i = $j; 400 next CHAR; 401 } 402 } 403 } 404 405 # Here wasn't a posix class, just process normally 406 $modified .= " $curchar "; 407 } 408 409 if ($in_brackets && ! $skip) { 410 if ($result eq 'c') { 411 $skip++; 412 $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error"; 413 } 414 else { 415 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n"; 416 next TEST; 417 } 418 } 419 420 # Use our modified pattern instead of the original 421 $pat = $modified; 422 } 423 } 424 425 for my $study ('', 'study $subject;', 'utf8::upgrade($subject);', 426 'utf8::upgrade($subject); study $subject;') { 427 # Need to make a copy, else the utf8::upgrade of an already studied 428 # scalar confuses things. 429 my $subject = $subject; 430 $subject = XS::APItest::string_without_null($subject) if $no_null; 431 my $c = $iters; 432 my ($code, $match, $got); 433 if ($repl eq 'pos') { 434 my $patcode = defined $no_null_pat ? '/$no_null_pat/g' 435 : "m${pat}g"; 436 $code= <<EOFCODE; 437 $study 438 pos(\$subject)=0; 439 \$match = ( \$subject =~ $patcode ); 440 \$got = pos(\$subject); 441EOFCODE 442 } 443 elsif ($qr_embed) { 444 $code= <<EOFCODE; 445 my \$RE = qr$pat; 446 $study 447 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; 448 \$got = "$repl"; 449EOFCODE 450 } 451 elsif ($qr_embed_thr) { 452 $code= <<EOFCODE; 453 # Can't run the match in a subthread, but can do this and 454 # clone the pattern the other way. 455 my \$RE = threads->new(sub {qr$pat})->join(); 456 $study 457 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; 458 \$got = "$repl"; 459EOFCODE 460 } 461 elsif ($no_null) { 462 my $patcode = defined $no_null_pat ? '/$no_null_pat/' 463 : $pat; 464 $code= <<EOFCODE; 465 $study 466 \$match = (\$subject =~ $OP$pat) while \$c--; 467 \$got = "$repl"; 468EOFCODE 469 } 470 else { 471 $code= <<EOFCODE; 472 $study 473 \$match = (\$subject =~ $OP$pat) while \$c--; 474 \$got = "$repl"; 475EOFCODE 476 } 477 $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets; 478 #$code.=qq[\n\$expect="$expect";\n]; 479 #use Devel::Peek; 480 #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/; 481 { 482 # Probably we should annotate specific tests with which warnings 483 # categories they're known to trigger, and hence should be 484 # disabled just for that test 485 no warnings qw(uninitialized regexp deprecated); 486 eval $code; 487 } 488 chomp( my $err = $@ ); 489 if ( $skip ) { 490 print "ok $testname # skipped", length($reason) ? ". $reason" : '', "\n"; 491 next TEST; 492 } 493 elsif ($result eq 'c') { 494 if ($err !~ m!^\Q$expect!) { print "not ok $testname$todo (compile) $input => '$err'\n"; next TEST } 495 last; # no need to study a syntax error 496 } 497 elsif ( $todo_qr ) { 498 print "not ok $testname # TODO", length($reason) ? " - $reason" : '', "\n"; 499 next TEST; 500 } 501 elsif ($@) { 502 print "not ok $testname$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST; 503 } 504 elsif ($result =~ /^n/) { 505 if ($match) { print "not ok $testname$todo ($study) $input => false positive\n"; next TEST } 506 } 507 else { 508 if (!$match || $got ne $expect) { 509 eval { require Data::Dumper }; 510 no warnings "utf8"; # But handle should be utf8 511 if ($@ || !defined &DynaLoader::boot_DynaLoader) { 512 # Data::Dumper will load on miniperl, but fail when used in 513 # anger as it tries to load B. I'd prefer to keep the 514 # regular calls below outside of an eval so that real 515 # (unknown) failures get spotted, not ignored. 516 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$code\n"); 517 } 518 else { # better diagnostics 519 my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump; 520 my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump; 521 my $e = Data::Dumper->new([$expect],['expected'])->Useqq(1)->Dump; 522 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$code\n$g\n$e\n"); 523 } 524 next TEST; 525 } 526 } 527 } 528 print "ok $testname$todo\n"; 529} 530 531printf "1..%d\n# $iters iterations\n", scalar @tests; 532 5331; 534