1#!./perl 2 3# The tests are in a separate file 't/re/re_tests'. 4# Each line in that file is a separate test. 5# There are five columns, separated by tabs. 6# An optional sixth column is used to give a reason, only when skipping tests 7# 8# Column 1 contains the pattern, optionally enclosed in C<''> C<::> or 9# C<//>. Modifiers can be put after the closing delimiter. C<''> will 10# automatically be added to any other patterns. 11# 12# Column 2 contains the string to be matched. 13# 14# Column 3 contains the expected result: 15# y expect a match 16# n expect no match 17# c expect an error 18# T the test is a TODO (can be combined with y/n/c) 19# M skip test on miniperl (combine with y/n/c/T) 20# B test exposes a known bug in Perl, should be skipped 21# b test exposes a known bug in Perl, should be skipped if noamp 22# t test exposes a bug with threading, TODO if qr_embed_thr 23# s test should only be run for regex_sets_compat.t 24# S test should not be run for regex_sets_compat.t 25# a test should only be run on ASCII platforms 26# e test should only be run on EBCDIC platforms 27# 28# Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 29# 30# Column 4 contains a string, usually C<$&>. 31# 32# Column 5 contains the expected result of double-quote 33# interpolating that string after the match, or start of error message. 34# 35# Column 6, if present, contains a reason why the test is skipped. 36# This is printed with "skipped", for harness to pick up. 37# 38# Column 7 can be used for comments 39# 40# \n in the tests are interpolated, as are variables of the form ${\w+}. 41# 42# Blanks lines are treated as PASSING tests to keep the line numbers 43# linked to the test number. 44# 45# If you want to add a regular expression test that can't be expressed 46# in this format, don't add it here: put it in re/pat.t instead. 47# 48# Note that the inputs get passed on as "m're'", so the re bypasses the lexer. 49# This means this file cannot be used for testing anything that the lexer 50# handles; in 5.12 this means just \N{NAME} and \N{U+...}. 51# 52# Note that columns 2,3 and 5 are all enclosed in double quotes and then 53# evalled; so something like a\"\x{100}$1 has length 3+length($1). 54# 55# \x... and \o{...} constants are automatically converted to the native 56# character set if necessary. \[0-7] constants aren't 57 58my ($file, $iters); 59BEGIN { 60 $iters = shift || 1; # Poor man performance suite, 10000 is OK. 61 62 # Do this open before any chdir 63 $file = shift; 64 if (defined $file) { 65 open TESTS, $file or die "Can't open $file"; 66 } 67 68 chdir 't' if -d 't'; 69 @INC = qw '../lib ../ext/re'; 70 if (!defined &DynaLoader::boot_DynaLoader) { # miniperl 71 print("1..0 # Skip Unicode tables not built yet\n"), exit 72 unless eval 'require "unicore/Heavy.pl"'; 73 } 74 75 # Some of the tests need a locale; which one doesn't much matter, except 76 # that it be valid. Make sure of that 77 eval { require POSIX; 78 POSIX->import(qw(LC_ALL setlocale)); 79 POSIX::setlocale(&LC_ALL, "C"); 80 }; 81} 82 83sub _comment { 84 return map { /^#/ ? "$_\n" : "# $_\n" } 85 map { split /\n/ } @_; 86} 87 88sub convert_from_ascii { 89 my $string = shift; 90 91 #my $save = $string; 92 # Convert \x{...}, \o{...} 93 $string =~ s/ (?<! \\ ) \\x\{ ( .*? ) } / "\\x{" . sprintf("%X", utf8::unicode_to_native(hex $1)) . "}" /gex; 94 $string =~ s/ (?<! \\ ) \\o\{ ( .*? ) } / "\\o{" . sprintf("%o", utf8::unicode_to_native(oct $1)) . "}" /gex; 95 96 # Convert \xAB 97 $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9]{2} ) / "\\x" . sprintf("%02X", utf8::unicode_to_native(hex $1)) /gex; 98 99 # Convert \xA 100 $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9] ) (?! [A-Fa-f0-9] ) / "\\x" . sprintf("%X", utf8::unicode_to_native(hex $1)) /gex; 101 102 #print STDERR __LINE__, ": $save\n$string\n" if $save ne $string; 103 return $string; 104} 105 106use strict; 107use warnings FATAL=>"all"; 108our ($bang, $ffff, $nulnul); # used by the tests 109our ($qr, $skip_amp, $qr_embed, $qr_embed_thr, $regex_sets, $alpha_assertions, $no_null); # set by our callers 110 111my $expanded_text = "expanded name from original test number"; 112my $expanded_text_re = qr/$expanded_text/; 113 114if (!defined $file) { 115 open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!"; 116} 117 118my @tests = <TESTS>; 119 120close TESTS; 121 122$bang = sprintf "\\%03o", ord "!"; # \41 would not be portable. 123$ffff = chr(0xff) x 2; 124$nulnul = "\0" x 2; 125my $OP = $qr ? 'qr' : 'm'; 126 127$| = 1; 128 129my $test; 130TEST: 131foreach (@tests) { 132 $test++; 133 if (!/\S/ || /^\s*#/ || /^__END__$/) { 134 chomp; 135 my ($not,$comment)= split /\s*#\s*/, $_, 2; 136 $comment ||= "(blank line)"; 137 print "ok $test # $comment\n"; 138 next; 139 } 140 chomp; 141 s/\\n/\n/g unless $regex_sets; 142 my ($pat, $subject, $result, $repl, $expect, $reason, $comment) = split(/\t/,$_,7); 143 $comment = "" unless defined $comment; 144 if (!defined $subject) { 145 die "Bad test definition on line $test: $_\n"; 146 } 147 $reason = '' unless defined $reason; 148 my $input = join(':',$pat,$subject,$result,$repl,$expect); 149 150 # the double '' below keeps simple syntax highlighters from going crazy 151 $pat = "'$pat'" unless $pat =~ /^[:''\/]/; 152 $pat =~ s/(\$\{\w+\})/$1/eeg; 153 $pat =~ s/\\n/\n/g unless $regex_sets; 154 $pat = convert_from_ascii($pat) if ord("A") != 65; 155 156 my $no_null_pat; 157 if ($no_null && $pat =~ /^'(.*)'\z/) { 158 $no_null_pat = XS::APItest::string_without_null($1); 159 } 160 161 $subject = convert_from_ascii($subject) if ord("A") != 65; 162 $subject = eval qq("$subject"); die $@ if $@; 163 164 $expect = convert_from_ascii($expect) if ord("A") != 65; 165 $expect = eval qq("$expect"); die $@ if $@; 166 $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/; 167 168 my $todo_qr = $qr_embed_thr && ($result =~ s/t//); 169 my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//)); 170 ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader; 171 if ($result =~ s/ ( [Ss] ) //x) { 172 if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) { 173 $skip++; 174 $reason = "Test not valid for $0"; 175 } 176 } 177 if ($result =~ s/a// && ord("A") != 65) { 178 $skip++; 179 $reason = "Test is only valid for ASCII platforms. $reason"; 180 } 181 if ($result =~ s/e// && ord("A") != 193) { 182 $skip++; 183 $reason = "Test is only valid for EBCDIC platforms. $reason"; 184 } 185 $reason = 'skipping $&' if $reason eq '' && $skip_amp; 186 $result =~ s/B//i unless $skip; 187 my $todo= $result =~ s/T// ? " # TODO" : ""; 188 my $testname= $test; 189 if ($comment) { 190 $comment=~s/^\s*(?:#\s*)?//; 191 $testname .= " - $comment" if $comment; 192 } 193 if (! $skip && $alpha_assertions) { 194 my $assertions_re = qr/ (?: \Q(?\E (?: > | <? [=>] ) ) /x; 195 if ($pat !~ $assertions_re && $comment !~ $expanded_text_re) { 196 $skip++; 197 $reason = "Pattern doesn't contain assertions"; 198 } 199 elsif ($comment !~ $expanded_text_re) { 200 my $expanded_pat = $pat; 201 202 $pat =~ s/\( \? > /(*atomic:/xg; 203 204 if ($pat =~ s/\( \? = /(*pla:/xg) { 205 $expanded_pat =~ s//(*positive_lookahead:/g; 206 } 207 if ($pat =~ s/\( \? ! /(*nla:/xg) { 208 $expanded_pat =~ s//(*negative_lookahead:/g; 209 } 210 if ($pat =~ s/\( \? <= /(*plb:/xg) { 211 $expanded_pat =~ s//(*positive_lookbehind:/g; 212 } 213 if ($pat =~ s/\( \? <! /(*nlb:/xg) { 214 $expanded_pat =~ s//(*negative_lookbehind:/g; 215 } 216 if ($expanded_pat ne $pat) { 217 $comment .= " $expanded_text $test"; 218 push @tests, join "\t", $expanded_pat, 219 $subject // "", 220 $result // "", 221 $repl // "", 222 $expect // "", 223 $reason // "", 224 $comment; 225 } 226 } 227 } 228 elsif (! $skip && $regex_sets) { 229 230 # If testing regex sets, change the [bracketed] classes into 231 # (?[bracketed]). But note that '\[' and '\c[' don't introduce such a 232 # class. (We don't bother looking for an odd number of backslashes, 233 # as this hasn't been needed so far.) 234 if ($pat !~ / (?<!\\c) (?<!\\) \[ /x) { 235 $skip++; 236 $reason = "Pattern doesn't contain [brackets]"; 237 } 238 else { # Use non-regex features of Perl to accomplish this. 239 my $modified = ""; 240 my $in_brackets = 0; 241 242 # Go through the pattern character-by-character. We also add 243 # blanks around each token to test the /x parts of (?[ ]) 244 my $pat_len = length($pat); 245 CHAR: for (my $i = 0; $i < $pat_len; $i++) { 246 my $curchar = substr($pat, $i, 1); 247 if ($curchar eq '\\') { 248 $modified .= " " if $in_brackets; 249 $modified .= $curchar; 250 $i++; 251 252 # Get the character the backslash is escaping 253 $curchar = substr($pat, $i, 1); 254 $modified .= $curchar; 255 256 # If the character following that is a '{}', treat the 257 # entire amount as a single token 258 if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') { 259 my $j = index($pat, '}', $i+2); 260 if ($j < 0) { 261 last unless $in_brackets; 262 if ($result eq 'c') { 263 $skip++; 264 $reason = "Can't handle compilation errors with unmatched '{'"; 265 } 266 else { 267 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n"; 268 next TEST; 269 } 270 } 271 $modified .= substr($pat, $i+1, $j - $i); 272 $i = $j; 273 } 274 elsif ($curchar eq 'x') { 275 276 # \x without brackets is supposed to be followed by 2 277 # hex digits. Take up to 2, and then add a blank 278 # after the last one. This avoids getting errors from 279 # (?[ ]) for run-ons, like \xabc 280 my $j = $i + 1; 281 for (; $j < $i + 3 && $j < $pat_len; $j++) { 282 my $curord = ord(substr($pat, $j, 1)); 283 if (!(($curord >= ord("A") && $curord <= ord("F")) 284 || ($curord >= ord("a") && $curord <= ord("f")) 285 || ($curord >= ord("0") && $curord <= ord("9")))) 286 { 287 $j++; 288 last; 289 } 290 } 291 $j--; 292 $modified .= substr($pat, $i + 1, $j - $i); 293 $modified .= " " if $in_brackets; 294 $i = $j; 295 } 296 elsif (ord($curchar) >= ord('0') 297 && (ord($curchar) <= ord('7'))) 298 { 299 # Similarly, octal constants have up to 3 digits. 300 my $j = $i + 1; 301 for (; $j < $i + 3 && $j < $pat_len; $j++) { 302 my $curord = ord(substr($pat, $j, 1)); 303 if (! ($curord >= ord("0") && $curord <= ord("7"))) { 304 $j++; 305 last; 306 } 307 } 308 $j--; 309 $modified .= substr($pat, $i + 1, $j - $i); 310 $i = $j; 311 } 312 313 next; 314 } # End of processing a backslash sequence 315 316 if (! $in_brackets # Skip (?{ }) 317 && $curchar eq '(' 318 && $i < $pat_len - 2 319 && substr($pat, $i+1, 1) eq '?' 320 && substr($pat, $i+2, 1) eq '{') 321 { 322 $skip++; 323 $reason = "Pattern contains '(?{'"; 324 last; 325 } 326 327 # Closing ']' 328 if ($curchar eq ']' && $in_brackets) { 329 $modified .= " ] ])"; 330 $in_brackets = 0; 331 next; 332 } 333 334 # A regular character. 335 if ($curchar ne '[') { 336 $modified .= " " if $in_brackets; 337 $modified .= $curchar; 338 next; 339 } 340 341 # Here is a '['; If not in a bracketed class, treat as the 342 # beginning of one. 343 if (! $in_brackets) { 344 $in_brackets = 1; 345 $modified .= "(?[ [ "; 346 347 # An immediately following ']' or '^]' is not the ending 348 # of the class, but is to be treated literally. 349 if ($i < $pat_len - 1 350 && substr($pat, $i+1, 1) eq ']') 351 { 352 $i ++; 353 $modified .= " ] "; 354 } 355 elsif ($i < $pat_len - 2 356 && substr($pat, $i+1, 1) eq '^' 357 && substr($pat, $i+2, 1) eq ']') 358 { 359 $i += 2; 360 $modified .= " ^ ] "; 361 } 362 next; 363 } 364 365 # Here is a plain '[' within [ ]. Could mean wants to 366 # match a '[', or it could be a posix class that has a 367 # corresponding ']'. Absorb either 368 369 $modified .= ' ['; 370 last if $i >= $pat_len - 1; 371 372 $i++; 373 $curchar = substr($pat, $i, 1); 374 if ($curchar =~ /[:=.]/) { 375 for (my $j = $i + 1; $j < $pat_len; $j++) { 376 next unless substr($pat, $j, 1) eq ']'; 377 last if $j - $i < 2; 378 if (substr($pat, $j - 1, 1) eq $curchar) { 379 # Here, is a posix class 380 $modified .= substr($pat, $i, $j - $i + 1) . " "; 381 $i = $j; 382 next CHAR; 383 } 384 } 385 } 386 387 # Here wasn't a posix class, just process normally 388 $modified .= " $curchar "; 389 } 390 391 if ($in_brackets && ! $skip) { 392 if ($result eq 'c') { 393 $skip++; 394 $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error"; 395 } 396 else { 397 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n"; 398 next TEST; 399 } 400 } 401 402 # Use our modified pattern instead of the original 403 $pat = $modified; 404 } 405 } 406 407 for my $study ('', 'study $subject;', 'utf8::upgrade($subject);', 408 'utf8::upgrade($subject); study $subject;') { 409 # Need to make a copy, else the utf8::upgrade of an already studied 410 # scalar confuses things. 411 my $subject = $subject; 412 $subject = XS::APItest::string_without_null($subject) if $no_null; 413 my $c = $iters; 414 my ($code, $match, $got); 415 if ($repl eq 'pos') { 416 my $patcode = defined $no_null_pat ? '/$no_null_pat/g' 417 : "m${pat}g"; 418 $code= <<EOFCODE; 419 $study 420 pos(\$subject)=0; 421 \$match = ( \$subject =~ $patcode ); 422 \$got = pos(\$subject); 423EOFCODE 424 } 425 elsif ($qr_embed) { 426 $code= <<EOFCODE; 427 my \$RE = qr$pat; 428 $study 429 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; 430 \$got = "$repl"; 431EOFCODE 432 } 433 elsif ($qr_embed_thr) { 434 $code= <<EOFCODE; 435 # Can't run the match in a subthread, but can do this and 436 # clone the pattern the other way. 437 my \$RE = threads->new(sub {qr$pat})->join(); 438 $study 439 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; 440 \$got = "$repl"; 441EOFCODE 442 } 443 elsif ($no_null) { 444 my $patcode = defined $no_null_pat ? '/$no_null_pat/' 445 : $pat; 446 $code= <<EOFCODE; 447 $study 448 \$match = (\$subject =~ $OP$pat) while \$c--; 449 \$got = "$repl"; 450EOFCODE 451 } 452 else { 453 $code= <<EOFCODE; 454 $study 455 \$match = (\$subject =~ $OP$pat) while \$c--; 456 \$got = "$repl"; 457EOFCODE 458 } 459 $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets; 460 $code = "no warnings 'experimental::alpha_assertions';$code" if $alpha_assertions; 461 #$code.=qq[\n\$expect="$expect";\n]; 462 #use Devel::Peek; 463 #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/; 464 { 465 # Probably we should annotate specific tests with which warnings 466 # categories they're known to trigger, and hence should be 467 # disabled just for that test 468 no warnings qw(uninitialized regexp deprecated); 469 eval $code; 470 } 471 chomp( my $err = $@ ); 472 if ( $skip ) { 473 print "ok $testname # skipped", length($reason) ? ". $reason" : '', "\n"; 474 next TEST; 475 } 476 elsif ($result eq 'c') { 477 if ($err !~ m!^\Q$expect!) { print "not ok $testname$todo (compile) $input => '$err'\n"; next TEST } 478 last; # no need to study a syntax error 479 } 480 elsif ( $todo_qr ) { 481 print "not ok $testname # TODO", length($reason) ? " - $reason" : '', "\n"; 482 next TEST; 483 } 484 elsif ($@) { 485 print "not ok $testname$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST; 486 } 487 elsif ($result =~ /^n/) { 488 if ($match) { print "not ok $testname$todo ($study) $input => false positive\n"; next TEST } 489 } 490 else { 491 if (!$match || $got ne $expect) { 492 eval { require Data::Dumper }; 493 no warnings "utf8"; # But handle should be utf8 494 if ($@ || !defined &DynaLoader::boot_DynaLoader) { 495 # Data::Dumper will load on miniperl, but fail when used in 496 # anger as it tries to load B. I'd prefer to keep the 497 # regular calls below outside of an eval so that real 498 # (unknown) failures get spotted, not ignored. 499 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$code\n"); 500 } 501 else { # better diagnostics 502 my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump; 503 my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump; 504 my $e = Data::Dumper->new([$expect],['expected'])->Useqq(1)->Dump; 505 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$code\n$g\n$e\n"); 506 } 507 next TEST; 508 } 509 } 510 } 511 print "ok $testname$todo\n"; 512} 513 514printf "1..%d\n# $iters iterations\n", scalar @tests; 515 5161; 517