1#!./perl 2 3# The tests are in a separate file 't/re/re_tests'. 4# Each line in that file is a separate test. 5# There are five columns, separated by tabs. 6# An optional sixth column is used to give a reason, only when skipping tests 7# 8# Column 1 contains the pattern, optionally enclosed in C<''>. 9# Modifiers can be put after the closing C<'>. 10# 11# Column 2 contains the string to be matched. 12# 13# Column 3 contains the expected result: 14# y expect a match 15# n expect no match 16# c expect an error 17# T the test is a TODO (can be combined with y/n/c) 18# M skip test on miniperl (combine with y/n/c/T) 19# B test exposes a known bug in Perl, should be skipped 20# b test exposes a known bug in Perl, should be skipped if noamp 21# t test exposes a bug with threading, TODO if qr_embed_thr 22# s test should only be run for regex_sets_compat.t 23# S test should not be run for regex_sets_compat.t 24# a test should only be run on ASCII platforms 25# e test should only be run on EBCDIC platforms 26# 27# Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 28# 29# Column 4 contains a string, usually C<$&>. 30# 31# Column 5 contains the expected result of double-quote 32# interpolating that string after the match, or start of error message. 33# 34# Column 6, if present, contains a reason why the test is skipped. 35# This is printed with "skipped", for harness to pick up. 36# 37# Column 7 can be used for comments 38# 39# \n in the tests are interpolated, as are variables of the form ${\w+}. 40# 41# Blanks lines are treated as PASSING tests to keep the line numbers 42# linked to the test number. 43# 44# If you want to add a regular expression test that can't be expressed 45# in this format, don't add it here: put it in re/pat.t instead. 46# 47# Note that the inputs get passed on as "m're'", so the re bypasses the lexer. 48# This means this file cannot be used for testing anything that the lexer 49# handles; in 5.12 this means just \N{NAME} and \N{U+...}. 50# 51# Note that columns 2,3 and 5 are all enclosed in double quotes and then 52# evalled; so something like a\"\x{100}$1 has length 3+length($1). 53# 54# \x... and \o{...} constants are automatically converted to the native 55# character set if necessary. \[0-7] constants aren't 56 57my ($file, $iters); 58BEGIN { 59 $iters = shift || 1; # Poor man performance suite, 10000 is OK. 60 61 # Do this open before any chdir 62 $file = shift; 63 if (defined $file) { 64 open TESTS, $file or die "Can't open $file"; 65 } 66 67 chdir 't' if -d 't'; 68 @INC = qw '../lib ../ext/re'; 69 if (!defined &DynaLoader::boot_DynaLoader) { # miniperl 70 print("1..0 # Skip Unicode tables not built yet\n"), exit 71 unless eval 'require "unicore/Heavy.pl"'; 72 } 73} 74 75sub _comment { 76 return map { /^#/ ? "$_\n" : "# $_\n" } 77 map { split /\n/ } @_; 78} 79 80sub convert_from_ascii { 81 my $string = shift; 82 83 #my $save = $string; 84 # Convert \x{...}, \o{...} 85 $string =~ s/ (?<! \\ ) \\x\{ ( .*? ) } / "\\x{" . sprintf("%X", utf8::unicode_to_native(hex $1)) . "}" /gex; 86 $string =~ s/ (?<! \\ ) \\o\{ ( .*? ) } / "\\o{" . sprintf("%o", utf8::unicode_to_native(oct $1)) . "}" /gex; 87 88 # Convert \xAB 89 $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9]{2} ) / "\\x" . sprintf("%02X", utf8::unicode_to_native(hex $1)) /gex; 90 91 # Convert \xA 92 $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9] ) (?! [A-Fa-f0-9] ) / "\\x" . sprintf("%X", utf8::unicode_to_native(hex $1)) /gex; 93 94 #print STDERR __LINE__, ": $save\n$string\n" if $save ne $string; 95 return $string; 96} 97 98use strict; 99use warnings FATAL=>"all"; 100use vars qw($bang $ffff $nulnul); # used by the tests 101use vars qw($qr $skip_amp $qr_embed $qr_embed_thr $regex_sets); # set by our callers 102 103 104 105if (!defined $file) { 106 open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!"; 107} 108 109my @tests = <TESTS>; 110 111close TESTS; 112 113$bang = sprintf "\\%03o", ord "!"; # \41 would not be portable. 114$ffff = chr(0xff) x 2; 115$nulnul = "\0" x 2; 116my $OP = $qr ? 'qr' : 'm'; 117 118$| = 1; 119printf "1..%d\n# $iters iterations\n", scalar @tests; 120 121my $test; 122TEST: 123foreach (@tests) { 124 $test++; 125 if (!/\S/ || /^\s*#/ || /^__END__$/) { 126 chomp; 127 my ($not,$comment)= split /\s*#\s*/, $_, 2; 128 $comment ||= "(blank line)"; 129 print "ok $test # $comment\n"; 130 next; 131 } 132 chomp; 133 s/\\n/\n/g unless $regex_sets; 134 my ($pat, $subject, $result, $repl, $expect, $reason, $comment) = split(/\t/,$_,7); 135 if (!defined $subject) { 136 die "Bad test definition on line $test: $_\n"; 137 } 138 $reason = '' unless defined $reason; 139 my $input = join(':',$pat,$subject,$result,$repl,$expect); 140 141 # the double '' below keeps simple syntax highlighters from going crazy 142 $pat = "'$pat'" unless $pat =~ /^[:''\/]/; 143 $pat =~ s/(\$\{\w+\})/$1/eeg; 144 $pat =~ s/\\n/\n/g unless $regex_sets; 145 $pat = convert_from_ascii($pat) if ord("A") != 65; 146 147 $subject = convert_from_ascii($subject) if ord("A") != 65; 148 $subject = eval qq("$subject"); die $@ if $@; 149 150 $expect = convert_from_ascii($expect) if ord("A") != 65; 151 $expect = eval qq("$expect"); die $@ if $@; 152 $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/; 153 154 my $todo_qr = $qr_embed_thr && ($result =~ s/t//); 155 my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//)); 156 ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader; 157 if ($result =~ s/ ( [Ss] ) //x) { 158 if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) { 159 $skip++; 160 $reason = "Test not valid for $0"; 161 } 162 } 163 if ($result =~ s/a// && ord("A") != 65) { 164 $skip++; 165 $reason = "Test is only valid for ASCII platforms. $reason"; 166 } 167 if ($result =~ s/e// && ord("A") != 193) { 168 $skip++; 169 $reason = "Test is only valid for EBCDIC platforms. $reason"; 170 } 171 $reason = 'skipping $&' if $reason eq '' && $skip_amp; 172 $result =~ s/B//i unless $skip; 173 my $todo= $result =~ s/T// ? " # TODO" : ""; 174 my $testname= $test; 175 if ($comment) { 176 $comment=~s/^\s*(?:#\s*)?//; 177 $testname .= " - $comment" if $comment; 178 } 179 if (! $skip && $regex_sets) { 180 181 # If testing regex sets, change the [bracketed] classes into 182 # (?[bracketed]). But note that '\[' and '\c[' don't introduce such a 183 # class. (We don't bother looking for an odd number of backslashes, 184 # as this hasn't been needed so far.) 185 if ($pat !~ / (?<!\\c) (?<!\\) \[ /x) { 186 $skip++; 187 $reason = "Pattern doesn't contain [brackets]"; 188 } 189 else { # Use non-regex features of Perl to accomplish this. 190 my $modified = ""; 191 my $in_brackets = 0; 192 193 # Go through the pattern character-by-character. We also add 194 # blanks around each token to test the /x parts of (?[ ]) 195 my $pat_len = length($pat); 196 CHAR: for (my $i = 0; $i < $pat_len; $i++) { 197 my $curchar = substr($pat, $i, 1); 198 if ($curchar eq '\\') { 199 $modified .= " " if $in_brackets; 200 $modified .= $curchar; 201 $i++; 202 203 # Get the character the backslash is escaping 204 $curchar = substr($pat, $i, 1); 205 $modified .= $curchar; 206 207 # If the character following that is a '{}', treat the 208 # entire amount as a single token 209 if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') { 210 my $j = index($pat, '}', $i+2); 211 if ($j < 0) { 212 last unless $in_brackets; 213 if ($result eq 'c') { 214 $skip++; 215 $reason = "Can't handle compilation errors with unmatched '{'"; 216 } 217 else { 218 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n"; 219 next TEST; 220 } 221 } 222 $modified .= substr($pat, $i+1, $j - $i); 223 $i = $j; 224 } 225 elsif ($curchar eq 'x') { 226 227 # \x without brackets is supposed to be followed by 2 228 # hex digits. Take up to 2, and then add a blank 229 # after the last one. This avoids getting errors from 230 # (?[ ]) for run-ons, like \xabc 231 my $j = $i + 1; 232 for (; $j < $i + 3 && $j < $pat_len; $j++) { 233 my $curord = ord(substr($pat, $j, 1)); 234 if (!(($curord >= ord("A") && $curord <= ord("F")) 235 || ($curord >= ord("a") && $curord <= ord("f")) 236 || ($curord >= ord("0") && $curord <= ord("9")))) 237 { 238 $j++; 239 last; 240 } 241 } 242 $j--; 243 $modified .= substr($pat, $i + 1, $j - $i) . " "; 244 $i = $j; 245 } 246 elsif (ord($curchar) >= ord('0') 247 && (ord($curchar) <= ord('7'))) 248 { 249 # Similarly, octal constants have up to 3 digits. 250 my $j = $i + 1; 251 for (; $j < $i + 3 && $j < $pat_len; $j++) { 252 my $curord = ord(substr($pat, $j, 1)); 253 if (! ($curord >= ord("0") && $curord <= ord("7"))) { 254 $j++; 255 last; 256 } 257 } 258 $j--; 259 $modified .= substr($pat, $i + 1, $j - $i); 260 $i = $j; 261 } 262 263 next; 264 } # End of processing a backslash sequence 265 266 if (! $in_brackets # Skip (?{ }) 267 && $curchar eq '(' 268 && $i < $pat_len - 2 269 && substr($pat, $i+1, 1) eq '?' 270 && substr($pat, $i+2, 1) eq '{') 271 { 272 $skip++; 273 $reason = "Pattern contains '(?{'"; 274 last; 275 } 276 277 # Closing ']' 278 if ($curchar eq ']' && $in_brackets) { 279 $modified .= " ] ])"; 280 $in_brackets = 0; 281 next; 282 } 283 284 # A regular character. 285 if ($curchar ne '[') { 286 if (! $in_brackets) { 287 $modified .= $curchar; 288 } 289 else { 290 $modified .= " $curchar "; 291 } 292 next; 293 } 294 295 # Here is a '['; If not in a bracketed class, treat as the 296 # beginning of one. 297 if (! $in_brackets) { 298 $in_brackets = 1; 299 $modified .= "(?[ [ "; 300 301 # An immediately following ']' or '^]' is not the ending 302 # of the class, but is to be treated literally. 303 if ($i < $pat_len - 1 304 && substr($pat, $i+1, 1) eq ']') 305 { 306 $i ++; 307 $modified .= " ] "; 308 } 309 elsif ($i < $pat_len - 2 310 && substr($pat, $i+1, 1) eq '^' 311 && substr($pat, $i+2, 1) eq ']') 312 { 313 $i += 2; 314 $modified .= " ^ ] "; 315 } 316 next; 317 } 318 319 # Here is a plain '[' within [ ]. Could mean wants to 320 # match a '[', or it could be a posix class that has a 321 # corresponding ']'. Absorb either 322 323 $modified .= ' ['; 324 last if $i >= $pat_len - 1; 325 326 $i++; 327 $curchar = substr($pat, $i, 1); 328 if ($curchar =~ /[:=.]/) { 329 for (my $j = $i + 1; $j < $pat_len; $j++) { 330 next unless substr($pat, $j, 1) eq ']'; 331 last if $j - $i < 2; 332 if (substr($pat, $j - 1, 1) eq $curchar) { 333 # Here, is a posix class 334 $modified .= substr($pat, $i, $j - $i + 1) . " "; 335 $i = $j; 336 next CHAR; 337 } 338 } 339 } 340 341 # Here wasn't a posix class, just process normally 342 $modified .= " $curchar "; 343 } 344 345 if ($in_brackets && ! $skip) { 346 if ($result eq 'c') { 347 $skip++; 348 $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error"; 349 } 350 else { 351 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n"; 352 next TEST; 353 } 354 } 355 356 # Use our modified pattern instead of the original 357 $pat = $modified; 358 } 359 } 360 361 for my $study ('', 'study $subject', 'utf8::upgrade($subject)', 362 'utf8::upgrade($subject); study $subject') { 363 # Need to make a copy, else the utf8::upgrade of an already studied 364 # scalar confuses things. 365 my $subject = $subject; 366 my $c = $iters; 367 my ($code, $match, $got); 368 if ($repl eq 'pos') { 369 $code= <<EOFCODE; 370 $study; 371 pos(\$subject)=0; 372 \$match = ( \$subject =~ m${pat}g ); 373 \$got = pos(\$subject); 374EOFCODE 375 } 376 elsif ($qr_embed) { 377 $code= <<EOFCODE; 378 my \$RE = qr$pat; 379 $study; 380 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; 381 \$got = "$repl"; 382EOFCODE 383 } 384 elsif ($qr_embed_thr) { 385 $code= <<EOFCODE; 386 # Can't run the match in a subthread, but can do this and 387 # clone the pattern the other way. 388 my \$RE = threads->new(sub {qr$pat})->join(); 389 $study; 390 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; 391 \$got = "$repl"; 392EOFCODE 393 } 394 else { 395 $code= <<EOFCODE; 396 $study; 397 \$match = (\$subject =~ $OP$pat) while \$c--; 398 \$got = "$repl"; 399EOFCODE 400 } 401 $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets; 402 #$code.=qq[\n\$expect="$expect";\n]; 403 #use Devel::Peek; 404 #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/; 405 { 406 # Probably we should annotate specific tests with which warnings 407 # categories they're known to trigger, and hence should be 408 # disabled just for that test 409 no warnings qw(uninitialized regexp deprecated); 410 eval $code; 411 } 412 chomp( my $err = $@ ); 413 if ( $skip ) { 414 print "ok $testname # skipped", length($reason) ? ". $reason" : '', "\n"; 415 next TEST; 416 } 417 elsif ($result eq 'c') { 418 if ($err !~ m!^\Q$expect!) { print "not ok $testname$todo (compile) $input => '$err'\n"; next TEST } 419 last; # no need to study a syntax error 420 } 421 elsif ( $todo_qr ) { 422 print "not ok $testname # TODO", length($reason) ? " - $reason" : '', "\n"; 423 next TEST; 424 } 425 elsif ($@) { 426 print "not ok $testname$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST; 427 } 428 elsif ($result =~ /^n/) { 429 if ($match) { print "not ok $testname$todo ($study) $input => false positive\n"; next TEST } 430 } 431 else { 432 if (!$match || $got ne $expect) { 433 eval { require Data::Dumper }; 434 no warnings "utf8"; # But handle should be utf8 435 if ($@ || !defined &DynaLoader::boot_DynaLoader) { 436 # Data::Dumper will load on miniperl, but fail when used in 437 # anger as it tries to load B. I'd prefer to keep the 438 # regular calls below outside of an eval so that real 439 # (unknown) failures get spotted, not ignored. 440 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$code\n"); 441 } 442 else { # better diagnostics 443 my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump; 444 my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump; 445 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$g\n$code\n"); 446 } 447 next TEST; 448 } 449 } 450 } 451 print "ok $testname$todo\n"; 452} 453 4541; 455