1#!./perl 2 3# The tests are in a separate file 't/re/re_tests'. 4# Each line in that file is a separate test. 5# There are five columns, separated by tabs. 6# 7# Column 1 contains the pattern, optionally enclosed in C<''>. 8# Modifiers can be put after the closing C<'>. 9# 10# Column 2 contains the string to be matched. 11# 12# Column 3 contains the expected result: 13# y expect a match 14# n expect no match 15# c expect an error 16# T the test is a TODO (can be combined with y/n/c) 17# M skip test on miniperl (combine with y/n/c/T) 18# B test exposes a known bug in Perl, should be skipped 19# b test exposes a known bug in Perl, should be skipped if noamp 20# t test exposes a bug with threading, TODO if qr_embed_thr 21# s test should only be run for regex_sets_compat.t 22# S test should not be run for regex_sets_compat.t 23# 24# Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 25# 26# Column 4 contains a string, usually C<$&>. 27# 28# Column 5 contains the expected result of double-quote 29# interpolating that string after the match, or start of error message. 30# 31# Column 6, if present, contains a reason why the test is skipped. 32# This is printed with "skipped", for harness to pick up. 33# 34# Column 7 can be used for comments 35# 36# \n in the tests are interpolated, as are variables of the form ${\w+}. 37# 38# Blanks lines are treated as PASSING tests to keep the line numbers 39# linked to the test number. 40# 41# If you want to add a regular expression test that can't be expressed 42# in this format, don't add it here: put it in re/pat.t instead. 43# 44# Note that the inputs get passed on as "m're'", so the re bypasses the lexer. 45# This means this file cannot be used for testing anything that the lexer 46# handles; in 5.12 this means just \N{NAME} and \N{U+...}. 47# 48# Note that columns 2,3 and 5 are all enclosed in double quotes and then 49# evalled; so something like a\"\x{100}$1 has length 3+length($1). 50 51my ($file, $iters); 52BEGIN { 53 $iters = shift || 1; # Poor man performance suite, 10000 is OK. 54 55 # Do this open before any chdir 56 $file = shift; 57 if (defined $file) { 58 open TESTS, $file or die "Can't open $file"; 59 } 60 61 chdir 't' if -d 't'; 62 @INC = '../lib'; 63 64} 65 66sub _comment { 67 return map { /^#/ ? "$_\n" : "# $_\n" } 68 map { split /\n/ } @_; 69} 70 71use strict; 72use warnings FATAL=>"all"; 73use vars qw($bang $ffff $nulnul); # used by the tests 74use vars qw($qr $skip_amp $qr_embed $qr_embed_thr $regex_sets); # set by our callers 75 76 77 78if (!defined $file) { 79 open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!"; 80} 81 82my @tests = <TESTS>; 83 84close TESTS; 85 86$bang = sprintf "\\%03o", ord "!"; # \41 would not be portable. 87$ffff = chr(0xff) x 2; 88$nulnul = "\0" x 2; 89my $OP = $qr ? 'qr' : 'm'; 90 91$| = 1; 92printf "1..%d\n# $iters iterations\n", scalar @tests; 93 94my $test; 95TEST: 96foreach (@tests) { 97 $test++; 98 if (!/\S/ || /^\s*#/ || /^__END__$/) { 99 print "ok $test # (Blank line or comment)\n"; 100 if (/#/) { print $_ }; 101 next; 102 } 103 chomp; 104 s/\\n/\n/g unless $regex_sets; 105 my ($pat, $subject, $result, $repl, $expect, $reason) = split(/\t/,$_,6); 106 if (!defined $subject) { 107 die "Bad test definition on line $test: $_\n"; 108 } 109 $reason = '' unless defined $reason; 110 my $input = join(':',$pat,$subject,$result,$repl,$expect); 111 # the double '' below keeps simple syntax highlighters from going crazy 112 $pat = "'$pat'" unless $pat =~ /^[:''\/]/; 113 $pat =~ s/(\$\{\w+\})/$1/eeg; 114 $pat =~ s/\\n/\n/g unless $regex_sets; 115 $subject = eval qq("$subject"); die $@ if $@; 116 $expect = eval qq("$expect"); die $@ if $@; 117 $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/; 118 my $todo_qr = $qr_embed_thr && ($result =~ s/t//); 119 my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//)); 120 ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader; 121 if ($result =~ s/ ( [Ss] ) //x) { 122 if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) { 123 $skip++; 124 $reason = "Test not valid for $0"; 125 } 126 } 127 $reason = 'skipping $&' if $reason eq '' && $skip_amp; 128 $result =~ s/B//i unless $skip; 129 my $todo= $result =~ s/T// ? " # TODO" : ""; 130 if (! $skip && $regex_sets) { 131 132 # If testing regex sets, change the [bracketed] classes into 133 # (?[bracketed]). 134 135 if ($pat !~ / \[ /x) { 136 137 $skip++; 138 $reason = "Pattern doesn't contain [brackets]"; 139 } 140 else { # Use non-regex features of Perl to accomplish this. 141 my $modified = ""; 142 my $in_brackets = 0; 143 144 # Go through the pattern character-by-character. We also add 145 # blanks around each token to test the /x parts of (?[ ]) 146 my $pat_len = length($pat); 147 CHAR: for (my $i = 0; $i < $pat_len; $i++) { 148 my $curchar = substr($pat, $i, 1); 149 if ($curchar eq '\\') { 150 $modified .= " " if $in_brackets; 151 $modified .= $curchar; 152 $i++; 153 154 # Get the character the backslash is escaping 155 $curchar = substr($pat, $i, 1); 156 $modified .= $curchar; 157 158 # If the character following that is a '{}', treat the 159 # entire amount as a single token 160 if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') { 161 my $j = index($pat, '}', $i+2); 162 if ($j < 0) { 163 last unless $in_brackets; 164 if ($result eq 'c') { 165 $skip++; 166 $reason = "Can't handle compilation errors with unmatched '{'"; 167 } 168 else { 169 print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n"; 170 next TEST; 171 } 172 } 173 $modified .= substr($pat, $i+1, $j - $i); 174 $i = $j; 175 } 176 elsif ($curchar eq 'x') { 177 178 # \x without brackets is supposed to be followed by 2 179 # hex digits. Take up to 2, and then add a blank 180 # after the last one. This avoids getting errors from 181 # (?[ ]) for run-ons, like \xabc 182 my $j = $i + 1; 183 for (; $j < $i + 3 && $j < $pat_len; $j++) { 184 my $curord = ord(substr($pat, $j, 1)); 185 if (!(($curord >= ord("A") && $curord <= ord("F")) 186 || ($curord >= ord("a") && $curord <= ord("f")) 187 || ($curord >= ord("0") && $curord <= ord("9")))) 188 { 189 $j++; 190 last; 191 } 192 } 193 $j--; 194 $modified .= substr($pat, $i + 1, $j - $i) . " "; 195 $i = $j; 196 } 197 elsif (ord($curchar) >= ord('0') 198 && (ord($curchar) <= ord('7'))) 199 { 200 # Similarly, octal constants have up to 3 digits. 201 my $j = $i + 1; 202 for (; $j < $i + 3 && $j < $pat_len; $j++) { 203 my $curord = ord(substr($pat, $j, 1)); 204 if (! ($curord >= ord("0") && $curord <= ord("7"))) { 205 $j++; 206 last; 207 } 208 } 209 $j--; 210 $modified .= substr($pat, $i + 1, $j - $i); 211 $i = $j; 212 } 213 214 next; 215 } # End of processing a backslash sequence 216 217 if (! $in_brackets # Skip (?{ }) 218 && $curchar eq '(' 219 && $i < $pat_len - 2 220 && substr($pat, $i+1, 1) eq '?' 221 && substr($pat, $i+2, 1) eq '{') 222 { 223 $skip++; 224 $reason = "Pattern contains '(?{'"; 225 last; 226 } 227 228 # Closing ']' 229 if ($curchar eq ']' && $in_brackets) { 230 $modified .= " ] ])"; 231 $in_brackets = 0; 232 next; 233 } 234 235 # A regular character. 236 if ($curchar ne '[') { 237 if (! $in_brackets) { 238 $modified .= $curchar; 239 } 240 else { 241 $modified .= " $curchar "; 242 } 243 next; 244 } 245 246 # Here is a '['; If not in a bracketed class, treat as the 247 # beginning of one. 248 if (! $in_brackets) { 249 $in_brackets = 1; 250 $modified .= "(?[ [ "; 251 252 # An immediately following ']' or '^]' is not the ending 253 # of the class, but is to be treated literally. 254 if ($i < $pat_len - 1 255 && substr($pat, $i+1, 1) eq ']') 256 { 257 $i ++; 258 $modified .= " ] "; 259 } 260 elsif ($i < $pat_len - 2 261 && substr($pat, $i+1, 1) eq '^' 262 && substr($pat, $i+2, 1) eq ']') 263 { 264 $i += 2; 265 $modified .= " ^ ] "; 266 } 267 next; 268 } 269 270 # Here is a plain '[' within [ ]. Could mean wants to 271 # match a '[', or it could be a posix class that has a 272 # corresponding ']'. Absorb either 273 274 $modified .= ' ['; 275 last if $i >= $pat_len - 1; 276 277 $i++; 278 $curchar = substr($pat, $i, 1); 279 if ($curchar =~ /[:=.]/) { 280 for (my $j = $i + 1; $j < $pat_len; $j++) { 281 next unless substr($pat, $j, 1) eq ']'; 282 last if $j - $i < 2; 283 if (substr($pat, $j - 1, 1) eq $curchar) { 284 # Here, is a posix class 285 $modified .= substr($pat, $i, $j - $i + 1) . " "; 286 $i = $j; 287 next CHAR; 288 } 289 } 290 } 291 292 # Here wasn't a posix class, just process normally 293 $modified .= " $curchar "; 294 } 295 296 if ($in_brackets && ! $skip) { 297 if ($result eq 'c') { 298 $skip++; 299 $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error"; 300 } 301 else { 302 print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n"; 303 next TEST; 304 } 305 } 306 307 # Use our modified pattern instead of the original 308 $pat = $modified; 309 } 310 } 311 312 for my $study ('', 'study $subject', 'utf8::upgrade($subject)', 313 'utf8::upgrade($subject); study $subject') { 314 # Need to make a copy, else the utf8::upgrade of an already studied 315 # scalar confuses things. 316 my $subject = $subject; 317 my $c = $iters; 318 my ($code, $match, $got); 319 if ($repl eq 'pos') { 320 $code= <<EOFCODE; 321 $study; 322 pos(\$subject)=0; 323 \$match = ( \$subject =~ m${pat}g ); 324 \$got = pos(\$subject); 325EOFCODE 326 } 327 elsif ($qr_embed) { 328 $code= <<EOFCODE; 329 my \$RE = qr$pat; 330 $study; 331 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; 332 \$got = "$repl"; 333EOFCODE 334 } 335 elsif ($qr_embed_thr) { 336 $code= <<EOFCODE; 337 # Can't run the match in a subthread, but can do this and 338 # clone the pattern the other way. 339 my \$RE = threads->new(sub {qr$pat})->join(); 340 $study; 341 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; 342 \$got = "$repl"; 343EOFCODE 344 } 345 else { 346 $code= <<EOFCODE; 347 $study; 348 \$match = (\$subject =~ $OP$pat) while \$c--; 349 \$got = "$repl"; 350EOFCODE 351 } 352 $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets; 353 #$code.=qq[\n\$expect="$expect";\n]; 354 #use Devel::Peek; 355 #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/; 356 { 357 # Probably we should annotate specific tests with which warnings 358 # categories they're known to trigger, and hence should be 359 # disabled just for that test 360 no warnings qw(uninitialized regexp); 361 eval $code; 362 } 363 chomp( my $err = $@ ); 364 if ( $skip ) { 365 print "ok $test # skipped", length($reason) ? ". $reason" : '', "\n"; 366 next TEST; 367 } 368 elsif ($result eq 'c') { 369 if ($err !~ m!^\Q$expect!) { print "not ok $test$todo (compile) $input => '$err'\n"; next TEST } 370 last; # no need to study a syntax error 371 } 372 elsif ( $todo_qr ) { 373 print "not ok $test # TODO", length($reason) ? " - $reason" : '', "\n"; 374 next TEST; 375 } 376 elsif ($@) { 377 print "not ok $test$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST; 378 } 379 elsif ($result =~ /^n/) { 380 if ($match) { print "not ok $test$todo ($study) $input => false positive\n"; next TEST } 381 } 382 else { 383 if (!$match || $got ne $expect) { 384 eval { require Data::Dumper }; 385 no warnings "utf8"; # But handle should be utf8 386 if ($@ || !defined &DynaLoader::boot_DynaLoader) { 387 # Data::Dumper will load on miniperl, but fail when used in 388 # anger as it tries to load B. I'd prefer to keep the 389 # regular calls below outside of an eval so that real 390 # (unknown) failures get spotted, not ignored. 391 print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$code\n"); 392 } 393 else { # better diagnostics 394 my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump; 395 my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump; 396 print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$g\n$code\n"); 397 } 398 next TEST; 399 } 400 } 401 } 402 print "ok $test$todo\n"; 403} 404 4051; 406