1#!./perl 2# 3# This is a home for regular expression tests that don't fit into 4# the format supported by re/regexp.t. If you want to add a test 5# that does fit that format, add it to re/re_tests, not here. 6 7use strict; 8use warnings; 9use 5.010; 10use Config; 11 12sub run_tests; 13 14$| = 1; 15 16 17BEGIN { 18 chdir 't' if -d 't'; 19 @INC = ('../lib','.'); 20 require './test.pl'; 21 skip_all_if_miniperl("miniperl can't load Tie::Hash::NamedCapture, need for %+ and %-"); 22} 23 24 25plan tests => 2532; # Update this when adding/deleting tests. 26 27run_tests() unless caller; 28 29# 30# Tests start here. 31# 32sub run_tests { 33 34 like("A \x{263a} B z C", qr/A . B (??{ "z" }) C/, 35 "Match UTF-8 char in presence of (??{ }); Bug 20000731.001"); 36 37 { 38 no warnings 'uninitialized'; 39 ok(undef =~ /^([^\/]*)(.*)$/, "Used to cause a SEGV; Bug 20001021.005"); 40 } 41 42 { 43 my $message = 'bug id 20001008.001'; 44 45 my @x = ("stra\337e 138", "stra\337e 138"); 46 for (@x) { 47 ok(s/(\d+)\s*([\w\-]+)/$1 . uc $2/e, $message); 48 ok(my ($latin) = /^(.+)(?:\s+\d)/, $message); 49 is($latin, "stra\337e", $message); 50 ok($latin =~ s/stra\337e/straße/, $message); 51 # 52 # Previous code follows, but outcommented - there were no tests. 53 # 54 # $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a 55 # use utf8; # needed for the raw UTF-8 56 # $latin =~ s!(s)tr(?:aß|s+e)!$1tr.!; # \303\237 after the a 57 } 58 } 59 60 { 61 # Fist half of the bug. 62 my $message = 'HEBREW ACCENT QADMA matched by .*; Bug 20001028.003'; 63 my $X = chr (1448); 64 ok(my ($Y) = $X =~ /(.*)/, $message); 65 is($Y, v1448, $message); 66 is(length $Y, 1, $message); 67 68 # Second half of the bug. 69 $message = 'HEBREW ACCENT QADMA in replacement; Bug 20001028.003'; 70 $X = ''; 71 $X =~ s/^/chr(1488)/e; 72 is(length $X, 1, $message); 73 is(ord $X, 1488, $message); 74 } 75 76 { 77 my $message = 'Repeated s///; Bug 20001108.001'; 78 my $X = "Szab\x{f3},Bal\x{e1}zs"; 79 my $Y = $X; 80 $Y =~ s/(B)/$1/ for 0 .. 3; 81 is($Y, $X, $message); 82 is($X, "Szab\x{f3},Bal\x{e1}zs", $message); 83 } 84 85 { 86 my $message = 's/// on UTF-8 string; Bug 20000517.001'; 87 my $x = "\x{100}A"; 88 $x =~ s/A/B/; 89 is($x, "\x{100}B", $message); 90 is(length $x, 2, $message); 91 } 92 93 { 94 my $message = '\C and É; Bug 20001230.002'; 95 ok("École" =~ /^\C\C(.)/ && $1 eq 'c', $message); 96 like("École", qr/^\C\C(c)/, $message); 97 } 98 99 { 100 # The original bug report had 'no utf8' here but that was irrelevant. 101 102 my $message = "Don't dump core; Bug 20010306.008"; 103 my $a = "a\x{1234}"; 104 like($a, qr/\w/, $message); # used to core dump. 105 } 106 107 { 108 my $message = '/g in scalar context; Bug 20010410.006'; 109 for my $rx ('/(.*?)\{(.*?)\}/csg', 110 '/(.*?)\{(.*?)\}/cg', 111 '/(.*?)\{(.*?)\}/sg', 112 '/(.*?)\{(.*?)\}/g', 113 '/(.+?)\{(.+?)\}/csg',) { 114 my $i = 0; 115 my $input = "a{b}c{d}"; 116 eval <<" --"; 117 while (eval \$input =~ $rx) { 118 \$i ++; 119 } 120 -- 121 is($i, 2, $message); 122 } 123 } 124 125 { 126 # Amazingly vertical tabulator is the same in ASCII and EBCDIC. 127 for ("\n", "\t", "\014", "\r") { 128 unlike($_, qr/[[:print:]]/, sprintf "\\%03o not in [[:print:]]; Bug 20010619.003", ord $_); 129 } 130 for (" ") { 131 like($_, qr/[[:print:]]/, "'$_' in [[:print:]]; Bug 20010619.003"); 132 } 133 } 134 135 { 136 # [ID 20010814.004] pos() doesn't work when using =~m// in list context 137 138 $_ = "ababacadaea"; 139 my $a = join ":", /b./gc; 140 my $b = join ":", /a./gc; 141 my $c = pos; 142 is("$a $b $c", 'ba:ba ad:ae 10', "pos() works with () = m//; Bug 20010814.004"); 143 } 144 145 { 146 # [ID 20010407.006] matching utf8 return values from 147 # functions does not work 148 149 my $message = 'UTF-8 return values from functions; Bug 20010407.006'; 150 package ID_20010407_006; 151 sub x {"a\x{1234}"} 152 my $x = x; 153 my $y; 154 ::ok($x =~ /(..)/, $message); 155 $y = $1; 156 ::ok(length ($y) == 2 && $y eq $x, $message); 157 ::ok(x =~ /(..)/, $message); 158 $y = $1; 159 ::ok(length ($y) == 2 && $y eq $x, $message); 160 } 161 162 { 163 # High bit bug -- japhy 164 my $x = "ab\200d"; 165 ok $x =~ /.*?\200/, "High bit fine"; 166 } 167 168 { 169 my $message = 'UTF-8 hash keys and /$/'; 170 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters 171 # /2002-01/msg01327.html 172 173 my $u = "a\x{100}"; 174 my $v = substr ($u, 0, 1); 175 my $w = substr ($u, 1, 1); 176 my %u = ($u => $u, $v => $v, $w => $w); 177 for (keys %u) { 178 my $m1 = /^\w*$/ ? 1 : 0; 179 my $m2 = $u {$_} =~ /^\w*$/ ? 1 : 0; 180 is($m1, $m2, $message); 181 } 182 } 183 184 { 185 my $message = "s///eg [change 13f46d054db22cf4]; Bug 20020124.005"; 186 187 for my $char ("a", "\x{df}", "\x{100}") { 188 my $x = "$char b $char"; 189 $x =~ s{($char)}{ 190 "c" =~ /c/; 191 "x"; 192 }ge; 193 is(substr ($x, 0, 1), substr ($x, -1, 1), $message); 194 } 195 } 196 197 { 198 my $message = "Correct pmop flags checked when empty pattern; Bug 20020412.005"; 199 200 # Requires reuse of last successful pattern. 201 my $num = 123; 202 $num =~ /\d/; 203 for (0 .. 1) { 204 my $match = m?? + 0; 205 ok($match != $_, $message) 206 or diag(sprintf "'match one' %s on %s iteration" => 207 $match ? 'succeeded' : 'failed', 208 $_ ? 'second' : 'first'); 209 } 210 $num =~ /(\d)/; 211 my $result = join "" => $num =~ //g; 212 is($result, $num, $message); 213 } 214 215 { 216 my $message = 'UTF-8 regex matches above 32k; Bug 20020630.002'; 217 for (['byte', "\x{ff}"], ['utf8', "\x{1ff}"]) { 218 my ($type, $char) = @$_; 219 for my $len (32000, 32768, 33000) { 220 my $s = $char . "f" x $len; 221 my $r = $s =~ /$char([f]*)/gc; 222 ok($r, $message) or diag("<$type x $len>"); 223 ok(!$r || pos ($s) == $len + 1, $message) 224 or diag("<$type x $len>; pos = @{[pos $s]}"); 225 } 226 } 227 } 228 229 { 230 my $s = "\x{100}" x 5; 231 my $ok = $s =~ /(\x{100}{4})/; 232 my ($ord, $len) = (ord $1, length $1); 233 ok $ok && $ord == 0x100 && $len == 4, "No panic: end_shift [change 0e933229fa758625]"; 234 } 235 236 { 237 our $a = "x\x{100}"; 238 chop $a; # Leaves the UTF-8 flag 239 $a .= "y"; # 1 byte before 'y'. 240 241 like($a, qr/^\C/, 'match one \C on 1-byte UTF-8; Bug 15763'); 242 like($a, qr/^\C{1}/, 'match \C{1}; Bug 15763'); 243 244 like($a, qr/^\Cy/, 'match \Cy; Bug 15763'); 245 like($a, qr/^\C{1}y/, 'match \C{1}y; Bug 15763'); 246 247 unlike($a, qr/^\C\Cy/, q {don't match two \Cy; Bug 15763}); 248 unlike($a, qr/^\C{2}y/, q {don't match \C{2}y; Bug 15763}); 249 250 $a = "\x{100}y"; # 2 bytes before "y" 251 252 like($a, qr/^\C/, 'match one \C on 2-byte UTF-8; Bug 15763'); 253 like($a, qr/^\C{1}/, 'match \C{1}; Bug 15763'); 254 like($a, qr/^\C\C/, 'match two \C; Bug 15763'); 255 like($a, qr/^\C{2}/, 'match \C{2}; Bug 15763'); 256 257 like($a, qr/^\C\C\C/, 'match three \C on 2-byte UTF-8 and a byte; Bug 15763'); 258 like($a, qr/^\C{3}/, 'match \C{3}; Bug 15763'); 259 260 like($a, qr/^\C\Cy/, 'match two \C; Bug 15763'); 261 like($a, qr/^\C{2}y/, 'match \C{2}; Bug 15763'); 262 263 unlike($a, qr/^\C\C\Cy/, q {don't match three \Cy; Bug 15763}); 264 unlike($a, qr/^\C{2}\Cy/, q {don't match \C{2}\Cy; Bug 15763}); 265 unlike($a, qr/^\C{3}y/, q {don't match \C{3}y; Bug 15763}); 266 267 $a = "\x{1000}y"; # 3 bytes before "y" 268 269 like($a, qr/^\C/, 'match one \C on three-byte UTF-8; Bug 15763'); 270 like($a, qr/^\C{1}/, 'match \C{1}; Bug 15763'); 271 like($a, qr/^\C\C/, 'match two \C; Bug 15763'); 272 like($a, qr/^\C{2}/, 'match \C{2}; Bug 15763'); 273 like($a, qr/^\C\C\C/, 'match three \C; Bug 15763'); 274 like($a, qr/^\C{3}/, 'match \C{3}; Bug 15763'); 275 276 like($a, qr/^\C\C\C\C/, 'match four \C on three-byte UTF-8 and a byte; Bug 15763'); 277 like($a, qr/^\C{4}/, 'match \C{4}; Bug 15763'); 278 279 like($a, qr/^\C\C\Cy/, 'match three \Cy; Bug 15763'); 280 like($a, qr/^\C{3}y/, 'match \C{3}y; Bug 15763'); 281 282 unlike($a, qr/^\C\C\C\Cy/, q {don't match four \Cy; Bug 15763}); 283 unlike($a, qr/^\C{4}y/, q {don't match \C{4}y; Bug 15763}); 284 } 285 286 287 { 288 my $message = 'UTF-8 matching; Bug 15397'; 289 like("\x{100}", qr/\x{100}/, $message); 290 like("\x{100}", qr/(\x{100})/, $message); 291 like("\x{100}", qr/(\x{100}){1}/, $message); 292 like("\x{100}\x{100}", qr/(\x{100}){2}/, $message); 293 like("\x{100}\x{100}", qr/(\x{100})(\x{100})/, $message); 294 } 295 296 { 297 my $message = 'Neither ()* nor ()*? sets $1 when matched 0 times; Bug 7471'; 298 local $_ = 'CD'; 299 ok(/(AB)*?CD/ && !defined $1, $message); 300 ok(/(AB)*CD/ && !defined $1, $message); 301 } 302 303 { 304 my $message = "Caching shouldn't prevent match; Bug 3547"; 305 my $pattern = "^(b+?|a){1,2}c"; 306 ok("bac" =~ /$pattern/ && $1 eq 'a', $message); 307 ok("bbac" =~ /$pattern/ && $1 eq 'a', $message); 308 ok("bbbac" =~ /$pattern/ && $1 eq 'a', $message); 309 ok("bbbbac" =~ /$pattern/ && $1 eq 'a', $message); 310 } 311 312 { 313 ok("\x{100}" =~ /(.)/, '$1 should keep UTF-8 ness; Bug 18232'); 314 is($1, "\x{100}", '$1 is UTF-8; Bug 18232'); 315 { 'a' =~ /./; } 316 is($1, "\x{100}", '$1 is still UTF-8; Bug 18232'); 317 isnt($1, "\xC4\x80", '$1 is not non-UTF-8; Bug 18232'); 318 } 319 320 { 321 my $message = "Optimizer doesn't prematurely reject match; Bug 19767"; 322 use utf8; 323 324 my $attr = 'Name-1'; 325 my $NormalChar = qr /[\p{IsDigit}\p{IsLower}\p{IsUpper}]/; 326 my $NormalWord = qr /${NormalChar}+?/; 327 my $PredNameHyphen = qr /^${NormalWord}(\-${NormalWord})*?$/; 328 329 $attr =~ /^$/; 330 like($attr, $PredNameHyphen, $message); # Original test. 331 332 "a" =~ m/[b]/; 333 like("0", qr/\p{N}+\z/, $message); # Variant. 334 } 335 336 { 337 my $message = "(??{ }) doesn't return stale values; Bug 20683"; 338 our $p = 1; 339 foreach (1, 2, 3, 4) { 340 $p ++ if /(??{ $p })/ 341 } 342 is($p, 5, $message); 343 344 { 345 package P; 346 $a = 1; 347 sub TIESCALAR {bless []} 348 sub FETCH {$a ++} 349 } 350 tie $p, "P"; 351 foreach (1, 2, 3, 4) { 352 /(??{ $p })/ 353 } 354 is($p, 5, $message); 355 } 356 357 { 358 # Subject: Odd regexp behavior 359 # From: Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk> 360 # Date: Wed, 26 Feb 2003 16:53:12 +0000 361 # Message-Id: <E18o4nw-0008Ly-00@wisbech.cl.cam.ac.uk> 362 # To: perl-unicode@perl.org 363 364 my $message = 'Markus Kuhn 2003-02-26'; 365 366 my $x = "\x{2019}\nk"; 367 ok($x =~ s/(\S)\n(\S)/$1 $2/sg, $message); 368 is($x, "\x{2019} k", $message); 369 370 $x = "b\nk"; 371 ok($x =~ s/(\S)\n(\S)/$1 $2/sg, $message); 372 is($x, "b k", $message); 373 374 like("\x{2019}", qr/\S/, $message); 375 } 376 377 { 378 my $message = "(??{ .. }) in split doesn't corrupt its stack; Bug 21411"; 379 our $i; 380 is('-1-3-5-', join('', split /((??{$i++}))/, '-1-3-5-'), $message); 381 no warnings 'syntax'; 382 @_ = split /(?{'WOW'})/, 'abc'; 383 local $" = "|"; 384 is("@_", "a|b|c", $message); 385 } 386 387 is(join('-', split /(?{ split "" })/, "abc"), 'a-b-c', 'nested split'); 388 389 { 390 $_ = "code: 'x' { '...' }\n"; study; 391 my @x; push @x, $& while m/'[^\']*'/gx; 392 local $" = ":"; 393 is("@x", "'x':'...'", "Parse::RecDescent triggered infinite loop; Bug 17757"); 394 } 395 396 { 397 sub func ($) { 398 ok("a\nb" !~ /^b/, "Propagated modifier; $_[0]; Bug 22354"); 399 ok("a\nb" =~ /^b/m, "Propagated modifier; $_[0] - with /m; Bug 22354"); 400 } 401 func "standalone"; 402 $_ = "x"; s/x/func "in subst"/e; 403 $_ = "x"; s/x/func "in multiline subst"/em; 404 $_ = "x"; /x(?{func "in regexp"})/; 405 $_ = "x"; /x(?{func "in multiline regexp"})/m; 406 } 407 408 { 409 $_ = "abcdef\n"; 410 my @x = m/./g; 411 is("abcde", $`, 'Global match sets $`; Bug 19049'); 412 } 413 414 { 415 # [perl #23769] Unicode regex broken on simple example 416 # regrepeat() didn't handle UTF-8 EXACT case right. 417 418 my $Mess = 'regrepeat() handles UTF-8 EXACT case right'; 419 my $message = "$Mess; Bug 23769"; 420 421 my $s = "\x{a0}\x{a0}\x{a0}\x{100}"; chop $s; 422 423 like($s, qr/\x{a0}/, $message); 424 like($s, qr/\x{a0}+/, $message); 425 like($s, qr/\x{a0}\x{a0}/, $message); 426 427 $message = "$Mess (easy variant); Bug 23769"; 428 ok("aaa\x{100}" =~ /(a+)/, $message); 429 is($1, "aaa", $message); 430 431 $message = "$Mess (easy invariant); Bug 23769"; 432 ok("aaa\x{100} " =~ /(a+?)/, $message); 433 is($1, "a", $message); 434 435 $message = "$Mess (regrepeat variant); Bug 23769"; 436 ok("\xa0\xa0\xa0\x{100} " =~ /(\xa0+?)/, $message); 437 is($1, "\xa0", $message); 438 439 $message = "$Mess (regrepeat invariant); Bug 23769"; 440 ok("\xa0\xa0\xa0\x{100}" =~ /(\xa0+)/, $message); 441 is($1, "\xa0\xa0\xa0", $message); 442 443 $message = "$Mess (hard variant); Bug 23769"; 444 ok("\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+?)/, $message); 445 is($1, "\xa0\xa1", $message); 446 447 $message = "$Mess (hard invariant); Bug 23769"; 448 ok("ababab\x{100} " =~ /((?:ab)+)/, $message); 449 is($1, 'ababab', $message); 450 451 ok("\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+)/, $message); 452 is($1, "\xa0\xa1\xa0\xa1\xa0\xa1", $message); 453 454 ok("ababab\x{100} " =~ /((?:ab)+?)/, $message); 455 is($1, "ab", $message); 456 457 $message = "Don't match first byte of UTF-8 representation; Bug 23769"; 458 unlike("\xc4\xc4\xc4", qr/(\x{100}+)/, $message); 459 unlike("\xc4\xc4\xc4", qr/(\x{100}+?)/, $message); 460 unlike("\xc4\xc4\xc4", qr/(\x{100}++)/, $message); 461 } 462 463 { 464 # perl panic: pp_match start/end pointers 465 466 is(eval {my ($x, $y) = "bca" =~ /^(?=.*(a)).*(bc)/; "$x-$y"}, "a-bc", 467 'Captures can move backwards in string; Bug 25269'); 468 } 469 470 { 471 # \cA not recognized in character classes 472 like("a\cAb", qr/\cA/, '\cA in pattern; Bug 27940'); 473 like("a\cAb", qr/[\cA]/, '\cA in character class; Bug 27940'); 474 like("a\cAb", qr/[\cA-\cB]/, '\cA in character class range; Bug 27940'); 475 like("abc", qr/[^\cA-\cB]/, '\cA in negated character class range; Bug 27940'); 476 like("a\cBb", qr/[\cA-\cC]/, '\cB in character class range; Bug 27940'); 477 like("a\cCbc", qr/[^\cA-\cB]/, '\cC in negated character class range; Bug 27940'); 478 like("a\cAb", qr/(??{"\cA"})/, '\cA in ??{} pattern; Bug 27940'); 479 unlike("ab", qr/a\cIb/x, '\cI in pattern; Bug 27940'); 480 } 481 482 { 483 # perl #28532: optional zero-width match at end of string is ignored 484 485 ok("abc" =~ /^abc(\z)?/ && defined($1), 486 'Optional zero-width match at end of string; Bug 28532'); 487 ok("abc" =~ /^abc(\z)??/ && !defined($1), 488 'Optional zero-width match at end of string; Bug 28532'); 489 } 490 491 { 492 my $utf8 = "\xe9\x{100}"; chop $utf8; 493 my $latin1 = "\xe9"; 494 495 like($utf8, qr/\xe9/i, "utf8/latin; Bug 36207"); 496 like($utf8, qr/$latin1/i, "utf8/latin runtime; Bug 36207"); 497 like($utf8, qr/(abc|\xe9)/i, "utf8/latin trie; Bug 36207"); 498 like($utf8, qr/(abc|$latin1)/i, "utf8/latin trie runtime; Bug 36207"); 499 500 like("\xe9", qr/$utf8/i, "latin/utf8; Bug 36207"); 501 like("\xe9", qr/(abc|$utf8)/i, "latin/utf8 trie; Bug 36207"); 502 like($latin1, qr/$utf8/i, "latin/utf8 runtime; Bug 36207"); 503 like($latin1, qr/(abc|$utf8)/i, "latin/utf8 trie runtime; Bug 36207"); 504 } 505 506 { 507 my $s = "abcd"; 508 $s =~ /(..)(..)/g; 509 $s = $1; 510 $s = $2; 511 is($2, 'cd', 512 "Assigning to original string does not corrupt match vars; Bug 37038"); 513 } 514 515 { 516 { 517 package wooosh; 518 sub gloople {"!"} 519 } 520 my $aeek = bless {} => 'wooosh'; 521 is(do {$aeek -> gloople () =~ /(.)/g}, 1, 522 "//g match against return value of sub [change e26a497577f3ce7b]"); 523 524 sub gloople {"!"} 525 is(do{gloople () =~ /(.)/g}, 1, 526 "change e26a497577f3ce7b didn't affect sub calls for some reason"); 527 } 528 529 { 530 # [perl #78680] 531 # See changes 26925-26928, which reverted change 26410 532 { 533 package lv; 534 our $var = "abc"; 535 sub variable : lvalue {$var} 536 } 537 my $o = bless [] => 'lv'; 538 my $f = ""; 539 my $r = eval { 540 for (1 .. 2) { 541 $f .= $1 if $o -> variable =~ /(.)/g; 542 } 543 1; 544 }; 545 if ($r) { 546 is($f, "ab", "pos() retained between calls"); 547 } 548 else { 549 ok 0, "Code failed: $@"; 550 } 551 552 our $var = "abc"; 553 sub variable : lvalue {$var} 554 my $g = ""; 555 my $s = eval { 556 for (1 .. 2) { 557 $g .= $1 if variable =~ /(.)/g; 558 } 559 1; 560 }; 561 if ($s) { 562 is($g, "ab", "pos() retained between calls"); 563 } 564 else { 565 ok 0, "Code failed: $@"; 566 } 567 } 568 569 SKIP: 570 { 571 skip "In EBCDIC" if $::IS_EBCDIC; 572 no warnings 'utf8'; 573 $_ = pack 'U0C2', 0xa2, 0xf8; # Ill-formed UTF-8 574 my $ret = 0; 575 is(do {!($ret = s/[\0]+//g)}, 1, 576 "Ill-formed UTF-8 doesn't match NUL in class; Bug 37836"); 577 } 578 579 { 580 # chr(65535) should be allowed in regexes 581 582 no warnings 'utf8'; # To allow non-characters 583 my ($c, $r, $s); 584 585 $c = chr 0xffff; 586 $c =~ s/$c//g; 587 is($c, "", "U+FFFF, parsed as atom; Bug 38293"); 588 589 $c = chr 0xffff; 590 $r = "\\$c"; 591 $c =~ s/$r//g; 592 is($c, "", "U+FFFF backslashed, parsed as atom; Bug 38293"); 593 594 $c = chr 0xffff; 595 $c =~ s/[$c]//g; 596 is($c, "", "U+FFFF, parsed in class; Bug 38293"); 597 598 $c = chr 0xffff; 599 $r = "[\\$c]"; 600 $c =~ s/$r//g; 601 is($c, "", "U+FFFF backslashed, parsed in class; Bug 38293"); 602 603 $s = "A\x{ffff}B"; 604 $s =~ s/\x{ffff}//i; 605 is($s, "AB", "U+FFFF, EXACTF; Bug 38293"); 606 607 $s = "\x{ffff}A"; 608 $s =~ s/\bA//; 609 is($s, "\x{ffff}", "U+FFFF, BOUND; Bug 38293"); 610 611 $s = "\x{ffff}!"; 612 $s =~ s/\B!//; 613 is($s, "\x{ffff}", "U+FFFF, NBOUND; Bug 38293"); 614 } 615 616 { 617 618 # The printing characters 619 my @chars = ("A" .. "Z"); 620 my $delim = ","; 621 my $size = 32771 - 4; 622 my $str = ''; 623 624 # Create some random junk. Inefficient, but it works. 625 for (my $i = 0; $i < $size; $ i++) { 626 $str .= $chars [rand @chars]; 627 } 628 629 $str .= ($delim x 4); 630 my $res; 631 my $matched; 632 ok($str =~ s/^(.*?)${delim}{4}//s, "Pattern matches; Bug 39583"); 633 is($str, "", "Empty string; Bug 39583"); 634 ok(defined $1 && length ($1) == $size, '$1 is correct size; Bug 39583'); 635 } 636 637 { 638 like("\0-A", qr/\c@-A/, '@- should not be interpolated in a pattern; Bug 27940'); 639 like("\0\0A", qr/\c@+A/, '@+ should not be interpolated in a pattern; Bug 27940'); 640 like("X\@-A", qr/X@-A/, '@- should not be interpolated in a pattern; Bug 27940'); 641 like("X\@\@A", qr/X@+A/, '@+ should not be interpolated in a pattern; Bug 27940'); 642 643 like("X\0A", qr/X\c@?A/, '\c@?; Bug 27940'); 644 like("X\0A", qr/X\c@*A/, '\c@*; Bug 27940'); 645 like("X\0A", qr/X\c@(A)/, '\c@(; Bug 27940'); 646 like("X\0A", qr/X(\c@)A/, '\c@); Bug 27940'); 647 like("X\0A", qr/X\c@|ZA/, '\c@|; Bug 27940'); 648 649 like("X\@A", qr/X@?A/, '@?; Bug 27940'); 650 like("X\@A", qr/X@*A/, '@*; Bug 27940'); 651 like("X\@A", qr/X@(A)/, '@(; Bug 27940'); 652 like("X\@A", qr/X(@)A/, '@); Bug 27940'); 653 like("X\@A", qr/X@|ZA/, '@|; Bug 27940'); 654 655 local $" = ','; # non-whitespace and non-RE-specific 656 like('abc', qr/(.)(.)(.)/, 'The last successful match is bogus; Bug 27940'); 657 like("A@+B", qr/A@{+}B/, 'Interpolation of @+ in /@{+}/; Bug 27940'); 658 like("A@-B", qr/A@{-}B/, 'Interpolation of @- in /@{-}/; Bug 27940'); 659 like("A@+B", qr/A@{+}B/x, 'Interpolation of @+ in /@{+}/x; Bug 27940'); 660 like("A@-B", qr/A@{-}B/x, 'Interpolation of @- in /@{-}/x; Bug 27940'); 661 } 662 663 { 664 my $s = 'foo bar baz'; 665 my (@k, @v, @fetch, $res); 666 my $count = 0; 667 my @names = qw ($+{A} $+{B} $+{C}); 668 if ($s =~ /(?<A>foo)\s+(?<B>bar)?\s+(?<C>baz)/) { 669 while (my ($k, $v) = each (%+)) { 670 $count++; 671 } 672 @k = sort keys (%+); 673 @v = sort values (%+); 674 $res = 1; 675 push @fetch, 676 ["$+{A}", "$1"], 677 ["$+{B}", "$2"], 678 ["$+{C}", "$3"], 679 ; 680 } 681 foreach (0 .. 2) { 682 if ($fetch [$_]) { 683 is($fetch[$_][0], $fetch[$_][1], "$names[$_]; Bug 50496"); 684 } else { 685 ok 0, $names[$_]; 686 } 687 } 688 is($res, 1, "'$s' =~ /(?<A>foo)\\s+(?<B>bar)?\\s+(?<C>baz)/; Bug 50496"); 689 is($count, 3, "Got 3 keys in %+ via each; Bug 50496"); 690 is(0 + @k, 3, "Got 3 keys in %+ via keys; Bug 50496"); 691 is("@k", "A B C", "Got expected keys; Bug 50496"); 692 is("@v", "bar baz foo", "Got expected values; Bug 50496"); 693 eval ' 694 no warnings "uninitialized"; 695 print for $+ {this_key_doesnt_exist}; 696 '; 697 is($@, '', 'lvalue $+ {...} should not throw an exception; Bug 50496'); 698 } 699 700 { 701 # 702 # Almost the same as the block above, except that the capture is nested. 703 # 704 705 my $s = 'foo bar baz'; 706 my (@k, @v, @fetch, $res); 707 my $count = 0; 708 my @names = qw ($+{A} $+{B} $+{C} $+{D}); 709 if ($s =~ /(?<D>(?<A>foo)\s+(?<B>bar)?\s+(?<C>baz))/) { 710 while (my ($k,$v) = each(%+)) { 711 $count++; 712 } 713 @k = sort keys (%+); 714 @v = sort values (%+); 715 $res = 1; 716 push @fetch, 717 ["$+{A}", "$2"], 718 ["$+{B}", "$3"], 719 ["$+{C}", "$4"], 720 ["$+{D}", "$1"], 721 ; 722 } 723 foreach (0 .. 3) { 724 if ($fetch [$_]) { 725 is($fetch[$_][0], $fetch[$_][1], "$names[$_]; Bug 50496"); 726 } else { 727 ok 0, $names [$_]; 728 } 729 } 730 is($res, 1, "'$s' =~ /(?<D>(?<A>foo)\\s+(?<B>bar)?\\s+(?<C>baz))/; Bug 50496"); 731 is($count, 4, "Got 4 keys in %+ via each; Bug 50496"); 732 is(@k, 4, "Got 4 keys in %+ via keys; Bug 50496"); 733 is("@k", "A B C D", "Got expected keys; Bug 50496"); 734 is("@v", "bar baz foo foo bar baz", "Got expected values; Bug 50496"); 735 eval ' 736 no warnings "uninitialized"; 737 print for $+ {this_key_doesnt_exist}; 738 '; 739 is($@, '', 'lvalue $+ {...} should not throw an exception; Bug 50496'); 740 } 741 742 { 743 my $str = 'abc'; 744 my $count = 0; 745 my $mval = 0; 746 my $pval = 0; 747 while ($str =~ /b/g) {$mval = $#-; $pval = $#+; $count ++} 748 is($mval, 0, '@- should be empty; Bug 36046'); 749 is($pval, 0, '@+ should be empty; Bug 36046'); 750 is($count, 1, 'Should have matched once only; Bug 36046'); 751 } 752 753 { 754 my $message = '/m in precompiled regexp; Bug 40684'; 755 my $s = "abc\ndef"; 756 my $rex = qr'^abc$'m; 757 ok($s =~ m/$rex/, $message); 758 ok($s =~ m/^abc$/m, $message); 759 } 760 761 { 762 my $message = '(?: ... )? should not lose $^R; Bug 36909'; 763 $^R = 'Nothing'; 764 { 765 local $^R = "Bad"; 766 ok('x foofoo y' =~ m { 767 (foo) # $^R correctly set 768 (?{ "last regexp code result" }) 769 }x, $message); 770 is($^R, 'last regexp code result', $message); 771 } 772 is($^R, 'Nothing', $message); 773 774 { 775 local $^R = "Bad"; 776 777 ok('x foofoo y' =~ m { 778 (?:foo|bar)+ # $^R correctly set 779 (?{ "last regexp code result" }) 780 }x, $message); 781 is($^R, 'last regexp code result', $message); 782 } 783 is($^R, 'Nothing', $message); 784 785 { 786 local $^R = "Bad"; 787 ok('x foofoo y' =~ m { 788 (foo|bar)\1+ # $^R undefined 789 (?{ "last regexp code result" }) 790 }x, $message); 791 is($^R, 'last regexp code result', $message); 792 } 793 is($^R, 'Nothing', $message); 794 795 { 796 local $^R = "Bad"; 797 ok('x foofoo y' =~ m { 798 (foo|bar)\1 # This time without the + 799 (?{"last regexp code result"}) 800 }x, $message); 801 is($^R, 'last regexp code result', $message); 802 } 803 is($^R, 'Nothing', $message); 804 } 805 806 { 807 my $message = 'Match is linear, not quadratic; Bug 22395'; 808 our $count; 809 for my $l (10, 100, 1000) { 810 $count = 0; 811 ('a' x $l) =~ /(.*)(?{$count++})[bc]/; 812 local $::TODO = "Should be L+1 not L*(L+3)/2 (L=$l)"; 813 is($count, $l + 1, $message); 814 } 815 } 816 817 { 818 my $message = '@-/@+ should not have undefined values; Bug 22614'; 819 local $_ = 'ab'; 820 our @len = (); 821 /(.){1,}(?{push @len,0+@-})(.){1,}(?{})^/; 822 is("@len", "2 2 2", $message); 823 } 824 825 { 826 my $message = '$& set on s///; Bug 18209'; 827 my $text = ' word1 word2 word3 word4 word5 word6 '; 828 829 my @words = ('word1', 'word3', 'word5'); 830 my $count; 831 foreach my $word (@words) { 832 $text =~ s/$word\s//gi; # Leave a space to separate words 833 # in the resultant str. 834 # The following block is not working. 835 if ($&) { 836 $count ++; 837 } 838 # End bad block 839 } 840 is($count, 3, $message); 841 is($text, ' word2 word4 word6 ', $message); 842 } 843 844 { 845 # RT#6893 846 847 local $_ = qq (A\nB\nC\n); 848 my @res; 849 while (m#(\G|\n)([^\n]*)\n#gsx) { 850 push @res, "$2"; 851 last if @res > 3; 852 } 853 is("@res", "A B C", "/g pattern shouldn't infinite loop; Bug 6893"); 854 } 855 856 { 857 # No optimizer bug 858 my @tails = ('', '(?(1))', '(|)', '()?'); 859 my @quants = ('*','+'); 860 my $doit = sub { 861 my $pats = shift; 862 for (@_) { 863 for my $pat (@$pats) { 864 for my $quant (@quants) { 865 for my $tail (@tails) { 866 my $re = "($pat$quant\$)$tail"; 867 ok(/$re/ && $1 eq $_, "'$_' =~ /$re/; Bug 41010"); 868 ok(/$re/m && $1 eq $_, "'$_' =~ /$re/m; Bug 41010"); 869 } 870 } 871 } 872 } 873 }; 874 875 my @dpats = ('\d', 876 '[1234567890]', 877 '(1|[23]|4|[56]|[78]|[90])', 878 '(?:1|[23]|4|[56]|[78]|[90])', 879 '(1|2|3|4|5|6|7|8|9|0)', 880 '(?:1|2|3|4|5|6|7|8|9|0)'); 881 my @spats = ('[ ]', ' ', '( |\t)', '(?: |\t)', '[ \t]', '\s'); 882 my @sstrs = (' '); 883 my @dstrs = ('12345'); 884 $doit -> (\@spats, @sstrs); 885 $doit -> (\@dpats, @dstrs); 886 } 887 888 { 889 # [perl #45605] Regexp failure with utf8-flagged and byte-flagged string 890 891 my $utf_8 = "\xd6schel"; 892 utf8::upgrade ($utf_8); 893 $utf_8 =~ m {(\xd6|Ö)schel}; 894 is($1, "\xd6", "Upgrade error; Bug 45605"); 895 } 896 897 { 898 # Regardless of utf8ness any character matches itself when 899 # doing a case insensitive match. See also [perl #36207] 900 901 for my $o (0 .. 255) { 902 my @ch = (chr ($o), chr ($o)); 903 utf8::upgrade ($ch [1]); 904 for my $u_str (0, 1) { 905 for my $u_pat (0, 1) { 906 like($ch[$u_str], qr/\Q$ch[$u_pat]\E/i, 907 "\$c =~ /\$c/i : chr ($o) : u_str = $u_str u_pat = $u_pat; Bug 36207"); 908 like($ch[$u_str], qr/\Q$ch[$u_pat]\E|xyz/i, 909 "\$c=~/\$c|xyz/i : chr($o) : u_str = $u_str u_pat = $u_pat; Bug 36207"); 910 } 911 } 912 } 913 } 914 915 { 916 my $message = '$REGMARK in replacement; Bug 49190'; 917 our $REGMARK; 918 no warnings 'experimental::lexical_topic'; 919 my $_ = "A"; 920 ok(s/(*:B)A/$REGMARK/, $message); 921 is($_, "B", $message); 922 $_ = "CCCCBAA"; 923 ok(s/(*:X)A+|(*:Y)B+|(*:Z)C+/$REGMARK/g, $message); 924 is($_, "ZYX", $message); 925 # Use a longer name to force reallocation of $REGMARK. 926 $_ = "CCCCBAA"; 927 ok(s/(*:X)A+|(*:YYYYYYYYYYYYYYYY)B+|(*:Z)C+/$REGMARK/g, $message); 928 is($_, "ZYYYYYYYYYYYYYYYYX", $message); 929 } 930 931 { 932 my $message = 'Substitution evaluation in list context; Bug 52658'; 933 my $reg = '../xxx/'; 934 my @te = ($reg =~ m{^(/?(?:\.\./)*)}, 935 $reg =~ s/(x)/'b'/eg > 1 ? '##' : '++'); 936 is($reg, '../bbb/', $message); 937 is($te [0], '../', $message); 938 } 939 940 { 941 my $a = "xyzt" x 8192; 942 like($a, qr/\A(?>[a-z])*\z/, 943 '(?>) does not cause wrongness on long string; Bug 60034'); 944 my $b = $a . chr 256; 945 chop $b; 946 is($a, $b, 'Bug 60034'); 947 like($b, qr/\A(?>[a-z])*\z/, 948 '(?>) does not cause wrongness on long string with UTF-8; Bug 60034'); 949 } 950 951 # 952 # Keep the following tests last -- they may crash perl 953 # 954 print "# Tests that follow may crash perl\n"; 955 { 956 957 my $message = 'Pattern in a loop, failure should not ' . 958 'affect previous success; Bug 19049/38869'; 959 my @list = ( 960 'ab cdef', # Matches regex 961 ('e' x 40000 ) .'ab c' # Matches not, but 'ab c' matches part of it 962 ); 963 my $y; 964 my $x; 965 foreach (@list) { 966 m/ab(.+)cd/i; # The ignore-case seems to be important 967 $y = $1; # Use $1, which might not be from the last match! 968 $x = substr ($list [0], $- [0], $+ [0] - $- [0]); 969 } 970 is($y, ' ', $message); 971 is($x, 'ab cd', $message); 972 } 973 974 { 975 ok (("a" x (2 ** 15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker; Bug 24274"); 976 ok ((q(a)x 100) =~ /^(??{'(.)'x 100})/, 977 "Regexp /^(??{'(.)'x 100})/ crashes older perls; Bug 24274"); 978 } 979 980 { 981 # [perl #45337] utf8 + "[a]a{2}" + /$.../ = panic: sv_len_utf8 cache 982 983 local ${^UTF8CACHE} = -1; 984 my $message = "Shouldn't panic; Bug 45337"; 985 my $s = "[a]a{2}"; 986 utf8::upgrade $s; 987 like("aaa", qr/$s/, $message); 988 } 989 { 990 my $message = "Check if tree logic breaks \$^R; Bug 57042"; 991 my $cond_re = qr/\s* 992 \s* (?: 993 \( \s* A (?{1}) 994 | \( \s* B (?{2}) 995 ) 996 /x; 997 my @res; 998 for my $line ("(A)","(B)") { 999 if ($line =~ m/$cond_re/) { 1000 push @res, $^R ? "#$^R" : "UNDEF"; 1001 } 1002 } 1003 is("@res","#1 #2", $message); 1004 } 1005 { 1006 no warnings 'closure'; 1007 my $re = qr/A(??{"1"})/; 1008 ok "A1B" =~ m/^((??{ $re }))((??{"B"}))$/; 1009 ok $1 eq "A1"; 1010 ok $2 eq "B"; 1011 } 1012 1013 # This only works under -DEBUGGING because it relies on an assert(). 1014 { 1015 # Check capture offset re-entrancy of utf8 code. 1016 1017 sub fswash { $_[0] =~ s/([>X])//g; } 1018 1019 my $k1 = "." x 4 . ">>"; 1020 fswash($k1); 1021 1022 my $k2 = "\x{f1}\x{2022}"; 1023 $k2 =~ s/([\360-\362])/>/g; 1024 fswash($k2); 1025 1026 is($k2, "\x{2022}", "utf8::SWASHNEW doesn't cause capture leaks; Bug 60508"); 1027 } 1028 1029 { 1030 # minimal CURLYM limited to 32767 matches 1031 my @pat = ( 1032 qr{a(x|y)*b}, # CURLYM 1033 qr{a(x|y)*?b}, # .. with minmod 1034 qr{a([wx]|[yz])*b}, # .. and without tries 1035 qr{a([wx]|[yz])*?b}, 1036 ); 1037 my $len = 32768; 1038 my $s = join '', 'a', 'x' x $len, 'b'; 1039 for my $pat (@pat) { 1040 like($s, $pat, "$pat; Bug 65372"); 1041 } 1042 } 1043 1044 { 1045 local $::TODO = "[perl #38133]"; 1046 1047 "A" =~ /(((?:A))?)+/; 1048 my $first = $2; 1049 1050 "A" =~ /(((A))?)+/; 1051 my $second = $2; 1052 1053 is($first, $second); 1054 } 1055 1056 { 1057 my $message 1058 = 'utf8 =~ /trie/ where trie matches a continuation octet; Bug 70998'; 1059 1060 # Catch warnings: 1061 my $w; 1062 local $SIG{__WARN__} = sub { $w .= shift }; 1063 1064 # This bug can be reduced to 1065 qq{\x{30ab}} =~ /\xab|\xa9/; 1066 # but it's nice to have a more 'real-world' test. The original test 1067 # case from the RT ticket follows: 1068 1069 my %conv = ( 1070 "\xab" => "<", 1071 "\xa9" => "(c)", 1072 ); 1073 my $conv_rx = '(' . join('|', map { quotemeta } keys %conv) . ')'; 1074 $conv_rx = qr{$conv_rx}; 1075 1076 my $x 1077 = qq{\x{3042}\x{304b}\x{3055}\x{305f}\x{306a}\x{306f}\x{307e}} 1078 . qq{\x{3084}\x{3089}\x{308f}\x{3093}\x{3042}\x{304b}\x{3055}} 1079 . qq{\x{305f}\x{306a}\x{306f}\x{307e}\x{3084}\x{3089}\x{308f}} 1080 . qq{\x{3093}\x{30a2}\x{30ab}\x{30b5}\x{30bf}\x{30ca}\x{30cf}} 1081 . qq{\x{30de}\x{30e4}\x{30e9}\x{30ef}\x{30f3}\x{30a2}\x{30ab}} 1082 . qq{\x{30b5}\x{30bf}\x{30ca}\x{30cf}\x{30de}\x{30e4}\x{30e9}} 1083 . qq{\x{30ef}\x{30f3}\x{30a2}\x{30ab}\x{30b5}\x{30bf}\x{30ca}} 1084 . qq{\x{30cf}\x{30de}\x{30e4}\x{30e9}\x{30ef}\x{30f3}}; 1085 1086 $x =~ s{$conv_rx}{$conv{$1}}eg; 1087 1088 is($w, undef, $message); 1089 } 1090 1091 { 1092 # minimal CURLYM limited to 32767 matches 1093 1094 is(join("-", " abc def " =~ /(?=(\S+))/g), "abc-bc-c-def-ef-f", 1095 'stclass optimisation does not break + inside (?=); Bug 68564'); 1096 } 1097 1098 { 1099 use charnames ":full"; 1100 # Delayed interpolation of \N' 1101 my $r1 = qr/\N{THAI CHARACTER SARA I}/; 1102 my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}"; 1103 1104 # Bug #56444 1105 ok $s1 =~ /$r1+/, 'my $r1 = qr/\N{THAI CHARACTER SARA I}/; my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}; $s1 =~ /$r1+/'; 1106 1107 # Bug #62056 1108 ok "${s1}A" =~ m/$s1\N{LATIN CAPITAL LETTER A}/, '"${s1}A" =~ m/$s1\N{LATIN CAPITAL LETTER A}/'; 1109 1110 ok "abbbbc" =~ m/\N{1}/ && $& eq "a", '"abbbbc" =~ m/\N{1}/ && $& eq "a"'; 1111 ok "abbbbc" =~ m/\N{3,4}/ && $& eq "abbb", '"abbbbc" =~ m/\N{3,4}/ && $& eq "abbb"'; 1112 } 1113 1114 { 1115 use charnames ":full"; 1116 my $message = '[perl #74982] Period coming after \N{}'; 1117 ok("\x{ff08}." =~ m/\N{FULLWIDTH LEFT PARENTHESIS}./ && $& eq "\x{ff08}.", $message); 1118 ok("\x{ff08}." =~ m/[\N{FULLWIDTH LEFT PARENTHESIS}]./ && $& eq "\x{ff08}.", $message); 1119 } 1120 1121SKIP: { 1122 ######## "Segfault using HTML::Entities", Richard Jolly <richardjolly@mac.com>, <A3C7D27E-C9F4-11D8-B294-003065AE00B6@mac.com> in perl-unicode@perl.org 1123 1124 skip('Perl configured without Encode module', 1) 1125 unless $Config{extensions} =~ / Encode /; 1126 1127 # Test case cut down by jhi 1128 fresh_perl_like(<<'EOP', qr!Malformed UTF-8 character \(unexpected end of string\) in substitution \(s///\) at!, {}, 'Segfault using HTML::Entities'); 1129use Encode; 1130my $t = ord('A') == 193 ? "\xEA" : "\xE9"; 1131Encode::_utf8_on($t); 1132$t =~ s/([^a])//ge; 1133EOP 1134 } 1135 1136 { 1137 # pattern must be compiled late or we can break the test file 1138 my $message = '[perl #115050] repeated nothings in a trie can cause panic'; 1139 my $pattern; 1140 $pattern = '[xyz]|||'; 1141 ok("blah blah" =~ /$pattern/, $message); 1142 ok("blah blah" =~ /(?:$pattern)h/, $message); 1143 $pattern = '|||[xyz]'; 1144 ok("blah blah" =~ /$pattern/, $message); 1145 ok("blah blah" =~ /(?:$pattern)h/, $message); 1146 } 1147 1148 { 1149 # [perl #4289] First mention $& after a match 1150 local $::TODO = "these tests fail without Copy-on-Write enabled" 1151 if $Config{ccflags} =~ /PERL_NO_COW/; 1152 fresh_perl_is( 1153 '$_ = "abc"; /b/g; $_ = "hello"; print eval q|$&|, "\n"', 1154 "b\n", {}, '$& first mentioned after match'); 1155 fresh_perl_is( 1156 '$_ = "abc"; /b/g; $_ = "hello"; print eval q|$`|, "\n"', 1157 "a\n", {}, '$` first mentioned after match'); 1158 fresh_perl_is( 1159 '$_ = "abc"; /b/g; $_ = "hello"; print eval q|$\'|,"\n"', 1160 "c\n", {}, '$\' first mentioned after match'); 1161 } 1162 1163 { 1164 # [perl #118175] threaded perl-5.18.0 fails pat_rt_report_thr.t 1165 # this tests some related failures 1166 # 1167 # The tests in the block *only* fail when run on 32-bit systems 1168 # with a malloc that allocates above the 2GB line. On the system 1169 # in the report above that only happened in a thread. 1170 my $s = "\x{1ff}" . "f" x 32; 1171 ok($s =~ /\x{1ff}[[:alpha:]]+/gca, "POSIXA pointer wrap"); 1172 1173 # this one segfaulted under the conditions above 1174 # of course, CANY is evil, maybe it should crash 1175 ok($s =~ /.\C+/, "CANY pointer wrap"); 1176 } 1177} # End of sub run_tests 1178 11791; 1180