1#!./perl 2# 3# This is a home for regular expression tests that do not fit into 4# the format supported by re/regexp.t. If you want to add a test 5# that does fit that format, add it to re/re_tests, not here. 6 7use strict; 8use warnings; 9use 5.010; 10 11 12sub run_tests; 13 14$| = 1; 15 16 17BEGIN { 18 chdir 't' if -d 't'; 19 @INC = ('../lib','.'); 20 require './test.pl'; 21 skip_all_if_miniperl("miniperl can't load Tie::Hash::NamedCapture, need for %+ and %-"); 22} 23 24run_tests() unless caller; 25 26# 27# Tests start here. 28# 29sub run_tests { 30 31 { 32 my $message = '\C matches octet'; 33 $_ = "a\x{100}b"; 34 ok(/(.)(\C)(\C)(.)/, $message); 35 is($1, "a", $message); 36 if ($::IS_ASCII) { # ASCII (or equivalent), should be UTF-8 37 is($2, "\xC4", $message); 38 is($3, "\x80", $message); 39 } 40 elsif ($::IS_EBCDIC) { # EBCDIC (or equivalent), should be UTF-EBCDIC 41 is($2, "\x8C", $message); 42 is($3, "\x41", $message); 43 } 44 else { 45 SKIP: { 46 ok 0, "Unexpected platform", "ord ('A') =" . ord 'A'; 47 skip "Unexpected platform"; 48 } 49 } 50 is($4, "b", $message); 51 } 52 53 { 54 my $message = '\C matches octet'; 55 $_ = "\x{100}"; 56 ok(/(\C)/g, $message); 57 if ($::IS_ASCII) { 58 is($1, "\xC4", $message); 59 } 60 elsif ($::IS_EBCDIC) { 61 is($1, "\x8C", $message); 62 } 63 else { 64 ok 0, "Unexpected platform", "ord ('A') = " . ord 'A'; 65 } 66 ok(/(\C)/g, $message); 67 if ($::IS_ASCII) { 68 is($1, "\x80", $message); 69 } 70 elsif ($::IS_EBCDIC) { 71 is($1, "\x41", $message); 72 } 73 else { 74 ok 0, "Unexpected platform", "ord ('A') = " . ord 'A'; 75 } 76 } 77 78 { 79 # Japhy -- added 03/03/2001 80 () = (my $str = "abc") =~ /(...)/; 81 $str = "def"; 82 is($1, "abc", 'Changing subject does not modify $1'); 83 } 84 85 SKIP: 86 { 87 # The trick is that in EBCDIC the explicit numeric range should 88 # match (as also in non-EBCDIC) but the explicit alphabetic range 89 # should not match. 90 ok "\x8e" =~ /[\x89-\x91]/, '"\x8e" =~ /[\x89-\x91]/'; 91 ok "\xce" =~ /[\xc9-\xd1]/, '"\xce" =~ /[\xc9-\xd1]/'; 92 ok "\xd0" =~ /[\xc9-\xd1]/, '"\xd0" =~ /[\xc9-\xd1]/'; 93 94 skip "Not an EBCDIC platform", 2 unless ord ('i') == 0x89 && 95 ord ('J') == 0xd1; 96 97 # In most places these tests would succeed since \x8e does not 98 # in most character sets match 'i' or 'j' nor would \xce match 99 # 'I' or 'J', but strictly speaking these tests are here for 100 # the good of EBCDIC, so let's test these only there. 101 unlike("\x8e", qr/[i-j]/, '"\x8e" !~ /[i-j]/'); 102 unlike("\xce", qr/[I-J]/, '"\xce" !~ /[I-J]/'); 103 unlike("\xd0", qr/[I-J]/, '"\xd0" !~ /[I-J]/'); 104 } 105 106 { 107 ok "\x{ab}" =~ /\x{ab}/, '"\x{ab}" =~ /\x{ab}/ '; 108 ok "\x{abcd}" =~ /\x{abcd}/, '"\x{abcd}" =~ /\x{abcd}/'; 109 } 110 111 { 112 my $message = 'bug id 20001008.001'; 113 114 my @x = ("stra\337e 138", "stra\337e 138"); 115 for (@x) { 116 ok(s/(\d+)\s*([\w\-]+)/$1 . uc $2/e, $message); 117 ok(my ($latin) = /^(.+)(?:\s+\d)/, $message); 118 is($latin, "stra\337e", $message); 119 ok($latin =~ s/stra\337e/straße/, $message); 120 # 121 # Previous code follows, but outcommented - there were no tests. 122 # 123 # $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a 124 # use utf8; # needed for the raw UTF-8 125 # $latin =~ s!(s)tr(?:aß|s+e)!$1tr.!; # \303\237 after the a 126 } 127 } 128 129 { 130 my $message = 'Test \x escapes'; 131 ok("ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\xd4", $message); 132 ok("ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}", $message); 133 ok("ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}", $message); 134 ok("ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\xd4", $message); 135 ok("ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4", $message); 136 ok("ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}", $message); 137 ok("ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}", $message); 138 ok("ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4", $message); 139 } 140 141 { 142 my $message = 'Match code points > 255'; 143 $_ = "abc\x{100}\x{200}\x{300}\x{380}\x{400}defg"; 144 ok(/(.\x{300})./, $message); 145 ok($` eq "abc\x{100}" && length ($`) == 4, $message); 146 ok($& eq "\x{200}\x{300}\x{380}" && length ($&) == 3, $message); 147 ok($' eq "\x{400}defg" && length ($') == 5, $message); 148 ok($1 eq "\x{200}\x{300}" && length ($1) == 2, $message); 149 } 150 151 { 152 my $x = "\x{10FFFD}"; 153 $x =~ s/(.)/$1/g; 154 ok ord($x) == 0x10FFFD && length($x) == 1, "From Robin Houston"; 155 } 156 157 { 158 my %d = ( 159 "7f" => [0, 0, 0], 160 "80" => [1, 1, 0], 161 "ff" => [1, 1, 0], 162 "100" => [0, 1, 1], 163 ); 164 165 while (my ($code, $match) = each %d) { 166 my $message = "Properties of \\x$code"; 167 my $char = eval qq ["\\x{$code}"]; 168 169 is(0 + ($char =~ /[\x80-\xff]/), $$match[0], $message); 170 is(0 + ($char =~ /[\x80-\x{100}]/), $$match[1], $message); 171 is(0 + ($char =~ /[\x{100}]/), $$match[2], $message); 172 } 173 } 174 175 { 176 # From Japhy 177 foreach (qw(c g o)) { 178 warning_like(sub {'' =~ "(?$_)"}, qr/^Useless \(\?$_\)/); 179 warning_like(sub {'' =~ "(?-$_)"}, qr/^Useless \(\?-$_\)/); 180 } 181 182 # Now test multi-error regexes 183 foreach (['(?g-o)', qr/^Useless \(\?g\)/, qr/^Useless \(\?-o\)/], 184 ['(?g-c)', qr/^Useless \(\?g\)/, qr/^Useless \(\?-c\)/], 185 # (?c) means (?g) error won't be thrown 186 ['(?o-cg)', qr/^Useless \(\?o\)/, qr/^Useless \(\?-c\)/], 187 ['(?ogc)', qr/^Useless \(\?o\)/, qr/^Useless \(\?g\)/, 188 qr/^Useless \(\?c\)/], 189 ) { 190 my ($re, @warnings) = @$_; 191 warnings_like(sub {eval "qr/$re/"}, \@warnings, "qr/$re/ warns"); 192 } 193 } 194 195 { 196 my $message = "/x tests"; 197 $_ = "foo"; 198 foreach my $pat (<<" --", <<" --") { 199 /f 200 o\r 201 o 202 \$ 203 /x 204 -- 205 /f 206 o 207 o 208 \$\r 209 /x 210 -- 211 is(eval $pat, 1, $message); 212 is($@, '', $message); 213 } 214 } 215 216 { 217 my $message = "/o feature"; 218 sub test_o {$_ [0] =~ /$_[1]/o; return $1} 219 is(test_o ('abc', '(.)..'), 'a', $message); 220 is(test_o ('abc', '..(.)'), 'a', $message); 221 } 222 223 { 224 # Test basic $^N usage outside of a regex 225 my $message = '$^N usage outside of a regex'; 226 my $x = "abcdef"; 227 ok(($x =~ /cde/ and !defined $^N), $message); 228 ok(($x =~ /(cde)/ and $^N eq "cde"), $message); 229 ok(($x =~ /(c)(d)(e)/ and $^N eq "e"), $message); 230 ok(($x =~ /(c(d)e)/ and $^N eq "cde"), $message); 231 ok(($x =~ /(foo)|(c(d)e)/ and $^N eq "cde"), $message); 232 ok(($x =~ /(c(d)e)|(foo)/ and $^N eq "cde"), $message); 233 ok(($x =~ /(c(d)e)|(abc)/ and $^N eq "abc"), $message); 234 ok(($x =~ /(c(d)e)|(abc)x/ and $^N eq "cde"), $message); 235 ok(($x =~ /(c(d)e)(abc)?/ and $^N eq "cde"), $message); 236 ok(($x =~ /(?:c(d)e)/ and $^N eq "d"), $message); 237 ok(($x =~ /(?:c(d)e)(?:f)/ and $^N eq "d"), $message); 238 ok(($x =~ /(?:([abc])|([def]))*/ and $^N eq "f"), $message); 239 ok(($x =~ /(?:([ace])|([bdf]))*/ and $^N eq "f"), $message); 240 ok(($x =~ /(([ace])|([bd]))*/ and $^N eq "e"), $message); 241 {ok(($x =~ /(([ace])|([bdf]))*/ and $^N eq "f"), $message);} 242 ## Test to see if $^N is automatically localized -- it should now 243 ## have the value set in the previous test. 244 is($^N, "e", '$^N is automatically localized'); 245 246 # Now test inside (?{ ... }) 247 $message = '$^N usage inside (?{ ... })'; 248 our ($y, $z); 249 ok(($x =~ /a([abc])(?{$y=$^N})c/ and $y eq "b"), $message); 250 ok(($x =~ /a([abc]+)(?{$y=$^N})d/ and $y eq "bc"), $message); 251 ok(($x =~ /a([abcdefg]+)(?{$y=$^N})d/ and $y eq "bc"), $message); 252 ok(($x =~ /(a([abcdefg]+)(?{$y=$^N})d)(?{$z=$^N})e/ and $y eq "bc" 253 and $z eq "abcd"), $message); 254 ok(($x =~ /(a([abcdefg]+)(?{$y=$^N})de)(?{$z=$^N})/ and $y eq "bc" 255 and $z eq "abcde"), $message); 256 257 } 258 259 SKIP: 260 { 261 ## Should probably put in tests for all the POSIX stuff, 262 ## but not sure how to guarantee a specific locale...... 263 264 skip "Not an ASCII platform", 2 unless $::IS_ASCII; 265 my $message = 'Test [[:cntrl:]]'; 266 my $AllBytes = join "" => map {chr} 0 .. 255; 267 (my $x = $AllBytes) =~ s/[[:cntrl:]]//g; 268 is($x, join("", map {chr} 0x20 .. 0x7E, 0x80 .. 0xFF), $message); 269 270 ($x = $AllBytes) =~ s/[^[:cntrl:]]//g; 271 is($x, (join "", map {chr} 0x00 .. 0x1F, 0x7F), $message); 272 } 273 274 { 275 # With /s modifier UTF8 chars were interpreted as bytes 276 my $message = "UTF-8 chars aren't bytes"; 277 my $a = "Hello \x{263A} World"; 278 my @a = ($a =~ /./gs); 279 is($#a, 12, $message); 280 } 281 282 { 283 my $message = '. matches \n with /s'; 284 my $str1 = "foo\nbar"; 285 my $str2 = "foo\n\x{100}bar"; 286 my ($a, $b) = map {chr} $::IS_ASCII ? (0xc4, 0x80) : (0x8c, 0x41); 287 my @a; 288 @a = $str1 =~ /./g; is(@a, 6, $message); is("@a", "f o o b a r", $message); 289 @a = $str1 =~ /./gs; is(@a, 7, $message); is("@a", "f o o \n b a r", $message); 290 @a = $str1 =~ /\C/g; is(@a, 7, $message); is("@a", "f o o \n b a r", $message); 291 @a = $str1 =~ /\C/gs; is(@a, 7, $message); is("@a", "f o o \n b a r", $message); 292 @a = $str2 =~ /./g; is(@a, 7, $message); is("@a", "f o o \x{100} b a r", $message); 293 @a = $str2 =~ /./gs; is(@a, 8, $message); is("@a", "f o o \n \x{100} b a r", $message); 294 @a = $str2 =~ /\C/g; is(@a, 9, $message); is("@a", "f o o \n $a $b b a r", $message); 295 @a = $str2 =~ /\C/gs; is(@a, 9, $message); is("@a", "f o o \n $a $b b a r", $message); 296 } 297 298 { 299 no warnings 'digit'; 300 # Check that \x## works. 5.6.1 and 5.005_03 fail some of these. 301 my $x; 302 $x = "\x4e" . "E"; 303 ok ($x =~ /^\x4EE$/, "Check only 2 bytes of hex are matched."); 304 305 $x = "\x4e" . "i"; 306 ok ($x =~ /^\x4Ei$/, "Check that invalid hex digit stops it (2)"); 307 308 $x = "\x4" . "j"; 309 ok ($x =~ /^\x4j$/, "Check that invalid hex digit stops it (1)"); 310 311 $x = "\x0" . "k"; 312 ok ($x =~ /^\xk$/, "Check that invalid hex digit stops it (0)"); 313 314 $x = "\x0" . "x"; 315 ok ($x =~ /^\xx$/, "\\xx isn't to be treated as \\0"); 316 317 $x = "\x0" . "xa"; 318 ok ($x =~ /^\xxa$/, "\\xxa isn't to be treated as \\xa"); 319 320 $x = "\x9" . "_b"; 321 ok ($x =~ /^\x9_b$/, "\\x9_b isn't to be treated as \\x9b"); 322 323 # and now again in [] ranges 324 325 $x = "\x4e" . "E"; 326 ok ($x =~ /^[\x4EE]{2}$/, "Check only 2 bytes of hex are matched."); 327 328 $x = "\x4e" . "i"; 329 ok ($x =~ /^[\x4Ei]{2}$/, "Check that invalid hex digit stops it (2)"); 330 331 $x = "\x4" . "j"; 332 ok ($x =~ /^[\x4j]{2}$/, "Check that invalid hex digit stops it (1)"); 333 334 $x = "\x0" . "k"; 335 ok ($x =~ /^[\xk]{2}$/, "Check that invalid hex digit stops it (0)"); 336 337 $x = "\x0" . "x"; 338 ok ($x =~ /^[\xx]{2}$/, "\\xx isn't to be treated as \\0"); 339 340 $x = "\x0" . "xa"; 341 ok ($x =~ /^[\xxa]{3}$/, "\\xxa isn't to be treated as \\xa"); 342 343 $x = "\x9" . "_b"; 344 ok ($x =~ /^[\x9_b]{3}$/, "\\x9_b isn't to be treated as \\x9b"); 345 346 # Check that \x{##} works. 5.6.1 fails quite a few of these. 347 348 $x = "\x9b"; 349 ok ($x =~ /^\x{9_b}$/, "\\x{9_b} is to be treated as \\x9b"); 350 351 $x = "\x9b" . "y"; 352 ok ($x =~ /^\x{9_b}y$/, "\\x{9_b} is to be treated as \\x9b (again)"); 353 354 $x = "\x9b" . "y"; 355 ok ($x =~ /^\x{9b_}y$/, "\\x{9b_} is to be treated as \\x9b"); 356 357 $x = "\x9b" . "y"; 358 ok ($x =~ /^\x{9_bq}y$/, "\\x{9_bc} is to be treated as \\x9b"); 359 360 $x = "\x0" . "y"; 361 ok ($x =~ /^\x{x9b}y$/, "\\x{x9b} is to be treated as \\x0"); 362 363 $x = "\x0" . "y"; 364 ok ($x =~ /^\x{0x9b}y$/, "\\x{0x9b} is to be treated as \\x0"); 365 366 $x = "\x9b" . "y"; 367 ok ($x =~ /^\x{09b}y$/, "\\x{09b} is to be treated as \\x9b"); 368 369 $x = "\x9b"; 370 ok ($x =~ /^[\x{9_b}]$/, "\\x{9_b} is to be treated as \\x9b"); 371 372 $x = "\x9b" . "y"; 373 ok ($x =~ /^[\x{9_b}y]{2}$/, 374 "\\x{9_b} is to be treated as \\x9b (again)"); 375 376 $x = "\x9b" . "y"; 377 ok ($x =~ /^[\x{9b_}y]{2}$/, "\\x{9b_} is to be treated as \\x9b"); 378 379 $x = "\x9b" . "y"; 380 ok ($x =~ /^[\x{9_bq}y]{2}$/, "\\x{9_bc} is to be treated as \\x9b"); 381 382 $x = "\x0" . "y"; 383 ok ($x =~ /^[\x{x9b}y]{2}$/, "\\x{x9b} is to be treated as \\x0"); 384 385 $x = "\x0" . "y"; 386 ok ($x =~ /^[\x{0x9b}y]{2}$/, "\\x{0x9b} is to be treated as \\x0"); 387 388 $x = "\x9b" . "y"; 389 ok ($x =~ /^[\x{09b}y]{2}$/, "\\x{09b} is to be treated as \\x9b"); 390 391 } 392 393 { 394 # High bit bug -- japhy 395 my $x = "ab\200d"; 396 ok $x =~ /.*?\200/, "High bit fine"; 397 } 398 399 { 400 # The basic character classes and Unicode 401 ok "\x{0100}" =~ /\w/, 'LATIN CAPITAL LETTER A WITH MACRON in /\w/'; 402 ok "\x{0660}" =~ /\d/, 'ARABIC-INDIC DIGIT ZERO in /\d/'; 403 ok "\x{1680}" =~ /\s/, 'OGHAM SPACE MARK in /\s/'; 404 } 405 406 { 407 my $message = "Folding matches and Unicode"; 408 like("a\x{100}", qr/A/i, $message); 409 like("A\x{100}", qr/a/i, $message); 410 like("a\x{100}", qr/a/i, $message); 411 like("A\x{100}", qr/A/i, $message); 412 like("\x{101}a", qr/\x{100}/i, $message); 413 like("\x{100}a", qr/\x{100}/i, $message); 414 like("\x{101}a", qr/\x{101}/i, $message); 415 like("\x{100}a", qr/\x{101}/i, $message); 416 like("a\x{100}", qr/A\x{100}/i, $message); 417 like("A\x{100}", qr/a\x{100}/i, $message); 418 like("a\x{100}", qr/a\x{100}/i, $message); 419 like("A\x{100}", qr/A\x{100}/i, $message); 420 like("a\x{100}", qr/[A]/i, $message); 421 like("A\x{100}", qr/[a]/i, $message); 422 like("a\x{100}", qr/[a]/i, $message); 423 like("A\x{100}", qr/[A]/i, $message); 424 like("\x{101}a", qr/[\x{100}]/i, $message); 425 like("\x{100}a", qr/[\x{100}]/i, $message); 426 like("\x{101}a", qr/[\x{101}]/i, $message); 427 like("\x{100}a", qr/[\x{101}]/i, $message); 428 } 429 430 { 431 use charnames ':full'; 432 my $message = "Folding 'LATIN LETTER A WITH GRAVE'"; 433 434 my $lower = "\N{LATIN SMALL LETTER A WITH GRAVE}"; 435 my $UPPER = "\N{LATIN CAPITAL LETTER A WITH GRAVE}"; 436 437 like($lower, qr/$UPPER/i, $message); 438 like($UPPER, qr/$lower/i, $message); 439 like($lower, qr/[$UPPER]/i, $message); 440 like($UPPER, qr/[$lower]/i, $message); 441 442 $message = "Folding 'GREEK LETTER ALPHA WITH VRACHY'"; 443 444 $lower = "\N{GREEK CAPITAL LETTER ALPHA WITH VRACHY}"; 445 $UPPER = "\N{GREEK SMALL LETTER ALPHA WITH VRACHY}"; 446 447 like($lower, qr/$UPPER/i, $message); 448 like($UPPER, qr/$lower/i, $message); 449 like($lower, qr/[$UPPER]/i, $message); 450 like($UPPER, qr/[$lower]/i, $message); 451 452 $message = "Folding 'LATIN LETTER Y WITH DIAERESIS'"; 453 454 $lower = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; 455 $UPPER = "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}"; 456 457 like($lower, qr/$UPPER/i, $message); 458 like($UPPER, qr/$lower/i, $message); 459 like($lower, qr/[$UPPER]/i, $message); 460 like($UPPER, qr/[$lower]/i, $message); 461 } 462 463 { 464 use charnames ':full'; 465 my $message = "GREEK CAPITAL LETTER SIGMA vs " . 466 "COMBINING GREEK PERISPOMENI"; 467 468 my $SIGMA = "\N{GREEK CAPITAL LETTER SIGMA}"; 469 my $char = "\N{COMBINING GREEK PERISPOMENI}"; 470 471 warning_is(sub {unlike("_:$char:_", qr/_:$SIGMA:_/i, $message)}, undef, 472 'Did not warn [change a5961de5f4215b5c]'); 473 } 474 475 { 476 my $message = '\X'; 477 use charnames ':full'; 478 479 ok("a!" =~ /^(\X)!/ && $1 eq "a", $message); 480 ok("\xDF!" =~ /^(\X)!/ && $1 eq "\xDF", $message); 481 ok("\x{100}!" =~ /^(\X)!/ && $1 eq "\x{100}", $message); 482 ok("\x{100}\x{300}!" =~ /^(\X)!/ && $1 eq "\x{100}\x{300}", $message); 483 ok("\N{LATIN CAPITAL LETTER E}!" =~ /^(\X)!/ && 484 $1 eq "\N{LATIN CAPITAL LETTER E}", $message); 485 ok("\N{LATIN CAPITAL LETTER E}\N{COMBINING GRAVE ACCENT}!" 486 =~ /^(\X)!/ && 487 $1 eq "\N{LATIN CAPITAL LETTER E}\N{COMBINING GRAVE ACCENT}", $message); 488 489 $message = '\C and \X'; 490 like("!abc!", qr/a\Cc/, $message); 491 like("!abc!", qr/a\Xc/, $message); 492 } 493 494 { 495 my $message = "Final Sigma"; 496 497 my $SIGMA = "\x{03A3}"; # CAPITAL 498 my $Sigma = "\x{03C2}"; # SMALL FINAL 499 my $sigma = "\x{03C3}"; # SMALL 500 501 like($SIGMA, qr/$SIGMA/i, $message); 502 like($SIGMA, qr/$Sigma/i, $message); 503 like($SIGMA, qr/$sigma/i, $message); 504 505 like($Sigma, qr/$SIGMA/i, $message); 506 like($Sigma, qr/$Sigma/i, $message); 507 like($Sigma, qr/$sigma/i, $message); 508 509 like($sigma, qr/$SIGMA/i, $message); 510 like($sigma, qr/$Sigma/i, $message); 511 like($sigma, qr/$sigma/i, $message); 512 513 like($SIGMA, qr/[$SIGMA]/i, $message); 514 like($SIGMA, qr/[$Sigma]/i, $message); 515 like($SIGMA, qr/[$sigma]/i, $message); 516 517 like($Sigma, qr/[$SIGMA]/i, $message); 518 like($Sigma, qr/[$Sigma]/i, $message); 519 like($Sigma, qr/[$sigma]/i, $message); 520 521 like($sigma, qr/[$SIGMA]/i, $message); 522 like($sigma, qr/[$Sigma]/i, $message); 523 like($sigma, qr/[$sigma]/i, $message); 524 525 $message = "More final Sigma"; 526 527 my $S3 = "$SIGMA$Sigma$sigma"; 528 529 ok(":$S3:" =~ /:(($SIGMA)+):/i && $1 eq $S3 && $2 eq $sigma, $message); 530 ok(":$S3:" =~ /:(($Sigma)+):/i && $1 eq $S3 && $2 eq $sigma, $message); 531 ok(":$S3:" =~ /:(($sigma)+):/i && $1 eq $S3 && $2 eq $sigma, $message); 532 533 ok(":$S3:" =~ /:(([$SIGMA])+):/i && $1 eq $S3 && $2 eq $sigma, $message); 534 ok(":$S3:" =~ /:(([$Sigma])+):/i && $1 eq $S3 && $2 eq $sigma, $message); 535 ok(":$S3:" =~ /:(([$sigma])+):/i && $1 eq $S3 && $2 eq $sigma, $message); 536 } 537 538 { 539 use charnames ':full'; 540 my $message = "Parlez-Vous " . 541 "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais?"; 542 543 ok("Fran\N{LATIN SMALL LETTER C}ais" =~ /Fran.ais/ && 544 $& eq "Francais", $message); 545 ok("Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais" =~ /Fran.ais/ && 546 $& eq "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais", $message); 547 ok("Fran\N{LATIN SMALL LETTER C}ais" =~ /Fran\Cais/ && 548 $& eq "Francais", $message); 549 # COMBINING CEDILLA is two bytes when encoded 550 like("Franc\N{COMBINING CEDILLA}ais", qr/Franc\C\Cais/, $message); 551 ok("Fran\N{LATIN SMALL LETTER C}ais" =~ /Fran\Xais/ && 552 $& eq "Francais", $message); 553 ok("Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais" =~ /Fran\Xais/ && 554 $& eq "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais", $message); 555 ok("Franc\N{COMBINING CEDILLA}ais" =~ /Fran\Xais/ && 556 $& eq "Franc\N{COMBINING CEDILLA}ais", $message); 557 ok("Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais" =~ 558 /Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais/ && 559 $& eq "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais", $message); 560 ok("Franc\N{COMBINING CEDILLA}ais" =~ /Franc\N{COMBINING CEDILLA}ais/ && 561 $& eq "Franc\N{COMBINING CEDILLA}ais", $message); 562 563 my @f = ( 564 ["Fran\N{LATIN SMALL LETTER C}ais", "Francais"], 565 ["Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais", 566 "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais"], 567 ["Franc\N{COMBINING CEDILLA}ais", "Franc\N{COMBINING CEDILLA}ais"], 568 ); 569 foreach my $entry (@f) { 570 my ($subject, $match) = @$entry; 571 ok($subject =~ /Fran(?:c\N{COMBINING CEDILLA}?| 572 \N{LATIN SMALL LETTER C WITH CEDILLA})ais/x && 573 $& eq $match, $message); 574 } 575 } 576 577 { 578 my $message = "Lingering (and useless) UTF8 flag doesn't mess up /i"; 579 my $pat = "ABcde"; 580 my $str = "abcDE\x{100}"; 581 chop $str; 582 like($str, qr/$pat/i, $message); 583 584 $pat = "ABcde\x{100}"; 585 $str = "abcDE"; 586 chop $pat; 587 like($str, qr/$pat/i, $message); 588 589 $pat = "ABcde\x{100}"; 590 $str = "abcDE\x{100}"; 591 chop $pat; 592 chop $str; 593 like($str, qr/$pat/i, $message); 594 } 595 596 { 597 use charnames ':full'; 598 my $message = "LATIN SMALL LETTER SHARP S " . 599 "(\N{LATIN SMALL LETTER SHARP S})"; 600 601 like("\N{LATIN SMALL LETTER SHARP S}", 602 qr/\N{LATIN SMALL LETTER SHARP S}/, $message); 603 like("\N{LATIN SMALL LETTER SHARP S}", 604 qr/\N{LATIN SMALL LETTER SHARP S}/i, $message); 605 like("\N{LATIN SMALL LETTER SHARP S}", 606 qr/[\N{LATIN SMALL LETTER SHARP S}]/, $message); 607 like("\N{LATIN SMALL LETTER SHARP S}", 608 qr/[\N{LATIN SMALL LETTER SHARP S}]/i, $message); 609 610 like("ss", qr /\N{LATIN SMALL LETTER SHARP S}/i, $message); 611 like("SS", qr /\N{LATIN SMALL LETTER SHARP S}/i, $message); 612 like("ss", qr/[\N{LATIN SMALL LETTER SHARP S}]/i, $message); 613 like("SS", qr/[\N{LATIN SMALL LETTER SHARP S}]/i, $message); 614 615 like("\N{LATIN SMALL LETTER SHARP S}", qr/ss/i, $message); 616 like("\N{LATIN SMALL LETTER SHARP S}", qr/SS/i, $message); 617 618 $message = "Unoptimized named sequence in class"; 619 like("ss", qr/[\N{LATIN SMALL LETTER SHARP S}x]/i, $message); 620 like("SS", qr/[\N{LATIN SMALL LETTER SHARP S}x]/i, $message); 621 like("\N{LATIN SMALL LETTER SHARP S}", 622 qr/[\N{LATIN SMALL LETTER SHARP S}x]/, $message); 623 like("\N{LATIN SMALL LETTER SHARP S}", 624 qr/[\N{LATIN SMALL LETTER SHARP S}x]/i, $message); 625 } 626 627 { 628 # More whitespace: U+0085, U+2028, U+2029\n"; 629 630 # U+0085, U+00A0 need to be forced to be Unicode, the \x{100} does that. 631 SKIP: { 632 skip "EBCDIC platform", 4 if $::IS_EBCDIC; 633 # Do \x{0015} and \x{0041} match \s in EBCDIC? 634 ok "<\x{100}\x{0085}>" =~ /<\x{100}\s>/, '\x{0085} in \s'; 635 ok "<\x{0085}>" =~ /<\v>/, '\x{0085} in \v'; 636 ok "<\x{100}\x{00A0}>" =~ /<\x{100}\s>/, '\x{00A0} in \s'; 637 ok "<\x{00A0}>" =~ /<\h>/, '\x{00A0} in \h'; 638 } 639 my @h = map {sprintf "%05x" => $_} 0x01680, 0x02000 .. 0x0200A, 640 0x0202F, 0x0205F, 0x03000; 641 my @v = map {sprintf "%05x" => $_} 0x02028, 0x02029; 642 643 my @H = map {sprintf "%05x" => $_} 0x01361, 0x0200B, 0x02408, 0x02420, 644 0x0303F, 0xE0020, 0x180E; 645 my @V = map {sprintf "%05x" => $_} 0x0008A .. 0x0008D, 0x00348, 0x10100, 646 0xE005F, 0xE007C, 0x180E; 647 648 for my $hex (@h) { 649 my $str = eval qq ["<\\x{$hex}>"]; 650 ok $str =~ /<\s>/, "\\x{$hex} in \\s"; 651 ok $str =~ /<\h>/, "\\x{$hex} in \\h"; 652 ok $str !~ /<\v>/, "\\x{$hex} not in \\v"; 653 } 654 655 for my $hex (@v) { 656 my $str = eval qq ["<\\x{$hex}>"]; 657 ok $str =~ /<\s>/, "\\x{$hex} in \\s"; 658 ok $str =~ /<\v>/, "\\x{$hex} in \\v"; 659 ok $str !~ /<\h>/, "\\x{$hex} not in \\h"; 660 } 661 662 for my $hex (@H) { 663 my $str = eval qq ["<\\x{$hex}>"]; 664 ok $str =~ /<\S>/, "\\x{$hex} in \\S"; 665 ok $str =~ /<\H>/, "\\x{$hex} in \\H"; 666 } 667 668 for my $hex (@V) { 669 my $str = eval qq ["<\\x{$hex}>"]; 670 ok $str =~ /<\S>/, "\\x{$hex} in \\S"; 671 ok $str =~ /<\V>/, "\\x{$hex} in \\V"; 672 } 673 } 674 675 { 676 # . with /s should work on characters, as opposed to bytes 677 my $message = ". with /s works on characters, not bytes"; 678 679 my $s = "\x{e4}\x{100}"; 680 # This is not expected to match: the point is that 681 # neither should we get "Malformed UTF-8" warnings. 682 warning_is(sub {$s =~ /\G(.+?)\n/gcs}, undef, 683 "No 'Malformed UTF-8' warning"); 684 685 my @c; 686 push @c => $1 while $s =~ /\G(.)/gs; 687 688 local $" = ""; 689 is("@c", $s, $message); 690 691 # Test only chars < 256 692 my $t1 = "Q003\n\n\x{e4}\x{f6}\n\nQ004\n\n\x{e7}"; 693 my $r1 = ""; 694 while ($t1 =~ / \G ( .+? ) \n\s+ ( .+? ) ( $ | \n\s+ ) /xgcs) { 695 $r1 .= $1 . $2; 696 } 697 698 my $t2 = $t1 . "\x{100}"; # Repeat with a larger char 699 my $r2 = ""; 700 while ($t2 =~ / \G ( .+? ) \n\s+ ( .+? ) ( $ | \n\s+ ) /xgcs) { 701 $r2 .= $1 . $2; 702 } 703 $r2 =~ s/\x{100}//; 704 705 is($r1, $r2, $message); 706 } 707 708 { 709 my $message = "Unicode lookbehind"; 710 like("A\x{100}B" , qr/(?<=A.)B/, $message); 711 like("A\x{200}\x{300}B", qr/(?<=A..)B/, $message); 712 like("\x{400}AB" , qr/(?<=\x{400}.)B/, $message); 713 like("\x{500}\x{600}B" , qr/(?<=\x{500}.)B/, $message); 714 715 # Original code also contained: 716 # ok "\x{500\x{600}}B" =~ /(?<=\x{500}.)B/; 717 # but that looks like a typo. 718 } 719 720 { 721 my $message = 'UTF-8 hash keys and /$/'; 722 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters 723 # /2002-01/msg01327.html 724 725 my $u = "a\x{100}"; 726 my $v = substr ($u, 0, 1); 727 my $w = substr ($u, 1, 1); 728 my %u = ($u => $u, $v => $v, $w => $w); 729 for (keys %u) { 730 my $m1 = /^\w*$/ ? 1 : 0; 731 my $m2 = $u {$_} =~ /^\w*$/ ? 1 : 0; 732 is($m1, $m2, $message); 733 } 734 } 735 736 { 737 my $message = "No SEGV in s/// and UTF-8"; 738 my $s = "s#\x{100}" x 4; 739 ok($s =~ s/[^\w]/ /g, $message); 740 if ( 1 or $ENV{PERL_TEST_LEGACY_POSIX_CC} ) { 741 is($s, "s \x{100}" x 4, $message); 742 } 743 else { 744 is($s, "s " x 4, $message); 745 } 746 } 747 748 { 749 my $message = "UTF-8 bug (maybe already known?)"; 750 my $u = "foo"; 751 $u =~ s/./\x{100}/g; 752 is($u, "\x{100}\x{100}\x{100}", $message); 753 754 $u = "foobar"; 755 $u =~ s/[ao]/\x{100}/g; 756 is($u, "f\x{100}\x{100}b\x{100}r", $message); 757 758 $u =~ s/\x{100}/e/g; 759 is($u, "feeber", $message); 760 } 761 762 { 763 my $message = "UTF-8 bug with s///"; 764 # check utf8/non-utf8 mixtures 765 # try to force all float/anchored check combinations 766 767 my $c = "\x{100}"; 768 my $subst; 769 for my $re ("xx.*$c", "x.*$c$c", "$c.*xx", "$c$c.*x", 770 "xx.*(?=$c)", "(?=$c).*xx",) { 771 unlike("xxx", qr/$re/, $message); 772 ok(+($subst = "xxx") !~ s/$re//, $message); 773 } 774 for my $re ("xx.*$c*", "$c*.*xx") { 775 like("xxx", qr/$re/, $message); 776 ok(+($subst = "xxx") =~ s/$re//, $message); 777 is($subst, "", $message); 778 } 779 for my $re ("xxy*", "y*xx") { 780 like("xx$c", qr/$re/, $message); 781 ok(+($subst = "xx$c") =~ s/$re//, $message); 782 is($subst, $c, $message); 783 unlike("xy$c", qr/$re/, $message); 784 ok(+($subst = "xy$c") !~ s/$re//, $message); 785 } 786 for my $re ("xy$c*z", "x$c*yz") { 787 like("xyz", qr/$re/, $message); 788 ok(+($subst = "xyz") =~ s/$re//, $message); 789 is($subst, "", $message); 790 } 791 } 792 793 { 794 # The second half of RT #114808 795 warning_is(sub {'aa' =~ /.+\x{100}/}, undef, 796 'utf8-only floating substr, non-utf8 target, no warning'); 797 } 798 799 { 800 my $message = "qr /.../x"; 801 my $R = qr / A B C # D E/x; 802 ok("ABCDE" =~ $R && $& eq "ABC", $message); 803 ok("ABCDE" =~ /$R/ && $& eq "ABC", $message); 804 ok("ABCDE" =~ m/$R/ && $& eq "ABC", $message); 805 ok("ABCDE" =~ /($R)/ && $1 eq "ABC", $message); 806 ok("ABCDE" =~ m/($R)/ && $1 eq "ABC", $message); 807 } 808 809 { 810 local $\; 811 $_ = 'aaaaaaaaaa'; 812 utf8::upgrade($_); chop $_; $\="\n"; 813 ok /[^\s]+/, 'm/[^\s]/ utf8'; 814 ok /[^\d]+/, 'm/[^\d]/ utf8'; 815 ok +($a = $_, $_ =~ s/[^\s]+/./g), 's/[^\s]/ utf8'; 816 ok +($a = $_, $a =~ s/[^\d]+/./g), 's/[^\s]/ utf8'; 817 } 818 819 { 820 # Subject: Odd regexp behavior 821 # From: Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk> 822 # Date: Wed, 26 Feb 2003 16:53:12 +0000 823 # Message-Id: <E18o4nw-0008Ly-00@wisbech.cl.cam.ac.uk> 824 # To: perl-unicode@perl.org 825 826 my $message = 'Markus Kuhn 2003-02-26'; 827 828 my $x = "\x{2019}\nk"; 829 ok($x =~ s/(\S)\n(\S)/$1 $2/sg, $message); 830 is($x, "\x{2019} k", $message); 831 832 $x = "b\nk"; 833 ok($x =~ s/(\S)\n(\S)/$1 $2/sg, $message); 834 is($x, "b k", $message); 835 836 like("\x{2019}", qr/\S/, $message); 837 } 838 839 { 840 ok "\x{100}\n" =~ /\x{100}\n$/, "UTF-8 length cache and fbm_compile"; 841 } 842 843 { 844 package Str; 845 use overload q /""/ => sub {${$_ [0]};}; 846 sub new {my ($c, $v) = @_; bless \$v, $c;} 847 848 package main; 849 $_ = Str -> new ("a\x{100}/\x{100}b"); 850 ok join (":", /\b(.)\x{100}/g) eq "a:/", "re_intuit_start and PL_bostr"; 851 } 852 853 { 854 my $re = qq /^([^X]*)X/; 855 utf8::upgrade ($re); 856 ok "\x{100}X" =~ /$re/, "S_cl_and ANYOF_UNICODE & ANYOF_INVERTED"; 857 my $loc_re = qq /(?l:^([^X]*)X)/; 858 utf8::upgrade ($loc_re); 859 ok "\x{100}X" =~ /$loc_re/, "locale, S_cl_and ANYOF_UNICODE & ANYOF_INVERTED"; 860 } 861 862 { 863 ok "123\x{100}" =~ /^.*1.*23\x{100}$/, 864 'UTF-8 + multiple floating substr'; 865 } 866 867 { 868 my $message = '<20030808193656.5109.1@llama.ni-s.u-net.com>'; 869 870 # LATIN SMALL/CAPITAL LETTER A WITH MACRON 871 like(" \x{101}", qr/\x{100}/i, $message); 872 873 # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW 874 like(" \x{1E01}", qr/\x{1E00}/i, $message); 875 876 # DESERET SMALL/CAPITAL LETTER LONG I 877 like(" \x{10428}", qr/\x{10400}/i, $message); 878 879 # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X' 880 like(" \x{1E01}x", qr/\x{1E00}X/i, $message); 881 } 882 883 { 884 for (120 .. 130, 240 .. 260) { 885 my $head = 'x' x $_; 886 my $message = q [Don't misparse \x{...} in regexp ] . 887 q [near EXACT char count limit]; 888 for my $tail ('\x{0061}', '\x{1234}', '\x61') { 889 eval qq{like("$head$tail", qr/$head$tail/, \$message)}; 890 is($@, '', $message); 891 } 892 $message = q [Don't misparse \N{...} in regexp ] . 893 q [near EXACT char count limit]; 894 for my $tail ('\N{SNOWFLAKE}') { 895 eval qq {use charnames ':full'; 896 like("$head$tail", qr/$head$tail/, \$message)}; 897 is($@, '', $message); 898 } 899 } 900 } 901 902 { # TRIE related 903 our @got = (); 904 "words" =~ /(word|word|word)(?{push @got, $1})s$/; 905 is(@got, 1, "TRIE optimisation"); 906 907 @got = (); 908 "words" =~ /(word|word|word)(?{push @got,$1})s$/i; 909 is(@got, 1,"TRIEF optimisation"); 910 911 my @nums = map {int rand 1000} 1 .. 100; 912 my $re = "(" . (join "|", @nums) . ")"; 913 $re = qr/\b$re\b/; 914 915 foreach (@nums) { 916 ok $_ =~ /$re/, "Trie nums"; 917 } 918 919 $_ = join " ", @nums; 920 @got = (); 921 push @got, $1 while /$re/g; 922 923 my %count; 924 $count {$_} ++ for @got; 925 my $ok = 1; 926 for (@nums) { 927 $ok = 0 if --$count {$_} < 0; 928 } 929 ok $ok, "Trie min count matches"; 930 } 931 932 { 933 # TRIE related 934 # LATIN SMALL/CAPITAL LETTER A WITH MACRON 935 ok "foba \x{101}foo" =~ qr/(foo|\x{100}foo|bar)/i && 936 $1 eq "\x{101}foo", 937 "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH MACRON"; 938 939 # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW 940 ok "foba \x{1E01}foo" =~ qr/(foo|\x{1E00}foo|bar)/i && 941 $1 eq "\x{1E01}foo", 942 "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH RING BELOW"; 943 944 # DESERET SMALL/CAPITAL LETTER LONG I 945 ok "foba \x{10428}foo" =~ qr/(foo|\x{10400}foo|bar)/i && 946 $1 eq "\x{10428}foo", 947 "TRIEF + DESERET SMALL/CAPITAL LETTER LONG I"; 948 949 # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X' 950 ok "foba \x{1E01}xfoo" =~ qr/(foo|\x{1E00}Xfoo|bar)/i && 951 $1 eq "\x{1E01}xfoo", 952 "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X'"; 953 954 use charnames ':full'; 955 956 my $s = "\N{LATIN SMALL LETTER SHARP S}"; 957 ok "foba ba$s" =~ qr/(foo|Ba$s|bar)/i && $1 eq "ba$s", 958 "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; 959 ok "foba ba$s" =~ qr/(Ba$s|foo|bar)/i && $1 eq "ba$s", 960 "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; 961 ok "foba ba$s" =~ qr/(foo|bar|Ba$s)/i && $1 eq "ba$s", 962 "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; 963 964 ok "foba ba$s" =~ qr/(foo|Bass|bar)/i && $1 eq "ba$s", 965 "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; 966 967 ok "foba ba$s" =~ qr/(foo|BaSS|bar)/i && $1 eq "ba$s", 968 "TRIEF + LATIN SMALL LETTER SHARP S =~ SS"; 969 970 ok "foba ba${s}pxySS$s$s" =~ qr/(b(?:a${s}t|a${s}f|a${s}p)[xy]+$s*)/i 971 && $1 eq "ba${s}pxySS$s$s", 972 "COMMON PREFIX TRIEF + LATIN SMALL LETTER SHARP S"; 973 } 974 975 { 976 BEGIN { 977 unshift @INC, 'lib'; 978 } 979 use Cname; 980 981 ok 'fooB' =~ /\N{foo}[\N{B}\N{b}]/, "Passthrough charname"; 982 my $name = "foo\xDF"; 983 my $result = eval "'A${name}B' =~ /^A\\N{$name}B\$/"; 984 ok !$@ && $result, "Passthrough charname of non-ASCII, Latin1"; 985 # 986 # Why doesn't must_warn work here? 987 # 988 my $w; 989 local $SIG {__WARN__} = sub {$w .= "@_"}; 990 eval 'q(xxWxx) =~ /[\N{WARN}]/'; 991 ok $w && $w =~ /Using just the first character returned by \\N\{} in character class/, 992 "single character in [\\N{}] warning"; 993 994 undef $w; 995 eval q [ok "\0" !~ /[\N{EMPTY-STR}XY]/, 996 "Zerolength charname in charclass doesn't match \\\\0"]; 997 ok $w && $w =~ /Ignoring zero length/, 998 'Ignoring zero length \N{} in character class warning'; 999 undef $w; 1000 eval q [ok 'xy' =~ /x[\N{EMPTY-STR} y]/x, 1001 'Empty string charname in [] is ignored; finds a following character']; 1002 ok $w && $w =~ /Ignoring zero length/, 1003 'Ignoring zero length \N{} in character class warning'; 1004 undef $w; 1005 eval q [ok 'x ' =~ /x[\N{EMPTY-STR} y]/, 1006 'Empty string charname in [] is ignored; finds a following blank under /x']; 1007 ok $w && $w =~ /Ignoring zero length/, 1008 'Ignoring zero length \N{} in character class warning'; 1009 1010 ok 'AB' =~ /(\N{EVIL})/ && $1 eq 'A', 'Charname caching $1'; 1011 ok 'ABC' =~ /(\N{EVIL})/, 'Charname caching $1'; 1012 ok 'xy' =~ /x\N{EMPTY-STR}y/, 1013 'Empty string charname produces NOTHING node'; 1014 ok '' =~ /\N{EMPTY-STR}/, 1015 'Empty string charname produces NOTHING node'; 1016 ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/, 'Verify that long string works'; 1017 ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/i, 'Verify under folding that long string works'; 1018 1019 eval '/(?[[\N{EMPTY-STR}]])/'; 1020 ok $@ && $@ =~ /Zero length \\N\{}/; 1021 1022 undef $w; 1023 eval q [is("\N{TOO MANY SPACES}", "TOO MANY SPACES", "Multiple spaces in character name works")]; 1024 like ($w, qr/A sequence of multiple spaces in a charnames alias definition is deprecated/, "... but returns a deprecation warning"); 1025 undef $w; 1026 eval q [use utf8; is("\N{TOO MANY SPACES}", "TOO MANY SPACES", "Same under 'use utf8': they work")]; 1027 like ($w, qr/A sequence of multiple spaces in a charnames alias definition is deprecated/, "... but return a deprecation warning"); 1028 { 1029 # disable lexical warnings 1030 BEGIN { ${^WARNING_BITS} = undef; $^W = 0 } 1031 undef $w; 1032 () = eval q ["\N{TOO MANY SPACES}"]; 1033 like ($w, qr/A sequence of multiple spaces in a charnames alias definition is deprecated/, "... and returns a deprecation warning outside of lexical warnings"); 1034 undef $w; 1035 eval q [use utf8; () = "\N{TOO MANY SPACES}"]; 1036 like ($w, qr/A sequence of multiple spaces in a charnames alias definition is deprecated/, "... same under utf8"); 1037 } 1038 { 1039 no warnings 'deprecated'; 1040 undef $w; 1041 eval q ["\N{TOO MANY SPACES}"]; 1042 ok (! defined $w, "... and no warning if warnings are off"); 1043 eval q [use utf8; "\N{TOO MANY SPACES}"]; 1044 ok (! defined $w, "... same under 'use utf8'"); 1045 } 1046 { 1047 use warnings FATAL=> 'deprecated'; 1048 () = eval q ["\N{TOO MANY SPACES}"]; 1049 like ($@, qr/A sequence of multiple spaces in a charnames alias definition is deprecated/, "... the deprecation warning can be fatal"); 1050 eval q [use utf8; () = "\N{TOO MANY SPACES}"]; 1051 like ($@, qr/A sequence of multiple spaces in a charnames alias definition is deprecated/, "... same under utf8"); 1052 } 1053 1054 undef $w; 1055 eval q [is("\N{TRAILING SPACE }", "TRAILING SPACE ", "Trailing space in character name works")]; 1056 like ($w, qr/Trailing white-space in a charnames alias definition is deprecated/, "... but returns a deprecation warning"); 1057 undef $w; 1058 eval q [use utf8; is("\N{TRAILING SPACE }", "TRAILING SPACE ", "Same under 'use utf8': they work")]; 1059 like ($w, qr/Trailing white-space in a charnames alias definition is deprecated/, "... but returns a deprecation warning"); 1060 { 1061 # disable lexical warnings 1062 BEGIN { ${^WARNING_BITS} = undef; $^W = 0 } 1063 undef $w; 1064 () = eval q ["\N{TRAILING SPACE }"]; 1065 like ($w, qr/Trailing white-space in a charnames alias definition is deprecated/, "... and returns a deprecation warning outside of lexical warnings"); 1066 undef $w; 1067 eval q [use utf8; () = "\N{TRAILING SPACE }"]; 1068 like ($w, qr/Trailing white-space in a charnames alias definition is deprecated/, "... same under utf8"); 1069 } 1070 { 1071 no warnings 'deprecated'; 1072 undef $w; 1073 eval q ["\N{TRAILING SPACE }"]; 1074 ok (! defined $w, "... and no warning if warnings are off"); 1075 eval q [use utf8; "\N{TRAILING SPACE }"]; 1076 ok (! defined $w, "... same under 'use utf8'"); 1077 } 1078 { 1079 use warnings FATAL=>'deprecated'; 1080 () = eval q ["\N{TRAILING SPACE }"]; 1081 like ($@, qr/Trailing white-space in a charnames alias definition is deprecated/, "... the warning can be fatal"); 1082 eval q [use utf8; () = "\N{TRAILING SPACE }"]; 1083 like ($@, qr/Trailing white-space in a charnames alias definition is deprecated/, "... same under utf8"); 1084 } 1085 1086 { 1087 BEGIN { no strict; *CnameTest:: = *{"_charnames\0A::" } } 1088 package CnameTest { sub translator { pop } } 1089 BEGIN { $^H{charnames} = \&CnameTest::translator } 1090 undef $w; 1091 () = eval q ["\N{TOO MANY SPACES}"]; 1092 like ($w, qr/A sequence of multiple spaces/, 1093 'translators in _charnames\0* packages get validated'); 1094 } 1095 1096 # If remove the limitation in regcomp code these should work 1097 # differently 1098 undef $w; 1099 eval q [ok "\N{TOO-LONG-STR}" =~ /^\N{TOO-LONG-STR}$/, 'Verify that what once was too long a string works']; 1100 eval 'q(syntax error) =~ /\N{MALFORMED}/'; 1101 ok $@ && $@ =~ /Malformed/, 'Verify that malformed utf8 gives an error'; 1102 eval 'q() =~ /\N{4F}/'; 1103 ok $@ && $@ =~ /Invalid character/, 'Verify that leading digit in name gives error'; 1104 eval 'q() =~ /\N{COM,MA}/'; 1105 ok $@ && $@ =~ /Invalid character/, 'Verify that comma in name gives error'; 1106 $name = "A\x{D7}O"; 1107 eval "q(W) =~ /\\N{$name}/"; 1108 ok $@ && $@ =~ /Invalid character/, 'Verify that latin1 symbol in name gives error'; 1109 my $utf8_name = "7 CITIES OF GOLD"; 1110 utf8::upgrade($utf8_name); 1111 eval "use utf8; q(W) =~ /\\N{$utf8_name}/"; 1112 ok $@ && $@ =~ /Invalid character/, 'Verify that leading digit in utf8 name gives error'; 1113 $utf8_name = "SHARP #"; 1114 utf8::upgrade($utf8_name); 1115 eval "use utf8; q(W) =~ /\\N{$utf8_name}/"; 1116 ok $@ && $@ =~ /Invalid character/, 'Verify that ASCII symbol in utf8 name gives error'; 1117 $utf8_name = "A HOUSE \xF7 AGAINST ITSELF"; 1118 utf8::upgrade($utf8_name); 1119 eval "use utf8; q(W) =~ /\\N{$utf8_name}/"; 1120 ok $@ && $@ =~ /Invalid character/, 'Verify that latin1 symbol in utf8 name gives error'; 1121 $utf8_name = "\x{664} HORSEMEN}"; 1122 eval "use utf8; q(W) =~ /\\N{$utf8_name}/"; 1123 ok $@ && $@ =~ /Invalid character/, 'Verify that leading above Latin1 digit in utf8 name gives error'; 1124 $utf8_name = "A \x{1F4A9} WOULD SMELL AS SWEET}"; 1125 eval "use utf8; q(W) =~ /\\N{$utf8_name}/"; 1126 ok $@ && $@ =~ /Invalid character/, 'Verify that above Latin1 symbol in utf8 name gives error'; 1127 1128 undef $w; 1129 $name = "A\x{D1}O"; 1130 eval "q(W) =~ /\\N{$name}/"; 1131 ok ! $w, 'Verify that latin1 letter in name doesnt give warning'; 1132 1133 # This tests the code path that restarts the parse when the recursive 1134 # call to S_reg() from within S_grok_bslash_N() discovers that the 1135 # pattern needs to be recalculated as UTF-8. use eval to avoid 1136 # needing literal Unicode in this source file: 1137 my $r = eval "qr/\\N{\x{100}\x{100}}/"; 1138 isnt $r, undef, "Generated regex for multi-char UTF-8 charname" 1139 or diag($@); 1140 ok "\x{100}\x{100}" =~ $r, "which matches"; 1141 } 1142 1143 { 1144 use charnames ':full'; 1145 1146 ok 'aabc' !~ /a\N{PLUS SIGN}b/, '/a\N{PLUS SIGN}b/ against aabc'; 1147 ok 'a+bc' =~ /a\N{PLUS SIGN}b/, '/a\N{PLUS SIGN}b/ against a+bc'; 1148 1149 ok ' A B' =~ /\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}/, 1150 'Intermixed named and unicode escapes'; 1151 ok "\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}" =~ 1152 /\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}/, 1153 'Intermixed named and unicode escapes'; 1154 ok "\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}" =~ 1155 /[\N{SPACE}\N{U+0041}][\N{SPACE}\N{U+0042}]/, 1156 'Intermixed named and unicode escapes'; 1157 ok "\0" =~ /^\N{NULL}$/, 'Verify that \N{NULL} works; is not confused with an error'; 1158 } 1159 1160 { 1161 our $brackets; 1162 $brackets = qr{ 1163 { (?> [^{}]+ | (??{ $brackets }) )* } 1164 }x; 1165 1166 ok "{b{c}d" !~ m/^((??{ $brackets }))/, "Bracket mismatch"; 1167 1168 SKIP: { 1169 our @stack = (); 1170 my @expect = qw( 1171 stuff1 1172 stuff2 1173 <stuff1>and<stuff2> 1174 right 1175 <right> 1176 <<right>> 1177 <<<right>>> 1178 <<stuff1>and<stuff2>><<<<right>>>> 1179 ); 1180 1181 local $_ = '<<<stuff1>and<stuff2>><<<<right>>>>>'; 1182 ok /^(<((?:(?>[^<>]+)|(?1))*)>(?{push @stack, $2 }))$/, 1183 "Recursion matches"; 1184 is(@stack, @expect, "Right amount of matches") 1185 or skip "Won't test individual results as count isn't equal", 1186 0 + @expect; 1187 my $idx = 0; 1188 foreach my $expect (@expect) { 1189 is($stack [$idx], $expect, 1190 "Expecting '$expect' at stack pos #$idx"); 1191 $idx ++; 1192 } 1193 } 1194 } 1195 1196 { 1197 my $s = '123453456'; 1198 $s =~ s/(?<digits>\d+)\k<digits>/$+{digits}/; 1199 ok $s eq '123456', 'Named capture (angle brackets) s///'; 1200 $s = '123453456'; 1201 $s =~ s/(?'digits'\d+)\k'digits'/$+{digits}/; 1202 ok $s eq '123456', 'Named capture (single quotes) s///'; 1203 } 1204 1205 { 1206 my @ary = ( 1207 pack('U', 0x00F1), # n-tilde 1208 '_'.pack('U', 0x00F1), # _ + n-tilde 1209 'c'.pack('U', 0x0327), # c + cedilla 1210 pack('U*', 0x00F1, 0x0327), # n-tilde + cedilla 1211 pack('U', 0x0391), # ALPHA 1212 pack('U', 0x0391).'2', # ALPHA + 2 1213 pack('U', 0x0391).'_', # ALPHA + _ 1214 ); 1215 1216 for my $uni (@ary) { 1217 my ($r1, $c1, $r2, $c2) = eval qq { 1218 use utf8; 1219 scalar ("..foo foo.." =~ /(?'${uni}'foo) \\k'${uni}'/), 1220 \$+{${uni}}, 1221 scalar ("..bar bar.." =~ /(?<${uni}>bar) \\k<${uni}>/), 1222 \$+{${uni}}; 1223 }; 1224 ok $r1, "Named capture UTF (?'')"; 1225 ok defined $c1 && $c1 eq 'foo', "Named capture UTF \%+"; 1226 ok $r2, "Named capture UTF (?<>)"; 1227 ok defined $c2 && $c2 eq 'bar', "Named capture UTF \%+"; 1228 } 1229 } 1230 1231 { 1232 my $s = 'foo bar baz'; 1233 my @res; 1234 if ('1234' =~ /(?<A>1)(?<B>2)(?<A>3)(?<B>4)/) { 1235 foreach my $name (sort keys(%-)) { 1236 my $ary = $- {$name}; 1237 foreach my $idx (0 .. $#$ary) { 1238 push @res, "$name:$idx:$ary->[$idx]"; 1239 } 1240 } 1241 } 1242 my @expect = qw (A:0:1 A:1:3 B:0:2 B:1:4); 1243 is("@res", "@expect", "Check %-"); 1244 eval' 1245 no warnings "uninitialized"; 1246 print for $- {this_key_doesnt_exist}; 1247 '; 1248 ok !$@,'lvalue $- {...} should not throw an exception'; 1249 } 1250 1251 { 1252 # \, breaks {3,4} 1253 ok "xaaay" !~ /xa{3\,4}y/, '\, in a pattern'; 1254 ok "xa{3,4}y" =~ /xa{3\,4}y/, '\, in a pattern'; 1255 1256 # \c\ followed by _ 1257 ok "x\c_y" !~ /x\c\_y/, '\_ in a pattern'; 1258 ok "x\c\_y" =~ /x\c\_y/, '\_ in a pattern'; 1259 1260 # \c\ followed by other characters 1261 for my $c ("z", "\0", "!", chr(254), chr(256)) { 1262 my $targ = "a\034$c"; 1263 my $reg = "a\\c\\$c"; 1264 ok eval ("qq/$targ/ =~ /$reg/"), "\\c\\ in pattern"; 1265 } 1266 } 1267 1268 { # Test the (*PRUNE) pattern 1269 our $count = 0; 1270 'aaab' =~ /a+b?(?{$count++})(*FAIL)/; 1271 is($count, 9, "Expect 9 for no (*PRUNE)"); 1272 $count = 0; 1273 'aaab' =~ /a+b?(*PRUNE)(?{$count++})(*FAIL)/; 1274 is($count, 3, "Expect 3 with (*PRUNE)"); 1275 local $_ = 'aaab'; 1276 $count = 0; 1277 1 while /.(*PRUNE)(?{$count++})(*FAIL)/g; 1278 is($count, 4, "/.(*PRUNE)/"); 1279 $count = 0; 1280 'aaab' =~ /a+b?(??{'(*PRUNE)'})(?{$count++})(*FAIL)/; 1281 is($count, 3, "Expect 3 with (*PRUNE)"); 1282 local $_ = 'aaab'; 1283 $count = 0; 1284 1 while /.(??{'(*PRUNE)'})(?{$count++})(*FAIL)/g; 1285 is($count, 4, "/.(*PRUNE)/"); 1286 } 1287 1288 { # Test the (*SKIP) pattern 1289 our $count = 0; 1290 'aaab' =~ /a+b?(*SKIP)(?{$count++})(*FAIL)/; 1291 is($count, 1, "Expect 1 with (*SKIP)"); 1292 local $_ = 'aaab'; 1293 $count = 0; 1294 1 while /.(*SKIP)(?{$count++})(*FAIL)/g; 1295 is($count, 4, "/.(*SKIP)/"); 1296 $_ = 'aaabaaab'; 1297 $count = 0; 1298 our @res = (); 1299 1 while /(a+b?)(*SKIP)(?{$count++; push @res,$1})(*FAIL)/g; 1300 is($count, 2, "Expect 2 with (*SKIP)"); 1301 is("@res", "aaab aaab", "Adjacent (*SKIP) works as expected"); 1302 } 1303 1304 { # Test the (*SKIP) pattern 1305 our $count = 0; 1306 'aaab' =~ /a+b?(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/; 1307 is($count, 1, "Expect 1 with (*SKIP)"); 1308 local $_ = 'aaab'; 1309 $count = 0; 1310 1 while /.(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/g; 1311 is($count, 4, "/.(*SKIP)/"); 1312 $_ = 'aaabaaab'; 1313 $count = 0; 1314 our @res = (); 1315 1 while /(a+b?)(*MARK:foo)(*SKIP)(?{$count++; push @res,$1})(*FAIL)/g; 1316 is($count, 2, "Expect 2 with (*SKIP)"); 1317 is("@res", "aaab aaab", "Adjacent (*SKIP) works as expected"); 1318 } 1319 1320 { # Test the (*SKIP) pattern 1321 our $count = 0; 1322 'aaab' =~ /a*(*MARK:a)b?(*MARK:b)(*SKIP:a)(?{$count++})(*FAIL)/; 1323 is($count, 3, "Expect 3 with *MARK:a)b?(*MARK:b)(*SKIP:a)"); 1324 local $_ = 'aaabaaab'; 1325 $count = 0; 1326 our @res = (); 1327 1 while 1328 /(a*(*MARK:a)b?)(*MARK:x)(*SKIP:a)(?{$count++; push @res,$1})(*FAIL)/g; 1329 is($count, 5, "Expect 5 with (*MARK:a)b?)(*MARK:x)(*SKIP:a)"); 1330 is("@res", "aaab b aaab b ", 1331 "Adjacent (*MARK:a)b?)(*MARK:x)(*SKIP:a) works as expected"); 1332 } 1333 1334 { # Test the (*COMMIT) pattern 1335 our $count = 0; 1336 'aaabaaab' =~ /a+b?(*COMMIT)(?{$count++})(*FAIL)/; 1337 is($count, 1, "Expect 1 with (*COMMIT)"); 1338 local $_ = 'aaab'; 1339 $count = 0; 1340 1 while /.(*COMMIT)(?{$count++})(*FAIL)/g; 1341 is($count, 1, "/.(*COMMIT)/"); 1342 $_ = 'aaabaaab'; 1343 $count = 0; 1344 our @res = (); 1345 1 while /(a+b?)(*COMMIT)(?{$count++; push @res,$1})(*FAIL)/g; 1346 is($count, 1, "Expect 1 with (*COMMIT)"); 1347 is("@res", "aaab", "Adjacent (*COMMIT) works as expected"); 1348 1349 ok("1\n2a\n" !~ /^\d+(*COMMIT)\w+/m, "COMMIT and anchors"); 1350 } 1351 1352 { 1353 # Test named commits and the $REGERROR var 1354 our $REGERROR; 1355 for my $name ('', ':foo') { 1356 for my $pat ("(*PRUNE$name)", 1357 ($name ? "(*MARK$name)" : "") . "(*SKIP$name)", 1358 "(*COMMIT$name)") { 1359 for my $suffix ('(*FAIL)', '') { 1360 'aaaab' =~ /a+b$pat$suffix/; 1361 is($REGERROR, 1362 ($suffix ? ($name ? 'foo' : "1") : ""), 1363 "Test $pat and \$REGERROR $suffix"); 1364 } 1365 } 1366 } 1367 } 1368 1369 { 1370 # Test named commits and the $REGERROR var 1371 package Fnorble; 1372 our $REGERROR; 1373 for my $name ('', ':foo') { 1374 for my $pat ("(*PRUNE$name)", 1375 ($name ? "(*MARK$name)" : "") . "(*SKIP$name)", 1376 "(*COMMIT$name)") { 1377 for my $suffix ('(*FAIL)','') { 1378 'aaaab' =~ /a+b$pat$suffix/; 1379 ::is($REGERROR, 1380 ($suffix ? ($name ? 'foo' : "1") : ""), 1381 "Test $pat and \$REGERROR $suffix"); 1382 } 1383 } 1384 } 1385 } 1386 1387 { 1388 # Test named commits and the $REGERROR var 1389 my $message = '$REGERROR'; 1390 our $REGERROR; 1391 for my $word (qw (bar baz bop)) { 1392 $REGERROR = ""; 1393 "aaaaa$word" =~ 1394 /a+(?:bar(*COMMIT:bar)|baz(*COMMIT:baz)|bop(*COMMIT:bop))(*FAIL)/; 1395 is($REGERROR, $word, $message); 1396 } 1397 } 1398 1399 { 1400 #Mindnumbingly simple test of (*THEN) 1401 for ("ABC","BAX") { 1402 ok /A (*THEN) X | B (*THEN) C/x, "Simple (*THEN) test"; 1403 } 1404 } 1405 1406 { 1407 my $message = "Relative Recursion"; 1408 my $parens = qr/(\((?:[^()]++|(?-1))*+\))/; 1409 local $_ = 'foo((2*3)+4-3) + bar(2*(3+4)-1*(2-3))'; 1410 my ($all, $one, $two) = ('', '', ''); 1411 ok(m/foo $parens \s* \+ \s* bar $parens/x, $message); 1412 is($1, '((2*3)+4-3)', $message); 1413 is($2, '(2*(3+4)-1*(2-3))', $message); 1414 is($&, 'foo((2*3)+4-3) + bar(2*(3+4)-1*(2-3))', $message); 1415 is($&, $_, $message); 1416 } 1417 1418 { 1419 my $spaces=" "; 1420 local $_ = join 'bar', $spaces, $spaces; 1421 our $count = 0; 1422 s/(?>\s+bar)(?{$count++})//g; 1423 is($_, $spaces, "SUSPEND final string"); 1424 is($count, 1, "Optimiser should have prevented more than one match"); 1425 } 1426 1427 { 1428 # From Message-ID: <877ixs6oa6.fsf@k75.linux.bogus> 1429 my $dow_name = "nada"; 1430 my $parser = "(\$dow_name) = \$time_string =~ /(D\x{e9}\\ " . 1431 "C\x{e9}adaoin|D\x{e9}\\ Sathairn|\\w+|\x{100})/"; 1432 my $time_string = "D\x{e9} C\x{e9}adaoin"; 1433 eval $parser; 1434 ok !$@, "Test Eval worked"; 1435 is($dow_name, $time_string, "UTF-8 trie common prefix extraction"); 1436 } 1437 1438 { 1439 my $v; 1440 ($v = 'bar') =~ /(\w+)/g; 1441 $v = 'foo'; 1442 is("$1", 'bar', 1443 '$1 is safe after /g - may fail due to specialized config in pp_hot.c'); 1444 } 1445 1446 { 1447 my $message = "http://nntp.perl.org/group/perl.perl5.porters/118663"; 1448 my $qr_barR1 = qr/(bar)\g-1/; 1449 like("foobarbarxyz", $qr_barR1, $message); 1450 like("foobarbarxyz", qr/foo${qr_barR1}xyz/, $message); 1451 like("foobarbarxyz", qr/(foo)${qr_barR1}xyz/, $message); 1452 like("foobarbarxyz", qr/(foo)(bar)\g{-1}xyz/, $message); 1453 like("foobarbarxyz", qr/(foo${qr_barR1})xyz/, $message); 1454 like("foobarbarxyz", qr/(foo(bar)\g{-1})xyz/, $message); 1455 } 1456 1457 { 1458 my $message = '$REGMARK'; 1459 our @r = (); 1460 our ($REGMARK, $REGERROR); 1461 like('foofoo', qr/foo (*MARK:foo) (?{push @r,$REGMARK}) /x, $message); 1462 is("@r","foo", $message); 1463 is($REGMARK, "foo", $message); 1464 unlike('foofoo', qr/foo (*MARK:foo) (*FAIL) /x, $message); 1465 is($REGMARK, '', $message); 1466 is($REGERROR, 'foo', $message); 1467 } 1468 1469 { 1470 my $message = '\K test'; 1471 my $x; 1472 $x = "abc.def.ghi.jkl"; 1473 $x =~ s/.*\K\..*//; 1474 is($x, "abc.def.ghi", $message); 1475 1476 $x = "one two three four"; 1477 $x =~ s/o+ \Kthree//g; 1478 is($x, "one two four", $message); 1479 1480 $x = "abcde"; 1481 $x =~ s/(.)\K/$1/g; 1482 is($x, "aabbccddee", $message); 1483 } 1484 1485 { 1486 sub kt { 1487 return '4' if $_[0] eq '09028623'; 1488 } 1489 # Nested EVAL using PL_curpm (via $1 or friends) 1490 my $re; 1491 our $grabit = qr/ ([0-6][0-9]{7}) (??{ kt $1 }) [890] /x; 1492 $re = qr/^ ( (??{ $grabit }) ) $ /x; 1493 my @res = '0902862349' =~ $re; 1494 is(join ("-", @res), "0902862349", 1495 'PL_curpm is set properly on nested eval'); 1496 1497 our $qr = qr/ (o) (??{ $1 }) /x; 1498 ok 'boob'=~/( b (??{ $qr }) b )/x && 1, "PL_curpm, nested eval"; 1499 } 1500 1501 { 1502 use charnames ":full"; 1503 ok "\N{ROMAN NUMERAL ONE}" =~ /\p{Alphabetic}/, "I =~ Alphabetic"; 1504 ok "\N{ROMAN NUMERAL ONE}" =~ /\p{Uppercase}/, "I =~ Uppercase"; 1505 ok "\N{ROMAN NUMERAL ONE}" !~ /\p{Lowercase}/, "I !~ Lowercase"; 1506 ok "\N{ROMAN NUMERAL ONE}" =~ /\p{IDStart}/, "I =~ ID_Start"; 1507 ok "\N{ROMAN NUMERAL ONE}" =~ /\p{IDContinue}/, "I =~ ID_Continue"; 1508 ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{Alphabetic}/, "i =~ Alphabetic"; 1509 ok "\N{SMALL ROMAN NUMERAL ONE}" !~ /\p{Uppercase}/, "i !~ Uppercase"; 1510 ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{Uppercase}/i, "i =~ Uppercase under /i"; 1511 ok "\N{SMALL ROMAN NUMERAL ONE}" !~ /\p{Titlecase}/, "i !~ Titlecase"; 1512 ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{Titlecase}/i, "i =~ Titlecase under /i"; 1513 ok "\N{ROMAN NUMERAL ONE}" =~ /\p{Lowercase}/i, "I =~ Lowercase under /i"; 1514 1515 ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{Lowercase}/, "i =~ Lowercase"; 1516 ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{IDStart}/, "i =~ ID_Start"; 1517 ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{IDContinue}/, "i =~ ID_Continue" 1518 } 1519 1520 { # More checking that /i works on the few properties that it makes a 1521 # difference. Uppercase, Lowercase, and Titlecase were done in the 1522 # block above 1523 ok "A" =~ /\p{PosixUpper}/, "A =~ PosixUpper"; 1524 ok "A" =~ /\p{PosixUpper}/i, "A =~ PosixUpper under /i"; 1525 ok "A" !~ /\p{PosixLower}/, "A !~ PosixLower"; 1526 ok "A" =~ /\p{PosixLower}/i, "A =~ PosixLower under /i"; 1527 ok "a" !~ /\p{PosixUpper}/, "a !~ PosixUpper"; 1528 ok "a" =~ /\p{PosixUpper}/i, "a =~ PosixUpper under /i"; 1529 ok "a" =~ /\p{PosixLower}/, "a =~ PosixLower"; 1530 ok "a" =~ /\p{PosixLower}/i, "a =~ PosixLower under /i"; 1531 1532 ok "\xC0" =~ /\p{XPosixUpper}/, "\\xC0 =~ XPosixUpper"; 1533 ok "\xC0" =~ /\p{XPosixUpper}/i, "\\xC0 =~ XPosixUpper under /i"; 1534 ok "\xC0" !~ /\p{XPosixLower}/, "\\xC0 !~ XPosixLower"; 1535 ok "\xC0" =~ /\p{XPosixLower}/i, "\\xC0 =~ XPosixLower under /i"; 1536 ok "\xE0" !~ /\p{XPosixUpper}/, "\\xE0 !~ XPosixUpper"; 1537 ok "\xE0" =~ /\p{XPosixUpper}/i, "\\xE0 =~ XPosixUpper under /i"; 1538 ok "\xE0" =~ /\p{XPosixLower}/, "\\xE0 =~ XPosixLower"; 1539 ok "\xE0" =~ /\p{XPosixLower}/i, "\\xE0 =~ XPosixLower under /i"; 1540 1541 ok "\xC0" =~ /\p{UppercaseLetter}/, "\\xC0 =~ UppercaseLetter"; 1542 ok "\xC0" =~ /\p{UppercaseLetter}/i, "\\xC0 =~ UppercaseLetter under /i"; 1543 ok "\xC0" !~ /\p{LowercaseLetter}/, "\\xC0 !~ LowercaseLetter"; 1544 ok "\xC0" =~ /\p{LowercaseLetter}/i, "\\xC0 =~ LowercaseLetter under /i"; 1545 ok "\xC0" !~ /\p{TitlecaseLetter}/, "\\xC0 !~ TitlecaseLetter"; 1546 ok "\xC0" =~ /\p{TitlecaseLetter}/i, "\\xC0 =~ TitlecaseLetter under /i"; 1547 ok "\xE0" !~ /\p{UppercaseLetter}/, "\\xE0 !~ UppercaseLetter"; 1548 ok "\xE0" =~ /\p{UppercaseLetter}/i, "\\xE0 =~ UppercaseLetter under /i"; 1549 ok "\xE0" =~ /\p{LowercaseLetter}/, "\\xE0 =~ LowercaseLetter"; 1550 ok "\xE0" =~ /\p{LowercaseLetter}/i, "\\xE0 =~ LowercaseLetter under /i"; 1551 ok "\xE0" !~ /\p{TitlecaseLetter}/, "\\xE0 !~ TitlecaseLetter"; 1552 ok "\xE0" =~ /\p{TitlecaseLetter}/i, "\\xE0 =~ TitlecaseLetter under /i"; 1553 ok "\x{1C5}" !~ /\p{UppercaseLetter}/, "\\x{1C5} !~ UppercaseLetter"; 1554 ok "\x{1C5}" =~ /\p{UppercaseLetter}/i, "\\x{1C5} =~ UppercaseLetter under /i"; 1555 ok "\x{1C5}" !~ /\p{LowercaseLetter}/, "\\x{1C5} !~ LowercaseLetter"; 1556 ok "\x{1C5}" =~ /\p{LowercaseLetter}/i, "\\x{1C5} =~ LowercaseLetter under /i"; 1557 ok "\x{1C5}" =~ /\p{TitlecaseLetter}/, "\\x{1C5} =~ TitlecaseLetter"; 1558 ok "\x{1C5}" =~ /\p{TitlecaseLetter}/i, "\\x{1C5} =~ TitlecaseLetter under /i"; 1559 } 1560 1561 { 1562 # requirement of Unicode Technical Standard #18, 1.7 Code Points 1563 # cf. http://www.unicode.org/reports/tr18/#Supplementary_Characters 1564 for my $u (0x7FF, 0x800, 0xFFFF, 0x10000) { 1565 no warnings 'utf8'; # oops 1566 my $c = chr $u; 1567 my $x = sprintf '%04X', $u; 1568 ok "A${c}B" =~ /A[\0-\x{10000}]B/, "Unicode range - $x"; 1569 } 1570 } 1571 1572 { 1573 my $res=""; 1574 1575 if ('1' =~ /(?|(?<digit>1)|(?<digit>2))/) { 1576 $res = "@{$- {digit}}"; 1577 } 1578 is($res, "1", 1579 "Check that (?|...) doesnt cause dupe entries in the names array"); 1580 1581 $res = ""; 1582 if ('11' =~ /(?|(?<digit>1)|(?<digit>2))(?&digit)/) { 1583 $res = "@{$- {digit}}"; 1584 } 1585 is($res, "1", 1586 "Check that (?&..) to a buffer inside a (?|...) goes to the leftmost"); 1587 } 1588 1589 { 1590 use warnings; 1591 my $message = "ASCII pattern that really is UTF-8"; 1592 my @w; 1593 local $SIG {__WARN__} = sub {push @w, "@_"}; 1594 my $c = qq (\x{DF}); 1595 like($c, qr/${c}|\x{100}/, $message); 1596 is("@w", '', $message); 1597 } 1598 1599 { 1600 my $message = "Corruption of match results of qr// across scopes"; 1601 my $qr = qr/(fo+)(ba+r)/; 1602 'foobar' =~ /$qr/; 1603 is("$1$2", "foobar", $message); 1604 { 1605 'foooooobaaaaar' =~ /$qr/; 1606 is("$1$2", 'foooooobaaaaar', $message); 1607 } 1608 is("$1$2", "foobar", $message); 1609 } 1610 1611 { 1612 my $message = "HORIZWS"; 1613 local $_ = "\t \r\n \n \t".chr(11)."\n"; 1614 s/\H/H/g; 1615 s/\h/h/g; 1616 is($_, "hhHHhHhhHH", $message); 1617 $_ = "\t \r\n \n \t" . chr (11) . "\n"; 1618 utf8::upgrade ($_); 1619 s/\H/H/g; 1620 s/\h/h/g; 1621 is($_, "hhHHhHhhHH", $message); 1622 } 1623 1624 { 1625 # Various whitespace special patterns 1626 my @h = map {chr $_} 0x09, 0x20, 0xa0, 0x1680, 0x2000, 1627 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 1628 0x2007, 0x2008, 0x2009, 0x200a, 0x202f, 0x205f, 1629 0x3000; 1630 my @v = map {chr $_} 0x0a, 0x0b, 0x0c, 0x0d, 0x85, 0x2028, 1631 0x2029; 1632 my @lb = ("\x0D\x0A", map {chr $_} 0x0A .. 0x0D, 0x85, 0x2028, 0x2029); 1633 foreach my $t ([\@h, qr/\h/, qr/\h+/], 1634 [\@v, qr/\v/, qr/\v+/], 1635 [\@lb, qr/\R/, qr/\R+/],) { 1636 my $ary = shift @$t; 1637 foreach my $pat (@$t) { 1638 foreach my $str (@$ary) { 1639 my $temp_str = $str; 1640 $temp_str = display($temp_str); 1641 ok $str =~ /($pat)/, $temp_str . " =~ /($pat)"; 1642 my $temp_1 = $1; 1643 is($1, $str, "\$1='" . display($temp_1) . "' eq '" . $temp_str . "' after ($pat)"); 1644 utf8::upgrade ($str); 1645 ok $str =~ /($pat)/, "Upgraded " . $temp_str . " =~ /($pat)/"; 1646 is($1, $str, "\$1='" . display($temp_1) . "' eq '" . $temp_str . "'(upgraded) after ($pat)"); 1647 } 1648 } 1649 } 1650 } 1651 1652 { 1653 # Check that \\xDF match properly in its various forms 1654 # Test that \xDF matches properly. this is pretty hacky stuff, 1655 # but its actually needed. The malarky with '-' is to prevent 1656 # compilation caching from playing any role in the test. 1657 my @df = (chr (0xDF), '-', chr (0xDF)); 1658 utf8::upgrade ($df [2]); 1659 my @strs = ('ss', 'sS', 'Ss', 'SS', chr (0xDF)); 1660 my @ss = map {("$_", "$_")} @strs; 1661 utf8::upgrade ($ss [$_ * 2 + 1]) for 0 .. $#strs; 1662 1663 for my $ssi (0 .. $#ss) { 1664 for my $dfi (0 .. $#df) { 1665 my $pat = $df [$dfi]; 1666 my $str = $ss [$ssi]; 1667 my $utf_df = ($dfi > 1) ? 'utf8' : ''; 1668 my $utf_ss = ($ssi % 2) ? 'utf8' : ''; 1669 (my $sstr = $str) =~ s/\xDF/\\xDF/; 1670 1671 if ($utf_df || $utf_ss || length ($ss [$ssi]) == 1) { 1672 my $ret = $str =~ /$pat/i; 1673 next if $pat eq '-'; 1674 ok $ret, "\"$sstr\" =~ /\\xDF/i " . 1675 "(str is @{[$utf_ss||'latin']}, pat is " . 1676 "@{[$utf_df||'latin']})"; 1677 } 1678 else { 1679 my $ret = $str !~ /$pat/i; 1680 next if $pat eq '-'; 1681 ok $ret, "\"$sstr\" !~ /\\xDF/i " . 1682 "(str is @{[$utf_ss||'latin']}, pat is " . 1683 "@{[$utf_df||'latin']})"; 1684 } 1685 } 1686 } 1687 } 1688 1689 { 1690 my $message = "BBC(Bleadperl Breaks CPAN) Today: String::Multibyte"; 1691 my $re = qr/(?:[\x00-\xFF]{4})/; 1692 my $hyp = "\0\0\0-"; 1693 my $esc = "\0\0\0\\"; 1694 1695 my $str = "$esc$hyp$hyp$esc$esc"; 1696 my @a = ($str =~ /\G(?:\Q$esc$esc\E|\Q$esc$hyp\E|$re)/g); 1697 1698 is(@a,3, $message); 1699 local $" = "="; 1700 is("@a","$esc$hyp=$hyp=$esc$esc", $message); 1701 } 1702 1703 { 1704 # Test for keys in %+ and %- 1705 my $message = 'Test keys in %+ and %-'; 1706 no warnings 'uninitialized', 'deprecated', 'experimental::lexical_topic'; 1707 my $_ = "abcdef"; 1708 /(?<foo>a)|(?<foo>b)/; 1709 is((join ",", sort keys %+), "foo", $message); 1710 is((join ",", sort keys %-), "foo", $message); 1711 is((join ",", sort values %+), "a", $message); 1712 is((join ",", sort map "@$_", values %-), "a ", $message); 1713 /(?<bar>a)(?<bar>b)(?<quux>.)/; 1714 is((join ",", sort keys %+), "bar,quux", $message); 1715 is((join ",", sort keys %-), "bar,quux", $message); 1716 is((join ",", sort values %+), "a,c", $message); # leftmost 1717 is((join ",", sort map "@$_", values %-), "a b,c", $message); 1718 /(?<un>a)(?<deux>c)?/; # second buffer won't capture 1719 is((join ",", sort keys %+), "un", $message); 1720 is((join ",", sort keys %-), "deux,un", $message); 1721 is((join ",", sort values %+), "a", $message); 1722 is((join ",", sort map "@$_", values %-), ",a", $message); 1723 } 1724 1725 { 1726 # length() on captures, the numbered ones end up in Perl_magic_len 1727 no warnings 'deprecated', 'experimental::lexical_topic'; 1728 my $_ = "aoeu \xe6var ook"; 1729 /^ \w+ \s (?<eek>\S+)/x; 1730 1731 is(length $`, 0, q[length $`]); 1732 is(length $', 4, q[length $']); 1733 is(length $&, 9, q[length $&]); 1734 is(length $1, 4, q[length $1]); 1735 is(length $+{eek}, 4, q[length $+{eek} == length $1]); 1736 } 1737 1738 { 1739 my $ok = -1; 1740 1741 $ok = exists ($-{x}) ? 1 : 0 if 'bar' =~ /(?<x>foo)|bar/; 1742 is($ok, 1, '$-{x} exists after "bar"=~/(?<x>foo)|bar/'); 1743 is(scalar (%+), 0, 'scalar %+ == 0 after "bar"=~/(?<x>foo)|bar/'); 1744 is(scalar (%-), 1, 'scalar %- == 1 after "bar"=~/(?<x>foo)|bar/'); 1745 1746 $ok = -1; 1747 $ok = exists ($+{x}) ? 1 : 0 if 'bar' =~ /(?<x>foo)|bar/; 1748 is($ok, 0, '$+{x} not exists after "bar"=~/(?<x>foo)|bar/'); 1749 is(scalar (%+), 0, 'scalar %+ == 0 after "bar"=~/(?<x>foo)|bar/'); 1750 is(scalar (%-), 1, 'scalar %- == 1 after "bar"=~/(?<x>foo)|bar/'); 1751 1752 $ok = -1; 1753 $ok = exists ($-{x}) ? 1 : 0 if 'foo' =~ /(?<x>foo)|bar/; 1754 is($ok, 1, '$-{x} exists after "foo"=~/(?<x>foo)|bar/'); 1755 is(scalar (%+), 1, 'scalar %+ == 1 after "foo"=~/(?<x>foo)|bar/'); 1756 is(scalar (%-), 1, 'scalar %- == 1 after "foo"=~/(?<x>foo)|bar/'); 1757 1758 $ok = -1; 1759 $ok = exists ($+{x}) ? 1 : 0 if 'foo'=~/(?<x>foo)|bar/; 1760 is($ok, 1, '$+{x} exists after "foo"=~/(?<x>foo)|bar/'); 1761 } 1762 1763 { 1764 local $_; 1765 ($_ = 'abc') =~ /(abc)/g; 1766 $_ = '123'; 1767 is("$1", 'abc', "/g leads to unsafe match vars: $1"); 1768 1769 fresh_perl_is(<<'EOP', ">abc<\n", {}, 'mention $&'); 1770$&; 1771my $x; 1772($x='abc')=~/(abc)/g; 1773$x='123'; 1774print ">$1<\n"; 1775EOP 1776 1777 fresh_perl_is(<<'EOP', ">abc<\n", {}, 'no mention of $&'); 1778my $x; 1779($x='abc')=~/(abc)/g; 1780$x='123'; 1781print ">$1<\n"; 1782EOP 1783 } 1784 1785 { 1786 # Message-ID: <20070818091501.7eff4831@r2d2> 1787 my $str = ""; 1788 for (0 .. 5) { 1789 my @x; 1790 $str .= "@x"; # this should ALWAYS be the empty string 1791 'a' =~ /(a|)/; 1792 push @x, 1; 1793 } 1794 is(length $str, 0, "Trie scope error, string should be empty"); 1795 $str = ""; 1796 my @foo = ('a') x 5; 1797 for (@foo) { 1798 my @bar; 1799 $str .= "@bar"; 1800 s/a|/push @bar, 1/e; 1801 } 1802 is(length $str, 0, "Trie scope error, string should be empty"); 1803 } 1804 1805 { 1806# more TRIE/AHOCORASICK problems with mixed utf8 / latin-1 and case folding 1807 for my $chr (160 .. 255) { 1808 my $chr_byte = chr($chr); 1809 my $chr_utf8 = chr($chr); utf8::upgrade($chr_utf8); 1810 my $rx = qr{$chr_byte|X}i; 1811 ok($chr_utf8 =~ $rx, "utf8/latin, codepoint $chr"); 1812 } 1813 } 1814 1815 { 1816 our $a = 3; "" =~ /(??{ $a })/; 1817 our $b = $a; 1818 is($b, $a, "Copy of scalar used for postponed subexpression"); 1819 } 1820 1821 { 1822 our @ctl_n = (); 1823 our @plus = (); 1824 our $nested_tags; 1825 $nested_tags = qr{ 1826 < 1827 (\w+) 1828 (?{ 1829 push @ctl_n,$^N; 1830 push @plus,$+; 1831 }) 1832 > 1833 (??{$nested_tags})* 1834 </\s* \w+ \s*> 1835 }x; 1836 1837 my $match = '<bla><blubb></blubb></bla>' =~ m/^$nested_tags$/; 1838 ok $match, 'nested construct matches'; 1839 is("@ctl_n", "bla blubb", '$^N inside of (?{}) works as expected'); 1840 is("@plus", "bla blubb", '$+ inside of (?{}) works as expected'); 1841 } 1842 1843 SKIP: { 1844 # XXX: This set of tests is essentially broken, POSIX character classes 1845 # should not have differing definitions under Unicode. 1846 # There are property names for that. 1847 skip "Tests assume ASCII", 4 unless $::IS_ASCII; 1848 1849 my @notIsPunct = grep {/[[:punct:]]/ and not /\p{IsPunct}/} 1850 map {chr} 0x20 .. 0x7f; 1851 is(join ('', @notIsPunct), '$+<=>^`|~', 1852 '[:punct:] disagrees with IsPunct on Symbols'); 1853 1854 my @isPrint = grep {not /[[:print:]]/ and /\p{IsPrint}/} 1855 map {chr} 0 .. 0x1f, 0x7f .. 0x9f; 1856 is(join ('', @isPrint), "", 1857 'IsPrint agrees with [:print:] on control characters'); 1858 1859 my @isPunct = grep {/[[:punct:]]/ != /\p{IsPunct}/} 1860 map {chr} 0x80 .. 0xff; 1861 is(join ('', @isPunct), "\xa1\xa7\xab\xb6\xb7\xbb\xbf", # ¡ « · » ¿ 1862 'IsPunct disagrees with [:punct:] outside ASCII'); 1863 1864 my @isPunctLatin1 = eval q { 1865 no warnings 'deprecated'; 1866 use encoding 'latin1'; 1867 grep {/[[:punct:]]/ != /\p{IsPunct}/} map {chr} 0x80 .. 0xff; 1868 }; 1869 skip "Eval failed ($@)", 1 if $@; 1870 skip "PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS set to 0", 1 1871 if !$ENV{PERL_TEST_LEGACY_POSIX_CC}; 1872 is(join ('', @isPunctLatin1), '', 1873 'IsPunct agrees with [:punct:] with explicit Latin1'); 1874 } 1875 1876 { 1877 # Tests for [#perl 71942] 1878 our $count_a; 1879 our $count_b; 1880 1881 my $c = 0; 1882 for my $re ( 1883# [ 1884# should match?, 1885# input string, 1886# re 1, 1887# re 2, 1888# expected values of count_a and count_b, 1889# ] 1890 [ 1891 0, 1892 "xababz", 1893 qr/a+(?{$count_a++})b?(*COMMIT)(*FAIL)/, 1894 qr/a+(?{$count_b++})b?(*COMMIT)z/, 1895 1, 1896 ], 1897 [ 1898 0, 1899 "xababz", 1900 qr/a+(?{$count_a++})b?(*COMMIT)\s*(*FAIL)/, 1901 qr/a+(?{$count_b++})b?(*COMMIT)\s*z/, 1902 1, 1903 ], 1904 [ 1905 0, 1906 "xababz", 1907 qr/a+(?{$count_a++})(?:b|)?(*COMMIT)(*FAIL)/, 1908 qr/a+(?{$count_b++})(?:b|)?(*COMMIT)z/, 1909 1, 1910 ], 1911 [ 1912 0, 1913 "xababz", 1914 qr/a+(?{$count_a++})b{0,6}(*COMMIT)(*FAIL)/, 1915 qr/a+(?{$count_b++})b{0,6}(*COMMIT)z/, 1916 1, 1917 ], 1918 [ 1919 0, 1920 "xabcabcz", 1921 qr/a+(?{$count_a++})(bc){0,6}(*COMMIT)(*FAIL)/, 1922 qr/a+(?{$count_b++})(bc){0,6}(*COMMIT)z/, 1923 1, 1924 ], 1925 [ 1926 0, 1927 "xabcabcz", 1928 qr/a+(?{$count_a++})(bc*){0,6}(*COMMIT)(*FAIL)/, 1929 qr/a+(?{$count_b++})(bc*){0,6}(*COMMIT)z/, 1930 1, 1931 ], 1932 1933 1934 [ 1935 0, 1936 "aaaabtz", 1937 qr/a+(?{$count_a++})b?(*PRUNE)(*FAIL)/, 1938 qr/a+(?{$count_b++})b?(*PRUNE)z/, 1939 4, 1940 ], 1941 [ 1942 0, 1943 "aaaabtz", 1944 qr/a+(?{$count_a++})b?(*PRUNE)\s*(*FAIL)/, 1945 qr/a+(?{$count_b++})b?(*PRUNE)\s*z/, 1946 4, 1947 ], 1948 [ 1949 0, 1950 "aaaabtz", 1951 qr/a+(?{$count_a++})(?:b|)(*PRUNE)(*FAIL)/, 1952 qr/a+(?{$count_b++})(?:b|)(*PRUNE)z/, 1953 4, 1954 ], 1955 [ 1956 0, 1957 "aaaabtz", 1958 qr/a+(?{$count_a++})b{0,6}(*PRUNE)(*FAIL)/, 1959 qr/a+(?{$count_b++})b{0,6}(*PRUNE)z/, 1960 4, 1961 ], 1962 [ 1963 0, 1964 "aaaabctz", 1965 qr/a+(?{$count_a++})(bc){0,6}(*PRUNE)(*FAIL)/, 1966 qr/a+(?{$count_b++})(bc){0,6}(*PRUNE)z/, 1967 4, 1968 ], 1969 [ 1970 0, 1971 "aaaabctz", 1972 qr/a+(?{$count_a++})(bc*){0,6}(*PRUNE)(*FAIL)/, 1973 qr/a+(?{$count_b++})(bc*){0,6}(*PRUNE)z/, 1974 4, 1975 ], 1976 1977 [ 1978 0, 1979 "aaabaaab", 1980 qr/a+(?{$count_a++;})b?(*SKIP)(*FAIL)/, 1981 qr/a+(?{$count_b++;})b?(*SKIP)z/, 1982 2, 1983 ], 1984 [ 1985 0, 1986 "aaabaaab", 1987 qr/a+(?{$count_a++;})b?(*SKIP)\s*(*FAIL)/, 1988 qr/a+(?{$count_b++;})b?(*SKIP)\s*z/, 1989 2, 1990 ], 1991 [ 1992 0, 1993 "aaabaaab", 1994 qr/a+(?{$count_a++;})(?:b|)(*SKIP)(*FAIL)/, 1995 qr/a+(?{$count_b++;})(?:b|)(*SKIP)z/, 1996 2, 1997 ], 1998 [ 1999 0, 2000 "aaabaaab", 2001 qr/a+(?{$count_a++;})b{0,6}(*SKIP)(*FAIL)/, 2002 qr/a+(?{$count_b++;})b{0,6}(*SKIP)z/, 2003 2, 2004 ], 2005 [ 2006 0, 2007 "aaabcaaabc", 2008 qr/a+(?{$count_a++;})(bc){0,6}(*SKIP)(*FAIL)/, 2009 qr/a+(?{$count_b++;})(bc){0,6}(*SKIP)z/, 2010 2, 2011 ], 2012 [ 2013 0, 2014 "aaabcaaabc", 2015 qr/a+(?{$count_a++;})(bc*){0,6}(*SKIP)(*FAIL)/, 2016 qr/a+(?{$count_b++;})(bc*){0,6}(*SKIP)z/, 2017 2, 2018 ], 2019 2020 2021 [ 2022 0, 2023 "aaddbdaabyzc", 2024 qr/a (?{$count_a++;}) (*MARK:T1) (a*) .*? b? (*SKIP:T1) (*FAIL) \s* c \1 /x, 2025 qr/a (?{$count_b++;}) (*MARK:T1) (a*) .*? b? (*SKIP:T1) z \s* c \1 /x, 2026 4, 2027 ], 2028 [ 2029 0, 2030 "aaddbdaabyzc", 2031 qr/a (?{$count_a++;}) (*MARK:T1) (a*) .*? b? (*SKIP:T1) \s* (*FAIL) \s* c \1 /x, 2032 qr/a (?{$count_b++;}) (*MARK:T1) (a*) .*? b? (*SKIP:T1) \s* z \s* c \1 /x, 2033 4, 2034 ], 2035 [ 2036 0, 2037 "aaddbdaabyzc", 2038 qr/a (?{$count_a++;}) (*MARK:T1) (a*) .*? (?:b|) (*SKIP:T1) (*FAIL) \s* c \1 /x, 2039 qr/a (?{$count_b++;}) (*MARK:T1) (a*) .*? (?:b|) (*SKIP:T1) z \s* c \1 /x, 2040 4, 2041 ], 2042 [ 2043 0, 2044 "aaddbdaabyzc", 2045 qr/a (?{$count_a++;}) (*MARK:T1) (a*) .*? b{0,6} (*SKIP:T1) (*FAIL) \s* c \1 /x, 2046 qr/a (?{$count_b++;}) (*MARK:T1) (a*) .*? b{0,6} (*SKIP:T1) z \s* c \1 /x, 2047 4, 2048 ], 2049 [ 2050 0, 2051 "aaddbcdaabcyzc", 2052 qr/a (?{$count_a++;}) (*MARK:T1) (a*) .*? (bc){0,6} (*SKIP:T1) (*FAIL) \s* c \1 /x, 2053 qr/a (?{$count_b++;}) (*MARK:T1) (a*) .*? (bc){0,6} (*SKIP:T1) z \s* c \1 /x, 2054 4, 2055 ], 2056 [ 2057 0, 2058 "aaddbcdaabcyzc", 2059 qr/a (?{$count_a++;}) (*MARK:T1) (a*) .*? (bc*){0,6} (*SKIP:T1) (*FAIL) \s* c \1 /x, 2060 qr/a (?{$count_b++;}) (*MARK:T1) (a*) .*? (bc*){0,6} (*SKIP:T1) z \s* c \1 /x, 2061 4, 2062 ], 2063 2064 2065 [ 2066 0, 2067 "aaaaddbdaabyzc", 2068 qr/a (?{$count_a++;}) (a?) (*MARK:T1) (a*) .*? b? (*MARK:T1) (*SKIP:T1) (*FAIL) \s* c \1 /x, 2069 qr/a (?{$count_b++;}) (a?) (*MARK:T1) (a*) .*? b? (*MARK:T1) (*SKIP:T1) z \s* c \1 /x, 2070 2, 2071 ], 2072 [ 2073 0, 2074 "aaaaddbdaabyzc", 2075 qr/a (?{$count_a++;}) (a?) (*MARK:T1) (a*) .*? b? (*MARK:T1) (*SKIP:T1) \s* (*FAIL) \s* c \1 /x, 2076 qr/a (?{$count_b++;}) (a?) (*MARK:T1) (a*) .*? b? (*MARK:T1) (*SKIP:T1) \s* z \s* c \1 /x, 2077 2, 2078 ], 2079 [ 2080 0, 2081 "aaaaddbdaabyzc", 2082 qr/a (?{$count_a++;}) (a?) (*MARK:T1) (a*) .*? (?:b|) (*MARK:T1) (*SKIP:T1) (*FAIL) \s* c \1 /x, 2083 qr/a (?{$count_b++;}) (a?) (*MARK:T1) (a*) .*? (?:b|) (*MARK:T1) (*SKIP:T1) z \s* c \1 /x, 2084 2, 2085 ], 2086 [ 2087 0, 2088 "aaaaddbdaabyzc", 2089 qr/a (?{$count_a++;}) (a?) (*MARK:T1) (a*) .*? b{0,6} (*MARK:T1) (*SKIP:T1) (*FAIL) \s* c \1 /x, 2090 qr/a (?{$count_b++;}) (a?) (*MARK:T1) (a*) .*? b{0,6} (*MARK:T1) (*SKIP:T1) z \s* c \1 /x, 2091 2, 2092 ], 2093 [ 2094 0, 2095 "aaaaddbcdaabcyzc", 2096 qr/a (?{$count_a++;}) (a?) (*MARK:T1) (a*) .*? (bc){0,6} (*MARK:T1) (*SKIP:T1) (*FAIL) \s* c \1 /x, 2097 qr/a (?{$count_b++;}) (a?) (*MARK:T1) (a*) .*? (bc){0,6} (*MARK:T1) (*SKIP:T1) z \s* c \1 /x, 2098 2, 2099 ], 2100 [ 2101 0, 2102 "aaaaddbcdaabcyzc", 2103 qr/a (?{$count_a++;}) (a?) (*MARK:T1) (a*) .*? (bc*){0,6} (*MARK:T1) (*SKIP:T1) (*FAIL) \s* c \1 /x, 2104 qr/a (?{$count_b++;}) (a?) (*MARK:T1) (a*) .*? (bc*){0,6} (*MARK:T1) (*SKIP:T1) z \s* c \1 /x, 2105 2, 2106 ], 2107 2108 2109 [ 2110 0, 2111 "AbcdCBefgBhiBqz", 2112 qr/(A (.*) (?{ $count_a++ }) C? (*THEN) | A D) (*FAIL)/x, 2113 qr/(A (.*) (?{ $count_b++ }) C? (*THEN) | A D) z/x, 2114 1, 2115 ], 2116 [ 2117 0, 2118 "AbcdCBefgBhiBqz", 2119 qr/(A (.*) (?{ $count_a++ }) C? (*THEN) | A D) \s* (*FAIL)/x, 2120 qr/(A (.*) (?{ $count_b++ }) C? (*THEN) | A D) \s* z/x, 2121 1, 2122 ], 2123 [ 2124 0, 2125 "AbcdCBefgBhiBqz", 2126 qr/(A (.*) (?{ $count_a++ }) (?:C|) (*THEN) | A D) (*FAIL)/x, 2127 qr/(A (.*) (?{ $count_b++ }) (?:C|) (*THEN) | A D) z/x, 2128 1, 2129 ], 2130 [ 2131 0, 2132 "AbcdCBefgBhiBqz", 2133 qr/(A (.*) (?{ $count_a++ }) C{0,6} (*THEN) | A D) (*FAIL)/x, 2134 qr/(A (.*) (?{ $count_b++ }) C{0,6} (*THEN) | A D) z/x, 2135 1, 2136 ], 2137 [ 2138 0, 2139 "AbcdCEBefgBhiBqz", 2140 qr/(A (.*) (?{ $count_a++ }) (CE){0,6} (*THEN) | A D) (*FAIL)/x, 2141 qr/(A (.*) (?{ $count_b++ }) (CE){0,6} (*THEN) | A D) z/x, 2142 1, 2143 ], 2144 [ 2145 0, 2146 "AbcdCBefgBhiBqz", 2147 qr/(A (.*) (?{ $count_a++ }) (CE*){0,6} (*THEN) | A D) (*FAIL)/x, 2148 qr/(A (.*) (?{ $count_b++ }) (CE*){0,6} (*THEN) | A D) z/x, 2149 1, 2150 ], 2151 ) { 2152 $c++; 2153 $count_a = 0; 2154 $count_b = 0; 2155 2156 my $match_a = ($re->[1] =~ $re->[2]) || 0; 2157 my $match_b = ($re->[1] =~ $re->[3]) || 0; 2158 2159 is($match_a, $re->[0], "match a " . ($re->[0] ? "succeeded" : "failed") . " ($c)"); 2160 is($match_b, $re->[0], "match b " . ($re->[0] ? "succeeded" : "failed") . " ($c)"); 2161 is($count_a, $re->[4], "count a ($c)"); 2162 is($count_b, $re->[4], "count b ($c)"); 2163 } 2164 } 2165 2166 { # Bleadperl v5.13.8-292-gf56b639 breaks NEZUMI/Unicode-LineBreak-1.011 2167 # \xdf in lookbehind failed to compile as is multi-char fold 2168 my $message = "Lookbehind with \\xdf matchable compiles"; 2169 my $r = eval 'qr{ 2170 (?u: (?<=^url:) | 2171 (?<=[/]) (?=[^/]) | 2172 (?<=[^-.]) (?=[-~.,_?\#%=&]) | 2173 (?<=[=&]) (?=.) 2174 )}iox'; 2175 is($@, '', $message); 2176 object_ok($r, 'Regexp', $message); 2177 } 2178 2179 # RT #82610 2180 ok 'foo/file.fob' =~ m,^(?=[^\.])[^/]*/(?=[^\.])[^/]*\.fo[^/]$,; 2181 2182 { # This was failing unless an explicit /d was added 2183 my $p = qr/[\xE0_]/i; 2184 utf8::upgrade($p); 2185 like("\xC0", $p, "Verify \"\\xC0\" =~ /[\\xE0_]/i; pattern in utf8"); 2186 } 2187 2188 ok "x" =~ /\A(?>(?:(?:)A|B|C?x))\z/, 2189 "Check TRIE does not overwrite EXACT following NOTHING at start - RT #111842"; 2190 2191 { 2192 my $single = ":"; 2193 my $upper = "\x{390}"; # Fold is 3 chars. 2194 my $multi = CORE::fc($upper); 2195 2196 my $failed = 0; 2197 2198 # Try forcing a node to be split, with a multi-char fold at the 2199 # boundary 2200 for my $repeat (1 .. 300) { 2201 my $string = $single x $repeat; 2202 my $lhs = $string . $upper; 2203 if ($lhs !~ m/$string$multi/i) { 2204 $failed = $repeat; 2205 last; 2206 } 2207 } 2208 ok(! $failed, "Matched multi-char fold across EXACTFish node boundaries; if failed, was at count $failed"); 2209 2210 $failed = 0; 2211 for my $repeat (1 .. 300) { 2212 my $string = $single x $repeat; 2213 my $lhs = $string . "\N{LATIN SMALL LIGATURE FFI}"; 2214 if ($lhs !~ m/${string}ff\N{LATIN SMALL LETTER I}/i) { 2215 $failed = $repeat; 2216 last; 2217 } 2218 } 2219 ok(! $failed, "Matched multi-char fold across EXACTFish node boundaries; if failed, was at count $failed"); 2220 2221 $failed = 0; 2222 for my $repeat (1 .. 300) { 2223 my $string = $single x $repeat; 2224 my $lhs = $string . "\N{LATIN SMALL LIGATURE FFL}"; 2225 if ($lhs !~ m/${string}ff\N{U+6c}/i) { 2226 $failed = $repeat; 2227 last; 2228 } 2229 } 2230 ok(! $failed, "Matched multi-char fold across EXACTFish node boundaries; if failed, was at count $failed"); 2231 2232 # This tests that under /d matching that an 'ss' split across two 2233 # parts of a node doesn't end up turning into something that matches 2234 # \xDF unless it is in utf8. 2235 $failed = 0; 2236 $single = 'a'; # Is non-terminal multi-char fold char 2237 for my $repeat (1 .. 300) { 2238 my $string = $single x $repeat; 2239 my $lhs = "$string\N{LATIN SMALL LETTER SHARP S}"; 2240 utf8::downgrade($lhs); 2241 $string .= "s"; 2242 if ($lhs =~ m/${string}s/di) { 2243 $failed = $repeat; 2244 last; 2245 } 2246 } 2247 ok(! $failed, "Matched multi-char fold 'ss' across EXACTF node boundaries; if failed, was at count $failed"); 2248 } 2249 2250 { 2251 fresh_perl_is('print eval "\"\x{101}\" =~ /[[:lower:]]/", "\n"; print eval "\"\x{100}\" =~ /[[:lower:]]/i", "\n";', 2252 "1\n1", # Both re's should match 2253 {}, 2254 "get [:lower:] swash in first eval; test under /i in second"); 2255 } 2256 2257 { 2258 fresh_perl_is(<<'EOF', 2259 my $s = "\x{41c}"; 2260 $s =~ /(.*)/ or die; 2261 $ls = lc $1; 2262 print $ls eq lc $s ? "good\n" : "bad: [$ls]\n"; 2263EOF 2264 "good\n", 2265 {}, 2266 "swash triggered by lc() doesn't corrupt \$1" 2267 ); 2268 } 2269 2270 { 2271 #' RT #119075 2272 no warnings 'regexp'; # Silence "has useless greediness modifier" 2273 local $@; 2274 eval { /a{0}?/; }; 2275 ok(! $@, 2276 "PCRE regression test: No 'Quantifier follows nothing in regex' warning"); 2277 2278 } 2279 2280 { 2281 unlike("\xB5", qr/^_?\p{IsMyRuntimeProperty}\z/, "yadayada"); 2282 like("\xB6", qr/^_?\p{IsMyRuntimeProperty}\z/, "yadayada"); 2283 unlike("\xB7", qr/^_?\p{IsMyRuntimeProperty}\z/, "yadayada"); 2284 like("\xB5", qr/^_?\P{IsMyRuntimeProperty}\z/, "yadayada"); 2285 unlike("\xB6", qr/^_?\P{IsMyRuntimeProperty}\z/, "yadayada"); 2286 like("\xB7", qr/^_?\P{IsMyRuntimeProperty}\z/, "yadayada"); 2287 2288 unlike("_\xB5", qr/^_?\p{IsMyRuntimeProperty}\z/, "yadayada"); 2289 like("_\xB6", qr/^_?\p{IsMyRuntimeProperty}\z/, "yadayada"); 2290 unlike("_\xB7", qr/^_?\p{IsMyRuntimeProperty}\z/, "yadayada"); 2291 like("_\xB5", qr/^_?\P{IsMyRuntimeProperty}\z/, "yadayada"); 2292 unlike("_\xB6", qr/^_?\P{IsMyRuntimeProperty}\z/, "yadayada"); 2293 like("_\xB7", qr/^_?\P{IsMyRuntimeProperty}\z/, "yadayada"); 2294 } 2295 2296 # These are defined later, so won't be known at regex compile time above 2297 sub IsMyRuntimeProperty { 2298 return "B6\n"; 2299 } 2300 2301 sub IsntMyRuntimeProperty { 2302 return "!B6\n"; 2303 } 2304 2305 { # From Lingua::Stem::UniNE; no ticket filed but related to #121778 2306 use utf8; 2307 my $word = 'рабта'; 2308 $word =~ s{ (?: 2309 ия # definite articles for nouns: 2310 | ът # ∙ masculine 2311 | та # ∙ feminine 2312 | то # ∙ neutral 2313 | те # ∙ plural 2314 ) $ }{}x; 2315 is($word, 'раб', "Handles UTF8 trie correctly"); 2316 } 2317 2318 { # [perl #122460] 2319 my $a = "rdvark"; 2320 $a =~ /(?{})(?=[A-Za-z0-9_])a*?/g; 2321 is (pos $a, 0, "optimizer correctly thinks (?=...) is 0-length"); 2322 } 2323 2324 # 2325 # Keep the following tests last -- they may crash perl 2326 # 2327 print "# Tests that follow may crash perl\n"; 2328 { 2329 eval '/\k/'; 2330 ok $@ =~ /\QSequence \k... not terminated in regex;\E/, 2331 'Lone \k not allowed'; 2332 } 2333 2334 { 2335 my $message = "Substitution with lookahead (possible segv)"; 2336 $_ = "ns1ns1ns1"; 2337 s/ns(?=\d)/ns_/g; 2338 is($_, "ns_1ns_1ns_1", $message); 2339 $_ = "ns1"; 2340 s/ns(?=\d)/ns_/; 2341 is($_, "ns_1", $message); 2342 $_ = "123"; 2343 s/(?=\d+)|(?<=\d)/!Bang!/g; 2344 is($_, "!Bang!1!Bang!2!Bang!3!Bang!", $message); 2345 } 2346 2347 { 2348 # Earlier versions of Perl said this was fatal. 2349 my $message = "U+0FFFF shouldn't crash the regex engine"; 2350 no warnings 'utf8'; 2351 my $a = eval "chr(65535)"; 2352 use warnings; 2353 my $warning_message; 2354 local $SIG{__WARN__} = sub { $warning_message = $_[0] }; 2355 eval $a =~ /[a-z]/; 2356 ok(1, $message); # If it didn't crash, it worked. 2357 } 2358 2359 TODO: { # Was looping 2360 todo_skip('Triggers thread clone SEGV. See #86550') 2361 if $::running_as_thread && $::running_as_thread; 2362 watchdog(10); # Use a bigger value for busy systems 2363 like("\x{00DF}", qr/[\x{1E9E}_]*/i, "\"\\x{00DF}\" =~ /[\\x{1E9E}_]*/i was looping"); 2364 } 2365 2366 { # Bug #90536, caused failed assertion 2367 unlike("s\N{U+DF}", qr/^\x{00DF}/i, "\"s\\N{U+DF}\", qr/^\\x{00DF}/i"); 2368 } 2369 2370 # User-defined Unicode properties to match above-Unicode code points 2371 sub Is_32_Bit_Super { return "110000\tFFFFFFFF\n" } 2372 sub Is_Portable_Super { return '!utf8::Any' } # Matches beyond 32 bits 2373 2374 { # Assertion was failing on on 64-bit platforms; just didn't work on 32. 2375 no warnings qw(non_unicode portable); 2376 use Config; 2377 2378 # We use 'ok' instead of 'like' because the warnings are lexically 2379 # scoped, and want to turn them off, so have to do the match in this 2380 # scope 2381 if ($Config{uvsize} < 8) { 2382 ok(chr(0xFFFF_FFFE) =~ /\p{Is_32_Bit_Super}/, 2383 "chr(0xFFFF_FFFE) can match a Unicode property"); 2384 ok(chr(0xFFFF_FFFF) =~ /\p{Is_32_Bit_Super}/, 2385 "chr(0xFFFF_FFFF) can match a Unicode property"); 2386 my $p = qr/^[\x{FFFF_FFFF}]$/; 2387 ok(chr(0xFFFF_FFFF) =~ $p, 2388 "chr(0xFFFF_FFFF) can match itself in a [class]"); 2389 ok(chr(0xFFFF_FFFF) =~ $p, # Tests any caching 2390 "chr(0xFFFF_FFFF) can match itself in a [class] subsequently"); 2391 } 2392 else { 2393 no warnings 'overflow'; 2394 ok(chr(0xFFFF_FFFF_FFFF_FFFE) =~ qr/\p{Is_Portable_Super}/, 2395 "chr(0xFFFF_FFFF_FFFF_FFFE) can match a Unicode property"); 2396 ok(chr(0xFFFF_FFFF_FFFF_FFFF) =~ qr/^\p{Is_Portable_Super}$/, 2397 "chr(0xFFFF_FFFF_FFFF_FFFF) can match a Unicode property"); 2398 2399 my $p = qr/^[\x{FFFF_FFFF_FFFF_FFFF}]$/; 2400 ok(chr(0xFFFF_FFFF_FFFF_FFFF) =~ $p, 2401 "chr(0xFFFF_FFFF_FFFF_FFFF) can match itself in a [class]"); 2402 ok(chr(0xFFFF_FFFF_FFFF_FFFF) =~ $p, # Tests any caching 2403 "chr(0xFFFF_FFFF_FFFF_FFFF) can match itself in a [class] subsequently"); 2404 2405 # This test is because something was declared as 32 bits, but 2406 # should have been cast to 64; only a problem where 2407 # sizeof(STRLEN) != sizeof(UV) 2408 ok(chr(0xFFFF_FFFF_FFFF_FFFE) !~ qr/\p{Is_32_Bit_Super}/, "chr(0xFFFF_FFFF_FFFF_FFFE) shouldn't match a range ending in 0xFFFF_FFFF"); 2409 } 2410 } 2411 2412 { # [perl #112530], the code below caused a panic 2413 sub InFoo { "a\tb\n9\ta\n" } 2414 like("\n", qr/\p{InFoo}/, 2415 "Overlapping ranges in user-defined properties"); 2416 } 2417 2418 { # Regexp:Grammars was broken: 2419 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2013-06/msg01290.html 2420 fresh_perl_like('use warnings; "abc" =~ qr{(?&foo){0}abc(?<foo>)}', 2421 'Quantifier unexpected on zero-length expression', 2422 {}, 2423 'No segfault on qr{(?&foo){0}abc(?<foo>)}'); 2424 } 2425 2426 # !!! NOTE that tests that aren't at all likely to crash perl should go 2427 # a ways above, above these last ones. 2428 2429 done_testing(); 2430} # End of sub run_tests 2431 24321; 2433