xref: /openbsd-src/gnu/usr.bin/perl/t/re/pat_rt_report.t (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1#!./perl
2#
3# This is a home for regular expression tests that don't fit into
4# the format supported by re/regexp.t.  If you want to add a test
5# that does fit that format, add it to re/re_tests, not here.
6
7use strict;
8use warnings;
9use 5.010;
10use Config;
11
12sub run_tests;
13
14$| = 1;
15
16
17BEGIN {
18    chdir 't' if -d 't';
19    @INC = ('../lib','.');
20    require './test.pl';
21    skip_all_if_miniperl("miniperl can't load Tie::Hash::NamedCapture, need for %+ and %-");
22}
23
24
25plan tests => 2532;  # Update this when adding/deleting tests.
26
27run_tests() unless caller;
28
29#
30# Tests start here.
31#
32sub run_tests {
33
34    like("A \x{263a} B z C", qr/A . B (??{ "z" }) C/,
35	 "Match UTF-8 char in presence of (??{ }); Bug 20000731.001");
36
37    {
38        no warnings 'uninitialized';
39        ok(undef =~ /^([^\/]*)(.*)$/, "Used to cause a SEGV; Bug 20001021.005");
40    }
41
42    {
43        my $message = 'bug id 20001008.001';
44
45        my @x = ("stra\337e 138", "stra\337e 138");
46        for (@x) {
47            ok(s/(\d+)\s*([\w\-]+)/$1 . uc $2/e, $message);
48            ok(my ($latin) = /^(.+)(?:\s+\d)/, $message);
49            is($latin, "stra\337e", $message);
50	    ok($latin =~ s/stra\337e/straße/, $message);
51            #
52            # Previous code follows, but outcommented - there were no tests.
53            #
54            # $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a
55            # use utf8; # needed for the raw UTF-8
56            # $latin =~ s!(s)tr(?:aß|s+e)!$1tr.!; # \303\237 after the a
57        }
58    }
59
60    {
61        # Fist half of the bug.
62        my $message = 'HEBREW ACCENT QADMA matched by .*; Bug 20001028.003';
63        my $X = chr (1448);
64        ok(my ($Y) = $X =~ /(.*)/, $message);
65        is($Y, v1448, $message);
66        is(length $Y, 1, $message);
67
68        # Second half of the bug.
69        $message = 'HEBREW ACCENT QADMA in replacement; Bug 20001028.003';
70        $X = '';
71        $X =~ s/^/chr(1488)/e;
72        is(length $X, 1, $message);
73        is(ord $X, 1488, $message);
74    }
75
76    {
77        my $message = 'Repeated s///; Bug 20001108.001';
78        my $X = "Szab\x{f3},Bal\x{e1}zs";
79        my $Y = $X;
80        $Y =~ s/(B)/$1/ for 0 .. 3;
81        is($Y, $X, $message);
82        is($X, "Szab\x{f3},Bal\x{e1}zs", $message);
83    }
84
85    {
86        my $message = 's/// on UTF-8 string; Bug 20000517.001';
87        my $x = "\x{100}A";
88        $x =~ s/A/B/;
89        is($x, "\x{100}B", $message);
90        is(length $x, 2, $message);
91    }
92
93    {
94        my $message = '\C and É; Bug 20001230.002';
95        ok("École" =~ /^\C\C(.)/ && $1 eq 'c', $message);
96        like("École", qr/^\C\C(c)/, $message);
97    }
98
99    {
100        # The original bug report had 'no utf8' here but that was irrelevant.
101
102        my $message = "Don't dump core; Bug 20010306.008";
103        my $a = "a\x{1234}";
104        like($a, qr/\w/, $message);  # used to core dump.
105    }
106
107    {
108        my $message = '/g in scalar context; Bug 20010410.006';
109        for my $rx ('/(.*?)\{(.*?)\}/csg',
110		    '/(.*?)\{(.*?)\}/cg',
111		    '/(.*?)\{(.*?)\}/sg',
112		    '/(.*?)\{(.*?)\}/g',
113		    '/(.+?)\{(.+?)\}/csg',) {
114            my $i = 0;
115            my $input = "a{b}c{d}";
116            eval <<"            --";
117                while (eval \$input =~ $rx) {
118                    \$i ++;
119                }
120            --
121            is($i, 2, $message);
122        }
123    }
124
125    {
126        # Amazingly vertical tabulator is the same in ASCII and EBCDIC.
127        for ("\n", "\t", "\014", "\r") {
128            unlike($_, qr/[[:print:]]/, sprintf "\\%03o not in [[:print:]]; Bug 20010619.003", ord $_);
129        }
130        for (" ") {
131            like($_, qr/[[:print:]]/, "'$_' in [[:print:]]; Bug 20010619.003");
132        }
133    }
134
135    {
136        # [ID 20010814.004] pos() doesn't work when using =~m// in list context
137
138        $_ = "ababacadaea";
139        my $a = join ":", /b./gc;
140        my $b = join ":", /a./gc;
141        my $c = pos;
142        is("$a $b $c", 'ba:ba ad:ae 10', "pos() works with () = m//; Bug 20010814.004");
143    }
144
145    {
146        # [ID 20010407.006] matching utf8 return values from
147        # functions does not work
148
149        my $message = 'UTF-8 return values from functions; Bug 20010407.006';
150        package ID_20010407_006;
151        sub x {"a\x{1234}"}
152        my $x = x;
153        my $y;
154      ::ok($x =~ /(..)/, $message);
155        $y = $1;
156      ::ok(length ($y) == 2 && $y eq $x, $message);
157      ::ok(x =~ /(..)/, $message);
158        $y = $1;
159      ::ok(length ($y) == 2 && $y eq $x, $message);
160    }
161
162    {
163        # High bit bug -- japhy
164        my $x = "ab\200d";
165        ok $x =~ /.*?\200/, "High bit fine";
166    }
167
168    {
169        my $message = 'UTF-8 hash keys and /$/';
170        # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters
171        #                                         /2002-01/msg01327.html
172
173        my $u = "a\x{100}";
174        my $v = substr ($u, 0, 1);
175        my $w = substr ($u, 1, 1);
176        my %u = ($u => $u, $v => $v, $w => $w);
177        for (keys %u) {
178            my $m1 =            /^\w*$/ ? 1 : 0;
179            my $m2 = $u {$_} =~ /^\w*$/ ? 1 : 0;
180            is($m1, $m2, $message);
181        }
182    }
183
184    {
185        my $message = "s///eg [change 13f46d054db22cf4]; Bug 20020124.005";
186
187        for my $char ("a", "\x{df}", "\x{100}") {
188            my $x = "$char b $char";
189            $x =~ s{($char)}{
190                  "c" =~ /c/;
191                  "x";
192            }ge;
193            is(substr ($x, 0, 1), substr ($x, -1, 1), $message);
194        }
195    }
196
197    {
198        my $message = "Correct pmop flags checked when empty pattern; Bug 20020412.005";
199
200        # Requires reuse of last successful pattern.
201        my $num = 123;
202        $num =~ /\d/;
203        for (0 .. 1) {
204            my $match = m?? + 0;
205            ok($match != $_, $message)
206                or diag(sprintf "'match one' %s on %s iteration" =>
207			$match ? 'succeeded' : 'failed',
208			$_     ? 'second'    : 'first');
209        }
210        $num =~ /(\d)/;
211        my $result = join "" => $num =~ //g;
212        is($result, $num, $message);
213    }
214
215    {
216        my $message = 'UTF-8 regex matches above 32k; Bug 20020630.002';
217        for (['byte', "\x{ff}"], ['utf8', "\x{1ff}"]) {
218            my ($type, $char) = @$_;
219            for my $len (32000, 32768, 33000) {
220                my  $s = $char . "f" x $len;
221                my  $r = $s =~ /$char([f]*)/gc;
222                ok($r, $message) or diag("<$type x $len>");
223                ok(!$r || pos ($s) == $len + 1, $message)
224		    or diag("<$type x $len>; pos = @{[pos $s]}");
225            }
226        }
227    }
228
229    {
230        my $s = "\x{100}" x 5;
231        my $ok = $s =~ /(\x{100}{4})/;
232        my ($ord, $len) = (ord $1, length $1);
233        ok $ok && $ord == 0x100 && $len == 4, "No panic: end_shift [change 0e933229fa758625]";
234    }
235
236    {
237        our $a = "x\x{100}";
238        chop $a;    # Leaves the UTF-8 flag
239        $a .= "y";  # 1 byte before 'y'.
240
241        like($a, qr/^\C/,        'match one \C on 1-byte UTF-8; Bug 15763');
242        like($a, qr/^\C{1}/,     'match \C{1}; Bug 15763');
243
244        like($a, qr/^\Cy/,       'match \Cy; Bug 15763');
245        like($a, qr/^\C{1}y/,    'match \C{1}y; Bug 15763');
246
247        unlike($a, qr/^\C\Cy/,     q {don't match two \Cy; Bug 15763});
248        unlike($a, qr/^\C{2}y/,    q {don't match \C{2}y; Bug 15763});
249
250        $a = "\x{100}y"; # 2 bytes before "y"
251
252        like($a, qr/^\C/,        'match one \C on 2-byte UTF-8; Bug 15763');
253        like($a, qr/^\C{1}/,     'match \C{1}; Bug 15763');
254        like($a, qr/^\C\C/,      'match two \C; Bug 15763');
255        like($a, qr/^\C{2}/,     'match \C{2}; Bug 15763');
256
257        like($a, qr/^\C\C\C/,    'match three \C on 2-byte UTF-8 and a byte; Bug 15763');
258        like($a, qr/^\C{3}/,     'match \C{3}; Bug 15763');
259
260        like($a, qr/^\C\Cy/,     'match two \C; Bug 15763');
261        like($a, qr/^\C{2}y/,    'match \C{2}; Bug 15763');
262
263        unlike($a, qr/^\C\C\Cy/,   q {don't match three \Cy; Bug 15763});
264        unlike($a, qr/^\C{2}\Cy/,  q {don't match \C{2}\Cy; Bug 15763});
265        unlike($a, qr/^\C{3}y/,    q {don't match \C{3}y; Bug 15763});
266
267        $a = "\x{1000}y"; # 3 bytes before "y"
268
269        like($a, qr/^\C/,        'match one \C on three-byte UTF-8; Bug 15763');
270        like($a, qr/^\C{1}/,     'match \C{1}; Bug 15763');
271        like($a, qr/^\C\C/,      'match two \C; Bug 15763');
272        like($a, qr/^\C{2}/,     'match \C{2}; Bug 15763');
273        like($a, qr/^\C\C\C/,    'match three \C; Bug 15763');
274        like($a, qr/^\C{3}/,     'match \C{3}; Bug 15763');
275
276        like($a, qr/^\C\C\C\C/,  'match four \C on three-byte UTF-8 and a byte; Bug 15763');
277        like($a, qr/^\C{4}/,     'match \C{4}; Bug 15763');
278
279        like($a, qr/^\C\C\Cy/,   'match three \Cy; Bug 15763');
280        like($a, qr/^\C{3}y/,    'match \C{3}y; Bug 15763');
281
282        unlike($a, qr/^\C\C\C\Cy/, q {don't match four \Cy; Bug 15763});
283        unlike($a, qr/^\C{4}y/,    q {don't match \C{4}y; Bug 15763});
284    }
285
286
287    {
288        my $message = 'UTF-8 matching; Bug 15397';
289        like("\x{100}", qr/\x{100}/, $message);
290        like("\x{100}", qr/(\x{100})/, $message);
291        like("\x{100}", qr/(\x{100}){1}/, $message);
292        like("\x{100}\x{100}", qr/(\x{100}){2}/, $message);
293        like("\x{100}\x{100}", qr/(\x{100})(\x{100})/, $message);
294    }
295
296    {
297        my $message = 'Neither ()* nor ()*? sets $1 when matched 0 times; Bug 7471';
298        local $_       = 'CD';
299        ok(/(AB)*?CD/ && !defined $1, $message);
300        ok(/(AB)*CD/  && !defined $1, $message);
301    }
302
303    {
304        my $message = "Caching shouldn't prevent match; Bug 3547";
305        my $pattern = "^(b+?|a){1,2}c";
306        ok("bac"    =~ /$pattern/ && $1 eq 'a', $message);
307        ok("bbac"   =~ /$pattern/ && $1 eq 'a', $message);
308        ok("bbbac"  =~ /$pattern/ && $1 eq 'a', $message);
309        ok("bbbbac" =~ /$pattern/ && $1 eq 'a', $message);
310    }
311
312    {
313        ok("\x{100}" =~ /(.)/, '$1 should keep UTF-8 ness; Bug 18232');
314        is($1, "\x{100}",  '$1 is UTF-8; Bug 18232');
315        { 'a' =~ /./; }
316        is($1, "\x{100}",  '$1 is still UTF-8; Bug 18232');
317        isnt($1, "\xC4\x80", '$1 is not non-UTF-8; Bug 18232');
318    }
319
320    {
321        my $message = "Optimizer doesn't prematurely reject match; Bug 19767";
322        use utf8;
323
324        my $attr = 'Name-1';
325        my $NormalChar      = qr /[\p{IsDigit}\p{IsLower}\p{IsUpper}]/;
326        my $NormalWord      = qr /${NormalChar}+?/;
327        my $PredNameHyphen  = qr /^${NormalWord}(\-${NormalWord})*?$/;
328
329        $attr =~ /^$/;
330        like($attr, $PredNameHyphen, $message);  # Original test.
331
332        "a" =~ m/[b]/;
333        like("0", qr/\p{N}+\z/, $message);         # Variant.
334    }
335
336    {
337        my $message = "(??{ }) doesn't return stale values; Bug 20683";
338        our $p = 1;
339        foreach (1, 2, 3, 4) {
340            $p ++ if /(??{ $p })/
341        }
342        is($p, 5, $message);
343
344        {
345            package P;
346            $a = 1;
347            sub TIESCALAR {bless []}
348            sub FETCH     {$a ++}
349        }
350        tie $p, "P";
351        foreach (1, 2, 3, 4) {
352            /(??{ $p })/
353        }
354        is($p, 5, $message);
355    }
356
357    {
358        # Subject: Odd regexp behavior
359        # From: Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>
360        # Date: Wed, 26 Feb 2003 16:53:12 +0000
361        # Message-Id: <E18o4nw-0008Ly-00@wisbech.cl.cam.ac.uk>
362        # To: perl-unicode@perl.org
363
364        my $message = 'Markus Kuhn 2003-02-26';
365
366        my $x = "\x{2019}\nk";
367        ok($x =~ s/(\S)\n(\S)/$1 $2/sg, $message);
368        is($x, "\x{2019} k", $message);
369
370        $x = "b\nk";
371        ok($x =~ s/(\S)\n(\S)/$1 $2/sg, $message);
372        is($x, "b k", $message);
373
374        like("\x{2019}", qr/\S/, $message);
375    }
376
377    {
378        my $message = "(??{ .. }) in split doesn't corrupt its stack; Bug 21411";
379        our $i;
380        is('-1-3-5-', join('', split /((??{$i++}))/, '-1-3-5-'), $message);
381        no warnings 'syntax';
382        @_ = split /(?{'WOW'})/, 'abc';
383        local $" = "|";
384        is("@_", "a|b|c", $message);
385    }
386
387    is(join('-', split /(?{ split "" })/, "abc"), 'a-b-c', 'nested split');
388
389    {
390        $_ = "code:   'x' { '...' }\n"; study;
391        my @x; push @x, $& while m/'[^\']*'/gx;
392        local $" = ":";
393        is("@x", "'x':'...'", "Parse::RecDescent triggered infinite loop; Bug 17757");
394    }
395
396    {
397        sub func ($) {
398            ok("a\nb" !~ /^b/,  "Propagated modifier; $_[0]; Bug 22354");
399            ok("a\nb" =~ /^b/m, "Propagated modifier; $_[0] - with /m; Bug 22354");
400        }
401        func "standalone";
402        $_ = "x"; s/x/func "in subst"/e;
403        $_ = "x"; s/x/func "in multiline subst"/em;
404        $_ = "x"; /x(?{func "in regexp"})/;
405        $_ = "x"; /x(?{func "in multiline regexp"})/m;
406    }
407
408    {
409        $_    = "abcdef\n";
410        my @x = m/./g;
411        is("abcde", $`, 'Global match sets $`; Bug 19049');
412    }
413
414    {
415        # [perl #23769] Unicode regex broken on simple example
416        # regrepeat() didn't handle UTF-8 EXACT case right.
417
418        my $Mess       = 'regrepeat() handles UTF-8 EXACT case right';
419        my $message = "$Mess; Bug 23769";
420
421        my $s = "\x{a0}\x{a0}\x{a0}\x{100}"; chop $s;
422
423        like($s, qr/\x{a0}/, $message);
424        like($s, qr/\x{a0}+/, $message);
425        like($s, qr/\x{a0}\x{a0}/, $message);
426
427        $message = "$Mess (easy variant); Bug 23769";
428        ok("aaa\x{100}" =~ /(a+)/, $message);
429        is($1, "aaa", $message);
430
431        $message = "$Mess (easy invariant); Bug 23769";
432        ok("aaa\x{100}     " =~ /(a+?)/, $message);
433        is($1, "a", $message);
434
435        $message = "$Mess (regrepeat variant); Bug 23769";
436        ok("\xa0\xa0\xa0\x{100}    " =~ /(\xa0+?)/, $message);
437        is($1, "\xa0", $message);
438
439        $message = "$Mess (regrepeat invariant); Bug 23769";
440        ok("\xa0\xa0\xa0\x{100}" =~ /(\xa0+)/, $message);
441        is($1, "\xa0\xa0\xa0", $message);
442
443        $message = "$Mess (hard variant); Bug 23769";
444        ok("\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+?)/, $message);
445        is($1, "\xa0\xa1", $message);
446
447        $message = "$Mess (hard invariant); Bug 23769";
448        ok("ababab\x{100}  " =~ /((?:ab)+)/, $message);
449        is($1, 'ababab', $message);
450
451        ok("\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+)/, $message);
452        is($1, "\xa0\xa1\xa0\xa1\xa0\xa1", $message);
453
454        ok("ababab\x{100}  " =~ /((?:ab)+?)/, $message);
455        is($1, "ab", $message);
456
457        $message = "Don't match first byte of UTF-8 representation; Bug 23769";
458        unlike("\xc4\xc4\xc4", qr/(\x{100}+)/, $message);
459        unlike("\xc4\xc4\xc4", qr/(\x{100}+?)/, $message);
460        unlike("\xc4\xc4\xc4", qr/(\x{100}++)/, $message);
461    }
462
463    {
464        # perl panic: pp_match start/end pointers
465
466        is(eval {my ($x, $y) = "bca" =~ /^(?=.*(a)).*(bc)/; "$x-$y"}, "a-bc",
467	   'Captures can move backwards in string; Bug 25269');
468    }
469
470    {
471        # \cA not recognized in character classes
472        like("a\cAb", qr/\cA/, '\cA in pattern; Bug 27940');
473        like("a\cAb", qr/[\cA]/, '\cA in character class; Bug 27940');
474        like("a\cAb", qr/[\cA-\cB]/, '\cA in character class range; Bug 27940');
475        like("abc", qr/[^\cA-\cB]/, '\cA in negated character class range; Bug 27940');
476        like("a\cBb", qr/[\cA-\cC]/, '\cB in character class range; Bug 27940');
477        like("a\cCbc", qr/[^\cA-\cB]/, '\cC in negated character class range; Bug 27940');
478        like("a\cAb", qr/(??{"\cA"})/, '\cA in ??{} pattern; Bug 27940');
479        unlike("ab", qr/a\cIb/x, '\cI in pattern; Bug 27940');
480    }
481
482    {
483        # perl #28532: optional zero-width match at end of string is ignored
484
485        ok("abc" =~ /^abc(\z)?/ && defined($1),
486           'Optional zero-width match at end of string; Bug 28532');
487        ok("abc" =~ /^abc(\z)??/ && !defined($1),
488           'Optional zero-width match at end of string; Bug 28532');
489    }
490
491    {
492        my $utf8 = "\xe9\x{100}"; chop $utf8;
493        my $latin1 = "\xe9";
494
495        like($utf8, qr/\xe9/i, "utf8/latin; Bug 36207");
496        like($utf8, qr/$latin1/i, "utf8/latin runtime; Bug 36207");
497        like($utf8, qr/(abc|\xe9)/i, "utf8/latin trie; Bug 36207");
498        like($utf8, qr/(abc|$latin1)/i, "utf8/latin trie runtime; Bug 36207");
499
500        like("\xe9", qr/$utf8/i, "latin/utf8; Bug 36207");
501        like("\xe9", qr/(abc|$utf8)/i, "latin/utf8 trie; Bug 36207");
502        like($latin1, qr/$utf8/i, "latin/utf8 runtime; Bug 36207");
503        like($latin1, qr/(abc|$utf8)/i, "latin/utf8 trie runtime; Bug 36207");
504    }
505
506    {
507        my $s = "abcd";
508        $s =~ /(..)(..)/g;
509        $s = $1;
510        $s = $2;
511        is($2, 'cd',
512	   "Assigning to original string does not corrupt match vars; Bug 37038");
513    }
514
515    {
516        {
517            package wooosh;
518            sub gloople {"!"}
519        }
520        my $aeek = bless {} => 'wooosh';
521        is(do {$aeek -> gloople () =~ /(.)/g}, 1,
522	   "//g match against return value of sub [change e26a497577f3ce7b]");
523
524        sub gloople {"!"}
525        is(do{gloople () =~ /(.)/g}, 1,
526	   "change e26a497577f3ce7b didn't affect sub calls for some reason");
527    }
528
529    {
530        # [perl #78680]
531        # See changes 26925-26928, which reverted change 26410
532        {
533            package lv;
534            our $var = "abc";
535            sub variable : lvalue {$var}
536        }
537        my $o = bless [] => 'lv';
538        my $f = "";
539        my $r = eval {
540            for (1 .. 2) {
541                $f .= $1 if $o -> variable =~ /(.)/g;
542            }
543            1;
544        };
545        if ($r) {
546            is($f, "ab", "pos() retained between calls");
547        }
548        else {
549            ok 0, "Code failed: $@";
550        }
551
552        our $var = "abc";
553        sub variable : lvalue {$var}
554        my $g = "";
555        my $s = eval {
556            for (1 .. 2) {
557                $g .= $1 if variable =~ /(.)/g;
558            }
559            1;
560        };
561        if ($s) {
562            is($g, "ab", "pos() retained between calls");
563        }
564        else {
565            ok 0, "Code failed: $@";
566        }
567    }
568
569  SKIP:
570    {
571        skip "In EBCDIC" if $::IS_EBCDIC;
572        no warnings 'utf8';
573        $_ = pack 'U0C2', 0xa2, 0xf8;  # Ill-formed UTF-8
574        my $ret = 0;
575        is(do {!($ret = s/[\0]+//g)}, 1,
576	   "Ill-formed UTF-8 doesn't match NUL in class; Bug 37836");
577    }
578
579    {
580        # chr(65535) should be allowed in regexes
581
582        no warnings 'utf8'; # To allow non-characters
583        my ($c, $r, $s);
584
585        $c = chr 0xffff;
586        $c =~ s/$c//g;
587        is($c, "", "U+FFFF, parsed as atom; Bug 38293");
588
589        $c = chr 0xffff;
590        $r = "\\$c";
591        $c =~ s/$r//g;
592        is($c, "", "U+FFFF backslashed, parsed as atom; Bug 38293");
593
594        $c = chr 0xffff;
595        $c =~ s/[$c]//g;
596        is($c, "", "U+FFFF, parsed in class; Bug 38293");
597
598        $c = chr 0xffff;
599        $r = "[\\$c]";
600        $c =~ s/$r//g;
601        is($c, "", "U+FFFF backslashed, parsed in class; Bug 38293");
602
603        $s = "A\x{ffff}B";
604        $s =~ s/\x{ffff}//i;
605        is($s, "AB", "U+FFFF, EXACTF; Bug 38293");
606
607        $s = "\x{ffff}A";
608        $s =~ s/\bA//;
609        is($s, "\x{ffff}", "U+FFFF, BOUND; Bug 38293");
610
611        $s = "\x{ffff}!";
612        $s =~ s/\B!//;
613        is($s, "\x{ffff}", "U+FFFF, NBOUND; Bug 38293");
614    }
615
616    {
617
618        # The printing characters
619        my @chars = ("A" .. "Z");
620        my $delim = ",";
621        my $size = 32771 - 4;
622        my $str = '';
623
624        # Create some random junk. Inefficient, but it works.
625        for (my $i = 0; $i < $size; $ i++) {
626            $str .= $chars [rand @chars];
627        }
628
629        $str .= ($delim x 4);
630        my $res;
631        my $matched;
632        ok($str =~ s/^(.*?)${delim}{4}//s, "Pattern matches; Bug 39583");
633        is($str, "", "Empty string; Bug 39583");
634        ok(defined $1 && length ($1) == $size, '$1 is correct size; Bug 39583');
635    }
636
637    {
638        like("\0-A", qr/\c@-A/, '@- should not be interpolated in a pattern; Bug 27940');
639        like("\0\0A", qr/\c@+A/, '@+ should not be interpolated in a pattern; Bug 27940');
640        like("X\@-A", qr/X@-A/, '@- should not be interpolated in a pattern; Bug 27940');
641        like("X\@\@A", qr/X@+A/, '@+ should not be interpolated in a pattern; Bug 27940');
642
643        like("X\0A", qr/X\c@?A/,  '\c@?; Bug 27940');
644        like("X\0A", qr/X\c@*A/,  '\c@*; Bug 27940');
645        like("X\0A", qr/X\c@(A)/, '\c@(; Bug 27940');
646        like("X\0A", qr/X(\c@)A/, '\c@); Bug 27940');
647        like("X\0A", qr/X\c@|ZA/, '\c@|; Bug 27940');
648
649        like("X\@A", qr/X@?A/,  '@?; Bug 27940');
650        like("X\@A", qr/X@*A/,  '@*; Bug 27940');
651        like("X\@A", qr/X@(A)/, '@(; Bug 27940');
652        like("X\@A", qr/X(@)A/, '@); Bug 27940');
653        like("X\@A", qr/X@|ZA/, '@|; Bug 27940');
654
655        local $" = ','; # non-whitespace and non-RE-specific
656        like('abc', qr/(.)(.)(.)/, 'The last successful match is bogus; Bug 27940');
657        like("A@+B", qr/A@{+}B/,  'Interpolation of @+ in /@{+}/; Bug 27940');
658        like("A@-B", qr/A@{-}B/,  'Interpolation of @- in /@{-}/; Bug 27940');
659        like("A@+B", qr/A@{+}B/x, 'Interpolation of @+ in /@{+}/x; Bug 27940');
660        like("A@-B", qr/A@{-}B/x, 'Interpolation of @- in /@{-}/x; Bug 27940');
661    }
662
663    {
664        my $s = 'foo bar baz';
665        my (@k, @v, @fetch, $res);
666        my $count = 0;
667        my @names = qw ($+{A} $+{B} $+{C});
668        if ($s =~ /(?<A>foo)\s+(?<B>bar)?\s+(?<C>baz)/) {
669            while (my ($k, $v) = each (%+)) {
670                $count++;
671            }
672            @k = sort keys   (%+);
673            @v = sort values (%+);
674            $res = 1;
675            push @fetch,
676                ["$+{A}", "$1"],
677                ["$+{B}", "$2"],
678                ["$+{C}", "$3"],
679            ;
680        }
681        foreach (0 .. 2) {
682            if ($fetch [$_]) {
683                is($fetch[$_][0], $fetch[$_][1], "$names[$_]; Bug 50496");
684            } else {
685                ok 0, $names[$_];
686            }
687        }
688        is($res, 1, "'$s' =~ /(?<A>foo)\\s+(?<B>bar)?\\s+(?<C>baz)/; Bug 50496");
689        is($count, 3, "Got 3 keys in %+ via each; Bug 50496");
690        is(0 + @k, 3, "Got 3 keys in %+ via keys; Bug 50496");
691        is("@k", "A B C", "Got expected keys; Bug 50496");
692        is("@v", "bar baz foo", "Got expected values; Bug 50496");
693        eval '
694            no warnings "uninitialized";
695            print for $+ {this_key_doesnt_exist};
696        ';
697        is($@, '', 'lvalue $+ {...} should not throw an exception; Bug 50496');
698    }
699
700    {
701        #
702        # Almost the same as the block above, except that the capture is nested.
703        #
704
705        my $s = 'foo bar baz';
706        my (@k, @v, @fetch, $res);
707        my $count = 0;
708        my @names = qw ($+{A} $+{B} $+{C} $+{D});
709        if ($s =~ /(?<D>(?<A>foo)\s+(?<B>bar)?\s+(?<C>baz))/) {
710            while (my ($k,$v) = each(%+)) {
711                $count++;
712            }
713            @k = sort keys   (%+);
714            @v = sort values (%+);
715            $res = 1;
716            push @fetch,
717                ["$+{A}", "$2"],
718                ["$+{B}", "$3"],
719                ["$+{C}", "$4"],
720                ["$+{D}", "$1"],
721            ;
722        }
723        foreach (0 .. 3) {
724            if ($fetch [$_]) {
725                is($fetch[$_][0], $fetch[$_][1], "$names[$_]; Bug 50496");
726            } else {
727                ok 0, $names [$_];
728            }
729        }
730        is($res, 1, "'$s' =~ /(?<D>(?<A>foo)\\s+(?<B>bar)?\\s+(?<C>baz))/; Bug 50496");
731        is($count, 4, "Got 4 keys in %+ via each; Bug 50496");
732        is(@k, 4, "Got 4 keys in %+ via keys; Bug 50496");
733        is("@k", "A B C D", "Got expected keys; Bug 50496");
734        is("@v", "bar baz foo foo bar baz", "Got expected values; Bug 50496");
735        eval '
736            no warnings "uninitialized";
737            print for $+ {this_key_doesnt_exist};
738        ';
739        is($@, '', 'lvalue $+ {...} should not throw an exception; Bug 50496');
740    }
741
742    {
743        my $str = 'abc';
744        my $count = 0;
745        my $mval = 0;
746        my $pval = 0;
747        while ($str =~ /b/g) {$mval = $#-; $pval = $#+; $count ++}
748        is($mval,  0, '@- should be empty; Bug 36046');
749        is($pval,  0, '@+ should be empty; Bug 36046');
750        is($count, 1, 'Should have matched once only; Bug 36046');
751    }
752
753    {
754        my $message = '/m in precompiled regexp; Bug 40684';
755        my $s = "abc\ndef";
756        my $rex = qr'^abc$'m;
757        ok($s =~ m/$rex/, $message);
758        ok($s =~ m/^abc$/m, $message);
759    }
760
761    {
762        my $message = '(?: ... )? should not lose $^R; Bug 36909';
763        $^R = 'Nothing';
764        {
765            local $^R = "Bad";
766            ok('x foofoo y' =~ m {
767                      (foo) # $^R correctly set
768                      (?{ "last regexp code result" })
769            }x, $message);
770            is($^R, 'last regexp code result', $message);
771        }
772        is($^R, 'Nothing', $message);
773
774        {
775            local $^R = "Bad";
776
777            ok('x foofoo y' =~ m {
778                      (?:foo|bar)+ # $^R correctly set
779                      (?{ "last regexp code result" })
780            }x, $message);
781            is($^R, 'last regexp code result', $message);
782        }
783        is($^R, 'Nothing', $message);
784
785        {
786            local $^R = "Bad";
787            ok('x foofoo y' =~ m {
788                      (foo|bar)\1+ # $^R undefined
789                      (?{ "last regexp code result" })
790            }x, $message);
791            is($^R, 'last regexp code result', $message);
792        }
793        is($^R, 'Nothing', $message);
794
795        {
796            local $^R = "Bad";
797            ok('x foofoo y' =~ m {
798                      (foo|bar)\1 # This time without the +
799                      (?{"last regexp code result"})
800            }x, $message);
801            is($^R, 'last regexp code result', $message);
802        }
803        is($^R, 'Nothing', $message);
804    }
805
806    {
807        my $message = 'Match is linear, not quadratic; Bug 22395';
808        our $count;
809        for my $l (10, 100, 1000) {
810            $count = 0;
811            ('a' x $l) =~ /(.*)(?{$count++})[bc]/;
812            local $::TODO = "Should be L+1 not L*(L+3)/2 (L=$l)";
813            is($count, $l + 1, $message);
814        }
815    }
816
817    {
818        my $message = '@-/@+ should not have undefined values; Bug 22614';
819        local $_ = 'ab';
820        our @len = ();
821        /(.){1,}(?{push @len,0+@-})(.){1,}(?{})^/;
822        is("@len", "2 2 2", $message);
823    }
824
825    {
826        my $message = '$& set on s///; Bug 18209';
827        my $text = ' word1 word2 word3 word4 word5 word6 ';
828
829        my @words = ('word1', 'word3', 'word5');
830        my $count;
831        foreach my $word (@words) {
832            $text =~ s/$word\s//gi; # Leave a space to separate words
833                                    # in the resultant str.
834            # The following block is not working.
835            if ($&) {
836                $count ++;
837            }
838            # End bad block
839        }
840        is($count, 3, $message);
841        is($text, ' word2 word4 word6 ', $message);
842    }
843
844    {
845        # RT#6893
846
847        local $_ = qq (A\nB\nC\n);
848        my @res;
849        while (m#(\G|\n)([^\n]*)\n#gsx) {
850            push @res, "$2";
851            last if @res > 3;
852        }
853        is("@res", "A B C", "/g pattern shouldn't infinite loop; Bug 6893");
854    }
855
856    {
857        # No optimizer bug
858        my @tails  = ('', '(?(1))', '(|)', '()?');
859        my @quants = ('*','+');
860        my $doit = sub {
861            my $pats = shift;
862            for (@_) {
863                for my $pat (@$pats) {
864                    for my $quant (@quants) {
865                        for my $tail (@tails) {
866                            my $re = "($pat$quant\$)$tail";
867                            ok(/$re/  && $1 eq $_, "'$_' =~ /$re/; Bug 41010");
868                            ok(/$re/m && $1 eq $_, "'$_' =~ /$re/m; Bug 41010");
869                        }
870                    }
871                }
872            }
873        };
874
875        my @dpats = ('\d',
876                     '[1234567890]',
877                     '(1|[23]|4|[56]|[78]|[90])',
878                     '(?:1|[23]|4|[56]|[78]|[90])',
879                     '(1|2|3|4|5|6|7|8|9|0)',
880                     '(?:1|2|3|4|5|6|7|8|9|0)');
881        my @spats = ('[ ]', ' ', '( |\t)', '(?: |\t)', '[ \t]', '\s');
882        my @sstrs = ('  ');
883        my @dstrs = ('12345');
884        $doit -> (\@spats, @sstrs);
885        $doit -> (\@dpats, @dstrs);
886    }
887
888    {
889        # [perl #45605] Regexp failure with utf8-flagged and byte-flagged string
890
891        my $utf_8 = "\xd6schel";
892        utf8::upgrade ($utf_8);
893        $utf_8 =~ m {(\xd6|&Ouml;)schel};
894        is($1, "\xd6", "Upgrade error; Bug 45605");
895    }
896
897    {
898        # Regardless of utf8ness any character matches itself when
899        # doing a case insensitive match. See also [perl #36207]
900
901        for my $o (0 .. 255) {
902            my @ch = (chr ($o), chr ($o));
903            utf8::upgrade ($ch [1]);
904            for my $u_str (0, 1) {
905                for my $u_pat (0, 1) {
906                    like($ch[$u_str], qr/\Q$ch[$u_pat]\E/i,
907			 "\$c =~ /\$c/i : chr ($o) : u_str = $u_str u_pat = $u_pat; Bug 36207");
908                    like($ch[$u_str], qr/\Q$ch[$u_pat]\E|xyz/i,
909			 "\$c=~/\$c|xyz/i : chr($o) : u_str = $u_str u_pat = $u_pat; Bug 36207");
910                }
911            }
912        }
913    }
914
915    {
916         my $message = '$REGMARK in replacement; Bug 49190';
917         our $REGMARK;
918         no warnings 'experimental::lexical_topic';
919         my $_ = "A";
920         ok(s/(*:B)A/$REGMARK/, $message);
921         is($_, "B", $message);
922         $_ = "CCCCBAA";
923         ok(s/(*:X)A+|(*:Y)B+|(*:Z)C+/$REGMARK/g, $message);
924         is($_, "ZYX", $message);
925         # Use a longer name to force reallocation of $REGMARK.
926         $_ = "CCCCBAA";
927         ok(s/(*:X)A+|(*:YYYYYYYYYYYYYYYY)B+|(*:Z)C+/$REGMARK/g, $message);
928         is($_, "ZYYYYYYYYYYYYYYYYX", $message);
929    }
930
931    {
932        my $message = 'Substitution evaluation in list context; Bug 52658';
933        my $reg = '../xxx/';
934        my @te  = ($reg =~ m{^(/?(?:\.\./)*)},
935                   $reg =~ s/(x)/'b'/eg > 1 ? '##' : '++');
936        is($reg, '../bbb/', $message);
937        is($te [0], '../', $message);
938    }
939
940    {
941        my $a = "xyzt" x 8192;
942        like($a, qr/\A(?>[a-z])*\z/,
943	     '(?>) does not cause wrongness on long string; Bug 60034');
944        my $b = $a . chr 256;
945        chop $b;
946	is($a, $b, 'Bug 60034');
947        like($b, qr/\A(?>[a-z])*\z/,
948	     '(?>) does not cause wrongness on long string with UTF-8; Bug 60034');
949    }
950
951    #
952    # Keep the following tests last -- they may crash perl
953    #
954    print "# Tests that follow may crash perl\n";
955    {
956
957        my $message = 'Pattern in a loop, failure should not ' .
958                         'affect previous success; Bug 19049/38869';
959        my @list = (
960            'ab cdef',             # Matches regex
961            ('e' x 40000 ) .'ab c' # Matches not, but 'ab c' matches part of it
962        );
963        my $y;
964        my $x;
965        foreach (@list) {
966            m/ab(.+)cd/i; # The ignore-case seems to be important
967            $y = $1;      # Use $1, which might not be from the last match!
968            $x = substr ($list [0], $- [0], $+ [0] - $- [0]);
969        }
970        is($y, ' ', $message);
971        is($x, 'ab cd', $message);
972    }
973
974    {
975        ok (("a" x (2 ** 15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker; Bug 24274");
976        ok ((q(a)x 100) =~ /^(??{'(.)'x 100})/,
977            "Regexp /^(??{'(.)'x 100})/ crashes older perls; Bug 24274");
978    }
979
980    {
981        # [perl #45337] utf8 + "[a]a{2}" + /$.../ = panic: sv_len_utf8 cache
982
983        local ${^UTF8CACHE} = -1;
984        my $message = "Shouldn't panic; Bug 45337";
985        my $s = "[a]a{2}";
986        utf8::upgrade $s;
987        like("aaa", qr/$s/, $message);
988    }
989    {
990	my $message = "Check if tree logic breaks \$^R; Bug 57042";
991	my $cond_re = qr/\s*
992	    \s* (?:
993		   \( \s* A  (?{1})
994		 | \( \s* B  (?{2})
995	       )
996	   /x;
997	my @res;
998	for my $line ("(A)","(B)") {
999	   if ($line =~ m/$cond_re/) {
1000	       push @res, $^R ? "#$^R" : "UNDEF";
1001	   }
1002	}
1003	is("@res","#1 #2", $message);
1004    }
1005    {
1006	no warnings 'closure';
1007	my $re = qr/A(??{"1"})/;
1008	ok "A1B" =~ m/^((??{ $re }))((??{"B"}))$/;
1009	ok $1 eq "A1";
1010	ok $2 eq "B";
1011    }
1012
1013    # This only works under -DEBUGGING because it relies on an assert().
1014    {
1015	# Check capture offset re-entrancy of utf8 code.
1016
1017        sub fswash { $_[0] =~ s/([>X])//g; }
1018
1019        my $k1 = "." x 4 . ">>";
1020        fswash($k1);
1021
1022        my $k2 = "\x{f1}\x{2022}";
1023        $k2 =~ s/([\360-\362])/>/g;
1024        fswash($k2);
1025
1026        is($k2, "\x{2022}", "utf8::SWASHNEW doesn't cause capture leaks; Bug 60508");
1027    }
1028
1029    {
1030	# minimal CURLYM limited to 32767 matches
1031	my @pat = (
1032	    qr{a(x|y)*b},	# CURLYM
1033	    qr{a(x|y)*?b},	# .. with minmod
1034	    qr{a([wx]|[yz])*b},	# .. and without tries
1035	    qr{a([wx]|[yz])*?b},
1036	);
1037	my $len = 32768;
1038	my $s = join '', 'a', 'x' x $len, 'b';
1039	for my $pat (@pat) {
1040	    like($s, $pat, "$pat; Bug 65372");
1041	}
1042    }
1043
1044    {
1045        local $::TODO = "[perl #38133]";
1046
1047        "A" =~ /(((?:A))?)+/;
1048        my $first = $2;
1049
1050        "A" =~ /(((A))?)+/;
1051        my $second = $2;
1052
1053        is($first, $second);
1054    }
1055
1056    {
1057       my $message
1058        = 'utf8 =~ /trie/ where trie matches a continuation octet; Bug 70998';
1059
1060       # Catch warnings:
1061       my $w;
1062       local $SIG{__WARN__} = sub { $w .= shift };
1063
1064       # This bug can be reduced to
1065       qq{\x{30ab}} =~ /\xab|\xa9/;
1066       # but it's nice to have a more 'real-world' test. The original test
1067       # case from the RT ticket follows:
1068
1069       my %conv = (
1070                   "\xab"     => "&lt;",
1071                   "\xa9"     => "(c)",
1072                  );
1073       my $conv_rx = '(' . join('|', map { quotemeta } keys %conv) . ')';
1074       $conv_rx = qr{$conv_rx};
1075
1076       my $x
1077        = qq{\x{3042}\x{304b}\x{3055}\x{305f}\x{306a}\x{306f}\x{307e}}
1078        . qq{\x{3084}\x{3089}\x{308f}\x{3093}\x{3042}\x{304b}\x{3055}}
1079        . qq{\x{305f}\x{306a}\x{306f}\x{307e}\x{3084}\x{3089}\x{308f}}
1080        . qq{\x{3093}\x{30a2}\x{30ab}\x{30b5}\x{30bf}\x{30ca}\x{30cf}}
1081        . qq{\x{30de}\x{30e4}\x{30e9}\x{30ef}\x{30f3}\x{30a2}\x{30ab}}
1082        . qq{\x{30b5}\x{30bf}\x{30ca}\x{30cf}\x{30de}\x{30e4}\x{30e9}}
1083        . qq{\x{30ef}\x{30f3}\x{30a2}\x{30ab}\x{30b5}\x{30bf}\x{30ca}}
1084        . qq{\x{30cf}\x{30de}\x{30e4}\x{30e9}\x{30ef}\x{30f3}};
1085
1086       $x =~ s{$conv_rx}{$conv{$1}}eg;
1087
1088       is($w, undef, $message);
1089    }
1090
1091    {
1092        # minimal CURLYM limited to 32767 matches
1093
1094        is(join("-", "   abc   def  " =~ /(?=(\S+))/g), "abc-bc-c-def-ef-f",
1095	   'stclass optimisation does not break + inside (?=); Bug 68564');
1096    }
1097
1098    {
1099        use charnames ":full";
1100        # Delayed interpolation of \N'
1101        my $r1 = qr/\N{THAI CHARACTER SARA I}/;
1102        my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}";
1103
1104        # Bug #56444
1105        ok $s1 =~ /$r1+/, 'my $r1 = qr/\N{THAI CHARACTER SARA I}/; my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}; $s1 =~ /$r1+/';
1106
1107        # Bug #62056
1108        ok "${s1}A" =~ m/$s1\N{LATIN CAPITAL LETTER A}/, '"${s1}A" =~ m/$s1\N{LATIN CAPITAL LETTER A}/';
1109
1110        ok "abbbbc" =~ m/\N{1}/ && $& eq "a", '"abbbbc" =~ m/\N{1}/ && $& eq "a"';
1111        ok "abbbbc" =~ m/\N{3,4}/ && $& eq "abbb", '"abbbbc" =~ m/\N{3,4}/ && $& eq "abbb"';
1112    }
1113
1114    {
1115        use charnames ":full";
1116        my $message = '[perl #74982] Period coming after \N{}';
1117        ok("\x{ff08}." =~ m/\N{FULLWIDTH LEFT PARENTHESIS}./ && $& eq "\x{ff08}.", $message);
1118        ok("\x{ff08}." =~ m/[\N{FULLWIDTH LEFT PARENTHESIS}]./ && $& eq "\x{ff08}.", $message);
1119    }
1120
1121SKIP: {
1122    ######## "Segfault using HTML::Entities", Richard Jolly <richardjolly@mac.com>, <A3C7D27E-C9F4-11D8-B294-003065AE00B6@mac.com> in perl-unicode@perl.org
1123
1124    skip('Perl configured without Encode module', 1)
1125	unless $Config{extensions} =~ / Encode /;
1126
1127    # Test case cut down by jhi
1128    fresh_perl_like(<<'EOP', qr!Malformed UTF-8 character \(unexpected end of string\) in substitution \(s///\) at!, {}, 'Segfault using HTML::Entities');
1129use Encode;
1130my $t = ord('A') == 193 ? "\xEA" : "\xE9";
1131Encode::_utf8_on($t);
1132$t =~ s/([^a])//ge;
1133EOP
1134    }
1135
1136    {
1137        # pattern must be compiled late or we can break the test file
1138        my $message = '[perl #115050] repeated nothings in a trie can cause panic';
1139        my $pattern;
1140        $pattern = '[xyz]|||';
1141        ok("blah blah" =~ /$pattern/, $message);
1142        ok("blah blah" =~ /(?:$pattern)h/, $message);
1143        $pattern = '|||[xyz]';
1144        ok("blah blah" =~ /$pattern/, $message);
1145        ok("blah blah" =~ /(?:$pattern)h/, $message);
1146    }
1147
1148    {
1149        # [perl #4289] First mention $& after a match
1150	local $::TODO = "these tests fail without Copy-on-Write enabled"
1151	    if $Config{ccflags} =~ /PERL_NO_COW/;
1152        fresh_perl_is(
1153            '$_ = "abc"; /b/g; $_ = "hello"; print eval q|$&|, "\n"',
1154            "b\n", {}, '$& first mentioned after match');
1155        fresh_perl_is(
1156            '$_ = "abc"; /b/g; $_ = "hello"; print eval q|$`|, "\n"',
1157            "a\n", {}, '$` first mentioned after match');
1158        fresh_perl_is(
1159            '$_ = "abc"; /b/g; $_ = "hello"; print eval q|$\'|,"\n"',
1160            "c\n", {}, '$\' first mentioned after match');
1161    }
1162
1163    {
1164	# [perl #118175] threaded perl-5.18.0 fails pat_rt_report_thr.t
1165	# this tests some related failures
1166	#
1167	# The tests in the block *only* fail when run on 32-bit systems
1168	# with a malloc that allocates above the 2GB line.  On the system
1169	# in the report above that only happened in a thread.
1170	my $s = "\x{1ff}" . "f" x 32;
1171	ok($s =~ /\x{1ff}[[:alpha:]]+/gca, "POSIXA pointer wrap");
1172
1173	# this one segfaulted under the conditions above
1174	# of course, CANY is evil, maybe it should crash
1175	ok($s =~ /.\C+/, "CANY pointer wrap");
1176    }
1177} # End of sub run_tests
1178
11791;
1180