xref: /openbsd-src/gnu/usr.bin/perl/t/op/split_unicode.t (revision eac174f2741a08d8deb8aae59a7f778ef9b5d770)
1898184e3Ssthen#!./perl
2898184e3Ssthen
3898184e3SsthenBEGIN {
4b8851fccSafresh1    chdir 't' if -d 't';
5898184e3Ssthen    require './test.pl';
69f11ffb7Safresh1}
79f11ffb7Safresh1
8898184e3Ssthenskip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec (used by charnames)");
9*eac174f2Safresh1plan(tests => 147);
10898184e3Ssthen
11898184e3Ssthen{
12898184e3Ssthen    # check the special casing of split /\s/ and unicode
13898184e3Ssthen    use charnames qw(:full);
14898184e3Ssthen    # below test data is extracted from
15898184e3Ssthen    # PropList-5.0.0.txt
16898184e3Ssthen    # Date: 2006-06-07, 23:22:52 GMT [MD]
17898184e3Ssthen    #
18898184e3Ssthen    # Unicode Character Database
19898184e3Ssthen    # Copyright (c) 1991-2006 Unicode, Inc.
20898184e3Ssthen    # For terms of use, see http://www.unicode.org/terms_of_use.html
21898184e3Ssthen    # For documentation, see UCD.html
22898184e3Ssthen    my @spaces=(
23898184e3Ssthen	ord("\t"),      # Cc       <control-0009>
24898184e3Ssthen	ord("\n"),      # Cc       <control-000A>
25898184e3Ssthen	# not PerlSpace # Cc       <control-000B>
26898184e3Ssthen	ord("\f"),      # Cc       <control-000C>
27898184e3Ssthen	ord("\r"),      # Cc       <control-000D>
28898184e3Ssthen	ord(" "),       # Zs       SPACE
29898184e3Ssthen	ord("\N{NEL}"), # Cc       <control-0085>
30898184e3Ssthen	ord("\N{NO-BREAK SPACE}"),
31898184e3Ssthen			# Zs       NO-BREAK SPACE
32898184e3Ssthen        0x1680,         # Zs       OGHAM SPACE MARK
33898184e3Ssthen        0x2000..0x200A, # Zs  [11] EN QUAD..HAIR SPACE
34898184e3Ssthen        0x2028,         # Zl       LINE SEPARATOR
35898184e3Ssthen        0x2029,         # Zp       PARAGRAPH SEPARATOR
36898184e3Ssthen        0x202F,         # Zs       NARROW NO-BREAK SPACE
37898184e3Ssthen        0x205F,         # Zs       MEDIUM MATHEMATICAL SPACE
38898184e3Ssthen        0x3000          # Zs       IDEOGRAPHIC SPACE
39898184e3Ssthen    );
40898184e3Ssthen    #diag "Have @{[0+@spaces]} to test\n";
41898184e3Ssthen    foreach my $cp (@spaces) {
42898184e3Ssthen	my $msg = sprintf "Space: U+%04x", $cp;
43898184e3Ssthen        my $space = chr($cp);
44898184e3Ssthen        my $str="A:$space:B\x{FFFD}";
45898184e3Ssthen        chop $str;
46898184e3Ssthen
47898184e3Ssthen        my @res=split(/\s+/,$str);
48898184e3Ssthen        my $cnt=split(/\s+/,$str);
49898184e3Ssthen        ok(@res == 2 && join('-',@res) eq "A:-:B", "$msg - /\\s+/");
50898184e3Ssthen	is($cnt, scalar(@res), "$msg - /\\s+/ (count)");
51898184e3Ssthen
52898184e3Ssthen        my $s2 = "$space$space:A:$space$space:B\x{FFFD}";
53898184e3Ssthen        chop $s2;
54898184e3Ssthen
55898184e3Ssthen        my @r2 = split(' ',$s2);
56898184e3Ssthen	my $c2 = split(' ',$s2);
57898184e3Ssthen        ok(@r2 == 2 && join('-', @r2) eq ":A:-:B",  "$msg - ' '");
58898184e3Ssthen	is($c2, scalar(@r2), "$msg - ' ' (count)");
59898184e3Ssthen
60898184e3Ssthen        my @r3 = split(/\s+/, $s2);
61898184e3Ssthen        my $c3 = split(/\s+/, $s2);
62898184e3Ssthen        ok(@r3 == 3 && join('-', @r3) eq "-:A:-:B", "$msg - /\\s+/ No.2");
63898184e3Ssthen	is($c3, scalar(@r3), "$msg - /\\s+/ No.2 (count)");
64898184e3Ssthen    }
6591f110e0Safresh1
6691f110e0Safresh1    { # RT #114808
6791f110e0Safresh1        warning_is(
6891f110e0Safresh1            sub {
6991f110e0Safresh1                $p=chr(0x100);
7091f110e0Safresh1                for (".","ab\x{101}def") {
7191f110e0Safresh1                    @q = split /$p/
7291f110e0Safresh1                }
7391f110e0Safresh1            },
7491f110e0Safresh1            undef,
7591f110e0Safresh1            'no warnings when part of split cant match non-utf8'
7691f110e0Safresh1        );
7791f110e0Safresh1    }
7891f110e0Safresh1
79898184e3Ssthen}
80*eac174f2Safresh1
81*eac174f2Safresh1{
82*eac174f2Safresh1    # Check empty pattern with specified field count on Unicode string
83*eac174f2Safresh1    my $string = "\x{100}\x{101}\x{102}";
84*eac174f2Safresh1    $_ = join(':', split(//, $string, 2));
85*eac174f2Safresh1    is($_, "\x{100}:\x{101}\x{102}",
86*eac174f2Safresh1            "Split into specified number of fields with empty pattern");
87*eac174f2Safresh1    @ary = split(//, $string, 2);
88*eac174f2Safresh1    $cnt = split(//, $string, 2);
89*eac174f2Safresh1    is($cnt, scalar(@ary), "Check element count from previous test");
90*eac174f2Safresh1}
91