xref: /openbsd-src/gnu/usr.bin/perl/t/op/split_unicode.t (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1#!./perl
2
3BEGIN {
4    require './test.pl';
5    skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec (used by charnames)");
6    plan(tests => 151);
7}
8
9{
10    # check the special casing of split /\s/ and unicode
11    use charnames qw(:full);
12    # below test data is extracted from
13    # PropList-5.0.0.txt
14    # Date: 2006-06-07, 23:22:52 GMT [MD]
15    #
16    # Unicode Character Database
17    # Copyright (c) 1991-2006 Unicode, Inc.
18    # For terms of use, see http://www.unicode.org/terms_of_use.html
19    # For documentation, see UCD.html
20    my @spaces=(
21	ord("\t"),      # Cc       <control-0009>
22	ord("\n"),      # Cc       <control-000A>
23	# not PerlSpace # Cc       <control-000B>
24	ord("\f"),      # Cc       <control-000C>
25	ord("\r"),      # Cc       <control-000D>
26	ord(" "),       # Zs       SPACE
27	ord("\N{NEL}"), # Cc       <control-0085>
28	ord("\N{NO-BREAK SPACE}"),
29			# Zs       NO-BREAK SPACE
30        0x1680,         # Zs       OGHAM SPACE MARK
31        0x180E,         # Zs       MONGOLIAN VOWEL SEPARATOR
32        0x2000..0x200A, # Zs  [11] EN QUAD..HAIR SPACE
33        0x2028,         # Zl       LINE SEPARATOR
34        0x2029,         # Zp       PARAGRAPH SEPARATOR
35        0x202F,         # Zs       NARROW NO-BREAK SPACE
36        0x205F,         # Zs       MEDIUM MATHEMATICAL SPACE
37        0x3000          # Zs       IDEOGRAPHIC SPACE
38    );
39    #diag "Have @{[0+@spaces]} to test\n";
40    foreach my $cp (@spaces) {
41	my $msg = sprintf "Space: U+%04x", $cp;
42        my $space = chr($cp);
43        my $str="A:$space:B\x{FFFD}";
44        chop $str;
45
46        my @res=split(/\s+/,$str);
47        my $cnt=split(/\s+/,$str);
48        ok(@res == 2 && join('-',@res) eq "A:-:B", "$msg - /\\s+/");
49	is($cnt, scalar(@res), "$msg - /\\s+/ (count)");
50
51        my $s2 = "$space$space:A:$space$space:B\x{FFFD}";
52        chop $s2;
53
54        my @r2 = split(' ',$s2);
55	my $c2 = split(' ',$s2);
56        ok(@r2 == 2 && join('-', @r2) eq ":A:-:B",  "$msg - ' '");
57	is($c2, scalar(@r2), "$msg - ' ' (count)");
58
59        my @r3 = split(/\s+/, $s2);
60        my $c3 = split(/\s+/, $s2);
61        ok(@r3 == 3 && join('-', @r3) eq "-:A:-:B", "$msg - /\\s+/ No.2");
62	is($c3, scalar(@r3), "$msg - /\\s+/ No.2 (count)");
63    }
64
65    { # RT #114808
66        warning_is(
67            sub {
68                $p=chr(0x100);
69                for (".","ab\x{101}def") {
70                    @q = split /$p/
71                }
72            },
73            undef,
74            'no warnings when part of split cant match non-utf8'
75        );
76    }
77
78}
79