xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/t/Unicode.t (revision 3d61058aa5c692477b6d18acfbbdb653a9930ff9)
1b39c5158Smillert#
2*3d61058aSafresh1# $Id: Unicode.t,v 2.5 2023/11/10 01:10:50 dankogai Exp $
3b39c5158Smillert#
4b39c5158Smillert# This script is written entirely in ASCII, even though quoted literals
5b39c5158Smillert# do include non-BMP unicode characters -- Are you happy, jhi?
6b39c5158Smillert#
7b39c5158Smillert
8b39c5158SmillertBEGIN {
9*3d61058aSafresh1    require Config; Config->import();
10b39c5158Smillert    if ($Config{'extensions'} !~ /\bEncode\b/) {
11b39c5158Smillert      print "1..0 # Skip: Encode was not built\n";
12b39c5158Smillert      exit 0;
13b39c5158Smillert    }
14b39c5158Smillert    if (ord("A") == 193) {
15b39c5158Smillert        print "1..0 # Skip: EBCDIC\n";
16b39c5158Smillert    exit 0;
17b39c5158Smillert    }
18b39c5158Smillert    $| = 1;
19b39c5158Smillert}
20b39c5158Smillert
21b39c5158Smillertuse strict;
22b39c5158Smillert#use Test::More 'no_plan';
23e9ce3842Safresh1use Test::More tests => 56;
24b39c5158Smillertuse Encode qw(encode decode find_encoding);
25b39c5158Smillert
26b39c5158Smillert#
27b39c5158Smillert# see
28eac174f2Safresh1# http://www.unicode.org/reports/tr19/
29b39c5158Smillert#
30b39c5158Smillert
31b39c5158Smillertmy $dankogai   = "\x{5c0f}\x{98fc}\x{3000}\x{5f3e}";
32b39c5158Smillertmy $nasty      = "$dankogai\x{1abcd}";
33e9ce3842Safresh1my $fallback   = "$dankogai\x{fffd}\x{fffd}";
34b39c5158Smillert
35b39c5158Smillert#hi: (0x1abcd - 0x10000) / 0x400 + 0xD800 = 0xd82a
36b39c5158Smillert#lo: (0x1abcd - 0x10000) % 0x400 + 0xDC00 = 0xdfcd
37b39c5158Smillert
38b39c5158Smillertmy $n_16be =
39b39c5158Smillert    pack("C*", map {hex($_)} qw<5c 0f 98 fc 30 00 5f 3e  d8 2a df cd>);
40b39c5158Smillertmy $n_16le =
41b39c5158Smillert    pack("C*", map {hex($_)} qw<0f 5c fc 98 00 30 3e 5f  2a d8 cd df>);
42b39c5158Smillertmy $f_16be =
43b39c5158Smillert    pack("C*", map {hex($_)} qw<5c 0f 98 fc 30 00 5f 3e  ff fd>);
44b39c5158Smillertmy $f_16le =
45b39c5158Smillert    pack("C*", map {hex($_)} qw<0f 5c fc 98 00 30 3e 5f  fd ff>);
46b39c5158Smillertmy $n_32be =
47b39c5158Smillert    pack("C*", map {hex($_)}
48b39c5158Smillert     qw<00 00 5c 0f 00 00 98 fc 00 00 30 00 00 00 5f 3e  00 01 ab cd>);
49b39c5158Smillertmy $n_32le =
50b39c5158Smillert    pack("C*", map {hex($_)}
51b39c5158Smillert     qw<0f 5c 00 00 fc 98 00 00 00 30 00 00 3e 5f 00 00  cd ab 01 00>);
52b39c5158Smillert
53b39c5158Smillertmy $n_16bb = pack('n', 0xFeFF) . $n_16be;
54b39c5158Smillertmy $n_16lb = pack('v', 0xFeFF) . $n_16le;
55b39c5158Smillertmy $n_32bb = pack('N', 0xFeFF) . $n_32be;
56b39c5158Smillertmy $n_32lb = pack('V', 0xFeFF) . $n_32le;
57b39c5158Smillert
58b39c5158Smillertis($n_16be, encode('UTF-16BE', $nasty),  qq{encode UTF-16BE});
59b39c5158Smillertis($n_16le, encode('UTF-16LE', $nasty),  qq{encode UTF-16LE});
60b39c5158Smillertis($n_32be, encode('UTF-32BE', $nasty),  qq{encode UTF-32BE});
61b39c5158Smillertis($n_32le, encode('UTF-32LE', $nasty),  qq{encode UTF-16LE});
62b39c5158Smillert
63b39c5158Smillertis($nasty,  decode('UTF-16BE', $n_16be), qq{decode UTF-16BE});
64b39c5158Smillertis($nasty,  decode('UTF-16LE', $n_16le), qq{decode UTF-16LE});
65b39c5158Smillertis($nasty,  decode('UTF-32BE', $n_32be), qq{decode UTF-32BE});
66b39c5158Smillertis($nasty,  decode('UTF-32LE', $n_32le), qq{decode UTF-32LE});
67b39c5158Smillert
68b39c5158Smillertis($n_16bb, encode('UTF-16',   $nasty),  qq{encode UTF-16});
69b39c5158Smillertis($n_32bb, encode('UTF-32',   $nasty),  qq{encode UTF-32});
70b39c5158Smillertis($nasty,  decode('UTF-16',   $n_16bb), qq{decode UTF-16, bom=be});
71b39c5158Smillertis($nasty,  decode('UTF-16',   $n_16lb), qq{decode UTF-16, bom=le});
72b39c5158Smillertis($nasty,  decode('UTF-32',   $n_32bb), qq{decode UTF-32, bom=be});
73b39c5158Smillertis($nasty,  decode('UTF-32',   $n_32lb), qq{decode UTF-32, bom=le});
74b39c5158Smillert
75b39c5158Smillertis(decode('UCS-2BE', $n_16be), $fallback, "decode UCS-2BE: fallback");
76b39c5158Smillertis(decode('UCS-2LE', $n_16le), $fallback, "decode UCS-2LE: fallback");
77b39c5158Smillerteval { decode('UCS-2BE', $n_16be, 1) };
78b39c5158Smillertis (index($@,'UCS-2BE:'), 0, "decode UCS-2BE: exception");
79b39c5158Smillerteval { decode('UCS-2LE', $n_16le, 1) };
80b39c5158Smillertis (index($@,'UCS-2LE:'), 0, "decode UCS-2LE: exception");
81b39c5158Smillertis(encode('UCS-2BE', $nasty), $f_16be, "encode UCS-2BE: fallback");
82b39c5158Smillertis(encode('UCS-2LE', $nasty), $f_16le, "encode UCS-2LE: fallback");
83b39c5158Smillerteval { encode('UCS-2BE', $nasty, 1) };
84b39c5158Smillertis(index($@, 'UCS-2BE'), 0, "encode UCS-2BE: exception");
85b39c5158Smillerteval { encode('UCS-2LE', $nasty, 1) };
86b39c5158Smillertis(index($@, 'UCS-2LE'), 0, "encode UCS-2LE: exception");
87b39c5158Smillert
88e9ce3842Safresh1{
89e9ce3842Safresh1    my %tests = (
90e9ce3842Safresh1        'UCS-2BE'  => 'n*',
91e9ce3842Safresh1        'UCS-2LE'  => 'v*',
92e9ce3842Safresh1        'UTF-16BE' => 'n*',
93e9ce3842Safresh1        'UTF-16LE' => 'v*',
94e9ce3842Safresh1        'UTF-32BE' => 'N*',
95e9ce3842Safresh1        'UTF-32LE' => 'V*',
96e9ce3842Safresh1    );
97e9ce3842Safresh1
98e9ce3842Safresh1    while (my ($enc, $pack) = each(%tests)) {
99e9ce3842Safresh1        is(decode($enc, pack($pack, 0xD800, 0x263A)), "\x{FFFD}\x{263A}",
100e9ce3842Safresh1          "decode $enc (HI surrogate followed by WHITE SMILING FACE)");
101e9ce3842Safresh1        is(decode($enc, pack($pack, 0xDC00, 0x263A)), "\x{FFFD}\x{263A}",
102e9ce3842Safresh1          "decode $enc (LO surrogate followed by WHITE SMILING FACE)");
103e9ce3842Safresh1    }
104e9ce3842Safresh1}
105e9ce3842Safresh1
106e9ce3842Safresh1{
107e9ce3842Safresh1    my %tests = (
108e9ce3842Safresh1        'UTF-16BE' => 'n*',
109e9ce3842Safresh1        'UTF-16LE' => 'v*',
110e9ce3842Safresh1    );
111e9ce3842Safresh1
112e9ce3842Safresh1    while (my ($enc, $pack) = each(%tests)) {
113e9ce3842Safresh1        is(decode($enc, pack($pack, 0xD800)), "\x{FFFD}",
114e9ce3842Safresh1          "decode $enc (HI surrogate)");
115e9ce3842Safresh1        is(decode($enc, pack($pack, 0x263A, 0xD800)), "\x{263A}\x{FFFD}",
116e9ce3842Safresh1          "decode $enc (WHITE SMILING FACE followed by HI surrogate)");
117e9ce3842Safresh1    }
118e9ce3842Safresh1}
119e9ce3842Safresh1
120e9ce3842Safresh1{
121e9ce3842Safresh1    my %tests = (
122e9ce3842Safresh1        'UTF-16BE' => 'n*',
123e9ce3842Safresh1        'UTF-16LE' => 'v*',
124e9ce3842Safresh1    );
125e9ce3842Safresh1
126e9ce3842Safresh1    while (my ($enc, $pack) = each(%tests)) {
127e9ce3842Safresh1        is(encode($enc, "\x{110000}"), pack($pack, 0xFFFD),
128e9ce3842Safresh1          "ordinals greater than U+10FFFF is replaced with U+FFFD");
129e9ce3842Safresh1    }
130e9ce3842Safresh1}
131e9ce3842Safresh1
132b39c5158Smillert#
133b39c5158Smillert# SvGROW test for (en|de)code_xs
134b39c5158Smillert#
135b39c5158SmillertSKIP: {
136b39c5158Smillert    my $utf8 = '';
137b39c5158Smillert    for my $j (0,0x10){
138b39c5158Smillert    for my $i (0..0xffff){
139b39c5158Smillert        $j == 0 and (0xD800 <= $i && $i <= 0xDFFF) and next;
140b39c5158Smillert        $utf8 .= ord($j+$i);
141b39c5158Smillert    }
142b39c5158Smillert    for my $major ('UTF-16', 'UTF-32'){
143b39c5158Smillert        for my $minor ('BE', 'LE'){
144b39c5158Smillert        my $enc = $major.$minor;
145b39c5158Smillert        is(decode($enc, encode($enc, $utf8)), $utf8, "$enc RT");
146b39c5158Smillert        }
147b39c5158Smillert    }
148b39c5158Smillert    }
149b39c5158Smillert};
150b39c5158Smillert
151b39c5158Smillert#
152b39c5158Smillert# CJKT vs. UTF-7
153b39c5158Smillert#
154b39c5158Smillert
155b39c5158Smillertuse File::Spec;
156b39c5158Smillertuse File::Basename;
157b39c5158Smillert
158b39c5158Smillertmy $dir =  dirname(__FILE__);
159b39c5158Smillertopendir my $dh, $dir or die "$dir:$!";
160b39c5158Smillertmy @file = sort grep {/\.utf$/o} readdir $dh;
161b39c5158Smillertclosedir $dh;
162b39c5158Smillertfor my $file (@file){
163b39c5158Smillert    my $path = File::Spec->catfile($dir, $file);
164b39c5158Smillert    open my $fh, '<', $path or die "$path:$!";
165b39c5158Smillert    my $content;
166b39c5158Smillert    if (PerlIO::Layer->find('perlio')){
167b39c5158Smillert    binmode $fh => ':utf8';
168b39c5158Smillert    $content = join('' => <$fh>);
169b39c5158Smillert    }else{ # ugh!
170b39c5158Smillert    binmode $fh;
171b39c5158Smillert    $content = join('' => <$fh>);
172b39c5158Smillert    Encode::_utf8_on($content)
173b39c5158Smillert    }
174b39c5158Smillert    close $fh;
175b39c5158Smillert    is(decode("UTF-7", encode("UTF-7", $content)), $content,
176b39c5158Smillert       "UTF-7 RT:$file");
177b39c5158Smillert}
178b39c5158Smillert
179b39c5158Smillert# Magic
180b39c5158Smillert{
181b39c5158Smillert    # see http://rt.perl.org/rt3//Ticket/Display.html?id=60472
182b39c5158Smillert    my $work = chr(0x100);
183b39c5158Smillert    my $encoding = find_encoding("UTF16-BE");
184b39c5158Smillert    my $tied;
185b39c5158Smillert    tie $tied, SomeScalar => \$work;
186b39c5158Smillert    my $result = $encoding->encode($tied, 1);
187b39c5158Smillert    is($work, "", "check set magic was applied");
188b39c5158Smillert}
189b39c5158Smillert
190b39c5158Smillertpackage SomeScalar;
191b39c5158Smillertuse Tie::Scalar;
192b39c5158Smillertuse vars qw(@ISA);
193b39c5158SmillertBEGIN { @ISA = 'Tie::Scalar' }
194b39c5158Smillert
195b39c5158Smillertsub TIESCALAR {
196b39c5158Smillert    my ($class, $ref) = @_;
197b39c5158Smillert    return bless $ref, $class;
198b39c5158Smillert}
199b39c5158Smillert
200b39c5158Smillertsub FETCH {
201b39c5158Smillert    ${$_[0]}
202b39c5158Smillert}
203b39c5158Smillert
204b39c5158Smillertsub STORE {
205b39c5158Smillert    ${$_[0]} = $_[1];
206b39c5158Smillert}
207b39c5158Smillert
208b39c5158Smillert1;
209b39c5158Smillert__END__
210