xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/t/encoding.t (revision 3d61058aa5c692477b6d18acfbbdb653a9930ff9)
1b39c5158SmillertBEGIN {
2*3d61058aSafresh1    require Config; Config->import();
3b39c5158Smillert    if ($Config{'extensions'} !~ /\bEncode\b/) {
4b39c5158Smillert      print "1..0 # Skip: Encode was not built\n";
5b39c5158Smillert      exit 0;
6b39c5158Smillert    }
7b39c5158Smillert    unless (find PerlIO::Layer 'perlio') {
8b39c5158Smillert    print "1..0 # Skip: PerlIO was not built\n";
9b39c5158Smillert    exit 0;
10b39c5158Smillert    }
11b39c5158Smillert    if (ord("A") == 193) {
129f11ffb7Safresh1    print "1..0 # Skip: encoding pragma does not support EBCDIC platforms\n";
139f11ffb7Safresh1    exit(0);
149f11ffb7Safresh1    }
159f11ffb7Safresh1    if ($] >= 5.025 and !$Config{usecperl}) {
16eac174f2Safresh1    print "1..0 # Skip: encoding pragma not supported in Perl 5.25 or later\n";
17b39c5158Smillert    exit(0);
18b39c5158Smillert    }
19b39c5158Smillert}
20b39c5158Smillert
21b8851fccSafresh1print "1..33\n";
22b8851fccSafresh1
23b39c5158Smillert
2491f110e0Safresh1no warnings "deprecated";
25b39c5158Smillertuse encoding "latin1"; # ignored (overwritten by the next line)
26b39c5158Smillertuse encoding "greek";  # iso 8859-7 (no "latin" alias, surprise...)
27b39c5158Smillert
28b39c5158Smillert# "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is
29b39c5158Smillert# \x{3AF} in Unicode (GREEK SMALL LETTER IOTA WITH TONOS),
30b39c5158Smillert# instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S)
31b39c5158Smillert
32b39c5158Smillert$a = "\xDF";
33b39c5158Smillert$b = "\x{100}";
34b39c5158Smillert
35b39c5158Smillertprint "not " unless ord($a) == 0x3af;
36b39c5158Smillertprint "ok 1\n";
37b39c5158Smillert
38b39c5158Smillertprint "not " unless ord($b) == 0x100;
39b39c5158Smillertprint "ok 2\n";
40b39c5158Smillert
41b39c5158Smillertmy $c;
42b39c5158Smillert
43b39c5158Smillert$c = $a . $b;
44b39c5158Smillert
45b39c5158Smillertprint "not " unless ord($c) == 0x3af;
46b39c5158Smillertprint "ok 3\n";
47b39c5158Smillert
48b39c5158Smillertprint "not " unless length($c) == 2;
49b39c5158Smillertprint "ok 4\n";
50b39c5158Smillert
51b39c5158Smillertprint "not " unless ord(substr($c, 1, 1)) == 0x100;
52b39c5158Smillertprint "ok 5\n";
53b39c5158Smillert
54b39c5158Smillertprint "not " unless ord(chr(0xdf)) == 0x3af; # spooky
55b39c5158Smillertprint "ok 6\n";
56b39c5158Smillert
57b39c5158Smillertprint "not " unless ord(pack("C", 0xdf)) == 0x3af;
58b39c5158Smillertprint "ok 7\n";
59b39c5158Smillert
60b39c5158Smillert# we didn't break pack/unpack, I hope
61b39c5158Smillert
62b39c5158Smillertprint "not " unless unpack("C", pack("C", 0xdf)) == 0xdf;
63b39c5158Smillertprint "ok 8\n";
64b39c5158Smillert
65b39c5158Smillert# the first octet of UTF-8 encoded 0x3af
66b39c5158Smillertprint "not " unless unpack("U0 C", chr(0xdf)) == 0xce;
67b39c5158Smillertprint "ok 9\n";
68b39c5158Smillert
69b39c5158Smillertprint "not " unless unpack("U", pack("U", 0xdf)) == 0xdf;
70b39c5158Smillertprint "ok 10\n";
71b39c5158Smillert
72b39c5158Smillertprint "not " unless unpack("U", chr(0xdf)) == 0x3af;
73b39c5158Smillertprint "ok 11\n";
74b39c5158Smillert
75b39c5158Smillert# charnames must still work
76b39c5158Smillertuse charnames ':full';
77b39c5158Smillertprint "not " unless ord("\N{LATIN SMALL LETTER SHARP S}") == 0xdf;
78b39c5158Smillertprint "ok 12\n";
79b39c5158Smillert
80b39c5158Smillert# combine
81b39c5158Smillert
82b39c5158Smillert$c = "\xDF\N{LATIN SMALL LETTER SHARP S}" . chr(0xdf);
83b39c5158Smillert
84b39c5158Smillertprint "not " unless ord($c) == 0x3af;
85b39c5158Smillertprint "ok 13\n";
86b39c5158Smillert
87b39c5158Smillertprint "not " unless ord(substr($c, 1, 1)) == 0xdf;
88b39c5158Smillertprint "ok 14\n";
89b39c5158Smillert
90b39c5158Smillertprint "not " unless ord(substr($c, 2, 1)) == 0x3af;
91b39c5158Smillertprint "ok 15\n";
92b39c5158Smillert
93b39c5158Smillert# regex literals
94b39c5158Smillert
95b39c5158Smillertprint "not " unless "\xDF"    =~ /\x{3AF}/;
96b39c5158Smillertprint "ok 16\n";
97b39c5158Smillert
98b39c5158Smillertprint "not " unless "\x{3AF}" =~ /\xDF/;
99b39c5158Smillertprint "ok 17\n";
100b39c5158Smillert
101b39c5158Smillertprint "not " unless "\xDF"    =~ /\xDF/;
102b39c5158Smillertprint "ok 18\n";
103b39c5158Smillert
104b39c5158Smillertprint "not " unless "\x{3AF}" =~ /\x{3AF}/;
105b39c5158Smillertprint "ok 19\n";
106b39c5158Smillert
107b39c5158Smillert# eq, cmp
108b39c5158Smillert
109b39c5158Smillertmy ($byte,$bytes,$U,$Ub,$g1,$g2,$l) = (
110b39c5158Smillert    pack("C*", 0xDF ),       # byte
111b39c5158Smillert    pack("C*", 0xDF, 0x20),  # ($bytes2 cmp $U) > 0
112b39c5158Smillert    pack("U*", 0x3AF),       # $U eq $byte
113b39c5158Smillert    pack("U*", 0xDF ),       # $Ub would eq $bytev w/o use encoding
114b39c5158Smillert    pack("U*", 0x3B1),       # ($g1 cmp $byte) > 0; === chr(0xe1)
115b39c5158Smillert    pack("U*", 0x3AF, 0x20), # ($g2 cmp $byte) > 0;
116b39c5158Smillert    pack("U*", 0x3AB),       # ($l  cmp $byte) < 0; === chr(0xdb)
117b39c5158Smillert);
118b39c5158Smillert
119b39c5158Smillert# all the tests in this section that compare a byte encoded string
120b39c5158Smillert# ato UTF-8 encoded are run in all possible vairants
121b39c5158Smillert# all of the eq, ne, cmp operations tested,
122b39c5158Smillert# $v z $u tested as well as $u z $v
123b39c5158Smillert
124b39c5158Smillertsub alleq($$){
125b39c5158Smillert    my ($a,$b)    =    (shift, shift);
126b39c5158Smillert     $a  eq  $b        &&     $b  eq  $a         &&
127b39c5158Smillert  !( $a  ne  $b )      &&  !( $b  ne  $a )       &&
128b39c5158Smillert   ( $a  cmp $b ) == 0 &&   ( $b  cmp $a ) == 0;
129b39c5158Smillert}
130b39c5158Smillert
131b39c5158Smillertsub anyeq($$){
132b39c5158Smillert    my ($a,$b)    =    (shift, shift);
133b39c5158Smillert     $a  eq  $b        ||     $b  eq  $a         ||
134b39c5158Smillert  !( $a  ne  $b )      ||  !( $b  ne  $a )       ||
135b39c5158Smillert   ( $a  cmp $b ) == 0 ||   ( $b  cmp $a ) == 0;
136b39c5158Smillert}
137b39c5158Smillert
138b39c5158Smillertsub allgt($$){
139b39c5158Smillert    my ($a,$b)    =    (shift, shift);
140b39c5158Smillert    ( $a cmp $b ) == 1 && ( $b cmp $a ) == -1;
141b39c5158Smillert}
142b39c5158Smillert#match the correct UTF-8 string
143b39c5158Smillertprint "not " unless  alleq($byte, $U);
144b39c5158Smillertprint "ok 20\n";
145b39c5158Smillert
146b39c5158Smillert#do not match a wrong UTF-8 string
147b39c5158Smillertprint "not " if anyeq($byte, $Ub);
148b39c5158Smillertprint "ok 21\n";
149b39c5158Smillert
150b39c5158Smillert#string ordering
151b39c5158Smillertprint "not " unless allgt ( $g1,    $byte  )  &&
152b39c5158Smillert                    allgt ( $g2,    $byte  )  &&
153b39c5158Smillert                    allgt ( $byte,  $l     )  &&
154b39c5158Smillert                    allgt ( $bytes, $U     );
155b39c5158Smillertprint "ok 22\n";
156b39c5158Smillert
157b39c5158Smillert# upgrade, downgrade
158b39c5158Smillert
159b39c5158Smillertmy ($u,$v,$v2);
160b39c5158Smillert$u = $v = $v2 = pack("C*", 0xDF);
161b39c5158Smillertutf8::upgrade($v);                   #explicit upgrade
162b39c5158Smillert$v2 = substr( $v2."\x{410}", 0, -1); #implicit upgrade
163b39c5158Smillert
164b39c5158Smillert# implicit upgrade === explicit upgrade
165b39c5158Smillertprint "not "  if do{{use bytes; $v ne $v2}} || $v ne $v2;
166b39c5158Smillertprint "ok 23\n";
167b39c5158Smillert
168b39c5158Smillert# utf8::upgrade is transparent and does not break equality
169b39c5158Smillertprint "not " unless alleq( $u, $v );
170b39c5158Smillertprint "ok 24\n";
171b39c5158Smillert
172b39c5158Smillert$u = $v = pack("C*", 0xDF);
173b39c5158Smillertutf8::upgrade($v);
174b39c5158Smillert#test for a roundtrip, we should get back from where we left
175b39c5158Smillerteval {utf8::downgrade( $v )};
176b39c5158Smillertprint "not " if $@ !~ /^Wide / || do{{use bytes; $u eq $v}} || $u ne $v;
177b39c5158Smillertprint "ok 25\n";
178b39c5158Smillert
179b39c5158Smillert# some more eq, cmp
180b39c5158Smillert
181b39c5158Smillert$byte=pack("C*", 0xDF);
182b39c5158Smillert
183b39c5158Smillertprint "not " unless pack("U*", 0x3AF) eq $byte;
184b39c5158Smillertprint "ok 26\n";
185b39c5158Smillert
186b39c5158Smillertprint "not " if chr(0xDF) cmp $byte;
187b39c5158Smillertprint "ok 27\n";
188b39c5158Smillert
189b39c5158Smillertprint "not " unless ((pack("U*", 0x3B0)       cmp $byte) ==  1) &&
190b39c5158Smillert                    ((pack("U*", 0x3AE)       cmp $byte) == -1) &&
191b39c5158Smillert                    ((pack("U*", 0x3AF, 0x20) cmp $byte) ==  1) &&
192b39c5158Smillert                ((pack("U*", 0x3AF) cmp pack("C*",0xDF,0x20))==-1);
193b39c5158Smillertprint "ok 28\n";
194b39c5158Smillert
195b39c5158Smillert
196b39c5158Smillert{
197b39c5158Smillert    # Used to core dump in 5.7.3
198b39c5158Smillert    no warnings; # so test goes noiselessly
199b39c5158Smillert    print ord(undef) == 0 ? "ok 29\n" : "not ok 29\n";
200b39c5158Smillert}
201b39c5158Smillert
202b39c5158Smillert{
203b39c5158Smillert    my %h1;
204b39c5158Smillert    my %h2;
205b39c5158Smillert    $h1{"\xdf"}    = 41;
206b39c5158Smillert    $h2{"\x{3af}"} = 42;
207b39c5158Smillert    print $h1{"\x{3af}"} == 41 ? "ok 30\n" : "not ok 30\n";
208b39c5158Smillert    print $h2{"\xdf"}    == 42 ? "ok 31\n" : "not ok 31\n";
209b39c5158Smillert}
210b8851fccSafresh1
211b8851fccSafresh1# Order of finding the above-Latin1 code point should not matter: both should
212b8851fccSafresh1# assume Latin1/Unicode encoding
213b8851fccSafresh1{
214b8851fccSafresh1    use bytes;
215b8851fccSafresh1    print "not " if "\xDF\x{100}" =~ /\x{3af}\x{100}/;
216b8851fccSafresh1    print "ok 32\n";
217b8851fccSafresh1    print "not " if "\x{100}\xDF" =~ /\x{100}\x{3af}/;
218b8851fccSafresh1    print "ok 33\n";
219b8851fccSafresh1}
220