xref: /openbsd-src/gnu/usr.bin/perl/t/io/utf8.t (revision d13be5d47e4149db2549a9828e244d59dbc43f15)
1#!./perl
2
3BEGIN {
4    chdir 't' if -d 't';
5    @INC = '../lib';
6    unless (find PerlIO::Layer 'perlio') {
7	print "1..0 # Skip: not perlio\n";
8	exit 0;
9    }
10}
11
12no utf8; # needed for use utf8 not griping about the raw octets
13
14BEGIN { require "./test.pl"; }
15
16plan(tests => 55);
17
18$| = 1;
19
20my $a_file = tempfile();
21
22open(F,"+>:utf8",$a_file);
23print F chr(0x100).'�';
24cmp_ok( tell(F), '==', 4, tell(F) );
25print F "\n";
26cmp_ok( tell(F), '>=', 5, tell(F) );
27seek(F,0,0);
28is( getc(F), chr(0x100) );
29is( getc(F), "�" );
30is( getc(F), "\n" );
31seek(F,0,0);
32binmode(F,":bytes");
33my $chr = chr(0xc4);
34if (ord($a_file) == 193) { $chr = chr(0x8c); } # EBCDIC
35is( getc(F), $chr );
36$chr = chr(0x80);
37if (ord($a_file) == 193) { $chr = chr(0x41); } # EBCDIC
38is( getc(F), $chr );
39$chr = chr(0xc2);
40if (ord($a_file) == 193) { $chr = chr(0x80); } # EBCDIC
41is( getc(F), $chr );
42$chr = chr(0xa3);
43if (ord($a_file) == 193) { $chr = chr(0x44); } # EBCDIC
44is( getc(F), $chr );
45is( getc(F), "\n" );
46seek(F,0,0);
47binmode(F,":utf8");
48is( scalar(<F>), "\x{100}�\n" );
49seek(F,0,0);
50$buf = chr(0x200);
51$count = read(F,$buf,2,1);
52cmp_ok( $count, '==', 2 );
53is( $buf, "\x{200}\x{100}�" );
54close(F);
55
56{
57    $a = chr(300); # This *is* UTF-encoded
58    $b = chr(130); # This is not.
59
60    open F, ">:utf8", $a_file or die $!;
61    print F $a,"\n";
62    close F;
63
64    open F, "<:utf8", $a_file or die $!;
65    $x = <F>;
66    chomp($x);
67    is( $x, chr(300) );
68
69    open F, $a_file or die $!; # Not UTF
70    binmode(F, ":bytes");
71    $x = <F>;
72    chomp($x);
73    $chr = chr(196).chr(172);
74    if (ord($a_file) == 193) { $chr = chr(141).chr(83); } # EBCDIC
75    is( $x, $chr );
76    close F;
77
78    open F, ">:utf8", $a_file or die $!;
79    binmode(F);  # we write a "\n" and then tell() - avoid CRLF issues.
80    binmode(F,":utf8"); # turn UTF-8-ness back on
81    print F $a;
82    my $y;
83    { my $x = tell(F);
84      { use bytes; $y = length($a);}
85      cmp_ok( $x, '==', $y );
86  }
87
88    { # Check byte length of $b
89	use bytes; my $y = length($b);
90	cmp_ok( $y, '==', 1 );
91    }
92
93    print F $b,"\n"; # Don't upgrades $b
94
95    { # Check byte length of $b
96	use bytes; my $y = length($b);
97	cmp_ok( $y, '==', 1 );
98    }
99
100    {
101	my $x = tell(F);
102	{ use bytes; if (ord('A')==193){$y += 2;}else{$y += 3;}} # EBCDIC ASCII
103	cmp_ok( $x, '==', $y );
104    }
105
106    close F;
107
108    open F, $a_file or die $!; # Not UTF
109    binmode(F, ":bytes");
110    $x = <F>;
111    chomp($x);
112    $chr = v196.172.194.130;
113    if (ord('A') == 193) { $chr = v141.83.130; } # EBCDIC
114    is( $x, $chr, sprintf('(%vd)', $x) );
115
116    open F, "<:utf8", $a_file or die $!;
117    $x = <F>;
118    chomp($x);
119    close F;
120    is( $x, chr(300).chr(130), sprintf('(%vd)', $x) );
121
122    open F, ">", $a_file or die $!;
123    binmode(F, ":bytes:");
124
125    # Now let's make it suffer.
126    my $w;
127    {
128	use warnings 'utf8';
129	local $SIG{__WARN__} = sub { $w = $_[0] };
130	print F $a;
131        ok( (!$@));
132	like($w, qr/Wide character in print/i );
133    }
134}
135
136# Hm. Time to get more evil.
137open F, ">:utf8", $a_file or die $!;
138print F $a;
139binmode(F, ":bytes");
140print F chr(130)."\n";
141close F;
142
143open F, "<", $a_file or die $!;
144binmode(F, ":bytes");
145$x = <F>; chomp $x;
146$chr = v196.172.130;
147if (ord('A') == 193) { $chr = v141.83.130; } # EBCDIC
148is( $x, $chr );
149
150# Right.
151open F, ">:utf8", $a_file or die $!;
152print F $a;
153close F;
154open F, ">>", $a_file or die $!;
155binmode(F, ":bytes");
156print F chr(130)."\n";
157close F;
158
159open F, "<", $a_file or die $!;
160binmode(F, ":bytes");
161$x = <F>; chomp $x;
162SKIP: {
163    skip("Defaulting to UTF-8 output means that we can't generate a mangled file")
164	if $UTF8_OUTPUT;
165    is( $x, $chr );
166}
167
168# Now we have a deformed file.
169
170SKIP: {
171    if (ord('A') == 193) {
172	skip("EBCDIC doesn't complain", 2);
173    } else {
174	my @warnings;
175	open F, "<:utf8", $a_file or die $!;
176	$x = <F>; chomp $x;
177	local $SIG{__WARN__} = sub { push @warnings, $_[0]; };
178	eval { sprintf "%vd\n", $x };
179	is (scalar @warnings, 1);
180	like ($warnings[0], qr/Malformed UTF-8 character \(unexpected continuation byte 0x82, with no preceding start byte/);
181    }
182}
183
184close F;
185unlink($a_file);
186
187open F, ">:utf8", $a_file;
188@a = map { chr(1 << ($_ << 2)) } 0..5; # 0x1, 0x10, .., 0x100000
189unshift @a, chr(0); # ... and a null byte in front just for fun
190print F @a;
191close F;
192
193my $c;
194
195# read() should work on characters, not bytes
196open F, "<:utf8", $a_file;
197$a = 0;
198my $failed;
199for (@a) {
200    unless (($c = read(F, $b, 1) == 1)  &&
201            length($b)           == 1  &&
202            ord($b)              == ord($_) &&
203            tell(F)              == ($a += bytes::length($b))) {
204        print '# ord($_)           == ', ord($_), "\n";
205        print '# ord($b)           == ', ord($b), "\n";
206        print '# length($b)        == ', length($b), "\n";
207        print '# bytes::length($b) == ', bytes::length($b), "\n";
208        print '# tell(F)           == ', tell(F), "\n";
209        print '# $a                == ', $a, "\n";
210        print '# $c                == ', $c, "\n";
211	$failed++;
212        last;
213    }
214}
215close F;
216is($failed, undef);
217
218{
219    # Check that warnings are on on I/O, and that they can be muffled.
220
221    local $SIG{__WARN__} = sub { $@ = shift };
222
223    undef $@;
224    open F, ">$a_file";
225    binmode(F, ":bytes");
226    print F chr(0x100);
227    close(F);
228
229    like( $@, 'Wide character in print' );
230
231    undef $@;
232    open F, ">:utf8", $a_file;
233    print F chr(0x100);
234    close(F);
235
236    isnt( defined $@, !0 );
237
238    undef $@;
239    open F, ">$a_file";
240    binmode(F, ":utf8");
241    print F chr(0x100);
242    close(F);
243
244    isnt( defined $@, !0 );
245
246    no warnings 'utf8';
247
248    undef $@;
249    open F, ">$a_file";
250    print F chr(0x100);
251    close(F);
252
253    isnt( defined $@, !0 );
254
255    use warnings 'utf8';
256
257    undef $@;
258    open F, ">$a_file";
259    binmode(F, ":bytes");
260    print F chr(0x100);
261    close(F);
262
263    like( $@, 'Wide character in print' );
264}
265
266{
267    open F, ">:bytes",$a_file; print F "\xde"; close F;
268
269    open F, "<:bytes", $a_file;
270    my $b = chr 0x100;
271    $b .= <F>;
272    is( $b, chr(0x100).chr(0xde), "21395 '.= <>' utf8 vs. bytes" );
273    close F;
274}
275
276{
277    open F, ">:utf8",$a_file; print F chr 0x100; close F;
278
279    open F, "<:utf8", $a_file;
280    my $b = "\xde";
281    $b .= <F>;
282    is( $b, chr(0xde).chr(0x100), "21395 '.= <>' bytes vs. utf8" );
283    close F;
284}
285
286{
287    my @a = ( [ 0x007F, "bytes" ],
288	      [ 0x0080, "bytes" ],
289	      [ 0x0080, "utf8"  ],
290	      [ 0x0100, "utf8"  ] );
291    my $t = 34;
292    for my $u (@a) {
293	for my $v (@a) {
294	    # print "# @$u - @$v\n";
295	    open F, ">$a_file";
296	    binmode(F, ":" . $u->[1]);
297	    print F chr($u->[0]);
298	    close F;
299
300	    open F, "<$a_file";
301	    binmode(F, ":" . $u->[1]);
302
303	    my $s = chr($v->[0]);
304	    utf8::upgrade($s) if $v->[1] eq "utf8";
305
306	    $s .= <F>;
307	    is( $s, chr($v->[0]) . chr($u->[0]), 'rcatline utf8' );
308	    close F;
309	    $t++;
310	}
311    }
312    # last test here 49
313}
314
315{
316    # [perl #23428] Somethings rotten in unicode semantics
317    open F, ">$a_file";
318    binmode F, ":utf8";
319    syswrite(F, $a = chr(0x100));
320    close F;
321    is( ord($a), 0x100, '23428 syswrite should not downgrade scalar' );
322    like( $a, qr/^\w+/, '23428 syswrite should not downgrade scalar' );
323}
324
325# sysread() and syswrite() tested in lib/open.t since Fcntl is used
326
327{
328    # <FH> on a :utf8 stream should complain immediately with -w
329    # if it finds bad UTF-8 (:encoding(utf8) works this way)
330    use warnings 'utf8';
331    undef $@;
332    local $SIG{__WARN__} = sub { $@ = shift };
333    open F, ">$a_file";
334    binmode F;
335    my ($chrE4, $chrF6) = (chr(0xE4), chr(0xF6));
336    if (ord('A') == 193)	# EBCDIC
337    { ($chrE4, $chrF6) = (chr(0x43), chr(0xEC)); }
338    print F "foo", $chrE4, "\n";
339    print F "foo", $chrF6, "\n";
340    close F;
341    open F, "<:utf8", $a_file;
342    undef $@;
343    my $line = <F>;
344    my ($chrE4, $chrF6) = ("E4", "F6");
345    if (ord('A') == 193) { ($chrE4, $chrF6) = ("43", "EC"); } # EBCDIC
346    like( $@, qr/utf8 "\\x$chrE4" does not map to Unicode .+ <F> line 1/,
347	  "<:utf8 readline must warn about bad utf8");
348    undef $@;
349    $line .= <F>;
350    like( $@, qr/utf8 "\\x$chrF6" does not map to Unicode .+ <F> line 2/,
351	  "<:utf8 rcatline must warn about bad utf8");
352    close F;
353}
354