xref: /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/t/io/utf8.t (revision 0:68f95e015346)
1*0Sstevel@tonic-gate#!./perl
2*0Sstevel@tonic-gate
3*0Sstevel@tonic-gateBEGIN {
4*0Sstevel@tonic-gate    chdir 't' if -d 't';
5*0Sstevel@tonic-gate    @INC = '../lib';
6*0Sstevel@tonic-gate    unless (find PerlIO::Layer 'perlio') {
7*0Sstevel@tonic-gate	print "1..0 # Skip: not perlio\n";
8*0Sstevel@tonic-gate	exit 0;
9*0Sstevel@tonic-gate    }
10*0Sstevel@tonic-gate}
11*0Sstevel@tonic-gate
12*0Sstevel@tonic-gateno utf8; # needed for use utf8 not griping about the raw octets
13*0Sstevel@tonic-gate
14*0Sstevel@tonic-gaterequire "./test.pl";
15*0Sstevel@tonic-gate
16*0Sstevel@tonic-gateplan(tests => 53);
17*0Sstevel@tonic-gate
18*0Sstevel@tonic-gate$| = 1;
19*0Sstevel@tonic-gate
20*0Sstevel@tonic-gateopen(F,"+>:utf8",'a');
21*0Sstevel@tonic-gateprint F chr(0x100).'�';
22*0Sstevel@tonic-gateok( tell(F) == 4, tell(F) );
23*0Sstevel@tonic-gateprint F "\n";
24*0Sstevel@tonic-gateok( tell(F) >= 5, tell(F) );
25*0Sstevel@tonic-gateseek(F,0,0);
26*0Sstevel@tonic-gateok( getc(F) eq chr(0x100) );
27*0Sstevel@tonic-gateok( getc(F) eq "�" );
28*0Sstevel@tonic-gateok( getc(F) eq "\n" );
29*0Sstevel@tonic-gateseek(F,0,0);
30*0Sstevel@tonic-gatebinmode(F,":bytes");
31*0Sstevel@tonic-gatemy $chr = chr(0xc4);
32*0Sstevel@tonic-gateif (ord('A') == 193) { $chr = chr(0x8c); } # EBCDIC
33*0Sstevel@tonic-gateok( getc(F) eq $chr );
34*0Sstevel@tonic-gate$chr = chr(0x80);
35*0Sstevel@tonic-gateif (ord('A') == 193) { $chr = chr(0x41); } # EBCDIC
36*0Sstevel@tonic-gateok( getc(F) eq $chr );
37*0Sstevel@tonic-gate$chr = chr(0xc2);
38*0Sstevel@tonic-gateif (ord('A') == 193) { $chr = chr(0x80); } # EBCDIC
39*0Sstevel@tonic-gateok( getc(F) eq $chr );
40*0Sstevel@tonic-gate$chr = chr(0xa3);
41*0Sstevel@tonic-gateif (ord('A') == 193) { $chr = chr(0x44); } # EBCDIC
42*0Sstevel@tonic-gateok( getc(F) eq $chr );
43*0Sstevel@tonic-gateok( getc(F) eq "\n" );
44*0Sstevel@tonic-gateseek(F,0,0);
45*0Sstevel@tonic-gatebinmode(F,":utf8");
46*0Sstevel@tonic-gateok( scalar(<F>) eq "\x{100}�\n" );
47*0Sstevel@tonic-gateseek(F,0,0);
48*0Sstevel@tonic-gate$buf = chr(0x200);
49*0Sstevel@tonic-gate$count = read(F,$buf,2,1);
50*0Sstevel@tonic-gateok( $count == 2 );
51*0Sstevel@tonic-gateok( $buf eq "\x{200}\x{100}�" );
52*0Sstevel@tonic-gateclose(F);
53*0Sstevel@tonic-gate
54*0Sstevel@tonic-gate{
55*0Sstevel@tonic-gate    $a = chr(300); # This *is* UTF-encoded
56*0Sstevel@tonic-gate    $b = chr(130); # This is not.
57*0Sstevel@tonic-gate
58*0Sstevel@tonic-gate    open F, ">:utf8", 'a' or die $!;
59*0Sstevel@tonic-gate    print F $a,"\n";
60*0Sstevel@tonic-gate    close F;
61*0Sstevel@tonic-gate
62*0Sstevel@tonic-gate    open F, "<:utf8", 'a' or die $!;
63*0Sstevel@tonic-gate    $x = <F>;
64*0Sstevel@tonic-gate    chomp($x);
65*0Sstevel@tonic-gate    ok( $x eq chr(300) );
66*0Sstevel@tonic-gate
67*0Sstevel@tonic-gate    open F, "a" or die $!; # Not UTF
68*0Sstevel@tonic-gate    binmode(F, ":bytes");
69*0Sstevel@tonic-gate    $x = <F>;
70*0Sstevel@tonic-gate    chomp($x);
71*0Sstevel@tonic-gate    $chr = chr(196).chr(172);
72*0Sstevel@tonic-gate    if (ord('A') == 193) { $chr = chr(141).chr(83); } # EBCDIC
73*0Sstevel@tonic-gate    ok( $x eq $chr );
74*0Sstevel@tonic-gate    close F;
75*0Sstevel@tonic-gate
76*0Sstevel@tonic-gate    open F, ">:utf8", 'a' or die $!;
77*0Sstevel@tonic-gate    binmode(F);  # we write a "\n" and then tell() - avoid CRLF issues.
78*0Sstevel@tonic-gate    binmode(F,":utf8"); # turn UTF-8-ness back on
79*0Sstevel@tonic-gate    print F $a;
80*0Sstevel@tonic-gate    my $y;
81*0Sstevel@tonic-gate    { my $x = tell(F);
82*0Sstevel@tonic-gate      { use bytes; $y = length($a);}
83*0Sstevel@tonic-gate      ok( $x == $y );
84*0Sstevel@tonic-gate  }
85*0Sstevel@tonic-gate
86*0Sstevel@tonic-gate    { # Check byte length of $b
87*0Sstevel@tonic-gate	use bytes; my $y = length($b);
88*0Sstevel@tonic-gate	ok( $y == 1 );
89*0Sstevel@tonic-gate    }
90*0Sstevel@tonic-gate
91*0Sstevel@tonic-gate    print F $b,"\n"; # Don't upgrades $b
92*0Sstevel@tonic-gate
93*0Sstevel@tonic-gate    { # Check byte length of $b
94*0Sstevel@tonic-gate	use bytes; my $y = length($b);
95*0Sstevel@tonic-gate	ok( $y == 1 );
96*0Sstevel@tonic-gate    }
97*0Sstevel@tonic-gate
98*0Sstevel@tonic-gate    {
99*0Sstevel@tonic-gate	my $x = tell(F);
100*0Sstevel@tonic-gate	{ use bytes; if (ord('A')==193){$y += 2;}else{$y += 3;}} # EBCDIC ASCII
101*0Sstevel@tonic-gate	ok( $x == $y );
102*0Sstevel@tonic-gate    }
103*0Sstevel@tonic-gate
104*0Sstevel@tonic-gate    close F;
105*0Sstevel@tonic-gate
106*0Sstevel@tonic-gate    open F, "a" or die $!; # Not UTF
107*0Sstevel@tonic-gate    binmode(F, ":bytes");
108*0Sstevel@tonic-gate    $x = <F>;
109*0Sstevel@tonic-gate    chomp($x);
110*0Sstevel@tonic-gate    $chr = v196.172.194.130;
111*0Sstevel@tonic-gate    if (ord('A') == 193) { $chr = v141.83.130; } # EBCDIC
112*0Sstevel@tonic-gate    ok( $x eq $chr, sprintf('(%vd)', $x) );
113*0Sstevel@tonic-gate
114*0Sstevel@tonic-gate    open F, "<:utf8", "a" or die $!;
115*0Sstevel@tonic-gate    $x = <F>;
116*0Sstevel@tonic-gate    chomp($x);
117*0Sstevel@tonic-gate    close F;
118*0Sstevel@tonic-gate    ok( $x eq chr(300).chr(130), sprintf('(%vd)', $x) );
119*0Sstevel@tonic-gate
120*0Sstevel@tonic-gate    open F, ">", "a" or die $!;
121*0Sstevel@tonic-gate    if (${^OPEN} =~ /:utf8/) {
122*0Sstevel@tonic-gate        binmode(F, ":bytes:");
123*0Sstevel@tonic-gate    }
124*0Sstevel@tonic-gate
125*0Sstevel@tonic-gate    # Now let's make it suffer.
126*0Sstevel@tonic-gate    my $w;
127*0Sstevel@tonic-gate    {
128*0Sstevel@tonic-gate	use warnings 'utf8';
129*0Sstevel@tonic-gate	local $SIG{__WARN__} = sub { $w = $_[0] };
130*0Sstevel@tonic-gate	print F $a;
131*0Sstevel@tonic-gate        ok( !($@ || $w !~ /Wide character in print/i) );
132*0Sstevel@tonic-gate    }
133*0Sstevel@tonic-gate}
134*0Sstevel@tonic-gate
135*0Sstevel@tonic-gate# Hm. Time to get more evil.
136*0Sstevel@tonic-gateopen F, ">:utf8", "a" or die $!;
137*0Sstevel@tonic-gateprint F $a;
138*0Sstevel@tonic-gatebinmode(F, ":bytes");
139*0Sstevel@tonic-gateprint F chr(130)."\n";
140*0Sstevel@tonic-gateclose F;
141*0Sstevel@tonic-gate
142*0Sstevel@tonic-gateopen F, "<", "a" or die $!;
143*0Sstevel@tonic-gatebinmode(F, ":bytes");
144*0Sstevel@tonic-gate$x = <F>; chomp $x;
145*0Sstevel@tonic-gate$chr = v196.172.130;
146*0Sstevel@tonic-gateif (ord('A') == 193) { $chr = v141.83.130; } # EBCDIC
147*0Sstevel@tonic-gateok( $x eq $chr );
148*0Sstevel@tonic-gate
149*0Sstevel@tonic-gate# Right.
150*0Sstevel@tonic-gateopen F, ">:utf8", "a" or die $!;
151*0Sstevel@tonic-gateprint F $a;
152*0Sstevel@tonic-gateclose F;
153*0Sstevel@tonic-gateopen F, ">>", "a" or die $!;
154*0Sstevel@tonic-gateprint F chr(130)."\n";
155*0Sstevel@tonic-gateclose F;
156*0Sstevel@tonic-gate
157*0Sstevel@tonic-gateopen F, "<", "a" or die $!;
158*0Sstevel@tonic-gate$x = <F>; chomp $x;
159*0Sstevel@tonic-gateok( $x eq $chr );
160*0Sstevel@tonic-gate
161*0Sstevel@tonic-gate# Now we have a deformed file.
162*0Sstevel@tonic-gate
163*0Sstevel@tonic-gateSKIP: {
164*0Sstevel@tonic-gate    if (ord('A') == 193) {
165*0Sstevel@tonic-gate	skip( "EBCDIC doesn't complain" );
166*0Sstevel@tonic-gate    } else {
167*0Sstevel@tonic-gate	open F, "<:utf8", "a" or die $!;
168*0Sstevel@tonic-gate	$x = <F>; chomp $x;
169*0Sstevel@tonic-gate	local $SIG{__WARN__} = sub { ok( 1 ) };
170*0Sstevel@tonic-gate	eval { sprintf "%vd\n", $x };
171*0Sstevel@tonic-gate    }
172*0Sstevel@tonic-gate}
173*0Sstevel@tonic-gate
174*0Sstevel@tonic-gateclose F;
175*0Sstevel@tonic-gateunlink('a');
176*0Sstevel@tonic-gate
177*0Sstevel@tonic-gateopen F, ">:utf8", "a";
178*0Sstevel@tonic-gate@a = map { chr(1 << ($_ << 2)) } 0..5; # 0x1, 0x10, .., 0x100000
179*0Sstevel@tonic-gateunshift @a, chr(0); # ... and a null byte in front just for fun
180*0Sstevel@tonic-gateprint F @a;
181*0Sstevel@tonic-gateclose F;
182*0Sstevel@tonic-gate
183*0Sstevel@tonic-gatemy $c;
184*0Sstevel@tonic-gate
185*0Sstevel@tonic-gate# read() should work on characters, not bytes
186*0Sstevel@tonic-gateopen F, "<:utf8", "a";
187*0Sstevel@tonic-gate$a = 0;
188*0Sstevel@tonic-gatefor (@a) {
189*0Sstevel@tonic-gate    unless (($c = read(F, $b, 1) == 1)  &&
190*0Sstevel@tonic-gate            length($b)           == 1  &&
191*0Sstevel@tonic-gate            ord($b)              == ord($_) &&
192*0Sstevel@tonic-gate            tell(F)              == ($a += bytes::length($b))) {
193*0Sstevel@tonic-gate        print '# ord($_)           == ', ord($_), "\n";
194*0Sstevel@tonic-gate        print '# ord($b)           == ', ord($b), "\n";
195*0Sstevel@tonic-gate        print '# length($b)        == ', length($b), "\n";
196*0Sstevel@tonic-gate        print '# bytes::length($b) == ', bytes::length($b), "\n";
197*0Sstevel@tonic-gate        print '# tell(F)           == ', tell(F), "\n";
198*0Sstevel@tonic-gate        print '# $a                == ', $a, "\n";
199*0Sstevel@tonic-gate        print '# $c                == ', $c, "\n";
200*0Sstevel@tonic-gate        print "not ";
201*0Sstevel@tonic-gate        last;
202*0Sstevel@tonic-gate    }
203*0Sstevel@tonic-gate}
204*0Sstevel@tonic-gateclose F;
205*0Sstevel@tonic-gateok( 1 );
206*0Sstevel@tonic-gate
207*0Sstevel@tonic-gate{
208*0Sstevel@tonic-gate    # Check that warnings are on on I/O, and that they can be muffled.
209*0Sstevel@tonic-gate
210*0Sstevel@tonic-gate    local $SIG{__WARN__} = sub { $@ = shift };
211*0Sstevel@tonic-gate
212*0Sstevel@tonic-gate    undef $@;
213*0Sstevel@tonic-gate    open F, ">a";
214*0Sstevel@tonic-gate    binmode(F, ":bytes");
215*0Sstevel@tonic-gate    print F chr(0x100);
216*0Sstevel@tonic-gate    close(F);
217*0Sstevel@tonic-gate
218*0Sstevel@tonic-gate    like( $@, 'Wide character in print' );
219*0Sstevel@tonic-gate
220*0Sstevel@tonic-gate    undef $@;
221*0Sstevel@tonic-gate    open F, ">:utf8", "a";
222*0Sstevel@tonic-gate    print F chr(0x100);
223*0Sstevel@tonic-gate    close(F);
224*0Sstevel@tonic-gate
225*0Sstevel@tonic-gate    isnt( defined $@ );
226*0Sstevel@tonic-gate
227*0Sstevel@tonic-gate    undef $@;
228*0Sstevel@tonic-gate    open F, ">a";
229*0Sstevel@tonic-gate    binmode(F, ":utf8");
230*0Sstevel@tonic-gate    print F chr(0x100);
231*0Sstevel@tonic-gate    close(F);
232*0Sstevel@tonic-gate
233*0Sstevel@tonic-gate    isnt( defined $@ );
234*0Sstevel@tonic-gate
235*0Sstevel@tonic-gate    no warnings 'utf8';
236*0Sstevel@tonic-gate
237*0Sstevel@tonic-gate    undef $@;
238*0Sstevel@tonic-gate    open F, ">a";
239*0Sstevel@tonic-gate    print F chr(0x100);
240*0Sstevel@tonic-gate    close(F);
241*0Sstevel@tonic-gate
242*0Sstevel@tonic-gate    isnt( defined $@ );
243*0Sstevel@tonic-gate
244*0Sstevel@tonic-gate    use warnings 'utf8';
245*0Sstevel@tonic-gate
246*0Sstevel@tonic-gate    undef $@;
247*0Sstevel@tonic-gate    open F, ">a";
248*0Sstevel@tonic-gate    binmode(F, ":bytes");
249*0Sstevel@tonic-gate    print F chr(0x100);
250*0Sstevel@tonic-gate    close(F);
251*0Sstevel@tonic-gate
252*0Sstevel@tonic-gate    like( $@, 'Wide character in print' );
253*0Sstevel@tonic-gate}
254*0Sstevel@tonic-gate
255*0Sstevel@tonic-gate{
256*0Sstevel@tonic-gate    open F, ">:bytes","a"; print F "\xde"; close F;
257*0Sstevel@tonic-gate
258*0Sstevel@tonic-gate    open F, "<:bytes", "a";
259*0Sstevel@tonic-gate    my $b = chr 0x100;
260*0Sstevel@tonic-gate    $b .= <F>;
261*0Sstevel@tonic-gate    ok( $b eq chr(0x100).chr(0xde), "21395 '.= <>' utf8 vs. bytes" );
262*0Sstevel@tonic-gate    close F;
263*0Sstevel@tonic-gate}
264*0Sstevel@tonic-gate
265*0Sstevel@tonic-gate{
266*0Sstevel@tonic-gate    open F, ">:utf8","a"; print F chr 0x100; close F;
267*0Sstevel@tonic-gate
268*0Sstevel@tonic-gate    open F, "<:utf8", "a";
269*0Sstevel@tonic-gate    my $b = "\xde";
270*0Sstevel@tonic-gate    $b .= <F>;
271*0Sstevel@tonic-gate    ok( $b eq chr(0xde).chr(0x100), "21395 '.= <>' bytes vs. utf8" );
272*0Sstevel@tonic-gate    close F;
273*0Sstevel@tonic-gate}
274*0Sstevel@tonic-gate
275*0Sstevel@tonic-gate{
276*0Sstevel@tonic-gate    my @a = ( [ 0x007F, "bytes" ],
277*0Sstevel@tonic-gate	      [ 0x0080, "bytes" ],
278*0Sstevel@tonic-gate	      [ 0x0080, "utf8"  ],
279*0Sstevel@tonic-gate	      [ 0x0100, "utf8"  ] );
280*0Sstevel@tonic-gate    my $t = 34;
281*0Sstevel@tonic-gate    for my $u (@a) {
282*0Sstevel@tonic-gate	for my $v (@a) {
283*0Sstevel@tonic-gate	    # print "# @$u - @$v\n";
284*0Sstevel@tonic-gate	    open F, ">a";
285*0Sstevel@tonic-gate	    binmode(F, ":" . $u->[1]);
286*0Sstevel@tonic-gate	    print F chr($u->[0]);
287*0Sstevel@tonic-gate	    close F;
288*0Sstevel@tonic-gate
289*0Sstevel@tonic-gate	    open F, "<a";
290*0Sstevel@tonic-gate	    binmode(F, ":" . $u->[1]);
291*0Sstevel@tonic-gate
292*0Sstevel@tonic-gate	    my $s = chr($v->[0]);
293*0Sstevel@tonic-gate	    utf8::upgrade($s) if $v->[1] eq "utf8";
294*0Sstevel@tonic-gate
295*0Sstevel@tonic-gate	    $s .= <F>;
296*0Sstevel@tonic-gate	    ok( $s eq chr($v->[0]) . chr($u->[0]), 'rcatline utf8' );
297*0Sstevel@tonic-gate	    close F;
298*0Sstevel@tonic-gate	    $t++;
299*0Sstevel@tonic-gate	}
300*0Sstevel@tonic-gate    }
301*0Sstevel@tonic-gate    # last test here 49
302*0Sstevel@tonic-gate}
303*0Sstevel@tonic-gate
304*0Sstevel@tonic-gate{
305*0Sstevel@tonic-gate    # [perl #23428] Somethings rotten in unicode semantics
306*0Sstevel@tonic-gate    open F, ">a";
307*0Sstevel@tonic-gate    binmode F, ":utf8";
308*0Sstevel@tonic-gate    syswrite(F, $a = chr(0x100));
309*0Sstevel@tonic-gate    close F;
310*0Sstevel@tonic-gate    is( ord($a), 0x100, '23428 syswrite should not downgrade scalar' );
311*0Sstevel@tonic-gate    like( $a, qr/^\w+/, '23428 syswrite should not downgrade scalar' );
312*0Sstevel@tonic-gate}
313*0Sstevel@tonic-gate
314*0Sstevel@tonic-gate# sysread() and syswrite() tested in lib/open.t since Fcntl is used
315*0Sstevel@tonic-gate
316*0Sstevel@tonic-gate{
317*0Sstevel@tonic-gate    # <FH> on a :utf8 stream should complain immediately with -w
318*0Sstevel@tonic-gate    # if it finds bad UTF-8 (:encoding(utf8) works this way)
319*0Sstevel@tonic-gate    use warnings 'utf8';
320*0Sstevel@tonic-gate    undef $@;
321*0Sstevel@tonic-gate    local $SIG{__WARN__} = sub { $@ = shift };
322*0Sstevel@tonic-gate    open F, ">a";
323*0Sstevel@tonic-gate    binmode F;
324*0Sstevel@tonic-gate    my ($chrE4, $chrF6) = (chr(0xE4), chr(0xF6));
325*0Sstevel@tonic-gate    if (ord('A') == 193)	# EBCDIC
326*0Sstevel@tonic-gate    { ($chrE4, $chrF6) = (chr(0x43), chr(0xEC)); }
327*0Sstevel@tonic-gate    print F "foo", $chrE4, "\n";
328*0Sstevel@tonic-gate    print F "foo", $chrF6, "\n";
329*0Sstevel@tonic-gate    close F;
330*0Sstevel@tonic-gate    open F, "<:utf8", "a";
331*0Sstevel@tonic-gate    undef $@;
332*0Sstevel@tonic-gate    my $line = <F>;
333*0Sstevel@tonic-gate    my ($chrE4, $chrF6) = ("E4", "F6");
334*0Sstevel@tonic-gate    if (ord('A') == 193) { ($chrE4, $chrF6) = ("43", "EC"); } # EBCDIC
335*0Sstevel@tonic-gate    like( $@, qr/utf8 "\\x$chrE4" does not map to Unicode .+ <F> line 1/,
336*0Sstevel@tonic-gate	  "<:utf8 readline must warn about bad utf8");
337*0Sstevel@tonic-gate    undef $@;
338*0Sstevel@tonic-gate    $line .= <F>;
339*0Sstevel@tonic-gate    like( $@, qr/utf8 "\\x$chrF6" does not map to Unicode .+ <F> line 2/,
340*0Sstevel@tonic-gate	  "<:utf8 rcatline must warn about bad utf8");
341*0Sstevel@tonic-gate    close F;
342*0Sstevel@tonic-gate}
343*0Sstevel@tonic-gate
344*0Sstevel@tonic-gateEND {
345*0Sstevel@tonic-gate    1 while unlink "a";
346*0Sstevel@tonic-gate    1 while unlink "b";
347*0Sstevel@tonic-gate}
348