xref: /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/t/op/utf8decode.t (revision 0:68f95e015346)
1*0Sstevel@tonic-gate#!./perl
2*0Sstevel@tonic-gate
3*0Sstevel@tonic-gateBEGIN {
4*0Sstevel@tonic-gate    chdir 't' if -d 't';
5*0Sstevel@tonic-gate    @INC = '../lib';
6*0Sstevel@tonic-gate}
7*0Sstevel@tonic-gate
8*0Sstevel@tonic-gate{
9*0Sstevel@tonic-gate    my $wide = v256;
10*0Sstevel@tonic-gate    use bytes;
11*0Sstevel@tonic-gate    my $ordwide = ord($wide);
12*0Sstevel@tonic-gate    printf "# under use bytes ord(v256) = 0x%02x\n", $ordwide;
13*0Sstevel@tonic-gate    if ($ordwide == 140) {
14*0Sstevel@tonic-gate	print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n";
15*0Sstevel@tonic-gate	exit 0;
16*0Sstevel@tonic-gate    }
17*0Sstevel@tonic-gate    elsif ($ordwide != 196) {
18*0Sstevel@tonic-gate	printf "# v256 starts with 0x%02x\n", $ordwide;
19*0Sstevel@tonic-gate    }
20*0Sstevel@tonic-gate}
21*0Sstevel@tonic-gate
22*0Sstevel@tonic-gateno utf8;
23*0Sstevel@tonic-gate
24*0Sstevel@tonic-gateprint "1..78\n";
25*0Sstevel@tonic-gate
26*0Sstevel@tonic-gatemy $test = 1;
27*0Sstevel@tonic-gate
28*0Sstevel@tonic-gate# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
29*0Sstevel@tonic-gate# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
30*0Sstevel@tonic-gate# version dated 2000-09-02.
31*0Sstevel@tonic-gate
32*0Sstevel@tonic-gate# We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff
33*0Sstevel@tonic-gate# because e.g. many patch programs have issues with binary data.
34*0Sstevel@tonic-gate
35*0Sstevel@tonic-gatemy @MK = split(/\n/, <<__EOMK__);
36*0Sstevel@tonic-gate1	Correct UTF-8
37*0Sstevel@tonic-gate1.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"	-		11	ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5	5
38*0Sstevel@tonic-gate2	Boundary conditions
39*0Sstevel@tonic-gate2.1	First possible sequence of certain length
40*0Sstevel@tonic-gate2.1.1 y "\x00"			0		1	00	1
41*0Sstevel@tonic-gate2.1.2 y "\xc2\x80"			80		2	c2:80	1
42*0Sstevel@tonic-gate2.1.3 y "\xe0\xa0\x80"		800		3	e0:a0:80	1
43*0Sstevel@tonic-gate2.1.4 y "\xf0\x90\x80\x80"		10000		4	f0:90:80:80	1
44*0Sstevel@tonic-gate2.1.5 y "\xf8\x88\x80\x80\x80"	200000		5	f8:88:80:80:80	1
45*0Sstevel@tonic-gate2.1.6 y "\xfc\x84\x80\x80\x80\x80"	4000000		6	fc:84:80:80:80:80	1
46*0Sstevel@tonic-gate2.2	Last possible sequence of certain length
47*0Sstevel@tonic-gate2.2.1 y "\x7f"			7f		1	7f	1
48*0Sstevel@tonic-gate2.2.2 y "\xdf\xbf"			7ff		2	df:bf	1
49*0Sstevel@tonic-gate# The ffff is illegal unless UTF8_ALLOW_FFFF
50*0Sstevel@tonic-gate2.2.3 n "\xef\xbf\xbf"			ffff		3	ef:bf:bf	1	character 0xffff
51*0Sstevel@tonic-gate2.2.4 y "\xf7\xbf\xbf\xbf"			1fffff		4	f7:bf:bf:bf	1
52*0Sstevel@tonic-gate2.2.5 y "\xfb\xbf\xbf\xbf\xbf"			3ffffff		5	fb:bf:bf:bf:bf	1
53*0Sstevel@tonic-gate2.2.6 y "\xfd\xbf\xbf\xbf\xbf\xbf"		7fffffff	6	fd:bf:bf:bf:bf:bf	1
54*0Sstevel@tonic-gate2.3	Other boundary conditions
55*0Sstevel@tonic-gate2.3.1 y "\xed\x9f\xbf"		d7ff		3	ed:9f:bf	1
56*0Sstevel@tonic-gate2.3.2 y "\xee\x80\x80"		e000		3	ee:80:80	1
57*0Sstevel@tonic-gate2.3.3 y "\xef\xbf\xbd"			fffd		3	ef:bf:bd	1
58*0Sstevel@tonic-gate2.3.4 y "\xf4\x8f\xbf\xbf"		10ffff		4	f4:8f:bf:bf	1
59*0Sstevel@tonic-gate2.3.5 y "\xf4\x90\x80\x80"		110000		4	f4:90:80:80	1
60*0Sstevel@tonic-gate3	Malformed sequences
61*0Sstevel@tonic-gate3.1	Unexpected continuation bytes
62*0Sstevel@tonic-gate3.1.1 n "\x80"			-		1	80	-	unexpected continuation byte 0x80
63*0Sstevel@tonic-gate3.1.2 n "\xbf"			-		1	bf	-	unexpected continuation byte 0xbf
64*0Sstevel@tonic-gate3.1.3 n "\x80\xbf"			-		2	80:bf	-	unexpected continuation byte 0x80
65*0Sstevel@tonic-gate3.1.4 n "\x80\xbf\x80"		-		3	80:bf:80	-	unexpected continuation byte 0x80
66*0Sstevel@tonic-gate3.1.5 n "\x80\xbf\x80\xbf"		-		4	80:bf:80:bf	-	unexpected continuation byte 0x80
67*0Sstevel@tonic-gate3.1.6 n "\x80\xbf\x80\xbf\x80"	-		5	80:bf:80:bf:80	-	unexpected continuation byte 0x80
68*0Sstevel@tonic-gate3.1.7 n "\x80\xbf\x80\xbf\x80\xbf"	-		6	80:bf:80:bf:80:bf	-	unexpected continuation byte 0x80
69*0Sstevel@tonic-gate3.1.8 n "\x80\xbf\x80\xbf\x80\xbf\x80"	-		7	80:bf:80:bf:80:bf:80	-	unexpected continuation byte 0x80
70*0Sstevel@tonic-gate3.1.9 n "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"				-	64	80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf	-	unexpected continuation byte 0x80
71*0Sstevel@tonic-gate3.2	Lonely start characters
72*0Sstevel@tonic-gate3.2.1 n "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf "	-	64 	c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20	-	unexpected non-continuation byte 0x20 after start byte 0xc0
73*0Sstevel@tonic-gate3.2.2 n "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef "	-	32	e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20	-	unexpected non-continuation byte 0x20 after start byte 0xe0
74*0Sstevel@tonic-gate3.2.3 n "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 "	-	16	f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20	-	unexpected non-continuation byte 0x20 after start byte 0xf0
75*0Sstevel@tonic-gate3.2.4 n "\xf8 \xf9 \xfa \xfb "		-	8	f8:20:f9:20:fa:20:fb:20	-	unexpected non-continuation byte 0x20 after start byte 0xf8
76*0Sstevel@tonic-gate3.2.5 n "\xfc \xfd "			-	4	fc:20:fd:20	-	unexpected non-continuation byte 0x20 after start byte 0xfc
77*0Sstevel@tonic-gate3.3	Sequences with last continuation byte missing
78*0Sstevel@tonic-gate3.3.1 n "\xc0"			-	1	c0	-	1 byte, need 2
79*0Sstevel@tonic-gate3.3.2 n "\xe0\x80"			-	2	e0:80	-	2 bytes, need 3
80*0Sstevel@tonic-gate3.3.3 n "\xf0\x80\x80"		-	3	f0:80:80	-	3 bytes, need 4
81*0Sstevel@tonic-gate3.3.4 n "\xf8\x80\x80\x80"		-	4	f8:80:80:80	-	4 bytes, need 5
82*0Sstevel@tonic-gate3.3.5 n "\xfc\x80\x80\x80\x80"	-	5	fc:80:80:80:80	-	5 bytes, need 6
83*0Sstevel@tonic-gate3.3.6 n "\xdf"			-	1	df	-	1 byte, need 2
84*0Sstevel@tonic-gate3.3.7 n "\xef\xbf"			-	2	ef:bf	-	2 bytes, need 3
85*0Sstevel@tonic-gate3.3.8 n "\xf7\xbf\xbf"			-	3	f7:bf:bf	-	3 bytes, need 4
86*0Sstevel@tonic-gate3.3.9 n "\xfb\xbf\xbf\xbf"			-	4	fb:bf:bf:bf	-	4 bytes, need 5
87*0Sstevel@tonic-gate3.3.10 n "\xfd\xbf\xbf\xbf\xbf"		-	5	fd:bf:bf:bf:bf	-	5 bytes, need 6
88*0Sstevel@tonic-gate3.4	Concatenation of incomplete sequences
89*0Sstevel@tonic-gate3.4.1 n "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf"	-	30	c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf	-	unexpected non-continuation byte 0xe0 after start byte 0xc0
90*0Sstevel@tonic-gate3.5	Impossible bytes
91*0Sstevel@tonic-gate3.5.1 n "\xfe"			-	1	fe	-	byte 0xfe
92*0Sstevel@tonic-gate3.5.2 n "\xff"			-	1	ff	-	byte 0xff
93*0Sstevel@tonic-gate3.5.3 n "\xfe\xfe\xff\xff"			-	4	fe:fe:ff:ff	-	byte 0xfe
94*0Sstevel@tonic-gate4	Overlong sequences
95*0Sstevel@tonic-gate4.1	Examples of an overlong ASCII character
96*0Sstevel@tonic-gate4.1.1 n "\xc0\xaf"			-	2	c0:af	-	2 bytes, need 1
97*0Sstevel@tonic-gate4.1.2 n "\xe0\x80\xaf"		-	3	e0:80:af	-	3 bytes, need 1
98*0Sstevel@tonic-gate4.1.3 n "\xf0\x80\x80\xaf"		-	4	f0:80:80:af	-	4 bytes, need 1
99*0Sstevel@tonic-gate4.1.4 n "\xf8\x80\x80\x80\xaf"	-	5	f8:80:80:80:af	-	5 bytes, need 1
100*0Sstevel@tonic-gate4.1.5 n "\xfc\x80\x80\x80\x80\xaf"	-	6	fc:80:80:80:80:af	-	6 bytes, need 1
101*0Sstevel@tonic-gate4.2	Maximum overlong sequences
102*0Sstevel@tonic-gate4.2.1 n "\xc1\xbf"			-	2	c1:bf	-	2 bytes, need 1
103*0Sstevel@tonic-gate4.2.2 n "\xe0\x9f\xbf"		-	3	e0:9f:bf	-	3 bytes, need 2
104*0Sstevel@tonic-gate4.2.3 n "\xf0\x8f\xbf\xbf"		-	4	f0:8f:bf:bf	-	4 bytes, need 3
105*0Sstevel@tonic-gate4.2.4 n "\xf8\x87\xbf\xbf\xbf"		-	5	f8:87:bf:bf:bf	-	5 bytes, need 4
106*0Sstevel@tonic-gate4.2.5 n "\xfc\x83\xbf\xbf\xbf\xbf"		-	6	fc:83:bf:bf:bf:bf	-	6 bytes, need 5
107*0Sstevel@tonic-gate4.3	Overlong representation of the NUL character
108*0Sstevel@tonic-gate4.3.1 n "\xc0\x80"			-	2	c0:80	-	2 bytes, need 1
109*0Sstevel@tonic-gate4.3.2 n "\xe0\x80\x80"		-	3	e0:80:80	-	3 bytes, need 1
110*0Sstevel@tonic-gate4.3.3 n "\xf0\x80\x80\x80"		-	4	f0:80:80:80	-	4 bytes, need 1
111*0Sstevel@tonic-gate4.3.4 n "\xf8\x80\x80\x80\x80"	-	5	f8:80:80:80:80	-	5 bytes, need 1
112*0Sstevel@tonic-gate4.3.5 n "\xfc\x80\x80\x80\x80\x80"	-	6	fc:80:80:80:80:80	-	6 bytes, need 1
113*0Sstevel@tonic-gate5	Illegal code positions
114*0Sstevel@tonic-gate5.1	Single UTF-16 surrogates
115*0Sstevel@tonic-gate5.1.1 n "\xed\xa0\x80"		-	3	ed:a0:80	-	UTF-16 surrogate 0xd800
116*0Sstevel@tonic-gate5.1.2 n "\xed\xad\xbf"			-	3	ed:ad:bf	-	UTF-16 surrogate 0xdb7f
117*0Sstevel@tonic-gate5.1.3 n "\xed\xae\x80"		-	3	ed:ae:80	-	UTF-16 surrogate 0xdb80
118*0Sstevel@tonic-gate5.1.4 n "\xed\xaf\xbf"			-	3	ed:af:bf	-	UTF-16 surrogate 0xdbff
119*0Sstevel@tonic-gate5.1.5 n "\xed\xb0\x80"		-	3	ed:b0:80	-	UTF-16 surrogate 0xdc00
120*0Sstevel@tonic-gate5.1.6 n "\xed\xbe\x80"		-	3	ed:be:80	-	UTF-16 surrogate 0xdf80
121*0Sstevel@tonic-gate5.1.7 n "\xed\xbf\xbf"			-	3	ed:bf:bf	-	UTF-16 surrogate 0xdfff
122*0Sstevel@tonic-gate5.2	Paired UTF-16 surrogates
123*0Sstevel@tonic-gate5.2.1 n "\xed\xa0\x80\xed\xb0\x80"		-	6	ed:a0:80:ed:b0:80	-	UTF-16 surrogate 0xd800
124*0Sstevel@tonic-gate5.2.2 n "\xed\xa0\x80\xed\xbf\xbf"		-	6	ed:a0:80:ed:bf:bf	-	UTF-16 surrogate 0xd800
125*0Sstevel@tonic-gate5.2.3 n "\xed\xad\xbf\xed\xb0\x80"		-	6	ed:ad:bf:ed:b0:80	-	UTF-16 surrogate 0xdb7f
126*0Sstevel@tonic-gate5.2.4 n "\xed\xad\xbf\xed\xbf\xbf"		-	6	ed:ad:bf:ed:bf:bf	-	UTF-16 surrogate 0xdb7f
127*0Sstevel@tonic-gate5.2.5 n "\xed\xae\x80\xed\xb0\x80"		-	6	ed:ae:80:ed:b0:80	-	UTF-16 surrogate 0xdb80
128*0Sstevel@tonic-gate5.2.6 n "\xed\xae\x80\xed\xbf\xbf"		-	6	ed:ae:80:ed:bf:bf	-	UTF-16 surrogate 0xdb80
129*0Sstevel@tonic-gate5.2.7 n "\xed\xaf\xbf\xed\xb0\x80"		-	6	ed:af:bf:ed:b0:80	-	UTF-16 surrogate 0xdbff
130*0Sstevel@tonic-gate5.2.8 n "\xed\xaf\xbf\xed\xbf\xbf"		-	6	ed:af:bf:ed:bf:bf	-	UTF-16 surrogate 0xdbff
131*0Sstevel@tonic-gate5.3	Other illegal code positions
132*0Sstevel@tonic-gate5.3.1 n "\xef\xbf\xbe"			-	3	ef:bf:be	-	byte order mark 0xfffe
133*0Sstevel@tonic-gate# The ffff is illegal unless UTF8_ALLOW_FFFF
134*0Sstevel@tonic-gate5.3.2 n "\xef\xbf\xbf"			-	3	ef:bf:bf	-	character 0xffff
135*0Sstevel@tonic-gate__EOMK__
136*0Sstevel@tonic-gate
137*0Sstevel@tonic-gate# 104..181
138*0Sstevel@tonic-gate{
139*0Sstevel@tonic-gate    my $id;
140*0Sstevel@tonic-gate
141*0Sstevel@tonic-gate    local $SIG{__WARN__} = sub {
142*0Sstevel@tonic-gate	print "# $id: @_";
143*0Sstevel@tonic-gate	$@ = "@_";
144*0Sstevel@tonic-gate    };
145*0Sstevel@tonic-gate
146*0Sstevel@tonic-gate    sub moan {
147*0Sstevel@tonic-gate	print "$id: @_";
148*0Sstevel@tonic-gate    }
149*0Sstevel@tonic-gate
150*0Sstevel@tonic-gate    sub warn_unpack_U {
151*0Sstevel@tonic-gate	$@ = '';
152*0Sstevel@tonic-gate	my @null = unpack('U0U*', $_[0]);
153*0Sstevel@tonic-gate	return $@;
154*0Sstevel@tonic-gate    }
155*0Sstevel@tonic-gate
156*0Sstevel@tonic-gate    for (@MK) {
157*0Sstevel@tonic-gate	if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) {
158*0Sstevel@tonic-gate	    # print "# $_\n";
159*0Sstevel@tonic-gate	} elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) {
160*0Sstevel@tonic-gate	    $id = $1;
161*0Sstevel@tonic-gate	    my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $experr) =
162*0Sstevel@tonic-gate		($2, $3, $4, $5, $6, $7, $8);
163*0Sstevel@tonic-gate	    my @hex = split(/:/, $hex);
164*0Sstevel@tonic-gate	    unless (@hex == $byteslen) {
165*0Sstevel@tonic-gate		my $nhex = @hex;
166*0Sstevel@tonic-gate		moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n";
167*0Sstevel@tonic-gate	    }
168*0Sstevel@tonic-gate	    {
169*0Sstevel@tonic-gate		use bytes;
170*0Sstevel@tonic-gate		my $bytesbyteslen = length($bytes);
171*0Sstevel@tonic-gate		unless ($bytesbyteslen == $byteslen) {
172*0Sstevel@tonic-gate		    moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n";
173*0Sstevel@tonic-gate		}
174*0Sstevel@tonic-gate	    }
175*0Sstevel@tonic-gate	    my $warn = warn_unpack_U($bytes);
176*0Sstevel@tonic-gate	    if ($okay eq 'y') {
177*0Sstevel@tonic-gate		if ($warn) {
178*0Sstevel@tonic-gate		    moan "unpack('U0U*') false negative\n";
179*0Sstevel@tonic-gate		    print "not ";
180*0Sstevel@tonic-gate		}
181*0Sstevel@tonic-gate	    } elsif ($okay eq 'n') {
182*0Sstevel@tonic-gate		if (not $warn || ($experr ne '' && $warn !~ /$experr/)) {
183*0Sstevel@tonic-gate		    moan "unpack('U0U*') false positive\n";
184*0Sstevel@tonic-gate		    print "not ";
185*0Sstevel@tonic-gate		}
186*0Sstevel@tonic-gate	    }
187*0Sstevel@tonic-gate	    print "ok $test # $id $okay\n";
188*0Sstevel@tonic-gate	    $test++;
189*0Sstevel@tonic-gate 	} else {
190*0Sstevel@tonic-gate	    moan "unknown format\n";
191*0Sstevel@tonic-gate	}
192*0Sstevel@tonic-gate    }
193*0Sstevel@tonic-gate}
194