xref: /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/t/op/utf8decode.t (revision 0:68f95e015346)
1#!./perl
2
3BEGIN {
4    chdir 't' if -d 't';
5    @INC = '../lib';
6}
7
8{
9    my $wide = v256;
10    use bytes;
11    my $ordwide = ord($wide);
12    printf "# under use bytes ord(v256) = 0x%02x\n", $ordwide;
13    if ($ordwide == 140) {
14	print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n";
15	exit 0;
16    }
17    elsif ($ordwide != 196) {
18	printf "# v256 starts with 0x%02x\n", $ordwide;
19    }
20}
21
22no utf8;
23
24print "1..78\n";
25
26my $test = 1;
27
28# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
29# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
30# version dated 2000-09-02.
31
32# We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff
33# because e.g. many patch programs have issues with binary data.
34
35my @MK = split(/\n/, <<__EOMK__);
361	Correct UTF-8
371.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"	-		11	ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5	5
382	Boundary conditions
392.1	First possible sequence of certain length
402.1.1 y "\x00"			0		1	00	1
412.1.2 y "\xc2\x80"			80		2	c2:80	1
422.1.3 y "\xe0\xa0\x80"		800		3	e0:a0:80	1
432.1.4 y "\xf0\x90\x80\x80"		10000		4	f0:90:80:80	1
442.1.5 y "\xf8\x88\x80\x80\x80"	200000		5	f8:88:80:80:80	1
452.1.6 y "\xfc\x84\x80\x80\x80\x80"	4000000		6	fc:84:80:80:80:80	1
462.2	Last possible sequence of certain length
472.2.1 y "\x7f"			7f		1	7f	1
482.2.2 y "\xdf\xbf"			7ff		2	df:bf	1
49# The ffff is illegal unless UTF8_ALLOW_FFFF
502.2.3 n "\xef\xbf\xbf"			ffff		3	ef:bf:bf	1	character 0xffff
512.2.4 y "\xf7\xbf\xbf\xbf"			1fffff		4	f7:bf:bf:bf	1
522.2.5 y "\xfb\xbf\xbf\xbf\xbf"			3ffffff		5	fb:bf:bf:bf:bf	1
532.2.6 y "\xfd\xbf\xbf\xbf\xbf\xbf"		7fffffff	6	fd:bf:bf:bf:bf:bf	1
542.3	Other boundary conditions
552.3.1 y "\xed\x9f\xbf"		d7ff		3	ed:9f:bf	1
562.3.2 y "\xee\x80\x80"		e000		3	ee:80:80	1
572.3.3 y "\xef\xbf\xbd"			fffd		3	ef:bf:bd	1
582.3.4 y "\xf4\x8f\xbf\xbf"		10ffff		4	f4:8f:bf:bf	1
592.3.5 y "\xf4\x90\x80\x80"		110000		4	f4:90:80:80	1
603	Malformed sequences
613.1	Unexpected continuation bytes
623.1.1 n "\x80"			-		1	80	-	unexpected continuation byte 0x80
633.1.2 n "\xbf"			-		1	bf	-	unexpected continuation byte 0xbf
643.1.3 n "\x80\xbf"			-		2	80:bf	-	unexpected continuation byte 0x80
653.1.4 n "\x80\xbf\x80"		-		3	80:bf:80	-	unexpected continuation byte 0x80
663.1.5 n "\x80\xbf\x80\xbf"		-		4	80:bf:80:bf	-	unexpected continuation byte 0x80
673.1.6 n "\x80\xbf\x80\xbf\x80"	-		5	80:bf:80:bf:80	-	unexpected continuation byte 0x80
683.1.7 n "\x80\xbf\x80\xbf\x80\xbf"	-		6	80:bf:80:bf:80:bf	-	unexpected continuation byte 0x80
693.1.8 n "\x80\xbf\x80\xbf\x80\xbf\x80"	-		7	80:bf:80:bf:80:bf:80	-	unexpected continuation byte 0x80
703.1.9 n "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"				-	64	80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf	-	unexpected continuation byte 0x80
713.2	Lonely start characters
723.2.1 n "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf "	-	64 	c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20	-	unexpected non-continuation byte 0x20 after start byte 0xc0
733.2.2 n "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef "	-	32	e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20	-	unexpected non-continuation byte 0x20 after start byte 0xe0
743.2.3 n "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 "	-	16	f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20	-	unexpected non-continuation byte 0x20 after start byte 0xf0
753.2.4 n "\xf8 \xf9 \xfa \xfb "		-	8	f8:20:f9:20:fa:20:fb:20	-	unexpected non-continuation byte 0x20 after start byte 0xf8
763.2.5 n "\xfc \xfd "			-	4	fc:20:fd:20	-	unexpected non-continuation byte 0x20 after start byte 0xfc
773.3	Sequences with last continuation byte missing
783.3.1 n "\xc0"			-	1	c0	-	1 byte, need 2
793.3.2 n "\xe0\x80"			-	2	e0:80	-	2 bytes, need 3
803.3.3 n "\xf0\x80\x80"		-	3	f0:80:80	-	3 bytes, need 4
813.3.4 n "\xf8\x80\x80\x80"		-	4	f8:80:80:80	-	4 bytes, need 5
823.3.5 n "\xfc\x80\x80\x80\x80"	-	5	fc:80:80:80:80	-	5 bytes, need 6
833.3.6 n "\xdf"			-	1	df	-	1 byte, need 2
843.3.7 n "\xef\xbf"			-	2	ef:bf	-	2 bytes, need 3
853.3.8 n "\xf7\xbf\xbf"			-	3	f7:bf:bf	-	3 bytes, need 4
863.3.9 n "\xfb\xbf\xbf\xbf"			-	4	fb:bf:bf:bf	-	4 bytes, need 5
873.3.10 n "\xfd\xbf\xbf\xbf\xbf"		-	5	fd:bf:bf:bf:bf	-	5 bytes, need 6
883.4	Concatenation of incomplete sequences
893.4.1 n "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf"	-	30	c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf	-	unexpected non-continuation byte 0xe0 after start byte 0xc0
903.5	Impossible bytes
913.5.1 n "\xfe"			-	1	fe	-	byte 0xfe
923.5.2 n "\xff"			-	1	ff	-	byte 0xff
933.5.3 n "\xfe\xfe\xff\xff"			-	4	fe:fe:ff:ff	-	byte 0xfe
944	Overlong sequences
954.1	Examples of an overlong ASCII character
964.1.1 n "\xc0\xaf"			-	2	c0:af	-	2 bytes, need 1
974.1.2 n "\xe0\x80\xaf"		-	3	e0:80:af	-	3 bytes, need 1
984.1.3 n "\xf0\x80\x80\xaf"		-	4	f0:80:80:af	-	4 bytes, need 1
994.1.4 n "\xf8\x80\x80\x80\xaf"	-	5	f8:80:80:80:af	-	5 bytes, need 1
1004.1.5 n "\xfc\x80\x80\x80\x80\xaf"	-	6	fc:80:80:80:80:af	-	6 bytes, need 1
1014.2	Maximum overlong sequences
1024.2.1 n "\xc1\xbf"			-	2	c1:bf	-	2 bytes, need 1
1034.2.2 n "\xe0\x9f\xbf"		-	3	e0:9f:bf	-	3 bytes, need 2
1044.2.3 n "\xf0\x8f\xbf\xbf"		-	4	f0:8f:bf:bf	-	4 bytes, need 3
1054.2.4 n "\xf8\x87\xbf\xbf\xbf"		-	5	f8:87:bf:bf:bf	-	5 bytes, need 4
1064.2.5 n "\xfc\x83\xbf\xbf\xbf\xbf"		-	6	fc:83:bf:bf:bf:bf	-	6 bytes, need 5
1074.3	Overlong representation of the NUL character
1084.3.1 n "\xc0\x80"			-	2	c0:80	-	2 bytes, need 1
1094.3.2 n "\xe0\x80\x80"		-	3	e0:80:80	-	3 bytes, need 1
1104.3.3 n "\xf0\x80\x80\x80"		-	4	f0:80:80:80	-	4 bytes, need 1
1114.3.4 n "\xf8\x80\x80\x80\x80"	-	5	f8:80:80:80:80	-	5 bytes, need 1
1124.3.5 n "\xfc\x80\x80\x80\x80\x80"	-	6	fc:80:80:80:80:80	-	6 bytes, need 1
1135	Illegal code positions
1145.1	Single UTF-16 surrogates
1155.1.1 n "\xed\xa0\x80"		-	3	ed:a0:80	-	UTF-16 surrogate 0xd800
1165.1.2 n "\xed\xad\xbf"			-	3	ed:ad:bf	-	UTF-16 surrogate 0xdb7f
1175.1.3 n "\xed\xae\x80"		-	3	ed:ae:80	-	UTF-16 surrogate 0xdb80
1185.1.4 n "\xed\xaf\xbf"			-	3	ed:af:bf	-	UTF-16 surrogate 0xdbff
1195.1.5 n "\xed\xb0\x80"		-	3	ed:b0:80	-	UTF-16 surrogate 0xdc00
1205.1.6 n "\xed\xbe\x80"		-	3	ed:be:80	-	UTF-16 surrogate 0xdf80
1215.1.7 n "\xed\xbf\xbf"			-	3	ed:bf:bf	-	UTF-16 surrogate 0xdfff
1225.2	Paired UTF-16 surrogates
1235.2.1 n "\xed\xa0\x80\xed\xb0\x80"		-	6	ed:a0:80:ed:b0:80	-	UTF-16 surrogate 0xd800
1245.2.2 n "\xed\xa0\x80\xed\xbf\xbf"		-	6	ed:a0:80:ed:bf:bf	-	UTF-16 surrogate 0xd800
1255.2.3 n "\xed\xad\xbf\xed\xb0\x80"		-	6	ed:ad:bf:ed:b0:80	-	UTF-16 surrogate 0xdb7f
1265.2.4 n "\xed\xad\xbf\xed\xbf\xbf"		-	6	ed:ad:bf:ed:bf:bf	-	UTF-16 surrogate 0xdb7f
1275.2.5 n "\xed\xae\x80\xed\xb0\x80"		-	6	ed:ae:80:ed:b0:80	-	UTF-16 surrogate 0xdb80
1285.2.6 n "\xed\xae\x80\xed\xbf\xbf"		-	6	ed:ae:80:ed:bf:bf	-	UTF-16 surrogate 0xdb80
1295.2.7 n "\xed\xaf\xbf\xed\xb0\x80"		-	6	ed:af:bf:ed:b0:80	-	UTF-16 surrogate 0xdbff
1305.2.8 n "\xed\xaf\xbf\xed\xbf\xbf"		-	6	ed:af:bf:ed:bf:bf	-	UTF-16 surrogate 0xdbff
1315.3	Other illegal code positions
1325.3.1 n "\xef\xbf\xbe"			-	3	ef:bf:be	-	byte order mark 0xfffe
133# The ffff is illegal unless UTF8_ALLOW_FFFF
1345.3.2 n "\xef\xbf\xbf"			-	3	ef:bf:bf	-	character 0xffff
135__EOMK__
136
137# 104..181
138{
139    my $id;
140
141    local $SIG{__WARN__} = sub {
142	print "# $id: @_";
143	$@ = "@_";
144    };
145
146    sub moan {
147	print "$id: @_";
148    }
149
150    sub warn_unpack_U {
151	$@ = '';
152	my @null = unpack('U0U*', $_[0]);
153	return $@;
154    }
155
156    for (@MK) {
157	if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) {
158	    # print "# $_\n";
159	} elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) {
160	    $id = $1;
161	    my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $experr) =
162		($2, $3, $4, $5, $6, $7, $8);
163	    my @hex = split(/:/, $hex);
164	    unless (@hex == $byteslen) {
165		my $nhex = @hex;
166		moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n";
167	    }
168	    {
169		use bytes;
170		my $bytesbyteslen = length($bytes);
171		unless ($bytesbyteslen == $byteslen) {
172		    moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n";
173		}
174	    }
175	    my $warn = warn_unpack_U($bytes);
176	    if ($okay eq 'y') {
177		if ($warn) {
178		    moan "unpack('U0U*') false negative\n";
179		    print "not ";
180		}
181	    } elsif ($okay eq 'n') {
182		if (not $warn || ($experr ne '' && $warn !~ /$experr/)) {
183		    moan "unpack('U0U*') false positive\n";
184		    print "not ";
185		}
186	    }
187	    print "ok $test # $id $okay\n";
188	    $test++;
189 	} else {
190	    moan "unknown format\n";
191	}
192    }
193}
194