xref: /openbsd-src/gnu/usr.bin/perl/t/op/utf8decode.t (revision b2ea75c1b17e1a9a339660e7ed45cd24946b230e)
1#!./perl
2
3BEGIN {
4    chdir 't' if -d 't';
5    @INC = '../lib';
6}
7
8no utf8;
9
10print "1..78\n";
11
12my $test = 1;
13
14# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
15# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
16# version dated 2000-09-02.
17
18# We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff
19# because e.g. many patch programs have issues with binary data.
20
21my @MK = split(/\n/, <<__EOMK__);
221	Correct UTF-8
231.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"	-		11	ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5	5
242	Boundary conditions 
252.1	First possible sequence of certain length
262.1.1 y "\x00"			0		1	00	1
272.1.2 y "\xc2\x80"			80		2	c2:80	1
282.1.3 y "\xe0\xa0\x80"		800		3	e0:a0:80	1
292.1.4 y "\xf0\x90\x80\x80"		10000		4	f0:90:80:80	1
302.1.5 y "\xf8\x88\x80\x80\x80"	200000		5	f8:88:80:80:80	1
312.1.6 y "\xfc\x84\x80\x80\x80\x80"	4000000		6	fc:84:80:80:80:80	1
322.2	Last possible sequence of certain length
332.2.1 y "\x7f"			7f		1	7f	1
342.2.2 y "\xdf\xbf"			7ff		2	df:bf	1
35# The ffff is illegal unless UTF8_ALLOW_FFFF
362.2.3 n "\xef\xbf\xbf"			ffff		3	ef:bf:bf	1	character 0xffff
372.2.4 y "\xf7\xbf\xbf\xbf"			1fffff		4	f7:bf:bf:bf	1
382.2.5 y "\xfb\xbf\xbf\xbf\xbf"			3ffffff		5	fb:bf:bf:bf:bf	1
392.2.6 y "\xfd\xbf\xbf\xbf\xbf\xbf"		7fffffff	6	fd:bf:bf:bf:bf:bf	1
402.3	Other boundary conditions
412.3.1 y "\xed\x9f\xbf"		d7ff		3	ed:9f:bf	1
422.3.2 y "\xee\x80\x80"		e000		3	ee:80:80	1
432.3.3 y "\xef\xbf\xbd"			fffd		3	ef:bf:bd	1
442.3.4 y "\xf4\x8f\xbf\xbf"		10ffff		4	f4:8f:bf:bf	1
452.3.5 y "\xf4\x90\x80\x80"		110000		4	f4:90:80:80	1
463	Malformed sequences
473.1	Unexpected continuation bytes
483.1.1 n "\x80"			-		1	80	-	unexpected continuation byte 0x80
493.1.2 n "\xbf"			-		1	bf	-	unexpected continuation byte 0xbf
503.1.3 n "\x80\xbf"			-		2	80:bf	-	unexpected continuation byte 0x80
513.1.4 n "\x80\xbf\x80"		-		3	80:bf:80	-	unexpected continuation byte 0x80
523.1.5 n "\x80\xbf\x80\xbf"		-		4	80:bf:80:bf	-	unexpected continuation byte 0x80
533.1.6 n "\x80\xbf\x80\xbf\x80"	-		5	80:bf:80:bf:80	-	unexpected continuation byte 0x80
543.1.7 n "\x80\xbf\x80\xbf\x80\xbf"	-		6	80:bf:80:bf:80:bf	-	unexpected continuation byte 0x80
553.1.8 n "\x80\xbf\x80\xbf\x80\xbf\x80"	-		7	80:bf:80:bf:80:bf:80	-	unexpected continuation byte 0x80
563.1.9 n "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"				-	64	80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf	-	unexpected continuation byte 0x80
573.2	Lonely start characters
583.2.1 n "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf "	-	64 	c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20	-	unexpected non-continuation byte 0x20 after start byte 0xc0
593.2.2 n "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef "	-	32	e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20	-	unexpected non-continuation byte 0x20 after start byte 0xe0
603.2.3 n "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 "	-	16	f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20	-	unexpected non-continuation byte 0x20 after start byte 0xf0
613.2.4 n "\xf8 \xf9 \xfa \xfb "		-	8	f8:20:f9:20:fa:20:fb:20	-	unexpected non-continuation byte 0x20 after start byte 0xf8
623.2.5 n "\xfc \xfd "			-	4	fc:20:fd:20	-	unexpected non-continuation byte 0x20 after start byte 0xfc
633.3	Sequences with last continuation byte missing
643.3.1 n "\xc0"			-	1	c0	-	1 byte, need 2
653.3.2 n "\xe0\x80"			-	2	e0:80	-	2 bytes, need 3
663.3.3 n "\xf0\x80\x80"		-	3	f0:80:80	-	3 bytes, need 4
673.3.4 n "\xf8\x80\x80\x80"		-	4	f8:80:80:80	-	4 bytes, need 5
683.3.5 n "\xfc\x80\x80\x80\x80"	-	5	fc:80:80:80:80	-	5 bytes, need 6
693.3.6 n "\xdf"			-	1	df	-	1 byte, need 2
703.3.7 n "\xef\xbf"			-	2	ef:bf	-	2 bytes, need 3
713.3.8 n "\xf7\xbf\xbf"			-	3	f7:bf:bf	-	3 bytes, need 4
723.3.9 n "\xfb\xbf\xbf\xbf"			-	4	fb:bf:bf:bf	-	4 bytes, need 5
733.3.10 n "\xfd\xbf\xbf\xbf\xbf"		-	5	fd:bf:bf:bf:bf	-	5 bytes, need 6
743.4	Concatenation of incomplete sequences
753.4.1 n "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf"	-	30	c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf	-	unexpected non-continuation byte 0xe0 after start byte 0xc0
763.5	Impossible bytes
773.5.1 n "\xfe"			-	1	fe	-	byte 0xfe
783.5.2 n "\xff"			-	1	ff	-	byte 0xff
793.5.3 n "\xfe\xfe\xff\xff"			-	4	fe:fe:ff:ff	-	byte 0xfe
804	Overlong sequences
814.1	Examples of an overlong ASCII character
824.1.1 n "\xc0\xaf"			-	2	c0:af	-	2 bytes, need 1
834.1.2 n "\xe0\x80\xaf"		-	3	e0:80:af	-	3 bytes, need 1
844.1.3 n "\xf0\x80\x80\xaf"		-	4	f0:80:80:af	-	4 bytes, need 1
854.1.4 n "\xf8\x80\x80\x80\xaf"	-	5	f8:80:80:80:af	-	5 bytes, need 1
864.1.5 n "\xfc\x80\x80\x80\x80\xaf"	-	6	fc:80:80:80:80:af	-	6 bytes, need 1
874.2	Maximum overlong sequences
884.2.1 n "\xc1\xbf"			-	2	c1:bf	-	2 bytes, need 1
894.2.2 n "\xe0\x9f\xbf"		-	3	e0:9f:bf	-	3 bytes, need 2
904.2.3 n "\xf0\x8f\xbf\xbf"		-	4	f0:8f:bf:bf	-	4 bytes, need 3
914.2.4 n "\xf8\x87\xbf\xbf\xbf"		-	5	f8:87:bf:bf:bf	-	5 bytes, need 4
924.2.5 n "\xfc\x83\xbf\xbf\xbf\xbf"		-	6	fc:83:bf:bf:bf:bf	-	6 bytes, need 5
934.3	Overlong representation of the NUL character
944.3.1 n "\xc0\x80"			-	2	c0:80	-	2 bytes, need 1
954.3.2 n "\xe0\x80\x80"		-	3	e0:80:80	-	3 bytes, need 1
964.3.3 n "\xf0\x80\x80\x80"		-	4	f0:80:80:80	-	4 bytes, need 1
974.3.4 n "\xf8\x80\x80\x80\x80"	-	5	f8:80:80:80:80	-	5 bytes, need 1
984.3.5 n "\xfc\x80\x80\x80\x80\x80"	-	6	fc:80:80:80:80:80	-	6 bytes, need 1
995	Illegal code positions
1005.1	Single UTF-16 surrogates
1015.1.1 n "\xed\xa0\x80"		-	3	ed:a0:80	-	UTF-16 surrogate 0xd800
1025.1.2 n "\xed\xad\xbf"			-	3	ed:ad:bf	-	UTF-16 surrogate 0xdb7f
1035.1.3 n "\xed\xae\x80"		-	3	ed:ae:80	-	UTF-16 surrogate 0xdb80
1045.1.4 n "\xed\xaf\xbf"			-	3	ed:af:bf	-	UTF-16 surrogate 0xdbff
1055.1.5 n "\xed\xb0\x80"		-	3	ed:b0:80	-	UTF-16 surrogate 0xdc00
1065.1.6 n "\xed\xbe\x80"		-	3	ed:be:80	-	UTF-16 surrogate 0xdf80
1075.1.7 n "\xed\xbf\xbf"			-	3	ed:bf:bf	-	UTF-16 surrogate 0xdfff
1085.2	Paired UTF-16 surrogates
1095.2.1 n "\xed\xa0\x80\xed\xb0\x80"		-	6	ed:a0:80:ed:b0:80	-	UTF-16 surrogate 0xd800
1105.2.2 n "\xed\xa0\x80\xed\xbf\xbf"		-	6	ed:a0:80:ed:bf:bf	-	UTF-16 surrogate 0xd800
1115.2.3 n "\xed\xad\xbf\xed\xb0\x80"		-	6	ed:ad:bf:ed:b0:80	-	UTF-16 surrogate 0xdb7f
1125.2.4 n "\xed\xad\xbf\xed\xbf\xbf"		-	6	ed:ad:bf:ed:bf:bf	-	UTF-16 surrogate 0xdb7f
1135.2.5 n "\xed\xae\x80\xed\xb0\x80"		-	6	ed:ae:80:ed:b0:80	-	UTF-16 surrogate 0xdb80
1145.2.6 n "\xed\xae\x80\xed\xbf\xbf"		-	6	ed:ae:80:ed:bf:bf	-	UTF-16 surrogate 0xdb80
1155.2.7 n "\xed\xaf\xbf\xed\xb0\x80"		-	6	ed:af:bf:ed:b0:80	-	UTF-16 surrogate 0xdbff
1165.2.8 n "\xed\xaf\xbf\xed\xbf\xbf"		-	6	ed:af:bf:ed:bf:bf	-	UTF-16 surrogate 0xdbff
1175.3	Other illegal code positions
1185.3.1 n "\xef\xbf\xbe"			-	3	ef:bf:be	-	byte order mark 0xfffe
119# The ffff is illegal unless UTF8_ALLOW_FFFF
1205.3.2 n "\xef\xbf\xbf"			-	3	ef:bf:bf	-	character 0xffff
121__EOMK__
122
123# 104..181
124{
125    my $WARNCNT;
126    my $id;
127
128    local $SIG{__WARN__} =
129	sub {
130	    print "# $id: @_";
131	    $WARNCNT++;
132	    $WARNMSG = "@_";
133	};
134
135    sub moan {
136	print "$id: @_";
137    }
138
139    sub test_unpack_U {
140	$WARNCNT = 0;
141	$WARNMSG = "";
142	unpack('U*', $_[0]);
143    }
144
145    for (@MK) {
146	if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) {
147	    # print "# $_\n";
148	} elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) {
149	    $id = $1;
150	    my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $error) =
151		($2, $3, $4, $5, $6, $7, $8);
152	    my @hex = split(/:/, $hex);
153	    unless (@hex == $byteslen) {
154		my $nhex = @hex;
155		moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n";
156	    }
157	    {
158		use bytes;
159		my $bytesbyteslen = length($bytes);
160		unless ($bytesbyteslen == $byteslen) {
161		    moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n";
162		}
163	    }
164	    if ($okay eq 'y') {
165		test_unpack_U($bytes);
166		if ($WARNCNT) {
167		    moan "unpack('U*') false negative\n";
168		    print "not ";
169		}
170	    } elsif ($okay eq 'n') {
171		test_unpack_U($bytes);
172		if ($WARNCNT == 0 || ($error ne '' && $WARNMSG !~ /$error/)) {
173		    moan "unpack('U*') false positive\n";
174		    print "not ";
175		}
176	    }
177	    print "ok $test\n";
178	    $test++;
179 	} else {
180	    moan "unknown format\n";
181	}
182    }
183}
184