1#!./perl 2 3BEGIN { 4 chdir 't' if -d 't'; 5 @INC = '../lib'; 6} 7 8no utf8; 9 10print "1..78\n"; 11 12my $test = 1; 13 14# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester, 15# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt, 16# version dated 2000-09-02. 17 18# We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff 19# because e.g. many patch programs have issues with binary data. 20 21my @MK = split(/\n/, <<__EOMK__); 221 Correct UTF-8 231.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5 242 Boundary conditions 252.1 First possible sequence of certain length 262.1.1 y "\x00" 0 1 00 1 272.1.2 y "\xc2\x80" 80 2 c2:80 1 282.1.3 y "\xe0\xa0\x80" 800 3 e0:a0:80 1 292.1.4 y "\xf0\x90\x80\x80" 10000 4 f0:90:80:80 1 302.1.5 y "\xf8\x88\x80\x80\x80" 200000 5 f8:88:80:80:80 1 312.1.6 y "\xfc\x84\x80\x80\x80\x80" 4000000 6 fc:84:80:80:80:80 1 322.2 Last possible sequence of certain length 332.2.1 y "\x7f" 7f 1 7f 1 342.2.2 y "\xdf\xbf" 7ff 2 df:bf 1 35# The ffff is illegal unless UTF8_ALLOW_FFFF 362.2.3 n "\xef\xbf\xbf" ffff 3 ef:bf:bf 1 character 0xffff 372.2.4 y "\xf7\xbf\xbf\xbf" 1fffff 4 f7:bf:bf:bf 1 382.2.5 y "\xfb\xbf\xbf\xbf\xbf" 3ffffff 5 fb:bf:bf:bf:bf 1 392.2.6 y "\xfd\xbf\xbf\xbf\xbf\xbf" 7fffffff 6 fd:bf:bf:bf:bf:bf 1 402.3 Other boundary conditions 412.3.1 y "\xed\x9f\xbf" d7ff 3 ed:9f:bf 1 422.3.2 y "\xee\x80\x80" e000 3 ee:80:80 1 432.3.3 y "\xef\xbf\xbd" fffd 3 ef:bf:bd 1 442.3.4 y "\xf4\x8f\xbf\xbf" 10ffff 4 f4:8f:bf:bf 1 452.3.5 y "\xf4\x90\x80\x80" 110000 4 f4:90:80:80 1 463 Malformed sequences 473.1 Unexpected continuation bytes 483.1.1 n "\x80" - 1 80 - unexpected continuation byte 0x80 493.1.2 n "\xbf" - 1 bf - unexpected continuation byte 0xbf 503.1.3 n "\x80\xbf" - 2 80:bf - unexpected continuation byte 0x80 513.1.4 n "\x80\xbf\x80" - 3 80:bf:80 - unexpected continuation byte 0x80 523.1.5 n "\x80\xbf\x80\xbf" - 4 80:bf:80:bf - unexpected continuation byte 0x80 533.1.6 n "\x80\xbf\x80\xbf\x80" - 5 80:bf:80:bf:80 - unexpected continuation byte 0x80 543.1.7 n "\x80\xbf\x80\xbf\x80\xbf" - 6 80:bf:80:bf:80:bf - unexpected continuation byte 0x80 553.1.8 n "\x80\xbf\x80\xbf\x80\xbf\x80" - 7 80:bf:80:bf:80:bf:80 - unexpected continuation byte 0x80 563.1.9 n "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" - 64 80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf - unexpected continuation byte 0x80 573.2 Lonely start characters 583.2.1 n "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf " - 64 c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 - unexpected non-continuation byte 0x20 after start byte 0xc0 593.2.2 n "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef " - 32 e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20 - unexpected non-continuation byte 0x20 after start byte 0xe0 603.2.3 n "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 " - 16 f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 - unexpected non-continuation byte 0x20 after start byte 0xf0 613.2.4 n "\xf8 \xf9 \xfa \xfb " - 8 f8:20:f9:20:fa:20:fb:20 - unexpected non-continuation byte 0x20 after start byte 0xf8 623.2.5 n "\xfc \xfd " - 4 fc:20:fd:20 - unexpected non-continuation byte 0x20 after start byte 0xfc 633.3 Sequences with last continuation byte missing 643.3.1 n "\xc0" - 1 c0 - 1 byte, need 2 653.3.2 n "\xe0\x80" - 2 e0:80 - 2 bytes, need 3 663.3.3 n "\xf0\x80\x80" - 3 f0:80:80 - 3 bytes, need 4 673.3.4 n "\xf8\x80\x80\x80" - 4 f8:80:80:80 - 4 bytes, need 5 683.3.5 n "\xfc\x80\x80\x80\x80" - 5 fc:80:80:80:80 - 5 bytes, need 6 693.3.6 n "\xdf" - 1 df - 1 byte, need 2 703.3.7 n "\xef\xbf" - 2 ef:bf - 2 bytes, need 3 713.3.8 n "\xf7\xbf\xbf" - 3 f7:bf:bf - 3 bytes, need 4 723.3.9 n "\xfb\xbf\xbf\xbf" - 4 fb:bf:bf:bf - 4 bytes, need 5 733.3.10 n "\xfd\xbf\xbf\xbf\xbf" - 5 fd:bf:bf:bf:bf - 5 bytes, need 6 743.4 Concatenation of incomplete sequences 753.4.1 n "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf" - 30 c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf - unexpected non-continuation byte 0xe0 after start byte 0xc0 763.5 Impossible bytes 773.5.1 n "\xfe" - 1 fe - byte 0xfe 783.5.2 n "\xff" - 1 ff - byte 0xff 793.5.3 n "\xfe\xfe\xff\xff" - 4 fe:fe:ff:ff - byte 0xfe 804 Overlong sequences 814.1 Examples of an overlong ASCII character 824.1.1 n "\xc0\xaf" - 2 c0:af - 2 bytes, need 1 834.1.2 n "\xe0\x80\xaf" - 3 e0:80:af - 3 bytes, need 1 844.1.3 n "\xf0\x80\x80\xaf" - 4 f0:80:80:af - 4 bytes, need 1 854.1.4 n "\xf8\x80\x80\x80\xaf" - 5 f8:80:80:80:af - 5 bytes, need 1 864.1.5 n "\xfc\x80\x80\x80\x80\xaf" - 6 fc:80:80:80:80:af - 6 bytes, need 1 874.2 Maximum overlong sequences 884.2.1 n "\xc1\xbf" - 2 c1:bf - 2 bytes, need 1 894.2.2 n "\xe0\x9f\xbf" - 3 e0:9f:bf - 3 bytes, need 2 904.2.3 n "\xf0\x8f\xbf\xbf" - 4 f0:8f:bf:bf - 4 bytes, need 3 914.2.4 n "\xf8\x87\xbf\xbf\xbf" - 5 f8:87:bf:bf:bf - 5 bytes, need 4 924.2.5 n "\xfc\x83\xbf\xbf\xbf\xbf" - 6 fc:83:bf:bf:bf:bf - 6 bytes, need 5 934.3 Overlong representation of the NUL character 944.3.1 n "\xc0\x80" - 2 c0:80 - 2 bytes, need 1 954.3.2 n "\xe0\x80\x80" - 3 e0:80:80 - 3 bytes, need 1 964.3.3 n "\xf0\x80\x80\x80" - 4 f0:80:80:80 - 4 bytes, need 1 974.3.4 n "\xf8\x80\x80\x80\x80" - 5 f8:80:80:80:80 - 5 bytes, need 1 984.3.5 n "\xfc\x80\x80\x80\x80\x80" - 6 fc:80:80:80:80:80 - 6 bytes, need 1 995 Illegal code positions 1005.1 Single UTF-16 surrogates 1015.1.1 n "\xed\xa0\x80" - 3 ed:a0:80 - UTF-16 surrogate 0xd800 1025.1.2 n "\xed\xad\xbf" - 3 ed:ad:bf - UTF-16 surrogate 0xdb7f 1035.1.3 n "\xed\xae\x80" - 3 ed:ae:80 - UTF-16 surrogate 0xdb80 1045.1.4 n "\xed\xaf\xbf" - 3 ed:af:bf - UTF-16 surrogate 0xdbff 1055.1.5 n "\xed\xb0\x80" - 3 ed:b0:80 - UTF-16 surrogate 0xdc00 1065.1.6 n "\xed\xbe\x80" - 3 ed:be:80 - UTF-16 surrogate 0xdf80 1075.1.7 n "\xed\xbf\xbf" - 3 ed:bf:bf - UTF-16 surrogate 0xdfff 1085.2 Paired UTF-16 surrogates 1095.2.1 n "\xed\xa0\x80\xed\xb0\x80" - 6 ed:a0:80:ed:b0:80 - UTF-16 surrogate 0xd800 1105.2.2 n "\xed\xa0\x80\xed\xbf\xbf" - 6 ed:a0:80:ed:bf:bf - UTF-16 surrogate 0xd800 1115.2.3 n "\xed\xad\xbf\xed\xb0\x80" - 6 ed:ad:bf:ed:b0:80 - UTF-16 surrogate 0xdb7f 1125.2.4 n "\xed\xad\xbf\xed\xbf\xbf" - 6 ed:ad:bf:ed:bf:bf - UTF-16 surrogate 0xdb7f 1135.2.5 n "\xed\xae\x80\xed\xb0\x80" - 6 ed:ae:80:ed:b0:80 - UTF-16 surrogate 0xdb80 1145.2.6 n "\xed\xae\x80\xed\xbf\xbf" - 6 ed:ae:80:ed:bf:bf - UTF-16 surrogate 0xdb80 1155.2.7 n "\xed\xaf\xbf\xed\xb0\x80" - 6 ed:af:bf:ed:b0:80 - UTF-16 surrogate 0xdbff 1165.2.8 n "\xed\xaf\xbf\xed\xbf\xbf" - 6 ed:af:bf:ed:bf:bf - UTF-16 surrogate 0xdbff 1175.3 Other illegal code positions 1185.3.1 n "\xef\xbf\xbe" - 3 ef:bf:be - byte order mark 0xfffe 119# The ffff is illegal unless UTF8_ALLOW_FFFF 1205.3.2 n "\xef\xbf\xbf" - 3 ef:bf:bf - character 0xffff 121__EOMK__ 122 123# 104..181 124{ 125 my $WARNCNT; 126 my $id; 127 128 local $SIG{__WARN__} = 129 sub { 130 print "# $id: @_"; 131 $WARNCNT++; 132 $WARNMSG = "@_"; 133 }; 134 135 sub moan { 136 print "$id: @_"; 137 } 138 139 sub test_unpack_U { 140 $WARNCNT = 0; 141 $WARNMSG = ""; 142 unpack('U*', $_[0]); 143 } 144 145 for (@MK) { 146 if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) { 147 # print "# $_\n"; 148 } elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) { 149 $id = $1; 150 my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $error) = 151 ($2, $3, $4, $5, $6, $7, $8); 152 my @hex = split(/:/, $hex); 153 unless (@hex == $byteslen) { 154 my $nhex = @hex; 155 moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n"; 156 } 157 { 158 use bytes; 159 my $bytesbyteslen = length($bytes); 160 unless ($bytesbyteslen == $byteslen) { 161 moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n"; 162 } 163 } 164 if ($okay eq 'y') { 165 test_unpack_U($bytes); 166 if ($WARNCNT) { 167 moan "unpack('U*') false negative\n"; 168 print "not "; 169 } 170 } elsif ($okay eq 'n') { 171 test_unpack_U($bytes); 172 if ($WARNCNT == 0 || ($error ne '' && $WARNMSG !~ /$error/)) { 173 moan "unpack('U*') false positive\n"; 174 print "not "; 175 } 176 } 177 print "ok $test\n"; 178 $test++; 179 } else { 180 moan "unknown format\n"; 181 } 182 } 183} 184