1#!./perl 2 3BEGIN { 4 chdir 't' if -d 't'; 5 @INC = '../lib'; 6} 7 8{ 9 my $wide = v256; 10 use bytes; 11 my $ordwide = ord($wide); 12 printf "# under use bytes ord(v256) = 0x%02x\n", $ordwide; 13 if ($ordwide == 140) { 14 print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n"; 15 exit 0; 16 } 17 elsif ($ordwide != 196) { 18 printf "# v256 starts with 0x%02x\n", $ordwide; 19 } 20} 21 22no utf8; 23 24print "1..78\n"; 25 26my $test = 1; 27 28# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester, 29# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt, 30# version dated 2000-09-02. 31 32# We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff 33# because e.g. many patch programs have issues with binary data. 34 35my @MK = split(/\n/, <<__EOMK__); 361 Correct UTF-8 371.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5 382 Boundary conditions 392.1 First possible sequence of certain length 402.1.1 y "\x00" 0 1 00 1 412.1.2 y "\xc2\x80" 80 2 c2:80 1 422.1.3 y "\xe0\xa0\x80" 800 3 e0:a0:80 1 432.1.4 y "\xf0\x90\x80\x80" 10000 4 f0:90:80:80 1 442.1.5 y "\xf8\x88\x80\x80\x80" 200000 5 f8:88:80:80:80 1 452.1.6 y "\xfc\x84\x80\x80\x80\x80" 4000000 6 fc:84:80:80:80:80 1 462.2 Last possible sequence of certain length 472.2.1 y "\x7f" 7f 1 7f 1 482.2.2 y "\xdf\xbf" 7ff 2 df:bf 1 49# The ffff is illegal unless UTF8_ALLOW_FFFF 502.2.3 n "\xef\xbf\xbf" ffff 3 ef:bf:bf 1 character 0xffff 512.2.4 y "\xf7\xbf\xbf\xbf" 1fffff 4 f7:bf:bf:bf 1 522.2.5 y "\xfb\xbf\xbf\xbf\xbf" 3ffffff 5 fb:bf:bf:bf:bf 1 532.2.6 y "\xfd\xbf\xbf\xbf\xbf\xbf" 7fffffff 6 fd:bf:bf:bf:bf:bf 1 542.3 Other boundary conditions 552.3.1 y "\xed\x9f\xbf" d7ff 3 ed:9f:bf 1 562.3.2 y "\xee\x80\x80" e000 3 ee:80:80 1 572.3.3 y "\xef\xbf\xbd" fffd 3 ef:bf:bd 1 582.3.4 y "\xf4\x8f\xbf\xbf" 10ffff 4 f4:8f:bf:bf 1 592.3.5 y "\xf4\x90\x80\x80" 110000 4 f4:90:80:80 1 603 Malformed sequences 613.1 Unexpected continuation bytes 623.1.1 n "\x80" - 1 80 - unexpected continuation byte 0x80 633.1.2 n "\xbf" - 1 bf - unexpected continuation byte 0xbf 643.1.3 n "\x80\xbf" - 2 80:bf - unexpected continuation byte 0x80 653.1.4 n "\x80\xbf\x80" - 3 80:bf:80 - unexpected continuation byte 0x80 663.1.5 n "\x80\xbf\x80\xbf" - 4 80:bf:80:bf - unexpected continuation byte 0x80 673.1.6 n "\x80\xbf\x80\xbf\x80" - 5 80:bf:80:bf:80 - unexpected continuation byte 0x80 683.1.7 n "\x80\xbf\x80\xbf\x80\xbf" - 6 80:bf:80:bf:80:bf - unexpected continuation byte 0x80 693.1.8 n "\x80\xbf\x80\xbf\x80\xbf\x80" - 7 80:bf:80:bf:80:bf:80 - unexpected continuation byte 0x80 703.1.9 n "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" - 64 80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf - unexpected continuation byte 0x80 713.2 Lonely start characters 723.2.1 n "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf " - 64 c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 - unexpected non-continuation byte 0x20 after start byte 0xc0 733.2.2 n "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef " - 32 e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20 - unexpected non-continuation byte 0x20 after start byte 0xe0 743.2.3 n "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 " - 16 f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 - unexpected non-continuation byte 0x20 after start byte 0xf0 753.2.4 n "\xf8 \xf9 \xfa \xfb " - 8 f8:20:f9:20:fa:20:fb:20 - unexpected non-continuation byte 0x20 after start byte 0xf8 763.2.5 n "\xfc \xfd " - 4 fc:20:fd:20 - unexpected non-continuation byte 0x20 after start byte 0xfc 773.3 Sequences with last continuation byte missing 783.3.1 n "\xc0" - 1 c0 - 1 byte, need 2 793.3.2 n "\xe0\x80" - 2 e0:80 - 2 bytes, need 3 803.3.3 n "\xf0\x80\x80" - 3 f0:80:80 - 3 bytes, need 4 813.3.4 n "\xf8\x80\x80\x80" - 4 f8:80:80:80 - 4 bytes, need 5 823.3.5 n "\xfc\x80\x80\x80\x80" - 5 fc:80:80:80:80 - 5 bytes, need 6 833.3.6 n "\xdf" - 1 df - 1 byte, need 2 843.3.7 n "\xef\xbf" - 2 ef:bf - 2 bytes, need 3 853.3.8 n "\xf7\xbf\xbf" - 3 f7:bf:bf - 3 bytes, need 4 863.3.9 n "\xfb\xbf\xbf\xbf" - 4 fb:bf:bf:bf - 4 bytes, need 5 873.3.10 n "\xfd\xbf\xbf\xbf\xbf" - 5 fd:bf:bf:bf:bf - 5 bytes, need 6 883.4 Concatenation of incomplete sequences 893.4.1 n "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf" - 30 c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf - unexpected non-continuation byte 0xe0 after start byte 0xc0 903.5 Impossible bytes 913.5.1 n "\xfe" - 1 fe - byte 0xfe 923.5.2 n "\xff" - 1 ff - byte 0xff 933.5.3 n "\xfe\xfe\xff\xff" - 4 fe:fe:ff:ff - byte 0xfe 944 Overlong sequences 954.1 Examples of an overlong ASCII character 964.1.1 n "\xc0\xaf" - 2 c0:af - 2 bytes, need 1 974.1.2 n "\xe0\x80\xaf" - 3 e0:80:af - 3 bytes, need 1 984.1.3 n "\xf0\x80\x80\xaf" - 4 f0:80:80:af - 4 bytes, need 1 994.1.4 n "\xf8\x80\x80\x80\xaf" - 5 f8:80:80:80:af - 5 bytes, need 1 1004.1.5 n "\xfc\x80\x80\x80\x80\xaf" - 6 fc:80:80:80:80:af - 6 bytes, need 1 1014.2 Maximum overlong sequences 1024.2.1 n "\xc1\xbf" - 2 c1:bf - 2 bytes, need 1 1034.2.2 n "\xe0\x9f\xbf" - 3 e0:9f:bf - 3 bytes, need 2 1044.2.3 n "\xf0\x8f\xbf\xbf" - 4 f0:8f:bf:bf - 4 bytes, need 3 1054.2.4 n "\xf8\x87\xbf\xbf\xbf" - 5 f8:87:bf:bf:bf - 5 bytes, need 4 1064.2.5 n "\xfc\x83\xbf\xbf\xbf\xbf" - 6 fc:83:bf:bf:bf:bf - 6 bytes, need 5 1074.3 Overlong representation of the NUL character 1084.3.1 n "\xc0\x80" - 2 c0:80 - 2 bytes, need 1 1094.3.2 n "\xe0\x80\x80" - 3 e0:80:80 - 3 bytes, need 1 1104.3.3 n "\xf0\x80\x80\x80" - 4 f0:80:80:80 - 4 bytes, need 1 1114.3.4 n "\xf8\x80\x80\x80\x80" - 5 f8:80:80:80:80 - 5 bytes, need 1 1124.3.5 n "\xfc\x80\x80\x80\x80\x80" - 6 fc:80:80:80:80:80 - 6 bytes, need 1 1135 Illegal code positions 1145.1 Single UTF-16 surrogates 1155.1.1 n "\xed\xa0\x80" - 3 ed:a0:80 - UTF-16 surrogate 0xd800 1165.1.2 n "\xed\xad\xbf" - 3 ed:ad:bf - UTF-16 surrogate 0xdb7f 1175.1.3 n "\xed\xae\x80" - 3 ed:ae:80 - UTF-16 surrogate 0xdb80 1185.1.4 n "\xed\xaf\xbf" - 3 ed:af:bf - UTF-16 surrogate 0xdbff 1195.1.5 n "\xed\xb0\x80" - 3 ed:b0:80 - UTF-16 surrogate 0xdc00 1205.1.6 n "\xed\xbe\x80" - 3 ed:be:80 - UTF-16 surrogate 0xdf80 1215.1.7 n "\xed\xbf\xbf" - 3 ed:bf:bf - UTF-16 surrogate 0xdfff 1225.2 Paired UTF-16 surrogates 1235.2.1 n "\xed\xa0\x80\xed\xb0\x80" - 6 ed:a0:80:ed:b0:80 - UTF-16 surrogate 0xd800 1245.2.2 n "\xed\xa0\x80\xed\xbf\xbf" - 6 ed:a0:80:ed:bf:bf - UTF-16 surrogate 0xd800 1255.2.3 n "\xed\xad\xbf\xed\xb0\x80" - 6 ed:ad:bf:ed:b0:80 - UTF-16 surrogate 0xdb7f 1265.2.4 n "\xed\xad\xbf\xed\xbf\xbf" - 6 ed:ad:bf:ed:bf:bf - UTF-16 surrogate 0xdb7f 1275.2.5 n "\xed\xae\x80\xed\xb0\x80" - 6 ed:ae:80:ed:b0:80 - UTF-16 surrogate 0xdb80 1285.2.6 n "\xed\xae\x80\xed\xbf\xbf" - 6 ed:ae:80:ed:bf:bf - UTF-16 surrogate 0xdb80 1295.2.7 n "\xed\xaf\xbf\xed\xb0\x80" - 6 ed:af:bf:ed:b0:80 - UTF-16 surrogate 0xdbff 1305.2.8 n "\xed\xaf\xbf\xed\xbf\xbf" - 6 ed:af:bf:ed:bf:bf - UTF-16 surrogate 0xdbff 1315.3 Other illegal code positions 1325.3.1 n "\xef\xbf\xbe" - 3 ef:bf:be - byte order mark 0xfffe 133# The ffff is illegal unless UTF8_ALLOW_FFFF 1345.3.2 n "\xef\xbf\xbf" - 3 ef:bf:bf - character 0xffff 135__EOMK__ 136 137# 104..181 138{ 139 my $id; 140 141 local $SIG{__WARN__} = sub { 142 print "# $id: @_"; 143 $@ = "@_"; 144 }; 145 146 sub moan { 147 print "$id: @_"; 148 } 149 150 sub warn_unpack_U { 151 $@ = ''; 152 my @null = unpack('U0U*', $_[0]); 153 return $@; 154 } 155 156 for (@MK) { 157 if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) { 158 # print "# $_\n"; 159 } elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) { 160 $id = $1; 161 my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $experr) = 162 ($2, $3, $4, $5, $6, $7, $8); 163 my @hex = split(/:/, $hex); 164 unless (@hex == $byteslen) { 165 my $nhex = @hex; 166 moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n"; 167 } 168 { 169 use bytes; 170 my $bytesbyteslen = length($bytes); 171 unless ($bytesbyteslen == $byteslen) { 172 moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n"; 173 } 174 } 175 my $warn = warn_unpack_U($bytes); 176 if ($okay eq 'y') { 177 if ($warn) { 178 moan "unpack('U0U*') false negative\n"; 179 print "not "; 180 } 181 } elsif ($okay eq 'n') { 182 if (not $warn || ($experr ne '' && $warn !~ /$experr/)) { 183 moan "unpack('U0U*') false positive\n"; 184 print "not "; 185 } 186 } 187 print "ok $test # $id $okay\n"; 188 $test++; 189 } else { 190 moan "unknown format\n"; 191 } 192 } 193} 194