1*0Sstevel@tonic-gate#!./perl 2*0Sstevel@tonic-gate 3*0Sstevel@tonic-gateBEGIN { 4*0Sstevel@tonic-gate chdir 't' if -d 't'; 5*0Sstevel@tonic-gate @INC = '../lib'; 6*0Sstevel@tonic-gate} 7*0Sstevel@tonic-gate 8*0Sstevel@tonic-gate{ 9*0Sstevel@tonic-gate my $wide = v256; 10*0Sstevel@tonic-gate use bytes; 11*0Sstevel@tonic-gate my $ordwide = ord($wide); 12*0Sstevel@tonic-gate printf "# under use bytes ord(v256) = 0x%02x\n", $ordwide; 13*0Sstevel@tonic-gate if ($ordwide == 140) { 14*0Sstevel@tonic-gate print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n"; 15*0Sstevel@tonic-gate exit 0; 16*0Sstevel@tonic-gate } 17*0Sstevel@tonic-gate elsif ($ordwide != 196) { 18*0Sstevel@tonic-gate printf "# v256 starts with 0x%02x\n", $ordwide; 19*0Sstevel@tonic-gate } 20*0Sstevel@tonic-gate} 21*0Sstevel@tonic-gate 22*0Sstevel@tonic-gateno utf8; 23*0Sstevel@tonic-gate 24*0Sstevel@tonic-gateprint "1..78\n"; 25*0Sstevel@tonic-gate 26*0Sstevel@tonic-gatemy $test = 1; 27*0Sstevel@tonic-gate 28*0Sstevel@tonic-gate# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester, 29*0Sstevel@tonic-gate# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt, 30*0Sstevel@tonic-gate# version dated 2000-09-02. 31*0Sstevel@tonic-gate 32*0Sstevel@tonic-gate# We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff 33*0Sstevel@tonic-gate# because e.g. many patch programs have issues with binary data. 34*0Sstevel@tonic-gate 35*0Sstevel@tonic-gatemy @MK = split(/\n/, <<__EOMK__); 36*0Sstevel@tonic-gate1 Correct UTF-8 37*0Sstevel@tonic-gate1.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5 38*0Sstevel@tonic-gate2 Boundary conditions 39*0Sstevel@tonic-gate2.1 First possible sequence of certain length 40*0Sstevel@tonic-gate2.1.1 y "\x00" 0 1 00 1 41*0Sstevel@tonic-gate2.1.2 y "\xc2\x80" 80 2 c2:80 1 42*0Sstevel@tonic-gate2.1.3 y "\xe0\xa0\x80" 800 3 e0:a0:80 1 43*0Sstevel@tonic-gate2.1.4 y "\xf0\x90\x80\x80" 10000 4 f0:90:80:80 1 44*0Sstevel@tonic-gate2.1.5 y "\xf8\x88\x80\x80\x80" 200000 5 f8:88:80:80:80 1 45*0Sstevel@tonic-gate2.1.6 y "\xfc\x84\x80\x80\x80\x80" 4000000 6 fc:84:80:80:80:80 1 46*0Sstevel@tonic-gate2.2 Last possible sequence of certain length 47*0Sstevel@tonic-gate2.2.1 y "\x7f" 7f 1 7f 1 48*0Sstevel@tonic-gate2.2.2 y "\xdf\xbf" 7ff 2 df:bf 1 49*0Sstevel@tonic-gate# The ffff is illegal unless UTF8_ALLOW_FFFF 50*0Sstevel@tonic-gate2.2.3 n "\xef\xbf\xbf" ffff 3 ef:bf:bf 1 character 0xffff 51*0Sstevel@tonic-gate2.2.4 y "\xf7\xbf\xbf\xbf" 1fffff 4 f7:bf:bf:bf 1 52*0Sstevel@tonic-gate2.2.5 y "\xfb\xbf\xbf\xbf\xbf" 3ffffff 5 fb:bf:bf:bf:bf 1 53*0Sstevel@tonic-gate2.2.6 y "\xfd\xbf\xbf\xbf\xbf\xbf" 7fffffff 6 fd:bf:bf:bf:bf:bf 1 54*0Sstevel@tonic-gate2.3 Other boundary conditions 55*0Sstevel@tonic-gate2.3.1 y "\xed\x9f\xbf" d7ff 3 ed:9f:bf 1 56*0Sstevel@tonic-gate2.3.2 y "\xee\x80\x80" e000 3 ee:80:80 1 57*0Sstevel@tonic-gate2.3.3 y "\xef\xbf\xbd" fffd 3 ef:bf:bd 1 58*0Sstevel@tonic-gate2.3.4 y "\xf4\x8f\xbf\xbf" 10ffff 4 f4:8f:bf:bf 1 59*0Sstevel@tonic-gate2.3.5 y "\xf4\x90\x80\x80" 110000 4 f4:90:80:80 1 60*0Sstevel@tonic-gate3 Malformed sequences 61*0Sstevel@tonic-gate3.1 Unexpected continuation bytes 62*0Sstevel@tonic-gate3.1.1 n "\x80" - 1 80 - unexpected continuation byte 0x80 63*0Sstevel@tonic-gate3.1.2 n "\xbf" - 1 bf - unexpected continuation byte 0xbf 64*0Sstevel@tonic-gate3.1.3 n "\x80\xbf" - 2 80:bf - unexpected continuation byte 0x80 65*0Sstevel@tonic-gate3.1.4 n "\x80\xbf\x80" - 3 80:bf:80 - unexpected continuation byte 0x80 66*0Sstevel@tonic-gate3.1.5 n "\x80\xbf\x80\xbf" - 4 80:bf:80:bf - unexpected continuation byte 0x80 67*0Sstevel@tonic-gate3.1.6 n "\x80\xbf\x80\xbf\x80" - 5 80:bf:80:bf:80 - unexpected continuation byte 0x80 68*0Sstevel@tonic-gate3.1.7 n "\x80\xbf\x80\xbf\x80\xbf" - 6 80:bf:80:bf:80:bf - unexpected continuation byte 0x80 69*0Sstevel@tonic-gate3.1.8 n "\x80\xbf\x80\xbf\x80\xbf\x80" - 7 80:bf:80:bf:80:bf:80 - unexpected continuation byte 0x80 70*0Sstevel@tonic-gate3.1.9 n "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" - 64 80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf - unexpected continuation byte 0x80 71*0Sstevel@tonic-gate3.2 Lonely start characters 72*0Sstevel@tonic-gate3.2.1 n "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf " - 64 c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 - unexpected non-continuation byte 0x20 after start byte 0xc0 73*0Sstevel@tonic-gate3.2.2 n "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef " - 32 e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20 - unexpected non-continuation byte 0x20 after start byte 0xe0 74*0Sstevel@tonic-gate3.2.3 n "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 " - 16 f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 - unexpected non-continuation byte 0x20 after start byte 0xf0 75*0Sstevel@tonic-gate3.2.4 n "\xf8 \xf9 \xfa \xfb " - 8 f8:20:f9:20:fa:20:fb:20 - unexpected non-continuation byte 0x20 after start byte 0xf8 76*0Sstevel@tonic-gate3.2.5 n "\xfc \xfd " - 4 fc:20:fd:20 - unexpected non-continuation byte 0x20 after start byte 0xfc 77*0Sstevel@tonic-gate3.3 Sequences with last continuation byte missing 78*0Sstevel@tonic-gate3.3.1 n "\xc0" - 1 c0 - 1 byte, need 2 79*0Sstevel@tonic-gate3.3.2 n "\xe0\x80" - 2 e0:80 - 2 bytes, need 3 80*0Sstevel@tonic-gate3.3.3 n "\xf0\x80\x80" - 3 f0:80:80 - 3 bytes, need 4 81*0Sstevel@tonic-gate3.3.4 n "\xf8\x80\x80\x80" - 4 f8:80:80:80 - 4 bytes, need 5 82*0Sstevel@tonic-gate3.3.5 n "\xfc\x80\x80\x80\x80" - 5 fc:80:80:80:80 - 5 bytes, need 6 83*0Sstevel@tonic-gate3.3.6 n "\xdf" - 1 df - 1 byte, need 2 84*0Sstevel@tonic-gate3.3.7 n "\xef\xbf" - 2 ef:bf - 2 bytes, need 3 85*0Sstevel@tonic-gate3.3.8 n "\xf7\xbf\xbf" - 3 f7:bf:bf - 3 bytes, need 4 86*0Sstevel@tonic-gate3.3.9 n "\xfb\xbf\xbf\xbf" - 4 fb:bf:bf:bf - 4 bytes, need 5 87*0Sstevel@tonic-gate3.3.10 n "\xfd\xbf\xbf\xbf\xbf" - 5 fd:bf:bf:bf:bf - 5 bytes, need 6 88*0Sstevel@tonic-gate3.4 Concatenation of incomplete sequences 89*0Sstevel@tonic-gate3.4.1 n "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf" - 30 c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf - unexpected non-continuation byte 0xe0 after start byte 0xc0 90*0Sstevel@tonic-gate3.5 Impossible bytes 91*0Sstevel@tonic-gate3.5.1 n "\xfe" - 1 fe - byte 0xfe 92*0Sstevel@tonic-gate3.5.2 n "\xff" - 1 ff - byte 0xff 93*0Sstevel@tonic-gate3.5.3 n "\xfe\xfe\xff\xff" - 4 fe:fe:ff:ff - byte 0xfe 94*0Sstevel@tonic-gate4 Overlong sequences 95*0Sstevel@tonic-gate4.1 Examples of an overlong ASCII character 96*0Sstevel@tonic-gate4.1.1 n "\xc0\xaf" - 2 c0:af - 2 bytes, need 1 97*0Sstevel@tonic-gate4.1.2 n "\xe0\x80\xaf" - 3 e0:80:af - 3 bytes, need 1 98*0Sstevel@tonic-gate4.1.3 n "\xf0\x80\x80\xaf" - 4 f0:80:80:af - 4 bytes, need 1 99*0Sstevel@tonic-gate4.1.4 n "\xf8\x80\x80\x80\xaf" - 5 f8:80:80:80:af - 5 bytes, need 1 100*0Sstevel@tonic-gate4.1.5 n "\xfc\x80\x80\x80\x80\xaf" - 6 fc:80:80:80:80:af - 6 bytes, need 1 101*0Sstevel@tonic-gate4.2 Maximum overlong sequences 102*0Sstevel@tonic-gate4.2.1 n "\xc1\xbf" - 2 c1:bf - 2 bytes, need 1 103*0Sstevel@tonic-gate4.2.2 n "\xe0\x9f\xbf" - 3 e0:9f:bf - 3 bytes, need 2 104*0Sstevel@tonic-gate4.2.3 n "\xf0\x8f\xbf\xbf" - 4 f0:8f:bf:bf - 4 bytes, need 3 105*0Sstevel@tonic-gate4.2.4 n "\xf8\x87\xbf\xbf\xbf" - 5 f8:87:bf:bf:bf - 5 bytes, need 4 106*0Sstevel@tonic-gate4.2.5 n "\xfc\x83\xbf\xbf\xbf\xbf" - 6 fc:83:bf:bf:bf:bf - 6 bytes, need 5 107*0Sstevel@tonic-gate4.3 Overlong representation of the NUL character 108*0Sstevel@tonic-gate4.3.1 n "\xc0\x80" - 2 c0:80 - 2 bytes, need 1 109*0Sstevel@tonic-gate4.3.2 n "\xe0\x80\x80" - 3 e0:80:80 - 3 bytes, need 1 110*0Sstevel@tonic-gate4.3.3 n "\xf0\x80\x80\x80" - 4 f0:80:80:80 - 4 bytes, need 1 111*0Sstevel@tonic-gate4.3.4 n "\xf8\x80\x80\x80\x80" - 5 f8:80:80:80:80 - 5 bytes, need 1 112*0Sstevel@tonic-gate4.3.5 n "\xfc\x80\x80\x80\x80\x80" - 6 fc:80:80:80:80:80 - 6 bytes, need 1 113*0Sstevel@tonic-gate5 Illegal code positions 114*0Sstevel@tonic-gate5.1 Single UTF-16 surrogates 115*0Sstevel@tonic-gate5.1.1 n "\xed\xa0\x80" - 3 ed:a0:80 - UTF-16 surrogate 0xd800 116*0Sstevel@tonic-gate5.1.2 n "\xed\xad\xbf" - 3 ed:ad:bf - UTF-16 surrogate 0xdb7f 117*0Sstevel@tonic-gate5.1.3 n "\xed\xae\x80" - 3 ed:ae:80 - UTF-16 surrogate 0xdb80 118*0Sstevel@tonic-gate5.1.4 n "\xed\xaf\xbf" - 3 ed:af:bf - UTF-16 surrogate 0xdbff 119*0Sstevel@tonic-gate5.1.5 n "\xed\xb0\x80" - 3 ed:b0:80 - UTF-16 surrogate 0xdc00 120*0Sstevel@tonic-gate5.1.6 n "\xed\xbe\x80" - 3 ed:be:80 - UTF-16 surrogate 0xdf80 121*0Sstevel@tonic-gate5.1.7 n "\xed\xbf\xbf" - 3 ed:bf:bf - UTF-16 surrogate 0xdfff 122*0Sstevel@tonic-gate5.2 Paired UTF-16 surrogates 123*0Sstevel@tonic-gate5.2.1 n "\xed\xa0\x80\xed\xb0\x80" - 6 ed:a0:80:ed:b0:80 - UTF-16 surrogate 0xd800 124*0Sstevel@tonic-gate5.2.2 n "\xed\xa0\x80\xed\xbf\xbf" - 6 ed:a0:80:ed:bf:bf - UTF-16 surrogate 0xd800 125*0Sstevel@tonic-gate5.2.3 n "\xed\xad\xbf\xed\xb0\x80" - 6 ed:ad:bf:ed:b0:80 - UTF-16 surrogate 0xdb7f 126*0Sstevel@tonic-gate5.2.4 n "\xed\xad\xbf\xed\xbf\xbf" - 6 ed:ad:bf:ed:bf:bf - UTF-16 surrogate 0xdb7f 127*0Sstevel@tonic-gate5.2.5 n "\xed\xae\x80\xed\xb0\x80" - 6 ed:ae:80:ed:b0:80 - UTF-16 surrogate 0xdb80 128*0Sstevel@tonic-gate5.2.6 n "\xed\xae\x80\xed\xbf\xbf" - 6 ed:ae:80:ed:bf:bf - UTF-16 surrogate 0xdb80 129*0Sstevel@tonic-gate5.2.7 n "\xed\xaf\xbf\xed\xb0\x80" - 6 ed:af:bf:ed:b0:80 - UTF-16 surrogate 0xdbff 130*0Sstevel@tonic-gate5.2.8 n "\xed\xaf\xbf\xed\xbf\xbf" - 6 ed:af:bf:ed:bf:bf - UTF-16 surrogate 0xdbff 131*0Sstevel@tonic-gate5.3 Other illegal code positions 132*0Sstevel@tonic-gate5.3.1 n "\xef\xbf\xbe" - 3 ef:bf:be - byte order mark 0xfffe 133*0Sstevel@tonic-gate# The ffff is illegal unless UTF8_ALLOW_FFFF 134*0Sstevel@tonic-gate5.3.2 n "\xef\xbf\xbf" - 3 ef:bf:bf - character 0xffff 135*0Sstevel@tonic-gate__EOMK__ 136*0Sstevel@tonic-gate 137*0Sstevel@tonic-gate# 104..181 138*0Sstevel@tonic-gate{ 139*0Sstevel@tonic-gate my $id; 140*0Sstevel@tonic-gate 141*0Sstevel@tonic-gate local $SIG{__WARN__} = sub { 142*0Sstevel@tonic-gate print "# $id: @_"; 143*0Sstevel@tonic-gate $@ = "@_"; 144*0Sstevel@tonic-gate }; 145*0Sstevel@tonic-gate 146*0Sstevel@tonic-gate sub moan { 147*0Sstevel@tonic-gate print "$id: @_"; 148*0Sstevel@tonic-gate } 149*0Sstevel@tonic-gate 150*0Sstevel@tonic-gate sub warn_unpack_U { 151*0Sstevel@tonic-gate $@ = ''; 152*0Sstevel@tonic-gate my @null = unpack('U0U*', $_[0]); 153*0Sstevel@tonic-gate return $@; 154*0Sstevel@tonic-gate } 155*0Sstevel@tonic-gate 156*0Sstevel@tonic-gate for (@MK) { 157*0Sstevel@tonic-gate if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) { 158*0Sstevel@tonic-gate # print "# $_\n"; 159*0Sstevel@tonic-gate } elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) { 160*0Sstevel@tonic-gate $id = $1; 161*0Sstevel@tonic-gate my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $experr) = 162*0Sstevel@tonic-gate ($2, $3, $4, $5, $6, $7, $8); 163*0Sstevel@tonic-gate my @hex = split(/:/, $hex); 164*0Sstevel@tonic-gate unless (@hex == $byteslen) { 165*0Sstevel@tonic-gate my $nhex = @hex; 166*0Sstevel@tonic-gate moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n"; 167*0Sstevel@tonic-gate } 168*0Sstevel@tonic-gate { 169*0Sstevel@tonic-gate use bytes; 170*0Sstevel@tonic-gate my $bytesbyteslen = length($bytes); 171*0Sstevel@tonic-gate unless ($bytesbyteslen == $byteslen) { 172*0Sstevel@tonic-gate moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n"; 173*0Sstevel@tonic-gate } 174*0Sstevel@tonic-gate } 175*0Sstevel@tonic-gate my $warn = warn_unpack_U($bytes); 176*0Sstevel@tonic-gate if ($okay eq 'y') { 177*0Sstevel@tonic-gate if ($warn) { 178*0Sstevel@tonic-gate moan "unpack('U0U*') false negative\n"; 179*0Sstevel@tonic-gate print "not "; 180*0Sstevel@tonic-gate } 181*0Sstevel@tonic-gate } elsif ($okay eq 'n') { 182*0Sstevel@tonic-gate if (not $warn || ($experr ne '' && $warn !~ /$experr/)) { 183*0Sstevel@tonic-gate moan "unpack('U0U*') false positive\n"; 184*0Sstevel@tonic-gate print "not "; 185*0Sstevel@tonic-gate } 186*0Sstevel@tonic-gate } 187*0Sstevel@tonic-gate print "ok $test # $id $okay\n"; 188*0Sstevel@tonic-gate $test++; 189*0Sstevel@tonic-gate } else { 190*0Sstevel@tonic-gate moan "unknown format\n"; 191*0Sstevel@tonic-gate } 192*0Sstevel@tonic-gate } 193*0Sstevel@tonic-gate} 194