1#!perl -w 2 3use strict; 4use Test::More; 5 6# This file tests various functions and macros in the API related to UTF-8. 7 8BEGIN { 9 use_ok('XS::APItest'); 10 require 'charset_tools.pl'; 11 require './t/utf8_setup.pl'; 12}; 13 14$|=1; 15 16use XS::APItest; 17use Config; 18my $word_length = defined $Config{quadkind} ? 8 : 4; 19 20# Below we test some byte-oriented functions that look for UTF-8 variant bytes 21# and we know can work on full words at a time. Hence this is not black box 22# testing. We know how long a word is. Suppose it is 4. We set things up so 23# that we have a string containing 3 bytes followed by 4, followed by 3, and 24# we tell our APItest functions to position the string so it starts at 1 byte 25# past a word boundary. That way the first 3 bytes are the final ones of a 26# word, and the final 3 are the initial ones of a non-complete word. This 27# assumes that the initial and final non-full word bytes are treated 28# individually, so we don't have to test the various combinations of partially 29# filled words. 30 31my $offset = 1; # Start 1 byte past word boundary. 32 33# We choose an invariant and a variant that are at the boundaries between 34# those two types on ASCII platforms. And, just in case the EBCDIC ever 35# changes to do per-word, we choose arbitrarily an invariant that has most of 36# its bits set natively, and a variant that has most unset. First create 37# versions for display in the test names. 38my $display_invariant = isASCII ? "7F" : sprintf "%02X", utf8::unicode_to_native(0x9F); 39my $display_variant = isASCII ? "80" : sprintf "%02X", utf8::unicode_to_native(0xA0); 40my $invariant = chr hex $display_invariant; 41my $variant = chr hex $display_variant; 42 43# We create a string with the correct number of bytes. The -1 is to make the 44# final portion not quite fill a full word and $offset to do the same for the 45# initial portion.) 46my $string_length = 3 * $word_length - 1 - $offset; 47my $all_invariants = $invariant x $string_length; 48my $display_all_invariants = $display_invariant x $string_length; 49 50my $ret_ref = test_is_utf8_invariant_string_loc($all_invariants, $offset, 51 length $all_invariants); 52pass("The tests below are for is_utf8_invariant_string_loc() with string" 53 . " starting $offset bytes after a word boundary"); 54is($ret_ref->[0], 1, "$display_all_invariants contains no variants"); 55 56# Just create a string with a single variant, in all the possible positions. 57for my $pos (0.. length($all_invariants) - 1) { 58 my $test_string = $all_invariants; 59 my $test_display = $display_all_invariants; 60 61 substr($test_string, $pos, 1) = $variant; 62 substr($test_display, $pos * 2, 2) = $display_variant; 63 my $ret_ref = test_is_utf8_invariant_string_loc($test_string, $offset, 64 length $test_string); 65 if (is($ret_ref->[0], 0, "$test_display has a variant")) { 66 is($ret_ref->[1], $pos, " at position $pos"); 67 } 68} 69 70# Now work on variant_under_utf8_count(). 71pass("The tests below are for variant_under_utf8_count() with string" 72 . " starting $offset bytes after a word boundary"); 73is(test_variant_under_utf8_count($all_invariants, $offset, 74 length $all_invariants), 75 0, 76 "$display_all_invariants contains 0 variants"); 77 78# First, put a variant in each possible position in the flanking partial words 79for my $pos (0 .. $word_length - $offset, 80 2 * $word_length .. length($all_invariants) - 1) 81{ 82 my $test_string = $all_invariants; 83 my $test_display = $display_all_invariants; 84 85 substr($test_string, $pos, 1) = $variant; 86 substr($test_display, $pos * 2, 2) = $display_variant; 87 is(test_variant_under_utf8_count($test_string, $offset, length $test_string), 88 1, 89 "$test_display contains 1 variant"); 90} 91 92# Then try all possible combinations of variant/invariant in the full word in 93# the middle (We've already tested the case with 0 variants, so start at 1.) 94for my $bit_pattern (1 .. (1 << $word_length) - 1) { 95 my $bits = $bit_pattern; 96 my $display_word = ""; 97 my $test_word = ""; 98 my $count = 0; 99 100 # Every 1 bit gets the variant for this particular $bit_pattern. 101 for my $bit (0 .. 7) { 102 if ($bits & 1) { 103 $count++; 104 $test_word .= $variant; 105 $display_word .= $display_variant; 106 } 107 else { 108 $test_word .= $invariant; 109 $display_word .= $display_invariant; 110 } 111 $bits >>= 1; 112 } 113 114 my $test_string = $variant x ($word_length - 1) 115 . $test_word 116 . $variant x ($word_length - 1); 117 my $display_string = $display_variant x ($word_length - 1) 118 . $display_word 119 . $display_variant x ($word_length - 1); 120 my $expected_count = $count + 2 * $word_length - 2; 121 is(test_variant_under_utf8_count($test_string, $offset, 122 length $test_string), $expected_count, 123 "$display_string contains $expected_count variants"); 124} 125 126 127my $pound_sign = chr utf8::unicode_to_native(163); 128 129# This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl 130# because that uses the same functions we are testing here. So UTF-EBCDIC 131# strings are hard-coded as I8 strings in this file instead, and we use the 132# translation functions to/from I8 from that file instead. 133 134my $look_for_everything_utf8n_to 135 = $::UTF8_DISALLOW_SURROGATE 136 | $::UTF8_WARN_SURROGATE 137 | $::UTF8_DISALLOW_NONCHAR 138 | $::UTF8_WARN_NONCHAR 139 | $::UTF8_DISALLOW_SUPER 140 | $::UTF8_WARN_SUPER 141 | $::UTF8_DISALLOW_PERL_EXTENDED 142 | $::UTF8_WARN_PERL_EXTENDED; 143my $look_for_everything_uvchr_to 144 = $::UNICODE_DISALLOW_SURROGATE 145 | $::UNICODE_WARN_SURROGATE 146 | $::UNICODE_DISALLOW_NONCHAR 147 | $::UNICODE_WARN_NONCHAR 148 | $::UNICODE_DISALLOW_SUPER 149 | $::UNICODE_WARN_SUPER 150 | $::UNICODE_DISALLOW_PERL_EXTENDED 151 | $::UNICODE_WARN_PERL_EXTENDED; 152 153my $highest_non_extended_cp = 2 ** ((isASCII) ? 31 : 30) - 1; 154 155foreach ([0, '', '', 'empty'], 156 [0, 'N', 'N', '1 char'], 157 [1, 'NN', 'N', '1 char substring'], 158 [-2, 'Perl', 'Rules', 'different'], 159 [0, $pound_sign, $pound_sign, 'pound sign'], 160 [1, $pound_sign . 10, $pound_sign . 1, 161 '10 pounds is more than 1 pound'], 162 [1, $pound_sign . $pound_sign, $pound_sign, 163 '2 pound signs are more than 1'], 164 [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'], 165 [-1, '!', "!\x{1F42A}", 'Initial substrings match'], 166 ) { 167 my ($expect, $left, $right, $desc) = @$_; 168 my $copy = $right; 169 utf8::encode($copy); 170 is(bytes_cmp_utf8($left, $copy), $expect, "bytes_cmp_utf8: $desc"); 171 next if $right =~ tr/\0-\377//c; 172 utf8::encode($left); 173 is(bytes_cmp_utf8($right, $left), -$expect, "... and $desc reversed"); 174} 175 176# The keys to this hash are Unicode code points, their values are the native 177# UTF-8 representations of them. The code points are chosen because they are 178# "interesting" on either or both ASCII and EBCDIC platforms. First we add 179# boundaries where the number of bytes required to represent them increase, or 180# are adjacent to problematic code points, so we want to make sure they aren't 181# considered problematic. 182my %code_points = ( 183 0x0100 => (isASCII) ? "\xc4\x80" : I8_to_native("\xc8\xa0"), 184 0x0400 - 1 => (isASCII) ? "\xcf\xbf" : I8_to_native("\xdf\xbf"), 185 0x0400 => (isASCII) ? "\xd0\x80" : I8_to_native("\xe1\xa0\xa0"), 186 0x0800 - 1 => (isASCII) ? "\xdf\xbf" : I8_to_native("\xe1\xbf\xbf"), 187 0x0800 => (isASCII) ? "\xe0\xa0\x80" : I8_to_native("\xe2\xa0\xa0"), 188 0x4000 - 1 => (isASCII) ? "\xe3\xbf\xbf" : I8_to_native("\xef\xbf\xbf"), 189 0x4000 => (isASCII) ? "\xe4\x80\x80" : I8_to_native("\xf0\xb0\xa0\xa0"), 190 0x8000 - 1 => (isASCII) ? "\xe7\xbf\xbf" : I8_to_native("\xf0\xbf\xbf\xbf"), 191 192 # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC, 193 # as of this writing, considers potentially problematic on EBCDIC 194 0x8000 => (isASCII) ? "\xe8\x80\x80" : I8_to_native("\xf1\xa0\xa0\xa0"), 195 196 0xD000 - 1 => (isASCII) ? "\xec\xbf\xbf" : I8_to_native("\xf1\xb3\xbf\xbf"), 197 198 # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC, 199 # as of this writing, considers potentially problematic on ASCII 200 0xD000 => (isASCII) ? "\xed\x80\x80" : I8_to_native("\xf1\xb4\xa0\xa0"), 201 202 # Bracket the surrogates, and include several surrogates 203 0xD7FF => (isASCII) ? "\xed\x9f\xbf" : I8_to_native("\xf1\xb5\xbf\xbf"), 204 0xD800 => (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"), 205 0xDC00 => (isASCII) ? "\xed\xb0\x80" : I8_to_native("\xf1\xb7\xa0\xa0"), 206 0xDFFF => (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"), 207 0xE000 => (isASCII) ? "\xee\x80\x80" : I8_to_native("\xf1\xb8\xa0\xa0"), 208 209 # Include the 32 contiguous non characters, and adjacent code points 210 0xFDCF => (isASCII) ? "\xef\xb7\x8f" : I8_to_native("\xf1\xbf\xae\xaf"), 211 0xFDD0 => (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"), 212 0xFDD1 => (isASCII) ? "\xef\xb7\x91" : I8_to_native("\xf1\xbf\xae\xb1"), 213 0xFDD2 => (isASCII) ? "\xef\xb7\x92" : I8_to_native("\xf1\xbf\xae\xb2"), 214 0xFDD3 => (isASCII) ? "\xef\xb7\x93" : I8_to_native("\xf1\xbf\xae\xb3"), 215 0xFDD4 => (isASCII) ? "\xef\xb7\x94" : I8_to_native("\xf1\xbf\xae\xb4"), 216 0xFDD5 => (isASCII) ? "\xef\xb7\x95" : I8_to_native("\xf1\xbf\xae\xb5"), 217 0xFDD6 => (isASCII) ? "\xef\xb7\x96" : I8_to_native("\xf1\xbf\xae\xb6"), 218 0xFDD7 => (isASCII) ? "\xef\xb7\x97" : I8_to_native("\xf1\xbf\xae\xb7"), 219 0xFDD8 => (isASCII) ? "\xef\xb7\x98" : I8_to_native("\xf1\xbf\xae\xb8"), 220 0xFDD9 => (isASCII) ? "\xef\xb7\x99" : I8_to_native("\xf1\xbf\xae\xb9"), 221 0xFDDA => (isASCII) ? "\xef\xb7\x9a" : I8_to_native("\xf1\xbf\xae\xba"), 222 0xFDDB => (isASCII) ? "\xef\xb7\x9b" : I8_to_native("\xf1\xbf\xae\xbb"), 223 0xFDDC => (isASCII) ? "\xef\xb7\x9c" : I8_to_native("\xf1\xbf\xae\xbc"), 224 0xFDDD => (isASCII) ? "\xef\xb7\x9d" : I8_to_native("\xf1\xbf\xae\xbd"), 225 0xFDDE => (isASCII) ? "\xef\xb7\x9e" : I8_to_native("\xf1\xbf\xae\xbe"), 226 0xFDDF => (isASCII) ? "\xef\xb7\x9f" : I8_to_native("\xf1\xbf\xae\xbf"), 227 0xFDE0 => (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"), 228 0xFDE1 => (isASCII) ? "\xef\xb7\xa1" : I8_to_native("\xf1\xbf\xaf\xa1"), 229 0xFDE2 => (isASCII) ? "\xef\xb7\xa2" : I8_to_native("\xf1\xbf\xaf\xa2"), 230 0xFDE3 => (isASCII) ? "\xef\xb7\xa3" : I8_to_native("\xf1\xbf\xaf\xa3"), 231 0xFDE4 => (isASCII) ? "\xef\xb7\xa4" : I8_to_native("\xf1\xbf\xaf\xa4"), 232 0xFDE5 => (isASCII) ? "\xef\xb7\xa5" : I8_to_native("\xf1\xbf\xaf\xa5"), 233 0xFDE6 => (isASCII) ? "\xef\xb7\xa6" : I8_to_native("\xf1\xbf\xaf\xa6"), 234 0xFDE7 => (isASCII) ? "\xef\xb7\xa7" : I8_to_native("\xf1\xbf\xaf\xa7"), 235 0xFDE8 => (isASCII) ? "\xef\xb7\xa8" : I8_to_native("\xf1\xbf\xaf\xa8"), 236 0xFDEa => (isASCII) ? "\xef\xb7\x99" : I8_to_native("\xf1\xbf\xaf\xa9"), 237 0xFDEA => (isASCII) ? "\xef\xb7\xaa" : I8_to_native("\xf1\xbf\xaf\xaa"), 238 0xFDEB => (isASCII) ? "\xef\xb7\xab" : I8_to_native("\xf1\xbf\xaf\xab"), 239 0xFDEC => (isASCII) ? "\xef\xb7\xac" : I8_to_native("\xf1\xbf\xaf\xac"), 240 0xFDED => (isASCII) ? "\xef\xb7\xad" : I8_to_native("\xf1\xbf\xaf\xad"), 241 0xFDEE => (isASCII) ? "\xef\xb7\xae" : I8_to_native("\xf1\xbf\xaf\xae"), 242 0xFDEF => (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"), 243 0xFDF0 => (isASCII) ? "\xef\xb7\xb0" : I8_to_native("\xf1\xbf\xaf\xb0"), 244 245 # Mostly around non-characters, but some are transitions to longer strings 246 0xFFFD => (isASCII) ? "\xef\xbf\xbd" : I8_to_native("\xf1\xbf\xbf\xbd"), 247 0x10000 - 1 => (isASCII) 248 ? "\xef\xbf\xbf" 249 : I8_to_native("\xf1\xbf\xbf\xbf"), 250 0x10000 => (isASCII) 251 ? "\xf0\x90\x80\x80" 252 : I8_to_native("\xf2\xa0\xa0\xa0"), 253 0x1FFFD => (isASCII) 254 ? "\xf0\x9f\xbf\xbd" 255 : I8_to_native("\xf3\xbf\xbf\xbd"), 256 0x1FFFE => (isASCII) 257 ? "\xf0\x9f\xbf\xbe" 258 : I8_to_native("\xf3\xbf\xbf\xbe"), 259 0x1FFFF => (isASCII) 260 ? "\xf0\x9f\xbf\xbf" 261 : I8_to_native("\xf3\xbf\xbf\xbf"), 262 0x20000 => (isASCII) 263 ? "\xf0\xa0\x80\x80" 264 : I8_to_native("\xf4\xa0\xa0\xa0"), 265 0x2FFFD => (isASCII) 266 ? "\xf0\xaf\xbf\xbd" 267 : I8_to_native("\xf5\xbf\xbf\xbd"), 268 0x2FFFE => (isASCII) 269 ? "\xf0\xaf\xbf\xbe" 270 : I8_to_native("\xf5\xbf\xbf\xbe"), 271 0x2FFFF => (isASCII) 272 ? "\xf0\xaf\xbf\xbf" 273 : I8_to_native("\xf5\xbf\xbf\xbf"), 274 0x30000 => (isASCII) 275 ? "\xf0\xb0\x80\x80" 276 : I8_to_native("\xf6\xa0\xa0\xa0"), 277 0x3FFFD => (isASCII) 278 ? "\xf0\xbf\xbf\xbd" 279 : I8_to_native("\xf7\xbf\xbf\xbd"), 280 0x3FFFE => (isASCII) 281 ? "\xf0\xbf\xbf\xbe" 282 : I8_to_native("\xf7\xbf\xbf\xbe"), 283 0x40000 - 1 => (isASCII) 284 ? "\xf0\xbf\xbf\xbf" 285 : I8_to_native("\xf7\xbf\xbf\xbf"), 286 0x40000 => (isASCII) 287 ? "\xf1\x80\x80\x80" 288 : I8_to_native("\xf8\xa8\xa0\xa0\xa0"), 289 0x4FFFD => (isASCII) 290 ? "\xf1\x8f\xbf\xbd" 291 : I8_to_native("\xf8\xa9\xbf\xbf\xbd"), 292 0x4FFFE => (isASCII) 293 ? "\xf1\x8f\xbf\xbe" 294 : I8_to_native("\xf8\xa9\xbf\xbf\xbe"), 295 0x4FFFF => (isASCII) 296 ? "\xf1\x8f\xbf\xbf" 297 : I8_to_native("\xf8\xa9\xbf\xbf\xbf"), 298 0x50000 => (isASCII) 299 ? "\xf1\x90\x80\x80" 300 : I8_to_native("\xf8\xaa\xa0\xa0\xa0"), 301 0x5FFFD => (isASCII) 302 ? "\xf1\x9f\xbf\xbd" 303 : I8_to_native("\xf8\xab\xbf\xbf\xbd"), 304 0x5FFFE => (isASCII) 305 ? "\xf1\x9f\xbf\xbe" 306 : I8_to_native("\xf8\xab\xbf\xbf\xbe"), 307 0x5FFFF => (isASCII) 308 ? "\xf1\x9f\xbf\xbf" 309 : I8_to_native("\xf8\xab\xbf\xbf\xbf"), 310 0x60000 => (isASCII) 311 ? "\xf1\xa0\x80\x80" 312 : I8_to_native("\xf8\xac\xa0\xa0\xa0"), 313 0x6FFFD => (isASCII) 314 ? "\xf1\xaf\xbf\xbd" 315 : I8_to_native("\xf8\xad\xbf\xbf\xbd"), 316 0x6FFFE => (isASCII) 317 ? "\xf1\xaf\xbf\xbe" 318 : I8_to_native("\xf8\xad\xbf\xbf\xbe"), 319 0x6FFFF => (isASCII) 320 ? "\xf1\xaf\xbf\xbf" 321 : I8_to_native("\xf8\xad\xbf\xbf\xbf"), 322 0x70000 => (isASCII) 323 ? "\xf1\xb0\x80\x80" 324 : I8_to_native("\xf8\xae\xa0\xa0\xa0"), 325 0x7FFFD => (isASCII) 326 ? "\xf1\xbf\xbf\xbd" 327 : I8_to_native("\xf8\xaf\xbf\xbf\xbd"), 328 0x7FFFE => (isASCII) 329 ? "\xf1\xbf\xbf\xbe" 330 : I8_to_native("\xf8\xaf\xbf\xbf\xbe"), 331 0x7FFFF => (isASCII) 332 ? "\xf1\xbf\xbf\xbf" 333 : I8_to_native("\xf8\xaf\xbf\xbf\xbf"), 334 0x80000 => (isASCII) 335 ? "\xf2\x80\x80\x80" 336 : I8_to_native("\xf8\xb0\xa0\xa0\xa0"), 337 0x8FFFD => (isASCII) 338 ? "\xf2\x8f\xbf\xbd" 339 : I8_to_native("\xf8\xb1\xbf\xbf\xbd"), 340 0x8FFFE => (isASCII) 341 ? "\xf2\x8f\xbf\xbe" 342 : I8_to_native("\xf8\xb1\xbf\xbf\xbe"), 343 0x8FFFF => (isASCII) 344 ? "\xf2\x8f\xbf\xbf" 345 : I8_to_native("\xf8\xb1\xbf\xbf\xbf"), 346 0x90000 => (isASCII) 347 ? "\xf2\x90\x80\x80" 348 : I8_to_native("\xf8\xb2\xa0\xa0\xa0"), 349 0x9FFFD => (isASCII) 350 ? "\xf2\x9f\xbf\xbd" 351 : I8_to_native("\xf8\xb3\xbf\xbf\xbd"), 352 0x9FFFE => (isASCII) 353 ? "\xf2\x9f\xbf\xbe" 354 : I8_to_native("\xf8\xb3\xbf\xbf\xbe"), 355 0x9FFFF => (isASCII) 356 ? "\xf2\x9f\xbf\xbf" 357 : I8_to_native("\xf8\xb3\xbf\xbf\xbf"), 358 0xA0000 => (isASCII) 359 ? "\xf2\xa0\x80\x80" 360 : I8_to_native("\xf8\xb4\xa0\xa0\xa0"), 361 0xAFFFD => (isASCII) 362 ? "\xf2\xaf\xbf\xbd" 363 : I8_to_native("\xf8\xb5\xbf\xbf\xbd"), 364 0xAFFFE => (isASCII) 365 ? "\xf2\xaf\xbf\xbe" 366 : I8_to_native("\xf8\xb5\xbf\xbf\xbe"), 367 0xAFFFF => (isASCII) 368 ? "\xf2\xaf\xbf\xbf" 369 : I8_to_native("\xf8\xb5\xbf\xbf\xbf"), 370 0xB0000 => (isASCII) 371 ? "\xf2\xb0\x80\x80" 372 : I8_to_native("\xf8\xb6\xa0\xa0\xa0"), 373 0xBFFFD => (isASCII) 374 ? "\xf2\xbf\xbf\xbd" 375 : I8_to_native("\xf8\xb7\xbf\xbf\xbd"), 376 0xBFFFE => (isASCII) 377 ? "\xf2\xbf\xbf\xbe" 378 : I8_to_native("\xf8\xb7\xbf\xbf\xbe"), 379 0xBFFFF => (isASCII) 380 ? "\xf2\xbf\xbf\xbf" 381 : I8_to_native("\xf8\xb7\xbf\xbf\xbf"), 382 0xC0000 => (isASCII) 383 ? "\xf3\x80\x80\x80" 384 : I8_to_native("\xf8\xb8\xa0\xa0\xa0"), 385 0xCFFFD => (isASCII) 386 ? "\xf3\x8f\xbf\xbd" 387 : I8_to_native("\xf8\xb9\xbf\xbf\xbd"), 388 0xCFFFE => (isASCII) 389 ? "\xf3\x8f\xbf\xbe" 390 : I8_to_native("\xf8\xb9\xbf\xbf\xbe"), 391 0xCFFFF => (isASCII) 392 ? "\xf3\x8f\xbf\xbf" 393 : I8_to_native("\xf8\xb9\xbf\xbf\xbf"), 394 0xD0000 => (isASCII) 395 ? "\xf3\x90\x80\x80" 396 : I8_to_native("\xf8\xba\xa0\xa0\xa0"), 397 0xDFFFD => (isASCII) 398 ? "\xf3\x9f\xbf\xbd" 399 : I8_to_native("\xf8\xbb\xbf\xbf\xbd"), 400 0xDFFFE => (isASCII) 401 ? "\xf3\x9f\xbf\xbe" 402 : I8_to_native("\xf8\xbb\xbf\xbf\xbe"), 403 0xDFFFF => (isASCII) 404 ? "\xf3\x9f\xbf\xbf" 405 : I8_to_native("\xf8\xbb\xbf\xbf\xbf"), 406 0xE0000 => (isASCII) 407 ? "\xf3\xa0\x80\x80" 408 : I8_to_native("\xf8\xbc\xa0\xa0\xa0"), 409 0xEFFFD => (isASCII) 410 ? "\xf3\xaf\xbf\xbd" 411 : I8_to_native("\xf8\xbd\xbf\xbf\xbd"), 412 0xEFFFE => (isASCII) 413 ? "\xf3\xaf\xbf\xbe" 414 : I8_to_native("\xf8\xbd\xbf\xbf\xbe"), 415 0xEFFFF => (isASCII) 416 ? "\xf3\xaf\xbf\xbf" 417 : I8_to_native("\xf8\xbd\xbf\xbf\xbf"), 418 0xF0000 => (isASCII) 419 ? "\xf3\xb0\x80\x80" 420 : I8_to_native("\xf8\xbe\xa0\xa0\xa0"), 421 0xFFFFD => (isASCII) 422 ? "\xf3\xbf\xbf\xbd" 423 : I8_to_native("\xf8\xbf\xbf\xbf\xbd"), 424 0xFFFFE => (isASCII) 425 ? "\xf3\xbf\xbf\xbe" 426 : I8_to_native("\xf8\xbf\xbf\xbf\xbe"), 427 0xFFFFF => (isASCII) 428 ? "\xf3\xbf\xbf\xbf" 429 : I8_to_native("\xf8\xbf\xbf\xbf\xbf"), 430 0x100000 => (isASCII) 431 ? "\xf4\x80\x80\x80" 432 : I8_to_native("\xf9\xa0\xa0\xa0\xa0"), 433 0x10FFFD => (isASCII) 434 ? "\xf4\x8f\xbf\xbd" 435 : I8_to_native("\xf9\xa1\xbf\xbf\xbd"), 436 0x10FFFE => (isASCII) 437 ? "\xf4\x8f\xbf\xbe" 438 : I8_to_native("\xf9\xa1\xbf\xbf\xbe"), 439 0x10FFFF => (isASCII) 440 ? "\xf4\x8f\xbf\xbf" 441 : I8_to_native("\xf9\xa1\xbf\xbf\xbf"), 442 0x110000 => (isASCII) 443 ? "\xf4\x90\x80\x80" 444 : I8_to_native("\xf9\xa2\xa0\xa0\xa0"), 445 446 # Things that would be noncharacters if they were in Unicode, and might be 447 # mistaken, if the C code is bad, to be nonchars 448 0x11FFFE => (isASCII) 449 ? "\xf4\x9f\xbf\xbe" 450 : I8_to_native("\xf9\xa3\xbf\xbf\xbe"), 451 0x11FFFF => (isASCII) 452 ? "\xf4\x9f\xbf\xbf" 453 : I8_to_native("\xf9\xa3\xbf\xbf\xbf"), 454 0x20FFFE => (isASCII) 455 ? "\xf8\x88\x8f\xbf\xbe" 456 : I8_to_native("\xfa\xa1\xbf\xbf\xbe"), 457 0x20FFFF => (isASCII) 458 ? "\xf8\x88\x8f\xbf\xbf" 459 : I8_to_native("\xfa\xa1\xbf\xbf\xbf"), 460 461 0x200000 - 1 => (isASCII) 462 ? "\xf7\xbf\xbf\xbf" 463 : I8_to_native("\xf9\xbf\xbf\xbf\xbf"), 464 0x200000 => (isASCII) 465 ? "\xf8\x88\x80\x80\x80" 466 : I8_to_native("\xfa\xa0\xa0\xa0\xa0"), 467 0x400000 - 1 => (isASCII) 468 ? "\xf8\x8f\xbf\xbf\xbf" 469 : I8_to_native("\xfb\xbf\xbf\xbf\xbf"), 470 0x400000 => (isASCII) 471 ? "\xf8\x90\x80\x80\x80" 472 : I8_to_native("\xfc\xa4\xa0\xa0\xa0\xa0"), 473 0x4000000 - 1 => (isASCII) 474 ? "\xfb\xbf\xbf\xbf\xbf" 475 : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"), 476 0x4000000 => (isASCII) 477 ? "\xfc\x84\x80\x80\x80\x80" 478 : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"), 479 0x4000000 - 1 => (isASCII) 480 ? "\xfb\xbf\xbf\xbf\xbf" 481 : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"), 482 0x4000000 => (isASCII) 483 ? "\xfc\x84\x80\x80\x80\x80" 484 : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"), 485 0x40000000 - 1 => (isASCII) 486 ? "\xfc\xbf\xbf\xbf\xbf\xbf" 487 : I8_to_native("\xfe\xbf\xbf\xbf\xbf\xbf\xbf"), 488 0x40000000 => 489 (isASCII) ? "\xfd\x80\x80\x80\x80\x80" 490 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0"), 491 0x80000000 - 1 => 492 (isASCII) ? "\xfd\xbf\xbf\xbf\xbf\xbf" 493 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"), 494); 495 496if ($::is64bit) { 497 no warnings qw(overflow portable); 498 $code_points{0x80000000} 499 = (isASCII) 500 ? "\xfe\x82\x80\x80\x80\x80\x80" 501 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"); 502 $code_points{0xFFFFFFFF} 503 = (isASCII) 504 ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" 505 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"); 506 $code_points{0x100000000} 507 = (isASCII) 508 ? "\xfe\x84\x80\x80\x80\x80\x80" 509 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"); 510 $code_points{0x1000000000 - 1} 511 = (isASCII) 512 ? "\xfe\xbf\xbf\xbf\xbf\xbf\xbf" 513 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf\xbf"); 514 $code_points{0x1000000000} 515 = (isASCII) 516 ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80" 517 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"); 518 $code_points{0x7FFFFFFFFFFFFFFF} 519 = (isASCII) 520 ? "\xff\x80\x87\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" 521 : I8_to_native("\xff\xa7\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"); 522 523 # This is used when UV_MAX is the upper limit of acceptable code points 524 # $code_points{0xFFFFFFFFFFFFFFFF} 525 # = (isASCII) 526 # ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" 527 # : I8_to_native("\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"); 528 529 if (isASCII) { # These could falsely show as overlongs in a naive 530 # implementation 531 $code_points{0x40000000000} 532 = "\xff\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80"; 533 $code_points{0x1000000000000} 534 = "\xff\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80"; 535 $code_points{0x40000000000000} 536 = "\xff\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80"; 537 $code_points{0x1000000000000000} 538 = "\xff\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"; 539 # overflows 540 #$code_points{0xfoo} 541 # = "\xff\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"; 542 } 543} 544elsif (! isASCII) { # 32-bit EBCDIC. 64-bit is clearer to handle, so doesn't 545 # need this test case 546 no warnings qw(overflow portable); 547 $code_points{0x40000000} = I8_to_native( 548 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0"); 549} 550 551# Now add in entries for each of code points 0-255, which require special 552# handling on EBCDIC. Remember the keys are Unicode values, and the values 553# are the native UTF-8. For invariants, the bytes are just the native chr. 554 555my $cp = 0; 556while ($cp < ((isASCII) ? 128 : 160)) { # This is from the definition of 557 # invariant 558 $code_points{$cp} = chr utf8::unicode_to_native($cp); 559 $cp++; 560} 561 562# Done with the invariants. Now do the variants. All in this range are 2 563# byte. Again, we can't use the internal functions to generate UTF-8, as 564# those are what we are trying to test. In the loop, we know what range the 565# continuation bytes can be in, and what the lowest start byte can be. So we 566# cycle through them. 567 568my $highest_continuation = 0xBF; 569my $start = (isASCII) ? 0xC2 : 0xC5; 570 571my $continuation = $::lowest_continuation - 1; 572 573while ($cp < 255) { 574 if (++$continuation > $highest_continuation) { 575 576 # Wrap to the next start byte when we reach the final continuation 577 # byte possible 578 $continuation = $::lowest_continuation; 579 $start++; 580 } 581 $code_points{$cp} = I8_to_native(chr($start) . chr($continuation)); 582 583 $cp++; 584} 585 586my @warnings; 587 588use warnings 'utf8'; 589local $SIG{__WARN__} = sub { push @warnings, @_ }; 590 591my %restriction_types; 592 593# This set of tests looks for basic sanity, and lastly tests various routines 594# for the given code point. If the earlier tests for that code point fail, 595# the later ones probably will too. Malformations are tested in later 596# segments of code. 597for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } 598 keys %code_points) 599{ 600 my $hex_u = sprintf("0x%02X", $u); 601 my $n = utf8::unicode_to_native($u); 602 my $hex_n = sprintf("0x%02X", $n); 603 my $bytes = $code_points{$u}; 604 605 my $offskip_should_be; 606 { 607 no warnings qw(overflow portable); 608 $offskip_should_be = (isASCII) 609 ? ( $u < 0x80 ? 1 : 610 $u < 0x800 ? 2 : 611 $u < 0x10000 ? 3 : 612 $u < 0x200000 ? 4 : 613 $u < 0x4000000 ? 5 : 614 $u < 0x80000000 ? 6 : (($::is64bit) 615 ? ($u < 0x1000000000 ? 7 : $::max_bytes) 616 : 7) 617 ) 618 : ($u < 0xA0 ? 1 : 619 $u < 0x400 ? 2 : 620 $u < 0x4000 ? 3 : 621 $u < 0x40000 ? 4 : 622 $u < 0x400000 ? 5 : 623 $u < 0x4000000 ? 6 : 624 $u < 0x40000000 ? 7 : $::max_bytes ); 625 } 626 627 # If this test fails, subsequent ones are meaningless. 628 next unless is(test_OFFUNISKIP($u), $offskip_should_be, 629 "Verify OFFUNISKIP($hex_u) is $offskip_should_be"); 630 my $invariant = $offskip_should_be == 1; 631 my $display_invariant = $invariant || 0; 632 is(test_OFFUNI_IS_INVARIANT($u), $invariant, 633 "Verify OFFUNI_IS_INVARIANT($hex_u) is $display_invariant"); 634 635 my $uvchr_skip_should_be = $offskip_should_be; 636 next unless is(test_UVCHR_SKIP($n), $uvchr_skip_should_be, 637 "Verify UVCHR_SKIP($hex_n) is $uvchr_skip_should_be"); 638 is(test_UVCHR_IS_INVARIANT($n), $offskip_should_be == 1, 639 "Verify UVCHR_IS_INVARIANT($hex_n) is $display_invariant"); 640 641 my $n_chr = chr $n; 642 utf8::upgrade $n_chr; 643 644 is(test_UTF8_SKIP($n_chr), $uvchr_skip_should_be, 645 "Verify UTF8_SKIP(chr $hex_n) is $uvchr_skip_should_be"); 646 647 use bytes; 648 my $byte_length = length $n_chr; 649 for (my $j = 0; $j < $byte_length; $j++) { 650 undef @warnings; 651 652 if ($j == $byte_length - 1) { 653 my $ret 654 = test_is_utf8_valid_partial_char_flags($n_chr, $byte_length, 0); 655 is($ret, 0, " Verify is_utf8_valid_partial_char_flags(" 656 . display_bytes($n_chr) 657 . ") returns 0 for full character"); 658 } 659 else { 660 my $bytes_so_far = substr($n_chr, 0, $j + 1); 661 my $ret 662 = test_is_utf8_valid_partial_char_flags($bytes_so_far, $j + 1, 0); 663 is($ret, 1, " Verify is_utf8_valid_partial_char_flags(" 664 . display_bytes($bytes_so_far) 665 . ") returns 1"); 666 } 667 668 is(scalar @warnings, 0, " Verify is_utf8_valid_partial_char_flags" 669 . " generated no warnings") 670 or output_warnings(@warnings); 671 672 my $b = substr($n_chr, $j, 1); 673 my $hex_b = sprintf("\"\\x%02x\"", ord $b); 674 675 my $byte_invariant = $j == 0 && $uvchr_skip_should_be == 1; 676 my $display_byte_invariant = $byte_invariant || 0; 677 next unless is(test_UTF8_IS_INVARIANT($b), $byte_invariant, 678 " Verify UTF8_IS_INVARIANT($hex_b) for byte $j " 679 . "is $display_byte_invariant"); 680 681 my $is_start = $j == 0 && $uvchr_skip_should_be > 1; 682 my $display_is_start = $is_start || 0; 683 next unless is(test_UTF8_IS_START($b), $is_start, 684 " Verify UTF8_IS_START($hex_b) is $display_is_start"); 685 686 my $is_continuation = $j != 0 && $uvchr_skip_should_be > 1; 687 my $display_is_continuation = $is_continuation || 0; 688 next unless is(test_UTF8_IS_CONTINUATION($b), $is_continuation, 689 " Verify UTF8_IS_CONTINUATION($hex_b) is " 690 . "$display_is_continuation"); 691 692 my $is_continued = $uvchr_skip_should_be > 1; 693 my $display_is_continued = $is_continued || 0; 694 next unless is(test_UTF8_IS_CONTINUED($b), $is_continued, 695 " Verify UTF8_IS_CONTINUED($hex_b) is " 696 . "$display_is_continued"); 697 698 my $is_downgradeable_start = $n < 256 699 && $uvchr_skip_should_be > 1 700 && $j == 0; 701 my $display_is_downgradeable_start = $is_downgradeable_start || 0; 702 next unless is(test_UTF8_IS_DOWNGRADEABLE_START($b), 703 $is_downgradeable_start, 704 " Verify UTF8_IS_DOWNGRADEABLE_START($hex_b) is " 705 . "$display_is_downgradeable_start"); 706 707 my $is_above_latin1 = $n > 255 && $j == 0; 708 my $display_is_above_latin1 = $is_above_latin1 || 0; 709 next unless is(test_UTF8_IS_ABOVE_LATIN1($b), 710 $is_above_latin1, 711 " Verify UTF8_IS_ABOVE_LATIN1($hex_b) is " 712 . "$display_is_above_latin1"); 713 714 my $is_possibly_problematic = $j == 0 715 && $n >= ((isASCII) 716 ? 0xD000 717 : 0x8000); 718 my $display_is_possibly_problematic = $is_possibly_problematic || 0; 719 next unless is(test_isUTF8_POSSIBLY_PROBLEMATIC($b), 720 $is_possibly_problematic, 721 " Verify isUTF8_POSSIBLY_PROBLEMATIC($hex_b) is " 722 . "$display_is_above_latin1"); 723 } 724 725 # We are not trying to look for warnings, etc, so if they should occur, it 726 # is an error. But some of the code points here do cause warnings, so we 727 # check here and turn off the ones that apply to such code points. A 728 # later section of the code tests for these kinds of things. 729 my $this_utf8_flags = $look_for_everything_utf8n_to; 730 my $len = length $bytes; 731 732 my $valid_under_strict = 1; 733 my $valid_under_c9strict = 1; 734 my $valid_for_not_extended_utf8 = 1; 735 if ($n > 0x10FFFF) { 736 $this_utf8_flags &= ~($::UTF8_DISALLOW_SUPER|$::UTF8_WARN_SUPER); 737 $valid_under_strict = 0; 738 $valid_under_c9strict = 0; 739 if ($n > $highest_non_extended_cp) { 740 $this_utf8_flags &= 741 ~($::UTF8_DISALLOW_PERL_EXTENDED|$::UTF8_WARN_PERL_EXTENDED); 742 $valid_for_not_extended_utf8 = 0; 743 } 744 } 745 elsif (($n >= 0xFDD0 && $n <= 0xFDEF) || ($n & 0xFFFE) == 0xFFFE) { 746 $this_utf8_flags &= ~($::UTF8_DISALLOW_NONCHAR|$::UTF8_WARN_NONCHAR); 747 $valid_under_strict = 0; 748 } 749 elsif ($n >= 0xD800 && $n <= 0xDFFF) { 750 $this_utf8_flags &= ~($::UTF8_DISALLOW_SURROGATE|$::UTF8_WARN_SURROGATE); 751 $valid_under_c9strict = 0; 752 $valid_under_strict = 0; 753 } 754 755 undef @warnings; 756 757 my $display_flags = sprintf "0x%x", $this_utf8_flags; 758 my $display_bytes = display_bytes($bytes); 759 my $ret_ref = test_utf8n_to_uvchr_error($bytes, $len, $this_utf8_flags); 760 761 # Rest of tests likely meaningless if it gets the wrong code point. 762 next unless is($ret_ref->[0], $n, 763 "Verify utf8n_to_uvchr_error($display_bytes, $display_flags)" 764 . "returns $hex_n"); 765 is($ret_ref->[1], $len, 766 "Verify utf8n_to_uvchr_error() for $hex_n returns expected length:" 767 . " $len"); 768 769 unless (is(scalar @warnings, 0, 770 "Verify utf8n_to_uvchr_error() for $hex_n generated no warnings")) 771 { 772 output_warnings(@warnings); 773 } 774 is($ret_ref->[2], 0, 775 "Verify utf8n_to_uvchr_error() returned no error bits"); 776 777 undef @warnings; 778 779 my $ret = test_isUTF8_CHAR($bytes, $len); 780 is($ret, $len, 781 "Verify isUTF8_CHAR($display_bytes) returns expected length: $len"); 782 783 unless (is(scalar @warnings, 0, 784 "Verify isUTF8_CHAR() for $hex_n generated no warnings")) 785 { 786 output_warnings(@warnings); 787 } 788 789 undef @warnings; 790 791 $ret = test_isUTF8_CHAR($bytes, $len - 1); 792 is($ret, 0, 793 "Verify isUTF8_CHAR() with too short length parameter returns 0"); 794 795 is(scalar @warnings, 0, "Verify isUTF8_CHAR() generated no warnings") 796 or output_warnings(@warnings); 797 798 undef @warnings; 799 800 $ret = test_isUTF8_CHAR_flags($bytes, $len, 0); 801 is($ret, $len, "Verify isUTF8_CHAR_flags($display_bytes, 0)" 802 . " returns expected length: $len"); 803 804 is(scalar @warnings, 0, 805 "Verify isUTF8_CHAR_flags() for $hex_n generated no warnings") 806 or output_warnings(@warnings); 807 808 undef @warnings; 809 810 $ret = test_isUTF8_CHAR_flags($bytes, $len - 1, 0); 811 is($ret, 0, 812 "Verify isUTF8_CHAR_flags() with too short length parameter returns 0"); 813 814 is(scalar @warnings, 0, "Verify isUTF8_CHAR_flags() generated no warnings") 815 or output_warnings(@warnings); 816 817 undef @warnings; 818 819 $ret = test_isSTRICT_UTF8_CHAR($bytes, $len); 820 my $expected_len = ($valid_under_strict) ? $len : 0; 821 is($ret, $expected_len, "Verify isSTRICT_UTF8_CHAR($display_bytes)" 822 . " returns expected length: $expected_len"); 823 824 is(scalar @warnings, 0, 825 "Verify isSTRICT_UTF8_CHAR() for $hex_n generated no warnings") 826 or output_warnings(@warnings); 827 828 undef @warnings; 829 830 $ret = test_isSTRICT_UTF8_CHAR($bytes, $len - 1); 831 is($ret, 0, 832 "Verify isSTRICT_UTF8_CHAR() with too short length parameter returns 0"); 833 834 is(scalar @warnings, 0, "Verify isSTRICT_UTF8_CHAR() generated no warnings") 835 or output_warnings(@warnings); 836 837 undef @warnings; 838 839 $ret = test_isUTF8_CHAR_flags($bytes, $len, 840 $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE); 841 is($ret, $expected_len, 842 "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')" 843 . " acts like isSTRICT_UTF8_CHAR"); 844 845 is(scalar @warnings, 0, 846 "Verify isUTF8_CHAR() for $hex_n generated no warnings") 847 or output_warnings(@warnings); 848 849 undef @warnings; 850 851 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len); 852 $expected_len = ($valid_under_c9strict) ? $len : 0; 853 is($ret, $expected_len, "Verify isC9_STRICT_UTF8_CHAR($display_bytes)" 854 . " returns expected length: $len"); 855 856 is(scalar @warnings, 0, 857 "Verify isC9_STRICT_UTF8_CHAR() for $hex_n generated no warnings") 858 or output_warnings(@warnings); 859 860 undef @warnings; 861 862 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len - 1); 863 is($ret, 0, 864 "Verify isC9_STRICT_UTF8_CHAR() with too short length parameter returns 0"); 865 866 is(scalar @warnings, 0, 867 "Verify isC9_STRICT_UTF8_CHAR() generated no warnings") 868 or output_warnings(@warnings); 869 870 undef @warnings; 871 872 $ret = test_isUTF8_CHAR_flags($bytes, $len, 873 $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); 874 is($ret, $expected_len, 875 "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')" 876 ." acts like isC9_STRICT_UTF8_CHAR"); 877 878 is(scalar @warnings, 0, 879 "Verify isUTF8_CHAR() for $hex_n generated no warnings") 880 or output_warnings(@warnings); 881 882 undef @warnings; 883 884 $ret_ref = test_valid_utf8_to_uvchr($bytes); 885 is($ret_ref->[0], $n, 886 "Verify valid_utf8_to_uvchr($display_bytes) returns $hex_n"); 887 is($ret_ref->[1], $len, 888 "Verify valid_utf8_to_uvchr() for $hex_n returns expected length: $len"); 889 890 is(scalar @warnings, 0, 891 "Verify valid_utf8_to_uvchr() for $hex_n generated no warnings") 892 or output_warnings(@warnings); 893 894 # Similarly for uvchr_to_utf8 895 my $this_uvchr_flags = $look_for_everything_uvchr_to; 896 if ($n > $highest_non_extended_cp) { 897 $this_uvchr_flags &= 898 ~($::UNICODE_DISALLOW_PERL_EXTENDED|$::UNICODE_WARN_PERL_EXTENDED); 899 } 900 if ($n > 0x10FFFF) { 901 $this_uvchr_flags &= ~($::UNICODE_DISALLOW_SUPER|$::UNICODE_WARN_SUPER); 902 } 903 elsif (($n >= 0xFDD0 && $n <= 0xFDEF) || ($n & 0xFFFE) == 0xFFFE) { 904 $this_uvchr_flags 905 &= ~($::UNICODE_DISALLOW_NONCHAR|$::UNICODE_WARN_NONCHAR); 906 } 907 elsif ($n >= 0xD800 && $n <= 0xDFFF) { 908 $this_uvchr_flags 909 &= ~($::UNICODE_DISALLOW_SURROGATE|$::UNICODE_WARN_SURROGATE); 910 } 911 $display_flags = sprintf "0x%x", $this_uvchr_flags; 912 913 undef @warnings; 914 915 $ret = test_uvchr_to_utf8_flags($n, $this_uvchr_flags); 916 ok(defined $ret, 917 "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returned success"); 918 is($ret, $bytes, 919 "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returns correct bytes"); 920 921 is(scalar @warnings, 0, 922 "Verify uvchr_to_utf8_flags($hex_n, $display_flags) for $hex_n" 923 . " generated no warnings") 924 or output_warnings(@warnings); 925 926 # Now append this code point to a string that we will test various 927 # versions of is_foo_utf8_string_bar on, and keep a count of how many code 928 # points are in it. All the code points in this loop are valid in Perl's 929 # extended UTF-8, but some are not valid under various restrictions. A 930 # string and count is kept separately that is entirely valid for each 931 # restriction. And, for each restriction, we note the first occurrence in 932 # the unrestricted string where we find something not in the restricted 933 # string. 934 $restriction_types{""}{'valid_strings'} .= $bytes; 935 $restriction_types{""}{'valid_counts'}++; 936 937 if ($valid_under_c9strict) { 938 $restriction_types{"c9strict"}{'valid_strings'} .= $bytes; 939 $restriction_types{"c9strict"}{'valid_counts'}++; 940 } 941 elsif (! exists $restriction_types{"c9strict"}{'first_invalid_offset'}) { 942 $restriction_types{"c9strict"}{'first_invalid_offset'} 943 = length $restriction_types{"c9strict"}{'valid_strings'}; 944 $restriction_types{"c9strict"}{'first_invalid_count'} 945 = $restriction_types{"c9strict"}{'valid_counts'}; 946 } 947 948 if ($valid_under_strict) { 949 $restriction_types{"strict"}{'valid_strings'} .= $bytes; 950 $restriction_types{"strict"}{'valid_counts'}++; 951 } 952 elsif (! exists $restriction_types{"strict"}{'first_invalid_offset'}) { 953 $restriction_types{"strict"}{'first_invalid_offset'} 954 = length $restriction_types{"strict"}{'valid_strings'}; 955 $restriction_types{"strict"}{'first_invalid_count'} 956 = $restriction_types{"strict"}{'valid_counts'}; 957 } 958 959 if ($valid_for_not_extended_utf8) { 960 $restriction_types{"not_extended_utf8"}{'valid_strings'} .= $bytes; 961 $restriction_types{"not_extended_utf8"}{'valid_counts'}++; 962 } 963 elsif (! exists 964 $restriction_types{"not_extended_utf8"}{'first_invalid_offset'}) 965 { 966 $restriction_types{"not_extended_utf8"}{'first_invalid_offset'} 967 = length $restriction_types{"not_extended_utf8"}{'valid_strings'}; 968 $restriction_types{"not_extended_utf8"}{'first_invalid_count'} 969 = $restriction_types{"not_extended_utf8"}{'valid_counts'}; 970 } 971} 972 973my $I8c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte 974my $cont_byte = I8_to_native($I8c); 975my $p = (isASCII) ? "\xe1\x80" : I8_to_native("\xE4\xA0"); # partial 976 977# The loop above tested the single or partial character functions/macros, 978# while building up strings to test the string functions, which we do now. 979 980for my $restriction (sort keys %restriction_types) { 981 use bytes; 982 983 for my $use_flags ("", "_flags") { 984 985 # For each restriction, we test it in both the is_foo_flags functions 986 # and the specially named foo function. But not if there isn't such a 987 # specially named function. Currently, this is the only tested 988 # restriction that doesn't have a specially named function 989 next if $use_flags eq "" && $restriction eq "not_extended_utf8"; 990 991 # Start building up the name of the function we will test. 992 my $base_name = "is_"; 993 994 if (! $use_flags && $restriction ne "") { 995 $base_name .= $restriction . "_"; 996 } 997 998 # We test both "is_utf8_string_foo" and "is_fixed_width_buf" functions 999 foreach my $operand ('string', 'fixed_width_buf') { 1000 1001 # Currently, only fixed_width_buf functions have the '_flags' 1002 # suffix. 1003 next if $operand eq 'fixed_width_buf' && $use_flags eq ""; 1004 1005 my $name = "${base_name}utf8_$operand"; 1006 1007 # We test each version of the function 1008 for my $function ("_loclen", "_loc", "") { 1009 1010 # We test each function against 1011 # a) valid input 1012 # b) invalid input created by appending an out-of-place 1013 # continuation character to the valid string 1014 # c) input created by appending a partial character. This 1015 # is valid in the 'fixed_width' functions, but invalid in 1016 # the 'string' ones 1017 # d) invalid input created by calling a function that is 1018 # expecting a restricted form of the input using the string 1019 # that's valid when unrestricted 1020 for my $error_type (0, $cont_byte, $p, $restriction) { 1021 #diag "restriction=$restriction, use_flags=$use_flags, function=$function, error_type=" . display_bytes($error_type); 1022 1023 # If there is no restriction, the error type will be "", 1024 # which is redundant with 0. 1025 next if $error_type eq ""; 1026 1027 my $this_name = "$name$function$use_flags"; 1028 my $bytes 1029 = $restriction_types{$restriction}{'valid_strings'}; 1030 my $expected_offset = length $bytes; 1031 my $expected_count 1032 = $restriction_types{$restriction}{'valid_counts'}; 1033 my $test_name_suffix = ""; 1034 1035 my $this_error_type = $error_type; 1036 if ($this_error_type) { 1037 1038 # Appending a bare continuation byte or a partial 1039 # character doesn't change the character count or 1040 # offset. But in the other cases, we have saved where 1041 # the failures should occur, so use those. Appending 1042 # a continuation byte makes it invalid; appending a 1043 # partial character makes the 'string' form invalid, 1044 # but not the 'fixed_width_buf' form. 1045 if ( $this_error_type eq $cont_byte 1046 || $this_error_type eq $p) 1047 { 1048 $bytes .= $this_error_type; 1049 if ($this_error_type eq $cont_byte) { 1050 $test_name_suffix 1051 = " for an unexpected continuation"; 1052 } 1053 else { 1054 $test_name_suffix 1055 = " if ends with a partial character"; 1056 $this_error_type 1057 = 0 if $operand eq "fixed_width_buf"; 1058 } 1059 } 1060 elsif (! exists $restriction_types 1061 {$this_error_type}{'first_invalid_count'}) 1062 { 1063 # If no errors were found, this is entirely valid. 1064 $this_error_type = 0; 1065 } 1066 else { 1067 1068 if (! exists $restriction_types{$this_error_type}) { 1069 fail("Internal test error: Unknown error type " 1070 . "'$this_error_type'"); 1071 next; 1072 } 1073 $test_name_suffix 1074 = " if contains forbidden code points"; 1075 1076 $bytes = $restriction_types{""}{'valid_strings'}; 1077 $expected_offset 1078 = $restriction_types{$this_error_type} 1079 {'first_invalid_offset'}; 1080 $expected_count 1081 = $restriction_types{$this_error_type } 1082 {'first_invalid_count'}; 1083 } 1084 } 1085 1086 my $length = length $bytes; 1087 my $ret_ref; 1088 1089 my $test = "\$ret_ref = test_$this_name(\$bytes, $length"; 1090 1091 # If using the _flags functions, we have to figure out what 1092 # flags to pass. This is done to match the restriction. 1093 if ($use_flags eq "_flags") { 1094 if (! $restriction) { 1095 $test .= ", 0"; # The flag 1096 1097 # Indicate the kind of flag in the test name. 1098 $this_name .= "(0)"; 1099 } 1100 else { 1101 $this_name .= "($restriction)"; 1102 if ($restriction eq "c9strict") { 1103 $test 1104 .= ", $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE"; 1105 } 1106 elsif ($restriction eq "strict") { 1107 $test .= ", $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE"; 1108 } 1109 elsif ($restriction eq "not_extended_utf8") { 1110 $test .= ", $::UTF8_DISALLOW_PERL_EXTENDED"; 1111 } 1112 else { 1113 fail("Internal test error: Unknown restriction " 1114 . "'$restriction'"); 1115 next; 1116 } 1117 } 1118 } 1119 $test .= ")"; 1120 1121 # Actually run the test 1122 eval $test; 1123 if ($@) { 1124 fail($test); 1125 diag $@; 1126 next; 1127 } 1128 1129 my $ret; 1130 my $error_offset; 1131 my $cp_count; 1132 1133 if ($function eq "") { 1134 $ret = $ret_ref; # For plain function, there's only a 1135 # single return value 1136 } 1137 else { # Otherwise, the multiple values come in an array. 1138 $ret = shift @$ret_ref ; 1139 $error_offset = shift @$ret_ref; 1140 $cp_count = shift@$ret_ref if $function eq "_loclen"; 1141 } 1142 1143 if ($this_error_type) { 1144 is($ret, 0, 1145 "Verify $this_name is FALSE$test_name_suffix"); 1146 } 1147 else { 1148 unless(is($ret, 1, 1149 "Verify $this_name is TRUE for valid input" 1150 . "$test_name_suffix")) 1151 { 1152 diag(" The bytes starting at offset" 1153 . " $error_offset are" 1154 . display_bytes(substr( 1155 $restriction_types{$restriction} 1156 {'valid_strings'}, 1157 $error_offset))); 1158 next; 1159 } 1160 } 1161 1162 if ($function ne "") { 1163 unless (is($error_offset, $expected_offset, 1164 "\tAnd returns the correct offset")) 1165 { 1166 my $min = ($error_offset < $expected_offset) 1167 ? $error_offset 1168 : $expected_offset; 1169 diag(" The bytes starting at offset" . $min 1170 . " are " . display_bytes(substr($bytes, $min))); 1171 } 1172 1173 if ($function eq '_loclen') { 1174 is($cp_count, $expected_count, 1175 "\tAnd returns the correct character count"); 1176 } 1177 } 1178 } 1179 } 1180 } 1181 } 1182} 1183 1184SKIP: 1185{ 1186 my $simple = join "", "A" .. "J"; 1187 my $utf_ch = "\x{3f_ffff}"; # Highest code point that is same number 1188 # of bytes on ASCII and EBCDIC: 5 1189 utf8::encode($utf_ch); 1190 my $utf_ch_len = length $utf_ch; 1191 note "utf_ch_len $utf_ch_len"; 1192 my $utf = $utf_ch x 10; 1193 my $bad_start = substr($utf, 1); 1194 # $bad_end ends with a start byte and a single continuation 1195 my $bad_end = substr($utf, 0, length($utf)-$utf_ch_len+2); 1196 1197 my @hop_tests = 1198 ( # start byte chars 1199 # string in 'string' to hop expected name 1200 [ $simple, 0, 5, 5, "simple in range, forward" ], 1201 [ $simple, 10, -5, 5, "simple in range, backward" ], 1202 [ $simple, 5, 10, 10, "simple out of range, forward" ], 1203 [ $simple, 5, -10, 0, "simple out of range, backward" ], 1204 [ $utf, $utf_ch_len * 5, 5, length($utf), "utf in range, forward" ], 1205 [ $utf, $utf_ch_len * 5, -5, 0, "utf in range, backward" ], 1206 [ $utf, $utf_ch_len * 5, 4, $utf_ch_len * 9, "utf in range b, forward" ], 1207 [ $utf, $utf_ch_len * 5, -4, $utf_ch_len, "utf in range b, backward" ], 1208 [ $utf, $utf_ch_len * 5, 6, length($utf), "utf out of range, forward" ], 1209 [ $utf, $utf_ch_len * 5, -6, 0, "utf out of range, backward" ], 1210 [ $bad_start, 0, 1, $utf_ch_len-1, "bad start, forward 1 from 0" ], 1211 [ $bad_start, 0, 5, 5 * $utf_ch_len-1, "bad start, forward 5 chars from 0" ], 1212 [ $bad_start, 0, 9, length($bad_start)-$utf_ch_len, "bad start, forward 9 chars from 0" ], 1213 [ $bad_start, 0, 10, length $bad_start, "bad start, forward 10 chars from 0" ], 1214 [ $bad_start, $utf_ch_len-1, -1, 0, "bad start, back 1 from first start byte" ], 1215 [ $bad_start, $utf_ch_len-2, -1, 0, "bad start, back 1 from before first start byte" ], 1216 [ $bad_start, 0, -1, 0, "bad start, back 1 from 0" ], 1217 [ $bad_start, length $bad_start, -10, 0, "bad start, back 10 from end" ], 1218 [ $bad_end, 0, 10, length $bad_end, "bad end, forward 10 from 0" ], 1219 [ $bad_end, length($bad_end)-1, 10, length $bad_end, "bad end, forward 1 from end-1" ], 1220 ); 1221 1222 for my $test (@hop_tests) { 1223 my ($str, $s_off, $hop, $want, $name) = @$test; 1224 my $result = test_utf8_hop_safe($str, $s_off, $hop); 1225 is($result, $want, "utf8_hop_safe: $name"); 1226 } 1227} 1228 1229{ 1230 my $replacement = chr(0xFFFD); 1231 use bytes; 1232 is(test_UTF8_IS_REPLACEMENT($replacement, length $replacement), 1, 1233 "UTF8_IS_REPLACEMENT returns 1 on a REPLACEMENT character"); 1234 is(test_UTF8_IS_REPLACEMENT($replacement, length $replacement) - 1, 0, 1235 "UTF8_IS_REPLACEMENT returns 0 on too short an input"); 1236} 1237 1238done_testing; 1239