1*5759b3d2Safresh1# Common subroutines and constants, called by .t files in this directory that 2*5759b3d2Safresh1# deal with UTF-8 3*5759b3d2Safresh1 4*5759b3d2Safresh1# The test files can't use byte_utf8a_to_utf8n() from t/charset_tools.pl 5*5759b3d2Safresh1# because that uses the same functions we are testing here. So UTF-EBCDIC 6*5759b3d2Safresh1# strings are hard-coded as I8 strings in this file instead, and we use the 7*5759b3d2Safresh1# translation functions to/from I8 from that file instead. 8*5759b3d2Safresh1 9*5759b3d2Safresh1sub isASCII { ord "A" == 65 } 10*5759b3d2Safresh1 11*5759b3d2Safresh1sub display_bytes_no_quotes { 12*5759b3d2Safresh1 use bytes; 13*5759b3d2Safresh1 my $string = shift; 14*5759b3d2Safresh1 return join("", map { 15*5759b3d2Safresh1 ($_ =~ /[[:print:]]/) 16*5759b3d2Safresh1 ? $_ 17*5759b3d2Safresh1 : sprintf("\\x%02x", ord $_) 18*5759b3d2Safresh1 } split "", $string) 19*5759b3d2Safresh1} 20*5759b3d2Safresh1 21*5759b3d2Safresh1sub display_bytes { 22*5759b3d2Safresh1 return '"' . display_bytes_no_quotes(shift) . '"'; 23*5759b3d2Safresh1} 24*5759b3d2Safresh1 25*5759b3d2Safresh1sub output_warnings(@) { 26*5759b3d2Safresh1 my @list = @_; 27*5759b3d2Safresh1 if (@list) { 28*5759b3d2Safresh1 diag "The warnings were:\n" . join "\n", map { chomp; $_ } @list; 29*5759b3d2Safresh1 } 30*5759b3d2Safresh1 else { 31*5759b3d2Safresh1 diag "No warnings were raised"; 32*5759b3d2Safresh1 } 33*5759b3d2Safresh1} 34*5759b3d2Safresh1 35*5759b3d2Safresh1sub start_byte_to_cont($) { 36*5759b3d2Safresh1 37*5759b3d2Safresh1 # Extract the code point information from the input UTF-8 start byte, and 38*5759b3d2Safresh1 # return a continuation byte containing the same information. This is 39*5759b3d2Safresh1 # used in constructing an overlong malformation from valid input. 40*5759b3d2Safresh1 41*5759b3d2Safresh1 my $byte = shift; 42*5759b3d2Safresh1 my $len = test_UTF8_SKIP($byte); 43*5759b3d2Safresh1 if ($len < 2) { 44*5759b3d2Safresh1 die "start_byte_to_cont() is expecting a UTF-8 variant"; 45*5759b3d2Safresh1 } 46*5759b3d2Safresh1 47*5759b3d2Safresh1 $byte = ord native_to_I8($byte); 48*5759b3d2Safresh1 49*5759b3d2Safresh1 # Copied from utf8.h. This gets rid of the leading 1 bits. 50*5759b3d2Safresh1 $byte &= ((($len) >= 7) ? 0x00 : (0x1F >> (($len)-2))); 51*5759b3d2Safresh1 52*5759b3d2Safresh1 $byte |= (isASCII) ? 0x80 : 0xA0; 53*5759b3d2Safresh1 return I8_to_native(chr $byte); 54*5759b3d2Safresh1} 55*5759b3d2Safresh1 56*5759b3d2Safresh1$::is64bit = length sprintf("%x", ~0) > 8; 57*5759b3d2Safresh1 58*5759b3d2Safresh1$::lowest_continuation = (isASCII) ? 0x80 : 0xA0; 59*5759b3d2Safresh1 60*5759b3d2Safresh1$::I8c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte 61*5759b3d2Safresh1 62*5759b3d2Safresh1 63*5759b3d2Safresh1$::max_bytes = (isASCII) ? 13 : 14; # Max number of bytes in a UTF-8 sequence 64*5759b3d2Safresh1 # representing a single code point 65*5759b3d2Safresh1 66*5759b3d2Safresh1# Copied from utf8.h 67*5759b3d2Safresh1$::UTF8_ALLOW_EMPTY = 0x0001; 68*5759b3d2Safresh1$::UTF8_GOT_EMPTY = $UTF8_ALLOW_EMPTY; 69*5759b3d2Safresh1$::UTF8_ALLOW_CONTINUATION = 0x0002; 70*5759b3d2Safresh1$::UTF8_GOT_CONTINUATION = $UTF8_ALLOW_CONTINUATION; 71*5759b3d2Safresh1$::UTF8_ALLOW_NON_CONTINUATION = 0x0004; 72*5759b3d2Safresh1$::UTF8_GOT_NON_CONTINUATION = $UTF8_ALLOW_NON_CONTINUATION; 73*5759b3d2Safresh1$::UTF8_ALLOW_SHORT = 0x0008; 74*5759b3d2Safresh1$::UTF8_GOT_SHORT = $UTF8_ALLOW_SHORT; 75*5759b3d2Safresh1$::UTF8_ALLOW_LONG = 0x0010; 76*5759b3d2Safresh1$::UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020; 77*5759b3d2Safresh1$::UTF8_GOT_LONG = $UTF8_ALLOW_LONG; 78*5759b3d2Safresh1$::UTF8_ALLOW_OVERFLOW = 0x0080; 79*5759b3d2Safresh1$::UTF8_GOT_OVERFLOW = $UTF8_ALLOW_OVERFLOW; 80*5759b3d2Safresh1$::UTF8_DISALLOW_SURROGATE = 0x0100; 81*5759b3d2Safresh1$::UTF8_GOT_SURROGATE = $UTF8_DISALLOW_SURROGATE; 82*5759b3d2Safresh1$::UTF8_WARN_SURROGATE = 0x0200; 83*5759b3d2Safresh1$::UTF8_DISALLOW_NONCHAR = 0x0400; 84*5759b3d2Safresh1$::UTF8_GOT_NONCHAR = $UTF8_DISALLOW_NONCHAR; 85*5759b3d2Safresh1$::UTF8_WARN_NONCHAR = 0x0800; 86*5759b3d2Safresh1$::UTF8_DISALLOW_SUPER = 0x1000; 87*5759b3d2Safresh1$::UTF8_GOT_SUPER = $UTF8_DISALLOW_SUPER; 88*5759b3d2Safresh1$::UTF8_WARN_SUPER = 0x2000; 89*5759b3d2Safresh1$::UTF8_DISALLOW_PERL_EXTENDED = 0x4000; 90*5759b3d2Safresh1$::UTF8_GOT_PERL_EXTENDED = $UTF8_DISALLOW_PERL_EXTENDED; 91*5759b3d2Safresh1$::UTF8_WARN_PERL_EXTENDED = 0x8000; 92*5759b3d2Safresh1$::UTF8_CHECK_ONLY = 0x10000; 93*5759b3d2Safresh1$::UTF8_NO_CONFIDENCE_IN_CURLEN_ = 0x20000; 94*5759b3d2Safresh1 95*5759b3d2Safresh1$::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE 96*5759b3d2Safresh1 = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE; 97*5759b3d2Safresh1$::UTF8_DISALLOW_ILLEGAL_INTERCHANGE 98*5759b3d2Safresh1 = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR; 99*5759b3d2Safresh1$::UTF8_WARN_ILLEGAL_C9_INTERCHANGE 100*5759b3d2Safresh1 = $UTF8_WARN_SUPER|$UTF8_WARN_SURROGATE; 101*5759b3d2Safresh1$::UTF8_WARN_ILLEGAL_INTERCHANGE 102*5759b3d2Safresh1 = $UTF8_WARN_ILLEGAL_C9_INTERCHANGE|$UTF8_WARN_NONCHAR; 103*5759b3d2Safresh1 104*5759b3d2Safresh1# Test uvchr_to_utf8(). 105*5759b3d2Safresh1$::UNICODE_WARN_SURROGATE = 0x0001; 106*5759b3d2Safresh1$::UNICODE_WARN_NONCHAR = 0x0002; 107*5759b3d2Safresh1$::UNICODE_WARN_SUPER = 0x0004; 108*5759b3d2Safresh1$::UNICODE_WARN_PERL_EXTENDED = 0x0008; 109*5759b3d2Safresh1$::UNICODE_DISALLOW_SURROGATE = 0x0010; 110*5759b3d2Safresh1$::UNICODE_DISALLOW_NONCHAR = 0x0020; 111*5759b3d2Safresh1$::UNICODE_DISALLOW_SUPER = 0x0040; 112*5759b3d2Safresh1$::UNICODE_DISALLOW_PERL_EXTENDED = 0x0080; 113