xref: /openbsd-src/gnu/usr.bin/perl/ext/XS-APItest/t/utf8_setup.pl (revision 5759b3d249badf144a6240f7eec4dcf9df003e6b)
1*5759b3d2Safresh1# Common subroutines and constants, called by .t files in this directory that
2*5759b3d2Safresh1# deal with UTF-8
3*5759b3d2Safresh1
4*5759b3d2Safresh1# The  test files can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
5*5759b3d2Safresh1# because that uses the same functions we are testing here.  So UTF-EBCDIC
6*5759b3d2Safresh1# strings are hard-coded as I8 strings in this file instead, and we use the
7*5759b3d2Safresh1# translation functions to/from I8 from that file instead.
8*5759b3d2Safresh1
9*5759b3d2Safresh1sub isASCII { ord "A" == 65 }
10*5759b3d2Safresh1
11*5759b3d2Safresh1sub display_bytes_no_quotes {
12*5759b3d2Safresh1    use bytes;
13*5759b3d2Safresh1    my $string = shift;
14*5759b3d2Safresh1    return join("", map {
15*5759b3d2Safresh1                          ($_ =~ /[[:print:]]/)
16*5759b3d2Safresh1                          ? $_
17*5759b3d2Safresh1                          : sprintf("\\x%02x", ord $_)
18*5759b3d2Safresh1                        } split "", $string)
19*5759b3d2Safresh1}
20*5759b3d2Safresh1
21*5759b3d2Safresh1sub display_bytes {
22*5759b3d2Safresh1    return   '"' . display_bytes_no_quotes(shift) . '"';
23*5759b3d2Safresh1}
24*5759b3d2Safresh1
25*5759b3d2Safresh1sub output_warnings(@) {
26*5759b3d2Safresh1    my @list = @_;
27*5759b3d2Safresh1    if (@list) {
28*5759b3d2Safresh1        diag "The warnings were:\n" . join "\n", map { chomp; $_ } @list;
29*5759b3d2Safresh1    }
30*5759b3d2Safresh1    else {
31*5759b3d2Safresh1        diag "No warnings were raised";
32*5759b3d2Safresh1    }
33*5759b3d2Safresh1}
34*5759b3d2Safresh1
35*5759b3d2Safresh1sub start_byte_to_cont($) {
36*5759b3d2Safresh1
37*5759b3d2Safresh1    # Extract the code point information from the input UTF-8 start byte, and
38*5759b3d2Safresh1    # return a continuation byte containing the same information.  This is
39*5759b3d2Safresh1    # used in constructing an overlong malformation from valid input.
40*5759b3d2Safresh1
41*5759b3d2Safresh1    my $byte = shift;
42*5759b3d2Safresh1    my $len = test_UTF8_SKIP($byte);
43*5759b3d2Safresh1    if ($len < 2) {
44*5759b3d2Safresh1        die "start_byte_to_cont() is expecting a UTF-8 variant";
45*5759b3d2Safresh1    }
46*5759b3d2Safresh1
47*5759b3d2Safresh1    $byte = ord native_to_I8($byte);
48*5759b3d2Safresh1
49*5759b3d2Safresh1    # Copied from utf8.h.  This gets rid of the leading 1 bits.
50*5759b3d2Safresh1    $byte &= ((($len) >= 7) ? 0x00 : (0x1F >> (($len)-2)));
51*5759b3d2Safresh1
52*5759b3d2Safresh1    $byte |= (isASCII) ? 0x80 : 0xA0;
53*5759b3d2Safresh1    return I8_to_native(chr $byte);
54*5759b3d2Safresh1}
55*5759b3d2Safresh1
56*5759b3d2Safresh1$::is64bit = length sprintf("%x", ~0) > 8;
57*5759b3d2Safresh1
58*5759b3d2Safresh1$::lowest_continuation = (isASCII) ? 0x80 : 0xA0;
59*5759b3d2Safresh1
60*5759b3d2Safresh1$::I8c = (isASCII) ? "\x80" : "\xa0";    # A continuation byte
61*5759b3d2Safresh1
62*5759b3d2Safresh1
63*5759b3d2Safresh1$::max_bytes = (isASCII) ? 13 : 14; # Max number of bytes in a UTF-8 sequence
64*5759b3d2Safresh1                                    # representing a single code point
65*5759b3d2Safresh1
66*5759b3d2Safresh1# Copied from utf8.h
67*5759b3d2Safresh1$::UTF8_ALLOW_EMPTY            = 0x0001;
68*5759b3d2Safresh1$::UTF8_GOT_EMPTY              = $UTF8_ALLOW_EMPTY;
69*5759b3d2Safresh1$::UTF8_ALLOW_CONTINUATION     = 0x0002;
70*5759b3d2Safresh1$::UTF8_GOT_CONTINUATION       = $UTF8_ALLOW_CONTINUATION;
71*5759b3d2Safresh1$::UTF8_ALLOW_NON_CONTINUATION = 0x0004;
72*5759b3d2Safresh1$::UTF8_GOT_NON_CONTINUATION   = $UTF8_ALLOW_NON_CONTINUATION;
73*5759b3d2Safresh1$::UTF8_ALLOW_SHORT            = 0x0008;
74*5759b3d2Safresh1$::UTF8_GOT_SHORT              = $UTF8_ALLOW_SHORT;
75*5759b3d2Safresh1$::UTF8_ALLOW_LONG             = 0x0010;
76*5759b3d2Safresh1$::UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020;
77*5759b3d2Safresh1$::UTF8_GOT_LONG               = $UTF8_ALLOW_LONG;
78*5759b3d2Safresh1$::UTF8_ALLOW_OVERFLOW         = 0x0080;
79*5759b3d2Safresh1$::UTF8_GOT_OVERFLOW           = $UTF8_ALLOW_OVERFLOW;
80*5759b3d2Safresh1$::UTF8_DISALLOW_SURROGATE     = 0x0100;
81*5759b3d2Safresh1$::UTF8_GOT_SURROGATE          = $UTF8_DISALLOW_SURROGATE;
82*5759b3d2Safresh1$::UTF8_WARN_SURROGATE         = 0x0200;
83*5759b3d2Safresh1$::UTF8_DISALLOW_NONCHAR       = 0x0400;
84*5759b3d2Safresh1$::UTF8_GOT_NONCHAR            = $UTF8_DISALLOW_NONCHAR;
85*5759b3d2Safresh1$::UTF8_WARN_NONCHAR           = 0x0800;
86*5759b3d2Safresh1$::UTF8_DISALLOW_SUPER         = 0x1000;
87*5759b3d2Safresh1$::UTF8_GOT_SUPER              = $UTF8_DISALLOW_SUPER;
88*5759b3d2Safresh1$::UTF8_WARN_SUPER             = 0x2000;
89*5759b3d2Safresh1$::UTF8_DISALLOW_PERL_EXTENDED  = 0x4000;
90*5759b3d2Safresh1$::UTF8_GOT_PERL_EXTENDED       = $UTF8_DISALLOW_PERL_EXTENDED;
91*5759b3d2Safresh1$::UTF8_WARN_PERL_EXTENDED      = 0x8000;
92*5759b3d2Safresh1$::UTF8_CHECK_ONLY             = 0x10000;
93*5759b3d2Safresh1$::UTF8_NO_CONFIDENCE_IN_CURLEN_ = 0x20000;
94*5759b3d2Safresh1
95*5759b3d2Safresh1$::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
96*5759b3d2Safresh1                             = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE;
97*5759b3d2Safresh1$::UTF8_DISALLOW_ILLEGAL_INTERCHANGE
98*5759b3d2Safresh1              = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR;
99*5759b3d2Safresh1$::UTF8_WARN_ILLEGAL_C9_INTERCHANGE
100*5759b3d2Safresh1                             = $UTF8_WARN_SUPER|$UTF8_WARN_SURROGATE;
101*5759b3d2Safresh1$::UTF8_WARN_ILLEGAL_INTERCHANGE
102*5759b3d2Safresh1              = $UTF8_WARN_ILLEGAL_C9_INTERCHANGE|$UTF8_WARN_NONCHAR;
103*5759b3d2Safresh1
104*5759b3d2Safresh1# Test uvchr_to_utf8().
105*5759b3d2Safresh1$::UNICODE_WARN_SURROGATE        = 0x0001;
106*5759b3d2Safresh1$::UNICODE_WARN_NONCHAR          = 0x0002;
107*5759b3d2Safresh1$::UNICODE_WARN_SUPER            = 0x0004;
108*5759b3d2Safresh1$::UNICODE_WARN_PERL_EXTENDED     = 0x0008;
109*5759b3d2Safresh1$::UNICODE_DISALLOW_SURROGATE    = 0x0010;
110*5759b3d2Safresh1$::UNICODE_DISALLOW_NONCHAR      = 0x0020;
111*5759b3d2Safresh1$::UNICODE_DISALLOW_SUPER        = 0x0040;
112*5759b3d2Safresh1$::UNICODE_DISALLOW_PERL_EXTENDED = 0x0080;
113