1b39c5158Smillert# 2*3d61058aSafresh1# $Id: Unicode.t,v 2.5 2023/11/10 01:10:50 dankogai Exp $ 3b39c5158Smillert# 4b39c5158Smillert# This script is written entirely in ASCII, even though quoted literals 5b39c5158Smillert# do include non-BMP unicode characters -- Are you happy, jhi? 6b39c5158Smillert# 7b39c5158Smillert 8b39c5158SmillertBEGIN { 9*3d61058aSafresh1 require Config; Config->import(); 10b39c5158Smillert if ($Config{'extensions'} !~ /\bEncode\b/) { 11b39c5158Smillert print "1..0 # Skip: Encode was not built\n"; 12b39c5158Smillert exit 0; 13b39c5158Smillert } 14b39c5158Smillert if (ord("A") == 193) { 15b39c5158Smillert print "1..0 # Skip: EBCDIC\n"; 16b39c5158Smillert exit 0; 17b39c5158Smillert } 18b39c5158Smillert $| = 1; 19b39c5158Smillert} 20b39c5158Smillert 21b39c5158Smillertuse strict; 22b39c5158Smillert#use Test::More 'no_plan'; 23e9ce3842Safresh1use Test::More tests => 56; 24b39c5158Smillertuse Encode qw(encode decode find_encoding); 25b39c5158Smillert 26b39c5158Smillert# 27b39c5158Smillert# see 28eac174f2Safresh1# http://www.unicode.org/reports/tr19/ 29b39c5158Smillert# 30b39c5158Smillert 31b39c5158Smillertmy $dankogai = "\x{5c0f}\x{98fc}\x{3000}\x{5f3e}"; 32b39c5158Smillertmy $nasty = "$dankogai\x{1abcd}"; 33e9ce3842Safresh1my $fallback = "$dankogai\x{fffd}\x{fffd}"; 34b39c5158Smillert 35b39c5158Smillert#hi: (0x1abcd - 0x10000) / 0x400 + 0xD800 = 0xd82a 36b39c5158Smillert#lo: (0x1abcd - 0x10000) % 0x400 + 0xDC00 = 0xdfcd 37b39c5158Smillert 38b39c5158Smillertmy $n_16be = 39b39c5158Smillert pack("C*", map {hex($_)} qw<5c 0f 98 fc 30 00 5f 3e d8 2a df cd>); 40b39c5158Smillertmy $n_16le = 41b39c5158Smillert pack("C*", map {hex($_)} qw<0f 5c fc 98 00 30 3e 5f 2a d8 cd df>); 42b39c5158Smillertmy $f_16be = 43b39c5158Smillert pack("C*", map {hex($_)} qw<5c 0f 98 fc 30 00 5f 3e ff fd>); 44b39c5158Smillertmy $f_16le = 45b39c5158Smillert pack("C*", map {hex($_)} qw<0f 5c fc 98 00 30 3e 5f fd ff>); 46b39c5158Smillertmy $n_32be = 47b39c5158Smillert pack("C*", map {hex($_)} 48b39c5158Smillert qw<00 00 5c 0f 00 00 98 fc 00 00 30 00 00 00 5f 3e 00 01 ab cd>); 49b39c5158Smillertmy $n_32le = 50b39c5158Smillert pack("C*", map {hex($_)} 51b39c5158Smillert qw<0f 5c 00 00 fc 98 00 00 00 30 00 00 3e 5f 00 00 cd ab 01 00>); 52b39c5158Smillert 53b39c5158Smillertmy $n_16bb = pack('n', 0xFeFF) . $n_16be; 54b39c5158Smillertmy $n_16lb = pack('v', 0xFeFF) . $n_16le; 55b39c5158Smillertmy $n_32bb = pack('N', 0xFeFF) . $n_32be; 56b39c5158Smillertmy $n_32lb = pack('V', 0xFeFF) . $n_32le; 57b39c5158Smillert 58b39c5158Smillertis($n_16be, encode('UTF-16BE', $nasty), qq{encode UTF-16BE}); 59b39c5158Smillertis($n_16le, encode('UTF-16LE', $nasty), qq{encode UTF-16LE}); 60b39c5158Smillertis($n_32be, encode('UTF-32BE', $nasty), qq{encode UTF-32BE}); 61b39c5158Smillertis($n_32le, encode('UTF-32LE', $nasty), qq{encode UTF-16LE}); 62b39c5158Smillert 63b39c5158Smillertis($nasty, decode('UTF-16BE', $n_16be), qq{decode UTF-16BE}); 64b39c5158Smillertis($nasty, decode('UTF-16LE', $n_16le), qq{decode UTF-16LE}); 65b39c5158Smillertis($nasty, decode('UTF-32BE', $n_32be), qq{decode UTF-32BE}); 66b39c5158Smillertis($nasty, decode('UTF-32LE', $n_32le), qq{decode UTF-32LE}); 67b39c5158Smillert 68b39c5158Smillertis($n_16bb, encode('UTF-16', $nasty), qq{encode UTF-16}); 69b39c5158Smillertis($n_32bb, encode('UTF-32', $nasty), qq{encode UTF-32}); 70b39c5158Smillertis($nasty, decode('UTF-16', $n_16bb), qq{decode UTF-16, bom=be}); 71b39c5158Smillertis($nasty, decode('UTF-16', $n_16lb), qq{decode UTF-16, bom=le}); 72b39c5158Smillertis($nasty, decode('UTF-32', $n_32bb), qq{decode UTF-32, bom=be}); 73b39c5158Smillertis($nasty, decode('UTF-32', $n_32lb), qq{decode UTF-32, bom=le}); 74b39c5158Smillert 75b39c5158Smillertis(decode('UCS-2BE', $n_16be), $fallback, "decode UCS-2BE: fallback"); 76b39c5158Smillertis(decode('UCS-2LE', $n_16le), $fallback, "decode UCS-2LE: fallback"); 77b39c5158Smillerteval { decode('UCS-2BE', $n_16be, 1) }; 78b39c5158Smillertis (index($@,'UCS-2BE:'), 0, "decode UCS-2BE: exception"); 79b39c5158Smillerteval { decode('UCS-2LE', $n_16le, 1) }; 80b39c5158Smillertis (index($@,'UCS-2LE:'), 0, "decode UCS-2LE: exception"); 81b39c5158Smillertis(encode('UCS-2BE', $nasty), $f_16be, "encode UCS-2BE: fallback"); 82b39c5158Smillertis(encode('UCS-2LE', $nasty), $f_16le, "encode UCS-2LE: fallback"); 83b39c5158Smillerteval { encode('UCS-2BE', $nasty, 1) }; 84b39c5158Smillertis(index($@, 'UCS-2BE'), 0, "encode UCS-2BE: exception"); 85b39c5158Smillerteval { encode('UCS-2LE', $nasty, 1) }; 86b39c5158Smillertis(index($@, 'UCS-2LE'), 0, "encode UCS-2LE: exception"); 87b39c5158Smillert 88e9ce3842Safresh1{ 89e9ce3842Safresh1 my %tests = ( 90e9ce3842Safresh1 'UCS-2BE' => 'n*', 91e9ce3842Safresh1 'UCS-2LE' => 'v*', 92e9ce3842Safresh1 'UTF-16BE' => 'n*', 93e9ce3842Safresh1 'UTF-16LE' => 'v*', 94e9ce3842Safresh1 'UTF-32BE' => 'N*', 95e9ce3842Safresh1 'UTF-32LE' => 'V*', 96e9ce3842Safresh1 ); 97e9ce3842Safresh1 98e9ce3842Safresh1 while (my ($enc, $pack) = each(%tests)) { 99e9ce3842Safresh1 is(decode($enc, pack($pack, 0xD800, 0x263A)), "\x{FFFD}\x{263A}", 100e9ce3842Safresh1 "decode $enc (HI surrogate followed by WHITE SMILING FACE)"); 101e9ce3842Safresh1 is(decode($enc, pack($pack, 0xDC00, 0x263A)), "\x{FFFD}\x{263A}", 102e9ce3842Safresh1 "decode $enc (LO surrogate followed by WHITE SMILING FACE)"); 103e9ce3842Safresh1 } 104e9ce3842Safresh1} 105e9ce3842Safresh1 106e9ce3842Safresh1{ 107e9ce3842Safresh1 my %tests = ( 108e9ce3842Safresh1 'UTF-16BE' => 'n*', 109e9ce3842Safresh1 'UTF-16LE' => 'v*', 110e9ce3842Safresh1 ); 111e9ce3842Safresh1 112e9ce3842Safresh1 while (my ($enc, $pack) = each(%tests)) { 113e9ce3842Safresh1 is(decode($enc, pack($pack, 0xD800)), "\x{FFFD}", 114e9ce3842Safresh1 "decode $enc (HI surrogate)"); 115e9ce3842Safresh1 is(decode($enc, pack($pack, 0x263A, 0xD800)), "\x{263A}\x{FFFD}", 116e9ce3842Safresh1 "decode $enc (WHITE SMILING FACE followed by HI surrogate)"); 117e9ce3842Safresh1 } 118e9ce3842Safresh1} 119e9ce3842Safresh1 120e9ce3842Safresh1{ 121e9ce3842Safresh1 my %tests = ( 122e9ce3842Safresh1 'UTF-16BE' => 'n*', 123e9ce3842Safresh1 'UTF-16LE' => 'v*', 124e9ce3842Safresh1 ); 125e9ce3842Safresh1 126e9ce3842Safresh1 while (my ($enc, $pack) = each(%tests)) { 127e9ce3842Safresh1 is(encode($enc, "\x{110000}"), pack($pack, 0xFFFD), 128e9ce3842Safresh1 "ordinals greater than U+10FFFF is replaced with U+FFFD"); 129e9ce3842Safresh1 } 130e9ce3842Safresh1} 131e9ce3842Safresh1 132b39c5158Smillert# 133b39c5158Smillert# SvGROW test for (en|de)code_xs 134b39c5158Smillert# 135b39c5158SmillertSKIP: { 136b39c5158Smillert my $utf8 = ''; 137b39c5158Smillert for my $j (0,0x10){ 138b39c5158Smillert for my $i (0..0xffff){ 139b39c5158Smillert $j == 0 and (0xD800 <= $i && $i <= 0xDFFF) and next; 140b39c5158Smillert $utf8 .= ord($j+$i); 141b39c5158Smillert } 142b39c5158Smillert for my $major ('UTF-16', 'UTF-32'){ 143b39c5158Smillert for my $minor ('BE', 'LE'){ 144b39c5158Smillert my $enc = $major.$minor; 145b39c5158Smillert is(decode($enc, encode($enc, $utf8)), $utf8, "$enc RT"); 146b39c5158Smillert } 147b39c5158Smillert } 148b39c5158Smillert } 149b39c5158Smillert}; 150b39c5158Smillert 151b39c5158Smillert# 152b39c5158Smillert# CJKT vs. UTF-7 153b39c5158Smillert# 154b39c5158Smillert 155b39c5158Smillertuse File::Spec; 156b39c5158Smillertuse File::Basename; 157b39c5158Smillert 158b39c5158Smillertmy $dir = dirname(__FILE__); 159b39c5158Smillertopendir my $dh, $dir or die "$dir:$!"; 160b39c5158Smillertmy @file = sort grep {/\.utf$/o} readdir $dh; 161b39c5158Smillertclosedir $dh; 162b39c5158Smillertfor my $file (@file){ 163b39c5158Smillert my $path = File::Spec->catfile($dir, $file); 164b39c5158Smillert open my $fh, '<', $path or die "$path:$!"; 165b39c5158Smillert my $content; 166b39c5158Smillert if (PerlIO::Layer->find('perlio')){ 167b39c5158Smillert binmode $fh => ':utf8'; 168b39c5158Smillert $content = join('' => <$fh>); 169b39c5158Smillert }else{ # ugh! 170b39c5158Smillert binmode $fh; 171b39c5158Smillert $content = join('' => <$fh>); 172b39c5158Smillert Encode::_utf8_on($content) 173b39c5158Smillert } 174b39c5158Smillert close $fh; 175b39c5158Smillert is(decode("UTF-7", encode("UTF-7", $content)), $content, 176b39c5158Smillert "UTF-7 RT:$file"); 177b39c5158Smillert} 178b39c5158Smillert 179b39c5158Smillert# Magic 180b39c5158Smillert{ 181b39c5158Smillert # see http://rt.perl.org/rt3//Ticket/Display.html?id=60472 182b39c5158Smillert my $work = chr(0x100); 183b39c5158Smillert my $encoding = find_encoding("UTF16-BE"); 184b39c5158Smillert my $tied; 185b39c5158Smillert tie $tied, SomeScalar => \$work; 186b39c5158Smillert my $result = $encoding->encode($tied, 1); 187b39c5158Smillert is($work, "", "check set magic was applied"); 188b39c5158Smillert} 189b39c5158Smillert 190b39c5158Smillertpackage SomeScalar; 191b39c5158Smillertuse Tie::Scalar; 192b39c5158Smillertuse vars qw(@ISA); 193b39c5158SmillertBEGIN { @ISA = 'Tie::Scalar' } 194b39c5158Smillert 195b39c5158Smillertsub TIESCALAR { 196b39c5158Smillert my ($class, $ref) = @_; 197b39c5158Smillert return bless $ref, $class; 198b39c5158Smillert} 199b39c5158Smillert 200b39c5158Smillertsub FETCH { 201b39c5158Smillert ${$_[0]} 202b39c5158Smillert} 203b39c5158Smillert 204b39c5158Smillertsub STORE { 205b39c5158Smillert ${$_[0]} = $_[1]; 206b39c5158Smillert} 207b39c5158Smillert 208b39c5158Smillert1; 209b39c5158Smillert__END__ 210