1*0Sstevel@tonic-gate# $Id: encoding.pm,v 1.48 2003/12/29 02:47:16 dankogai Exp dankogai $ 2*0Sstevel@tonic-gatepackage encoding; 3*0Sstevel@tonic-gateour $VERSION = do { my @r = (q$Revision: 1.48 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; 4*0Sstevel@tonic-gate 5*0Sstevel@tonic-gateuse Encode; 6*0Sstevel@tonic-gateuse strict; 7*0Sstevel@tonic-gatesub DEBUG () { 0 } 8*0Sstevel@tonic-gate 9*0Sstevel@tonic-gateBEGIN { 10*0Sstevel@tonic-gate if (ord("A") == 193) { 11*0Sstevel@tonic-gate require Carp; 12*0Sstevel@tonic-gate Carp::croak("encoding pragma does not support EBCDIC platforms"); 13*0Sstevel@tonic-gate } 14*0Sstevel@tonic-gate} 15*0Sstevel@tonic-gate 16*0Sstevel@tonic-gateour $HAS_PERLIO = 0; 17*0Sstevel@tonic-gateeval { require PerlIO::encoding }; 18*0Sstevel@tonic-gateunless ($@){ 19*0Sstevel@tonic-gate $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02); 20*0Sstevel@tonic-gate} 21*0Sstevel@tonic-gate 22*0Sstevel@tonic-gatesub _exception{ 23*0Sstevel@tonic-gate my $name = shift; 24*0Sstevel@tonic-gate $] > 5.008 and return 0; # 5.8.1 or higher then no 25*0Sstevel@tonic-gate my %utfs = map {$_=>1} 26*0Sstevel@tonic-gate qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE 27*0Sstevel@tonic-gate UTF-32 UTF-32BE UTF-32LE); 28*0Sstevel@tonic-gate $utfs{$name} or return 0; # UTFs or no 29*0Sstevel@tonic-gate require Config; Config->import(); our %Config; 30*0Sstevel@tonic-gate return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no 31*0Sstevel@tonic-gate} 32*0Sstevel@tonic-gate 33*0Sstevel@tonic-gatesub import { 34*0Sstevel@tonic-gate my $class = shift; 35*0Sstevel@tonic-gate my $name = shift; 36*0Sstevel@tonic-gate my %arg = @_; 37*0Sstevel@tonic-gate $name ||= $ENV{PERL_ENCODING}; 38*0Sstevel@tonic-gate my $enc = find_encoding($name); 39*0Sstevel@tonic-gate unless (defined $enc) { 40*0Sstevel@tonic-gate require Carp; 41*0Sstevel@tonic-gate Carp::croak("Unknown encoding '$name'"); 42*0Sstevel@tonic-gate } 43*0Sstevel@tonic-gate $name = $enc->name; # canonize 44*0Sstevel@tonic-gate unless ($arg{Filter}) { 45*0Sstevel@tonic-gate DEBUG and warn "_exception($name) = ", _exception($name); 46*0Sstevel@tonic-gate _exception($name) or ${^ENCODING} = $enc; 47*0Sstevel@tonic-gate $HAS_PERLIO or return 1; 48*0Sstevel@tonic-gate }else{ 49*0Sstevel@tonic-gate defined(${^ENCODING}) and undef ${^ENCODING}; 50*0Sstevel@tonic-gate # implicitly 'use utf8' 51*0Sstevel@tonic-gate require utf8; # to fetch $utf8::hint_bits; 52*0Sstevel@tonic-gate $^H |= $utf8::hint_bits; 53*0Sstevel@tonic-gate eval { 54*0Sstevel@tonic-gate require Filter::Util::Call ; 55*0Sstevel@tonic-gate Filter::Util::Call->import ; 56*0Sstevel@tonic-gate filter_add(sub{ 57*0Sstevel@tonic-gate my $status = filter_read(); 58*0Sstevel@tonic-gate if ($status > 0){ 59*0Sstevel@tonic-gate $_ = $enc->decode($_, 1); 60*0Sstevel@tonic-gate DEBUG and warn $_; 61*0Sstevel@tonic-gate } 62*0Sstevel@tonic-gate $status ; 63*0Sstevel@tonic-gate }); 64*0Sstevel@tonic-gate }; 65*0Sstevel@tonic-gate } DEBUG and warn "Filter installed"; 66*0Sstevel@tonic-gate defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; 67*0Sstevel@tonic-gate for my $h (qw(STDIN STDOUT)){ 68*0Sstevel@tonic-gate if ($arg{$h}){ 69*0Sstevel@tonic-gate unless (defined find_encoding($arg{$h})) { 70*0Sstevel@tonic-gate require Carp; 71*0Sstevel@tonic-gate Carp::croak("Unknown encoding for $h, '$arg{$h}'"); 72*0Sstevel@tonic-gate } 73*0Sstevel@tonic-gate eval { binmode($h, ":raw :encoding($arg{$h})") }; 74*0Sstevel@tonic-gate }else{ 75*0Sstevel@tonic-gate unless (exists $arg{$h}){ 76*0Sstevel@tonic-gate eval { 77*0Sstevel@tonic-gate no warnings 'uninitialized'; 78*0Sstevel@tonic-gate binmode($h, ":raw :encoding($name)"); 79*0Sstevel@tonic-gate }; 80*0Sstevel@tonic-gate } 81*0Sstevel@tonic-gate } 82*0Sstevel@tonic-gate if ($@){ 83*0Sstevel@tonic-gate require Carp; 84*0Sstevel@tonic-gate Carp::croak($@); 85*0Sstevel@tonic-gate } 86*0Sstevel@tonic-gate } 87*0Sstevel@tonic-gate return 1; # I doubt if we need it, though 88*0Sstevel@tonic-gate} 89*0Sstevel@tonic-gate 90*0Sstevel@tonic-gatesub unimport{ 91*0Sstevel@tonic-gate no warnings; 92*0Sstevel@tonic-gate undef ${^ENCODING}; 93*0Sstevel@tonic-gate if ($HAS_PERLIO){ 94*0Sstevel@tonic-gate binmode(STDIN, ":raw"); 95*0Sstevel@tonic-gate binmode(STDOUT, ":raw"); 96*0Sstevel@tonic-gate }else{ 97*0Sstevel@tonic-gate binmode(STDIN); 98*0Sstevel@tonic-gate binmode(STDOUT); 99*0Sstevel@tonic-gate } 100*0Sstevel@tonic-gate if ($INC{"Filter/Util/Call.pm"}){ 101*0Sstevel@tonic-gate eval { filter_del() }; 102*0Sstevel@tonic-gate } 103*0Sstevel@tonic-gate} 104*0Sstevel@tonic-gate 105*0Sstevel@tonic-gate1; 106*0Sstevel@tonic-gate__END__ 107*0Sstevel@tonic-gate 108*0Sstevel@tonic-gate=pod 109*0Sstevel@tonic-gate 110*0Sstevel@tonic-gate=head1 NAME 111*0Sstevel@tonic-gate 112*0Sstevel@tonic-gateencoding - allows you to write your script in non-ascii or non-utf8 113*0Sstevel@tonic-gate 114*0Sstevel@tonic-gate=head1 SYNOPSIS 115*0Sstevel@tonic-gate 116*0Sstevel@tonic-gate use encoding "greek"; # Perl like Greek to you? 117*0Sstevel@tonic-gate use encoding "euc-jp"; # Jperl! 118*0Sstevel@tonic-gate 119*0Sstevel@tonic-gate # or you can even do this if your shell supports your native encoding 120*0Sstevel@tonic-gate 121*0Sstevel@tonic-gate perl -Mencoding=latin2 -e '...' # Feeling centrally European? 122*0Sstevel@tonic-gate perl -Mencoding=euc-kr -e '...' # Or Korean? 123*0Sstevel@tonic-gate 124*0Sstevel@tonic-gate # more control 125*0Sstevel@tonic-gate 126*0Sstevel@tonic-gate # A simple euc-cn => utf-8 converter 127*0Sstevel@tonic-gate use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; 128*0Sstevel@tonic-gate 129*0Sstevel@tonic-gate # "no encoding;" supported (but not scoped!) 130*0Sstevel@tonic-gate no encoding; 131*0Sstevel@tonic-gate 132*0Sstevel@tonic-gate # an alternate way, Filter 133*0Sstevel@tonic-gate use encoding "euc-jp", Filter=>1; 134*0Sstevel@tonic-gate # now you can use kanji identifiers -- in euc-jp! 135*0Sstevel@tonic-gate 136*0Sstevel@tonic-gate=head1 ABSTRACT 137*0Sstevel@tonic-gate 138*0Sstevel@tonic-gateLet's start with a bit of history: Perl 5.6.0 introduced Unicode 139*0Sstevel@tonic-gatesupport. You could apply C<substr()> and regexes even to complex CJK 140*0Sstevel@tonic-gatecharacters -- so long as the script was written in UTF-8. But back 141*0Sstevel@tonic-gatethen, text editors that supported UTF-8 were still rare and many users 142*0Sstevel@tonic-gateinstead chose to write scripts in legacy encodings, giving up a whole 143*0Sstevel@tonic-gatenew feature of Perl 5.6. 144*0Sstevel@tonic-gate 145*0Sstevel@tonic-gateRewind to the future: starting from perl 5.8.0 with the B<encoding> 146*0Sstevel@tonic-gatepragma, you can write your script in any encoding you like (so long 147*0Sstevel@tonic-gateas the C<Encode> module supports it) and still enjoy Unicode support. 148*0Sstevel@tonic-gateThis pragma achieves that by doing the following: 149*0Sstevel@tonic-gate 150*0Sstevel@tonic-gate=over 151*0Sstevel@tonic-gate 152*0Sstevel@tonic-gate=item * 153*0Sstevel@tonic-gate 154*0Sstevel@tonic-gateInternally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from 155*0Sstevel@tonic-gatethe encoding specified to utf8. In Perl 5.8.1 and later, literals in 156*0Sstevel@tonic-gateC<tr///> and C<DATA> pseudo-filehandle are also converted. 157*0Sstevel@tonic-gate 158*0Sstevel@tonic-gate=item * 159*0Sstevel@tonic-gate 160*0Sstevel@tonic-gateChanging PerlIO layers of C<STDIN> and C<STDOUT> to the encoding 161*0Sstevel@tonic-gate specified. 162*0Sstevel@tonic-gate 163*0Sstevel@tonic-gate=back 164*0Sstevel@tonic-gate 165*0Sstevel@tonic-gate=head2 Literal Conversions 166*0Sstevel@tonic-gate 167*0Sstevel@tonic-gateYou can write code in EUC-JP as follows: 168*0Sstevel@tonic-gate 169*0Sstevel@tonic-gate my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji 170*0Sstevel@tonic-gate #<-char-><-char-> # 4 octets 171*0Sstevel@tonic-gate s/\bCamel\b/$Rakuda/; 172*0Sstevel@tonic-gate 173*0Sstevel@tonic-gateAnd with C<use encoding "euc-jp"> in effect, it is the same thing as 174*0Sstevel@tonic-gatethe code in UTF-8: 175*0Sstevel@tonic-gate 176*0Sstevel@tonic-gate my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters 177*0Sstevel@tonic-gate s/\bCamel\b/$Rakuda/; 178*0Sstevel@tonic-gate 179*0Sstevel@tonic-gate=head2 PerlIO layers for C<STD(IN|OUT)> 180*0Sstevel@tonic-gate 181*0Sstevel@tonic-gateThe B<encoding> pragma also modifies the filehandle layers of 182*0Sstevel@tonic-gateSTDIN and STDOUT to the specified encoding. Therefore, 183*0Sstevel@tonic-gate 184*0Sstevel@tonic-gate use encoding "euc-jp"; 185*0Sstevel@tonic-gate my $message = "Camel is the symbol of perl.\n"; 186*0Sstevel@tonic-gate my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji 187*0Sstevel@tonic-gate $message =~ s/\bCamel\b/$Rakuda/; 188*0Sstevel@tonic-gate print $message; 189*0Sstevel@tonic-gate 190*0Sstevel@tonic-gateWill print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", 191*0Sstevel@tonic-gatenot "\x{99F1}\x{99DD} is the symbol of perl.\n". 192*0Sstevel@tonic-gate 193*0Sstevel@tonic-gateYou can override this by giving extra arguments; see below. 194*0Sstevel@tonic-gate 195*0Sstevel@tonic-gate=head2 Implicit upgrading for byte strings 196*0Sstevel@tonic-gate 197*0Sstevel@tonic-gateBy default, if strings operating under byte semantics and strings 198*0Sstevel@tonic-gatewith Unicode character data are concatenated, the new string will 199*0Sstevel@tonic-gatebe created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>. 200*0Sstevel@tonic-gate 201*0Sstevel@tonic-gateThe B<encoding> pragma changes this to use the specified encoding 202*0Sstevel@tonic-gateinstead. For example: 203*0Sstevel@tonic-gate 204*0Sstevel@tonic-gate use encoding 'utf8'; 205*0Sstevel@tonic-gate my $string = chr(20000); # a Unicode string 206*0Sstevel@tonic-gate utf8::encode($string); # now it's a UTF-8 encoded byte string 207*0Sstevel@tonic-gate # concatenate with another Unicode string 208*0Sstevel@tonic-gate print length($string . chr(20000)); 209*0Sstevel@tonic-gate 210*0Sstevel@tonic-gateWill print C<2>, because C<$string> is upgraded as UTF-8. Without 211*0Sstevel@tonic-gateC<use encoding 'utf8';>, it will print C<4> instead, since C<$string> 212*0Sstevel@tonic-gateis three octets when interpreted as Latin-1. 213*0Sstevel@tonic-gate 214*0Sstevel@tonic-gate=head1 FEATURES THAT REQUIRE 5.8.1 215*0Sstevel@tonic-gate 216*0Sstevel@tonic-gateSome of the features offered by this pragma requires perl 5.8.1. Most 217*0Sstevel@tonic-gateof these are done by Inaba Hiroto. Any other features and changes 218*0Sstevel@tonic-gateare good for 5.8.0. 219*0Sstevel@tonic-gate 220*0Sstevel@tonic-gate=over 221*0Sstevel@tonic-gate 222*0Sstevel@tonic-gate=item "NON-EUC" doublebyte encodings 223*0Sstevel@tonic-gate 224*0Sstevel@tonic-gateBecause perl needs to parse script before applying this pragma, such 225*0Sstevel@tonic-gateencodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH; 226*0Sstevel@tonic-gate\x5c) in the second byte fails because the second byte may 227*0Sstevel@tonic-gateaccidentally escape the quoting character that follows. Perl 5.8.1 228*0Sstevel@tonic-gateor later fixes this problem. 229*0Sstevel@tonic-gate 230*0Sstevel@tonic-gate=item tr// 231*0Sstevel@tonic-gate 232*0Sstevel@tonic-gateC<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0 233*0Sstevel@tonic-gateSee the section below for details. 234*0Sstevel@tonic-gate 235*0Sstevel@tonic-gate=item DATA pseudo-filehandle 236*0Sstevel@tonic-gate 237*0Sstevel@tonic-gateAnother feature that was overlooked was C<DATA>. 238*0Sstevel@tonic-gate 239*0Sstevel@tonic-gate=back 240*0Sstevel@tonic-gate 241*0Sstevel@tonic-gate=head1 USAGE 242*0Sstevel@tonic-gate 243*0Sstevel@tonic-gate=over 4 244*0Sstevel@tonic-gate 245*0Sstevel@tonic-gate=item use encoding [I<ENCNAME>] ; 246*0Sstevel@tonic-gate 247*0Sstevel@tonic-gateSets the script encoding to I<ENCNAME>. And unless ${^UNICODE} 248*0Sstevel@tonic-gateexists and non-zero, PerlIO layers of STDIN and STDOUT are set to 249*0Sstevel@tonic-gate":encoding(I<ENCNAME>)". 250*0Sstevel@tonic-gate 251*0Sstevel@tonic-gateNote that STDERR WILL NOT be changed. 252*0Sstevel@tonic-gate 253*0Sstevel@tonic-gateAlso note that non-STD file handles remain unaffected. Use C<use 254*0Sstevel@tonic-gateopen> or C<binmode> to change layers of those. 255*0Sstevel@tonic-gate 256*0Sstevel@tonic-gateIf no encoding is specified, the environment variable L<PERL_ENCODING> 257*0Sstevel@tonic-gateis consulted. If no encoding can be found, the error C<Unknown encoding 258*0Sstevel@tonic-gate'I<ENCNAME>'> will be thrown. 259*0Sstevel@tonic-gate 260*0Sstevel@tonic-gate=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; 261*0Sstevel@tonic-gate 262*0Sstevel@tonic-gateYou can also individually set encodings of STDIN and STDOUT via the 263*0Sstevel@tonic-gateC<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the 264*0Sstevel@tonic-gatefirst I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding 265*0Sstevel@tonic-gatecompletely off. 266*0Sstevel@tonic-gate 267*0Sstevel@tonic-gateWhen ${^UNICODE} exists and non-zero, these options will completely 268*0Sstevel@tonic-gateignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See 269*0Sstevel@tonic-gateL<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for 270*0Sstevel@tonic-gatedetails (perl 5.8.1 and later). 271*0Sstevel@tonic-gate 272*0Sstevel@tonic-gate=item use encoding I<ENCNAME> Filter=E<gt>1; 273*0Sstevel@tonic-gate 274*0Sstevel@tonic-gateThis turns the encoding pragma into a source filter. While the 275*0Sstevel@tonic-gatedefault approach just decodes interpolated literals (in qq() and 276*0Sstevel@tonic-gateqr()), this will apply a source filter to the entire source code. See 277*0Sstevel@tonic-gateL</"The Filter Option"> below for details. 278*0Sstevel@tonic-gate 279*0Sstevel@tonic-gate=item no encoding; 280*0Sstevel@tonic-gate 281*0Sstevel@tonic-gateUnsets the script encoding. The layers of STDIN, STDOUT are 282*0Sstevel@tonic-gatereset to ":raw" (the default unprocessed raw stream of bytes). 283*0Sstevel@tonic-gate 284*0Sstevel@tonic-gate=back 285*0Sstevel@tonic-gate 286*0Sstevel@tonic-gate=head1 The Filter Option 287*0Sstevel@tonic-gate 288*0Sstevel@tonic-gateThe magic of C<use encoding> is not applied to the names of 289*0Sstevel@tonic-gateidentifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human 290*0Sstevel@tonic-gateis a single Han ideograph) work, you still need to write your script 291*0Sstevel@tonic-gatein UTF-8 -- or use a source filter. That's what 'Filter=>1' does. 292*0Sstevel@tonic-gate 293*0Sstevel@tonic-gateWhat does this mean? Your source code behaves as if it is written in 294*0Sstevel@tonic-gateUTF-8 with 'use utf8' in effect. So even if your editor only supports 295*0Sstevel@tonic-gateShift_JIS, for example, you can still try examples in Chapter 15 of 296*0Sstevel@tonic-gateC<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8 297*0Sstevel@tonic-gateidentifiers. 298*0Sstevel@tonic-gate 299*0Sstevel@tonic-gateThis option is significantly slower and (as of this writing) non-ASCII 300*0Sstevel@tonic-gateidentifiers are not very stable WITHOUT this option and with the 301*0Sstevel@tonic-gatesource code written in UTF-8. 302*0Sstevel@tonic-gate 303*0Sstevel@tonic-gate=head2 Filter-related changes at Encode version 1.87 304*0Sstevel@tonic-gate 305*0Sstevel@tonic-gate=over 306*0Sstevel@tonic-gate 307*0Sstevel@tonic-gate=item * 308*0Sstevel@tonic-gate 309*0Sstevel@tonic-gateThe Filter option now sets STDIN and STDOUT like non-filter options. 310*0Sstevel@tonic-gateAnd C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like 311*0Sstevel@tonic-gatenon-filter version. 312*0Sstevel@tonic-gate 313*0Sstevel@tonic-gate=item * 314*0Sstevel@tonic-gate 315*0Sstevel@tonic-gateC<use utf8> is implicitly declared so you no longer have to C<use 316*0Sstevel@tonic-gateutf8> to C<${"\x{4eba}"}++>. 317*0Sstevel@tonic-gate 318*0Sstevel@tonic-gate=back 319*0Sstevel@tonic-gate 320*0Sstevel@tonic-gate=head1 CAVEATS 321*0Sstevel@tonic-gate 322*0Sstevel@tonic-gate=head2 NOT SCOPED 323*0Sstevel@tonic-gate 324*0Sstevel@tonic-gateThe pragma is a per script, not a per block lexical. Only the last 325*0Sstevel@tonic-gateC<use encoding> or C<no encoding> matters, and it affects 326*0Sstevel@tonic-gateB<the whole script>. However, the <no encoding> pragma is supported and 327*0Sstevel@tonic-gateB<use encoding> can appear as many times as you want in a given script. 328*0Sstevel@tonic-gateThe multiple use of this pragma is discouraged. 329*0Sstevel@tonic-gate 330*0Sstevel@tonic-gateBy the same reason, the use this pragma inside modules is also 331*0Sstevel@tonic-gatediscouraged (though not as strongly discouranged as the case above. 332*0Sstevel@tonic-gateSee below). 333*0Sstevel@tonic-gate 334*0Sstevel@tonic-gateIf you still have to write a module with this pragma, be very careful 335*0Sstevel@tonic-gateof the load order. See the codes below; 336*0Sstevel@tonic-gate 337*0Sstevel@tonic-gate # called module 338*0Sstevel@tonic-gate package Module_IN_BAR; 339*0Sstevel@tonic-gate use encoding "bar"; 340*0Sstevel@tonic-gate # stuff in "bar" encoding here 341*0Sstevel@tonic-gate 1; 342*0Sstevel@tonic-gate 343*0Sstevel@tonic-gate # caller script 344*0Sstevel@tonic-gate use encoding "foo" 345*0Sstevel@tonic-gate use Module_IN_BAR; 346*0Sstevel@tonic-gate # surprise! use encoding "bar" is in effect. 347*0Sstevel@tonic-gate 348*0Sstevel@tonic-gateThe best way to avoid this oddity is to use this pragma RIGHT AFTER 349*0Sstevel@tonic-gateother modules are loaded. i.e. 350*0Sstevel@tonic-gate 351*0Sstevel@tonic-gate use Module_IN_BAR; 352*0Sstevel@tonic-gate use encoding "foo"; 353*0Sstevel@tonic-gate 354*0Sstevel@tonic-gate=head2 DO NOT MIX MULTIPLE ENCODINGS 355*0Sstevel@tonic-gate 356*0Sstevel@tonic-gateNotice that only literals (string or regular expression) having only 357*0Sstevel@tonic-gatelegacy code points are affected: if you mix data like this 358*0Sstevel@tonic-gate 359*0Sstevel@tonic-gate \xDF\x{100} 360*0Sstevel@tonic-gate 361*0Sstevel@tonic-gatethe data is assumed to be in (Latin 1 and) Unicode, not in your native 362*0Sstevel@tonic-gateencoding. In other words, this will match in "greek": 363*0Sstevel@tonic-gate 364*0Sstevel@tonic-gate "\xDF" =~ /\x{3af}/ 365*0Sstevel@tonic-gate 366*0Sstevel@tonic-gatebut this will not 367*0Sstevel@tonic-gate 368*0Sstevel@tonic-gate "\xDF\x{100}" =~ /\x{3af}\x{100}/ 369*0Sstevel@tonic-gate 370*0Sstevel@tonic-gatesince the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on 371*0Sstevel@tonic-gatethe left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL 372*0Sstevel@tonic-gateLETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You 373*0Sstevel@tonic-gateshould not be mixing your legacy data and Unicode in the same string. 374*0Sstevel@tonic-gate 375*0Sstevel@tonic-gateThis pragma also affects encoding of the 0x80..0xFF code point range: 376*0Sstevel@tonic-gatenormally characters in that range are left as eight-bit bytes (unless 377*0Sstevel@tonic-gatethey are combined with characters with code points 0x100 or larger, 378*0Sstevel@tonic-gatein which case all characters need to become UTF-8 encoded), but if 379*0Sstevel@tonic-gatethe C<encoding> pragma is present, even the 0x80..0xFF range always 380*0Sstevel@tonic-gategets UTF-8 encoded. 381*0Sstevel@tonic-gate 382*0Sstevel@tonic-gateAfter all, the best thing about this pragma is that you don't have to 383*0Sstevel@tonic-gateresort to \x{....} just to spell your name in a native encoding. 384*0Sstevel@tonic-gateSo feel free to put your strings in your encoding in quotes and 385*0Sstevel@tonic-gateregexes. 386*0Sstevel@tonic-gate 387*0Sstevel@tonic-gate=head2 tr/// with ranges 388*0Sstevel@tonic-gate 389*0Sstevel@tonic-gateThe B<encoding> pragma works by decoding string literals in 390*0Sstevel@tonic-gateC<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this 391*0Sstevel@tonic-gatedoes not apply to C<tr///>. Therefore, 392*0Sstevel@tonic-gate 393*0Sstevel@tonic-gate use encoding 'euc-jp'; 394*0Sstevel@tonic-gate #.... 395*0Sstevel@tonic-gate $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/; 396*0Sstevel@tonic-gate # -------- -------- -------- -------- 397*0Sstevel@tonic-gate 398*0Sstevel@tonic-gateDoes not work as 399*0Sstevel@tonic-gate 400*0Sstevel@tonic-gate $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/; 401*0Sstevel@tonic-gate 402*0Sstevel@tonic-gate=over 403*0Sstevel@tonic-gate 404*0Sstevel@tonic-gate=item Legend of characters above 405*0Sstevel@tonic-gate 406*0Sstevel@tonic-gate utf8 euc-jp charnames::viacode() 407*0Sstevel@tonic-gate ----------------------------------------- 408*0Sstevel@tonic-gate \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A 409*0Sstevel@tonic-gate \x{3093} \xA4\xF3 HIRAGANA LETTER N 410*0Sstevel@tonic-gate \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A 411*0Sstevel@tonic-gate \x{30f3} \xA5\xF3 KATAKANA LETTER N 412*0Sstevel@tonic-gate 413*0Sstevel@tonic-gate=back 414*0Sstevel@tonic-gate 415*0Sstevel@tonic-gateThis counterintuitive behavior has been fixed in perl 5.8.1. 416*0Sstevel@tonic-gate 417*0Sstevel@tonic-gate=head3 workaround to tr///; 418*0Sstevel@tonic-gate 419*0Sstevel@tonic-gateIn perl 5.8.0, you can work around as follows; 420*0Sstevel@tonic-gate 421*0Sstevel@tonic-gate use encoding 'euc-jp'; 422*0Sstevel@tonic-gate # .... 423*0Sstevel@tonic-gate eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; 424*0Sstevel@tonic-gate 425*0Sstevel@tonic-gateNote the C<tr//> expression is surrounded by C<qq{}>. The idea behind 426*0Sstevel@tonic-gateis the same as classic idiom that makes C<tr///> 'interpolate'. 427*0Sstevel@tonic-gate 428*0Sstevel@tonic-gate tr/$from/$to/; # wrong! 429*0Sstevel@tonic-gate eval qq{ tr/$from/$to/ }; # workaround. 430*0Sstevel@tonic-gate 431*0Sstevel@tonic-gateNevertheless, in case of B<encoding> pragma even C<q//> is affected so 432*0Sstevel@tonic-gateC<tr///> not being decoded was obviously against the will of Perl5 433*0Sstevel@tonic-gatePorters so it has been fixed in Perl 5.8.1 or later. 434*0Sstevel@tonic-gate 435*0Sstevel@tonic-gate=head1 EXAMPLE - Greekperl 436*0Sstevel@tonic-gate 437*0Sstevel@tonic-gate use encoding "iso 8859-7"; 438*0Sstevel@tonic-gate 439*0Sstevel@tonic-gate # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. 440*0Sstevel@tonic-gate 441*0Sstevel@tonic-gate $a = "\xDF"; 442*0Sstevel@tonic-gate $b = "\x{100}"; 443*0Sstevel@tonic-gate 444*0Sstevel@tonic-gate printf "%#x\n", ord($a); # will print 0x3af, not 0xdf 445*0Sstevel@tonic-gate 446*0Sstevel@tonic-gate $c = $a . $b; 447*0Sstevel@tonic-gate 448*0Sstevel@tonic-gate # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". 449*0Sstevel@tonic-gate 450*0Sstevel@tonic-gate # chr() is affected, and ... 451*0Sstevel@tonic-gate 452*0Sstevel@tonic-gate print "mega\n" if ord(chr(0xdf)) == 0x3af; 453*0Sstevel@tonic-gate 454*0Sstevel@tonic-gate # ... ord() is affected by the encoding pragma ... 455*0Sstevel@tonic-gate 456*0Sstevel@tonic-gate print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; 457*0Sstevel@tonic-gate 458*0Sstevel@tonic-gate # ... as are eq and cmp ... 459*0Sstevel@tonic-gate 460*0Sstevel@tonic-gate print "peta\n" if "\x{3af}" eq pack("C", 0xdf); 461*0Sstevel@tonic-gate print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; 462*0Sstevel@tonic-gate 463*0Sstevel@tonic-gate # ... but pack/unpack C are not affected, in case you still 464*0Sstevel@tonic-gate # want to go back to your native encoding 465*0Sstevel@tonic-gate 466*0Sstevel@tonic-gate print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; 467*0Sstevel@tonic-gate 468*0Sstevel@tonic-gate=head1 KNOWN PROBLEMS 469*0Sstevel@tonic-gate 470*0Sstevel@tonic-gate=over 471*0Sstevel@tonic-gate 472*0Sstevel@tonic-gate=item literals in regex that are longer than 127 bytes 473*0Sstevel@tonic-gate 474*0Sstevel@tonic-gateFor native multibyte encodings (either fixed or variable length), 475*0Sstevel@tonic-gatethe current implementation of the regular expressions may introduce 476*0Sstevel@tonic-gaterecoding errors for regular expression literals longer than 127 bytes. 477*0Sstevel@tonic-gate 478*0Sstevel@tonic-gate=item EBCDIC 479*0Sstevel@tonic-gate 480*0Sstevel@tonic-gateThe encoding pragma is not supported on EBCDIC platforms. 481*0Sstevel@tonic-gate(Porters who are willing and able to remove this limitation are 482*0Sstevel@tonic-gatewelcome.) 483*0Sstevel@tonic-gate 484*0Sstevel@tonic-gate=item format 485*0Sstevel@tonic-gate 486*0Sstevel@tonic-gateThis pragma doesn't work well with format because PerlIO does not 487*0Sstevel@tonic-gateget along very well with it. When format contains non-ascii 488*0Sstevel@tonic-gatecharacters it prints funny or gets "wide character warnings". 489*0Sstevel@tonic-gateTo understand it, try the code below. 490*0Sstevel@tonic-gate 491*0Sstevel@tonic-gate # Save this one in utf8 492*0Sstevel@tonic-gate # replace *non-ascii* with a non-ascii string 493*0Sstevel@tonic-gate my $camel; 494*0Sstevel@tonic-gate format STDOUT = 495*0Sstevel@tonic-gate *non-ascii*@>>>>>>> 496*0Sstevel@tonic-gate $camel 497*0Sstevel@tonic-gate . 498*0Sstevel@tonic-gate $camel = "*non-ascii*"; 499*0Sstevel@tonic-gate binmode(STDOUT=>':encoding(utf8)'); # bang! 500*0Sstevel@tonic-gate write; # funny 501*0Sstevel@tonic-gate print $camel, "\n"; # fine 502*0Sstevel@tonic-gate 503*0Sstevel@tonic-gateWithout binmode this happens to work but without binmode, print() 504*0Sstevel@tonic-gatefails instead of write(). 505*0Sstevel@tonic-gate 506*0Sstevel@tonic-gateAt any rate, the very use of format is questionable when it comes to 507*0Sstevel@tonic-gateunicode characters since you have to consider such things as character 508*0Sstevel@tonic-gatewidth (i.e. double-width for ideographs) and directions (i.e. BIDI for 509*0Sstevel@tonic-gateArabic and Hebrew). 510*0Sstevel@tonic-gate 511*0Sstevel@tonic-gate=back 512*0Sstevel@tonic-gate 513*0Sstevel@tonic-gate=head1 HISTORY 514*0Sstevel@tonic-gate 515*0Sstevel@tonic-gateThis pragma first appeared in Perl 5.8.0. For features that require 516*0Sstevel@tonic-gate5.8.1 and better, see above. 517*0Sstevel@tonic-gate 518*0Sstevel@tonic-gate=head1 SEE ALSO 519*0Sstevel@tonic-gate 520*0Sstevel@tonic-gateL<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, 521*0Sstevel@tonic-gate 522*0Sstevel@tonic-gateCh. 15 of C<Programming Perl (3rd Edition)> 523*0Sstevel@tonic-gateby Larry Wall, Tom Christiansen, Jon Orwant; 524*0Sstevel@tonic-gateO'Reilly & Associates; ISBN 0-596-00027-8 525*0Sstevel@tonic-gate 526*0Sstevel@tonic-gate=cut 527