1# $Id: encoding.pm,v 2.11 2013/02/18 02:23:56 dankogai Exp $ 2package encoding; 3our $VERSION = '2.6_01'; 4 5use Encode; 6use strict; 7use warnings; 8 9use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; 10 11BEGIN { 12 if ( ord("A") == 193 ) { 13 require Carp; 14 Carp::croak("encoding: pragma does not support EBCDIC platforms"); 15 } 16} 17 18our $HAS_PERLIO = 0; 19eval { require PerlIO::encoding }; 20unless ($@) { 21 $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 ); 22} 23 24sub _exception { 25 my $name = shift; 26 $] > 5.008 and return 0; # 5.8.1 or higher then no 27 my %utfs = map { $_ => 1 } 28 qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE 29 UTF-32 UTF-32BE UTF-32LE); 30 $utfs{$name} or return 0; # UTFs or no 31 require Config; 32 Config->import(); 33 our %Config; 34 return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no 35} 36 37sub in_locale { $^H & ( $locale::hint_bits || 0 ) } 38 39sub _get_locale_encoding { 40 my $locale_encoding; 41 42 # I18N::Langinfo isn't available everywhere 43 eval { 44 require I18N::Langinfo; 45 I18N::Langinfo->import(qw(langinfo CODESET)); 46 $locale_encoding = langinfo( CODESET() ); 47 }; 48 49 my $country_language; 50 51 no warnings 'uninitialized'; 52 53 if ( (not $locale_encoding) && in_locale() ) { 54 if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) { 55 ( $country_language, $locale_encoding ) = ( $1, $2 ); 56 } 57 elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) { 58 ( $country_language, $locale_encoding ) = ( $1, $2 ); 59 } 60 61 # LANGUAGE affects only LC_MESSAGES only on glibc 62 } 63 elsif ( not $locale_encoding ) { 64 if ( $ENV{LC_ALL} =~ /\butf-?8\b/i 65 || $ENV{LANG} =~ /\butf-?8\b/i ) 66 { 67 $locale_encoding = 'utf8'; 68 } 69 70 # Could do more heuristics based on the country and language 71 # parts of LC_ALL and LANG (the parts before the dot (if any)), 72 # since we have Locale::Country and Locale::Language available. 73 # TODO: get a database of Language -> Encoding mappings 74 # (the Estonian database at http://www.eki.ee/letter/ 75 # would be excellent!) --jhi 76 } 77 if ( defined $locale_encoding 78 && lc($locale_encoding) eq 'euc' 79 && defined $country_language ) 80 { 81 if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) { 82 $locale_encoding = 'euc-jp'; 83 } 84 elsif ( $country_language =~ /^ko_KR|korean?$/i ) { 85 $locale_encoding = 'euc-kr'; 86 } 87 elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) { 88 $locale_encoding = 'euc-cn'; 89 } 90 elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) { 91 $locale_encoding = 'euc-tw'; 92 } 93 else { 94 require Carp; 95 Carp::croak( 96 "encoding: Locale encoding '$locale_encoding' too ambiguous" 97 ); 98 } 99 } 100 101 return $locale_encoding; 102} 103 104sub import { 105 if ($] >= 5.017) { 106 warnings::warnif("deprecated", 107 "Use of the encoding pragma is deprecated") 108 } 109 my $class = shift; 110 my $name = shift; 111 if (!$name){ 112 require Carp; 113 Carp::croak("encoding: no encoding specified."); 114 } 115 if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm 116 my $caller = caller(); 117 { 118 no strict 'refs'; 119 *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; 120 } 121 return; 122 } 123 $name = _get_locale_encoding() if $name eq ':locale'; 124 my %arg = @_; 125 $name = $ENV{PERL_ENCODING} unless defined $name; 126 my $enc = find_encoding($name); 127 unless ( defined $enc ) { 128 require Carp; 129 Carp::croak("encoding: Unknown encoding '$name'"); 130 } 131 $name = $enc->name; # canonize 132 unless ( $arg{Filter} ) { 133 DEBUG and warn "_exception($name) = ", _exception($name); 134 _exception($name) or ${^ENCODING} = $enc; 135 $HAS_PERLIO or return 1; 136 } 137 else { 138 defined( ${^ENCODING} ) and undef ${^ENCODING}; 139 140 # implicitly 'use utf8' 141 require utf8; # to fetch $utf8::hint_bits; 142 $^H |= $utf8::hint_bits; 143 eval { 144 require Filter::Util::Call; 145 Filter::Util::Call->import; 146 filter_add( 147 sub { 148 my $status = filter_read(); 149 if ( $status > 0 ) { 150 $_ = $enc->decode( $_, 1 ); 151 DEBUG and warn $_; 152 } 153 $status; 154 } 155 ); 156 }; 157 $@ eq '' and DEBUG and warn "Filter installed"; 158 } 159 defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; 160 for my $h (qw(STDIN STDOUT)) { 161 if ( $arg{$h} ) { 162 unless ( defined find_encoding( $arg{$h} ) ) { 163 require Carp; 164 Carp::croak( 165 "encoding: Unknown encoding for $h, '$arg{$h}'"); 166 } 167 eval { binmode( $h, ":raw :encoding($arg{$h})" ) }; 168 } 169 else { 170 unless ( exists $arg{$h} ) { 171 eval { 172 no warnings 'uninitialized'; 173 binmode( $h, ":raw :encoding($name)" ); 174 }; 175 } 176 } 177 if ($@) { 178 require Carp; 179 Carp::croak($@); 180 } 181 } 182 return 1; # I doubt if we need it, though 183} 184 185sub unimport { 186 no warnings; 187 undef ${^ENCODING}; 188 if ($HAS_PERLIO) { 189 binmode( STDIN, ":raw" ); 190 binmode( STDOUT, ":raw" ); 191 } 192 else { 193 binmode(STDIN); 194 binmode(STDOUT); 195 } 196 if ( $INC{"Filter/Util/Call.pm"} ) { 197 eval { filter_del() }; 198 } 199} 200 2011; 202__END__ 203 204=pod 205 206=head1 NAME 207 208encoding - allows you to write your script in non-ascii or non-utf8 209 210=head1 WARNING 211 212This module is deprecated under perl 5.18. It uses a mechanism provided by 213perl that is deprecated under 5.18 and higher, and may be removed in a 214future version. 215 216=head1 SYNOPSIS 217 218 use encoding "greek"; # Perl like Greek to you? 219 use encoding "euc-jp"; # Jperl! 220 221 # or you can even do this if your shell supports your native encoding 222 223 perl -Mencoding=latin2 -e'...' # Feeling centrally European? 224 perl -Mencoding=euc-kr -e'...' # Or Korean? 225 226 # more control 227 228 # A simple euc-cn => utf-8 converter 229 use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; 230 231 # "no encoding;" supported (but not scoped!) 232 no encoding; 233 234 # an alternate way, Filter 235 use encoding "euc-jp", Filter=>1; 236 # now you can use kanji identifiers -- in euc-jp! 237 238 # switch on locale - 239 # note that this probably means that unless you have a complete control 240 # over the environments the application is ever going to be run, you should 241 # NOT use the feature of encoding pragma allowing you to write your script 242 # in any recognized encoding because changing locale settings will wreck 243 # the script; you can of course still use the other features of the pragma. 244 use encoding ':locale'; 245 246=head1 ABSTRACT 247 248Let's start with a bit of history: Perl 5.6.0 introduced Unicode 249support. You could apply C<substr()> and regexes even to complex CJK 250characters -- so long as the script was written in UTF-8. But back 251then, text editors that supported UTF-8 were still rare and many users 252instead chose to write scripts in legacy encodings, giving up a whole 253new feature of Perl 5.6. 254 255Rewind to the future: starting from perl 5.8.0 with the B<encoding> 256pragma, you can write your script in any encoding you like (so long 257as the C<Encode> module supports it) and still enjoy Unicode support. 258This pragma achieves that by doing the following: 259 260=over 261 262=item * 263 264Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from 265the encoding specified to utf8. In Perl 5.8.1 and later, literals in 266C<tr///> and C<DATA> pseudo-filehandle are also converted. 267 268=item * 269 270Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding 271 specified. 272 273=back 274 275=head2 Literal Conversions 276 277You can write code in EUC-JP as follows: 278 279 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji 280 #<-char-><-char-> # 4 octets 281 s/\bCamel\b/$Rakuda/; 282 283And with C<use encoding "euc-jp"> in effect, it is the same thing as 284the code in UTF-8: 285 286 my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters 287 s/\bCamel\b/$Rakuda/; 288 289=head2 PerlIO layers for C<STD(IN|OUT)> 290 291The B<encoding> pragma also modifies the filehandle layers of 292STDIN and STDOUT to the specified encoding. Therefore, 293 294 use encoding "euc-jp"; 295 my $message = "Camel is the symbol of perl.\n"; 296 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji 297 $message =~ s/\bCamel\b/$Rakuda/; 298 print $message; 299 300Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", 301not "\x{99F1}\x{99DD} is the symbol of perl.\n". 302 303You can override this by giving extra arguments; see below. 304 305=head2 Implicit upgrading for byte strings 306 307By default, if strings operating under byte semantics and strings 308with Unicode character data are concatenated, the new string will 309be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>. 310 311The B<encoding> pragma changes this to use the specified encoding 312instead. For example: 313 314 use encoding 'utf8'; 315 my $string = chr(20000); # a Unicode string 316 utf8::encode($string); # now it's a UTF-8 encoded byte string 317 # concatenate with another Unicode string 318 print length($string . chr(20000)); 319 320Will print C<2>, because C<$string> is upgraded as UTF-8. Without 321C<use encoding 'utf8';>, it will print C<4> instead, since C<$string> 322is three octets when interpreted as Latin-1. 323 324=head2 Side effects 325 326If the C<encoding> pragma is in scope then the lengths returned are 327calculated from the length of C<$/> in Unicode characters, which is not 328always the same as the length of C<$/> in the native encoding. 329 330This pragma affects utf8::upgrade, but not utf8::downgrade. 331 332=head1 FEATURES THAT REQUIRE 5.8.1 333 334Some of the features offered by this pragma requires perl 5.8.1. Most 335of these are done by Inaba Hiroto. Any other features and changes 336are good for 5.8.0. 337 338=over 339 340=item "NON-EUC" doublebyte encodings 341 342Because perl needs to parse script before applying this pragma, such 343encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH; 344\x5c) in the second byte fails because the second byte may 345accidentally escape the quoting character that follows. Perl 5.8.1 346or later fixes this problem. 347 348=item tr// 349 350C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0 351See the section below for details. 352 353=item DATA pseudo-filehandle 354 355Another feature that was overlooked was C<DATA>. 356 357=back 358 359=head1 USAGE 360 361=over 4 362 363=item use encoding [I<ENCNAME>] ; 364 365Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE} 366exists and non-zero, PerlIO layers of STDIN and STDOUT are set to 367":encoding(I<ENCNAME>)". 368 369Note that STDERR WILL NOT be changed. 370 371Also note that non-STD file handles remain unaffected. Use C<use 372open> or C<binmode> to change layers of those. 373 374If no encoding is specified, the environment variable L<PERL_ENCODING> 375is consulted. If no encoding can be found, the error C<Unknown encoding 376'I<ENCNAME>'> will be thrown. 377 378=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; 379 380You can also individually set encodings of STDIN and STDOUT via the 381C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the 382first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding 383completely off. 384 385When ${^UNICODE} exists and non-zero, these options will completely 386ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See 387L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for 388details (perl 5.8.1 and later). 389 390=item use encoding I<ENCNAME> Filter=E<gt>1; 391 392This turns the encoding pragma into a source filter. While the 393default approach just decodes interpolated literals (in qq() and 394qr()), this will apply a source filter to the entire source code. See 395L</"The Filter Option"> below for details. 396 397=item no encoding; 398 399Unsets the script encoding. The layers of STDIN, STDOUT are 400reset to ":raw" (the default unprocessed raw stream of bytes). 401 402=back 403 404=head1 The Filter Option 405 406The magic of C<use encoding> is not applied to the names of 407identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human 408is a single Han ideograph) work, you still need to write your script 409in UTF-8 -- or use a source filter. That's what 'Filter=>1' does. 410 411What does this mean? Your source code behaves as if it is written in 412UTF-8 with 'use utf8' in effect. So even if your editor only supports 413Shift_JIS, for example, you can still try examples in Chapter 15 of 414C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8 415identifiers. 416 417This option is significantly slower and (as of this writing) non-ASCII 418identifiers are not very stable WITHOUT this option and with the 419source code written in UTF-8. 420 421=head2 Filter-related changes at Encode version 1.87 422 423=over 424 425=item * 426 427The Filter option now sets STDIN and STDOUT like non-filter options. 428And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like 429non-filter version. 430 431=item * 432 433C<use utf8> is implicitly declared so you no longer have to C<use 434utf8> to C<${"\x{4eba}"}++>. 435 436=back 437 438=head1 CAVEATS 439 440=head2 NOT SCOPED 441 442The pragma is a per script, not a per block lexical. Only the last 443C<use encoding> or C<no encoding> matters, and it affects 444B<the whole script>. However, the <no encoding> pragma is supported and 445B<use encoding> can appear as many times as you want in a given script. 446The multiple use of this pragma is discouraged. 447 448By the same reason, the use this pragma inside modules is also 449discouraged (though not as strongly discouraged as the case above. 450See below). 451 452If you still have to write a module with this pragma, be very careful 453of the load order. See the codes below; 454 455 # called module 456 package Module_IN_BAR; 457 use encoding "bar"; 458 # stuff in "bar" encoding here 459 1; 460 461 # caller script 462 use encoding "foo" 463 use Module_IN_BAR; 464 # surprise! use encoding "bar" is in effect. 465 466The best way to avoid this oddity is to use this pragma RIGHT AFTER 467other modules are loaded. i.e. 468 469 use Module_IN_BAR; 470 use encoding "foo"; 471 472=head2 DO NOT MIX MULTIPLE ENCODINGS 473 474Notice that only literals (string or regular expression) having only 475legacy code points are affected: if you mix data like this 476 477 \xDF\x{100} 478 479the data is assumed to be in (Latin 1 and) Unicode, not in your native 480encoding. In other words, this will match in "greek": 481 482 "\xDF" =~ /\x{3af}/ 483 484but this will not 485 486 "\xDF\x{100}" =~ /\x{3af}\x{100}/ 487 488since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on 489the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL 490LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You 491should not be mixing your legacy data and Unicode in the same string. 492 493This pragma also affects encoding of the 0x80..0xFF code point range: 494normally characters in that range are left as eight-bit bytes (unless 495they are combined with characters with code points 0x100 or larger, 496in which case all characters need to become UTF-8 encoded), but if 497the C<encoding> pragma is present, even the 0x80..0xFF range always 498gets UTF-8 encoded. 499 500After all, the best thing about this pragma is that you don't have to 501resort to \x{....} just to spell your name in a native encoding. 502So feel free to put your strings in your encoding in quotes and 503regexes. 504 505=head2 tr/// with ranges 506 507The B<encoding> pragma works by decoding string literals in 508C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this 509does not apply to C<tr///>. Therefore, 510 511 use encoding 'euc-jp'; 512 #.... 513 $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/; 514 # -------- -------- -------- -------- 515 516Does not work as 517 518 $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/; 519 520=over 521 522=item Legend of characters above 523 524 utf8 euc-jp charnames::viacode() 525 ----------------------------------------- 526 \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A 527 \x{3093} \xA4\xF3 HIRAGANA LETTER N 528 \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A 529 \x{30f3} \xA5\xF3 KATAKANA LETTER N 530 531=back 532 533This counterintuitive behavior has been fixed in perl 5.8.1. 534 535=head3 workaround to tr///; 536 537In perl 5.8.0, you can work around as follows; 538 539 use encoding 'euc-jp'; 540 # .... 541 eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; 542 543Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind 544is the same as classic idiom that makes C<tr///> 'interpolate'. 545 546 tr/$from/$to/; # wrong! 547 eval qq{ tr/$from/$to/ }; # workaround. 548 549Nevertheless, in case of B<encoding> pragma even C<q//> is affected so 550C<tr///> not being decoded was obviously against the will of Perl5 551Porters so it has been fixed in Perl 5.8.1 or later. 552 553=head1 EXAMPLE - Greekperl 554 555 use encoding "iso 8859-7"; 556 557 # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. 558 559 $a = "\xDF"; 560 $b = "\x{100}"; 561 562 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf 563 564 $c = $a . $b; 565 566 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". 567 568 # chr() is affected, and ... 569 570 print "mega\n" if ord(chr(0xdf)) == 0x3af; 571 572 # ... ord() is affected by the encoding pragma ... 573 574 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; 575 576 # ... as are eq and cmp ... 577 578 print "peta\n" if "\x{3af}" eq pack("C", 0xdf); 579 print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; 580 581 # ... but pack/unpack C are not affected, in case you still 582 # want to go back to your native encoding 583 584 print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; 585 586=head1 KNOWN PROBLEMS 587 588=over 589 590=item literals in regex that are longer than 127 bytes 591 592For native multibyte encodings (either fixed or variable length), 593the current implementation of the regular expressions may introduce 594recoding errors for regular expression literals longer than 127 bytes. 595 596=item EBCDIC 597 598The encoding pragma is not supported on EBCDIC platforms. 599(Porters who are willing and able to remove this limitation are 600welcome.) 601 602=item format 603 604This pragma doesn't work well with format because PerlIO does not 605get along very well with it. When format contains non-ascii 606characters it prints funny or gets "wide character warnings". 607To understand it, try the code below. 608 609 # Save this one in utf8 610 # replace *non-ascii* with a non-ascii string 611 my $camel; 612 format STDOUT = 613 *non-ascii*@>>>>>>> 614 $camel 615 . 616 $camel = "*non-ascii*"; 617 binmode(STDOUT=>':encoding(utf8)'); # bang! 618 write; # funny 619 print $camel, "\n"; # fine 620 621Without binmode this happens to work but without binmode, print() 622fails instead of write(). 623 624At any rate, the very use of format is questionable when it comes to 625unicode characters since you have to consider such things as character 626width (i.e. double-width for ideographs) and directions (i.e. BIDI for 627Arabic and Hebrew). 628 629=item Thread safety 630 631C<use encoding ...> is not thread-safe (i.e., do not use in threaded 632applications). 633 634=back 635 636=head2 The Logic of :locale 637 638The logic of C<:locale> is as follows: 639 640=over 4 641 642=item 1. 643 644If the platform supports the langinfo(CODESET) interface, the codeset 645returned is used as the default encoding for the open pragma. 646 647=item 2. 648 649If 1. didn't work but we are under the locale pragma, the environment 650variables LC_ALL and LANG (in that order) are matched for encodings 651(the part after C<.>, if any), and if any found, that is used 652as the default encoding for the open pragma. 653 654=item 3. 655 656If 1. and 2. didn't work, the environment variables LC_ALL and LANG 657(in that order) are matched for anything looking like UTF-8, and if 658any found, C<:utf8> is used as the default encoding for the open 659pragma. 660 661=back 662 663If your locale environment variables (LC_ALL, LC_CTYPE, LANG) 664contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), 665the default encoding of your STDIN, STDOUT, and STDERR, and of 666B<any subsequent file open>, is UTF-8. 667 668=head1 HISTORY 669 670This pragma first appeared in Perl 5.8.0. For features that require 6715.8.1 and better, see above. 672 673The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6. 674 675=head1 SEE ALSO 676 677L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, 678 679Ch. 15 of C<Programming Perl (3rd Edition)> 680by Larry Wall, Tom Christiansen, Jon Orwant; 681O'Reilly & Associates; ISBN 0-596-00027-8 682 683=cut 684