1# $Id: encoding.pm,v 2.12 2013/04/26 18:30:46 dankogai Exp $ 2package encoding; 3our $VERSION = sprintf "%d.%02d", q$Revision: 2.12 $ =~ /(\d+)/g; 4 5use Encode; 6use strict; 7use warnings; 8 9use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; 10 11BEGIN { 12 if ( ord("A") == 193 ) { 13 require Carp; 14 Carp::croak("encoding: pragma does not support EBCDIC platforms"); 15 } 16} 17 18our $HAS_PERLIO = 0; 19eval { require PerlIO::encoding }; 20unless ($@) { 21 $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 ); 22} 23 24sub _exception { 25 my $name = shift; 26 $] > 5.008 and return 0; # 5.8.1 or higher then no 27 my %utfs = map { $_ => 1 } 28 qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE 29 UTF-32 UTF-32BE UTF-32LE); 30 $utfs{$name} or return 0; # UTFs or no 31 require Config; 32 Config->import(); 33 our %Config; 34 return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no 35} 36 37sub in_locale { $^H & ( $locale::hint_bits || 0 ) } 38 39sub _get_locale_encoding { 40 my $locale_encoding; 41 42 # I18N::Langinfo isn't available everywhere 43 eval { 44 require I18N::Langinfo; 45 I18N::Langinfo->import(qw(langinfo CODESET)); 46 $locale_encoding = langinfo( CODESET() ); 47 }; 48 49 my $country_language; 50 51 no warnings 'uninitialized'; 52 53 if ( (not $locale_encoding) && in_locale() ) { 54 if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) { 55 ( $country_language, $locale_encoding ) = ( $1, $2 ); 56 } 57 elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) { 58 ( $country_language, $locale_encoding ) = ( $1, $2 ); 59 } 60 61 # LANGUAGE affects only LC_MESSAGES only on glibc 62 } 63 elsif ( not $locale_encoding ) { 64 if ( $ENV{LC_ALL} =~ /\butf-?8\b/i 65 || $ENV{LANG} =~ /\butf-?8\b/i ) 66 { 67 $locale_encoding = 'utf8'; 68 } 69 70 # Could do more heuristics based on the country and language 71 # parts of LC_ALL and LANG (the parts before the dot (if any)), 72 # since we have Locale::Country and Locale::Language available. 73 # TODO: get a database of Language -> Encoding mappings 74 # (the Estonian database at http://www.eki.ee/letter/ 75 # would be excellent!) --jhi 76 } 77 if ( defined $locale_encoding 78 && lc($locale_encoding) eq 'euc' 79 && defined $country_language ) 80 { 81 if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) { 82 $locale_encoding = 'euc-jp'; 83 } 84 elsif ( $country_language =~ /^ko_KR|korean?$/i ) { 85 $locale_encoding = 'euc-kr'; 86 } 87 elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) { 88 $locale_encoding = 'euc-cn'; 89 } 90 elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) { 91 $locale_encoding = 'euc-tw'; 92 } 93 else { 94 require Carp; 95 Carp::croak( 96 "encoding: Locale encoding '$locale_encoding' too ambiguous" 97 ); 98 } 99 } 100 101 return $locale_encoding; 102} 103 104sub import { 105 if ($] >= 5.017) { 106 warnings::warnif("deprecated", 107 "Use of the encoding pragma is deprecated") 108 } 109 my $class = shift; 110 my $name = shift; 111 if (!$name){ 112 require Carp; 113 Carp::croak("encoding: no encoding specified."); 114 } 115 if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm 116 my $caller = caller(); 117 { 118 no strict 'refs'; 119 *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; 120 } 121 return; 122 } 123 $name = _get_locale_encoding() if $name eq ':locale'; 124 my %arg = @_; 125 $name = $ENV{PERL_ENCODING} unless defined $name; 126 my $enc = find_encoding($name); 127 unless ( defined $enc ) { 128 require Carp; 129 Carp::croak("encoding: Unknown encoding '$name'"); 130 } 131 $name = $enc->name; # canonize 132 unless ( $arg{Filter} ) { 133 DEBUG and warn "_exception($name) = ", _exception($name); 134 _exception($name) or ${^ENCODING} = $enc; 135 $HAS_PERLIO or return 1; 136 } 137 else { 138 defined( ${^ENCODING} ) and undef ${^ENCODING}; 139 140 # implicitly 'use utf8' 141 require utf8; # to fetch $utf8::hint_bits; 142 $^H |= $utf8::hint_bits; 143 eval { 144 require Filter::Util::Call; 145 Filter::Util::Call->import; 146 filter_add( 147 sub { 148 my $status = filter_read(); 149 if ( $status > 0 ) { 150 $_ = $enc->decode( $_, 1 ); 151 DEBUG and warn $_; 152 } 153 $status; 154 } 155 ); 156 }; 157 $@ eq '' and DEBUG and warn "Filter installed"; 158 } 159 defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; 160 for my $h (qw(STDIN STDOUT)) { 161 if ( $arg{$h} ) { 162 unless ( defined find_encoding( $arg{$h} ) ) { 163 require Carp; 164 Carp::croak( 165 "encoding: Unknown encoding for $h, '$arg{$h}'"); 166 } 167 eval { binmode( $h, ":raw :encoding($arg{$h})" ) }; 168 } 169 else { 170 unless ( exists $arg{$h} ) { 171 eval { 172 no warnings 'uninitialized'; 173 binmode( $h, ":raw :encoding($name)" ); 174 }; 175 } 176 } 177 if ($@) { 178 require Carp; 179 Carp::croak($@); 180 } 181 } 182 return 1; # I doubt if we need it, though 183} 184 185sub unimport { 186 no warnings; 187 undef ${^ENCODING}; 188 if ($HAS_PERLIO) { 189 binmode( STDIN, ":raw" ); 190 binmode( STDOUT, ":raw" ); 191 } 192 else { 193 binmode(STDIN); 194 binmode(STDOUT); 195 } 196 if ( $INC{"Filter/Util/Call.pm"} ) { 197 eval { filter_del() }; 198 } 199} 200 2011; 202__END__ 203 204=pod 205 206=head1 NAME 207 208encoding - allows you to write your script in non-ascii or non-utf8 209 210=head1 WARNING 211 212This module is deprecated under perl 5.18. It uses a mechanism provided by 213perl that is deprecated under 5.18 and higher, and may be removed in a 214future version. 215 216The easiest and the best alternative is to write your script in UTF-8 217and declear: 218 219 use utf8; # not use encoding ':utf8'; 220 221See L<perluniintro> and L<utf8> for details. 222 223=head1 SYNOPSIS 224 225 use encoding "greek"; # Perl like Greek to you? 226 use encoding "euc-jp"; # Jperl! 227 228 # or you can even do this if your shell supports your native encoding 229 230 perl -Mencoding=latin2 -e'...' # Feeling centrally European? 231 perl -Mencoding=euc-kr -e'...' # Or Korean? 232 233 # more control 234 235 # A simple euc-cn => utf-8 converter 236 use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; 237 238 # "no encoding;" supported (but not scoped!) 239 no encoding; 240 241 # an alternate way, Filter 242 use encoding "euc-jp", Filter=>1; 243 # now you can use kanji identifiers -- in euc-jp! 244 245 # switch on locale - 246 # note that this probably means that unless you have a complete control 247 # over the environments the application is ever going to be run, you should 248 # NOT use the feature of encoding pragma allowing you to write your script 249 # in any recognized encoding because changing locale settings will wreck 250 # the script; you can of course still use the other features of the pragma. 251 use encoding ':locale'; 252 253=head1 ABSTRACT 254 255Let's start with a bit of history: Perl 5.6.0 introduced Unicode 256support. You could apply C<substr()> and regexes even to complex CJK 257characters -- so long as the script was written in UTF-8. But back 258then, text editors that supported UTF-8 were still rare and many users 259instead chose to write scripts in legacy encodings, giving up a whole 260new feature of Perl 5.6. 261 262Rewind to the future: starting from perl 5.8.0 with the B<encoding> 263pragma, you can write your script in any encoding you like (so long 264as the C<Encode> module supports it) and still enjoy Unicode support. 265This pragma achieves that by doing the following: 266 267=over 268 269=item * 270 271Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from 272the encoding specified to utf8. In Perl 5.8.1 and later, literals in 273C<tr///> and C<DATA> pseudo-filehandle are also converted. 274 275=item * 276 277Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding 278 specified. 279 280=back 281 282=head2 Literal Conversions 283 284You can write code in EUC-JP as follows: 285 286 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji 287 #<-char-><-char-> # 4 octets 288 s/\bCamel\b/$Rakuda/; 289 290And with C<use encoding "euc-jp"> in effect, it is the same thing as 291the code in UTF-8: 292 293 my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters 294 s/\bCamel\b/$Rakuda/; 295 296=head2 PerlIO layers for C<STD(IN|OUT)> 297 298The B<encoding> pragma also modifies the filehandle layers of 299STDIN and STDOUT to the specified encoding. Therefore, 300 301 use encoding "euc-jp"; 302 my $message = "Camel is the symbol of perl.\n"; 303 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji 304 $message =~ s/\bCamel\b/$Rakuda/; 305 print $message; 306 307Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", 308not "\x{99F1}\x{99DD} is the symbol of perl.\n". 309 310You can override this by giving extra arguments; see below. 311 312=head2 Implicit upgrading for byte strings 313 314By default, if strings operating under byte semantics and strings 315with Unicode character data are concatenated, the new string will 316be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>. 317 318The B<encoding> pragma changes this to use the specified encoding 319instead. For example: 320 321 use encoding 'utf8'; 322 my $string = chr(20000); # a Unicode string 323 utf8::encode($string); # now it's a UTF-8 encoded byte string 324 # concatenate with another Unicode string 325 print length($string . chr(20000)); 326 327Will print C<2>, because C<$string> is upgraded as UTF-8. Without 328C<use encoding 'utf8';>, it will print C<4> instead, since C<$string> 329is three octets when interpreted as Latin-1. 330 331=head2 Side effects 332 333If the C<encoding> pragma is in scope then the lengths returned are 334calculated from the length of C<$/> in Unicode characters, which is not 335always the same as the length of C<$/> in the native encoding. 336 337This pragma affects utf8::upgrade, but not utf8::downgrade. 338 339=head1 FEATURES THAT REQUIRE 5.8.1 340 341Some of the features offered by this pragma requires perl 5.8.1. Most 342of these are done by Inaba Hiroto. Any other features and changes 343are good for 5.8.0. 344 345=over 346 347=item "NON-EUC" doublebyte encodings 348 349Because perl needs to parse script before applying this pragma, such 350encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH; 351\x5c) in the second byte fails because the second byte may 352accidentally escape the quoting character that follows. Perl 5.8.1 353or later fixes this problem. 354 355=item tr// 356 357C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0 358See the section below for details. 359 360=item DATA pseudo-filehandle 361 362Another feature that was overlooked was C<DATA>. 363 364=back 365 366=head1 USAGE 367 368=over 4 369 370=item use encoding [I<ENCNAME>] ; 371 372Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE} 373exists and non-zero, PerlIO layers of STDIN and STDOUT are set to 374":encoding(I<ENCNAME>)". 375 376Note that STDERR WILL NOT be changed. 377 378Also note that non-STD file handles remain unaffected. Use C<use 379open> or C<binmode> to change layers of those. 380 381If no encoding is specified, the environment variable L<PERL_ENCODING> 382is consulted. If no encoding can be found, the error C<Unknown encoding 383'I<ENCNAME>'> will be thrown. 384 385=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; 386 387You can also individually set encodings of STDIN and STDOUT via the 388C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the 389first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding 390completely off. 391 392When ${^UNICODE} exists and non-zero, these options will completely 393ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See 394L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for 395details (perl 5.8.1 and later). 396 397=item use encoding I<ENCNAME> Filter=E<gt>1; 398 399This turns the encoding pragma into a source filter. While the 400default approach just decodes interpolated literals (in qq() and 401qr()), this will apply a source filter to the entire source code. See 402L</"The Filter Option"> below for details. 403 404=item no encoding; 405 406Unsets the script encoding. The layers of STDIN, STDOUT are 407reset to ":raw" (the default unprocessed raw stream of bytes). 408 409=back 410 411=head1 The Filter Option 412 413The magic of C<use encoding> is not applied to the names of 414identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human 415is a single Han ideograph) work, you still need to write your script 416in UTF-8 -- or use a source filter. That's what 'Filter=>1' does. 417 418What does this mean? Your source code behaves as if it is written in 419UTF-8 with 'use utf8' in effect. So even if your editor only supports 420Shift_JIS, for example, you can still try examples in Chapter 15 of 421C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8 422identifiers. 423 424This option is significantly slower and (as of this writing) non-ASCII 425identifiers are not very stable WITHOUT this option and with the 426source code written in UTF-8. 427 428=head2 Filter-related changes at Encode version 1.87 429 430=over 431 432=item * 433 434The Filter option now sets STDIN and STDOUT like non-filter options. 435And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like 436non-filter version. 437 438=item * 439 440C<use utf8> is implicitly declared so you no longer have to C<use 441utf8> to C<${"\x{4eba}"}++>. 442 443=back 444 445=head1 CAVEATS 446 447=head2 NOT SCOPED 448 449The pragma is a per script, not a per block lexical. Only the last 450C<use encoding> or C<no encoding> matters, and it affects 451B<the whole script>. However, the <no encoding> pragma is supported and 452B<use encoding> can appear as many times as you want in a given script. 453The multiple use of this pragma is discouraged. 454 455By the same reason, the use this pragma inside modules is also 456discouraged (though not as strongly discouraged as the case above. 457See below). 458 459If you still have to write a module with this pragma, be very careful 460of the load order. See the codes below; 461 462 # called module 463 package Module_IN_BAR; 464 use encoding "bar"; 465 # stuff in "bar" encoding here 466 1; 467 468 # caller script 469 use encoding "foo" 470 use Module_IN_BAR; 471 # surprise! use encoding "bar" is in effect. 472 473The best way to avoid this oddity is to use this pragma RIGHT AFTER 474other modules are loaded. i.e. 475 476 use Module_IN_BAR; 477 use encoding "foo"; 478 479=head2 DO NOT MIX MULTIPLE ENCODINGS 480 481Notice that only literals (string or regular expression) having only 482legacy code points are affected: if you mix data like this 483 484 \xDF\x{100} 485 486the data is assumed to be in (Latin 1 and) Unicode, not in your native 487encoding. In other words, this will match in "greek": 488 489 "\xDF" =~ /\x{3af}/ 490 491but this will not 492 493 "\xDF\x{100}" =~ /\x{3af}\x{100}/ 494 495since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on 496the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL 497LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You 498should not be mixing your legacy data and Unicode in the same string. 499 500This pragma also affects encoding of the 0x80..0xFF code point range: 501normally characters in that range are left as eight-bit bytes (unless 502they are combined with characters with code points 0x100 or larger, 503in which case all characters need to become UTF-8 encoded), but if 504the C<encoding> pragma is present, even the 0x80..0xFF range always 505gets UTF-8 encoded. 506 507After all, the best thing about this pragma is that you don't have to 508resort to \x{....} just to spell your name in a native encoding. 509So feel free to put your strings in your encoding in quotes and 510regexes. 511 512=head2 tr/// with ranges 513 514The B<encoding> pragma works by decoding string literals in 515C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this 516does not apply to C<tr///>. Therefore, 517 518 use encoding 'euc-jp'; 519 #.... 520 $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/; 521 # -------- -------- -------- -------- 522 523Does not work as 524 525 $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/; 526 527=over 528 529=item Legend of characters above 530 531 utf8 euc-jp charnames::viacode() 532 ----------------------------------------- 533 \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A 534 \x{3093} \xA4\xF3 HIRAGANA LETTER N 535 \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A 536 \x{30f3} \xA5\xF3 KATAKANA LETTER N 537 538=back 539 540This counterintuitive behavior has been fixed in perl 5.8.1. 541 542=head3 workaround to tr///; 543 544In perl 5.8.0, you can work around as follows; 545 546 use encoding 'euc-jp'; 547 # .... 548 eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; 549 550Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind 551is the same as classic idiom that makes C<tr///> 'interpolate'. 552 553 tr/$from/$to/; # wrong! 554 eval qq{ tr/$from/$to/ }; # workaround. 555 556Nevertheless, in case of B<encoding> pragma even C<q//> is affected so 557C<tr///> not being decoded was obviously against the will of Perl5 558Porters so it has been fixed in Perl 5.8.1 or later. 559 560=head1 EXAMPLE - Greekperl 561 562 use encoding "iso 8859-7"; 563 564 # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. 565 566 $a = "\xDF"; 567 $b = "\x{100}"; 568 569 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf 570 571 $c = $a . $b; 572 573 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". 574 575 # chr() is affected, and ... 576 577 print "mega\n" if ord(chr(0xdf)) == 0x3af; 578 579 # ... ord() is affected by the encoding pragma ... 580 581 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; 582 583 # ... as are eq and cmp ... 584 585 print "peta\n" if "\x{3af}" eq pack("C", 0xdf); 586 print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; 587 588 # ... but pack/unpack C are not affected, in case you still 589 # want to go back to your native encoding 590 591 print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; 592 593=head1 KNOWN PROBLEMS 594 595=over 596 597=item literals in regex that are longer than 127 bytes 598 599For native multibyte encodings (either fixed or variable length), 600the current implementation of the regular expressions may introduce 601recoding errors for regular expression literals longer than 127 bytes. 602 603=item EBCDIC 604 605The encoding pragma is not supported on EBCDIC platforms. 606(Porters who are willing and able to remove this limitation are 607welcome.) 608 609=item format 610 611This pragma doesn't work well with format because PerlIO does not 612get along very well with it. When format contains non-ascii 613characters it prints funny or gets "wide character warnings". 614To understand it, try the code below. 615 616 # Save this one in utf8 617 # replace *non-ascii* with a non-ascii string 618 my $camel; 619 format STDOUT = 620 *non-ascii*@>>>>>>> 621 $camel 622 . 623 $camel = "*non-ascii*"; 624 binmode(STDOUT=>':encoding(utf8)'); # bang! 625 write; # funny 626 print $camel, "\n"; # fine 627 628Without binmode this happens to work but without binmode, print() 629fails instead of write(). 630 631At any rate, the very use of format is questionable when it comes to 632unicode characters since you have to consider such things as character 633width (i.e. double-width for ideographs) and directions (i.e. BIDI for 634Arabic and Hebrew). 635 636=item Thread safety 637 638C<use encoding ...> is not thread-safe (i.e., do not use in threaded 639applications). 640 641=back 642 643=head2 The Logic of :locale 644 645The logic of C<:locale> is as follows: 646 647=over 4 648 649=item 1. 650 651If the platform supports the langinfo(CODESET) interface, the codeset 652returned is used as the default encoding for the open pragma. 653 654=item 2. 655 656If 1. didn't work but we are under the locale pragma, the environment 657variables LC_ALL and LANG (in that order) are matched for encodings 658(the part after C<.>, if any), and if any found, that is used 659as the default encoding for the open pragma. 660 661=item 3. 662 663If 1. and 2. didn't work, the environment variables LC_ALL and LANG 664(in that order) are matched for anything looking like UTF-8, and if 665any found, C<:utf8> is used as the default encoding for the open 666pragma. 667 668=back 669 670If your locale environment variables (LC_ALL, LC_CTYPE, LANG) 671contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), 672the default encoding of your STDIN, STDOUT, and STDERR, and of 673B<any subsequent file open>, is UTF-8. 674 675=head1 HISTORY 676 677This pragma first appeared in Perl 5.8.0. For features that require 6785.8.1 and better, see above. 679 680The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6. 681 682=head1 SEE ALSO 683 684L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, 685 686Ch. 15 of C<Programming Perl (3rd Edition)> 687by Larry Wall, Tom Christiansen, Jon Orwant; 688O'Reilly & Associates; ISBN 0-596-00027-8 689 690=cut 691