ext/Encode/encoding.pm

*0Sstevel@tonic-gate# $Id: encoding.pm,v 1.48 2003/12/29 02:47:16 dankogai Exp dankogai $
*0Sstevel@tonic-gatepackage encoding;
*0Sstevel@tonic-gateour $VERSION = do { my @r = (q$Revision: 1.48 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateuse Encode;
*0Sstevel@tonic-gateuse strict;
*0Sstevel@tonic-gatesub DEBUG () { 0 }
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateBEGIN {
*0Sstevel@tonic-gate    if (ord("A") == 193) {
*0Sstevel@tonic-gate	require Carp;
*0Sstevel@tonic-gate	Carp::croak("encoding pragma does not support EBCDIC platforms");
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateour $HAS_PERLIO = 0;
*0Sstevel@tonic-gateeval { require PerlIO::encoding };
*0Sstevel@tonic-gateunless ($@){
*0Sstevel@tonic-gate    $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub _exception{
*0Sstevel@tonic-gate    my $name = shift;
*0Sstevel@tonic-gate    $] > 5.008 and return 0;               # 5.8.1 or higher then no
*0Sstevel@tonic-gate    my %utfs = map {$_=>1}
*0Sstevel@tonic-gate	qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
*0Sstevel@tonic-gate	   UTF-32 UTF-32BE UTF-32LE);
*0Sstevel@tonic-gate    $utfs{$name} or return 0;               # UTFs or no
*0Sstevel@tonic-gate    require Config; Config->import(); our %Config;
*0Sstevel@tonic-gate    return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub import {
*0Sstevel@tonic-gate    my $class = shift;
*0Sstevel@tonic-gate    my $name  = shift;
*0Sstevel@tonic-gate    my %arg = @_;
*0Sstevel@tonic-gate    $name ||= $ENV{PERL_ENCODING};
*0Sstevel@tonic-gate    my $enc = find_encoding($name);
*0Sstevel@tonic-gate    unless (defined $enc) {
*0Sstevel@tonic-gate	require Carp;
*0Sstevel@tonic-gate	Carp::croak("Unknown encoding '$name'");
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    $name = $enc->name; # canonize
*0Sstevel@tonic-gate    unless ($arg{Filter}) {
*0Sstevel@tonic-gate	DEBUG and warn "_exception($name) = ", _exception($name);
*0Sstevel@tonic-gate	_exception($name) or ${^ENCODING} = $enc;
*0Sstevel@tonic-gate	$HAS_PERLIO or return 1;
*0Sstevel@tonic-gate    }else{
*0Sstevel@tonic-gate	defined(${^ENCODING}) and undef ${^ENCODING};
*0Sstevel@tonic-gate	# implicitly 'use utf8'
*0Sstevel@tonic-gate	require utf8; # to fetch $utf8::hint_bits;
*0Sstevel@tonic-gate	$^H |= $utf8::hint_bits;
*0Sstevel@tonic-gate	eval {
*0Sstevel@tonic-gate	    require Filter::Util::Call ;
*0Sstevel@tonic-gate	    Filter::Util::Call->import ;
*0Sstevel@tonic-gate	    filter_add(sub{
*0Sstevel@tonic-gate			   my $status = filter_read();
*0Sstevel@tonic-gate                           if ($status > 0){
*0Sstevel@tonic-gate			       $_ = $enc->decode($_, 1);
*0Sstevel@tonic-gate			       DEBUG and warn $_;
*0Sstevel@tonic-gate			   }
*0Sstevel@tonic-gate			   $status ;
*0Sstevel@tonic-gate		       });
*0Sstevel@tonic-gate	};
*0Sstevel@tonic-gate    }	DEBUG and warn "Filter installed";
*0Sstevel@tonic-gate    defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
*0Sstevel@tonic-gate    for my $h (qw(STDIN STDOUT)){
*0Sstevel@tonic-gate	if ($arg{$h}){
*0Sstevel@tonic-gate	    unless (defined find_encoding($arg{$h})) {
*0Sstevel@tonic-gate		require Carp;
*0Sstevel@tonic-gate		Carp::croak("Unknown encoding for $h, '$arg{$h}'");
*0Sstevel@tonic-gate	    }
*0Sstevel@tonic-gate	    eval { binmode($h, ":raw :encoding($arg{$h})") };
*0Sstevel@tonic-gate	}else{
*0Sstevel@tonic-gate	    unless (exists $arg{$h}){
*0Sstevel@tonic-gate		eval {
*0Sstevel@tonic-gate		    no warnings 'uninitialized';
*0Sstevel@tonic-gate		    binmode($h, ":raw :encoding($name)");
*0Sstevel@tonic-gate		};
*0Sstevel@tonic-gate	    }
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate	if ($@){
*0Sstevel@tonic-gate	    require Carp;
*0Sstevel@tonic-gate	    Carp::croak($@);
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    return 1; # I doubt if we need it, though
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub unimport{
*0Sstevel@tonic-gate    no warnings;
*0Sstevel@tonic-gate    undef ${^ENCODING};
*0Sstevel@tonic-gate    if ($HAS_PERLIO){
*0Sstevel@tonic-gate	binmode(STDIN,  ":raw");
*0Sstevel@tonic-gate	binmode(STDOUT, ":raw");
*0Sstevel@tonic-gate    }else{
*0Sstevel@tonic-gate	binmode(STDIN);
*0Sstevel@tonic-gate	binmode(STDOUT);
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    if ($INC{"Filter/Util/Call.pm"}){
*0Sstevel@tonic-gate	eval { filter_del() };
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate1;
*0Sstevel@tonic-gate__END__
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=pod
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 NAME
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateencoding - allows you to write your script in non-ascii or non-utf8
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 SYNOPSIS
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  use encoding "greek";  # Perl like Greek to you?
*0Sstevel@tonic-gate  use encoding "euc-jp"; # Jperl!
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  # or you can even do this if your shell supports your native encoding
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  perl -Mencoding=latin2 -e '...' # Feeling centrally European?
*0Sstevel@tonic-gate  perl -Mencoding=euc-kr -e '...' # Or Korean?
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  # more control
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  # A simple euc-cn => utf-8 converter
*0Sstevel@tonic-gate  use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  # "no encoding;" supported (but not scoped!)
*0Sstevel@tonic-gate  no encoding;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  # an alternate way, Filter
*0Sstevel@tonic-gate  use encoding "euc-jp", Filter=>1;
*0Sstevel@tonic-gate  # now you can use kanji identifiers -- in euc-jp!
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 ABSTRACT
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateLet's start with a bit of history: Perl 5.6.0 introduced Unicode
*0Sstevel@tonic-gatesupport.  You could apply C<substr()> and regexes even to complex CJK
*0Sstevel@tonic-gatecharacters -- so long as the script was written in UTF-8.  But back
*0Sstevel@tonic-gatethen, text editors that supported UTF-8 were still rare and many users
*0Sstevel@tonic-gateinstead chose to write scripts in legacy encodings, giving up a whole
*0Sstevel@tonic-gatenew feature of Perl 5.6.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateRewind to the future: starting from perl 5.8.0 with the B<encoding>
*0Sstevel@tonic-gatepragma, you can write your script in any encoding you like (so long
*0Sstevel@tonic-gateas the C<Encode> module supports it) and still enjoy Unicode support.
*0Sstevel@tonic-gateThis pragma achieves that by doing the following:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateInternally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
*0Sstevel@tonic-gatethe encoding specified to utf8.  In Perl 5.8.1 and later, literals in
*0Sstevel@tonic-gateC<tr///> and C<DATA> pseudo-filehandle are also converted.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateChanging PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
*0Sstevel@tonic-gate specified.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 Literal Conversions
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateYou can write code in EUC-JP as follows:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
*0Sstevel@tonic-gate               #<-char-><-char->   # 4 octets
*0Sstevel@tonic-gate  s/\bCamel\b/$Rakuda/;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateAnd with C<use encoding "euc-jp"> in effect, it is the same thing as
*0Sstevel@tonic-gatethe code in UTF-8:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
*0Sstevel@tonic-gate  s/\bCamel\b/$Rakuda/;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 PerlIO layers for C<STD(IN|OUT)>
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe B<encoding> pragma also modifies the filehandle layers of
*0Sstevel@tonic-gateSTDIN and STDOUT to the specified encoding.  Therefore,
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  use encoding "euc-jp";
*0Sstevel@tonic-gate  my $message = "Camel is the symbol of perl.\n";
*0Sstevel@tonic-gate  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
*0Sstevel@tonic-gate  $message =~ s/\bCamel\b/$Rakuda/;
*0Sstevel@tonic-gate  print $message;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWill print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
*0Sstevel@tonic-gatenot "\x{99F1}\x{99DD} is the symbol of perl.\n".
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateYou can override this by giving extra arguments; see below.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 Implicit upgrading for byte strings
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateBy default, if strings operating under byte semantics and strings
*0Sstevel@tonic-gatewith Unicode character data are concatenated, the new string will
*0Sstevel@tonic-gatebe created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe B<encoding> pragma changes this to use the specified encoding
*0Sstevel@tonic-gateinstead.  For example:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use encoding 'utf8';
*0Sstevel@tonic-gate    my $string = chr(20000); # a Unicode string
*0Sstevel@tonic-gate    utf8::encode($string);   # now it's a UTF-8 encoded byte string
*0Sstevel@tonic-gate    # concatenate with another Unicode string
*0Sstevel@tonic-gate    print length($string . chr(20000));
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWill print C<2>, because C<$string> is upgraded as UTF-8.  Without
*0Sstevel@tonic-gateC<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
*0Sstevel@tonic-gateis three octets when interpreted as Latin-1.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 FEATURES THAT REQUIRE 5.8.1
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateSome of the features offered by this pragma requires perl 5.8.1.  Most
*0Sstevel@tonic-gateof these are done by Inaba Hiroto.  Any other features and changes
*0Sstevel@tonic-gateare good for 5.8.0.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item "NON-EUC" doublebyte encodings
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateBecause perl needs to parse script before applying this pragma, such
*0Sstevel@tonic-gateencodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
*0Sstevel@tonic-gate\x5c) in the second byte fails because the second byte may
*0Sstevel@tonic-gateaccidentally escape the quoting character that follows.  Perl 5.8.1
*0Sstevel@tonic-gateor later fixes this problem.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item tr//
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateC<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
*0Sstevel@tonic-gateSee the section below for details.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item DATA pseudo-filehandle
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateAnother feature that was overlooked was C<DATA>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 USAGE
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over 4
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item use encoding [I<ENCNAME>] ;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateSets the script encoding to I<ENCNAME>.  And unless ${^UNICODE}
*0Sstevel@tonic-gateexists and non-zero, PerlIO layers of STDIN and STDOUT are set to
*0Sstevel@tonic-gate":encoding(I<ENCNAME>)".
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateNote that STDERR WILL NOT be changed.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateAlso note that non-STD file handles remain unaffected.  Use C<use
*0Sstevel@tonic-gateopen> or C<binmode> to change layers of those.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIf no encoding is specified, the environment variable L<PERL_ENCODING>
*0Sstevel@tonic-gateis consulted.  If no encoding can be found, the error C<Unknown encoding
*0Sstevel@tonic-gate'I<ENCNAME>'> will be thrown.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateYou can also individually set encodings of STDIN and STDOUT via the
*0Sstevel@tonic-gateC<< STDIN => I<ENCNAME> >> form.  In this case, you cannot omit the
*0Sstevel@tonic-gatefirst I<ENCNAME>.  C<< STDIN => undef >> turns the IO transcoding
*0Sstevel@tonic-gatecompletely off.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWhen ${^UNICODE} exists and non-zero, these options will completely
*0Sstevel@tonic-gateignored.  ${^UNICODE} is a variable introduced in perl 5.8.1.  See
*0Sstevel@tonic-gateL<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
*0Sstevel@tonic-gatedetails (perl 5.8.1 and later).
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item use encoding I<ENCNAME> Filter=E<gt>1;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThis turns the encoding pragma into a source filter.  While the
*0Sstevel@tonic-gatedefault approach just decodes interpolated literals (in qq() and
*0Sstevel@tonic-gateqr()), this will apply a source filter to the entire source code.  See
*0Sstevel@tonic-gateL</"The Filter Option"> below for details.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item no encoding;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateUnsets the script encoding. The layers of STDIN, STDOUT are
*0Sstevel@tonic-gatereset to ":raw" (the default unprocessed raw stream of bytes).
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 The Filter Option
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe magic of C<use encoding> is not applied to the names of
*0Sstevel@tonic-gateidentifiers.  In order to make C<${"\x{4eba}"}++> ($human++, where human
*0Sstevel@tonic-gateis a single Han ideograph) work, you still need to write your script
*0Sstevel@tonic-gatein UTF-8 -- or use a source filter.  That's what 'Filter=>1' does.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWhat does this mean?  Your source code behaves as if it is written in
*0Sstevel@tonic-gateUTF-8 with 'use utf8' in effect.  So even if your editor only supports
*0Sstevel@tonic-gateShift_JIS, for example, you can still try examples in Chapter 15 of
*0Sstevel@tonic-gateC<Programming Perl, 3rd Ed.>.  For instance, you can use UTF-8
*0Sstevel@tonic-gateidentifiers.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThis option is significantly slower and (as of this writing) non-ASCII
*0Sstevel@tonic-gateidentifiers are not very stable WITHOUT this option and with the
*0Sstevel@tonic-gatesource code written in UTF-8.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 Filter-related changes at Encode version 1.87
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe Filter option now sets STDIN and STDOUT like non-filter options.
*0Sstevel@tonic-gateAnd C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
*0Sstevel@tonic-gatenon-filter version.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateC<use utf8> is implicitly declared so you no longer have to C<use
*0Sstevel@tonic-gateutf8> to C<${"\x{4eba}"}++>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 CAVEATS
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 NOT SCOPED
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe pragma is a per script, not a per block lexical.  Only the last
*0Sstevel@tonic-gateC<use encoding> or C<no encoding> matters, and it affects
*0Sstevel@tonic-gateB<the whole script>.  However, the <no encoding> pragma is supported and
*0Sstevel@tonic-gateB<use encoding> can appear as many times as you want in a given script.
*0Sstevel@tonic-gateThe multiple use of this pragma is discouraged.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateBy the same reason, the use this pragma inside modules is also
*0Sstevel@tonic-gatediscouraged (though not as strongly discouranged as the case above.
*0Sstevel@tonic-gateSee below).
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIf you still have to write a module with this pragma, be very careful
*0Sstevel@tonic-gateof the load order.  See the codes below;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  # called module
*0Sstevel@tonic-gate  package Module_IN_BAR;
*0Sstevel@tonic-gate  use encoding "bar";
*0Sstevel@tonic-gate  # stuff in "bar" encoding here
*0Sstevel@tonic-gate  1;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  # caller script
*0Sstevel@tonic-gate  use encoding "foo"
*0Sstevel@tonic-gate  use Module_IN_BAR;
*0Sstevel@tonic-gate  # surprise! use encoding "bar" is in effect.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe best way to avoid this oddity is to use this pragma RIGHT AFTER
*0Sstevel@tonic-gateother modules are loaded.  i.e.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  use Module_IN_BAR;
*0Sstevel@tonic-gate  use encoding "foo";
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 DO NOT MIX MULTIPLE ENCODINGS
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateNotice that only literals (string or regular expression) having only
*0Sstevel@tonic-gatelegacy code points are affected: if you mix data like this
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	\xDF\x{100}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatethe data is assumed to be in (Latin 1 and) Unicode, not in your native
*0Sstevel@tonic-gateencoding.  In other words, this will match in "greek":
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	"\xDF" =~ /\x{3af}/
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatebut this will not
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	"\xDF\x{100}" =~ /\x{3af}\x{100}/
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesince the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
*0Sstevel@tonic-gatethe left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
*0Sstevel@tonic-gateLETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
*0Sstevel@tonic-gateshould not be mixing your legacy data and Unicode in the same string.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThis pragma also affects encoding of the 0x80..0xFF code point range:
*0Sstevel@tonic-gatenormally characters in that range are left as eight-bit bytes (unless
*0Sstevel@tonic-gatethey are combined with characters with code points 0x100 or larger,
*0Sstevel@tonic-gatein which case all characters need to become UTF-8 encoded), but if
*0Sstevel@tonic-gatethe C<encoding> pragma is present, even the 0x80..0xFF range always
*0Sstevel@tonic-gategets UTF-8 encoded.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateAfter all, the best thing about this pragma is that you don't have to
*0Sstevel@tonic-gateresort to \x{....} just to spell your name in a native encoding.
*0Sstevel@tonic-gateSo feel free to put your strings in your encoding in quotes and
*0Sstevel@tonic-gateregexes.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 tr/// with ranges
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe B<encoding> pragma works by decoding string literals in
*0Sstevel@tonic-gateC<q//,qq//,qr//,qw///, qx//> and so forth.  In perl 5.8.0, this
*0Sstevel@tonic-gatedoes not apply to C<tr///>.  Therefore,
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  use encoding 'euc-jp';
*0Sstevel@tonic-gate  #....
*0Sstevel@tonic-gate  $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
*0Sstevel@tonic-gate  #           -------- -------- -------- --------
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateDoes not work as
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item Legend of characters above
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  utf8     euc-jp   charnames::viacode()
*0Sstevel@tonic-gate  -----------------------------------------
*0Sstevel@tonic-gate  \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
*0Sstevel@tonic-gate  \x{3093} \xA4\xF3 HIRAGANA LETTER N
*0Sstevel@tonic-gate  \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
*0Sstevel@tonic-gate  \x{30f3} \xA5\xF3 KATAKANA LETTER N
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThis counterintuitive behavior has been fixed in perl 5.8.1.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head3 workaround to tr///;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIn perl 5.8.0, you can work around as follows;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  use encoding 'euc-jp';
*0Sstevel@tonic-gate  #  ....
*0Sstevel@tonic-gate  eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateNote the C<tr//> expression is surrounded by C<qq{}>.  The idea behind
*0Sstevel@tonic-gateis the same as classic idiom that makes C<tr///> 'interpolate'.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate   tr/$from/$to/;            # wrong!
*0Sstevel@tonic-gate   eval qq{ tr/$from/$to/ }; # workaround.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateNevertheless, in case of B<encoding> pragma even C<q//> is affected so
*0Sstevel@tonic-gateC<tr///> not being decoded was obviously against the will of Perl5
*0Sstevel@tonic-gatePorters so it has been fixed in Perl 5.8.1 or later.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 EXAMPLE - Greekperl
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use encoding "iso 8859-7";
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    $a = "\xDF";
*0Sstevel@tonic-gate    $b = "\x{100}";
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    $c = $a . $b;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    # chr() is affected, and ...
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    print "mega\n"  if ord(chr(0xdf)) == 0x3af;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    # ... ord() is affected by the encoding pragma ...
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    # ... as are eq and cmp ...
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
*0Sstevel@tonic-gate    print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    # ... but pack/unpack C are not affected, in case you still
*0Sstevel@tonic-gate    # want to go back to your native encoding
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 KNOWN PROBLEMS
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item literals in regex that are longer than 127 bytes
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateFor native multibyte encodings (either fixed or variable length),
*0Sstevel@tonic-gatethe current implementation of the regular expressions may introduce
*0Sstevel@tonic-gaterecoding errors for regular expression literals longer than 127 bytes.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item EBCDIC
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe encoding pragma is not supported on EBCDIC platforms.
*0Sstevel@tonic-gate(Porters who are willing and able to remove this limitation are
*0Sstevel@tonic-gatewelcome.)
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item format
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThis pragma doesn't work well with format because PerlIO does not
*0Sstevel@tonic-gateget along very well with it.  When format contains non-ascii
*0Sstevel@tonic-gatecharacters it prints funny or gets "wide character warnings".
*0Sstevel@tonic-gateTo understand it, try the code below.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  # Save this one in utf8
*0Sstevel@tonic-gate  # replace *non-ascii* with a non-ascii string
*0Sstevel@tonic-gate  my $camel;
*0Sstevel@tonic-gate  format STDOUT =
*0Sstevel@tonic-gate  *non-ascii*@>>>>>>>
*0Sstevel@tonic-gate  $camel
*0Sstevel@tonic-gate  .
*0Sstevel@tonic-gate  $camel = "*non-ascii*";
*0Sstevel@tonic-gate  binmode(STDOUT=>':encoding(utf8)'); # bang!
*0Sstevel@tonic-gate  write;              # funny
*0Sstevel@tonic-gate  print $camel, "\n"; # fine
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWithout binmode this happens to work but without binmode, print()
*0Sstevel@tonic-gatefails instead of write().
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateAt any rate, the very use of format is questionable when it comes to
*0Sstevel@tonic-gateunicode characters since you have to consider such things as character
*0Sstevel@tonic-gatewidth (i.e. double-width for ideographs) and directions (i.e. BIDI for
*0Sstevel@tonic-gateArabic and Hebrew).
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 HISTORY
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThis pragma first appeared in Perl 5.8.0.  For features that require
*0Sstevel@tonic-gate5.8.1 and better, see above.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 SEE ALSO
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateL<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateCh. 15 of C<Programming Perl (3rd Edition)>
*0Sstevel@tonic-gateby Larry Wall, Tom Christiansen, Jon Orwant;
*0Sstevel@tonic-gateO'Reilly & Associates; ISBN 0-596-00027-8
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut