xref: /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/ext/Encode/encoding.pm (revision 0:68f95e015346)
1*0Sstevel@tonic-gate# $Id: encoding.pm,v 1.48 2003/12/29 02:47:16 dankogai Exp dankogai $
2*0Sstevel@tonic-gatepackage encoding;
3*0Sstevel@tonic-gateour $VERSION = do { my @r = (q$Revision: 1.48 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
4*0Sstevel@tonic-gate
5*0Sstevel@tonic-gateuse Encode;
6*0Sstevel@tonic-gateuse strict;
7*0Sstevel@tonic-gatesub DEBUG () { 0 }
8*0Sstevel@tonic-gate
9*0Sstevel@tonic-gateBEGIN {
10*0Sstevel@tonic-gate    if (ord("A") == 193) {
11*0Sstevel@tonic-gate	require Carp;
12*0Sstevel@tonic-gate	Carp::croak("encoding pragma does not support EBCDIC platforms");
13*0Sstevel@tonic-gate    }
14*0Sstevel@tonic-gate}
15*0Sstevel@tonic-gate
16*0Sstevel@tonic-gateour $HAS_PERLIO = 0;
17*0Sstevel@tonic-gateeval { require PerlIO::encoding };
18*0Sstevel@tonic-gateunless ($@){
19*0Sstevel@tonic-gate    $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
20*0Sstevel@tonic-gate}
21*0Sstevel@tonic-gate
22*0Sstevel@tonic-gatesub _exception{
23*0Sstevel@tonic-gate    my $name = shift;
24*0Sstevel@tonic-gate    $] > 5.008 and return 0;               # 5.8.1 or higher then no
25*0Sstevel@tonic-gate    my %utfs = map {$_=>1}
26*0Sstevel@tonic-gate	qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
27*0Sstevel@tonic-gate	   UTF-32 UTF-32BE UTF-32LE);
28*0Sstevel@tonic-gate    $utfs{$name} or return 0;               # UTFs or no
29*0Sstevel@tonic-gate    require Config; Config->import(); our %Config;
30*0Sstevel@tonic-gate    return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
31*0Sstevel@tonic-gate}
32*0Sstevel@tonic-gate
33*0Sstevel@tonic-gatesub import {
34*0Sstevel@tonic-gate    my $class = shift;
35*0Sstevel@tonic-gate    my $name  = shift;
36*0Sstevel@tonic-gate    my %arg = @_;
37*0Sstevel@tonic-gate    $name ||= $ENV{PERL_ENCODING};
38*0Sstevel@tonic-gate    my $enc = find_encoding($name);
39*0Sstevel@tonic-gate    unless (defined $enc) {
40*0Sstevel@tonic-gate	require Carp;
41*0Sstevel@tonic-gate	Carp::croak("Unknown encoding '$name'");
42*0Sstevel@tonic-gate    }
43*0Sstevel@tonic-gate    $name = $enc->name; # canonize
44*0Sstevel@tonic-gate    unless ($arg{Filter}) {
45*0Sstevel@tonic-gate	DEBUG and warn "_exception($name) = ", _exception($name);
46*0Sstevel@tonic-gate	_exception($name) or ${^ENCODING} = $enc;
47*0Sstevel@tonic-gate	$HAS_PERLIO or return 1;
48*0Sstevel@tonic-gate    }else{
49*0Sstevel@tonic-gate	defined(${^ENCODING}) and undef ${^ENCODING};
50*0Sstevel@tonic-gate	# implicitly 'use utf8'
51*0Sstevel@tonic-gate	require utf8; # to fetch $utf8::hint_bits;
52*0Sstevel@tonic-gate	$^H |= $utf8::hint_bits;
53*0Sstevel@tonic-gate	eval {
54*0Sstevel@tonic-gate	    require Filter::Util::Call ;
55*0Sstevel@tonic-gate	    Filter::Util::Call->import ;
56*0Sstevel@tonic-gate	    filter_add(sub{
57*0Sstevel@tonic-gate			   my $status = filter_read();
58*0Sstevel@tonic-gate                           if ($status > 0){
59*0Sstevel@tonic-gate			       $_ = $enc->decode($_, 1);
60*0Sstevel@tonic-gate			       DEBUG and warn $_;
61*0Sstevel@tonic-gate			   }
62*0Sstevel@tonic-gate			   $status ;
63*0Sstevel@tonic-gate		       });
64*0Sstevel@tonic-gate	};
65*0Sstevel@tonic-gate    }	DEBUG and warn "Filter installed";
66*0Sstevel@tonic-gate    defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
67*0Sstevel@tonic-gate    for my $h (qw(STDIN STDOUT)){
68*0Sstevel@tonic-gate	if ($arg{$h}){
69*0Sstevel@tonic-gate	    unless (defined find_encoding($arg{$h})) {
70*0Sstevel@tonic-gate		require Carp;
71*0Sstevel@tonic-gate		Carp::croak("Unknown encoding for $h, '$arg{$h}'");
72*0Sstevel@tonic-gate	    }
73*0Sstevel@tonic-gate	    eval { binmode($h, ":raw :encoding($arg{$h})") };
74*0Sstevel@tonic-gate	}else{
75*0Sstevel@tonic-gate	    unless (exists $arg{$h}){
76*0Sstevel@tonic-gate		eval {
77*0Sstevel@tonic-gate		    no warnings 'uninitialized';
78*0Sstevel@tonic-gate		    binmode($h, ":raw :encoding($name)");
79*0Sstevel@tonic-gate		};
80*0Sstevel@tonic-gate	    }
81*0Sstevel@tonic-gate	}
82*0Sstevel@tonic-gate	if ($@){
83*0Sstevel@tonic-gate	    require Carp;
84*0Sstevel@tonic-gate	    Carp::croak($@);
85*0Sstevel@tonic-gate	}
86*0Sstevel@tonic-gate    }
87*0Sstevel@tonic-gate    return 1; # I doubt if we need it, though
88*0Sstevel@tonic-gate}
89*0Sstevel@tonic-gate
90*0Sstevel@tonic-gatesub unimport{
91*0Sstevel@tonic-gate    no warnings;
92*0Sstevel@tonic-gate    undef ${^ENCODING};
93*0Sstevel@tonic-gate    if ($HAS_PERLIO){
94*0Sstevel@tonic-gate	binmode(STDIN,  ":raw");
95*0Sstevel@tonic-gate	binmode(STDOUT, ":raw");
96*0Sstevel@tonic-gate    }else{
97*0Sstevel@tonic-gate	binmode(STDIN);
98*0Sstevel@tonic-gate	binmode(STDOUT);
99*0Sstevel@tonic-gate    }
100*0Sstevel@tonic-gate    if ($INC{"Filter/Util/Call.pm"}){
101*0Sstevel@tonic-gate	eval { filter_del() };
102*0Sstevel@tonic-gate    }
103*0Sstevel@tonic-gate}
104*0Sstevel@tonic-gate
105*0Sstevel@tonic-gate1;
106*0Sstevel@tonic-gate__END__
107*0Sstevel@tonic-gate
108*0Sstevel@tonic-gate=pod
109*0Sstevel@tonic-gate
110*0Sstevel@tonic-gate=head1 NAME
111*0Sstevel@tonic-gate
112*0Sstevel@tonic-gateencoding - allows you to write your script in non-ascii or non-utf8
113*0Sstevel@tonic-gate
114*0Sstevel@tonic-gate=head1 SYNOPSIS
115*0Sstevel@tonic-gate
116*0Sstevel@tonic-gate  use encoding "greek";  # Perl like Greek to you?
117*0Sstevel@tonic-gate  use encoding "euc-jp"; # Jperl!
118*0Sstevel@tonic-gate
119*0Sstevel@tonic-gate  # or you can even do this if your shell supports your native encoding
120*0Sstevel@tonic-gate
121*0Sstevel@tonic-gate  perl -Mencoding=latin2 -e '...' # Feeling centrally European?
122*0Sstevel@tonic-gate  perl -Mencoding=euc-kr -e '...' # Or Korean?
123*0Sstevel@tonic-gate
124*0Sstevel@tonic-gate  # more control
125*0Sstevel@tonic-gate
126*0Sstevel@tonic-gate  # A simple euc-cn => utf-8 converter
127*0Sstevel@tonic-gate  use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};
128*0Sstevel@tonic-gate
129*0Sstevel@tonic-gate  # "no encoding;" supported (but not scoped!)
130*0Sstevel@tonic-gate  no encoding;
131*0Sstevel@tonic-gate
132*0Sstevel@tonic-gate  # an alternate way, Filter
133*0Sstevel@tonic-gate  use encoding "euc-jp", Filter=>1;
134*0Sstevel@tonic-gate  # now you can use kanji identifiers -- in euc-jp!
135*0Sstevel@tonic-gate
136*0Sstevel@tonic-gate=head1 ABSTRACT
137*0Sstevel@tonic-gate
138*0Sstevel@tonic-gateLet's start with a bit of history: Perl 5.6.0 introduced Unicode
139*0Sstevel@tonic-gatesupport.  You could apply C<substr()> and regexes even to complex CJK
140*0Sstevel@tonic-gatecharacters -- so long as the script was written in UTF-8.  But back
141*0Sstevel@tonic-gatethen, text editors that supported UTF-8 were still rare and many users
142*0Sstevel@tonic-gateinstead chose to write scripts in legacy encodings, giving up a whole
143*0Sstevel@tonic-gatenew feature of Perl 5.6.
144*0Sstevel@tonic-gate
145*0Sstevel@tonic-gateRewind to the future: starting from perl 5.8.0 with the B<encoding>
146*0Sstevel@tonic-gatepragma, you can write your script in any encoding you like (so long
147*0Sstevel@tonic-gateas the C<Encode> module supports it) and still enjoy Unicode support.
148*0Sstevel@tonic-gateThis pragma achieves that by doing the following:
149*0Sstevel@tonic-gate
150*0Sstevel@tonic-gate=over
151*0Sstevel@tonic-gate
152*0Sstevel@tonic-gate=item *
153*0Sstevel@tonic-gate
154*0Sstevel@tonic-gateInternally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
155*0Sstevel@tonic-gatethe encoding specified to utf8.  In Perl 5.8.1 and later, literals in
156*0Sstevel@tonic-gateC<tr///> and C<DATA> pseudo-filehandle are also converted.
157*0Sstevel@tonic-gate
158*0Sstevel@tonic-gate=item *
159*0Sstevel@tonic-gate
160*0Sstevel@tonic-gateChanging PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
161*0Sstevel@tonic-gate specified.
162*0Sstevel@tonic-gate
163*0Sstevel@tonic-gate=back
164*0Sstevel@tonic-gate
165*0Sstevel@tonic-gate=head2 Literal Conversions
166*0Sstevel@tonic-gate
167*0Sstevel@tonic-gateYou can write code in EUC-JP as follows:
168*0Sstevel@tonic-gate
169*0Sstevel@tonic-gate  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
170*0Sstevel@tonic-gate               #<-char-><-char->   # 4 octets
171*0Sstevel@tonic-gate  s/\bCamel\b/$Rakuda/;
172*0Sstevel@tonic-gate
173*0Sstevel@tonic-gateAnd with C<use encoding "euc-jp"> in effect, it is the same thing as
174*0Sstevel@tonic-gatethe code in UTF-8:
175*0Sstevel@tonic-gate
176*0Sstevel@tonic-gate  my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
177*0Sstevel@tonic-gate  s/\bCamel\b/$Rakuda/;
178*0Sstevel@tonic-gate
179*0Sstevel@tonic-gate=head2 PerlIO layers for C<STD(IN|OUT)>
180*0Sstevel@tonic-gate
181*0Sstevel@tonic-gateThe B<encoding> pragma also modifies the filehandle layers of
182*0Sstevel@tonic-gateSTDIN and STDOUT to the specified encoding.  Therefore,
183*0Sstevel@tonic-gate
184*0Sstevel@tonic-gate  use encoding "euc-jp";
185*0Sstevel@tonic-gate  my $message = "Camel is the symbol of perl.\n";
186*0Sstevel@tonic-gate  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
187*0Sstevel@tonic-gate  $message =~ s/\bCamel\b/$Rakuda/;
188*0Sstevel@tonic-gate  print $message;
189*0Sstevel@tonic-gate
190*0Sstevel@tonic-gateWill print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
191*0Sstevel@tonic-gatenot "\x{99F1}\x{99DD} is the symbol of perl.\n".
192*0Sstevel@tonic-gate
193*0Sstevel@tonic-gateYou can override this by giving extra arguments; see below.
194*0Sstevel@tonic-gate
195*0Sstevel@tonic-gate=head2 Implicit upgrading for byte strings
196*0Sstevel@tonic-gate
197*0Sstevel@tonic-gateBy default, if strings operating under byte semantics and strings
198*0Sstevel@tonic-gatewith Unicode character data are concatenated, the new string will
199*0Sstevel@tonic-gatebe created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
200*0Sstevel@tonic-gate
201*0Sstevel@tonic-gateThe B<encoding> pragma changes this to use the specified encoding
202*0Sstevel@tonic-gateinstead.  For example:
203*0Sstevel@tonic-gate
204*0Sstevel@tonic-gate    use encoding 'utf8';
205*0Sstevel@tonic-gate    my $string = chr(20000); # a Unicode string
206*0Sstevel@tonic-gate    utf8::encode($string);   # now it's a UTF-8 encoded byte string
207*0Sstevel@tonic-gate    # concatenate with another Unicode string
208*0Sstevel@tonic-gate    print length($string . chr(20000));
209*0Sstevel@tonic-gate
210*0Sstevel@tonic-gateWill print C<2>, because C<$string> is upgraded as UTF-8.  Without
211*0Sstevel@tonic-gateC<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
212*0Sstevel@tonic-gateis three octets when interpreted as Latin-1.
213*0Sstevel@tonic-gate
214*0Sstevel@tonic-gate=head1 FEATURES THAT REQUIRE 5.8.1
215*0Sstevel@tonic-gate
216*0Sstevel@tonic-gateSome of the features offered by this pragma requires perl 5.8.1.  Most
217*0Sstevel@tonic-gateof these are done by Inaba Hiroto.  Any other features and changes
218*0Sstevel@tonic-gateare good for 5.8.0.
219*0Sstevel@tonic-gate
220*0Sstevel@tonic-gate=over
221*0Sstevel@tonic-gate
222*0Sstevel@tonic-gate=item "NON-EUC" doublebyte encodings
223*0Sstevel@tonic-gate
224*0Sstevel@tonic-gateBecause perl needs to parse script before applying this pragma, such
225*0Sstevel@tonic-gateencodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
226*0Sstevel@tonic-gate\x5c) in the second byte fails because the second byte may
227*0Sstevel@tonic-gateaccidentally escape the quoting character that follows.  Perl 5.8.1
228*0Sstevel@tonic-gateor later fixes this problem.
229*0Sstevel@tonic-gate
230*0Sstevel@tonic-gate=item tr//
231*0Sstevel@tonic-gate
232*0Sstevel@tonic-gateC<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
233*0Sstevel@tonic-gateSee the section below for details.
234*0Sstevel@tonic-gate
235*0Sstevel@tonic-gate=item DATA pseudo-filehandle
236*0Sstevel@tonic-gate
237*0Sstevel@tonic-gateAnother feature that was overlooked was C<DATA>.
238*0Sstevel@tonic-gate
239*0Sstevel@tonic-gate=back
240*0Sstevel@tonic-gate
241*0Sstevel@tonic-gate=head1 USAGE
242*0Sstevel@tonic-gate
243*0Sstevel@tonic-gate=over 4
244*0Sstevel@tonic-gate
245*0Sstevel@tonic-gate=item use encoding [I<ENCNAME>] ;
246*0Sstevel@tonic-gate
247*0Sstevel@tonic-gateSets the script encoding to I<ENCNAME>.  And unless ${^UNICODE}
248*0Sstevel@tonic-gateexists and non-zero, PerlIO layers of STDIN and STDOUT are set to
249*0Sstevel@tonic-gate":encoding(I<ENCNAME>)".
250*0Sstevel@tonic-gate
251*0Sstevel@tonic-gateNote that STDERR WILL NOT be changed.
252*0Sstevel@tonic-gate
253*0Sstevel@tonic-gateAlso note that non-STD file handles remain unaffected.  Use C<use
254*0Sstevel@tonic-gateopen> or C<binmode> to change layers of those.
255*0Sstevel@tonic-gate
256*0Sstevel@tonic-gateIf no encoding is specified, the environment variable L<PERL_ENCODING>
257*0Sstevel@tonic-gateis consulted.  If no encoding can be found, the error C<Unknown encoding
258*0Sstevel@tonic-gate'I<ENCNAME>'> will be thrown.
259*0Sstevel@tonic-gate
260*0Sstevel@tonic-gate=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
261*0Sstevel@tonic-gate
262*0Sstevel@tonic-gateYou can also individually set encodings of STDIN and STDOUT via the
263*0Sstevel@tonic-gateC<< STDIN => I<ENCNAME> >> form.  In this case, you cannot omit the
264*0Sstevel@tonic-gatefirst I<ENCNAME>.  C<< STDIN => undef >> turns the IO transcoding
265*0Sstevel@tonic-gatecompletely off.
266*0Sstevel@tonic-gate
267*0Sstevel@tonic-gateWhen ${^UNICODE} exists and non-zero, these options will completely
268*0Sstevel@tonic-gateignored.  ${^UNICODE} is a variable introduced in perl 5.8.1.  See
269*0Sstevel@tonic-gateL<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
270*0Sstevel@tonic-gatedetails (perl 5.8.1 and later).
271*0Sstevel@tonic-gate
272*0Sstevel@tonic-gate=item use encoding I<ENCNAME> Filter=E<gt>1;
273*0Sstevel@tonic-gate
274*0Sstevel@tonic-gateThis turns the encoding pragma into a source filter.  While the
275*0Sstevel@tonic-gatedefault approach just decodes interpolated literals (in qq() and
276*0Sstevel@tonic-gateqr()), this will apply a source filter to the entire source code.  See
277*0Sstevel@tonic-gateL</"The Filter Option"> below for details.
278*0Sstevel@tonic-gate
279*0Sstevel@tonic-gate=item no encoding;
280*0Sstevel@tonic-gate
281*0Sstevel@tonic-gateUnsets the script encoding. The layers of STDIN, STDOUT are
282*0Sstevel@tonic-gatereset to ":raw" (the default unprocessed raw stream of bytes).
283*0Sstevel@tonic-gate
284*0Sstevel@tonic-gate=back
285*0Sstevel@tonic-gate
286*0Sstevel@tonic-gate=head1 The Filter Option
287*0Sstevel@tonic-gate
288*0Sstevel@tonic-gateThe magic of C<use encoding> is not applied to the names of
289*0Sstevel@tonic-gateidentifiers.  In order to make C<${"\x{4eba}"}++> ($human++, where human
290*0Sstevel@tonic-gateis a single Han ideograph) work, you still need to write your script
291*0Sstevel@tonic-gatein UTF-8 -- or use a source filter.  That's what 'Filter=>1' does.
292*0Sstevel@tonic-gate
293*0Sstevel@tonic-gateWhat does this mean?  Your source code behaves as if it is written in
294*0Sstevel@tonic-gateUTF-8 with 'use utf8' in effect.  So even if your editor only supports
295*0Sstevel@tonic-gateShift_JIS, for example, you can still try examples in Chapter 15 of
296*0Sstevel@tonic-gateC<Programming Perl, 3rd Ed.>.  For instance, you can use UTF-8
297*0Sstevel@tonic-gateidentifiers.
298*0Sstevel@tonic-gate
299*0Sstevel@tonic-gateThis option is significantly slower and (as of this writing) non-ASCII
300*0Sstevel@tonic-gateidentifiers are not very stable WITHOUT this option and with the
301*0Sstevel@tonic-gatesource code written in UTF-8.
302*0Sstevel@tonic-gate
303*0Sstevel@tonic-gate=head2 Filter-related changes at Encode version 1.87
304*0Sstevel@tonic-gate
305*0Sstevel@tonic-gate=over
306*0Sstevel@tonic-gate
307*0Sstevel@tonic-gate=item *
308*0Sstevel@tonic-gate
309*0Sstevel@tonic-gateThe Filter option now sets STDIN and STDOUT like non-filter options.
310*0Sstevel@tonic-gateAnd C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
311*0Sstevel@tonic-gatenon-filter version.
312*0Sstevel@tonic-gate
313*0Sstevel@tonic-gate=item *
314*0Sstevel@tonic-gate
315*0Sstevel@tonic-gateC<use utf8> is implicitly declared so you no longer have to C<use
316*0Sstevel@tonic-gateutf8> to C<${"\x{4eba}"}++>.
317*0Sstevel@tonic-gate
318*0Sstevel@tonic-gate=back
319*0Sstevel@tonic-gate
320*0Sstevel@tonic-gate=head1 CAVEATS
321*0Sstevel@tonic-gate
322*0Sstevel@tonic-gate=head2 NOT SCOPED
323*0Sstevel@tonic-gate
324*0Sstevel@tonic-gateThe pragma is a per script, not a per block lexical.  Only the last
325*0Sstevel@tonic-gateC<use encoding> or C<no encoding> matters, and it affects
326*0Sstevel@tonic-gateB<the whole script>.  However, the <no encoding> pragma is supported and
327*0Sstevel@tonic-gateB<use encoding> can appear as many times as you want in a given script.
328*0Sstevel@tonic-gateThe multiple use of this pragma is discouraged.
329*0Sstevel@tonic-gate
330*0Sstevel@tonic-gateBy the same reason, the use this pragma inside modules is also
331*0Sstevel@tonic-gatediscouraged (though not as strongly discouranged as the case above.
332*0Sstevel@tonic-gateSee below).
333*0Sstevel@tonic-gate
334*0Sstevel@tonic-gateIf you still have to write a module with this pragma, be very careful
335*0Sstevel@tonic-gateof the load order.  See the codes below;
336*0Sstevel@tonic-gate
337*0Sstevel@tonic-gate  # called module
338*0Sstevel@tonic-gate  package Module_IN_BAR;
339*0Sstevel@tonic-gate  use encoding "bar";
340*0Sstevel@tonic-gate  # stuff in "bar" encoding here
341*0Sstevel@tonic-gate  1;
342*0Sstevel@tonic-gate
343*0Sstevel@tonic-gate  # caller script
344*0Sstevel@tonic-gate  use encoding "foo"
345*0Sstevel@tonic-gate  use Module_IN_BAR;
346*0Sstevel@tonic-gate  # surprise! use encoding "bar" is in effect.
347*0Sstevel@tonic-gate
348*0Sstevel@tonic-gateThe best way to avoid this oddity is to use this pragma RIGHT AFTER
349*0Sstevel@tonic-gateother modules are loaded.  i.e.
350*0Sstevel@tonic-gate
351*0Sstevel@tonic-gate  use Module_IN_BAR;
352*0Sstevel@tonic-gate  use encoding "foo";
353*0Sstevel@tonic-gate
354*0Sstevel@tonic-gate=head2 DO NOT MIX MULTIPLE ENCODINGS
355*0Sstevel@tonic-gate
356*0Sstevel@tonic-gateNotice that only literals (string or regular expression) having only
357*0Sstevel@tonic-gatelegacy code points are affected: if you mix data like this
358*0Sstevel@tonic-gate
359*0Sstevel@tonic-gate	\xDF\x{100}
360*0Sstevel@tonic-gate
361*0Sstevel@tonic-gatethe data is assumed to be in (Latin 1 and) Unicode, not in your native
362*0Sstevel@tonic-gateencoding.  In other words, this will match in "greek":
363*0Sstevel@tonic-gate
364*0Sstevel@tonic-gate	"\xDF" =~ /\x{3af}/
365*0Sstevel@tonic-gate
366*0Sstevel@tonic-gatebut this will not
367*0Sstevel@tonic-gate
368*0Sstevel@tonic-gate	"\xDF\x{100}" =~ /\x{3af}\x{100}/
369*0Sstevel@tonic-gate
370*0Sstevel@tonic-gatesince the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
371*0Sstevel@tonic-gatethe left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
372*0Sstevel@tonic-gateLETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
373*0Sstevel@tonic-gateshould not be mixing your legacy data and Unicode in the same string.
374*0Sstevel@tonic-gate
375*0Sstevel@tonic-gateThis pragma also affects encoding of the 0x80..0xFF code point range:
376*0Sstevel@tonic-gatenormally characters in that range are left as eight-bit bytes (unless
377*0Sstevel@tonic-gatethey are combined with characters with code points 0x100 or larger,
378*0Sstevel@tonic-gatein which case all characters need to become UTF-8 encoded), but if
379*0Sstevel@tonic-gatethe C<encoding> pragma is present, even the 0x80..0xFF range always
380*0Sstevel@tonic-gategets UTF-8 encoded.
381*0Sstevel@tonic-gate
382*0Sstevel@tonic-gateAfter all, the best thing about this pragma is that you don't have to
383*0Sstevel@tonic-gateresort to \x{....} just to spell your name in a native encoding.
384*0Sstevel@tonic-gateSo feel free to put your strings in your encoding in quotes and
385*0Sstevel@tonic-gateregexes.
386*0Sstevel@tonic-gate
387*0Sstevel@tonic-gate=head2 tr/// with ranges
388*0Sstevel@tonic-gate
389*0Sstevel@tonic-gateThe B<encoding> pragma works by decoding string literals in
390*0Sstevel@tonic-gateC<q//,qq//,qr//,qw///, qx//> and so forth.  In perl 5.8.0, this
391*0Sstevel@tonic-gatedoes not apply to C<tr///>.  Therefore,
392*0Sstevel@tonic-gate
393*0Sstevel@tonic-gate  use encoding 'euc-jp';
394*0Sstevel@tonic-gate  #....
395*0Sstevel@tonic-gate  $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
396*0Sstevel@tonic-gate  #           -------- -------- -------- --------
397*0Sstevel@tonic-gate
398*0Sstevel@tonic-gateDoes not work as
399*0Sstevel@tonic-gate
400*0Sstevel@tonic-gate  $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
401*0Sstevel@tonic-gate
402*0Sstevel@tonic-gate=over
403*0Sstevel@tonic-gate
404*0Sstevel@tonic-gate=item Legend of characters above
405*0Sstevel@tonic-gate
406*0Sstevel@tonic-gate  utf8     euc-jp   charnames::viacode()
407*0Sstevel@tonic-gate  -----------------------------------------
408*0Sstevel@tonic-gate  \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
409*0Sstevel@tonic-gate  \x{3093} \xA4\xF3 HIRAGANA LETTER N
410*0Sstevel@tonic-gate  \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
411*0Sstevel@tonic-gate  \x{30f3} \xA5\xF3 KATAKANA LETTER N
412*0Sstevel@tonic-gate
413*0Sstevel@tonic-gate=back
414*0Sstevel@tonic-gate
415*0Sstevel@tonic-gateThis counterintuitive behavior has been fixed in perl 5.8.1.
416*0Sstevel@tonic-gate
417*0Sstevel@tonic-gate=head3 workaround to tr///;
418*0Sstevel@tonic-gate
419*0Sstevel@tonic-gateIn perl 5.8.0, you can work around as follows;
420*0Sstevel@tonic-gate
421*0Sstevel@tonic-gate  use encoding 'euc-jp';
422*0Sstevel@tonic-gate  #  ....
423*0Sstevel@tonic-gate  eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
424*0Sstevel@tonic-gate
425*0Sstevel@tonic-gateNote the C<tr//> expression is surrounded by C<qq{}>.  The idea behind
426*0Sstevel@tonic-gateis the same as classic idiom that makes C<tr///> 'interpolate'.
427*0Sstevel@tonic-gate
428*0Sstevel@tonic-gate   tr/$from/$to/;            # wrong!
429*0Sstevel@tonic-gate   eval qq{ tr/$from/$to/ }; # workaround.
430*0Sstevel@tonic-gate
431*0Sstevel@tonic-gateNevertheless, in case of B<encoding> pragma even C<q//> is affected so
432*0Sstevel@tonic-gateC<tr///> not being decoded was obviously against the will of Perl5
433*0Sstevel@tonic-gatePorters so it has been fixed in Perl 5.8.1 or later.
434*0Sstevel@tonic-gate
435*0Sstevel@tonic-gate=head1 EXAMPLE - Greekperl
436*0Sstevel@tonic-gate
437*0Sstevel@tonic-gate    use encoding "iso 8859-7";
438*0Sstevel@tonic-gate
439*0Sstevel@tonic-gate    # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
440*0Sstevel@tonic-gate
441*0Sstevel@tonic-gate    $a = "\xDF";
442*0Sstevel@tonic-gate    $b = "\x{100}";
443*0Sstevel@tonic-gate
444*0Sstevel@tonic-gate    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
445*0Sstevel@tonic-gate
446*0Sstevel@tonic-gate    $c = $a . $b;
447*0Sstevel@tonic-gate
448*0Sstevel@tonic-gate    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
449*0Sstevel@tonic-gate
450*0Sstevel@tonic-gate    # chr() is affected, and ...
451*0Sstevel@tonic-gate
452*0Sstevel@tonic-gate    print "mega\n"  if ord(chr(0xdf)) == 0x3af;
453*0Sstevel@tonic-gate
454*0Sstevel@tonic-gate    # ... ord() is affected by the encoding pragma ...
455*0Sstevel@tonic-gate
456*0Sstevel@tonic-gate    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
457*0Sstevel@tonic-gate
458*0Sstevel@tonic-gate    # ... as are eq and cmp ...
459*0Sstevel@tonic-gate
460*0Sstevel@tonic-gate    print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
461*0Sstevel@tonic-gate    print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
462*0Sstevel@tonic-gate
463*0Sstevel@tonic-gate    # ... but pack/unpack C are not affected, in case you still
464*0Sstevel@tonic-gate    # want to go back to your native encoding
465*0Sstevel@tonic-gate
466*0Sstevel@tonic-gate    print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
467*0Sstevel@tonic-gate
468*0Sstevel@tonic-gate=head1 KNOWN PROBLEMS
469*0Sstevel@tonic-gate
470*0Sstevel@tonic-gate=over
471*0Sstevel@tonic-gate
472*0Sstevel@tonic-gate=item literals in regex that are longer than 127 bytes
473*0Sstevel@tonic-gate
474*0Sstevel@tonic-gateFor native multibyte encodings (either fixed or variable length),
475*0Sstevel@tonic-gatethe current implementation of the regular expressions may introduce
476*0Sstevel@tonic-gaterecoding errors for regular expression literals longer than 127 bytes.
477*0Sstevel@tonic-gate
478*0Sstevel@tonic-gate=item EBCDIC
479*0Sstevel@tonic-gate
480*0Sstevel@tonic-gateThe encoding pragma is not supported on EBCDIC platforms.
481*0Sstevel@tonic-gate(Porters who are willing and able to remove this limitation are
482*0Sstevel@tonic-gatewelcome.)
483*0Sstevel@tonic-gate
484*0Sstevel@tonic-gate=item format
485*0Sstevel@tonic-gate
486*0Sstevel@tonic-gateThis pragma doesn't work well with format because PerlIO does not
487*0Sstevel@tonic-gateget along very well with it.  When format contains non-ascii
488*0Sstevel@tonic-gatecharacters it prints funny or gets "wide character warnings".
489*0Sstevel@tonic-gateTo understand it, try the code below.
490*0Sstevel@tonic-gate
491*0Sstevel@tonic-gate  # Save this one in utf8
492*0Sstevel@tonic-gate  # replace *non-ascii* with a non-ascii string
493*0Sstevel@tonic-gate  my $camel;
494*0Sstevel@tonic-gate  format STDOUT =
495*0Sstevel@tonic-gate  *non-ascii*@>>>>>>>
496*0Sstevel@tonic-gate  $camel
497*0Sstevel@tonic-gate  .
498*0Sstevel@tonic-gate  $camel = "*non-ascii*";
499*0Sstevel@tonic-gate  binmode(STDOUT=>':encoding(utf8)'); # bang!
500*0Sstevel@tonic-gate  write;              # funny
501*0Sstevel@tonic-gate  print $camel, "\n"; # fine
502*0Sstevel@tonic-gate
503*0Sstevel@tonic-gateWithout binmode this happens to work but without binmode, print()
504*0Sstevel@tonic-gatefails instead of write().
505*0Sstevel@tonic-gate
506*0Sstevel@tonic-gateAt any rate, the very use of format is questionable when it comes to
507*0Sstevel@tonic-gateunicode characters since you have to consider such things as character
508*0Sstevel@tonic-gatewidth (i.e. double-width for ideographs) and directions (i.e. BIDI for
509*0Sstevel@tonic-gateArabic and Hebrew).
510*0Sstevel@tonic-gate
511*0Sstevel@tonic-gate=back
512*0Sstevel@tonic-gate
513*0Sstevel@tonic-gate=head1 HISTORY
514*0Sstevel@tonic-gate
515*0Sstevel@tonic-gateThis pragma first appeared in Perl 5.8.0.  For features that require
516*0Sstevel@tonic-gate5.8.1 and better, see above.
517*0Sstevel@tonic-gate
518*0Sstevel@tonic-gate=head1 SEE ALSO
519*0Sstevel@tonic-gate
520*0Sstevel@tonic-gateL<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
521*0Sstevel@tonic-gate
522*0Sstevel@tonic-gateCh. 15 of C<Programming Perl (3rd Edition)>
523*0Sstevel@tonic-gateby Larry Wall, Tom Christiansen, Jon Orwant;
524*0Sstevel@tonic-gateO'Reilly & Associates; ISBN 0-596-00027-8
525*0Sstevel@tonic-gate
526*0Sstevel@tonic-gate=cut
527