xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/encoding.pm (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1# $Id: encoding.pm,v 2.11 2013/02/18 02:23:56 dankogai Exp $
2package encoding;
3our $VERSION = '2.6_01';
4
5use Encode;
6use strict;
7use warnings;
8
9use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
10
11BEGIN {
12    if ( ord("A") == 193 ) {
13        require Carp;
14        Carp::croak("encoding: pragma does not support EBCDIC platforms");
15    }
16}
17
18our $HAS_PERLIO = 0;
19eval { require PerlIO::encoding };
20unless ($@) {
21    $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 );
22}
23
24sub _exception {
25    my $name = shift;
26    $] > 5.008 and return 0;    # 5.8.1 or higher then no
27    my %utfs = map { $_ => 1 }
28      qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
29      UTF-32 UTF-32BE UTF-32LE);
30    $utfs{$name} or return 0;    # UTFs or no
31    require Config;
32    Config->import();
33    our %Config;
34    return $Config{perl_patchlevel} ? 0 : 1    # maintperl then no
35}
36
37sub in_locale { $^H & ( $locale::hint_bits || 0 ) }
38
39sub _get_locale_encoding {
40    my $locale_encoding;
41
42    # I18N::Langinfo isn't available everywhere
43    eval {
44        require I18N::Langinfo;
45        I18N::Langinfo->import(qw(langinfo CODESET));
46        $locale_encoding = langinfo( CODESET() );
47    };
48
49    my $country_language;
50
51    no warnings 'uninitialized';
52
53    if ( (not $locale_encoding) && in_locale() ) {
54        if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) {
55            ( $country_language, $locale_encoding ) = ( $1, $2 );
56        }
57        elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) {
58            ( $country_language, $locale_encoding ) = ( $1, $2 );
59        }
60
61        # LANGUAGE affects only LC_MESSAGES only on glibc
62    }
63    elsif ( not $locale_encoding ) {
64        if (   $ENV{LC_ALL} =~ /\butf-?8\b/i
65            || $ENV{LANG} =~ /\butf-?8\b/i )
66        {
67            $locale_encoding = 'utf8';
68        }
69
70        # Could do more heuristics based on the country and language
71        # parts of LC_ALL and LANG (the parts before the dot (if any)),
72        # since we have Locale::Country and Locale::Language available.
73        # TODO: get a database of Language -> Encoding mappings
74        # (the Estonian database at http://www.eki.ee/letter/
75        # would be excellent!) --jhi
76    }
77    if (   defined $locale_encoding
78        && lc($locale_encoding) eq 'euc'
79        && defined $country_language )
80    {
81        if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) {
82            $locale_encoding = 'euc-jp';
83        }
84        elsif ( $country_language =~ /^ko_KR|korean?$/i ) {
85            $locale_encoding = 'euc-kr';
86        }
87        elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) {
88            $locale_encoding = 'euc-cn';
89        }
90        elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) {
91            $locale_encoding = 'euc-tw';
92        }
93        else {
94            require Carp;
95            Carp::croak(
96                "encoding: Locale encoding '$locale_encoding' too ambiguous"
97            );
98        }
99    }
100
101    return $locale_encoding;
102}
103
104sub import {
105    if ($] >= 5.017) {
106	warnings::warnif("deprecated",
107			 "Use of the encoding pragma is deprecated")
108    }
109    my $class = shift;
110    my $name  = shift;
111    if (!$name){
112	require Carp;
113        Carp::croak("encoding: no encoding specified.");
114    }
115    if ( $name eq ':_get_locale_encoding' ) {    # used by lib/open.pm
116        my $caller = caller();
117        {
118            no strict 'refs';
119            *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
120        }
121        return;
122    }
123    $name = _get_locale_encoding() if $name eq ':locale';
124    my %arg = @_;
125    $name = $ENV{PERL_ENCODING} unless defined $name;
126    my $enc = find_encoding($name);
127    unless ( defined $enc ) {
128        require Carp;
129        Carp::croak("encoding: Unknown encoding '$name'");
130    }
131    $name = $enc->name;    # canonize
132    unless ( $arg{Filter} ) {
133        DEBUG and warn "_exception($name) = ", _exception($name);
134        _exception($name) or ${^ENCODING} = $enc;
135        $HAS_PERLIO or return 1;
136    }
137    else {
138        defined( ${^ENCODING} ) and undef ${^ENCODING};
139
140        # implicitly 'use utf8'
141        require utf8;      # to fetch $utf8::hint_bits;
142        $^H |= $utf8::hint_bits;
143        eval {
144            require Filter::Util::Call;
145            Filter::Util::Call->import;
146            filter_add(
147                sub {
148                    my $status = filter_read();
149                    if ( $status > 0 ) {
150                        $_ = $enc->decode( $_, 1 );
151                        DEBUG and warn $_;
152                    }
153                    $status;
154                }
155            );
156        };
157        $@ eq '' and DEBUG and warn "Filter installed";
158    }
159    defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
160    for my $h (qw(STDIN STDOUT)) {
161        if ( $arg{$h} ) {
162            unless ( defined find_encoding( $arg{$h} ) ) {
163                require Carp;
164                Carp::croak(
165                    "encoding: Unknown encoding for $h, '$arg{$h}'");
166            }
167            eval { binmode( $h, ":raw :encoding($arg{$h})" ) };
168        }
169        else {
170            unless ( exists $arg{$h} ) {
171                eval {
172                    no warnings 'uninitialized';
173                    binmode( $h, ":raw :encoding($name)" );
174                };
175            }
176        }
177        if ($@) {
178            require Carp;
179            Carp::croak($@);
180        }
181    }
182    return 1;    # I doubt if we need it, though
183}
184
185sub unimport {
186    no warnings;
187    undef ${^ENCODING};
188    if ($HAS_PERLIO) {
189        binmode( STDIN,  ":raw" );
190        binmode( STDOUT, ":raw" );
191    }
192    else {
193        binmode(STDIN);
194        binmode(STDOUT);
195    }
196    if ( $INC{"Filter/Util/Call.pm"} ) {
197        eval { filter_del() };
198    }
199}
200
2011;
202__END__
203
204=pod
205
206=head1 NAME
207
208encoding - allows you to write your script in non-ascii or non-utf8
209
210=head1 WARNING
211
212This module is deprecated under perl 5.18.  It uses a mechanism provided by
213perl that is deprecated under 5.18 and higher, and may be removed in a
214future version.
215
216=head1 SYNOPSIS
217
218  use encoding "greek";  # Perl like Greek to you?
219  use encoding "euc-jp"; # Jperl!
220
221  # or you can even do this if your shell supports your native encoding
222
223  perl -Mencoding=latin2 -e'...' # Feeling centrally European?
224  perl -Mencoding=euc-kr -e'...' # Or Korean?
225
226  # more control
227
228  # A simple euc-cn => utf-8 converter
229  use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};
230
231  # "no encoding;" supported (but not scoped!)
232  no encoding;
233
234  # an alternate way, Filter
235  use encoding "euc-jp", Filter=>1;
236  # now you can use kanji identifiers -- in euc-jp!
237
238  # switch on locale -
239  # note that this probably means that unless you have a complete control
240  # over the environments the application is ever going to be run, you should
241  # NOT use the feature of encoding pragma allowing you to write your script
242  # in any recognized encoding because changing locale settings will wreck
243  # the script; you can of course still use the other features of the pragma.
244  use encoding ':locale';
245
246=head1 ABSTRACT
247
248Let's start with a bit of history: Perl 5.6.0 introduced Unicode
249support.  You could apply C<substr()> and regexes even to complex CJK
250characters -- so long as the script was written in UTF-8.  But back
251then, text editors that supported UTF-8 were still rare and many users
252instead chose to write scripts in legacy encodings, giving up a whole
253new feature of Perl 5.6.
254
255Rewind to the future: starting from perl 5.8.0 with the B<encoding>
256pragma, you can write your script in any encoding you like (so long
257as the C<Encode> module supports it) and still enjoy Unicode support.
258This pragma achieves that by doing the following:
259
260=over
261
262=item *
263
264Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
265the encoding specified to utf8.  In Perl 5.8.1 and later, literals in
266C<tr///> and C<DATA> pseudo-filehandle are also converted.
267
268=item *
269
270Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
271 specified.
272
273=back
274
275=head2 Literal Conversions
276
277You can write code in EUC-JP as follows:
278
279  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
280               #<-char-><-char->   # 4 octets
281  s/\bCamel\b/$Rakuda/;
282
283And with C<use encoding "euc-jp"> in effect, it is the same thing as
284the code in UTF-8:
285
286  my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
287  s/\bCamel\b/$Rakuda/;
288
289=head2 PerlIO layers for C<STD(IN|OUT)>
290
291The B<encoding> pragma also modifies the filehandle layers of
292STDIN and STDOUT to the specified encoding.  Therefore,
293
294  use encoding "euc-jp";
295  my $message = "Camel is the symbol of perl.\n";
296  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
297  $message =~ s/\bCamel\b/$Rakuda/;
298  print $message;
299
300Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
301not "\x{99F1}\x{99DD} is the symbol of perl.\n".
302
303You can override this by giving extra arguments; see below.
304
305=head2 Implicit upgrading for byte strings
306
307By default, if strings operating under byte semantics and strings
308with Unicode character data are concatenated, the new string will
309be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
310
311The B<encoding> pragma changes this to use the specified encoding
312instead.  For example:
313
314    use encoding 'utf8';
315    my $string = chr(20000); # a Unicode string
316    utf8::encode($string);   # now it's a UTF-8 encoded byte string
317    # concatenate with another Unicode string
318    print length($string . chr(20000));
319
320Will print C<2>, because C<$string> is upgraded as UTF-8.  Without
321C<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
322is three octets when interpreted as Latin-1.
323
324=head2 Side effects
325
326If the C<encoding> pragma is in scope then the lengths returned are
327calculated from the length of C<$/> in Unicode characters, which is not
328always the same as the length of C<$/> in the native encoding.
329
330This pragma affects utf8::upgrade, but not utf8::downgrade.
331
332=head1 FEATURES THAT REQUIRE 5.8.1
333
334Some of the features offered by this pragma requires perl 5.8.1.  Most
335of these are done by Inaba Hiroto.  Any other features and changes
336are good for 5.8.0.
337
338=over
339
340=item "NON-EUC" doublebyte encodings
341
342Because perl needs to parse script before applying this pragma, such
343encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
344\x5c) in the second byte fails because the second byte may
345accidentally escape the quoting character that follows.  Perl 5.8.1
346or later fixes this problem.
347
348=item tr//
349
350C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
351See the section below for details.
352
353=item DATA pseudo-filehandle
354
355Another feature that was overlooked was C<DATA>.
356
357=back
358
359=head1 USAGE
360
361=over 4
362
363=item use encoding [I<ENCNAME>] ;
364
365Sets the script encoding to I<ENCNAME>.  And unless ${^UNICODE}
366exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
367":encoding(I<ENCNAME>)".
368
369Note that STDERR WILL NOT be changed.
370
371Also note that non-STD file handles remain unaffected.  Use C<use
372open> or C<binmode> to change layers of those.
373
374If no encoding is specified, the environment variable L<PERL_ENCODING>
375is consulted.  If no encoding can be found, the error C<Unknown encoding
376'I<ENCNAME>'> will be thrown.
377
378=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
379
380You can also individually set encodings of STDIN and STDOUT via the
381C<< STDIN => I<ENCNAME> >> form.  In this case, you cannot omit the
382first I<ENCNAME>.  C<< STDIN => undef >> turns the IO transcoding
383completely off.
384
385When ${^UNICODE} exists and non-zero, these options will completely
386ignored.  ${^UNICODE} is a variable introduced in perl 5.8.1.  See
387L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
388details (perl 5.8.1 and later).
389
390=item use encoding I<ENCNAME> Filter=E<gt>1;
391
392This turns the encoding pragma into a source filter.  While the
393default approach just decodes interpolated literals (in qq() and
394qr()), this will apply a source filter to the entire source code.  See
395L</"The Filter Option"> below for details.
396
397=item no encoding;
398
399Unsets the script encoding. The layers of STDIN, STDOUT are
400reset to ":raw" (the default unprocessed raw stream of bytes).
401
402=back
403
404=head1 The Filter Option
405
406The magic of C<use encoding> is not applied to the names of
407identifiers.  In order to make C<${"\x{4eba}"}++> ($human++, where human
408is a single Han ideograph) work, you still need to write your script
409in UTF-8 -- or use a source filter.  That's what 'Filter=>1' does.
410
411What does this mean?  Your source code behaves as if it is written in
412UTF-8 with 'use utf8' in effect.  So even if your editor only supports
413Shift_JIS, for example, you can still try examples in Chapter 15 of
414C<Programming Perl, 3rd Ed.>.  For instance, you can use UTF-8
415identifiers.
416
417This option is significantly slower and (as of this writing) non-ASCII
418identifiers are not very stable WITHOUT this option and with the
419source code written in UTF-8.
420
421=head2 Filter-related changes at Encode version 1.87
422
423=over
424
425=item *
426
427The Filter option now sets STDIN and STDOUT like non-filter options.
428And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
429non-filter version.
430
431=item *
432
433C<use utf8> is implicitly declared so you no longer have to C<use
434utf8> to C<${"\x{4eba}"}++>.
435
436=back
437
438=head1 CAVEATS
439
440=head2 NOT SCOPED
441
442The pragma is a per script, not a per block lexical.  Only the last
443C<use encoding> or C<no encoding> matters, and it affects
444B<the whole script>.  However, the <no encoding> pragma is supported and
445B<use encoding> can appear as many times as you want in a given script.
446The multiple use of this pragma is discouraged.
447
448By the same reason, the use this pragma inside modules is also
449discouraged (though not as strongly discouraged as the case above.
450See below).
451
452If you still have to write a module with this pragma, be very careful
453of the load order.  See the codes below;
454
455  # called module
456  package Module_IN_BAR;
457  use encoding "bar";
458  # stuff in "bar" encoding here
459  1;
460
461  # caller script
462  use encoding "foo"
463  use Module_IN_BAR;
464  # surprise! use encoding "bar" is in effect.
465
466The best way to avoid this oddity is to use this pragma RIGHT AFTER
467other modules are loaded.  i.e.
468
469  use Module_IN_BAR;
470  use encoding "foo";
471
472=head2 DO NOT MIX MULTIPLE ENCODINGS
473
474Notice that only literals (string or regular expression) having only
475legacy code points are affected: if you mix data like this
476
477    \xDF\x{100}
478
479the data is assumed to be in (Latin 1 and) Unicode, not in your native
480encoding.  In other words, this will match in "greek":
481
482    "\xDF" =~ /\x{3af}/
483
484but this will not
485
486    "\xDF\x{100}" =~ /\x{3af}\x{100}/
487
488since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
489the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
490LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
491should not be mixing your legacy data and Unicode in the same string.
492
493This pragma also affects encoding of the 0x80..0xFF code point range:
494normally characters in that range are left as eight-bit bytes (unless
495they are combined with characters with code points 0x100 or larger,
496in which case all characters need to become UTF-8 encoded), but if
497the C<encoding> pragma is present, even the 0x80..0xFF range always
498gets UTF-8 encoded.
499
500After all, the best thing about this pragma is that you don't have to
501resort to \x{....} just to spell your name in a native encoding.
502So feel free to put your strings in your encoding in quotes and
503regexes.
504
505=head2 tr/// with ranges
506
507The B<encoding> pragma works by decoding string literals in
508C<q//,qq//,qr//,qw///, qx//> and so forth.  In perl 5.8.0, this
509does not apply to C<tr///>.  Therefore,
510
511  use encoding 'euc-jp';
512  #....
513  $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
514  #           -------- -------- -------- --------
515
516Does not work as
517
518  $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
519
520=over
521
522=item Legend of characters above
523
524  utf8     euc-jp   charnames::viacode()
525  -----------------------------------------
526  \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
527  \x{3093} \xA4\xF3 HIRAGANA LETTER N
528  \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
529  \x{30f3} \xA5\xF3 KATAKANA LETTER N
530
531=back
532
533This counterintuitive behavior has been fixed in perl 5.8.1.
534
535=head3 workaround to tr///;
536
537In perl 5.8.0, you can work around as follows;
538
539  use encoding 'euc-jp';
540  #  ....
541  eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
542
543Note the C<tr//> expression is surrounded by C<qq{}>.  The idea behind
544is the same as classic idiom that makes C<tr///> 'interpolate'.
545
546   tr/$from/$to/;            # wrong!
547   eval qq{ tr/$from/$to/ }; # workaround.
548
549Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
550C<tr///> not being decoded was obviously against the will of Perl5
551Porters so it has been fixed in Perl 5.8.1 or later.
552
553=head1 EXAMPLE - Greekperl
554
555    use encoding "iso 8859-7";
556
557    # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
558
559    $a = "\xDF";
560    $b = "\x{100}";
561
562    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
563
564    $c = $a . $b;
565
566    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
567
568    # chr() is affected, and ...
569
570    print "mega\n"  if ord(chr(0xdf)) == 0x3af;
571
572    # ... ord() is affected by the encoding pragma ...
573
574    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
575
576    # ... as are eq and cmp ...
577
578    print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
579    print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
580
581    # ... but pack/unpack C are not affected, in case you still
582    # want to go back to your native encoding
583
584    print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
585
586=head1 KNOWN PROBLEMS
587
588=over
589
590=item literals in regex that are longer than 127 bytes
591
592For native multibyte encodings (either fixed or variable length),
593the current implementation of the regular expressions may introduce
594recoding errors for regular expression literals longer than 127 bytes.
595
596=item EBCDIC
597
598The encoding pragma is not supported on EBCDIC platforms.
599(Porters who are willing and able to remove this limitation are
600welcome.)
601
602=item format
603
604This pragma doesn't work well with format because PerlIO does not
605get along very well with it.  When format contains non-ascii
606characters it prints funny or gets "wide character warnings".
607To understand it, try the code below.
608
609  # Save this one in utf8
610  # replace *non-ascii* with a non-ascii string
611  my $camel;
612  format STDOUT =
613  *non-ascii*@>>>>>>>
614  $camel
615  .
616  $camel = "*non-ascii*";
617  binmode(STDOUT=>':encoding(utf8)'); # bang!
618  write;              # funny
619  print $camel, "\n"; # fine
620
621Without binmode this happens to work but without binmode, print()
622fails instead of write().
623
624At any rate, the very use of format is questionable when it comes to
625unicode characters since you have to consider such things as character
626width (i.e. double-width for ideographs) and directions (i.e. BIDI for
627Arabic and Hebrew).
628
629=item Thread safety
630
631C<use encoding ...> is not thread-safe (i.e., do not use in threaded
632applications).
633
634=back
635
636=head2 The Logic of :locale
637
638The logic of C<:locale> is as follows:
639
640=over 4
641
642=item 1.
643
644If the platform supports the langinfo(CODESET) interface, the codeset
645returned is used as the default encoding for the open pragma.
646
647=item 2.
648
649If 1. didn't work but we are under the locale pragma, the environment
650variables LC_ALL and LANG (in that order) are matched for encodings
651(the part after C<.>, if any), and if any found, that is used
652as the default encoding for the open pragma.
653
654=item 3.
655
656If 1. and 2. didn't work, the environment variables LC_ALL and LANG
657(in that order) are matched for anything looking like UTF-8, and if
658any found, C<:utf8> is used as the default encoding for the open
659pragma.
660
661=back
662
663If your locale environment variables (LC_ALL, LC_CTYPE, LANG)
664contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
665the default encoding of your STDIN, STDOUT, and STDERR, and of
666B<any subsequent file open>, is UTF-8.
667
668=head1 HISTORY
669
670This pragma first appeared in Perl 5.8.0.  For features that require
6715.8.1 and better, see above.
672
673The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6.
674
675=head1 SEE ALSO
676
677L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
678
679Ch. 15 of C<Programming Perl (3rd Edition)>
680by Larry Wall, Tom Christiansen, Jon Orwant;
681O'Reilly & Associates; ISBN 0-596-00027-8
682
683=cut
684