xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/encoding.pm (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1# $Id: encoding.pm,v 2.12 2013/04/26 18:30:46 dankogai Exp $
2package encoding;
3our $VERSION = sprintf "%d.%02d", q$Revision: 2.12 $ =~ /(\d+)/g;
4
5use Encode;
6use strict;
7use warnings;
8
9use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
10
11BEGIN {
12    if ( ord("A") == 193 ) {
13        require Carp;
14        Carp::croak("encoding: pragma does not support EBCDIC platforms");
15    }
16}
17
18our $HAS_PERLIO = 0;
19eval { require PerlIO::encoding };
20unless ($@) {
21    $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 );
22}
23
24sub _exception {
25    my $name = shift;
26    $] > 5.008 and return 0;    # 5.8.1 or higher then no
27    my %utfs = map { $_ => 1 }
28      qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
29      UTF-32 UTF-32BE UTF-32LE);
30    $utfs{$name} or return 0;    # UTFs or no
31    require Config;
32    Config->import();
33    our %Config;
34    return $Config{perl_patchlevel} ? 0 : 1    # maintperl then no
35}
36
37sub in_locale { $^H & ( $locale::hint_bits || 0 ) }
38
39sub _get_locale_encoding {
40    my $locale_encoding;
41
42    # I18N::Langinfo isn't available everywhere
43    eval {
44        require I18N::Langinfo;
45        I18N::Langinfo->import(qw(langinfo CODESET));
46        $locale_encoding = langinfo( CODESET() );
47    };
48
49    my $country_language;
50
51    no warnings 'uninitialized';
52
53    if ( (not $locale_encoding) && in_locale() ) {
54        if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) {
55            ( $country_language, $locale_encoding ) = ( $1, $2 );
56        }
57        elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) {
58            ( $country_language, $locale_encoding ) = ( $1, $2 );
59        }
60
61        # LANGUAGE affects only LC_MESSAGES only on glibc
62    }
63    elsif ( not $locale_encoding ) {
64        if (   $ENV{LC_ALL} =~ /\butf-?8\b/i
65            || $ENV{LANG} =~ /\butf-?8\b/i )
66        {
67            $locale_encoding = 'utf8';
68        }
69
70        # Could do more heuristics based on the country and language
71        # parts of LC_ALL and LANG (the parts before the dot (if any)),
72        # since we have Locale::Country and Locale::Language available.
73        # TODO: get a database of Language -> Encoding mappings
74        # (the Estonian database at http://www.eki.ee/letter/
75        # would be excellent!) --jhi
76    }
77    if (   defined $locale_encoding
78        && lc($locale_encoding) eq 'euc'
79        && defined $country_language )
80    {
81        if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) {
82            $locale_encoding = 'euc-jp';
83        }
84        elsif ( $country_language =~ /^ko_KR|korean?$/i ) {
85            $locale_encoding = 'euc-kr';
86        }
87        elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) {
88            $locale_encoding = 'euc-cn';
89        }
90        elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) {
91            $locale_encoding = 'euc-tw';
92        }
93        else {
94            require Carp;
95            Carp::croak(
96                "encoding: Locale encoding '$locale_encoding' too ambiguous"
97            );
98        }
99    }
100
101    return $locale_encoding;
102}
103
104sub import {
105    if ($] >= 5.017) {
106	warnings::warnif("deprecated",
107			 "Use of the encoding pragma is deprecated")
108    }
109    my $class = shift;
110    my $name  = shift;
111    if (!$name){
112	require Carp;
113        Carp::croak("encoding: no encoding specified.");
114    }
115    if ( $name eq ':_get_locale_encoding' ) {    # used by lib/open.pm
116        my $caller = caller();
117        {
118            no strict 'refs';
119            *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
120        }
121        return;
122    }
123    $name = _get_locale_encoding() if $name eq ':locale';
124    my %arg = @_;
125    $name = $ENV{PERL_ENCODING} unless defined $name;
126    my $enc = find_encoding($name);
127    unless ( defined $enc ) {
128        require Carp;
129        Carp::croak("encoding: Unknown encoding '$name'");
130    }
131    $name = $enc->name;    # canonize
132    unless ( $arg{Filter} ) {
133        DEBUG and warn "_exception($name) = ", _exception($name);
134        _exception($name) or ${^ENCODING} = $enc;
135        $HAS_PERLIO or return 1;
136    }
137    else {
138        defined( ${^ENCODING} ) and undef ${^ENCODING};
139
140        # implicitly 'use utf8'
141        require utf8;      # to fetch $utf8::hint_bits;
142        $^H |= $utf8::hint_bits;
143        eval {
144            require Filter::Util::Call;
145            Filter::Util::Call->import;
146            filter_add(
147                sub {
148                    my $status = filter_read();
149                    if ( $status > 0 ) {
150                        $_ = $enc->decode( $_, 1 );
151                        DEBUG and warn $_;
152                    }
153                    $status;
154                }
155            );
156        };
157        $@ eq '' and DEBUG and warn "Filter installed";
158    }
159    defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
160    for my $h (qw(STDIN STDOUT)) {
161        if ( $arg{$h} ) {
162            unless ( defined find_encoding( $arg{$h} ) ) {
163                require Carp;
164                Carp::croak(
165                    "encoding: Unknown encoding for $h, '$arg{$h}'");
166            }
167            eval { binmode( $h, ":raw :encoding($arg{$h})" ) };
168        }
169        else {
170            unless ( exists $arg{$h} ) {
171                eval {
172                    no warnings 'uninitialized';
173                    binmode( $h, ":raw :encoding($name)" );
174                };
175            }
176        }
177        if ($@) {
178            require Carp;
179            Carp::croak($@);
180        }
181    }
182    return 1;    # I doubt if we need it, though
183}
184
185sub unimport {
186    no warnings;
187    undef ${^ENCODING};
188    if ($HAS_PERLIO) {
189        binmode( STDIN,  ":raw" );
190        binmode( STDOUT, ":raw" );
191    }
192    else {
193        binmode(STDIN);
194        binmode(STDOUT);
195    }
196    if ( $INC{"Filter/Util/Call.pm"} ) {
197        eval { filter_del() };
198    }
199}
200
2011;
202__END__
203
204=pod
205
206=head1 NAME
207
208encoding - allows you to write your script in non-ascii or non-utf8
209
210=head1 WARNING
211
212This module is deprecated under perl 5.18.  It uses a mechanism provided by
213perl that is deprecated under 5.18 and higher, and may be removed in a
214future version.
215
216The easiest and the best alternative is to write your script in UTF-8
217and declear:
218
219  use utf8; # not use encoding ':utf8';
220
221See L<perluniintro> and L<utf8> for details.
222
223=head1 SYNOPSIS
224
225  use encoding "greek";  # Perl like Greek to you?
226  use encoding "euc-jp"; # Jperl!
227
228  # or you can even do this if your shell supports your native encoding
229
230  perl -Mencoding=latin2 -e'...' # Feeling centrally European?
231  perl -Mencoding=euc-kr -e'...' # Or Korean?
232
233  # more control
234
235  # A simple euc-cn => utf-8 converter
236  use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};
237
238  # "no encoding;" supported (but not scoped!)
239  no encoding;
240
241  # an alternate way, Filter
242  use encoding "euc-jp", Filter=>1;
243  # now you can use kanji identifiers -- in euc-jp!
244
245  # switch on locale -
246  # note that this probably means that unless you have a complete control
247  # over the environments the application is ever going to be run, you should
248  # NOT use the feature of encoding pragma allowing you to write your script
249  # in any recognized encoding because changing locale settings will wreck
250  # the script; you can of course still use the other features of the pragma.
251  use encoding ':locale';
252
253=head1 ABSTRACT
254
255Let's start with a bit of history: Perl 5.6.0 introduced Unicode
256support.  You could apply C<substr()> and regexes even to complex CJK
257characters -- so long as the script was written in UTF-8.  But back
258then, text editors that supported UTF-8 were still rare and many users
259instead chose to write scripts in legacy encodings, giving up a whole
260new feature of Perl 5.6.
261
262Rewind to the future: starting from perl 5.8.0 with the B<encoding>
263pragma, you can write your script in any encoding you like (so long
264as the C<Encode> module supports it) and still enjoy Unicode support.
265This pragma achieves that by doing the following:
266
267=over
268
269=item *
270
271Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
272the encoding specified to utf8.  In Perl 5.8.1 and later, literals in
273C<tr///> and C<DATA> pseudo-filehandle are also converted.
274
275=item *
276
277Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
278 specified.
279
280=back
281
282=head2 Literal Conversions
283
284You can write code in EUC-JP as follows:
285
286  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
287               #<-char-><-char->   # 4 octets
288  s/\bCamel\b/$Rakuda/;
289
290And with C<use encoding "euc-jp"> in effect, it is the same thing as
291the code in UTF-8:
292
293  my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
294  s/\bCamel\b/$Rakuda/;
295
296=head2 PerlIO layers for C<STD(IN|OUT)>
297
298The B<encoding> pragma also modifies the filehandle layers of
299STDIN and STDOUT to the specified encoding.  Therefore,
300
301  use encoding "euc-jp";
302  my $message = "Camel is the symbol of perl.\n";
303  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
304  $message =~ s/\bCamel\b/$Rakuda/;
305  print $message;
306
307Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
308not "\x{99F1}\x{99DD} is the symbol of perl.\n".
309
310You can override this by giving extra arguments; see below.
311
312=head2 Implicit upgrading for byte strings
313
314By default, if strings operating under byte semantics and strings
315with Unicode character data are concatenated, the new string will
316be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
317
318The B<encoding> pragma changes this to use the specified encoding
319instead.  For example:
320
321    use encoding 'utf8';
322    my $string = chr(20000); # a Unicode string
323    utf8::encode($string);   # now it's a UTF-8 encoded byte string
324    # concatenate with another Unicode string
325    print length($string . chr(20000));
326
327Will print C<2>, because C<$string> is upgraded as UTF-8.  Without
328C<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
329is three octets when interpreted as Latin-1.
330
331=head2 Side effects
332
333If the C<encoding> pragma is in scope then the lengths returned are
334calculated from the length of C<$/> in Unicode characters, which is not
335always the same as the length of C<$/> in the native encoding.
336
337This pragma affects utf8::upgrade, but not utf8::downgrade.
338
339=head1 FEATURES THAT REQUIRE 5.8.1
340
341Some of the features offered by this pragma requires perl 5.8.1.  Most
342of these are done by Inaba Hiroto.  Any other features and changes
343are good for 5.8.0.
344
345=over
346
347=item "NON-EUC" doublebyte encodings
348
349Because perl needs to parse script before applying this pragma, such
350encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
351\x5c) in the second byte fails because the second byte may
352accidentally escape the quoting character that follows.  Perl 5.8.1
353or later fixes this problem.
354
355=item tr//
356
357C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
358See the section below for details.
359
360=item DATA pseudo-filehandle
361
362Another feature that was overlooked was C<DATA>.
363
364=back
365
366=head1 USAGE
367
368=over 4
369
370=item use encoding [I<ENCNAME>] ;
371
372Sets the script encoding to I<ENCNAME>.  And unless ${^UNICODE}
373exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
374":encoding(I<ENCNAME>)".
375
376Note that STDERR WILL NOT be changed.
377
378Also note that non-STD file handles remain unaffected.  Use C<use
379open> or C<binmode> to change layers of those.
380
381If no encoding is specified, the environment variable L<PERL_ENCODING>
382is consulted.  If no encoding can be found, the error C<Unknown encoding
383'I<ENCNAME>'> will be thrown.
384
385=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
386
387You can also individually set encodings of STDIN and STDOUT via the
388C<< STDIN => I<ENCNAME> >> form.  In this case, you cannot omit the
389first I<ENCNAME>.  C<< STDIN => undef >> turns the IO transcoding
390completely off.
391
392When ${^UNICODE} exists and non-zero, these options will completely
393ignored.  ${^UNICODE} is a variable introduced in perl 5.8.1.  See
394L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
395details (perl 5.8.1 and later).
396
397=item use encoding I<ENCNAME> Filter=E<gt>1;
398
399This turns the encoding pragma into a source filter.  While the
400default approach just decodes interpolated literals (in qq() and
401qr()), this will apply a source filter to the entire source code.  See
402L</"The Filter Option"> below for details.
403
404=item no encoding;
405
406Unsets the script encoding. The layers of STDIN, STDOUT are
407reset to ":raw" (the default unprocessed raw stream of bytes).
408
409=back
410
411=head1 The Filter Option
412
413The magic of C<use encoding> is not applied to the names of
414identifiers.  In order to make C<${"\x{4eba}"}++> ($human++, where human
415is a single Han ideograph) work, you still need to write your script
416in UTF-8 -- or use a source filter.  That's what 'Filter=>1' does.
417
418What does this mean?  Your source code behaves as if it is written in
419UTF-8 with 'use utf8' in effect.  So even if your editor only supports
420Shift_JIS, for example, you can still try examples in Chapter 15 of
421C<Programming Perl, 3rd Ed.>.  For instance, you can use UTF-8
422identifiers.
423
424This option is significantly slower and (as of this writing) non-ASCII
425identifiers are not very stable WITHOUT this option and with the
426source code written in UTF-8.
427
428=head2 Filter-related changes at Encode version 1.87
429
430=over
431
432=item *
433
434The Filter option now sets STDIN and STDOUT like non-filter options.
435And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
436non-filter version.
437
438=item *
439
440C<use utf8> is implicitly declared so you no longer have to C<use
441utf8> to C<${"\x{4eba}"}++>.
442
443=back
444
445=head1 CAVEATS
446
447=head2 NOT SCOPED
448
449The pragma is a per script, not a per block lexical.  Only the last
450C<use encoding> or C<no encoding> matters, and it affects
451B<the whole script>.  However, the <no encoding> pragma is supported and
452B<use encoding> can appear as many times as you want in a given script.
453The multiple use of this pragma is discouraged.
454
455By the same reason, the use this pragma inside modules is also
456discouraged (though not as strongly discouraged as the case above.
457See below).
458
459If you still have to write a module with this pragma, be very careful
460of the load order.  See the codes below;
461
462  # called module
463  package Module_IN_BAR;
464  use encoding "bar";
465  # stuff in "bar" encoding here
466  1;
467
468  # caller script
469  use encoding "foo"
470  use Module_IN_BAR;
471  # surprise! use encoding "bar" is in effect.
472
473The best way to avoid this oddity is to use this pragma RIGHT AFTER
474other modules are loaded.  i.e.
475
476  use Module_IN_BAR;
477  use encoding "foo";
478
479=head2 DO NOT MIX MULTIPLE ENCODINGS
480
481Notice that only literals (string or regular expression) having only
482legacy code points are affected: if you mix data like this
483
484    \xDF\x{100}
485
486the data is assumed to be in (Latin 1 and) Unicode, not in your native
487encoding.  In other words, this will match in "greek":
488
489    "\xDF" =~ /\x{3af}/
490
491but this will not
492
493    "\xDF\x{100}" =~ /\x{3af}\x{100}/
494
495since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
496the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
497LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
498should not be mixing your legacy data and Unicode in the same string.
499
500This pragma also affects encoding of the 0x80..0xFF code point range:
501normally characters in that range are left as eight-bit bytes (unless
502they are combined with characters with code points 0x100 or larger,
503in which case all characters need to become UTF-8 encoded), but if
504the C<encoding> pragma is present, even the 0x80..0xFF range always
505gets UTF-8 encoded.
506
507After all, the best thing about this pragma is that you don't have to
508resort to \x{....} just to spell your name in a native encoding.
509So feel free to put your strings in your encoding in quotes and
510regexes.
511
512=head2 tr/// with ranges
513
514The B<encoding> pragma works by decoding string literals in
515C<q//,qq//,qr//,qw///, qx//> and so forth.  In perl 5.8.0, this
516does not apply to C<tr///>.  Therefore,
517
518  use encoding 'euc-jp';
519  #....
520  $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
521  #           -------- -------- -------- --------
522
523Does not work as
524
525  $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
526
527=over
528
529=item Legend of characters above
530
531  utf8     euc-jp   charnames::viacode()
532  -----------------------------------------
533  \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
534  \x{3093} \xA4\xF3 HIRAGANA LETTER N
535  \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
536  \x{30f3} \xA5\xF3 KATAKANA LETTER N
537
538=back
539
540This counterintuitive behavior has been fixed in perl 5.8.1.
541
542=head3 workaround to tr///;
543
544In perl 5.8.0, you can work around as follows;
545
546  use encoding 'euc-jp';
547  #  ....
548  eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
549
550Note the C<tr//> expression is surrounded by C<qq{}>.  The idea behind
551is the same as classic idiom that makes C<tr///> 'interpolate'.
552
553   tr/$from/$to/;            # wrong!
554   eval qq{ tr/$from/$to/ }; # workaround.
555
556Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
557C<tr///> not being decoded was obviously against the will of Perl5
558Porters so it has been fixed in Perl 5.8.1 or later.
559
560=head1 EXAMPLE - Greekperl
561
562    use encoding "iso 8859-7";
563
564    # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
565
566    $a = "\xDF";
567    $b = "\x{100}";
568
569    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
570
571    $c = $a . $b;
572
573    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
574
575    # chr() is affected, and ...
576
577    print "mega\n"  if ord(chr(0xdf)) == 0x3af;
578
579    # ... ord() is affected by the encoding pragma ...
580
581    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
582
583    # ... as are eq and cmp ...
584
585    print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
586    print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
587
588    # ... but pack/unpack C are not affected, in case you still
589    # want to go back to your native encoding
590
591    print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
592
593=head1 KNOWN PROBLEMS
594
595=over
596
597=item literals in regex that are longer than 127 bytes
598
599For native multibyte encodings (either fixed or variable length),
600the current implementation of the regular expressions may introduce
601recoding errors for regular expression literals longer than 127 bytes.
602
603=item EBCDIC
604
605The encoding pragma is not supported on EBCDIC platforms.
606(Porters who are willing and able to remove this limitation are
607welcome.)
608
609=item format
610
611This pragma doesn't work well with format because PerlIO does not
612get along very well with it.  When format contains non-ascii
613characters it prints funny or gets "wide character warnings".
614To understand it, try the code below.
615
616  # Save this one in utf8
617  # replace *non-ascii* with a non-ascii string
618  my $camel;
619  format STDOUT =
620  *non-ascii*@>>>>>>>
621  $camel
622  .
623  $camel = "*non-ascii*";
624  binmode(STDOUT=>':encoding(utf8)'); # bang!
625  write;              # funny
626  print $camel, "\n"; # fine
627
628Without binmode this happens to work but without binmode, print()
629fails instead of write().
630
631At any rate, the very use of format is questionable when it comes to
632unicode characters since you have to consider such things as character
633width (i.e. double-width for ideographs) and directions (i.e. BIDI for
634Arabic and Hebrew).
635
636=item Thread safety
637
638C<use encoding ...> is not thread-safe (i.e., do not use in threaded
639applications).
640
641=back
642
643=head2 The Logic of :locale
644
645The logic of C<:locale> is as follows:
646
647=over 4
648
649=item 1.
650
651If the platform supports the langinfo(CODESET) interface, the codeset
652returned is used as the default encoding for the open pragma.
653
654=item 2.
655
656If 1. didn't work but we are under the locale pragma, the environment
657variables LC_ALL and LANG (in that order) are matched for encodings
658(the part after C<.>, if any), and if any found, that is used
659as the default encoding for the open pragma.
660
661=item 3.
662
663If 1. and 2. didn't work, the environment variables LC_ALL and LANG
664(in that order) are matched for anything looking like UTF-8, and if
665any found, C<:utf8> is used as the default encoding for the open
666pragma.
667
668=back
669
670If your locale environment variables (LC_ALL, LC_CTYPE, LANG)
671contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
672the default encoding of your STDIN, STDOUT, and STDERR, and of
673B<any subsequent file open>, is UTF-8.
674
675=head1 HISTORY
676
677This pragma first appeared in Perl 5.8.0.  For features that require
6785.8.1 and better, see above.
679
680The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6.
681
682=head1 SEE ALSO
683
684L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
685
686Ch. 15 of C<Programming Perl (3rd Edition)>
687by Larry Wall, Tom Christiansen, Jon Orwant;
688O'Reilly & Associates; ISBN 0-596-00027-8
689
690=cut
691