xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/Encode.pm (revision c90a81c56dcebd6a1b73fe4aff9b03385b8e63b3)
1#
2# $Id: Encode.pm,v 2.80 2016/01/25 14:54:01 dankogai Exp $
3#
4package Encode;
5use strict;
6use warnings;
7our $VERSION = sprintf "%d.%02d_01", q$Revision: 2.80 $ =~ /(\d+)/g;
8use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
9use XSLoader ();
10XSLoader::load( __PACKAGE__, $VERSION );
11
12use Exporter 5.57 'import';
13
14# Public, encouraged API is exported by default
15
16our @EXPORT = qw(
17  decode  decode_utf8  encode  encode_utf8 str2bytes bytes2str
18  encodings  find_encoding clone_encoding
19);
20our @FB_FLAGS = qw(
21  DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
22  PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
23);
24our @FB_CONSTS = qw(
25  FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
26  FB_PERLQQ FB_HTMLCREF FB_XMLCREF
27);
28our @EXPORT_OK = (
29    qw(
30      _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
31      is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
32      ),
33    @FB_FLAGS, @FB_CONSTS,
34);
35
36our %EXPORT_TAGS = (
37    all          => [ @EXPORT,    @EXPORT_OK ],
38    default      => [ @EXPORT ],
39    fallbacks    => [ @FB_CONSTS ],
40    fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
41);
42
43# Documentation moved after __END__ for speed - NI-S
44
45our $ON_EBCDIC = ( ord("A") == 193 );
46
47use Encode::Alias;
48
49# Make a %Encoding package variable to allow a certain amount of cheating
50our %Encoding;
51our %ExtModule;
52require Encode::Config;
53#  See
54#  https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
55#  to find why sig handlers inside eval{} are disabled.
56eval {
57    local $SIG{__DIE__};
58    local $SIG{__WARN__};
59    local @INC = @INC;
60    pop @INC if $INC[-1] eq '.';
61    require Encode::ConfigLocal;
62};
63
64sub encodings {
65    my %enc;
66    my $arg  = $_[1] || '';
67    if ( $arg eq ":all" ) {
68        %enc = ( %Encoding, %ExtModule );
69    }
70    else {
71        %enc = %Encoding;
72        for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) {
73            DEBUG and warn $mod;
74            for my $enc ( keys %ExtModule ) {
75                $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
76            }
77        }
78    }
79    return sort { lc $a cmp lc $b }
80      grep      { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
81}
82
83sub perlio_ok {
84    my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
85    $obj->can("perlio_ok") and return $obj->perlio_ok();
86    return 0;    # safety net
87}
88
89sub define_encoding {
90    my $obj  = shift;
91    my $name = shift;
92    $Encoding{$name} = $obj;
93    my $lc = lc($name);
94    define_alias( $lc => $obj ) unless $lc eq $name;
95    while (@_) {
96        my $alias = shift;
97        define_alias( $alias, $obj );
98    }
99    return $obj;
100}
101
102sub getEncoding {
103    my ( $class, $name, $skip_external ) = @_;
104
105    $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
106
107    ref($name) && $name->can('renew') and return $name;
108    exists $Encoding{$name} and return $Encoding{$name};
109    my $lc = lc $name;
110    exists $Encoding{$lc} and return $Encoding{$lc};
111
112    my $oc = $class->find_alias($name);
113    defined($oc) and return $oc;
114    $lc ne $name and $oc = $class->find_alias($lc);
115    defined($oc) and return $oc;
116
117    unless ($skip_external) {
118        if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
119            $mod =~ s,::,/,g;
120            $mod .= '.pm';
121            eval { require $mod; };
122            exists $Encoding{$name} and return $Encoding{$name};
123        }
124    }
125    return;
126}
127
128sub find_encoding($;$) {
129    my ( $name, $skip_external ) = @_;
130    return __PACKAGE__->getEncoding( $name, $skip_external );
131}
132
133sub resolve_alias($) {
134    my $obj = find_encoding(shift);
135    defined $obj and return $obj->name;
136    return;
137}
138
139sub clone_encoding($) {
140    my $obj = find_encoding(shift);
141    ref $obj or return;
142    eval { require Storable };
143    $@ and return;
144    return Storable::dclone($obj);
145}
146
147sub encode($$;$) {
148    my ( $name, $string, $check ) = @_;
149    return undef unless defined $string;
150    $string .= '';    # stringify;
151    $check ||= 0;
152    unless ( defined $name ) {
153        require Carp;
154        Carp::croak("Encoding name should not be undef");
155    }
156    my $enc = find_encoding($name);
157    unless ( defined $enc ) {
158        require Carp;
159        Carp::croak("Unknown encoding '$name'");
160    }
161    # For Unicode, warnings need to be caught and re-issued at this level
162    # so that callers can disable utf8 warnings lexically.
163    my $octets;
164    if ( ref($enc) eq 'Encode::Unicode' ) {
165        my $warn = '';
166        {
167            local $SIG{__WARN__} = sub { $warn = shift };
168            $octets = $enc->encode( $string, $check );
169        }
170        warnings::warnif('utf8', $warn) if length $warn;
171    }
172    else {
173        $octets = $enc->encode( $string, $check );
174    }
175    $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() );
176    return $octets;
177}
178*str2bytes = \&encode;
179
180sub decode($$;$) {
181    my ( $name, $octets, $check ) = @_;
182    return undef unless defined $octets;
183    $octets .= '';
184    $check ||= 0;
185    my $enc = find_encoding($name);
186    unless ( defined $enc ) {
187        require Carp;
188        Carp::croak("Unknown encoding '$name'");
189    }
190    # For Unicode, warnings need to be caught and re-issued at this level
191    # so that callers can disable utf8 warnings lexically.
192    my $string;
193    if ( ref($enc) eq 'Encode::Unicode' ) {
194        my $warn = '';
195        {
196            local $SIG{__WARN__} = sub { $warn = shift };
197            $string = $enc->decode( $octets, $check );
198        }
199        warnings::warnif('utf8', $warn) if length $warn;
200    }
201    else {
202        $string = $enc->decode( $octets, $check );
203    }
204    $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
205    return $string;
206}
207*bytes2str = \&decode;
208
209sub from_to($$$;$) {
210    my ( $string, $from, $to, $check ) = @_;
211    return undef unless defined $string;
212    $check ||= 0;
213    my $f = find_encoding($from);
214    unless ( defined $f ) {
215        require Carp;
216        Carp::croak("Unknown encoding '$from'");
217    }
218    my $t = find_encoding($to);
219    unless ( defined $t ) {
220        require Carp;
221        Carp::croak("Unknown encoding '$to'");
222    }
223    my $uni = $f->decode($string);
224    $_[0] = $string = $t->encode( $uni, $check );
225    return undef if ( $check && length($uni) );
226    return defined( $_[0] ) ? length($string) : undef;
227}
228
229sub encode_utf8($) {
230    my ($str) = @_;
231    utf8::encode($str);
232    return $str;
233}
234
235my $utf8enc;
236
237sub decode_utf8($;$) {
238    my ( $octets, $check ) = @_;
239    return undef unless defined $octets;
240    $octets .= '';
241    $check   ||= 0;
242    $utf8enc ||= find_encoding('utf8');
243    my $string = $utf8enc->decode( $octets, $check );
244    $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
245    return $string;
246}
247
248# sub decode_utf8($;$) {
249#     my ( $str, $check ) = @_;
250#     return $str if is_utf8($str);
251#     if ($check) {
252#         return decode( "utf8", $str, $check );
253#     }
254#     else {
255#         return decode( "utf8", $str );
256#         return $str;
257#     }
258# }
259
260predefine_encodings(1);
261
262#
263# This is to restore %Encoding if really needed;
264#
265
266sub predefine_encodings {
267    require Encode::Encoding;
268    no warnings 'redefine';
269    my $use_xs = shift;
270    if ($ON_EBCDIC) {
271
272        # was in Encode::UTF_EBCDIC
273        package Encode::UTF_EBCDIC;
274        push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
275        *decode = sub {
276            my ( undef, $str, $chk ) = @_;
277            my $res = '';
278            for ( my $i = 0 ; $i < length($str) ; $i++ ) {
279                $res .=
280                  chr(
281                    utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
282                  );
283            }
284            $_[1] = '' if $chk;
285            return $res;
286        };
287        *encode = sub {
288            my ( undef, $str, $chk ) = @_;
289            my $res = '';
290            for ( my $i = 0 ; $i < length($str) ; $i++ ) {
291                $res .=
292                  chr(
293                    utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
294                  );
295            }
296            $_[1] = '' if $chk;
297            return $res;
298        };
299        $Encode::Encoding{Unicode} =
300          bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
301    }
302    else {
303
304        package Encode::Internal;
305        push @Encode::Internal::ISA, 'Encode::Encoding';
306        *decode = sub {
307            my ( undef, $str, $chk ) = @_;
308            utf8::upgrade($str);
309            $_[1] = '' if $chk;
310            return $str;
311        };
312        *encode = \&decode;
313        $Encode::Encoding{Unicode} =
314          bless { Name => "Internal" } => "Encode::Internal";
315    }
316    {
317        # https://rt.cpan.org/Public/Bug/Display.html?id=103253
318        package Encode::XS;
319        push @Encode::XS::ISA, 'Encode::Encoding';
320    }
321    {
322
323        # was in Encode::utf8
324        package Encode::utf8;
325        push @Encode::utf8::ISA, 'Encode::Encoding';
326
327        #
328        if ($use_xs) {
329            Encode::DEBUG and warn __PACKAGE__, " XS on";
330            *decode = \&decode_xs;
331            *encode = \&encode_xs;
332        }
333        else {
334            Encode::DEBUG and warn __PACKAGE__, " XS off";
335            *decode = sub {
336                my ( undef, $octets, $chk ) = @_;
337                my $str = Encode::decode_utf8($octets);
338                if ( defined $str ) {
339                    $_[1] = '' if $chk;
340                    return $str;
341                }
342                return undef;
343            };
344            *encode = sub {
345                my ( undef, $string, $chk ) = @_;
346                my $octets = Encode::encode_utf8($string);
347                $_[1] = '' if $chk;
348                return $octets;
349            };
350        }
351        *cat_decode = sub {    # ($obj, $dst, $src, $pos, $trm, $chk)
352                               # currently ignores $chk
353            my ( undef, undef, undef, $pos, $trm ) = @_;
354            my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
355            use bytes;
356            if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
357                $$rdst .=
358                  substr( $$rsrc, $pos, $npos - $pos + length($trm) );
359                $$rpos = $npos + length($trm);
360                return 1;
361            }
362            $$rdst .= substr( $$rsrc, $pos );
363            $$rpos = length($$rsrc);
364            return '';
365        };
366        $Encode::Encoding{utf8} =
367          bless { Name => "utf8" } => "Encode::utf8";
368        $Encode::Encoding{"utf-8-strict"} =
369          bless { Name => "utf-8-strict", strict_utf8 => 1 }
370            => "Encode::utf8";
371    }
372}
373
3741;
375
376__END__
377
378=head1 NAME
379
380Encode - character encodings in Perl
381
382=head1 SYNOPSIS
383
384    use Encode qw(decode encode);
385    $characters = decode('UTF-8', $octets,     Encode::FB_CROAK);
386    $octets     = encode('UTF-8', $characters, Encode::FB_CROAK);
387
388=head2 Table of Contents
389
390Encode consists of a collection of modules whose details are too extensive
391to fit in one document.  This one itself explains the top-level APIs
392and general topics at a glance.  For other topics and more details,
393see the documentation for these modules:
394
395=over 2
396
397=item L<Encode::Alias> - Alias definitions to encodings
398
399=item L<Encode::Encoding> - Encode Implementation Base Class
400
401=item L<Encode::Supported> - List of Supported Encodings
402
403=item L<Encode::CN> - Simplified Chinese Encodings
404
405=item L<Encode::JP> - Japanese Encodings
406
407=item L<Encode::KR> - Korean Encodings
408
409=item L<Encode::TW> - Traditional Chinese Encodings
410
411=back
412
413=head1 DESCRIPTION
414
415The C<Encode> module provides the interface between Perl strings
416and the rest of the system.  Perl strings are sequences of
417I<characters>.
418
419The repertoire of characters that Perl can represent is a superset of those
420defined by the Unicode Consortium. On most platforms the ordinal
421values of a character as returned by C<ord(I<S>)> is the I<Unicode
422codepoint> for that character. The exceptions are platforms where
423the legacy encoding is some variant of EBCDIC rather than a superset
424of ASCII; see L<perlebcdic>.
425
426During recent history, data is moved around a computer in 8-bit chunks,
427often called "bytes" but also known as "octets" in standards documents.
428Perl is widely used to manipulate data of many types: not only strings of
429characters representing human or computer languages, but also "binary"
430data, being the machine's representation of numbers, pixels in an image, or
431just about anything.
432
433When Perl is processing "binary data", the programmer wants Perl to
434process "sequences of bytes". This is not a problem for Perl: because a
435byte has 256 possible values, it easily fits in Perl's much larger
436"logical character".
437
438This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq>
439explain the I<why>.
440
441=head2 TERMINOLOGY
442
443=head3 character
444
445A character in the range 0 .. 2**32-1 (or more);
446what Perl's strings are made of.
447
448=head3 byte
449
450A character in the range 0..255;
451a special case of a Perl character.
452
453=head3 octet
454
4558 bits of data, with ordinal values 0..255;
456term for bytes passed to or from a non-Perl context, such as a disk file,
457standard I/O stream, database, command-line argument, environment variable,
458socket etc.
459
460=head1 THE PERL ENCODING API
461
462=head2 Basic methods
463
464=head3 encode
465
466  $octets  = encode(ENCODING, STRING[, CHECK])
467
468Encodes the scalar value I<STRING> from Perl's internal form into
469I<ENCODING> and returns a sequence of octets.  I<ENCODING> can be either a
470canonical name or an alias.  For encoding names and aliases, see
471L</"Defining Aliases">.  For CHECK, see L</"Handling Malformed Data">.
472
473For example, to convert a string from Perl's internal format into
474ISO-8859-1, also known as Latin1:
475
476  $octets = encode("iso-8859-1", $string);
477
478B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then
479$octets I<might not be equal to> $string.  Though both contain the
480same data, the UTF8 flag for $octets is I<always> off.  When you
481encode anything, the UTF8 flag on the result is always off, even when it
482contains a completely valid utf8 string. See L</"The UTF8 flag"> below.
483
484If the $string is C<undef>, then C<undef> is returned.
485
486=head3 decode
487
488  $string = decode(ENCODING, OCTETS[, CHECK])
489
490This function returns the string that results from decoding the scalar
491value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
492Perl's internal form.  As with encode(),
493I<ENCODING> can be either a canonical name or an alias. For encoding names
494and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
495Malformed Data">.
496
497For example, to convert ISO-8859-1 data into a string in Perl's
498internal format:
499
500  $string = decode("iso-8859-1", $octets);
501
502B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
503I<might not be equal to> $octets.  Though both contain the same data, the
504UTF8 flag for $string is on.  See L</"The UTF8 flag">
505below.
506
507If the $string is C<undef>, then C<undef> is returned.
508
509=head3 find_encoding
510
511  [$obj =] find_encoding(ENCODING)
512
513Returns the I<encoding object> corresponding to I<ENCODING>.  Returns
514C<undef> if no matching I<ENCODING> is find.  The returned object is
515what does the actual encoding or decoding.
516
517  $utf8 = decode($name, $bytes);
518
519is in fact
520
521    $utf8 = do {
522        $obj = find_encoding($name);
523        croak qq(encoding "$name" not found) unless ref $obj;
524        $obj->decode($bytes);
525    };
526
527with more error checking.
528
529You can therefore save time by reusing this object as follows;
530
531    my $enc = find_encoding("iso-8859-1");
532    while(<>) {
533        my $utf8 = $enc->decode($_);
534        ... # now do something with $utf8;
535    }
536
537Besides L</decode> and L</encode>, other methods are
538available as well.  For instance, C<name()> returns the canonical
539name of the encoding object.
540
541  find_encoding("latin1")->name; # iso-8859-1
542
543See L<Encode::Encoding> for details.
544
545=head3 from_to
546
547  [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
548
549Converts I<in-place> data between two encodings. The data in $octets
550must be encoded as octets and I<not> as characters in Perl's internal
551format. For example, to convert ISO-8859-1 data into Microsoft's CP1250
552encoding:
553
554  from_to($octets, "iso-8859-1", "cp1250");
555
556and to convert it back:
557
558  from_to($octets, "cp1250", "iso-8859-1");
559
560Because the conversion happens in place, the data to be
561converted cannot be a string constant: it must be a scalar variable.
562
563C<from_to()> returns the length of the converted string in octets on success,
564and C<undef> on error.
565
566B<CAVEAT>: The following operations may look the same, but are not:
567
568  from_to($data, "iso-8859-1", "utf8"); #1
569  $data = decode("iso-8859-1", $data);  #2
570
571Both #1 and #2 make $data consist of a completely valid UTF-8 string,
572but only #2 turns the UTF8 flag on.  #1 is equivalent to:
573
574  $data = encode("utf8", decode("iso-8859-1", $data));
575
576See L</"The UTF8 flag"> below.
577
578Also note that:
579
580  from_to($octets, $from, $to, $check);
581
582is equivalent to:
583
584  $octets = encode($to, decode($from, $octets), $check);
585
586Yes, it does I<not> respect the $check during decoding.  It is
587deliberately done that way.  If you need minute control, use C<decode>
588followed by C<encode> as follows:
589
590  $octets = encode($to, decode($from, $octets, $check_from), $check_to);
591
592=head3 encode_utf8
593
594  $octets = encode_utf8($string);
595
596Equivalent to C<$octets = encode("utf8", $string)>.  The characters in
597$string are encoded in Perl's internal format, and the result is returned
598as a sequence of octets.  Because all possible characters in Perl have a
599(loose, not strict) UTF-8 representation, this function cannot fail.
600
601=head3 decode_utf8
602
603  $string = decode_utf8($octets [, CHECK]);
604
605Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
606The sequence of octets represented by $octets is decoded
607from UTF-8 into a sequence of logical characters.
608Because not all sequences of octets are valid UTF-8,
609it is quite possible for this function to fail.
610For CHECK, see L</"Handling Malformed Data">.
611
612=head2 Listing available encodings
613
614  use Encode;
615  @list = Encode->encodings();
616
617Returns a list of canonical names of available encodings that have already
618been loaded.  To get a list of all available encodings including those that
619have not yet been loaded, say:
620
621  @all_encodings = Encode->encodings(":all");
622
623Or you can give the name of a specific module:
624
625  @with_jp = Encode->encodings("Encode::JP");
626
627When "C<::>" is not in the name, "C<Encode::>" is assumed.
628
629  @ebcdic = Encode->encodings("EBCDIC");
630
631To find out in detail which encodings are supported by this package,
632see L<Encode::Supported>.
633
634=head2 Defining Aliases
635
636To add a new alias to a given encoding, use:
637
638  use Encode;
639  use Encode::Alias;
640  define_alias(NEWNAME => ENCODING);
641
642After that, I<NEWNAME> can be used as an alias for I<ENCODING>.
643I<ENCODING> may be either the name of an encoding or an
644I<encoding object>.
645
646Before you do that, first make sure the alias is nonexistent using
647C<resolve_alias()>, which returns the canonical name thereof.
648For example:
649
650  Encode::resolve_alias("latin1") eq "iso-8859-1" # true
651  Encode::resolve_alias("iso-8859-12")   # false; nonexistent
652  Encode::resolve_alias($name) eq $name  # true if $name is canonical
653
654C<resolve_alias()> does not need C<use Encode::Alias>; it can be
655imported via C<use Encode qw(resolve_alias)>.
656
657See L<Encode::Alias> for details.
658
659=head2 Finding IANA Character Set Registry names
660
661The canonical name of a given encoding does not necessarily agree with
662IANA Character Set Registry, commonly seen as C<< Content-Type:
663text/plain; charset=I<WHATEVER> >>.  For most cases, the canonical name
664works, but sometimes it does not, most notably with "utf-8-strict".
665
666As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added.
667
668  use Encode;
669  my $enc = find_encoding("UTF-8");
670  warn $enc->name;      # utf-8-strict
671  warn $enc->mime_name; # UTF-8
672
673See also:  L<Encode::Encoding>
674
675=head1 Encoding via PerlIO
676
677If your perl supports C<PerlIO> (which is the default), you can use a
678C<PerlIO> layer to decode and encode directly via a filehandle.  The
679following two examples are fully identical in functionality:
680
681  ### Version 1 via PerlIO
682    open(INPUT,  "< :encoding(shiftjis)", $infile)
683        || die "Can't open < $infile for reading: $!";
684    open(OUTPUT, "> :encoding(euc-jp)",  $outfile)
685        || die "Can't open > $output for writing: $!";
686    while (<INPUT>) {   # auto decodes $_
687        print OUTPUT;   # auto encodes $_
688    }
689    close(INPUT)   || die "can't close $infile: $!";
690    close(OUTPUT)  || die "can't close $outfile: $!";
691
692  ### Version 2 via from_to()
693    open(INPUT,  "< :raw", $infile)
694        || die "Can't open < $infile for reading: $!";
695    open(OUTPUT, "> :raw",  $outfile)
696        || die "Can't open > $output for writing: $!";
697
698    while (<INPUT>) {
699        from_to($_, "shiftjis", "euc-jp", 1);  # switch encoding
700        print OUTPUT;   # emit raw (but properly encoded) data
701    }
702    close(INPUT)   || die "can't close $infile: $!";
703    close(OUTPUT)  || die "can't close $outfile: $!";
704
705In the first version above, you let the appropriate encoding layer
706handle the conversion.  In the second, you explicitly translate
707from one encoding to the other.
708
709Unfortunately, it may be that encodings are not C<PerlIO>-savvy.  You can check
710to see whether your encoding is supported by C<PerlIO> by invoking the
711C<perlio_ok> method on it:
712
713  Encode::perlio_ok("hz");             # false
714  find_encoding("euc-cn")->perlio_ok;  # true wherever PerlIO is available
715
716  use Encode qw(perlio_ok);            # imported upon request
717  perlio_ok("euc-jp")
718
719Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy
720except for C<hz> and C<ISO-2022-kr>.  For the gory details, see
721L<Encode::Encoding> and L<Encode::PerlIO>.
722
723=head1 Handling Malformed Data
724
725The optional I<CHECK> argument tells C<Encode> what to do when
726encountering malformed data.  Without I<CHECK>, C<Encode::FB_DEFAULT>
727(== 0) is assumed.
728
729As of version 2.12, C<Encode> supports coderef values for C<CHECK>;
730see below.
731
732B<NOTE:> Not all encodings support this feature.
733Some encodings ignore the I<CHECK> argument.  For example,
734L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
735
736=head2 List of I<CHECK> values
737
738=head3 FB_DEFAULT
739
740  I<CHECK> = Encode::FB_DEFAULT ( == 0)
741
742If I<CHECK> is 0, encoding and decoding replace any malformed character
743with a I<substitution character>.  When you encode, I<SUBCHAR> is used.
744When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is
745used.  If the data is supposed to be UTF-8, an optional lexical warning of
746warning category C<"utf8"> is given.
747
748=head3 FB_CROAK
749
750  I<CHECK> = Encode::FB_CROAK ( == 1)
751
752If I<CHECK> is 1, methods immediately die with an error
753message.  Therefore, when I<CHECK> is 1, you should trap
754exceptions with C<eval{}>, unless you really want to let it C<die>.
755
756=head3 FB_QUIET
757
758  I<CHECK> = Encode::FB_QUIET
759
760If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately
761return the portion of the data that has been processed so far when an
762error occurs. The data argument is overwritten with everything
763after that point; that is, the unprocessed portion of the data.  This is
764handy when you have to call C<decode> repeatedly in the case where your
765source data may contain partial multi-byte character sequences,
766(that is, you are reading with a fixed-width buffer). Here's some sample
767code to do exactly that:
768
769    my($buffer, $string) = ("", "");
770    while (read($fh, $buffer, 256, length($buffer))) {
771        $string .= decode($encoding, $buffer, Encode::FB_QUIET);
772        # $buffer now contains the unprocessed partial character
773    }
774
775=head3 FB_WARN
776
777  I<CHECK> = Encode::FB_WARN
778
779This is the same as C<FB_QUIET> above, except that instead of being silent
780on errors, it issues a warning.  This is handy for when you are debugging.
781
782=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
783
784=over 2
785
786=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
787
788=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
789
790=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
791
792=back
793
794For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
795C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
796
797When you decode, C<\xI<HH>> is inserted for a malformed character, where
798I<HH> is the hex representation of the octet that could not be decoded to
799utf8.  When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
800the Unicode code point (in any number of hex digits) of the character that
801cannot be found in the character repertoire of the encoding.
802
803The HTML/XML character reference modes are about the same. In place of
804C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
805XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
806
807In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
808
809=head3 The bitmask
810
811These modes are all actually set via a bitmask.  Here is how the C<FB_I<XXX>>
812constants are laid out.  You can import the C<FB_I<XXX>> constants via
813C<use Encode qw(:fallbacks)>, and you can import the generic bitmask
814constants via C<use Encode qw(:fallback_all)>.
815
816                     FB_DEFAULT FB_CROAK FB_QUIET FB_WARN  FB_PERLQQ
817 DIE_ON_ERR    0x0001             X
818 WARN_ON_ERR   0x0002                               X
819 RETURN_ON_ERR 0x0004                      X        X
820 LEAVE_SRC     0x0008                                        X
821 PERLQQ        0x0100                                        X
822 HTMLCREF      0x0200
823 XMLCREF       0x0400
824
825=head3 LEAVE_SRC
826
827  Encode::LEAVE_SRC
828
829If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
830source string to encode() or decode() will be overwritten in place.
831If you're not interested in this, then bitwise-OR it with the bitmask.
832
833=head2 coderef for CHECK
834
835As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
836ordinal value of the unmapped character as an argument and returns
837octets that represent the fallback character.  For instance:
838
839  $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
840
841Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
842
843Even the fallback for C<decode> must return octets, which are
844then decoded with the character encoding that C<decode> accepts. So for
845example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
846a fallback for bytes that are not valid UTF-8, you could write
847
848    $str = decode 'UTF-8', $octets, sub {
849        my $tmp = chr shift;
850        from_to $tmp, 'ISO-8859-15', 'UTF-8';
851        return $tmp;
852    };
853
854=head1 Defining Encodings
855
856To define a new encoding, use:
857
858    use Encode qw(define_encoding);
859    define_encoding($object, CANONICAL_NAME [, alias...]);
860
861I<CANONICAL_NAME> will be associated with I<$object>.  The object
862should provide the interface described in L<Encode::Encoding>.
863If more than two arguments are provided, additional
864arguments are considered aliases for I<$object>.
865
866See L<Encode::Encoding> for details.
867
868=head1 The UTF8 flag
869
870Before the introduction of Unicode support in Perl, The C<eq> operator
871just compared the strings represented by two scalars. Beginning with
872Perl 5.8, C<eq> compares two strings with simultaneous consideration of
873I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of
874I<Programming Perl, 3rd ed.>
875
876=over 2
877
878=item Goal #1:
879
880Old byte-oriented programs should not spontaneously break on the old
881byte-oriented data they used to work on.
882
883=item Goal #2:
884
885Old byte-oriented programs should magically start working on the new
886character-oriented data when appropriate.
887
888=item Goal #3:
889
890Programs should run just as fast in the new character-oriented mode
891as in the old byte-oriented mode.
892
893=item Goal #4:
894
895Perl should remain one language, rather than forking into a
896byte-oriented Perl and a character-oriented Perl.
897
898=back
899
900When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been
901born yet, many features documented in the book remained unimplemented for a
902long time.  Perl 5.8 corrected much of this, and the introduction of the
903UTF8 flag is one of them.  You can think of there being two fundamentally
904different kinds of strings and string-operations in Perl: one a
905byte-oriented mode  for when the internal UTF8 flag is off, and the other a
906character-oriented mode for when the internal UTF8 flag is on.
907
908Here is how C<Encode> handles the UTF8 flag.
909
910=over 2
911
912=item *
913
914When you I<encode>, the resulting UTF8 flag is always B<off>.
915
916=item *
917
918When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can
919unambiguously represent data.  Here is what we mean by "unambiguously".
920After C<$utf8 = decode("foo", $octet)>,
921
922  When $octet is...   The UTF8 flag in $utf8 is
923  ---------------------------------------------
924  In ASCII only (or EBCDIC only)            OFF
925  In ISO-8859-1                              ON
926  In any other Encoding                      ON
927  ---------------------------------------------
928
929As you see, there is one exception: in ASCII.  That way you can assume
930Goal #1.  And with C<Encode>, Goal #2 is assumed but you still have to be
931careful in the cases mentioned in the B<CAVEAT> paragraphs above.
932
933This UTF8 flag is not visible in Perl scripts, exactly for the same reason
934you cannot (or rather, you I<don't have to>) see whether a scalar contains
935a string, an integer, or a floating-point number.   But you can still peek
936and poke these if you will.  See the next section.
937
938=back
939
940=head2 Messing with Perl's Internals
941
942The following API uses parts of Perl's internals in the current
943implementation.  As such, they are efficient but may change in a future
944release.
945
946=head3 is_utf8
947
948  is_utf8(STRING [, CHECK])
949
950[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>.
951If I<CHECK> is true, also checks whether I<STRING> contains well-formed
952UTF-8.  Returns true if successful, false otherwise.
953
954As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
955
956=head3 _utf8_on
957
958  _utf8_on(STRING)
959
960[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>.  The I<STRING>
961is I<not> checked for containing only well-formed UTF-8.  Do not use this
962unless you I<know with absolute certainty> that the STRING holds only
963well-formed UTF-8.  Returns the previous state of the UTF8 flag (so please
964don't treat the return value as indicating success or failure), or C<undef>
965if I<STRING> is not a string.
966
967B<NOTE>: For security reasons, this function does not work on tainted values.
968
969=head3 _utf8_off
970
971  _utf8_off(STRING)
972
973[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>.  Do not use
974frivolously.  Returns the previous state of the UTF8 flag, or C<undef> if
975I<STRING> is not a string.  Do not treat the return value as indicative of
976success or failure, because that isn't what it means: it is only the
977previous setting.
978
979B<NOTE>: For security reasons, this function does not work on tainted values.
980
981=head1 UTF-8 vs. utf8 vs. UTF8
982
983  ....We now view strings not as sequences of bytes, but as sequences
984  of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
985  computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
986
987That has historically been Perl's notion of UTF-8, as that is how UTF-8 was
988first conceived by Ken Thompson when he invented it. However, thanks to
989later revisions to the applicable standards, official UTF-8 is now rather
990stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF
991to cover only 21 bits instead of 32 or 64 bits) and some sequences
992are not allowed, like those used in surrogate pairs, the 31 non-character
993code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane
994(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc.
995
996The former default in which Perl would always use a loose interpretation of
997UTF-8 has now been overruled:
998
999  From: Larry Wall <larry@wall.org>
1000  Date: December 04, 2004 11:51:58 JST
1001  To: perl-unicode@perl.org
1002  Subject: Re: Make Encode.pm support the real UTF-8
1003  Message-Id: <20041204025158.GA28754@wall.org>
1004
1005  On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
1006  : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
1007  : but "UTF-8" is the name of the standard and should give the
1008  : corresponding behaviour.
1009
1010  For what it's worth, that's how I've always kept them straight in my
1011  head.
1012
1013  Also for what it's worth, Perl 6 will mostly default to strict but
1014  make it easy to switch back to lax.
1015
1016  Larry
1017
1018Got that?  As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current
1019sense, which is conservative and strict and security-conscious, whereas
1020B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and
1021lax.  C<Encode> version 2.10 or later thus groks this subtle but critically
1022important distinction between C<"UTF-8"> and C<"utf8">.
1023
1024  encode("utf8",  "\x{FFFF_FFFF}", 1); # okay
1025  encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
1026
1027In the C<Encode> module, C<"UTF-8"> is actually a canonical name for
1028C<"utf-8-strict">.  That hyphen between the C<"UTF"> and the C<"8"> is
1029critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
1030
1031  find_encoding("UTF-8")->name # is 'utf-8-strict'
1032  find_encoding("utf-8")->name # ditto. names are case insensitive
1033  find_encoding("utf_8")->name # ditto. "_" are treated as "-"
1034  find_encoding("UTF8")->name  # is 'utf8'.
1035
1036Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates
1037whether a string is internally encoded as "utf8", also without a hyphen.
1038
1039=head1 SEE ALSO
1040
1041L<Encode::Encoding>,
1042L<Encode::Supported>,
1043L<Encode::PerlIO>,
1044L<encoding>,
1045L<perlebcdic>,
1046L<perlfunc/open>,
1047L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
1048L<utf8>,
1049the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html>
1050
1051=head1 MAINTAINER
1052
1053This project was originated by the late Nick Ing-Simmons and later
1054maintained by Dan Kogai I<< <dankogai@cpan.org> >>.  See AUTHORS
1055for a full list of people involved.  For any questions, send mail to
1056I<< <perl-unicode@perl.org> >> so that we can all share.
1057
1058While Dan Kogai retains the copyright as a maintainer, credit
1059should go to all those involved.  See AUTHORS for a list of those
1060who submitted code to the project.
1061
1062=head1 COPYRIGHT
1063
1064Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>.
1065
1066This library is free software; you can redistribute it and/or modify
1067it under the same terms as Perl itself.
1068
1069=cut
1070