xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/Encode.pm (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1#
2# $Id: Encode.pm,v 2.60 2014/04/29 16:26:49 dankogai Exp dankogai $
3#
4package Encode;
5use strict;
6use warnings;
7our $VERSION = sprintf "%d.%02d", q$Revision: 2.60_01 $ =~ /(\d+)/g;
8use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
9use XSLoader ();
10XSLoader::load( __PACKAGE__, $VERSION );
11
12use Exporter 5.57 'import';
13
14# Public, encouraged API is exported by default
15
16our @EXPORT = qw(
17  decode  decode_utf8  encode  encode_utf8 str2bytes bytes2str
18  encodings  find_encoding clone_encoding
19);
20our @FB_FLAGS = qw(
21  DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
22  PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
23);
24our @FB_CONSTS = qw(
25  FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
26  FB_PERLQQ FB_HTMLCREF FB_XMLCREF
27);
28our @EXPORT_OK = (
29    qw(
30      _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
31      is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
32      ),
33    @FB_FLAGS, @FB_CONSTS,
34);
35
36our %EXPORT_TAGS = (
37    all          => [ @EXPORT,    @EXPORT_OK ],
38    default      => [ @EXPORT ],
39    fallbacks    => [ @FB_CONSTS ],
40    fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
41);
42
43# Documentation moved after __END__ for speed - NI-S
44
45our $ON_EBCDIC = ( ord("A") == 193 );
46
47use Encode::Alias;
48
49# Make a %Encoding package variable to allow a certain amount of cheating
50our %Encoding;
51our %ExtModule;
52require Encode::Config;
53#  See
54#  https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
55#  to find why sig handlers inside eval{} are disabled.
56eval {
57    local $SIG{__DIE__};
58    local $SIG{__WARN__};
59    local @INC = @INC;
60    pop @INC if $INC[-1] eq '.';
61    require Encode::ConfigLocal;
62};
63
64sub encodings {
65    my %enc;
66    my $arg  = $_[1] || '';
67    if ( $arg eq ":all" ) {
68        %enc = ( %Encoding, %ExtModule );
69    }
70    else {
71        %enc = %Encoding;
72        for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) {
73            DEBUG and warn $mod;
74            for my $enc ( keys %ExtModule ) {
75                $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
76            }
77        }
78    }
79    return sort { lc $a cmp lc $b }
80      grep      { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
81}
82
83sub perlio_ok {
84    my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
85    $obj->can("perlio_ok") and return $obj->perlio_ok();
86    return 0;    # safety net
87}
88
89sub define_encoding {
90    my $obj  = shift;
91    my $name = shift;
92    $Encoding{$name} = $obj;
93    my $lc = lc($name);
94    define_alias( $lc => $obj ) unless $lc eq $name;
95    while (@_) {
96        my $alias = shift;
97        define_alias( $alias, $obj );
98    }
99    return $obj;
100}
101
102sub getEncoding {
103    my ( $class, $name, $skip_external ) = @_;
104
105    $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
106
107    ref($name) && $name->can('renew') and return $name;
108    exists $Encoding{$name} and return $Encoding{$name};
109    my $lc = lc $name;
110    exists $Encoding{$lc} and return $Encoding{$lc};
111
112    my $oc = $class->find_alias($name);
113    defined($oc) and return $oc;
114    $lc ne $name and $oc = $class->find_alias($lc);
115    defined($oc) and return $oc;
116
117    unless ($skip_external) {
118        if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
119            $mod =~ s,::,/,g;
120            $mod .= '.pm';
121            eval { require $mod; };
122            exists $Encoding{$name} and return $Encoding{$name};
123        }
124    }
125    return;
126}
127
128sub find_encoding($;$) {
129    my ( $name, $skip_external ) = @_;
130    return __PACKAGE__->getEncoding( $name, $skip_external );
131}
132
133sub resolve_alias($) {
134    my $obj = find_encoding(shift);
135    defined $obj and return $obj->name;
136    return;
137}
138
139sub clone_encoding($) {
140    my $obj = find_encoding(shift);
141    ref $obj or return;
142    eval { require Storable };
143    $@ and return;
144    return Storable::dclone($obj);
145}
146
147sub encode($$;$) {
148    my ( $name, $string, $check ) = @_;
149    return undef unless defined $string;
150    $string .= '';    # stringify;
151    $check ||= 0;
152    unless ( defined $name ) {
153        require Carp;
154        Carp::croak("Encoding name should not be undef");
155    }
156    my $enc = find_encoding($name);
157    unless ( defined $enc ) {
158        require Carp;
159        Carp::croak("Unknown encoding '$name'");
160    }
161    my $octets = $enc->encode( $string, $check );
162    $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() );
163    return $octets;
164}
165*str2bytes = \&encode;
166
167sub decode($$;$) {
168    my ( $name, $octets, $check ) = @_;
169    return undef unless defined $octets;
170    $octets .= '';
171    $check ||= 0;
172    my $enc = find_encoding($name);
173    unless ( defined $enc ) {
174        require Carp;
175        Carp::croak("Unknown encoding '$name'");
176    }
177    my $string = $enc->decode( $octets, $check );
178    $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
179    return $string;
180}
181*bytes2str = \&decode;
182
183sub from_to($$$;$) {
184    my ( $string, $from, $to, $check ) = @_;
185    return undef unless defined $string;
186    $check ||= 0;
187    my $f = find_encoding($from);
188    unless ( defined $f ) {
189        require Carp;
190        Carp::croak("Unknown encoding '$from'");
191    }
192    my $t = find_encoding($to);
193    unless ( defined $t ) {
194        require Carp;
195        Carp::croak("Unknown encoding '$to'");
196    }
197    my $uni = $f->decode($string);
198    $_[0] = $string = $t->encode( $uni, $check );
199    return undef if ( $check && length($uni) );
200    return defined( $_[0] ) ? length($string) : undef;
201}
202
203sub encode_utf8($) {
204    my ($str) = @_;
205    utf8::encode($str);
206    return $str;
207}
208
209my $utf8enc;
210
211sub decode_utf8($;$) {
212    my ( $octets, $check ) = @_;
213    return undef unless defined $octets;
214    $octets .= '';
215    $check   ||= 0;
216    $utf8enc ||= find_encoding('utf8');
217    my $string = $utf8enc->decode( $octets, $check );
218    $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
219    return $string;
220}
221
222# sub decode_utf8($;$) {
223#     my ( $str, $check ) = @_;
224#     return $str if is_utf8($str);
225#     if ($check) {
226#         return decode( "utf8", $str, $check );
227#     }
228#     else {
229#         return decode( "utf8", $str );
230#         return $str;
231#     }
232# }
233
234predefine_encodings(1);
235
236#
237# This is to restore %Encoding if really needed;
238#
239
240sub predefine_encodings {
241    require Encode::Encoding;
242    no warnings 'redefine';
243    my $use_xs = shift;
244    if ($ON_EBCDIC) {
245
246        # was in Encode::UTF_EBCDIC
247        package Encode::UTF_EBCDIC;
248        push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
249        *decode = sub {
250            my ( undef, $str, $chk ) = @_;
251            my $res = '';
252            for ( my $i = 0 ; $i < length($str) ; $i++ ) {
253                $res .=
254                  chr(
255                    utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
256                  );
257            }
258            $_[1] = '' if $chk;
259            return $res;
260        };
261        *encode = sub {
262            my ( undef, $str, $chk ) = @_;
263            my $res = '';
264            for ( my $i = 0 ; $i < length($str) ; $i++ ) {
265                $res .=
266                  chr(
267                    utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
268                  );
269            }
270            $_[1] = '' if $chk;
271            return $res;
272        };
273        $Encode::Encoding{Unicode} =
274          bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
275    }
276    else {
277
278        package Encode::Internal;
279        push @Encode::Internal::ISA, 'Encode::Encoding';
280        *decode = sub {
281            my ( undef, $str, $chk ) = @_;
282            utf8::upgrade($str);
283            $_[1] = '' if $chk;
284            return $str;
285        };
286        *encode = \&decode;
287        $Encode::Encoding{Unicode} =
288          bless { Name => "Internal" } => "Encode::Internal";
289    }
290
291    {
292
293        # was in Encode::utf8
294        package Encode::utf8;
295        push @Encode::utf8::ISA, 'Encode::Encoding';
296
297        #
298        if ($use_xs) {
299            Encode::DEBUG and warn __PACKAGE__, " XS on";
300            *decode = \&decode_xs;
301            *encode = \&encode_xs;
302        }
303        else {
304            Encode::DEBUG and warn __PACKAGE__, " XS off";
305            *decode = sub {
306                my ( undef, $octets, $chk ) = @_;
307                my $str = Encode::decode_utf8($octets);
308                if ( defined $str ) {
309                    $_[1] = '' if $chk;
310                    return $str;
311                }
312                return undef;
313            };
314            *encode = sub {
315                my ( undef, $string, $chk ) = @_;
316                my $octets = Encode::encode_utf8($string);
317                $_[1] = '' if $chk;
318                return $octets;
319            };
320        }
321        *cat_decode = sub {    # ($obj, $dst, $src, $pos, $trm, $chk)
322                               # currently ignores $chk
323            my ( undef, undef, undef, $pos, $trm ) = @_;
324            my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
325            use bytes;
326            if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
327                $$rdst .=
328                  substr( $$rsrc, $pos, $npos - $pos + length($trm) );
329                $$rpos = $npos + length($trm);
330                return 1;
331            }
332            $$rdst .= substr( $$rsrc, $pos );
333            $$rpos = length($$rsrc);
334            return '';
335        };
336        $Encode::Encoding{utf8} =
337          bless { Name => "utf8" } => "Encode::utf8";
338        $Encode::Encoding{"utf-8-strict"} =
339          bless { Name => "utf-8-strict", strict_utf8 => 1 }
340            => "Encode::utf8";
341    }
342}
343
3441;
345
346__END__
347
348=head1 NAME
349
350Encode - character encodings in Perl
351
352=head1 SYNOPSIS
353
354    use Encode qw(decode encode);
355    $characters = decode('UTF-8', $octets,     Encode::FB_CROAK);
356    $octets     = encode('UTF-8', $characters, Encode::FB_CROAK);
357
358=head2 Table of Contents
359
360Encode consists of a collection of modules whose details are too extensive
361to fit in one document.  This one itself explains the top-level APIs
362and general topics at a glance.  For other topics and more details,
363see the documentation for these modules:
364
365=over 2
366
367=item L<Encode::Alias> - Alias definitions to encodings
368
369=item L<Encode::Encoding> - Encode Implementation Base Class
370
371=item L<Encode::Supported> - List of Supported Encodings
372
373=item L<Encode::CN> - Simplified Chinese Encodings
374
375=item L<Encode::JP> - Japanese Encodings
376
377=item L<Encode::KR> - Korean Encodings
378
379=item L<Encode::TW> - Traditional Chinese Encodings
380
381=back
382
383=head1 DESCRIPTION
384
385The C<Encode> module provides the interface between Perl strings
386and the rest of the system.  Perl strings are sequences of
387I<characters>.
388
389The repertoire of characters that Perl can represent is a superset of those
390defined by the Unicode Consortium. On most platforms the ordinal
391values of a character as returned by C<ord(I<S>)> is the I<Unicode
392codepoint> for that character. The exceptions are platforms where
393the legacy encoding is some variant of EBCDIC rather than a superset
394of ASCII; see L<perlebcdic>.
395
396During recent history, data is moved around a computer in 8-bit chunks,
397often called "bytes" but also known as "octets" in standards documents.
398Perl is widely used to manipulate data of many types: not only strings of
399characters representing human or computer languages, but also "binary"
400data, being the machine's representation of numbers, pixels in an image, or
401just about anything.
402
403When Perl is processing "binary data", the programmer wants Perl to
404process "sequences of bytes". This is not a problem for Perl: because a
405byte has 256 possible values, it easily fits in Perl's much larger
406"logical character".
407
408This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq>
409explain the I<why>.
410
411=head2 TERMINOLOGY
412
413=head3 character
414
415A character in the range 0 .. 2**32-1 (or more);
416what Perl's strings are made of.
417
418=head3 byte
419
420A character in the range 0..255;
421a special case of a Perl character.
422
423=head3 octet
424
4258 bits of data, with ordinal values 0..255;
426term for bytes passed to or from a non-Perl context, such as a disk file,
427standard I/O stream, database, command-line argument, environment variable,
428socket etc.
429
430=head1 THE PERL ENCODING API
431
432=head2 Basic methods
433
434=head3 encode
435
436  $octets  = encode(ENCODING, STRING[, CHECK])
437
438Encodes the scalar value I<STRING> from Perl's internal form into
439I<ENCODING> and returns a sequence of octets.  I<ENCODING> can be either a
440canonical name or an alias.  For encoding names and aliases, see
441L</"Defining Aliases">.  For CHECK, see L</"Handling Malformed Data">.
442
443For example, to convert a string from Perl's internal format into
444ISO-8859-1, also known as Latin1:
445
446  $octets = encode("iso-8859-1", $string);
447
448B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then
449$octets I<might not be equal to> $string.  Though both contain the
450same data, the UTF8 flag for $octets is I<always> off.  When you
451encode anything, the UTF8 flag on the result is always off, even when it
452contains a completely valid utf8 string. See L</"The UTF8 flag"> below.
453
454If the $string is C<undef>, then C<undef> is returned.
455
456=head3 decode
457
458  $string = decode(ENCODING, OCTETS[, CHECK])
459
460This function returns the string that results from decoding the scalar
461value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
462Perl's internal form.  The returns the resulting string.  As with encode(),
463I<ENCODING> can be either a canonical name or an alias. For encoding names
464and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
465Malformed Data">.
466
467For example, to convert ISO-8859-1 data into a string in Perl's
468internal format:
469
470  $string = decode("iso-8859-1", $octets);
471
472B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
473I<might not be equal to> $octets.  Though both contain the same data, the
474UTF8 flag for $string is on.  See L</"The UTF8 flag">
475below.
476
477If the $string is C<undef>, then C<undef> is returned.
478
479=head3 find_encoding
480
481  [$obj =] find_encoding(ENCODING)
482
483Returns the I<encoding object> corresponding to I<ENCODING>.  Returns
484C<undef> if no matching I<ENCODING> is find.  The returned object is
485what does the actual encoding or decoding.
486
487  $utf8 = decode($name, $bytes);
488
489is in fact
490
491    $utf8 = do {
492        $obj = find_encoding($name);
493        croak qq(encoding "$name" not found) unless ref $obj;
494        $obj->decode($bytes);
495    };
496
497with more error checking.
498
499You can therefore save time by reusing this object as follows;
500
501    my $enc = find_encoding("iso-8859-1");
502    while(<>) {
503        my $utf8 = $enc->decode($_);
504        ... # now do something with $utf8;
505    }
506
507Besides L</decode> and L</encode>, other methods are
508available as well.  For instance, C<name()> returns the canonical
509name of the encoding object.
510
511  find_encoding("latin1")->name; # iso-8859-1
512
513See L<Encode::Encoding> for details.
514
515=head3 from_to
516
517  [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
518
519Converts I<in-place> data between two encodings. The data in $octets
520must be encoded as octets and I<not> as characters in Perl's internal
521format. For example, to convert ISO-8859-1 data into Microsoft's CP1250
522encoding:
523
524  from_to($octets, "iso-8859-1", "cp1250");
525
526and to convert it back:
527
528  from_to($octets, "cp1250", "iso-8859-1");
529
530Because the conversion happens in place, the data to be
531converted cannot be a string constant: it must be a scalar variable.
532
533C<from_to()> returns the length of the converted string in octets on success,
534and C<undef> on error.
535
536B<CAVEAT>: The following operations may look the same, but are not:
537
538  from_to($data, "iso-8859-1", "utf8"); #1
539  $data = decode("iso-8859-1", $data);  #2
540
541Both #1 and #2 make $data consist of a completely valid UTF-8 string,
542but only #2 turns the UTF8 flag on.  #1 is equivalent to:
543
544  $data = encode("utf8", decode("iso-8859-1", $data));
545
546See L</"The UTF8 flag"> below.
547
548Also note that:
549
550  from_to($octets, $from, $to, $check);
551
552is equivalent t:o
553
554  $octets = encode($to, decode($from, $octets), $check);
555
556Yes, it does I<not> respect the $check during decoding.  It is
557deliberately done that way.  If you need minute control, use C<decode>
558followed by C<encode> as follows:
559
560  $octets = encode($to, decode($from, $octets, $check_from), $check_to);
561
562=head3 encode_utf8
563
564  $octets = encode_utf8($string);
565
566Equivalent to C<$octets = encode("utf8", $string)>.  The characters in
567$string are encoded in Perl's internal format, and the result is returned
568as a sequence of octets.  Because all possible characters in Perl have a
569(loose, not strict) UTF-8 representation, this function cannot fail.
570
571=head3 decode_utf8
572
573  $string = decode_utf8($octets [, CHECK]);
574
575Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
576The sequence of octets represented by $octets is decoded
577from UTF-8 into a sequence of logical characters.
578Because not all sequences of octets are valid UTF-8,
579it is quite possible for this function to fail.
580For CHECK, see L</"Handling Malformed Data">.
581
582=head2 Listing available encodings
583
584  use Encode;
585  @list = Encode->encodings();
586
587Returns a list of canonical names of available encodings that have already
588been loaded.  To get a list of all available encodings including those that
589have not yet been loaded, say:
590
591  @all_encodings = Encode->encodings(":all");
592
593Or you can give the name of a specific module:
594
595  @with_jp = Encode->encodings("Encode::JP");
596
597When "C<::>" is not in the name, "C<Encode::>" is assumed.
598
599  @ebcdic = Encode->encodings("EBCDIC");
600
601To find out in detail which encodings are supported by this package,
602see L<Encode::Supported>.
603
604=head2 Defining Aliases
605
606To add a new alias to a given encoding, use:
607
608  use Encode;
609  use Encode::Alias;
610  define_alias(NEWNAME => ENCODING);
611
612After that, I<NEWNAME> can be used as an alias for I<ENCODING>.
613I<ENCODING> may be either the name of an encoding or an
614I<encoding object>.
615
616Before you do that, first make sure the alias is nonexistent using
617C<resolve_alias()>, which returns the canonical name thereof.
618For example:
619
620  Encode::resolve_alias("latin1") eq "iso-8859-1" # true
621  Encode::resolve_alias("iso-8859-12")   # false; nonexistent
622  Encode::resolve_alias($name) eq $name  # true if $name is canonical
623
624C<resolve_alias()> does not need C<use Encode::Alias>; it can be
625imported via C<use Encode qw(resolve_alias)>.
626
627See L<Encode::Alias> for details.
628
629=head2 Finding IANA Character Set Registry names
630
631The canonical name of a given encoding does not necessarily agree with
632IANA Character Set Registry, commonly seen as C<< Content-Type:
633text/plain; charset=I<WHATEVER> >>.  For most cases, the canonical name
634works, but sometimes it does not, most notably with "utf-8-strict".
635
636As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added.
637
638  use Encode;
639  my $enc = find_encoding("UTF-8");
640  warn $enc->name;      # utf-8-strict
641  warn $enc->mime_name; # UTF-8
642
643See also:  L<Encode::Encoding>
644
645=head1 Encoding via PerlIO
646
647If your perl supports C<PerlIO> (which is the default), you can use a
648C<PerlIO> layer to decode and encode directly via a filehandle.  The
649following two examples are fully identical in functionality:
650
651  ### Version 1 via PerlIO
652    open(INPUT,  "< :encoding(shiftjis)", $infile)
653        || die "Can't open < $infile for reading: $!";
654    open(OUTPUT, "> :encoding(euc-jp)",  $outfile)
655        || die "Can't open > $output for writing: $!";
656    while (<INPUT>) {   # auto decodes $_
657        print OUTPUT;   # auto encodes $_
658    }
659    close(INPUT)   || die "can't close $infile: $!";
660    close(OUTPUT)  || die "can't close $outfile: $!";
661
662  ### Version 2 via from_to()
663    open(INPUT,  "< :raw", $infile)
664        || die "Can't open < $infile for reading: $!";
665    open(OUTPUT, "> :raw",  $outfile)
666        || die "Can't open > $output for writing: $!";
667
668    while (<INPUT>) {
669        from_to($_, "shiftjis", "euc-jp", 1);  # switch encoding
670        print OUTPUT;   # emit raw (but properly encoded) data
671    }
672    close(INPUT)   || die "can't close $infile: $!";
673    close(OUTPUT)  || die "can't close $outfile: $!";
674
675In the first version above, you let the appropriate encoding layer
676handle the conversion.  In the second, you explicitly translate
677from one encoding to the other.
678
679Unfortunately, it may be that encodings are C<PerlIO>-savvy.  You can check
680to see whether your encoding is supported by C<PerlIO> by invoking the
681C<perlio_ok> method on it:
682
683  Encode::perlio_ok("hz");             # false
684  find_encoding("euc-cn")->perlio_ok;  # true wherever PerlIO is available
685
686  use Encode qw(perlio_ok);            # imported upon request
687  perlio_ok("euc-jp")
688
689Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy
690except for C<hz> and C<ISO-2022-kr>.  For the gory details, see
691L<Encode::Encoding> and L<Encode::PerlIO>.
692
693=head1 Handling Malformed Data
694
695The optional I<CHECK> argument tells C<Encode> what to do when
696encountering malformed data.  Without I<CHECK>, C<Encode::FB_DEFAULT>
697(== 0) is assumed.
698
699As of version 2.12, C<Encode> supports coderef values for C<CHECK>;
700see below.
701
702B<NOTE:> Not all encodings support this feature.
703Some encodings ignore the I<CHECK> argument.  For example,
704L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
705
706=head2 List of I<CHECK> values
707
708=head3 FB_DEFAULT
709
710  I<CHECK> = Encode::FB_DEFAULT ( == 0)
711
712If I<CHECK> is 0, encoding and decoding replace any malformed character
713with a I<substitution character>.  When you encode, I<SUBCHAR> is used.
714When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is
715used.  If the data is supposed to be UTF-8, an optional lexical warning of
716warning category C<"utf8"> is given.
717
718=head3 FB_CROAK
719
720  I<CHECK> = Encode::FB_CROAK ( == 1)
721
722If I<CHECK> is 1, methods immediately die with an error
723message.  Therefore, when I<CHECK> is 1, you should trap
724exceptions with C<eval{}>, unless you really want to let it C<die>.
725
726=head3 FB_QUIET
727
728  I<CHECK> = Encode::FB_QUIET
729
730If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately
731return the portion of the data that has been processed so far when an
732error occurs. The data argument is overwritten with everything
733after that point; that is, the unprocessed portion of the data.  This is
734handy when you have to call C<decode> repeatedly in the case where your
735source data may contain partial multi-byte character sequences,
736(that is, you are reading with a fixed-width buffer). Here's some sample
737code to do exactly that:
738
739    my($buffer, $string) = ("", "");
740    while (read($fh, $buffer, 256, length($buffer))) {
741        $string .= decode($encoding, $buffer, Encode::FB_QUIET);
742        # $buffer now contains the unprocessed partial character
743    }
744
745=head3 FB_WARN
746
747  I<CHECK> = Encode::FB_WARN
748
749This is the same as C<FB_QUIET> above, except that instead of being silent
750on errors, it issues a warning.  This is handy for when you are debugging.
751
752=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
753
754=over 2
755
756=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
757
758=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
759
760=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
761
762=back
763
764For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
765C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
766
767When you decode, C<\xI<HH>> is inserted for a malformed character, where
768I<HH> is the hex representation of the octet that could not be decoded to
769utf8.  When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
770the Unicode code point (in any number of hex digits) of the character that
771cannot be found in the character repertoire of the encoding.
772
773The HTML/XML character reference modes are about the same. In place of
774C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
775XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
776
777In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
778
779=head3 The bitmask
780
781These modes are all actually set via a bitmask.  Here is how the C<FB_I<XXX>>
782constants are laid out.  You can import the C<FB_I<XXX>> constants via
783C<use Encode qw(:fallbacks)>, and you can import the generic bitmask
784constants via C<use Encode qw(:fallback_all)>.
785
786                     FB_DEFAULT FB_CROAK FB_QUIET FB_WARN  FB_PERLQQ
787 DIE_ON_ERR    0x0001             X
788 WARN_ON_ERR   0x0002                               X
789 RETURN_ON_ERR 0x0004                      X        X
790 LEAVE_SRC     0x0008                                        X
791 PERLQQ        0x0100                                        X
792 HTMLCREF      0x0200
793 XMLCREF       0x0400
794
795=head3 LEAVE_SRC
796
797  Encode::LEAVE_SRC
798
799If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
800source string to encode() or decode() will be overwritten in place.
801If you're not interested in this, then bitwise-OR it with the bitmask.
802
803=head2 coderef for CHECK
804
805As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
806ordinal value of the unmapped character as an argument and returns
807octets that represent the fallback character.  For instance:
808
809  $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
810
811Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
812
813Even the fallback for C<decode> must return octets, which are
814then decoded with the character encoding that C<decode> accepts. So for
815example if you wish to decode octests as UTF-8, and use ISO-8859-15 as
816a fallback for bytes that are not valid UTF-8, you could write
817
818    $str = decode 'UTF-8', $octets, sub {
819        my $tmp = chr shift;
820        from_to $tmp, 'ISO-8859-15', 'UTF-8';
821        return $tmp;
822    };
823
824=head1 Defining Encodings
825
826To define a new encoding, use:
827
828    use Encode qw(define_encoding);
829    define_encoding($object, CANONICAL_NAME [, alias...]);
830
831I<CANONICAL_NAME> will be associated with I<$object>.  The object
832should provide the interface described in L<Encode::Encoding>.
833If more than two arguments are provided, additional
834arguments are considered aliases for I<$object>.
835
836See L<Encode::Encoding> for details.
837
838=head1 The UTF8 flag
839
840Before the introduction of Unicode support in Perl, The C<eq> operator
841just compared the strings represented by two scalars. Beginning with
842Perl 5.8, C<eq> compares two strings with simultaneous consideration of
843I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of
844I<Programming Perl, 3rd ed.>
845
846=over 2
847
848=item Goal #1:
849
850Old byte-oriented programs should not spontaneously break on the old
851byte-oriented data they used to work on.
852
853=item Goal #2:
854
855Old byte-oriented programs should magically start working on the new
856character-oriented data when appropriate.
857
858=item Goal #3:
859
860Programs should run just as fast in the new character-oriented mode
861as in the old byte-oriented mode.
862
863=item Goal #4:
864
865Perl should remain one language, rather than forking into a
866byte-oriented Perl and a character-oriented Perl.
867
868=back
869
870When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been
871born yet, many features documented in the book remained unimplemented for a
872long time.  Perl 5.8 corrected much of this, and the introduction of the
873UTF8 flag is one of them.  You can think of there being two fundamentally
874different kinds of strings and string-operations in Perl: one a
875byte-oriented mode  for when the internal UTF8 flag is off, and the other a
876character-oriented mode for when the internal UTF8 flag is on.
877
878Here is how C<Encode> handles the UTF8 flag.
879
880=over 2
881
882=item *
883
884When you I<encode>, the resulting UTF8 flag is always B<off>.
885
886=item *
887
888When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can
889unambiguously represent data.  Here is what we mean by "unambiguously".
890After C<$utf8 = decode("foo", $octet)>,
891
892  When $octet is...   The UTF8 flag in $utf8 is
893  ---------------------------------------------
894  In ASCII only (or EBCDIC only)            OFF
895  In ISO-8859-1                              ON
896  In any other Encoding                      ON
897  ---------------------------------------------
898
899As you see, there is one exception: in ASCII.  That way you can assume
900Goal #1.  And with C<Encode>, Goal #2 is assumed but you still have to be
901careful in the cases mentioned in the B<CAVEAT> paragraphs above.
902
903This UTF8 flag is not visible in Perl scripts, exactly for the same reason
904you cannot (or rather, you I<don't have to>) see whether a scalar contains
905a string, an integer, or a floating-point number.   But you can still peek
906and poke these if you will.  See the next section.
907
908=back
909
910=head2 Messing with Perl's Internals
911
912The following API uses parts of Perl's internals in the current
913implementation.  As such, they are efficient but may change in a future
914release.
915
916=head3 is_utf8
917
918  is_utf8(STRING [, CHECK])
919
920[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>.
921If I<CHECK> is true, also checks whether I<STRING> contains well-formed
922UTF-8.  Returns true if successful, false otherwise.
923
924As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
925
926=head3 _utf8_on
927
928  _utf8_on(STRING)
929
930[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>.  The I<STRING>
931is I<not> checked for containing only well-formed UTF-8.  Do not use this
932unless you I<know with absolute certainty> that the STRING holds only
933well-formed UTF-8.  Returns the previous state of the UTF8 flag (so please
934don't treat the return value as indicating success or failure), or C<undef>
935if I<STRING> is not a string.
936
937B<NOTE>: For security reasons, this function does not work on tainted values.
938
939=head3 _utf8_off
940
941  _utf8_off(STRING)
942
943[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>.  Do not use
944frivolously.  Returns the previous state of the UTF8 flag, or C<undef> if
945I<STRING> is not a string.  Do not treat the return value as indicative of
946success or failure, because that isn't what it means: it is only the
947previous setting.
948
949B<NOTE>: For security reasons, this function does not work on tainted values.
950
951=head1 UTF-8 vs. utf8 vs. UTF8
952
953  ....We now view strings not as sequences of bytes, but as sequences
954  of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
955  computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
956
957That has historically been Perl's notion of UTF-8, as that is how UTF-8 was
958first conceived by Ken Thompson when he invented it. However, thanks to
959later revisions to the applicable standards, official UTF-8 is now rather
960stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF
961to cover only 21 bits instead of 32 or 64 bits) and some sequences
962are not allowed, like those used in surrogate pairs, the 31 non-character
963code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane
964(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc.
965
966The former default in which Perl would always use a loose interpretation of
967UTF-8 has now been overruled:
968
969  From: Larry Wall <larry@wall.org>
970  Date: December 04, 2004 11:51:58 JST
971  To: perl-unicode@perl.org
972  Subject: Re: Make Encode.pm support the real UTF-8
973  Message-Id: <20041204025158.GA28754@wall.org>
974
975  On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
976  : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
977  : but "UTF-8" is the name of the standard and should give the
978  : corresponding behaviour.
979
980  For what it's worth, that's how I've always kept them straight in my
981  head.
982
983  Also for what it's worth, Perl 6 will mostly default to strict but
984  make it easy to switch back to lax.
985
986  Larry
987
988Got that?  As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current
989sense, which is conservative and strict and security-conscious, whereas
990B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and
991lax.  C<Encode> version 2.10 or later thus groks this subtle but critically
992important distinction between C<"UTF-8"> and C<"utf8">.
993
994  encode("utf8",  "\x{FFFF_FFFF}", 1); # okay
995  encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
996
997In the C<Encode> module, C<"UTF-8"> is actually a canonical name for
998C<"utf-8-strict">.  That hyphen between the C<"UTF"> and the C<"8"> is
999critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
1000
1001  find_encoding("UTF-8")->name # is 'utf-8-strict'
1002  find_encoding("utf-8")->name # ditto. names are case insensitive
1003  find_encoding("utf_8")->name # ditto. "_" are treated as "-"
1004  find_encoding("UTF8")->name  # is 'utf8'.
1005
1006Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates
1007whether a string is internally encoded as "utf8", also without a hyphen.
1008
1009=head1 SEE ALSO
1010
1011L<Encode::Encoding>,
1012L<Encode::Supported>,
1013L<Encode::PerlIO>,
1014L<encoding>,
1015L<perlebcdic>,
1016L<perlfunc/open>,
1017L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
1018L<utf8>,
1019the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html>
1020
1021=head1 MAINTAINER
1022
1023This project was originated by the late Nick Ing-Simmons and later
1024maintained by Dan Kogai I<< <dankogai@cpan.org> >>.  See AUTHORS
1025for a full list of people involved.  For any questions, send mail to
1026I<< <perl-unicode@perl.org> >> so that we can all share.
1027
1028While Dan Kogai retains the copyright as a maintainer, credit
1029should go to all those involved.  See AUTHORS for a list of those
1030who submitted code to the project.
1031
1032=head1 COPYRIGHT
1033
1034Copyright 2002-2013 Dan Kogai I<< <dankogai@cpan.org> >>.
1035
1036This library is free software; you can redistribute it and/or modify
1037it under the same terms as Perl itself.
1038
1039=cut
1040