xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/Encode.pm (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1#
2# $Id: Encode.pm,v 2.49 2013/03/05 03:13:47 dankogai Exp dankogai $
3#
4package Encode;
5use strict;
6use warnings;
7our $VERSION = sprintf "%d.%02d", q$Revision: 2.49 $ =~ /(\d+)/g;
8use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
9use XSLoader ();
10XSLoader::load( __PACKAGE__, $VERSION );
11
12require Exporter;
13use base qw/Exporter/;
14
15# Public, encouraged API is exported by default
16
17our @EXPORT = qw(
18  decode  decode_utf8  encode  encode_utf8 str2bytes bytes2str
19  encodings  find_encoding clone_encoding
20);
21our @FB_FLAGS = qw(
22  DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
23  PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
24);
25our @FB_CONSTS = qw(
26  FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
27  FB_PERLQQ FB_HTMLCREF FB_XMLCREF
28);
29our @EXPORT_OK = (
30    qw(
31      _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
32      is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
33      ),
34    @FB_FLAGS, @FB_CONSTS,
35);
36
37our %EXPORT_TAGS = (
38    all          => [ @EXPORT,    @EXPORT_OK ],
39    default      => [ @EXPORT ],
40    fallbacks    => [ @FB_CONSTS ],
41    fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
42);
43
44# Documentation moved after __END__ for speed - NI-S
45
46our $ON_EBCDIC = ( ord("A") == 193 );
47
48use Encode::Alias;
49
50# Make a %Encoding package variable to allow a certain amount of cheating
51our %Encoding;
52our %ExtModule;
53require Encode::Config;
54#  See
55#  https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
56#  to find why sig handers inside eval{} are disabled.
57eval {
58    local $SIG{__DIE__};
59    local $SIG{__WARN__};
60    require Encode::ConfigLocal;
61};
62
63sub encodings {
64    my %enc;
65    my $arg  = $_[1] || '';
66    if ( $arg eq ":all" ) {
67        %enc = ( %Encoding, %ExtModule );
68    }
69    else {
70        %enc = %Encoding;
71        for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) {
72            DEBUG and warn $mod;
73            for my $enc ( keys %ExtModule ) {
74                $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
75            }
76        }
77    }
78    return sort { lc $a cmp lc $b }
79      grep      { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
80}
81
82sub perlio_ok {
83    my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
84    $obj->can("perlio_ok") and return $obj->perlio_ok();
85    return 0;    # safety net
86}
87
88sub define_encoding {
89    my $obj  = shift;
90    my $name = shift;
91    $Encoding{$name} = $obj;
92    my $lc = lc($name);
93    define_alias( $lc => $obj ) unless $lc eq $name;
94    while (@_) {
95        my $alias = shift;
96        define_alias( $alias, $obj );
97    }
98    return $obj;
99}
100
101sub getEncoding {
102    my ( $class, $name, $skip_external ) = @_;
103
104    $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
105
106    ref($name) && $name->can('renew') and return $name;
107    exists $Encoding{$name} and return $Encoding{$name};
108    my $lc = lc $name;
109    exists $Encoding{$lc} and return $Encoding{$lc};
110
111    my $oc = $class->find_alias($name);
112    defined($oc) and return $oc;
113    $lc ne $name and $oc = $class->find_alias($lc);
114    defined($oc) and return $oc;
115
116    unless ($skip_external) {
117        if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
118            $mod =~ s,::,/,g;
119            $mod .= '.pm';
120            eval { require $mod; };
121            exists $Encoding{$name} and return $Encoding{$name};
122        }
123    }
124    return;
125}
126
127sub find_encoding($;$) {
128    my ( $name, $skip_external ) = @_;
129    return __PACKAGE__->getEncoding( $name, $skip_external );
130}
131
132sub resolve_alias($) {
133    my $obj = find_encoding(shift);
134    defined $obj and return $obj->name;
135    return;
136}
137
138sub clone_encoding($) {
139    my $obj = find_encoding(shift);
140    ref $obj or return;
141    eval { require Storable };
142    $@ and return;
143    return Storable::dclone($obj);
144}
145
146sub encode($$;$) {
147    my ( $name, $string, $check ) = @_;
148    return undef unless defined $string;
149    $string .= '';    # stringify;
150    $check ||= 0;
151    unless ( defined $name ) {
152        require Carp;
153        Carp::croak("Encoding name should not be undef");
154    }
155    my $enc = find_encoding($name);
156    unless ( defined $enc ) {
157        require Carp;
158        Carp::croak("Unknown encoding '$name'");
159    }
160    my $octets = $enc->encode( $string, $check );
161    $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() );
162    return $octets;
163}
164*str2bytes = \&encode;
165
166sub decode($$;$) {
167    my ( $name, $octets, $check ) = @_;
168    return undef unless defined $octets;
169    $octets .= '';
170    $check ||= 0;
171    my $enc = find_encoding($name);
172    unless ( defined $enc ) {
173        require Carp;
174        Carp::croak("Unknown encoding '$name'");
175    }
176    my $string = $enc->decode( $octets, $check );
177    $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
178    return $string;
179}
180*bytes2str = \&decode;
181
182sub from_to($$$;$) {
183    my ( $string, $from, $to, $check ) = @_;
184    return undef unless defined $string;
185    $check ||= 0;
186    my $f = find_encoding($from);
187    unless ( defined $f ) {
188        require Carp;
189        Carp::croak("Unknown encoding '$from'");
190    }
191    my $t = find_encoding($to);
192    unless ( defined $t ) {
193        require Carp;
194        Carp::croak("Unknown encoding '$to'");
195    }
196    my $uni = $f->decode($string);
197    $_[0] = $string = $t->encode( $uni, $check );
198    return undef if ( $check && length($uni) );
199    return defined( $_[0] ) ? length($string) : undef;
200}
201
202sub encode_utf8($) {
203    my ($str) = @_;
204    utf8::encode($str);
205    return $str;
206}
207
208my $utf8enc;
209
210sub decode_utf8($;$) {
211    my ( $octets, $check ) = @_;
212    return $octets if is_utf8($octets);
213    return undef unless defined $octets;
214    $octets .= '' if ref $octets;
215    $check   ||= 0;
216    $utf8enc ||= find_encoding('utf8');
217    my $string = $utf8enc->decode( $octets, $check );
218    $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
219    return $string;
220}
221
222# sub decode_utf8($;$) {
223#     my ( $str, $check ) = @_;
224#     return $str if is_utf8($str);
225#     if ($check) {
226#         return decode( "utf8", $str, $check );
227#     }
228#     else {
229#         return decode( "utf8", $str );
230#         return $str;
231#     }
232# }
233
234predefine_encodings(1);
235
236#
237# This is to restore %Encoding if really needed;
238#
239
240sub predefine_encodings {
241    require Encode::Encoding;
242    no warnings 'redefine';
243    my $use_xs = shift;
244    if ($ON_EBCDIC) {
245
246        # was in Encode::UTF_EBCDIC
247        package Encode::UTF_EBCDIC;
248        push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
249        *decode = sub {
250            my ( undef, $str, $chk ) = @_;
251            my $res = '';
252            for ( my $i = 0 ; $i < length($str) ; $i++ ) {
253                $res .=
254                  chr(
255                    utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
256                  );
257            }
258            $_[1] = '' if $chk;
259            return $res;
260        };
261        *encode = sub {
262            my ( undef, $str, $chk ) = @_;
263            my $res = '';
264            for ( my $i = 0 ; $i < length($str) ; $i++ ) {
265                $res .=
266                  chr(
267                    utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
268                  );
269            }
270            $_[1] = '' if $chk;
271            return $res;
272        };
273        $Encode::Encoding{Unicode} =
274          bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
275    }
276    else {
277
278        package Encode::Internal;
279        push @Encode::Internal::ISA, 'Encode::Encoding';
280        *decode = sub {
281            my ( undef, $str, $chk ) = @_;
282            utf8::upgrade($str);
283            $_[1] = '' if $chk;
284            return $str;
285        };
286        *encode = \&decode;
287        $Encode::Encoding{Unicode} =
288          bless { Name => "Internal" } => "Encode::Internal";
289    }
290
291    {
292
293        # was in Encode::utf8
294        package Encode::utf8;
295        push @Encode::utf8::ISA, 'Encode::Encoding';
296
297        #
298        if ($use_xs) {
299            Encode::DEBUG and warn __PACKAGE__, " XS on";
300            *decode = \&decode_xs;
301            *encode = \&encode_xs;
302        }
303        else {
304            Encode::DEBUG and warn __PACKAGE__, " XS off";
305            *decode = sub {
306                my ( undef, $octets, $chk ) = @_;
307                my $str = Encode::decode_utf8($octets);
308                if ( defined $str ) {
309                    $_[1] = '' if $chk;
310                    return $str;
311                }
312                return undef;
313            };
314            *encode = sub {
315                my ( undef, $string, $chk ) = @_;
316                my $octets = Encode::encode_utf8($string);
317                $_[1] = '' if $chk;
318                return $octets;
319            };
320        }
321        *cat_decode = sub {    # ($obj, $dst, $src, $pos, $trm, $chk)
322                               # currently ignores $chk
323            my ( undef, undef, undef, $pos, $trm ) = @_;
324            my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
325            use bytes;
326            if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
327                $$rdst .=
328                  substr( $$rsrc, $pos, $npos - $pos + length($trm) );
329                $$rpos = $npos + length($trm);
330                return 1;
331            }
332            $$rdst .= substr( $$rsrc, $pos );
333            $$rpos = length($$rsrc);
334            return '';
335        };
336        $Encode::Encoding{utf8} =
337          bless { Name => "utf8" } => "Encode::utf8";
338        $Encode::Encoding{"utf-8-strict"} =
339          bless { Name => "utf-8-strict", strict_utf8 => 1 }
340            => "Encode::utf8";
341    }
342}
343
3441;
345
346__END__
347
348=head1 NAME
349
350Encode - character encodings in Perl
351
352=head1 SYNOPSIS
353
354    use Encode qw(decode encode);
355    $characters = decode('UTF-8', $octets,     Encode::FB_CROAK);
356    $octets     = encode('UTF-8', $characters, Encode::FB_CROAK);
357
358=head2 Table of Contents
359
360Encode consists of a collection of modules whose details are too extensive
361to fit in one document.  This one itself explains the top-level APIs
362and general topics at a glance.  For other topics and more details,
363see the documentation for these modules:
364
365=over 2
366
367=item L<Encode::Alias> - Alias definitions to encodings
368
369=item L<Encode::Encoding> - Encode Implementation Base Class
370
371=item L<Encode::Supported> - List of Supported Encodings
372
373=item L<Encode::CN> - Simplified Chinese Encodings
374
375=item L<Encode::JP> - Japanese Encodings
376
377=item L<Encode::KR> - Korean Encodings
378
379=item L<Encode::TW> - Traditional Chinese Encodings
380
381=back
382
383=head1 DESCRIPTION
384
385The C<Encode> module provides the interface between Perl strings
386and the rest of the system.  Perl strings are sequences of
387I<characters>.
388
389The repertoire of characters that Perl can represent is a superset of those
390defined by the Unicode Consortium. On most platforms the ordinal
391values of a character as returned by C<ord(I<S>)> is the I<Unicode
392codepoint> for that character. The exceptions are platforms where
393the legacy encoding is some variant of EBCDIC rather than a superset
394of ASCII; see L<perlebcdic>.
395
396During recent history, data is moved around a computer in 8-bit chunks,
397often called "bytes" but also known as "octets" in standards documents.
398Perl is widely used to manipulate data of many types: not only strings of
399characters representing human or computer languages, but also "binary"
400data, being the machine's representation of numbers, pixels in an image, or
401just about anything.
402
403When Perl is processing "binary data", the programmer wants Perl to
404process "sequences of bytes". This is not a problem for Perl: because a
405byte has 256 possible values, it easily fits in Perl's much larger
406"logical character".
407
408This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq>
409explain the I<why>.
410
411=head2 TERMINOLOGY
412
413=head3 character
414
415A character in the range 0 .. 2**32-1 (or more);
416what Perl's strings are made of.
417
418=head3 byte
419
420A character in the range 0..255;
421a special case of a Perl character.
422
423=head3 octet
424
4258 bits of data, with ordinal values 0..255;
426term for bytes passed to or from a non-Perl context, such as a disk file,
427standard I/O stream, database, command-line argument, environment variable,
428socket etc.
429
430=head1 THE PERL ENCODING API
431
432=head2 Basic methods
433
434=head3 encode
435
436  $octets  = encode(ENCODING, STRING[, CHECK])
437
438Encodes the scalar value I<STRING> from Perl's internal form into
439I<ENCODING> and returns a sequence of octets.  I<ENCODING> can be either a
440canonical name or an alias.  For encoding names and aliases, see
441L</"Defining Aliases">.  For CHECK, see L</"Handling Malformed Data">.
442
443For example, to convert a string from Perl's internal format into
444ISO-8859-1, also known as Latin1:
445
446  $octets = encode("iso-8859-1", $string);
447
448B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then
449$octets I<might not be equal to> $string.  Though both contain the
450same data, the UTF8 flag for $octets is I<always> off.  When you
451encode anything, the UTF8 flag on the result is always off, even when it
452contains a completely valid utf8 string. See L</"The UTF8 flag"> below.
453
454If the $string is C<undef>, then C<undef> is returned.
455
456=head3 decode
457
458  $string = decode(ENCODING, OCTETS[, CHECK])
459
460This function returns the string that results from decoding the scalar
461value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
462Perl's internal form.  The returns the resulting string.  As with encode(),
463I<ENCODING> can be either a canonical name or an alias. For encoding names
464and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
465Malformed Data">.
466
467For example, to convert ISO-8859-1 data into a string in Perl's
468internal format:
469
470  $string = decode("iso-8859-1", $octets);
471
472B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
473I<might not be equal to> $octets.  Though both contain the same data, the
474UTF8 flag for $string is on unless $octets consists entirely of ASCII data
475on ASCII machines or EBCDIC on EBCDIC machines.  See L</"The UTF8 flag">
476below.
477
478If the $string is C<undef>, then C<undef> is returned.
479
480=head3 find_encoding
481
482  [$obj =] find_encoding(ENCODING)
483
484Returns the I<encoding object> corresponding to I<ENCODING>.  Returns
485C<undef> if no matching I<ENCODING> is find.  The returned object is
486what does the actual encoding or decoding.
487
488  $utf8 = decode($name, $bytes);
489
490is in fact
491
492    $utf8 = do {
493        $obj = find_encoding($name);
494        croak qq(encoding "$name" not found) unless ref $obj;
495        $obj->decode($bytes);
496    };
497
498with more error checking.
499
500You can therefore save time by reusing this object as follows;
501
502    my $enc = find_encoding("iso-8859-1");
503    while(<>) {
504        my $utf8 = $enc->decode($_);
505        ... # now do something with $utf8;
506    }
507
508Besides L</decode> and L</encode>, other methods are
509available as well.  For instance, C<name()> returns the canonical
510name of the encoding object.
511
512  find_encoding("latin1")->name; # iso-8859-1
513
514See L<Encode::Encoding> for details.
515
516=head3 from_to
517
518  [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
519
520Converts I<in-place> data between two encodings. The data in $octets
521must be encoded as octets and I<not> as characters in Perl's internal
522format. For example, to convert ISO-8859-1 data into Microsoft's CP1250
523encoding:
524
525  from_to($octets, "iso-8859-1", "cp1250");
526
527and to convert it back:
528
529  from_to($octets, "cp1250", "iso-8859-1");
530
531Because the conversion happens in place, the data to be
532converted cannot be a string constant: it must be a scalar variable.
533
534C<from_to()> returns the length of the converted string in octets on success,
535and C<undef> on error.
536
537B<CAVEAT>: The following operations may look the same, but are not:
538
539  from_to($data, "iso-8859-1", "utf8"); #1
540  $data = decode("iso-8859-1", $data);  #2
541
542Both #1 and #2 make $data consist of a completely valid UTF-8 string,
543but only #2 turns the UTF8 flag on.  #1 is equivalent to:
544
545  $data = encode("utf8", decode("iso-8859-1", $data));
546
547See L</"The UTF8 flag"> below.
548
549Also note that:
550
551  from_to($octets, $from, $to, $check);
552
553is equivalent t:o
554
555  $octets = encode($to, decode($from, $octets), $check);
556
557Yes, it does I<not> respect the $check during decoding.  It is
558deliberately done that way.  If you need minute control, use C<decode>
559followed by C<encode> as follows:
560
561  $octets = encode($to, decode($from, $octets, $check_from), $check_to);
562
563=head3 encode_utf8
564
565  $octets = encode_utf8($string);
566
567Equivalent to C<$octets = encode("utf8", $string)>.  The characters in
568$string are encoded in Perl's internal format, and the result is returned
569as a sequence of octets.  Because all possible characters in Perl have a
570(loose, not strict) UTF-8 representation, this function cannot fail.
571
572=head3 decode_utf8
573
574  $string = decode_utf8($octets [, CHECK]);
575
576Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
577The sequence of octets represented by $octets is decoded
578from UTF-8 into a sequence of logical characters.
579Because not all sequences of octets are valid UTF-8,
580it is quite possible for this function to fail.
581For CHECK, see L</"Handling Malformed Data">.
582
583=head2 Listing available encodings
584
585  use Encode;
586  @list = Encode->encodings();
587
588Returns a list of canonical names of available encodings that have already
589been loaded.  To get a list of all available encodings including those that
590have not yet been loaded, say:
591
592  @all_encodings = Encode->encodings(":all");
593
594Or you can give the name of a specific module:
595
596  @with_jp = Encode->encodings("Encode::JP");
597
598When "C<::>" is not in the name, "C<Encode::>" is assumed.
599
600  @ebcdic = Encode->encodings("EBCDIC");
601
602To find out in detail which encodings are supported by this package,
603see L<Encode::Supported>.
604
605=head2 Defining Aliases
606
607To add a new alias to a given encoding, use:
608
609  use Encode;
610  use Encode::Alias;
611  define_alias(NEWNAME => ENCODING);
612
613After that, I<NEWNAME> can be used as an alias for I<ENCODING>.
614I<ENCODING> may be either the name of an encoding or an
615I<encoding object>.
616
617Before you do that, first make sure the alias is nonexistent using
618C<resolve_alias()>, which returns the canonical name thereof.
619For example:
620
621  Encode::resolve_alias("latin1") eq "iso-8859-1" # true
622  Encode::resolve_alias("iso-8859-12")   # false; nonexistent
623  Encode::resolve_alias($name) eq $name  # true if $name is canonical
624
625C<resolve_alias()> does not need C<use Encode::Alias>; it can be
626imported via C<use Encode qw(resolve_alias)>.
627
628See L<Encode::Alias> for details.
629
630=head2 Finding IANA Character Set Registry names
631
632The canonical name of a given encoding does not necessarily agree with
633IANA Character Set Registry, commonly seen as C<< Content-Type:
634text/plain; charset=I<WHATEVER> >>.  For most cases, the canonical name
635works, but sometimes it does not, most notably with "utf-8-strict".
636
637As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added.
638
639  use Encode;
640  my $enc = find_encoding("UTF-8");
641  warn $enc->name;      # utf-8-strict
642  warn $enc->mime_name; # UTF-8
643
644See also:  L<Encode::Encoding>
645
646=head1 Encoding via PerlIO
647
648If your perl supports C<PerlIO> (which is the default), you can use a
649C<PerlIO> layer to decode and encode directly via a filehandle.  The
650following two examples are fully identical in functionality:
651
652  ### Version 1 via PerlIO
653    open(INPUT,  "< :encoding(shiftjis)", $infile)
654        || die "Can't open < $infile for reading: $!";
655    open(OUTPUT, "> :encoding(euc-jp)",  $outfile)
656        || die "Can't open > $output for writing: $!";
657    while (<INPUT>) {   # auto decodes $_
658        print OUTPUT;   # auto encodes $_
659    }
660    close(INPUT)   || die "can't close $infile: $!";
661    close(OUTPUT)  || die "can't close $outfile: $!";
662
663  ### Version 2 via from_to()
664    open(INPUT,  "< :raw", $infile)
665        || die "Can't open < $infile for reading: $!";
666    open(OUTPUT, "> :raw",  $outfile)
667        || die "Can't open > $output for writing: $!";
668
669    while (<INPUT>) {
670        from_to($_, "shiftjis", "euc-jp", 1);  # switch encoding
671        print OUTPUT;   # emit raw (but properly encoded) data
672    }
673    close(INPUT)   || die "can't close $infile: $!";
674    close(OUTPUT)  || die "can't close $outfile: $!";
675
676In the first version above, you let the appropriate encoding layer
677handle the conversion.  In the second, you explicitly translate
678from one encoding to the other.
679
680Unfortunately, it may be that encodings are C<PerlIO>-savvy.  You can check
681to see whether your encoding is supported by C<PerlIO> by invoking the
682C<perlio_ok> method on it:
683
684  Encode::perlio_ok("hz");             # false
685  find_encoding("euc-cn")->perlio_ok;  # true wherever PerlIO is available
686
687  use Encode qw(perlio_ok);            # imported upon request
688  perlio_ok("euc-jp")
689
690Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy
691except for C<hz> and C<ISO-2022-kr>.  For the gory details, see
692L<Encode::Encoding> and L<Encode::PerlIO>.
693
694=head1 Handling Malformed Data
695
696The optional I<CHECK> argument tells C<Encode> what to do when
697encountering malformed data.  Without I<CHECK>, C<Encode::FB_DEFAULT>
698(== 0) is assumed.
699
700As of version 2.12, C<Encode> supports coderef values for C<CHECK>;
701see below.
702
703B<NOTE:> Not all encodings support this feature.
704Some encodings ignore the I<CHECK> argument.  For example,
705L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
706
707=head2 List of I<CHECK> values
708
709=head3 FB_DEFAULT
710
711  I<CHECK> = Encode::FB_DEFAULT ( == 0)
712
713If I<CHECK> is 0, encoding and decoding replace any malformed character
714with a I<substitution character>.  When you encode, I<SUBCHAR> is used.
715When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is
716used.  If the data is supposed to be UTF-8, an optional lexical warning of
717warning category C<"utf8"> is given.
718
719=head3 FB_CROAK
720
721  I<CHECK> = Encode::FB_CROAK ( == 1)
722
723If I<CHECK> is 1, methods immediately die with an error
724message.  Therefore, when I<CHECK> is 1, you should trap
725exceptions with C<eval{}>, unless you really want to let it C<die>.
726
727=head3 FB_QUIET
728
729  I<CHECK> = Encode::FB_QUIET
730
731If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately
732return the portion of the data that has been processed so far when an
733error occurs. The data argument is overwritten with everything
734after that point; that is, the unprocessed portion of the data.  This is
735handy when you have to call C<decode> repeatedly in the case where your
736source data may contain partial multi-byte character sequences,
737(that is, you are reading with a fixed-width buffer). Here's some sample
738code to do exactly that:
739
740    my($buffer, $string) = ("", "");
741    while (read($fh, $buffer, 256, length($buffer))) {
742        $string .= decode($encoding, $buffer, Encode::FB_QUIET);
743        # $buffer now contains the unprocessed partial character
744    }
745
746=head3 FB_WARN
747
748  I<CHECK> = Encode::FB_WARN
749
750This is the same as C<FB_QUIET> above, except that instead of being silent
751on errors, it issues a warning.  This is handy for when you are debugging.
752
753=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
754
755=over 2
756
757=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
758
759=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
760
761=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
762
763=back
764
765For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
766C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
767
768When you decode, C<\xI<HH>> is inserted for a malformed character, where
769I<HH> is the hex representation of the octet that could not be decoded to
770utf8.  When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
771the Unicode code point (in any number of hex digits) of the character that
772cannot be found in the character repertoire of the encoding.
773
774The HTML/XML character reference modes are about the same. In place of
775C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
776XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
777
778In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
779
780=head3 The bitmask
781
782These modes are all actually set via a bitmask.  Here is how the C<FB_I<XXX>>
783constants are laid out.  You can import the C<FB_I<XXX>> constants via
784C<use Encode qw(:fallbacks)>, and you can import the generic bitmask
785constants via C<use Encode qw(:fallback_all)>.
786
787                     FB_DEFAULT FB_CROAK FB_QUIET FB_WARN  FB_PERLQQ
788 DIE_ON_ERR    0x0001             X
789 WARN_ON_ERR   0x0002                               X
790 RETURN_ON_ERR 0x0004                      X        X
791 LEAVE_SRC     0x0008                                        X
792 PERLQQ        0x0100                                        X
793 HTMLCREF      0x0200
794 XMLCREF       0x0400
795
796=head3 LEAVE_SRC
797
798  Encode::LEAVE_SRC
799
800If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
801source string to encode() or decode() will be overwritten in place.
802If you're not interested in this, then bitwise-OR it with the bitmask.
803
804=head2 coderef for CHECK
805
806As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
807ordinal value of the unmapped character as an argument and returns a string
808that represents the fallback character.  For instance:
809
810  $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
811
812Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
813
814=head1 Defining Encodings
815
816To define a new encoding, use:
817
818    use Encode qw(define_encoding);
819    define_encoding($object, CANONICAL_NAME [, alias...]);
820
821I<CANONICAL_NAME> will be associated with I<$object>.  The object
822should provide the interface described in L<Encode::Encoding>.
823If more than two arguments are provided, additional
824arguments are considered aliases for I<$object>.
825
826See L<Encode::Encoding> for details.
827
828=head1 The UTF8 flag
829
830Before the introduction of Unicode support in Perl, The C<eq> operator
831just compared the strings represented by two scalars. Beginning with
832Perl 5.8, C<eq> compares two strings with simultaneous consideration of
833I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of
834I<Programming Perl, 3rd ed.>
835
836=over 2
837
838=item Goal #1:
839
840Old byte-oriented programs should not spontaneously break on the old
841byte-oriented data they used to work on.
842
843=item Goal #2:
844
845Old byte-oriented programs should magically start working on the new
846character-oriented data when appropriate.
847
848=item Goal #3:
849
850Programs should run just as fast in the new character-oriented mode
851as in the old byte-oriented mode.
852
853=item Goal #4:
854
855Perl should remain one language, rather than forking into a
856byte-oriented Perl and a character-oriented Perl.
857
858=back
859
860When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been
861born yet, many features documented in the book remained unimplemented for a
862long time.  Perl 5.8 corrected much of this, and the introduction of the
863UTF8 flag is one of them.  You can think of there being two fundamentally
864different kinds of strings and string-operations in Perl: one a
865byte-oriented mode  for when the internal UTF8 flag is off, and the other a
866character-oriented mode for when the internal UTF8 flag is on.
867
868Here is how C<Encode> handles the UTF8 flag.
869
870=over 2
871
872=item *
873
874When you I<encode>, the resulting UTF8 flag is always B<off>.
875
876=item *
877
878When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can
879unambiguously represent data.  Here is what we mean by "unambiguously".
880After C<$utf8 = decode("foo", $octet)>,
881
882  When $octet is...   The UTF8 flag in $utf8 is
883  ---------------------------------------------
884  In ASCII only (or EBCDIC only)            OFF
885  In ISO-8859-1                              ON
886  In any other Encoding                      ON
887  ---------------------------------------------
888
889As you see, there is one exception: in ASCII.  That way you can assume
890Goal #1.  And with C<Encode>, Goal #2 is assumed but you still have to be
891careful in the cases mentioned in the B<CAVEAT> paragraphs above.
892
893This UTF8 flag is not visible in Perl scripts, exactly for the same reason
894you cannot (or rather, you I<don't have to>) see whether a scalar contains
895a string, an integer, or a floating-point number.   But you can still peek
896and poke these if you will.  See the next section.
897
898=back
899
900=head2 Messing with Perl's Internals
901
902The following API uses parts of Perl's internals in the current
903implementation.  As such, they are efficient but may change in a future
904release.
905
906=head3 is_utf8
907
908  is_utf8(STRING [, CHECK])
909
910[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>.
911If I<CHECK> is true, also checks whether I<STRING> contains well-formed
912UTF-8.  Returns true if successful, false otherwise.
913
914As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
915
916=head3 _utf8_on
917
918  _utf8_on(STRING)
919
920[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>.  The I<STRING>
921is I<not> checked for containing only well-formed UTF-8.  Do not use this
922unless you I<know with absolute certainty> that the STRING holds only
923well-formed UTF-8.  Returns the previous state of the UTF8 flag (so please
924don't treat the return value as indicating success or failure), or C<undef>
925if I<STRING> is not a string.
926
927B<NOTE>: For security reasons, this function does not work on tainted values.
928
929=head3 _utf8_off
930
931  _utf8_off(STRING)
932
933[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>.  Do not use
934frivolously.  Returns the previous state of the UTF8 flag, or C<undef> if
935I<STRING> is not a string.  Do not treat the return value as indicative of
936success or failure, because that isn't what it means: it is only the
937previous setting.
938
939B<NOTE>: For security reasons, this function does not work on tainted values.
940
941=head1 UTF-8 vs. utf8 vs. UTF8
942
943  ....We now view strings not as sequences of bytes, but as sequences
944  of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
945  computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
946
947That has historically been Perl's notion of UTF-8, as that is how UTF-8 was
948first conceived by Ken Thompson when he invented it. However, thanks to
949later revisions to the applicable standards, official UTF-8 is now rather
950stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF
951to cover only 21 bits instead of 32 or 64 bits) and some sequences
952are not allowed, like those used in surrogate pairs, the 31 non-character
953code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane
954(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc.
955
956The former default in which Perl would always use a loose interpretation of
957UTF-8 has now been overruled:
958
959  From: Larry Wall <larry@wall.org>
960  Date: December 04, 2004 11:51:58 JST
961  To: perl-unicode@perl.org
962  Subject: Re: Make Encode.pm support the real UTF-8
963  Message-Id: <20041204025158.GA28754@wall.org>
964
965  On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
966  : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
967  : but "UTF-8" is the name of the standard and should give the
968  : corresponding behaviour.
969
970  For what it's worth, that's how I've always kept them straight in my
971  head.
972
973  Also for what it's worth, Perl 6 will mostly default to strict but
974  make it easy to switch back to lax.
975
976  Larry
977
978Got that?  As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current
979sense, which is conservative and strict and security-conscious, whereas
980B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and
981lax.  C<Encode> version 2.10 or later thus groks this subtle but critically
982important distinction between C<"UTF-8"> and C<"utf8">.
983
984  encode("utf8",  "\x{FFFF_FFFF}", 1); # okay
985  encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
986
987In the C<Encode> module, C<"UTF-8"> is actually a canonical name for
988C<"utf-8-strict">.  That hyphen between the C<"UTF"> and the C<"8"> is
989critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
990
991  find_encoding("UTF-8")->name # is 'utf-8-strict'
992  find_encoding("utf-8")->name # ditto. names are case insensitive
993  find_encoding("utf_8")->name # ditto. "_" are treated as "-"
994  find_encoding("UTF8")->name  # is 'utf8'.
995
996Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates
997whether a string is internally encoded as "utf8", also without a hyphen.
998
999=head1 SEE ALSO
1000
1001L<Encode::Encoding>,
1002L<Encode::Supported>,
1003L<Encode::PerlIO>,
1004L<encoding>,
1005L<perlebcdic>,
1006L<perlfunc/open>,
1007L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
1008L<utf8>,
1009the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html>
1010
1011=head1 MAINTAINER
1012
1013This project was originated by the late Nick Ing-Simmons and later
1014maintained by Dan Kogai I<< <dankogai@cpan.org> >>.  See AUTHORS
1015for a full list of people involved.  For any questions, send mail to
1016I<< <perl-unicode@perl.org> >> so that we can all share.
1017
1018While Dan Kogai retains the copyright as a maintainer, credit
1019should go to all those involved.  See AUTHORS for a list of those
1020who submitted code to the project.
1021
1022=head1 COPYRIGHT
1023
1024Copyright 2002-2012 Dan Kogai I<< <dankogai@cpan.org> >>.
1025
1026This library is free software; you can redistribute it and/or modify
1027it under the same terms as Perl itself.
1028
1029=cut
1030