xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/Encode.pm (revision 4e1ee0786f11cc571bd0be17d38e46f635c719fc)
1#
2# $Id: Encode.pm,v 3.06 2020/05/02 02:31:14 dankogai Exp $
3#
4package Encode;
5use strict;
6use warnings;
7use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
8our $VERSION;
9BEGIN {
10    $VERSION = "3.06_01";
11    $VERSION = eval $VERSION;
12    require XSLoader;
13    XSLoader::load( __PACKAGE__, $VERSION );
14}
15
16use Exporter 5.57 'import';
17
18use Carp ();
19our @CARP_NOT = qw(Encode::Encoder);
20
21# Public, encouraged API is exported by default
22
23our @EXPORT = qw(
24  decode  decode_utf8  encode  encode_utf8 str2bytes bytes2str
25  encodings  find_encoding find_mime_encoding clone_encoding
26);
27our @FB_FLAGS = qw(
28  DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
29  PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
30);
31our @FB_CONSTS = qw(
32  FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
33  FB_PERLQQ FB_HTMLCREF FB_XMLCREF
34);
35our @EXPORT_OK = (
36    qw(
37      _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
38      is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
39      ),
40    @FB_FLAGS, @FB_CONSTS,
41);
42
43our %EXPORT_TAGS = (
44    all          => [ @EXPORT,    @EXPORT_OK ],
45    default      => [ @EXPORT ],
46    fallbacks    => [ @FB_CONSTS ],
47    fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
48);
49
50# Documentation moved after __END__ for speed - NI-S
51
52our $ON_EBCDIC = ( ord("A") == 193 );
53
54use Encode::Alias ();
55use Encode::MIME::Name;
56
57use Storable;
58
59# Make a %Encoding package variable to allow a certain amount of cheating
60our %Encoding;
61our %ExtModule;
62require Encode::Config;
63#  See
64#  https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
65#  to find why sig handlers inside eval{} are disabled.
66eval {
67    local $SIG{__DIE__};
68    local $SIG{__WARN__};
69    local @INC = @INC;
70    pop @INC if @INC && $INC[-1] eq '.';
71    require Encode::ConfigLocal;
72};
73
74sub encodings {
75    my %enc;
76    my $arg  = $_[1] || '';
77    if ( $arg eq ":all" ) {
78        %enc = ( %Encoding, %ExtModule );
79    }
80    else {
81        %enc = %Encoding;
82        for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) {
83            DEBUG and warn $mod;
84            for my $enc ( keys %ExtModule ) {
85                $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
86            }
87        }
88    }
89    return sort { lc $a cmp lc $b }
90      grep      { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
91}
92
93sub perlio_ok {
94    my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
95    $obj->can("perlio_ok") and return $obj->perlio_ok();
96    return 0;    # safety net
97}
98
99sub define_encoding {
100    my $obj  = shift;
101    my $name = shift;
102    $Encoding{$name} = $obj;
103    my $lc = lc($name);
104    define_alias( $lc => $obj ) unless $lc eq $name;
105    while (@_) {
106        my $alias = shift;
107        define_alias( $alias, $obj );
108    }
109    my $class = ref($obj);
110    push @Encode::CARP_NOT, $class unless grep { $_ eq $class } @Encode::CARP_NOT;
111    push @Encode::Encoding::CARP_NOT, $class unless grep { $_ eq $class } @Encode::Encoding::CARP_NOT;
112    return $obj;
113}
114
115sub getEncoding {
116    my ( $class, $name, $skip_external ) = @_;
117
118    defined($name) or return;
119
120    $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
121
122    ref($name) && $name->can('renew') and return $name;
123    exists $Encoding{$name} and return $Encoding{$name};
124    my $lc = lc $name;
125    exists $Encoding{$lc} and return $Encoding{$lc};
126
127    my $oc = $class->find_alias($name);
128    defined($oc) and return $oc;
129    $lc ne $name and $oc = $class->find_alias($lc);
130    defined($oc) and return $oc;
131
132    unless ($skip_external) {
133        if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
134            $mod =~ s,::,/,g;
135            $mod .= '.pm';
136            eval { require $mod; };
137            exists $Encoding{$name} and return $Encoding{$name};
138        }
139    }
140    return;
141}
142
143# HACK: These two functions must be defined in Encode and because of
144# cyclic dependency between Encode and Encode::Alias, Exporter does not work
145sub find_alias {
146    goto &Encode::Alias::find_alias;
147}
148sub define_alias {
149    goto &Encode::Alias::define_alias;
150}
151
152sub find_encoding($;$) {
153    my ( $name, $skip_external ) = @_;
154    return __PACKAGE__->getEncoding( $name, $skip_external );
155}
156
157sub find_mime_encoding($;$) {
158    my ( $mime_name, $skip_external ) = @_;
159    my $name = Encode::MIME::Name::get_encode_name( $mime_name );
160    return find_encoding( $name, $skip_external );
161}
162
163sub resolve_alias($) {
164    my $obj = find_encoding(shift);
165    defined $obj and return $obj->name;
166    return;
167}
168
169sub clone_encoding($) {
170    my $obj = find_encoding(shift);
171    ref $obj or return;
172    return Storable::dclone($obj);
173}
174
175onBOOT;
176
177if ($ON_EBCDIC) {
178    package Encode::UTF_EBCDIC;
179    use parent 'Encode::Encoding';
180    my $obj = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
181    Encode::define_encoding($obj, 'Unicode');
182    sub decode {
183        my ( undef, $str, $chk ) = @_;
184        my $res = '';
185        for ( my $i = 0 ; $i < length($str) ; $i++ ) {
186            $res .=
187              chr(
188                utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
189              );
190        }
191        $_[1] = '' if $chk;
192        return $res;
193    }
194    sub encode {
195        my ( undef, $str, $chk ) = @_;
196        my $res = '';
197        for ( my $i = 0 ; $i < length($str) ; $i++ ) {
198            $res .=
199              chr(
200                utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
201              );
202        }
203        $_[1] = '' if $chk;
204        return $res;
205    }
206} else {
207    package Encode::Internal;
208    use parent 'Encode::Encoding';
209    my $obj = bless { Name => "Internal" } => "Encode::Internal";
210    Encode::define_encoding($obj, 'Unicode');
211    sub decode {
212        my ( undef, $str, $chk ) = @_;
213        utf8::upgrade($str);
214        $_[1] = '' if $chk;
215        return $str;
216    }
217    *encode = \&decode;
218}
219
220{
221    # https://rt.cpan.org/Public/Bug/Display.html?id=103253
222    package Encode::XS;
223    use parent 'Encode::Encoding';
224}
225
226{
227    package Encode::utf8;
228    use parent 'Encode::Encoding';
229    my %obj = (
230        'utf8'         => { Name => 'utf8' },
231        'utf-8-strict' => { Name => 'utf-8-strict', strict_utf8 => 1 }
232    );
233    for ( keys %obj ) {
234        bless $obj{$_} => __PACKAGE__;
235        Encode::define_encoding( $obj{$_} => $_ );
236    }
237    sub cat_decode {
238        # ($obj, $dst, $src, $pos, $trm, $chk)
239        # currently ignores $chk
240        my ( undef, undef, undef, $pos, $trm ) = @_;
241        my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
242        use bytes;
243        if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
244            $$rdst .=
245              substr( $$rsrc, $pos, $npos - $pos + length($trm) );
246            $$rpos = $npos + length($trm);
247            return 1;
248        }
249        $$rdst .= substr( $$rsrc, $pos );
250        $$rpos = length($$rsrc);
251        return '';
252    }
253}
254
2551;
256
257__END__
258
259=head1 NAME
260
261Encode - character encodings in Perl
262
263=head1 SYNOPSIS
264
265    use Encode qw(decode encode);
266    $characters = decode('UTF-8', $octets,     Encode::FB_CROAK);
267    $octets     = encode('UTF-8', $characters, Encode::FB_CROAK);
268
269=head2 Table of Contents
270
271Encode consists of a collection of modules whose details are too extensive
272to fit in one document.  This one itself explains the top-level APIs
273and general topics at a glance.  For other topics and more details,
274see the documentation for these modules:
275
276=over 2
277
278=item L<Encode::Alias> - Alias definitions to encodings
279
280=item L<Encode::Encoding> - Encode Implementation Base Class
281
282=item L<Encode::Supported> - List of Supported Encodings
283
284=item L<Encode::CN> - Simplified Chinese Encodings
285
286=item L<Encode::JP> - Japanese Encodings
287
288=item L<Encode::KR> - Korean Encodings
289
290=item L<Encode::TW> - Traditional Chinese Encodings
291
292=back
293
294=head1 DESCRIPTION
295
296The C<Encode> module provides the interface between Perl strings
297and the rest of the system.  Perl strings are sequences of
298I<characters>.
299
300The repertoire of characters that Perl can represent is a superset of those
301defined by the Unicode Consortium. On most platforms the ordinal
302values of a character as returned by C<ord(I<S>)> is the I<Unicode
303codepoint> for that character. The exceptions are platforms where
304the legacy encoding is some variant of EBCDIC rather than a superset
305of ASCII; see L<perlebcdic>.
306
307During recent history, data is moved around a computer in 8-bit chunks,
308often called "bytes" but also known as "octets" in standards documents.
309Perl is widely used to manipulate data of many types: not only strings of
310characters representing human or computer languages, but also "binary"
311data, being the machine's representation of numbers, pixels in an image, or
312just about anything.
313
314When Perl is processing "binary data", the programmer wants Perl to
315process "sequences of bytes". This is not a problem for Perl: because a
316byte has 256 possible values, it easily fits in Perl's much larger
317"logical character".
318
319This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq>
320explain the I<why>.
321
322=head2 TERMINOLOGY
323
324=head3 character
325
326A character in the range 0 .. 2**32-1 (or more);
327what Perl's strings are made of.
328
329=head3 byte
330
331A character in the range 0..255;
332a special case of a Perl character.
333
334=head3 octet
335
3368 bits of data, with ordinal values 0..255;
337term for bytes passed to or from a non-Perl context, such as a disk file,
338standard I/O stream, database, command-line argument, environment variable,
339socket etc.
340
341=head1 THE PERL ENCODING API
342
343=head2 Basic methods
344
345=head3 encode
346
347  $octets  = encode(ENCODING, STRING[, CHECK])
348
349Encodes the scalar value I<STRING> from Perl's internal form into
350I<ENCODING> and returns a sequence of octets.  I<ENCODING> can be either a
351canonical name or an alias.  For encoding names and aliases, see
352L</"Defining Aliases">.  For CHECK, see L</"Handling Malformed Data">.
353
354B<CAVEAT>: the input scalar I<STRING> might be modified in-place depending
355on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
356left unchanged.
357
358For example, to convert a string from Perl's internal format into
359ISO-8859-1, also known as Latin1:
360
361  $octets = encode("iso-8859-1", $string);
362
363B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then
364$octets I<might not be equal to> $string.  Though both contain the
365same data, the UTF8 flag for $octets is I<always> off.  When you
366encode anything, the UTF8 flag on the result is always off, even when it
367contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below.
368
369If the $string is C<undef>, then C<undef> is returned.
370
371C<str2bytes> may be used as an alias for C<encode>.
372
373=head3 decode
374
375  $string = decode(ENCODING, OCTETS[, CHECK])
376
377This function returns the string that results from decoding the scalar
378value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
379Perl's internal form.  As with encode(),
380I<ENCODING> can be either a canonical name or an alias. For encoding names
381and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
382Malformed Data">.
383
384B<CAVEAT>: the input scalar I<OCTETS> might be modified in-place depending
385on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
386left unchanged.
387
388For example, to convert ISO-8859-1 data into a string in Perl's
389internal format:
390
391  $string = decode("iso-8859-1", $octets);
392
393B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string
394I<might not be equal to> $octets.  Though both contain the same data, the
395UTF8 flag for $string is on.  See L</"The UTF8 flag">
396below.
397
398If the $string is C<undef>, then C<undef> is returned.
399
400C<bytes2str> may be used as an alias for C<decode>.
401
402=head3 find_encoding
403
404  [$obj =] find_encoding(ENCODING)
405
406Returns the I<encoding object> corresponding to I<ENCODING>.  Returns
407C<undef> if no matching I<ENCODING> is find.  The returned object is
408what does the actual encoding or decoding.
409
410  $string = decode($name, $bytes);
411
412is in fact
413
414    $string = do {
415        $obj = find_encoding($name);
416        croak qq(encoding "$name" not found) unless ref $obj;
417        $obj->decode($bytes);
418    };
419
420with more error checking.
421
422You can therefore save time by reusing this object as follows;
423
424    my $enc = find_encoding("iso-8859-1");
425    while(<>) {
426        my $string = $enc->decode($_);
427        ... # now do something with $string;
428    }
429
430Besides L</decode> and L</encode>, other methods are
431available as well.  For instance, C<name()> returns the canonical
432name of the encoding object.
433
434  find_encoding("latin1")->name; # iso-8859-1
435
436See L<Encode::Encoding> for details.
437
438=head3 find_mime_encoding
439
440  [$obj =] find_mime_encoding(MIME_ENCODING)
441
442Returns the I<encoding object> corresponding to I<MIME_ENCODING>.  Acts
443same as C<find_encoding()> but C<mime_name()> of returned object must
444match to I<MIME_ENCODING>.  So as opposite of C<find_encoding()>
445canonical names and aliases are not used when searching for object.
446
447    find_mime_encoding("utf8"); # returns undef because "utf8" is not valid I<MIME_ENCODING>
448    find_mime_encoding("utf-8"); # returns encode object "utf-8-strict"
449    find_mime_encoding("UTF-8"); # same as "utf-8" because I<MIME_ENCODING> is case insensitive
450    find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not valid I<MIME_ENCODING>
451
452=head3 from_to
453
454  [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
455
456Converts I<in-place> data between two encodings. The data in $octets
457must be encoded as octets and I<not> as characters in Perl's internal
458format. For example, to convert ISO-8859-1 data into Microsoft's CP1250
459encoding:
460
461  from_to($octets, "iso-8859-1", "cp1250");
462
463and to convert it back:
464
465  from_to($octets, "cp1250", "iso-8859-1");
466
467Because the conversion happens in place, the data to be
468converted cannot be a string constant: it must be a scalar variable.
469
470C<from_to()> returns the length of the converted string in octets on success,
471and C<undef> on error.
472
473B<CAVEAT>: The following operations may look the same, but are not:
474
475  from_to($data, "iso-8859-1", "UTF-8"); #1
476  $data = decode("iso-8859-1", $data);  #2
477
478Both #1 and #2 make $data consist of a completely valid UTF-8 string,
479but only #2 turns the UTF8 flag on.  #1 is equivalent to:
480
481  $data = encode("UTF-8", decode("iso-8859-1", $data));
482
483See L</"The UTF8 flag"> below.
484
485Also note that:
486
487  from_to($octets, $from, $to, $check);
488
489is equivalent to:
490
491  $octets = encode($to, decode($from, $octets), $check);
492
493Yes, it does I<not> respect the $check during decoding.  It is
494deliberately done that way.  If you need minute control, use C<decode>
495followed by C<encode> as follows:
496
497  $octets = encode($to, decode($from, $octets, $check_from), $check_to);
498
499=head3 encode_utf8
500
501  $octets = encode_utf8($string);
502
503Equivalent to C<$octets = encode("utf8", $string)>.  The characters in
504$string are encoded in Perl's internal format, and the result is returned
505as a sequence of octets.  Because all possible characters in Perl have a
506(loose, not strict) utf8 representation, this function cannot fail.
507
508B<WARNING>: do not use this function for data exchange as it can produce
509not strict utf8 $octets! For strictly valid UTF-8 output use
510C<$octets = encode("UTF-8", $string)>.
511
512=head3 decode_utf8
513
514  $string = decode_utf8($octets [, CHECK]);
515
516Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
517The sequence of octets represented by $octets is decoded
518from (loose, not strict) utf8 into a sequence of logical characters.
519Because not all sequences of octets are valid not strict utf8,
520it is quite possible for this function to fail.
521For CHECK, see L</"Handling Malformed Data">.
522
523B<WARNING>: do not use this function for data exchange as it can produce
524$string with not strict utf8 representation! For strictly valid UTF-8
525$string representation use C<$string = decode("UTF-8", $octets [, CHECK])>.
526
527B<CAVEAT>: the input I<$octets> might be modified in-place depending on
528what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
529left unchanged.
530
531=head2 Listing available encodings
532
533  use Encode;
534  @list = Encode->encodings();
535
536Returns a list of canonical names of available encodings that have already
537been loaded.  To get a list of all available encodings including those that
538have not yet been loaded, say:
539
540  @all_encodings = Encode->encodings(":all");
541
542Or you can give the name of a specific module:
543
544  @with_jp = Encode->encodings("Encode::JP");
545
546When "C<::>" is not in the name, "C<Encode::>" is assumed.
547
548  @ebcdic = Encode->encodings("EBCDIC");
549
550To find out in detail which encodings are supported by this package,
551see L<Encode::Supported>.
552
553=head2 Defining Aliases
554
555To add a new alias to a given encoding, use:
556
557  use Encode;
558  use Encode::Alias;
559  define_alias(NEWNAME => ENCODING);
560
561After that, I<NEWNAME> can be used as an alias for I<ENCODING>.
562I<ENCODING> may be either the name of an encoding or an
563I<encoding object>.
564
565Before you do that, first make sure the alias is nonexistent using
566C<resolve_alias()>, which returns the canonical name thereof.
567For example:
568
569  Encode::resolve_alias("latin1") eq "iso-8859-1" # true
570  Encode::resolve_alias("iso-8859-12")   # false; nonexistent
571  Encode::resolve_alias($name) eq $name  # true if $name is canonical
572
573C<resolve_alias()> does not need C<use Encode::Alias>; it can be
574imported via C<use Encode qw(resolve_alias)>.
575
576See L<Encode::Alias> for details.
577
578=head2 Finding IANA Character Set Registry names
579
580The canonical name of a given encoding does not necessarily agree with
581IANA Character Set Registry, commonly seen as C<< Content-Type:
582text/plain; charset=I<WHATEVER> >>.  For most cases, the canonical name
583works, but sometimes it does not, most notably with "utf-8-strict".
584
585As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added.
586
587  use Encode;
588  my $enc = find_encoding("UTF-8");
589  warn $enc->name;      # utf-8-strict
590  warn $enc->mime_name; # UTF-8
591
592See also:  L<Encode::Encoding>
593
594=head1 Encoding via PerlIO
595
596If your perl supports C<PerlIO> (which is the default), you can use a
597C<PerlIO> layer to decode and encode directly via a filehandle.  The
598following two examples are fully identical in functionality:
599
600  ### Version 1 via PerlIO
601    open(INPUT,  "< :encoding(shiftjis)", $infile)
602        || die "Can't open < $infile for reading: $!";
603    open(OUTPUT, "> :encoding(euc-jp)",  $outfile)
604        || die "Can't open > $output for writing: $!";
605    while (<INPUT>) {   # auto decodes $_
606        print OUTPUT;   # auto encodes $_
607    }
608    close(INPUT)   || die "can't close $infile: $!";
609    close(OUTPUT)  || die "can't close $outfile: $!";
610
611  ### Version 2 via from_to()
612    open(INPUT,  "< :raw", $infile)
613        || die "Can't open < $infile for reading: $!";
614    open(OUTPUT, "> :raw",  $outfile)
615        || die "Can't open > $output for writing: $!";
616
617    while (<INPUT>) {
618        from_to($_, "shiftjis", "euc-jp", 1);  # switch encoding
619        print OUTPUT;   # emit raw (but properly encoded) data
620    }
621    close(INPUT)   || die "can't close $infile: $!";
622    close(OUTPUT)  || die "can't close $outfile: $!";
623
624In the first version above, you let the appropriate encoding layer
625handle the conversion.  In the second, you explicitly translate
626from one encoding to the other.
627
628Unfortunately, it may be that encodings are not C<PerlIO>-savvy.  You can check
629to see whether your encoding is supported by C<PerlIO> by invoking the
630C<perlio_ok> method on it:
631
632  Encode::perlio_ok("hz");             # false
633  find_encoding("euc-cn")->perlio_ok;  # true wherever PerlIO is available
634
635  use Encode qw(perlio_ok);            # imported upon request
636  perlio_ok("euc-jp")
637
638Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy
639except for C<hz> and C<ISO-2022-kr>.  For the gory details, see
640L<Encode::Encoding> and L<Encode::PerlIO>.
641
642=head1 Handling Malformed Data
643
644The optional I<CHECK> argument tells C<Encode> what to do when
645encountering malformed data.  Without I<CHECK>, C<Encode::FB_DEFAULT>
646(== 0) is assumed.
647
648As of version 2.12, C<Encode> supports coderef values for C<CHECK>;
649see below.
650
651B<NOTE:> Not all encodings support this feature.
652Some encodings ignore the I<CHECK> argument.  For example,
653L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
654
655=head2 List of I<CHECK> values
656
657=head3 FB_DEFAULT
658
659  I<CHECK> = Encode::FB_DEFAULT ( == 0)
660
661If I<CHECK> is 0, encoding and decoding replace any malformed character
662with a I<substitution character>.  When you encode, I<SUBCHAR> is used.
663When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is
664used.  If the data is supposed to be UTF-8, an optional lexical warning of
665warning category C<"utf8"> is given.
666
667=head3 FB_CROAK
668
669  I<CHECK> = Encode::FB_CROAK ( == 1)
670
671If I<CHECK> is 1, methods immediately die with an error
672message.  Therefore, when I<CHECK> is 1, you should trap
673exceptions with C<eval{}>, unless you really want to let it C<die>.
674
675=head3 FB_QUIET
676
677  I<CHECK> = Encode::FB_QUIET
678
679If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately
680return the portion of the data that has been processed so far when an
681error occurs. The data argument is overwritten with everything
682after that point; that is, the unprocessed portion of the data.  This is
683handy when you have to call C<decode> repeatedly in the case where your
684source data may contain partial multi-byte character sequences,
685(that is, you are reading with a fixed-width buffer). Here's some sample
686code to do exactly that:
687
688    my($buffer, $string) = ("", "");
689    while (read($fh, $buffer, 256, length($buffer))) {
690        $string .= decode($encoding, $buffer, Encode::FB_QUIET);
691        # $buffer now contains the unprocessed partial character
692    }
693
694=head3 FB_WARN
695
696  I<CHECK> = Encode::FB_WARN
697
698This is the same as C<FB_QUIET> above, except that instead of being silent
699on errors, it issues a warning.  This is handy for when you are debugging.
700
701B<CAVEAT>: All warnings from Encode module are reported, independently of
702L<pragma warnings|warnings> settings. If you want to follow settings of
703lexical warnings configured by L<pragma warnings|warnings> then append
704also check value C<ENCODE::ONLY_PRAGMA_WARNINGS>. This value is available
705since Encode version 2.99.
706
707=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
708
709=over 2
710
711=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
712
713=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
714
715=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
716
717=back
718
719For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
720C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
721
722When you decode, C<\xI<HH>> is inserted for a malformed character, where
723I<HH> is the hex representation of the octet that could not be decoded to
724utf8.  When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
725the Unicode code point (in any number of hex digits) of the character that
726cannot be found in the character repertoire of the encoding.
727
728The HTML/XML character reference modes are about the same. In place of
729C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
730XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
731
732In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
733
734=head3 The bitmask
735
736These modes are all actually set via a bitmask.  Here is how the C<FB_I<XXX>>
737constants are laid out.  You can import the C<FB_I<XXX>> constants via
738C<use Encode qw(:fallbacks)>, and you can import the generic bitmask
739constants via C<use Encode qw(:fallback_all)>.
740
741                     FB_DEFAULT FB_CROAK FB_QUIET FB_WARN  FB_PERLQQ
742 DIE_ON_ERR    0x0001             X
743 WARN_ON_ERR   0x0002                               X
744 RETURN_ON_ERR 0x0004                      X        X
745 LEAVE_SRC     0x0008                                        X
746 PERLQQ        0x0100                                        X
747 HTMLCREF      0x0200
748 XMLCREF       0x0400
749
750=head3 LEAVE_SRC
751
752  Encode::LEAVE_SRC
753
754If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
755source string to encode() or decode() will be overwritten in place.
756If you're not interested in this, then bitwise-OR it with the bitmask.
757
758=head2 coderef for CHECK
759
760As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
761ordinal value of the unmapped character as an argument and returns
762octets that represent the fallback character.  For instance:
763
764  $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
765
766Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
767
768Fallback for C<decode> must return decoded string (sequence of characters)
769and takes a list of ordinal values as its arguments. So for
770example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
771a fallback for bytes that are not valid UTF-8, you could write
772
773    $str = decode 'UTF-8', $octets, sub {
774        my $tmp = join '', map chr, @_;
775        return decode 'ISO-8859-15', $tmp;
776    };
777
778=head1 Defining Encodings
779
780To define a new encoding, use:
781
782    use Encode qw(define_encoding);
783    define_encoding($object, CANONICAL_NAME [, alias...]);
784
785I<CANONICAL_NAME> will be associated with I<$object>.  The object
786should provide the interface described in L<Encode::Encoding>.
787If more than two arguments are provided, additional
788arguments are considered aliases for I<$object>.
789
790See L<Encode::Encoding> for details.
791
792=head1 The UTF8 flag
793
794Before the introduction of Unicode support in Perl, The C<eq> operator
795just compared the strings represented by two scalars. Beginning with
796Perl 5.8, C<eq> compares two strings with simultaneous consideration of
797I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of
798I<Programming Perl, 3rd ed.>
799
800=over 2
801
802=item Goal #1:
803
804Old byte-oriented programs should not spontaneously break on the old
805byte-oriented data they used to work on.
806
807=item Goal #2:
808
809Old byte-oriented programs should magically start working on the new
810character-oriented data when appropriate.
811
812=item Goal #3:
813
814Programs should run just as fast in the new character-oriented mode
815as in the old byte-oriented mode.
816
817=item Goal #4:
818
819Perl should remain one language, rather than forking into a
820byte-oriented Perl and a character-oriented Perl.
821
822=back
823
824When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been
825born yet, many features documented in the book remained unimplemented for a
826long time.  Perl 5.8 corrected much of this, and the introduction of the
827UTF8 flag is one of them.  You can think of there being two fundamentally
828different kinds of strings and string-operations in Perl: one a
829byte-oriented mode  for when the internal UTF8 flag is off, and the other a
830character-oriented mode for when the internal UTF8 flag is on.
831
832This UTF8 flag is not visible in Perl scripts, exactly for the same reason
833you cannot (or rather, you I<don't have to>) see whether a scalar contains
834a string, an integer, or a floating-point number.   But you can still peek
835and poke these if you will.  See the next section.
836
837=head2 Messing with Perl's Internals
838
839The following API uses parts of Perl's internals in the current
840implementation.  As such, they are efficient but may change in a future
841release.
842
843=head3 is_utf8
844
845  is_utf8(STRING [, CHECK])
846
847[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>.
848If I<CHECK> is true, also checks whether I<STRING> contains well-formed
849UTF-8.  Returns true if successful, false otherwise.
850
851Typically only necessary for debugging and testing.  Don't use this flag as
852a marker to distinguish character and binary data, that should be decided
853for each variable when you write your code.
854
855B<CAVEAT>: If I<STRING> has UTF8 flag set, it does B<NOT> mean that
856I<STRING> is UTF-8 encoded and vice-versa.
857
858As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
859
860=head3 _utf8_on
861
862  _utf8_on(STRING)
863
864[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>.  The I<STRING>
865is I<not> checked for containing only well-formed UTF-8.  Do not use this
866unless you I<know with absolute certainty> that the STRING holds only
867well-formed UTF-8.  Returns the previous state of the UTF8 flag (so please
868don't treat the return value as indicating success or failure), or C<undef>
869if I<STRING> is not a string.
870
871B<NOTE>: For security reasons, this function does not work on tainted values.
872
873=head3 _utf8_off
874
875  _utf8_off(STRING)
876
877[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>.  Do not use
878frivolously.  Returns the previous state of the UTF8 flag, or C<undef> if
879I<STRING> is not a string.  Do not treat the return value as indicative of
880success or failure, because that isn't what it means: it is only the
881previous setting.
882
883B<NOTE>: For security reasons, this function does not work on tainted values.
884
885=head1 UTF-8 vs. utf8 vs. UTF8
886
887  ....We now view strings not as sequences of bytes, but as sequences
888  of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
889  computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
890
891That has historically been Perl's notion of UTF-8, as that is how UTF-8 was
892first conceived by Ken Thompson when he invented it. However, thanks to
893later revisions to the applicable standards, official UTF-8 is now rather
894stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF
895to cover only 21 bits instead of 32 or 64 bits) and some sequences
896are not allowed, like those used in surrogate pairs, the 31 non-character
897code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane
898(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc.
899
900The former default in which Perl would always use a loose interpretation of
901UTF-8 has now been overruled:
902
903  From: Larry Wall <larry@wall.org>
904  Date: December 04, 2004 11:51:58 JST
905  To: perl-unicode@perl.org
906  Subject: Re: Make Encode.pm support the real UTF-8
907  Message-Id: <20041204025158.GA28754@wall.org>
908
909  On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
910  : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
911  : but "UTF-8" is the name of the standard and should give the
912  : corresponding behaviour.
913
914  For what it's worth, that's how I've always kept them straight in my
915  head.
916
917  Also for what it's worth, Perl 6 will mostly default to strict but
918  make it easy to switch back to lax.
919
920  Larry
921
922Got that?  As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current
923sense, which is conservative and strict and security-conscious, whereas
924B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and
925lax.  C<Encode> version 2.10 or later thus groks this subtle but critically
926important distinction between C<"UTF-8"> and C<"utf8">.
927
928  encode("utf8",  "\x{FFFF_FFFF}", 1); # okay
929  encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
930
931In the C<Encode> module, C<"UTF-8"> is actually a canonical name for
932C<"utf-8-strict">.  That hyphen between the C<"UTF"> and the C<"8"> is
933critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
934
935  find_encoding("UTF-8")->name # is 'utf-8-strict'
936  find_encoding("utf-8")->name # ditto. names are case insensitive
937  find_encoding("utf_8")->name # ditto. "_" are treated as "-"
938  find_encoding("UTF8")->name  # is 'utf8'.
939
940Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates
941whether a string is internally encoded as "utf8", also without a hyphen.
942
943=head1 SEE ALSO
944
945L<Encode::Encoding>,
946L<Encode::Supported>,
947L<Encode::PerlIO>,
948L<encoding>,
949L<perlebcdic>,
950L<perlfunc/open>,
951L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
952L<utf8>,
953the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html>
954
955=head1 MAINTAINER
956
957This project was originated by the late Nick Ing-Simmons and later
958maintained by Dan Kogai I<< <dankogai@cpan.org> >>.  See AUTHORS
959for a full list of people involved.  For any questions, send mail to
960I<< <perl-unicode@perl.org> >> so that we can all share.
961
962While Dan Kogai retains the copyright as a maintainer, credit
963should go to all those involved.  See AUTHORS for a list of those
964who submitted code to the project.
965
966=head1 COPYRIGHT
967
968Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>.
969
970This library is free software; you can redistribute it and/or modify
971it under the same terms as Perl itself.
972
973=cut
974