xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/Encode.pm (revision 3d61058aa5c692477b6d18acfbbdb653a9930ff9)
1b39c5158Smillert#
2*3d61058aSafresh1# $Id: Encode.pm,v 3.21 2024/02/25 22:17:32 dankogai Exp $
3b39c5158Smillert#
4b39c5158Smillertpackage Encode;
5b39c5158Smillertuse strict;
6b39c5158Smillertuse warnings;
748950c12Ssthenuse constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
89f11ffb7Safresh1our $VERSION;
99f11ffb7Safresh1BEGIN {
10*3d61058aSafresh1    $VERSION = sprintf "%d.%02d", q$Revision: 3.21 $ =~ /(\d+)/g;
119f11ffb7Safresh1    require XSLoader;
12b39c5158Smillert    XSLoader::load( __PACKAGE__, $VERSION );
139f11ffb7Safresh1}
14b39c5158Smillert
15e5157e49Safresh1use Exporter 5.57 'import';
16b39c5158Smillert
17b46d8ef2Safresh1use Carp ();
189f11ffb7Safresh1our @CARP_NOT = qw(Encode::Encoder);
199f11ffb7Safresh1
20b39c5158Smillert# Public, encouraged API is exported by default
21b39c5158Smillert
22b39c5158Smillertour @EXPORT = qw(
23b39c5158Smillert  decode  decode_utf8  encode  encode_utf8 str2bytes bytes2str
249f11ffb7Safresh1  encodings  find_encoding find_mime_encoding clone_encoding
25b39c5158Smillert);
26b39c5158Smillertour @FB_FLAGS = qw(
27b39c5158Smillert  DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
28b39c5158Smillert  PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
29b39c5158Smillert);
30b39c5158Smillertour @FB_CONSTS = qw(
31b39c5158Smillert  FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
32b39c5158Smillert  FB_PERLQQ FB_HTMLCREF FB_XMLCREF
33b39c5158Smillert);
34b39c5158Smillertour @EXPORT_OK = (
35b39c5158Smillert    qw(
36b39c5158Smillert      _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
37b39c5158Smillert      is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
38b39c5158Smillert      ),
39b39c5158Smillert    @FB_FLAGS, @FB_CONSTS,
40b39c5158Smillert);
41b39c5158Smillert
42b39c5158Smillertour %EXPORT_TAGS = (
43b39c5158Smillert    all          => [ @EXPORT,    @EXPORT_OK ],
44b39c5158Smillert    default      => [ @EXPORT ],
45b39c5158Smillert    fallbacks    => [ @FB_CONSTS ],
46b39c5158Smillert    fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
47b39c5158Smillert);
48b39c5158Smillert
49b39c5158Smillert# Documentation moved after __END__ for speed - NI-S
50b39c5158Smillert
51b39c5158Smillertour $ON_EBCDIC = ( ord("A") == 193 );
52b39c5158Smillert
539f11ffb7Safresh1use Encode::Alias ();
549f11ffb7Safresh1use Encode::MIME::Name;
559f11ffb7Safresh1
569f11ffb7Safresh1use Storable;
57b39c5158Smillert
58b39c5158Smillert# Make a %Encoding package variable to allow a certain amount of cheating
59b39c5158Smillertour %Encoding;
60b39c5158Smillertour %ExtModule;
61b39c5158Smillertrequire Encode::Config;
62b39c5158Smillert#  See
63b39c5158Smillert#  https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
64e5157e49Safresh1#  to find why sig handlers inside eval{} are disabled.
65b39c5158Smillerteval {
66b39c5158Smillert    local $SIG{__DIE__};
67b39c5158Smillert    local $SIG{__WARN__};
68c50a90c5Safresh1    local @INC = @INC;
69c50a90c5Safresh1    pop @INC if @INC && $INC[-1] eq '.';
70b39c5158Smillert    require Encode::ConfigLocal;
71b39c5158Smillert};
72b39c5158Smillert
73b39c5158Smillertsub encodings {
74b39c5158Smillert    my %enc;
75e9ce3842Safresh1    my $arg  = $_[1] || '';
76e9ce3842Safresh1    if ( $arg eq ":all" ) {
77b39c5158Smillert        %enc = ( %Encoding, %ExtModule );
78b39c5158Smillert    }
79b39c5158Smillert    else {
80b39c5158Smillert        %enc = %Encoding;
8148950c12Ssthen        for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) {
82b39c5158Smillert            DEBUG and warn $mod;
83b39c5158Smillert            for my $enc ( keys %ExtModule ) {
84b39c5158Smillert                $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
85b39c5158Smillert            }
86b39c5158Smillert        }
87b39c5158Smillert    }
88b39c5158Smillert    return sort { lc $a cmp lc $b }
89b39c5158Smillert      grep      { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
90b39c5158Smillert}
91b39c5158Smillert
92b39c5158Smillertsub perlio_ok {
93b39c5158Smillert    my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
94b39c5158Smillert    $obj->can("perlio_ok") and return $obj->perlio_ok();
95b39c5158Smillert    return 0;    # safety net
96b39c5158Smillert}
97b39c5158Smillert
98b39c5158Smillertsub define_encoding {
99b39c5158Smillert    my $obj  = shift;
100b39c5158Smillert    my $name = shift;
101b39c5158Smillert    $Encoding{$name} = $obj;
102b39c5158Smillert    my $lc = lc($name);
103b39c5158Smillert    define_alias( $lc => $obj ) unless $lc eq $name;
104b39c5158Smillert    while (@_) {
105b39c5158Smillert        my $alias = shift;
106b39c5158Smillert        define_alias( $alias, $obj );
107b39c5158Smillert    }
1089f11ffb7Safresh1    my $class = ref($obj);
1099f11ffb7Safresh1    push @Encode::CARP_NOT, $class unless grep { $_ eq $class } @Encode::CARP_NOT;
1109f11ffb7Safresh1    push @Encode::Encoding::CARP_NOT, $class unless grep { $_ eq $class } @Encode::Encoding::CARP_NOT;
111b39c5158Smillert    return $obj;
112b39c5158Smillert}
113b39c5158Smillert
114b39c5158Smillertsub getEncoding {
115b39c5158Smillert    my ( $class, $name, $skip_external ) = @_;
116b39c5158Smillert
1179f11ffb7Safresh1    defined($name) or return;
1189f11ffb7Safresh1
11948950c12Ssthen    $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
12048950c12Ssthen
121b39c5158Smillert    ref($name) && $name->can('renew') and return $name;
122b39c5158Smillert    exists $Encoding{$name} and return $Encoding{$name};
123b39c5158Smillert    my $lc = lc $name;
124b39c5158Smillert    exists $Encoding{$lc} and return $Encoding{$lc};
125b39c5158Smillert
126b39c5158Smillert    my $oc = $class->find_alias($name);
127b39c5158Smillert    defined($oc) and return $oc;
128b39c5158Smillert    $lc ne $name and $oc = $class->find_alias($lc);
129b39c5158Smillert    defined($oc) and return $oc;
130b39c5158Smillert
131b39c5158Smillert    unless ($skip_external) {
132b39c5158Smillert        if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
133b39c5158Smillert            $mod =~ s,::,/,g;
134b39c5158Smillert            $mod .= '.pm';
135b39c5158Smillert            eval { require $mod; };
136b39c5158Smillert            exists $Encoding{$name} and return $Encoding{$name};
137b39c5158Smillert        }
138b39c5158Smillert    }
139b39c5158Smillert    return;
140b39c5158Smillert}
141b39c5158Smillert
1429f11ffb7Safresh1# HACK: These two functions must be defined in Encode and because of
1439f11ffb7Safresh1# cyclic dependency between Encode and Encode::Alias, Exporter does not work
1449f11ffb7Safresh1sub find_alias {
1459f11ffb7Safresh1    goto &Encode::Alias::find_alias;
1469f11ffb7Safresh1}
1479f11ffb7Safresh1sub define_alias {
1489f11ffb7Safresh1    goto &Encode::Alias::define_alias;
1499f11ffb7Safresh1}
1509f11ffb7Safresh1
151b39c5158Smillertsub find_encoding($;$) {
152b39c5158Smillert    my ( $name, $skip_external ) = @_;
153b39c5158Smillert    return __PACKAGE__->getEncoding( $name, $skip_external );
154b39c5158Smillert}
155b39c5158Smillert
1569f11ffb7Safresh1sub find_mime_encoding($;$) {
1579f11ffb7Safresh1    my ( $mime_name, $skip_external ) = @_;
1589f11ffb7Safresh1    my $name = Encode::MIME::Name::get_encode_name( $mime_name );
1599f11ffb7Safresh1    return find_encoding( $name, $skip_external );
1609f11ffb7Safresh1}
1619f11ffb7Safresh1
162b39c5158Smillertsub resolve_alias($) {
163b39c5158Smillert    my $obj = find_encoding(shift);
164b39c5158Smillert    defined $obj and return $obj->name;
165b39c5158Smillert    return;
166b39c5158Smillert}
167b39c5158Smillert
168b39c5158Smillertsub clone_encoding($) {
169b39c5158Smillert    my $obj = find_encoding(shift);
170b39c5158Smillert    ref $obj or return;
171b39c5158Smillert    return Storable::dclone($obj);
172b39c5158Smillert}
173b39c5158Smillert
1749f11ffb7Safresh1onBOOT;
175b39c5158Smillert
176b39c5158Smillertif ($ON_EBCDIC) {
177b39c5158Smillert    package Encode::UTF_EBCDIC;
1789f11ffb7Safresh1    use parent 'Encode::Encoding';
1799f11ffb7Safresh1    my $obj = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
1809f11ffb7Safresh1    Encode::define_encoding($obj, 'Unicode');
1819f11ffb7Safresh1    sub decode {
182e9ce3842Safresh1        my ( undef, $str, $chk ) = @_;
183b39c5158Smillert        my $res = '';
184b39c5158Smillert        for ( my $i = 0 ; $i < length($str) ; $i++ ) {
185b39c5158Smillert            $res .=
186b39c5158Smillert              chr(
187b39c5158Smillert                utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
188b39c5158Smillert              );
189b39c5158Smillert        }
190b39c5158Smillert        $_[1] = '' if $chk;
191b39c5158Smillert        return $res;
1929f11ffb7Safresh1    }
1939f11ffb7Safresh1    sub encode {
194e9ce3842Safresh1        my ( undef, $str, $chk ) = @_;
195b39c5158Smillert        my $res = '';
196b39c5158Smillert        for ( my $i = 0 ; $i < length($str) ; $i++ ) {
197b39c5158Smillert            $res .=
198b39c5158Smillert              chr(
199b39c5158Smillert                utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
200b39c5158Smillert              );
201b39c5158Smillert        }
202b39c5158Smillert        $_[1] = '' if $chk;
203b39c5158Smillert        return $res;
204b39c5158Smillert    }
2059f11ffb7Safresh1}
2069f11ffb7Safresh1
207b8851fccSafresh1{
208b8851fccSafresh1    # https://rt.cpan.org/Public/Bug/Display.html?id=103253
209b8851fccSafresh1    package Encode::XS;
2109f11ffb7Safresh1    use parent 'Encode::Encoding';
211b8851fccSafresh1}
2129f11ffb7Safresh1
213b39c5158Smillert{
214b39c5158Smillert    package Encode::utf8;
2159f11ffb7Safresh1    use parent 'Encode::Encoding';
2169f11ffb7Safresh1    my %obj = (
2179f11ffb7Safresh1        'utf8'         => { Name => 'utf8' },
2189f11ffb7Safresh1        'utf-8-strict' => { Name => 'utf-8-strict', strict_utf8 => 1 }
2199f11ffb7Safresh1    );
2209f11ffb7Safresh1    for ( keys %obj ) {
2219f11ffb7Safresh1        bless $obj{$_} => __PACKAGE__;
2229f11ffb7Safresh1        Encode::define_encoding( $obj{$_} => $_ );
223b39c5158Smillert    }
2249f11ffb7Safresh1    sub cat_decode {
2259f11ffb7Safresh1        # ($obj, $dst, $src, $pos, $trm, $chk)
226b39c5158Smillert        # currently ignores $chk
227e9ce3842Safresh1        my ( undef, undef, undef, $pos, $trm ) = @_;
228b39c5158Smillert        my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
229b39c5158Smillert        use bytes;
230b39c5158Smillert        if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
231b39c5158Smillert            $$rdst .=
232b39c5158Smillert              substr( $$rsrc, $pos, $npos - $pos + length($trm) );
233b39c5158Smillert            $$rpos = $npos + length($trm);
234b39c5158Smillert            return 1;
235b39c5158Smillert        }
236b39c5158Smillert        $$rdst .= substr( $$rsrc, $pos );
237b39c5158Smillert        $$rpos = length($$rsrc);
238b39c5158Smillert        return '';
239b39c5158Smillert    }
240b39c5158Smillert}
241b39c5158Smillert
242b39c5158Smillert1;
243b39c5158Smillert
244b39c5158Smillert__END__
245b39c5158Smillert
246b39c5158Smillert=head1 NAME
247b39c5158Smillert
24848950c12SsthenEncode - character encodings in Perl
249b39c5158Smillert
250b39c5158Smillert=head1 SYNOPSIS
251b39c5158Smillert
252e9ce3842Safresh1    use Encode qw(decode encode);
253e9ce3842Safresh1    $characters = decode('UTF-8', $octets,     Encode::FB_CROAK);
254e9ce3842Safresh1    $octets     = encode('UTF-8', $characters, Encode::FB_CROAK);
255b39c5158Smillert
256b39c5158Smillert=head2 Table of Contents
257b39c5158Smillert
25848950c12SsthenEncode consists of a collection of modules whose details are too extensive
25948950c12Ssthento fit in one document.  This one itself explains the top-level APIs
260b39c5158Smillertand general topics at a glance.  For other topics and more details,
26148950c12Ssthensee the documentation for these modules:
262b39c5158Smillert
263e9ce3842Safresh1=over 2
264e9ce3842Safresh1
265e9ce3842Safresh1=item L<Encode::Alias> - Alias definitions to encodings
266e9ce3842Safresh1
267e9ce3842Safresh1=item L<Encode::Encoding> - Encode Implementation Base Class
268e9ce3842Safresh1
269e9ce3842Safresh1=item L<Encode::Supported> - List of Supported Encodings
270e9ce3842Safresh1
271e9ce3842Safresh1=item L<Encode::CN> - Simplified Chinese Encodings
272e9ce3842Safresh1
273e9ce3842Safresh1=item L<Encode::JP> - Japanese Encodings
274e9ce3842Safresh1
275e9ce3842Safresh1=item L<Encode::KR> - Korean Encodings
276e9ce3842Safresh1
277e9ce3842Safresh1=item L<Encode::TW> - Traditional Chinese Encodings
278e9ce3842Safresh1
279e9ce3842Safresh1=back
280b39c5158Smillert
281b39c5158Smillert=head1 DESCRIPTION
282b39c5158Smillert
28348950c12SsthenThe C<Encode> module provides the interface between Perl strings
284b39c5158Smillertand the rest of the system.  Perl strings are sequences of
28548950c12SsthenI<characters>.
286b39c5158Smillert
28748950c12SsthenThe repertoire of characters that Perl can represent is a superset of those
288b39c5158Smillertdefined by the Unicode Consortium. On most platforms the ordinal
28948950c12Ssthenvalues of a character as returned by C<ord(I<S>)> is the I<Unicode
29048950c12Ssthencodepoint> for that character. The exceptions are platforms where
29148950c12Ssthenthe legacy encoding is some variant of EBCDIC rather than a superset
29248950c12Ssthenof ASCII; see L<perlebcdic>.
293b39c5158Smillert
29448950c12SsthenDuring recent history, data is moved around a computer in 8-bit chunks,
29548950c12Ssthenoften called "bytes" but also known as "octets" in standards documents.
29648950c12SsthenPerl is widely used to manipulate data of many types: not only strings of
29748950c12Ssthencharacters representing human or computer languages, but also "binary"
29848950c12Ssthendata, being the machine's representation of numbers, pixels in an image, or
29948950c12Ssthenjust about anything.
300b39c5158Smillert
301b39c5158SmillertWhen Perl is processing "binary data", the programmer wants Perl to
30248950c12Ssthenprocess "sequences of bytes". This is not a problem for Perl: because a
303b39c5158Smillertbyte has 256 possible values, it easily fits in Perl's much larger
304b39c5158Smillert"logical character".
305b39c5158Smillert
306e9ce3842Safresh1This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq>
307e9ce3842Safresh1explain the I<why>.
308e9ce3842Safresh1
309b39c5158Smillert=head2 TERMINOLOGY
310b39c5158Smillert
311e9ce3842Safresh1=head3 character
312b39c5158Smillert
313e9ce3842Safresh1A character in the range 0 .. 2**32-1 (or more);
31448950c12Ssthenwhat Perl's strings are made of.
315b39c5158Smillert
316e9ce3842Safresh1=head3 byte
317b39c5158Smillert
318e9ce3842Safresh1A character in the range 0..255;
319e9ce3842Safresh1a special case of a Perl character.
320b39c5158Smillert
321e9ce3842Safresh1=head3 octet
322b39c5158Smillert
323e9ce3842Safresh18 bits of data, with ordinal values 0..255;
324e9ce3842Safresh1term for bytes passed to or from a non-Perl context, such as a disk file,
325e9ce3842Safresh1standard I/O stream, database, command-line argument, environment variable,
326e9ce3842Safresh1socket etc.
327b39c5158Smillert
32848950c12Ssthen=head1 THE PERL ENCODING API
329b39c5158Smillert
330e9ce3842Safresh1=head2 Basic methods
331b39c5158Smillert
332e9ce3842Safresh1=head3 encode
333e9ce3842Safresh1
334e9ce3842Safresh1  $octets  = encode(ENCODING, STRING[, CHECK])
335b39c5158Smillert
33648950c12SsthenEncodes the scalar value I<STRING> from Perl's internal form into
33748950c12SsthenI<ENCODING> and returns a sequence of octets.  I<ENCODING> can be either a
33848950c12Ssthencanonical name or an alias.  For encoding names and aliases, see
33948950c12SsthenL</"Defining Aliases">.  For CHECK, see L</"Handling Malformed Data">.
340b39c5158Smillert
3419f11ffb7Safresh1B<CAVEAT>: the input scalar I<STRING> might be modified in-place depending
3429f11ffb7Safresh1on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
3439f11ffb7Safresh1left unchanged.
3449f11ffb7Safresh1
34548950c12SsthenFor example, to convert a string from Perl's internal format into
34648950c12SsthenISO-8859-1, also known as Latin1:
347b39c5158Smillert
348b39c5158Smillert  $octets = encode("iso-8859-1", $string);
349b39c5158Smillert
3509f11ffb7Safresh1B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then
35148950c12Ssthen$octets I<might not be equal to> $string.  Though both contain the
35248950c12Ssthensame data, the UTF8 flag for $octets is I<always> off.  When you
35348950c12Ssthenencode anything, the UTF8 flag on the result is always off, even when it
3549f11ffb7Safresh1contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below.
355b39c5158Smillert
35648950c12SsthenIf the $string is C<undef>, then C<undef> is returned.
357b39c5158Smillert
3589f11ffb7Safresh1C<str2bytes> may be used as an alias for C<encode>.
3599f11ffb7Safresh1
360e9ce3842Safresh1=head3 decode
361e9ce3842Safresh1
362e9ce3842Safresh1  $string = decode(ENCODING, OCTETS[, CHECK])
363b39c5158Smillert
36448950c12SsthenThis function returns the string that results from decoding the scalar
36548950c12Ssthenvalue I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
366b8851fccSafresh1Perl's internal form.  As with encode(),
36748950c12SsthenI<ENCODING> can be either a canonical name or an alias. For encoding names
36848950c12Ssthenand aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
36948950c12SsthenMalformed Data">.
370b39c5158Smillert
3719f11ffb7Safresh1B<CAVEAT>: the input scalar I<OCTETS> might be modified in-place depending
3729f11ffb7Safresh1on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
3739f11ffb7Safresh1left unchanged.
3749f11ffb7Safresh1
37548950c12SsthenFor example, to convert ISO-8859-1 data into a string in Perl's
37648950c12Sstheninternal format:
377b39c5158Smillert
378b39c5158Smillert  $string = decode("iso-8859-1", $octets);
379b39c5158Smillert
3809f11ffb7Safresh1B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string
38148950c12SsthenI<might not be equal to> $octets.  Though both contain the same data, the
382e5157e49Safresh1UTF8 flag for $string is on.  See L</"The UTF8 flag">
383b39c5158Smillertbelow.
384b39c5158Smillert
38548950c12SsthenIf the $string is C<undef>, then C<undef> is returned.
386b39c5158Smillert
3879f11ffb7Safresh1C<bytes2str> may be used as an alias for C<decode>.
3889f11ffb7Safresh1
389e9ce3842Safresh1=head3 find_encoding
390e9ce3842Safresh1
391e9ce3842Safresh1  [$obj =] find_encoding(ENCODING)
392b39c5158Smillert
39348950c12SsthenReturns the I<encoding object> corresponding to I<ENCODING>.  Returns
39448950c12SsthenC<undef> if no matching I<ENCODING> is find.  The returned object is
39548950c12Ssthenwhat does the actual encoding or decoding.
396b39c5158Smillert
3979f11ffb7Safresh1  $string = decode($name, $bytes);
398b39c5158Smillert
399b39c5158Smillertis in fact
400b39c5158Smillert
4019f11ffb7Safresh1    $string = do {
402b39c5158Smillert        $obj = find_encoding($name);
403b39c5158Smillert        croak qq(encoding "$name" not found) unless ref $obj;
40448950c12Ssthen        $obj->decode($bytes);
405b39c5158Smillert    };
406b39c5158Smillert
407b39c5158Smillertwith more error checking.
408b39c5158Smillert
40948950c12SsthenYou can therefore save time by reusing this object as follows;
410b39c5158Smillert
411b39c5158Smillert    my $enc = find_encoding("iso-8859-1");
412b39c5158Smillert    while(<>) {
4139f11ffb7Safresh1        my $string = $enc->decode($_);
4149f11ffb7Safresh1        ... # now do something with $string;
415b39c5158Smillert    }
416b39c5158Smillert
417e9ce3842Safresh1Besides L</decode> and L</encode>, other methods are
418e9ce3842Safresh1available as well.  For instance, C<name()> returns the canonical
419b39c5158Smillertname of the encoding object.
420b39c5158Smillert
421b39c5158Smillert  find_encoding("latin1")->name; # iso-8859-1
422b39c5158Smillert
423b39c5158SmillertSee L<Encode::Encoding> for details.
424b39c5158Smillert
4259f11ffb7Safresh1=head3 find_mime_encoding
4269f11ffb7Safresh1
4279f11ffb7Safresh1  [$obj =] find_mime_encoding(MIME_ENCODING)
4289f11ffb7Safresh1
4299f11ffb7Safresh1Returns the I<encoding object> corresponding to I<MIME_ENCODING>.  Acts
4309f11ffb7Safresh1same as C<find_encoding()> but C<mime_name()> of returned object must
4319f11ffb7Safresh1match to I<MIME_ENCODING>.  So as opposite of C<find_encoding()>
4329f11ffb7Safresh1canonical names and aliases are not used when searching for object.
4339f11ffb7Safresh1
434*3d61058aSafresh1    find_mime_encoding("utf8"); # returns undef because "utf8" is not a valid MIME_ENCODING
4359f11ffb7Safresh1    find_mime_encoding("utf-8"); # returns encode object "utf-8-strict"
436*3d61058aSafresh1    find_mime_encoding("UTF-8"); # same as "utf-8" because MIME_ENCODING is case insensitive
437*3d61058aSafresh1    find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not a valid MIME_ENCODING
4389f11ffb7Safresh1
439e9ce3842Safresh1=head3 from_to
440e9ce3842Safresh1
441e9ce3842Safresh1  [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
442b39c5158Smillert
44348950c12SsthenConverts I<in-place> data between two encodings. The data in $octets
44448950c12Ssthenmust be encoded as octets and I<not> as characters in Perl's internal
44548950c12Ssthenformat. For example, to convert ISO-8859-1 data into Microsoft's CP1250
446b39c5158Smillertencoding:
447b39c5158Smillert
448b39c5158Smillert  from_to($octets, "iso-8859-1", "cp1250");
449b39c5158Smillert
450b39c5158Smillertand to convert it back:
451b39c5158Smillert
452b39c5158Smillert  from_to($octets, "cp1250", "iso-8859-1");
453b39c5158Smillert
45448950c12SsthenBecause the conversion happens in place, the data to be
45548950c12Ssthenconverted cannot be a string constant: it must be a scalar variable.
456b39c5158Smillert
457e9ce3842Safresh1C<from_to()> returns the length of the converted string in octets on success,
45848950c12Ssthenand C<undef> on error.
459b39c5158Smillert
46048950c12SsthenB<CAVEAT>: The following operations may look the same, but are not:
461b39c5158Smillert
4629f11ffb7Safresh1  from_to($data, "iso-8859-1", "UTF-8"); #1
463b39c5158Smillert  $data = decode("iso-8859-1", $data);  #2
464b39c5158Smillert
46548950c12SsthenBoth #1 and #2 make $data consist of a completely valid UTF-8 string,
46648950c12Ssthenbut only #2 turns the UTF8 flag on.  #1 is equivalent to:
467b39c5158Smillert
4689f11ffb7Safresh1  $data = encode("UTF-8", decode("iso-8859-1", $data));
469b39c5158Smillert
470b39c5158SmillertSee L</"The UTF8 flag"> below.
471b39c5158Smillert
47248950c12SsthenAlso note that:
473b39c5158Smillert
474b39c5158Smillert  from_to($octets, $from, $to, $check);
475b39c5158Smillert
476b8851fccSafresh1is equivalent to:
477b39c5158Smillert
478b39c5158Smillert  $octets = encode($to, decode($from, $octets), $check);
479b39c5158Smillert
48048950c12SsthenYes, it does I<not> respect the $check during decoding.  It is
48148950c12Ssthendeliberately done that way.  If you need minute control, use C<decode>
48248950c12Ssthenfollowed by C<encode> as follows:
483b39c5158Smillert
484b39c5158Smillert  $octets = encode($to, decode($from, $octets, $check_from), $check_to);
485b39c5158Smillert
486e9ce3842Safresh1=head3 encode_utf8
487e9ce3842Safresh1
488e9ce3842Safresh1  $octets = encode_utf8($string);
489b39c5158Smillert
490eac174f2Safresh1B<WARNING>: L<This function can produce invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
491eac174f2Safresh1Do not use it for data exchange.
492eac174f2Safresh1Unless you want Perl's older "lax" mode, prefer
493eac174f2Safresh1C<$octets = encode("UTF-8", $string)>.
494eac174f2Safresh1
49548950c12SsthenEquivalent to C<$octets = encode("utf8", $string)>.  The characters in
49648950c12Ssthen$string are encoded in Perl's internal format, and the result is returned
49748950c12Ssthenas a sequence of octets.  Because all possible characters in Perl have a
4989f11ffb7Safresh1(loose, not strict) utf8 representation, this function cannot fail.
4999f11ffb7Safresh1
500e9ce3842Safresh1=head3 decode_utf8
501e9ce3842Safresh1
502e9ce3842Safresh1  $string = decode_utf8($octets [, CHECK]);
503b39c5158Smillert
504eac174f2Safresh1B<WARNING>: L<This function accepts invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
505eac174f2Safresh1Do not use it for data exchange.
506eac174f2Safresh1Unless you want Perl's older "lax" mode, prefer
507eac174f2Safresh1C<$string = decode("UTF-8", $octets [, CHECK])>.
508eac174f2Safresh1
50948950c12SsthenEquivalent to C<$string = decode("utf8", $octets [, CHECK])>.
51048950c12SsthenThe sequence of octets represented by $octets is decoded
5119f11ffb7Safresh1from (loose, not strict) utf8 into a sequence of logical characters.
5129f11ffb7Safresh1Because not all sequences of octets are valid not strict utf8,
51348950c12Ssthenit is quite possible for this function to fail.
51448950c12SsthenFor CHECK, see L</"Handling Malformed Data">.
515b39c5158Smillert
5169f11ffb7Safresh1B<CAVEAT>: the input I<$octets> might be modified in-place depending on
5179f11ffb7Safresh1what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
5189f11ffb7Safresh1left unchanged.
5199f11ffb7Safresh1
520b39c5158Smillert=head2 Listing available encodings
521b39c5158Smillert
522b39c5158Smillert  use Encode;
523b39c5158Smillert  @list = Encode->encodings();
524b39c5158Smillert
52548950c12SsthenReturns a list of canonical names of available encodings that have already
52648950c12Ssthenbeen loaded.  To get a list of all available encodings including those that
52748950c12Ssthenhave not yet been loaded, say:
528b39c5158Smillert
529b39c5158Smillert  @all_encodings = Encode->encodings(":all");
530b39c5158Smillert
53148950c12SsthenOr you can give the name of a specific module:
532b39c5158Smillert
533b39c5158Smillert  @with_jp = Encode->encodings("Encode::JP");
534b39c5158Smillert
53548950c12SsthenWhen "C<::>" is not in the name, "C<Encode::>" is assumed.
536b39c5158Smillert
537b39c5158Smillert  @ebcdic = Encode->encodings("EBCDIC");
538b39c5158Smillert
539b39c5158SmillertTo find out in detail which encodings are supported by this package,
540b39c5158Smillertsee L<Encode::Supported>.
541b39c5158Smillert
542b39c5158Smillert=head2 Defining Aliases
543b39c5158Smillert
544b39c5158SmillertTo add a new alias to a given encoding, use:
545b39c5158Smillert
546b39c5158Smillert  use Encode;
547b39c5158Smillert  use Encode::Alias;
54848950c12Ssthen  define_alias(NEWNAME => ENCODING);
549b39c5158Smillert
55048950c12SsthenAfter that, I<NEWNAME> can be used as an alias for I<ENCODING>.
551e9ce3842Safresh1I<ENCODING> may be either the name of an encoding or an
55248950c12SsthenI<encoding object>.
553b39c5158Smillert
55448950c12SsthenBefore you do that, first make sure the alias is nonexistent using
555b39c5158SmillertC<resolve_alias()>, which returns the canonical name thereof.
55648950c12SsthenFor example:
557b39c5158Smillert
558b39c5158Smillert  Encode::resolve_alias("latin1") eq "iso-8859-1" # true
559b39c5158Smillert  Encode::resolve_alias("iso-8859-12")   # false; nonexistent
560b39c5158Smillert  Encode::resolve_alias($name) eq $name  # true if $name is canonical
561b39c5158Smillert
562e9ce3842Safresh1C<resolve_alias()> does not need C<use Encode::Alias>; it can be
56348950c12Ssthenimported via C<use Encode qw(resolve_alias)>.
564b39c5158Smillert
565b39c5158SmillertSee L<Encode::Alias> for details.
566b39c5158Smillert
567b39c5158Smillert=head2 Finding IANA Character Set Registry names
568b39c5158Smillert
569b39c5158SmillertThe canonical name of a given encoding does not necessarily agree with
57048950c12SsthenIANA Character Set Registry, commonly seen as C<< Content-Type:
57148950c12Ssthentext/plain; charset=I<WHATEVER> >>.  For most cases, the canonical name
57248950c12Ssthenworks, but sometimes it does not, most notably with "utf-8-strict".
573b39c5158Smillert
57448950c12SsthenAs of C<Encode> version 2.21, a new method C<mime_name()> is therefore added.
575b39c5158Smillert
576b39c5158Smillert  use Encode;
57748950c12Ssthen  my $enc = find_encoding("UTF-8");
578b39c5158Smillert  warn $enc->name;      # utf-8-strict
579b39c5158Smillert  warn $enc->mime_name; # UTF-8
580b39c5158Smillert
581b39c5158SmillertSee also:  L<Encode::Encoding>
582b39c5158Smillert
583b39c5158Smillert=head1 Encoding via PerlIO
584b39c5158Smillert
58548950c12SsthenIf your perl supports C<PerlIO> (which is the default), you can use a
58648950c12SsthenC<PerlIO> layer to decode and encode directly via a filehandle.  The
58748950c12Ssthenfollowing two examples are fully identical in functionality:
588b39c5158Smillert
58948950c12Ssthen  ### Version 1 via PerlIO
59048950c12Ssthen    open(INPUT,  "< :encoding(shiftjis)", $infile)
59148950c12Ssthen        || die "Can't open < $infile for reading: $!";
59248950c12Ssthen    open(OUTPUT, "> :encoding(euc-jp)",  $outfile)
59348950c12Ssthen        || die "Can't open > $output for writing: $!";
59448950c12Ssthen    while (<INPUT>) {   # auto decodes $_
59548950c12Ssthen        print OUTPUT;   # auto encodes $_
596b39c5158Smillert    }
59748950c12Ssthen    close(INPUT)   || die "can't close $infile: $!";
59848950c12Ssthen    close(OUTPUT)  || die "can't close $outfile: $!";
599b39c5158Smillert
60048950c12Ssthen  ### Version 2 via from_to()
60148950c12Ssthen    open(INPUT,  "< :raw", $infile)
60248950c12Ssthen        || die "Can't open < $infile for reading: $!";
60348950c12Ssthen    open(OUTPUT, "> :raw",  $outfile)
60448950c12Ssthen        || die "Can't open > $output for writing: $!";
605b39c5158Smillert
60648950c12Ssthen    while (<INPUT>) {
60748950c12Ssthen        from_to($_, "shiftjis", "euc-jp", 1);  # switch encoding
60848950c12Ssthen        print OUTPUT;   # emit raw (but properly encoded) data
60948950c12Ssthen    }
61048950c12Ssthen    close(INPUT)   || die "can't close $infile: $!";
61148950c12Ssthen    close(OUTPUT)  || die "can't close $outfile: $!";
612b39c5158Smillert
61348950c12SsthenIn the first version above, you let the appropriate encoding layer
61448950c12Ssthenhandle the conversion.  In the second, you explicitly translate
61548950c12Ssthenfrom one encoding to the other.
61648950c12Ssthen
617b8851fccSafresh1Unfortunately, it may be that encodings are not C<PerlIO>-savvy.  You can check
61848950c12Ssthento see whether your encoding is supported by C<PerlIO> by invoking the
61948950c12SsthenC<perlio_ok> method on it:
62048950c12Ssthen
62148950c12Ssthen  Encode::perlio_ok("hz");             # false
62248950c12Ssthen  find_encoding("euc-cn")->perlio_ok;  # true wherever PerlIO is available
62348950c12Ssthen
62448950c12Ssthen  use Encode qw(perlio_ok);            # imported upon request
625b39c5158Smillert  perlio_ok("euc-jp")
626b39c5158Smillert
62748950c12SsthenFortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy
628e9ce3842Safresh1except for C<hz> and C<ISO-2022-kr>.  For the gory details, see
629b39c5158SmillertL<Encode::Encoding> and L<Encode::PerlIO>.
630b39c5158Smillert
631b39c5158Smillert=head1 Handling Malformed Data
632b39c5158Smillert
63348950c12SsthenThe optional I<CHECK> argument tells C<Encode> what to do when
63448950c12Ssthenencountering malformed data.  Without I<CHECK>, C<Encode::FB_DEFAULT>
63548950c12Ssthen(== 0) is assumed.
636b39c5158Smillert
63748950c12SsthenAs of version 2.12, C<Encode> supports coderef values for C<CHECK>;
63848950c12Ssthensee below.
639b39c5158Smillert
640e9ce3842Safresh1B<NOTE:> Not all encodings support this feature.
641e9ce3842Safresh1Some encodings ignore the I<CHECK> argument.  For example,
642b39c5158SmillertL<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
643b39c5158Smillert
644e9ce3842Safresh1=head2 List of I<CHECK> values
645b39c5158Smillert
646e9ce3842Safresh1=head3 FB_DEFAULT
647b39c5158Smillert
648*3d61058aSafresh1  CHECK = Encode::FB_DEFAULT ( == 0)
649b39c5158Smillert
65048950c12SsthenIf I<CHECK> is 0, encoding and decoding replace any malformed character
65148950c12Ssthenwith a I<substitution character>.  When you encode, I<SUBCHAR> is used.
65248950c12SsthenWhen you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is
65348950c12Ssthenused.  If the data is supposed to be UTF-8, an optional lexical warning of
65448950c12Ssthenwarning category C<"utf8"> is given.
655b39c5158Smillert
656e9ce3842Safresh1=head3 FB_CROAK
657e9ce3842Safresh1
658*3d61058aSafresh1  CHECK = Encode::FB_CROAK ( == 1)
659b39c5158Smillert
66048950c12SsthenIf I<CHECK> is 1, methods immediately die with an error
66148950c12Ssthenmessage.  Therefore, when I<CHECK> is 1, you should trap
66248950c12Ssthenexceptions with C<eval{}>, unless you really want to let it C<die>.
663b39c5158Smillert
664e9ce3842Safresh1=head3 FB_QUIET
665e9ce3842Safresh1
666*3d61058aSafresh1  CHECK = Encode::FB_QUIET
667b39c5158Smillert
66848950c12SsthenIf I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately
669b39c5158Smillertreturn the portion of the data that has been processed so far when an
67048950c12Ssthenerror occurs. The data argument is overwritten with everything
67148950c12Ssthenafter that point; that is, the unprocessed portion of the data.  This is
67248950c12Ssthenhandy when you have to call C<decode> repeatedly in the case where your
673b39c5158Smillertsource data may contain partial multi-byte character sequences,
67448950c12Ssthen(that is, you are reading with a fixed-width buffer). Here's some sample
67548950c12Ssthencode to do exactly that:
676b39c5158Smillert
67748950c12Ssthen    my($buffer, $string) = ("", "");
67848950c12Ssthen    while (read($fh, $buffer, 256, length($buffer))) {
679b39c5158Smillert        $string .= decode($encoding, $buffer, Encode::FB_QUIET);
680b39c5158Smillert        # $buffer now contains the unprocessed partial character
681b39c5158Smillert    }
682b39c5158Smillert
683e9ce3842Safresh1=head3 FB_WARN
684e9ce3842Safresh1
685*3d61058aSafresh1  CHECK = Encode::FB_WARN
686b39c5158Smillert
68748950c12SsthenThis is the same as C<FB_QUIET> above, except that instead of being silent
68848950c12Ssthenon errors, it issues a warning.  This is handy for when you are debugging.
689b39c5158Smillert
690b46d8ef2Safresh1B<CAVEAT>: All warnings from Encode module are reported, independently of
691b46d8ef2Safresh1L<pragma warnings|warnings> settings. If you want to follow settings of
692b46d8ef2Safresh1lexical warnings configured by L<pragma warnings|warnings> then append
693b46d8ef2Safresh1also check value C<ENCODE::ONLY_PRAGMA_WARNINGS>. This value is available
694b46d8ef2Safresh1since Encode version 2.99.
695b46d8ef2Safresh1
696e9ce3842Safresh1=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
697e9ce3842Safresh1
698e9ce3842Safresh1=over 2
699e9ce3842Safresh1
700b39c5158Smillert=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
701b39c5158Smillert
702b39c5158Smillert=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
703b39c5158Smillert
704b39c5158Smillert=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
705b39c5158Smillert
706e9ce3842Safresh1=back
707e9ce3842Safresh1
70848950c12SsthenFor encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
70948950c12SsthenC<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
710b39c5158Smillert
71148950c12SsthenWhen you decode, C<\xI<HH>> is inserted for a malformed character, where
71248950c12SsthenI<HH> is the hex representation of the octet that could not be decoded to
71348950c12Ssthenutf8.  When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
71448950c12Ssthenthe Unicode code point (in any number of hex digits) of the character that
71548950c12Ssthencannot be found in the character repertoire of the encoding.
716b39c5158Smillert
71748950c12SsthenThe HTML/XML character reference modes are about the same. In place of
71848950c12SsthenC<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
719b39c5158SmillertXML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
720b39c5158Smillert
72148950c12SsthenIn C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
722b39c5158Smillert
723e9ce3842Safresh1=head3 The bitmask
724b39c5158Smillert
72548950c12SsthenThese modes are all actually set via a bitmask.  Here is how the C<FB_I<XXX>>
72648950c12Ssthenconstants are laid out.  You can import the C<FB_I<XXX>> constants via
72748950c12SsthenC<use Encode qw(:fallbacks)>, and you can import the generic bitmask
728b39c5158Smillertconstants via C<use Encode qw(:fallback_all)>.
729b39c5158Smillert
730b39c5158Smillert                     FB_DEFAULT FB_CROAK FB_QUIET FB_WARN  FB_PERLQQ
731b39c5158Smillert DIE_ON_ERR    0x0001             X
732b39c5158Smillert WARN_ON_ERR   0x0002                               X
733b39c5158Smillert RETURN_ON_ERR 0x0004                      X        X
734b39c5158Smillert LEAVE_SRC     0x0008                                        X
735b39c5158Smillert PERLQQ        0x0100                                        X
736b39c5158Smillert HTMLCREF      0x0200
737b39c5158Smillert XMLCREF       0x0400
738b39c5158Smillert
739e9ce3842Safresh1=head3 LEAVE_SRC
740b39c5158Smillert
741e9ce3842Safresh1  Encode::LEAVE_SRC
742b39c5158Smillert
74348950c12SsthenIf the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
744e9ce3842Safresh1source string to encode() or decode() will be overwritten in place.
74548950c12SsthenIf you're not interested in this, then bitwise-OR it with the bitmask.
746b39c5158Smillert
747b39c5158Smillert=head2 coderef for CHECK
748b39c5158Smillert
74948950c12SsthenAs of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
750e5157e49Safresh1ordinal value of the unmapped character as an argument and returns
751e5157e49Safresh1octets that represent the fallback character.  For instance:
752b39c5158Smillert
753b39c5158Smillert  $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
754b39c5158Smillert
75548950c12SsthenActs like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
756b39c5158Smillert
7579f11ffb7Safresh1Fallback for C<decode> must return decoded string (sequence of characters)
7589f11ffb7Safresh1and takes a list of ordinal values as its arguments. So for
759b8851fccSafresh1example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
760e5157e49Safresh1a fallback for bytes that are not valid UTF-8, you could write
761e5157e49Safresh1
762e5157e49Safresh1    $str = decode 'UTF-8', $octets, sub {
7639f11ffb7Safresh1        my $tmp = join '', map chr, @_;
7649f11ffb7Safresh1        return decode 'ISO-8859-15', $tmp;
765e5157e49Safresh1    };
766e5157e49Safresh1
767b39c5158Smillert=head1 Defining Encodings
768b39c5158Smillert
769b39c5158SmillertTo define a new encoding, use:
770b39c5158Smillert
771b39c5158Smillert    use Encode qw(define_encoding);
77248950c12Ssthen    define_encoding($object, CANONICAL_NAME [, alias...]);
773b39c5158Smillert
77448950c12SsthenI<CANONICAL_NAME> will be associated with I<$object>.  The object
775b39c5158Smillertshould provide the interface described in L<Encode::Encoding>.
77648950c12SsthenIf more than two arguments are provided, additional
77748950c12Ssthenarguments are considered aliases for I<$object>.
778b39c5158Smillert
77948950c12SsthenSee L<Encode::Encoding> for details.
780b39c5158Smillert
781b39c5158Smillert=head1 The UTF8 flag
782b39c5158Smillert
783*3d61058aSafresh1Before the introduction of Unicode support in Perl, the C<eq> operator
784b39c5158Smillertjust compared the strings represented by two scalars. Beginning with
78548950c12SsthenPerl 5.8, C<eq> compares two strings with simultaneous consideration of
78648950c12SsthenI<the UTF8 flag>. To explain why we made it so, I quote from page 402 of
78748950c12SsthenI<Programming Perl, 3rd ed.>
788b39c5158Smillert
789b39c5158Smillert=over 2
790b39c5158Smillert
791b39c5158Smillert=item Goal #1:
792b39c5158Smillert
793b39c5158SmillertOld byte-oriented programs should not spontaneously break on the old
794b39c5158Smillertbyte-oriented data they used to work on.
795b39c5158Smillert
796b39c5158Smillert=item Goal #2:
797b39c5158Smillert
798b39c5158SmillertOld byte-oriented programs should magically start working on the new
799b39c5158Smillertcharacter-oriented data when appropriate.
800b39c5158Smillert
801b39c5158Smillert=item Goal #3:
802b39c5158Smillert
803b39c5158SmillertPrograms should run just as fast in the new character-oriented mode
804b39c5158Smillertas in the old byte-oriented mode.
805b39c5158Smillert
806b39c5158Smillert=item Goal #4:
807b39c5158Smillert
808b39c5158SmillertPerl should remain one language, rather than forking into a
809b39c5158Smillertbyte-oriented Perl and a character-oriented Perl.
810b39c5158Smillert
811b39c5158Smillert=back
812b39c5158Smillert
81348950c12SsthenWhen I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been
81448950c12Ssthenborn yet, many features documented in the book remained unimplemented for a
81548950c12Ssthenlong time.  Perl 5.8 corrected much of this, and the introduction of the
81648950c12SsthenUTF8 flag is one of them.  You can think of there being two fundamentally
81748950c12Ssthendifferent kinds of strings and string-operations in Perl: one a
81848950c12Ssthenbyte-oriented mode  for when the internal UTF8 flag is off, and the other a
81948950c12Ssthencharacter-oriented mode for when the internal UTF8 flag is on.
820b39c5158Smillert
82148950c12SsthenThis UTF8 flag is not visible in Perl scripts, exactly for the same reason
82248950c12Ssthenyou cannot (or rather, you I<don't have to>) see whether a scalar contains
82348950c12Ssthena string, an integer, or a floating-point number.   But you can still peek
82448950c12Ssthenand poke these if you will.  See the next section.
825b39c5158Smillert
826b39c5158Smillert=head2 Messing with Perl's Internals
827b39c5158Smillert
828b39c5158SmillertThe following API uses parts of Perl's internals in the current
82948950c12Ssthenimplementation.  As such, they are efficient but may change in a future
83048950c12Ssthenrelease.
831b39c5158Smillert
832e9ce3842Safresh1=head3 is_utf8
833b39c5158Smillert
834e9ce3842Safresh1  is_utf8(STRING [, CHECK])
835b39c5158Smillert
83648950c12Ssthen[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>.
83748950c12SsthenIf I<CHECK> is true, also checks whether I<STRING> contains well-formed
838b39c5158SmillertUTF-8.  Returns true if successful, false otherwise.
839b39c5158Smillert
8409f11ffb7Safresh1Typically only necessary for debugging and testing.  Don't use this flag as
8419f11ffb7Safresh1a marker to distinguish character and binary data, that should be decided
8429f11ffb7Safresh1for each variable when you write your code.
8439f11ffb7Safresh1
8449f11ffb7Safresh1B<CAVEAT>: If I<STRING> has UTF8 flag set, it does B<NOT> mean that
8459f11ffb7Safresh1I<STRING> is UTF-8 encoded and vice-versa.
8469f11ffb7Safresh1
84748950c12SsthenAs of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
848b39c5158Smillert
849e9ce3842Safresh1=head3 _utf8_on
850e9ce3842Safresh1
851e9ce3842Safresh1  _utf8_on(STRING)
852b39c5158Smillert
85348950c12Ssthen[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>.  The I<STRING>
85448950c12Ssthenis I<not> checked for containing only well-formed UTF-8.  Do not use this
85548950c12Ssthenunless you I<know with absolute certainty> that the STRING holds only
85648950c12Ssthenwell-formed UTF-8.  Returns the previous state of the UTF8 flag (so please
85748950c12Ssthendon't treat the return value as indicating success or failure), or C<undef>
85848950c12Ssthenif I<STRING> is not a string.
859b39c5158Smillert
86048950c12SsthenB<NOTE>: For security reasons, this function does not work on tainted values.
861b39c5158Smillert
862e9ce3842Safresh1=head3 _utf8_off
863e9ce3842Safresh1
864e9ce3842Safresh1  _utf8_off(STRING)
865b39c5158Smillert
86648950c12Ssthen[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>.  Do not use
86748950c12Ssthenfrivolously.  Returns the previous state of the UTF8 flag, or C<undef> if
86848950c12SsthenI<STRING> is not a string.  Do not treat the return value as indicative of
86948950c12Ssthensuccess or failure, because that isn't what it means: it is only the
87048950c12Ssthenprevious setting.
871b39c5158Smillert
87248950c12SsthenB<NOTE>: For security reasons, this function does not work on tainted values.
873b39c5158Smillert
874b39c5158Smillert=head1 UTF-8 vs. utf8 vs. UTF8
875b39c5158Smillert
876b39c5158Smillert  ....We now view strings not as sequences of bytes, but as sequences
877b39c5158Smillert  of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
878b39c5158Smillert  computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
879b39c5158Smillert
88048950c12SsthenThat has historically been Perl's notion of UTF-8, as that is how UTF-8 was
88148950c12Ssthenfirst conceived by Ken Thompson when he invented it. However, thanks to
88248950c12Ssthenlater revisions to the applicable standards, official UTF-8 is now rather
88348950c12Ssthenstricter than that. For example, its range is much narrower (0 .. 0x10_FFFF
88448950c12Ssthento cover only 21 bits instead of 32 or 64 bits) and some sequences
88548950c12Ssthenare not allowed, like those used in surrogate pairs, the 31 non-character
88648950c12Ssthencode points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane
88748950c12Ssthen(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc.
888b39c5158Smillert
88948950c12SsthenThe former default in which Perl would always use a loose interpretation of
89048950c12SsthenUTF-8 has now been overruled:
891b39c5158Smillert
892b39c5158Smillert  From: Larry Wall <larry@wall.org>
893b39c5158Smillert  Date: December 04, 2004 11:51:58 JST
894b39c5158Smillert  To: perl-unicode@perl.org
895b39c5158Smillert  Subject: Re: Make Encode.pm support the real UTF-8
896b39c5158Smillert  Message-Id: <20041204025158.GA28754@wall.org>
897b39c5158Smillert
898b39c5158Smillert  On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
899b39c5158Smillert  : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
900b39c5158Smillert  : but "UTF-8" is the name of the standard and should give the
901b39c5158Smillert  : corresponding behaviour.
902b39c5158Smillert
903b39c5158Smillert  For what it's worth, that's how I've always kept them straight in my
904b39c5158Smillert  head.
905b39c5158Smillert
906b39c5158Smillert  Also for what it's worth, Perl 6 will mostly default to strict but
907b39c5158Smillert  make it easy to switch back to lax.
908b39c5158Smillert
909b39c5158Smillert  Larry
910b39c5158Smillert
91148950c12SsthenGot that?  As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current
91248950c12Ssthensense, which is conservative and strict and security-conscious, whereas
91348950c12SsthenB<"utf8"> means UTF-8 in its former sense, which was liberal and loose and
91448950c12Ssthenlax.  C<Encode> version 2.10 or later thus groks this subtle but critically
91548950c12Ssthenimportant distinction between C<"UTF-8"> and C<"utf8">.
916b39c5158Smillert
917b39c5158Smillert  encode("utf8",  "\x{FFFF_FFFF}", 1); # okay
918b39c5158Smillert  encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
919b39c5158Smillert
920eac174f2Safresh1This distinction is also important for decoding. In the following,
921eac174f2Safresh1C<$s> stores character U+200000, which exceeds UTF-8's allowed range.
922eac174f2Safresh1C<$s> thus stores an invalid Unicode code point:
923eac174f2Safresh1
924eac174f2Safresh1  $s = decode("utf8", "\xf8\x88\x80\x80\x80");
925eac174f2Safresh1
926eac174f2Safresh1C<"UTF-8">, by contrast, will either coerce the input to something valid:
927eac174f2Safresh1
928eac174f2Safresh1    $s = decode("UTF-8", "\xf8\x88\x80\x80\x80"); # U+FFFD
929eac174f2Safresh1
930eac174f2Safresh1.. or croak:
931eac174f2Safresh1
932eac174f2Safresh1    decode("UTF-8", "\xf8\x88\x80\x80\x80", FB_CROAK|LEAVE_SRC);
933eac174f2Safresh1
93448950c12SsthenIn the C<Encode> module, C<"UTF-8"> is actually a canonical name for
93548950c12SsthenC<"utf-8-strict">.  That hyphen between the C<"UTF"> and the C<"8"> is
93648950c12Ssthencritical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
937b39c5158Smillert
938b39c5158Smillert  find_encoding("UTF-8")->name # is 'utf-8-strict'
939b39c5158Smillert  find_encoding("utf-8")->name # ditto. names are case insensitive
940b39c5158Smillert  find_encoding("utf_8")->name # ditto. "_" are treated as "-"
941b39c5158Smillert  find_encoding("UTF8")->name  # is 'utf8'.
942b39c5158Smillert
94348950c12SsthenPerl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates
94448950c12Ssthenwhether a string is internally encoded as "utf8", also without a hyphen.
945b39c5158Smillert
946b39c5158Smillert=head1 SEE ALSO
947b39c5158Smillert
948b39c5158SmillertL<Encode::Encoding>,
949b39c5158SmillertL<Encode::Supported>,
950b39c5158SmillertL<Encode::PerlIO>,
951b39c5158SmillertL<encoding>,
952b39c5158SmillertL<perlebcdic>,
953b39c5158SmillertL<perlfunc/open>,
954b39c5158SmillertL<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
955b39c5158SmillertL<utf8>,
956e9ce3842Safresh1the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html>
957b39c5158Smillert
958b39c5158Smillert=head1 MAINTAINER
959b39c5158Smillert
96048950c12SsthenThis project was originated by the late Nick Ing-Simmons and later
961e9ce3842Safresh1maintained by Dan Kogai I<< <dankogai@cpan.org> >>.  See AUTHORS
96248950c12Ssthenfor a full list of people involved.  For any questions, send mail to
96348950c12SsthenI<< <perl-unicode@perl.org> >> so that we can all share.
964b39c5158Smillert
96548950c12SsthenWhile Dan Kogai retains the copyright as a maintainer, credit
96648950c12Ssthenshould go to all those involved.  See AUTHORS for a list of those
96748950c12Ssthenwho submitted code to the project.
968b39c5158Smillert
969b39c5158Smillert=head1 COPYRIGHT
970b39c5158Smillert
971b8851fccSafresh1Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>.
972b39c5158Smillert
973b39c5158SmillertThis library is free software; you can redistribute it and/or modify
974b39c5158Smillertit under the same terms as Perl itself.
975b39c5158Smillert
976b39c5158Smillert=cut
977