cpan/Encode/Encode.pm

b39c5158Smillert#
*3d61058aSafresh1# $Id: Encode.pm,v 3.21 2024/02/25 22:17:32 dankogai Exp $
b39c5158Smillert#
b39c5158Smillertpackage Encode;
b39c5158Smillertuse strict;
b39c5158Smillertuse warnings;
48950c12Ssthenuse constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
9f11ffb7Safresh1our $VERSION;
9f11ffb7Safresh1BEGIN {
*3d61058aSafresh1    $VERSION = sprintf "%d.%02d", q$Revision: 3.21 $ =~ /(\d+)/g;
9f11ffb7Safresh1    require XSLoader;
b39c5158Smillert    XSLoader::load( __PACKAGE__, $VERSION );
9f11ffb7Safresh1}
b39c5158Smillert
e5157e49Safresh1use Exporter 5.57 'import';
b39c5158Smillert
b46d8ef2Safresh1use Carp ();
9f11ffb7Safresh1our @CARP_NOT = qw(Encode::Encoder);
9f11ffb7Safresh1
b39c5158Smillert# Public, encouraged API is exported by default
b39c5158Smillert
b39c5158Smillertour @EXPORT = qw(
b39c5158Smillert  decode  decode_utf8  encode  encode_utf8 str2bytes bytes2str
9f11ffb7Safresh1  encodings  find_encoding find_mime_encoding clone_encoding
b39c5158Smillert);
b39c5158Smillertour @FB_FLAGS = qw(
b39c5158Smillert  DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
b39c5158Smillert  PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
b39c5158Smillert);
b39c5158Smillertour @FB_CONSTS = qw(
b39c5158Smillert  FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
b39c5158Smillert  FB_PERLQQ FB_HTMLCREF FB_XMLCREF
b39c5158Smillert);
b39c5158Smillertour @EXPORT_OK = (
b39c5158Smillert    qw(
b39c5158Smillert      _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
b39c5158Smillert      is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
b39c5158Smillert      ),
b39c5158Smillert    @FB_FLAGS, @FB_CONSTS,
b39c5158Smillert);
b39c5158Smillert
b39c5158Smillertour %EXPORT_TAGS = (
b39c5158Smillert    all          => [ @EXPORT,    @EXPORT_OK ],
b39c5158Smillert    default      => [ @EXPORT ],
b39c5158Smillert    fallbacks    => [ @FB_CONSTS ],
b39c5158Smillert    fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
b39c5158Smillert);
b39c5158Smillert
b39c5158Smillert# Documentation moved after __END__ for speed - NI-S
b39c5158Smillert
b39c5158Smillertour $ON_EBCDIC = ( ord("A") == 193 );
b39c5158Smillert
9f11ffb7Safresh1use Encode::Alias ();
9f11ffb7Safresh1use Encode::MIME::Name;
9f11ffb7Safresh1
9f11ffb7Safresh1use Storable;
b39c5158Smillert
b39c5158Smillert# Make a %Encoding package variable to allow a certain amount of cheating
b39c5158Smillertour %Encoding;
b39c5158Smillertour %ExtModule;
b39c5158Smillertrequire Encode::Config;
b39c5158Smillert#  See
b39c5158Smillert#  https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
e5157e49Safresh1#  to find why sig handlers inside eval{} are disabled.
b39c5158Smillerteval {
b39c5158Smillert    local $SIG{__DIE__};
b39c5158Smillert    local $SIG{__WARN__};
c50a90c5Safresh1    local @INC = @INC;
c50a90c5Safresh1    pop @INC if @INC && $INC[-1] eq '.';
b39c5158Smillert    require Encode::ConfigLocal;
b39c5158Smillert};
b39c5158Smillert
b39c5158Smillertsub encodings {
b39c5158Smillert    my %enc;
e9ce3842Safresh1    my $arg  = $_[1] || '';
e9ce3842Safresh1    if ( $arg eq ":all" ) {
b39c5158Smillert        %enc = ( %Encoding, %ExtModule );
b39c5158Smillert    }
b39c5158Smillert    else {
b39c5158Smillert        %enc = %Encoding;
48950c12Ssthen        for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) {
b39c5158Smillert            DEBUG and warn $mod;
b39c5158Smillert            for my $enc ( keys %ExtModule ) {
b39c5158Smillert                $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
b39c5158Smillert            }
b39c5158Smillert        }
b39c5158Smillert    }
b39c5158Smillert    return sort { lc $a cmp lc $b }
b39c5158Smillert      grep      { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
b39c5158Smillert}
b39c5158Smillert
b39c5158Smillertsub perlio_ok {
b39c5158Smillert    my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
b39c5158Smillert    $obj->can("perlio_ok") and return $obj->perlio_ok();
b39c5158Smillert    return 0;    # safety net
b39c5158Smillert}
b39c5158Smillert
b39c5158Smillertsub define_encoding {
b39c5158Smillert    my $obj  = shift;
b39c5158Smillert    my $name = shift;
b39c5158Smillert    $Encoding{$name} = $obj;
b39c5158Smillert    my $lc = lc($name);
b39c5158Smillert    define_alias( $lc => $obj ) unless $lc eq $name;
b39c5158Smillert    while (@_) {
b39c5158Smillert        my $alias = shift;
b39c5158Smillert        define_alias( $alias, $obj );
b39c5158Smillert    }
9f11ffb7Safresh1    my $class = ref($obj);
9f11ffb7Safresh1    push @Encode::CARP_NOT, $class unless grep { $_ eq $class } @Encode::CARP_NOT;
9f11ffb7Safresh1    push @Encode::Encoding::CARP_NOT, $class unless grep { $_ eq $class } @Encode::Encoding::CARP_NOT;
b39c5158Smillert    return $obj;
b39c5158Smillert}
b39c5158Smillert
b39c5158Smillertsub getEncoding {
b39c5158Smillert    my ( $class, $name, $skip_external ) = @_;
b39c5158Smillert
9f11ffb7Safresh1    defined($name) or return;
9f11ffb7Safresh1
48950c12Ssthen    $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
48950c12Ssthen
b39c5158Smillert    ref($name) && $name->can('renew') and return $name;
b39c5158Smillert    exists $Encoding{$name} and return $Encoding{$name};
b39c5158Smillert    my $lc = lc $name;
b39c5158Smillert    exists $Encoding{$lc} and return $Encoding{$lc};
b39c5158Smillert
b39c5158Smillert    my $oc = $class->find_alias($name);
b39c5158Smillert    defined($oc) and return $oc;
b39c5158Smillert    $lc ne $name and $oc = $class->find_alias($lc);
b39c5158Smillert    defined($oc) and return $oc;
b39c5158Smillert
b39c5158Smillert    unless ($skip_external) {
b39c5158Smillert        if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
b39c5158Smillert            $mod =~ s,::,/,g;
b39c5158Smillert            $mod .= '.pm';
b39c5158Smillert            eval { require $mod; };
b39c5158Smillert            exists $Encoding{$name} and return $Encoding{$name};
b39c5158Smillert        }
b39c5158Smillert    }
b39c5158Smillert    return;
b39c5158Smillert}
b39c5158Smillert
9f11ffb7Safresh1# HACK: These two functions must be defined in Encode and because of
9f11ffb7Safresh1# cyclic dependency between Encode and Encode::Alias, Exporter does not work
9f11ffb7Safresh1sub find_alias {
9f11ffb7Safresh1    goto &Encode::Alias::find_alias;
9f11ffb7Safresh1}
9f11ffb7Safresh1sub define_alias {
9f11ffb7Safresh1    goto &Encode::Alias::define_alias;
9f11ffb7Safresh1}
9f11ffb7Safresh1
b39c5158Smillertsub find_encoding($;$) {
b39c5158Smillert    my ( $name, $skip_external ) = @_;
b39c5158Smillert    return __PACKAGE__->getEncoding( $name, $skip_external );
b39c5158Smillert}
b39c5158Smillert
9f11ffb7Safresh1sub find_mime_encoding($;$) {
9f11ffb7Safresh1    my ( $mime_name, $skip_external ) = @_;
9f11ffb7Safresh1    my $name = Encode::MIME::Name::get_encode_name( $mime_name );
9f11ffb7Safresh1    return find_encoding( $name, $skip_external );
9f11ffb7Safresh1}
9f11ffb7Safresh1
b39c5158Smillertsub resolve_alias($) {
b39c5158Smillert    my $obj = find_encoding(shift);
b39c5158Smillert    defined $obj and return $obj->name;
b39c5158Smillert    return;
b39c5158Smillert}
b39c5158Smillert
b39c5158Smillertsub clone_encoding($) {
b39c5158Smillert    my $obj = find_encoding(shift);
b39c5158Smillert    ref $obj or return;
b39c5158Smillert    return Storable::dclone($obj);
b39c5158Smillert}
b39c5158Smillert
9f11ffb7Safresh1onBOOT;
b39c5158Smillert
b39c5158Smillertif ($ON_EBCDIC) {
b39c5158Smillert    package Encode::UTF_EBCDIC;
9f11ffb7Safresh1    use parent 'Encode::Encoding';
9f11ffb7Safresh1    my $obj = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
9f11ffb7Safresh1    Encode::define_encoding($obj, 'Unicode');
9f11ffb7Safresh1    sub decode {
e9ce3842Safresh1        my ( undef, $str, $chk ) = @_;
b39c5158Smillert        my $res = '';
b39c5158Smillert        for ( my $i = 0 ; $i < length($str) ; $i++ ) {
b39c5158Smillert            $res .=
b39c5158Smillert              chr(
b39c5158Smillert                utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
b39c5158Smillert              );
b39c5158Smillert        }
b39c5158Smillert        $_[1] = '' if $chk;
b39c5158Smillert        return $res;
9f11ffb7Safresh1    }
9f11ffb7Safresh1    sub encode {
e9ce3842Safresh1        my ( undef, $str, $chk ) = @_;
b39c5158Smillert        my $res = '';
b39c5158Smillert        for ( my $i = 0 ; $i < length($str) ; $i++ ) {
b39c5158Smillert            $res .=
b39c5158Smillert              chr(
b39c5158Smillert                utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
b39c5158Smillert              );
b39c5158Smillert        }
b39c5158Smillert        $_[1] = '' if $chk;
b39c5158Smillert        return $res;
b39c5158Smillert    }
9f11ffb7Safresh1}
9f11ffb7Safresh1
b8851fccSafresh1{
b8851fccSafresh1    # https://rt.cpan.org/Public/Bug/Display.html?id=103253
b8851fccSafresh1    package Encode::XS;
9f11ffb7Safresh1    use parent 'Encode::Encoding';
b8851fccSafresh1}
9f11ffb7Safresh1
b39c5158Smillert{
b39c5158Smillert    package Encode::utf8;
9f11ffb7Safresh1    use parent 'Encode::Encoding';
9f11ffb7Safresh1    my %obj = (
9f11ffb7Safresh1        'utf8'         => { Name => 'utf8' },
9f11ffb7Safresh1        'utf-8-strict' => { Name => 'utf-8-strict', strict_utf8 => 1 }
9f11ffb7Safresh1    );
9f11ffb7Safresh1    for ( keys %obj ) {
9f11ffb7Safresh1        bless $obj{$_} => __PACKAGE__;
9f11ffb7Safresh1        Encode::define_encoding( $obj{$_} => $_ );
b39c5158Smillert    }
9f11ffb7Safresh1    sub cat_decode {
9f11ffb7Safresh1        # ($obj, $dst, $src, $pos, $trm, $chk)
b39c5158Smillert        # currently ignores $chk
e9ce3842Safresh1        my ( undef, undef, undef, $pos, $trm ) = @_;
b39c5158Smillert        my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
b39c5158Smillert        use bytes;
b39c5158Smillert        if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
b39c5158Smillert            $$rdst .=
b39c5158Smillert              substr( $$rsrc, $pos, $npos - $pos + length($trm) );
b39c5158Smillert            $$rpos = $npos + length($trm);
b39c5158Smillert            return 1;
b39c5158Smillert        }
b39c5158Smillert        $$rdst .= substr( $$rsrc, $pos );
b39c5158Smillert        $$rpos = length($$rsrc);
b39c5158Smillert        return '';
b39c5158Smillert    }
b39c5158Smillert}
b39c5158Smillert
b39c5158Smillert1;
b39c5158Smillert
b39c5158Smillert__END__
b39c5158Smillert
b39c5158Smillert=head1 NAME
b39c5158Smillert
48950c12SsthenEncode - character encodings in Perl
b39c5158Smillert
b39c5158Smillert=head1 SYNOPSIS
b39c5158Smillert
e9ce3842Safresh1    use Encode qw(decode encode);
e9ce3842Safresh1    $characters = decode('UTF-8', $octets,     Encode::FB_CROAK);
e9ce3842Safresh1    $octets     = encode('UTF-8', $characters, Encode::FB_CROAK);
b39c5158Smillert
b39c5158Smillert=head2 Table of Contents
b39c5158Smillert
48950c12SsthenEncode consists of a collection of modules whose details are too extensive
48950c12Ssthento fit in one document.  This one itself explains the top-level APIs
b39c5158Smillertand general topics at a glance.  For other topics and more details,
48950c12Ssthensee the documentation for these modules:
b39c5158Smillert
e9ce3842Safresh1=over 2
e9ce3842Safresh1
e9ce3842Safresh1=item L<Encode::Alias> - Alias definitions to encodings
e9ce3842Safresh1
e9ce3842Safresh1=item L<Encode::Encoding> - Encode Implementation Base Class
e9ce3842Safresh1
e9ce3842Safresh1=item L<Encode::Supported> - List of Supported Encodings
e9ce3842Safresh1
e9ce3842Safresh1=item L<Encode::CN> - Simplified Chinese Encodings
e9ce3842Safresh1
e9ce3842Safresh1=item L<Encode::JP> - Japanese Encodings
e9ce3842Safresh1
e9ce3842Safresh1=item L<Encode::KR> - Korean Encodings
e9ce3842Safresh1
e9ce3842Safresh1=item L<Encode::TW> - Traditional Chinese Encodings
e9ce3842Safresh1
e9ce3842Safresh1=back
b39c5158Smillert
b39c5158Smillert=head1 DESCRIPTION
b39c5158Smillert
48950c12SsthenThe C<Encode> module provides the interface between Perl strings
b39c5158Smillertand the rest of the system.  Perl strings are sequences of
48950c12SsthenI<characters>.
b39c5158Smillert
48950c12SsthenThe repertoire of characters that Perl can represent is a superset of those
b39c5158Smillertdefined by the Unicode Consortium. On most platforms the ordinal
48950c12Ssthenvalues of a character as returned by C<ord(I<S>)> is the I<Unicode
48950c12Ssthencodepoint> for that character. The exceptions are platforms where
48950c12Ssthenthe legacy encoding is some variant of EBCDIC rather than a superset
48950c12Ssthenof ASCII; see L<perlebcdic>.
b39c5158Smillert
48950c12SsthenDuring recent history, data is moved around a computer in 8-bit chunks,
48950c12Ssthenoften called "bytes" but also known as "octets" in standards documents.
48950c12SsthenPerl is widely used to manipulate data of many types: not only strings of
48950c12Ssthencharacters representing human or computer languages, but also "binary"
48950c12Ssthendata, being the machine's representation of numbers, pixels in an image, or
48950c12Ssthenjust about anything.
b39c5158Smillert
b39c5158SmillertWhen Perl is processing "binary data", the programmer wants Perl to
48950c12Ssthenprocess "sequences of bytes". This is not a problem for Perl: because a
b39c5158Smillertbyte has 256 possible values, it easily fits in Perl's much larger
b39c5158Smillert"logical character".
b39c5158Smillert
e9ce3842Safresh1This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq>
e9ce3842Safresh1explain the I<why>.
e9ce3842Safresh1
b39c5158Smillert=head2 TERMINOLOGY
b39c5158Smillert
e9ce3842Safresh1=head3 character
b39c5158Smillert
e9ce3842Safresh1A character in the range 0 .. 2**32-1 (or more);
48950c12Ssthenwhat Perl's strings are made of.
b39c5158Smillert
e9ce3842Safresh1=head3 byte
b39c5158Smillert
e9ce3842Safresh1A character in the range 0..255;
e9ce3842Safresh1a special case of a Perl character.
b39c5158Smillert
e9ce3842Safresh1=head3 octet
b39c5158Smillert
e9ce3842Safresh18 bits of data, with ordinal values 0..255;
e9ce3842Safresh1term for bytes passed to or from a non-Perl context, such as a disk file,
e9ce3842Safresh1standard I/O stream, database, command-line argument, environment variable,
e9ce3842Safresh1socket etc.
b39c5158Smillert
48950c12Ssthen=head1 THE PERL ENCODING API
b39c5158Smillert
e9ce3842Safresh1=head2 Basic methods
b39c5158Smillert
e9ce3842Safresh1=head3 encode
e9ce3842Safresh1
e9ce3842Safresh1  $octets  = encode(ENCODING, STRING[, CHECK])
b39c5158Smillert
48950c12SsthenEncodes the scalar value I<STRING> from Perl's internal form into
48950c12SsthenI<ENCODING> and returns a sequence of octets.  I<ENCODING> can be either a
48950c12Ssthencanonical name or an alias.  For encoding names and aliases, see
48950c12SsthenL</"Defining Aliases">.  For CHECK, see L</"Handling Malformed Data">.
b39c5158Smillert
9f11ffb7Safresh1B<CAVEAT>: the input scalar I<STRING> might be modified in-place depending
9f11ffb7Safresh1on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
9f11ffb7Safresh1left unchanged.
9f11ffb7Safresh1
48950c12SsthenFor example, to convert a string from Perl's internal format into
48950c12SsthenISO-8859-1, also known as Latin1:
b39c5158Smillert
b39c5158Smillert  $octets = encode("iso-8859-1", $string);
b39c5158Smillert
9f11ffb7Safresh1B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then
48950c12Ssthen$octets I<might not be equal to> $string.  Though both contain the
48950c12Ssthensame data, the UTF8 flag for $octets is I<always> off.  When you
48950c12Ssthenencode anything, the UTF8 flag on the result is always off, even when it
9f11ffb7Safresh1contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below.
b39c5158Smillert
48950c12SsthenIf the $string is C<undef>, then C<undef> is returned.
b39c5158Smillert
9f11ffb7Safresh1C<str2bytes> may be used as an alias for C<encode>.
9f11ffb7Safresh1
e9ce3842Safresh1=head3 decode
e9ce3842Safresh1
e9ce3842Safresh1  $string = decode(ENCODING, OCTETS[, CHECK])
b39c5158Smillert
48950c12SsthenThis function returns the string that results from decoding the scalar
48950c12Ssthenvalue I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
b8851fccSafresh1Perl's internal form.  As with encode(),
48950c12SsthenI<ENCODING> can be either a canonical name or an alias. For encoding names
48950c12Ssthenand aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
48950c12SsthenMalformed Data">.
b39c5158Smillert
9f11ffb7Safresh1B<CAVEAT>: the input scalar I<OCTETS> might be modified in-place depending
9f11ffb7Safresh1on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
9f11ffb7Safresh1left unchanged.
9f11ffb7Safresh1
48950c12SsthenFor example, to convert ISO-8859-1 data into a string in Perl's
48950c12Sstheninternal format:
b39c5158Smillert
b39c5158Smillert  $string = decode("iso-8859-1", $octets);
b39c5158Smillert
9f11ffb7Safresh1B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string
48950c12SsthenI<might not be equal to> $octets.  Though both contain the same data, the
e5157e49Safresh1UTF8 flag for $string is on.  See L</"The UTF8 flag">
b39c5158Smillertbelow.
b39c5158Smillert
48950c12SsthenIf the $string is C<undef>, then C<undef> is returned.
b39c5158Smillert
9f11ffb7Safresh1C<bytes2str> may be used as an alias for C<decode>.
9f11ffb7Safresh1
e9ce3842Safresh1=head3 find_encoding
e9ce3842Safresh1
e9ce3842Safresh1  [$obj =] find_encoding(ENCODING)
b39c5158Smillert
48950c12SsthenReturns the I<encoding object> corresponding to I<ENCODING>.  Returns
48950c12SsthenC<undef> if no matching I<ENCODING> is find.  The returned object is
48950c12Ssthenwhat does the actual encoding or decoding.
b39c5158Smillert
9f11ffb7Safresh1  $string = decode($name, $bytes);
b39c5158Smillert
b39c5158Smillertis in fact
b39c5158Smillert
9f11ffb7Safresh1    $string = do {
b39c5158Smillert        $obj = find_encoding($name);
b39c5158Smillert        croak qq(encoding "$name" not found) unless ref $obj;
48950c12Ssthen        $obj->decode($bytes);
b39c5158Smillert    };
b39c5158Smillert
b39c5158Smillertwith more error checking.
b39c5158Smillert
48950c12SsthenYou can therefore save time by reusing this object as follows;
b39c5158Smillert
b39c5158Smillert    my $enc = find_encoding("iso-8859-1");
b39c5158Smillert    while(<>) {
9f11ffb7Safresh1        my $string = $enc->decode($_);
9f11ffb7Safresh1        ... # now do something with $string;
b39c5158Smillert    }
b39c5158Smillert
e9ce3842Safresh1Besides L</decode> and L</encode>, other methods are
e9ce3842Safresh1available as well.  For instance, C<name()> returns the canonical
b39c5158Smillertname of the encoding object.
b39c5158Smillert
b39c5158Smillert  find_encoding("latin1")->name; # iso-8859-1
b39c5158Smillert
b39c5158SmillertSee L<Encode::Encoding> for details.
b39c5158Smillert
9f11ffb7Safresh1=head3 find_mime_encoding
9f11ffb7Safresh1
9f11ffb7Safresh1  [$obj =] find_mime_encoding(MIME_ENCODING)
9f11ffb7Safresh1
9f11ffb7Safresh1Returns the I<encoding object> corresponding to I<MIME_ENCODING>.  Acts
9f11ffb7Safresh1same as C<find_encoding()> but C<mime_name()> of returned object must
9f11ffb7Safresh1match to I<MIME_ENCODING>.  So as opposite of C<find_encoding()>
9f11ffb7Safresh1canonical names and aliases are not used when searching for object.
9f11ffb7Safresh1
*3d61058aSafresh1    find_mime_encoding("utf8"); # returns undef because "utf8" is not a valid MIME_ENCODING
9f11ffb7Safresh1    find_mime_encoding("utf-8"); # returns encode object "utf-8-strict"
*3d61058aSafresh1    find_mime_encoding("UTF-8"); # same as "utf-8" because MIME_ENCODING is case insensitive
*3d61058aSafresh1    find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not a valid MIME_ENCODING
9f11ffb7Safresh1
e9ce3842Safresh1=head3 from_to
e9ce3842Safresh1
e9ce3842Safresh1  [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
b39c5158Smillert
48950c12SsthenConverts I<in-place> data between two encodings. The data in $octets
48950c12Ssthenmust be encoded as octets and I<not> as characters in Perl's internal
48950c12Ssthenformat. For example, to convert ISO-8859-1 data into Microsoft's CP1250
b39c5158Smillertencoding:
b39c5158Smillert
b39c5158Smillert  from_to($octets, "iso-8859-1", "cp1250");
b39c5158Smillert
b39c5158Smillertand to convert it back:
b39c5158Smillert
b39c5158Smillert  from_to($octets, "cp1250", "iso-8859-1");
b39c5158Smillert
48950c12SsthenBecause the conversion happens in place, the data to be
48950c12Ssthenconverted cannot be a string constant: it must be a scalar variable.
b39c5158Smillert
e9ce3842Safresh1C<from_to()> returns the length of the converted string in octets on success,
48950c12Ssthenand C<undef> on error.
b39c5158Smillert
48950c12SsthenB<CAVEAT>: The following operations may look the same, but are not:
b39c5158Smillert
9f11ffb7Safresh1  from_to($data, "iso-8859-1", "UTF-8"); #1
b39c5158Smillert  $data = decode("iso-8859-1", $data);  #2
b39c5158Smillert
48950c12SsthenBoth #1 and #2 make $data consist of a completely valid UTF-8 string,
48950c12Ssthenbut only #2 turns the UTF8 flag on.  #1 is equivalent to:
b39c5158Smillert
9f11ffb7Safresh1  $data = encode("UTF-8", decode("iso-8859-1", $data));
b39c5158Smillert
b39c5158SmillertSee L</"The UTF8 flag"> below.
b39c5158Smillert
48950c12SsthenAlso note that:
b39c5158Smillert
b39c5158Smillert  from_to($octets, $from, $to, $check);
b39c5158Smillert
b8851fccSafresh1is equivalent to:
b39c5158Smillert
b39c5158Smillert  $octets = encode($to, decode($from, $octets), $check);
b39c5158Smillert
48950c12SsthenYes, it does I<not> respect the $check during decoding.  It is
48950c12Ssthendeliberately done that way.  If you need minute control, use C<decode>
48950c12Ssthenfollowed by C<encode> as follows:
b39c5158Smillert
b39c5158Smillert  $octets = encode($to, decode($from, $octets, $check_from), $check_to);
b39c5158Smillert
e9ce3842Safresh1=head3 encode_utf8
e9ce3842Safresh1
e9ce3842Safresh1  $octets = encode_utf8($string);
b39c5158Smillert
eac174f2Safresh1B<WARNING>: L<This function can produce invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
eac174f2Safresh1Do not use it for data exchange.
eac174f2Safresh1Unless you want Perl's older "lax" mode, prefer
eac174f2Safresh1C<$octets = encode("UTF-8", $string)>.
eac174f2Safresh1
48950c12SsthenEquivalent to C<$octets = encode("utf8", $string)>.  The characters in
48950c12Ssthen$string are encoded in Perl's internal format, and the result is returned
48950c12Ssthenas a sequence of octets.  Because all possible characters in Perl have a
9f11ffb7Safresh1(loose, not strict) utf8 representation, this function cannot fail.
9f11ffb7Safresh1
e9ce3842Safresh1=head3 decode_utf8
e9ce3842Safresh1
e9ce3842Safresh1  $string = decode_utf8($octets [, CHECK]);
b39c5158Smillert
eac174f2Safresh1B<WARNING>: L<This function accepts invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
eac174f2Safresh1Do not use it for data exchange.
eac174f2Safresh1Unless you want Perl's older "lax" mode, prefer
eac174f2Safresh1C<$string = decode("UTF-8", $octets [, CHECK])>.
eac174f2Safresh1
48950c12SsthenEquivalent to C<$string = decode("utf8", $octets [, CHECK])>.
48950c12SsthenThe sequence of octets represented by $octets is decoded
9f11ffb7Safresh1from (loose, not strict) utf8 into a sequence of logical characters.
9f11ffb7Safresh1Because not all sequences of octets are valid not strict utf8,
48950c12Ssthenit is quite possible for this function to fail.
48950c12SsthenFor CHECK, see L</"Handling Malformed Data">.
b39c5158Smillert
9f11ffb7Safresh1B<CAVEAT>: the input I<$octets> might be modified in-place depending on
9f11ffb7Safresh1what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
9f11ffb7Safresh1left unchanged.
9f11ffb7Safresh1
b39c5158Smillert=head2 Listing available encodings
b39c5158Smillert
b39c5158Smillert  use Encode;
b39c5158Smillert  @list = Encode->encodings();
b39c5158Smillert
48950c12SsthenReturns a list of canonical names of available encodings that have already
48950c12Ssthenbeen loaded.  To get a list of all available encodings including those that
48950c12Ssthenhave not yet been loaded, say:
b39c5158Smillert
b39c5158Smillert  @all_encodings = Encode->encodings(":all");
b39c5158Smillert
48950c12SsthenOr you can give the name of a specific module:
b39c5158Smillert
b39c5158Smillert  @with_jp = Encode->encodings("Encode::JP");
b39c5158Smillert
48950c12SsthenWhen "C<::>" is not in the name, "C<Encode::>" is assumed.
b39c5158Smillert
b39c5158Smillert  @ebcdic = Encode->encodings("EBCDIC");
b39c5158Smillert
b39c5158SmillertTo find out in detail which encodings are supported by this package,
b39c5158Smillertsee L<Encode::Supported>.
b39c5158Smillert
b39c5158Smillert=head2 Defining Aliases
b39c5158Smillert
b39c5158SmillertTo add a new alias to a given encoding, use:
b39c5158Smillert
b39c5158Smillert  use Encode;
b39c5158Smillert  use Encode::Alias;
48950c12Ssthen  define_alias(NEWNAME => ENCODING);
b39c5158Smillert
48950c12SsthenAfter that, I<NEWNAME> can be used as an alias for I<ENCODING>.
e9ce3842Safresh1I<ENCODING> may be either the name of an encoding or an
48950c12SsthenI<encoding object>.
b39c5158Smillert
48950c12SsthenBefore you do that, first make sure the alias is nonexistent using
b39c5158SmillertC<resolve_alias()>, which returns the canonical name thereof.
48950c12SsthenFor example:
b39c5158Smillert
b39c5158Smillert  Encode::resolve_alias("latin1") eq "iso-8859-1" # true
b39c5158Smillert  Encode::resolve_alias("iso-8859-12")   # false; nonexistent
b39c5158Smillert  Encode::resolve_alias($name) eq $name  # true if $name is canonical
b39c5158Smillert
e9ce3842Safresh1C<resolve_alias()> does not need C<use Encode::Alias>; it can be
48950c12Ssthenimported via C<use Encode qw(resolve_alias)>.
b39c5158Smillert
b39c5158SmillertSee L<Encode::Alias> for details.
b39c5158Smillert
b39c5158Smillert=head2 Finding IANA Character Set Registry names
b39c5158Smillert
b39c5158SmillertThe canonical name of a given encoding does not necessarily agree with
48950c12SsthenIANA Character Set Registry, commonly seen as C<< Content-Type:
48950c12Ssthentext/plain; charset=I<WHATEVER> >>.  For most cases, the canonical name
48950c12Ssthenworks, but sometimes it does not, most notably with "utf-8-strict".
b39c5158Smillert
48950c12SsthenAs of C<Encode> version 2.21, a new method C<mime_name()> is therefore added.
b39c5158Smillert
b39c5158Smillert  use Encode;
48950c12Ssthen  my $enc = find_encoding("UTF-8");
b39c5158Smillert  warn $enc->name;      # utf-8-strict
b39c5158Smillert  warn $enc->mime_name; # UTF-8
b39c5158Smillert
b39c5158SmillertSee also:  L<Encode::Encoding>
b39c5158Smillert
b39c5158Smillert=head1 Encoding via PerlIO
b39c5158Smillert
48950c12SsthenIf your perl supports C<PerlIO> (which is the default), you can use a
48950c12SsthenC<PerlIO> layer to decode and encode directly via a filehandle.  The
48950c12Ssthenfollowing two examples are fully identical in functionality:
b39c5158Smillert
48950c12Ssthen  ### Version 1 via PerlIO
48950c12Ssthen    open(INPUT,  "< :encoding(shiftjis)", $infile)
48950c12Ssthen        || die "Can't open < $infile for reading: $!";
48950c12Ssthen    open(OUTPUT, "> :encoding(euc-jp)",  $outfile)
48950c12Ssthen        || die "Can't open > $output for writing: $!";
48950c12Ssthen    while (<INPUT>) {   # auto decodes $_
48950c12Ssthen        print OUTPUT;   # auto encodes $_
b39c5158Smillert    }
48950c12Ssthen    close(INPUT)   || die "can't close $infile: $!";
48950c12Ssthen    close(OUTPUT)  || die "can't close $outfile: $!";
b39c5158Smillert
48950c12Ssthen  ### Version 2 via from_to()
48950c12Ssthen    open(INPUT,  "< :raw", $infile)
48950c12Ssthen        || die "Can't open < $infile for reading: $!";
48950c12Ssthen    open(OUTPUT, "> :raw",  $outfile)
48950c12Ssthen        || die "Can't open > $output for writing: $!";
b39c5158Smillert
48950c12Ssthen    while (<INPUT>) {
48950c12Ssthen        from_to($_, "shiftjis", "euc-jp", 1);  # switch encoding
48950c12Ssthen        print OUTPUT;   # emit raw (but properly encoded) data
48950c12Ssthen    }
48950c12Ssthen    close(INPUT)   || die "can't close $infile: $!";
48950c12Ssthen    close(OUTPUT)  || die "can't close $outfile: $!";
b39c5158Smillert
48950c12SsthenIn the first version above, you let the appropriate encoding layer
48950c12Ssthenhandle the conversion.  In the second, you explicitly translate
48950c12Ssthenfrom one encoding to the other.
48950c12Ssthen
b8851fccSafresh1Unfortunately, it may be that encodings are not C<PerlIO>-savvy.  You can check
48950c12Ssthento see whether your encoding is supported by C<PerlIO> by invoking the
48950c12SsthenC<perlio_ok> method on it:
48950c12Ssthen
48950c12Ssthen  Encode::perlio_ok("hz");             # false
48950c12Ssthen  find_encoding("euc-cn")->perlio_ok;  # true wherever PerlIO is available
48950c12Ssthen
48950c12Ssthen  use Encode qw(perlio_ok);            # imported upon request
b39c5158Smillert  perlio_ok("euc-jp")
b39c5158Smillert
48950c12SsthenFortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy
e9ce3842Safresh1except for C<hz> and C<ISO-2022-kr>.  For the gory details, see
b39c5158SmillertL<Encode::Encoding> and L<Encode::PerlIO>.
b39c5158Smillert
b39c5158Smillert=head1 Handling Malformed Data
b39c5158Smillert
48950c12SsthenThe optional I<CHECK> argument tells C<Encode> what to do when
48950c12Ssthenencountering malformed data.  Without I<CHECK>, C<Encode::FB_DEFAULT>
48950c12Ssthen(== 0) is assumed.
b39c5158Smillert
48950c12SsthenAs of version 2.12, C<Encode> supports coderef values for C<CHECK>;
48950c12Ssthensee below.
b39c5158Smillert
e9ce3842Safresh1B<NOTE:> Not all encodings support this feature.
e9ce3842Safresh1Some encodings ignore the I<CHECK> argument.  For example,
b39c5158SmillertL<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
b39c5158Smillert
e9ce3842Safresh1=head2 List of I<CHECK> values
b39c5158Smillert
e9ce3842Safresh1=head3 FB_DEFAULT
b39c5158Smillert
*3d61058aSafresh1  CHECK = Encode::FB_DEFAULT ( == 0)
b39c5158Smillert
48950c12SsthenIf I<CHECK> is 0, encoding and decoding replace any malformed character
48950c12Ssthenwith a I<substitution character>.  When you encode, I<SUBCHAR> is used.
48950c12SsthenWhen you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is
48950c12Ssthenused.  If the data is supposed to be UTF-8, an optional lexical warning of
48950c12Ssthenwarning category C<"utf8"> is given.
b39c5158Smillert
e9ce3842Safresh1=head3 FB_CROAK
e9ce3842Safresh1
*3d61058aSafresh1  CHECK = Encode::FB_CROAK ( == 1)
b39c5158Smillert
48950c12SsthenIf I<CHECK> is 1, methods immediately die with an error
48950c12Ssthenmessage.  Therefore, when I<CHECK> is 1, you should trap
48950c12Ssthenexceptions with C<eval{}>, unless you really want to let it C<die>.
b39c5158Smillert
e9ce3842Safresh1=head3 FB_QUIET
e9ce3842Safresh1
*3d61058aSafresh1  CHECK = Encode::FB_QUIET
b39c5158Smillert
48950c12SsthenIf I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately
b39c5158Smillertreturn the portion of the data that has been processed so far when an
48950c12Ssthenerror occurs. The data argument is overwritten with everything
48950c12Ssthenafter that point; that is, the unprocessed portion of the data.  This is
48950c12Ssthenhandy when you have to call C<decode> repeatedly in the case where your
b39c5158Smillertsource data may contain partial multi-byte character sequences,
48950c12Ssthen(that is, you are reading with a fixed-width buffer). Here's some sample
48950c12Ssthencode to do exactly that:
b39c5158Smillert
48950c12Ssthen    my($buffer, $string) = ("", "");
48950c12Ssthen    while (read($fh, $buffer, 256, length($buffer))) {
b39c5158Smillert        $string .= decode($encoding, $buffer, Encode::FB_QUIET);
b39c5158Smillert        # $buffer now contains the unprocessed partial character
b39c5158Smillert    }
b39c5158Smillert
e9ce3842Safresh1=head3 FB_WARN
e9ce3842Safresh1
*3d61058aSafresh1  CHECK = Encode::FB_WARN
b39c5158Smillert
48950c12SsthenThis is the same as C<FB_QUIET> above, except that instead of being silent
48950c12Ssthenon errors, it issues a warning.  This is handy for when you are debugging.
b39c5158Smillert
b46d8ef2Safresh1B<CAVEAT>: All warnings from Encode module are reported, independently of
b46d8ef2Safresh1L<pragma warnings|warnings> settings. If you want to follow settings of
b46d8ef2Safresh1lexical warnings configured by L<pragma warnings|warnings> then append
b46d8ef2Safresh1also check value C<ENCODE::ONLY_PRAGMA_WARNINGS>. This value is available
b46d8ef2Safresh1since Encode version 2.99.
b46d8ef2Safresh1
e9ce3842Safresh1=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
e9ce3842Safresh1
e9ce3842Safresh1=over 2
e9ce3842Safresh1
b39c5158Smillert=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
b39c5158Smillert
b39c5158Smillert=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
b39c5158Smillert
b39c5158Smillert=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
b39c5158Smillert
e9ce3842Safresh1=back
e9ce3842Safresh1
48950c12SsthenFor encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
48950c12SsthenC<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
b39c5158Smillert
48950c12SsthenWhen you decode, C<\xI<HH>> is inserted for a malformed character, where
48950c12SsthenI<HH> is the hex representation of the octet that could not be decoded to
48950c12Ssthenutf8.  When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
48950c12Ssthenthe Unicode code point (in any number of hex digits) of the character that
48950c12Ssthencannot be found in the character repertoire of the encoding.
b39c5158Smillert
48950c12SsthenThe HTML/XML character reference modes are about the same. In place of
48950c12SsthenC<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
b39c5158SmillertXML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
b39c5158Smillert
48950c12SsthenIn C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
b39c5158Smillert
e9ce3842Safresh1=head3 The bitmask
b39c5158Smillert
48950c12SsthenThese modes are all actually set via a bitmask.  Here is how the C<FB_I<XXX>>
48950c12Ssthenconstants are laid out.  You can import the C<FB_I<XXX>> constants via
48950c12SsthenC<use Encode qw(:fallbacks)>, and you can import the generic bitmask
b39c5158Smillertconstants via C<use Encode qw(:fallback_all)>.
b39c5158Smillert
b39c5158Smillert                     FB_DEFAULT FB_CROAK FB_QUIET FB_WARN  FB_PERLQQ
b39c5158Smillert DIE_ON_ERR    0x0001             X
b39c5158Smillert WARN_ON_ERR   0x0002                               X
b39c5158Smillert RETURN_ON_ERR 0x0004                      X        X
b39c5158Smillert LEAVE_SRC     0x0008                                        X
b39c5158Smillert PERLQQ        0x0100                                        X
b39c5158Smillert HTMLCREF      0x0200
b39c5158Smillert XMLCREF       0x0400
b39c5158Smillert
e9ce3842Safresh1=head3 LEAVE_SRC
b39c5158Smillert
e9ce3842Safresh1  Encode::LEAVE_SRC
b39c5158Smillert
48950c12SsthenIf the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
e9ce3842Safresh1source string to encode() or decode() will be overwritten in place.
48950c12SsthenIf you're not interested in this, then bitwise-OR it with the bitmask.
b39c5158Smillert
b39c5158Smillert=head2 coderef for CHECK
b39c5158Smillert
48950c12SsthenAs of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
e5157e49Safresh1ordinal value of the unmapped character as an argument and returns
e5157e49Safresh1octets that represent the fallback character.  For instance:
b39c5158Smillert
b39c5158Smillert  $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
b39c5158Smillert
48950c12SsthenActs like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
b39c5158Smillert
9f11ffb7Safresh1Fallback for C<decode> must return decoded string (sequence of characters)
9f11ffb7Safresh1and takes a list of ordinal values as its arguments. So for
b8851fccSafresh1example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
e5157e49Safresh1a fallback for bytes that are not valid UTF-8, you could write
e5157e49Safresh1
e5157e49Safresh1    $str = decode 'UTF-8', $octets, sub {
9f11ffb7Safresh1        my $tmp = join '', map chr, @_;
9f11ffb7Safresh1        return decode 'ISO-8859-15', $tmp;
e5157e49Safresh1    };
e5157e49Safresh1
b39c5158Smillert=head1 Defining Encodings
b39c5158Smillert
b39c5158SmillertTo define a new encoding, use:
b39c5158Smillert
b39c5158Smillert    use Encode qw(define_encoding);
48950c12Ssthen    define_encoding($object, CANONICAL_NAME [, alias...]);
b39c5158Smillert
48950c12SsthenI<CANONICAL_NAME> will be associated with I<$object>.  The object
b39c5158Smillertshould provide the interface described in L<Encode::Encoding>.
48950c12SsthenIf more than two arguments are provided, additional
48950c12Ssthenarguments are considered aliases for I<$object>.
b39c5158Smillert
48950c12SsthenSee L<Encode::Encoding> for details.
b39c5158Smillert
b39c5158Smillert=head1 The UTF8 flag
b39c5158Smillert
*3d61058aSafresh1Before the introduction of Unicode support in Perl, the C<eq> operator
b39c5158Smillertjust compared the strings represented by two scalars. Beginning with
48950c12SsthenPerl 5.8, C<eq> compares two strings with simultaneous consideration of
48950c12SsthenI<the UTF8 flag>. To explain why we made it so, I quote from page 402 of
48950c12SsthenI<Programming Perl, 3rd ed.>
b39c5158Smillert
b39c5158Smillert=over 2
b39c5158Smillert
b39c5158Smillert=item Goal #1:
b39c5158Smillert
b39c5158SmillertOld byte-oriented programs should not spontaneously break on the old
b39c5158Smillertbyte-oriented data they used to work on.
b39c5158Smillert
b39c5158Smillert=item Goal #2:
b39c5158Smillert
b39c5158SmillertOld byte-oriented programs should magically start working on the new
b39c5158Smillertcharacter-oriented data when appropriate.
b39c5158Smillert
b39c5158Smillert=item Goal #3:
b39c5158Smillert
b39c5158SmillertPrograms should run just as fast in the new character-oriented mode
b39c5158Smillertas in the old byte-oriented mode.
b39c5158Smillert
b39c5158Smillert=item Goal #4:
b39c5158Smillert
b39c5158SmillertPerl should remain one language, rather than forking into a
b39c5158Smillertbyte-oriented Perl and a character-oriented Perl.
b39c5158Smillert
b39c5158Smillert=back
b39c5158Smillert
48950c12SsthenWhen I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been
48950c12Ssthenborn yet, many features documented in the book remained unimplemented for a
48950c12Ssthenlong time.  Perl 5.8 corrected much of this, and the introduction of the
48950c12SsthenUTF8 flag is one of them.  You can think of there being two fundamentally
48950c12Ssthendifferent kinds of strings and string-operations in Perl: one a
48950c12Ssthenbyte-oriented mode  for when the internal UTF8 flag is off, and the other a
48950c12Ssthencharacter-oriented mode for when the internal UTF8 flag is on.
b39c5158Smillert
48950c12SsthenThis UTF8 flag is not visible in Perl scripts, exactly for the same reason
48950c12Ssthenyou cannot (or rather, you I<don't have to>) see whether a scalar contains
48950c12Ssthena string, an integer, or a floating-point number.   But you can still peek
48950c12Ssthenand poke these if you will.  See the next section.
b39c5158Smillert
b39c5158Smillert=head2 Messing with Perl's Internals
b39c5158Smillert
b39c5158SmillertThe following API uses parts of Perl's internals in the current
48950c12Ssthenimplementation.  As such, they are efficient but may change in a future
48950c12Ssthenrelease.
b39c5158Smillert
e9ce3842Safresh1=head3 is_utf8
b39c5158Smillert
e9ce3842Safresh1  is_utf8(STRING [, CHECK])
b39c5158Smillert
48950c12Ssthen[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>.
48950c12SsthenIf I<CHECK> is true, also checks whether I<STRING> contains well-formed
b39c5158SmillertUTF-8.  Returns true if successful, false otherwise.
b39c5158Smillert
9f11ffb7Safresh1Typically only necessary for debugging and testing.  Don't use this flag as
9f11ffb7Safresh1a marker to distinguish character and binary data, that should be decided
9f11ffb7Safresh1for each variable when you write your code.
9f11ffb7Safresh1
9f11ffb7Safresh1B<CAVEAT>: If I<STRING> has UTF8 flag set, it does B<NOT> mean that
9f11ffb7Safresh1I<STRING> is UTF-8 encoded and vice-versa.
9f11ffb7Safresh1
48950c12SsthenAs of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
b39c5158Smillert
e9ce3842Safresh1=head3 _utf8_on
e9ce3842Safresh1
e9ce3842Safresh1  _utf8_on(STRING)
b39c5158Smillert
48950c12Ssthen[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>.  The I<STRING>
48950c12Ssthenis I<not> checked for containing only well-formed UTF-8.  Do not use this
48950c12Ssthenunless you I<know with absolute certainty> that the STRING holds only
48950c12Ssthenwell-formed UTF-8.  Returns the previous state of the UTF8 flag (so please
48950c12Ssthendon't treat the return value as indicating success or failure), or C<undef>
48950c12Ssthenif I<STRING> is not a string.
b39c5158Smillert
48950c12SsthenB<NOTE>: For security reasons, this function does not work on tainted values.
b39c5158Smillert
e9ce3842Safresh1=head3 _utf8_off
e9ce3842Safresh1
e9ce3842Safresh1  _utf8_off(STRING)
b39c5158Smillert
48950c12Ssthen[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>.  Do not use
48950c12Ssthenfrivolously.  Returns the previous state of the UTF8 flag, or C<undef> if
48950c12SsthenI<STRING> is not a string.  Do not treat the return value as indicative of
48950c12Ssthensuccess or failure, because that isn't what it means: it is only the
48950c12Ssthenprevious setting.
b39c5158Smillert
48950c12SsthenB<NOTE>: For security reasons, this function does not work on tainted values.
b39c5158Smillert
b39c5158Smillert=head1 UTF-8 vs. utf8 vs. UTF8
b39c5158Smillert
b39c5158Smillert  ....We now view strings not as sequences of bytes, but as sequences
b39c5158Smillert  of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
b39c5158Smillert  computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
b39c5158Smillert
48950c12SsthenThat has historically been Perl's notion of UTF-8, as that is how UTF-8 was
48950c12Ssthenfirst conceived by Ken Thompson when he invented it. However, thanks to
48950c12Ssthenlater revisions to the applicable standards, official UTF-8 is now rather
48950c12Ssthenstricter than that. For example, its range is much narrower (0 .. 0x10_FFFF
48950c12Ssthento cover only 21 bits instead of 32 or 64 bits) and some sequences
48950c12Ssthenare not allowed, like those used in surrogate pairs, the 31 non-character
48950c12Ssthencode points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane
48950c12Ssthen(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc.
b39c5158Smillert
48950c12SsthenThe former default in which Perl would always use a loose interpretation of
48950c12SsthenUTF-8 has now been overruled:
b39c5158Smillert
b39c5158Smillert  From: Larry Wall <larry@wall.org>
b39c5158Smillert  Date: December 04, 2004 11:51:58 JST
b39c5158Smillert  To: perl-unicode@perl.org
b39c5158Smillert  Subject: Re: Make Encode.pm support the real UTF-8
b39c5158Smillert  Message-Id: <20041204025158.GA28754@wall.org>
b39c5158Smillert
b39c5158Smillert  On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
b39c5158Smillert  : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
b39c5158Smillert  : but "UTF-8" is the name of the standard and should give the
b39c5158Smillert  : corresponding behaviour.
b39c5158Smillert
b39c5158Smillert  For what it's worth, that's how I've always kept them straight in my
b39c5158Smillert  head.
b39c5158Smillert
b39c5158Smillert  Also for what it's worth, Perl 6 will mostly default to strict but
b39c5158Smillert  make it easy to switch back to lax.
b39c5158Smillert
b39c5158Smillert  Larry
b39c5158Smillert
48950c12SsthenGot that?  As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current
48950c12Ssthensense, which is conservative and strict and security-conscious, whereas
48950c12SsthenB<"utf8"> means UTF-8 in its former sense, which was liberal and loose and
48950c12Ssthenlax.  C<Encode> version 2.10 or later thus groks this subtle but critically
48950c12Ssthenimportant distinction between C<"UTF-8"> and C<"utf8">.
b39c5158Smillert
b39c5158Smillert  encode("utf8",  "\x{FFFF_FFFF}", 1); # okay
b39c5158Smillert  encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
b39c5158Smillert
eac174f2Safresh1This distinction is also important for decoding. In the following,
eac174f2Safresh1C<$s> stores character U+200000, which exceeds UTF-8's allowed range.
eac174f2Safresh1C<$s> thus stores an invalid Unicode code point:
eac174f2Safresh1
eac174f2Safresh1  $s = decode("utf8", "\xf8\x88\x80\x80\x80");
eac174f2Safresh1
eac174f2Safresh1C<"UTF-8">, by contrast, will either coerce the input to something valid:
eac174f2Safresh1
eac174f2Safresh1    $s = decode("UTF-8", "\xf8\x88\x80\x80\x80"); # U+FFFD
eac174f2Safresh1
eac174f2Safresh1.. or croak:
eac174f2Safresh1
eac174f2Safresh1    decode("UTF-8", "\xf8\x88\x80\x80\x80", FB_CROAK|LEAVE_SRC);
eac174f2Safresh1
48950c12SsthenIn the C<Encode> module, C<"UTF-8"> is actually a canonical name for
48950c12SsthenC<"utf-8-strict">.  That hyphen between the C<"UTF"> and the C<"8"> is
48950c12Ssthencritical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
b39c5158Smillert
b39c5158Smillert  find_encoding("UTF-8")->name # is 'utf-8-strict'
b39c5158Smillert  find_encoding("utf-8")->name # ditto. names are case insensitive
b39c5158Smillert  find_encoding("utf_8")->name # ditto. "_" are treated as "-"
b39c5158Smillert  find_encoding("UTF8")->name  # is 'utf8'.
b39c5158Smillert
48950c12SsthenPerl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates
48950c12Ssthenwhether a string is internally encoded as "utf8", also without a hyphen.
b39c5158Smillert
b39c5158Smillert=head1 SEE ALSO
b39c5158Smillert
b39c5158SmillertL<Encode::Encoding>,
b39c5158SmillertL<Encode::Supported>,
b39c5158SmillertL<Encode::PerlIO>,
b39c5158SmillertL<encoding>,
b39c5158SmillertL<perlebcdic>,
b39c5158SmillertL<perlfunc/open>,
b39c5158SmillertL<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
b39c5158SmillertL<utf8>,
e9ce3842Safresh1the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html>
b39c5158Smillert
b39c5158Smillert=head1 MAINTAINER
b39c5158Smillert
48950c12SsthenThis project was originated by the late Nick Ing-Simmons and later
e9ce3842Safresh1maintained by Dan Kogai I<< <dankogai@cpan.org> >>.  See AUTHORS
48950c12Ssthenfor a full list of people involved.  For any questions, send mail to
48950c12SsthenI<< <perl-unicode@perl.org> >> so that we can all share.
b39c5158Smillert
48950c12SsthenWhile Dan Kogai retains the copyright as a maintainer, credit
48950c12Ssthenshould go to all those involved.  See AUTHORS for a list of those
48950c12Ssthenwho submitted code to the project.
b39c5158Smillert
b39c5158Smillert=head1 COPYRIGHT
b39c5158Smillert
b8851fccSafresh1Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>.
b39c5158Smillert
b39c5158SmillertThis library is free software; you can redistribute it and/or modify
b39c5158Smillertit under the same terms as Perl itself.
b39c5158Smillert
b39c5158Smillert=cut