1b39c5158Smillert# 2*3d61058aSafresh1# $Id: Encode.pm,v 3.21 2024/02/25 22:17:32 dankogai Exp $ 3b39c5158Smillert# 4b39c5158Smillertpackage Encode; 5b39c5158Smillertuse strict; 6b39c5158Smillertuse warnings; 748950c12Ssthenuse constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; 89f11ffb7Safresh1our $VERSION; 99f11ffb7Safresh1BEGIN { 10*3d61058aSafresh1 $VERSION = sprintf "%d.%02d", q$Revision: 3.21 $ =~ /(\d+)/g; 119f11ffb7Safresh1 require XSLoader; 12b39c5158Smillert XSLoader::load( __PACKAGE__, $VERSION ); 139f11ffb7Safresh1} 14b39c5158Smillert 15e5157e49Safresh1use Exporter 5.57 'import'; 16b39c5158Smillert 17b46d8ef2Safresh1use Carp (); 189f11ffb7Safresh1our @CARP_NOT = qw(Encode::Encoder); 199f11ffb7Safresh1 20b39c5158Smillert# Public, encouraged API is exported by default 21b39c5158Smillert 22b39c5158Smillertour @EXPORT = qw( 23b39c5158Smillert decode decode_utf8 encode encode_utf8 str2bytes bytes2str 249f11ffb7Safresh1 encodings find_encoding find_mime_encoding clone_encoding 25b39c5158Smillert); 26b39c5158Smillertour @FB_FLAGS = qw( 27b39c5158Smillert DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC 28b39c5158Smillert PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL 29b39c5158Smillert); 30b39c5158Smillertour @FB_CONSTS = qw( 31b39c5158Smillert FB_DEFAULT FB_CROAK FB_QUIET FB_WARN 32b39c5158Smillert FB_PERLQQ FB_HTMLCREF FB_XMLCREF 33b39c5158Smillert); 34b39c5158Smillertour @EXPORT_OK = ( 35b39c5158Smillert qw( 36b39c5158Smillert _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit 37b39c5158Smillert is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade 38b39c5158Smillert ), 39b39c5158Smillert @FB_FLAGS, @FB_CONSTS, 40b39c5158Smillert); 41b39c5158Smillert 42b39c5158Smillertour %EXPORT_TAGS = ( 43b39c5158Smillert all => [ @EXPORT, @EXPORT_OK ], 44b39c5158Smillert default => [ @EXPORT ], 45b39c5158Smillert fallbacks => [ @FB_CONSTS ], 46b39c5158Smillert fallback_all => [ @FB_CONSTS, @FB_FLAGS ], 47b39c5158Smillert); 48b39c5158Smillert 49b39c5158Smillert# Documentation moved after __END__ for speed - NI-S 50b39c5158Smillert 51b39c5158Smillertour $ON_EBCDIC = ( ord("A") == 193 ); 52b39c5158Smillert 539f11ffb7Safresh1use Encode::Alias (); 549f11ffb7Safresh1use Encode::MIME::Name; 559f11ffb7Safresh1 569f11ffb7Safresh1use Storable; 57b39c5158Smillert 58b39c5158Smillert# Make a %Encoding package variable to allow a certain amount of cheating 59b39c5158Smillertour %Encoding; 60b39c5158Smillertour %ExtModule; 61b39c5158Smillertrequire Encode::Config; 62b39c5158Smillert# See 63b39c5158Smillert# https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2 64e5157e49Safresh1# to find why sig handlers inside eval{} are disabled. 65b39c5158Smillerteval { 66b39c5158Smillert local $SIG{__DIE__}; 67b39c5158Smillert local $SIG{__WARN__}; 68c50a90c5Safresh1 local @INC = @INC; 69c50a90c5Safresh1 pop @INC if @INC && $INC[-1] eq '.'; 70b39c5158Smillert require Encode::ConfigLocal; 71b39c5158Smillert}; 72b39c5158Smillert 73b39c5158Smillertsub encodings { 74b39c5158Smillert my %enc; 75e9ce3842Safresh1 my $arg = $_[1] || ''; 76e9ce3842Safresh1 if ( $arg eq ":all" ) { 77b39c5158Smillert %enc = ( %Encoding, %ExtModule ); 78b39c5158Smillert } 79b39c5158Smillert else { 80b39c5158Smillert %enc = %Encoding; 8148950c12Ssthen for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) { 82b39c5158Smillert DEBUG and warn $mod; 83b39c5158Smillert for my $enc ( keys %ExtModule ) { 84b39c5158Smillert $ExtModule{$enc} eq $mod and $enc{$enc} = $mod; 85b39c5158Smillert } 86b39c5158Smillert } 87b39c5158Smillert } 88b39c5158Smillert return sort { lc $a cmp lc $b } 89b39c5158Smillert grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc; 90b39c5158Smillert} 91b39c5158Smillert 92b39c5158Smillertsub perlio_ok { 93b39c5158Smillert my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] ); 94b39c5158Smillert $obj->can("perlio_ok") and return $obj->perlio_ok(); 95b39c5158Smillert return 0; # safety net 96b39c5158Smillert} 97b39c5158Smillert 98b39c5158Smillertsub define_encoding { 99b39c5158Smillert my $obj = shift; 100b39c5158Smillert my $name = shift; 101b39c5158Smillert $Encoding{$name} = $obj; 102b39c5158Smillert my $lc = lc($name); 103b39c5158Smillert define_alias( $lc => $obj ) unless $lc eq $name; 104b39c5158Smillert while (@_) { 105b39c5158Smillert my $alias = shift; 106b39c5158Smillert define_alias( $alias, $obj ); 107b39c5158Smillert } 1089f11ffb7Safresh1 my $class = ref($obj); 1099f11ffb7Safresh1 push @Encode::CARP_NOT, $class unless grep { $_ eq $class } @Encode::CARP_NOT; 1109f11ffb7Safresh1 push @Encode::Encoding::CARP_NOT, $class unless grep { $_ eq $class } @Encode::Encoding::CARP_NOT; 111b39c5158Smillert return $obj; 112b39c5158Smillert} 113b39c5158Smillert 114b39c5158Smillertsub getEncoding { 115b39c5158Smillert my ( $class, $name, $skip_external ) = @_; 116b39c5158Smillert 1179f11ffb7Safresh1 defined($name) or return; 1189f11ffb7Safresh1 11948950c12Ssthen $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796 12048950c12Ssthen 121b39c5158Smillert ref($name) && $name->can('renew') and return $name; 122b39c5158Smillert exists $Encoding{$name} and return $Encoding{$name}; 123b39c5158Smillert my $lc = lc $name; 124b39c5158Smillert exists $Encoding{$lc} and return $Encoding{$lc}; 125b39c5158Smillert 126b39c5158Smillert my $oc = $class->find_alias($name); 127b39c5158Smillert defined($oc) and return $oc; 128b39c5158Smillert $lc ne $name and $oc = $class->find_alias($lc); 129b39c5158Smillert defined($oc) and return $oc; 130b39c5158Smillert 131b39c5158Smillert unless ($skip_external) { 132b39c5158Smillert if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) { 133b39c5158Smillert $mod =~ s,::,/,g; 134b39c5158Smillert $mod .= '.pm'; 135b39c5158Smillert eval { require $mod; }; 136b39c5158Smillert exists $Encoding{$name} and return $Encoding{$name}; 137b39c5158Smillert } 138b39c5158Smillert } 139b39c5158Smillert return; 140b39c5158Smillert} 141b39c5158Smillert 1429f11ffb7Safresh1# HACK: These two functions must be defined in Encode and because of 1439f11ffb7Safresh1# cyclic dependency between Encode and Encode::Alias, Exporter does not work 1449f11ffb7Safresh1sub find_alias { 1459f11ffb7Safresh1 goto &Encode::Alias::find_alias; 1469f11ffb7Safresh1} 1479f11ffb7Safresh1sub define_alias { 1489f11ffb7Safresh1 goto &Encode::Alias::define_alias; 1499f11ffb7Safresh1} 1509f11ffb7Safresh1 151b39c5158Smillertsub find_encoding($;$) { 152b39c5158Smillert my ( $name, $skip_external ) = @_; 153b39c5158Smillert return __PACKAGE__->getEncoding( $name, $skip_external ); 154b39c5158Smillert} 155b39c5158Smillert 1569f11ffb7Safresh1sub find_mime_encoding($;$) { 1579f11ffb7Safresh1 my ( $mime_name, $skip_external ) = @_; 1589f11ffb7Safresh1 my $name = Encode::MIME::Name::get_encode_name( $mime_name ); 1599f11ffb7Safresh1 return find_encoding( $name, $skip_external ); 1609f11ffb7Safresh1} 1619f11ffb7Safresh1 162b39c5158Smillertsub resolve_alias($) { 163b39c5158Smillert my $obj = find_encoding(shift); 164b39c5158Smillert defined $obj and return $obj->name; 165b39c5158Smillert return; 166b39c5158Smillert} 167b39c5158Smillert 168b39c5158Smillertsub clone_encoding($) { 169b39c5158Smillert my $obj = find_encoding(shift); 170b39c5158Smillert ref $obj or return; 171b39c5158Smillert return Storable::dclone($obj); 172b39c5158Smillert} 173b39c5158Smillert 1749f11ffb7Safresh1onBOOT; 175b39c5158Smillert 176b39c5158Smillertif ($ON_EBCDIC) { 177b39c5158Smillert package Encode::UTF_EBCDIC; 1789f11ffb7Safresh1 use parent 'Encode::Encoding'; 1799f11ffb7Safresh1 my $obj = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC"; 1809f11ffb7Safresh1 Encode::define_encoding($obj, 'Unicode'); 1819f11ffb7Safresh1 sub decode { 182e9ce3842Safresh1 my ( undef, $str, $chk ) = @_; 183b39c5158Smillert my $res = ''; 184b39c5158Smillert for ( my $i = 0 ; $i < length($str) ; $i++ ) { 185b39c5158Smillert $res .= 186b39c5158Smillert chr( 187b39c5158Smillert utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) ) 188b39c5158Smillert ); 189b39c5158Smillert } 190b39c5158Smillert $_[1] = '' if $chk; 191b39c5158Smillert return $res; 1929f11ffb7Safresh1 } 1939f11ffb7Safresh1 sub encode { 194e9ce3842Safresh1 my ( undef, $str, $chk ) = @_; 195b39c5158Smillert my $res = ''; 196b39c5158Smillert for ( my $i = 0 ; $i < length($str) ; $i++ ) { 197b39c5158Smillert $res .= 198b39c5158Smillert chr( 199b39c5158Smillert utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) ) 200b39c5158Smillert ); 201b39c5158Smillert } 202b39c5158Smillert $_[1] = '' if $chk; 203b39c5158Smillert return $res; 204b39c5158Smillert } 2059f11ffb7Safresh1} 2069f11ffb7Safresh1 207b8851fccSafresh1{ 208b8851fccSafresh1 # https://rt.cpan.org/Public/Bug/Display.html?id=103253 209b8851fccSafresh1 package Encode::XS; 2109f11ffb7Safresh1 use parent 'Encode::Encoding'; 211b8851fccSafresh1} 2129f11ffb7Safresh1 213b39c5158Smillert{ 214b39c5158Smillert package Encode::utf8; 2159f11ffb7Safresh1 use parent 'Encode::Encoding'; 2169f11ffb7Safresh1 my %obj = ( 2179f11ffb7Safresh1 'utf8' => { Name => 'utf8' }, 2189f11ffb7Safresh1 'utf-8-strict' => { Name => 'utf-8-strict', strict_utf8 => 1 } 2199f11ffb7Safresh1 ); 2209f11ffb7Safresh1 for ( keys %obj ) { 2219f11ffb7Safresh1 bless $obj{$_} => __PACKAGE__; 2229f11ffb7Safresh1 Encode::define_encoding( $obj{$_} => $_ ); 223b39c5158Smillert } 2249f11ffb7Safresh1 sub cat_decode { 2259f11ffb7Safresh1 # ($obj, $dst, $src, $pos, $trm, $chk) 226b39c5158Smillert # currently ignores $chk 227e9ce3842Safresh1 my ( undef, undef, undef, $pos, $trm ) = @_; 228b39c5158Smillert my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; 229b39c5158Smillert use bytes; 230b39c5158Smillert if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) { 231b39c5158Smillert $$rdst .= 232b39c5158Smillert substr( $$rsrc, $pos, $npos - $pos + length($trm) ); 233b39c5158Smillert $$rpos = $npos + length($trm); 234b39c5158Smillert return 1; 235b39c5158Smillert } 236b39c5158Smillert $$rdst .= substr( $$rsrc, $pos ); 237b39c5158Smillert $$rpos = length($$rsrc); 238b39c5158Smillert return ''; 239b39c5158Smillert } 240b39c5158Smillert} 241b39c5158Smillert 242b39c5158Smillert1; 243b39c5158Smillert 244b39c5158Smillert__END__ 245b39c5158Smillert 246b39c5158Smillert=head1 NAME 247b39c5158Smillert 24848950c12SsthenEncode - character encodings in Perl 249b39c5158Smillert 250b39c5158Smillert=head1 SYNOPSIS 251b39c5158Smillert 252e9ce3842Safresh1 use Encode qw(decode encode); 253e9ce3842Safresh1 $characters = decode('UTF-8', $octets, Encode::FB_CROAK); 254e9ce3842Safresh1 $octets = encode('UTF-8', $characters, Encode::FB_CROAK); 255b39c5158Smillert 256b39c5158Smillert=head2 Table of Contents 257b39c5158Smillert 25848950c12SsthenEncode consists of a collection of modules whose details are too extensive 25948950c12Ssthento fit in one document. This one itself explains the top-level APIs 260b39c5158Smillertand general topics at a glance. For other topics and more details, 26148950c12Ssthensee the documentation for these modules: 262b39c5158Smillert 263e9ce3842Safresh1=over 2 264e9ce3842Safresh1 265e9ce3842Safresh1=item L<Encode::Alias> - Alias definitions to encodings 266e9ce3842Safresh1 267e9ce3842Safresh1=item L<Encode::Encoding> - Encode Implementation Base Class 268e9ce3842Safresh1 269e9ce3842Safresh1=item L<Encode::Supported> - List of Supported Encodings 270e9ce3842Safresh1 271e9ce3842Safresh1=item L<Encode::CN> - Simplified Chinese Encodings 272e9ce3842Safresh1 273e9ce3842Safresh1=item L<Encode::JP> - Japanese Encodings 274e9ce3842Safresh1 275e9ce3842Safresh1=item L<Encode::KR> - Korean Encodings 276e9ce3842Safresh1 277e9ce3842Safresh1=item L<Encode::TW> - Traditional Chinese Encodings 278e9ce3842Safresh1 279e9ce3842Safresh1=back 280b39c5158Smillert 281b39c5158Smillert=head1 DESCRIPTION 282b39c5158Smillert 28348950c12SsthenThe C<Encode> module provides the interface between Perl strings 284b39c5158Smillertand the rest of the system. Perl strings are sequences of 28548950c12SsthenI<characters>. 286b39c5158Smillert 28748950c12SsthenThe repertoire of characters that Perl can represent is a superset of those 288b39c5158Smillertdefined by the Unicode Consortium. On most platforms the ordinal 28948950c12Ssthenvalues of a character as returned by C<ord(I<S>)> is the I<Unicode 29048950c12Ssthencodepoint> for that character. The exceptions are platforms where 29148950c12Ssthenthe legacy encoding is some variant of EBCDIC rather than a superset 29248950c12Ssthenof ASCII; see L<perlebcdic>. 293b39c5158Smillert 29448950c12SsthenDuring recent history, data is moved around a computer in 8-bit chunks, 29548950c12Ssthenoften called "bytes" but also known as "octets" in standards documents. 29648950c12SsthenPerl is widely used to manipulate data of many types: not only strings of 29748950c12Ssthencharacters representing human or computer languages, but also "binary" 29848950c12Ssthendata, being the machine's representation of numbers, pixels in an image, or 29948950c12Ssthenjust about anything. 300b39c5158Smillert 301b39c5158SmillertWhen Perl is processing "binary data", the programmer wants Perl to 30248950c12Ssthenprocess "sequences of bytes". This is not a problem for Perl: because a 303b39c5158Smillertbyte has 256 possible values, it easily fits in Perl's much larger 304b39c5158Smillert"logical character". 305b39c5158Smillert 306e9ce3842Safresh1This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq> 307e9ce3842Safresh1explain the I<why>. 308e9ce3842Safresh1 309b39c5158Smillert=head2 TERMINOLOGY 310b39c5158Smillert 311e9ce3842Safresh1=head3 character 312b39c5158Smillert 313e9ce3842Safresh1A character in the range 0 .. 2**32-1 (or more); 31448950c12Ssthenwhat Perl's strings are made of. 315b39c5158Smillert 316e9ce3842Safresh1=head3 byte 317b39c5158Smillert 318e9ce3842Safresh1A character in the range 0..255; 319e9ce3842Safresh1a special case of a Perl character. 320b39c5158Smillert 321e9ce3842Safresh1=head3 octet 322b39c5158Smillert 323e9ce3842Safresh18 bits of data, with ordinal values 0..255; 324e9ce3842Safresh1term for bytes passed to or from a non-Perl context, such as a disk file, 325e9ce3842Safresh1standard I/O stream, database, command-line argument, environment variable, 326e9ce3842Safresh1socket etc. 327b39c5158Smillert 32848950c12Ssthen=head1 THE PERL ENCODING API 329b39c5158Smillert 330e9ce3842Safresh1=head2 Basic methods 331b39c5158Smillert 332e9ce3842Safresh1=head3 encode 333e9ce3842Safresh1 334e9ce3842Safresh1 $octets = encode(ENCODING, STRING[, CHECK]) 335b39c5158Smillert 33648950c12SsthenEncodes the scalar value I<STRING> from Perl's internal form into 33748950c12SsthenI<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a 33848950c12Ssthencanonical name or an alias. For encoding names and aliases, see 33948950c12SsthenL</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">. 340b39c5158Smillert 3419f11ffb7Safresh1B<CAVEAT>: the input scalar I<STRING> might be modified in-place depending 3429f11ffb7Safresh1on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be 3439f11ffb7Safresh1left unchanged. 3449f11ffb7Safresh1 34548950c12SsthenFor example, to convert a string from Perl's internal format into 34648950c12SsthenISO-8859-1, also known as Latin1: 347b39c5158Smillert 348b39c5158Smillert $octets = encode("iso-8859-1", $string); 349b39c5158Smillert 3509f11ffb7Safresh1B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then 35148950c12Ssthen$octets I<might not be equal to> $string. Though both contain the 35248950c12Ssthensame data, the UTF8 flag for $octets is I<always> off. When you 35348950c12Ssthenencode anything, the UTF8 flag on the result is always off, even when it 3549f11ffb7Safresh1contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below. 355b39c5158Smillert 35648950c12SsthenIf the $string is C<undef>, then C<undef> is returned. 357b39c5158Smillert 3589f11ffb7Safresh1C<str2bytes> may be used as an alias for C<encode>. 3599f11ffb7Safresh1 360e9ce3842Safresh1=head3 decode 361e9ce3842Safresh1 362e9ce3842Safresh1 $string = decode(ENCODING, OCTETS[, CHECK]) 363b39c5158Smillert 36448950c12SsthenThis function returns the string that results from decoding the scalar 36548950c12Ssthenvalue I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into 366b8851fccSafresh1Perl's internal form. As with encode(), 36748950c12SsthenI<ENCODING> can be either a canonical name or an alias. For encoding names 36848950c12Ssthenand aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling 36948950c12SsthenMalformed Data">. 370b39c5158Smillert 3719f11ffb7Safresh1B<CAVEAT>: the input scalar I<OCTETS> might be modified in-place depending 3729f11ffb7Safresh1on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be 3739f11ffb7Safresh1left unchanged. 3749f11ffb7Safresh1 37548950c12SsthenFor example, to convert ISO-8859-1 data into a string in Perl's 37648950c12Sstheninternal format: 377b39c5158Smillert 378b39c5158Smillert $string = decode("iso-8859-1", $octets); 379b39c5158Smillert 3809f11ffb7Safresh1B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string 38148950c12SsthenI<might not be equal to> $octets. Though both contain the same data, the 382e5157e49Safresh1UTF8 flag for $string is on. See L</"The UTF8 flag"> 383b39c5158Smillertbelow. 384b39c5158Smillert 38548950c12SsthenIf the $string is C<undef>, then C<undef> is returned. 386b39c5158Smillert 3879f11ffb7Safresh1C<bytes2str> may be used as an alias for C<decode>. 3889f11ffb7Safresh1 389e9ce3842Safresh1=head3 find_encoding 390e9ce3842Safresh1 391e9ce3842Safresh1 [$obj =] find_encoding(ENCODING) 392b39c5158Smillert 39348950c12SsthenReturns the I<encoding object> corresponding to I<ENCODING>. Returns 39448950c12SsthenC<undef> if no matching I<ENCODING> is find. The returned object is 39548950c12Ssthenwhat does the actual encoding or decoding. 396b39c5158Smillert 3979f11ffb7Safresh1 $string = decode($name, $bytes); 398b39c5158Smillert 399b39c5158Smillertis in fact 400b39c5158Smillert 4019f11ffb7Safresh1 $string = do { 402b39c5158Smillert $obj = find_encoding($name); 403b39c5158Smillert croak qq(encoding "$name" not found) unless ref $obj; 40448950c12Ssthen $obj->decode($bytes); 405b39c5158Smillert }; 406b39c5158Smillert 407b39c5158Smillertwith more error checking. 408b39c5158Smillert 40948950c12SsthenYou can therefore save time by reusing this object as follows; 410b39c5158Smillert 411b39c5158Smillert my $enc = find_encoding("iso-8859-1"); 412b39c5158Smillert while(<>) { 4139f11ffb7Safresh1 my $string = $enc->decode($_); 4149f11ffb7Safresh1 ... # now do something with $string; 415b39c5158Smillert } 416b39c5158Smillert 417e9ce3842Safresh1Besides L</decode> and L</encode>, other methods are 418e9ce3842Safresh1available as well. For instance, C<name()> returns the canonical 419b39c5158Smillertname of the encoding object. 420b39c5158Smillert 421b39c5158Smillert find_encoding("latin1")->name; # iso-8859-1 422b39c5158Smillert 423b39c5158SmillertSee L<Encode::Encoding> for details. 424b39c5158Smillert 4259f11ffb7Safresh1=head3 find_mime_encoding 4269f11ffb7Safresh1 4279f11ffb7Safresh1 [$obj =] find_mime_encoding(MIME_ENCODING) 4289f11ffb7Safresh1 4299f11ffb7Safresh1Returns the I<encoding object> corresponding to I<MIME_ENCODING>. Acts 4309f11ffb7Safresh1same as C<find_encoding()> but C<mime_name()> of returned object must 4319f11ffb7Safresh1match to I<MIME_ENCODING>. So as opposite of C<find_encoding()> 4329f11ffb7Safresh1canonical names and aliases are not used when searching for object. 4339f11ffb7Safresh1 434*3d61058aSafresh1 find_mime_encoding("utf8"); # returns undef because "utf8" is not a valid MIME_ENCODING 4359f11ffb7Safresh1 find_mime_encoding("utf-8"); # returns encode object "utf-8-strict" 436*3d61058aSafresh1 find_mime_encoding("UTF-8"); # same as "utf-8" because MIME_ENCODING is case insensitive 437*3d61058aSafresh1 find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not a valid MIME_ENCODING 4389f11ffb7Safresh1 439e9ce3842Safresh1=head3 from_to 440e9ce3842Safresh1 441e9ce3842Safresh1 [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) 442b39c5158Smillert 44348950c12SsthenConverts I<in-place> data between two encodings. The data in $octets 44448950c12Ssthenmust be encoded as octets and I<not> as characters in Perl's internal 44548950c12Ssthenformat. For example, to convert ISO-8859-1 data into Microsoft's CP1250 446b39c5158Smillertencoding: 447b39c5158Smillert 448b39c5158Smillert from_to($octets, "iso-8859-1", "cp1250"); 449b39c5158Smillert 450b39c5158Smillertand to convert it back: 451b39c5158Smillert 452b39c5158Smillert from_to($octets, "cp1250", "iso-8859-1"); 453b39c5158Smillert 45448950c12SsthenBecause the conversion happens in place, the data to be 45548950c12Ssthenconverted cannot be a string constant: it must be a scalar variable. 456b39c5158Smillert 457e9ce3842Safresh1C<from_to()> returns the length of the converted string in octets on success, 45848950c12Ssthenand C<undef> on error. 459b39c5158Smillert 46048950c12SsthenB<CAVEAT>: The following operations may look the same, but are not: 461b39c5158Smillert 4629f11ffb7Safresh1 from_to($data, "iso-8859-1", "UTF-8"); #1 463b39c5158Smillert $data = decode("iso-8859-1", $data); #2 464b39c5158Smillert 46548950c12SsthenBoth #1 and #2 make $data consist of a completely valid UTF-8 string, 46648950c12Ssthenbut only #2 turns the UTF8 flag on. #1 is equivalent to: 467b39c5158Smillert 4689f11ffb7Safresh1 $data = encode("UTF-8", decode("iso-8859-1", $data)); 469b39c5158Smillert 470b39c5158SmillertSee L</"The UTF8 flag"> below. 471b39c5158Smillert 47248950c12SsthenAlso note that: 473b39c5158Smillert 474b39c5158Smillert from_to($octets, $from, $to, $check); 475b39c5158Smillert 476b8851fccSafresh1is equivalent to: 477b39c5158Smillert 478b39c5158Smillert $octets = encode($to, decode($from, $octets), $check); 479b39c5158Smillert 48048950c12SsthenYes, it does I<not> respect the $check during decoding. It is 48148950c12Ssthendeliberately done that way. If you need minute control, use C<decode> 48248950c12Ssthenfollowed by C<encode> as follows: 483b39c5158Smillert 484b39c5158Smillert $octets = encode($to, decode($from, $octets, $check_from), $check_to); 485b39c5158Smillert 486e9ce3842Safresh1=head3 encode_utf8 487e9ce3842Safresh1 488e9ce3842Safresh1 $octets = encode_utf8($string); 489b39c5158Smillert 490eac174f2Safresh1B<WARNING>: L<This function can produce invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8> 491eac174f2Safresh1Do not use it for data exchange. 492eac174f2Safresh1Unless you want Perl's older "lax" mode, prefer 493eac174f2Safresh1C<$octets = encode("UTF-8", $string)>. 494eac174f2Safresh1 49548950c12SsthenEquivalent to C<$octets = encode("utf8", $string)>. The characters in 49648950c12Ssthen$string are encoded in Perl's internal format, and the result is returned 49748950c12Ssthenas a sequence of octets. Because all possible characters in Perl have a 4989f11ffb7Safresh1(loose, not strict) utf8 representation, this function cannot fail. 4999f11ffb7Safresh1 500e9ce3842Safresh1=head3 decode_utf8 501e9ce3842Safresh1 502e9ce3842Safresh1 $string = decode_utf8($octets [, CHECK]); 503b39c5158Smillert 504eac174f2Safresh1B<WARNING>: L<This function accepts invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8> 505eac174f2Safresh1Do not use it for data exchange. 506eac174f2Safresh1Unless you want Perl's older "lax" mode, prefer 507eac174f2Safresh1C<$string = decode("UTF-8", $octets [, CHECK])>. 508eac174f2Safresh1 50948950c12SsthenEquivalent to C<$string = decode("utf8", $octets [, CHECK])>. 51048950c12SsthenThe sequence of octets represented by $octets is decoded 5119f11ffb7Safresh1from (loose, not strict) utf8 into a sequence of logical characters. 5129f11ffb7Safresh1Because not all sequences of octets are valid not strict utf8, 51348950c12Ssthenit is quite possible for this function to fail. 51448950c12SsthenFor CHECK, see L</"Handling Malformed Data">. 515b39c5158Smillert 5169f11ffb7Safresh1B<CAVEAT>: the input I<$octets> might be modified in-place depending on 5179f11ffb7Safresh1what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be 5189f11ffb7Safresh1left unchanged. 5199f11ffb7Safresh1 520b39c5158Smillert=head2 Listing available encodings 521b39c5158Smillert 522b39c5158Smillert use Encode; 523b39c5158Smillert @list = Encode->encodings(); 524b39c5158Smillert 52548950c12SsthenReturns a list of canonical names of available encodings that have already 52648950c12Ssthenbeen loaded. To get a list of all available encodings including those that 52748950c12Ssthenhave not yet been loaded, say: 528b39c5158Smillert 529b39c5158Smillert @all_encodings = Encode->encodings(":all"); 530b39c5158Smillert 53148950c12SsthenOr you can give the name of a specific module: 532b39c5158Smillert 533b39c5158Smillert @with_jp = Encode->encodings("Encode::JP"); 534b39c5158Smillert 53548950c12SsthenWhen "C<::>" is not in the name, "C<Encode::>" is assumed. 536b39c5158Smillert 537b39c5158Smillert @ebcdic = Encode->encodings("EBCDIC"); 538b39c5158Smillert 539b39c5158SmillertTo find out in detail which encodings are supported by this package, 540b39c5158Smillertsee L<Encode::Supported>. 541b39c5158Smillert 542b39c5158Smillert=head2 Defining Aliases 543b39c5158Smillert 544b39c5158SmillertTo add a new alias to a given encoding, use: 545b39c5158Smillert 546b39c5158Smillert use Encode; 547b39c5158Smillert use Encode::Alias; 54848950c12Ssthen define_alias(NEWNAME => ENCODING); 549b39c5158Smillert 55048950c12SsthenAfter that, I<NEWNAME> can be used as an alias for I<ENCODING>. 551e9ce3842Safresh1I<ENCODING> may be either the name of an encoding or an 55248950c12SsthenI<encoding object>. 553b39c5158Smillert 55448950c12SsthenBefore you do that, first make sure the alias is nonexistent using 555b39c5158SmillertC<resolve_alias()>, which returns the canonical name thereof. 55648950c12SsthenFor example: 557b39c5158Smillert 558b39c5158Smillert Encode::resolve_alias("latin1") eq "iso-8859-1" # true 559b39c5158Smillert Encode::resolve_alias("iso-8859-12") # false; nonexistent 560b39c5158Smillert Encode::resolve_alias($name) eq $name # true if $name is canonical 561b39c5158Smillert 562e9ce3842Safresh1C<resolve_alias()> does not need C<use Encode::Alias>; it can be 56348950c12Ssthenimported via C<use Encode qw(resolve_alias)>. 564b39c5158Smillert 565b39c5158SmillertSee L<Encode::Alias> for details. 566b39c5158Smillert 567b39c5158Smillert=head2 Finding IANA Character Set Registry names 568b39c5158Smillert 569b39c5158SmillertThe canonical name of a given encoding does not necessarily agree with 57048950c12SsthenIANA Character Set Registry, commonly seen as C<< Content-Type: 57148950c12Ssthentext/plain; charset=I<WHATEVER> >>. For most cases, the canonical name 57248950c12Ssthenworks, but sometimes it does not, most notably with "utf-8-strict". 573b39c5158Smillert 57448950c12SsthenAs of C<Encode> version 2.21, a new method C<mime_name()> is therefore added. 575b39c5158Smillert 576b39c5158Smillert use Encode; 57748950c12Ssthen my $enc = find_encoding("UTF-8"); 578b39c5158Smillert warn $enc->name; # utf-8-strict 579b39c5158Smillert warn $enc->mime_name; # UTF-8 580b39c5158Smillert 581b39c5158SmillertSee also: L<Encode::Encoding> 582b39c5158Smillert 583b39c5158Smillert=head1 Encoding via PerlIO 584b39c5158Smillert 58548950c12SsthenIf your perl supports C<PerlIO> (which is the default), you can use a 58648950c12SsthenC<PerlIO> layer to decode and encode directly via a filehandle. The 58748950c12Ssthenfollowing two examples are fully identical in functionality: 588b39c5158Smillert 58948950c12Ssthen ### Version 1 via PerlIO 59048950c12Ssthen open(INPUT, "< :encoding(shiftjis)", $infile) 59148950c12Ssthen || die "Can't open < $infile for reading: $!"; 59248950c12Ssthen open(OUTPUT, "> :encoding(euc-jp)", $outfile) 59348950c12Ssthen || die "Can't open > $output for writing: $!"; 59448950c12Ssthen while (<INPUT>) { # auto decodes $_ 59548950c12Ssthen print OUTPUT; # auto encodes $_ 596b39c5158Smillert } 59748950c12Ssthen close(INPUT) || die "can't close $infile: $!"; 59848950c12Ssthen close(OUTPUT) || die "can't close $outfile: $!"; 599b39c5158Smillert 60048950c12Ssthen ### Version 2 via from_to() 60148950c12Ssthen open(INPUT, "< :raw", $infile) 60248950c12Ssthen || die "Can't open < $infile for reading: $!"; 60348950c12Ssthen open(OUTPUT, "> :raw", $outfile) 60448950c12Ssthen || die "Can't open > $output for writing: $!"; 605b39c5158Smillert 60648950c12Ssthen while (<INPUT>) { 60748950c12Ssthen from_to($_, "shiftjis", "euc-jp", 1); # switch encoding 60848950c12Ssthen print OUTPUT; # emit raw (but properly encoded) data 60948950c12Ssthen } 61048950c12Ssthen close(INPUT) || die "can't close $infile: $!"; 61148950c12Ssthen close(OUTPUT) || die "can't close $outfile: $!"; 612b39c5158Smillert 61348950c12SsthenIn the first version above, you let the appropriate encoding layer 61448950c12Ssthenhandle the conversion. In the second, you explicitly translate 61548950c12Ssthenfrom one encoding to the other. 61648950c12Ssthen 617b8851fccSafresh1Unfortunately, it may be that encodings are not C<PerlIO>-savvy. You can check 61848950c12Ssthento see whether your encoding is supported by C<PerlIO> by invoking the 61948950c12SsthenC<perlio_ok> method on it: 62048950c12Ssthen 62148950c12Ssthen Encode::perlio_ok("hz"); # false 62248950c12Ssthen find_encoding("euc-cn")->perlio_ok; # true wherever PerlIO is available 62348950c12Ssthen 62448950c12Ssthen use Encode qw(perlio_ok); # imported upon request 625b39c5158Smillert perlio_ok("euc-jp") 626b39c5158Smillert 62748950c12SsthenFortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy 628e9ce3842Safresh1except for C<hz> and C<ISO-2022-kr>. For the gory details, see 629b39c5158SmillertL<Encode::Encoding> and L<Encode::PerlIO>. 630b39c5158Smillert 631b39c5158Smillert=head1 Handling Malformed Data 632b39c5158Smillert 63348950c12SsthenThe optional I<CHECK> argument tells C<Encode> what to do when 63448950c12Ssthenencountering malformed data. Without I<CHECK>, C<Encode::FB_DEFAULT> 63548950c12Ssthen(== 0) is assumed. 636b39c5158Smillert 63748950c12SsthenAs of version 2.12, C<Encode> supports coderef values for C<CHECK>; 63848950c12Ssthensee below. 639b39c5158Smillert 640e9ce3842Safresh1B<NOTE:> Not all encodings support this feature. 641e9ce3842Safresh1Some encodings ignore the I<CHECK> argument. For example, 642b39c5158SmillertL<Encode::Unicode> ignores I<CHECK> and it always croaks on error. 643b39c5158Smillert 644e9ce3842Safresh1=head2 List of I<CHECK> values 645b39c5158Smillert 646e9ce3842Safresh1=head3 FB_DEFAULT 647b39c5158Smillert 648*3d61058aSafresh1 CHECK = Encode::FB_DEFAULT ( == 0) 649b39c5158Smillert 65048950c12SsthenIf I<CHECK> is 0, encoding and decoding replace any malformed character 65148950c12Ssthenwith a I<substitution character>. When you encode, I<SUBCHAR> is used. 65248950c12SsthenWhen you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is 65348950c12Ssthenused. If the data is supposed to be UTF-8, an optional lexical warning of 65448950c12Ssthenwarning category C<"utf8"> is given. 655b39c5158Smillert 656e9ce3842Safresh1=head3 FB_CROAK 657e9ce3842Safresh1 658*3d61058aSafresh1 CHECK = Encode::FB_CROAK ( == 1) 659b39c5158Smillert 66048950c12SsthenIf I<CHECK> is 1, methods immediately die with an error 66148950c12Ssthenmessage. Therefore, when I<CHECK> is 1, you should trap 66248950c12Ssthenexceptions with C<eval{}>, unless you really want to let it C<die>. 663b39c5158Smillert 664e9ce3842Safresh1=head3 FB_QUIET 665e9ce3842Safresh1 666*3d61058aSafresh1 CHECK = Encode::FB_QUIET 667b39c5158Smillert 66848950c12SsthenIf I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately 669b39c5158Smillertreturn the portion of the data that has been processed so far when an 67048950c12Ssthenerror occurs. The data argument is overwritten with everything 67148950c12Ssthenafter that point; that is, the unprocessed portion of the data. This is 67248950c12Ssthenhandy when you have to call C<decode> repeatedly in the case where your 673b39c5158Smillertsource data may contain partial multi-byte character sequences, 67448950c12Ssthen(that is, you are reading with a fixed-width buffer). Here's some sample 67548950c12Ssthencode to do exactly that: 676b39c5158Smillert 67748950c12Ssthen my($buffer, $string) = ("", ""); 67848950c12Ssthen while (read($fh, $buffer, 256, length($buffer))) { 679b39c5158Smillert $string .= decode($encoding, $buffer, Encode::FB_QUIET); 680b39c5158Smillert # $buffer now contains the unprocessed partial character 681b39c5158Smillert } 682b39c5158Smillert 683e9ce3842Safresh1=head3 FB_WARN 684e9ce3842Safresh1 685*3d61058aSafresh1 CHECK = Encode::FB_WARN 686b39c5158Smillert 68748950c12SsthenThis is the same as C<FB_QUIET> above, except that instead of being silent 68848950c12Ssthenon errors, it issues a warning. This is handy for when you are debugging. 689b39c5158Smillert 690b46d8ef2Safresh1B<CAVEAT>: All warnings from Encode module are reported, independently of 691b46d8ef2Safresh1L<pragma warnings|warnings> settings. If you want to follow settings of 692b46d8ef2Safresh1lexical warnings configured by L<pragma warnings|warnings> then append 693b46d8ef2Safresh1also check value C<ENCODE::ONLY_PRAGMA_WARNINGS>. This value is available 694b46d8ef2Safresh1since Encode version 2.99. 695b46d8ef2Safresh1 696e9ce3842Safresh1=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF 697e9ce3842Safresh1 698e9ce3842Safresh1=over 2 699e9ce3842Safresh1 700b39c5158Smillert=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ) 701b39c5158Smillert 702b39c5158Smillert=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF) 703b39c5158Smillert 704b39c5158Smillert=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF) 705b39c5158Smillert 706e9ce3842Safresh1=back 707e9ce3842Safresh1 70848950c12SsthenFor encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==> 70948950c12SsthenC<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode. 710b39c5158Smillert 71148950c12SsthenWhen you decode, C<\xI<HH>> is inserted for a malformed character, where 71248950c12SsthenI<HH> is the hex representation of the octet that could not be decoded to 71348950c12Ssthenutf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is 71448950c12Ssthenthe Unicode code point (in any number of hex digits) of the character that 71548950c12Ssthencannot be found in the character repertoire of the encoding. 716b39c5158Smillert 71748950c12SsthenThe HTML/XML character reference modes are about the same. In place of 71848950c12SsthenC<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and 719b39c5158SmillertXML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number. 720b39c5158Smillert 72148950c12SsthenIn C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied. 722b39c5158Smillert 723e9ce3842Safresh1=head3 The bitmask 724b39c5158Smillert 72548950c12SsthenThese modes are all actually set via a bitmask. Here is how the C<FB_I<XXX>> 72648950c12Ssthenconstants are laid out. You can import the C<FB_I<XXX>> constants via 72748950c12SsthenC<use Encode qw(:fallbacks)>, and you can import the generic bitmask 728b39c5158Smillertconstants via C<use Encode qw(:fallback_all)>. 729b39c5158Smillert 730b39c5158Smillert FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ 731b39c5158Smillert DIE_ON_ERR 0x0001 X 732b39c5158Smillert WARN_ON_ERR 0x0002 X 733b39c5158Smillert RETURN_ON_ERR 0x0004 X X 734b39c5158Smillert LEAVE_SRC 0x0008 X 735b39c5158Smillert PERLQQ 0x0100 X 736b39c5158Smillert HTMLCREF 0x0200 737b39c5158Smillert XMLCREF 0x0400 738b39c5158Smillert 739e9ce3842Safresh1=head3 LEAVE_SRC 740b39c5158Smillert 741e9ce3842Safresh1 Encode::LEAVE_SRC 742b39c5158Smillert 74348950c12SsthenIf the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the 744e9ce3842Safresh1source string to encode() or decode() will be overwritten in place. 74548950c12SsthenIf you're not interested in this, then bitwise-OR it with the bitmask. 746b39c5158Smillert 747b39c5158Smillert=head2 coderef for CHECK 748b39c5158Smillert 74948950c12SsthenAs of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the 750e5157e49Safresh1ordinal value of the unmapped character as an argument and returns 751e5157e49Safresh1octets that represent the fallback character. For instance: 752b39c5158Smillert 753b39c5158Smillert $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift }); 754b39c5158Smillert 75548950c12SsthenActs like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>. 756b39c5158Smillert 7579f11ffb7Safresh1Fallback for C<decode> must return decoded string (sequence of characters) 7589f11ffb7Safresh1and takes a list of ordinal values as its arguments. So for 759b8851fccSafresh1example if you wish to decode octets as UTF-8, and use ISO-8859-15 as 760e5157e49Safresh1a fallback for bytes that are not valid UTF-8, you could write 761e5157e49Safresh1 762e5157e49Safresh1 $str = decode 'UTF-8', $octets, sub { 7639f11ffb7Safresh1 my $tmp = join '', map chr, @_; 7649f11ffb7Safresh1 return decode 'ISO-8859-15', $tmp; 765e5157e49Safresh1 }; 766e5157e49Safresh1 767b39c5158Smillert=head1 Defining Encodings 768b39c5158Smillert 769b39c5158SmillertTo define a new encoding, use: 770b39c5158Smillert 771b39c5158Smillert use Encode qw(define_encoding); 77248950c12Ssthen define_encoding($object, CANONICAL_NAME [, alias...]); 773b39c5158Smillert 77448950c12SsthenI<CANONICAL_NAME> will be associated with I<$object>. The object 775b39c5158Smillertshould provide the interface described in L<Encode::Encoding>. 77648950c12SsthenIf more than two arguments are provided, additional 77748950c12Ssthenarguments are considered aliases for I<$object>. 778b39c5158Smillert 77948950c12SsthenSee L<Encode::Encoding> for details. 780b39c5158Smillert 781b39c5158Smillert=head1 The UTF8 flag 782b39c5158Smillert 783*3d61058aSafresh1Before the introduction of Unicode support in Perl, the C<eq> operator 784b39c5158Smillertjust compared the strings represented by two scalars. Beginning with 78548950c12SsthenPerl 5.8, C<eq> compares two strings with simultaneous consideration of 78648950c12SsthenI<the UTF8 flag>. To explain why we made it so, I quote from page 402 of 78748950c12SsthenI<Programming Perl, 3rd ed.> 788b39c5158Smillert 789b39c5158Smillert=over 2 790b39c5158Smillert 791b39c5158Smillert=item Goal #1: 792b39c5158Smillert 793b39c5158SmillertOld byte-oriented programs should not spontaneously break on the old 794b39c5158Smillertbyte-oriented data they used to work on. 795b39c5158Smillert 796b39c5158Smillert=item Goal #2: 797b39c5158Smillert 798b39c5158SmillertOld byte-oriented programs should magically start working on the new 799b39c5158Smillertcharacter-oriented data when appropriate. 800b39c5158Smillert 801b39c5158Smillert=item Goal #3: 802b39c5158Smillert 803b39c5158SmillertPrograms should run just as fast in the new character-oriented mode 804b39c5158Smillertas in the old byte-oriented mode. 805b39c5158Smillert 806b39c5158Smillert=item Goal #4: 807b39c5158Smillert 808b39c5158SmillertPerl should remain one language, rather than forking into a 809b39c5158Smillertbyte-oriented Perl and a character-oriented Perl. 810b39c5158Smillert 811b39c5158Smillert=back 812b39c5158Smillert 81348950c12SsthenWhen I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been 81448950c12Ssthenborn yet, many features documented in the book remained unimplemented for a 81548950c12Ssthenlong time. Perl 5.8 corrected much of this, and the introduction of the 81648950c12SsthenUTF8 flag is one of them. You can think of there being two fundamentally 81748950c12Ssthendifferent kinds of strings and string-operations in Perl: one a 81848950c12Ssthenbyte-oriented mode for when the internal UTF8 flag is off, and the other a 81948950c12Ssthencharacter-oriented mode for when the internal UTF8 flag is on. 820b39c5158Smillert 82148950c12SsthenThis UTF8 flag is not visible in Perl scripts, exactly for the same reason 82248950c12Ssthenyou cannot (or rather, you I<don't have to>) see whether a scalar contains 82348950c12Ssthena string, an integer, or a floating-point number. But you can still peek 82448950c12Ssthenand poke these if you will. See the next section. 825b39c5158Smillert 826b39c5158Smillert=head2 Messing with Perl's Internals 827b39c5158Smillert 828b39c5158SmillertThe following API uses parts of Perl's internals in the current 82948950c12Ssthenimplementation. As such, they are efficient but may change in a future 83048950c12Ssthenrelease. 831b39c5158Smillert 832e9ce3842Safresh1=head3 is_utf8 833b39c5158Smillert 834e9ce3842Safresh1 is_utf8(STRING [, CHECK]) 835b39c5158Smillert 83648950c12Ssthen[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>. 83748950c12SsthenIf I<CHECK> is true, also checks whether I<STRING> contains well-formed 838b39c5158SmillertUTF-8. Returns true if successful, false otherwise. 839b39c5158Smillert 8409f11ffb7Safresh1Typically only necessary for debugging and testing. Don't use this flag as 8419f11ffb7Safresh1a marker to distinguish character and binary data, that should be decided 8429f11ffb7Safresh1for each variable when you write your code. 8439f11ffb7Safresh1 8449f11ffb7Safresh1B<CAVEAT>: If I<STRING> has UTF8 flag set, it does B<NOT> mean that 8459f11ffb7Safresh1I<STRING> is UTF-8 encoded and vice-versa. 8469f11ffb7Safresh1 84748950c12SsthenAs of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function. 848b39c5158Smillert 849e9ce3842Safresh1=head3 _utf8_on 850e9ce3842Safresh1 851e9ce3842Safresh1 _utf8_on(STRING) 852b39c5158Smillert 85348950c12Ssthen[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>. The I<STRING> 85448950c12Ssthenis I<not> checked for containing only well-formed UTF-8. Do not use this 85548950c12Ssthenunless you I<know with absolute certainty> that the STRING holds only 85648950c12Ssthenwell-formed UTF-8. Returns the previous state of the UTF8 flag (so please 85748950c12Ssthendon't treat the return value as indicating success or failure), or C<undef> 85848950c12Ssthenif I<STRING> is not a string. 859b39c5158Smillert 86048950c12SsthenB<NOTE>: For security reasons, this function does not work on tainted values. 861b39c5158Smillert 862e9ce3842Safresh1=head3 _utf8_off 863e9ce3842Safresh1 864e9ce3842Safresh1 _utf8_off(STRING) 865b39c5158Smillert 86648950c12Ssthen[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>. Do not use 86748950c12Ssthenfrivolously. Returns the previous state of the UTF8 flag, or C<undef> if 86848950c12SsthenI<STRING> is not a string. Do not treat the return value as indicative of 86948950c12Ssthensuccess or failure, because that isn't what it means: it is only the 87048950c12Ssthenprevious setting. 871b39c5158Smillert 87248950c12SsthenB<NOTE>: For security reasons, this function does not work on tainted values. 873b39c5158Smillert 874b39c5158Smillert=head1 UTF-8 vs. utf8 vs. UTF8 875b39c5158Smillert 876b39c5158Smillert ....We now view strings not as sequences of bytes, but as sequences 877b39c5158Smillert of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit 878b39c5158Smillert computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed. 879b39c5158Smillert 88048950c12SsthenThat has historically been Perl's notion of UTF-8, as that is how UTF-8 was 88148950c12Ssthenfirst conceived by Ken Thompson when he invented it. However, thanks to 88248950c12Ssthenlater revisions to the applicable standards, official UTF-8 is now rather 88348950c12Ssthenstricter than that. For example, its range is much narrower (0 .. 0x10_FFFF 88448950c12Ssthento cover only 21 bits instead of 32 or 64 bits) and some sequences 88548950c12Ssthenare not allowed, like those used in surrogate pairs, the 31 non-character 88648950c12Ssthencode points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane 88748950c12Ssthen(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc. 888b39c5158Smillert 88948950c12SsthenThe former default in which Perl would always use a loose interpretation of 89048950c12SsthenUTF-8 has now been overruled: 891b39c5158Smillert 892b39c5158Smillert From: Larry Wall <larry@wall.org> 893b39c5158Smillert Date: December 04, 2004 11:51:58 JST 894b39c5158Smillert To: perl-unicode@perl.org 895b39c5158Smillert Subject: Re: Make Encode.pm support the real UTF-8 896b39c5158Smillert Message-Id: <20041204025158.GA28754@wall.org> 897b39c5158Smillert 898b39c5158Smillert On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote: 899b39c5158Smillert : I've no problem with 'utf8' being perl's unrestricted uft8 encoding, 900b39c5158Smillert : but "UTF-8" is the name of the standard and should give the 901b39c5158Smillert : corresponding behaviour. 902b39c5158Smillert 903b39c5158Smillert For what it's worth, that's how I've always kept them straight in my 904b39c5158Smillert head. 905b39c5158Smillert 906b39c5158Smillert Also for what it's worth, Perl 6 will mostly default to strict but 907b39c5158Smillert make it easy to switch back to lax. 908b39c5158Smillert 909b39c5158Smillert Larry 910b39c5158Smillert 91148950c12SsthenGot that? As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current 91248950c12Ssthensense, which is conservative and strict and security-conscious, whereas 91348950c12SsthenB<"utf8"> means UTF-8 in its former sense, which was liberal and loose and 91448950c12Ssthenlax. C<Encode> version 2.10 or later thus groks this subtle but critically 91548950c12Ssthenimportant distinction between C<"UTF-8"> and C<"utf8">. 916b39c5158Smillert 917b39c5158Smillert encode("utf8", "\x{FFFF_FFFF}", 1); # okay 918b39c5158Smillert encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks 919b39c5158Smillert 920eac174f2Safresh1This distinction is also important for decoding. In the following, 921eac174f2Safresh1C<$s> stores character U+200000, which exceeds UTF-8's allowed range. 922eac174f2Safresh1C<$s> thus stores an invalid Unicode code point: 923eac174f2Safresh1 924eac174f2Safresh1 $s = decode("utf8", "\xf8\x88\x80\x80\x80"); 925eac174f2Safresh1 926eac174f2Safresh1C<"UTF-8">, by contrast, will either coerce the input to something valid: 927eac174f2Safresh1 928eac174f2Safresh1 $s = decode("UTF-8", "\xf8\x88\x80\x80\x80"); # U+FFFD 929eac174f2Safresh1 930eac174f2Safresh1.. or croak: 931eac174f2Safresh1 932eac174f2Safresh1 decode("UTF-8", "\xf8\x88\x80\x80\x80", FB_CROAK|LEAVE_SRC); 933eac174f2Safresh1 93448950c12SsthenIn the C<Encode> module, C<"UTF-8"> is actually a canonical name for 93548950c12SsthenC<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is 93648950c12Ssthencritical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive: 937b39c5158Smillert 938b39c5158Smillert find_encoding("UTF-8")->name # is 'utf-8-strict' 939b39c5158Smillert find_encoding("utf-8")->name # ditto. names are case insensitive 940b39c5158Smillert find_encoding("utf_8")->name # ditto. "_" are treated as "-" 941b39c5158Smillert find_encoding("UTF8")->name # is 'utf8'. 942b39c5158Smillert 94348950c12SsthenPerl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates 94448950c12Ssthenwhether a string is internally encoded as "utf8", also without a hyphen. 945b39c5158Smillert 946b39c5158Smillert=head1 SEE ALSO 947b39c5158Smillert 948b39c5158SmillertL<Encode::Encoding>, 949b39c5158SmillertL<Encode::Supported>, 950b39c5158SmillertL<Encode::PerlIO>, 951b39c5158SmillertL<encoding>, 952b39c5158SmillertL<perlebcdic>, 953b39c5158SmillertL<perlfunc/open>, 954b39c5158SmillertL<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut> 955b39c5158SmillertL<utf8>, 956e9ce3842Safresh1the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html> 957b39c5158Smillert 958b39c5158Smillert=head1 MAINTAINER 959b39c5158Smillert 96048950c12SsthenThis project was originated by the late Nick Ing-Simmons and later 961e9ce3842Safresh1maintained by Dan Kogai I<< <dankogai@cpan.org> >>. See AUTHORS 96248950c12Ssthenfor a full list of people involved. For any questions, send mail to 96348950c12SsthenI<< <perl-unicode@perl.org> >> so that we can all share. 964b39c5158Smillert 96548950c12SsthenWhile Dan Kogai retains the copyright as a maintainer, credit 96648950c12Ssthenshould go to all those involved. See AUTHORS for a list of those 96748950c12Ssthenwho submitted code to the project. 968b39c5158Smillert 969b39c5158Smillert=head1 COPYRIGHT 970b39c5158Smillert 971b8851fccSafresh1Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>. 972b39c5158Smillert 973b39c5158SmillertThis library is free software; you can redistribute it and/or modify 974b39c5158Smillertit under the same terms as Perl itself. 975b39c5158Smillert 976b39c5158Smillert=cut 977