1# 2# $Id: Encode.pm,v 2.49 2013/03/05 03:13:47 dankogai Exp dankogai $ 3# 4package Encode; 5use strict; 6use warnings; 7our $VERSION = sprintf "%d.%02d", q$Revision: 2.49 $ =~ /(\d+)/g; 8use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; 9use XSLoader (); 10XSLoader::load( __PACKAGE__, $VERSION ); 11 12require Exporter; 13use base qw/Exporter/; 14 15# Public, encouraged API is exported by default 16 17our @EXPORT = qw( 18 decode decode_utf8 encode encode_utf8 str2bytes bytes2str 19 encodings find_encoding clone_encoding 20); 21our @FB_FLAGS = qw( 22 DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC 23 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL 24); 25our @FB_CONSTS = qw( 26 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN 27 FB_PERLQQ FB_HTMLCREF FB_XMLCREF 28); 29our @EXPORT_OK = ( 30 qw( 31 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit 32 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade 33 ), 34 @FB_FLAGS, @FB_CONSTS, 35); 36 37our %EXPORT_TAGS = ( 38 all => [ @EXPORT, @EXPORT_OK ], 39 default => [ @EXPORT ], 40 fallbacks => [ @FB_CONSTS ], 41 fallback_all => [ @FB_CONSTS, @FB_FLAGS ], 42); 43 44# Documentation moved after __END__ for speed - NI-S 45 46our $ON_EBCDIC = ( ord("A") == 193 ); 47 48use Encode::Alias; 49 50# Make a %Encoding package variable to allow a certain amount of cheating 51our %Encoding; 52our %ExtModule; 53require Encode::Config; 54# See 55# https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2 56# to find why sig handers inside eval{} are disabled. 57eval { 58 local $SIG{__DIE__}; 59 local $SIG{__WARN__}; 60 require Encode::ConfigLocal; 61}; 62 63sub encodings { 64 my %enc; 65 my $arg = $_[1] || ''; 66 if ( $arg eq ":all" ) { 67 %enc = ( %Encoding, %ExtModule ); 68 } 69 else { 70 %enc = %Encoding; 71 for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) { 72 DEBUG and warn $mod; 73 for my $enc ( keys %ExtModule ) { 74 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod; 75 } 76 } 77 } 78 return sort { lc $a cmp lc $b } 79 grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc; 80} 81 82sub perlio_ok { 83 my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] ); 84 $obj->can("perlio_ok") and return $obj->perlio_ok(); 85 return 0; # safety net 86} 87 88sub define_encoding { 89 my $obj = shift; 90 my $name = shift; 91 $Encoding{$name} = $obj; 92 my $lc = lc($name); 93 define_alias( $lc => $obj ) unless $lc eq $name; 94 while (@_) { 95 my $alias = shift; 96 define_alias( $alias, $obj ); 97 } 98 return $obj; 99} 100 101sub getEncoding { 102 my ( $class, $name, $skip_external ) = @_; 103 104 $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796 105 106 ref($name) && $name->can('renew') and return $name; 107 exists $Encoding{$name} and return $Encoding{$name}; 108 my $lc = lc $name; 109 exists $Encoding{$lc} and return $Encoding{$lc}; 110 111 my $oc = $class->find_alias($name); 112 defined($oc) and return $oc; 113 $lc ne $name and $oc = $class->find_alias($lc); 114 defined($oc) and return $oc; 115 116 unless ($skip_external) { 117 if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) { 118 $mod =~ s,::,/,g; 119 $mod .= '.pm'; 120 eval { require $mod; }; 121 exists $Encoding{$name} and return $Encoding{$name}; 122 } 123 } 124 return; 125} 126 127sub find_encoding($;$) { 128 my ( $name, $skip_external ) = @_; 129 return __PACKAGE__->getEncoding( $name, $skip_external ); 130} 131 132sub resolve_alias($) { 133 my $obj = find_encoding(shift); 134 defined $obj and return $obj->name; 135 return; 136} 137 138sub clone_encoding($) { 139 my $obj = find_encoding(shift); 140 ref $obj or return; 141 eval { require Storable }; 142 $@ and return; 143 return Storable::dclone($obj); 144} 145 146sub encode($$;$) { 147 my ( $name, $string, $check ) = @_; 148 return undef unless defined $string; 149 $string .= ''; # stringify; 150 $check ||= 0; 151 unless ( defined $name ) { 152 require Carp; 153 Carp::croak("Encoding name should not be undef"); 154 } 155 my $enc = find_encoding($name); 156 unless ( defined $enc ) { 157 require Carp; 158 Carp::croak("Unknown encoding '$name'"); 159 } 160 my $octets = $enc->encode( $string, $check ); 161 $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() ); 162 return $octets; 163} 164*str2bytes = \&encode; 165 166sub decode($$;$) { 167 my ( $name, $octets, $check ) = @_; 168 return undef unless defined $octets; 169 $octets .= ''; 170 $check ||= 0; 171 my $enc = find_encoding($name); 172 unless ( defined $enc ) { 173 require Carp; 174 Carp::croak("Unknown encoding '$name'"); 175 } 176 my $string = $enc->decode( $octets, $check ); 177 $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() ); 178 return $string; 179} 180*bytes2str = \&decode; 181 182sub from_to($$$;$) { 183 my ( $string, $from, $to, $check ) = @_; 184 return undef unless defined $string; 185 $check ||= 0; 186 my $f = find_encoding($from); 187 unless ( defined $f ) { 188 require Carp; 189 Carp::croak("Unknown encoding '$from'"); 190 } 191 my $t = find_encoding($to); 192 unless ( defined $t ) { 193 require Carp; 194 Carp::croak("Unknown encoding '$to'"); 195 } 196 my $uni = $f->decode($string); 197 $_[0] = $string = $t->encode( $uni, $check ); 198 return undef if ( $check && length($uni) ); 199 return defined( $_[0] ) ? length($string) : undef; 200} 201 202sub encode_utf8($) { 203 my ($str) = @_; 204 utf8::encode($str); 205 return $str; 206} 207 208my $utf8enc; 209 210sub decode_utf8($;$) { 211 my ( $octets, $check ) = @_; 212 return $octets if is_utf8($octets); 213 return undef unless defined $octets; 214 $octets .= '' if ref $octets; 215 $check ||= 0; 216 $utf8enc ||= find_encoding('utf8'); 217 my $string = $utf8enc->decode( $octets, $check ); 218 $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() ); 219 return $string; 220} 221 222# sub decode_utf8($;$) { 223# my ( $str, $check ) = @_; 224# return $str if is_utf8($str); 225# if ($check) { 226# return decode( "utf8", $str, $check ); 227# } 228# else { 229# return decode( "utf8", $str ); 230# return $str; 231# } 232# } 233 234predefine_encodings(1); 235 236# 237# This is to restore %Encoding if really needed; 238# 239 240sub predefine_encodings { 241 require Encode::Encoding; 242 no warnings 'redefine'; 243 my $use_xs = shift; 244 if ($ON_EBCDIC) { 245 246 # was in Encode::UTF_EBCDIC 247 package Encode::UTF_EBCDIC; 248 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding'; 249 *decode = sub { 250 my ( undef, $str, $chk ) = @_; 251 my $res = ''; 252 for ( my $i = 0 ; $i < length($str) ; $i++ ) { 253 $res .= 254 chr( 255 utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) ) 256 ); 257 } 258 $_[1] = '' if $chk; 259 return $res; 260 }; 261 *encode = sub { 262 my ( undef, $str, $chk ) = @_; 263 my $res = ''; 264 for ( my $i = 0 ; $i < length($str) ; $i++ ) { 265 $res .= 266 chr( 267 utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) ) 268 ); 269 } 270 $_[1] = '' if $chk; 271 return $res; 272 }; 273 $Encode::Encoding{Unicode} = 274 bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC"; 275 } 276 else { 277 278 package Encode::Internal; 279 push @Encode::Internal::ISA, 'Encode::Encoding'; 280 *decode = sub { 281 my ( undef, $str, $chk ) = @_; 282 utf8::upgrade($str); 283 $_[1] = '' if $chk; 284 return $str; 285 }; 286 *encode = \&decode; 287 $Encode::Encoding{Unicode} = 288 bless { Name => "Internal" } => "Encode::Internal"; 289 } 290 291 { 292 293 # was in Encode::utf8 294 package Encode::utf8; 295 push @Encode::utf8::ISA, 'Encode::Encoding'; 296 297 # 298 if ($use_xs) { 299 Encode::DEBUG and warn __PACKAGE__, " XS on"; 300 *decode = \&decode_xs; 301 *encode = \&encode_xs; 302 } 303 else { 304 Encode::DEBUG and warn __PACKAGE__, " XS off"; 305 *decode = sub { 306 my ( undef, $octets, $chk ) = @_; 307 my $str = Encode::decode_utf8($octets); 308 if ( defined $str ) { 309 $_[1] = '' if $chk; 310 return $str; 311 } 312 return undef; 313 }; 314 *encode = sub { 315 my ( undef, $string, $chk ) = @_; 316 my $octets = Encode::encode_utf8($string); 317 $_[1] = '' if $chk; 318 return $octets; 319 }; 320 } 321 *cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk) 322 # currently ignores $chk 323 my ( undef, undef, undef, $pos, $trm ) = @_; 324 my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; 325 use bytes; 326 if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) { 327 $$rdst .= 328 substr( $$rsrc, $pos, $npos - $pos + length($trm) ); 329 $$rpos = $npos + length($trm); 330 return 1; 331 } 332 $$rdst .= substr( $$rsrc, $pos ); 333 $$rpos = length($$rsrc); 334 return ''; 335 }; 336 $Encode::Encoding{utf8} = 337 bless { Name => "utf8" } => "Encode::utf8"; 338 $Encode::Encoding{"utf-8-strict"} = 339 bless { Name => "utf-8-strict", strict_utf8 => 1 } 340 => "Encode::utf8"; 341 } 342} 343 3441; 345 346__END__ 347 348=head1 NAME 349 350Encode - character encodings in Perl 351 352=head1 SYNOPSIS 353 354 use Encode qw(decode encode); 355 $characters = decode('UTF-8', $octets, Encode::FB_CROAK); 356 $octets = encode('UTF-8', $characters, Encode::FB_CROAK); 357 358=head2 Table of Contents 359 360Encode consists of a collection of modules whose details are too extensive 361to fit in one document. This one itself explains the top-level APIs 362and general topics at a glance. For other topics and more details, 363see the documentation for these modules: 364 365=over 2 366 367=item L<Encode::Alias> - Alias definitions to encodings 368 369=item L<Encode::Encoding> - Encode Implementation Base Class 370 371=item L<Encode::Supported> - List of Supported Encodings 372 373=item L<Encode::CN> - Simplified Chinese Encodings 374 375=item L<Encode::JP> - Japanese Encodings 376 377=item L<Encode::KR> - Korean Encodings 378 379=item L<Encode::TW> - Traditional Chinese Encodings 380 381=back 382 383=head1 DESCRIPTION 384 385The C<Encode> module provides the interface between Perl strings 386and the rest of the system. Perl strings are sequences of 387I<characters>. 388 389The repertoire of characters that Perl can represent is a superset of those 390defined by the Unicode Consortium. On most platforms the ordinal 391values of a character as returned by C<ord(I<S>)> is the I<Unicode 392codepoint> for that character. The exceptions are platforms where 393the legacy encoding is some variant of EBCDIC rather than a superset 394of ASCII; see L<perlebcdic>. 395 396During recent history, data is moved around a computer in 8-bit chunks, 397often called "bytes" but also known as "octets" in standards documents. 398Perl is widely used to manipulate data of many types: not only strings of 399characters representing human or computer languages, but also "binary" 400data, being the machine's representation of numbers, pixels in an image, or 401just about anything. 402 403When Perl is processing "binary data", the programmer wants Perl to 404process "sequences of bytes". This is not a problem for Perl: because a 405byte has 256 possible values, it easily fits in Perl's much larger 406"logical character". 407 408This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq> 409explain the I<why>. 410 411=head2 TERMINOLOGY 412 413=head3 character 414 415A character in the range 0 .. 2**32-1 (or more); 416what Perl's strings are made of. 417 418=head3 byte 419 420A character in the range 0..255; 421a special case of a Perl character. 422 423=head3 octet 424 4258 bits of data, with ordinal values 0..255; 426term for bytes passed to or from a non-Perl context, such as a disk file, 427standard I/O stream, database, command-line argument, environment variable, 428socket etc. 429 430=head1 THE PERL ENCODING API 431 432=head2 Basic methods 433 434=head3 encode 435 436 $octets = encode(ENCODING, STRING[, CHECK]) 437 438Encodes the scalar value I<STRING> from Perl's internal form into 439I<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a 440canonical name or an alias. For encoding names and aliases, see 441L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">. 442 443For example, to convert a string from Perl's internal format into 444ISO-8859-1, also known as Latin1: 445 446 $octets = encode("iso-8859-1", $string); 447 448B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then 449$octets I<might not be equal to> $string. Though both contain the 450same data, the UTF8 flag for $octets is I<always> off. When you 451encode anything, the UTF8 flag on the result is always off, even when it 452contains a completely valid utf8 string. See L</"The UTF8 flag"> below. 453 454If the $string is C<undef>, then C<undef> is returned. 455 456=head3 decode 457 458 $string = decode(ENCODING, OCTETS[, CHECK]) 459 460This function returns the string that results from decoding the scalar 461value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into 462Perl's internal form. The returns the resulting string. As with encode(), 463I<ENCODING> can be either a canonical name or an alias. For encoding names 464and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling 465Malformed Data">. 466 467For example, to convert ISO-8859-1 data into a string in Perl's 468internal format: 469 470 $string = decode("iso-8859-1", $octets); 471 472B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string 473I<might not be equal to> $octets. Though both contain the same data, the 474UTF8 flag for $string is on unless $octets consists entirely of ASCII data 475on ASCII machines or EBCDIC on EBCDIC machines. See L</"The UTF8 flag"> 476below. 477 478If the $string is C<undef>, then C<undef> is returned. 479 480=head3 find_encoding 481 482 [$obj =] find_encoding(ENCODING) 483 484Returns the I<encoding object> corresponding to I<ENCODING>. Returns 485C<undef> if no matching I<ENCODING> is find. The returned object is 486what does the actual encoding or decoding. 487 488 $utf8 = decode($name, $bytes); 489 490is in fact 491 492 $utf8 = do { 493 $obj = find_encoding($name); 494 croak qq(encoding "$name" not found) unless ref $obj; 495 $obj->decode($bytes); 496 }; 497 498with more error checking. 499 500You can therefore save time by reusing this object as follows; 501 502 my $enc = find_encoding("iso-8859-1"); 503 while(<>) { 504 my $utf8 = $enc->decode($_); 505 ... # now do something with $utf8; 506 } 507 508Besides L</decode> and L</encode>, other methods are 509available as well. For instance, C<name()> returns the canonical 510name of the encoding object. 511 512 find_encoding("latin1")->name; # iso-8859-1 513 514See L<Encode::Encoding> for details. 515 516=head3 from_to 517 518 [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) 519 520Converts I<in-place> data between two encodings. The data in $octets 521must be encoded as octets and I<not> as characters in Perl's internal 522format. For example, to convert ISO-8859-1 data into Microsoft's CP1250 523encoding: 524 525 from_to($octets, "iso-8859-1", "cp1250"); 526 527and to convert it back: 528 529 from_to($octets, "cp1250", "iso-8859-1"); 530 531Because the conversion happens in place, the data to be 532converted cannot be a string constant: it must be a scalar variable. 533 534C<from_to()> returns the length of the converted string in octets on success, 535and C<undef> on error. 536 537B<CAVEAT>: The following operations may look the same, but are not: 538 539 from_to($data, "iso-8859-1", "utf8"); #1 540 $data = decode("iso-8859-1", $data); #2 541 542Both #1 and #2 make $data consist of a completely valid UTF-8 string, 543but only #2 turns the UTF8 flag on. #1 is equivalent to: 544 545 $data = encode("utf8", decode("iso-8859-1", $data)); 546 547See L</"The UTF8 flag"> below. 548 549Also note that: 550 551 from_to($octets, $from, $to, $check); 552 553is equivalent t:o 554 555 $octets = encode($to, decode($from, $octets), $check); 556 557Yes, it does I<not> respect the $check during decoding. It is 558deliberately done that way. If you need minute control, use C<decode> 559followed by C<encode> as follows: 560 561 $octets = encode($to, decode($from, $octets, $check_from), $check_to); 562 563=head3 encode_utf8 564 565 $octets = encode_utf8($string); 566 567Equivalent to C<$octets = encode("utf8", $string)>. The characters in 568$string are encoded in Perl's internal format, and the result is returned 569as a sequence of octets. Because all possible characters in Perl have a 570(loose, not strict) UTF-8 representation, this function cannot fail. 571 572=head3 decode_utf8 573 574 $string = decode_utf8($octets [, CHECK]); 575 576Equivalent to C<$string = decode("utf8", $octets [, CHECK])>. 577The sequence of octets represented by $octets is decoded 578from UTF-8 into a sequence of logical characters. 579Because not all sequences of octets are valid UTF-8, 580it is quite possible for this function to fail. 581For CHECK, see L</"Handling Malformed Data">. 582 583=head2 Listing available encodings 584 585 use Encode; 586 @list = Encode->encodings(); 587 588Returns a list of canonical names of available encodings that have already 589been loaded. To get a list of all available encodings including those that 590have not yet been loaded, say: 591 592 @all_encodings = Encode->encodings(":all"); 593 594Or you can give the name of a specific module: 595 596 @with_jp = Encode->encodings("Encode::JP"); 597 598When "C<::>" is not in the name, "C<Encode::>" is assumed. 599 600 @ebcdic = Encode->encodings("EBCDIC"); 601 602To find out in detail which encodings are supported by this package, 603see L<Encode::Supported>. 604 605=head2 Defining Aliases 606 607To add a new alias to a given encoding, use: 608 609 use Encode; 610 use Encode::Alias; 611 define_alias(NEWNAME => ENCODING); 612 613After that, I<NEWNAME> can be used as an alias for I<ENCODING>. 614I<ENCODING> may be either the name of an encoding or an 615I<encoding object>. 616 617Before you do that, first make sure the alias is nonexistent using 618C<resolve_alias()>, which returns the canonical name thereof. 619For example: 620 621 Encode::resolve_alias("latin1") eq "iso-8859-1" # true 622 Encode::resolve_alias("iso-8859-12") # false; nonexistent 623 Encode::resolve_alias($name) eq $name # true if $name is canonical 624 625C<resolve_alias()> does not need C<use Encode::Alias>; it can be 626imported via C<use Encode qw(resolve_alias)>. 627 628See L<Encode::Alias> for details. 629 630=head2 Finding IANA Character Set Registry names 631 632The canonical name of a given encoding does not necessarily agree with 633IANA Character Set Registry, commonly seen as C<< Content-Type: 634text/plain; charset=I<WHATEVER> >>. For most cases, the canonical name 635works, but sometimes it does not, most notably with "utf-8-strict". 636 637As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added. 638 639 use Encode; 640 my $enc = find_encoding("UTF-8"); 641 warn $enc->name; # utf-8-strict 642 warn $enc->mime_name; # UTF-8 643 644See also: L<Encode::Encoding> 645 646=head1 Encoding via PerlIO 647 648If your perl supports C<PerlIO> (which is the default), you can use a 649C<PerlIO> layer to decode and encode directly via a filehandle. The 650following two examples are fully identical in functionality: 651 652 ### Version 1 via PerlIO 653 open(INPUT, "< :encoding(shiftjis)", $infile) 654 || die "Can't open < $infile for reading: $!"; 655 open(OUTPUT, "> :encoding(euc-jp)", $outfile) 656 || die "Can't open > $output for writing: $!"; 657 while (<INPUT>) { # auto decodes $_ 658 print OUTPUT; # auto encodes $_ 659 } 660 close(INPUT) || die "can't close $infile: $!"; 661 close(OUTPUT) || die "can't close $outfile: $!"; 662 663 ### Version 2 via from_to() 664 open(INPUT, "< :raw", $infile) 665 || die "Can't open < $infile for reading: $!"; 666 open(OUTPUT, "> :raw", $outfile) 667 || die "Can't open > $output for writing: $!"; 668 669 while (<INPUT>) { 670 from_to($_, "shiftjis", "euc-jp", 1); # switch encoding 671 print OUTPUT; # emit raw (but properly encoded) data 672 } 673 close(INPUT) || die "can't close $infile: $!"; 674 close(OUTPUT) || die "can't close $outfile: $!"; 675 676In the first version above, you let the appropriate encoding layer 677handle the conversion. In the second, you explicitly translate 678from one encoding to the other. 679 680Unfortunately, it may be that encodings are C<PerlIO>-savvy. You can check 681to see whether your encoding is supported by C<PerlIO> by invoking the 682C<perlio_ok> method on it: 683 684 Encode::perlio_ok("hz"); # false 685 find_encoding("euc-cn")->perlio_ok; # true wherever PerlIO is available 686 687 use Encode qw(perlio_ok); # imported upon request 688 perlio_ok("euc-jp") 689 690Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy 691except for C<hz> and C<ISO-2022-kr>. For the gory details, see 692L<Encode::Encoding> and L<Encode::PerlIO>. 693 694=head1 Handling Malformed Data 695 696The optional I<CHECK> argument tells C<Encode> what to do when 697encountering malformed data. Without I<CHECK>, C<Encode::FB_DEFAULT> 698(== 0) is assumed. 699 700As of version 2.12, C<Encode> supports coderef values for C<CHECK>; 701see below. 702 703B<NOTE:> Not all encodings support this feature. 704Some encodings ignore the I<CHECK> argument. For example, 705L<Encode::Unicode> ignores I<CHECK> and it always croaks on error. 706 707=head2 List of I<CHECK> values 708 709=head3 FB_DEFAULT 710 711 I<CHECK> = Encode::FB_DEFAULT ( == 0) 712 713If I<CHECK> is 0, encoding and decoding replace any malformed character 714with a I<substitution character>. When you encode, I<SUBCHAR> is used. 715When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is 716used. If the data is supposed to be UTF-8, an optional lexical warning of 717warning category C<"utf8"> is given. 718 719=head3 FB_CROAK 720 721 I<CHECK> = Encode::FB_CROAK ( == 1) 722 723If I<CHECK> is 1, methods immediately die with an error 724message. Therefore, when I<CHECK> is 1, you should trap 725exceptions with C<eval{}>, unless you really want to let it C<die>. 726 727=head3 FB_QUIET 728 729 I<CHECK> = Encode::FB_QUIET 730 731If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately 732return the portion of the data that has been processed so far when an 733error occurs. The data argument is overwritten with everything 734after that point; that is, the unprocessed portion of the data. This is 735handy when you have to call C<decode> repeatedly in the case where your 736source data may contain partial multi-byte character sequences, 737(that is, you are reading with a fixed-width buffer). Here's some sample 738code to do exactly that: 739 740 my($buffer, $string) = ("", ""); 741 while (read($fh, $buffer, 256, length($buffer))) { 742 $string .= decode($encoding, $buffer, Encode::FB_QUIET); 743 # $buffer now contains the unprocessed partial character 744 } 745 746=head3 FB_WARN 747 748 I<CHECK> = Encode::FB_WARN 749 750This is the same as C<FB_QUIET> above, except that instead of being silent 751on errors, it issues a warning. This is handy for when you are debugging. 752 753=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF 754 755=over 2 756 757=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ) 758 759=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF) 760 761=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF) 762 763=back 764 765For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==> 766C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode. 767 768When you decode, C<\xI<HH>> is inserted for a malformed character, where 769I<HH> is the hex representation of the octet that could not be decoded to 770utf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is 771the Unicode code point (in any number of hex digits) of the character that 772cannot be found in the character repertoire of the encoding. 773 774The HTML/XML character reference modes are about the same. In place of 775C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and 776XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number. 777 778In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied. 779 780=head3 The bitmask 781 782These modes are all actually set via a bitmask. Here is how the C<FB_I<XXX>> 783constants are laid out. You can import the C<FB_I<XXX>> constants via 784C<use Encode qw(:fallbacks)>, and you can import the generic bitmask 785constants via C<use Encode qw(:fallback_all)>. 786 787 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ 788 DIE_ON_ERR 0x0001 X 789 WARN_ON_ERR 0x0002 X 790 RETURN_ON_ERR 0x0004 X X 791 LEAVE_SRC 0x0008 X 792 PERLQQ 0x0100 X 793 HTMLCREF 0x0200 794 XMLCREF 0x0400 795 796=head3 LEAVE_SRC 797 798 Encode::LEAVE_SRC 799 800If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the 801source string to encode() or decode() will be overwritten in place. 802If you're not interested in this, then bitwise-OR it with the bitmask. 803 804=head2 coderef for CHECK 805 806As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the 807ordinal value of the unmapped character as an argument and returns a string 808that represents the fallback character. For instance: 809 810 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift }); 811 812Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>. 813 814=head1 Defining Encodings 815 816To define a new encoding, use: 817 818 use Encode qw(define_encoding); 819 define_encoding($object, CANONICAL_NAME [, alias...]); 820 821I<CANONICAL_NAME> will be associated with I<$object>. The object 822should provide the interface described in L<Encode::Encoding>. 823If more than two arguments are provided, additional 824arguments are considered aliases for I<$object>. 825 826See L<Encode::Encoding> for details. 827 828=head1 The UTF8 flag 829 830Before the introduction of Unicode support in Perl, The C<eq> operator 831just compared the strings represented by two scalars. Beginning with 832Perl 5.8, C<eq> compares two strings with simultaneous consideration of 833I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of 834I<Programming Perl, 3rd ed.> 835 836=over 2 837 838=item Goal #1: 839 840Old byte-oriented programs should not spontaneously break on the old 841byte-oriented data they used to work on. 842 843=item Goal #2: 844 845Old byte-oriented programs should magically start working on the new 846character-oriented data when appropriate. 847 848=item Goal #3: 849 850Programs should run just as fast in the new character-oriented mode 851as in the old byte-oriented mode. 852 853=item Goal #4: 854 855Perl should remain one language, rather than forking into a 856byte-oriented Perl and a character-oriented Perl. 857 858=back 859 860When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been 861born yet, many features documented in the book remained unimplemented for a 862long time. Perl 5.8 corrected much of this, and the introduction of the 863UTF8 flag is one of them. You can think of there being two fundamentally 864different kinds of strings and string-operations in Perl: one a 865byte-oriented mode for when the internal UTF8 flag is off, and the other a 866character-oriented mode for when the internal UTF8 flag is on. 867 868Here is how C<Encode> handles the UTF8 flag. 869 870=over 2 871 872=item * 873 874When you I<encode>, the resulting UTF8 flag is always B<off>. 875 876=item * 877 878When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can 879unambiguously represent data. Here is what we mean by "unambiguously". 880After C<$utf8 = decode("foo", $octet)>, 881 882 When $octet is... The UTF8 flag in $utf8 is 883 --------------------------------------------- 884 In ASCII only (or EBCDIC only) OFF 885 In ISO-8859-1 ON 886 In any other Encoding ON 887 --------------------------------------------- 888 889As you see, there is one exception: in ASCII. That way you can assume 890Goal #1. And with C<Encode>, Goal #2 is assumed but you still have to be 891careful in the cases mentioned in the B<CAVEAT> paragraphs above. 892 893This UTF8 flag is not visible in Perl scripts, exactly for the same reason 894you cannot (or rather, you I<don't have to>) see whether a scalar contains 895a string, an integer, or a floating-point number. But you can still peek 896and poke these if you will. See the next section. 897 898=back 899 900=head2 Messing with Perl's Internals 901 902The following API uses parts of Perl's internals in the current 903implementation. As such, they are efficient but may change in a future 904release. 905 906=head3 is_utf8 907 908 is_utf8(STRING [, CHECK]) 909 910[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>. 911If I<CHECK> is true, also checks whether I<STRING> contains well-formed 912UTF-8. Returns true if successful, false otherwise. 913 914As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function. 915 916=head3 _utf8_on 917 918 _utf8_on(STRING) 919 920[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>. The I<STRING> 921is I<not> checked for containing only well-formed UTF-8. Do not use this 922unless you I<know with absolute certainty> that the STRING holds only 923well-formed UTF-8. Returns the previous state of the UTF8 flag (so please 924don't treat the return value as indicating success or failure), or C<undef> 925if I<STRING> is not a string. 926 927B<NOTE>: For security reasons, this function does not work on tainted values. 928 929=head3 _utf8_off 930 931 _utf8_off(STRING) 932 933[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>. Do not use 934frivolously. Returns the previous state of the UTF8 flag, or C<undef> if 935I<STRING> is not a string. Do not treat the return value as indicative of 936success or failure, because that isn't what it means: it is only the 937previous setting. 938 939B<NOTE>: For security reasons, this function does not work on tainted values. 940 941=head1 UTF-8 vs. utf8 vs. UTF8 942 943 ....We now view strings not as sequences of bytes, but as sequences 944 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit 945 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed. 946 947That has historically been Perl's notion of UTF-8, as that is how UTF-8 was 948first conceived by Ken Thompson when he invented it. However, thanks to 949later revisions to the applicable standards, official UTF-8 is now rather 950stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF 951to cover only 21 bits instead of 32 or 64 bits) and some sequences 952are not allowed, like those used in surrogate pairs, the 31 non-character 953code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane 954(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc. 955 956The former default in which Perl would always use a loose interpretation of 957UTF-8 has now been overruled: 958 959 From: Larry Wall <larry@wall.org> 960 Date: December 04, 2004 11:51:58 JST 961 To: perl-unicode@perl.org 962 Subject: Re: Make Encode.pm support the real UTF-8 963 Message-Id: <20041204025158.GA28754@wall.org> 964 965 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote: 966 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding, 967 : but "UTF-8" is the name of the standard and should give the 968 : corresponding behaviour. 969 970 For what it's worth, that's how I've always kept them straight in my 971 head. 972 973 Also for what it's worth, Perl 6 will mostly default to strict but 974 make it easy to switch back to lax. 975 976 Larry 977 978Got that? As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current 979sense, which is conservative and strict and security-conscious, whereas 980B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and 981lax. C<Encode> version 2.10 or later thus groks this subtle but critically 982important distinction between C<"UTF-8"> and C<"utf8">. 983 984 encode("utf8", "\x{FFFF_FFFF}", 1); # okay 985 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks 986 987In the C<Encode> module, C<"UTF-8"> is actually a canonical name for 988C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is 989critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive: 990 991 find_encoding("UTF-8")->name # is 'utf-8-strict' 992 find_encoding("utf-8")->name # ditto. names are case insensitive 993 find_encoding("utf_8")->name # ditto. "_" are treated as "-" 994 find_encoding("UTF8")->name # is 'utf8'. 995 996Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates 997whether a string is internally encoded as "utf8", also without a hyphen. 998 999=head1 SEE ALSO 1000 1001L<Encode::Encoding>, 1002L<Encode::Supported>, 1003L<Encode::PerlIO>, 1004L<encoding>, 1005L<perlebcdic>, 1006L<perlfunc/open>, 1007L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut> 1008L<utf8>, 1009the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html> 1010 1011=head1 MAINTAINER 1012 1013This project was originated by the late Nick Ing-Simmons and later 1014maintained by Dan Kogai I<< <dankogai@cpan.org> >>. See AUTHORS 1015for a full list of people involved. For any questions, send mail to 1016I<< <perl-unicode@perl.org> >> so that we can all share. 1017 1018While Dan Kogai retains the copyright as a maintainer, credit 1019should go to all those involved. See AUTHORS for a list of those 1020who submitted code to the project. 1021 1022=head1 COPYRIGHT 1023 1024Copyright 2002-2012 Dan Kogai I<< <dankogai@cpan.org> >>. 1025 1026This library is free software; you can redistribute it and/or modify 1027it under the same terms as Perl itself. 1028 1029=cut 1030