1# 2# $Id: Encode.pm,v 2.60 2014/04/29 16:26:49 dankogai Exp dankogai $ 3# 4package Encode; 5use strict; 6use warnings; 7our $VERSION = sprintf "%d.%02d", q$Revision: 2.60_01 $ =~ /(\d+)/g; 8use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; 9use XSLoader (); 10XSLoader::load( __PACKAGE__, $VERSION ); 11 12use Exporter 5.57 'import'; 13 14# Public, encouraged API is exported by default 15 16our @EXPORT = qw( 17 decode decode_utf8 encode encode_utf8 str2bytes bytes2str 18 encodings find_encoding clone_encoding 19); 20our @FB_FLAGS = qw( 21 DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC 22 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL 23); 24our @FB_CONSTS = qw( 25 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN 26 FB_PERLQQ FB_HTMLCREF FB_XMLCREF 27); 28our @EXPORT_OK = ( 29 qw( 30 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit 31 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade 32 ), 33 @FB_FLAGS, @FB_CONSTS, 34); 35 36our %EXPORT_TAGS = ( 37 all => [ @EXPORT, @EXPORT_OK ], 38 default => [ @EXPORT ], 39 fallbacks => [ @FB_CONSTS ], 40 fallback_all => [ @FB_CONSTS, @FB_FLAGS ], 41); 42 43# Documentation moved after __END__ for speed - NI-S 44 45our $ON_EBCDIC = ( ord("A") == 193 ); 46 47use Encode::Alias; 48 49# Make a %Encoding package variable to allow a certain amount of cheating 50our %Encoding; 51our %ExtModule; 52require Encode::Config; 53# See 54# https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2 55# to find why sig handlers inside eval{} are disabled. 56eval { 57 local $SIG{__DIE__}; 58 local $SIG{__WARN__}; 59 local @INC = @INC; 60 pop @INC if $INC[-1] eq '.'; 61 require Encode::ConfigLocal; 62}; 63 64sub encodings { 65 my %enc; 66 my $arg = $_[1] || ''; 67 if ( $arg eq ":all" ) { 68 %enc = ( %Encoding, %ExtModule ); 69 } 70 else { 71 %enc = %Encoding; 72 for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) { 73 DEBUG and warn $mod; 74 for my $enc ( keys %ExtModule ) { 75 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod; 76 } 77 } 78 } 79 return sort { lc $a cmp lc $b } 80 grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc; 81} 82 83sub perlio_ok { 84 my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] ); 85 $obj->can("perlio_ok") and return $obj->perlio_ok(); 86 return 0; # safety net 87} 88 89sub define_encoding { 90 my $obj = shift; 91 my $name = shift; 92 $Encoding{$name} = $obj; 93 my $lc = lc($name); 94 define_alias( $lc => $obj ) unless $lc eq $name; 95 while (@_) { 96 my $alias = shift; 97 define_alias( $alias, $obj ); 98 } 99 return $obj; 100} 101 102sub getEncoding { 103 my ( $class, $name, $skip_external ) = @_; 104 105 $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796 106 107 ref($name) && $name->can('renew') and return $name; 108 exists $Encoding{$name} and return $Encoding{$name}; 109 my $lc = lc $name; 110 exists $Encoding{$lc} and return $Encoding{$lc}; 111 112 my $oc = $class->find_alias($name); 113 defined($oc) and return $oc; 114 $lc ne $name and $oc = $class->find_alias($lc); 115 defined($oc) and return $oc; 116 117 unless ($skip_external) { 118 if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) { 119 $mod =~ s,::,/,g; 120 $mod .= '.pm'; 121 eval { require $mod; }; 122 exists $Encoding{$name} and return $Encoding{$name}; 123 } 124 } 125 return; 126} 127 128sub find_encoding($;$) { 129 my ( $name, $skip_external ) = @_; 130 return __PACKAGE__->getEncoding( $name, $skip_external ); 131} 132 133sub resolve_alias($) { 134 my $obj = find_encoding(shift); 135 defined $obj and return $obj->name; 136 return; 137} 138 139sub clone_encoding($) { 140 my $obj = find_encoding(shift); 141 ref $obj or return; 142 eval { require Storable }; 143 $@ and return; 144 return Storable::dclone($obj); 145} 146 147sub encode($$;$) { 148 my ( $name, $string, $check ) = @_; 149 return undef unless defined $string; 150 $string .= ''; # stringify; 151 $check ||= 0; 152 unless ( defined $name ) { 153 require Carp; 154 Carp::croak("Encoding name should not be undef"); 155 } 156 my $enc = find_encoding($name); 157 unless ( defined $enc ) { 158 require Carp; 159 Carp::croak("Unknown encoding '$name'"); 160 } 161 my $octets = $enc->encode( $string, $check ); 162 $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() ); 163 return $octets; 164} 165*str2bytes = \&encode; 166 167sub decode($$;$) { 168 my ( $name, $octets, $check ) = @_; 169 return undef unless defined $octets; 170 $octets .= ''; 171 $check ||= 0; 172 my $enc = find_encoding($name); 173 unless ( defined $enc ) { 174 require Carp; 175 Carp::croak("Unknown encoding '$name'"); 176 } 177 my $string = $enc->decode( $octets, $check ); 178 $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() ); 179 return $string; 180} 181*bytes2str = \&decode; 182 183sub from_to($$$;$) { 184 my ( $string, $from, $to, $check ) = @_; 185 return undef unless defined $string; 186 $check ||= 0; 187 my $f = find_encoding($from); 188 unless ( defined $f ) { 189 require Carp; 190 Carp::croak("Unknown encoding '$from'"); 191 } 192 my $t = find_encoding($to); 193 unless ( defined $t ) { 194 require Carp; 195 Carp::croak("Unknown encoding '$to'"); 196 } 197 my $uni = $f->decode($string); 198 $_[0] = $string = $t->encode( $uni, $check ); 199 return undef if ( $check && length($uni) ); 200 return defined( $_[0] ) ? length($string) : undef; 201} 202 203sub encode_utf8($) { 204 my ($str) = @_; 205 utf8::encode($str); 206 return $str; 207} 208 209my $utf8enc; 210 211sub decode_utf8($;$) { 212 my ( $octets, $check ) = @_; 213 return undef unless defined $octets; 214 $octets .= ''; 215 $check ||= 0; 216 $utf8enc ||= find_encoding('utf8'); 217 my $string = $utf8enc->decode( $octets, $check ); 218 $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() ); 219 return $string; 220} 221 222# sub decode_utf8($;$) { 223# my ( $str, $check ) = @_; 224# return $str if is_utf8($str); 225# if ($check) { 226# return decode( "utf8", $str, $check ); 227# } 228# else { 229# return decode( "utf8", $str ); 230# return $str; 231# } 232# } 233 234predefine_encodings(1); 235 236# 237# This is to restore %Encoding if really needed; 238# 239 240sub predefine_encodings { 241 require Encode::Encoding; 242 no warnings 'redefine'; 243 my $use_xs = shift; 244 if ($ON_EBCDIC) { 245 246 # was in Encode::UTF_EBCDIC 247 package Encode::UTF_EBCDIC; 248 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding'; 249 *decode = sub { 250 my ( undef, $str, $chk ) = @_; 251 my $res = ''; 252 for ( my $i = 0 ; $i < length($str) ; $i++ ) { 253 $res .= 254 chr( 255 utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) ) 256 ); 257 } 258 $_[1] = '' if $chk; 259 return $res; 260 }; 261 *encode = sub { 262 my ( undef, $str, $chk ) = @_; 263 my $res = ''; 264 for ( my $i = 0 ; $i < length($str) ; $i++ ) { 265 $res .= 266 chr( 267 utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) ) 268 ); 269 } 270 $_[1] = '' if $chk; 271 return $res; 272 }; 273 $Encode::Encoding{Unicode} = 274 bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC"; 275 } 276 else { 277 278 package Encode::Internal; 279 push @Encode::Internal::ISA, 'Encode::Encoding'; 280 *decode = sub { 281 my ( undef, $str, $chk ) = @_; 282 utf8::upgrade($str); 283 $_[1] = '' if $chk; 284 return $str; 285 }; 286 *encode = \&decode; 287 $Encode::Encoding{Unicode} = 288 bless { Name => "Internal" } => "Encode::Internal"; 289 } 290 291 { 292 293 # was in Encode::utf8 294 package Encode::utf8; 295 push @Encode::utf8::ISA, 'Encode::Encoding'; 296 297 # 298 if ($use_xs) { 299 Encode::DEBUG and warn __PACKAGE__, " XS on"; 300 *decode = \&decode_xs; 301 *encode = \&encode_xs; 302 } 303 else { 304 Encode::DEBUG and warn __PACKAGE__, " XS off"; 305 *decode = sub { 306 my ( undef, $octets, $chk ) = @_; 307 my $str = Encode::decode_utf8($octets); 308 if ( defined $str ) { 309 $_[1] = '' if $chk; 310 return $str; 311 } 312 return undef; 313 }; 314 *encode = sub { 315 my ( undef, $string, $chk ) = @_; 316 my $octets = Encode::encode_utf8($string); 317 $_[1] = '' if $chk; 318 return $octets; 319 }; 320 } 321 *cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk) 322 # currently ignores $chk 323 my ( undef, undef, undef, $pos, $trm ) = @_; 324 my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; 325 use bytes; 326 if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) { 327 $$rdst .= 328 substr( $$rsrc, $pos, $npos - $pos + length($trm) ); 329 $$rpos = $npos + length($trm); 330 return 1; 331 } 332 $$rdst .= substr( $$rsrc, $pos ); 333 $$rpos = length($$rsrc); 334 return ''; 335 }; 336 $Encode::Encoding{utf8} = 337 bless { Name => "utf8" } => "Encode::utf8"; 338 $Encode::Encoding{"utf-8-strict"} = 339 bless { Name => "utf-8-strict", strict_utf8 => 1 } 340 => "Encode::utf8"; 341 } 342} 343 3441; 345 346__END__ 347 348=head1 NAME 349 350Encode - character encodings in Perl 351 352=head1 SYNOPSIS 353 354 use Encode qw(decode encode); 355 $characters = decode('UTF-8', $octets, Encode::FB_CROAK); 356 $octets = encode('UTF-8', $characters, Encode::FB_CROAK); 357 358=head2 Table of Contents 359 360Encode consists of a collection of modules whose details are too extensive 361to fit in one document. This one itself explains the top-level APIs 362and general topics at a glance. For other topics and more details, 363see the documentation for these modules: 364 365=over 2 366 367=item L<Encode::Alias> - Alias definitions to encodings 368 369=item L<Encode::Encoding> - Encode Implementation Base Class 370 371=item L<Encode::Supported> - List of Supported Encodings 372 373=item L<Encode::CN> - Simplified Chinese Encodings 374 375=item L<Encode::JP> - Japanese Encodings 376 377=item L<Encode::KR> - Korean Encodings 378 379=item L<Encode::TW> - Traditional Chinese Encodings 380 381=back 382 383=head1 DESCRIPTION 384 385The C<Encode> module provides the interface between Perl strings 386and the rest of the system. Perl strings are sequences of 387I<characters>. 388 389The repertoire of characters that Perl can represent is a superset of those 390defined by the Unicode Consortium. On most platforms the ordinal 391values of a character as returned by C<ord(I<S>)> is the I<Unicode 392codepoint> for that character. The exceptions are platforms where 393the legacy encoding is some variant of EBCDIC rather than a superset 394of ASCII; see L<perlebcdic>. 395 396During recent history, data is moved around a computer in 8-bit chunks, 397often called "bytes" but also known as "octets" in standards documents. 398Perl is widely used to manipulate data of many types: not only strings of 399characters representing human or computer languages, but also "binary" 400data, being the machine's representation of numbers, pixels in an image, or 401just about anything. 402 403When Perl is processing "binary data", the programmer wants Perl to 404process "sequences of bytes". This is not a problem for Perl: because a 405byte has 256 possible values, it easily fits in Perl's much larger 406"logical character". 407 408This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq> 409explain the I<why>. 410 411=head2 TERMINOLOGY 412 413=head3 character 414 415A character in the range 0 .. 2**32-1 (or more); 416what Perl's strings are made of. 417 418=head3 byte 419 420A character in the range 0..255; 421a special case of a Perl character. 422 423=head3 octet 424 4258 bits of data, with ordinal values 0..255; 426term for bytes passed to or from a non-Perl context, such as a disk file, 427standard I/O stream, database, command-line argument, environment variable, 428socket etc. 429 430=head1 THE PERL ENCODING API 431 432=head2 Basic methods 433 434=head3 encode 435 436 $octets = encode(ENCODING, STRING[, CHECK]) 437 438Encodes the scalar value I<STRING> from Perl's internal form into 439I<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a 440canonical name or an alias. For encoding names and aliases, see 441L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">. 442 443For example, to convert a string from Perl's internal format into 444ISO-8859-1, also known as Latin1: 445 446 $octets = encode("iso-8859-1", $string); 447 448B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then 449$octets I<might not be equal to> $string. Though both contain the 450same data, the UTF8 flag for $octets is I<always> off. When you 451encode anything, the UTF8 flag on the result is always off, even when it 452contains a completely valid utf8 string. See L</"The UTF8 flag"> below. 453 454If the $string is C<undef>, then C<undef> is returned. 455 456=head3 decode 457 458 $string = decode(ENCODING, OCTETS[, CHECK]) 459 460This function returns the string that results from decoding the scalar 461value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into 462Perl's internal form. The returns the resulting string. As with encode(), 463I<ENCODING> can be either a canonical name or an alias. For encoding names 464and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling 465Malformed Data">. 466 467For example, to convert ISO-8859-1 data into a string in Perl's 468internal format: 469 470 $string = decode("iso-8859-1", $octets); 471 472B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string 473I<might not be equal to> $octets. Though both contain the same data, the 474UTF8 flag for $string is on. See L</"The UTF8 flag"> 475below. 476 477If the $string is C<undef>, then C<undef> is returned. 478 479=head3 find_encoding 480 481 [$obj =] find_encoding(ENCODING) 482 483Returns the I<encoding object> corresponding to I<ENCODING>. Returns 484C<undef> if no matching I<ENCODING> is find. The returned object is 485what does the actual encoding or decoding. 486 487 $utf8 = decode($name, $bytes); 488 489is in fact 490 491 $utf8 = do { 492 $obj = find_encoding($name); 493 croak qq(encoding "$name" not found) unless ref $obj; 494 $obj->decode($bytes); 495 }; 496 497with more error checking. 498 499You can therefore save time by reusing this object as follows; 500 501 my $enc = find_encoding("iso-8859-1"); 502 while(<>) { 503 my $utf8 = $enc->decode($_); 504 ... # now do something with $utf8; 505 } 506 507Besides L</decode> and L</encode>, other methods are 508available as well. For instance, C<name()> returns the canonical 509name of the encoding object. 510 511 find_encoding("latin1")->name; # iso-8859-1 512 513See L<Encode::Encoding> for details. 514 515=head3 from_to 516 517 [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) 518 519Converts I<in-place> data between two encodings. The data in $octets 520must be encoded as octets and I<not> as characters in Perl's internal 521format. For example, to convert ISO-8859-1 data into Microsoft's CP1250 522encoding: 523 524 from_to($octets, "iso-8859-1", "cp1250"); 525 526and to convert it back: 527 528 from_to($octets, "cp1250", "iso-8859-1"); 529 530Because the conversion happens in place, the data to be 531converted cannot be a string constant: it must be a scalar variable. 532 533C<from_to()> returns the length of the converted string in octets on success, 534and C<undef> on error. 535 536B<CAVEAT>: The following operations may look the same, but are not: 537 538 from_to($data, "iso-8859-1", "utf8"); #1 539 $data = decode("iso-8859-1", $data); #2 540 541Both #1 and #2 make $data consist of a completely valid UTF-8 string, 542but only #2 turns the UTF8 flag on. #1 is equivalent to: 543 544 $data = encode("utf8", decode("iso-8859-1", $data)); 545 546See L</"The UTF8 flag"> below. 547 548Also note that: 549 550 from_to($octets, $from, $to, $check); 551 552is equivalent t:o 553 554 $octets = encode($to, decode($from, $octets), $check); 555 556Yes, it does I<not> respect the $check during decoding. It is 557deliberately done that way. If you need minute control, use C<decode> 558followed by C<encode> as follows: 559 560 $octets = encode($to, decode($from, $octets, $check_from), $check_to); 561 562=head3 encode_utf8 563 564 $octets = encode_utf8($string); 565 566Equivalent to C<$octets = encode("utf8", $string)>. The characters in 567$string are encoded in Perl's internal format, and the result is returned 568as a sequence of octets. Because all possible characters in Perl have a 569(loose, not strict) UTF-8 representation, this function cannot fail. 570 571=head3 decode_utf8 572 573 $string = decode_utf8($octets [, CHECK]); 574 575Equivalent to C<$string = decode("utf8", $octets [, CHECK])>. 576The sequence of octets represented by $octets is decoded 577from UTF-8 into a sequence of logical characters. 578Because not all sequences of octets are valid UTF-8, 579it is quite possible for this function to fail. 580For CHECK, see L</"Handling Malformed Data">. 581 582=head2 Listing available encodings 583 584 use Encode; 585 @list = Encode->encodings(); 586 587Returns a list of canonical names of available encodings that have already 588been loaded. To get a list of all available encodings including those that 589have not yet been loaded, say: 590 591 @all_encodings = Encode->encodings(":all"); 592 593Or you can give the name of a specific module: 594 595 @with_jp = Encode->encodings("Encode::JP"); 596 597When "C<::>" is not in the name, "C<Encode::>" is assumed. 598 599 @ebcdic = Encode->encodings("EBCDIC"); 600 601To find out in detail which encodings are supported by this package, 602see L<Encode::Supported>. 603 604=head2 Defining Aliases 605 606To add a new alias to a given encoding, use: 607 608 use Encode; 609 use Encode::Alias; 610 define_alias(NEWNAME => ENCODING); 611 612After that, I<NEWNAME> can be used as an alias for I<ENCODING>. 613I<ENCODING> may be either the name of an encoding or an 614I<encoding object>. 615 616Before you do that, first make sure the alias is nonexistent using 617C<resolve_alias()>, which returns the canonical name thereof. 618For example: 619 620 Encode::resolve_alias("latin1") eq "iso-8859-1" # true 621 Encode::resolve_alias("iso-8859-12") # false; nonexistent 622 Encode::resolve_alias($name) eq $name # true if $name is canonical 623 624C<resolve_alias()> does not need C<use Encode::Alias>; it can be 625imported via C<use Encode qw(resolve_alias)>. 626 627See L<Encode::Alias> for details. 628 629=head2 Finding IANA Character Set Registry names 630 631The canonical name of a given encoding does not necessarily agree with 632IANA Character Set Registry, commonly seen as C<< Content-Type: 633text/plain; charset=I<WHATEVER> >>. For most cases, the canonical name 634works, but sometimes it does not, most notably with "utf-8-strict". 635 636As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added. 637 638 use Encode; 639 my $enc = find_encoding("UTF-8"); 640 warn $enc->name; # utf-8-strict 641 warn $enc->mime_name; # UTF-8 642 643See also: L<Encode::Encoding> 644 645=head1 Encoding via PerlIO 646 647If your perl supports C<PerlIO> (which is the default), you can use a 648C<PerlIO> layer to decode and encode directly via a filehandle. The 649following two examples are fully identical in functionality: 650 651 ### Version 1 via PerlIO 652 open(INPUT, "< :encoding(shiftjis)", $infile) 653 || die "Can't open < $infile for reading: $!"; 654 open(OUTPUT, "> :encoding(euc-jp)", $outfile) 655 || die "Can't open > $output for writing: $!"; 656 while (<INPUT>) { # auto decodes $_ 657 print OUTPUT; # auto encodes $_ 658 } 659 close(INPUT) || die "can't close $infile: $!"; 660 close(OUTPUT) || die "can't close $outfile: $!"; 661 662 ### Version 2 via from_to() 663 open(INPUT, "< :raw", $infile) 664 || die "Can't open < $infile for reading: $!"; 665 open(OUTPUT, "> :raw", $outfile) 666 || die "Can't open > $output for writing: $!"; 667 668 while (<INPUT>) { 669 from_to($_, "shiftjis", "euc-jp", 1); # switch encoding 670 print OUTPUT; # emit raw (but properly encoded) data 671 } 672 close(INPUT) || die "can't close $infile: $!"; 673 close(OUTPUT) || die "can't close $outfile: $!"; 674 675In the first version above, you let the appropriate encoding layer 676handle the conversion. In the second, you explicitly translate 677from one encoding to the other. 678 679Unfortunately, it may be that encodings are C<PerlIO>-savvy. You can check 680to see whether your encoding is supported by C<PerlIO> by invoking the 681C<perlio_ok> method on it: 682 683 Encode::perlio_ok("hz"); # false 684 find_encoding("euc-cn")->perlio_ok; # true wherever PerlIO is available 685 686 use Encode qw(perlio_ok); # imported upon request 687 perlio_ok("euc-jp") 688 689Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy 690except for C<hz> and C<ISO-2022-kr>. For the gory details, see 691L<Encode::Encoding> and L<Encode::PerlIO>. 692 693=head1 Handling Malformed Data 694 695The optional I<CHECK> argument tells C<Encode> what to do when 696encountering malformed data. Without I<CHECK>, C<Encode::FB_DEFAULT> 697(== 0) is assumed. 698 699As of version 2.12, C<Encode> supports coderef values for C<CHECK>; 700see below. 701 702B<NOTE:> Not all encodings support this feature. 703Some encodings ignore the I<CHECK> argument. For example, 704L<Encode::Unicode> ignores I<CHECK> and it always croaks on error. 705 706=head2 List of I<CHECK> values 707 708=head3 FB_DEFAULT 709 710 I<CHECK> = Encode::FB_DEFAULT ( == 0) 711 712If I<CHECK> is 0, encoding and decoding replace any malformed character 713with a I<substitution character>. When you encode, I<SUBCHAR> is used. 714When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is 715used. If the data is supposed to be UTF-8, an optional lexical warning of 716warning category C<"utf8"> is given. 717 718=head3 FB_CROAK 719 720 I<CHECK> = Encode::FB_CROAK ( == 1) 721 722If I<CHECK> is 1, methods immediately die with an error 723message. Therefore, when I<CHECK> is 1, you should trap 724exceptions with C<eval{}>, unless you really want to let it C<die>. 725 726=head3 FB_QUIET 727 728 I<CHECK> = Encode::FB_QUIET 729 730If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately 731return the portion of the data that has been processed so far when an 732error occurs. The data argument is overwritten with everything 733after that point; that is, the unprocessed portion of the data. This is 734handy when you have to call C<decode> repeatedly in the case where your 735source data may contain partial multi-byte character sequences, 736(that is, you are reading with a fixed-width buffer). Here's some sample 737code to do exactly that: 738 739 my($buffer, $string) = ("", ""); 740 while (read($fh, $buffer, 256, length($buffer))) { 741 $string .= decode($encoding, $buffer, Encode::FB_QUIET); 742 # $buffer now contains the unprocessed partial character 743 } 744 745=head3 FB_WARN 746 747 I<CHECK> = Encode::FB_WARN 748 749This is the same as C<FB_QUIET> above, except that instead of being silent 750on errors, it issues a warning. This is handy for when you are debugging. 751 752=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF 753 754=over 2 755 756=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ) 757 758=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF) 759 760=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF) 761 762=back 763 764For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==> 765C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode. 766 767When you decode, C<\xI<HH>> is inserted for a malformed character, where 768I<HH> is the hex representation of the octet that could not be decoded to 769utf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is 770the Unicode code point (in any number of hex digits) of the character that 771cannot be found in the character repertoire of the encoding. 772 773The HTML/XML character reference modes are about the same. In place of 774C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and 775XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number. 776 777In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied. 778 779=head3 The bitmask 780 781These modes are all actually set via a bitmask. Here is how the C<FB_I<XXX>> 782constants are laid out. You can import the C<FB_I<XXX>> constants via 783C<use Encode qw(:fallbacks)>, and you can import the generic bitmask 784constants via C<use Encode qw(:fallback_all)>. 785 786 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ 787 DIE_ON_ERR 0x0001 X 788 WARN_ON_ERR 0x0002 X 789 RETURN_ON_ERR 0x0004 X X 790 LEAVE_SRC 0x0008 X 791 PERLQQ 0x0100 X 792 HTMLCREF 0x0200 793 XMLCREF 0x0400 794 795=head3 LEAVE_SRC 796 797 Encode::LEAVE_SRC 798 799If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the 800source string to encode() or decode() will be overwritten in place. 801If you're not interested in this, then bitwise-OR it with the bitmask. 802 803=head2 coderef for CHECK 804 805As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the 806ordinal value of the unmapped character as an argument and returns 807octets that represent the fallback character. For instance: 808 809 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift }); 810 811Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>. 812 813Even the fallback for C<decode> must return octets, which are 814then decoded with the character encoding that C<decode> accepts. So for 815example if you wish to decode octests as UTF-8, and use ISO-8859-15 as 816a fallback for bytes that are not valid UTF-8, you could write 817 818 $str = decode 'UTF-8', $octets, sub { 819 my $tmp = chr shift; 820 from_to $tmp, 'ISO-8859-15', 'UTF-8'; 821 return $tmp; 822 }; 823 824=head1 Defining Encodings 825 826To define a new encoding, use: 827 828 use Encode qw(define_encoding); 829 define_encoding($object, CANONICAL_NAME [, alias...]); 830 831I<CANONICAL_NAME> will be associated with I<$object>. The object 832should provide the interface described in L<Encode::Encoding>. 833If more than two arguments are provided, additional 834arguments are considered aliases for I<$object>. 835 836See L<Encode::Encoding> for details. 837 838=head1 The UTF8 flag 839 840Before the introduction of Unicode support in Perl, The C<eq> operator 841just compared the strings represented by two scalars. Beginning with 842Perl 5.8, C<eq> compares two strings with simultaneous consideration of 843I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of 844I<Programming Perl, 3rd ed.> 845 846=over 2 847 848=item Goal #1: 849 850Old byte-oriented programs should not spontaneously break on the old 851byte-oriented data they used to work on. 852 853=item Goal #2: 854 855Old byte-oriented programs should magically start working on the new 856character-oriented data when appropriate. 857 858=item Goal #3: 859 860Programs should run just as fast in the new character-oriented mode 861as in the old byte-oriented mode. 862 863=item Goal #4: 864 865Perl should remain one language, rather than forking into a 866byte-oriented Perl and a character-oriented Perl. 867 868=back 869 870When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been 871born yet, many features documented in the book remained unimplemented for a 872long time. Perl 5.8 corrected much of this, and the introduction of the 873UTF8 flag is one of them. You can think of there being two fundamentally 874different kinds of strings and string-operations in Perl: one a 875byte-oriented mode for when the internal UTF8 flag is off, and the other a 876character-oriented mode for when the internal UTF8 flag is on. 877 878Here is how C<Encode> handles the UTF8 flag. 879 880=over 2 881 882=item * 883 884When you I<encode>, the resulting UTF8 flag is always B<off>. 885 886=item * 887 888When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can 889unambiguously represent data. Here is what we mean by "unambiguously". 890After C<$utf8 = decode("foo", $octet)>, 891 892 When $octet is... The UTF8 flag in $utf8 is 893 --------------------------------------------- 894 In ASCII only (or EBCDIC only) OFF 895 In ISO-8859-1 ON 896 In any other Encoding ON 897 --------------------------------------------- 898 899As you see, there is one exception: in ASCII. That way you can assume 900Goal #1. And with C<Encode>, Goal #2 is assumed but you still have to be 901careful in the cases mentioned in the B<CAVEAT> paragraphs above. 902 903This UTF8 flag is not visible in Perl scripts, exactly for the same reason 904you cannot (or rather, you I<don't have to>) see whether a scalar contains 905a string, an integer, or a floating-point number. But you can still peek 906and poke these if you will. See the next section. 907 908=back 909 910=head2 Messing with Perl's Internals 911 912The following API uses parts of Perl's internals in the current 913implementation. As such, they are efficient but may change in a future 914release. 915 916=head3 is_utf8 917 918 is_utf8(STRING [, CHECK]) 919 920[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>. 921If I<CHECK> is true, also checks whether I<STRING> contains well-formed 922UTF-8. Returns true if successful, false otherwise. 923 924As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function. 925 926=head3 _utf8_on 927 928 _utf8_on(STRING) 929 930[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>. The I<STRING> 931is I<not> checked for containing only well-formed UTF-8. Do not use this 932unless you I<know with absolute certainty> that the STRING holds only 933well-formed UTF-8. Returns the previous state of the UTF8 flag (so please 934don't treat the return value as indicating success or failure), or C<undef> 935if I<STRING> is not a string. 936 937B<NOTE>: For security reasons, this function does not work on tainted values. 938 939=head3 _utf8_off 940 941 _utf8_off(STRING) 942 943[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>. Do not use 944frivolously. Returns the previous state of the UTF8 flag, or C<undef> if 945I<STRING> is not a string. Do not treat the return value as indicative of 946success or failure, because that isn't what it means: it is only the 947previous setting. 948 949B<NOTE>: For security reasons, this function does not work on tainted values. 950 951=head1 UTF-8 vs. utf8 vs. UTF8 952 953 ....We now view strings not as sequences of bytes, but as sequences 954 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit 955 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed. 956 957That has historically been Perl's notion of UTF-8, as that is how UTF-8 was 958first conceived by Ken Thompson when he invented it. However, thanks to 959later revisions to the applicable standards, official UTF-8 is now rather 960stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF 961to cover only 21 bits instead of 32 or 64 bits) and some sequences 962are not allowed, like those used in surrogate pairs, the 31 non-character 963code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane 964(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc. 965 966The former default in which Perl would always use a loose interpretation of 967UTF-8 has now been overruled: 968 969 From: Larry Wall <larry@wall.org> 970 Date: December 04, 2004 11:51:58 JST 971 To: perl-unicode@perl.org 972 Subject: Re: Make Encode.pm support the real UTF-8 973 Message-Id: <20041204025158.GA28754@wall.org> 974 975 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote: 976 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding, 977 : but "UTF-8" is the name of the standard and should give the 978 : corresponding behaviour. 979 980 For what it's worth, that's how I've always kept them straight in my 981 head. 982 983 Also for what it's worth, Perl 6 will mostly default to strict but 984 make it easy to switch back to lax. 985 986 Larry 987 988Got that? As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current 989sense, which is conservative and strict and security-conscious, whereas 990B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and 991lax. C<Encode> version 2.10 or later thus groks this subtle but critically 992important distinction between C<"UTF-8"> and C<"utf8">. 993 994 encode("utf8", "\x{FFFF_FFFF}", 1); # okay 995 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks 996 997In the C<Encode> module, C<"UTF-8"> is actually a canonical name for 998C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is 999critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive: 1000 1001 find_encoding("UTF-8")->name # is 'utf-8-strict' 1002 find_encoding("utf-8")->name # ditto. names are case insensitive 1003 find_encoding("utf_8")->name # ditto. "_" are treated as "-" 1004 find_encoding("UTF8")->name # is 'utf8'. 1005 1006Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates 1007whether a string is internally encoded as "utf8", also without a hyphen. 1008 1009=head1 SEE ALSO 1010 1011L<Encode::Encoding>, 1012L<Encode::Supported>, 1013L<Encode::PerlIO>, 1014L<encoding>, 1015L<perlebcdic>, 1016L<perlfunc/open>, 1017L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut> 1018L<utf8>, 1019the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html> 1020 1021=head1 MAINTAINER 1022 1023This project was originated by the late Nick Ing-Simmons and later 1024maintained by Dan Kogai I<< <dankogai@cpan.org> >>. See AUTHORS 1025for a full list of people involved. For any questions, send mail to 1026I<< <perl-unicode@perl.org> >> so that we can all share. 1027 1028While Dan Kogai retains the copyright as a maintainer, credit 1029should go to all those involved. See AUTHORS for a list of those 1030who submitted code to the project. 1031 1032=head1 COPYRIGHT 1033 1034Copyright 2002-2013 Dan Kogai I<< <dankogai@cpan.org> >>. 1035 1036This library is free software; you can redistribute it and/or modify 1037it under the same terms as Perl itself. 1038 1039=cut 1040