1# 2# $Id: Encode.pm,v 3.06 2020/05/02 02:31:14 dankogai Exp $ 3# 4package Encode; 5use strict; 6use warnings; 7use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; 8our $VERSION; 9BEGIN { 10 $VERSION = "3.06_01"; 11 $VERSION = eval $VERSION; 12 require XSLoader; 13 XSLoader::load( __PACKAGE__, $VERSION ); 14} 15 16use Exporter 5.57 'import'; 17 18use Carp (); 19our @CARP_NOT = qw(Encode::Encoder); 20 21# Public, encouraged API is exported by default 22 23our @EXPORT = qw( 24 decode decode_utf8 encode encode_utf8 str2bytes bytes2str 25 encodings find_encoding find_mime_encoding clone_encoding 26); 27our @FB_FLAGS = qw( 28 DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC 29 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL 30); 31our @FB_CONSTS = qw( 32 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN 33 FB_PERLQQ FB_HTMLCREF FB_XMLCREF 34); 35our @EXPORT_OK = ( 36 qw( 37 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit 38 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade 39 ), 40 @FB_FLAGS, @FB_CONSTS, 41); 42 43our %EXPORT_TAGS = ( 44 all => [ @EXPORT, @EXPORT_OK ], 45 default => [ @EXPORT ], 46 fallbacks => [ @FB_CONSTS ], 47 fallback_all => [ @FB_CONSTS, @FB_FLAGS ], 48); 49 50# Documentation moved after __END__ for speed - NI-S 51 52our $ON_EBCDIC = ( ord("A") == 193 ); 53 54use Encode::Alias (); 55use Encode::MIME::Name; 56 57use Storable; 58 59# Make a %Encoding package variable to allow a certain amount of cheating 60our %Encoding; 61our %ExtModule; 62require Encode::Config; 63# See 64# https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2 65# to find why sig handlers inside eval{} are disabled. 66eval { 67 local $SIG{__DIE__}; 68 local $SIG{__WARN__}; 69 local @INC = @INC; 70 pop @INC if @INC && $INC[-1] eq '.'; 71 require Encode::ConfigLocal; 72}; 73 74sub encodings { 75 my %enc; 76 my $arg = $_[1] || ''; 77 if ( $arg eq ":all" ) { 78 %enc = ( %Encoding, %ExtModule ); 79 } 80 else { 81 %enc = %Encoding; 82 for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) { 83 DEBUG and warn $mod; 84 for my $enc ( keys %ExtModule ) { 85 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod; 86 } 87 } 88 } 89 return sort { lc $a cmp lc $b } 90 grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc; 91} 92 93sub perlio_ok { 94 my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] ); 95 $obj->can("perlio_ok") and return $obj->perlio_ok(); 96 return 0; # safety net 97} 98 99sub define_encoding { 100 my $obj = shift; 101 my $name = shift; 102 $Encoding{$name} = $obj; 103 my $lc = lc($name); 104 define_alias( $lc => $obj ) unless $lc eq $name; 105 while (@_) { 106 my $alias = shift; 107 define_alias( $alias, $obj ); 108 } 109 my $class = ref($obj); 110 push @Encode::CARP_NOT, $class unless grep { $_ eq $class } @Encode::CARP_NOT; 111 push @Encode::Encoding::CARP_NOT, $class unless grep { $_ eq $class } @Encode::Encoding::CARP_NOT; 112 return $obj; 113} 114 115sub getEncoding { 116 my ( $class, $name, $skip_external ) = @_; 117 118 defined($name) or return; 119 120 $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796 121 122 ref($name) && $name->can('renew') and return $name; 123 exists $Encoding{$name} and return $Encoding{$name}; 124 my $lc = lc $name; 125 exists $Encoding{$lc} and return $Encoding{$lc}; 126 127 my $oc = $class->find_alias($name); 128 defined($oc) and return $oc; 129 $lc ne $name and $oc = $class->find_alias($lc); 130 defined($oc) and return $oc; 131 132 unless ($skip_external) { 133 if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) { 134 $mod =~ s,::,/,g; 135 $mod .= '.pm'; 136 eval { require $mod; }; 137 exists $Encoding{$name} and return $Encoding{$name}; 138 } 139 } 140 return; 141} 142 143# HACK: These two functions must be defined in Encode and because of 144# cyclic dependency between Encode and Encode::Alias, Exporter does not work 145sub find_alias { 146 goto &Encode::Alias::find_alias; 147} 148sub define_alias { 149 goto &Encode::Alias::define_alias; 150} 151 152sub find_encoding($;$) { 153 my ( $name, $skip_external ) = @_; 154 return __PACKAGE__->getEncoding( $name, $skip_external ); 155} 156 157sub find_mime_encoding($;$) { 158 my ( $mime_name, $skip_external ) = @_; 159 my $name = Encode::MIME::Name::get_encode_name( $mime_name ); 160 return find_encoding( $name, $skip_external ); 161} 162 163sub resolve_alias($) { 164 my $obj = find_encoding(shift); 165 defined $obj and return $obj->name; 166 return; 167} 168 169sub clone_encoding($) { 170 my $obj = find_encoding(shift); 171 ref $obj or return; 172 return Storable::dclone($obj); 173} 174 175onBOOT; 176 177if ($ON_EBCDIC) { 178 package Encode::UTF_EBCDIC; 179 use parent 'Encode::Encoding'; 180 my $obj = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC"; 181 Encode::define_encoding($obj, 'Unicode'); 182 sub decode { 183 my ( undef, $str, $chk ) = @_; 184 my $res = ''; 185 for ( my $i = 0 ; $i < length($str) ; $i++ ) { 186 $res .= 187 chr( 188 utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) ) 189 ); 190 } 191 $_[1] = '' if $chk; 192 return $res; 193 } 194 sub encode { 195 my ( undef, $str, $chk ) = @_; 196 my $res = ''; 197 for ( my $i = 0 ; $i < length($str) ; $i++ ) { 198 $res .= 199 chr( 200 utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) ) 201 ); 202 } 203 $_[1] = '' if $chk; 204 return $res; 205 } 206} else { 207 package Encode::Internal; 208 use parent 'Encode::Encoding'; 209 my $obj = bless { Name => "Internal" } => "Encode::Internal"; 210 Encode::define_encoding($obj, 'Unicode'); 211 sub decode { 212 my ( undef, $str, $chk ) = @_; 213 utf8::upgrade($str); 214 $_[1] = '' if $chk; 215 return $str; 216 } 217 *encode = \&decode; 218} 219 220{ 221 # https://rt.cpan.org/Public/Bug/Display.html?id=103253 222 package Encode::XS; 223 use parent 'Encode::Encoding'; 224} 225 226{ 227 package Encode::utf8; 228 use parent 'Encode::Encoding'; 229 my %obj = ( 230 'utf8' => { Name => 'utf8' }, 231 'utf-8-strict' => { Name => 'utf-8-strict', strict_utf8 => 1 } 232 ); 233 for ( keys %obj ) { 234 bless $obj{$_} => __PACKAGE__; 235 Encode::define_encoding( $obj{$_} => $_ ); 236 } 237 sub cat_decode { 238 # ($obj, $dst, $src, $pos, $trm, $chk) 239 # currently ignores $chk 240 my ( undef, undef, undef, $pos, $trm ) = @_; 241 my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; 242 use bytes; 243 if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) { 244 $$rdst .= 245 substr( $$rsrc, $pos, $npos - $pos + length($trm) ); 246 $$rpos = $npos + length($trm); 247 return 1; 248 } 249 $$rdst .= substr( $$rsrc, $pos ); 250 $$rpos = length($$rsrc); 251 return ''; 252 } 253} 254 2551; 256 257__END__ 258 259=head1 NAME 260 261Encode - character encodings in Perl 262 263=head1 SYNOPSIS 264 265 use Encode qw(decode encode); 266 $characters = decode('UTF-8', $octets, Encode::FB_CROAK); 267 $octets = encode('UTF-8', $characters, Encode::FB_CROAK); 268 269=head2 Table of Contents 270 271Encode consists of a collection of modules whose details are too extensive 272to fit in one document. This one itself explains the top-level APIs 273and general topics at a glance. For other topics and more details, 274see the documentation for these modules: 275 276=over 2 277 278=item L<Encode::Alias> - Alias definitions to encodings 279 280=item L<Encode::Encoding> - Encode Implementation Base Class 281 282=item L<Encode::Supported> - List of Supported Encodings 283 284=item L<Encode::CN> - Simplified Chinese Encodings 285 286=item L<Encode::JP> - Japanese Encodings 287 288=item L<Encode::KR> - Korean Encodings 289 290=item L<Encode::TW> - Traditional Chinese Encodings 291 292=back 293 294=head1 DESCRIPTION 295 296The C<Encode> module provides the interface between Perl strings 297and the rest of the system. Perl strings are sequences of 298I<characters>. 299 300The repertoire of characters that Perl can represent is a superset of those 301defined by the Unicode Consortium. On most platforms the ordinal 302values of a character as returned by C<ord(I<S>)> is the I<Unicode 303codepoint> for that character. The exceptions are platforms where 304the legacy encoding is some variant of EBCDIC rather than a superset 305of ASCII; see L<perlebcdic>. 306 307During recent history, data is moved around a computer in 8-bit chunks, 308often called "bytes" but also known as "octets" in standards documents. 309Perl is widely used to manipulate data of many types: not only strings of 310characters representing human or computer languages, but also "binary" 311data, being the machine's representation of numbers, pixels in an image, or 312just about anything. 313 314When Perl is processing "binary data", the programmer wants Perl to 315process "sequences of bytes". This is not a problem for Perl: because a 316byte has 256 possible values, it easily fits in Perl's much larger 317"logical character". 318 319This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq> 320explain the I<why>. 321 322=head2 TERMINOLOGY 323 324=head3 character 325 326A character in the range 0 .. 2**32-1 (or more); 327what Perl's strings are made of. 328 329=head3 byte 330 331A character in the range 0..255; 332a special case of a Perl character. 333 334=head3 octet 335 3368 bits of data, with ordinal values 0..255; 337term for bytes passed to or from a non-Perl context, such as a disk file, 338standard I/O stream, database, command-line argument, environment variable, 339socket etc. 340 341=head1 THE PERL ENCODING API 342 343=head2 Basic methods 344 345=head3 encode 346 347 $octets = encode(ENCODING, STRING[, CHECK]) 348 349Encodes the scalar value I<STRING> from Perl's internal form into 350I<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a 351canonical name or an alias. For encoding names and aliases, see 352L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">. 353 354B<CAVEAT>: the input scalar I<STRING> might be modified in-place depending 355on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be 356left unchanged. 357 358For example, to convert a string from Perl's internal format into 359ISO-8859-1, also known as Latin1: 360 361 $octets = encode("iso-8859-1", $string); 362 363B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then 364$octets I<might not be equal to> $string. Though both contain the 365same data, the UTF8 flag for $octets is I<always> off. When you 366encode anything, the UTF8 flag on the result is always off, even when it 367contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below. 368 369If the $string is C<undef>, then C<undef> is returned. 370 371C<str2bytes> may be used as an alias for C<encode>. 372 373=head3 decode 374 375 $string = decode(ENCODING, OCTETS[, CHECK]) 376 377This function returns the string that results from decoding the scalar 378value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into 379Perl's internal form. As with encode(), 380I<ENCODING> can be either a canonical name or an alias. For encoding names 381and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling 382Malformed Data">. 383 384B<CAVEAT>: the input scalar I<OCTETS> might be modified in-place depending 385on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be 386left unchanged. 387 388For example, to convert ISO-8859-1 data into a string in Perl's 389internal format: 390 391 $string = decode("iso-8859-1", $octets); 392 393B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string 394I<might not be equal to> $octets. Though both contain the same data, the 395UTF8 flag for $string is on. See L</"The UTF8 flag"> 396below. 397 398If the $string is C<undef>, then C<undef> is returned. 399 400C<bytes2str> may be used as an alias for C<decode>. 401 402=head3 find_encoding 403 404 [$obj =] find_encoding(ENCODING) 405 406Returns the I<encoding object> corresponding to I<ENCODING>. Returns 407C<undef> if no matching I<ENCODING> is find. The returned object is 408what does the actual encoding or decoding. 409 410 $string = decode($name, $bytes); 411 412is in fact 413 414 $string = do { 415 $obj = find_encoding($name); 416 croak qq(encoding "$name" not found) unless ref $obj; 417 $obj->decode($bytes); 418 }; 419 420with more error checking. 421 422You can therefore save time by reusing this object as follows; 423 424 my $enc = find_encoding("iso-8859-1"); 425 while(<>) { 426 my $string = $enc->decode($_); 427 ... # now do something with $string; 428 } 429 430Besides L</decode> and L</encode>, other methods are 431available as well. For instance, C<name()> returns the canonical 432name of the encoding object. 433 434 find_encoding("latin1")->name; # iso-8859-1 435 436See L<Encode::Encoding> for details. 437 438=head3 find_mime_encoding 439 440 [$obj =] find_mime_encoding(MIME_ENCODING) 441 442Returns the I<encoding object> corresponding to I<MIME_ENCODING>. Acts 443same as C<find_encoding()> but C<mime_name()> of returned object must 444match to I<MIME_ENCODING>. So as opposite of C<find_encoding()> 445canonical names and aliases are not used when searching for object. 446 447 find_mime_encoding("utf8"); # returns undef because "utf8" is not valid I<MIME_ENCODING> 448 find_mime_encoding("utf-8"); # returns encode object "utf-8-strict" 449 find_mime_encoding("UTF-8"); # same as "utf-8" because I<MIME_ENCODING> is case insensitive 450 find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not valid I<MIME_ENCODING> 451 452=head3 from_to 453 454 [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) 455 456Converts I<in-place> data between two encodings. The data in $octets 457must be encoded as octets and I<not> as characters in Perl's internal 458format. For example, to convert ISO-8859-1 data into Microsoft's CP1250 459encoding: 460 461 from_to($octets, "iso-8859-1", "cp1250"); 462 463and to convert it back: 464 465 from_to($octets, "cp1250", "iso-8859-1"); 466 467Because the conversion happens in place, the data to be 468converted cannot be a string constant: it must be a scalar variable. 469 470C<from_to()> returns the length of the converted string in octets on success, 471and C<undef> on error. 472 473B<CAVEAT>: The following operations may look the same, but are not: 474 475 from_to($data, "iso-8859-1", "UTF-8"); #1 476 $data = decode("iso-8859-1", $data); #2 477 478Both #1 and #2 make $data consist of a completely valid UTF-8 string, 479but only #2 turns the UTF8 flag on. #1 is equivalent to: 480 481 $data = encode("UTF-8", decode("iso-8859-1", $data)); 482 483See L</"The UTF8 flag"> below. 484 485Also note that: 486 487 from_to($octets, $from, $to, $check); 488 489is equivalent to: 490 491 $octets = encode($to, decode($from, $octets), $check); 492 493Yes, it does I<not> respect the $check during decoding. It is 494deliberately done that way. If you need minute control, use C<decode> 495followed by C<encode> as follows: 496 497 $octets = encode($to, decode($from, $octets, $check_from), $check_to); 498 499=head3 encode_utf8 500 501 $octets = encode_utf8($string); 502 503Equivalent to C<$octets = encode("utf8", $string)>. The characters in 504$string are encoded in Perl's internal format, and the result is returned 505as a sequence of octets. Because all possible characters in Perl have a 506(loose, not strict) utf8 representation, this function cannot fail. 507 508B<WARNING>: do not use this function for data exchange as it can produce 509not strict utf8 $octets! For strictly valid UTF-8 output use 510C<$octets = encode("UTF-8", $string)>. 511 512=head3 decode_utf8 513 514 $string = decode_utf8($octets [, CHECK]); 515 516Equivalent to C<$string = decode("utf8", $octets [, CHECK])>. 517The sequence of octets represented by $octets is decoded 518from (loose, not strict) utf8 into a sequence of logical characters. 519Because not all sequences of octets are valid not strict utf8, 520it is quite possible for this function to fail. 521For CHECK, see L</"Handling Malformed Data">. 522 523B<WARNING>: do not use this function for data exchange as it can produce 524$string with not strict utf8 representation! For strictly valid UTF-8 525$string representation use C<$string = decode("UTF-8", $octets [, CHECK])>. 526 527B<CAVEAT>: the input I<$octets> might be modified in-place depending on 528what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be 529left unchanged. 530 531=head2 Listing available encodings 532 533 use Encode; 534 @list = Encode->encodings(); 535 536Returns a list of canonical names of available encodings that have already 537been loaded. To get a list of all available encodings including those that 538have not yet been loaded, say: 539 540 @all_encodings = Encode->encodings(":all"); 541 542Or you can give the name of a specific module: 543 544 @with_jp = Encode->encodings("Encode::JP"); 545 546When "C<::>" is not in the name, "C<Encode::>" is assumed. 547 548 @ebcdic = Encode->encodings("EBCDIC"); 549 550To find out in detail which encodings are supported by this package, 551see L<Encode::Supported>. 552 553=head2 Defining Aliases 554 555To add a new alias to a given encoding, use: 556 557 use Encode; 558 use Encode::Alias; 559 define_alias(NEWNAME => ENCODING); 560 561After that, I<NEWNAME> can be used as an alias for I<ENCODING>. 562I<ENCODING> may be either the name of an encoding or an 563I<encoding object>. 564 565Before you do that, first make sure the alias is nonexistent using 566C<resolve_alias()>, which returns the canonical name thereof. 567For example: 568 569 Encode::resolve_alias("latin1") eq "iso-8859-1" # true 570 Encode::resolve_alias("iso-8859-12") # false; nonexistent 571 Encode::resolve_alias($name) eq $name # true if $name is canonical 572 573C<resolve_alias()> does not need C<use Encode::Alias>; it can be 574imported via C<use Encode qw(resolve_alias)>. 575 576See L<Encode::Alias> for details. 577 578=head2 Finding IANA Character Set Registry names 579 580The canonical name of a given encoding does not necessarily agree with 581IANA Character Set Registry, commonly seen as C<< Content-Type: 582text/plain; charset=I<WHATEVER> >>. For most cases, the canonical name 583works, but sometimes it does not, most notably with "utf-8-strict". 584 585As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added. 586 587 use Encode; 588 my $enc = find_encoding("UTF-8"); 589 warn $enc->name; # utf-8-strict 590 warn $enc->mime_name; # UTF-8 591 592See also: L<Encode::Encoding> 593 594=head1 Encoding via PerlIO 595 596If your perl supports C<PerlIO> (which is the default), you can use a 597C<PerlIO> layer to decode and encode directly via a filehandle. The 598following two examples are fully identical in functionality: 599 600 ### Version 1 via PerlIO 601 open(INPUT, "< :encoding(shiftjis)", $infile) 602 || die "Can't open < $infile for reading: $!"; 603 open(OUTPUT, "> :encoding(euc-jp)", $outfile) 604 || die "Can't open > $output for writing: $!"; 605 while (<INPUT>) { # auto decodes $_ 606 print OUTPUT; # auto encodes $_ 607 } 608 close(INPUT) || die "can't close $infile: $!"; 609 close(OUTPUT) || die "can't close $outfile: $!"; 610 611 ### Version 2 via from_to() 612 open(INPUT, "< :raw", $infile) 613 || die "Can't open < $infile for reading: $!"; 614 open(OUTPUT, "> :raw", $outfile) 615 || die "Can't open > $output for writing: $!"; 616 617 while (<INPUT>) { 618 from_to($_, "shiftjis", "euc-jp", 1); # switch encoding 619 print OUTPUT; # emit raw (but properly encoded) data 620 } 621 close(INPUT) || die "can't close $infile: $!"; 622 close(OUTPUT) || die "can't close $outfile: $!"; 623 624In the first version above, you let the appropriate encoding layer 625handle the conversion. In the second, you explicitly translate 626from one encoding to the other. 627 628Unfortunately, it may be that encodings are not C<PerlIO>-savvy. You can check 629to see whether your encoding is supported by C<PerlIO> by invoking the 630C<perlio_ok> method on it: 631 632 Encode::perlio_ok("hz"); # false 633 find_encoding("euc-cn")->perlio_ok; # true wherever PerlIO is available 634 635 use Encode qw(perlio_ok); # imported upon request 636 perlio_ok("euc-jp") 637 638Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy 639except for C<hz> and C<ISO-2022-kr>. For the gory details, see 640L<Encode::Encoding> and L<Encode::PerlIO>. 641 642=head1 Handling Malformed Data 643 644The optional I<CHECK> argument tells C<Encode> what to do when 645encountering malformed data. Without I<CHECK>, C<Encode::FB_DEFAULT> 646(== 0) is assumed. 647 648As of version 2.12, C<Encode> supports coderef values for C<CHECK>; 649see below. 650 651B<NOTE:> Not all encodings support this feature. 652Some encodings ignore the I<CHECK> argument. For example, 653L<Encode::Unicode> ignores I<CHECK> and it always croaks on error. 654 655=head2 List of I<CHECK> values 656 657=head3 FB_DEFAULT 658 659 I<CHECK> = Encode::FB_DEFAULT ( == 0) 660 661If I<CHECK> is 0, encoding and decoding replace any malformed character 662with a I<substitution character>. When you encode, I<SUBCHAR> is used. 663When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is 664used. If the data is supposed to be UTF-8, an optional lexical warning of 665warning category C<"utf8"> is given. 666 667=head3 FB_CROAK 668 669 I<CHECK> = Encode::FB_CROAK ( == 1) 670 671If I<CHECK> is 1, methods immediately die with an error 672message. Therefore, when I<CHECK> is 1, you should trap 673exceptions with C<eval{}>, unless you really want to let it C<die>. 674 675=head3 FB_QUIET 676 677 I<CHECK> = Encode::FB_QUIET 678 679If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately 680return the portion of the data that has been processed so far when an 681error occurs. The data argument is overwritten with everything 682after that point; that is, the unprocessed portion of the data. This is 683handy when you have to call C<decode> repeatedly in the case where your 684source data may contain partial multi-byte character sequences, 685(that is, you are reading with a fixed-width buffer). Here's some sample 686code to do exactly that: 687 688 my($buffer, $string) = ("", ""); 689 while (read($fh, $buffer, 256, length($buffer))) { 690 $string .= decode($encoding, $buffer, Encode::FB_QUIET); 691 # $buffer now contains the unprocessed partial character 692 } 693 694=head3 FB_WARN 695 696 I<CHECK> = Encode::FB_WARN 697 698This is the same as C<FB_QUIET> above, except that instead of being silent 699on errors, it issues a warning. This is handy for when you are debugging. 700 701B<CAVEAT>: All warnings from Encode module are reported, independently of 702L<pragma warnings|warnings> settings. If you want to follow settings of 703lexical warnings configured by L<pragma warnings|warnings> then append 704also check value C<ENCODE::ONLY_PRAGMA_WARNINGS>. This value is available 705since Encode version 2.99. 706 707=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF 708 709=over 2 710 711=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ) 712 713=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF) 714 715=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF) 716 717=back 718 719For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==> 720C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode. 721 722When you decode, C<\xI<HH>> is inserted for a malformed character, where 723I<HH> is the hex representation of the octet that could not be decoded to 724utf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is 725the Unicode code point (in any number of hex digits) of the character that 726cannot be found in the character repertoire of the encoding. 727 728The HTML/XML character reference modes are about the same. In place of 729C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and 730XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number. 731 732In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied. 733 734=head3 The bitmask 735 736These modes are all actually set via a bitmask. Here is how the C<FB_I<XXX>> 737constants are laid out. You can import the C<FB_I<XXX>> constants via 738C<use Encode qw(:fallbacks)>, and you can import the generic bitmask 739constants via C<use Encode qw(:fallback_all)>. 740 741 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ 742 DIE_ON_ERR 0x0001 X 743 WARN_ON_ERR 0x0002 X 744 RETURN_ON_ERR 0x0004 X X 745 LEAVE_SRC 0x0008 X 746 PERLQQ 0x0100 X 747 HTMLCREF 0x0200 748 XMLCREF 0x0400 749 750=head3 LEAVE_SRC 751 752 Encode::LEAVE_SRC 753 754If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the 755source string to encode() or decode() will be overwritten in place. 756If you're not interested in this, then bitwise-OR it with the bitmask. 757 758=head2 coderef for CHECK 759 760As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the 761ordinal value of the unmapped character as an argument and returns 762octets that represent the fallback character. For instance: 763 764 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift }); 765 766Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>. 767 768Fallback for C<decode> must return decoded string (sequence of characters) 769and takes a list of ordinal values as its arguments. So for 770example if you wish to decode octets as UTF-8, and use ISO-8859-15 as 771a fallback for bytes that are not valid UTF-8, you could write 772 773 $str = decode 'UTF-8', $octets, sub { 774 my $tmp = join '', map chr, @_; 775 return decode 'ISO-8859-15', $tmp; 776 }; 777 778=head1 Defining Encodings 779 780To define a new encoding, use: 781 782 use Encode qw(define_encoding); 783 define_encoding($object, CANONICAL_NAME [, alias...]); 784 785I<CANONICAL_NAME> will be associated with I<$object>. The object 786should provide the interface described in L<Encode::Encoding>. 787If more than two arguments are provided, additional 788arguments are considered aliases for I<$object>. 789 790See L<Encode::Encoding> for details. 791 792=head1 The UTF8 flag 793 794Before the introduction of Unicode support in Perl, The C<eq> operator 795just compared the strings represented by two scalars. Beginning with 796Perl 5.8, C<eq> compares two strings with simultaneous consideration of 797I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of 798I<Programming Perl, 3rd ed.> 799 800=over 2 801 802=item Goal #1: 803 804Old byte-oriented programs should not spontaneously break on the old 805byte-oriented data they used to work on. 806 807=item Goal #2: 808 809Old byte-oriented programs should magically start working on the new 810character-oriented data when appropriate. 811 812=item Goal #3: 813 814Programs should run just as fast in the new character-oriented mode 815as in the old byte-oriented mode. 816 817=item Goal #4: 818 819Perl should remain one language, rather than forking into a 820byte-oriented Perl and a character-oriented Perl. 821 822=back 823 824When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been 825born yet, many features documented in the book remained unimplemented for a 826long time. Perl 5.8 corrected much of this, and the introduction of the 827UTF8 flag is one of them. You can think of there being two fundamentally 828different kinds of strings and string-operations in Perl: one a 829byte-oriented mode for when the internal UTF8 flag is off, and the other a 830character-oriented mode for when the internal UTF8 flag is on. 831 832This UTF8 flag is not visible in Perl scripts, exactly for the same reason 833you cannot (or rather, you I<don't have to>) see whether a scalar contains 834a string, an integer, or a floating-point number. But you can still peek 835and poke these if you will. See the next section. 836 837=head2 Messing with Perl's Internals 838 839The following API uses parts of Perl's internals in the current 840implementation. As such, they are efficient but may change in a future 841release. 842 843=head3 is_utf8 844 845 is_utf8(STRING [, CHECK]) 846 847[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>. 848If I<CHECK> is true, also checks whether I<STRING> contains well-formed 849UTF-8. Returns true if successful, false otherwise. 850 851Typically only necessary for debugging and testing. Don't use this flag as 852a marker to distinguish character and binary data, that should be decided 853for each variable when you write your code. 854 855B<CAVEAT>: If I<STRING> has UTF8 flag set, it does B<NOT> mean that 856I<STRING> is UTF-8 encoded and vice-versa. 857 858As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function. 859 860=head3 _utf8_on 861 862 _utf8_on(STRING) 863 864[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>. The I<STRING> 865is I<not> checked for containing only well-formed UTF-8. Do not use this 866unless you I<know with absolute certainty> that the STRING holds only 867well-formed UTF-8. Returns the previous state of the UTF8 flag (so please 868don't treat the return value as indicating success or failure), or C<undef> 869if I<STRING> is not a string. 870 871B<NOTE>: For security reasons, this function does not work on tainted values. 872 873=head3 _utf8_off 874 875 _utf8_off(STRING) 876 877[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>. Do not use 878frivolously. Returns the previous state of the UTF8 flag, or C<undef> if 879I<STRING> is not a string. Do not treat the return value as indicative of 880success or failure, because that isn't what it means: it is only the 881previous setting. 882 883B<NOTE>: For security reasons, this function does not work on tainted values. 884 885=head1 UTF-8 vs. utf8 vs. UTF8 886 887 ....We now view strings not as sequences of bytes, but as sequences 888 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit 889 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed. 890 891That has historically been Perl's notion of UTF-8, as that is how UTF-8 was 892first conceived by Ken Thompson when he invented it. However, thanks to 893later revisions to the applicable standards, official UTF-8 is now rather 894stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF 895to cover only 21 bits instead of 32 or 64 bits) and some sequences 896are not allowed, like those used in surrogate pairs, the 31 non-character 897code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane 898(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc. 899 900The former default in which Perl would always use a loose interpretation of 901UTF-8 has now been overruled: 902 903 From: Larry Wall <larry@wall.org> 904 Date: December 04, 2004 11:51:58 JST 905 To: perl-unicode@perl.org 906 Subject: Re: Make Encode.pm support the real UTF-8 907 Message-Id: <20041204025158.GA28754@wall.org> 908 909 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote: 910 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding, 911 : but "UTF-8" is the name of the standard and should give the 912 : corresponding behaviour. 913 914 For what it's worth, that's how I've always kept them straight in my 915 head. 916 917 Also for what it's worth, Perl 6 will mostly default to strict but 918 make it easy to switch back to lax. 919 920 Larry 921 922Got that? As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current 923sense, which is conservative and strict and security-conscious, whereas 924B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and 925lax. C<Encode> version 2.10 or later thus groks this subtle but critically 926important distinction between C<"UTF-8"> and C<"utf8">. 927 928 encode("utf8", "\x{FFFF_FFFF}", 1); # okay 929 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks 930 931In the C<Encode> module, C<"UTF-8"> is actually a canonical name for 932C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is 933critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive: 934 935 find_encoding("UTF-8")->name # is 'utf-8-strict' 936 find_encoding("utf-8")->name # ditto. names are case insensitive 937 find_encoding("utf_8")->name # ditto. "_" are treated as "-" 938 find_encoding("UTF8")->name # is 'utf8'. 939 940Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates 941whether a string is internally encoded as "utf8", also without a hyphen. 942 943=head1 SEE ALSO 944 945L<Encode::Encoding>, 946L<Encode::Supported>, 947L<Encode::PerlIO>, 948L<encoding>, 949L<perlebcdic>, 950L<perlfunc/open>, 951L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut> 952L<utf8>, 953the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html> 954 955=head1 MAINTAINER 956 957This project was originated by the late Nick Ing-Simmons and later 958maintained by Dan Kogai I<< <dankogai@cpan.org> >>. See AUTHORS 959for a full list of people involved. For any questions, send mail to 960I<< <perl-unicode@perl.org> >> so that we can all share. 961 962While Dan Kogai retains the copyright as a maintainer, credit 963should go to all those involved. See AUTHORS for a list of those 964who submitted code to the project. 965 966=head1 COPYRIGHT 967 968Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>. 969 970This library is free software; you can redistribute it and/or modify 971it under the same terms as Perl itself. 972 973=cut 974