1# 2# $Id: Encode.pm,v 3.01 2019/03/13 00:25:25 dankogai Exp $ 3# 4package Encode; 5use strict; 6use warnings; 7use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; 8our $VERSION; 9BEGIN { 10 $VERSION = sprintf "%d.%02d", q$Revision: 3.01 $ =~ /(\d+)/g; 11 require XSLoader; 12 XSLoader::load( __PACKAGE__, $VERSION ); 13} 14 15use Exporter 5.57 'import'; 16 17use Carp (); 18our @CARP_NOT = qw(Encode::Encoder); 19 20# Public, encouraged API is exported by default 21 22our @EXPORT = qw( 23 decode decode_utf8 encode encode_utf8 str2bytes bytes2str 24 encodings find_encoding find_mime_encoding clone_encoding 25); 26our @FB_FLAGS = qw( 27 DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC 28 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL 29); 30our @FB_CONSTS = qw( 31 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN 32 FB_PERLQQ FB_HTMLCREF FB_XMLCREF 33); 34our @EXPORT_OK = ( 35 qw( 36 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit 37 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade 38 ), 39 @FB_FLAGS, @FB_CONSTS, 40); 41 42our %EXPORT_TAGS = ( 43 all => [ @EXPORT, @EXPORT_OK ], 44 default => [ @EXPORT ], 45 fallbacks => [ @FB_CONSTS ], 46 fallback_all => [ @FB_CONSTS, @FB_FLAGS ], 47); 48 49# Documentation moved after __END__ for speed - NI-S 50 51our $ON_EBCDIC = ( ord("A") == 193 ); 52 53use Encode::Alias (); 54use Encode::MIME::Name; 55 56use Storable; 57 58# Make a %Encoding package variable to allow a certain amount of cheating 59our %Encoding; 60our %ExtModule; 61require Encode::Config; 62# See 63# https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2 64# to find why sig handlers inside eval{} are disabled. 65eval { 66 local $SIG{__DIE__}; 67 local $SIG{__WARN__}; 68 local @INC = @INC; 69 pop @INC if $INC[-1] eq '.'; 70 require Encode::ConfigLocal; 71}; 72 73sub encodings { 74 my %enc; 75 my $arg = $_[1] || ''; 76 if ( $arg eq ":all" ) { 77 %enc = ( %Encoding, %ExtModule ); 78 } 79 else { 80 %enc = %Encoding; 81 for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) { 82 DEBUG and warn $mod; 83 for my $enc ( keys %ExtModule ) { 84 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod; 85 } 86 } 87 } 88 return sort { lc $a cmp lc $b } 89 grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc; 90} 91 92sub perlio_ok { 93 my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] ); 94 $obj->can("perlio_ok") and return $obj->perlio_ok(); 95 return 0; # safety net 96} 97 98sub define_encoding { 99 my $obj = shift; 100 my $name = shift; 101 $Encoding{$name} = $obj; 102 my $lc = lc($name); 103 define_alias( $lc => $obj ) unless $lc eq $name; 104 while (@_) { 105 my $alias = shift; 106 define_alias( $alias, $obj ); 107 } 108 my $class = ref($obj); 109 push @Encode::CARP_NOT, $class unless grep { $_ eq $class } @Encode::CARP_NOT; 110 push @Encode::Encoding::CARP_NOT, $class unless grep { $_ eq $class } @Encode::Encoding::CARP_NOT; 111 return $obj; 112} 113 114sub getEncoding { 115 my ( $class, $name, $skip_external ) = @_; 116 117 defined($name) or return; 118 119 $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796 120 121 ref($name) && $name->can('renew') and return $name; 122 exists $Encoding{$name} and return $Encoding{$name}; 123 my $lc = lc $name; 124 exists $Encoding{$lc} and return $Encoding{$lc}; 125 126 my $oc = $class->find_alias($name); 127 defined($oc) and return $oc; 128 $lc ne $name and $oc = $class->find_alias($lc); 129 defined($oc) and return $oc; 130 131 unless ($skip_external) { 132 if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) { 133 $mod =~ s,::,/,g; 134 $mod .= '.pm'; 135 eval { require $mod; }; 136 exists $Encoding{$name} and return $Encoding{$name}; 137 } 138 } 139 return; 140} 141 142# HACK: These two functions must be defined in Encode and because of 143# cyclic dependency between Encode and Encode::Alias, Exporter does not work 144sub find_alias { 145 goto &Encode::Alias::find_alias; 146} 147sub define_alias { 148 goto &Encode::Alias::define_alias; 149} 150 151sub find_encoding($;$) { 152 my ( $name, $skip_external ) = @_; 153 return __PACKAGE__->getEncoding( $name, $skip_external ); 154} 155 156sub find_mime_encoding($;$) { 157 my ( $mime_name, $skip_external ) = @_; 158 my $name = Encode::MIME::Name::get_encode_name( $mime_name ); 159 return find_encoding( $name, $skip_external ); 160} 161 162sub resolve_alias($) { 163 my $obj = find_encoding(shift); 164 defined $obj and return $obj->name; 165 return; 166} 167 168sub clone_encoding($) { 169 my $obj = find_encoding(shift); 170 ref $obj or return; 171 return Storable::dclone($obj); 172} 173 174onBOOT; 175 176if ($ON_EBCDIC) { 177 package Encode::UTF_EBCDIC; 178 use parent 'Encode::Encoding'; 179 my $obj = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC"; 180 Encode::define_encoding($obj, 'Unicode'); 181 sub decode { 182 my ( undef, $str, $chk ) = @_; 183 my $res = ''; 184 for ( my $i = 0 ; $i < length($str) ; $i++ ) { 185 $res .= 186 chr( 187 utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) ) 188 ); 189 } 190 $_[1] = '' if $chk; 191 return $res; 192 } 193 sub encode { 194 my ( undef, $str, $chk ) = @_; 195 my $res = ''; 196 for ( my $i = 0 ; $i < length($str) ; $i++ ) { 197 $res .= 198 chr( 199 utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) ) 200 ); 201 } 202 $_[1] = '' if $chk; 203 return $res; 204 } 205} else { 206 package Encode::Internal; 207 use parent 'Encode::Encoding'; 208 my $obj = bless { Name => "Internal" } => "Encode::Internal"; 209 Encode::define_encoding($obj, 'Unicode'); 210 sub decode { 211 my ( undef, $str, $chk ) = @_; 212 utf8::upgrade($str); 213 $_[1] = '' if $chk; 214 return $str; 215 } 216 *encode = \&decode; 217} 218 219{ 220 # https://rt.cpan.org/Public/Bug/Display.html?id=103253 221 package Encode::XS; 222 use parent 'Encode::Encoding'; 223} 224 225{ 226 package Encode::utf8; 227 use parent 'Encode::Encoding'; 228 my %obj = ( 229 'utf8' => { Name => 'utf8' }, 230 'utf-8-strict' => { Name => 'utf-8-strict', strict_utf8 => 1 } 231 ); 232 for ( keys %obj ) { 233 bless $obj{$_} => __PACKAGE__; 234 Encode::define_encoding( $obj{$_} => $_ ); 235 } 236 sub cat_decode { 237 # ($obj, $dst, $src, $pos, $trm, $chk) 238 # currently ignores $chk 239 my ( undef, undef, undef, $pos, $trm ) = @_; 240 my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; 241 use bytes; 242 if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) { 243 $$rdst .= 244 substr( $$rsrc, $pos, $npos - $pos + length($trm) ); 245 $$rpos = $npos + length($trm); 246 return 1; 247 } 248 $$rdst .= substr( $$rsrc, $pos ); 249 $$rpos = length($$rsrc); 250 return ''; 251 } 252} 253 2541; 255 256__END__ 257 258=head1 NAME 259 260Encode - character encodings in Perl 261 262=head1 SYNOPSIS 263 264 use Encode qw(decode encode); 265 $characters = decode('UTF-8', $octets, Encode::FB_CROAK); 266 $octets = encode('UTF-8', $characters, Encode::FB_CROAK); 267 268=head2 Table of Contents 269 270Encode consists of a collection of modules whose details are too extensive 271to fit in one document. This one itself explains the top-level APIs 272and general topics at a glance. For other topics and more details, 273see the documentation for these modules: 274 275=over 2 276 277=item L<Encode::Alias> - Alias definitions to encodings 278 279=item L<Encode::Encoding> - Encode Implementation Base Class 280 281=item L<Encode::Supported> - List of Supported Encodings 282 283=item L<Encode::CN> - Simplified Chinese Encodings 284 285=item L<Encode::JP> - Japanese Encodings 286 287=item L<Encode::KR> - Korean Encodings 288 289=item L<Encode::TW> - Traditional Chinese Encodings 290 291=back 292 293=head1 DESCRIPTION 294 295The C<Encode> module provides the interface between Perl strings 296and the rest of the system. Perl strings are sequences of 297I<characters>. 298 299The repertoire of characters that Perl can represent is a superset of those 300defined by the Unicode Consortium. On most platforms the ordinal 301values of a character as returned by C<ord(I<S>)> is the I<Unicode 302codepoint> for that character. The exceptions are platforms where 303the legacy encoding is some variant of EBCDIC rather than a superset 304of ASCII; see L<perlebcdic>. 305 306During recent history, data is moved around a computer in 8-bit chunks, 307often called "bytes" but also known as "octets" in standards documents. 308Perl is widely used to manipulate data of many types: not only strings of 309characters representing human or computer languages, but also "binary" 310data, being the machine's representation of numbers, pixels in an image, or 311just about anything. 312 313When Perl is processing "binary data", the programmer wants Perl to 314process "sequences of bytes". This is not a problem for Perl: because a 315byte has 256 possible values, it easily fits in Perl's much larger 316"logical character". 317 318This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq> 319explain the I<why>. 320 321=head2 TERMINOLOGY 322 323=head3 character 324 325A character in the range 0 .. 2**32-1 (or more); 326what Perl's strings are made of. 327 328=head3 byte 329 330A character in the range 0..255; 331a special case of a Perl character. 332 333=head3 octet 334 3358 bits of data, with ordinal values 0..255; 336term for bytes passed to or from a non-Perl context, such as a disk file, 337standard I/O stream, database, command-line argument, environment variable, 338socket etc. 339 340=head1 THE PERL ENCODING API 341 342=head2 Basic methods 343 344=head3 encode 345 346 $octets = encode(ENCODING, STRING[, CHECK]) 347 348Encodes the scalar value I<STRING> from Perl's internal form into 349I<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a 350canonical name or an alias. For encoding names and aliases, see 351L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">. 352 353B<CAVEAT>: the input scalar I<STRING> might be modified in-place depending 354on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be 355left unchanged. 356 357For example, to convert a string from Perl's internal format into 358ISO-8859-1, also known as Latin1: 359 360 $octets = encode("iso-8859-1", $string); 361 362B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then 363$octets I<might not be equal to> $string. Though both contain the 364same data, the UTF8 flag for $octets is I<always> off. When you 365encode anything, the UTF8 flag on the result is always off, even when it 366contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below. 367 368If the $string is C<undef>, then C<undef> is returned. 369 370C<str2bytes> may be used as an alias for C<encode>. 371 372=head3 decode 373 374 $string = decode(ENCODING, OCTETS[, CHECK]) 375 376This function returns the string that results from decoding the scalar 377value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into 378Perl's internal form. As with encode(), 379I<ENCODING> can be either a canonical name or an alias. For encoding names 380and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling 381Malformed Data">. 382 383B<CAVEAT>: the input scalar I<OCTETS> might be modified in-place depending 384on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be 385left unchanged. 386 387For example, to convert ISO-8859-1 data into a string in Perl's 388internal format: 389 390 $string = decode("iso-8859-1", $octets); 391 392B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string 393I<might not be equal to> $octets. Though both contain the same data, the 394UTF8 flag for $string is on. See L</"The UTF8 flag"> 395below. 396 397If the $string is C<undef>, then C<undef> is returned. 398 399C<bytes2str> may be used as an alias for C<decode>. 400 401=head3 find_encoding 402 403 [$obj =] find_encoding(ENCODING) 404 405Returns the I<encoding object> corresponding to I<ENCODING>. Returns 406C<undef> if no matching I<ENCODING> is find. The returned object is 407what does the actual encoding or decoding. 408 409 $string = decode($name, $bytes); 410 411is in fact 412 413 $string = do { 414 $obj = find_encoding($name); 415 croak qq(encoding "$name" not found) unless ref $obj; 416 $obj->decode($bytes); 417 }; 418 419with more error checking. 420 421You can therefore save time by reusing this object as follows; 422 423 my $enc = find_encoding("iso-8859-1"); 424 while(<>) { 425 my $string = $enc->decode($_); 426 ... # now do something with $string; 427 } 428 429Besides L</decode> and L</encode>, other methods are 430available as well. For instance, C<name()> returns the canonical 431name of the encoding object. 432 433 find_encoding("latin1")->name; # iso-8859-1 434 435See L<Encode::Encoding> for details. 436 437=head3 find_mime_encoding 438 439 [$obj =] find_mime_encoding(MIME_ENCODING) 440 441Returns the I<encoding object> corresponding to I<MIME_ENCODING>. Acts 442same as C<find_encoding()> but C<mime_name()> of returned object must 443match to I<MIME_ENCODING>. So as opposite of C<find_encoding()> 444canonical names and aliases are not used when searching for object. 445 446 find_mime_encoding("utf8"); # returns undef because "utf8" is not valid I<MIME_ENCODING> 447 find_mime_encoding("utf-8"); # returns encode object "utf-8-strict" 448 find_mime_encoding("UTF-8"); # same as "utf-8" because I<MIME_ENCODING> is case insensitive 449 find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not valid I<MIME_ENCODING> 450 451=head3 from_to 452 453 [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) 454 455Converts I<in-place> data between two encodings. The data in $octets 456must be encoded as octets and I<not> as characters in Perl's internal 457format. For example, to convert ISO-8859-1 data into Microsoft's CP1250 458encoding: 459 460 from_to($octets, "iso-8859-1", "cp1250"); 461 462and to convert it back: 463 464 from_to($octets, "cp1250", "iso-8859-1"); 465 466Because the conversion happens in place, the data to be 467converted cannot be a string constant: it must be a scalar variable. 468 469C<from_to()> returns the length of the converted string in octets on success, 470and C<undef> on error. 471 472B<CAVEAT>: The following operations may look the same, but are not: 473 474 from_to($data, "iso-8859-1", "UTF-8"); #1 475 $data = decode("iso-8859-1", $data); #2 476 477Both #1 and #2 make $data consist of a completely valid UTF-8 string, 478but only #2 turns the UTF8 flag on. #1 is equivalent to: 479 480 $data = encode("UTF-8", decode("iso-8859-1", $data)); 481 482See L</"The UTF8 flag"> below. 483 484Also note that: 485 486 from_to($octets, $from, $to, $check); 487 488is equivalent to: 489 490 $octets = encode($to, decode($from, $octets), $check); 491 492Yes, it does I<not> respect the $check during decoding. It is 493deliberately done that way. If you need minute control, use C<decode> 494followed by C<encode> as follows: 495 496 $octets = encode($to, decode($from, $octets, $check_from), $check_to); 497 498=head3 encode_utf8 499 500 $octets = encode_utf8($string); 501 502Equivalent to C<$octets = encode("utf8", $string)>. The characters in 503$string are encoded in Perl's internal format, and the result is returned 504as a sequence of octets. Because all possible characters in Perl have a 505(loose, not strict) utf8 representation, this function cannot fail. 506 507B<WARNING>: do not use this function for data exchange as it can produce 508not strict utf8 $octets! For strictly valid UTF-8 output use 509C<$octets = encode("UTF-8", $string)>. 510 511=head3 decode_utf8 512 513 $string = decode_utf8($octets [, CHECK]); 514 515Equivalent to C<$string = decode("utf8", $octets [, CHECK])>. 516The sequence of octets represented by $octets is decoded 517from (loose, not strict) utf8 into a sequence of logical characters. 518Because not all sequences of octets are valid not strict utf8, 519it is quite possible for this function to fail. 520For CHECK, see L</"Handling Malformed Data">. 521 522B<WARNING>: do not use this function for data exchange as it can produce 523$string with not strict utf8 representation! For strictly valid UTF-8 524$string representation use C<$string = decode("UTF-8", $octets [, CHECK])>. 525 526B<CAVEAT>: the input I<$octets> might be modified in-place depending on 527what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be 528left unchanged. 529 530=head2 Listing available encodings 531 532 use Encode; 533 @list = Encode->encodings(); 534 535Returns a list of canonical names of available encodings that have already 536been loaded. To get a list of all available encodings including those that 537have not yet been loaded, say: 538 539 @all_encodings = Encode->encodings(":all"); 540 541Or you can give the name of a specific module: 542 543 @with_jp = Encode->encodings("Encode::JP"); 544 545When "C<::>" is not in the name, "C<Encode::>" is assumed. 546 547 @ebcdic = Encode->encodings("EBCDIC"); 548 549To find out in detail which encodings are supported by this package, 550see L<Encode::Supported>. 551 552=head2 Defining Aliases 553 554To add a new alias to a given encoding, use: 555 556 use Encode; 557 use Encode::Alias; 558 define_alias(NEWNAME => ENCODING); 559 560After that, I<NEWNAME> can be used as an alias for I<ENCODING>. 561I<ENCODING> may be either the name of an encoding or an 562I<encoding object>. 563 564Before you do that, first make sure the alias is nonexistent using 565C<resolve_alias()>, which returns the canonical name thereof. 566For example: 567 568 Encode::resolve_alias("latin1") eq "iso-8859-1" # true 569 Encode::resolve_alias("iso-8859-12") # false; nonexistent 570 Encode::resolve_alias($name) eq $name # true if $name is canonical 571 572C<resolve_alias()> does not need C<use Encode::Alias>; it can be 573imported via C<use Encode qw(resolve_alias)>. 574 575See L<Encode::Alias> for details. 576 577=head2 Finding IANA Character Set Registry names 578 579The canonical name of a given encoding does not necessarily agree with 580IANA Character Set Registry, commonly seen as C<< Content-Type: 581text/plain; charset=I<WHATEVER> >>. For most cases, the canonical name 582works, but sometimes it does not, most notably with "utf-8-strict". 583 584As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added. 585 586 use Encode; 587 my $enc = find_encoding("UTF-8"); 588 warn $enc->name; # utf-8-strict 589 warn $enc->mime_name; # UTF-8 590 591See also: L<Encode::Encoding> 592 593=head1 Encoding via PerlIO 594 595If your perl supports C<PerlIO> (which is the default), you can use a 596C<PerlIO> layer to decode and encode directly via a filehandle. The 597following two examples are fully identical in functionality: 598 599 ### Version 1 via PerlIO 600 open(INPUT, "< :encoding(shiftjis)", $infile) 601 || die "Can't open < $infile for reading: $!"; 602 open(OUTPUT, "> :encoding(euc-jp)", $outfile) 603 || die "Can't open > $output for writing: $!"; 604 while (<INPUT>) { # auto decodes $_ 605 print OUTPUT; # auto encodes $_ 606 } 607 close(INPUT) || die "can't close $infile: $!"; 608 close(OUTPUT) || die "can't close $outfile: $!"; 609 610 ### Version 2 via from_to() 611 open(INPUT, "< :raw", $infile) 612 || die "Can't open < $infile for reading: $!"; 613 open(OUTPUT, "> :raw", $outfile) 614 || die "Can't open > $output for writing: $!"; 615 616 while (<INPUT>) { 617 from_to($_, "shiftjis", "euc-jp", 1); # switch encoding 618 print OUTPUT; # emit raw (but properly encoded) data 619 } 620 close(INPUT) || die "can't close $infile: $!"; 621 close(OUTPUT) || die "can't close $outfile: $!"; 622 623In the first version above, you let the appropriate encoding layer 624handle the conversion. In the second, you explicitly translate 625from one encoding to the other. 626 627Unfortunately, it may be that encodings are not C<PerlIO>-savvy. You can check 628to see whether your encoding is supported by C<PerlIO> by invoking the 629C<perlio_ok> method on it: 630 631 Encode::perlio_ok("hz"); # false 632 find_encoding("euc-cn")->perlio_ok; # true wherever PerlIO is available 633 634 use Encode qw(perlio_ok); # imported upon request 635 perlio_ok("euc-jp") 636 637Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy 638except for C<hz> and C<ISO-2022-kr>. For the gory details, see 639L<Encode::Encoding> and L<Encode::PerlIO>. 640 641=head1 Handling Malformed Data 642 643The optional I<CHECK> argument tells C<Encode> what to do when 644encountering malformed data. Without I<CHECK>, C<Encode::FB_DEFAULT> 645(== 0) is assumed. 646 647As of version 2.12, C<Encode> supports coderef values for C<CHECK>; 648see below. 649 650B<NOTE:> Not all encodings support this feature. 651Some encodings ignore the I<CHECK> argument. For example, 652L<Encode::Unicode> ignores I<CHECK> and it always croaks on error. 653 654=head2 List of I<CHECK> values 655 656=head3 FB_DEFAULT 657 658 I<CHECK> = Encode::FB_DEFAULT ( == 0) 659 660If I<CHECK> is 0, encoding and decoding replace any malformed character 661with a I<substitution character>. When you encode, I<SUBCHAR> is used. 662When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is 663used. If the data is supposed to be UTF-8, an optional lexical warning of 664warning category C<"utf8"> is given. 665 666=head3 FB_CROAK 667 668 I<CHECK> = Encode::FB_CROAK ( == 1) 669 670If I<CHECK> is 1, methods immediately die with an error 671message. Therefore, when I<CHECK> is 1, you should trap 672exceptions with C<eval{}>, unless you really want to let it C<die>. 673 674=head3 FB_QUIET 675 676 I<CHECK> = Encode::FB_QUIET 677 678If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately 679return the portion of the data that has been processed so far when an 680error occurs. The data argument is overwritten with everything 681after that point; that is, the unprocessed portion of the data. This is 682handy when you have to call C<decode> repeatedly in the case where your 683source data may contain partial multi-byte character sequences, 684(that is, you are reading with a fixed-width buffer). Here's some sample 685code to do exactly that: 686 687 my($buffer, $string) = ("", ""); 688 while (read($fh, $buffer, 256, length($buffer))) { 689 $string .= decode($encoding, $buffer, Encode::FB_QUIET); 690 # $buffer now contains the unprocessed partial character 691 } 692 693=head3 FB_WARN 694 695 I<CHECK> = Encode::FB_WARN 696 697This is the same as C<FB_QUIET> above, except that instead of being silent 698on errors, it issues a warning. This is handy for when you are debugging. 699 700B<CAVEAT>: All warnings from Encode module are reported, independently of 701L<pragma warnings|warnings> settings. If you want to follow settings of 702lexical warnings configured by L<pragma warnings|warnings> then append 703also check value C<ENCODE::ONLY_PRAGMA_WARNINGS>. This value is available 704since Encode version 2.99. 705 706=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF 707 708=over 2 709 710=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ) 711 712=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF) 713 714=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF) 715 716=back 717 718For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==> 719C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode. 720 721When you decode, C<\xI<HH>> is inserted for a malformed character, where 722I<HH> is the hex representation of the octet that could not be decoded to 723utf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is 724the Unicode code point (in any number of hex digits) of the character that 725cannot be found in the character repertoire of the encoding. 726 727The HTML/XML character reference modes are about the same. In place of 728C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and 729XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number. 730 731In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied. 732 733=head3 The bitmask 734 735These modes are all actually set via a bitmask. Here is how the C<FB_I<XXX>> 736constants are laid out. You can import the C<FB_I<XXX>> constants via 737C<use Encode qw(:fallbacks)>, and you can import the generic bitmask 738constants via C<use Encode qw(:fallback_all)>. 739 740 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ 741 DIE_ON_ERR 0x0001 X 742 WARN_ON_ERR 0x0002 X 743 RETURN_ON_ERR 0x0004 X X 744 LEAVE_SRC 0x0008 X 745 PERLQQ 0x0100 X 746 HTMLCREF 0x0200 747 XMLCREF 0x0400 748 749=head3 LEAVE_SRC 750 751 Encode::LEAVE_SRC 752 753If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the 754source string to encode() or decode() will be overwritten in place. 755If you're not interested in this, then bitwise-OR it with the bitmask. 756 757=head2 coderef for CHECK 758 759As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the 760ordinal value of the unmapped character as an argument and returns 761octets that represent the fallback character. For instance: 762 763 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift }); 764 765Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>. 766 767Fallback for C<decode> must return decoded string (sequence of characters) 768and takes a list of ordinal values as its arguments. So for 769example if you wish to decode octets as UTF-8, and use ISO-8859-15 as 770a fallback for bytes that are not valid UTF-8, you could write 771 772 $str = decode 'UTF-8', $octets, sub { 773 my $tmp = join '', map chr, @_; 774 return decode 'ISO-8859-15', $tmp; 775 }; 776 777=head1 Defining Encodings 778 779To define a new encoding, use: 780 781 use Encode qw(define_encoding); 782 define_encoding($object, CANONICAL_NAME [, alias...]); 783 784I<CANONICAL_NAME> will be associated with I<$object>. The object 785should provide the interface described in L<Encode::Encoding>. 786If more than two arguments are provided, additional 787arguments are considered aliases for I<$object>. 788 789See L<Encode::Encoding> for details. 790 791=head1 The UTF8 flag 792 793Before the introduction of Unicode support in Perl, The C<eq> operator 794just compared the strings represented by two scalars. Beginning with 795Perl 5.8, C<eq> compares two strings with simultaneous consideration of 796I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of 797I<Programming Perl, 3rd ed.> 798 799=over 2 800 801=item Goal #1: 802 803Old byte-oriented programs should not spontaneously break on the old 804byte-oriented data they used to work on. 805 806=item Goal #2: 807 808Old byte-oriented programs should magically start working on the new 809character-oriented data when appropriate. 810 811=item Goal #3: 812 813Programs should run just as fast in the new character-oriented mode 814as in the old byte-oriented mode. 815 816=item Goal #4: 817 818Perl should remain one language, rather than forking into a 819byte-oriented Perl and a character-oriented Perl. 820 821=back 822 823When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been 824born yet, many features documented in the book remained unimplemented for a 825long time. Perl 5.8 corrected much of this, and the introduction of the 826UTF8 flag is one of them. You can think of there being two fundamentally 827different kinds of strings and string-operations in Perl: one a 828byte-oriented mode for when the internal UTF8 flag is off, and the other a 829character-oriented mode for when the internal UTF8 flag is on. 830 831This UTF8 flag is not visible in Perl scripts, exactly for the same reason 832you cannot (or rather, you I<don't have to>) see whether a scalar contains 833a string, an integer, or a floating-point number. But you can still peek 834and poke these if you will. See the next section. 835 836=head2 Messing with Perl's Internals 837 838The following API uses parts of Perl's internals in the current 839implementation. As such, they are efficient but may change in a future 840release. 841 842=head3 is_utf8 843 844 is_utf8(STRING [, CHECK]) 845 846[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>. 847If I<CHECK> is true, also checks whether I<STRING> contains well-formed 848UTF-8. Returns true if successful, false otherwise. 849 850Typically only necessary for debugging and testing. Don't use this flag as 851a marker to distinguish character and binary data, that should be decided 852for each variable when you write your code. 853 854B<CAVEAT>: If I<STRING> has UTF8 flag set, it does B<NOT> mean that 855I<STRING> is UTF-8 encoded and vice-versa. 856 857As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function. 858 859=head3 _utf8_on 860 861 _utf8_on(STRING) 862 863[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>. The I<STRING> 864is I<not> checked for containing only well-formed UTF-8. Do not use this 865unless you I<know with absolute certainty> that the STRING holds only 866well-formed UTF-8. Returns the previous state of the UTF8 flag (so please 867don't treat the return value as indicating success or failure), or C<undef> 868if I<STRING> is not a string. 869 870B<NOTE>: For security reasons, this function does not work on tainted values. 871 872=head3 _utf8_off 873 874 _utf8_off(STRING) 875 876[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>. Do not use 877frivolously. Returns the previous state of the UTF8 flag, or C<undef> if 878I<STRING> is not a string. Do not treat the return value as indicative of 879success or failure, because that isn't what it means: it is only the 880previous setting. 881 882B<NOTE>: For security reasons, this function does not work on tainted values. 883 884=head1 UTF-8 vs. utf8 vs. UTF8 885 886 ....We now view strings not as sequences of bytes, but as sequences 887 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit 888 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed. 889 890That has historically been Perl's notion of UTF-8, as that is how UTF-8 was 891first conceived by Ken Thompson when he invented it. However, thanks to 892later revisions to the applicable standards, official UTF-8 is now rather 893stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF 894to cover only 21 bits instead of 32 or 64 bits) and some sequences 895are not allowed, like those used in surrogate pairs, the 31 non-character 896code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane 897(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc. 898 899The former default in which Perl would always use a loose interpretation of 900UTF-8 has now been overruled: 901 902 From: Larry Wall <larry@wall.org> 903 Date: December 04, 2004 11:51:58 JST 904 To: perl-unicode@perl.org 905 Subject: Re: Make Encode.pm support the real UTF-8 906 Message-Id: <20041204025158.GA28754@wall.org> 907 908 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote: 909 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding, 910 : but "UTF-8" is the name of the standard and should give the 911 : corresponding behaviour. 912 913 For what it's worth, that's how I've always kept them straight in my 914 head. 915 916 Also for what it's worth, Perl 6 will mostly default to strict but 917 make it easy to switch back to lax. 918 919 Larry 920 921Got that? As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current 922sense, which is conservative and strict and security-conscious, whereas 923B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and 924lax. C<Encode> version 2.10 or later thus groks this subtle but critically 925important distinction between C<"UTF-8"> and C<"utf8">. 926 927 encode("utf8", "\x{FFFF_FFFF}", 1); # okay 928 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks 929 930In the C<Encode> module, C<"UTF-8"> is actually a canonical name for 931C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is 932critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive: 933 934 find_encoding("UTF-8")->name # is 'utf-8-strict' 935 find_encoding("utf-8")->name # ditto. names are case insensitive 936 find_encoding("utf_8")->name # ditto. "_" are treated as "-" 937 find_encoding("UTF8")->name # is 'utf8'. 938 939Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates 940whether a string is internally encoded as "utf8", also without a hyphen. 941 942=head1 SEE ALSO 943 944L<Encode::Encoding>, 945L<Encode::Supported>, 946L<Encode::PerlIO>, 947L<encoding>, 948L<perlebcdic>, 949L<perlfunc/open>, 950L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut> 951L<utf8>, 952the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html> 953 954=head1 MAINTAINER 955 956This project was originated by the late Nick Ing-Simmons and later 957maintained by Dan Kogai I<< <dankogai@cpan.org> >>. See AUTHORS 958for a full list of people involved. For any questions, send mail to 959I<< <perl-unicode@perl.org> >> so that we can all share. 960 961While Dan Kogai retains the copyright as a maintainer, credit 962should go to all those involved. See AUTHORS for a list of those 963who submitted code to the project. 964 965=head1 COPYRIGHT 966 967Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>. 968 969This library is free software; you can redistribute it and/or modify 970it under the same terms as Perl itself. 971 972=cut 973