1# 2# $Id: Encode.pm,v 1.99 2003/12/29 02:47:16 dankogai Exp dankogai $ 3# 4package Encode; 5use strict; 6our $VERSION = "1.99_01"; 7sub DEBUG () { 0 } 8use XSLoader (); 9XSLoader::load(__PACKAGE__, $VERSION); 10 11require Exporter; 12use base qw/Exporter/; 13 14# Public, encouraged API is exported by default 15 16our @EXPORT = qw( 17 decode decode_utf8 encode encode_utf8 18 encodings find_encoding clone_encoding 19); 20 21our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC 22 PERLQQ HTMLCREF XMLCREF); 23our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN 24 FB_PERLQQ FB_HTMLCREF FB_XMLCREF); 25 26our @EXPORT_OK = 27 ( 28 qw( 29 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit 30 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade 31 ), 32 @FB_FLAGS, @FB_CONSTS, 33 ); 34 35our %EXPORT_TAGS = 36 ( 37 all => [ @EXPORT, @EXPORT_OK ], 38 fallbacks => [ @FB_CONSTS ], 39 fallback_all => [ @FB_CONSTS, @FB_FLAGS ], 40 ); 41 42# Documentation moved after __END__ for speed - NI-S 43 44our $ON_EBCDIC = (ord("A") == 193); 45 46use Encode::Alias; 47 48# Make a %Encoding package variable to allow a certain amount of cheating 49our %Encoding; 50our %ExtModule; 51require Encode::Config; 52eval { require Encode::ConfigLocal }; 53 54sub encodings 55{ 56 my $class = shift; 57 my %enc; 58 if (@_ and $_[0] eq ":all"){ 59 %enc = ( %Encoding, %ExtModule ); 60 }else{ 61 %enc = %Encoding; 62 for my $mod (map {m/::/o ? $_ : "Encode::$_" } @_){ 63 DEBUG and warn $mod; 64 for my $enc (keys %ExtModule){ 65 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod; 66 } 67 } 68 } 69 return 70 sort { lc $a cmp lc $b } 71 grep {!/^(?:Internal|Unicode|Guess)$/o} keys %enc; 72} 73 74sub perlio_ok{ 75 my $obj = ref($_[0]) ? $_[0] : find_encoding($_[0]); 76 $obj->can("perlio_ok") and return $obj->perlio_ok(); 77 return 0; # safety net 78} 79 80sub define_encoding 81{ 82 my $obj = shift; 83 my $name = shift; 84 $Encoding{$name} = $obj; 85 my $lc = lc($name); 86 define_alias($lc => $obj) unless $lc eq $name; 87 while (@_){ 88 my $alias = shift; 89 define_alias($alias, $obj); 90 } 91 return $obj; 92} 93 94sub getEncoding 95{ 96 my ($class, $name, $skip_external) = @_; 97 98 ref($name) && $name->can('renew') and return $name; 99 exists $Encoding{$name} and return $Encoding{$name}; 100 my $lc = lc $name; 101 exists $Encoding{$lc} and return $Encoding{$lc}; 102 103 my $oc = $class->find_alias($name); 104 defined($oc) and return $oc; 105 $lc ne $name and $oc = $class->find_alias($lc); 106 defined($oc) and return $oc; 107 108 unless ($skip_external) 109 { 110 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){ 111 $mod =~ s,::,/,g ; $mod .= '.pm'; 112 eval{ require $mod; }; 113 exists $Encoding{$name} and return $Encoding{$name}; 114 } 115 } 116 return; 117} 118 119sub find_encoding($;$) 120{ 121 my ($name, $skip_external) = @_; 122 return __PACKAGE__->getEncoding($name,$skip_external); 123} 124 125sub resolve_alias($){ 126 my $obj = find_encoding(shift); 127 defined $obj and return $obj->name; 128 return; 129} 130 131sub clone_encoding($){ 132 my $obj = find_encoding(shift); 133 ref $obj or return; 134 eval { require Storable }; 135 $@ and return; 136 return Storable::dclone($obj); 137} 138 139sub encode($$;$) 140{ 141 my ($name, $string, $check) = @_; 142 return undef unless defined $string; 143 $check ||=0; 144 my $enc = find_encoding($name); 145 unless(defined $enc){ 146 require Carp; 147 Carp::croak("Unknown encoding '$name'"); 148 } 149 my $octets = $enc->encode($string,$check); 150 $_[1] = $string if $check; 151 return $octets; 152} 153 154sub decode($$;$) 155{ 156 my ($name,$octets,$check) = @_; 157 return undef unless defined $octets; 158 $check ||=0; 159 my $enc = find_encoding($name); 160 unless(defined $enc){ 161 require Carp; 162 Carp::croak("Unknown encoding '$name'"); 163 } 164 my $string = $enc->decode($octets,$check); 165 $_[1] = $octets if $check; 166 return $string; 167} 168 169sub from_to($$$;$) 170{ 171 my ($string,$from,$to,$check) = @_; 172 return undef unless defined $string; 173 $check ||=0; 174 my $f = find_encoding($from); 175 unless (defined $f){ 176 require Carp; 177 Carp::croak("Unknown encoding '$from'"); 178 } 179 my $t = find_encoding($to); 180 unless (defined $t){ 181 require Carp; 182 Carp::croak("Unknown encoding '$to'"); 183 } 184 my $uni = $f->decode($string,$check); 185 return undef if ($check && length($string)); 186 $string = $t->encode($uni,$check); 187 return undef if ($check && length($uni)); 188 return defined($_[0] = $string) ? length($string) : undef ; 189} 190 191sub encode_utf8($) 192{ 193 my ($str) = @_; 194 utf8::encode($str); 195 return $str; 196} 197 198sub decode_utf8($;$) 199{ 200 my ($str, $check) = @_; 201 if ($check){ 202 return decode("utf8", $str, $check); 203 }else{ 204 return undef unless utf8::decode($str); 205 return $str; 206 } 207} 208 209predefine_encodings(1); 210 211# 212# This is to restore %Encoding if really needed; 213# 214 215sub predefine_encodings{ 216 use Encode::Encoding; 217 no warnings 'redefine'; 218 my $use_xs = shift; 219 if ($ON_EBCDIC) { 220 # was in Encode::UTF_EBCDIC 221 package Encode::UTF_EBCDIC; 222 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding'; 223 *decode = sub{ 224 my ($obj,$str,$chk) = @_; 225 my $res = ''; 226 for (my $i = 0; $i < length($str); $i++) { 227 $res .= 228 chr(utf8::unicode_to_native(ord(substr($str,$i,1)))); 229 } 230 $_[1] = '' if $chk; 231 return $res; 232 }; 233 *encode = sub{ 234 my ($obj,$str,$chk) = @_; 235 my $res = ''; 236 for (my $i = 0; $i < length($str); $i++) { 237 $res .= 238 chr(utf8::native_to_unicode(ord(substr($str,$i,1)))); 239 } 240 $_[1] = '' if $chk; 241 return $res; 242 }; 243 $Encode::Encoding{Unicode} = 244 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC"; 245 } else { 246 package Encode::Internal; 247 push @Encode::Internal::ISA, 'Encode::Encoding'; 248 *decode = sub{ 249 my ($obj,$str,$chk) = @_; 250 utf8::upgrade($str); 251 $_[1] = '' if $chk; 252 return $str; 253 }; 254 *encode = \&decode; 255 $Encode::Encoding{Unicode} = 256 bless {Name => "Internal"} => "Encode::Internal"; 257 } 258 259 { 260 # was in Encode::utf8 261 package Encode::utf8; 262 push @Encode::utf8::ISA, 'Encode::Encoding'; 263 # 264 if ($use_xs){ 265 Encode::DEBUG and warn __PACKAGE__, " XS on"; 266 *decode = \&decode_xs; 267 *encode = \&encode_xs; 268 }else{ 269 Encode::DEBUG and warn __PACKAGE__, " XS off"; 270 *decode = sub{ 271 my ($obj,$octets,$chk) = @_; 272 my $str = Encode::decode_utf8($octets); 273 if (defined $str) { 274 $_[1] = '' if $chk; 275 return $str; 276 } 277 return undef; 278 }; 279 *encode = sub { 280 my ($obj,$string,$chk) = @_; 281 my $octets = Encode::encode_utf8($string); 282 $_[1] = '' if $chk; 283 return $octets; 284 }; 285 } 286 *cat_decode = sub{ # ($obj, $dst, $src, $pos, $trm, $chk) 287 my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk 288 my ($rdst, $rsrc, $rpos) = \@_[1,2,3]; 289 use bytes; 290 if ((my $npos = index($$rsrc, $trm, $pos)) >= 0) { 291 $$rdst .= substr($$rsrc, $pos, $npos - $pos + length($trm)); 292 $$rpos = $npos + length($trm); 293 return 1; 294 } 295 $$rdst .= substr($$rsrc, $pos); 296 $$rpos = length($$rsrc); 297 return ''; 298 }; 299 $Encode::Encoding{utf8} = 300 bless {Name => "utf8"} => "Encode::utf8"; 301 } 302} 303 3041; 305 306__END__ 307 308=head1 NAME 309 310Encode - character encodings 311 312=head1 SYNOPSIS 313 314 use Encode; 315 316=head2 Table of Contents 317 318Encode consists of a collection of modules whose details are too big 319to fit in one document. This POD itself explains the top-level APIs 320and general topics at a glance. For other topics and more details, 321see the PODs below: 322 323 Name Description 324 -------------------------------------------------------- 325 Encode::Alias Alias definitions to encodings 326 Encode::Encoding Encode Implementation Base Class 327 Encode::Supported List of Supported Encodings 328 Encode::CN Simplified Chinese Encodings 329 Encode::JP Japanese Encodings 330 Encode::KR Korean Encodings 331 Encode::TW Traditional Chinese Encodings 332 -------------------------------------------------------- 333 334=head1 DESCRIPTION 335 336The C<Encode> module provides the interfaces between Perl's strings 337and the rest of the system. Perl strings are sequences of 338B<characters>. 339 340The repertoire of characters that Perl can represent is at least that 341defined by the Unicode Consortium. On most platforms the ordinal 342values of the characters (as returned by C<ord(ch)>) is the "Unicode 343codepoint" for the character (the exceptions are those platforms where 344the legacy encoding is some variant of EBCDIC rather than a super-set 345of ASCII - see L<perlebcdic>). 346 347Traditionally, computer data has been moved around in 8-bit chunks 348often called "bytes". These chunks are also known as "octets" in 349networking standards. Perl is widely used to manipulate data of many 350types - not only strings of characters representing human or computer 351languages but also "binary" data being the machine's representation of 352numbers, pixels in an image - or just about anything. 353 354When Perl is processing "binary data", the programmer wants Perl to 355process "sequences of bytes". This is not a problem for Perl - as a 356byte has 256 possible values, it easily fits in Perl's much larger 357"logical character". 358 359=head2 TERMINOLOGY 360 361=over 2 362 363=item * 364 365I<character>: a character in the range 0..(2**32-1) (or more). 366(What Perl's strings are made of.) 367 368=item * 369 370I<byte>: a character in the range 0..255 371(A special case of a Perl character.) 372 373=item * 374 375I<octet>: 8 bits of data, with ordinal values 0..255 376(Term for bytes passed to or from a non-Perl context, e.g. a disk file.) 377 378=back 379 380=head1 PERL ENCODING API 381 382=over 2 383 384=item $octets = encode(ENCODING, $string [, CHECK]) 385 386Encodes a string from Perl's internal form into I<ENCODING> and returns 387a sequence of octets. ENCODING can be either a canonical name or 388an alias. For encoding names and aliases, see L</"Defining Aliases">. 389For CHECK, see L</"Handling Malformed Data">. 390 391For example, to convert a string from Perl's internal format to 392iso-8859-1 (also known as Latin1), 393 394 $octets = encode("iso-8859-1", $string); 395 396B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then $octets 397B<may not be equal to> $string. Though they both contain the same data, the utf8 flag 398for $octets is B<always> off. When you encode anything, utf8 flag of 399the result is always off, even when it contains completely valid utf8 400string. See L</"The UTF-8 flag"> below. 401 402encode($valid_encoding, undef) is harmless but warns you for 403C<Use of uninitialized value in subroutine entry>. 404encode($valid_encoding, '') is harmless and warnless. 405 406=item $string = decode(ENCODING, $octets [, CHECK]) 407 408Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's 409internal form and returns the resulting string. As in encode(), 410ENCODING can be either a canonical name or an alias. For encoding names 411and aliases, see L</"Defining Aliases">. For CHECK, see 412L</"Handling Malformed Data">. 413 414For example, to convert ISO-8859-1 data to a string in Perl's internal format: 415 416 $string = decode("iso-8859-1", $octets); 417 418B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string 419B<may not be equal to> $octets. Though they both contain the same data, 420the utf8 flag for $string is on unless $octets entirely consists of 421ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag"> 422below. 423 424decode($valid_encoding, undef) is harmless but warns you for 425C<Use of uninitialized value in subroutine entry>. 426decode($valid_encoding, '') is harmless and warnless. 427 428=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) 429 430Converts B<in-place> data between two encodings. The data in $octets 431must be encoded as octets and not as characters in Perl's internal 432format. For example, to convert ISO-8859-1 data to Microsoft's CP1250 encoding: 433 434 from_to($octets, "iso-8859-1", "cp1250"); 435 436and to convert it back: 437 438 from_to($octets, "cp1250", "iso-8859-1"); 439 440Note that because the conversion happens in place, the data to be 441converted cannot be a string constant; it must be a scalar variable. 442 443from_to() returns the length of the converted string in octets on success, undef 444otherwise. 445 446B<CAVEAT>: The following operations look the same but are not quite so; 447 448 from_to($data, "iso-8859-1", "utf8"); #1 449 $data = decode("iso-8859-1", $data); #2 450 451Both #1 and #2 make $data consist of a completely valid UTF-8 string 452but only #2 turns utf8 flag on. #1 is equivalent to 453 454 $data = encode("utf8", decode("iso-8859-1", $data)); 455 456See L</"The UTF-8 flag"> below. 457 458=item $octets = encode_utf8($string); 459 460Equivalent to C<$octets = encode("utf8", $string);> The characters 461that comprise $string are encoded in Perl's internal format and the 462result is returned as a sequence of octets. All possible 463characters have a UTF-8 representation so this function cannot fail. 464 465 466=item $string = decode_utf8($octets [, CHECK]); 467 468equivalent to C<$string = decode("utf8", $octets [, CHECK])>. 469The sequence of octets represented by 470$octets is decoded from UTF-8 into a sequence of logical 471characters. Not all sequences of octets form valid UTF-8 encodings, so 472it is possible for this call to fail. For CHECK, see 473L</"Handling Malformed Data">. 474 475=back 476 477=head2 Listing available encodings 478 479 use Encode; 480 @list = Encode->encodings(); 481 482Returns a list of the canonical names of the available encodings that 483are loaded. To get a list of all available encodings including the 484ones that are not loaded yet, say 485 486 @all_encodings = Encode->encodings(":all"); 487 488Or you can give the name of a specific module. 489 490 @with_jp = Encode->encodings("Encode::JP"); 491 492When "::" is not in the name, "Encode::" is assumed. 493 494 @ebcdic = Encode->encodings("EBCDIC"); 495 496To find out in detail which encodings are supported by this package, 497see L<Encode::Supported>. 498 499=head2 Defining Aliases 500 501To add a new alias to a given encoding, use: 502 503 use Encode; 504 use Encode::Alias; 505 define_alias(newName => ENCODING); 506 507After that, newName can be used as an alias for ENCODING. 508ENCODING may be either the name of an encoding or an 509I<encoding object> 510 511But before you do so, make sure the alias is nonexistent with 512C<resolve_alias()>, which returns the canonical name thereof. 513i.e. 514 515 Encode::resolve_alias("latin1") eq "iso-8859-1" # true 516 Encode::resolve_alias("iso-8859-12") # false; nonexistent 517 Encode::resolve_alias($name) eq $name # true if $name is canonical 518 519resolve_alias() does not need C<use Encode::Alias>; it can be 520exported via C<use Encode qw(resolve_alias)>. 521 522See L<Encode::Alias> for details. 523 524=head1 Encoding via PerlIO 525 526If your perl supports I<PerlIO> (which is the default), you can use a PerlIO layer to decode 527and encode directly via a filehandle. The following two examples 528are totally identical in their functionality. 529 530 # via PerlIO 531 open my $in, "<:encoding(shiftjis)", $infile or die; 532 open my $out, ">:encoding(euc-jp)", $outfile or die; 533 while(<$in>){ print $out $_; } 534 535 # via from_to 536 open my $in, "<", $infile or die; 537 open my $out, ">", $outfile or die; 538 while(<$in>){ 539 from_to($_, "shiftjis", "euc-jp", 1); 540 print $out $_; 541 } 542 543Unfortunately, it may be that encodings are PerlIO-savvy. You can check 544if your encoding is supported by PerlIO by calling the C<perlio_ok> 545method. 546 547 Encode::perlio_ok("hz"); # False 548 find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available 549 550 use Encode qw(perlio_ok); # exported upon request 551 perlio_ok("euc-jp") 552 553Fortunately, all encodings that come with Encode core are PerlIO-savvy 554except for hz and ISO-2022-kr. For gory details, see L<Encode::Encoding> and L<Encode::PerlIO>. 555 556=head1 Handling Malformed Data 557 558The I<CHECK> argument is used as follows. When you omit it, 559the behaviour is the same as if you had passed a value of 0 for 560I<CHECK>. 561 562=over 2 563 564=item I<CHECK> = Encode::FB_DEFAULT ( == 0) 565 566If I<CHECK> is 0, (en|de)code will put a I<substitution character> 567in place of a malformed character. For UCM-based encodings, 568E<lt>subcharE<gt> will be used. For Unicode, the code point C<0xFFFD> is used. 569If the data is supposed to be UTF-8, an optional lexical warning 570(category utf8) is given. 571 572=item I<CHECK> = Encode::FB_CROAK ( == 1) 573 574If I<CHECK> is 1, methods will die on error immediately with an error 575message. Therefore, when I<CHECK> is set to 1, you should trap the 576fatal error with eval{} unless you really want to let it die on error. 577 578=item I<CHECK> = Encode::FB_QUIET 579 580If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately 581return the portion of the data that has been processed so far when 582an error occurs. The data argument will be overwritten with 583everything after that point (that is, the unprocessed part of data). 584This is handy when you have to call decode repeatedly in the case 585where your source data may contain partial multi-byte character 586sequences, for example because you are reading with a fixed-width 587buffer. Here is some sample code that does exactly this: 588 589 my $data = ''; my $utf8 = ''; 590 while(defined(read $fh, $buffer, 256)){ 591 # buffer may end in a partial character so we append 592 $data .= $buffer; 593 $utf8 .= decode($encoding, $data, Encode::FB_QUIET); 594 # $data now contains the unprocessed partial character 595 } 596 597=item I<CHECK> = Encode::FB_WARN 598 599This is the same as above, except that it warns on error. Handy when 600you are debugging the mode above. 601 602=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ) 603 604=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF) 605 606=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF) 607 608For encodings that are implemented by Encode::XS, CHECK == 609Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode. 610 611When you decode, C<\xI<HH>> will be inserted for a malformed character, 612where I<HH> is the hex representation of the octet that could not be 613decoded to utf8. And when you encode, C<\x{I<HHHH>}> will be inserted, 614where I<HHHH> is the Unicode ID of the character that cannot be found 615in the character repertoire of the encoding. 616 617HTML/XML character reference modes are about the same, in place of 618C<\x{I<HHHH>}>, HTML uses C<&#I<NNNN>>; where I<NNNN> is a decimal digit and 619XML uses C<&#xI<HHHH>>; where I<HHHH> is the hexadecimal digit. 620 621=item The bitmask 622 623These modes are actually set via a bitmask. Here is how the FB_XX 624constants are laid out. You can import the FB_XX constants via 625C<use Encode qw(:fallbacks)>; you can import the generic bitmask 626constants via C<use Encode qw(:fallback_all)>. 627 628 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ 629 DIE_ON_ERR 0x0001 X 630 WARN_ON_ERR 0x0002 X 631 RETURN_ON_ERR 0x0004 X X 632 LEAVE_SRC 0x0008 633 PERLQQ 0x0100 X 634 HTMLCREF 0x0200 635 XMLCREF 0x0400 636 637=back 638 639=head2 Unimplemented fallback schemes 640 641In the future, you will be able to use a code reference to a callback 642function for the value of I<CHECK> but its API is still undecided. 643 644The fallback scheme does not work on EBCDIC platforms. 645 646=head1 Defining Encodings 647 648To define a new encoding, use: 649 650 use Encode qw(define_encoding); 651 define_encoding($object, 'canonicalName' [, alias...]); 652 653I<canonicalName> will be associated with I<$object>. The object 654should provide the interface described in L<Encode::Encoding>. 655If more than two arguments are provided then additional 656arguments are taken as aliases for I<$object>. 657 658See L<Encode::Encoding> for more details. 659 660=head1 The UTF-8 flag 661 662Before the introduction of utf8 support in perl, The C<eq> operator 663just compared the strings represented by two scalars. Beginning with 664perl 5.8, C<eq> compares two strings with simultaneous consideration 665of I<the utf8 flag>. To explain why we made it so, I will quote page 666402 of C<Programming Perl, 3rd ed.> 667 668=over 2 669 670=item Goal #1: 671 672Old byte-oriented programs should not spontaneously break on the old 673byte-oriented data they used to work on. 674 675=item Goal #2: 676 677Old byte-oriented programs should magically start working on the new 678character-oriented data when appropriate. 679 680=item Goal #3: 681 682Programs should run just as fast in the new character-oriented mode 683as in the old byte-oriented mode. 684 685=item Goal #4: 686 687Perl should remain one language, rather than forking into a 688byte-oriented Perl and a character-oriented Perl. 689 690=back 691 692Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 693was born and many features documented in the book remained 694unimplemented for a long time. Perl 5.8 corrected this and the introduction 695of the UTF-8 flag is one of them. You can think of this perl notion as of a 696byte-oriented mode (utf8 flag off) and a character-oriented mode (utf8 697flag on). 698 699Here is how Encode takes care of the utf8 flag. 700 701=over 2 702 703=item * 704 705When you encode, the resulting utf8 flag is always off. 706 707=item * 708 709When you decode, the resulting utf8 flag is on unless you can 710unambiguously represent data. Here is the definition of 711dis-ambiguity. 712 713After C<$utf8 = decode('foo', $octet);>, 714 715 When $octet is... The utf8 flag in $utf8 is 716 --------------------------------------------- 717 In ASCII only (or EBCDIC only) OFF 718 In ISO-8859-1 ON 719 In any other Encoding ON 720 --------------------------------------------- 721 722As you see, there is one exception, In ASCII. That way you can assue 723Goal #1. And with Encode Goal #2 is assumed but you still have to be 724careful in such cases mentioned in B<CAVEAT> paragraphs. 725 726This utf8 flag is not visible in perl scripts, exactly for the same 727reason you cannot (or you I<don't have to>) see if a scalar contains a 728string, integer, or floating point number. But you can still peek 729and poke these if you will. See the section below. 730 731=back 732 733=head2 Messing with Perl's Internals 734 735The following API uses parts of Perl's internals in the current 736implementation. As such, they are efficient but may change. 737 738=over 2 739 740=item is_utf8(STRING [, CHECK]) 741 742[INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING. 743If CHECK is true, also checks the data in STRING for being well-formed 744UTF-8. Returns true if successful, false otherwise. 745 746As of perl 5.8.1, L<utf8> also has utf8::is_utf8(). 747 748=item _utf8_on(STRING) 749 750[INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is 751B<not> checked for being well-formed UTF-8. Do not use unless you 752B<know> that the STRING is well-formed UTF-8. Returns the previous 753state of the UTF-8 flag (so please don't treat the return value as 754indicating success or failure), or C<undef> if STRING is not a string. 755 756=item _utf8_off(STRING) 757 758[INTERNAL] Turns off the UTF-8 flag in STRING. Do not use frivolously. 759Returns the previous state of the UTF-8 flag (so please don't treat the 760return value as indicating success or failure), or C<undef> if STRING is 761not a string. 762 763=back 764 765=head1 SEE ALSO 766 767L<Encode::Encoding>, 768L<Encode::Supported>, 769L<Encode::PerlIO>, 770L<encoding>, 771L<perlebcdic>, 772L<perlfunc/open>, 773L<perlunicode>, 774L<utf8>, 775the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt> 776 777=head1 MAINTAINER 778 779This project was originated by Nick Ing-Simmons and later maintained 780by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full 781list of people involved. For any questions, use 782E<lt>perl-unicode@perl.orgE<gt> so we can all share. 783 784=cut 785