Encode/Unicode/Unicode.pm

b39c5158Smillertpackage Encode::Unicode;
b39c5158Smillert
b39c5158Smillertuse strict;
b39c5158Smillertuse warnings;
b39c5158Smillert
*eac174f2Safresh1our $VERSION = do { my @r = ( q$Revision: 2.20 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
b39c5158Smillert
b39c5158Smillertuse XSLoader;
b39c5158SmillertXSLoader::load( __PACKAGE__, $VERSION );
b39c5158Smillert
b39c5158Smillert#
b39c5158Smillert# Object Generator 8 transcoders all at once!
b39c5158Smillert#
b39c5158Smillert
9f11ffb7Safresh1use Encode ();
b39c5158Smillert
b39c5158Smillertour %BOM_Unknown = map { $_ => 1 } qw(UTF-16 UTF-32);
b39c5158Smillert
b39c5158Smillertfor my $name (
b39c5158Smillert    qw(UTF-16 UTF-16BE UTF-16LE
b39c5158Smillert    UTF-32 UTF-32BE UTF-32LE
b39c5158Smillert    UCS-2BE  UCS-2LE)
b39c5158Smillert  )
b39c5158Smillert{
b39c5158Smillert    my ( $size, $endian, $ucs2, $mask );
b39c5158Smillert    $name =~ /^(\w+)-(\d+)(\w*)$/o;
b39c5158Smillert    if ( $ucs2 = ( $1 eq 'UCS' ) ) {
b39c5158Smillert        $size = 2;
b39c5158Smillert    }
b39c5158Smillert    else {
b39c5158Smillert        $size = $2 / 8;
b39c5158Smillert    }
b39c5158Smillert    $endian = ( $3 eq 'BE' ) ? 'n' : ( $3 eq 'LE' ) ? 'v' : '';
b39c5158Smillert    $size == 4 and $endian = uc($endian);
b39c5158Smillert
9f11ffb7Safresh1    my $obj = bless {
b39c5158Smillert        Name   => $name,
b39c5158Smillert        size   => $size,
b39c5158Smillert        endian => $endian,
b39c5158Smillert        ucs2   => $ucs2,
b39c5158Smillert    } => __PACKAGE__;
9f11ffb7Safresh1    Encode::define_encoding($obj, $name);
b39c5158Smillert}
b39c5158Smillert
e5157e49Safresh1use parent qw(Encode::Encoding);
b39c5158Smillert
b39c5158Smillertsub renew {
b39c5158Smillert    my $self = shift;
b39c5158Smillert    $BOM_Unknown{ $self->name } or return $self;
b39c5158Smillert    my $clone = bless {%$self} => ref($self);
b39c5158Smillert    $clone->{renewed}++;    # so the caller knows it is renewed.
b39c5158Smillert    return $clone;
b39c5158Smillert}
b39c5158Smillert
b39c5158Smillert1;
b39c5158Smillert__END__
b39c5158Smillert
b39c5158Smillert=head1 NAME
b39c5158Smillert
b39c5158SmillertEncode::Unicode -- Various Unicode Transformation Formats
b39c5158Smillert
b39c5158Smillert=cut
b39c5158Smillert
b39c5158Smillert=head1 SYNOPSIS
b39c5158Smillert
b39c5158Smillert    use Encode qw/encode decode/;
b39c5158Smillert    $ucs2 = encode("UCS-2BE", $utf8);
b39c5158Smillert    $utf8 = decode("UCS-2BE", $ucs2);
b39c5158Smillert
b39c5158Smillert=head1 ABSTRACT
b39c5158Smillert
b39c5158SmillertThis module implements all Character Encoding Schemes of Unicode that
b39c5158Smillertare officially documented by Unicode Consortium (except, of course,
b39c5158Smillertfor UTF-8, which is a native format in perl).
b39c5158Smillert
b39c5158Smillert=over 4
b39c5158Smillert
b39c5158Smillert=item L<http://www.unicode.org/glossary/> says:
b39c5158Smillert
b39c5158SmillertI<Character Encoding Scheme> A character encoding form plus byte
b39c5158Smillertserialization. There are Seven character encoding schemes in Unicode:
b39c5158SmillertUTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32 (UCS-4), UTF-32BE (UCS-4BE) and
b39c5158SmillertUTF-32LE (UCS-4LE), and UTF-7.
b39c5158Smillert
b39c5158SmillertSince UTF-7 is a 7-bit (re)encoded version of UTF-16BE, It is not part of
b39c5158SmillertUnicode's Character Encoding Scheme.  It is separately implemented in
b39c5158SmillertEncode::Unicode::UTF7.  For details see L<Encode::Unicode::UTF7>.
b39c5158Smillert
b39c5158Smillert=item Quick Reference
b39c5158Smillert
b39c5158Smillert                Decodes from ord(N)           Encodes chr(N) to...
b39c5158Smillert       octet/char BOM S.P d800-dfff  ord > 0xffff     \x{1abcd} ==
b39c5158Smillert  ---------------+-----------------+------------------------------
b39c5158Smillert  UCS-2BE       2   N   N  is bogus                  Not Available
b39c5158Smillert  UCS-2LE       2   N   N     bogus                  Not Available
b39c5158Smillert  UTF-16      2/4   Y   Y  is   S.P           S.P            BE/LE
b39c5158Smillert  UTF-16BE    2/4   N   Y       S.P           S.P    0xd82a,0xdfcd
b39c5158Smillert  UTF-16LE    2/4   N   Y       S.P           S.P    0x2ad8,0xcddf
b39c5158Smillert  UTF-32        4   Y   -  is bogus         As is            BE/LE
b39c5158Smillert  UTF-32BE      4   N   -     bogus         As is       0x0001abcd
b39c5158Smillert  UTF-32LE      4   N   -     bogus         As is       0xcdab0100
b39c5158Smillert  UTF-8       1-4   -   -     bogus   >= 4 octets   \xf0\x9a\af\8d
b39c5158Smillert  ---------------+-----------------+------------------------------
b39c5158Smillert
b39c5158Smillert=back
b39c5158Smillert
b39c5158Smillert=head1 Size, Endianness, and BOM
b39c5158Smillert
b39c5158SmillertYou can categorize these CES by 3 criteria:  size of each character,
b39c5158Smillertendianness, and Byte Order Mark.
b39c5158Smillert
b39c5158Smillert=head2 by size
b39c5158Smillert
b39c5158SmillertUCS-2 is a fixed-length encoding with each character taking 16 bits.
b39c5158SmillertIt B<does not> support I<surrogate pairs>.  When a surrogate pair
b39c5158Smillertis encountered during decode(), its place is filled with \x{FFFD}
b39c5158Smillertif I<CHECK> is 0, or the routine croaks if I<CHECK> is 1.  When a
b39c5158Smillertcharacter whose ord value is larger than 0xFFFF is encountered,
b39c5158Smillertits place is filled with \x{FFFD} if I<CHECK> is 0, or the routine
b39c5158Smillertcroaks if I<CHECK> is 1.
b39c5158Smillert
b39c5158SmillertUTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>.
b39c5158SmillertWhen it encounters a high surrogate (0xD800-0xDBFF), it fetches the
b39c5158Smillertfollowing low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to
b39c5158Smillertform a character.  Bogus surrogates result in death.  When \x{10000}
b39c5158Smillertor above is encountered during encode(), it C<ensurrogate>s them and
b39c5158Smillertpushes the surrogate pair to the output stream.
b39c5158Smillert
b39c5158SmillertUTF-32 (UCS-4) is a fixed-length encoding with each character taking 32 bits.
b39c5158SmillertSince it is 32-bit, there is no need for I<surrogate pairs>.
b39c5158Smillert
b39c5158Smillert=head2 by endianness
b39c5158Smillert
b39c5158SmillertThe first (and now failed) goal of Unicode was to map all character
b39c5158Smillertrepertoires into a fixed-length integer so that programmers are happy.
b39c5158SmillertSince each character is either a I<short> or I<long> in C, you have to
b39c5158Smillertpay attention to the endianness of each platform when you pass data
b39c5158Smillertto one another.
b39c5158Smillert
b39c5158SmillertAnything marked as BE is Big Endian (or network byte order) and LE is
b39c5158SmillertLittle Endian (aka VAX byte order).  For anything not marked either
b39c5158SmillertBE or LE, a character called Byte Order Mark (BOM) indicating the
b39c5158Smillertendianness is prepended to the string.
b39c5158Smillert
b39c5158SmillertCAVEAT: Though BOM in utf8 (\xEF\xBB\xBF) is valid, it is meaningless
b39c5158Smillertand as of this writing Encode suite just leave it as is (\x{FeFF}).
b39c5158Smillert
b39c5158Smillert=over 4
b39c5158Smillert
b39c5158Smillert=item BOM as integer when fetched in network byte order
b39c5158Smillert
b39c5158Smillert              16         32 bits/char
b39c5158Smillert  -------------------------
b39c5158Smillert  BE      0xFeFF 0x0000FeFF
b39c5158Smillert  LE      0xFFFe 0xFFFe0000
b39c5158Smillert  -------------------------
b39c5158Smillert
b39c5158Smillert=back
b39c5158Smillert
b39c5158SmillertThis modules handles the BOM as follows.
b39c5158Smillert
b39c5158Smillert=over 4
b39c5158Smillert
b39c5158Smillert=item *
b39c5158Smillert
b39c5158SmillertWhen BE or LE is explicitly stated as the name of encoding, BOM is
b39c5158Smillertsimply treated as a normal character (ZERO WIDTH NO-BREAK SPACE).
b39c5158Smillert
b39c5158Smillert=item *
b39c5158Smillert
b39c5158SmillertWhen BE or LE is omitted during decode(), it checks if BOM is at the
b39c5158Smillertbeginning of the string; if one is found, the endianness is set to
b8851fccSafresh1what the BOM says.
b8851fccSafresh1
b8851fccSafresh1=item *
b8851fccSafresh1
b8851fccSafresh1Default Byte Order
b8851fccSafresh1
b8851fccSafresh1When no BOM is found, Encode 2.76 and blow croaked.  Since Encode
b8851fccSafresh12.77, it falls back to BE accordingly to RFC2781 and the Unicode
b8851fccSafresh1Standard version 8.0
b39c5158Smillert
b39c5158Smillert=item *
b39c5158Smillert
b39c5158SmillertWhen BE or LE is omitted during encode(), it returns a BE-encoded
b39c5158Smillertstring with BOM prepended.  So when you want to encode a whole text
b39c5158Smillertfile, make sure you encode() the whole text at once, not line by line
b39c5158Smillertor each line, not file, will have a BOM prepended.
b39c5158Smillert
b39c5158Smillert=item *
b39c5158Smillert
b39c5158SmillertC<UCS-2> is an exception.  Unlike others, this is an alias of UCS-2BE.
b39c5158SmillertUCS-2 is already registered by IANA and others that way.
b39c5158Smillert
b39c5158Smillert=back
b39c5158Smillert
b39c5158Smillert=head1 Surrogate Pairs
b39c5158Smillert
b39c5158SmillertTo say the least, surrogate pairs were the biggest mistake of the
b39c5158SmillertUnicode Consortium.  But according to the late Douglas Adams in I<The
b39c5158SmillertHitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
b39c5158SmillertUniverse was created. This has made a lot of people very angry and
b39c5158Smillertbeen widely regarded as a bad move>.  Their mistake was not of this
b39c5158Smillertmagnitude so let's forgive them.
b39c5158Smillert
b39c5158Smillert(I don't dare make any comparison with Unicode Consortium and the
b39c5158SmillertVogons here ;)  Or, comparing Encode to Babel Fish is completely
b39c5158Smillertappropriate -- if you can only stick this into your ear :)
b39c5158Smillert
b39c5158SmillertSurrogate pairs were born when the Unicode Consortium finally
b39c5158Smillertadmitted that 16 bits were not big enough to hold all the world's
b39c5158Smillertcharacter repertoires.  But they already made UCS-2 16-bit.  What
b39c5158Smillertdo we do?
b39c5158Smillert
b39c5158SmillertBack then, the range 0xD800-0xDFFF was not allocated.  Let's split
b39c5158Smillertthat range in half and use the first half to represent the C<upper
b39c5158Smillerthalf of a character> and the second half to represent the C<lower
b39c5158Smillerthalf of a character>.  That way, you can represent 1024 * 1024 =
b39c5158Smillert1048576 more characters.  Now we can store character ranges up to
b39c5158Smillert\x{10ffff} even with 16-bit encodings.  This pair of half-character is
b39c5158Smillertnow called a I<surrogate pair> and UTF-16 is the name of the encoding
b39c5158Smillertthat embraces them.
b39c5158Smillert
b39c5158SmillertHere is a formula to ensurrogate a Unicode character \x{10000} and
b39c5158Smillertabove;
b39c5158Smillert
b39c5158Smillert  $hi = ($uni - 0x10000) / 0x400 + 0xD800;
b39c5158Smillert  $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
b39c5158Smillert
b39c5158SmillertAnd to desurrogate;
b39c5158Smillert
b39c5158Smillert $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
b39c5158Smillert
b39c5158SmillertNote this move has made \x{D800}-\x{DFFF} into a forbidden zone but
b39c5158Smillertperl does not prohibit the use of characters within this range.  To perl,
b39c5158Smillertevery one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
b39c5158Smillert
b39c5158Smillert  (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
b39c5158Smillert  integer support!
b39c5158Smillert
b39c5158Smillert=head1 Error Checking
b39c5158Smillert
b39c5158SmillertUnlike most encodings which accept various ways to handle errors,
b39c5158SmillertUnicode encodings simply croaks.
b39c5158Smillert
b39c5158Smillert  % perl -MEncode -e'$_ = "\xfe\xff\xd8\xd9\xda\xdb\0\n"' \
b39c5158Smillert         -e'Encode::from_to($_, "utf16","shift_jis", 0); print'
b39c5158Smillert  UTF-16:Malformed LO surrogate d8d9 at /path/to/Encode.pm line 184.
b39c5158Smillert  % perl -MEncode -e'$a = "BOM missing"' \
b39c5158Smillert         -e' Encode::from_to($a, "utf16", "shift_jis", 0); print'
b39c5158Smillert  UTF-16:Unrecognised BOM 424f at /path/to/Encode.pm line 184.
b39c5158Smillert
b39c5158SmillertUnlike other encodings where mappings are not one-to-one against
b39c5158SmillertUnicode, UTFs are supposed to map 100% against one another.  So Encode
b39c5158Smillertis more strict on UTFs.
b39c5158Smillert
b39c5158SmillertConsider that "division by zero" of Encode :)
b39c5158Smillert
b39c5158Smillert=head1 SEE ALSO
b39c5158Smillert
*eac174f2Safresh1L<Encode>, L<Encode::Unicode::UTF7>, L<https://www.unicode.org/glossary/>,
*eac174f2Safresh1L<https://www.unicode.org/faq/utf_bom.html>,
b39c5158Smillert
b39c5158SmillertRFC 2781 L<http://www.ietf.org/rfc/rfc2781.txt>,
b39c5158Smillert
*eac174f2Safresh1The whole Unicode standard L<https://www.unicode.org/standard/standard.html>
b39c5158Smillert
*eac174f2Safresh1Ch. 6 pp. 275 of C<Programming Perl (3rd Edition)>
*eac174f2Safresh1by Tom Christiansen, brian d foy & Larry Wall;
*eac174f2Safresh1O'Reilly & Associates; ISBN 978-0-596-00492-7
b39c5158Smillert
b39c5158Smillert=cut