Encode/Unicode/Unicode.pm

*0Sstevel@tonic-gatepackage Encode::Unicode;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateuse strict;
*0Sstevel@tonic-gateuse warnings;
*0Sstevel@tonic-gateno warnings 'redefine';
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateour $VERSION = do { my @r = (q$Revision: 1.40 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateuse XSLoader;
*0Sstevel@tonic-gateXSLoader::load(__PACKAGE__,$VERSION);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate#
*0Sstevel@tonic-gate# Object Generator 8 transcoders all at once!
*0Sstevel@tonic-gate#
*0Sstevel@tonic-gate
*0Sstevel@tonic-gaterequire Encode;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateour %BOM_Unknown = map {$_ => 1} qw(UTF-16 UTF-32);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatefor my $name (qw(UTF-16 UTF-16BE UTF-16LE
*0Sstevel@tonic-gate                 UTF-32 UTF-32BE UTF-32LE
*0Sstevel@tonic-gate                        UCS-2BE  UCS-2LE))
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate    my ($size, $endian, $ucs2, $mask);
*0Sstevel@tonic-gate    $name =~ /^(\w+)-(\d+)(\w*)$/o;
*0Sstevel@tonic-gate    if ($ucs2 = ($1 eq 'UCS')){
*0Sstevel@tonic-gate	$size = 2;
*0Sstevel@tonic-gate    }else{
*0Sstevel@tonic-gate	$size = $2/8;
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
*0Sstevel@tonic-gate    $size == 4 and $endian = uc($endian);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    $Encode::Encoding{$name} =
*0Sstevel@tonic-gate	bless {
*0Sstevel@tonic-gate	       Name   =>   $name,
*0Sstevel@tonic-gate	       size   =>   $size,
*0Sstevel@tonic-gate	       endian => $endian,
*0Sstevel@tonic-gate	       ucs2   =>   $ucs2,
*0Sstevel@tonic-gate	      } => __PACKAGE__;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateuse base qw(Encode::Encoding);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub renew {
*0Sstevel@tonic-gate    my $self = shift;
*0Sstevel@tonic-gate    $BOM_Unknown{$self->name} or return $self;
*0Sstevel@tonic-gate    my $clone = bless { %$self } => ref($self);
*0Sstevel@tonic-gate    $clone->{clone} = 1; # so the caller knows it is renewed.
*0Sstevel@tonic-gate    return $clone;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# There used to be a perl implemntation of (en|de)code but with
*0Sstevel@tonic-gate# XS version is ripe, perl version is zapped for optimal speed
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate*decode = \&decode_xs;
*0Sstevel@tonic-gate*encode = \&encode_xs;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate1;
*0Sstevel@tonic-gate__END__
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 NAME
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateEncode::Unicode -- Various Unicode Transformation Formats
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 SYNOPSIS
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Encode qw/encode decode/;
*0Sstevel@tonic-gate    $ucs2 = encode("UCS-2BE", $utf8);
*0Sstevel@tonic-gate    $utf8 = decode("UCS-2BE", $ucs2);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 ABSTRACT
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThis module implements all Character Encoding Schemes of Unicode that
*0Sstevel@tonic-gateare officially documented by Unicode Consortium (except, of course,
*0Sstevel@tonic-gatefor UTF-8, which is a native format in perl).
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over 4
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item L<http://www.unicode.org/glossary/> says:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateI<Character Encoding Scheme> A character encoding form plus byte
*0Sstevel@tonic-gateserialization. There are Seven character encoding schemes in Unicode:
*0Sstevel@tonic-gateUTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32 (UCS-4), UTF-32BE (UCS-4BE) and
*0Sstevel@tonic-gateUTF-32LE (UCS-4LE), and UTF-7.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateSince UTF-7 is a 7-bit (re)encoded version of UTF-16BE, It is not part of
*0Sstevel@tonic-gateUnicode's Character Encoding Scheme.  It is separately implemented in
*0Sstevel@tonic-gateEncode::Unicode::UTF7.  For details see L<Encode::Unicode::UTF7>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item Quick Reference
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate                Decodes from ord(N)           Encodes chr(N) to...
*0Sstevel@tonic-gate       octet/char BOM S.P d800-dfff  ord > 0xffff     \x{1abcd} ==
*0Sstevel@tonic-gate  ---------------+-----------------+------------------------------
*0Sstevel@tonic-gate  UCS-2BE	2   N   N  is bogus                  Not Available
*0Sstevel@tonic-gate  UCS-2LE       2   N   N     bogus                  Not Available
*0Sstevel@tonic-gate  UTF-16      2/4   Y   Y  is   S.P           S.P            BE/LE
*0Sstevel@tonic-gate  UTF-16BE    2/4   N   Y       S.P           S.P    0xd82a,0xdfcd
*0Sstevel@tonic-gate  UTF-16LE	2   N   Y       S.P           S.P    0x2ad8,0xcddf
*0Sstevel@tonic-gate  UTF-32	4   Y   -  is bogus         As is            BE/LE
*0Sstevel@tonic-gate  UTF-32BE	4   N   -     bogus         As is       0x0001abcd
*0Sstevel@tonic-gate  UTF-32LE	4   N   -     bogus         As is       0xcdab0100
*0Sstevel@tonic-gate  UTF-8       1-4   -   -     bogus   >= 4 octets   \xf0\x9a\af\8d
*0Sstevel@tonic-gate  ---------------+-----------------+------------------------------
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 Size, Endianness, and BOM
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateYou can categorize these CES by 3 criteria:  size of each character,
*0Sstevel@tonic-gateendianness, and Byte Order Mark.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 by size
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateUCS-2 is a fixed-length encoding with each character taking 16 bits.
*0Sstevel@tonic-gateIt B<does not> support I<surrogate pairs>.  When a surrogate pair
*0Sstevel@tonic-gateis encountered during decode(), its place is filled with \x{FFFD}
*0Sstevel@tonic-gateif I<CHECK> is 0, or the routine croaks if I<CHECK> is 1.  When a
*0Sstevel@tonic-gatecharacter whose ord value is larger than 0xFFFF is encountered,
*0Sstevel@tonic-gateits place is filled with \x{FFFD} if I<CHECK> is 0, or the routine
*0Sstevel@tonic-gatecroaks if I<CHECK> is 1.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateUTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>.
*0Sstevel@tonic-gateWhen it encounters a high surrogate (0xD800-0xDBFF), it fetches the
*0Sstevel@tonic-gatefollowing low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to
*0Sstevel@tonic-gateform a character.  Bogus surrogates result in death.  When \x{10000}
*0Sstevel@tonic-gateor above is encountered during encode(), it C<ensurrogate>s them and
*0Sstevel@tonic-gatepushes the surrogate pair to the output stream.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateUTF-32 (UCS-4) is a fixed-length encoding with each character taking 32 bits.
*0Sstevel@tonic-gateSince it is 32-bit, there is no need for I<surrogate pairs>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 by endianness
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe first (and now failed) goal of Unicode was to map all character
*0Sstevel@tonic-gaterepertoires into a fixed-length integer so that programmers are happy.
*0Sstevel@tonic-gateSince each character is either a I<short> or I<long> in C, you have to
*0Sstevel@tonic-gatepay attention to the endianness of each platform when you pass data
*0Sstevel@tonic-gateto one another.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateAnything marked as BE is Big Endian (or network byte order) and LE is
*0Sstevel@tonic-gateLittle Endian (aka VAX byte order).  For anything not marked either
*0Sstevel@tonic-gateBE or LE, a character called Byte Order Mark (BOM) indicating the
*0Sstevel@tonic-gateendianness is prepended to the string.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over 4
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item BOM as integer when fetched in network byte order
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate              16         32 bits/char
*0Sstevel@tonic-gate  -------------------------
*0Sstevel@tonic-gate  BE      0xFeFF 0x0000FeFF
*0Sstevel@tonic-gate  LE      0xFFeF 0xFFFe0000
*0Sstevel@tonic-gate  -------------------------
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThis modules handles the BOM as follows.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over 4
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWhen BE or LE is explicitly stated as the name of encoding, BOM is
*0Sstevel@tonic-gatesimply treated as a normal character (ZERO WIDTH NO-BREAK SPACE).
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWhen BE or LE is omitted during decode(), it checks if BOM is at the
*0Sstevel@tonic-gatebeginning of the string; if one is found, the endianness is set to
*0Sstevel@tonic-gatewhat the BOM says.  If no BOM is found, the routine dies.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWhen BE or LE is omitted during encode(), it returns a BE-encoded
*0Sstevel@tonic-gatestring with BOM prepended.  So when you want to encode a whole text
*0Sstevel@tonic-gatefile, make sure you encode() the whole text at once, not line by line
*0Sstevel@tonic-gateor each line, not file, will have a BOM prepended.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateC<UCS-2> is an exception.  Unlike others, this is an alias of UCS-2BE.
*0Sstevel@tonic-gateUCS-2 is already registered by IANA and others that way.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 Surrogate Pairs
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateTo say the least, surrogate pairs were the biggest mistake of the
*0Sstevel@tonic-gateUnicode Consortium.  But according to the late Douglas Adams in I<The
*0Sstevel@tonic-gateHitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
*0Sstevel@tonic-gateUniverse was created. This has made a lot of people very angry and
*0Sstevel@tonic-gatebeen widely regarded as a bad move>.  Their mistake was not of this
*0Sstevel@tonic-gatemagnitude so let's forgive them.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate(I don't dare make any comparison with Unicode Consortium and the
*0Sstevel@tonic-gateVogons here ;)  Or, comparing Encode to Babel Fish is completely
*0Sstevel@tonic-gateappropriate -- if you can only stick this into your ear :)
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateSurrogate pairs were born when the Unicode Consortium finally
*0Sstevel@tonic-gateadmitted that 16 bits were not big enough to hold all the world's
*0Sstevel@tonic-gatecharacter repertoires.  But they already made UCS-2 16-bit.  What
*0Sstevel@tonic-gatedo we do?
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateBack then, the range 0xD800-0xDFFF was not allocated.  Let's split
*0Sstevel@tonic-gatethat range in half and use the first half to represent the C<upper
*0Sstevel@tonic-gatehalf of a character> and the second half to represent the C<lower
*0Sstevel@tonic-gatehalf of a character>.  That way, you can represent 1024 * 1024 =
*0Sstevel@tonic-gate1048576 more characters.  Now we can store character ranges up to
*0Sstevel@tonic-gate\x{10ffff} even with 16-bit encodings.  This pair of half-character is
*0Sstevel@tonic-gatenow called a I<surrogate pair> and UTF-16 is the name of the encoding
*0Sstevel@tonic-gatethat embraces them.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateHere is a formula to ensurrogate a Unicode character \x{10000} and
*0Sstevel@tonic-gateabove;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  $hi = ($uni - 0x10000) / 0x400 + 0xD800;
*0Sstevel@tonic-gate  $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateAnd to desurrogate;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateNote this move has made \x{D800}-\x{DFFF} into a forbidden zone but
*0Sstevel@tonic-gateperl does not prohibit the use of characters within this range.  To perl,
*0Sstevel@tonic-gateevery one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
*0Sstevel@tonic-gate  integer support!
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 SEE ALSO
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateL<Encode>, L<Encode::Unicode::UTF7>, L<http://www.unicode.org/glossary/>,
*0Sstevel@tonic-gateL<http://www.unicode.org/unicode/faq/utf_bom.html>,
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateRFC 2781 L<http://rfc.net/rfc2781.html>,
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe whole Unicode standard L<http://www.unicode.org/unicode/uni2book/u2.html>
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateCh. 15, pp. 403 of C<Programming Perl (3rd Edition)>
*0Sstevel@tonic-gateby Larry Wall, Tom Christiansen, Jon Orwant;
*0Sstevel@tonic-gateO'Reilly & Associates; ISBN 0-596-00027-8
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut