xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/Unicode/Unicode.pm (revision eac174f2741a08d8deb8aae59a7f778ef9b5d770)
1b39c5158Smillertpackage Encode::Unicode;
2b39c5158Smillert
3b39c5158Smillertuse strict;
4b39c5158Smillertuse warnings;
5b39c5158Smillert
6*eac174f2Safresh1our $VERSION = do { my @r = ( q$Revision: 2.20 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
7b39c5158Smillert
8b39c5158Smillertuse XSLoader;
9b39c5158SmillertXSLoader::load( __PACKAGE__, $VERSION );
10b39c5158Smillert
11b39c5158Smillert#
12b39c5158Smillert# Object Generator 8 transcoders all at once!
13b39c5158Smillert#
14b39c5158Smillert
159f11ffb7Safresh1use Encode ();
16b39c5158Smillert
17b39c5158Smillertour %BOM_Unknown = map { $_ => 1 } qw(UTF-16 UTF-32);
18b39c5158Smillert
19b39c5158Smillertfor my $name (
20b39c5158Smillert    qw(UTF-16 UTF-16BE UTF-16LE
21b39c5158Smillert    UTF-32 UTF-32BE UTF-32LE
22b39c5158Smillert    UCS-2BE  UCS-2LE)
23b39c5158Smillert  )
24b39c5158Smillert{
25b39c5158Smillert    my ( $size, $endian, $ucs2, $mask );
26b39c5158Smillert    $name =~ /^(\w+)-(\d+)(\w*)$/o;
27b39c5158Smillert    if ( $ucs2 = ( $1 eq 'UCS' ) ) {
28b39c5158Smillert        $size = 2;
29b39c5158Smillert    }
30b39c5158Smillert    else {
31b39c5158Smillert        $size = $2 / 8;
32b39c5158Smillert    }
33b39c5158Smillert    $endian = ( $3 eq 'BE' ) ? 'n' : ( $3 eq 'LE' ) ? 'v' : '';
34b39c5158Smillert    $size == 4 and $endian = uc($endian);
35b39c5158Smillert
369f11ffb7Safresh1    my $obj = bless {
37b39c5158Smillert        Name   => $name,
38b39c5158Smillert        size   => $size,
39b39c5158Smillert        endian => $endian,
40b39c5158Smillert        ucs2   => $ucs2,
41b39c5158Smillert    } => __PACKAGE__;
429f11ffb7Safresh1    Encode::define_encoding($obj, $name);
43b39c5158Smillert}
44b39c5158Smillert
45e5157e49Safresh1use parent qw(Encode::Encoding);
46b39c5158Smillert
47b39c5158Smillertsub renew {
48b39c5158Smillert    my $self = shift;
49b39c5158Smillert    $BOM_Unknown{ $self->name } or return $self;
50b39c5158Smillert    my $clone = bless {%$self} => ref($self);
51b39c5158Smillert    $clone->{renewed}++;    # so the caller knows it is renewed.
52b39c5158Smillert    return $clone;
53b39c5158Smillert}
54b39c5158Smillert
55b39c5158Smillert1;
56b39c5158Smillert__END__
57b39c5158Smillert
58b39c5158Smillert=head1 NAME
59b39c5158Smillert
60b39c5158SmillertEncode::Unicode -- Various Unicode Transformation Formats
61b39c5158Smillert
62b39c5158Smillert=cut
63b39c5158Smillert
64b39c5158Smillert=head1 SYNOPSIS
65b39c5158Smillert
66b39c5158Smillert    use Encode qw/encode decode/;
67b39c5158Smillert    $ucs2 = encode("UCS-2BE", $utf8);
68b39c5158Smillert    $utf8 = decode("UCS-2BE", $ucs2);
69b39c5158Smillert
70b39c5158Smillert=head1 ABSTRACT
71b39c5158Smillert
72b39c5158SmillertThis module implements all Character Encoding Schemes of Unicode that
73b39c5158Smillertare officially documented by Unicode Consortium (except, of course,
74b39c5158Smillertfor UTF-8, which is a native format in perl).
75b39c5158Smillert
76b39c5158Smillert=over 4
77b39c5158Smillert
78b39c5158Smillert=item L<http://www.unicode.org/glossary/> says:
79b39c5158Smillert
80b39c5158SmillertI<Character Encoding Scheme> A character encoding form plus byte
81b39c5158Smillertserialization. There are Seven character encoding schemes in Unicode:
82b39c5158SmillertUTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32 (UCS-4), UTF-32BE (UCS-4BE) and
83b39c5158SmillertUTF-32LE (UCS-4LE), and UTF-7.
84b39c5158Smillert
85b39c5158SmillertSince UTF-7 is a 7-bit (re)encoded version of UTF-16BE, It is not part of
86b39c5158SmillertUnicode's Character Encoding Scheme.  It is separately implemented in
87b39c5158SmillertEncode::Unicode::UTF7.  For details see L<Encode::Unicode::UTF7>.
88b39c5158Smillert
89b39c5158Smillert=item Quick Reference
90b39c5158Smillert
91b39c5158Smillert                Decodes from ord(N)           Encodes chr(N) to...
92b39c5158Smillert       octet/char BOM S.P d800-dfff  ord > 0xffff     \x{1abcd} ==
93b39c5158Smillert  ---------------+-----------------+------------------------------
94b39c5158Smillert  UCS-2BE       2   N   N  is bogus                  Not Available
95b39c5158Smillert  UCS-2LE       2   N   N     bogus                  Not Available
96b39c5158Smillert  UTF-16      2/4   Y   Y  is   S.P           S.P            BE/LE
97b39c5158Smillert  UTF-16BE    2/4   N   Y       S.P           S.P    0xd82a,0xdfcd
98b39c5158Smillert  UTF-16LE    2/4   N   Y       S.P           S.P    0x2ad8,0xcddf
99b39c5158Smillert  UTF-32        4   Y   -  is bogus         As is            BE/LE
100b39c5158Smillert  UTF-32BE      4   N   -     bogus         As is       0x0001abcd
101b39c5158Smillert  UTF-32LE      4   N   -     bogus         As is       0xcdab0100
102b39c5158Smillert  UTF-8       1-4   -   -     bogus   >= 4 octets   \xf0\x9a\af\8d
103b39c5158Smillert  ---------------+-----------------+------------------------------
104b39c5158Smillert
105b39c5158Smillert=back
106b39c5158Smillert
107b39c5158Smillert=head1 Size, Endianness, and BOM
108b39c5158Smillert
109b39c5158SmillertYou can categorize these CES by 3 criteria:  size of each character,
110b39c5158Smillertendianness, and Byte Order Mark.
111b39c5158Smillert
112b39c5158Smillert=head2 by size
113b39c5158Smillert
114b39c5158SmillertUCS-2 is a fixed-length encoding with each character taking 16 bits.
115b39c5158SmillertIt B<does not> support I<surrogate pairs>.  When a surrogate pair
116b39c5158Smillertis encountered during decode(), its place is filled with \x{FFFD}
117b39c5158Smillertif I<CHECK> is 0, or the routine croaks if I<CHECK> is 1.  When a
118b39c5158Smillertcharacter whose ord value is larger than 0xFFFF is encountered,
119b39c5158Smillertits place is filled with \x{FFFD} if I<CHECK> is 0, or the routine
120b39c5158Smillertcroaks if I<CHECK> is 1.
121b39c5158Smillert
122b39c5158SmillertUTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>.
123b39c5158SmillertWhen it encounters a high surrogate (0xD800-0xDBFF), it fetches the
124b39c5158Smillertfollowing low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to
125b39c5158Smillertform a character.  Bogus surrogates result in death.  When \x{10000}
126b39c5158Smillertor above is encountered during encode(), it C<ensurrogate>s them and
127b39c5158Smillertpushes the surrogate pair to the output stream.
128b39c5158Smillert
129b39c5158SmillertUTF-32 (UCS-4) is a fixed-length encoding with each character taking 32 bits.
130b39c5158SmillertSince it is 32-bit, there is no need for I<surrogate pairs>.
131b39c5158Smillert
132b39c5158Smillert=head2 by endianness
133b39c5158Smillert
134b39c5158SmillertThe first (and now failed) goal of Unicode was to map all character
135b39c5158Smillertrepertoires into a fixed-length integer so that programmers are happy.
136b39c5158SmillertSince each character is either a I<short> or I<long> in C, you have to
137b39c5158Smillertpay attention to the endianness of each platform when you pass data
138b39c5158Smillertto one another.
139b39c5158Smillert
140b39c5158SmillertAnything marked as BE is Big Endian (or network byte order) and LE is
141b39c5158SmillertLittle Endian (aka VAX byte order).  For anything not marked either
142b39c5158SmillertBE or LE, a character called Byte Order Mark (BOM) indicating the
143b39c5158Smillertendianness is prepended to the string.
144b39c5158Smillert
145b39c5158SmillertCAVEAT: Though BOM in utf8 (\xEF\xBB\xBF) is valid, it is meaningless
146b39c5158Smillertand as of this writing Encode suite just leave it as is (\x{FeFF}).
147b39c5158Smillert
148b39c5158Smillert=over 4
149b39c5158Smillert
150b39c5158Smillert=item BOM as integer when fetched in network byte order
151b39c5158Smillert
152b39c5158Smillert              16         32 bits/char
153b39c5158Smillert  -------------------------
154b39c5158Smillert  BE      0xFeFF 0x0000FeFF
155b39c5158Smillert  LE      0xFFFe 0xFFFe0000
156b39c5158Smillert  -------------------------
157b39c5158Smillert
158b39c5158Smillert=back
159b39c5158Smillert
160b39c5158SmillertThis modules handles the BOM as follows.
161b39c5158Smillert
162b39c5158Smillert=over 4
163b39c5158Smillert
164b39c5158Smillert=item *
165b39c5158Smillert
166b39c5158SmillertWhen BE or LE is explicitly stated as the name of encoding, BOM is
167b39c5158Smillertsimply treated as a normal character (ZERO WIDTH NO-BREAK SPACE).
168b39c5158Smillert
169b39c5158Smillert=item *
170b39c5158Smillert
171b39c5158SmillertWhen BE or LE is omitted during decode(), it checks if BOM is at the
172b39c5158Smillertbeginning of the string; if one is found, the endianness is set to
173b8851fccSafresh1what the BOM says.
174b8851fccSafresh1
175b8851fccSafresh1=item *
176b8851fccSafresh1
177b8851fccSafresh1Default Byte Order
178b8851fccSafresh1
179b8851fccSafresh1When no BOM is found, Encode 2.76 and blow croaked.  Since Encode
180b8851fccSafresh12.77, it falls back to BE accordingly to RFC2781 and the Unicode
181b8851fccSafresh1Standard version 8.0
182b39c5158Smillert
183b39c5158Smillert=item *
184b39c5158Smillert
185b39c5158SmillertWhen BE or LE is omitted during encode(), it returns a BE-encoded
186b39c5158Smillertstring with BOM prepended.  So when you want to encode a whole text
187b39c5158Smillertfile, make sure you encode() the whole text at once, not line by line
188b39c5158Smillertor each line, not file, will have a BOM prepended.
189b39c5158Smillert
190b39c5158Smillert=item *
191b39c5158Smillert
192b39c5158SmillertC<UCS-2> is an exception.  Unlike others, this is an alias of UCS-2BE.
193b39c5158SmillertUCS-2 is already registered by IANA and others that way.
194b39c5158Smillert
195b39c5158Smillert=back
196b39c5158Smillert
197b39c5158Smillert=head1 Surrogate Pairs
198b39c5158Smillert
199b39c5158SmillertTo say the least, surrogate pairs were the biggest mistake of the
200b39c5158SmillertUnicode Consortium.  But according to the late Douglas Adams in I<The
201b39c5158SmillertHitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
202b39c5158SmillertUniverse was created. This has made a lot of people very angry and
203b39c5158Smillertbeen widely regarded as a bad move>.  Their mistake was not of this
204b39c5158Smillertmagnitude so let's forgive them.
205b39c5158Smillert
206b39c5158Smillert(I don't dare make any comparison with Unicode Consortium and the
207b39c5158SmillertVogons here ;)  Or, comparing Encode to Babel Fish is completely
208b39c5158Smillertappropriate -- if you can only stick this into your ear :)
209b39c5158Smillert
210b39c5158SmillertSurrogate pairs were born when the Unicode Consortium finally
211b39c5158Smillertadmitted that 16 bits were not big enough to hold all the world's
212b39c5158Smillertcharacter repertoires.  But they already made UCS-2 16-bit.  What
213b39c5158Smillertdo we do?
214b39c5158Smillert
215b39c5158SmillertBack then, the range 0xD800-0xDFFF was not allocated.  Let's split
216b39c5158Smillertthat range in half and use the first half to represent the C<upper
217b39c5158Smillerthalf of a character> and the second half to represent the C<lower
218b39c5158Smillerthalf of a character>.  That way, you can represent 1024 * 1024 =
219b39c5158Smillert1048576 more characters.  Now we can store character ranges up to
220b39c5158Smillert\x{10ffff} even with 16-bit encodings.  This pair of half-character is
221b39c5158Smillertnow called a I<surrogate pair> and UTF-16 is the name of the encoding
222b39c5158Smillertthat embraces them.
223b39c5158Smillert
224b39c5158SmillertHere is a formula to ensurrogate a Unicode character \x{10000} and
225b39c5158Smillertabove;
226b39c5158Smillert
227b39c5158Smillert  $hi = ($uni - 0x10000) / 0x400 + 0xD800;
228b39c5158Smillert  $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
229b39c5158Smillert
230b39c5158SmillertAnd to desurrogate;
231b39c5158Smillert
232b39c5158Smillert $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
233b39c5158Smillert
234b39c5158SmillertNote this move has made \x{D800}-\x{DFFF} into a forbidden zone but
235b39c5158Smillertperl does not prohibit the use of characters within this range.  To perl,
236b39c5158Smillertevery one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
237b39c5158Smillert
238b39c5158Smillert  (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
239b39c5158Smillert  integer support!
240b39c5158Smillert
241b39c5158Smillert=head1 Error Checking
242b39c5158Smillert
243b39c5158SmillertUnlike most encodings which accept various ways to handle errors,
244b39c5158SmillertUnicode encodings simply croaks.
245b39c5158Smillert
246b39c5158Smillert  % perl -MEncode -e'$_ = "\xfe\xff\xd8\xd9\xda\xdb\0\n"' \
247b39c5158Smillert         -e'Encode::from_to($_, "utf16","shift_jis", 0); print'
248b39c5158Smillert  UTF-16:Malformed LO surrogate d8d9 at /path/to/Encode.pm line 184.
249b39c5158Smillert  % perl -MEncode -e'$a = "BOM missing"' \
250b39c5158Smillert         -e' Encode::from_to($a, "utf16", "shift_jis", 0); print'
251b39c5158Smillert  UTF-16:Unrecognised BOM 424f at /path/to/Encode.pm line 184.
252b39c5158Smillert
253b39c5158SmillertUnlike other encodings where mappings are not one-to-one against
254b39c5158SmillertUnicode, UTFs are supposed to map 100% against one another.  So Encode
255b39c5158Smillertis more strict on UTFs.
256b39c5158Smillert
257b39c5158SmillertConsider that "division by zero" of Encode :)
258b39c5158Smillert
259b39c5158Smillert=head1 SEE ALSO
260b39c5158Smillert
261*eac174f2Safresh1L<Encode>, L<Encode::Unicode::UTF7>, L<https://www.unicode.org/glossary/>,
262*eac174f2Safresh1L<https://www.unicode.org/faq/utf_bom.html>,
263b39c5158Smillert
264b39c5158SmillertRFC 2781 L<http://www.ietf.org/rfc/rfc2781.txt>,
265b39c5158Smillert
266*eac174f2Safresh1The whole Unicode standard L<https://www.unicode.org/standard/standard.html>
267b39c5158Smillert
268*eac174f2Safresh1Ch. 6 pp. 275 of C<Programming Perl (3rd Edition)>
269*eac174f2Safresh1by Tom Christiansen, brian d foy & Larry Wall;
270*eac174f2Safresh1O'Reilly & Associates; ISBN 978-0-596-00492-7
271b39c5158Smillert
272b39c5158Smillert=cut
273