xref: /openbsd-src/gnu/usr.bin/perl/lib/Unicode/UCD.pm (revision 55745691c11d58794cc2bb4d620ee3985f4381e6)
1package Unicode::UCD;
2
3use strict;
4use warnings;
5
6our $VERSION = '0.2';
7
8require Exporter;
9
10our @ISA = qw(Exporter);
11
12our @EXPORT_OK = qw(charinfo
13		    charblock charscript
14		    charblocks charscripts
15		    charinrange
16		    compexcl
17		    casefold casespec);
18
19use Carp;
20
21=head1 NAME
22
23Unicode::UCD - Unicode character database
24
25=head1 SYNOPSIS
26
27    use Unicode::UCD 'charinfo';
28    my $charinfo   = charinfo($codepoint);
29
30    use Unicode::UCD 'charblock';
31    my $charblock  = charblock($codepoint);
32
33    use Unicode::UCD 'charscript';
34    my $charscript = charblock($codepoint);
35
36    use Unicode::UCD 'charblocks';
37    my $charblocks = charblocks();
38
39    use Unicode::UCD 'charscripts';
40    my %charscripts = charscripts();
41
42    use Unicode::UCD qw(charscript charinrange);
43    my $range = charscript($script);
44    print "looks like $script\n" if charinrange($range, $codepoint);
45
46    use Unicode::UCD 'compexcl';
47    my $compexcl = compexcl($codepoint);
48
49    my $unicode_version = Unicode::UCD::UnicodeVersion();
50
51=head1 DESCRIPTION
52
53The Unicode::UCD module offers a simple interface to the Unicode
54Character Database.
55
56=cut
57
58my $UNICODEFH;
59my $BLOCKSFH;
60my $SCRIPTSFH;
61my $VERSIONFH;
62my $COMPEXCLFH;
63my $CASEFOLDFH;
64my $CASESPECFH;
65
66sub openunicode {
67    my ($rfh, @path) = @_;
68    my $f;
69    unless (defined $$rfh) {
70	for my $d (@INC) {
71	    use File::Spec;
72	    $f = File::Spec->catfile($d, "unicore", @path);
73	    last if open($$rfh, $f);
74	    undef $f;
75	}
76	croak __PACKAGE__, ": failed to find ",
77              File::Spec->catfile(@path), " in @INC"
78	    unless defined $f;
79    }
80    return $f;
81}
82
83=head2 charinfo
84
85    use Unicode::UCD 'charinfo';
86
87    my $charinfo = charinfo(0x41);
88
89charinfo() returns a reference to a hash that has the following fields
90as defined by the Unicode standard:
91
92    key
93
94    code             code point with at least four hexdigits
95    name             name of the character IN UPPER CASE
96    category         general category of the character
97    combining        classes used in the Canonical Ordering Algorithm
98    bidi             bidirectional category
99    decomposition    character decomposition mapping
100    decimal          if decimal digit this is the integer numeric value
101    digit            if digit this is the numeric value
102    numeric          if numeric is the integer or rational numeric value
103    mirrored         if mirrored in bidirectional text
104    unicode10        Unicode 1.0 name if existed and different
105    comment          ISO 10646 comment field
106    upper            uppercase equivalent mapping
107    lower            lowercase equivalent mapping
108    title            titlecase equivalent mapping
109
110    block            block the character belongs to (used in \p{In...})
111    script           script the character belongs to
112
113If no match is found, a reference to an empty hash is returned.
114
115The C<block> property is the same as returned by charinfo().  It is
116not defined in the Unicode Character Database proper (Chapter 4 of the
117Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118(Chapter 14 of TUS3).  Similarly for the C<script> property.
119
120Note that you cannot do (de)composition and casing based solely on the
121above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
122you will need also the compexcl(), casefold(), and casespec() functions.
123
124=cut
125
126sub _getcode {
127    my $arg = shift;
128
129    if ($arg =~ /^\d+$/) {
130	return $arg;
131    } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
132	return hex($1);
133    }
134
135    return;
136}
137
138# Lingua::KO::Hangul::Util not part of the standard distribution
139# but it will be used if available.
140
141eval { require Lingua::KO::Hangul::Util };
142my $hasHangulUtil = ! $@;
143if ($hasHangulUtil) {
144    Lingua::KO::Hangul::Util->import();
145}
146
147sub hangul_decomp { # internal: called from charinfo
148    if ($hasHangulUtil) {
149	my @tmp = decomposeHangul(shift);
150	return sprintf("%04X %04X",      @tmp) if @tmp == 2;
151	return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
152    }
153    return;
154}
155
156sub hangul_charname { # internal: called from charinfo
157    return sprintf("HANGUL SYLLABLE-%04X", shift);
158}
159
160sub han_charname { # internal: called from charinfo
161    return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
162}
163
164my @CharinfoRanges = (
165# block name
166# [ first, last, coderef to name, coderef to decompose ],
167# CJK Ideographs Extension A
168  [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
169# CJK Ideographs
170  [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
171# Hangul Syllables
172  [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
173# Non-Private Use High Surrogates
174  [ 0xD800,   0xDB7F,   undef,   undef  ],
175# Private Use High Surrogates
176  [ 0xDB80,   0xDBFF,   undef,   undef  ],
177# Low Surrogates
178  [ 0xDC00,   0xDFFF,   undef,   undef  ],
179# The Private Use Area
180  [ 0xE000,   0xF8FF,   undef,   undef  ],
181# CJK Ideographs Extension B
182  [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
183# Plane 15 Private Use Area
184  [ 0xF0000,  0xFFFFD,  undef,   undef  ],
185# Plane 16 Private Use Area
186  [ 0x100000, 0x10FFFD, undef,   undef  ],
187);
188
189sub charinfo {
190    my $arg  = shift;
191    my $code = _getcode($arg);
192    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
193	unless defined $code;
194    my $hexk = sprintf("%06X", $code);
195    my($rcode,$rname,$rdec);
196    foreach my $range (@CharinfoRanges){
197      if ($range->[0] <= $code && $code <= $range->[1]) {
198        $rcode = $hexk;
199	$rcode =~ s/^0+//;
200	$rcode =  sprintf("%04X", hex($rcode));
201        $rname = $range->[2] ? $range->[2]->($code) : '';
202        $rdec  = $range->[3] ? $range->[3]->($code) : '';
203        $hexk  = sprintf("%06X", $range->[0]); # replace by the first
204        last;
205      }
206    }
207    openunicode(\$UNICODEFH, "UnicodeData.txt");
208    if (defined $UNICODEFH) {
209	use Search::Dict 1.02;
210	if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
211	    my $line = <$UNICODEFH>;
212	    chomp $line;
213	    my %prop;
214	    @prop{qw(
215		     code name category
216		     combining bidi decomposition
217		     decimal digit numeric
218		     mirrored unicode10 comment
219		     upper lower title
220		    )} = split(/;/, $line, -1);
221	    $hexk =~ s/^0+//;
222	    $hexk =  sprintf("%04X", hex($hexk));
223	    if ($prop{code} eq $hexk) {
224		$prop{block}  = charblock($code);
225		$prop{script} = charscript($code);
226		if(defined $rname){
227                    $prop{code} = $rcode;
228                    $prop{name} = $rname;
229                    $prop{decomposition} = $rdec;
230                }
231		return \%prop;
232	    }
233	}
234    }
235    return;
236}
237
238sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
239    my ($table, $lo, $hi, $code) = @_;
240
241    return if $lo > $hi;
242
243    my $mid = int(($lo+$hi) / 2);
244
245    if ($table->[$mid]->[0] < $code) {
246	if ($table->[$mid]->[1] >= $code) {
247	    return $table->[$mid]->[2];
248	} else {
249	    _search($table, $mid + 1, $hi, $code);
250	}
251    } elsif ($table->[$mid]->[0] > $code) {
252	_search($table, $lo, $mid - 1, $code);
253    } else {
254	return $table->[$mid]->[2];
255    }
256}
257
258sub charinrange {
259    my ($range, $arg) = @_;
260    my $code = _getcode($arg);
261    croak __PACKAGE__, "::charinrange: unknown code '$arg'"
262	unless defined $code;
263    _search($range, 0, $#$range, $code);
264}
265
266=head2 charblock
267
268    use Unicode::UCD 'charblock';
269
270    my $charblock = charblock(0x41);
271    my $charblock = charblock(1234);
272    my $charblock = charblock("0x263a");
273    my $charblock = charblock("U+263a");
274
275    my $range     = charblock('Armenian');
276
277With a B<code point argument> charblock() returns the I<block> the character
278belongs to, e.g.  C<Basic Latin>.  Note that not all the character
279positions within all blocks are defined.
280
281See also L</Blocks versus Scripts>.
282
283If supplied with an argument that can't be a code point, charblock() tries
284to do the opposite and interpret the argument as a character block. The
285return value is a I<range>: an anonymous list of lists that contain
286I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
287code point is in a range using the L</charinrange> function. If the
288argument is not a known charater block, C<undef> is returned.
289
290=cut
291
292my @BLOCKS;
293my %BLOCKS;
294
295sub _charblocks {
296    unless (@BLOCKS) {
297	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
298	    while (<$BLOCKSFH>) {
299		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
300		    my ($lo, $hi) = (hex($1), hex($2));
301		    my $subrange = [ $lo, $hi, $3 ];
302		    push @BLOCKS, $subrange;
303		    push @{$BLOCKS{$3}}, $subrange;
304		}
305	    }
306	    close($BLOCKSFH);
307	}
308    }
309}
310
311sub charblock {
312    my $arg = shift;
313
314    _charblocks() unless @BLOCKS;
315
316    my $code = _getcode($arg);
317
318    if (defined $code) {
319	_search(\@BLOCKS, 0, $#BLOCKS, $code);
320    } else {
321	if (exists $BLOCKS{$arg}) {
322	    return $BLOCKS{$arg};
323	} else {
324	    return;
325	}
326    }
327}
328
329=head2 charscript
330
331    use Unicode::UCD 'charscript';
332
333    my $charscript = charscript(0x41);
334    my $charscript = charscript(1234);
335    my $charscript = charscript("U+263a");
336
337    my $range      = charscript('Thai');
338
339With a B<code point argument> charscript() returns the I<script> the
340character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
341
342See also L</Blocks versus Scripts>.
343
344If supplied with an argument that can't be a code point, charscript() tries
345to do the opposite and interpret the argument as a character script. The
346return value is a I<range>: an anonymous list of lists that contain
347I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
348code point is in a range using the L</charinrange> function. If the
349argument is not a known charater script, C<undef> is returned.
350
351=cut
352
353my @SCRIPTS;
354my %SCRIPTS;
355
356sub _charscripts {
357    unless (@SCRIPTS) {
358	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
359	    while (<$SCRIPTSFH>) {
360		if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
361		    my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
362		    my $script = lc($3);
363		    $script =~ s/\b(\w)/uc($1)/ge;
364		    my $subrange = [ $lo, $hi, $script ];
365		    push @SCRIPTS, $subrange;
366		    push @{$SCRIPTS{$script}}, $subrange;
367		}
368	    }
369	    close($SCRIPTSFH);
370	    @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
371	}
372    }
373}
374
375sub charscript {
376    my $arg = shift;
377
378    _charscripts() unless @SCRIPTS;
379
380    my $code = _getcode($arg);
381
382    if (defined $code) {
383	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
384    } else {
385	if (exists $SCRIPTS{$arg}) {
386	    return $SCRIPTS{$arg};
387	} else {
388	    return;
389	}
390    }
391}
392
393=head2 charblocks
394
395    use Unicode::UCD 'charblocks';
396
397    my $charblocks = charblocks();
398
399charblocks() returns a reference to a hash with the known block names
400as the keys, and the code point ranges (see L</charblock>) as the values.
401
402See also L</Blocks versus Scripts>.
403
404=cut
405
406sub charblocks {
407    _charblocks() unless %BLOCKS;
408    return \%BLOCKS;
409}
410
411=head2 charscripts
412
413    use Unicode::UCD 'charscripts';
414
415    my %charscripts = charscripts();
416
417charscripts() returns a hash with the known script names as the keys,
418and the code point ranges (see L</charscript>) as the values.
419
420See also L</Blocks versus Scripts>.
421
422=cut
423
424sub charscripts {
425    _charscripts() unless %SCRIPTS;
426    return \%SCRIPTS;
427}
428
429=head2 Blocks versus Scripts
430
431The difference between a block and a script is that scripts are closer
432to the linguistic notion of a set of characters required to present
433languages, while block is more of an artifact of the Unicode character
434numbering and separation into blocks of (mostly) 256 characters.
435
436For example the Latin B<script> is spread over several B<blocks>, such
437as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
438C<Latin Extended-B>.  On the other hand, the Latin script does not
439contain all the characters of the C<Basic Latin> block (also known as
440the ASCII): it includes only the letters, and not, for example, the digits
441or the punctuation.
442
443For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
444
445For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
446
447=head2 Matching Scripts and Blocks
448
449Scripts are matched with the regular-expression construct
450C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
451while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
452any of the 256 code points in the Tibetan block).
453
454=head2 Code Point Arguments
455
456A I<code point argument> is either a decimal or a hexadecimal scalar
457designating a Unicode character, or C<U+> followed by hexadecimals
458designating a Unicode character.  Note that Unicode is B<not> limited
459to 16 bits (the number of Unicode characters is open-ended, in theory
460unlimited): you may have more than 4 hexdigits.
461
462=head2 charinrange
463
464In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
465can also test whether a code point is in the I<range> as returned by
466L</charblock> and L</charscript> or as the values of the hash returned
467by L</charblocks> and L</charscripts> by using charinrange():
468
469    use Unicode::UCD qw(charscript charinrange);
470
471    $range = charscript('Hiragana');
472    print "looks like hiragana\n" if charinrange($range, $codepoint);
473
474=cut
475
476=head2 compexcl
477
478    use Unicode::UCD 'compexcl';
479
480    my $compexcl = compexcl("09dc");
481
482The compexcl() returns the composition exclusion (that is, if the
483character should not be produced during a precomposition) of the
484character specified by a B<code point argument>.
485
486If there is a composition exclusion for the character, true is
487returned.  Otherwise, false is returned.
488
489=cut
490
491my %COMPEXCL;
492
493sub _compexcl {
494    unless (%COMPEXCL) {
495	if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
496	    while (<$COMPEXCLFH>) {
497		if (/^([0-9A-F]+)\s+\#\s+/) {
498		    my $code = hex($1);
499		    $COMPEXCL{$code} = undef;
500		}
501	    }
502	    close($COMPEXCLFH);
503	}
504    }
505}
506
507sub compexcl {
508    my $arg  = shift;
509    my $code = _getcode($arg);
510    croak __PACKAGE__, "::compexcl: unknown code '$arg'"
511	unless defined $code;
512
513    _compexcl() unless %COMPEXCL;
514
515    return exists $COMPEXCL{$code};
516}
517
518=head2 casefold
519
520    use Unicode::UCD 'casefold';
521
522    my %casefold = casefold("09dc");
523
524The casefold() returns the locale-independent case folding of the
525character specified by a B<code point argument>.
526
527If there is a case folding for that character, a reference to a hash
528with the following fields is returned:
529
530    key
531
532    code             code point with at least four hexdigits
533    status           "C", "F", "S", or "I"
534    mapping          one or more codes separated by spaces
535
536The meaning of the I<status> is as follows:
537
538   C                 common case folding, common mappings shared
539                     by both simple and full mappings
540   F                 full case folding, mappings that cause strings
541                     to grow in length. Multiple characters are separated
542                     by spaces
543   S                 simple case folding, mappings to single characters
544                     where different from F
545   I                 special case for dotted uppercase I and
546                     dotless lowercase i
547                     - If this mapping is included, the result is
548                       case-insensitive, but dotless and dotted I's
549                       are not distinguished
550                     - If this mapping is excluded, the result is not
551                       fully case-insensitive, but dotless and dotted
552                       I's are distinguished
553
554If there is no case folding for that character, C<undef> is returned.
555
556For more information about case mappings see
557http://www.unicode.org/unicode/reports/tr21/
558
559=cut
560
561my %CASEFOLD;
562
563sub _casefold {
564    unless (%CASEFOLD) {
565	if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
566	    while (<$CASEFOLDFH>) {
567		if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
568		    my $code = hex($1);
569		    $CASEFOLD{$code} = { code    => $1,
570					 status  => $2,
571					 mapping => $3 };
572		}
573	    }
574	    close($CASEFOLDFH);
575	}
576    }
577}
578
579sub casefold {
580    my $arg  = shift;
581    my $code = _getcode($arg);
582    croak __PACKAGE__, "::casefold: unknown code '$arg'"
583	unless defined $code;
584
585    _casefold() unless %CASEFOLD;
586
587    return $CASEFOLD{$code};
588}
589
590=head2 casespec
591
592    use Unicode::UCD 'casespec';
593
594    my %casespec = casespec("09dc");
595
596The casespec() returns the potentially locale-dependent case mapping
597of the character specified by a B<code point argument>.  The mapping
598may change the length of the string (which the basic Unicode case
599mappings as returned by charinfo() never do).
600
601If there is a case folding for that character, a reference to a hash
602with the following fields is returned:
603
604    key
605
606    code             code point with at least four hexdigits
607    lower            lowercase
608    title            titlecase
609    upper            uppercase
610    condition        condition list (may be undef)
611
612The C<condition> is optional.  Where present, it consists of one or
613more I<locales> or I<contexts>, separated by spaces (other than as
614used to separate elements, spaces are to be ignored).  A condition
615list overrides the normal behavior if all of the listed conditions are
616true.  Case distinctions in the condition list are not significant.
617Conditions preceded by "NON_" represent the negation of the condition
618
619Note that when there are multiple case folding definitions for a
620single code point because of different locales, the value returned by
621casespec() is a hash reference which has the locales as the keys and
622hash references as described above as the values.
623
624A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
625followed by a "_" and a 2-letter ISO language code (possibly followed
626by a "_" and a variant code).  You can find the lists of those codes,
627see L<Locale::Country> and L<Locale::Language>.
628
629A I<context> is one of the following choices:
630
631    FINAL            The letter is not followed by a letter of
632                     general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
633    MODERN           The mapping is only used for modern text
634    AFTER_i          The last base character was "i" (U+0069)
635
636For more information about case mappings see
637http://www.unicode.org/unicode/reports/tr21/
638
639=cut
640
641my %CASESPEC;
642
643sub _casespec {
644    unless (%CASESPEC) {
645	if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
646	    while (<$CASESPECFH>) {
647		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
648		    my ($hexcode, $lower, $title, $upper, $condition) =
649			($1, $2, $3, $4, $5);
650		    my $code = hex($hexcode);
651		    if (exists $CASESPEC{$code}) {
652			if (exists $CASESPEC{$code}->{code}) {
653			    my ($oldlower,
654				$oldtitle,
655				$oldupper,
656				$oldcondition) =
657				    @{$CASESPEC{$code}}{qw(lower
658							   title
659							   upper
660							   condition)};
661			    if (defined $oldcondition) {
662				my ($oldlocale) =
663				($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
664				delete $CASESPEC{$code};
665				$CASESPEC{$code}->{$oldlocale} =
666				{ code      => $hexcode,
667				  lower     => $oldlower,
668				  title     => $oldtitle,
669				  upper     => $oldupper,
670				  condition => $oldcondition };
671			    }
672			}
673			my ($locale) =
674			    ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
675			$CASESPEC{$code}->{$locale} =
676			{ code      => $hexcode,
677			  lower     => $lower,
678			  title     => $title,
679			  upper     => $upper,
680			  condition => $condition };
681		    } else {
682			$CASESPEC{$code} =
683			{ code      => $hexcode,
684			  lower     => $lower,
685			  title     => $title,
686			  upper     => $upper,
687			  condition => $condition };
688		    }
689		}
690	    }
691	    close($CASESPECFH);
692	}
693    }
694}
695
696sub casespec {
697    my $arg  = shift;
698    my $code = _getcode($arg);
699    croak __PACKAGE__, "::casespec: unknown code '$arg'"
700	unless defined $code;
701
702    _casespec() unless %CASESPEC;
703
704    return $CASESPEC{$code};
705}
706
707=head2 Unicode::UCD::UnicodeVersion
708
709Unicode::UCD::UnicodeVersion() returns the version of the Unicode
710Character Database, in other words, the version of the Unicode
711standard the database implements.  The version is a string
712of numbers delimited by dots (C<'.'>).
713
714=cut
715
716my $UNICODEVERSION;
717
718sub UnicodeVersion {
719    unless (defined $UNICODEVERSION) {
720	openunicode(\$VERSIONFH, "version");
721	chomp($UNICODEVERSION = <$VERSIONFH>);
722	close($VERSIONFH);
723	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
724	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
725    }
726    return $UNICODEVERSION;
727}
728
729=head2 Implementation Note
730
731The first use of charinfo() opens a read-only filehandle to the Unicode
732Character Database (the database is included in the Perl distribution).
733The filehandle is then kept open for further queries.  In other words,
734if you are wondering where one of your filehandles went, that's where.
735
736=head1 BUGS
737
738Does not yet support EBCDIC platforms.
739
740=head1 AUTHOR
741
742Jarkko Hietaniemi
743
744=cut
745
7461;
747