xref: /openbsd-src/gnu/usr.bin/perl/lib/Unicode/UCD.pm (revision a28daedfc357b214be5c701aa8ba8adb29a7f1c2)
1package Unicode::UCD;
2
3use strict;
4use warnings;
5
6our $VERSION = '0.25';
7
8use Storable qw(dclone);
9
10require Exporter;
11
12our @ISA = qw(Exporter);
13
14our @EXPORT_OK = qw(charinfo
15		    charblock charscript
16		    charblocks charscripts
17		    charinrange
18		    general_categories bidi_types
19		    compexcl
20		    casefold casespec
21		    namedseq);
22
23use Carp;
24
25=head1 NAME
26
27Unicode::UCD - Unicode character database
28
29=head1 SYNOPSIS
30
31    use Unicode::UCD 'charinfo';
32    my $charinfo   = charinfo($codepoint);
33
34    use Unicode::UCD 'charblock';
35    my $charblock  = charblock($codepoint);
36
37    use Unicode::UCD 'charscript';
38    my $charscript = charscript($codepoint);
39
40    use Unicode::UCD 'charblocks';
41    my $charblocks = charblocks();
42
43    use Unicode::UCD 'charscripts';
44    my $charscripts = charscripts();
45
46    use Unicode::UCD qw(charscript charinrange);
47    my $range = charscript($script);
48    print "looks like $script\n" if charinrange($range, $codepoint);
49
50    use Unicode::UCD qw(general_categories bidi_types);
51    my $categories = general_categories();
52    my $types = bidi_types();
53
54    use Unicode::UCD 'compexcl';
55    my $compexcl = compexcl($codepoint);
56
57    use Unicode::UCD 'namedseq';
58    my $namedseq = namedseq($named_sequence_name);
59
60    my $unicode_version = Unicode::UCD::UnicodeVersion();
61
62=head1 DESCRIPTION
63
64The Unicode::UCD module offers a simple interface to the Unicode
65Character Database.
66
67=cut
68
69my $UNICODEFH;
70my $BLOCKSFH;
71my $SCRIPTSFH;
72my $VERSIONFH;
73my $COMPEXCLFH;
74my $CASEFOLDFH;
75my $CASESPECFH;
76my $NAMEDSEQFH;
77
78sub openunicode {
79    my ($rfh, @path) = @_;
80    my $f;
81    unless (defined $$rfh) {
82	for my $d (@INC) {
83	    use File::Spec;
84	    $f = File::Spec->catfile($d, "unicore", @path);
85	    last if open($$rfh, $f);
86	    undef $f;
87	}
88	croak __PACKAGE__, ": failed to find ",
89              File::Spec->catfile(@path), " in @INC"
90	    unless defined $f;
91    }
92    return $f;
93}
94
95=head2 charinfo
96
97    use Unicode::UCD 'charinfo';
98
99    my $charinfo = charinfo(0x41);
100
101charinfo() returns a reference to a hash that has the following fields
102as defined by the Unicode standard:
103
104    key
105
106    code             code point with at least four hexdigits
107    name             name of the character IN UPPER CASE
108    category         general category of the character
109    combining        classes used in the Canonical Ordering Algorithm
110    bidi             bidirectional type
111    decomposition    character decomposition mapping
112    decimal          if decimal digit this is the integer numeric value
113    digit            if digit this is the numeric value
114    numeric          if numeric is the integer or rational numeric value
115    mirrored         if mirrored in bidirectional text
116    unicode10        Unicode 1.0 name if existed and different
117    comment          ISO 10646 comment field
118    upper            uppercase equivalent mapping
119    lower            lowercase equivalent mapping
120    title            titlecase equivalent mapping
121
122    block            block the character belongs to (used in \p{In...})
123    script           script the character belongs to
124
125If no match is found, a reference to an empty hash is returned.
126
127The C<block> property is the same as returned by charinfo().  It is
128not defined in the Unicode Character Database proper (Chapter 4 of the
129Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
130(Chapter 14 of TUS3).  Similarly for the C<script> property.
131
132Note that you cannot do (de)composition and casing based solely on the
133above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
134you will need also the compexcl(), casefold(), and casespec() functions.
135
136=cut
137
138# NB: This function is duplicated in charnames.pm
139sub _getcode {
140    my $arg = shift;
141
142    if ($arg =~ /^[1-9]\d*$/) {
143	return $arg;
144    } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
145	return hex($1);
146    }
147
148    return;
149}
150
151# Lingua::KO::Hangul::Util not part of the standard distribution
152# but it will be used if available.
153
154eval { require Lingua::KO::Hangul::Util };
155my $hasHangulUtil = ! $@;
156if ($hasHangulUtil) {
157    Lingua::KO::Hangul::Util->import();
158}
159
160sub hangul_decomp { # internal: called from charinfo
161    if ($hasHangulUtil) {
162	my @tmp = decomposeHangul(shift);
163	return sprintf("%04X %04X",      @tmp) if @tmp == 2;
164	return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
165    }
166    return;
167}
168
169sub hangul_charname { # internal: called from charinfo
170    return sprintf("HANGUL SYLLABLE-%04X", shift);
171}
172
173sub han_charname { # internal: called from charinfo
174    return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
175}
176
177my @CharinfoRanges = (
178# block name
179# [ first, last, coderef to name, coderef to decompose ],
180# CJK Ideographs Extension A
181  [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
182# CJK Ideographs
183  [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
184# Hangul Syllables
185  [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
186# Non-Private Use High Surrogates
187  [ 0xD800,   0xDB7F,   undef,   undef  ],
188# Private Use High Surrogates
189  [ 0xDB80,   0xDBFF,   undef,   undef  ],
190# Low Surrogates
191  [ 0xDC00,   0xDFFF,   undef,   undef  ],
192# The Private Use Area
193  [ 0xE000,   0xF8FF,   undef,   undef  ],
194# CJK Ideographs Extension B
195  [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
196# Plane 15 Private Use Area
197  [ 0xF0000,  0xFFFFD,  undef,   undef  ],
198# Plane 16 Private Use Area
199  [ 0x100000, 0x10FFFD, undef,   undef  ],
200);
201
202sub charinfo {
203    my $arg  = shift;
204    my $code = _getcode($arg);
205    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
206	unless defined $code;
207    my $hexk = sprintf("%06X", $code);
208    my($rcode,$rname,$rdec);
209    foreach my $range (@CharinfoRanges){
210      if ($range->[0] <= $code && $code <= $range->[1]) {
211        $rcode = $hexk;
212	$rcode =~ s/^0+//;
213	$rcode =  sprintf("%04X", hex($rcode));
214        $rname = $range->[2] ? $range->[2]->($code) : '';
215        $rdec  = $range->[3] ? $range->[3]->($code) : '';
216        $hexk  = sprintf("%06X", $range->[0]); # replace by the first
217        last;
218      }
219    }
220    openunicode(\$UNICODEFH, "UnicodeData.txt");
221    if (defined $UNICODEFH) {
222	use Search::Dict 1.02;
223	if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
224	    my $line = <$UNICODEFH>;
225	    return unless defined $line;
226	    chomp $line;
227	    my %prop;
228	    @prop{qw(
229		     code name category
230		     combining bidi decomposition
231		     decimal digit numeric
232		     mirrored unicode10 comment
233		     upper lower title
234		    )} = split(/;/, $line, -1);
235	    $hexk =~ s/^0+//;
236	    $hexk =  sprintf("%04X", hex($hexk));
237	    if ($prop{code} eq $hexk) {
238		$prop{block}  = charblock($code);
239		$prop{script} = charscript($code);
240		if(defined $rname){
241                    $prop{code} = $rcode;
242                    $prop{name} = $rname;
243                    $prop{decomposition} = $rdec;
244                }
245		return \%prop;
246	    }
247	}
248    }
249    return;
250}
251
252sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
253    my ($table, $lo, $hi, $code) = @_;
254
255    return if $lo > $hi;
256
257    my $mid = int(($lo+$hi) / 2);
258
259    if ($table->[$mid]->[0] < $code) {
260	if ($table->[$mid]->[1] >= $code) {
261	    return $table->[$mid]->[2];
262	} else {
263	    _search($table, $mid + 1, $hi, $code);
264	}
265    } elsif ($table->[$mid]->[0] > $code) {
266	_search($table, $lo, $mid - 1, $code);
267    } else {
268	return $table->[$mid]->[2];
269    }
270}
271
272sub charinrange {
273    my ($range, $arg) = @_;
274    my $code = _getcode($arg);
275    croak __PACKAGE__, "::charinrange: unknown code '$arg'"
276	unless defined $code;
277    _search($range, 0, $#$range, $code);
278}
279
280=head2 charblock
281
282    use Unicode::UCD 'charblock';
283
284    my $charblock = charblock(0x41);
285    my $charblock = charblock(1234);
286    my $charblock = charblock("0x263a");
287    my $charblock = charblock("U+263a");
288
289    my $range     = charblock('Armenian');
290
291With a B<code point argument> charblock() returns the I<block> the character
292belongs to, e.g.  C<Basic Latin>.  Note that not all the character
293positions within all blocks are defined.
294
295See also L</Blocks versus Scripts>.
296
297If supplied with an argument that can't be a code point, charblock() tries
298to do the opposite and interpret the argument as a character block. The
299return value is a I<range>: an anonymous list of lists that contain
300I<start-of-range>, I<end-of-range> code point pairs. You can test whether
301a code point is in a range using the L</charinrange> function. If the
302argument is not a known character block, C<undef> is returned.
303
304=cut
305
306my @BLOCKS;
307my %BLOCKS;
308
309sub _charblocks {
310    unless (@BLOCKS) {
311	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
312	    local $_;
313	    while (<$BLOCKSFH>) {
314		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
315		    my ($lo, $hi) = (hex($1), hex($2));
316		    my $subrange = [ $lo, $hi, $3 ];
317		    push @BLOCKS, $subrange;
318		    push @{$BLOCKS{$3}}, $subrange;
319		}
320	    }
321	    close($BLOCKSFH);
322	}
323    }
324}
325
326sub charblock {
327    my $arg = shift;
328
329    _charblocks() unless @BLOCKS;
330
331    my $code = _getcode($arg);
332
333    if (defined $code) {
334	_search(\@BLOCKS, 0, $#BLOCKS, $code);
335    } else {
336	if (exists $BLOCKS{$arg}) {
337	    return dclone $BLOCKS{$arg};
338	} else {
339	    return;
340	}
341    }
342}
343
344=head2 charscript
345
346    use Unicode::UCD 'charscript';
347
348    my $charscript = charscript(0x41);
349    my $charscript = charscript(1234);
350    my $charscript = charscript("U+263a");
351
352    my $range      = charscript('Thai');
353
354With a B<code point argument> charscript() returns the I<script> the
355character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
356
357See also L</Blocks versus Scripts>.
358
359If supplied with an argument that can't be a code point, charscript() tries
360to do the opposite and interpret the argument as a character script. The
361return value is a I<range>: an anonymous list of lists that contain
362I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
363code point is in a range using the L</charinrange> function. If the
364argument is not a known character script, C<undef> is returned.
365
366=cut
367
368my @SCRIPTS;
369my %SCRIPTS;
370
371sub _charscripts {
372    unless (@SCRIPTS) {
373	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
374	    local $_;
375	    while (<$SCRIPTSFH>) {
376		if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
377		    my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
378		    my $script = lc($3);
379		    $script =~ s/\b(\w)/uc($1)/ge;
380		    my $subrange = [ $lo, $hi, $script ];
381		    push @SCRIPTS, $subrange;
382		    push @{$SCRIPTS{$script}}, $subrange;
383		}
384	    }
385	    close($SCRIPTSFH);
386	    @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
387	}
388    }
389}
390
391sub charscript {
392    my $arg = shift;
393
394    _charscripts() unless @SCRIPTS;
395
396    my $code = _getcode($arg);
397
398    if (defined $code) {
399	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
400    } else {
401	if (exists $SCRIPTS{$arg}) {
402	    return dclone $SCRIPTS{$arg};
403	} else {
404	    return;
405	}
406    }
407}
408
409=head2 charblocks
410
411    use Unicode::UCD 'charblocks';
412
413    my $charblocks = charblocks();
414
415charblocks() returns a reference to a hash with the known block names
416as the keys, and the code point ranges (see L</charblock>) as the values.
417
418See also L</Blocks versus Scripts>.
419
420=cut
421
422sub charblocks {
423    _charblocks() unless %BLOCKS;
424    return dclone \%BLOCKS;
425}
426
427=head2 charscripts
428
429    use Unicode::UCD 'charscripts';
430
431    my $charscripts = charscripts();
432
433charscripts() returns a reference to a hash with the known script
434names as the keys, and the code point ranges (see L</charscript>) as
435the values.
436
437See also L</Blocks versus Scripts>.
438
439=cut
440
441sub charscripts {
442    _charscripts() unless %SCRIPTS;
443    return dclone \%SCRIPTS;
444}
445
446=head2 Blocks versus Scripts
447
448The difference between a block and a script is that scripts are closer
449to the linguistic notion of a set of characters required to present
450languages, while block is more of an artifact of the Unicode character
451numbering and separation into blocks of (mostly) 256 characters.
452
453For example the Latin B<script> is spread over several B<blocks>, such
454as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
455C<Latin Extended-B>.  On the other hand, the Latin script does not
456contain all the characters of the C<Basic Latin> block (also known as
457the ASCII): it includes only the letters, and not, for example, the digits
458or the punctuation.
459
460For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
461
462For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
463
464=head2 Matching Scripts and Blocks
465
466Scripts are matched with the regular-expression construct
467C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
468while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
469any of the 256 code points in the Tibetan block).
470
471=head2 Code Point Arguments
472
473A I<code point argument> is either a decimal or a hexadecimal scalar
474designating a Unicode character, or C<U+> followed by hexadecimals
475designating a Unicode character.  In other words, if you want a code
476point to be interpreted as a hexadecimal number, you must prefix it
477with either C<0x> or C<U+>, because a string like e.g. C<123> will
478be interpreted as a decimal code point.  Also note that Unicode is
479B<not> limited to 16 bits (the number of Unicode characters is
480open-ended, in theory unlimited): you may have more than 4 hexdigits.
481
482=head2 charinrange
483
484In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
485can also test whether a code point is in the I<range> as returned by
486L</charblock> and L</charscript> or as the values of the hash returned
487by L</charblocks> and L</charscripts> by using charinrange():
488
489    use Unicode::UCD qw(charscript charinrange);
490
491    $range = charscript('Hiragana');
492    print "looks like hiragana\n" if charinrange($range, $codepoint);
493
494=cut
495
496my %GENERAL_CATEGORIES =
497 (
498    'L'  =>         'Letter',
499    'LC' =>         'CasedLetter',
500    'Lu' =>         'UppercaseLetter',
501    'Ll' =>         'LowercaseLetter',
502    'Lt' =>         'TitlecaseLetter',
503    'Lm' =>         'ModifierLetter',
504    'Lo' =>         'OtherLetter',
505    'M'  =>         'Mark',
506    'Mn' =>         'NonspacingMark',
507    'Mc' =>         'SpacingMark',
508    'Me' =>         'EnclosingMark',
509    'N'  =>         'Number',
510    'Nd' =>         'DecimalNumber',
511    'Nl' =>         'LetterNumber',
512    'No' =>         'OtherNumber',
513    'P'  =>         'Punctuation',
514    'Pc' =>         'ConnectorPunctuation',
515    'Pd' =>         'DashPunctuation',
516    'Ps' =>         'OpenPunctuation',
517    'Pe' =>         'ClosePunctuation',
518    'Pi' =>         'InitialPunctuation',
519    'Pf' =>         'FinalPunctuation',
520    'Po' =>         'OtherPunctuation',
521    'S'  =>         'Symbol',
522    'Sm' =>         'MathSymbol',
523    'Sc' =>         'CurrencySymbol',
524    'Sk' =>         'ModifierSymbol',
525    'So' =>         'OtherSymbol',
526    'Z'  =>         'Separator',
527    'Zs' =>         'SpaceSeparator',
528    'Zl' =>         'LineSeparator',
529    'Zp' =>         'ParagraphSeparator',
530    'C'  =>         'Other',
531    'Cc' =>         'Control',
532    'Cf' =>         'Format',
533    'Cs' =>         'Surrogate',
534    'Co' =>         'PrivateUse',
535    'Cn' =>         'Unassigned',
536 );
537
538sub general_categories {
539    return dclone \%GENERAL_CATEGORIES;
540}
541
542=head2 general_categories
543
544    use Unicode::UCD 'general_categories';
545
546    my $categories = general_categories();
547
548The general_categories() returns a reference to a hash which has short
549general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
550names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
551C<Symbol>) as values.  The hash is reversible in case you need to go
552from the long names to the short names.  The general category is the
553one returned from charinfo() under the C<category> key.
554
555=cut
556
557my %BIDI_TYPES =
558 (
559   'L'   => 'Left-to-Right',
560   'LRE' => 'Left-to-Right Embedding',
561   'LRO' => 'Left-to-Right Override',
562   'R'   => 'Right-to-Left',
563   'AL'  => 'Right-to-Left Arabic',
564   'RLE' => 'Right-to-Left Embedding',
565   'RLO' => 'Right-to-Left Override',
566   'PDF' => 'Pop Directional Format',
567   'EN'  => 'European Number',
568   'ES'  => 'European Number Separator',
569   'ET'  => 'European Number Terminator',
570   'AN'  => 'Arabic Number',
571   'CS'  => 'Common Number Separator',
572   'NSM' => 'Non-Spacing Mark',
573   'BN'  => 'Boundary Neutral',
574   'B'   => 'Paragraph Separator',
575   'S'   => 'Segment Separator',
576   'WS'  => 'Whitespace',
577   'ON'  => 'Other Neutrals',
578 );
579
580sub bidi_types {
581    return dclone \%BIDI_TYPES;
582}
583
584=head2 bidi_types
585
586    use Unicode::UCD 'bidi_types';
587
588    my $categories = bidi_types();
589
590The bidi_types() returns a reference to a hash which has the short
591bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
592names (such as C<Left-to-Right>, C<Right-to-Left>) as values.  The
593hash is reversible in case you need to go from the long names to the
594short names.  The bidi type is the one returned from charinfo()
595under the C<bidi> key.  For the exact meaning of the various bidi classes
596the Unicode TR9 is recommended reading:
597http://www.unicode.org/reports/tr9/tr9-17.html
598(as of Unicode 5.0.0)
599
600=cut
601
602=head2 compexcl
603
604    use Unicode::UCD 'compexcl';
605
606    my $compexcl = compexcl("09dc");
607
608The compexcl() returns the composition exclusion (that is, if the
609character should not be produced during a precomposition) of the
610character specified by a B<code point argument>.
611
612If there is a composition exclusion for the character, true is
613returned.  Otherwise, false is returned.
614
615=cut
616
617my %COMPEXCL;
618
619sub _compexcl {
620    unless (%COMPEXCL) {
621	if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
622	    local $_;
623	    while (<$COMPEXCLFH>) {
624		if (/^([0-9A-F]+)\s+\#\s+/) {
625		    my $code = hex($1);
626		    $COMPEXCL{$code} = undef;
627		}
628	    }
629	    close($COMPEXCLFH);
630	}
631    }
632}
633
634sub compexcl {
635    my $arg  = shift;
636    my $code = _getcode($arg);
637    croak __PACKAGE__, "::compexcl: unknown code '$arg'"
638	unless defined $code;
639
640    _compexcl() unless %COMPEXCL;
641
642    return exists $COMPEXCL{$code};
643}
644
645=head2 casefold
646
647    use Unicode::UCD 'casefold';
648
649    my $casefold = casefold("00DF");
650
651The casefold() returns the locale-independent case folding of the
652character specified by a B<code point argument>.
653
654If there is a case folding for that character, a reference to a hash
655with the following fields is returned:
656
657    key
658
659    code             code point with at least four hexdigits
660    status           "C", "F", "S", or "I"
661    mapping          one or more codes separated by spaces
662
663The meaning of the I<status> is as follows:
664
665   C                 common case folding, common mappings shared
666                     by both simple and full mappings
667   F                 full case folding, mappings that cause strings
668                     to grow in length. Multiple characters are separated
669                     by spaces
670   S                 simple case folding, mappings to single characters
671                     where different from F
672   I                 special case for dotted uppercase I and
673                     dotless lowercase i
674                     - If this mapping is included, the result is
675                       case-insensitive, but dotless and dotted I's
676                       are not distinguished
677                     - If this mapping is excluded, the result is not
678                       fully case-insensitive, but dotless and dotted
679                       I's are distinguished
680
681If there is no case folding for that character, C<undef> is returned.
682
683For more information about case mappings see
684http://www.unicode.org/unicode/reports/tr21/
685
686=cut
687
688my %CASEFOLD;
689
690sub _casefold {
691    unless (%CASEFOLD) {
692	if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
693	    local $_;
694	    while (<$CASEFOLDFH>) {
695		if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
696		    my $code = hex($1);
697		    $CASEFOLD{$code} = { code    => $1,
698					 status  => $2,
699					 mapping => $3 };
700		}
701	    }
702	    close($CASEFOLDFH);
703	}
704    }
705}
706
707sub casefold {
708    my $arg  = shift;
709    my $code = _getcode($arg);
710    croak __PACKAGE__, "::casefold: unknown code '$arg'"
711	unless defined $code;
712
713    _casefold() unless %CASEFOLD;
714
715    return $CASEFOLD{$code};
716}
717
718=head2 casespec
719
720    use Unicode::UCD 'casespec';
721
722    my $casespec = casespec("FB00");
723
724The casespec() returns the potentially locale-dependent case mapping
725of the character specified by a B<code point argument>.  The mapping
726may change the length of the string (which the basic Unicode case
727mappings as returned by charinfo() never do).
728
729If there is a case folding for that character, a reference to a hash
730with the following fields is returned:
731
732    key
733
734    code             code point with at least four hexdigits
735    lower            lowercase
736    title            titlecase
737    upper            uppercase
738    condition        condition list (may be undef)
739
740The C<condition> is optional.  Where present, it consists of one or
741more I<locales> or I<contexts>, separated by spaces (other than as
742used to separate elements, spaces are to be ignored).  A condition
743list overrides the normal behavior if all of the listed conditions are
744true.  Case distinctions in the condition list are not significant.
745Conditions preceded by "NON_" represent the negation of the condition.
746
747Note that when there are multiple case folding definitions for a
748single code point because of different locales, the value returned by
749casespec() is a hash reference which has the locales as the keys and
750hash references as described above as the values.
751
752A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
753followed by a "_" and a 2-letter ISO language code (possibly followed
754by a "_" and a variant code).  You can find the lists of those codes,
755see L<Locale::Country> and L<Locale::Language>.
756
757A I<context> is one of the following choices:
758
759    FINAL            The letter is not followed by a letter of
760                     general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
761    MODERN           The mapping is only used for modern text
762    AFTER_i          The last base character was "i" (U+0069)
763
764For more information about case mappings see
765http://www.unicode.org/unicode/reports/tr21/
766
767=cut
768
769my %CASESPEC;
770
771sub _casespec {
772    unless (%CASESPEC) {
773	if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
774	    local $_;
775	    while (<$CASESPECFH>) {
776		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
777		    my ($hexcode, $lower, $title, $upper, $condition) =
778			($1, $2, $3, $4, $5);
779		    my $code = hex($hexcode);
780		    if (exists $CASESPEC{$code}) {
781			if (exists $CASESPEC{$code}->{code}) {
782			    my ($oldlower,
783				$oldtitle,
784				$oldupper,
785				$oldcondition) =
786				    @{$CASESPEC{$code}}{qw(lower
787							   title
788							   upper
789							   condition)};
790			    if (defined $oldcondition) {
791				my ($oldlocale) =
792				($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
793				delete $CASESPEC{$code};
794				$CASESPEC{$code}->{$oldlocale} =
795				{ code      => $hexcode,
796				  lower     => $oldlower,
797				  title     => $oldtitle,
798				  upper     => $oldupper,
799				  condition => $oldcondition };
800			    }
801			}
802			my ($locale) =
803			    ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
804			$CASESPEC{$code}->{$locale} =
805			{ code      => $hexcode,
806			  lower     => $lower,
807			  title     => $title,
808			  upper     => $upper,
809			  condition => $condition };
810		    } else {
811			$CASESPEC{$code} =
812			{ code      => $hexcode,
813			  lower     => $lower,
814			  title     => $title,
815			  upper     => $upper,
816			  condition => $condition };
817		    }
818		}
819	    }
820	    close($CASESPECFH);
821	}
822    }
823}
824
825sub casespec {
826    my $arg  = shift;
827    my $code = _getcode($arg);
828    croak __PACKAGE__, "::casespec: unknown code '$arg'"
829	unless defined $code;
830
831    _casespec() unless %CASESPEC;
832
833    return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
834}
835
836=head2 namedseq()
837
838    use Unicode::UCD 'namedseq';
839
840    my $namedseq = namedseq("KATAKANA LETTER AINU P");
841    my @namedseq = namedseq("KATAKANA LETTER AINU P");
842    my %namedseq = namedseq();
843
844If used with a single argument in a scalar context, returns the string
845consisting of the code points of the named sequence, or C<undef> if no
846named sequence by that name exists.  If used with a single argument in
847a list context, returns list of the code points.  If used with no
848arguments in a list context, returns a hash with the names of the
849named sequences as the keys and the named sequences as strings as
850the values.  Otherwise, returns C<undef> or empty list depending
851on the context.
852
853(New from Unicode 4.1.0)
854
855=cut
856
857my %NAMEDSEQ;
858
859sub _namedseq {
860    unless (%NAMEDSEQ) {
861	if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
862	    local $_;
863	    while (<$NAMEDSEQFH>) {
864		if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
865		    my ($n, $s) = ($1, $2);
866		    my @s = map { chr(hex($_)) } split(' ', $s);
867		    $NAMEDSEQ{$n} = join("", @s);
868		}
869	    }
870	    close($NAMEDSEQFH);
871	}
872    }
873}
874
875sub namedseq {
876    _namedseq() unless %NAMEDSEQ;
877    my $wantarray = wantarray();
878    if (defined $wantarray) {
879	if ($wantarray) {
880	    if (@_ == 0) {
881		return %NAMEDSEQ;
882	    } elsif (@_ == 1) {
883		my $s = $NAMEDSEQ{ $_[0] };
884		return defined $s ? map { ord($_) } split('', $s) : ();
885	    }
886	} elsif (@_ == 1) {
887	    return $NAMEDSEQ{ $_[0] };
888	}
889    }
890    return;
891}
892
893=head2 Unicode::UCD::UnicodeVersion
894
895Unicode::UCD::UnicodeVersion() returns the version of the Unicode
896Character Database, in other words, the version of the Unicode
897standard the database implements.  The version is a string
898of numbers delimited by dots (C<'.'>).
899
900=cut
901
902my $UNICODEVERSION;
903
904sub UnicodeVersion {
905    unless (defined $UNICODEVERSION) {
906	openunicode(\$VERSIONFH, "version");
907	chomp($UNICODEVERSION = <$VERSIONFH>);
908	close($VERSIONFH);
909	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
910	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
911    }
912    return $UNICODEVERSION;
913}
914
915=head2 Implementation Note
916
917The first use of charinfo() opens a read-only filehandle to the Unicode
918Character Database (the database is included in the Perl distribution).
919The filehandle is then kept open for further queries.  In other words,
920if you are wondering where one of your filehandles went, that's where.
921
922=head1 BUGS
923
924Does not yet support EBCDIC platforms.
925
926=head1 AUTHOR
927
928Jarkko Hietaniemi
929
930=cut
931
9321;
933