xref: /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/lib/Unicode/UCD.pm (revision 0:68f95e015346)
1*0Sstevel@tonic-gatepackage Unicode::UCD;
2*0Sstevel@tonic-gate
3*0Sstevel@tonic-gateuse strict;
4*0Sstevel@tonic-gateuse warnings;
5*0Sstevel@tonic-gate
6*0Sstevel@tonic-gateour $VERSION = '0.22';
7*0Sstevel@tonic-gate
8*0Sstevel@tonic-gateuse Storable qw(dclone);
9*0Sstevel@tonic-gate
10*0Sstevel@tonic-gaterequire Exporter;
11*0Sstevel@tonic-gate
12*0Sstevel@tonic-gateour @ISA = qw(Exporter);
13*0Sstevel@tonic-gate
14*0Sstevel@tonic-gateour @EXPORT_OK = qw(charinfo
15*0Sstevel@tonic-gate		    charblock charscript
16*0Sstevel@tonic-gate		    charblocks charscripts
17*0Sstevel@tonic-gate		    charinrange
18*0Sstevel@tonic-gate		    compexcl
19*0Sstevel@tonic-gate		    casefold casespec);
20*0Sstevel@tonic-gate
21*0Sstevel@tonic-gateuse Carp;
22*0Sstevel@tonic-gate
23*0Sstevel@tonic-gate=head1 NAME
24*0Sstevel@tonic-gate
25*0Sstevel@tonic-gateUnicode::UCD - Unicode character database
26*0Sstevel@tonic-gate
27*0Sstevel@tonic-gate=head1 SYNOPSIS
28*0Sstevel@tonic-gate
29*0Sstevel@tonic-gate    use Unicode::UCD 'charinfo';
30*0Sstevel@tonic-gate    my $charinfo   = charinfo($codepoint);
31*0Sstevel@tonic-gate
32*0Sstevel@tonic-gate    use Unicode::UCD 'charblock';
33*0Sstevel@tonic-gate    my $charblock  = charblock($codepoint);
34*0Sstevel@tonic-gate
35*0Sstevel@tonic-gate    use Unicode::UCD 'charscript';
36*0Sstevel@tonic-gate    my $charscript = charscript($codepoint);
37*0Sstevel@tonic-gate
38*0Sstevel@tonic-gate    use Unicode::UCD 'charblocks';
39*0Sstevel@tonic-gate    my $charblocks = charblocks();
40*0Sstevel@tonic-gate
41*0Sstevel@tonic-gate    use Unicode::UCD 'charscripts';
42*0Sstevel@tonic-gate    my %charscripts = charscripts();
43*0Sstevel@tonic-gate
44*0Sstevel@tonic-gate    use Unicode::UCD qw(charscript charinrange);
45*0Sstevel@tonic-gate    my $range = charscript($script);
46*0Sstevel@tonic-gate    print "looks like $script\n" if charinrange($range, $codepoint);
47*0Sstevel@tonic-gate
48*0Sstevel@tonic-gate    use Unicode::UCD 'compexcl';
49*0Sstevel@tonic-gate    my $compexcl = compexcl($codepoint);
50*0Sstevel@tonic-gate
51*0Sstevel@tonic-gate    my $unicode_version = Unicode::UCD::UnicodeVersion();
52*0Sstevel@tonic-gate
53*0Sstevel@tonic-gate=head1 DESCRIPTION
54*0Sstevel@tonic-gate
55*0Sstevel@tonic-gateThe Unicode::UCD module offers a simple interface to the Unicode
56*0Sstevel@tonic-gateCharacter Database.
57*0Sstevel@tonic-gate
58*0Sstevel@tonic-gate=cut
59*0Sstevel@tonic-gate
60*0Sstevel@tonic-gatemy $UNICODEFH;
61*0Sstevel@tonic-gatemy $BLOCKSFH;
62*0Sstevel@tonic-gatemy $SCRIPTSFH;
63*0Sstevel@tonic-gatemy $VERSIONFH;
64*0Sstevel@tonic-gatemy $COMPEXCLFH;
65*0Sstevel@tonic-gatemy $CASEFOLDFH;
66*0Sstevel@tonic-gatemy $CASESPECFH;
67*0Sstevel@tonic-gate
68*0Sstevel@tonic-gatesub openunicode {
69*0Sstevel@tonic-gate    my ($rfh, @path) = @_;
70*0Sstevel@tonic-gate    my $f;
71*0Sstevel@tonic-gate    unless (defined $$rfh) {
72*0Sstevel@tonic-gate	for my $d (@INC) {
73*0Sstevel@tonic-gate	    use File::Spec;
74*0Sstevel@tonic-gate	    $f = File::Spec->catfile($d, "unicore", @path);
75*0Sstevel@tonic-gate	    last if open($$rfh, $f);
76*0Sstevel@tonic-gate	    undef $f;
77*0Sstevel@tonic-gate	}
78*0Sstevel@tonic-gate	croak __PACKAGE__, ": failed to find ",
79*0Sstevel@tonic-gate              File::Spec->catfile(@path), " in @INC"
80*0Sstevel@tonic-gate	    unless defined $f;
81*0Sstevel@tonic-gate    }
82*0Sstevel@tonic-gate    return $f;
83*0Sstevel@tonic-gate}
84*0Sstevel@tonic-gate
85*0Sstevel@tonic-gate=head2 charinfo
86*0Sstevel@tonic-gate
87*0Sstevel@tonic-gate    use Unicode::UCD 'charinfo';
88*0Sstevel@tonic-gate
89*0Sstevel@tonic-gate    my $charinfo = charinfo(0x41);
90*0Sstevel@tonic-gate
91*0Sstevel@tonic-gatecharinfo() returns a reference to a hash that has the following fields
92*0Sstevel@tonic-gateas defined by the Unicode standard:
93*0Sstevel@tonic-gate
94*0Sstevel@tonic-gate    key
95*0Sstevel@tonic-gate
96*0Sstevel@tonic-gate    code             code point with at least four hexdigits
97*0Sstevel@tonic-gate    name             name of the character IN UPPER CASE
98*0Sstevel@tonic-gate    category         general category of the character
99*0Sstevel@tonic-gate    combining        classes used in the Canonical Ordering Algorithm
100*0Sstevel@tonic-gate    bidi             bidirectional category
101*0Sstevel@tonic-gate    decomposition    character decomposition mapping
102*0Sstevel@tonic-gate    decimal          if decimal digit this is the integer numeric value
103*0Sstevel@tonic-gate    digit            if digit this is the numeric value
104*0Sstevel@tonic-gate    numeric          if numeric is the integer or rational numeric value
105*0Sstevel@tonic-gate    mirrored         if mirrored in bidirectional text
106*0Sstevel@tonic-gate    unicode10        Unicode 1.0 name if existed and different
107*0Sstevel@tonic-gate    comment          ISO 10646 comment field
108*0Sstevel@tonic-gate    upper            uppercase equivalent mapping
109*0Sstevel@tonic-gate    lower            lowercase equivalent mapping
110*0Sstevel@tonic-gate    title            titlecase equivalent mapping
111*0Sstevel@tonic-gate
112*0Sstevel@tonic-gate    block            block the character belongs to (used in \p{In...})
113*0Sstevel@tonic-gate    script           script the character belongs to
114*0Sstevel@tonic-gate
115*0Sstevel@tonic-gateIf no match is found, a reference to an empty hash is returned.
116*0Sstevel@tonic-gate
117*0Sstevel@tonic-gateThe C<block> property is the same as returned by charinfo().  It is
118*0Sstevel@tonic-gatenot defined in the Unicode Character Database proper (Chapter 4 of the
119*0Sstevel@tonic-gateUnicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
120*0Sstevel@tonic-gate(Chapter 14 of TUS3).  Similarly for the C<script> property.
121*0Sstevel@tonic-gate
122*0Sstevel@tonic-gateNote that you cannot do (de)composition and casing based solely on the
123*0Sstevel@tonic-gateabove C<decomposition> and C<lower>, C<upper>, C<title>, properties,
124*0Sstevel@tonic-gateyou will need also the compexcl(), casefold(), and casespec() functions.
125*0Sstevel@tonic-gate
126*0Sstevel@tonic-gate=cut
127*0Sstevel@tonic-gate
128*0Sstevel@tonic-gate# NB: This function is duplicated in charnames.pm
129*0Sstevel@tonic-gatesub _getcode {
130*0Sstevel@tonic-gate    my $arg = shift;
131*0Sstevel@tonic-gate
132*0Sstevel@tonic-gate    if ($arg =~ /^[1-9]\d*$/) {
133*0Sstevel@tonic-gate	return $arg;
134*0Sstevel@tonic-gate    } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
135*0Sstevel@tonic-gate	return hex($1);
136*0Sstevel@tonic-gate    }
137*0Sstevel@tonic-gate
138*0Sstevel@tonic-gate    return;
139*0Sstevel@tonic-gate}
140*0Sstevel@tonic-gate
141*0Sstevel@tonic-gate# Lingua::KO::Hangul::Util not part of the standard distribution
142*0Sstevel@tonic-gate# but it will be used if available.
143*0Sstevel@tonic-gate
144*0Sstevel@tonic-gateeval { require Lingua::KO::Hangul::Util };
145*0Sstevel@tonic-gatemy $hasHangulUtil = ! $@;
146*0Sstevel@tonic-gateif ($hasHangulUtil) {
147*0Sstevel@tonic-gate    Lingua::KO::Hangul::Util->import();
148*0Sstevel@tonic-gate}
149*0Sstevel@tonic-gate
150*0Sstevel@tonic-gatesub hangul_decomp { # internal: called from charinfo
151*0Sstevel@tonic-gate    if ($hasHangulUtil) {
152*0Sstevel@tonic-gate	my @tmp = decomposeHangul(shift);
153*0Sstevel@tonic-gate	return sprintf("%04X %04X",      @tmp) if @tmp == 2;
154*0Sstevel@tonic-gate	return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
155*0Sstevel@tonic-gate    }
156*0Sstevel@tonic-gate    return;
157*0Sstevel@tonic-gate}
158*0Sstevel@tonic-gate
159*0Sstevel@tonic-gatesub hangul_charname { # internal: called from charinfo
160*0Sstevel@tonic-gate    return sprintf("HANGUL SYLLABLE-%04X", shift);
161*0Sstevel@tonic-gate}
162*0Sstevel@tonic-gate
163*0Sstevel@tonic-gatesub han_charname { # internal: called from charinfo
164*0Sstevel@tonic-gate    return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
165*0Sstevel@tonic-gate}
166*0Sstevel@tonic-gate
167*0Sstevel@tonic-gatemy @CharinfoRanges = (
168*0Sstevel@tonic-gate# block name
169*0Sstevel@tonic-gate# [ first, last, coderef to name, coderef to decompose ],
170*0Sstevel@tonic-gate# CJK Ideographs Extension A
171*0Sstevel@tonic-gate  [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
172*0Sstevel@tonic-gate# CJK Ideographs
173*0Sstevel@tonic-gate  [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
174*0Sstevel@tonic-gate# Hangul Syllables
175*0Sstevel@tonic-gate  [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
176*0Sstevel@tonic-gate# Non-Private Use High Surrogates
177*0Sstevel@tonic-gate  [ 0xD800,   0xDB7F,   undef,   undef  ],
178*0Sstevel@tonic-gate# Private Use High Surrogates
179*0Sstevel@tonic-gate  [ 0xDB80,   0xDBFF,   undef,   undef  ],
180*0Sstevel@tonic-gate# Low Surrogates
181*0Sstevel@tonic-gate  [ 0xDC00,   0xDFFF,   undef,   undef  ],
182*0Sstevel@tonic-gate# The Private Use Area
183*0Sstevel@tonic-gate  [ 0xE000,   0xF8FF,   undef,   undef  ],
184*0Sstevel@tonic-gate# CJK Ideographs Extension B
185*0Sstevel@tonic-gate  [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
186*0Sstevel@tonic-gate# Plane 15 Private Use Area
187*0Sstevel@tonic-gate  [ 0xF0000,  0xFFFFD,  undef,   undef  ],
188*0Sstevel@tonic-gate# Plane 16 Private Use Area
189*0Sstevel@tonic-gate  [ 0x100000, 0x10FFFD, undef,   undef  ],
190*0Sstevel@tonic-gate);
191*0Sstevel@tonic-gate
192*0Sstevel@tonic-gatesub charinfo {
193*0Sstevel@tonic-gate    my $arg  = shift;
194*0Sstevel@tonic-gate    my $code = _getcode($arg);
195*0Sstevel@tonic-gate    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
196*0Sstevel@tonic-gate	unless defined $code;
197*0Sstevel@tonic-gate    my $hexk = sprintf("%06X", $code);
198*0Sstevel@tonic-gate    my($rcode,$rname,$rdec);
199*0Sstevel@tonic-gate    foreach my $range (@CharinfoRanges){
200*0Sstevel@tonic-gate      if ($range->[0] <= $code && $code <= $range->[1]) {
201*0Sstevel@tonic-gate        $rcode = $hexk;
202*0Sstevel@tonic-gate	$rcode =~ s/^0+//;
203*0Sstevel@tonic-gate	$rcode =  sprintf("%04X", hex($rcode));
204*0Sstevel@tonic-gate        $rname = $range->[2] ? $range->[2]->($code) : '';
205*0Sstevel@tonic-gate        $rdec  = $range->[3] ? $range->[3]->($code) : '';
206*0Sstevel@tonic-gate        $hexk  = sprintf("%06X", $range->[0]); # replace by the first
207*0Sstevel@tonic-gate        last;
208*0Sstevel@tonic-gate      }
209*0Sstevel@tonic-gate    }
210*0Sstevel@tonic-gate    openunicode(\$UNICODEFH, "UnicodeData.txt");
211*0Sstevel@tonic-gate    if (defined $UNICODEFH) {
212*0Sstevel@tonic-gate	use Search::Dict 1.02;
213*0Sstevel@tonic-gate	if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
214*0Sstevel@tonic-gate	    my $line = <$UNICODEFH>;
215*0Sstevel@tonic-gate	    return unless defined $line;
216*0Sstevel@tonic-gate	    chomp $line;
217*0Sstevel@tonic-gate	    my %prop;
218*0Sstevel@tonic-gate	    @prop{qw(
219*0Sstevel@tonic-gate		     code name category
220*0Sstevel@tonic-gate		     combining bidi decomposition
221*0Sstevel@tonic-gate		     decimal digit numeric
222*0Sstevel@tonic-gate		     mirrored unicode10 comment
223*0Sstevel@tonic-gate		     upper lower title
224*0Sstevel@tonic-gate		    )} = split(/;/, $line, -1);
225*0Sstevel@tonic-gate	    $hexk =~ s/^0+//;
226*0Sstevel@tonic-gate	    $hexk =  sprintf("%04X", hex($hexk));
227*0Sstevel@tonic-gate	    if ($prop{code} eq $hexk) {
228*0Sstevel@tonic-gate		$prop{block}  = charblock($code);
229*0Sstevel@tonic-gate		$prop{script} = charscript($code);
230*0Sstevel@tonic-gate		if(defined $rname){
231*0Sstevel@tonic-gate                    $prop{code} = $rcode;
232*0Sstevel@tonic-gate                    $prop{name} = $rname;
233*0Sstevel@tonic-gate                    $prop{decomposition} = $rdec;
234*0Sstevel@tonic-gate                }
235*0Sstevel@tonic-gate		return \%prop;
236*0Sstevel@tonic-gate	    }
237*0Sstevel@tonic-gate	}
238*0Sstevel@tonic-gate    }
239*0Sstevel@tonic-gate    return;
240*0Sstevel@tonic-gate}
241*0Sstevel@tonic-gate
242*0Sstevel@tonic-gatesub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
243*0Sstevel@tonic-gate    my ($table, $lo, $hi, $code) = @_;
244*0Sstevel@tonic-gate
245*0Sstevel@tonic-gate    return if $lo > $hi;
246*0Sstevel@tonic-gate
247*0Sstevel@tonic-gate    my $mid = int(($lo+$hi) / 2);
248*0Sstevel@tonic-gate
249*0Sstevel@tonic-gate    if ($table->[$mid]->[0] < $code) {
250*0Sstevel@tonic-gate	if ($table->[$mid]->[1] >= $code) {
251*0Sstevel@tonic-gate	    return $table->[$mid]->[2];
252*0Sstevel@tonic-gate	} else {
253*0Sstevel@tonic-gate	    _search($table, $mid + 1, $hi, $code);
254*0Sstevel@tonic-gate	}
255*0Sstevel@tonic-gate    } elsif ($table->[$mid]->[0] > $code) {
256*0Sstevel@tonic-gate	_search($table, $lo, $mid - 1, $code);
257*0Sstevel@tonic-gate    } else {
258*0Sstevel@tonic-gate	return $table->[$mid]->[2];
259*0Sstevel@tonic-gate    }
260*0Sstevel@tonic-gate}
261*0Sstevel@tonic-gate
262*0Sstevel@tonic-gatesub charinrange {
263*0Sstevel@tonic-gate    my ($range, $arg) = @_;
264*0Sstevel@tonic-gate    my $code = _getcode($arg);
265*0Sstevel@tonic-gate    croak __PACKAGE__, "::charinrange: unknown code '$arg'"
266*0Sstevel@tonic-gate	unless defined $code;
267*0Sstevel@tonic-gate    _search($range, 0, $#$range, $code);
268*0Sstevel@tonic-gate}
269*0Sstevel@tonic-gate
270*0Sstevel@tonic-gate=head2 charblock
271*0Sstevel@tonic-gate
272*0Sstevel@tonic-gate    use Unicode::UCD 'charblock';
273*0Sstevel@tonic-gate
274*0Sstevel@tonic-gate    my $charblock = charblock(0x41);
275*0Sstevel@tonic-gate    my $charblock = charblock(1234);
276*0Sstevel@tonic-gate    my $charblock = charblock("0x263a");
277*0Sstevel@tonic-gate    my $charblock = charblock("U+263a");
278*0Sstevel@tonic-gate
279*0Sstevel@tonic-gate    my $range     = charblock('Armenian');
280*0Sstevel@tonic-gate
281*0Sstevel@tonic-gateWith a B<code point argument> charblock() returns the I<block> the character
282*0Sstevel@tonic-gatebelongs to, e.g.  C<Basic Latin>.  Note that not all the character
283*0Sstevel@tonic-gatepositions within all blocks are defined.
284*0Sstevel@tonic-gate
285*0Sstevel@tonic-gateSee also L</Blocks versus Scripts>.
286*0Sstevel@tonic-gate
287*0Sstevel@tonic-gateIf supplied with an argument that can't be a code point, charblock() tries
288*0Sstevel@tonic-gateto do the opposite and interpret the argument as a character block. The
289*0Sstevel@tonic-gatereturn value is a I<range>: an anonymous list of lists that contain
290*0Sstevel@tonic-gateI<start-of-range>, I<end-of-range> code point pairs. You can test whether a
291*0Sstevel@tonic-gatecode point is in a range using the L</charinrange> function. If the
292*0Sstevel@tonic-gateargument is not a known charater block, C<undef> is returned.
293*0Sstevel@tonic-gate
294*0Sstevel@tonic-gate=cut
295*0Sstevel@tonic-gate
296*0Sstevel@tonic-gatemy @BLOCKS;
297*0Sstevel@tonic-gatemy %BLOCKS;
298*0Sstevel@tonic-gate
299*0Sstevel@tonic-gatesub _charblocks {
300*0Sstevel@tonic-gate    unless (@BLOCKS) {
301*0Sstevel@tonic-gate	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
302*0Sstevel@tonic-gate	    local $_;
303*0Sstevel@tonic-gate	    while (<$BLOCKSFH>) {
304*0Sstevel@tonic-gate		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
305*0Sstevel@tonic-gate		    my ($lo, $hi) = (hex($1), hex($2));
306*0Sstevel@tonic-gate		    my $subrange = [ $lo, $hi, $3 ];
307*0Sstevel@tonic-gate		    push @BLOCKS, $subrange;
308*0Sstevel@tonic-gate		    push @{$BLOCKS{$3}}, $subrange;
309*0Sstevel@tonic-gate		}
310*0Sstevel@tonic-gate	    }
311*0Sstevel@tonic-gate	    close($BLOCKSFH);
312*0Sstevel@tonic-gate	}
313*0Sstevel@tonic-gate    }
314*0Sstevel@tonic-gate}
315*0Sstevel@tonic-gate
316*0Sstevel@tonic-gatesub charblock {
317*0Sstevel@tonic-gate    my $arg = shift;
318*0Sstevel@tonic-gate
319*0Sstevel@tonic-gate    _charblocks() unless @BLOCKS;
320*0Sstevel@tonic-gate
321*0Sstevel@tonic-gate    my $code = _getcode($arg);
322*0Sstevel@tonic-gate
323*0Sstevel@tonic-gate    if (defined $code) {
324*0Sstevel@tonic-gate	_search(\@BLOCKS, 0, $#BLOCKS, $code);
325*0Sstevel@tonic-gate    } else {
326*0Sstevel@tonic-gate	if (exists $BLOCKS{$arg}) {
327*0Sstevel@tonic-gate	    return dclone $BLOCKS{$arg};
328*0Sstevel@tonic-gate	} else {
329*0Sstevel@tonic-gate	    return;
330*0Sstevel@tonic-gate	}
331*0Sstevel@tonic-gate    }
332*0Sstevel@tonic-gate}
333*0Sstevel@tonic-gate
334*0Sstevel@tonic-gate=head2 charscript
335*0Sstevel@tonic-gate
336*0Sstevel@tonic-gate    use Unicode::UCD 'charscript';
337*0Sstevel@tonic-gate
338*0Sstevel@tonic-gate    my $charscript = charscript(0x41);
339*0Sstevel@tonic-gate    my $charscript = charscript(1234);
340*0Sstevel@tonic-gate    my $charscript = charscript("U+263a");
341*0Sstevel@tonic-gate
342*0Sstevel@tonic-gate    my $range      = charscript('Thai');
343*0Sstevel@tonic-gate
344*0Sstevel@tonic-gateWith a B<code point argument> charscript() returns the I<script> the
345*0Sstevel@tonic-gatecharacter belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
346*0Sstevel@tonic-gate
347*0Sstevel@tonic-gateSee also L</Blocks versus Scripts>.
348*0Sstevel@tonic-gate
349*0Sstevel@tonic-gateIf supplied with an argument that can't be a code point, charscript() tries
350*0Sstevel@tonic-gateto do the opposite and interpret the argument as a character script. The
351*0Sstevel@tonic-gatereturn value is a I<range>: an anonymous list of lists that contain
352*0Sstevel@tonic-gateI<start-of-range>, I<end-of-range> code point pairs. You can test whether a
353*0Sstevel@tonic-gatecode point is in a range using the L</charinrange> function. If the
354*0Sstevel@tonic-gateargument is not a known charater script, C<undef> is returned.
355*0Sstevel@tonic-gate
356*0Sstevel@tonic-gate=cut
357*0Sstevel@tonic-gate
358*0Sstevel@tonic-gatemy @SCRIPTS;
359*0Sstevel@tonic-gatemy %SCRIPTS;
360*0Sstevel@tonic-gate
361*0Sstevel@tonic-gatesub _charscripts {
362*0Sstevel@tonic-gate    unless (@SCRIPTS) {
363*0Sstevel@tonic-gate	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
364*0Sstevel@tonic-gate	    local $_;
365*0Sstevel@tonic-gate	    while (<$SCRIPTSFH>) {
366*0Sstevel@tonic-gate		if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
367*0Sstevel@tonic-gate		    my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
368*0Sstevel@tonic-gate		    my $script = lc($3);
369*0Sstevel@tonic-gate		    $script =~ s/\b(\w)/uc($1)/ge;
370*0Sstevel@tonic-gate		    my $subrange = [ $lo, $hi, $script ];
371*0Sstevel@tonic-gate		    push @SCRIPTS, $subrange;
372*0Sstevel@tonic-gate		    push @{$SCRIPTS{$script}}, $subrange;
373*0Sstevel@tonic-gate		}
374*0Sstevel@tonic-gate	    }
375*0Sstevel@tonic-gate	    close($SCRIPTSFH);
376*0Sstevel@tonic-gate	    @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
377*0Sstevel@tonic-gate	}
378*0Sstevel@tonic-gate    }
379*0Sstevel@tonic-gate}
380*0Sstevel@tonic-gate
381*0Sstevel@tonic-gatesub charscript {
382*0Sstevel@tonic-gate    my $arg = shift;
383*0Sstevel@tonic-gate
384*0Sstevel@tonic-gate    _charscripts() unless @SCRIPTS;
385*0Sstevel@tonic-gate
386*0Sstevel@tonic-gate    my $code = _getcode($arg);
387*0Sstevel@tonic-gate
388*0Sstevel@tonic-gate    if (defined $code) {
389*0Sstevel@tonic-gate	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
390*0Sstevel@tonic-gate    } else {
391*0Sstevel@tonic-gate	if (exists $SCRIPTS{$arg}) {
392*0Sstevel@tonic-gate	    return dclone $SCRIPTS{$arg};
393*0Sstevel@tonic-gate	} else {
394*0Sstevel@tonic-gate	    return;
395*0Sstevel@tonic-gate	}
396*0Sstevel@tonic-gate    }
397*0Sstevel@tonic-gate}
398*0Sstevel@tonic-gate
399*0Sstevel@tonic-gate=head2 charblocks
400*0Sstevel@tonic-gate
401*0Sstevel@tonic-gate    use Unicode::UCD 'charblocks';
402*0Sstevel@tonic-gate
403*0Sstevel@tonic-gate    my $charblocks = charblocks();
404*0Sstevel@tonic-gate
405*0Sstevel@tonic-gatecharblocks() returns a reference to a hash with the known block names
406*0Sstevel@tonic-gateas the keys, and the code point ranges (see L</charblock>) as the values.
407*0Sstevel@tonic-gate
408*0Sstevel@tonic-gateSee also L</Blocks versus Scripts>.
409*0Sstevel@tonic-gate
410*0Sstevel@tonic-gate=cut
411*0Sstevel@tonic-gate
412*0Sstevel@tonic-gatesub charblocks {
413*0Sstevel@tonic-gate    _charblocks() unless %BLOCKS;
414*0Sstevel@tonic-gate    return dclone \%BLOCKS;
415*0Sstevel@tonic-gate}
416*0Sstevel@tonic-gate
417*0Sstevel@tonic-gate=head2 charscripts
418*0Sstevel@tonic-gate
419*0Sstevel@tonic-gate    use Unicode::UCD 'charscripts';
420*0Sstevel@tonic-gate
421*0Sstevel@tonic-gate    my %charscripts = charscripts();
422*0Sstevel@tonic-gate
423*0Sstevel@tonic-gatecharscripts() returns a hash with the known script names as the keys,
424*0Sstevel@tonic-gateand the code point ranges (see L</charscript>) as the values.
425*0Sstevel@tonic-gate
426*0Sstevel@tonic-gateSee also L</Blocks versus Scripts>.
427*0Sstevel@tonic-gate
428*0Sstevel@tonic-gate=cut
429*0Sstevel@tonic-gate
430*0Sstevel@tonic-gatesub charscripts {
431*0Sstevel@tonic-gate    _charscripts() unless %SCRIPTS;
432*0Sstevel@tonic-gate    return dclone \%SCRIPTS;
433*0Sstevel@tonic-gate}
434*0Sstevel@tonic-gate
435*0Sstevel@tonic-gate=head2 Blocks versus Scripts
436*0Sstevel@tonic-gate
437*0Sstevel@tonic-gateThe difference between a block and a script is that scripts are closer
438*0Sstevel@tonic-gateto the linguistic notion of a set of characters required to present
439*0Sstevel@tonic-gatelanguages, while block is more of an artifact of the Unicode character
440*0Sstevel@tonic-gatenumbering and separation into blocks of (mostly) 256 characters.
441*0Sstevel@tonic-gate
442*0Sstevel@tonic-gateFor example the Latin B<script> is spread over several B<blocks>, such
443*0Sstevel@tonic-gateas C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
444*0Sstevel@tonic-gateC<Latin Extended-B>.  On the other hand, the Latin script does not
445*0Sstevel@tonic-gatecontain all the characters of the C<Basic Latin> block (also known as
446*0Sstevel@tonic-gatethe ASCII): it includes only the letters, and not, for example, the digits
447*0Sstevel@tonic-gateor the punctuation.
448*0Sstevel@tonic-gate
449*0Sstevel@tonic-gateFor blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
450*0Sstevel@tonic-gate
451*0Sstevel@tonic-gateFor scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
452*0Sstevel@tonic-gate
453*0Sstevel@tonic-gate=head2 Matching Scripts and Blocks
454*0Sstevel@tonic-gate
455*0Sstevel@tonic-gateScripts are matched with the regular-expression construct
456*0Sstevel@tonic-gateC<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
457*0Sstevel@tonic-gatewhile C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
458*0Sstevel@tonic-gateany of the 256 code points in the Tibetan block).
459*0Sstevel@tonic-gate
460*0Sstevel@tonic-gate=head2 Code Point Arguments
461*0Sstevel@tonic-gate
462*0Sstevel@tonic-gateA I<code point argument> is either a decimal or a hexadecimal scalar
463*0Sstevel@tonic-gatedesignating a Unicode character, or C<U+> followed by hexadecimals
464*0Sstevel@tonic-gatedesignating a Unicode character.  In other words, if you want a code
465*0Sstevel@tonic-gatepoint to be interpreted as a hexadecimal number, you must prefix it
466*0Sstevel@tonic-gatewith either C<0x> or C<U+>, because a string like e.g. C<123> will
467*0Sstevel@tonic-gatebe interpreted as a decimal code point.  Also note that Unicode is
468*0Sstevel@tonic-gateB<not> limited to 16 bits (the number of Unicode characters is
469*0Sstevel@tonic-gateopen-ended, in theory unlimited): you may have more than 4 hexdigits.
470*0Sstevel@tonic-gate
471*0Sstevel@tonic-gate=head2 charinrange
472*0Sstevel@tonic-gate
473*0Sstevel@tonic-gateIn addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
474*0Sstevel@tonic-gatecan also test whether a code point is in the I<range> as returned by
475*0Sstevel@tonic-gateL</charblock> and L</charscript> or as the values of the hash returned
476*0Sstevel@tonic-gateby L</charblocks> and L</charscripts> by using charinrange():
477*0Sstevel@tonic-gate
478*0Sstevel@tonic-gate    use Unicode::UCD qw(charscript charinrange);
479*0Sstevel@tonic-gate
480*0Sstevel@tonic-gate    $range = charscript('Hiragana');
481*0Sstevel@tonic-gate    print "looks like hiragana\n" if charinrange($range, $codepoint);
482*0Sstevel@tonic-gate
483*0Sstevel@tonic-gate=cut
484*0Sstevel@tonic-gate
485*0Sstevel@tonic-gate=head2 compexcl
486*0Sstevel@tonic-gate
487*0Sstevel@tonic-gate    use Unicode::UCD 'compexcl';
488*0Sstevel@tonic-gate
489*0Sstevel@tonic-gate    my $compexcl = compexcl("09dc");
490*0Sstevel@tonic-gate
491*0Sstevel@tonic-gateThe compexcl() returns the composition exclusion (that is, if the
492*0Sstevel@tonic-gatecharacter should not be produced during a precomposition) of the
493*0Sstevel@tonic-gatecharacter specified by a B<code point argument>.
494*0Sstevel@tonic-gate
495*0Sstevel@tonic-gateIf there is a composition exclusion for the character, true is
496*0Sstevel@tonic-gatereturned.  Otherwise, false is returned.
497*0Sstevel@tonic-gate
498*0Sstevel@tonic-gate=cut
499*0Sstevel@tonic-gate
500*0Sstevel@tonic-gatemy %COMPEXCL;
501*0Sstevel@tonic-gate
502*0Sstevel@tonic-gatesub _compexcl {
503*0Sstevel@tonic-gate    unless (%COMPEXCL) {
504*0Sstevel@tonic-gate	if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
505*0Sstevel@tonic-gate	    local $_;
506*0Sstevel@tonic-gate	    while (<$COMPEXCLFH>) {
507*0Sstevel@tonic-gate		if (/^([0-9A-F]+)\s+\#\s+/) {
508*0Sstevel@tonic-gate		    my $code = hex($1);
509*0Sstevel@tonic-gate		    $COMPEXCL{$code} = undef;
510*0Sstevel@tonic-gate		}
511*0Sstevel@tonic-gate	    }
512*0Sstevel@tonic-gate	    close($COMPEXCLFH);
513*0Sstevel@tonic-gate	}
514*0Sstevel@tonic-gate    }
515*0Sstevel@tonic-gate}
516*0Sstevel@tonic-gate
517*0Sstevel@tonic-gatesub compexcl {
518*0Sstevel@tonic-gate    my $arg  = shift;
519*0Sstevel@tonic-gate    my $code = _getcode($arg);
520*0Sstevel@tonic-gate    croak __PACKAGE__, "::compexcl: unknown code '$arg'"
521*0Sstevel@tonic-gate	unless defined $code;
522*0Sstevel@tonic-gate
523*0Sstevel@tonic-gate    _compexcl() unless %COMPEXCL;
524*0Sstevel@tonic-gate
525*0Sstevel@tonic-gate    return exists $COMPEXCL{$code};
526*0Sstevel@tonic-gate}
527*0Sstevel@tonic-gate
528*0Sstevel@tonic-gate=head2 casefold
529*0Sstevel@tonic-gate
530*0Sstevel@tonic-gate    use Unicode::UCD 'casefold';
531*0Sstevel@tonic-gate
532*0Sstevel@tonic-gate    my $casefold = casefold("00DF");
533*0Sstevel@tonic-gate
534*0Sstevel@tonic-gateThe casefold() returns the locale-independent case folding of the
535*0Sstevel@tonic-gatecharacter specified by a B<code point argument>.
536*0Sstevel@tonic-gate
537*0Sstevel@tonic-gateIf there is a case folding for that character, a reference to a hash
538*0Sstevel@tonic-gatewith the following fields is returned:
539*0Sstevel@tonic-gate
540*0Sstevel@tonic-gate    key
541*0Sstevel@tonic-gate
542*0Sstevel@tonic-gate    code             code point with at least four hexdigits
543*0Sstevel@tonic-gate    status           "C", "F", "S", or "I"
544*0Sstevel@tonic-gate    mapping          one or more codes separated by spaces
545*0Sstevel@tonic-gate
546*0Sstevel@tonic-gateThe meaning of the I<status> is as follows:
547*0Sstevel@tonic-gate
548*0Sstevel@tonic-gate   C                 common case folding, common mappings shared
549*0Sstevel@tonic-gate                     by both simple and full mappings
550*0Sstevel@tonic-gate   F                 full case folding, mappings that cause strings
551*0Sstevel@tonic-gate                     to grow in length. Multiple characters are separated
552*0Sstevel@tonic-gate                     by spaces
553*0Sstevel@tonic-gate   S                 simple case folding, mappings to single characters
554*0Sstevel@tonic-gate                     where different from F
555*0Sstevel@tonic-gate   I                 special case for dotted uppercase I and
556*0Sstevel@tonic-gate                     dotless lowercase i
557*0Sstevel@tonic-gate                     - If this mapping is included, the result is
558*0Sstevel@tonic-gate                       case-insensitive, but dotless and dotted I's
559*0Sstevel@tonic-gate                       are not distinguished
560*0Sstevel@tonic-gate                     - If this mapping is excluded, the result is not
561*0Sstevel@tonic-gate                       fully case-insensitive, but dotless and dotted
562*0Sstevel@tonic-gate                       I's are distinguished
563*0Sstevel@tonic-gate
564*0Sstevel@tonic-gateIf there is no case folding for that character, C<undef> is returned.
565*0Sstevel@tonic-gate
566*0Sstevel@tonic-gateFor more information about case mappings see
567*0Sstevel@tonic-gatehttp://www.unicode.org/unicode/reports/tr21/
568*0Sstevel@tonic-gate
569*0Sstevel@tonic-gate=cut
570*0Sstevel@tonic-gate
571*0Sstevel@tonic-gatemy %CASEFOLD;
572*0Sstevel@tonic-gate
573*0Sstevel@tonic-gatesub _casefold {
574*0Sstevel@tonic-gate    unless (%CASEFOLD) {
575*0Sstevel@tonic-gate	if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
576*0Sstevel@tonic-gate	    local $_;
577*0Sstevel@tonic-gate	    while (<$CASEFOLDFH>) {
578*0Sstevel@tonic-gate		if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
579*0Sstevel@tonic-gate		    my $code = hex($1);
580*0Sstevel@tonic-gate		    $CASEFOLD{$code} = { code    => $1,
581*0Sstevel@tonic-gate					 status  => $2,
582*0Sstevel@tonic-gate					 mapping => $3 };
583*0Sstevel@tonic-gate		}
584*0Sstevel@tonic-gate	    }
585*0Sstevel@tonic-gate	    close($CASEFOLDFH);
586*0Sstevel@tonic-gate	}
587*0Sstevel@tonic-gate    }
588*0Sstevel@tonic-gate}
589*0Sstevel@tonic-gate
590*0Sstevel@tonic-gatesub casefold {
591*0Sstevel@tonic-gate    my $arg  = shift;
592*0Sstevel@tonic-gate    my $code = _getcode($arg);
593*0Sstevel@tonic-gate    croak __PACKAGE__, "::casefold: unknown code '$arg'"
594*0Sstevel@tonic-gate	unless defined $code;
595*0Sstevel@tonic-gate
596*0Sstevel@tonic-gate    _casefold() unless %CASEFOLD;
597*0Sstevel@tonic-gate
598*0Sstevel@tonic-gate    return $CASEFOLD{$code};
599*0Sstevel@tonic-gate}
600*0Sstevel@tonic-gate
601*0Sstevel@tonic-gate=head2 casespec
602*0Sstevel@tonic-gate
603*0Sstevel@tonic-gate    use Unicode::UCD 'casespec';
604*0Sstevel@tonic-gate
605*0Sstevel@tonic-gate    my $casespec = casespec("FB00");
606*0Sstevel@tonic-gate
607*0Sstevel@tonic-gateThe casespec() returns the potentially locale-dependent case mapping
608*0Sstevel@tonic-gateof the character specified by a B<code point argument>.  The mapping
609*0Sstevel@tonic-gatemay change the length of the string (which the basic Unicode case
610*0Sstevel@tonic-gatemappings as returned by charinfo() never do).
611*0Sstevel@tonic-gate
612*0Sstevel@tonic-gateIf there is a case folding for that character, a reference to a hash
613*0Sstevel@tonic-gatewith the following fields is returned:
614*0Sstevel@tonic-gate
615*0Sstevel@tonic-gate    key
616*0Sstevel@tonic-gate
617*0Sstevel@tonic-gate    code             code point with at least four hexdigits
618*0Sstevel@tonic-gate    lower            lowercase
619*0Sstevel@tonic-gate    title            titlecase
620*0Sstevel@tonic-gate    upper            uppercase
621*0Sstevel@tonic-gate    condition        condition list (may be undef)
622*0Sstevel@tonic-gate
623*0Sstevel@tonic-gateThe C<condition> is optional.  Where present, it consists of one or
624*0Sstevel@tonic-gatemore I<locales> or I<contexts>, separated by spaces (other than as
625*0Sstevel@tonic-gateused to separate elements, spaces are to be ignored).  A condition
626*0Sstevel@tonic-gatelist overrides the normal behavior if all of the listed conditions are
627*0Sstevel@tonic-gatetrue.  Case distinctions in the condition list are not significant.
628*0Sstevel@tonic-gateConditions preceded by "NON_" represent the negation of the condition.
629*0Sstevel@tonic-gate
630*0Sstevel@tonic-gateNote that when there are multiple case folding definitions for a
631*0Sstevel@tonic-gatesingle code point because of different locales, the value returned by
632*0Sstevel@tonic-gatecasespec() is a hash reference which has the locales as the keys and
633*0Sstevel@tonic-gatehash references as described above as the values.
634*0Sstevel@tonic-gate
635*0Sstevel@tonic-gateA I<locale> is defined as a 2-letter ISO 3166 country code, possibly
636*0Sstevel@tonic-gatefollowed by a "_" and a 2-letter ISO language code (possibly followed
637*0Sstevel@tonic-gateby a "_" and a variant code).  You can find the lists of those codes,
638*0Sstevel@tonic-gatesee L<Locale::Country> and L<Locale::Language>.
639*0Sstevel@tonic-gate
640*0Sstevel@tonic-gateA I<context> is one of the following choices:
641*0Sstevel@tonic-gate
642*0Sstevel@tonic-gate    FINAL            The letter is not followed by a letter of
643*0Sstevel@tonic-gate                     general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
644*0Sstevel@tonic-gate    MODERN           The mapping is only used for modern text
645*0Sstevel@tonic-gate    AFTER_i          The last base character was "i" (U+0069)
646*0Sstevel@tonic-gate
647*0Sstevel@tonic-gateFor more information about case mappings see
648*0Sstevel@tonic-gatehttp://www.unicode.org/unicode/reports/tr21/
649*0Sstevel@tonic-gate
650*0Sstevel@tonic-gate=cut
651*0Sstevel@tonic-gate
652*0Sstevel@tonic-gatemy %CASESPEC;
653*0Sstevel@tonic-gate
654*0Sstevel@tonic-gatesub _casespec {
655*0Sstevel@tonic-gate    unless (%CASESPEC) {
656*0Sstevel@tonic-gate	if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
657*0Sstevel@tonic-gate	    local $_;
658*0Sstevel@tonic-gate	    while (<$CASESPECFH>) {
659*0Sstevel@tonic-gate		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
660*0Sstevel@tonic-gate		    my ($hexcode, $lower, $title, $upper, $condition) =
661*0Sstevel@tonic-gate			($1, $2, $3, $4, $5);
662*0Sstevel@tonic-gate		    my $code = hex($hexcode);
663*0Sstevel@tonic-gate		    if (exists $CASESPEC{$code}) {
664*0Sstevel@tonic-gate			if (exists $CASESPEC{$code}->{code}) {
665*0Sstevel@tonic-gate			    my ($oldlower,
666*0Sstevel@tonic-gate				$oldtitle,
667*0Sstevel@tonic-gate				$oldupper,
668*0Sstevel@tonic-gate				$oldcondition) =
669*0Sstevel@tonic-gate				    @{$CASESPEC{$code}}{qw(lower
670*0Sstevel@tonic-gate							   title
671*0Sstevel@tonic-gate							   upper
672*0Sstevel@tonic-gate							   condition)};
673*0Sstevel@tonic-gate			    if (defined $oldcondition) {
674*0Sstevel@tonic-gate				my ($oldlocale) =
675*0Sstevel@tonic-gate				($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
676*0Sstevel@tonic-gate				delete $CASESPEC{$code};
677*0Sstevel@tonic-gate				$CASESPEC{$code}->{$oldlocale} =
678*0Sstevel@tonic-gate				{ code      => $hexcode,
679*0Sstevel@tonic-gate				  lower     => $oldlower,
680*0Sstevel@tonic-gate				  title     => $oldtitle,
681*0Sstevel@tonic-gate				  upper     => $oldupper,
682*0Sstevel@tonic-gate				  condition => $oldcondition };
683*0Sstevel@tonic-gate			    }
684*0Sstevel@tonic-gate			}
685*0Sstevel@tonic-gate			my ($locale) =
686*0Sstevel@tonic-gate			    ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
687*0Sstevel@tonic-gate			$CASESPEC{$code}->{$locale} =
688*0Sstevel@tonic-gate			{ code      => $hexcode,
689*0Sstevel@tonic-gate			  lower     => $lower,
690*0Sstevel@tonic-gate			  title     => $title,
691*0Sstevel@tonic-gate			  upper     => $upper,
692*0Sstevel@tonic-gate			  condition => $condition };
693*0Sstevel@tonic-gate		    } else {
694*0Sstevel@tonic-gate			$CASESPEC{$code} =
695*0Sstevel@tonic-gate			{ code      => $hexcode,
696*0Sstevel@tonic-gate			  lower     => $lower,
697*0Sstevel@tonic-gate			  title     => $title,
698*0Sstevel@tonic-gate			  upper     => $upper,
699*0Sstevel@tonic-gate			  condition => $condition };
700*0Sstevel@tonic-gate		    }
701*0Sstevel@tonic-gate		}
702*0Sstevel@tonic-gate	    }
703*0Sstevel@tonic-gate	    close($CASESPECFH);
704*0Sstevel@tonic-gate	}
705*0Sstevel@tonic-gate    }
706*0Sstevel@tonic-gate}
707*0Sstevel@tonic-gate
708*0Sstevel@tonic-gatesub casespec {
709*0Sstevel@tonic-gate    my $arg  = shift;
710*0Sstevel@tonic-gate    my $code = _getcode($arg);
711*0Sstevel@tonic-gate    croak __PACKAGE__, "::casespec: unknown code '$arg'"
712*0Sstevel@tonic-gate	unless defined $code;
713*0Sstevel@tonic-gate
714*0Sstevel@tonic-gate    _casespec() unless %CASESPEC;
715*0Sstevel@tonic-gate
716*0Sstevel@tonic-gate    return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
717*0Sstevel@tonic-gate}
718*0Sstevel@tonic-gate
719*0Sstevel@tonic-gate=head2 Unicode::UCD::UnicodeVersion
720*0Sstevel@tonic-gate
721*0Sstevel@tonic-gateUnicode::UCD::UnicodeVersion() returns the version of the Unicode
722*0Sstevel@tonic-gateCharacter Database, in other words, the version of the Unicode
723*0Sstevel@tonic-gatestandard the database implements.  The version is a string
724*0Sstevel@tonic-gateof numbers delimited by dots (C<'.'>).
725*0Sstevel@tonic-gate
726*0Sstevel@tonic-gate=cut
727*0Sstevel@tonic-gate
728*0Sstevel@tonic-gatemy $UNICODEVERSION;
729*0Sstevel@tonic-gate
730*0Sstevel@tonic-gatesub UnicodeVersion {
731*0Sstevel@tonic-gate    unless (defined $UNICODEVERSION) {
732*0Sstevel@tonic-gate	openunicode(\$VERSIONFH, "version");
733*0Sstevel@tonic-gate	chomp($UNICODEVERSION = <$VERSIONFH>);
734*0Sstevel@tonic-gate	close($VERSIONFH);
735*0Sstevel@tonic-gate	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
736*0Sstevel@tonic-gate	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
737*0Sstevel@tonic-gate    }
738*0Sstevel@tonic-gate    return $UNICODEVERSION;
739*0Sstevel@tonic-gate}
740*0Sstevel@tonic-gate
741*0Sstevel@tonic-gate=head2 Implementation Note
742*0Sstevel@tonic-gate
743*0Sstevel@tonic-gateThe first use of charinfo() opens a read-only filehandle to the Unicode
744*0Sstevel@tonic-gateCharacter Database (the database is included in the Perl distribution).
745*0Sstevel@tonic-gateThe filehandle is then kept open for further queries.  In other words,
746*0Sstevel@tonic-gateif you are wondering where one of your filehandles went, that's where.
747*0Sstevel@tonic-gate
748*0Sstevel@tonic-gate=head1 BUGS
749*0Sstevel@tonic-gate
750*0Sstevel@tonic-gateDoes not yet support EBCDIC platforms.
751*0Sstevel@tonic-gate
752*0Sstevel@tonic-gate=head1 AUTHOR
753*0Sstevel@tonic-gate
754*0Sstevel@tonic-gateJarkko Hietaniemi
755*0Sstevel@tonic-gate
756*0Sstevel@tonic-gate=cut
757*0Sstevel@tonic-gate
758*0Sstevel@tonic-gate1;
759