1package Unicode::UCD; 2 3use strict; 4use warnings; 5 6our $VERSION = '0.24'; 7 8use Storable qw(dclone); 9 10require Exporter; 11 12our @ISA = qw(Exporter); 13 14our @EXPORT_OK = qw(charinfo 15 charblock charscript 16 charblocks charscripts 17 charinrange 18 compexcl 19 casefold casespec 20 namedseq); 21 22use Carp; 23 24=head1 NAME 25 26Unicode::UCD - Unicode character database 27 28=head1 SYNOPSIS 29 30 use Unicode::UCD 'charinfo'; 31 my $charinfo = charinfo($codepoint); 32 33 use Unicode::UCD 'charblock'; 34 my $charblock = charblock($codepoint); 35 36 use Unicode::UCD 'charscript'; 37 my $charscript = charscript($codepoint); 38 39 use Unicode::UCD 'charblocks'; 40 my $charblocks = charblocks(); 41 42 use Unicode::UCD 'charscripts'; 43 my %charscripts = charscripts(); 44 45 use Unicode::UCD qw(charscript charinrange); 46 my $range = charscript($script); 47 print "looks like $script\n" if charinrange($range, $codepoint); 48 49 use Unicode::UCD 'compexcl'; 50 my $compexcl = compexcl($codepoint); 51 52 use Unicode::UCD 'namedseq'; 53 my $namedseq = namedseq($named_sequence_name); 54 55 my $unicode_version = Unicode::UCD::UnicodeVersion(); 56 57=head1 DESCRIPTION 58 59The Unicode::UCD module offers a simple interface to the Unicode 60Character Database. 61 62=cut 63 64my $UNICODEFH; 65my $BLOCKSFH; 66my $SCRIPTSFH; 67my $VERSIONFH; 68my $COMPEXCLFH; 69my $CASEFOLDFH; 70my $CASESPECFH; 71my $NAMEDSEQFH; 72 73sub openunicode { 74 my ($rfh, @path) = @_; 75 my $f; 76 unless (defined $$rfh) { 77 for my $d (@INC) { 78 use File::Spec; 79 $f = File::Spec->catfile($d, "unicore", @path); 80 last if open($$rfh, $f); 81 undef $f; 82 } 83 croak __PACKAGE__, ": failed to find ", 84 File::Spec->catfile(@path), " in @INC" 85 unless defined $f; 86 } 87 return $f; 88} 89 90=head2 charinfo 91 92 use Unicode::UCD 'charinfo'; 93 94 my $charinfo = charinfo(0x41); 95 96charinfo() returns a reference to a hash that has the following fields 97as defined by the Unicode standard: 98 99 key 100 101 code code point with at least four hexdigits 102 name name of the character IN UPPER CASE 103 category general category of the character 104 combining classes used in the Canonical Ordering Algorithm 105 bidi bidirectional category 106 decomposition character decomposition mapping 107 decimal if decimal digit this is the integer numeric value 108 digit if digit this is the numeric value 109 numeric if numeric is the integer or rational numeric value 110 mirrored if mirrored in bidirectional text 111 unicode10 Unicode 1.0 name if existed and different 112 comment ISO 10646 comment field 113 upper uppercase equivalent mapping 114 lower lowercase equivalent mapping 115 title titlecase equivalent mapping 116 117 block block the character belongs to (used in \p{In...}) 118 script script the character belongs to 119 120If no match is found, a reference to an empty hash is returned. 121 122The C<block> property is the same as returned by charinfo(). It is 123not defined in the Unicode Character Database proper (Chapter 4 of the 124Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database 125(Chapter 14 of TUS3). Similarly for the C<script> property. 126 127Note that you cannot do (de)composition and casing based solely on the 128above C<decomposition> and C<lower>, C<upper>, C<title>, properties, 129you will need also the compexcl(), casefold(), and casespec() functions. 130 131=cut 132 133# NB: This function is duplicated in charnames.pm 134sub _getcode { 135 my $arg = shift; 136 137 if ($arg =~ /^[1-9]\d*$/) { 138 return $arg; 139 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { 140 return hex($1); 141 } 142 143 return; 144} 145 146# Lingua::KO::Hangul::Util not part of the standard distribution 147# but it will be used if available. 148 149eval { require Lingua::KO::Hangul::Util }; 150my $hasHangulUtil = ! $@; 151if ($hasHangulUtil) { 152 Lingua::KO::Hangul::Util->import(); 153} 154 155sub hangul_decomp { # internal: called from charinfo 156 if ($hasHangulUtil) { 157 my @tmp = decomposeHangul(shift); 158 return sprintf("%04X %04X", @tmp) if @tmp == 2; 159 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3; 160 } 161 return; 162} 163 164sub hangul_charname { # internal: called from charinfo 165 return sprintf("HANGUL SYLLABLE-%04X", shift); 166} 167 168sub han_charname { # internal: called from charinfo 169 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift); 170} 171 172my @CharinfoRanges = ( 173# block name 174# [ first, last, coderef to name, coderef to decompose ], 175# CJK Ideographs Extension A 176 [ 0x3400, 0x4DB5, \&han_charname, undef ], 177# CJK Ideographs 178 [ 0x4E00, 0x9FA5, \&han_charname, undef ], 179# Hangul Syllables 180 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ], 181# Non-Private Use High Surrogates 182 [ 0xD800, 0xDB7F, undef, undef ], 183# Private Use High Surrogates 184 [ 0xDB80, 0xDBFF, undef, undef ], 185# Low Surrogates 186 [ 0xDC00, 0xDFFF, undef, undef ], 187# The Private Use Area 188 [ 0xE000, 0xF8FF, undef, undef ], 189# CJK Ideographs Extension B 190 [ 0x20000, 0x2A6D6, \&han_charname, undef ], 191# Plane 15 Private Use Area 192 [ 0xF0000, 0xFFFFD, undef, undef ], 193# Plane 16 Private Use Area 194 [ 0x100000, 0x10FFFD, undef, undef ], 195); 196 197sub charinfo { 198 my $arg = shift; 199 my $code = _getcode($arg); 200 croak __PACKAGE__, "::charinfo: unknown code '$arg'" 201 unless defined $code; 202 my $hexk = sprintf("%06X", $code); 203 my($rcode,$rname,$rdec); 204 foreach my $range (@CharinfoRanges){ 205 if ($range->[0] <= $code && $code <= $range->[1]) { 206 $rcode = $hexk; 207 $rcode =~ s/^0+//; 208 $rcode = sprintf("%04X", hex($rcode)); 209 $rname = $range->[2] ? $range->[2]->($code) : ''; 210 $rdec = $range->[3] ? $range->[3]->($code) : ''; 211 $hexk = sprintf("%06X", $range->[0]); # replace by the first 212 last; 213 } 214 } 215 openunicode(\$UNICODEFH, "UnicodeData.txt"); 216 if (defined $UNICODEFH) { 217 use Search::Dict 1.02; 218 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) { 219 my $line = <$UNICODEFH>; 220 return unless defined $line; 221 chomp $line; 222 my %prop; 223 @prop{qw( 224 code name category 225 combining bidi decomposition 226 decimal digit numeric 227 mirrored unicode10 comment 228 upper lower title 229 )} = split(/;/, $line, -1); 230 $hexk =~ s/^0+//; 231 $hexk = sprintf("%04X", hex($hexk)); 232 if ($prop{code} eq $hexk) { 233 $prop{block} = charblock($code); 234 $prop{script} = charscript($code); 235 if(defined $rname){ 236 $prop{code} = $rcode; 237 $prop{name} = $rname; 238 $prop{decomposition} = $rdec; 239 } 240 return \%prop; 241 } 242 } 243 } 244 return; 245} 246 247sub _search { # Binary search in a [[lo,hi,prop],[...],...] table. 248 my ($table, $lo, $hi, $code) = @_; 249 250 return if $lo > $hi; 251 252 my $mid = int(($lo+$hi) / 2); 253 254 if ($table->[$mid]->[0] < $code) { 255 if ($table->[$mid]->[1] >= $code) { 256 return $table->[$mid]->[2]; 257 } else { 258 _search($table, $mid + 1, $hi, $code); 259 } 260 } elsif ($table->[$mid]->[0] > $code) { 261 _search($table, $lo, $mid - 1, $code); 262 } else { 263 return $table->[$mid]->[2]; 264 } 265} 266 267sub charinrange { 268 my ($range, $arg) = @_; 269 my $code = _getcode($arg); 270 croak __PACKAGE__, "::charinrange: unknown code '$arg'" 271 unless defined $code; 272 _search($range, 0, $#$range, $code); 273} 274 275=head2 charblock 276 277 use Unicode::UCD 'charblock'; 278 279 my $charblock = charblock(0x41); 280 my $charblock = charblock(1234); 281 my $charblock = charblock("0x263a"); 282 my $charblock = charblock("U+263a"); 283 284 my $range = charblock('Armenian'); 285 286With a B<code point argument> charblock() returns the I<block> the character 287belongs to, e.g. C<Basic Latin>. Note that not all the character 288positions within all blocks are defined. 289 290See also L</Blocks versus Scripts>. 291 292If supplied with an argument that can't be a code point, charblock() tries 293to do the opposite and interpret the argument as a character block. The 294return value is a I<range>: an anonymous list of lists that contain 295I<start-of-range>, I<end-of-range> code point pairs. You can test whether 296a code point is in a range using the L</charinrange> function. If the 297argument is not a known character block, C<undef> is returned. 298 299=cut 300 301my @BLOCKS; 302my %BLOCKS; 303 304sub _charblocks { 305 unless (@BLOCKS) { 306 if (openunicode(\$BLOCKSFH, "Blocks.txt")) { 307 local $_; 308 while (<$BLOCKSFH>) { 309 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { 310 my ($lo, $hi) = (hex($1), hex($2)); 311 my $subrange = [ $lo, $hi, $3 ]; 312 push @BLOCKS, $subrange; 313 push @{$BLOCKS{$3}}, $subrange; 314 } 315 } 316 close($BLOCKSFH); 317 } 318 } 319} 320 321sub charblock { 322 my $arg = shift; 323 324 _charblocks() unless @BLOCKS; 325 326 my $code = _getcode($arg); 327 328 if (defined $code) { 329 _search(\@BLOCKS, 0, $#BLOCKS, $code); 330 } else { 331 if (exists $BLOCKS{$arg}) { 332 return dclone $BLOCKS{$arg}; 333 } else { 334 return; 335 } 336 } 337} 338 339=head2 charscript 340 341 use Unicode::UCD 'charscript'; 342 343 my $charscript = charscript(0x41); 344 my $charscript = charscript(1234); 345 my $charscript = charscript("U+263a"); 346 347 my $range = charscript('Thai'); 348 349With a B<code point argument> charscript() returns the I<script> the 350character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. 351 352See also L</Blocks versus Scripts>. 353 354If supplied with an argument that can't be a code point, charscript() tries 355to do the opposite and interpret the argument as a character script. The 356return value is a I<range>: an anonymous list of lists that contain 357I<start-of-range>, I<end-of-range> code point pairs. You can test whether a 358code point is in a range using the L</charinrange> function. If the 359argument is not a known character script, C<undef> is returned. 360 361=cut 362 363my @SCRIPTS; 364my %SCRIPTS; 365 366sub _charscripts { 367 unless (@SCRIPTS) { 368 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) { 369 local $_; 370 while (<$SCRIPTSFH>) { 371 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { 372 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1)); 373 my $script = lc($3); 374 $script =~ s/\b(\w)/uc($1)/ge; 375 my $subrange = [ $lo, $hi, $script ]; 376 push @SCRIPTS, $subrange; 377 push @{$SCRIPTS{$script}}, $subrange; 378 } 379 } 380 close($SCRIPTSFH); 381 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS; 382 } 383 } 384} 385 386sub charscript { 387 my $arg = shift; 388 389 _charscripts() unless @SCRIPTS; 390 391 my $code = _getcode($arg); 392 393 if (defined $code) { 394 _search(\@SCRIPTS, 0, $#SCRIPTS, $code); 395 } else { 396 if (exists $SCRIPTS{$arg}) { 397 return dclone $SCRIPTS{$arg}; 398 } else { 399 return; 400 } 401 } 402} 403 404=head2 charblocks 405 406 use Unicode::UCD 'charblocks'; 407 408 my $charblocks = charblocks(); 409 410charblocks() returns a reference to a hash with the known block names 411as the keys, and the code point ranges (see L</charblock>) as the values. 412 413See also L</Blocks versus Scripts>. 414 415=cut 416 417sub charblocks { 418 _charblocks() unless %BLOCKS; 419 return dclone \%BLOCKS; 420} 421 422=head2 charscripts 423 424 use Unicode::UCD 'charscripts'; 425 426 my %charscripts = charscripts(); 427 428charscripts() returns a hash with the known script names as the keys, 429and the code point ranges (see L</charscript>) as the values. 430 431See also L</Blocks versus Scripts>. 432 433=cut 434 435sub charscripts { 436 _charscripts() unless %SCRIPTS; 437 return dclone \%SCRIPTS; 438} 439 440=head2 Blocks versus Scripts 441 442The difference between a block and a script is that scripts are closer 443to the linguistic notion of a set of characters required to present 444languages, while block is more of an artifact of the Unicode character 445numbering and separation into blocks of (mostly) 256 characters. 446 447For example the Latin B<script> is spread over several B<blocks>, such 448as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and 449C<Latin Extended-B>. On the other hand, the Latin script does not 450contain all the characters of the C<Basic Latin> block (also known as 451the ASCII): it includes only the letters, and not, for example, the digits 452or the punctuation. 453 454For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt 455 456For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/ 457 458=head2 Matching Scripts and Blocks 459 460Scripts are matched with the regular-expression construct 461C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script), 462while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches 463any of the 256 code points in the Tibetan block). 464 465=head2 Code Point Arguments 466 467A I<code point argument> is either a decimal or a hexadecimal scalar 468designating a Unicode character, or C<U+> followed by hexadecimals 469designating a Unicode character. In other words, if you want a code 470point to be interpreted as a hexadecimal number, you must prefix it 471with either C<0x> or C<U+>, because a string like e.g. C<123> will 472be interpreted as a decimal code point. Also note that Unicode is 473B<not> limited to 16 bits (the number of Unicode characters is 474open-ended, in theory unlimited): you may have more than 4 hexdigits. 475 476=head2 charinrange 477 478In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you 479can also test whether a code point is in the I<range> as returned by 480L</charblock> and L</charscript> or as the values of the hash returned 481by L</charblocks> and L</charscripts> by using charinrange(): 482 483 use Unicode::UCD qw(charscript charinrange); 484 485 $range = charscript('Hiragana'); 486 print "looks like hiragana\n" if charinrange($range, $codepoint); 487 488=cut 489 490=head2 compexcl 491 492 use Unicode::UCD 'compexcl'; 493 494 my $compexcl = compexcl("09dc"); 495 496The compexcl() returns the composition exclusion (that is, if the 497character should not be produced during a precomposition) of the 498character specified by a B<code point argument>. 499 500If there is a composition exclusion for the character, true is 501returned. Otherwise, false is returned. 502 503=cut 504 505my %COMPEXCL; 506 507sub _compexcl { 508 unless (%COMPEXCL) { 509 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) { 510 local $_; 511 while (<$COMPEXCLFH>) { 512 if (/^([0-9A-F]+)\s+\#\s+/) { 513 my $code = hex($1); 514 $COMPEXCL{$code} = undef; 515 } 516 } 517 close($COMPEXCLFH); 518 } 519 } 520} 521 522sub compexcl { 523 my $arg = shift; 524 my $code = _getcode($arg); 525 croak __PACKAGE__, "::compexcl: unknown code '$arg'" 526 unless defined $code; 527 528 _compexcl() unless %COMPEXCL; 529 530 return exists $COMPEXCL{$code}; 531} 532 533=head2 casefold 534 535 use Unicode::UCD 'casefold'; 536 537 my $casefold = casefold("00DF"); 538 539The casefold() returns the locale-independent case folding of the 540character specified by a B<code point argument>. 541 542If there is a case folding for that character, a reference to a hash 543with the following fields is returned: 544 545 key 546 547 code code point with at least four hexdigits 548 status "C", "F", "S", or "I" 549 mapping one or more codes separated by spaces 550 551The meaning of the I<status> is as follows: 552 553 C common case folding, common mappings shared 554 by both simple and full mappings 555 F full case folding, mappings that cause strings 556 to grow in length. Multiple characters are separated 557 by spaces 558 S simple case folding, mappings to single characters 559 where different from F 560 I special case for dotted uppercase I and 561 dotless lowercase i 562 - If this mapping is included, the result is 563 case-insensitive, but dotless and dotted I's 564 are not distinguished 565 - If this mapping is excluded, the result is not 566 fully case-insensitive, but dotless and dotted 567 I's are distinguished 568 569If there is no case folding for that character, C<undef> is returned. 570 571For more information about case mappings see 572http://www.unicode.org/unicode/reports/tr21/ 573 574=cut 575 576my %CASEFOLD; 577 578sub _casefold { 579 unless (%CASEFOLD) { 580 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) { 581 local $_; 582 while (<$CASEFOLDFH>) { 583 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) { 584 my $code = hex($1); 585 $CASEFOLD{$code} = { code => $1, 586 status => $2, 587 mapping => $3 }; 588 } 589 } 590 close($CASEFOLDFH); 591 } 592 } 593} 594 595sub casefold { 596 my $arg = shift; 597 my $code = _getcode($arg); 598 croak __PACKAGE__, "::casefold: unknown code '$arg'" 599 unless defined $code; 600 601 _casefold() unless %CASEFOLD; 602 603 return $CASEFOLD{$code}; 604} 605 606=head2 casespec 607 608 use Unicode::UCD 'casespec'; 609 610 my $casespec = casespec("FB00"); 611 612The casespec() returns the potentially locale-dependent case mapping 613of the character specified by a B<code point argument>. The mapping 614may change the length of the string (which the basic Unicode case 615mappings as returned by charinfo() never do). 616 617If there is a case folding for that character, a reference to a hash 618with the following fields is returned: 619 620 key 621 622 code code point with at least four hexdigits 623 lower lowercase 624 title titlecase 625 upper uppercase 626 condition condition list (may be undef) 627 628The C<condition> is optional. Where present, it consists of one or 629more I<locales> or I<contexts>, separated by spaces (other than as 630used to separate elements, spaces are to be ignored). A condition 631list overrides the normal behavior if all of the listed conditions are 632true. Case distinctions in the condition list are not significant. 633Conditions preceded by "NON_" represent the negation of the condition. 634 635Note that when there are multiple case folding definitions for a 636single code point because of different locales, the value returned by 637casespec() is a hash reference which has the locales as the keys and 638hash references as described above as the values. 639 640A I<locale> is defined as a 2-letter ISO 3166 country code, possibly 641followed by a "_" and a 2-letter ISO language code (possibly followed 642by a "_" and a variant code). You can find the lists of those codes, 643see L<Locale::Country> and L<Locale::Language>. 644 645A I<context> is one of the following choices: 646 647 FINAL The letter is not followed by a letter of 648 general category L (e.g. Ll, Lt, Lu, Lm, or Lo) 649 MODERN The mapping is only used for modern text 650 AFTER_i The last base character was "i" (U+0069) 651 652For more information about case mappings see 653http://www.unicode.org/unicode/reports/tr21/ 654 655=cut 656 657my %CASESPEC; 658 659sub _casespec { 660 unless (%CASESPEC) { 661 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { 662 local $_; 663 while (<$CASESPECFH>) { 664 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { 665 my ($hexcode, $lower, $title, $upper, $condition) = 666 ($1, $2, $3, $4, $5); 667 my $code = hex($hexcode); 668 if (exists $CASESPEC{$code}) { 669 if (exists $CASESPEC{$code}->{code}) { 670 my ($oldlower, 671 $oldtitle, 672 $oldupper, 673 $oldcondition) = 674 @{$CASESPEC{$code}}{qw(lower 675 title 676 upper 677 condition)}; 678 if (defined $oldcondition) { 679 my ($oldlocale) = 680 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/); 681 delete $CASESPEC{$code}; 682 $CASESPEC{$code}->{$oldlocale} = 683 { code => $hexcode, 684 lower => $oldlower, 685 title => $oldtitle, 686 upper => $oldupper, 687 condition => $oldcondition }; 688 } 689 } 690 my ($locale) = 691 ($condition =~ /^([a-z][a-z](?:_\S+)?)/); 692 $CASESPEC{$code}->{$locale} = 693 { code => $hexcode, 694 lower => $lower, 695 title => $title, 696 upper => $upper, 697 condition => $condition }; 698 } else { 699 $CASESPEC{$code} = 700 { code => $hexcode, 701 lower => $lower, 702 title => $title, 703 upper => $upper, 704 condition => $condition }; 705 } 706 } 707 } 708 close($CASESPECFH); 709 } 710 } 711} 712 713sub casespec { 714 my $arg = shift; 715 my $code = _getcode($arg); 716 croak __PACKAGE__, "::casespec: unknown code '$arg'" 717 unless defined $code; 718 719 _casespec() unless %CASESPEC; 720 721 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code}; 722} 723 724=head2 namedseq() 725 726 use Unicode::UCD 'namedseq'; 727 728 my $namedseq = namedseq("KATAKANA LETTER AINU P"); 729 my @namedseq = namedseq("KATAKANA LETTER AINU P"); 730 my %namedseq = namedseq(); 731 732If used with a single argument in a scalar context, returns the string 733consisting of the code points of the named sequence, or C<undef> if no 734named sequence by that name exists. If used with a single argument in 735a list context, returns list of the code points. If used with no 736arguments in a list context, returns a hash with the names of the 737named sequences as the keys and the named sequences as strings as 738the values. Otherwise, returns C<undef> or empty list depending 739on the context. 740 741(New from Unicode 4.1.0) 742 743=cut 744 745my %NAMEDSEQ; 746 747sub _namedseq { 748 unless (%NAMEDSEQ) { 749 if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) { 750 local $_; 751 while (<$NAMEDSEQFH>) { 752 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) { 753 my ($n, $s) = ($1, $2); 754 my @s = map { chr(hex($_)) } split(' ', $s); 755 $NAMEDSEQ{$n} = join("", @s); 756 } 757 } 758 close($NAMEDSEQFH); 759 } 760 } 761} 762 763sub namedseq { 764 _namedseq() unless %NAMEDSEQ; 765 my $wantarray = wantarray(); 766 if (defined $wantarray) { 767 if ($wantarray) { 768 if (@_ == 0) { 769 return %NAMEDSEQ; 770 } elsif (@_ == 1) { 771 my $s = $NAMEDSEQ{ $_[0] }; 772 return defined $s ? map { ord($_) } split('', $s) : (); 773 } 774 } elsif (@_ == 1) { 775 return $NAMEDSEQ{ $_[0] }; 776 } 777 } 778 return; 779} 780 781=head2 Unicode::UCD::UnicodeVersion 782 783Unicode::UCD::UnicodeVersion() returns the version of the Unicode 784Character Database, in other words, the version of the Unicode 785standard the database implements. The version is a string 786of numbers delimited by dots (C<'.'>). 787 788=cut 789 790my $UNICODEVERSION; 791 792sub UnicodeVersion { 793 unless (defined $UNICODEVERSION) { 794 openunicode(\$VERSIONFH, "version"); 795 chomp($UNICODEVERSION = <$VERSIONFH>); 796 close($VERSIONFH); 797 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'" 798 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/; 799 } 800 return $UNICODEVERSION; 801} 802 803=head2 Implementation Note 804 805The first use of charinfo() opens a read-only filehandle to the Unicode 806Character Database (the database is included in the Perl distribution). 807The filehandle is then kept open for further queries. In other words, 808if you are wondering where one of your filehandles went, that's where. 809 810=head1 BUGS 811 812Does not yet support EBCDIC platforms. 813 814=head1 AUTHOR 815 816Jarkko Hietaniemi 817 818=cut 819 8201; 821