1package Unicode::UCD; 2 3use strict; 4use warnings; 5 6our $VERSION = '0.25'; 7 8use Storable qw(dclone); 9 10require Exporter; 11 12our @ISA = qw(Exporter); 13 14our @EXPORT_OK = qw(charinfo 15 charblock charscript 16 charblocks charscripts 17 charinrange 18 general_categories bidi_types 19 compexcl 20 casefold casespec 21 namedseq); 22 23use Carp; 24 25=head1 NAME 26 27Unicode::UCD - Unicode character database 28 29=head1 SYNOPSIS 30 31 use Unicode::UCD 'charinfo'; 32 my $charinfo = charinfo($codepoint); 33 34 use Unicode::UCD 'charblock'; 35 my $charblock = charblock($codepoint); 36 37 use Unicode::UCD 'charscript'; 38 my $charscript = charscript($codepoint); 39 40 use Unicode::UCD 'charblocks'; 41 my $charblocks = charblocks(); 42 43 use Unicode::UCD 'charscripts'; 44 my $charscripts = charscripts(); 45 46 use Unicode::UCD qw(charscript charinrange); 47 my $range = charscript($script); 48 print "looks like $script\n" if charinrange($range, $codepoint); 49 50 use Unicode::UCD qw(general_categories bidi_types); 51 my $categories = general_categories(); 52 my $types = bidi_types(); 53 54 use Unicode::UCD 'compexcl'; 55 my $compexcl = compexcl($codepoint); 56 57 use Unicode::UCD 'namedseq'; 58 my $namedseq = namedseq($named_sequence_name); 59 60 my $unicode_version = Unicode::UCD::UnicodeVersion(); 61 62=head1 DESCRIPTION 63 64The Unicode::UCD module offers a simple interface to the Unicode 65Character Database. 66 67=cut 68 69my $UNICODEFH; 70my $BLOCKSFH; 71my $SCRIPTSFH; 72my $VERSIONFH; 73my $COMPEXCLFH; 74my $CASEFOLDFH; 75my $CASESPECFH; 76my $NAMEDSEQFH; 77 78sub openunicode { 79 my ($rfh, @path) = @_; 80 my $f; 81 unless (defined $$rfh) { 82 for my $d (@INC) { 83 use File::Spec; 84 $f = File::Spec->catfile($d, "unicore", @path); 85 last if open($$rfh, $f); 86 undef $f; 87 } 88 croak __PACKAGE__, ": failed to find ", 89 File::Spec->catfile(@path), " in @INC" 90 unless defined $f; 91 } 92 return $f; 93} 94 95=head2 charinfo 96 97 use Unicode::UCD 'charinfo'; 98 99 my $charinfo = charinfo(0x41); 100 101charinfo() returns a reference to a hash that has the following fields 102as defined by the Unicode standard: 103 104 key 105 106 code code point with at least four hexdigits 107 name name of the character IN UPPER CASE 108 category general category of the character 109 combining classes used in the Canonical Ordering Algorithm 110 bidi bidirectional type 111 decomposition character decomposition mapping 112 decimal if decimal digit this is the integer numeric value 113 digit if digit this is the numeric value 114 numeric if numeric is the integer or rational numeric value 115 mirrored if mirrored in bidirectional text 116 unicode10 Unicode 1.0 name if existed and different 117 comment ISO 10646 comment field 118 upper uppercase equivalent mapping 119 lower lowercase equivalent mapping 120 title titlecase equivalent mapping 121 122 block block the character belongs to (used in \p{In...}) 123 script script the character belongs to 124 125If no match is found, a reference to an empty hash is returned. 126 127The C<block> property is the same as returned by charinfo(). It is 128not defined in the Unicode Character Database proper (Chapter 4 of the 129Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database 130(Chapter 14 of TUS3). Similarly for the C<script> property. 131 132Note that you cannot do (de)composition and casing based solely on the 133above C<decomposition> and C<lower>, C<upper>, C<title>, properties, 134you will need also the compexcl(), casefold(), and casespec() functions. 135 136=cut 137 138# NB: This function is duplicated in charnames.pm 139sub _getcode { 140 my $arg = shift; 141 142 if ($arg =~ /^[1-9]\d*$/) { 143 return $arg; 144 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { 145 return hex($1); 146 } 147 148 return; 149} 150 151# Lingua::KO::Hangul::Util not part of the standard distribution 152# but it will be used if available. 153 154eval { require Lingua::KO::Hangul::Util }; 155my $hasHangulUtil = ! $@; 156if ($hasHangulUtil) { 157 Lingua::KO::Hangul::Util->import(); 158} 159 160sub hangul_decomp { # internal: called from charinfo 161 if ($hasHangulUtil) { 162 my @tmp = decomposeHangul(shift); 163 return sprintf("%04X %04X", @tmp) if @tmp == 2; 164 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3; 165 } 166 return; 167} 168 169sub hangul_charname { # internal: called from charinfo 170 return sprintf("HANGUL SYLLABLE-%04X", shift); 171} 172 173sub han_charname { # internal: called from charinfo 174 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift); 175} 176 177my @CharinfoRanges = ( 178# block name 179# [ first, last, coderef to name, coderef to decompose ], 180# CJK Ideographs Extension A 181 [ 0x3400, 0x4DB5, \&han_charname, undef ], 182# CJK Ideographs 183 [ 0x4E00, 0x9FA5, \&han_charname, undef ], 184# Hangul Syllables 185 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ], 186# Non-Private Use High Surrogates 187 [ 0xD800, 0xDB7F, undef, undef ], 188# Private Use High Surrogates 189 [ 0xDB80, 0xDBFF, undef, undef ], 190# Low Surrogates 191 [ 0xDC00, 0xDFFF, undef, undef ], 192# The Private Use Area 193 [ 0xE000, 0xF8FF, undef, undef ], 194# CJK Ideographs Extension B 195 [ 0x20000, 0x2A6D6, \&han_charname, undef ], 196# Plane 15 Private Use Area 197 [ 0xF0000, 0xFFFFD, undef, undef ], 198# Plane 16 Private Use Area 199 [ 0x100000, 0x10FFFD, undef, undef ], 200); 201 202sub charinfo { 203 my $arg = shift; 204 my $code = _getcode($arg); 205 croak __PACKAGE__, "::charinfo: unknown code '$arg'" 206 unless defined $code; 207 my $hexk = sprintf("%06X", $code); 208 my($rcode,$rname,$rdec); 209 foreach my $range (@CharinfoRanges){ 210 if ($range->[0] <= $code && $code <= $range->[1]) { 211 $rcode = $hexk; 212 $rcode =~ s/^0+//; 213 $rcode = sprintf("%04X", hex($rcode)); 214 $rname = $range->[2] ? $range->[2]->($code) : ''; 215 $rdec = $range->[3] ? $range->[3]->($code) : ''; 216 $hexk = sprintf("%06X", $range->[0]); # replace by the first 217 last; 218 } 219 } 220 openunicode(\$UNICODEFH, "UnicodeData.txt"); 221 if (defined $UNICODEFH) { 222 use Search::Dict 1.02; 223 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) { 224 my $line = <$UNICODEFH>; 225 return unless defined $line; 226 chomp $line; 227 my %prop; 228 @prop{qw( 229 code name category 230 combining bidi decomposition 231 decimal digit numeric 232 mirrored unicode10 comment 233 upper lower title 234 )} = split(/;/, $line, -1); 235 $hexk =~ s/^0+//; 236 $hexk = sprintf("%04X", hex($hexk)); 237 if ($prop{code} eq $hexk) { 238 $prop{block} = charblock($code); 239 $prop{script} = charscript($code); 240 if(defined $rname){ 241 $prop{code} = $rcode; 242 $prop{name} = $rname; 243 $prop{decomposition} = $rdec; 244 } 245 return \%prop; 246 } 247 } 248 } 249 return; 250} 251 252sub _search { # Binary search in a [[lo,hi,prop],[...],...] table. 253 my ($table, $lo, $hi, $code) = @_; 254 255 return if $lo > $hi; 256 257 my $mid = int(($lo+$hi) / 2); 258 259 if ($table->[$mid]->[0] < $code) { 260 if ($table->[$mid]->[1] >= $code) { 261 return $table->[$mid]->[2]; 262 } else { 263 _search($table, $mid + 1, $hi, $code); 264 } 265 } elsif ($table->[$mid]->[0] > $code) { 266 _search($table, $lo, $mid - 1, $code); 267 } else { 268 return $table->[$mid]->[2]; 269 } 270} 271 272sub charinrange { 273 my ($range, $arg) = @_; 274 my $code = _getcode($arg); 275 croak __PACKAGE__, "::charinrange: unknown code '$arg'" 276 unless defined $code; 277 _search($range, 0, $#$range, $code); 278} 279 280=head2 charblock 281 282 use Unicode::UCD 'charblock'; 283 284 my $charblock = charblock(0x41); 285 my $charblock = charblock(1234); 286 my $charblock = charblock("0x263a"); 287 my $charblock = charblock("U+263a"); 288 289 my $range = charblock('Armenian'); 290 291With a B<code point argument> charblock() returns the I<block> the character 292belongs to, e.g. C<Basic Latin>. Note that not all the character 293positions within all blocks are defined. 294 295See also L</Blocks versus Scripts>. 296 297If supplied with an argument that can't be a code point, charblock() tries 298to do the opposite and interpret the argument as a character block. The 299return value is a I<range>: an anonymous list of lists that contain 300I<start-of-range>, I<end-of-range> code point pairs. You can test whether 301a code point is in a range using the L</charinrange> function. If the 302argument is not a known character block, C<undef> is returned. 303 304=cut 305 306my @BLOCKS; 307my %BLOCKS; 308 309sub _charblocks { 310 unless (@BLOCKS) { 311 if (openunicode(\$BLOCKSFH, "Blocks.txt")) { 312 local $_; 313 while (<$BLOCKSFH>) { 314 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { 315 my ($lo, $hi) = (hex($1), hex($2)); 316 my $subrange = [ $lo, $hi, $3 ]; 317 push @BLOCKS, $subrange; 318 push @{$BLOCKS{$3}}, $subrange; 319 } 320 } 321 close($BLOCKSFH); 322 } 323 } 324} 325 326sub charblock { 327 my $arg = shift; 328 329 _charblocks() unless @BLOCKS; 330 331 my $code = _getcode($arg); 332 333 if (defined $code) { 334 _search(\@BLOCKS, 0, $#BLOCKS, $code); 335 } else { 336 if (exists $BLOCKS{$arg}) { 337 return dclone $BLOCKS{$arg}; 338 } else { 339 return; 340 } 341 } 342} 343 344=head2 charscript 345 346 use Unicode::UCD 'charscript'; 347 348 my $charscript = charscript(0x41); 349 my $charscript = charscript(1234); 350 my $charscript = charscript("U+263a"); 351 352 my $range = charscript('Thai'); 353 354With a B<code point argument> charscript() returns the I<script> the 355character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. 356 357See also L</Blocks versus Scripts>. 358 359If supplied with an argument that can't be a code point, charscript() tries 360to do the opposite and interpret the argument as a character script. The 361return value is a I<range>: an anonymous list of lists that contain 362I<start-of-range>, I<end-of-range> code point pairs. You can test whether a 363code point is in a range using the L</charinrange> function. If the 364argument is not a known character script, C<undef> is returned. 365 366=cut 367 368my @SCRIPTS; 369my %SCRIPTS; 370 371sub _charscripts { 372 unless (@SCRIPTS) { 373 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) { 374 local $_; 375 while (<$SCRIPTSFH>) { 376 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { 377 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1)); 378 my $script = lc($3); 379 $script =~ s/\b(\w)/uc($1)/ge; 380 my $subrange = [ $lo, $hi, $script ]; 381 push @SCRIPTS, $subrange; 382 push @{$SCRIPTS{$script}}, $subrange; 383 } 384 } 385 close($SCRIPTSFH); 386 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS; 387 } 388 } 389} 390 391sub charscript { 392 my $arg = shift; 393 394 _charscripts() unless @SCRIPTS; 395 396 my $code = _getcode($arg); 397 398 if (defined $code) { 399 _search(\@SCRIPTS, 0, $#SCRIPTS, $code); 400 } else { 401 if (exists $SCRIPTS{$arg}) { 402 return dclone $SCRIPTS{$arg}; 403 } else { 404 return; 405 } 406 } 407} 408 409=head2 charblocks 410 411 use Unicode::UCD 'charblocks'; 412 413 my $charblocks = charblocks(); 414 415charblocks() returns a reference to a hash with the known block names 416as the keys, and the code point ranges (see L</charblock>) as the values. 417 418See also L</Blocks versus Scripts>. 419 420=cut 421 422sub charblocks { 423 _charblocks() unless %BLOCKS; 424 return dclone \%BLOCKS; 425} 426 427=head2 charscripts 428 429 use Unicode::UCD 'charscripts'; 430 431 my $charscripts = charscripts(); 432 433charscripts() returns a reference to a hash with the known script 434names as the keys, and the code point ranges (see L</charscript>) as 435the values. 436 437See also L</Blocks versus Scripts>. 438 439=cut 440 441sub charscripts { 442 _charscripts() unless %SCRIPTS; 443 return dclone \%SCRIPTS; 444} 445 446=head2 Blocks versus Scripts 447 448The difference between a block and a script is that scripts are closer 449to the linguistic notion of a set of characters required to present 450languages, while block is more of an artifact of the Unicode character 451numbering and separation into blocks of (mostly) 256 characters. 452 453For example the Latin B<script> is spread over several B<blocks>, such 454as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and 455C<Latin Extended-B>. On the other hand, the Latin script does not 456contain all the characters of the C<Basic Latin> block (also known as 457the ASCII): it includes only the letters, and not, for example, the digits 458or the punctuation. 459 460For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt 461 462For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/ 463 464=head2 Matching Scripts and Blocks 465 466Scripts are matched with the regular-expression construct 467C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script), 468while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches 469any of the 256 code points in the Tibetan block). 470 471=head2 Code Point Arguments 472 473A I<code point argument> is either a decimal or a hexadecimal scalar 474designating a Unicode character, or C<U+> followed by hexadecimals 475designating a Unicode character. In other words, if you want a code 476point to be interpreted as a hexadecimal number, you must prefix it 477with either C<0x> or C<U+>, because a string like e.g. C<123> will 478be interpreted as a decimal code point. Also note that Unicode is 479B<not> limited to 16 bits (the number of Unicode characters is 480open-ended, in theory unlimited): you may have more than 4 hexdigits. 481 482=head2 charinrange 483 484In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you 485can also test whether a code point is in the I<range> as returned by 486L</charblock> and L</charscript> or as the values of the hash returned 487by L</charblocks> and L</charscripts> by using charinrange(): 488 489 use Unicode::UCD qw(charscript charinrange); 490 491 $range = charscript('Hiragana'); 492 print "looks like hiragana\n" if charinrange($range, $codepoint); 493 494=cut 495 496my %GENERAL_CATEGORIES = 497 ( 498 'L' => 'Letter', 499 'LC' => 'CasedLetter', 500 'Lu' => 'UppercaseLetter', 501 'Ll' => 'LowercaseLetter', 502 'Lt' => 'TitlecaseLetter', 503 'Lm' => 'ModifierLetter', 504 'Lo' => 'OtherLetter', 505 'M' => 'Mark', 506 'Mn' => 'NonspacingMark', 507 'Mc' => 'SpacingMark', 508 'Me' => 'EnclosingMark', 509 'N' => 'Number', 510 'Nd' => 'DecimalNumber', 511 'Nl' => 'LetterNumber', 512 'No' => 'OtherNumber', 513 'P' => 'Punctuation', 514 'Pc' => 'ConnectorPunctuation', 515 'Pd' => 'DashPunctuation', 516 'Ps' => 'OpenPunctuation', 517 'Pe' => 'ClosePunctuation', 518 'Pi' => 'InitialPunctuation', 519 'Pf' => 'FinalPunctuation', 520 'Po' => 'OtherPunctuation', 521 'S' => 'Symbol', 522 'Sm' => 'MathSymbol', 523 'Sc' => 'CurrencySymbol', 524 'Sk' => 'ModifierSymbol', 525 'So' => 'OtherSymbol', 526 'Z' => 'Separator', 527 'Zs' => 'SpaceSeparator', 528 'Zl' => 'LineSeparator', 529 'Zp' => 'ParagraphSeparator', 530 'C' => 'Other', 531 'Cc' => 'Control', 532 'Cf' => 'Format', 533 'Cs' => 'Surrogate', 534 'Co' => 'PrivateUse', 535 'Cn' => 'Unassigned', 536 ); 537 538sub general_categories { 539 return dclone \%GENERAL_CATEGORIES; 540} 541 542=head2 general_categories 543 544 use Unicode::UCD 'general_categories'; 545 546 my $categories = general_categories(); 547 548The general_categories() returns a reference to a hash which has short 549general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long 550names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>, 551C<Symbol>) as values. The hash is reversible in case you need to go 552from the long names to the short names. The general category is the 553one returned from charinfo() under the C<category> key. 554 555=cut 556 557my %BIDI_TYPES = 558 ( 559 'L' => 'Left-to-Right', 560 'LRE' => 'Left-to-Right Embedding', 561 'LRO' => 'Left-to-Right Override', 562 'R' => 'Right-to-Left', 563 'AL' => 'Right-to-Left Arabic', 564 'RLE' => 'Right-to-Left Embedding', 565 'RLO' => 'Right-to-Left Override', 566 'PDF' => 'Pop Directional Format', 567 'EN' => 'European Number', 568 'ES' => 'European Number Separator', 569 'ET' => 'European Number Terminator', 570 'AN' => 'Arabic Number', 571 'CS' => 'Common Number Separator', 572 'NSM' => 'Non-Spacing Mark', 573 'BN' => 'Boundary Neutral', 574 'B' => 'Paragraph Separator', 575 'S' => 'Segment Separator', 576 'WS' => 'Whitespace', 577 'ON' => 'Other Neutrals', 578 ); 579 580sub bidi_types { 581 return dclone \%BIDI_TYPES; 582} 583 584=head2 bidi_types 585 586 use Unicode::UCD 'bidi_types'; 587 588 my $categories = bidi_types(); 589 590The bidi_types() returns a reference to a hash which has the short 591bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long 592names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The 593hash is reversible in case you need to go from the long names to the 594short names. The bidi type is the one returned from charinfo() 595under the C<bidi> key. For the exact meaning of the various bidi classes 596the Unicode TR9 is recommended reading: 597http://www.unicode.org/reports/tr9/tr9-17.html 598(as of Unicode 5.0.0) 599 600=cut 601 602=head2 compexcl 603 604 use Unicode::UCD 'compexcl'; 605 606 my $compexcl = compexcl("09dc"); 607 608The compexcl() returns the composition exclusion (that is, if the 609character should not be produced during a precomposition) of the 610character specified by a B<code point argument>. 611 612If there is a composition exclusion for the character, true is 613returned. Otherwise, false is returned. 614 615=cut 616 617my %COMPEXCL; 618 619sub _compexcl { 620 unless (%COMPEXCL) { 621 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) { 622 local $_; 623 while (<$COMPEXCLFH>) { 624 if (/^([0-9A-F]+)\s+\#\s+/) { 625 my $code = hex($1); 626 $COMPEXCL{$code} = undef; 627 } 628 } 629 close($COMPEXCLFH); 630 } 631 } 632} 633 634sub compexcl { 635 my $arg = shift; 636 my $code = _getcode($arg); 637 croak __PACKAGE__, "::compexcl: unknown code '$arg'" 638 unless defined $code; 639 640 _compexcl() unless %COMPEXCL; 641 642 return exists $COMPEXCL{$code}; 643} 644 645=head2 casefold 646 647 use Unicode::UCD 'casefold'; 648 649 my $casefold = casefold("00DF"); 650 651The casefold() returns the locale-independent case folding of the 652character specified by a B<code point argument>. 653 654If there is a case folding for that character, a reference to a hash 655with the following fields is returned: 656 657 key 658 659 code code point with at least four hexdigits 660 status "C", "F", "S", or "I" 661 mapping one or more codes separated by spaces 662 663The meaning of the I<status> is as follows: 664 665 C common case folding, common mappings shared 666 by both simple and full mappings 667 F full case folding, mappings that cause strings 668 to grow in length. Multiple characters are separated 669 by spaces 670 S simple case folding, mappings to single characters 671 where different from F 672 I special case for dotted uppercase I and 673 dotless lowercase i 674 - If this mapping is included, the result is 675 case-insensitive, but dotless and dotted I's 676 are not distinguished 677 - If this mapping is excluded, the result is not 678 fully case-insensitive, but dotless and dotted 679 I's are distinguished 680 681If there is no case folding for that character, C<undef> is returned. 682 683For more information about case mappings see 684http://www.unicode.org/unicode/reports/tr21/ 685 686=cut 687 688my %CASEFOLD; 689 690sub _casefold { 691 unless (%CASEFOLD) { 692 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) { 693 local $_; 694 while (<$CASEFOLDFH>) { 695 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) { 696 my $code = hex($1); 697 $CASEFOLD{$code} = { code => $1, 698 status => $2, 699 mapping => $3 }; 700 } 701 } 702 close($CASEFOLDFH); 703 } 704 } 705} 706 707sub casefold { 708 my $arg = shift; 709 my $code = _getcode($arg); 710 croak __PACKAGE__, "::casefold: unknown code '$arg'" 711 unless defined $code; 712 713 _casefold() unless %CASEFOLD; 714 715 return $CASEFOLD{$code}; 716} 717 718=head2 casespec 719 720 use Unicode::UCD 'casespec'; 721 722 my $casespec = casespec("FB00"); 723 724The casespec() returns the potentially locale-dependent case mapping 725of the character specified by a B<code point argument>. The mapping 726may change the length of the string (which the basic Unicode case 727mappings as returned by charinfo() never do). 728 729If there is a case folding for that character, a reference to a hash 730with the following fields is returned: 731 732 key 733 734 code code point with at least four hexdigits 735 lower lowercase 736 title titlecase 737 upper uppercase 738 condition condition list (may be undef) 739 740The C<condition> is optional. Where present, it consists of one or 741more I<locales> or I<contexts>, separated by spaces (other than as 742used to separate elements, spaces are to be ignored). A condition 743list overrides the normal behavior if all of the listed conditions are 744true. Case distinctions in the condition list are not significant. 745Conditions preceded by "NON_" represent the negation of the condition. 746 747Note that when there are multiple case folding definitions for a 748single code point because of different locales, the value returned by 749casespec() is a hash reference which has the locales as the keys and 750hash references as described above as the values. 751 752A I<locale> is defined as a 2-letter ISO 3166 country code, possibly 753followed by a "_" and a 2-letter ISO language code (possibly followed 754by a "_" and a variant code). You can find the lists of those codes, 755see L<Locale::Country> and L<Locale::Language>. 756 757A I<context> is one of the following choices: 758 759 FINAL The letter is not followed by a letter of 760 general category L (e.g. Ll, Lt, Lu, Lm, or Lo) 761 MODERN The mapping is only used for modern text 762 AFTER_i The last base character was "i" (U+0069) 763 764For more information about case mappings see 765http://www.unicode.org/unicode/reports/tr21/ 766 767=cut 768 769my %CASESPEC; 770 771sub _casespec { 772 unless (%CASESPEC) { 773 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { 774 local $_; 775 while (<$CASESPECFH>) { 776 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { 777 my ($hexcode, $lower, $title, $upper, $condition) = 778 ($1, $2, $3, $4, $5); 779 my $code = hex($hexcode); 780 if (exists $CASESPEC{$code}) { 781 if (exists $CASESPEC{$code}->{code}) { 782 my ($oldlower, 783 $oldtitle, 784 $oldupper, 785 $oldcondition) = 786 @{$CASESPEC{$code}}{qw(lower 787 title 788 upper 789 condition)}; 790 if (defined $oldcondition) { 791 my ($oldlocale) = 792 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/); 793 delete $CASESPEC{$code}; 794 $CASESPEC{$code}->{$oldlocale} = 795 { code => $hexcode, 796 lower => $oldlower, 797 title => $oldtitle, 798 upper => $oldupper, 799 condition => $oldcondition }; 800 } 801 } 802 my ($locale) = 803 ($condition =~ /^([a-z][a-z](?:_\S+)?)/); 804 $CASESPEC{$code}->{$locale} = 805 { code => $hexcode, 806 lower => $lower, 807 title => $title, 808 upper => $upper, 809 condition => $condition }; 810 } else { 811 $CASESPEC{$code} = 812 { code => $hexcode, 813 lower => $lower, 814 title => $title, 815 upper => $upper, 816 condition => $condition }; 817 } 818 } 819 } 820 close($CASESPECFH); 821 } 822 } 823} 824 825sub casespec { 826 my $arg = shift; 827 my $code = _getcode($arg); 828 croak __PACKAGE__, "::casespec: unknown code '$arg'" 829 unless defined $code; 830 831 _casespec() unless %CASESPEC; 832 833 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code}; 834} 835 836=head2 namedseq() 837 838 use Unicode::UCD 'namedseq'; 839 840 my $namedseq = namedseq("KATAKANA LETTER AINU P"); 841 my @namedseq = namedseq("KATAKANA LETTER AINU P"); 842 my %namedseq = namedseq(); 843 844If used with a single argument in a scalar context, returns the string 845consisting of the code points of the named sequence, or C<undef> if no 846named sequence by that name exists. If used with a single argument in 847a list context, returns list of the code points. If used with no 848arguments in a list context, returns a hash with the names of the 849named sequences as the keys and the named sequences as strings as 850the values. Otherwise, returns C<undef> or empty list depending 851on the context. 852 853(New from Unicode 4.1.0) 854 855=cut 856 857my %NAMEDSEQ; 858 859sub _namedseq { 860 unless (%NAMEDSEQ) { 861 if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) { 862 local $_; 863 while (<$NAMEDSEQFH>) { 864 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) { 865 my ($n, $s) = ($1, $2); 866 my @s = map { chr(hex($_)) } split(' ', $s); 867 $NAMEDSEQ{$n} = join("", @s); 868 } 869 } 870 close($NAMEDSEQFH); 871 } 872 } 873} 874 875sub namedseq { 876 _namedseq() unless %NAMEDSEQ; 877 my $wantarray = wantarray(); 878 if (defined $wantarray) { 879 if ($wantarray) { 880 if (@_ == 0) { 881 return %NAMEDSEQ; 882 } elsif (@_ == 1) { 883 my $s = $NAMEDSEQ{ $_[0] }; 884 return defined $s ? map { ord($_) } split('', $s) : (); 885 } 886 } elsif (@_ == 1) { 887 return $NAMEDSEQ{ $_[0] }; 888 } 889 } 890 return; 891} 892 893=head2 Unicode::UCD::UnicodeVersion 894 895Unicode::UCD::UnicodeVersion() returns the version of the Unicode 896Character Database, in other words, the version of the Unicode 897standard the database implements. The version is a string 898of numbers delimited by dots (C<'.'>). 899 900=cut 901 902my $UNICODEVERSION; 903 904sub UnicodeVersion { 905 unless (defined $UNICODEVERSION) { 906 openunicode(\$VERSIONFH, "version"); 907 chomp($UNICODEVERSION = <$VERSIONFH>); 908 close($VERSIONFH); 909 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'" 910 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/; 911 } 912 return $UNICODEVERSION; 913} 914 915=head2 Implementation Note 916 917The first use of charinfo() opens a read-only filehandle to the Unicode 918Character Database (the database is included in the Perl distribution). 919The filehandle is then kept open for further queries. In other words, 920if you are wondering where one of your filehandles went, that's where. 921 922=head1 BUGS 923 924Does not yet support EBCDIC platforms. 925 926=head1 AUTHOR 927 928Jarkko Hietaniemi 929 930=cut 931 9321; 933