1package Unicode::UCD; 2 3use strict; 4use warnings; 5 6our $VERSION = '0.22'; 7 8use Storable qw(dclone); 9 10require Exporter; 11 12our @ISA = qw(Exporter); 13 14our @EXPORT_OK = qw(charinfo 15 charblock charscript 16 charblocks charscripts 17 charinrange 18 compexcl 19 casefold casespec); 20 21use Carp; 22 23=head1 NAME 24 25Unicode::UCD - Unicode character database 26 27=head1 SYNOPSIS 28 29 use Unicode::UCD 'charinfo'; 30 my $charinfo = charinfo($codepoint); 31 32 use Unicode::UCD 'charblock'; 33 my $charblock = charblock($codepoint); 34 35 use Unicode::UCD 'charscript'; 36 my $charscript = charscript($codepoint); 37 38 use Unicode::UCD 'charblocks'; 39 my $charblocks = charblocks(); 40 41 use Unicode::UCD 'charscripts'; 42 my %charscripts = charscripts(); 43 44 use Unicode::UCD qw(charscript charinrange); 45 my $range = charscript($script); 46 print "looks like $script\n" if charinrange($range, $codepoint); 47 48 use Unicode::UCD 'compexcl'; 49 my $compexcl = compexcl($codepoint); 50 51 my $unicode_version = Unicode::UCD::UnicodeVersion(); 52 53=head1 DESCRIPTION 54 55The Unicode::UCD module offers a simple interface to the Unicode 56Character Database. 57 58=cut 59 60my $UNICODEFH; 61my $BLOCKSFH; 62my $SCRIPTSFH; 63my $VERSIONFH; 64my $COMPEXCLFH; 65my $CASEFOLDFH; 66my $CASESPECFH; 67 68sub openunicode { 69 my ($rfh, @path) = @_; 70 my $f; 71 unless (defined $$rfh) { 72 for my $d (@INC) { 73 use File::Spec; 74 $f = File::Spec->catfile($d, "unicore", @path); 75 last if open($$rfh, $f); 76 undef $f; 77 } 78 croak __PACKAGE__, ": failed to find ", 79 File::Spec->catfile(@path), " in @INC" 80 unless defined $f; 81 } 82 return $f; 83} 84 85=head2 charinfo 86 87 use Unicode::UCD 'charinfo'; 88 89 my $charinfo = charinfo(0x41); 90 91charinfo() returns a reference to a hash that has the following fields 92as defined by the Unicode standard: 93 94 key 95 96 code code point with at least four hexdigits 97 name name of the character IN UPPER CASE 98 category general category of the character 99 combining classes used in the Canonical Ordering Algorithm 100 bidi bidirectional category 101 decomposition character decomposition mapping 102 decimal if decimal digit this is the integer numeric value 103 digit if digit this is the numeric value 104 numeric if numeric is the integer or rational numeric value 105 mirrored if mirrored in bidirectional text 106 unicode10 Unicode 1.0 name if existed and different 107 comment ISO 10646 comment field 108 upper uppercase equivalent mapping 109 lower lowercase equivalent mapping 110 title titlecase equivalent mapping 111 112 block block the character belongs to (used in \p{In...}) 113 script script the character belongs to 114 115If no match is found, a reference to an empty hash is returned. 116 117The C<block> property is the same as returned by charinfo(). It is 118not defined in the Unicode Character Database proper (Chapter 4 of the 119Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database 120(Chapter 14 of TUS3). Similarly for the C<script> property. 121 122Note that you cannot do (de)composition and casing based solely on the 123above C<decomposition> and C<lower>, C<upper>, C<title>, properties, 124you will need also the compexcl(), casefold(), and casespec() functions. 125 126=cut 127 128# NB: This function is duplicated in charnames.pm 129sub _getcode { 130 my $arg = shift; 131 132 if ($arg =~ /^[1-9]\d*$/) { 133 return $arg; 134 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { 135 return hex($1); 136 } 137 138 return; 139} 140 141# Lingua::KO::Hangul::Util not part of the standard distribution 142# but it will be used if available. 143 144eval { require Lingua::KO::Hangul::Util }; 145my $hasHangulUtil = ! $@; 146if ($hasHangulUtil) { 147 Lingua::KO::Hangul::Util->import(); 148} 149 150sub hangul_decomp { # internal: called from charinfo 151 if ($hasHangulUtil) { 152 my @tmp = decomposeHangul(shift); 153 return sprintf("%04X %04X", @tmp) if @tmp == 2; 154 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3; 155 } 156 return; 157} 158 159sub hangul_charname { # internal: called from charinfo 160 return sprintf("HANGUL SYLLABLE-%04X", shift); 161} 162 163sub han_charname { # internal: called from charinfo 164 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift); 165} 166 167my @CharinfoRanges = ( 168# block name 169# [ first, last, coderef to name, coderef to decompose ], 170# CJK Ideographs Extension A 171 [ 0x3400, 0x4DB5, \&han_charname, undef ], 172# CJK Ideographs 173 [ 0x4E00, 0x9FA5, \&han_charname, undef ], 174# Hangul Syllables 175 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ], 176# Non-Private Use High Surrogates 177 [ 0xD800, 0xDB7F, undef, undef ], 178# Private Use High Surrogates 179 [ 0xDB80, 0xDBFF, undef, undef ], 180# Low Surrogates 181 [ 0xDC00, 0xDFFF, undef, undef ], 182# The Private Use Area 183 [ 0xE000, 0xF8FF, undef, undef ], 184# CJK Ideographs Extension B 185 [ 0x20000, 0x2A6D6, \&han_charname, undef ], 186# Plane 15 Private Use Area 187 [ 0xF0000, 0xFFFFD, undef, undef ], 188# Plane 16 Private Use Area 189 [ 0x100000, 0x10FFFD, undef, undef ], 190); 191 192sub charinfo { 193 my $arg = shift; 194 my $code = _getcode($arg); 195 croak __PACKAGE__, "::charinfo: unknown code '$arg'" 196 unless defined $code; 197 my $hexk = sprintf("%06X", $code); 198 my($rcode,$rname,$rdec); 199 foreach my $range (@CharinfoRanges){ 200 if ($range->[0] <= $code && $code <= $range->[1]) { 201 $rcode = $hexk; 202 $rcode =~ s/^0+//; 203 $rcode = sprintf("%04X", hex($rcode)); 204 $rname = $range->[2] ? $range->[2]->($code) : ''; 205 $rdec = $range->[3] ? $range->[3]->($code) : ''; 206 $hexk = sprintf("%06X", $range->[0]); # replace by the first 207 last; 208 } 209 } 210 openunicode(\$UNICODEFH, "UnicodeData.txt"); 211 if (defined $UNICODEFH) { 212 use Search::Dict 1.02; 213 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) { 214 my $line = <$UNICODEFH>; 215 return unless defined $line; 216 chomp $line; 217 my %prop; 218 @prop{qw( 219 code name category 220 combining bidi decomposition 221 decimal digit numeric 222 mirrored unicode10 comment 223 upper lower title 224 )} = split(/;/, $line, -1); 225 $hexk =~ s/^0+//; 226 $hexk = sprintf("%04X", hex($hexk)); 227 if ($prop{code} eq $hexk) { 228 $prop{block} = charblock($code); 229 $prop{script} = charscript($code); 230 if(defined $rname){ 231 $prop{code} = $rcode; 232 $prop{name} = $rname; 233 $prop{decomposition} = $rdec; 234 } 235 return \%prop; 236 } 237 } 238 } 239 return; 240} 241 242sub _search { # Binary search in a [[lo,hi,prop],[...],...] table. 243 my ($table, $lo, $hi, $code) = @_; 244 245 return if $lo > $hi; 246 247 my $mid = int(($lo+$hi) / 2); 248 249 if ($table->[$mid]->[0] < $code) { 250 if ($table->[$mid]->[1] >= $code) { 251 return $table->[$mid]->[2]; 252 } else { 253 _search($table, $mid + 1, $hi, $code); 254 } 255 } elsif ($table->[$mid]->[0] > $code) { 256 _search($table, $lo, $mid - 1, $code); 257 } else { 258 return $table->[$mid]->[2]; 259 } 260} 261 262sub charinrange { 263 my ($range, $arg) = @_; 264 my $code = _getcode($arg); 265 croak __PACKAGE__, "::charinrange: unknown code '$arg'" 266 unless defined $code; 267 _search($range, 0, $#$range, $code); 268} 269 270=head2 charblock 271 272 use Unicode::UCD 'charblock'; 273 274 my $charblock = charblock(0x41); 275 my $charblock = charblock(1234); 276 my $charblock = charblock("0x263a"); 277 my $charblock = charblock("U+263a"); 278 279 my $range = charblock('Armenian'); 280 281With a B<code point argument> charblock() returns the I<block> the character 282belongs to, e.g. C<Basic Latin>. Note that not all the character 283positions within all blocks are defined. 284 285See also L</Blocks versus Scripts>. 286 287If supplied with an argument that can't be a code point, charblock() tries 288to do the opposite and interpret the argument as a character block. The 289return value is a I<range>: an anonymous list of lists that contain 290I<start-of-range>, I<end-of-range> code point pairs. You can test whether a 291code point is in a range using the L</charinrange> function. If the 292argument is not a known charater block, C<undef> is returned. 293 294=cut 295 296my @BLOCKS; 297my %BLOCKS; 298 299sub _charblocks { 300 unless (@BLOCKS) { 301 if (openunicode(\$BLOCKSFH, "Blocks.txt")) { 302 local $_; 303 while (<$BLOCKSFH>) { 304 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { 305 my ($lo, $hi) = (hex($1), hex($2)); 306 my $subrange = [ $lo, $hi, $3 ]; 307 push @BLOCKS, $subrange; 308 push @{$BLOCKS{$3}}, $subrange; 309 } 310 } 311 close($BLOCKSFH); 312 } 313 } 314} 315 316sub charblock { 317 my $arg = shift; 318 319 _charblocks() unless @BLOCKS; 320 321 my $code = _getcode($arg); 322 323 if (defined $code) { 324 _search(\@BLOCKS, 0, $#BLOCKS, $code); 325 } else { 326 if (exists $BLOCKS{$arg}) { 327 return dclone $BLOCKS{$arg}; 328 } else { 329 return; 330 } 331 } 332} 333 334=head2 charscript 335 336 use Unicode::UCD 'charscript'; 337 338 my $charscript = charscript(0x41); 339 my $charscript = charscript(1234); 340 my $charscript = charscript("U+263a"); 341 342 my $range = charscript('Thai'); 343 344With a B<code point argument> charscript() returns the I<script> the 345character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. 346 347See also L</Blocks versus Scripts>. 348 349If supplied with an argument that can't be a code point, charscript() tries 350to do the opposite and interpret the argument as a character script. The 351return value is a I<range>: an anonymous list of lists that contain 352I<start-of-range>, I<end-of-range> code point pairs. You can test whether a 353code point is in a range using the L</charinrange> function. If the 354argument is not a known charater script, C<undef> is returned. 355 356=cut 357 358my @SCRIPTS; 359my %SCRIPTS; 360 361sub _charscripts { 362 unless (@SCRIPTS) { 363 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) { 364 local $_; 365 while (<$SCRIPTSFH>) { 366 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { 367 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1)); 368 my $script = lc($3); 369 $script =~ s/\b(\w)/uc($1)/ge; 370 my $subrange = [ $lo, $hi, $script ]; 371 push @SCRIPTS, $subrange; 372 push @{$SCRIPTS{$script}}, $subrange; 373 } 374 } 375 close($SCRIPTSFH); 376 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS; 377 } 378 } 379} 380 381sub charscript { 382 my $arg = shift; 383 384 _charscripts() unless @SCRIPTS; 385 386 my $code = _getcode($arg); 387 388 if (defined $code) { 389 _search(\@SCRIPTS, 0, $#SCRIPTS, $code); 390 } else { 391 if (exists $SCRIPTS{$arg}) { 392 return dclone $SCRIPTS{$arg}; 393 } else { 394 return; 395 } 396 } 397} 398 399=head2 charblocks 400 401 use Unicode::UCD 'charblocks'; 402 403 my $charblocks = charblocks(); 404 405charblocks() returns a reference to a hash with the known block names 406as the keys, and the code point ranges (see L</charblock>) as the values. 407 408See also L</Blocks versus Scripts>. 409 410=cut 411 412sub charblocks { 413 _charblocks() unless %BLOCKS; 414 return dclone \%BLOCKS; 415} 416 417=head2 charscripts 418 419 use Unicode::UCD 'charscripts'; 420 421 my %charscripts = charscripts(); 422 423charscripts() returns a hash with the known script names as the keys, 424and the code point ranges (see L</charscript>) as the values. 425 426See also L</Blocks versus Scripts>. 427 428=cut 429 430sub charscripts { 431 _charscripts() unless %SCRIPTS; 432 return dclone \%SCRIPTS; 433} 434 435=head2 Blocks versus Scripts 436 437The difference between a block and a script is that scripts are closer 438to the linguistic notion of a set of characters required to present 439languages, while block is more of an artifact of the Unicode character 440numbering and separation into blocks of (mostly) 256 characters. 441 442For example the Latin B<script> is spread over several B<blocks>, such 443as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and 444C<Latin Extended-B>. On the other hand, the Latin script does not 445contain all the characters of the C<Basic Latin> block (also known as 446the ASCII): it includes only the letters, and not, for example, the digits 447or the punctuation. 448 449For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt 450 451For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/ 452 453=head2 Matching Scripts and Blocks 454 455Scripts are matched with the regular-expression construct 456C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script), 457while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches 458any of the 256 code points in the Tibetan block). 459 460=head2 Code Point Arguments 461 462A I<code point argument> is either a decimal or a hexadecimal scalar 463designating a Unicode character, or C<U+> followed by hexadecimals 464designating a Unicode character. In other words, if you want a code 465point to be interpreted as a hexadecimal number, you must prefix it 466with either C<0x> or C<U+>, because a string like e.g. C<123> will 467be interpreted as a decimal code point. Also note that Unicode is 468B<not> limited to 16 bits (the number of Unicode characters is 469open-ended, in theory unlimited): you may have more than 4 hexdigits. 470 471=head2 charinrange 472 473In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you 474can also test whether a code point is in the I<range> as returned by 475L</charblock> and L</charscript> or as the values of the hash returned 476by L</charblocks> and L</charscripts> by using charinrange(): 477 478 use Unicode::UCD qw(charscript charinrange); 479 480 $range = charscript('Hiragana'); 481 print "looks like hiragana\n" if charinrange($range, $codepoint); 482 483=cut 484 485=head2 compexcl 486 487 use Unicode::UCD 'compexcl'; 488 489 my $compexcl = compexcl("09dc"); 490 491The compexcl() returns the composition exclusion (that is, if the 492character should not be produced during a precomposition) of the 493character specified by a B<code point argument>. 494 495If there is a composition exclusion for the character, true is 496returned. Otherwise, false is returned. 497 498=cut 499 500my %COMPEXCL; 501 502sub _compexcl { 503 unless (%COMPEXCL) { 504 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) { 505 local $_; 506 while (<$COMPEXCLFH>) { 507 if (/^([0-9A-F]+)\s+\#\s+/) { 508 my $code = hex($1); 509 $COMPEXCL{$code} = undef; 510 } 511 } 512 close($COMPEXCLFH); 513 } 514 } 515} 516 517sub compexcl { 518 my $arg = shift; 519 my $code = _getcode($arg); 520 croak __PACKAGE__, "::compexcl: unknown code '$arg'" 521 unless defined $code; 522 523 _compexcl() unless %COMPEXCL; 524 525 return exists $COMPEXCL{$code}; 526} 527 528=head2 casefold 529 530 use Unicode::UCD 'casefold'; 531 532 my $casefold = casefold("00DF"); 533 534The casefold() returns the locale-independent case folding of the 535character specified by a B<code point argument>. 536 537If there is a case folding for that character, a reference to a hash 538with the following fields is returned: 539 540 key 541 542 code code point with at least four hexdigits 543 status "C", "F", "S", or "I" 544 mapping one or more codes separated by spaces 545 546The meaning of the I<status> is as follows: 547 548 C common case folding, common mappings shared 549 by both simple and full mappings 550 F full case folding, mappings that cause strings 551 to grow in length. Multiple characters are separated 552 by spaces 553 S simple case folding, mappings to single characters 554 where different from F 555 I special case for dotted uppercase I and 556 dotless lowercase i 557 - If this mapping is included, the result is 558 case-insensitive, but dotless and dotted I's 559 are not distinguished 560 - If this mapping is excluded, the result is not 561 fully case-insensitive, but dotless and dotted 562 I's are distinguished 563 564If there is no case folding for that character, C<undef> is returned. 565 566For more information about case mappings see 567http://www.unicode.org/unicode/reports/tr21/ 568 569=cut 570 571my %CASEFOLD; 572 573sub _casefold { 574 unless (%CASEFOLD) { 575 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) { 576 local $_; 577 while (<$CASEFOLDFH>) { 578 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) { 579 my $code = hex($1); 580 $CASEFOLD{$code} = { code => $1, 581 status => $2, 582 mapping => $3 }; 583 } 584 } 585 close($CASEFOLDFH); 586 } 587 } 588} 589 590sub casefold { 591 my $arg = shift; 592 my $code = _getcode($arg); 593 croak __PACKAGE__, "::casefold: unknown code '$arg'" 594 unless defined $code; 595 596 _casefold() unless %CASEFOLD; 597 598 return $CASEFOLD{$code}; 599} 600 601=head2 casespec 602 603 use Unicode::UCD 'casespec'; 604 605 my $casespec = casespec("FB00"); 606 607The casespec() returns the potentially locale-dependent case mapping 608of the character specified by a B<code point argument>. The mapping 609may change the length of the string (which the basic Unicode case 610mappings as returned by charinfo() never do). 611 612If there is a case folding for that character, a reference to a hash 613with the following fields is returned: 614 615 key 616 617 code code point with at least four hexdigits 618 lower lowercase 619 title titlecase 620 upper uppercase 621 condition condition list (may be undef) 622 623The C<condition> is optional. Where present, it consists of one or 624more I<locales> or I<contexts>, separated by spaces (other than as 625used to separate elements, spaces are to be ignored). A condition 626list overrides the normal behavior if all of the listed conditions are 627true. Case distinctions in the condition list are not significant. 628Conditions preceded by "NON_" represent the negation of the condition. 629 630Note that when there are multiple case folding definitions for a 631single code point because of different locales, the value returned by 632casespec() is a hash reference which has the locales as the keys and 633hash references as described above as the values. 634 635A I<locale> is defined as a 2-letter ISO 3166 country code, possibly 636followed by a "_" and a 2-letter ISO language code (possibly followed 637by a "_" and a variant code). You can find the lists of those codes, 638see L<Locale::Country> and L<Locale::Language>. 639 640A I<context> is one of the following choices: 641 642 FINAL The letter is not followed by a letter of 643 general category L (e.g. Ll, Lt, Lu, Lm, or Lo) 644 MODERN The mapping is only used for modern text 645 AFTER_i The last base character was "i" (U+0069) 646 647For more information about case mappings see 648http://www.unicode.org/unicode/reports/tr21/ 649 650=cut 651 652my %CASESPEC; 653 654sub _casespec { 655 unless (%CASESPEC) { 656 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { 657 local $_; 658 while (<$CASESPECFH>) { 659 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { 660 my ($hexcode, $lower, $title, $upper, $condition) = 661 ($1, $2, $3, $4, $5); 662 my $code = hex($hexcode); 663 if (exists $CASESPEC{$code}) { 664 if (exists $CASESPEC{$code}->{code}) { 665 my ($oldlower, 666 $oldtitle, 667 $oldupper, 668 $oldcondition) = 669 @{$CASESPEC{$code}}{qw(lower 670 title 671 upper 672 condition)}; 673 if (defined $oldcondition) { 674 my ($oldlocale) = 675 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/); 676 delete $CASESPEC{$code}; 677 $CASESPEC{$code}->{$oldlocale} = 678 { code => $hexcode, 679 lower => $oldlower, 680 title => $oldtitle, 681 upper => $oldupper, 682 condition => $oldcondition }; 683 } 684 } 685 my ($locale) = 686 ($condition =~ /^([a-z][a-z](?:_\S+)?)/); 687 $CASESPEC{$code}->{$locale} = 688 { code => $hexcode, 689 lower => $lower, 690 title => $title, 691 upper => $upper, 692 condition => $condition }; 693 } else { 694 $CASESPEC{$code} = 695 { code => $hexcode, 696 lower => $lower, 697 title => $title, 698 upper => $upper, 699 condition => $condition }; 700 } 701 } 702 } 703 close($CASESPECFH); 704 } 705 } 706} 707 708sub casespec { 709 my $arg = shift; 710 my $code = _getcode($arg); 711 croak __PACKAGE__, "::casespec: unknown code '$arg'" 712 unless defined $code; 713 714 _casespec() unless %CASESPEC; 715 716 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code}; 717} 718 719=head2 Unicode::UCD::UnicodeVersion 720 721Unicode::UCD::UnicodeVersion() returns the version of the Unicode 722Character Database, in other words, the version of the Unicode 723standard the database implements. The version is a string 724of numbers delimited by dots (C<'.'>). 725 726=cut 727 728my $UNICODEVERSION; 729 730sub UnicodeVersion { 731 unless (defined $UNICODEVERSION) { 732 openunicode(\$VERSIONFH, "version"); 733 chomp($UNICODEVERSION = <$VERSIONFH>); 734 close($VERSIONFH); 735 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'" 736 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/; 737 } 738 return $UNICODEVERSION; 739} 740 741=head2 Implementation Note 742 743The first use of charinfo() opens a read-only filehandle to the Unicode 744Character Database (the database is included in the Perl distribution). 745The filehandle is then kept open for further queries. In other words, 746if you are wondering where one of your filehandles went, that's where. 747 748=head1 BUGS 749 750Does not yet support EBCDIC platforms. 751 752=head1 AUTHOR 753 754Jarkko Hietaniemi 755 756=cut 757 7581; 759