1package Unicode::UCD; 2 3use strict; 4use warnings; 5 6our $VERSION = '0.21'; 7 8use Storable qw(dclone); 9 10require Exporter; 11 12our @ISA = qw(Exporter); 13 14our @EXPORT_OK = qw(charinfo 15 charblock charscript 16 charblocks charscripts 17 charinrange 18 compexcl 19 casefold casespec); 20 21use Carp; 22 23=head1 NAME 24 25Unicode::UCD - Unicode character database 26 27=head1 SYNOPSIS 28 29 use Unicode::UCD 'charinfo'; 30 my $charinfo = charinfo($codepoint); 31 32 use Unicode::UCD 'charblock'; 33 my $charblock = charblock($codepoint); 34 35 use Unicode::UCD 'charscript'; 36 my $charscript = charscript($codepoint); 37 38 use Unicode::UCD 'charblocks'; 39 my $charblocks = charblocks(); 40 41 use Unicode::UCD 'charscripts'; 42 my %charscripts = charscripts(); 43 44 use Unicode::UCD qw(charscript charinrange); 45 my $range = charscript($script); 46 print "looks like $script\n" if charinrange($range, $codepoint); 47 48 use Unicode::UCD 'compexcl'; 49 my $compexcl = compexcl($codepoint); 50 51 my $unicode_version = Unicode::UCD::UnicodeVersion(); 52 53=head1 DESCRIPTION 54 55The Unicode::UCD module offers a simple interface to the Unicode 56Character Database. 57 58=cut 59 60my $UNICODEFH; 61my $BLOCKSFH; 62my $SCRIPTSFH; 63my $VERSIONFH; 64my $COMPEXCLFH; 65my $CASEFOLDFH; 66my $CASESPECFH; 67 68sub openunicode { 69 my ($rfh, @path) = @_; 70 my $f; 71 unless (defined $$rfh) { 72 for my $d (@INC) { 73 use File::Spec; 74 $f = File::Spec->catfile($d, "unicore", @path); 75 last if open($$rfh, $f); 76 undef $f; 77 } 78 croak __PACKAGE__, ": failed to find ", 79 File::Spec->catfile(@path), " in @INC" 80 unless defined $f; 81 } 82 return $f; 83} 84 85=head2 charinfo 86 87 use Unicode::UCD 'charinfo'; 88 89 my $charinfo = charinfo(0x41); 90 91charinfo() returns a reference to a hash that has the following fields 92as defined by the Unicode standard: 93 94 key 95 96 code code point with at least four hexdigits 97 name name of the character IN UPPER CASE 98 category general category of the character 99 combining classes used in the Canonical Ordering Algorithm 100 bidi bidirectional category 101 decomposition character decomposition mapping 102 decimal if decimal digit this is the integer numeric value 103 digit if digit this is the numeric value 104 numeric if numeric is the integer or rational numeric value 105 mirrored if mirrored in bidirectional text 106 unicode10 Unicode 1.0 name if existed and different 107 comment ISO 10646 comment field 108 upper uppercase equivalent mapping 109 lower lowercase equivalent mapping 110 title titlecase equivalent mapping 111 112 block block the character belongs to (used in \p{In...}) 113 script script the character belongs to 114 115If no match is found, a reference to an empty hash is returned. 116 117The C<block> property is the same as returned by charinfo(). It is 118not defined in the Unicode Character Database proper (Chapter 4 of the 119Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database 120(Chapter 14 of TUS3). Similarly for the C<script> property. 121 122Note that you cannot do (de)composition and casing based solely on the 123above C<decomposition> and C<lower>, C<upper>, C<title>, properties, 124you will need also the compexcl(), casefold(), and casespec() functions. 125 126=cut 127 128sub _getcode { 129 my $arg = shift; 130 131 if ($arg =~ /^[1-9]\d*$/) { 132 return $arg; 133 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { 134 return hex($1); 135 } 136 137 return; 138} 139 140# Lingua::KO::Hangul::Util not part of the standard distribution 141# but it will be used if available. 142 143eval { require Lingua::KO::Hangul::Util }; 144my $hasHangulUtil = ! $@; 145if ($hasHangulUtil) { 146 Lingua::KO::Hangul::Util->import(); 147} 148 149sub hangul_decomp { # internal: called from charinfo 150 if ($hasHangulUtil) { 151 my @tmp = decomposeHangul(shift); 152 return sprintf("%04X %04X", @tmp) if @tmp == 2; 153 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3; 154 } 155 return; 156} 157 158sub hangul_charname { # internal: called from charinfo 159 return sprintf("HANGUL SYLLABLE-%04X", shift); 160} 161 162sub han_charname { # internal: called from charinfo 163 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift); 164} 165 166my @CharinfoRanges = ( 167# block name 168# [ first, last, coderef to name, coderef to decompose ], 169# CJK Ideographs Extension A 170 [ 0x3400, 0x4DB5, \&han_charname, undef ], 171# CJK Ideographs 172 [ 0x4E00, 0x9FA5, \&han_charname, undef ], 173# Hangul Syllables 174 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ], 175# Non-Private Use High Surrogates 176 [ 0xD800, 0xDB7F, undef, undef ], 177# Private Use High Surrogates 178 [ 0xDB80, 0xDBFF, undef, undef ], 179# Low Surrogates 180 [ 0xDC00, 0xDFFF, undef, undef ], 181# The Private Use Area 182 [ 0xE000, 0xF8FF, undef, undef ], 183# CJK Ideographs Extension B 184 [ 0x20000, 0x2A6D6, \&han_charname, undef ], 185# Plane 15 Private Use Area 186 [ 0xF0000, 0xFFFFD, undef, undef ], 187# Plane 16 Private Use Area 188 [ 0x100000, 0x10FFFD, undef, undef ], 189); 190 191sub charinfo { 192 my $arg = shift; 193 my $code = _getcode($arg); 194 croak __PACKAGE__, "::charinfo: unknown code '$arg'" 195 unless defined $code; 196 my $hexk = sprintf("%06X", $code); 197 my($rcode,$rname,$rdec); 198 foreach my $range (@CharinfoRanges){ 199 if ($range->[0] <= $code && $code <= $range->[1]) { 200 $rcode = $hexk; 201 $rcode =~ s/^0+//; 202 $rcode = sprintf("%04X", hex($rcode)); 203 $rname = $range->[2] ? $range->[2]->($code) : ''; 204 $rdec = $range->[3] ? $range->[3]->($code) : ''; 205 $hexk = sprintf("%06X", $range->[0]); # replace by the first 206 last; 207 } 208 } 209 openunicode(\$UNICODEFH, "UnicodeData.txt"); 210 if (defined $UNICODEFH) { 211 use Search::Dict 1.02; 212 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) { 213 my $line = <$UNICODEFH>; 214 return unless defined $line; 215 chomp $line; 216 my %prop; 217 @prop{qw( 218 code name category 219 combining bidi decomposition 220 decimal digit numeric 221 mirrored unicode10 comment 222 upper lower title 223 )} = split(/;/, $line, -1); 224 $hexk =~ s/^0+//; 225 $hexk = sprintf("%04X", hex($hexk)); 226 if ($prop{code} eq $hexk) { 227 $prop{block} = charblock($code); 228 $prop{script} = charscript($code); 229 if(defined $rname){ 230 $prop{code} = $rcode; 231 $prop{name} = $rname; 232 $prop{decomposition} = $rdec; 233 } 234 return \%prop; 235 } 236 } 237 } 238 return; 239} 240 241sub _search { # Binary search in a [[lo,hi,prop],[...],...] table. 242 my ($table, $lo, $hi, $code) = @_; 243 244 return if $lo > $hi; 245 246 my $mid = int(($lo+$hi) / 2); 247 248 if ($table->[$mid]->[0] < $code) { 249 if ($table->[$mid]->[1] >= $code) { 250 return $table->[$mid]->[2]; 251 } else { 252 _search($table, $mid + 1, $hi, $code); 253 } 254 } elsif ($table->[$mid]->[0] > $code) { 255 _search($table, $lo, $mid - 1, $code); 256 } else { 257 return $table->[$mid]->[2]; 258 } 259} 260 261sub charinrange { 262 my ($range, $arg) = @_; 263 my $code = _getcode($arg); 264 croak __PACKAGE__, "::charinrange: unknown code '$arg'" 265 unless defined $code; 266 _search($range, 0, $#$range, $code); 267} 268 269=head2 charblock 270 271 use Unicode::UCD 'charblock'; 272 273 my $charblock = charblock(0x41); 274 my $charblock = charblock(1234); 275 my $charblock = charblock("0x263a"); 276 my $charblock = charblock("U+263a"); 277 278 my $range = charblock('Armenian'); 279 280With a B<code point argument> charblock() returns the I<block> the character 281belongs to, e.g. C<Basic Latin>. Note that not all the character 282positions within all blocks are defined. 283 284See also L</Blocks versus Scripts>. 285 286If supplied with an argument that can't be a code point, charblock() tries 287to do the opposite and interpret the argument as a character block. The 288return value is a I<range>: an anonymous list of lists that contain 289I<start-of-range>, I<end-of-range> code point pairs. You can test whether a 290code point is in a range using the L</charinrange> function. If the 291argument is not a known charater block, C<undef> is returned. 292 293=cut 294 295my @BLOCKS; 296my %BLOCKS; 297 298sub _charblocks { 299 unless (@BLOCKS) { 300 if (openunicode(\$BLOCKSFH, "Blocks.txt")) { 301 local $_; 302 while (<$BLOCKSFH>) { 303 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { 304 my ($lo, $hi) = (hex($1), hex($2)); 305 my $subrange = [ $lo, $hi, $3 ]; 306 push @BLOCKS, $subrange; 307 push @{$BLOCKS{$3}}, $subrange; 308 } 309 } 310 close($BLOCKSFH); 311 } 312 } 313} 314 315sub charblock { 316 my $arg = shift; 317 318 _charblocks() unless @BLOCKS; 319 320 my $code = _getcode($arg); 321 322 if (defined $code) { 323 _search(\@BLOCKS, 0, $#BLOCKS, $code); 324 } else { 325 if (exists $BLOCKS{$arg}) { 326 return dclone $BLOCKS{$arg}; 327 } else { 328 return; 329 } 330 } 331} 332 333=head2 charscript 334 335 use Unicode::UCD 'charscript'; 336 337 my $charscript = charscript(0x41); 338 my $charscript = charscript(1234); 339 my $charscript = charscript("U+263a"); 340 341 my $range = charscript('Thai'); 342 343With a B<code point argument> charscript() returns the I<script> the 344character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. 345 346See also L</Blocks versus Scripts>. 347 348If supplied with an argument that can't be a code point, charscript() tries 349to do the opposite and interpret the argument as a character script. The 350return value is a I<range>: an anonymous list of lists that contain 351I<start-of-range>, I<end-of-range> code point pairs. You can test whether a 352code point is in a range using the L</charinrange> function. If the 353argument is not a known charater script, C<undef> is returned. 354 355=cut 356 357my @SCRIPTS; 358my %SCRIPTS; 359 360sub _charscripts { 361 unless (@SCRIPTS) { 362 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) { 363 local $_; 364 while (<$SCRIPTSFH>) { 365 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { 366 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1)); 367 my $script = lc($3); 368 $script =~ s/\b(\w)/uc($1)/ge; 369 my $subrange = [ $lo, $hi, $script ]; 370 push @SCRIPTS, $subrange; 371 push @{$SCRIPTS{$script}}, $subrange; 372 } 373 } 374 close($SCRIPTSFH); 375 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS; 376 } 377 } 378} 379 380sub charscript { 381 my $arg = shift; 382 383 _charscripts() unless @SCRIPTS; 384 385 my $code = _getcode($arg); 386 387 if (defined $code) { 388 _search(\@SCRIPTS, 0, $#SCRIPTS, $code); 389 } else { 390 if (exists $SCRIPTS{$arg}) { 391 return dclone $SCRIPTS{$arg}; 392 } else { 393 return; 394 } 395 } 396} 397 398=head2 charblocks 399 400 use Unicode::UCD 'charblocks'; 401 402 my $charblocks = charblocks(); 403 404charblocks() returns a reference to a hash with the known block names 405as the keys, and the code point ranges (see L</charblock>) as the values. 406 407See also L</Blocks versus Scripts>. 408 409=cut 410 411sub charblocks { 412 _charblocks() unless %BLOCKS; 413 return dclone \%BLOCKS; 414} 415 416=head2 charscripts 417 418 use Unicode::UCD 'charscripts'; 419 420 my %charscripts = charscripts(); 421 422charscripts() returns a hash with the known script names as the keys, 423and the code point ranges (see L</charscript>) as the values. 424 425See also L</Blocks versus Scripts>. 426 427=cut 428 429sub charscripts { 430 _charscripts() unless %SCRIPTS; 431 return dclone \%SCRIPTS; 432} 433 434=head2 Blocks versus Scripts 435 436The difference between a block and a script is that scripts are closer 437to the linguistic notion of a set of characters required to present 438languages, while block is more of an artifact of the Unicode character 439numbering and separation into blocks of (mostly) 256 characters. 440 441For example the Latin B<script> is spread over several B<blocks>, such 442as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and 443C<Latin Extended-B>. On the other hand, the Latin script does not 444contain all the characters of the C<Basic Latin> block (also known as 445the ASCII): it includes only the letters, and not, for example, the digits 446or the punctuation. 447 448For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt 449 450For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/ 451 452=head2 Matching Scripts and Blocks 453 454Scripts are matched with the regular-expression construct 455C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script), 456while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches 457any of the 256 code points in the Tibetan block). 458 459=head2 Code Point Arguments 460 461A I<code point argument> is either a decimal or a hexadecimal scalar 462designating a Unicode character, or C<U+> followed by hexadecimals 463designating a Unicode character. In other words, if you want a code 464point to be interpreted as a hexadecimal number, you must prefix it 465with either C<0x> or C<U+>, because a string like e.g. C<123> will 466be interpreted as a decimal code point. Also note that Unicode is 467B<not> limited to 16 bits (the number of Unicode characters is 468open-ended, in theory unlimited): you may have more than 4 hexdigits. 469 470=head2 charinrange 471 472In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you 473can also test whether a code point is in the I<range> as returned by 474L</charblock> and L</charscript> or as the values of the hash returned 475by L</charblocks> and L</charscripts> by using charinrange(): 476 477 use Unicode::UCD qw(charscript charinrange); 478 479 $range = charscript('Hiragana'); 480 print "looks like hiragana\n" if charinrange($range, $codepoint); 481 482=cut 483 484=head2 compexcl 485 486 use Unicode::UCD 'compexcl'; 487 488 my $compexcl = compexcl("09dc"); 489 490The compexcl() returns the composition exclusion (that is, if the 491character should not be produced during a precomposition) of the 492character specified by a B<code point argument>. 493 494If there is a composition exclusion for the character, true is 495returned. Otherwise, false is returned. 496 497=cut 498 499my %COMPEXCL; 500 501sub _compexcl { 502 unless (%COMPEXCL) { 503 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) { 504 local $_; 505 while (<$COMPEXCLFH>) { 506 if (/^([0-9A-F]+)\s+\#\s+/) { 507 my $code = hex($1); 508 $COMPEXCL{$code} = undef; 509 } 510 } 511 close($COMPEXCLFH); 512 } 513 } 514} 515 516sub compexcl { 517 my $arg = shift; 518 my $code = _getcode($arg); 519 croak __PACKAGE__, "::compexcl: unknown code '$arg'" 520 unless defined $code; 521 522 _compexcl() unless %COMPEXCL; 523 524 return exists $COMPEXCL{$code}; 525} 526 527=head2 casefold 528 529 use Unicode::UCD 'casefold'; 530 531 my $casefold = casefold("00DF"); 532 533The casefold() returns the locale-independent case folding of the 534character specified by a B<code point argument>. 535 536If there is a case folding for that character, a reference to a hash 537with the following fields is returned: 538 539 key 540 541 code code point with at least four hexdigits 542 status "C", "F", "S", or "I" 543 mapping one or more codes separated by spaces 544 545The meaning of the I<status> is as follows: 546 547 C common case folding, common mappings shared 548 by both simple and full mappings 549 F full case folding, mappings that cause strings 550 to grow in length. Multiple characters are separated 551 by spaces 552 S simple case folding, mappings to single characters 553 where different from F 554 I special case for dotted uppercase I and 555 dotless lowercase i 556 - If this mapping is included, the result is 557 case-insensitive, but dotless and dotted I's 558 are not distinguished 559 - If this mapping is excluded, the result is not 560 fully case-insensitive, but dotless and dotted 561 I's are distinguished 562 563If there is no case folding for that character, C<undef> is returned. 564 565For more information about case mappings see 566http://www.unicode.org/unicode/reports/tr21/ 567 568=cut 569 570my %CASEFOLD; 571 572sub _casefold { 573 unless (%CASEFOLD) { 574 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) { 575 local $_; 576 while (<$CASEFOLDFH>) { 577 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) { 578 my $code = hex($1); 579 $CASEFOLD{$code} = { code => $1, 580 status => $2, 581 mapping => $3 }; 582 } 583 } 584 close($CASEFOLDFH); 585 } 586 } 587} 588 589sub casefold { 590 my $arg = shift; 591 my $code = _getcode($arg); 592 croak __PACKAGE__, "::casefold: unknown code '$arg'" 593 unless defined $code; 594 595 _casefold() unless %CASEFOLD; 596 597 return $CASEFOLD{$code}; 598} 599 600=head2 casespec 601 602 use Unicode::UCD 'casespec'; 603 604 my $casespec = casespec("FB00"); 605 606The casespec() returns the potentially locale-dependent case mapping 607of the character specified by a B<code point argument>. The mapping 608may change the length of the string (which the basic Unicode case 609mappings as returned by charinfo() never do). 610 611If there is a case folding for that character, a reference to a hash 612with the following fields is returned: 613 614 key 615 616 code code point with at least four hexdigits 617 lower lowercase 618 title titlecase 619 upper uppercase 620 condition condition list (may be undef) 621 622The C<condition> is optional. Where present, it consists of one or 623more I<locales> or I<contexts>, separated by spaces (other than as 624used to separate elements, spaces are to be ignored). A condition 625list overrides the normal behavior if all of the listed conditions are 626true. Case distinctions in the condition list are not significant. 627Conditions preceded by "NON_" represent the negation of the condition. 628 629Note that when there are multiple case folding definitions for a 630single code point because of different locales, the value returned by 631casespec() is a hash reference which has the locales as the keys and 632hash references as described above as the values. 633 634A I<locale> is defined as a 2-letter ISO 3166 country code, possibly 635followed by a "_" and a 2-letter ISO language code (possibly followed 636by a "_" and a variant code). You can find the lists of those codes, 637see L<Locale::Country> and L<Locale::Language>. 638 639A I<context> is one of the following choices: 640 641 FINAL The letter is not followed by a letter of 642 general category L (e.g. Ll, Lt, Lu, Lm, or Lo) 643 MODERN The mapping is only used for modern text 644 AFTER_i The last base character was "i" (U+0069) 645 646For more information about case mappings see 647http://www.unicode.org/unicode/reports/tr21/ 648 649=cut 650 651my %CASESPEC; 652 653sub _casespec { 654 unless (%CASESPEC) { 655 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { 656 local $_; 657 while (<$CASESPECFH>) { 658 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { 659 my ($hexcode, $lower, $title, $upper, $condition) = 660 ($1, $2, $3, $4, $5); 661 my $code = hex($hexcode); 662 if (exists $CASESPEC{$code}) { 663 if (exists $CASESPEC{$code}->{code}) { 664 my ($oldlower, 665 $oldtitle, 666 $oldupper, 667 $oldcondition) = 668 @{$CASESPEC{$code}}{qw(lower 669 title 670 upper 671 condition)}; 672 if (defined $oldcondition) { 673 my ($oldlocale) = 674 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/); 675 delete $CASESPEC{$code}; 676 $CASESPEC{$code}->{$oldlocale} = 677 { code => $hexcode, 678 lower => $oldlower, 679 title => $oldtitle, 680 upper => $oldupper, 681 condition => $oldcondition }; 682 } 683 } 684 my ($locale) = 685 ($condition =~ /^([a-z][a-z](?:_\S+)?)/); 686 $CASESPEC{$code}->{$locale} = 687 { code => $hexcode, 688 lower => $lower, 689 title => $title, 690 upper => $upper, 691 condition => $condition }; 692 } else { 693 $CASESPEC{$code} = 694 { code => $hexcode, 695 lower => $lower, 696 title => $title, 697 upper => $upper, 698 condition => $condition }; 699 } 700 } 701 } 702 close($CASESPECFH); 703 } 704 } 705} 706 707sub casespec { 708 my $arg = shift; 709 my $code = _getcode($arg); 710 croak __PACKAGE__, "::casespec: unknown code '$arg'" 711 unless defined $code; 712 713 _casespec() unless %CASESPEC; 714 715 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code}; 716} 717 718=head2 Unicode::UCD::UnicodeVersion 719 720Unicode::UCD::UnicodeVersion() returns the version of the Unicode 721Character Database, in other words, the version of the Unicode 722standard the database implements. The version is a string 723of numbers delimited by dots (C<'.'>). 724 725=cut 726 727my $UNICODEVERSION; 728 729sub UnicodeVersion { 730 unless (defined $UNICODEVERSION) { 731 openunicode(\$VERSIONFH, "version"); 732 chomp($UNICODEVERSION = <$VERSIONFH>); 733 close($VERSIONFH); 734 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'" 735 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/; 736 } 737 return $UNICODEVERSION; 738} 739 740=head2 Implementation Note 741 742The first use of charinfo() opens a read-only filehandle to the Unicode 743Character Database (the database is included in the Perl distribution). 744The filehandle is then kept open for further queries. In other words, 745if you are wondering where one of your filehandles went, that's where. 746 747=head1 BUGS 748 749Does not yet support EBCDIC platforms. 750 751=head1 AUTHOR 752 753Jarkko Hietaniemi 754 755=cut 756 7571; 758