1package Unicode::UCD; 2 3use strict; 4use warnings; 5 6our $VERSION = '0.2'; 7 8require Exporter; 9 10our @ISA = qw(Exporter); 11 12our @EXPORT_OK = qw(charinfo 13 charblock charscript 14 charblocks charscripts 15 charinrange 16 compexcl 17 casefold casespec); 18 19use Carp; 20 21=head1 NAME 22 23Unicode::UCD - Unicode character database 24 25=head1 SYNOPSIS 26 27 use Unicode::UCD 'charinfo'; 28 my $charinfo = charinfo($codepoint); 29 30 use Unicode::UCD 'charblock'; 31 my $charblock = charblock($codepoint); 32 33 use Unicode::UCD 'charscript'; 34 my $charscript = charblock($codepoint); 35 36 use Unicode::UCD 'charblocks'; 37 my $charblocks = charblocks(); 38 39 use Unicode::UCD 'charscripts'; 40 my %charscripts = charscripts(); 41 42 use Unicode::UCD qw(charscript charinrange); 43 my $range = charscript($script); 44 print "looks like $script\n" if charinrange($range, $codepoint); 45 46 use Unicode::UCD 'compexcl'; 47 my $compexcl = compexcl($codepoint); 48 49 my $unicode_version = Unicode::UCD::UnicodeVersion(); 50 51=head1 DESCRIPTION 52 53The Unicode::UCD module offers a simple interface to the Unicode 54Character Database. 55 56=cut 57 58my $UNICODEFH; 59my $BLOCKSFH; 60my $SCRIPTSFH; 61my $VERSIONFH; 62my $COMPEXCLFH; 63my $CASEFOLDFH; 64my $CASESPECFH; 65 66sub openunicode { 67 my ($rfh, @path) = @_; 68 my $f; 69 unless (defined $$rfh) { 70 for my $d (@INC) { 71 use File::Spec; 72 $f = File::Spec->catfile($d, "unicore", @path); 73 last if open($$rfh, $f); 74 undef $f; 75 } 76 croak __PACKAGE__, ": failed to find ", 77 File::Spec->catfile(@path), " in @INC" 78 unless defined $f; 79 } 80 return $f; 81} 82 83=head2 charinfo 84 85 use Unicode::UCD 'charinfo'; 86 87 my $charinfo = charinfo(0x41); 88 89charinfo() returns a reference to a hash that has the following fields 90as defined by the Unicode standard: 91 92 key 93 94 code code point with at least four hexdigits 95 name name of the character IN UPPER CASE 96 category general category of the character 97 combining classes used in the Canonical Ordering Algorithm 98 bidi bidirectional category 99 decomposition character decomposition mapping 100 decimal if decimal digit this is the integer numeric value 101 digit if digit this is the numeric value 102 numeric if numeric is the integer or rational numeric value 103 mirrored if mirrored in bidirectional text 104 unicode10 Unicode 1.0 name if existed and different 105 comment ISO 10646 comment field 106 upper uppercase equivalent mapping 107 lower lowercase equivalent mapping 108 title titlecase equivalent mapping 109 110 block block the character belongs to (used in \p{In...}) 111 script script the character belongs to 112 113If no match is found, a reference to an empty hash is returned. 114 115The C<block> property is the same as returned by charinfo(). It is 116not defined in the Unicode Character Database proper (Chapter 4 of the 117Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database 118(Chapter 14 of TUS3). Similarly for the C<script> property. 119 120Note that you cannot do (de)composition and casing based solely on the 121above C<decomposition> and C<lower>, C<upper>, C<title>, properties, 122you will need also the compexcl(), casefold(), and casespec() functions. 123 124=cut 125 126sub _getcode { 127 my $arg = shift; 128 129 if ($arg =~ /^\d+$/) { 130 return $arg; 131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) { 132 return hex($1); 133 } 134 135 return; 136} 137 138# Lingua::KO::Hangul::Util not part of the standard distribution 139# but it will be used if available. 140 141eval { require Lingua::KO::Hangul::Util }; 142my $hasHangulUtil = ! $@; 143if ($hasHangulUtil) { 144 Lingua::KO::Hangul::Util->import(); 145} 146 147sub hangul_decomp { # internal: called from charinfo 148 if ($hasHangulUtil) { 149 my @tmp = decomposeHangul(shift); 150 return sprintf("%04X %04X", @tmp) if @tmp == 2; 151 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3; 152 } 153 return; 154} 155 156sub hangul_charname { # internal: called from charinfo 157 return sprintf("HANGUL SYLLABLE-%04X", shift); 158} 159 160sub han_charname { # internal: called from charinfo 161 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift); 162} 163 164my @CharinfoRanges = ( 165# block name 166# [ first, last, coderef to name, coderef to decompose ], 167# CJK Ideographs Extension A 168 [ 0x3400, 0x4DB5, \&han_charname, undef ], 169# CJK Ideographs 170 [ 0x4E00, 0x9FA5, \&han_charname, undef ], 171# Hangul Syllables 172 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ], 173# Non-Private Use High Surrogates 174 [ 0xD800, 0xDB7F, undef, undef ], 175# Private Use High Surrogates 176 [ 0xDB80, 0xDBFF, undef, undef ], 177# Low Surrogates 178 [ 0xDC00, 0xDFFF, undef, undef ], 179# The Private Use Area 180 [ 0xE000, 0xF8FF, undef, undef ], 181# CJK Ideographs Extension B 182 [ 0x20000, 0x2A6D6, \&han_charname, undef ], 183# Plane 15 Private Use Area 184 [ 0xF0000, 0xFFFFD, undef, undef ], 185# Plane 16 Private Use Area 186 [ 0x100000, 0x10FFFD, undef, undef ], 187); 188 189sub charinfo { 190 my $arg = shift; 191 my $code = _getcode($arg); 192 croak __PACKAGE__, "::charinfo: unknown code '$arg'" 193 unless defined $code; 194 my $hexk = sprintf("%06X", $code); 195 my($rcode,$rname,$rdec); 196 foreach my $range (@CharinfoRanges){ 197 if ($range->[0] <= $code && $code <= $range->[1]) { 198 $rcode = $hexk; 199 $rcode =~ s/^0+//; 200 $rcode = sprintf("%04X", hex($rcode)); 201 $rname = $range->[2] ? $range->[2]->($code) : ''; 202 $rdec = $range->[3] ? $range->[3]->($code) : ''; 203 $hexk = sprintf("%06X", $range->[0]); # replace by the first 204 last; 205 } 206 } 207 openunicode(\$UNICODEFH, "UnicodeData.txt"); 208 if (defined $UNICODEFH) { 209 use Search::Dict 1.02; 210 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) { 211 my $line = <$UNICODEFH>; 212 chomp $line; 213 my %prop; 214 @prop{qw( 215 code name category 216 combining bidi decomposition 217 decimal digit numeric 218 mirrored unicode10 comment 219 upper lower title 220 )} = split(/;/, $line, -1); 221 $hexk =~ s/^0+//; 222 $hexk = sprintf("%04X", hex($hexk)); 223 if ($prop{code} eq $hexk) { 224 $prop{block} = charblock($code); 225 $prop{script} = charscript($code); 226 if(defined $rname){ 227 $prop{code} = $rcode; 228 $prop{name} = $rname; 229 $prop{decomposition} = $rdec; 230 } 231 return \%prop; 232 } 233 } 234 } 235 return; 236} 237 238sub _search { # Binary search in a [[lo,hi,prop],[...],...] table. 239 my ($table, $lo, $hi, $code) = @_; 240 241 return if $lo > $hi; 242 243 my $mid = int(($lo+$hi) / 2); 244 245 if ($table->[$mid]->[0] < $code) { 246 if ($table->[$mid]->[1] >= $code) { 247 return $table->[$mid]->[2]; 248 } else { 249 _search($table, $mid + 1, $hi, $code); 250 } 251 } elsif ($table->[$mid]->[0] > $code) { 252 _search($table, $lo, $mid - 1, $code); 253 } else { 254 return $table->[$mid]->[2]; 255 } 256} 257 258sub charinrange { 259 my ($range, $arg) = @_; 260 my $code = _getcode($arg); 261 croak __PACKAGE__, "::charinrange: unknown code '$arg'" 262 unless defined $code; 263 _search($range, 0, $#$range, $code); 264} 265 266=head2 charblock 267 268 use Unicode::UCD 'charblock'; 269 270 my $charblock = charblock(0x41); 271 my $charblock = charblock(1234); 272 my $charblock = charblock("0x263a"); 273 my $charblock = charblock("U+263a"); 274 275 my $range = charblock('Armenian'); 276 277With a B<code point argument> charblock() returns the I<block> the character 278belongs to, e.g. C<Basic Latin>. Note that not all the character 279positions within all blocks are defined. 280 281See also L</Blocks versus Scripts>. 282 283If supplied with an argument that can't be a code point, charblock() tries 284to do the opposite and interpret the argument as a character block. The 285return value is a I<range>: an anonymous list of lists that contain 286I<start-of-range>, I<end-of-range> code point pairs. You can test whether a 287code point is in a range using the L</charinrange> function. If the 288argument is not a known charater block, C<undef> is returned. 289 290=cut 291 292my @BLOCKS; 293my %BLOCKS; 294 295sub _charblocks { 296 unless (@BLOCKS) { 297 if (openunicode(\$BLOCKSFH, "Blocks.txt")) { 298 while (<$BLOCKSFH>) { 299 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { 300 my ($lo, $hi) = (hex($1), hex($2)); 301 my $subrange = [ $lo, $hi, $3 ]; 302 push @BLOCKS, $subrange; 303 push @{$BLOCKS{$3}}, $subrange; 304 } 305 } 306 close($BLOCKSFH); 307 } 308 } 309} 310 311sub charblock { 312 my $arg = shift; 313 314 _charblocks() unless @BLOCKS; 315 316 my $code = _getcode($arg); 317 318 if (defined $code) { 319 _search(\@BLOCKS, 0, $#BLOCKS, $code); 320 } else { 321 if (exists $BLOCKS{$arg}) { 322 return $BLOCKS{$arg}; 323 } else { 324 return; 325 } 326 } 327} 328 329=head2 charscript 330 331 use Unicode::UCD 'charscript'; 332 333 my $charscript = charscript(0x41); 334 my $charscript = charscript(1234); 335 my $charscript = charscript("U+263a"); 336 337 my $range = charscript('Thai'); 338 339With a B<code point argument> charscript() returns the I<script> the 340character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. 341 342See also L</Blocks versus Scripts>. 343 344If supplied with an argument that can't be a code point, charscript() tries 345to do the opposite and interpret the argument as a character script. The 346return value is a I<range>: an anonymous list of lists that contain 347I<start-of-range>, I<end-of-range> code point pairs. You can test whether a 348code point is in a range using the L</charinrange> function. If the 349argument is not a known charater script, C<undef> is returned. 350 351=cut 352 353my @SCRIPTS; 354my %SCRIPTS; 355 356sub _charscripts { 357 unless (@SCRIPTS) { 358 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) { 359 while (<$SCRIPTSFH>) { 360 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { 361 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1)); 362 my $script = lc($3); 363 $script =~ s/\b(\w)/uc($1)/ge; 364 my $subrange = [ $lo, $hi, $script ]; 365 push @SCRIPTS, $subrange; 366 push @{$SCRIPTS{$script}}, $subrange; 367 } 368 } 369 close($SCRIPTSFH); 370 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS; 371 } 372 } 373} 374 375sub charscript { 376 my $arg = shift; 377 378 _charscripts() unless @SCRIPTS; 379 380 my $code = _getcode($arg); 381 382 if (defined $code) { 383 _search(\@SCRIPTS, 0, $#SCRIPTS, $code); 384 } else { 385 if (exists $SCRIPTS{$arg}) { 386 return $SCRIPTS{$arg}; 387 } else { 388 return; 389 } 390 } 391} 392 393=head2 charblocks 394 395 use Unicode::UCD 'charblocks'; 396 397 my $charblocks = charblocks(); 398 399charblocks() returns a reference to a hash with the known block names 400as the keys, and the code point ranges (see L</charblock>) as the values. 401 402See also L</Blocks versus Scripts>. 403 404=cut 405 406sub charblocks { 407 _charblocks() unless %BLOCKS; 408 return \%BLOCKS; 409} 410 411=head2 charscripts 412 413 use Unicode::UCD 'charscripts'; 414 415 my %charscripts = charscripts(); 416 417charscripts() returns a hash with the known script names as the keys, 418and the code point ranges (see L</charscript>) as the values. 419 420See also L</Blocks versus Scripts>. 421 422=cut 423 424sub charscripts { 425 _charscripts() unless %SCRIPTS; 426 return \%SCRIPTS; 427} 428 429=head2 Blocks versus Scripts 430 431The difference between a block and a script is that scripts are closer 432to the linguistic notion of a set of characters required to present 433languages, while block is more of an artifact of the Unicode character 434numbering and separation into blocks of (mostly) 256 characters. 435 436For example the Latin B<script> is spread over several B<blocks>, such 437as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and 438C<Latin Extended-B>. On the other hand, the Latin script does not 439contain all the characters of the C<Basic Latin> block (also known as 440the ASCII): it includes only the letters, and not, for example, the digits 441or the punctuation. 442 443For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt 444 445For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/ 446 447=head2 Matching Scripts and Blocks 448 449Scripts are matched with the regular-expression construct 450C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script), 451while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches 452any of the 256 code points in the Tibetan block). 453 454=head2 Code Point Arguments 455 456A I<code point argument> is either a decimal or a hexadecimal scalar 457designating a Unicode character, or C<U+> followed by hexadecimals 458designating a Unicode character. Note that Unicode is B<not> limited 459to 16 bits (the number of Unicode characters is open-ended, in theory 460unlimited): you may have more than 4 hexdigits. 461 462=head2 charinrange 463 464In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you 465can also test whether a code point is in the I<range> as returned by 466L</charblock> and L</charscript> or as the values of the hash returned 467by L</charblocks> and L</charscripts> by using charinrange(): 468 469 use Unicode::UCD qw(charscript charinrange); 470 471 $range = charscript('Hiragana'); 472 print "looks like hiragana\n" if charinrange($range, $codepoint); 473 474=cut 475 476=head2 compexcl 477 478 use Unicode::UCD 'compexcl'; 479 480 my $compexcl = compexcl("09dc"); 481 482The compexcl() returns the composition exclusion (that is, if the 483character should not be produced during a precomposition) of the 484character specified by a B<code point argument>. 485 486If there is a composition exclusion for the character, true is 487returned. Otherwise, false is returned. 488 489=cut 490 491my %COMPEXCL; 492 493sub _compexcl { 494 unless (%COMPEXCL) { 495 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) { 496 while (<$COMPEXCLFH>) { 497 if (/^([0-9A-F]+)\s+\#\s+/) { 498 my $code = hex($1); 499 $COMPEXCL{$code} = undef; 500 } 501 } 502 close($COMPEXCLFH); 503 } 504 } 505} 506 507sub compexcl { 508 my $arg = shift; 509 my $code = _getcode($arg); 510 croak __PACKAGE__, "::compexcl: unknown code '$arg'" 511 unless defined $code; 512 513 _compexcl() unless %COMPEXCL; 514 515 return exists $COMPEXCL{$code}; 516} 517 518=head2 casefold 519 520 use Unicode::UCD 'casefold'; 521 522 my %casefold = casefold("09dc"); 523 524The casefold() returns the locale-independent case folding of the 525character specified by a B<code point argument>. 526 527If there is a case folding for that character, a reference to a hash 528with the following fields is returned: 529 530 key 531 532 code code point with at least four hexdigits 533 status "C", "F", "S", or "I" 534 mapping one or more codes separated by spaces 535 536The meaning of the I<status> is as follows: 537 538 C common case folding, common mappings shared 539 by both simple and full mappings 540 F full case folding, mappings that cause strings 541 to grow in length. Multiple characters are separated 542 by spaces 543 S simple case folding, mappings to single characters 544 where different from F 545 I special case for dotted uppercase I and 546 dotless lowercase i 547 - If this mapping is included, the result is 548 case-insensitive, but dotless and dotted I's 549 are not distinguished 550 - If this mapping is excluded, the result is not 551 fully case-insensitive, but dotless and dotted 552 I's are distinguished 553 554If there is no case folding for that character, C<undef> is returned. 555 556For more information about case mappings see 557http://www.unicode.org/unicode/reports/tr21/ 558 559=cut 560 561my %CASEFOLD; 562 563sub _casefold { 564 unless (%CASEFOLD) { 565 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) { 566 while (<$CASEFOLDFH>) { 567 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) { 568 my $code = hex($1); 569 $CASEFOLD{$code} = { code => $1, 570 status => $2, 571 mapping => $3 }; 572 } 573 } 574 close($CASEFOLDFH); 575 } 576 } 577} 578 579sub casefold { 580 my $arg = shift; 581 my $code = _getcode($arg); 582 croak __PACKAGE__, "::casefold: unknown code '$arg'" 583 unless defined $code; 584 585 _casefold() unless %CASEFOLD; 586 587 return $CASEFOLD{$code}; 588} 589 590=head2 casespec 591 592 use Unicode::UCD 'casespec'; 593 594 my %casespec = casespec("09dc"); 595 596The casespec() returns the potentially locale-dependent case mapping 597of the character specified by a B<code point argument>. The mapping 598may change the length of the string (which the basic Unicode case 599mappings as returned by charinfo() never do). 600 601If there is a case folding for that character, a reference to a hash 602with the following fields is returned: 603 604 key 605 606 code code point with at least four hexdigits 607 lower lowercase 608 title titlecase 609 upper uppercase 610 condition condition list (may be undef) 611 612The C<condition> is optional. Where present, it consists of one or 613more I<locales> or I<contexts>, separated by spaces (other than as 614used to separate elements, spaces are to be ignored). A condition 615list overrides the normal behavior if all of the listed conditions are 616true. Case distinctions in the condition list are not significant. 617Conditions preceded by "NON_" represent the negation of the condition 618 619Note that when there are multiple case folding definitions for a 620single code point because of different locales, the value returned by 621casespec() is a hash reference which has the locales as the keys and 622hash references as described above as the values. 623 624A I<locale> is defined as a 2-letter ISO 3166 country code, possibly 625followed by a "_" and a 2-letter ISO language code (possibly followed 626by a "_" and a variant code). You can find the lists of those codes, 627see L<Locale::Country> and L<Locale::Language>. 628 629A I<context> is one of the following choices: 630 631 FINAL The letter is not followed by a letter of 632 general category L (e.g. Ll, Lt, Lu, Lm, or Lo) 633 MODERN The mapping is only used for modern text 634 AFTER_i The last base character was "i" (U+0069) 635 636For more information about case mappings see 637http://www.unicode.org/unicode/reports/tr21/ 638 639=cut 640 641my %CASESPEC; 642 643sub _casespec { 644 unless (%CASESPEC) { 645 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { 646 while (<$CASESPECFH>) { 647 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { 648 my ($hexcode, $lower, $title, $upper, $condition) = 649 ($1, $2, $3, $4, $5); 650 my $code = hex($hexcode); 651 if (exists $CASESPEC{$code}) { 652 if (exists $CASESPEC{$code}->{code}) { 653 my ($oldlower, 654 $oldtitle, 655 $oldupper, 656 $oldcondition) = 657 @{$CASESPEC{$code}}{qw(lower 658 title 659 upper 660 condition)}; 661 if (defined $oldcondition) { 662 my ($oldlocale) = 663 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/); 664 delete $CASESPEC{$code}; 665 $CASESPEC{$code}->{$oldlocale} = 666 { code => $hexcode, 667 lower => $oldlower, 668 title => $oldtitle, 669 upper => $oldupper, 670 condition => $oldcondition }; 671 } 672 } 673 my ($locale) = 674 ($condition =~ /^([a-z][a-z](?:_\S+)?)/); 675 $CASESPEC{$code}->{$locale} = 676 { code => $hexcode, 677 lower => $lower, 678 title => $title, 679 upper => $upper, 680 condition => $condition }; 681 } else { 682 $CASESPEC{$code} = 683 { code => $hexcode, 684 lower => $lower, 685 title => $title, 686 upper => $upper, 687 condition => $condition }; 688 } 689 } 690 } 691 close($CASESPECFH); 692 } 693 } 694} 695 696sub casespec { 697 my $arg = shift; 698 my $code = _getcode($arg); 699 croak __PACKAGE__, "::casespec: unknown code '$arg'" 700 unless defined $code; 701 702 _casespec() unless %CASESPEC; 703 704 return $CASESPEC{$code}; 705} 706 707=head2 Unicode::UCD::UnicodeVersion 708 709Unicode::UCD::UnicodeVersion() returns the version of the Unicode 710Character Database, in other words, the version of the Unicode 711standard the database implements. The version is a string 712of numbers delimited by dots (C<'.'>). 713 714=cut 715 716my $UNICODEVERSION; 717 718sub UnicodeVersion { 719 unless (defined $UNICODEVERSION) { 720 openunicode(\$VERSIONFH, "version"); 721 chomp($UNICODEVERSION = <$VERSIONFH>); 722 close($VERSIONFH); 723 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'" 724 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/; 725 } 726 return $UNICODEVERSION; 727} 728 729=head2 Implementation Note 730 731The first use of charinfo() opens a read-only filehandle to the Unicode 732Character Database (the database is included in the Perl distribution). 733The filehandle is then kept open for further queries. In other words, 734if you are wondering where one of your filehandles went, that's where. 735 736=head1 BUGS 737 738Does not yet support EBCDIC platforms. 739 740=head1 AUTHOR 741 742Jarkko Hietaniemi 743 744=cut 745 7461; 747