1package Unicode::UCD; 2 3use strict; 4use warnings; 5no warnings 'surrogate'; # surrogates can be inputs to this 6use charnames (); 7 8our $VERSION = '0.64'; 9 10require Exporter; 11 12our @ISA = qw(Exporter); 13 14our @EXPORT_OK = qw(charinfo 15 charblock charscript 16 charblocks charscripts 17 charinrange 18 charprop 19 charprops_all 20 general_categories bidi_types 21 compexcl 22 casefold all_casefolds casespec 23 namedseq 24 num 25 prop_aliases 26 prop_value_aliases 27 prop_values 28 prop_invlist 29 prop_invmap 30 search_invlist 31 MAX_CP 32 ); 33 34use Carp; 35 36sub IS_ASCII_PLATFORM { ord("A") == 65 } 37 38=head1 NAME 39 40Unicode::UCD - Unicode character database 41 42=head1 SYNOPSIS 43 44 use Unicode::UCD 'charinfo'; 45 my $charinfo = charinfo($codepoint); 46 47 use Unicode::UCD 'charprop'; 48 my $value = charprop($codepoint, $property); 49 50 use Unicode::UCD 'charprops_all'; 51 my $all_values_hash_ref = charprops_all($codepoint); 52 53 use Unicode::UCD 'casefold'; 54 my $casefold = casefold($codepoint); 55 56 use Unicode::UCD 'all_casefolds'; 57 my $all_casefolds_ref = all_casefolds(); 58 59 use Unicode::UCD 'casespec'; 60 my $casespec = casespec($codepoint); 61 62 use Unicode::UCD 'charblock'; 63 my $charblock = charblock($codepoint); 64 65 use Unicode::UCD 'charscript'; 66 my $charscript = charscript($codepoint); 67 68 use Unicode::UCD 'charblocks'; 69 my $charblocks = charblocks(); 70 71 use Unicode::UCD 'charscripts'; 72 my $charscripts = charscripts(); 73 74 use Unicode::UCD qw(charscript charinrange); 75 my $range = charscript($script); 76 print "looks like $script\n" if charinrange($range, $codepoint); 77 78 use Unicode::UCD qw(general_categories bidi_types); 79 my $categories = general_categories(); 80 my $types = bidi_types(); 81 82 use Unicode::UCD 'prop_aliases'; 83 my @space_names = prop_aliases("space"); 84 85 use Unicode::UCD 'prop_value_aliases'; 86 my @gc_punct_names = prop_value_aliases("Gc", "Punct"); 87 88 use Unicode::UCD 'prop_values'; 89 my @all_EA_short_names = prop_values("East_Asian_Width"); 90 91 use Unicode::UCD 'prop_invlist'; 92 my @puncts = prop_invlist("gc=punctuation"); 93 94 use Unicode::UCD 'prop_invmap'; 95 my ($list_ref, $map_ref, $format, $missing) 96 = prop_invmap("General Category"); 97 98 use Unicode::UCD 'search_invlist'; 99 my $index = search_invlist(\@invlist, $code_point); 100 101 use Unicode::UCD 'compexcl'; 102 my $compexcl = compexcl($codepoint); 103 104 use Unicode::UCD 'namedseq'; 105 my $namedseq = namedseq($named_sequence_name); 106 107 my $unicode_version = Unicode::UCD::UnicodeVersion(); 108 109 my $convert_to_numeric = 110 Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}"); 111 112=head1 DESCRIPTION 113 114The Unicode::UCD module offers a series of functions that 115provide a simple interface to the Unicode 116Character Database. 117 118=head2 code point argument 119 120Some of the functions are called with a I<code point argument>, which is either 121a decimal or a hexadecimal scalar designating a code point in the platform's 122native character set (extended to Unicode), or a string containing C<U+> 123followed by hexadecimals 124designating a Unicode code point. A leading 0 will force a hexadecimal 125interpretation, as will a hexadecimal digit that isn't a decimal digit. 126 127Examples: 128 129 223 # Decimal 223 in native character set 130 0223 # Hexadecimal 223, native (= 547 decimal) 131 0xDF # Hexadecimal DF, native (= 223 decimal 132 'U+DF' # Hexadecimal DF, in Unicode's character set 133 (= LATIN SMALL LETTER SHARP S) 134 135Note that the largest code point in Unicode is U+10FFFF. 136 137=cut 138 139my $BLOCKSFH; 140my $VERSIONFH; 141my $CASEFOLDFH; 142my $CASESPECFH; 143my $NAMEDSEQFH; 144my $v_unicode_version; # v-string. 145 146sub openunicode { 147 my ($rfh, @path) = @_; 148 my $f; 149 unless (defined $$rfh) { 150 for my $d (@INC) { 151 use File::Spec; 152 $f = File::Spec->catfile($d, "unicore", @path); 153 last if open($$rfh, $f); 154 undef $f; 155 } 156 croak __PACKAGE__, ": failed to find ", 157 File::Spec->catfile(@path), " in @INC" 158 unless defined $f; 159 } 160 return $f; 161} 162 163sub _dclone ($) { # Use Storable::dclone if available; otherwise emulate it. 164 165 use if defined &DynaLoader::boot_DynaLoader, Storable => qw(dclone); 166 167 return dclone(shift) if defined &dclone; 168 169 my $arg = shift; 170 my $type = ref $arg; 171 return $arg unless $type; # No deep cloning needed for scalars 172 173 if ($type eq 'ARRAY') { 174 my @return; 175 foreach my $element (@$arg) { 176 push @return, &_dclone($element); 177 } 178 return \@return; 179 } 180 elsif ($type eq 'HASH') { 181 my %return; 182 foreach my $key (keys %$arg) { 183 $return{$key} = &_dclone($arg->{$key}); 184 } 185 return \%return; 186 } 187 else { 188 croak "_dclone can't handle " . $type; 189 } 190} 191 192=head2 B<charinfo()> 193 194 use Unicode::UCD 'charinfo'; 195 196 my $charinfo = charinfo(0x41); 197 198This returns information about the input L</code point argument> 199as a reference to a hash of fields as defined by the Unicode 200standard. If the L</code point argument> is not assigned in the standard 201(i.e., has the general category C<Cn> meaning C<Unassigned>) 202or is a non-character (meaning it is guaranteed to never be assigned in 203the standard), 204C<undef> is returned. 205 206Fields that aren't applicable to the particular code point argument exist in the 207returned hash, and are empty. 208 209For results that are less "raw" than this function returns, or to get the values for 210any property, not just the few covered by this function, use the 211L</charprop()> function. 212 213The keys in the hash with the meanings of their values are: 214 215=over 216 217=item B<code> 218 219the input native L</code point argument> expressed in hexadecimal, with 220leading zeros 221added if necessary to make it contain at least four hexdigits 222 223=item B<name> 224 225name of I<code>, all IN UPPER CASE. 226Some control-type code points do not have names. 227This field will be empty for C<Surrogate> and C<Private Use> code points, 228and for the others without a name, 229it will contain a description enclosed in angle brackets, like 230C<E<lt>controlE<gt>>. 231 232 233=item B<category> 234 235The short name of the general category of I<code>. 236This will match one of the keys in the hash returned by L</general_categories()>. 237 238The L</prop_value_aliases()> function can be used to get all the synonyms 239of the category name. 240 241=item B<combining> 242 243the combining class number for I<code> used in the Canonical Ordering Algorithm. 244For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior> 245available at 246L<http://www.unicode.org/versions/Unicode5.1.0/> 247 248The L</prop_value_aliases()> function can be used to get all the synonyms 249of the combining class number. 250 251=item B<bidi> 252 253bidirectional type of I<code>. 254This will match one of the keys in the hash returned by L</bidi_types()>. 255 256The L</prop_value_aliases()> function can be used to get all the synonyms 257of the bidi type name. 258 259=item B<decomposition> 260 261is empty if I<code> has no decomposition; or is one or more codes 262(separated by spaces) that, taken in order, represent a decomposition for 263I<code>. Each has at least four hexdigits. 264The codes may be preceded by a word enclosed in angle brackets, then a space, 265like C<E<lt>compatE<gt> >, giving the type of decomposition 266 267This decomposition may be an intermediate one whose components are also 268decomposable. Use L<Unicode::Normalize> to get the final decomposition in one 269step. 270 271=item B<decimal> 272 273if I<code> represents a decimal digit this is its integer numeric value 274 275=item B<digit> 276 277if I<code> represents some other digit-like number, this is its integer 278numeric value 279 280=item B<numeric> 281 282if I<code> represents a whole or rational number, this is its numeric value. 283Rational values are expressed as a string like C<1/4>. 284 285=item B<mirrored> 286 287C<Y> or C<N> designating if I<code> is mirrored in bidirectional text 288 289=item B<unicode10> 290 291name of I<code> in the Unicode 1.0 standard if one 292existed for this code point and is different from the current name 293 294=item B<comment> 295 296As of Unicode 6.0, this is always empty. 297 298=item B<upper> 299 300is, if non-empty, the uppercase mapping for I<code> expressed as at least four 301hexdigits. This indicates that the full uppercase mapping is a single 302character, and is identical to the simple (single-character only) mapping. 303When this field is empty, it means that the simple uppercase mapping is 304I<code> itself; you'll need some other means, (like L</charprop()> or 305L</casespec()> to get the full mapping. 306 307=item B<lower> 308 309is, if non-empty, the lowercase mapping for I<code> expressed as at least four 310hexdigits. This indicates that the full lowercase mapping is a single 311character, and is identical to the simple (single-character only) mapping. 312When this field is empty, it means that the simple lowercase mapping is 313I<code> itself; you'll need some other means, (like L</charprop()> or 314L</casespec()> to get the full mapping. 315 316=item B<title> 317 318is, if non-empty, the titlecase mapping for I<code> expressed as at least four 319hexdigits. This indicates that the full titlecase mapping is a single 320character, and is identical to the simple (single-character only) mapping. 321When this field is empty, it means that the simple titlecase mapping is 322I<code> itself; you'll need some other means, (like L</charprop()> or 323L</casespec()> to get the full mapping. 324 325=item B<block> 326 327the block I<code> belongs to (used in C<\p{Blk=...}>). 328The L</prop_value_aliases()> function can be used to get all the synonyms 329of the block name. 330 331See L</Blocks versus Scripts>. 332 333=item B<script> 334 335the script I<code> belongs to. 336The L</prop_value_aliases()> function can be used to get all the synonyms 337of the script name. 338 339See L</Blocks versus Scripts>. 340 341=back 342 343Note that you cannot do (de)composition and casing based solely on the 344I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields; you 345will need also the L</casespec()> function and the C<Composition_Exclusion> 346property. (Or you could just use the L<lc()|perlfunc/lc>, 347L<uc()|perlfunc/uc>, and L<ucfirst()|perlfunc/ucfirst> functions, and the 348L<Unicode::Normalize> module.) 349 350=cut 351 352# NB: This function is nearly duplicated in charnames.pm 353sub _getcode { 354 my $arg = shift; 355 356 if ($arg =~ /^[1-9]\d*$/) { 357 return $arg; 358 } 359 elsif ($arg =~ /^(?:0[xX])?([[:xdigit:]]+)$/) { 360 return CORE::hex($1); 361 } 362 elsif ($arg =~ /^[Uu]\+([[:xdigit:]]+)$/) { # Is of form U+0000, means 363 # wants the Unicode code 364 # point, not the native one 365 my $decimal = CORE::hex($1); 366 return $decimal if IS_ASCII_PLATFORM; 367 return utf8::unicode_to_native($decimal); 368 } 369 370 return; 371} 372 373# Populated by _num. Converts real number back to input rational 374my %real_to_rational; 375 376# To store the contents of files found on disk. 377my @BIDIS; 378my @CATEGORIES; 379my @DECOMPOSITIONS; 380my @NUMERIC_TYPES; 381my %SIMPLE_LOWER; 382my %SIMPLE_TITLE; 383my %SIMPLE_UPPER; 384my %UNICODE_1_NAMES; 385my %ISO_COMMENT; 386 387# Eval'd so can run on versions earlier than the property is available in 388my $Hangul_Syllables_re = eval 'qr/\p{Block=Hangul_Syllables}/'; 389 390sub charinfo { 391 392 # This function has traditionally mimicked what is in UnicodeData.txt, 393 # warts and all. This is a re-write that avoids UnicodeData.txt so that 394 # it can be removed to save disk space. Instead, this assembles 395 # information gotten by other methods that get data from various other 396 # files. It uses charnames to get the character name; and various 397 # mktables tables. 398 399 use feature 'unicode_strings'; 400 401 # Will fail if called under minitest 402 use if defined &DynaLoader::boot_DynaLoader, "Unicode::Normalize" => qw(getCombinClass NFD); 403 404 my $arg = shift; 405 my $code = _getcode($arg); 406 croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code; 407 408 # Non-unicode implies undef. 409 return if $code > 0x10FFFF; 410 411 my %prop; 412 my $char = chr($code); 413 414 @CATEGORIES =_read_table("To/Gc.pl") unless @CATEGORIES; 415 $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code) 416 // $utf8::SwashInfo{'ToGc'}{'missing'}; 417 # Return undef if category value is 'Unassigned' or one of its synonyms 418 return if grep { lc $_ eq 'unassigned' } 419 prop_value_aliases('Gc', $prop{'category'}); 420 421 $prop{'code'} = sprintf "%04X", $code; 422 $prop{'name'} = ($char =~ /\p{Cntrl}/) ? '<control>' 423 : (charnames::viacode($code) // ""); 424 425 $prop{'combining'} = getCombinClass($code); 426 427 @BIDIS =_read_table("To/Bc.pl") unless @BIDIS; 428 $prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code) 429 // $utf8::SwashInfo{'ToBc'}{'missing'}; 430 431 # For most code points, we can just read in "unicore/Decomposition.pl", as 432 # its contents are exactly what should be output. But that file doesn't 433 # contain the data for the Hangul syllable decompositions, which can be 434 # algorithmically computed, and NFD() does that, so we call NFD() for 435 # those. We can't use NFD() for everything, as it does a complete 436 # recursive decomposition, and what this function has always done is to 437 # return what's in UnicodeData.txt which doesn't show that recursiveness. 438 # Fortunately, the NFD() of the Hanguls doesn't have any recursion 439 # issues. 440 # Having no decomposition implies an empty field; otherwise, all but 441 # "Canonical" imply a compatible decomposition, and the type is prefixed 442 # to that, as it is in UnicodeData.txt 443 UnicodeVersion() unless defined $v_unicode_version; 444 if ($v_unicode_version ge v2.0.0 && $char =~ $Hangul_Syllables_re) { 445 # The code points of the decomposition are output in standard Unicode 446 # hex format, separated by blanks. 447 $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)} 448 unpack "U*", NFD($char); 449 } 450 else { 451 @DECOMPOSITIONS = _read_table("Decomposition.pl") 452 unless @DECOMPOSITIONS; 453 $prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS, 454 $code) // ""; 455 } 456 457 # Can use num() to get the numeric values, if any. 458 if (! defined (my $value = num($char))) { 459 $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = ""; 460 } 461 else { 462 if ($char =~ /\d/) { 463 $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = $value; 464 } 465 else { 466 467 # For non-decimal-digits, we have to read in the Numeric type 468 # to distinguish them. It is not just a matter of integer vs. 469 # rational, as some whole number values are not considered digits, 470 # e.g., TAMIL NUMBER TEN. 471 $prop{'decimal'} = ""; 472 473 @NUMERIC_TYPES =_read_table("To/Nt.pl") unless @NUMERIC_TYPES; 474 if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "") 475 eq 'Digit') 476 { 477 $prop{'digit'} = $prop{'numeric'} = $value; 478 } 479 else { 480 $prop{'digit'} = ""; 481 $prop{'numeric'} = $real_to_rational{$value} // $value; 482 } 483 } 484 } 485 486 $prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N'; 487 488 %UNICODE_1_NAMES =_read_table("To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES; 489 $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // ""; 490 491 UnicodeVersion() unless defined $v_unicode_version; 492 if ($v_unicode_version ge v6.0.0) { 493 $prop{'comment'} = ""; 494 } 495 else { 496 %ISO_COMMENT = _read_table("To/Isc.pl", "use_hash") unless %ISO_COMMENT; 497 $prop{'comment'} = (defined $ISO_COMMENT{$code}) 498 ? $ISO_COMMENT{$code} 499 : ""; 500 } 501 502 %SIMPLE_UPPER = _read_table("To/Uc.pl", "use_hash") unless %SIMPLE_UPPER; 503 $prop{'upper'} = (defined $SIMPLE_UPPER{$code}) 504 ? sprintf("%04X", $SIMPLE_UPPER{$code}) 505 : ""; 506 507 %SIMPLE_LOWER = _read_table("To/Lc.pl", "use_hash") unless %SIMPLE_LOWER; 508 $prop{'lower'} = (defined $SIMPLE_LOWER{$code}) 509 ? sprintf("%04X", $SIMPLE_LOWER{$code}) 510 : ""; 511 512 %SIMPLE_TITLE = _read_table("To/Tc.pl", "use_hash") unless %SIMPLE_TITLE; 513 $prop{'title'} = (defined $SIMPLE_TITLE{$code}) 514 ? sprintf("%04X", $SIMPLE_TITLE{$code}) 515 : ""; 516 517 $prop{block} = charblock($code); 518 $prop{script} = charscript($code); 519 return \%prop; 520} 521 522sub _search { # Binary search in a [[lo,hi,prop],[...],...] table. 523 my ($table, $lo, $hi, $code) = @_; 524 525 return if $lo > $hi; 526 527 my $mid = int(($lo+$hi) / 2); 528 529 if ($table->[$mid]->[0] < $code) { 530 if ($table->[$mid]->[1] >= $code) { 531 return $table->[$mid]->[2]; 532 } else { 533 _search($table, $mid + 1, $hi, $code); 534 } 535 } elsif ($table->[$mid]->[0] > $code) { 536 _search($table, $lo, $mid - 1, $code); 537 } else { 538 return $table->[$mid]->[2]; 539 } 540} 541 542sub _read_table ($;$) { 543 544 # Returns the contents of the mktables generated table file located at $1 545 # in the form of either an array of arrays or a hash, depending on if the 546 # optional second parameter is true (for hash return) or not. In the case 547 # of a hash return, each key is a code point, and its corresponding value 548 # is what the table gives as the code point's corresponding value. In the 549 # case of an array return, each outer array denotes a range with [0] the 550 # start point of that range; [1] the end point; and [2] the value that 551 # every code point in the range has. The hash return is useful for fast 552 # lookup when the table contains only single code point ranges. The array 553 # return takes much less memory when there are large ranges. 554 # 555 # This function has the side effect of setting 556 # $utf8::SwashInfo{$property}{'format'} to be the mktables format of the 557 # table; and 558 # $utf8::SwashInfo{$property}{'missing'} to be the value for all entries 559 # not listed in the table. 560 # where $property is the Unicode property name, preceded by 'To' for map 561 # properties., e.g., 'ToSc'. 562 # 563 # Table entries look like one of: 564 # 0000 0040 Common # [65] 565 # 00AA Latin 566 567 my $table = shift; 568 my $return_hash = shift; 569 $return_hash = 0 unless defined $return_hash; 570 my @return; 571 my %return; 572 local $_; 573 my $list = do "unicore/$table"; 574 575 # Look up if this property requires adjustments, which we do below if it 576 # does. 577 require "unicore/Heavy.pl"; 578 my $property = $table =~ s/\.pl//r; 579 $property = $utf8::file_to_swash_name{$property}; 580 my $to_adjust = defined $property 581 && $utf8::SwashInfo{$property}{'format'} =~ / ^ a /x; 582 583 for (split /^/m, $list) { 584 my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?) 585 \s* ( \# .* )? # Optional comment 586 $ /x; 587 my $decimal_start = hex $start; 588 my $decimal_end = ($end eq "") ? $decimal_start : hex $end; 589 $value = hex $value if $to_adjust 590 && $utf8::SwashInfo{$property}{'format'} eq 'ax'; 591 if ($return_hash) { 592 foreach my $i ($decimal_start .. $decimal_end) { 593 $return{$i} = ($to_adjust) 594 ? $value + $i - $decimal_start 595 : $value; 596 } 597 } 598 elsif (! $to_adjust 599 && @return 600 && $return[-1][1] == $decimal_start - 1 601 && $return[-1][2] eq $value) 602 { 603 # If this is merely extending the previous range, do just that. 604 $return[-1]->[1] = $decimal_end; 605 } 606 else { 607 push @return, [ $decimal_start, $decimal_end, $value ]; 608 } 609 } 610 return ($return_hash) ? %return : @return; 611} 612 613sub charinrange { 614 my ($range, $arg) = @_; 615 my $code = _getcode($arg); 616 croak __PACKAGE__, "::charinrange: unknown code '$arg'" 617 unless defined $code; 618 _search($range, 0, $#$range, $code); 619} 620 621=head2 B<charprop()> 622 623 use Unicode::UCD 'charprop'; 624 625 print charprop(0x41, "Gc"), "\n"; 626 print charprop(0x61, "General_Category"), "\n"; 627 628 prints 629 Lu 630 Ll 631 632This returns the value of the Unicode property given by the second parameter 633for the L</code point argument> given by the first. 634 635The passed-in property may be specified as any of the synonyms returned by 636L</prop_aliases()>. 637 638The return value is always a scalar, either a string or a number. For 639properties where there are synonyms for the values, the synonym returned by 640this function is the longest, most descriptive form, the one returned by 641L</prop_value_aliases()> when called in a scalar context. Of course, you can 642call L</prop_value_aliases()> on the result to get other synonyms. 643 644The return values are more "cooked" than the L</charinfo()> ones. For 645example, the C<"uc"> property value is the actual string containing the full 646uppercase mapping of the input code point. You have to go to extra trouble 647with C<charinfo> to get this value from its C<upper> hash element when the 648full mapping differs from the simple one. 649 650Special note should be made of the return values for a few properties: 651 652=over 653 654=item Block 655 656The value returned is the new-style (see L</Old-style versus new-style block 657names>). 658 659=item Decomposition_Mapping 660 661Like L</charinfo()>, the result may be an intermediate decomposition whose 662components are also decomposable. Use L<Unicode::Normalize> to get the final 663decomposition in one step. 664 665Unlike L</charinfo()>, this does not include the decomposition type. Use the 666C<Decomposition_Type> property to get that. 667 668=item Name_Alias 669 670If the input code point's name has more than one synonym, they are returned 671joined into a single comma-separated string. 672 673=item Numeric_Value 674 675If the result is a fraction, it is converted into a floating point number to 676the accuracy of your platform. 677 678=item Script_Extensions 679 680If the result is multiple script names, they are returned joined into a single 681comma-separated string. 682 683=back 684 685When called with a property that is a Perl extension that isn't expressible in 686a compound form, this function currently returns C<undef>, as the only two 687possible values are I<true> or I<false> (1 or 0 I suppose). This behavior may 688change in the future, so don't write code that relies on it. C<Present_In> is 689a Perl extension that is expressible in a bipartite or compound form (for 690example, C<\p{Present_In=4.0}>), so C<charprop> accepts it. But C<Any> is a 691Perl extension that isn't expressible that way, so C<charprop> returns 692C<undef> for it. Also C<charprop> returns C<undef> for all Perl extensions 693that are internal-only. 694 695=cut 696 697sub charprop ($$) { 698 my ($input_cp, $prop) = @_; 699 700 my $cp = _getcode($input_cp); 701 croak __PACKAGE__, "::charprop: unknown code point '$input_cp'" unless defined $cp; 702 703 my ($list_ref, $map_ref, $format, $default) 704 = prop_invmap($prop); 705 return undef unless defined $list_ref; 706 707 my $i = search_invlist($list_ref, $cp); 708 croak __PACKAGE__, "::charprop: prop_invmap return is invalid for charprop('$input_cp', '$prop)" unless defined $i; 709 710 # $i is the index into both the inversion list and map of $cp. 711 my $map = $map_ref->[$i]; 712 713 # Convert enumeration values to their most complete form. 714 if (! ref $map) { 715 my $long_form = prop_value_aliases($prop, $map); 716 $map = $long_form if defined $long_form; 717 } 718 719 if ($format =~ / ^ s /x) { # Scalars 720 return join ",", @$map if ref $map; # Convert to scalar with comma 721 # separated array elements 722 723 # Resolve ambiguity as to whether an all digit value is a code point 724 # that should be converted to a character, or whether it is really 725 # just a number. To do this, look at the default. If it is a 726 # non-empty number, we can safely assume the result is also a number. 727 if ($map =~ / ^ \d+ $ /ax && $default !~ / ^ \d+ $ /ax) { 728 $map = chr $map; 729 } 730 elsif ($map =~ / ^ (?: Y | N ) $ /x) { 731 732 # prop_invmap() returns these values for properties that are Perl 733 # extensions. But this is misleading. For now, return undef for 734 # these, as currently documented. 735 undef $map unless 736 exists $Unicode::UCD::prop_aliases{utf8::_loose_name(lc $prop)}; 737 } 738 return $map; 739 } 740 elsif ($format eq 'ar') { # numbers, including rationals 741 my $offset = $cp - $list_ref->[$i]; 742 return $map if $map =~ /nan/i; 743 return $map + $offset if $offset != 0; # If needs adjustment 744 return eval $map; # Convert e.g., 1/2 to 0.5 745 } 746 elsif ($format =~ /^a/) { # Some entries need adjusting 747 748 # Linearize sequences into a string. 749 return join "", map { chr $_ } @$map if ref $map; # XXX && $format =~ /^ a [dl] /x; 750 751 return "" if $map eq "" && $format =~ /^a.*e/; 752 753 # These are all character mappings. Return the chr if no adjustment 754 # is needed 755 return chr $cp if $map eq "0"; 756 757 # Convert special entry. 758 if ($map eq '<hangul syllable>' && $format eq 'ad') { 759 use Unicode::Normalize qw(NFD); 760 return NFD(chr $cp); 761 } 762 763 # The rest need adjustment from the first entry in the inversion list 764 # corresponding to this map. 765 my $offset = $cp - $list_ref->[$i]; 766 return chr($map + $cp - $list_ref->[$i]); 767 } 768 elsif ($format eq 'n') { # The name property 769 770 # There are two special cases, handled here. 771 if ($map =~ / ( .+ ) <code\ point> $ /x) { 772 $map = sprintf("$1%04X", $cp); 773 } 774 elsif ($map eq '<hangul syllable>') { 775 $map = charnames::viacode($cp); 776 } 777 return $map; 778 } 779 else { 780 croak __PACKAGE__, "::charprop: Internal error: unknown format '$format'. Please perlbug this"; 781 } 782} 783 784=head2 B<charprops_all()> 785 786 use Unicode::UCD 'charprops_all'; 787 788 my $%properties_of_A_hash_ref = charprops_all("U+41"); 789 790This returns a reference to a hash whose keys are all the distinct Unicode (no 791Perl extension) properties, and whose values are the respective values for 792those properties for the input L</code point argument>. 793 794Each key is the property name in its longest, most descriptive form. The 795values are what L</charprop()> would return. 796 797This function is expensive in time and memory. 798 799=cut 800 801sub charprops_all($) { 802 my $input_cp = shift; 803 804 my $cp = _getcode($input_cp); 805 croak __PACKAGE__, "::charprops_all: unknown code point '$input_cp'" unless defined $cp; 806 807 my %return; 808 809 require "unicore/UCD.pl"; 810 811 foreach my $prop (keys %Unicode::UCD::prop_aliases) { 812 813 # Don't return a Perl extension. (This is the only one that 814 # %prop_aliases has in it.) 815 next if $prop eq 'perldecimaldigit'; 816 817 # Use long name for $prop in the hash 818 $return{scalar prop_aliases($prop)} = charprop($cp, $prop); 819 } 820 821 return \%return; 822} 823 824=head2 B<charblock()> 825 826 use Unicode::UCD 'charblock'; 827 828 my $charblock = charblock(0x41); 829 my $charblock = charblock(1234); 830 my $charblock = charblock(0x263a); 831 my $charblock = charblock("U+263a"); 832 833 my $range = charblock('Armenian'); 834 835With a L</code point argument> C<charblock()> returns the I<block> the code point 836belongs to, e.g. C<Basic Latin>. The old-style block name is returned (see 837L</Old-style versus new-style block names>). 838The L</prop_value_aliases()> function can be used to get all the synonyms 839of the block name. 840 841If the code point is unassigned, this returns the block it would belong to if 842it were assigned. (If the Unicode version being used is so early as to not 843have blocks, all code points are considered to be in C<No_Block>.) 844 845See also L</Blocks versus Scripts>. 846 847If supplied with an argument that can't be a code point, C<charblock()> tries to 848do the opposite and interpret the argument as an old-style block name. On an 849ASCII platform, the return value is a I<range set> with one range: an 850anonymous array with a single element that consists of another anonymous array 851whose first element is the first code point in the block, and whose second 852element is the final code point in the block. On an EBCDIC 853platform, the first two Unicode blocks are not contiguous. Their range sets 854are lists containing I<start-of-range>, I<end-of-range> code point pairs. You 855can test whether a code point is in a range set using the L</charinrange()> 856function. (To be precise, each I<range set> contains a third array element, 857after the range boundary ones: the old_style block name.) 858 859If the argument to C<charblock()> is not a known block, C<undef> is 860returned. 861 862=cut 863 864my @BLOCKS; 865my %BLOCKS; 866 867sub _charblocks { 868 869 # Can't read from the mktables table because it loses the hyphens in the 870 # original. 871 unless (@BLOCKS) { 872 UnicodeVersion() unless defined $v_unicode_version; 873 if ($v_unicode_version lt v2.0.0) { 874 my $subrange = [ 0, 0x10FFFF, 'No_Block' ]; 875 push @BLOCKS, $subrange; 876 push @{$BLOCKS{'No_Block'}}, $subrange; 877 } 878 elsif (openunicode(\$BLOCKSFH, "Blocks.txt")) { 879 local $_; 880 local $/ = "\n"; 881 while (<$BLOCKSFH>) { 882 883 # Old versions used a different syntax to mark the range. 884 $_ =~ s/;\s+/../ if $v_unicode_version lt v3.1.0; 885 886 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { 887 my ($lo, $hi) = (hex($1), hex($2)); 888 my $subrange = [ $lo, $hi, $3 ]; 889 push @BLOCKS, $subrange; 890 push @{$BLOCKS{$3}}, $subrange; 891 } 892 } 893 close($BLOCKSFH); 894 if (! IS_ASCII_PLATFORM) { 895 # The first two blocks, through 0xFF, are wrong on EBCDIC 896 # platforms. 897 898 my @new_blocks = _read_table("To/Blk.pl"); 899 900 # Get rid of the first two ranges in the Unicode version, and 901 # replace them with the ones computed by mktables. 902 shift @BLOCKS; 903 shift @BLOCKS; 904 delete $BLOCKS{'Basic Latin'}; 905 delete $BLOCKS{'Latin-1 Supplement'}; 906 907 # But there are multiple entries in the computed versions, and 908 # we change their names to (which we know) to be the old-style 909 # ones. 910 for my $i (0.. @new_blocks - 1) { 911 if ($new_blocks[$i][2] =~ s/Basic_Latin/Basic Latin/ 912 or $new_blocks[$i][2] =~ 913 s/Latin_1_Supplement/Latin-1 Supplement/) 914 { 915 push @{$BLOCKS{$new_blocks[$i][2]}}, $new_blocks[$i]; 916 } 917 else { 918 splice @new_blocks, $i; 919 last; 920 } 921 } 922 unshift @BLOCKS, @new_blocks; 923 } 924 } 925 } 926} 927 928sub charblock { 929 my $arg = shift; 930 931 _charblocks() unless @BLOCKS; 932 933 my $code = _getcode($arg); 934 935 if (defined $code) { 936 my $result = _search(\@BLOCKS, 0, $#BLOCKS, $code); 937 return $result if defined $result; 938 return 'No_Block'; 939 } 940 elsif (exists $BLOCKS{$arg}) { 941 return _dclone $BLOCKS{$arg}; 942 } 943 944 carp __PACKAGE__, "::charblock: unknown code '$arg'"; 945 return; 946} 947 948=head2 B<charscript()> 949 950 use Unicode::UCD 'charscript'; 951 952 my $charscript = charscript(0x41); 953 my $charscript = charscript(1234); 954 my $charscript = charscript("U+263a"); 955 956 my $range = charscript('Thai'); 957 958With a L</code point argument>, C<charscript()> returns the I<script> the 959code point belongs to, e.g., C<Latin>, C<Greek>, C<Han>. 960If the code point is unassigned or the Unicode version being used is so early 961that it doesn't have scripts, this function returns C<"Unknown">. 962The L</prop_value_aliases()> function can be used to get all the synonyms 963of the script name. 964 965If supplied with an argument that can't be a code point, charscript() tries 966to do the opposite and interpret the argument as a script name. The 967return value is a I<range set>: an anonymous array of arrays that contain 968I<start-of-range>, I<end-of-range> code point pairs. You can test whether a 969code point is in a range set using the L</charinrange()> function. 970(To be precise, each I<range set> contains a third array element, 971after the range boundary ones: the script name.) 972 973If the C<charscript()> argument is not a known script, C<undef> is returned. 974 975See also L</Blocks versus Scripts>. 976 977=cut 978 979my @SCRIPTS; 980my %SCRIPTS; 981 982sub _charscripts { 983 unless (@SCRIPTS) { 984 UnicodeVersion() unless defined $v_unicode_version; 985 if ($v_unicode_version lt v3.1.0) { 986 push @SCRIPTS, [ 0, 0x10FFFF, 'Unknown' ]; 987 } 988 else { 989 @SCRIPTS =_read_table("To/Sc.pl"); 990 } 991 } 992 foreach my $entry (@SCRIPTS) { 993 $entry->[2] =~ s/(_\w)/\L$1/g; # Preserve old-style casing 994 push @{$SCRIPTS{$entry->[2]}}, $entry; 995 } 996} 997 998sub charscript { 999 my $arg = shift; 1000 1001 _charscripts() unless @SCRIPTS; 1002 1003 my $code = _getcode($arg); 1004 1005 if (defined $code) { 1006 my $result = _search(\@SCRIPTS, 0, $#SCRIPTS, $code); 1007 return $result if defined $result; 1008 return $utf8::SwashInfo{'ToSc'}{'missing'}; 1009 } elsif (exists $SCRIPTS{$arg}) { 1010 return _dclone $SCRIPTS{$arg}; 1011 } 1012 1013 carp __PACKAGE__, "::charscript: unknown code '$arg'"; 1014 return; 1015} 1016 1017=head2 B<charblocks()> 1018 1019 use Unicode::UCD 'charblocks'; 1020 1021 my $charblocks = charblocks(); 1022 1023C<charblocks()> returns a reference to a hash with the known block names 1024as the keys, and the code point ranges (see L</charblock()>) as the values. 1025 1026The names are in the old-style (see L</Old-style versus new-style block 1027names>). 1028 1029L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a 1030different type of data structure. 1031 1032L<prop_values("Block")|/prop_values()> can be used to get all 1033the known new-style block names as a list, without the code point ranges. 1034 1035See also L</Blocks versus Scripts>. 1036 1037=cut 1038 1039sub charblocks { 1040 _charblocks() unless %BLOCKS; 1041 return _dclone \%BLOCKS; 1042} 1043 1044=head2 B<charscripts()> 1045 1046 use Unicode::UCD 'charscripts'; 1047 1048 my $charscripts = charscripts(); 1049 1050C<charscripts()> returns a reference to a hash with the known script 1051names as the keys, and the code point ranges (see L</charscript()>) as 1052the values. 1053 1054L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a 1055different type of data structure. 1056 1057L<C<prop_values("Script")>|/prop_values()> can be used to get all 1058the known script names as a list, without the code point ranges. 1059 1060See also L</Blocks versus Scripts>. 1061 1062=cut 1063 1064sub charscripts { 1065 _charscripts() unless %SCRIPTS; 1066 return _dclone \%SCRIPTS; 1067} 1068 1069=head2 B<charinrange()> 1070 1071In addition to using the C<\p{Blk=...}> and C<\P{Blk=...}> constructs, you 1072can also test whether a code point is in the I<range> as returned by 1073L</charblock()> and L</charscript()> or as the values of the hash returned 1074by L</charblocks()> and L</charscripts()> by using C<charinrange()>: 1075 1076 use Unicode::UCD qw(charscript charinrange); 1077 1078 $range = charscript('Hiragana'); 1079 print "looks like hiragana\n" if charinrange($range, $codepoint); 1080 1081=cut 1082 1083my %GENERAL_CATEGORIES = 1084 ( 1085 'L' => 'Letter', 1086 'LC' => 'CasedLetter', 1087 'Lu' => 'UppercaseLetter', 1088 'Ll' => 'LowercaseLetter', 1089 'Lt' => 'TitlecaseLetter', 1090 'Lm' => 'ModifierLetter', 1091 'Lo' => 'OtherLetter', 1092 'M' => 'Mark', 1093 'Mn' => 'NonspacingMark', 1094 'Mc' => 'SpacingMark', 1095 'Me' => 'EnclosingMark', 1096 'N' => 'Number', 1097 'Nd' => 'DecimalNumber', 1098 'Nl' => 'LetterNumber', 1099 'No' => 'OtherNumber', 1100 'P' => 'Punctuation', 1101 'Pc' => 'ConnectorPunctuation', 1102 'Pd' => 'DashPunctuation', 1103 'Ps' => 'OpenPunctuation', 1104 'Pe' => 'ClosePunctuation', 1105 'Pi' => 'InitialPunctuation', 1106 'Pf' => 'FinalPunctuation', 1107 'Po' => 'OtherPunctuation', 1108 'S' => 'Symbol', 1109 'Sm' => 'MathSymbol', 1110 'Sc' => 'CurrencySymbol', 1111 'Sk' => 'ModifierSymbol', 1112 'So' => 'OtherSymbol', 1113 'Z' => 'Separator', 1114 'Zs' => 'SpaceSeparator', 1115 'Zl' => 'LineSeparator', 1116 'Zp' => 'ParagraphSeparator', 1117 'C' => 'Other', 1118 'Cc' => 'Control', 1119 'Cf' => 'Format', 1120 'Cs' => 'Surrogate', 1121 'Co' => 'PrivateUse', 1122 'Cn' => 'Unassigned', 1123 ); 1124 1125sub general_categories { 1126 return _dclone \%GENERAL_CATEGORIES; 1127} 1128 1129=head2 B<general_categories()> 1130 1131 use Unicode::UCD 'general_categories'; 1132 1133 my $categories = general_categories(); 1134 1135This returns a reference to a hash which has short 1136general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long 1137names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>, 1138C<Symbol>) as values. The hash is reversible in case you need to go 1139from the long names to the short names. The general category is the 1140one returned from 1141L</charinfo()> under the C<category> key. 1142 1143The L</prop_values()> and L</prop_value_aliases()> functions can be used as an 1144alternative to this function; the first returning a simple list of the short 1145category names; and the second gets all the synonyms of a given category name. 1146 1147=cut 1148 1149my %BIDI_TYPES = 1150 ( 1151 'L' => 'Left-to-Right', 1152 'LRE' => 'Left-to-Right Embedding', 1153 'LRO' => 'Left-to-Right Override', 1154 'R' => 'Right-to-Left', 1155 'AL' => 'Right-to-Left Arabic', 1156 'RLE' => 'Right-to-Left Embedding', 1157 'RLO' => 'Right-to-Left Override', 1158 'PDF' => 'Pop Directional Format', 1159 'EN' => 'European Number', 1160 'ES' => 'European Number Separator', 1161 'ET' => 'European Number Terminator', 1162 'AN' => 'Arabic Number', 1163 'CS' => 'Common Number Separator', 1164 'NSM' => 'Non-Spacing Mark', 1165 'BN' => 'Boundary Neutral', 1166 'B' => 'Paragraph Separator', 1167 'S' => 'Segment Separator', 1168 'WS' => 'Whitespace', 1169 'ON' => 'Other Neutrals', 1170 ); 1171 1172=head2 B<bidi_types()> 1173 1174 use Unicode::UCD 'bidi_types'; 1175 1176 my $categories = bidi_types(); 1177 1178This returns a reference to a hash which has the short 1179bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long 1180names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The 1181hash is reversible in case you need to go from the long names to the 1182short names. The bidi type is the one returned from 1183L</charinfo()> 1184under the C<bidi> key. For the exact meaning of the various bidi classes 1185the Unicode TR9 is recommended reading: 1186L<http://www.unicode.org/reports/tr9/> 1187(as of Unicode 5.0.0) 1188 1189The L</prop_values()> and L</prop_value_aliases()> functions can be used as an 1190alternative to this function; the first returning a simple list of the short 1191bidi type names; and the second gets all the synonyms of a given bidi type 1192name. 1193 1194=cut 1195 1196sub bidi_types { 1197 return _dclone \%BIDI_TYPES; 1198} 1199 1200=head2 B<compexcl()> 1201 1202 use Unicode::UCD 'compexcl'; 1203 1204 my $compexcl = compexcl(0x09dc); 1205 1206This routine returns C<undef> if the Unicode version being used is so early 1207that it doesn't have this property. 1208 1209C<compexcl()> is included for backwards 1210compatibility, but as of Perl 5.12 and more modern Unicode versions, for 1211most purposes it is probably more convenient to use one of the following 1212instead: 1213 1214 my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex}; 1215 my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion}; 1216 1217or even 1218 1219 my $compexcl = chr(0x09dc) =~ /\p{CE}; 1220 my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion}; 1221 1222The first two forms return B<true> if the L</code point argument> should not 1223be produced by composition normalization. For the final two forms to return 1224B<true>, it is additionally required that this fact not otherwise be 1225determinable from the Unicode data base. 1226 1227This routine behaves identically to the final two forms. That is, 1228it does not return B<true> if the code point has a decomposition 1229consisting of another single code point, nor if its decomposition starts 1230with a code point whose combining class is non-zero. Code points that meet 1231either of these conditions should also not be produced by composition 1232normalization, which is probably why you should use the 1233C<Full_Composition_Exclusion> property instead, as shown above. 1234 1235The routine returns B<false> otherwise. 1236 1237=cut 1238 1239# Eval'd so can run on versions earlier than the property is available in 1240my $Composition_Exclusion_re = eval 'qr/\p{Composition_Exclusion}/'; 1241 1242sub compexcl { 1243 my $arg = shift; 1244 my $code = _getcode($arg); 1245 croak __PACKAGE__, "::compexcl: unknown code '$arg'" 1246 unless defined $code; 1247 1248 UnicodeVersion() unless defined $v_unicode_version; 1249 return if $v_unicode_version lt v3.0.0; 1250 1251 no warnings "non_unicode"; # So works on non-Unicode code points 1252 return chr($code) =~ $Composition_Exclusion_re 1253} 1254 1255=head2 B<casefold()> 1256 1257 use Unicode::UCD 'casefold'; 1258 1259 my $casefold = casefold(0xDF); 1260 if (defined $casefold) { 1261 my @full_fold_hex = split / /, $casefold->{'full'}; 1262 my $full_fold_string = 1263 join "", map {chr(hex($_))} @full_fold_hex; 1264 my @turkic_fold_hex = 1265 split / /, ($casefold->{'turkic'} ne "") 1266 ? $casefold->{'turkic'} 1267 : $casefold->{'full'}; 1268 my $turkic_fold_string = 1269 join "", map {chr(hex($_))} @turkic_fold_hex; 1270 } 1271 if (defined $casefold && $casefold->{'simple'} ne "") { 1272 my $simple_fold_hex = $casefold->{'simple'}; 1273 my $simple_fold_string = chr(hex($simple_fold_hex)); 1274 } 1275 1276This returns the (almost) locale-independent case folding of the 1277character specified by the L</code point argument>. (Starting in Perl v5.16, 1278the core function C<fc()> returns the C<full> mapping (described below) 1279faster than this does, and for entire strings.) 1280 1281If there is no case folding for the input code point, C<undef> is returned. 1282 1283If there is a case folding for that code point, a reference to a hash 1284with the following fields is returned: 1285 1286=over 1287 1288=item B<code> 1289 1290the input native L</code point argument> expressed in hexadecimal, with 1291leading zeros 1292added if necessary to make it contain at least four hexdigits 1293 1294=item B<full> 1295 1296one or more codes (separated by spaces) that, taken in order, give the 1297code points for the case folding for I<code>. 1298Each has at least four hexdigits. 1299 1300=item B<simple> 1301 1302is empty, or is exactly one code with at least four hexdigits which can be used 1303as an alternative case folding when the calling program cannot cope with the 1304fold being a sequence of multiple code points. If I<full> is just one code 1305point, then I<simple> equals I<full>. If there is no single code point folding 1306defined for I<code>, then I<simple> is the empty string. Otherwise, it is an 1307inferior, but still better-than-nothing alternative folding to I<full>. 1308 1309=item B<mapping> 1310 1311is the same as I<simple> if I<simple> is not empty, and it is the same as I<full> 1312otherwise. It can be considered to be the simplest possible folding for 1313I<code>. It is defined primarily for backwards compatibility. 1314 1315=item B<status> 1316 1317is C<C> (for C<common>) if the best possible fold is a single code point 1318(I<simple> equals I<full> equals I<mapping>). It is C<S> if there are distinct 1319folds, I<simple> and I<full> (I<mapping> equals I<simple>). And it is C<F> if 1320there is only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty). 1321Note that this 1322describes the contents of I<mapping>. It is defined primarily for backwards 1323compatibility. 1324 1325For Unicode versions between 3.1 and 3.1.1 inclusive, I<status> can also be 1326C<I> which is the same as C<C> but is a special case for dotted uppercase I and 1327dotless lowercase i: 1328 1329=over 1330 1331=item Z<>B<*> If you use this C<I> mapping 1332 1333the result is case-insensitive, 1334but dotless and dotted I's are not distinguished 1335 1336=item Z<>B<*> If you exclude this C<I> mapping 1337 1338the result is not fully case-insensitive, but 1339dotless and dotted I's are distinguished 1340 1341=back 1342 1343=item B<turkic> 1344 1345contains any special folding for Turkic languages. For versions of Unicode 1346starting with 3.2, this field is empty unless I<code> has a different folding 1347in Turkic languages, in which case it is one or more codes (separated by 1348spaces) that, taken in order, give the code points for the case folding for 1349I<code> in those languages. 1350Each code has at least four hexdigits. 1351Note that this folding does not maintain canonical equivalence without 1352additional processing. 1353 1354For Unicode versions between 3.1 and 3.1.1 inclusive, this field is empty unless 1355there is a 1356special folding for Turkic languages, in which case I<status> is C<I>, and 1357I<mapping>, I<full>, I<simple>, and I<turkic> are all equal. 1358 1359=back 1360 1361Programs that want complete generality and the best folding results should use 1362the folding contained in the I<full> field. But note that the fold for some 1363code points will be a sequence of multiple code points. 1364 1365Programs that can't cope with the fold mapping being multiple code points can 1366use the folding contained in the I<simple> field, with the loss of some 1367generality. In Unicode 5.1, about 7% of the defined foldings have no single 1368code point folding. 1369 1370The I<mapping> and I<status> fields are provided for backwards compatibility for 1371existing programs. They contain the same values as in previous versions of 1372this function. 1373 1374Locale is not completely independent. The I<turkic> field contains results to 1375use when the locale is a Turkic language. 1376 1377For more information about case mappings see 1378L<http://www.unicode.org/unicode/reports/tr21> 1379 1380=cut 1381 1382my %CASEFOLD; 1383 1384sub _casefold { 1385 unless (%CASEFOLD) { # Populate the hash 1386 my ($full_invlist_ref, $full_invmap_ref, undef, $default) 1387 = prop_invmap('Case_Folding'); 1388 1389 # Use the recipe given in the prop_invmap() pod to convert the 1390 # inversion map into the hash. 1391 for my $i (0 .. @$full_invlist_ref - 1 - 1) { 1392 next if $full_invmap_ref->[$i] == $default; 1393 my $adjust = -1; 1394 for my $j ($full_invlist_ref->[$i] .. $full_invlist_ref->[$i+1] -1) { 1395 $adjust++; 1396 if (! ref $full_invmap_ref->[$i]) { 1397 1398 # This is a single character mapping 1399 $CASEFOLD{$j}{'status'} = 'C'; 1400 $CASEFOLD{$j}{'simple'} 1401 = $CASEFOLD{$j}{'full'} 1402 = $CASEFOLD{$j}{'mapping'} 1403 = sprintf("%04X", $full_invmap_ref->[$i] + $adjust); 1404 $CASEFOLD{$j}{'code'} = sprintf("%04X", $j); 1405 $CASEFOLD{$j}{'turkic'} = ""; 1406 } 1407 else { # prop_invmap ensures that $adjust is 0 for a ref 1408 $CASEFOLD{$j}{'status'} = 'F'; 1409 $CASEFOLD{$j}{'full'} 1410 = $CASEFOLD{$j}{'mapping'} 1411 = join " ", map { sprintf "%04X", $_ } 1412 @{$full_invmap_ref->[$i]}; 1413 $CASEFOLD{$j}{'simple'} = ""; 1414 $CASEFOLD{$j}{'code'} = sprintf("%04X", $j); 1415 $CASEFOLD{$j}{'turkic'} = ""; 1416 } 1417 } 1418 } 1419 1420 # We have filled in the full mappings above, assuming there were no 1421 # simple ones for the ones with multi-character maps. Now, we find 1422 # and fix the cases where that assumption was false. 1423 (my ($simple_invlist_ref, $simple_invmap_ref, undef), $default) 1424 = prop_invmap('Simple_Case_Folding'); 1425 for my $i (0 .. @$simple_invlist_ref - 1 - 1) { 1426 next if $simple_invmap_ref->[$i] == $default; 1427 my $adjust = -1; 1428 for my $j ($simple_invlist_ref->[$i] 1429 .. $simple_invlist_ref->[$i+1] -1) 1430 { 1431 $adjust++; 1432 next if $CASEFOLD{$j}{'status'} eq 'C'; 1433 $CASEFOLD{$j}{'status'} = 'S'; 1434 $CASEFOLD{$j}{'simple'} 1435 = $CASEFOLD{$j}{'mapping'} 1436 = sprintf("%04X", $simple_invmap_ref->[$i] + $adjust); 1437 $CASEFOLD{$j}{'code'} = sprintf("%04X", $j); 1438 $CASEFOLD{$j}{'turkic'} = ""; 1439 } 1440 } 1441 1442 # We hard-code in the turkish rules 1443 UnicodeVersion() unless defined $v_unicode_version; 1444 if ($v_unicode_version ge v3.2.0) { 1445 1446 # These two code points should already have regular entries, so 1447 # just fill in the turkish fields 1448 $CASEFOLD{ord('I')}{'turkic'} = '0131'; 1449 $CASEFOLD{0x130}{'turkic'} = sprintf "%04X", ord('i'); 1450 } 1451 elsif ($v_unicode_version ge v3.1.0) { 1452 1453 # These two code points don't have entries otherwise. 1454 $CASEFOLD{0x130}{'code'} = '0130'; 1455 $CASEFOLD{0x131}{'code'} = '0131'; 1456 $CASEFOLD{0x130}{'status'} = $CASEFOLD{0x131}{'status'} = 'I'; 1457 $CASEFOLD{0x130}{'turkic'} 1458 = $CASEFOLD{0x130}{'mapping'} 1459 = $CASEFOLD{0x130}{'full'} 1460 = $CASEFOLD{0x130}{'simple'} 1461 = $CASEFOLD{0x131}{'turkic'} 1462 = $CASEFOLD{0x131}{'mapping'} 1463 = $CASEFOLD{0x131}{'full'} 1464 = $CASEFOLD{0x131}{'simple'} 1465 = sprintf "%04X", ord('i'); 1466 } 1467 } 1468} 1469 1470sub casefold { 1471 my $arg = shift; 1472 my $code = _getcode($arg); 1473 croak __PACKAGE__, "::casefold: unknown code '$arg'" 1474 unless defined $code; 1475 1476 _casefold() unless %CASEFOLD; 1477 1478 return $CASEFOLD{$code}; 1479} 1480 1481=head2 B<all_casefolds()> 1482 1483 1484 use Unicode::UCD 'all_casefolds'; 1485 1486 my $all_folds_ref = all_casefolds(); 1487 foreach my $char_with_casefold (sort { $a <=> $b } 1488 keys %$all_folds_ref) 1489 { 1490 printf "%04X:", $char_with_casefold; 1491 my $casefold = $all_folds_ref->{$char_with_casefold}; 1492 1493 # Get folds for $char_with_casefold 1494 1495 my @full_fold_hex = split / /, $casefold->{'full'}; 1496 my $full_fold_string = 1497 join "", map {chr(hex($_))} @full_fold_hex; 1498 print " full=", join " ", @full_fold_hex; 1499 my @turkic_fold_hex = 1500 split / /, ($casefold->{'turkic'} ne "") 1501 ? $casefold->{'turkic'} 1502 : $casefold->{'full'}; 1503 my $turkic_fold_string = 1504 join "", map {chr(hex($_))} @turkic_fold_hex; 1505 print "; turkic=", join " ", @turkic_fold_hex; 1506 if (defined $casefold && $casefold->{'simple'} ne "") { 1507 my $simple_fold_hex = $casefold->{'simple'}; 1508 my $simple_fold_string = chr(hex($simple_fold_hex)); 1509 print "; simple=$simple_fold_hex"; 1510 } 1511 print "\n"; 1512 } 1513 1514This returns all the case foldings in the current version of Unicode in the 1515form of a reference to a hash. Each key to the hash is the decimal 1516representation of a Unicode character that has a casefold to other than 1517itself. The casefold of a semi-colon is itself, so it isn't in the hash; 1518likewise for a lowercase "a", but there is an entry for a capital "A". The 1519hash value for each key is another hash, identical to what is returned by 1520L</casefold()> if called with that code point as its argument. So the value 1521C<< all_casefolds()->{ord("A")}' >> is equivalent to C<casefold(ord("A"))>; 1522 1523=cut 1524 1525sub all_casefolds () { 1526 _casefold() unless %CASEFOLD; 1527 return _dclone \%CASEFOLD; 1528} 1529 1530=head2 B<casespec()> 1531 1532 use Unicode::UCD 'casespec'; 1533 1534 my $casespec = casespec(0xFB00); 1535 1536This returns the potentially locale-dependent case mappings of the L</code point 1537argument>. The mappings may be longer than a single code point (which the basic 1538Unicode case mappings as returned by L</charinfo()> never are). 1539 1540If there are no case mappings for the L</code point argument>, or if all three 1541possible mappings (I<lower>, I<title> and I<upper>) result in single code 1542points and are locale independent and unconditional, C<undef> is returned 1543(which means that the case mappings, if any, for the code point are those 1544returned by L</charinfo()>). 1545 1546Otherwise, a reference to a hash giving the mappings (or a reference to a hash 1547of such hashes, explained below) is returned with the following keys and their 1548meanings: 1549 1550The keys in the bottom layer hash with the meanings of their values are: 1551 1552=over 1553 1554=item B<code> 1555 1556the input native L</code point argument> expressed in hexadecimal, with 1557leading zeros 1558added if necessary to make it contain at least four hexdigits 1559 1560=item B<lower> 1561 1562one or more codes (separated by spaces) that, taken in order, give the 1563code points for the lower case of I<code>. 1564Each has at least four hexdigits. 1565 1566=item B<title> 1567 1568one or more codes (separated by spaces) that, taken in order, give the 1569code points for the title case of I<code>. 1570Each has at least four hexdigits. 1571 1572=item B<upper> 1573 1574one or more codes (separated by spaces) that, taken in order, give the 1575code points for the upper case of I<code>. 1576Each has at least four hexdigits. 1577 1578=item B<condition> 1579 1580the conditions for the mappings to be valid. 1581If C<undef>, the mappings are always valid. 1582When defined, this field is a list of conditions, 1583all of which must be true for the mappings to be valid. 1584The list consists of one or more 1585I<locales> (see below) 1586and/or I<contexts> (explained in the next paragraph), 1587separated by spaces. 1588(Other than as used to separate elements, spaces are to be ignored.) 1589Case distinctions in the condition list are not significant. 1590Conditions preceded by "NON_" represent the negation of the condition. 1591 1592A I<context> is one of those defined in the Unicode standard. 1593For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations> 1594available at 1595L<http://www.unicode.org/versions/Unicode5.1.0/>. 1596These are for context-sensitive casing. 1597 1598=back 1599 1600The hash described above is returned for locale-independent casing, where 1601at least one of the mappings has length longer than one. If C<undef> is 1602returned, the code point may have mappings, but if so, all are length one, 1603and are returned by L</charinfo()>. 1604Note that when this function does return a value, it will be for the complete 1605set of mappings for a code point, even those whose length is one. 1606 1607If there are additional casing rules that apply only in certain locales, 1608an additional key for each will be defined in the returned hash. Each such key 1609will be its locale name, defined as a 2-letter ISO 3166 country code, possibly 1610followed by a "_" and a 2-letter ISO language code (possibly followed by a "_" 1611and a variant code). You can find the lists of all possible locales, see 1612L<Locale::Country> and L<Locale::Language>. 1613(In Unicode 6.0, the only locales returned by this function 1614are C<lt>, C<tr>, and C<az>.) 1615 1616Each locale key is a reference to a hash that has the form above, and gives 1617the casing rules for that particular locale, which take precedence over the 1618locale-independent ones when in that locale. 1619 1620If the only casing for a code point is locale-dependent, then the returned 1621hash will not have any of the base keys, like C<code>, C<upper>, etc., but 1622will contain only locale keys. 1623 1624For more information about case mappings see 1625L<http://www.unicode.org/unicode/reports/tr21/> 1626 1627=cut 1628 1629my %CASESPEC; 1630 1631sub _casespec { 1632 unless (%CASESPEC) { 1633 UnicodeVersion() unless defined $v_unicode_version; 1634 if ($v_unicode_version lt v2.1.8) { 1635 %CASESPEC = {}; 1636 } 1637 elsif (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { 1638 local $_; 1639 local $/ = "\n"; 1640 while (<$CASESPECFH>) { 1641 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { 1642 1643 my ($hexcode, $lower, $title, $upper, $condition) = 1644 ($1, $2, $3, $4, $5); 1645 if (! IS_ASCII_PLATFORM) { # Remap entry to native 1646 foreach my $var_ref (\$hexcode, 1647 \$lower, 1648 \$title, 1649 \$upper) 1650 { 1651 next unless defined $$var_ref; 1652 $$var_ref = join " ", 1653 map { sprintf("%04X", 1654 utf8::unicode_to_native(hex $_)) } 1655 split " ", $$var_ref; 1656 } 1657 } 1658 1659 my $code = hex($hexcode); 1660 1661 # In 2.1.8, there were duplicate entries; ignore all but 1662 # the first one -- there were no conditions in the file 1663 # anyway. 1664 if (exists $CASESPEC{$code} && $v_unicode_version ne v2.1.8) 1665 { 1666 if (exists $CASESPEC{$code}->{code}) { 1667 my ($oldlower, 1668 $oldtitle, 1669 $oldupper, 1670 $oldcondition) = 1671 @{$CASESPEC{$code}}{qw(lower 1672 title 1673 upper 1674 condition)}; 1675 if (defined $oldcondition) { 1676 my ($oldlocale) = 1677 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/); 1678 delete $CASESPEC{$code}; 1679 $CASESPEC{$code}->{$oldlocale} = 1680 { code => $hexcode, 1681 lower => $oldlower, 1682 title => $oldtitle, 1683 upper => $oldupper, 1684 condition => $oldcondition }; 1685 } 1686 } 1687 my ($locale) = 1688 ($condition =~ /^([a-z][a-z](?:_\S+)?)/); 1689 $CASESPEC{$code}->{$locale} = 1690 { code => $hexcode, 1691 lower => $lower, 1692 title => $title, 1693 upper => $upper, 1694 condition => $condition }; 1695 } else { 1696 $CASESPEC{$code} = 1697 { code => $hexcode, 1698 lower => $lower, 1699 title => $title, 1700 upper => $upper, 1701 condition => $condition }; 1702 } 1703 } 1704 } 1705 close($CASESPECFH); 1706 } 1707 } 1708} 1709 1710sub casespec { 1711 my $arg = shift; 1712 my $code = _getcode($arg); 1713 croak __PACKAGE__, "::casespec: unknown code '$arg'" 1714 unless defined $code; 1715 1716 _casespec() unless %CASESPEC; 1717 1718 return ref $CASESPEC{$code} ? _dclone $CASESPEC{$code} : $CASESPEC{$code}; 1719} 1720 1721=head2 B<namedseq()> 1722 1723 use Unicode::UCD 'namedseq'; 1724 1725 my $namedseq = namedseq("KATAKANA LETTER AINU P"); 1726 my @namedseq = namedseq("KATAKANA LETTER AINU P"); 1727 my %namedseq = namedseq(); 1728 1729If used with a single argument in a scalar context, returns the string 1730consisting of the code points of the named sequence, or C<undef> if no 1731named sequence by that name exists. If used with a single argument in 1732a list context, it returns the list of the ordinals of the code points. 1733 1734If used with no 1735arguments in a list context, it returns a hash with the names of all the 1736named sequences as the keys and their sequences as strings as 1737the values. Otherwise, it returns C<undef> or an empty list depending 1738on the context. 1739 1740This function only operates on officially approved (not provisional) named 1741sequences. 1742 1743Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named 1744sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA 1745LETTER AINU P")> will return the same string this function does, but will also 1746operate on character names that aren't named sequences, without you having to 1747know which are which. See L<charnames>. 1748 1749=cut 1750 1751my %NAMEDSEQ; 1752 1753sub _namedseq { 1754 unless (%NAMEDSEQ) { 1755 if (openunicode(\$NAMEDSEQFH, "Name.pl")) { 1756 local $_; 1757 local $/ = "\n"; 1758 while (<$NAMEDSEQFH>) { 1759 if (/^ [0-9A-F]+ \ /x) { 1760 chomp; 1761 my ($sequence, $name) = split /\t/; 1762 my @s = map { chr(hex($_)) } split(' ', $sequence); 1763 $NAMEDSEQ{$name} = join("", @s); 1764 } 1765 } 1766 close($NAMEDSEQFH); 1767 } 1768 } 1769} 1770 1771sub namedseq { 1772 1773 # Use charnames::string_vianame() which now returns this information, 1774 # unless the caller wants the hash returned, in which case we read it in, 1775 # and thereafter use it instead of calling charnames, as it is faster. 1776 1777 my $wantarray = wantarray(); 1778 if (defined $wantarray) { 1779 if ($wantarray) { 1780 if (@_ == 0) { 1781 _namedseq() unless %NAMEDSEQ; 1782 return %NAMEDSEQ; 1783 } elsif (@_ == 1) { 1784 my $s; 1785 if (%NAMEDSEQ) { 1786 $s = $NAMEDSEQ{ $_[0] }; 1787 } 1788 else { 1789 $s = charnames::string_vianame($_[0]); 1790 } 1791 return defined $s ? map { ord($_) } split('', $s) : (); 1792 } 1793 } elsif (@_ == 1) { 1794 return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ; 1795 return charnames::string_vianame($_[0]); 1796 } 1797 } 1798 return; 1799} 1800 1801my %NUMERIC; 1802 1803sub _numeric { 1804 my @numbers = _read_table("To/Nv.pl"); 1805 foreach my $entry (@numbers) { 1806 my ($start, $end, $value) = @$entry; 1807 1808 # If value contains a slash, convert to decimal, add a reverse hash 1809 # used by charinfo. 1810 if ((my @rational = split /\//, $value) == 2) { 1811 my $real = $rational[0] / $rational[1]; 1812 $real_to_rational{$real} = $value; 1813 $value = $real; 1814 1815 # Should only be single element, but just in case... 1816 for my $i ($start .. $end) { 1817 $NUMERIC{$i} = $value; 1818 } 1819 } 1820 else { 1821 # The values require adjusting, as is in 'a' format 1822 for my $i ($start .. $end) { 1823 $NUMERIC{$i} = $value + $i - $start; 1824 } 1825 } 1826 } 1827 1828 # Decided unsafe to use these that aren't officially part of the Unicode 1829 # standard. 1830 #use Math::Trig; 1831 #my $pi = acos(-1.0); 1832 #$NUMERIC{0x03C0} = $pi; 1833 1834 # Euler's constant, not to be confused with Euler's number 1835 #$NUMERIC{0x2107} = 0.57721566490153286060651209008240243104215933593992; 1836 1837 # Euler's number 1838 #$NUMERIC{0x212F} = 2.7182818284590452353602874713526624977572; 1839 1840 return; 1841} 1842 1843=pod 1844 1845=head2 B<num()> 1846 1847 use Unicode::UCD 'num'; 1848 1849 my $val = num("123"); 1850 my $one_quarter = num("\N{VULGAR FRACTION 1/4}"); 1851 1852C<num()> returns the numeric value of the input Unicode string; or C<undef> if it 1853doesn't think the entire string has a completely valid, safe numeric value. 1854 1855If the string is just one character in length, the Unicode numeric value 1856is returned if it has one, or C<undef> otherwise. Note that this need 1857not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for 1858example returns -0.5. 1859 1860=cut 1861 1862#A few characters to which Unicode doesn't officially 1863#assign a numeric value are considered numeric by C<num>. 1864#These are: 1865 1866# EULER CONSTANT 0.5772... (this is NOT Euler's number) 1867# SCRIPT SMALL E 2.71828... (this IS Euler's number) 1868# GREEK SMALL LETTER PI 3.14159... 1869 1870=pod 1871 1872If the string is more than one character, C<undef> is returned unless 1873all its characters are decimal digits (that is, they would match C<\d+>), 1874from the same script. For example if you have an ASCII '0' and a Bengali 1875'3', mixed together, they aren't considered a valid number, and C<undef> 1876is returned. A further restriction is that the digits all have to be of 1877the same form. A half-width digit mixed with a full-width one will 1878return C<undef>. The Arabic script has two sets of digits; C<num> will 1879return C<undef> unless all the digits in the string come from the same 1880set. 1881 1882C<num> errs on the side of safety, and there may be valid strings of 1883decimal digits that it doesn't recognize. Note that Unicode defines 1884a number of "digit" characters that aren't "decimal digit" characters. 1885"Decimal digits" have the property that they have a positional value, i.e., 1886there is a units position, a 10's position, a 100's, etc, AND they are 1887arranged in Unicode in blocks of 10 contiguous code points. The Chinese 1888digits, for example, are not in such a contiguous block, and so Unicode 1889doesn't view them as decimal digits, but merely digits, and so C<\d> will not 1890match them. A single-character string containing one of these digits will 1891have its decimal value returned by C<num>, but any longer string containing 1892only these digits will return C<undef>. 1893 1894Strings of multiple sub- and superscripts are not recognized as numbers. You 1895can use either of the compatibility decompositions in Unicode::Normalize to 1896change these into digits, and then call C<num> on the result. 1897 1898=cut 1899 1900# To handle sub, superscripts, this could if called in list context, 1901# consider those, and return the <decomposition> type in the second 1902# array element. 1903 1904sub num { 1905 my $string = $_[0]; 1906 1907 _numeric unless %NUMERIC; 1908 1909 my $length = length($string); 1910 return $NUMERIC{ord($string)} if $length == 1; 1911 return if $string =~ /\D/; 1912 my $first_ord = ord(substr($string, 0, 1)); 1913 my $value = $NUMERIC{$first_ord}; 1914 1915 # To be a valid decimal number, it should be in a block of 10 consecutive 1916 # characters, whose values are 0, 1, 2, ... 9. Therefore this digit's 1917 # value is its offset in that block from the character that means zero. 1918 my $zero_ord = $first_ord - $value; 1919 1920 # Unicode 6.0 instituted the rule that only digits in a consecutive 1921 # block of 10 would be considered decimal digits. If this is an earlier 1922 # release, we verify that this first character is a member of such a 1923 # block. That is, that the block of characters surrounding this one 1924 # consists of all \d characters whose numeric values are the expected 1925 # ones. 1926 UnicodeVersion() unless defined $v_unicode_version; 1927 if ($v_unicode_version lt v6.0.0) { 1928 for my $i (0 .. 9) { 1929 my $ord = $zero_ord + $i; 1930 return unless chr($ord) =~ /\d/; 1931 my $numeric = $NUMERIC{$ord}; 1932 return unless defined $numeric; 1933 return unless $numeric == $i; 1934 } 1935 } 1936 1937 for my $i (1 .. $length -1) { 1938 1939 # Here we know either by verifying, or by fact of the first character 1940 # being a \d in Unicode 6.0 or later, that any character between the 1941 # character that means 0, and 9 positions above it must be \d, and 1942 # must have its value correspond to its offset from the zero. Any 1943 # characters outside these 10 do not form a legal number for this 1944 # function. 1945 my $ord = ord(substr($string, $i, 1)); 1946 my $digit = $ord - $zero_ord; 1947 return unless $digit >= 0 && $digit <= 9; 1948 $value = $value * 10 + $digit; 1949 } 1950 1951 return $value; 1952} 1953 1954=pod 1955 1956=head2 B<prop_aliases()> 1957 1958 use Unicode::UCD 'prop_aliases'; 1959 1960 my ($short_name, $full_name, @other_names) = prop_aliases("space"); 1961 my $same_full_name = prop_aliases("Space"); # Scalar context 1962 my ($same_short_name) = prop_aliases("Space"); # gets 0th element 1963 print "The full name is $full_name\n"; 1964 print "The short name is $short_name\n"; 1965 print "The other aliases are: ", join(", ", @other_names), "\n"; 1966 1967 prints: 1968 The full name is White_Space 1969 The short name is WSpace 1970 The other aliases are: Space 1971 1972Most Unicode properties have several synonymous names. Typically, there is at 1973least a short name, convenient to type, and a long name that more fully 1974describes the property, and hence is more easily understood. 1975 1976If you know one name for a Unicode property, you can use C<prop_aliases> to find 1977either the long name (when called in scalar context), or a list of all of the 1978names, somewhat ordered so that the short name is in the 0th element, the long 1979name in the next element, and any other synonyms are in the remaining 1980elements, in no particular order. 1981 1982The long name is returned in a form nicely capitalized, suitable for printing. 1983 1984The input parameter name is loosely matched, which means that white space, 1985hyphens, and underscores are ignored (except for the trailing underscore in 1986the old_form grandfathered-in C<"L_">, which is better written as C<"LC">, and 1987both of which mean C<General_Category=Cased Letter>). 1988 1989If the name is unknown, C<undef> is returned (or an empty list in list 1990context). Note that Perl typically recognizes property names in regular 1991expressions with an optional C<"Is_>" (with or without the underscore) 1992prefixed to them, such as C<\p{isgc=punct}>. This function does not recognize 1993those in the input, returning C<undef>. Nor are they included in the output 1994as possible synonyms. 1995 1996C<prop_aliases> does know about the Perl extensions to Unicode properties, 1997such as C<Any> and C<XPosixAlpha>, and the single form equivalents to Unicode 1998properties such as C<XDigit>, C<Greek>, C<In_Greek>, and C<Is_Greek>. The 1999final example demonstrates that the C<"Is_"> prefix is recognized for these 2000extensions; it is needed to resolve ambiguities. For example, 2001C<prop_aliases('lc')> returns the list C<(lc, Lowercase_Mapping)>, but 2002C<prop_aliases('islc')> returns C<(Is_LC, Cased_Letter)>. This is 2003because C<islc> is a Perl extension which is short for 2004C<General_Category=Cased Letter>. The lists returned for the Perl extensions 2005will not include the C<"Is_"> prefix (whether or not the input had it) unless 2006needed to resolve ambiguities, as shown in the C<"islc"> example, where the 2007returned list had one element containing C<"Is_">, and the other without. 2008 2009It is also possible for the reverse to happen: C<prop_aliases('isc')> returns 2010the list C<(isc, ISO_Comment)>; whereas C<prop_aliases('c')> returns 2011C<(C, Other)> (the latter being a Perl extension meaning 2012C<General_Category=Other>. 2013L<perluniprops/Properties accessible through Unicode::UCD> lists the available 2014forms, including which ones are discouraged from use. 2015 2016Those discouraged forms are accepted as input to C<prop_aliases>, but are not 2017returned in the lists. C<prop_aliases('isL&')> and C<prop_aliases('isL_')>, 2018which are old synonyms for C<"Is_LC"> and should not be used in new code, are 2019examples of this. These both return C<(Is_LC, Cased_Letter)>. Thus this 2020function allows you to take a discouraged form, and find its acceptable 2021alternatives. The same goes with single-form Block property equivalences. 2022Only the forms that begin with C<"In_"> are not discouraged; if you pass 2023C<prop_aliases> a discouraged form, you will get back the equivalent ones that 2024begin with C<"In_">. It will otherwise look like a new-style block name (see. 2025L</Old-style versus new-style block names>). 2026 2027C<prop_aliases> does not know about any user-defined properties, and will 2028return C<undef> if called with one of those. Likewise for Perl internal 2029properties, with the exception of "Perl_Decimal_Digit" which it does know 2030about (and which is documented below in L</prop_invmap()>). 2031 2032=cut 2033 2034# It may be that there are use cases where the discouraged forms should be 2035# returned. If that comes up, an optional boolean second parameter to the 2036# function could be created, for example. 2037 2038# These are created by mktables for this routine and stored in unicore/UCD.pl 2039# where their structures are described. 2040our %string_property_loose_to_name; 2041our %ambiguous_names; 2042our %loose_perlprop_to_name; 2043our %prop_aliases; 2044 2045sub prop_aliases ($) { 2046 my $prop = $_[0]; 2047 return unless defined $prop; 2048 2049 require "unicore/UCD.pl"; 2050 require "unicore/Heavy.pl"; 2051 require "utf8_heavy.pl"; 2052 2053 # The property name may be loosely or strictly matched; we don't know yet. 2054 # But both types use lower-case. 2055 $prop = lc $prop; 2056 2057 # It is loosely matched if its lower case isn't known to be strict. 2058 my $list_ref; 2059 if (! exists $utf8::stricter_to_file_of{$prop}) { 2060 my $loose = utf8::_loose_name($prop); 2061 2062 # There is a hash that converts from any loose name to its standard 2063 # form, mapping all synonyms for a name to one name that can be used 2064 # as a key into another hash. The whole concept is for memory 2065 # savings, as the second hash doesn't have to have all the 2066 # combinations. Actually, there are two hashes that do the 2067 # converstion. One is used in utf8_heavy.pl (stored in Heavy.pl) for 2068 # looking up properties matchable in regexes. This function needs to 2069 # access string properties, which aren't available in regexes, so a 2070 # second conversion hash is made for them (stored in UCD.pl). Look in 2071 # the string one now, as the rest can have an optional 'is' prefix, 2072 # which these don't. 2073 if (exists $string_property_loose_to_name{$loose}) { 2074 2075 # Convert to its standard loose name. 2076 $prop = $string_property_loose_to_name{$loose}; 2077 } 2078 else { 2079 my $retrying = 0; # bool. ? Has an initial 'is' been stripped 2080 RETRY: 2081 if (exists $utf8::loose_property_name_of{$loose} 2082 && (! $retrying 2083 || ! exists $ambiguous_names{$loose})) 2084 { 2085 # Found an entry giving the standard form. We don't get here 2086 # (in the test above) when we've stripped off an 2087 # 'is' and the result is an ambiguous name. That is because 2088 # these are official Unicode properties (though Perl can have 2089 # an optional 'is' prefix meaning the official property), and 2090 # all ambiguous cases involve a Perl single-form extension 2091 # for the gc, script, or block properties, and the stripped 2092 # 'is' means that they mean one of those, and not one of 2093 # these 2094 $prop = $utf8::loose_property_name_of{$loose}; 2095 } 2096 elsif (exists $loose_perlprop_to_name{$loose}) { 2097 2098 # This hash is specifically for this function to list Perl 2099 # extensions that aren't in the earlier hashes. If there is 2100 # only one element, the short and long names are identical. 2101 # Otherwise the form is already in the same form as 2102 # %prop_aliases, which is handled at the end of the function. 2103 $list_ref = $loose_perlprop_to_name{$loose}; 2104 if (@$list_ref == 1) { 2105 my @list = ($list_ref->[0], $list_ref->[0]); 2106 $list_ref = \@list; 2107 } 2108 } 2109 elsif (! exists $utf8::loose_to_file_of{$loose}) { 2110 2111 # loose_to_file_of is a complete list of loose names. If not 2112 # there, the input is unknown. 2113 return; 2114 } 2115 elsif ($loose =~ / [:=] /x) { 2116 2117 # Here we found the name but not its aliases, so it has to 2118 # exist. Exclude property-value combinations. (This shows up 2119 # for something like ccc=vr which matches loosely, but is a 2120 # synonym for ccc=9 which matches only strictly. 2121 return; 2122 } 2123 else { 2124 2125 # Here it has to exist, and isn't a property-value 2126 # combination. This means it must be one of the Perl 2127 # single-form extensions. First see if it is for a 2128 # property-value combination in one of the following 2129 # properties. 2130 my @list; 2131 foreach my $property ("gc", "script") { 2132 @list = prop_value_aliases($property, $loose); 2133 last if @list; 2134 } 2135 if (@list) { 2136 2137 # Here, it is one of those property-value combination 2138 # single-form synonyms. There are ambiguities with some 2139 # of these. Check against the list for these, and adjust 2140 # if necessary. 2141 for my $i (0 .. @list -1) { 2142 if (exists $ambiguous_names 2143 {utf8::_loose_name(lc $list[$i])}) 2144 { 2145 # The ambiguity is resolved by toggling whether or 2146 # not it has an 'is' prefix 2147 $list[$i] =~ s/^Is_// or $list[$i] =~ s/^/Is_/; 2148 } 2149 } 2150 return @list; 2151 } 2152 2153 # Here, it wasn't one of the gc or script single-form 2154 # extensions. It could be a block property single-form 2155 # extension. An 'in' prefix definitely means that, and should 2156 # be looked up without the prefix. However, starting in 2157 # Unicode 6.1, we have to special case 'indic...', as there 2158 # is a property that begins with that name. We shouldn't 2159 # strip the 'in' from that. I'm (khw) generalizing this to 2160 # 'indic' instead of the single property, because I suspect 2161 # that others of this class may come along in the future. 2162 # However, this could backfire and a block created whose name 2163 # begins with 'dic...', and we would want to strip the 'in'. 2164 # At which point this would have to be tweaked. 2165 my $began_with_in = $loose =~ s/^in(?!dic)//; 2166 @list = prop_value_aliases("block", $loose); 2167 if (@list) { 2168 map { $_ =~ s/^/In_/ } @list; 2169 return @list; 2170 } 2171 2172 # Here still haven't found it. The last opportunity for it 2173 # being valid is only if it began with 'is'. We retry without 2174 # the 'is', setting a flag to that effect so that we don't 2175 # accept things that begin with 'isis...' 2176 if (! $retrying && ! $began_with_in && $loose =~ s/^is//) { 2177 $retrying = 1; 2178 goto RETRY; 2179 } 2180 2181 # Here, didn't find it. Since it was in %loose_to_file_of, we 2182 # should have been able to find it. 2183 carp __PACKAGE__, "::prop_aliases: Unexpectedly could not find '$prop'. Send bug report to perlbug\@perl.org"; 2184 return; 2185 } 2186 } 2187 } 2188 2189 if (! $list_ref) { 2190 # Here, we have set $prop to a standard form name of the input. Look 2191 # it up in the structure created by mktables for this purpose, which 2192 # contains both strict and loosely matched properties. Avoid 2193 # autovivifying. 2194 $list_ref = $prop_aliases{$prop} if exists $prop_aliases{$prop}; 2195 return unless $list_ref; 2196 } 2197 2198 # The full name is in element 1. 2199 return $list_ref->[1] unless wantarray; 2200 2201 return @{_dclone $list_ref}; 2202} 2203 2204=pod 2205 2206=head2 B<prop_values()> 2207 2208 use Unicode::UCD 'prop_values'; 2209 2210 print "AHex values are: ", join(", ", prop_values("AHex")), 2211 "\n"; 2212 prints: 2213 AHex values are: N, Y 2214 2215Some Unicode properties have a restricted set of legal values. For example, 2216all binary properties are restricted to just C<true> or C<false>; and there 2217are only a few dozen possible General Categories. Use C<prop_values> 2218to find out if a given property is one such, and if so, to get a list of the 2219values: 2220 2221 print join ", ", prop_values("NFC_Quick_Check"); 2222 prints: 2223 M, N, Y 2224 2225If the property doesn't have such a restricted set, C<undef> is returned. 2226 2227There are usually several synonyms for each possible value. Use 2228L</prop_value_aliases()> to access those. 2229 2230Case, white space, hyphens, and underscores are ignored in the input property 2231name (except for the trailing underscore in the old-form grandfathered-in 2232general category property value C<"L_">, which is better written as C<"LC">). 2233 2234If the property name is unknown, C<undef> is returned. Note that Perl typically 2235recognizes property names in regular expressions with an optional C<"Is_>" 2236(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>. 2237This function does not recognize those in the property parameter, returning 2238C<undef>. 2239 2240For the block property, new-style block names are returned (see 2241L</Old-style versus new-style block names>). 2242 2243C<prop_values> does not know about any user-defined properties, and 2244will return C<undef> if called with one of those. 2245 2246=cut 2247 2248# These are created by mktables for this module and stored in unicore/UCD.pl 2249# where their structures are described. 2250our %loose_to_standard_value; 2251our %prop_value_aliases; 2252 2253sub prop_values ($) { 2254 my $prop = shift; 2255 return undef unless defined $prop; 2256 2257 require "unicore/UCD.pl"; 2258 require "utf8_heavy.pl"; 2259 2260 # Find the property name synonym that's used as the key in other hashes, 2261 # which is element 0 in the returned list. 2262 ($prop) = prop_aliases($prop); 2263 return undef if ! $prop; 2264 $prop = utf8::_loose_name(lc $prop); 2265 2266 # Here is a legal property. 2267 return undef unless exists $prop_value_aliases{$prop}; 2268 my @return; 2269 foreach my $value_key (sort { lc $a cmp lc $b } 2270 keys %{$prop_value_aliases{$prop}}) 2271 { 2272 push @return, $prop_value_aliases{$prop}{$value_key}[0]; 2273 } 2274 return @return; 2275} 2276 2277=pod 2278 2279=head2 B<prop_value_aliases()> 2280 2281 use Unicode::UCD 'prop_value_aliases'; 2282 2283 my ($short_name, $full_name, @other_names) 2284 = prop_value_aliases("Gc", "Punct"); 2285 my $same_full_name = prop_value_aliases("Gc", "P"); # Scalar cntxt 2286 my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th 2287 # element 2288 print "The full name is $full_name\n"; 2289 print "The short name is $short_name\n"; 2290 print "The other aliases are: ", join(", ", @other_names), "\n"; 2291 2292 prints: 2293 The full name is Punctuation 2294 The short name is P 2295 The other aliases are: Punct 2296 2297Some Unicode properties have a restricted set of legal values. For example, 2298all binary properties are restricted to just C<true> or C<false>; and there 2299are only a few dozen possible General Categories. 2300 2301You can use L</prop_values()> to find out if a given property is one which has 2302a restricted set of values, and if so, what those values are. But usually 2303each value actually has several synonyms. For example, in Unicode binary 2304properties, I<truth> can be represented by any of the strings "Y", "Yes", "T", 2305or "True"; and the General Category "Punctuation" by that string, or "Punct", 2306or simply "P". 2307 2308Like property names, there is typically at least a short name for each such 2309property-value, and a long name. If you know any name of the property-value 2310(which you can get by L</prop_values()>, you can use C<prop_value_aliases>() 2311to get the long name (when called in scalar context), or a list of all the 2312names, with the short name in the 0th element, the long name in the next 2313element, and any other synonyms in the remaining elements, in no particular 2314order, except that any all-numeric synonyms will be last. 2315 2316The long name is returned in a form nicely capitalized, suitable for printing. 2317 2318Case, white space, hyphens, and underscores are ignored in the input parameters 2319(except for the trailing underscore in the old-form grandfathered-in general 2320category property value C<"L_">, which is better written as C<"LC">). 2321 2322If either name is unknown, C<undef> is returned. Note that Perl typically 2323recognizes property names in regular expressions with an optional C<"Is_>" 2324(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>. 2325This function does not recognize those in the property parameter, returning 2326C<undef>. 2327 2328If called with a property that doesn't have synonyms for its values, it 2329returns the input value, possibly normalized with capitalization and 2330underscores, but not necessarily checking that the input value is valid. 2331 2332For the block property, new-style block names are returned (see 2333L</Old-style versus new-style block names>). 2334 2335To find the synonyms for single-forms, such as C<\p{Any}>, use 2336L</prop_aliases()> instead. 2337 2338C<prop_value_aliases> does not know about any user-defined properties, and 2339will return C<undef> if called with one of those. 2340 2341=cut 2342 2343sub prop_value_aliases ($$) { 2344 my ($prop, $value) = @_; 2345 return unless defined $prop && defined $value; 2346 2347 require "unicore/UCD.pl"; 2348 require "utf8_heavy.pl"; 2349 2350 # Find the property name synonym that's used as the key in other hashes, 2351 # which is element 0 in the returned list. 2352 ($prop) = prop_aliases($prop); 2353 return if ! $prop; 2354 $prop = utf8::_loose_name(lc $prop); 2355 2356 # Here is a legal property, but the hash below (created by mktables for 2357 # this purpose) only knows about the properties that have a very finite 2358 # number of potential values, that is not ones whose value could be 2359 # anything, like most (if not all) string properties. These don't have 2360 # synonyms anyway. Simply return the input. For example, there is no 2361 # synonym for ('Uppercase_Mapping', A'). 2362 if (! exists $prop_value_aliases{$prop}) { 2363 2364 # Here, we have a legal property, but an unknown value. Since the 2365 # property is legal, if it isn't in the prop_aliases hash, it must be 2366 # a Perl-extension All perl extensions are binary, hence are 2367 # enumerateds, which means that we know that the input unknown value 2368 # is illegal. 2369 return if ! exists $Unicode::UCD::prop_aliases{$prop}; 2370 2371 # Otherwise, we assume it's valid, as documented. 2372 return $value; 2373 } 2374 2375 # The value name may be loosely or strictly matched; we don't know yet. 2376 # But both types use lower-case. 2377 $value = lc $value; 2378 2379 # If the name isn't found under loose matching, it certainly won't be 2380 # found under strict 2381 my $loose_value = utf8::_loose_name($value); 2382 return unless exists $loose_to_standard_value{"$prop=$loose_value"}; 2383 2384 # Similarly if the combination under loose matching doesn't exist, it 2385 # won't exist under strict. 2386 my $standard_value = $loose_to_standard_value{"$prop=$loose_value"}; 2387 return unless exists $prop_value_aliases{$prop}{$standard_value}; 2388 2389 # Here we did find a combination under loose matching rules. But it could 2390 # be that is a strict property match that shouldn't have matched. 2391 # %prop_value_aliases is set up so that the strict matches will appear as 2392 # if they were in loose form. Thus, if the non-loose version is legal, 2393 # we're ok, can skip the further check. 2394 if (! exists $utf8::stricter_to_file_of{"$prop=$value"} 2395 2396 # We're also ok and skip the further check if value loosely matches. 2397 # mktables has verified that no strict name under loose rules maps to 2398 # an existing loose name. This code relies on the very limited 2399 # circumstances that strict names can be here. Strict name matching 2400 # happens under two conditions: 2401 # 1) when the name begins with an underscore. But this function 2402 # doesn't accept those, and %prop_value_aliases doesn't have 2403 # them. 2404 # 2) When the values are numeric, in which case we need to look 2405 # further, but their squeezed-out loose values will be in 2406 # %stricter_to_file_of 2407 && exists $utf8::stricter_to_file_of{"$prop=$loose_value"}) 2408 { 2409 # The only thing that's legal loosely under strict is that can have an 2410 # underscore between digit pairs XXX 2411 while ($value =~ s/(\d)_(\d)/$1$2/g) {} 2412 return unless exists $utf8::stricter_to_file_of{"$prop=$value"}; 2413 } 2414 2415 # Here, we know that the combination exists. Return it. 2416 my $list_ref = $prop_value_aliases{$prop}{$standard_value}; 2417 if (@$list_ref > 1) { 2418 # The full name is in element 1. 2419 return $list_ref->[1] unless wantarray; 2420 2421 return @{_dclone $list_ref}; 2422 } 2423 2424 return $list_ref->[0] unless wantarray; 2425 2426 # Only 1 element means that it repeats 2427 return ( $list_ref->[0], $list_ref->[0] ); 2428} 2429 2430# All 1 bits is the largest possible UV. 2431$Unicode::UCD::MAX_CP = ~0; 2432 2433=pod 2434 2435=head2 B<prop_invlist()> 2436 2437C<prop_invlist> returns an inversion list (described below) that defines all the 2438code points for the binary Unicode property (or "property=value" pair) given 2439by the input parameter string: 2440 2441 use feature 'say'; 2442 use Unicode::UCD 'prop_invlist'; 2443 say join ", ", prop_invlist("Any"); 2444 2445 prints: 2446 0, 1114112 2447 2448If the input is unknown C<undef> is returned in scalar context; an empty-list 2449in list context. If the input is known, the number of elements in 2450the list is returned if called in scalar context. 2451 2452L<perluniprops|perluniprops/Properties accessible through \p{} and \P{}> gives 2453the list of properties that this function accepts, as well as all the possible 2454forms for them (including with the optional "Is_" prefixes). (Except this 2455function doesn't accept any Perl-internal properties, some of which are listed 2456there.) This function uses the same loose or tighter matching rules for 2457resolving the input property's name as is done for regular expressions. These 2458are also specified in L<perluniprops|perluniprops/Properties accessible 2459through \p{} and \P{}>. Examples of using the "property=value" form are: 2460 2461 say join ", ", prop_invlist("Script=Shavian"); 2462 2463 prints: 2464 66640, 66688 2465 2466 say join ", ", prop_invlist("ASCII_Hex_Digit=No"); 2467 2468 prints: 2469 0, 48, 58, 65, 71, 97, 103 2470 2471 say join ", ", prop_invlist("ASCII_Hex_Digit=Yes"); 2472 2473 prints: 2474 48, 58, 65, 71, 97, 103 2475 2476Inversion lists are a compact way of specifying Unicode property-value 2477definitions. The 0th item in the list is the lowest code point that has the 2478property-value. The next item (item [1]) is the lowest code point beyond that 2479one that does NOT have the property-value. And the next item beyond that 2480([2]) is the lowest code point beyond that one that does have the 2481property-value, and so on. Put another way, each element in the list gives 2482the beginning of a range that has the property-value (for even numbered 2483elements), or doesn't have the property-value (for odd numbered elements). 2484The name for this data structure stems from the fact that each element in the 2485list toggles (or inverts) whether the corresponding range is or isn't on the 2486list. 2487 2488In the final example above, the first ASCII Hex digit is code point 48, the 2489character "0", and all code points from it through 57 (a "9") are ASCII hex 2490digits. Code points 58 through 64 aren't, but 65 (an "A") through 70 (an "F") 2491are, as are 97 ("a") through 102 ("f"). 103 starts a range of code points 2492that aren't ASCII hex digits. That range extends to infinity, which on your 2493computer can be found in the variable C<$Unicode::UCD::MAX_CP>. (This 2494variable is as close to infinity as Perl can get on your platform, and may be 2495too high for some operations to work; you may wish to use a smaller number for 2496your purposes.) 2497 2498Note that the inversion lists returned by this function can possibly include 2499non-Unicode code points, that is anything above 0x10FFFF. Unicode properties 2500are not defined on such code points. You might wish to change the output to 2501not include these. Simply add 0x110000 at the end of the non-empty returned 2502list if it isn't already that value; and pop that value if it is; like: 2503 2504 my @list = prop_invlist("foo"); 2505 if (@list) { 2506 if ($list[-1] == 0x110000) { 2507 pop @list; # Defeat the turning on for above Unicode 2508 } 2509 else { 2510 push @list, 0x110000; # Turn off for above Unicode 2511 } 2512 } 2513 2514It is a simple matter to expand out an inversion list to a full list of all 2515code points that have the property-value: 2516 2517 my @invlist = prop_invlist($property_name); 2518 die "empty" unless @invlist; 2519 my @full_list; 2520 for (my $i = 0; $i < @invlist; $i += 2) { 2521 my $upper = ($i + 1) < @invlist 2522 ? $invlist[$i+1] - 1 # In range 2523 : $Unicode::UCD::MAX_CP; # To infinity. You may want 2524 # to stop much much earlier; 2525 # going this high may expose 2526 # perl deficiencies with very 2527 # large numbers. 2528 for my $j ($invlist[$i] .. $upper) { 2529 push @full_list, $j; 2530 } 2531 } 2532 2533C<prop_invlist> does not know about any user-defined nor Perl internal-only 2534properties, and will return C<undef> if called with one of those. 2535 2536The L</search_invlist()> function is provided for finding a code point within 2537an inversion list. 2538 2539=cut 2540 2541# User-defined properties could be handled with some changes to utf8_heavy.pl; 2542# and implementing here of dealing with EXTRAS. If done, consideration should 2543# be given to the fact that the user subroutine could return different results 2544# with each call; security issues need to be thought about. 2545 2546# These are created by mktables for this routine and stored in unicore/UCD.pl 2547# where their structures are described. 2548our %loose_defaults; 2549our $MAX_UNICODE_CODEPOINT; 2550 2551sub prop_invlist ($;$) { 2552 my $prop = $_[0]; 2553 2554 # Undocumented way to get at Perl internal properties; it may be changed 2555 # or removed without notice at any time. 2556 my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok'; 2557 2558 return if ! defined $prop; 2559 2560 require "utf8_heavy.pl"; 2561 2562 # Warnings for these are only for regexes, so not applicable to us 2563 no warnings 'deprecated'; 2564 2565 # Get the swash definition of the property-value. 2566 my $swash = utf8::SWASHNEW(__PACKAGE__, $prop, undef, 1, 0); 2567 2568 # Fail if not found, or isn't a boolean property-value, or is a 2569 # user-defined property, or is internal-only. 2570 return if ! $swash 2571 || ref $swash eq "" 2572 || $swash->{'BITS'} != 1 2573 || $swash->{'USER_DEFINED'} 2574 || (! $internal_ok && $prop =~ /^\s*_/); 2575 2576 if ($swash->{'EXTRAS'}) { 2577 carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has EXTRAS magic"; 2578 return; 2579 } 2580 if ($swash->{'SPECIALS'}) { 2581 carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has SPECIALS magic"; 2582 return; 2583 } 2584 2585 my @invlist; 2586 2587 if ($swash->{'LIST'} =~ /^V/) { 2588 2589 # A 'V' as the first character marks the input as already an inversion 2590 # list, in which case, all we need to do is put the remaining lines 2591 # into our array. 2592 @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr; 2593 shift @invlist; 2594 } 2595 else { 2596 # The input lines look like: 2597 # 0041\t005A # [26] 2598 # 005F 2599 2600 # Split into lines, stripped of trailing comments 2601 foreach my $range (split "\n", 2602 $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr) 2603 { 2604 # And find the beginning and end of the range on the line 2605 my ($hex_begin, $hex_end) = split "\t", $range; 2606 my $begin = hex $hex_begin; 2607 2608 # If the new range merely extends the old, we remove the marker 2609 # created the last time through the loop for the old's end, which 2610 # causes the new one's end to be used instead. 2611 if (@invlist && $begin == $invlist[-1]) { 2612 pop @invlist; 2613 } 2614 else { 2615 # Add the beginning of the range 2616 push @invlist, $begin; 2617 } 2618 2619 if (defined $hex_end) { # The next item starts with the code point 1 2620 # beyond the end of the range. 2621 no warnings 'portable'; 2622 my $end = hex $hex_end; 2623 last if $end == $Unicode::UCD::MAX_CP; 2624 push @invlist, $end + 1; 2625 } 2626 else { # No end of range, is a single code point. 2627 push @invlist, $begin + 1; 2628 } 2629 } 2630 } 2631 2632 # Could need to be inverted: add or subtract a 0 at the beginning of the 2633 # list. 2634 if ($swash->{'INVERT_IT'}) { 2635 if (@invlist && $invlist[0] == 0) { 2636 shift @invlist; 2637 } 2638 else { 2639 unshift @invlist, 0; 2640 } 2641 } 2642 2643 return @invlist; 2644} 2645 2646=pod 2647 2648=head2 B<prop_invmap()> 2649 2650 use Unicode::UCD 'prop_invmap'; 2651 my ($list_ref, $map_ref, $format, $default) 2652 = prop_invmap("General Category"); 2653 2654C<prop_invmap> is used to get the complete mapping definition for a property, 2655in the form of an inversion map. An inversion map consists of two parallel 2656arrays. One is an ordered list of code points that mark range beginnings, and 2657the other gives the value (or mapping) that all code points in the 2658corresponding range have. 2659 2660C<prop_invmap> is called with the name of the desired property. The name is 2661loosely matched, meaning that differences in case, white-space, hyphens, and 2662underscores are not meaningful (except for the trailing underscore in the 2663old-form grandfathered-in property C<"L_">, which is better written as C<"LC">, 2664or even better, C<"Gc=LC">). 2665 2666Many Unicode properties have more than one name (or alias). C<prop_invmap> 2667understands all of these, including Perl extensions to them. Ambiguities are 2668resolved as described above for L</prop_aliases()> (except if a property has 2669both a complete mapping, and a binary C<Y>/C<N> mapping, then specifying the 2670property name prefixed by C<"is"> causes the binary one to be returned). The 2671Perl internal property "Perl_Decimal_Digit, described below, is also accepted. 2672An empty list is returned if the property name is unknown. 2673See L<perluniprops/Properties accessible through Unicode::UCD> for the 2674properties acceptable as inputs to this function. 2675 2676It is a fatal error to call this function except in list context. 2677 2678In addition to the two arrays that form the inversion map, C<prop_invmap> 2679returns two other values; one is a scalar that gives some details as to the 2680format of the entries of the map array; the other is a default value, useful 2681in maps whose format name begins with the letter C<"a">, as described 2682L<below in its subsection|/a>; and for specialized purposes, such as 2683converting to another data structure, described at the end of this main 2684section. 2685 2686This means that C<prop_invmap> returns a 4 element list. For example, 2687 2688 my ($blocks_ranges_ref, $blocks_maps_ref, $format, $default) 2689 = prop_invmap("Block"); 2690 2691In this call, the two arrays will be populated as shown below (for Unicode 26926.0): 2693 2694 Index @blocks_ranges @blocks_maps 2695 0 0x0000 Basic Latin 2696 1 0x0080 Latin-1 Supplement 2697 2 0x0100 Latin Extended-A 2698 3 0x0180 Latin Extended-B 2699 4 0x0250 IPA Extensions 2700 5 0x02B0 Spacing Modifier Letters 2701 6 0x0300 Combining Diacritical Marks 2702 7 0x0370 Greek and Coptic 2703 8 0x0400 Cyrillic 2704 ... 2705 233 0x2B820 No_Block 2706 234 0x2F800 CJK Compatibility Ideographs Supplement 2707 235 0x2FA20 No_Block 2708 236 0xE0000 Tags 2709 237 0xE0080 No_Block 2710 238 0xE0100 Variation Selectors Supplement 2711 239 0xE01F0 No_Block 2712 240 0xF0000 Supplementary Private Use Area-A 2713 241 0x100000 Supplementary Private Use Area-B 2714 242 0x110000 No_Block 2715 2716The first line (with Index [0]) means that the value for code point 0 is "Basic 2717Latin". The entry "0x0080" in the @blocks_ranges column in the second line 2718means that the value from the first line, "Basic Latin", extends to all code 2719points in the range from 0 up to but not including 0x0080, that is, through 2720127. In other words, the code points from 0 to 127 are all in the "Basic 2721Latin" block. Similarly, all code points in the range from 0x0080 up to (but 2722not including) 0x0100 are in the block named "Latin-1 Supplement", etc. 2723(Notice that the return is the old-style block names; see L</Old-style versus 2724new-style block names>). 2725 2726The final line (with Index [242]) means that the value for all code points above 2727the legal Unicode maximum code point have the value "No_Block", which is the 2728term Unicode uses for a non-existing block. 2729 2730The arrays completely specify the mappings for all possible code points. 2731The final element in an inversion map returned by this function will always be 2732for the range that consists of all the code points that aren't legal Unicode, 2733but that are expressible on the platform. (That is, it starts with code point 27340x110000, the first code point above the legal Unicode maximum, and extends to 2735infinity.) The value for that range will be the same that any typical 2736unassigned code point has for the specified property. (Certain unassigned 2737code points are not "typical"; for example the non-character code points, or 2738those in blocks that are to be written right-to-left. The above-Unicode 2739range's value is not based on these atypical code points.) It could be argued 2740that, instead of treating these as unassigned Unicode code points, the value 2741for this range should be C<undef>. If you wish, you can change the returned 2742arrays accordingly. 2743 2744The maps for almost all properties are simple scalars that should be 2745interpreted as-is. 2746These values are those given in the Unicode-supplied data files, which may be 2747inconsistent as to capitalization and as to which synonym for a property-value 2748is given. The results may be normalized by using the L</prop_value_aliases()> 2749function. 2750 2751There are exceptions to the simple scalar maps. Some properties have some 2752elements in their map list that are themselves lists of scalars; and some 2753special strings are returned that are not to be interpreted as-is. Element 2754[2] (placed into C<$format> in the example above) of the returned four element 2755list tells you if the map has any of these special elements or not, as follows: 2756 2757=over 2758 2759=item B<C<s>> 2760 2761means all the elements of the map array are simple scalars, with no special 2762elements. Almost all properties are like this, like the C<block> example 2763above. 2764 2765=item B<C<sl>> 2766 2767means that some of the map array elements have the form given by C<"s">, and 2768the rest are lists of scalars. For example, here is a portion of the output 2769of calling C<prop_invmap>() with the "Script Extensions" property: 2770 2771 @scripts_ranges @scripts_maps 2772 ... 2773 0x0953 Devanagari 2774 0x0964 [ Bengali, Devanagari, Gurumukhi, Oriya ] 2775 0x0966 Devanagari 2776 0x0970 Common 2777 2778Here, the code points 0x964 and 0x965 are both used in Bengali, 2779Devanagari, Gurmukhi, and Oriya, but no other scripts. 2780 2781The Name_Alias property is also of this form. But each scalar consists of two 2782components: 1) the name, and 2) the type of alias this is. They are 2783separated by a colon and a space. In Unicode 6.1, there are several alias types: 2784 2785=over 2786 2787=item C<correction> 2788 2789indicates that the name is a corrected form for the 2790original name (which remains valid) for the same code point. 2791 2792=item C<control> 2793 2794adds a new name for a control character. 2795 2796=item C<alternate> 2797 2798is an alternate name for a character 2799 2800=item C<figment> 2801 2802is a name for a character that has been documented but was never in any 2803actual standard. 2804 2805=item C<abbreviation> 2806 2807is a common abbreviation for a character 2808 2809=back 2810 2811The lists are ordered (roughly) so the most preferred names come before less 2812preferred ones. 2813 2814For example, 2815 2816 @aliases_ranges @alias_maps 2817 ... 2818 0x009E [ 'PRIVACY MESSAGE: control', 'PM: abbreviation' ] 2819 0x009F [ 'APPLICATION PROGRAM COMMAND: control', 2820 'APC: abbreviation' 2821 ] 2822 0x00A0 'NBSP: abbreviation' 2823 0x00A1 "" 2824 0x00AD 'SHY: abbreviation' 2825 0x00AE "" 2826 0x01A2 'LATIN CAPITAL LETTER GHA: correction' 2827 0x01A3 'LATIN SMALL LETTER GHA: correction' 2828 0x01A4 "" 2829 ... 2830 2831A map to the empty string means that there is no alias defined for the code 2832point. 2833 2834=item B<C<a>> 2835 2836is like C<"s"> in that all the map array elements are scalars, but here they are 2837restricted to all being integers, and some have to be adjusted (hence the name 2838C<"a">) to get the correct result. For example, in: 2839 2840 my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default) 2841 = prop_invmap("Simple_Uppercase_Mapping"); 2842 2843the returned arrays look like this: 2844 2845 @$uppers_ranges_ref @$uppers_maps_ref Note 2846 0 0 2847 97 65 'a' maps to 'A', b => B ... 2848 123 0 2849 181 924 MICRO SIGN => Greek Cap MU 2850 182 0 2851 ... 2852 2853and C<$default> is 0. 2854 2855Let's start with the second line. It says that the uppercase of code point 97 2856is 65; or C<uc("a")> == "A". But the line is for the entire range of code 2857points 97 through 122. To get the mapping for any code point in this range, 2858you take the offset it has from the beginning code point of the range, and add 2859that to the mapping for that first code point. So, the mapping for 122 ("z") 2860is derived by taking the offset of 122 from 97 (=25) and adding that to 65, 2861yielding 90 ("z"). Likewise for everything in between. 2862 2863Requiring this simple adjustment allows the returned arrays to be 2864significantly smaller than otherwise, up to a factor of 10, speeding up 2865searching through them. 2866 2867Ranges that map to C<$default>, C<"0">, behave somewhat differently. For 2868these, each code point maps to itself. So, in the first line in the example, 2869S<C<ord(uc(chr(0)))>> is 0, S<C<ord(uc(chr(1)))>> is 1, .. 2870S<C<ord(uc(chr(96)))>> is 96. 2871 2872=item B<C<al>> 2873 2874means that some of the map array elements have the form given by C<"a">, and 2875the rest are ordered lists of code points. 2876For example, in: 2877 2878 my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default) 2879 = prop_invmap("Uppercase_Mapping"); 2880 2881the returned arrays look like this: 2882 2883 @$uppers_ranges_ref @$uppers_maps_ref 2884 0 0 2885 97 65 2886 123 0 2887 181 924 2888 182 0 2889 ... 2890 0x0149 [ 0x02BC 0x004E ] 2891 0x014A 0 2892 0x014B 330 2893 ... 2894 2895This is the full Uppercase_Mapping property (as opposed to the 2896Simple_Uppercase_Mapping given in the example for format C<"a">). The only 2897difference between the two in the ranges shown is that the code point at 28980x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two 2899characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN 2900CAPITAL LETTER N). 2901 2902No adjustments are needed to entries that are references to arrays; each such 2903entry will have exactly one element in its range, so the offset is always 0. 2904 2905The fourth (index [3]) element (C<$default>) in the list returned for this 2906format is 0. 2907 2908=item B<C<ae>> 2909 2910This is like C<"a">, but some elements are the empty string, and should not be 2911adjusted. 2912The one internal Perl property accessible by C<prop_invmap> is of this type: 2913"Perl_Decimal_Digit" returns an inversion map which gives the numeric values 2914that are represented by the Unicode decimal digit characters. Characters that 2915don't represent decimal digits map to the empty string, like so: 2916 2917 @digits @values 2918 0x0000 "" 2919 0x0030 0 2920 0x003A: "" 2921 0x0660: 0 2922 0x066A: "" 2923 0x06F0: 0 2924 0x06FA: "" 2925 0x07C0: 0 2926 0x07CA: "" 2927 0x0966: 0 2928 ... 2929 2930This means that the code points from 0 to 0x2F do not represent decimal digits; 2931the code point 0x30 (DIGIT ZERO) represents 0; code point 0x31, (DIGIT ONE), 2932represents 0+1-0 = 1; ... code point 0x39, (DIGIT NINE), represents 0+9-0 = 9; 2933... code points 0x3A through 0x65F do not represent decimal digits; 0x660 2934(ARABIC-INDIC DIGIT ZERO), represents 0; ... 0x07C1 (NKO DIGIT ONE), 2935represents 0+1-0 = 1 ... 2936 2937The fourth (index [3]) element (C<$default>) in the list returned for this 2938format is the empty string. 2939 2940=item B<C<ale>> 2941 2942is a combination of the C<"al"> type and the C<"ae"> type. Some of 2943the map array elements have the forms given by C<"al">, and 2944the rest are the empty string. The property C<NFKC_Casefold> has this form. 2945An example slice is: 2946 2947 @$ranges_ref @$maps_ref Note 2948 ... 2949 0x00AA 97 FEMININE ORDINAL INDICATOR => 'a' 2950 0x00AB 0 2951 0x00AD SOFT HYPHEN => "" 2952 0x00AE 0 2953 0x00AF [ 0x0020, 0x0304 ] MACRON => SPACE . COMBINING MACRON 2954 0x00B0 0 2955 ... 2956 2957The fourth (index [3]) element (C<$default>) in the list returned for this 2958format is 0. 2959 2960=item B<C<ar>> 2961 2962means that all the elements of the map array are either rational numbers or 2963the string C<"NaN">, meaning "Not a Number". A rational number is either an 2964integer, or two integers separated by a solidus (C<"/">). The second integer 2965represents the denominator of the division implied by the solidus, and is 2966actually always positive, so it is guaranteed not to be 0 and to not be 2967signed. When the element is a plain integer (without the 2968solidus), it may need to be adjusted to get the correct value by adding the 2969offset, just as other C<"a"> properties. No adjustment is needed for 2970fractions, as the range is guaranteed to have just a single element, and so 2971the offset is always 0. 2972 2973If you want to convert the returned map to entirely scalar numbers, you 2974can use something like this: 2975 2976 my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property); 2977 if ($format && $format eq "ar") { 2978 map { $_ = eval $_ if $_ ne 'NaN' } @$map_ref; 2979 } 2980 2981Here's some entries from the output of the property "Nv", which has format 2982C<"ar">. 2983 2984 @numerics_ranges @numerics_maps Note 2985 0x00 "NaN" 2986 0x30 0 DIGIT 0 .. DIGIT 9 2987 0x3A "NaN" 2988 0xB2 2 SUPERSCRIPTs 2 and 3 2989 0xB4 "NaN" 2990 0xB9 1 SUPERSCRIPT 1 2991 0xBA "NaN" 2992 0xBC 1/4 VULGAR FRACTION 1/4 2993 0xBD 1/2 VULGAR FRACTION 1/2 2994 0xBE 3/4 VULGAR FRACTION 3/4 2995 0xBF "NaN" 2996 0x660 0 ARABIC-INDIC DIGIT ZERO .. NINE 2997 0x66A "NaN" 2998 2999The fourth (index [3]) element (C<$default>) in the list returned for this 3000format is C<"NaN">. 3001 3002=item B<C<n>> 3003 3004means the Name property. All the elements of the map array are simple 3005scalars, but some of them contain special strings that require more work to 3006get the actual name. 3007 3008Entries such as: 3009 3010 CJK UNIFIED IDEOGRAPH-<code point> 3011 3012mean that the name for the code point is "CJK UNIFIED IDEOGRAPH-" 3013with the code point (expressed in hexadecimal) appended to it, like "CJK 3014UNIFIED IDEOGRAPH-3403" (similarly for S<C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code 3015pointE<gt>>>). 3016 3017Also, entries like 3018 3019 <hangul syllable> 3020 3021means that the name is algorithmically calculated. This is easily done by 3022the function L<charnames/charnames::viacode(code)>. 3023 3024Note that for control characters (C<Gc=cc>), Unicode's data files have the 3025string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty 3026string. This function returns that real name, the empty string. (There are 3027names for these characters, but they are considered aliases, not the Name 3028property name, and are contained in the C<Name_Alias> property.) 3029 3030=item B<C<ad>> 3031 3032means the Decomposition_Mapping property. This property is like C<"al"> 3033properties, except that one of the scalar elements is of the form: 3034 3035 <hangul syllable> 3036 3037This signifies that this entry should be replaced by the decompositions for 3038all the code points whose decomposition is algorithmically calculated. (All 3039of them are currently in one range and no others outside the range are likely 3040to ever be added to Unicode; the C<"n"> format 3041has this same entry.) These can be generated via the function 3042L<Unicode::Normalize::NFD()|Unicode::Normalize>. 3043 3044Note that the mapping is the one that is specified in the Unicode data files, 3045and to get the final decomposition, it may need to be applied recursively. 3046 3047The fourth (index [3]) element (C<$default>) in the list returned for this 3048format is 0. 3049 3050=back 3051 3052Note that a format begins with the letter "a" if and only the property it is 3053for requires adjustments by adding the offsets in multi-element ranges. For 3054all these properties, an entry should be adjusted only if the map is a scalar 3055which is an integer. That is, it must match the regular expression: 3056 3057 / ^ -? \d+ $ /xa 3058 3059Further, the first element in a range never needs adjustment, as the 3060adjustment would be just adding 0. 3061 3062A binary search such as that provided by L</search_invlist()>, can be used to 3063quickly find a code point in the inversion list, and hence its corresponding 3064mapping. 3065 3066The final, fourth element (index [3], assigned to C<$default> in the "block" 3067example) in the four element list returned by this function is used with the 3068C<"a"> format types; it may also be useful for applications 3069that wish to convert the returned inversion map data structure into some 3070other, such as a hash. It gives the mapping that most code points map to 3071under the property. If you establish the convention that any code point not 3072explicitly listed in your data structure maps to this value, you can 3073potentially make your data structure much smaller. As you construct your data 3074structure from the one returned by this function, simply ignore those ranges 3075that map to this value. For example, to 3076convert to the data structure searchable by L</charinrange()>, you can follow 3077this recipe for properties that don't require adjustments: 3078 3079 my ($list_ref, $map_ref, $format, $default) = prop_invmap($property); 3080 my @range_list; 3081 3082 # Look at each element in the list, but the -2 is needed because we 3083 # look at $i+1 in the loop, and the final element is guaranteed to map 3084 # to $default by prop_invmap(), so we would skip it anyway. 3085 for my $i (0 .. @$list_ref - 2) { 3086 next if $map_ref->[$i] eq $default; 3087 push @range_list, [ $list_ref->[$i], 3088 $list_ref->[$i+1], 3089 $map_ref->[$i] 3090 ]; 3091 } 3092 3093 print charinrange(\@range_list, $code_point), "\n"; 3094 3095With this, C<charinrange()> will return C<undef> if its input code point maps 3096to C<$default>. You can avoid this by omitting the C<next> statement, and adding 3097a line after the loop to handle the final element of the inversion map. 3098 3099Similarly, this recipe can be used for properties that do require adjustments: 3100 3101 for my $i (0 .. @$list_ref - 2) { 3102 next if $map_ref->[$i] eq $default; 3103 3104 # prop_invmap() guarantees that if the mapping is to an array, the 3105 # range has just one element, so no need to worry about adjustments. 3106 if (ref $map_ref->[$i]) { 3107 push @range_list, 3108 [ $list_ref->[$i], $list_ref->[$i], $map_ref->[$i] ]; 3109 } 3110 else { # Otherwise each element is actually mapped to a separate 3111 # value, so the range has to be split into single code point 3112 # ranges. 3113 3114 my $adjustment = 0; 3115 3116 # For each code point that gets mapped to something... 3117 for my $j ($list_ref->[$i] .. $list_ref->[$i+1] -1 ) { 3118 3119 # ... add a range consisting of just it mapping to the 3120 # original plus the adjustment, which is incremented for the 3121 # next time through the loop, as the offset increases by 1 3122 # for each element in the range 3123 push @range_list, 3124 [ $j, $j, $map_ref->[$i] + $adjustment++ ]; 3125 } 3126 } 3127 } 3128 3129Note that the inversion maps returned for the C<Case_Folding> and 3130C<Simple_Case_Folding> properties do not include the Turkic-locale mappings. 3131Use L</casefold()> for these. 3132 3133C<prop_invmap> does not know about any user-defined properties, and will 3134return C<undef> if called with one of those. 3135 3136The returned values for the Perl extension properties, such as C<Any> and 3137C<Greek> are somewhat misleading. The values are either C<"Y"> or C<"N>". 3138All Unicode properties are bipartite, so you can actually use the C<"Y"> or 3139C<"N>" in a Perl regular rexpression for these, like C<qr/\p{ID_Start=Y/}> or 3140C<qr/\p{Upper=N/}>. But the Perl extensions aren't specified this way, only 3141like C</qr/\p{Any}>, I<etc>. You can't actually use the C<"Y"> and C<"N>" in 3142them. 3143 3144=cut 3145 3146# User-defined properties could be handled with some changes to utf8_heavy.pl; 3147# if done, consideration should be given to the fact that the user subroutine 3148# could return different results with each call, which could lead to some 3149# security issues. 3150 3151# One could store things in memory so they don't have to be recalculated, but 3152# it is unlikely this will be called often, and some properties would take up 3153# significant memory. 3154 3155# These are created by mktables for this routine and stored in unicore/UCD.pl 3156# where their structures are described. 3157our @algorithmic_named_code_points; 3158our $HANGUL_BEGIN; 3159our $HANGUL_COUNT; 3160 3161sub prop_invmap ($;$) { 3162 3163 croak __PACKAGE__, "::prop_invmap: must be called in list context" unless wantarray; 3164 3165 my $prop = $_[0]; 3166 return unless defined $prop; 3167 3168 # Undocumented way to get at Perl internal properties; it may be changed 3169 # or removed without notice at any time. It currently also changes the 3170 # output to use the format specified in the file rather than the one we 3171 # normally compute and return 3172 my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok'; 3173 3174 # Fail internal properties 3175 return if $prop =~ /^_/ && ! $internal_ok; 3176 3177 # The values returned by this function. 3178 my (@invlist, @invmap, $format, $missing); 3179 3180 # The swash has two components we look at, the base list, and a hash, 3181 # named 'SPECIALS', containing any additional members whose mappings don't 3182 # fit into the base list scheme of things. These generally 'override' 3183 # any value in the base list for the same code point. 3184 my $overrides; 3185 3186 require "utf8_heavy.pl"; 3187 require "unicore/UCD.pl"; 3188 3189RETRY: 3190 3191 # If there are multiple entries for a single code point 3192 my $has_multiples = 0; 3193 3194 # Try to get the map swash for the property. They have 'To' prepended to 3195 # the property name, and 32 means we will accept 32 bit return values. 3196 # The 0 means we aren't calling this from tr///. 3197 my $swash = utf8::SWASHNEW(__PACKAGE__, "To$prop", undef, 32, 0); 3198 3199 # If didn't find it, could be because needs a proxy. And if was the 3200 # 'Block' or 'Name' property, use a proxy even if did find it. Finding it 3201 # in these cases would be the result of the installation changing mktables 3202 # to output the Block or Name tables. The Block table gives block names 3203 # in the new-style, and this routine is supposed to return old-style block 3204 # names. The Name table is valid, but we need to execute the special code 3205 # below to add in the algorithmic-defined name entries. 3206 # And NFKCCF needs conversion, so handle that here too. 3207 if (ref $swash eq "" 3208 || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na | NFKCCF ) $ /x) 3209 { 3210 3211 # Get the short name of the input property, in standard form 3212 my ($second_try) = prop_aliases($prop); 3213 return unless $second_try; 3214 $second_try = utf8::_loose_name(lc $second_try); 3215 3216 if ($second_try eq "in") { 3217 3218 # This property is identical to age for inversion map purposes 3219 $prop = "age"; 3220 goto RETRY; 3221 } 3222 elsif ($second_try =~ / ^ s ( cf | fc | [ltu] c ) $ /x) { 3223 3224 # These properties use just the LIST part of the full mapping, 3225 # which includes the simple maps that are otherwise overridden by 3226 # the SPECIALS. So all we need do is to not look at the SPECIALS; 3227 # set $overrides to indicate that 3228 $overrides = -1; 3229 3230 # The full name is the simple name stripped of its initial 's' 3231 $prop = $1; 3232 3233 # .. except for this case 3234 $prop = 'cf' if $prop eq 'fc'; 3235 3236 goto RETRY; 3237 } 3238 elsif ($second_try eq "blk") { 3239 3240 # We use the old block names. Just create a fake swash from its 3241 # data. 3242 _charblocks(); 3243 my %blocks; 3244 $blocks{'LIST'} = ""; 3245 $blocks{'TYPE'} = "ToBlk"; 3246 $utf8::SwashInfo{ToBlk}{'missing'} = "No_Block"; 3247 $utf8::SwashInfo{ToBlk}{'format'} = "s"; 3248 3249 foreach my $block (@BLOCKS) { 3250 $blocks{'LIST'} .= sprintf "%x\t%x\t%s\n", 3251 $block->[0], 3252 $block->[1], 3253 $block->[2]; 3254 } 3255 $swash = \%blocks; 3256 } 3257 elsif ($second_try eq "na") { 3258 3259 # Use the combo file that has all the Name-type properties in it, 3260 # extracting just the ones that are for the actual 'Name' 3261 # property. And create a fake swash from it. 3262 my %names; 3263 $names{'LIST'} = ""; 3264 my $original = do "unicore/Name.pl"; 3265 my $algorithm_names = \@algorithmic_named_code_points; 3266 3267 # We need to remove the names from it that are aliases. For that 3268 # we need to also read in that table. Create a hash with the keys 3269 # being the code points, and the values being a list of the 3270 # aliases for the code point key. 3271 my ($aliases_code_points, $aliases_maps, undef, undef) 3272 = &prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok'); 3273 my %aliases; 3274 for (my $i = 0; $i < @$aliases_code_points; $i++) { 3275 my $code_point = $aliases_code_points->[$i]; 3276 $aliases{$code_point} = $aliases_maps->[$i]; 3277 3278 # If not already a list, make it into one, so that later we 3279 # can treat things uniformly 3280 if (! ref $aliases{$code_point}) { 3281 $aliases{$code_point} = [ $aliases{$code_point} ]; 3282 } 3283 3284 # Remove the alias type from the entry, retaining just the 3285 # name. 3286 map { s/:.*// } @{$aliases{$code_point}}; 3287 } 3288 3289 my $i = 0; 3290 foreach my $line (split "\n", $original) { 3291 my ($hex_code_point, $name) = split "\t", $line; 3292 3293 # Weeds out all comments, blank lines, and named sequences 3294 next if $hex_code_point =~ /[^[:xdigit:]]/a; 3295 3296 my $code_point = hex $hex_code_point; 3297 3298 # The name of all controls is the default: the empty string. 3299 # The set of controls is immutable 3300 next if chr($code_point) =~ /[[:cntrl:]]/u; 3301 3302 # If this is a name_alias, it isn't a name 3303 next if grep { $_ eq $name } @{$aliases{$code_point}}; 3304 3305 # If we are beyond where one of the special lines needs to 3306 # be inserted ... 3307 while ($i < @$algorithm_names 3308 && $code_point > $algorithm_names->[$i]->{'low'}) 3309 { 3310 3311 # ... then insert it, ahead of what we were about to 3312 # output 3313 $names{'LIST'} .= sprintf "%x\t%x\t%s\n", 3314 $algorithm_names->[$i]->{'low'}, 3315 $algorithm_names->[$i]->{'high'}, 3316 $algorithm_names->[$i]->{'name'}; 3317 3318 # Done with this range. 3319 $i++; 3320 3321 # We loop until all special lines that precede the next 3322 # regular one are output. 3323 } 3324 3325 # Here, is a normal name. 3326 $names{'LIST'} .= sprintf "%x\t\t%s\n", $code_point, $name; 3327 } # End of loop through all the names 3328 3329 $names{'TYPE'} = "ToNa"; 3330 $utf8::SwashInfo{ToNa}{'missing'} = ""; 3331 $utf8::SwashInfo{ToNa}{'format'} = "n"; 3332 $swash = \%names; 3333 } 3334 elsif ($second_try =~ / ^ ( d [mt] ) $ /x) { 3335 3336 # The file is a combination of dt and dm properties. Create a 3337 # fake swash from the portion that we want. 3338 my $original = do "unicore/Decomposition.pl"; 3339 my %decomps; 3340 3341 if ($second_try eq 'dt') { 3342 $decomps{'TYPE'} = "ToDt"; 3343 $utf8::SwashInfo{'ToDt'}{'missing'} = "None"; 3344 $utf8::SwashInfo{'ToDt'}{'format'} = "s"; 3345 } # 'dm' is handled below, with 'nfkccf' 3346 3347 $decomps{'LIST'} = ""; 3348 3349 # This property has one special range not in the file: for the 3350 # hangul syllables. But not in Unicode version 1. 3351 UnicodeVersion() unless defined $v_unicode_version; 3352 my $done_hangul = ($v_unicode_version lt v2.0.0) 3353 ? 1 3354 : 0; # Have we done the hangul range ? 3355 foreach my $line (split "\n", $original) { 3356 my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line; 3357 my $code_point = hex $hex_lower; 3358 my $value; 3359 my $redo = 0; 3360 3361 # The type, enclosed in <...>, precedes the mapping separated 3362 # by blanks 3363 if ($type_and_map =~ / ^ < ( .* ) > \s+ (.*) $ /x) { 3364 $value = ($second_try eq 'dt') ? $1 : $2 3365 } 3366 else { # If there is no type specified, it's canonical 3367 $value = ($second_try eq 'dt') 3368 ? "Canonical" : 3369 $type_and_map; 3370 } 3371 3372 # Insert the hangul range at the appropriate spot. 3373 if (! $done_hangul && $code_point > $HANGUL_BEGIN) { 3374 $done_hangul = 1; 3375 $decomps{'LIST'} .= 3376 sprintf "%x\t%x\t%s\n", 3377 $HANGUL_BEGIN, 3378 $HANGUL_BEGIN + $HANGUL_COUNT - 1, 3379 ($second_try eq 'dt') 3380 ? "Canonical" 3381 : "<hangul syllable>"; 3382 } 3383 3384 if ($value =~ / / && $hex_upper ne "" && $hex_upper ne $hex_lower) { 3385 $line = sprintf("%04X\t%s\t%s", hex($hex_lower) + 1, $hex_upper, $value); 3386 $hex_upper = ""; 3387 $redo = 1; 3388 } 3389 3390 # And append this to our constructed LIST. 3391 $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n"; 3392 3393 redo if $redo; 3394 } 3395 $swash = \%decomps; 3396 } 3397 elsif ($second_try ne 'nfkccf') { # Don't know this property. Fail. 3398 return; 3399 } 3400 3401 if ($second_try eq 'nfkccf' || $second_try eq 'dm') { 3402 3403 # The 'nfkccf' property is stored in the old format for backwards 3404 # compatibility for any applications that has read its file 3405 # directly before prop_invmap() existed. 3406 # And the code above has extracted the 'dm' property from its file 3407 # yielding the same format. So here we convert them to adjusted 3408 # format for compatibility with the other properties similar to 3409 # them. 3410 my %revised_swash; 3411 3412 # We construct a new converted list. 3413 my $list = ""; 3414 3415 my @ranges = split "\n", $swash->{'LIST'}; 3416 for (my $i = 0; $i < @ranges; $i++) { 3417 my ($hex_begin, $hex_end, $map) = split "\t", $ranges[$i]; 3418 3419 # The dm property has maps that are space separated sequences 3420 # of code points, as well as the special entry "<hangul 3421 # syllable>, which also contains a blank. 3422 my @map = split " ", $map; 3423 if (@map > 1) { 3424 3425 # If it's just the special entry, append as-is. 3426 if ($map eq '<hangul syllable>') { 3427 $list .= "$ranges[$i]\n"; 3428 } 3429 else { 3430 3431 # These should all be single-element ranges. 3432 croak __PACKAGE__, "::prop_invmap: Not expecting a mapping with multiple code points in a multi-element range, $ranges[$i]" if $hex_end ne "" && $hex_end ne $hex_begin; 3433 3434 # Convert them to decimal, as that's what's expected. 3435 $list .= "$hex_begin\t\t" 3436 . join(" ", map { hex } @map) 3437 . "\n"; 3438 } 3439 next; 3440 } 3441 3442 # Here, the mapping doesn't have a blank, is for a single code 3443 # point. 3444 my $begin = hex $hex_begin; 3445 my $end = (defined $hex_end && $hex_end ne "") 3446 ? hex $hex_end 3447 : $begin; 3448 3449 # Again, the output is to be in decimal. 3450 my $decimal_map = hex $map; 3451 3452 # We know that multi-element ranges with the same mapping 3453 # should not be adjusted, as after the adjustment 3454 # multi-element ranges are for consecutive increasing code 3455 # points. Further, the final element in the list won't be 3456 # adjusted, as there is nothing after it to include in the 3457 # adjustment 3458 if ($begin != $end || $i == @ranges -1) { 3459 3460 # So just convert these to single-element ranges 3461 foreach my $code_point ($begin .. $end) { 3462 $list .= sprintf("%04X\t\t%d\n", 3463 $code_point, $decimal_map); 3464 } 3465 } 3466 else { 3467 3468 # Here, we have a candidate for adjusting. What we do is 3469 # look through the subsequent adjacent elements in the 3470 # input. If the map to the next one differs by 1 from the 3471 # one before, then we combine into a larger range with the 3472 # initial map. Loop doing this until we find one that 3473 # can't be combined. 3474 3475 my $offset = 0; # How far away are we from the initial 3476 # map 3477 my $squished = 0; # ? Did we squish at least two 3478 # elements together into one range 3479 for ( ; $i < @ranges; $i++) { 3480 my ($next_hex_begin, $next_hex_end, $next_map) 3481 = split "\t", $ranges[$i+1]; 3482 3483 # In the case of 'dm', the map may be a sequence of 3484 # multiple code points, which are never combined with 3485 # another range 3486 last if $next_map =~ / /; 3487 3488 $offset++; 3489 my $next_decimal_map = hex $next_map; 3490 3491 # If the next map is not next in sequence, it 3492 # shouldn't be combined. 3493 last if $next_decimal_map != $decimal_map + $offset; 3494 3495 my $next_begin = hex $next_hex_begin; 3496 3497 # Likewise, if the next element isn't adjacent to the 3498 # previous one, it shouldn't be combined. 3499 last if $next_begin != $begin + $offset; 3500 3501 my $next_end = (defined $next_hex_end 3502 && $next_hex_end ne "") 3503 ? hex $next_hex_end 3504 : $next_begin; 3505 3506 # And finally, if the next element is a multi-element 3507 # range, it shouldn't be combined. 3508 last if $next_end != $next_begin; 3509 3510 # Here, we will combine. Loop to see if we should 3511 # combine the next element too. 3512 $squished = 1; 3513 } 3514 3515 if ($squished) { 3516 3517 # Here, 'i' is the element number of the last element to 3518 # be combined, and the range is single-element, or we 3519 # wouldn't be combining. Get it's code point. 3520 my ($hex_end, undef, undef) = split "\t", $ranges[$i]; 3521 $list .= "$hex_begin\t$hex_end\t$decimal_map\n"; 3522 } else { 3523 3524 # Here, no combining done. Just append the initial 3525 # (and current) values. 3526 $list .= "$hex_begin\t\t$decimal_map\n"; 3527 } 3528 } 3529 } # End of loop constructing the converted list 3530 3531 # Finish up the data structure for our converted swash 3532 my $type = ($second_try eq 'nfkccf') ? 'ToNFKCCF' : 'ToDm'; 3533 $revised_swash{'LIST'} = $list; 3534 $revised_swash{'TYPE'} = $type; 3535 $revised_swash{'SPECIALS'} = $swash->{'SPECIALS'}; 3536 $swash = \%revised_swash; 3537 3538 $utf8::SwashInfo{$type}{'missing'} = 0; 3539 $utf8::SwashInfo{$type}{'format'} = 'a'; 3540 } 3541 } 3542 3543 if ($swash->{'EXTRAS'}) { 3544 carp __PACKAGE__, "::prop_invmap: swash returned for $prop unexpectedly has EXTRAS magic"; 3545 return; 3546 } 3547 3548 # Here, have a valid swash return. Examine it. 3549 my $returned_prop = $swash->{'TYPE'}; 3550 3551 # All properties but binary ones should have 'missing' and 'format' 3552 # entries 3553 $missing = $utf8::SwashInfo{$returned_prop}{'missing'}; 3554 $missing = 'N' unless defined $missing; 3555 3556 $format = $utf8::SwashInfo{$returned_prop}{'format'}; 3557 $format = 'b' unless defined $format; 3558 3559 my $requires_adjustment = $format =~ /^a/; 3560 3561 if ($swash->{'LIST'} =~ /^V/) { 3562 @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr; 3563 3564 shift @invlist; # Get rid of 'V'; 3565 3566 # Could need to be inverted: add or subtract a 0 at the beginning of 3567 # the list. 3568 if ($swash->{'INVERT_IT'}) { 3569 if (@invlist && $invlist[0] == 0) { 3570 shift @invlist; 3571 } 3572 else { 3573 unshift @invlist, 0; 3574 } 3575 } 3576 foreach my $i (0 .. @invlist - 1) { 3577 $invmap[$i] = ($i % 2 == 0) ? 'Y' : 'N' 3578 } 3579 3580 # The map includes lines for all code points; add one for the range 3581 # from 0 to the first Y. 3582 if ($invlist[0] != 0) { 3583 unshift @invlist, 0; 3584 unshift @invmap, 'N'; 3585 } 3586 } 3587 else { 3588 if ($swash->{'INVERT_IT'}) { 3589 croak __PACKAGE__, ":prop_invmap: Don't know how to deal with inverted"; 3590 } 3591 3592 # The LIST input lines look like: 3593 # ... 3594 # 0374\t\tCommon 3595 # 0375\t0377\tGreek # [3] 3596 # 037A\t037D\tGreek # [4] 3597 # 037E\t\tCommon 3598 # 0384\t\tGreek 3599 # ... 3600 # 3601 # Convert them to like 3602 # 0374 => Common 3603 # 0375 => Greek 3604 # 0378 => $missing 3605 # 037A => Greek 3606 # 037E => Common 3607 # 037F => $missing 3608 # 0384 => Greek 3609 # 3610 # For binary properties, the final non-comment column is absent, and 3611 # assumed to be 'Y'. 3612 3613 foreach my $range (split "\n", $swash->{'LIST'}) { 3614 $range =~ s/ \s* (?: \# .* )? $ //xg; # rmv trailing space, comments 3615 3616 # Find the beginning and end of the range on the line 3617 my ($hex_begin, $hex_end, $map) = split "\t", $range; 3618 my $begin = hex $hex_begin; 3619 no warnings 'portable'; 3620 my $end = (defined $hex_end && $hex_end ne "") 3621 ? hex $hex_end 3622 : $begin; 3623 3624 # Each time through the loop (after the first): 3625 # $invlist[-2] contains the beginning of the previous range processed 3626 # $invlist[-1] contains the end+1 of the previous range processed 3627 # $invmap[-2] contains the value of the previous range processed 3628 # $invmap[-1] contains the default value for missing ranges 3629 # ($missing) 3630 # 3631 # Thus, things are set up for the typical case of a new 3632 # non-adjacent range of non-missings to be added. But, if the new 3633 # range is adjacent, it needs to replace the [-1] element; and if 3634 # the new range is a multiple value of the previous one, it needs 3635 # to be added to the [-2] map element. 3636 3637 # The first time through, everything will be empty. If the 3638 # property doesn't have a range that begins at 0, add one that 3639 # maps to $missing 3640 if (! @invlist) { 3641 if ($begin != 0) { 3642 push @invlist, 0; 3643 push @invmap, $missing; 3644 } 3645 } 3646 elsif (@invlist > 1 && $invlist[-2] == $begin) { 3647 3648 # Here we handle the case where the input has multiple entries 3649 # for each code point. mktables should have made sure that 3650 # each such range contains only one code point. At this 3651 # point, $invlist[-1] is the $missing that was added at the 3652 # end of the last loop iteration, and [-2] is the last real 3653 # input code point, and that code point is the same as the one 3654 # we are adding now, making the new one a multiple entry. Add 3655 # it to the existing entry, either by pushing it to the 3656 # existing list of multiple entries, or converting the single 3657 # current entry into a list with both on it. This is all we 3658 # need do for this iteration. 3659 3660 if ($end != $begin) { 3661 croak __PACKAGE__, ":prop_invmap: Multiple maps per code point in '$prop' require single-element ranges: begin=$begin, end=$end, map=$map"; 3662 } 3663 if (! ref $invmap[-2]) { 3664 $invmap[-2] = [ $invmap[-2], $map ]; 3665 } 3666 else { 3667 push @{$invmap[-2]}, $map; 3668 } 3669 $has_multiples = 1; 3670 next; 3671 } 3672 elsif ($invlist[-1] == $begin) { 3673 3674 # If the input isn't in the most compact form, so that there 3675 # are two adjacent ranges that map to the same thing, they 3676 # should be combined (EXCEPT where the arrays require 3677 # adjustments, in which case everything is already set up 3678 # correctly). This happens in our constructed dt mapping, as 3679 # Element [-2] is the map for the latest range so far 3680 # processed. Just set the beginning point of the map to 3681 # $missing (in invlist[-1]) to 1 beyond where this range ends. 3682 # For example, in 3683 # 12\t13\tXYZ 3684 # 14\t17\tXYZ 3685 # we have set it up so that it looks like 3686 # 12 => XYZ 3687 # 14 => $missing 3688 # 3689 # We now see that it should be 3690 # 12 => XYZ 3691 # 18 => $missing 3692 if (! $requires_adjustment && @invlist > 1 && ( (defined $map) 3693 ? $invmap[-2] eq $map 3694 : $invmap[-2] eq 'Y')) 3695 { 3696 $invlist[-1] = $end + 1; 3697 next; 3698 } 3699 3700 # Here, the range started in the previous iteration that maps 3701 # to $missing starts at the same code point as this range. 3702 # That means there is no gap to fill that that range was 3703 # intended for, so we just pop it off the parallel arrays. 3704 pop @invlist; 3705 pop @invmap; 3706 } 3707 3708 # Add the range beginning, and the range's map. 3709 push @invlist, $begin; 3710 if ($returned_prop eq 'ToDm') { 3711 3712 # The decomposition maps are either a line like <hangul 3713 # syllable> which are to be taken as is; or a sequence of code 3714 # points in hex and separated by blanks. Convert them to 3715 # decimal, and if there is more than one, use an anonymous 3716 # array as the map. 3717 if ($map =~ /^ < /x) { 3718 push @invmap, $map; 3719 } 3720 else { 3721 my @map = split " ", $map; 3722 if (@map == 1) { 3723 push @invmap, $map[0]; 3724 } 3725 else { 3726 push @invmap, \@map; 3727 } 3728 } 3729 } 3730 else { 3731 3732 # Otherwise, convert hex formatted list entries to decimal; 3733 # add a 'Y' map for the missing value in binary properties, or 3734 # otherwise, use the input map unchanged. 3735 $map = ($format eq 'x' || $format eq 'ax') 3736 ? hex $map 3737 : $format eq 'b' 3738 ? 'Y' 3739 : $map; 3740 push @invmap, $map; 3741 } 3742 3743 # We just started a range. It ends with $end. The gap between it 3744 # and the next element in the list must be filled with a range 3745 # that maps to the default value. If there is no gap, the next 3746 # iteration will pop this, unless there is no next iteration, and 3747 # we have filled all of the Unicode code space, so check for that 3748 # and skip. 3749 if ($end < $Unicode::UCD::MAX_CP) { 3750 push @invlist, $end + 1; 3751 push @invmap, $missing; 3752 } 3753 } 3754 } 3755 3756 # If the property is empty, make all code points use the value for missing 3757 # ones. 3758 if (! @invlist) { 3759 push @invlist, 0; 3760 push @invmap, $missing; 3761 } 3762 3763 # The final element is always for just the above-Unicode code points. If 3764 # not already there, add it. It merely splits the current final range 3765 # that extends to infinity into two elements, each with the same map. 3766 # (This is to conform with the API that says the final element is for 3767 # $MAX_UNICODE_CODEPOINT + 1 .. INFINITY.) 3768 if ($invlist[-1] != $MAX_UNICODE_CODEPOINT + 1) { 3769 push @invmap, $invmap[-1]; 3770 push @invlist, $MAX_UNICODE_CODEPOINT + 1; 3771 } 3772 3773 # The second component of the map are those values that require 3774 # non-standard specification, stored in SPECIALS. These override any 3775 # duplicate code points in LIST. If we are using a proxy, we may have 3776 # already set $overrides based on the proxy. 3777 $overrides = $swash->{'SPECIALS'} unless defined $overrides; 3778 if ($overrides) { 3779 3780 # A negative $overrides implies that the SPECIALS should be ignored, 3781 # and a simple 'a' list is the value. 3782 if ($overrides < 0) { 3783 $format = 'a'; 3784 } 3785 else { 3786 3787 # Currently, all overrides are for properties that normally map to 3788 # single code points, but now some will map to lists of code 3789 # points (but there is an exception case handled below). 3790 $format = 'al'; 3791 3792 # Look through the overrides. 3793 foreach my $cp_maybe_utf8 (keys %$overrides) { 3794 my $cp; 3795 my @map; 3796 3797 # If the overrides came from SPECIALS, the code point keys are 3798 # packed UTF-8. 3799 if ($overrides == $swash->{'SPECIALS'}) { 3800 $cp = $cp_maybe_utf8; 3801 if (! utf8::decode($cp)) { 3802 croak __PACKAGE__, "::prop_invmap: Malformed UTF-8: ", 3803 map { sprintf("\\x{%02X}", unpack("C", $_)) } 3804 split "", $cp; 3805 } 3806 3807 $cp = unpack("W", $cp); 3808 @map = unpack "W*", $swash->{'SPECIALS'}{$cp_maybe_utf8}; 3809 3810 # The empty string will show up unpacked as an empty 3811 # array. 3812 $format = 'ale' if @map == 0; 3813 } 3814 else { 3815 3816 # But if we generated the overrides, we didn't bother to 3817 # pack them, and we, so far, do this only for properties 3818 # that are 'a' ones. 3819 $cp = $cp_maybe_utf8; 3820 @map = hex $overrides->{$cp}; 3821 $format = 'a'; 3822 } 3823 3824 # Find the range that the override applies to. 3825 my $i = search_invlist(\@invlist, $cp); 3826 if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) { 3827 croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]" 3828 } 3829 3830 # And what that range currently maps to 3831 my $cur_map = $invmap[$i]; 3832 3833 # If there is a gap between the next range and the code point 3834 # we are overriding, we have to add elements to both arrays to 3835 # fill that gap, using the map that applies to it, which is 3836 # $cur_map, since it is part of the current range. 3837 if ($invlist[$i + 1] > $cp + 1) { 3838 #use feature 'say'; 3839 #say "Before splice:"; 3840 #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2; 3841 #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1; 3842 #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]); 3843 #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1; 3844 #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2; 3845 3846 splice @invlist, $i + 1, 0, $cp + 1; 3847 splice @invmap, $i + 1, 0, $cur_map; 3848 3849 #say "After splice:"; 3850 #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2; 3851 #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1; 3852 #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]); 3853 #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1; 3854 #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2; 3855 } 3856 3857 # If the remaining portion of the range is multiple code 3858 # points (ending with the one we are replacing, guaranteed by 3859 # the earlier splice). We must split it into two 3860 if ($invlist[$i] < $cp) { 3861 $i++; # Compensate for the new element 3862 3863 #use feature 'say'; 3864 #say "Before splice:"; 3865 #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2; 3866 #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1; 3867 #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]); 3868 #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1; 3869 #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2; 3870 3871 splice @invlist, $i, 0, $cp; 3872 splice @invmap, $i, 0, 'dummy'; 3873 3874 #say "After splice:"; 3875 #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2; 3876 #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1; 3877 #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]); 3878 #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1; 3879 #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2; 3880 } 3881 3882 # Here, the range we are overriding contains a single code 3883 # point. The result could be the empty string, a single 3884 # value, or a list. If the last case, we use an anonymous 3885 # array. 3886 $invmap[$i] = (scalar @map == 0) 3887 ? "" 3888 : (scalar @map > 1) 3889 ? \@map 3890 : $map[0]; 3891 } 3892 } 3893 } 3894 elsif ($format eq 'x') { 3895 3896 # All hex-valued properties are really to code points, and have been 3897 # converted to decimal. 3898 $format = 's'; 3899 } 3900 elsif ($returned_prop eq 'ToDm') { 3901 $format = 'ad'; 3902 } 3903 elsif ($format eq 'sw') { # blank-separated elements to form a list. 3904 map { $_ = [ split " ", $_ ] if $_ =~ / / } @invmap; 3905 $format = 'sl'; 3906 } 3907 elsif ($returned_prop =~ / To ( _Perl )? NameAlias/x) { 3908 3909 # This property currently doesn't have any lists, but theoretically 3910 # could 3911 $format = 'sl'; 3912 } 3913 elsif ($returned_prop eq 'ToPerlDecimalDigit') { 3914 $format = 'ae'; 3915 } 3916 elsif ($returned_prop eq 'ToNv') { 3917 3918 # The one property that has this format is stored as a delta, so needs 3919 # to indicate that need to add code point to it. 3920 $format = 'ar'; 3921 } 3922 elsif ($format eq 'ax') { 3923 3924 # Normally 'ax' properties have overrides, and will have been handled 3925 # above, but if not, they still need adjustment, and the hex values 3926 # have already been converted to decimal 3927 $format = 'a'; 3928 } 3929 elsif ($format ne 'n' && $format !~ / ^ a /x) { 3930 3931 # All others are simple scalars 3932 $format = 's'; 3933 } 3934 if ($has_multiples && $format !~ /l/) { 3935 croak __PACKAGE__, "::prop_invmap: Wrong format '$format' for prop_invmap('$prop'); should indicate has lists"; 3936 } 3937 3938 return (\@invlist, \@invmap, $format, $missing); 3939} 3940 3941sub search_invlist { 3942 3943=pod 3944 3945=head2 B<search_invlist()> 3946 3947 use Unicode::UCD qw(prop_invmap prop_invlist); 3948 use Unicode::UCD 'search_invlist'; 3949 3950 my @invlist = prop_invlist($property_name); 3951 print $code_point, ((search_invlist(\@invlist, $code_point) // -1) % 2) 3952 ? " isn't" 3953 : " is", 3954 " in $property_name\n"; 3955 3956 my ($blocks_ranges_ref, $blocks_map_ref) = prop_invmap("Block"); 3957 my $index = search_invlist($blocks_ranges_ref, $code_point); 3958 print "$code_point is in block ", $blocks_map_ref->[$index], "\n"; 3959 3960C<search_invlist> is used to search an inversion list returned by 3961C<prop_invlist> or C<prop_invmap> for a particular L</code point argument>. 3962C<undef> is returned if the code point is not found in the inversion list 3963(this happens only when it is not a legal L<code point argument>, or is less 3964than the list's first element). A warning is raised in the first instance. 3965 3966Otherwise, it returns the index into the list of the range that contains the 3967code point.; that is, find C<i> such that 3968 3969 list[i]<= code_point < list[i+1]. 3970 3971As explained in L</prop_invlist()>, whether a code point is in the list or not 3972depends on if the index is even (in) or odd (not in). And as explained in 3973L</prop_invmap()>, the index is used with the returned parallel array to find 3974the mapping. 3975 3976=cut 3977 3978 3979 my $list_ref = shift; 3980 my $input_code_point = shift; 3981 my $code_point = _getcode($input_code_point); 3982 3983 if (! defined $code_point) { 3984 carp __PACKAGE__, "::search_invlist: unknown code '$input_code_point'"; 3985 return; 3986 } 3987 3988 my $max_element = @$list_ref - 1; 3989 3990 # Return undef if list is empty or requested item is before the first element. 3991 return if $max_element < 0; 3992 return if $code_point < $list_ref->[0]; 3993 3994 # Short cut something at the far-end of the table. This also allows us to 3995 # refer to element [$i+1] without fear of being out-of-bounds in the loop 3996 # below. 3997 return $max_element if $code_point >= $list_ref->[$max_element]; 3998 3999 use integer; # want integer division 4000 4001 my $i = $max_element / 2; 4002 4003 my $lower = 0; 4004 my $upper = $max_element; 4005 while (1) { 4006 4007 if ($code_point >= $list_ref->[$i]) { 4008 4009 # Here we have met the lower constraint. We can quit if we 4010 # also meet the upper one. 4011 last if $code_point < $list_ref->[$i+1]; 4012 4013 $lower = $i; # Still too low. 4014 4015 } 4016 else { 4017 4018 # Here, $code_point < $list_ref[$i], so look lower down. 4019 $upper = $i; 4020 } 4021 4022 # Split search domain in half to try again. 4023 my $temp = ($upper + $lower) / 2; 4024 4025 # No point in continuing unless $i changes for next time 4026 # in the loop. 4027 return $i if $temp == $i; 4028 $i = $temp; 4029 } # End of while loop 4030 4031 # Here we have found the offset 4032 return $i; 4033} 4034 4035=head2 Unicode::UCD::UnicodeVersion 4036 4037This returns the version of the Unicode Character Database, in other words, the 4038version of the Unicode standard the database implements. The version is a 4039string of numbers delimited by dots (C<'.'>). 4040 4041=cut 4042 4043my $UNICODEVERSION; 4044 4045sub UnicodeVersion { 4046 unless (defined $UNICODEVERSION) { 4047 openunicode(\$VERSIONFH, "version"); 4048 local $/ = "\n"; 4049 chomp($UNICODEVERSION = <$VERSIONFH>); 4050 close($VERSIONFH); 4051 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'" 4052 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/; 4053 } 4054 $v_unicode_version = pack "C*", split /\./, $UNICODEVERSION; 4055 return $UNICODEVERSION; 4056} 4057 4058=head2 B<Blocks versus Scripts> 4059 4060The difference between a block and a script is that scripts are closer 4061to the linguistic notion of a set of code points required to represent 4062languages, while block is more of an artifact of the Unicode code point 4063numbering and separation into blocks of consecutive code points (so far the 4064size of a block is some multiple of 16, like 128 or 256). 4065 4066For example the Latin B<script> is spread over several B<blocks>, such 4067as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and 4068C<Latin Extended-B>. On the other hand, the Latin script does not 4069contain all the characters of the C<Basic Latin> block (also known as 4070ASCII): it includes only the letters, and not, for example, the digits 4071nor the punctuation. 4072 4073For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt> 4074 4075For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/> 4076 4077=head2 B<Matching Scripts and Blocks> 4078 4079Scripts are matched with the regular-expression construct 4080C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script), 4081while C<\p{Blk=...}> is used for blocks (e.g. C<\p{Blk=Tibetan}> matches 4082any of the 256 code points in the Tibetan block). 4083 4084=head2 Old-style versus new-style block names 4085 4086Unicode publishes the names of blocks in two different styles, though the two 4087are equivalent under Unicode's loose matching rules. 4088 4089The original style uses blanks and hyphens in the block names (except for 4090C<No_Block>), like so: 4091 4092 Miscellaneous Mathematical Symbols-B 4093 4094The newer style replaces these with underscores, like this: 4095 4096 Miscellaneous_Mathematical_Symbols_B 4097 4098This newer style is consistent with the values of other Unicode properties. 4099To preserve backward compatibility, all the functions in Unicode::UCD that 4100return block names (except as noted) return the old-style ones. 4101L</prop_value_aliases()> returns the new-style and can be used to convert from 4102old-style to new-style: 4103 4104 my $new_style = prop_values_aliases("block", $old_style); 4105 4106Perl also has single-form extensions that refer to blocks, C<In_Cyrillic>, 4107meaning C<Block=Cyrillic>. These have always been written in the new style. 4108 4109To convert from new-style to old-style, follow this recipe: 4110 4111 $old_style = charblock((prop_invlist("block=$new_style"))[0]); 4112 4113(which finds the range of code points in the block using C<prop_invlist>, 4114gets the lower end of the range (0th element) and then looks up the old name 4115for its block using C<charblock>). 4116 4117Note that starting in Unicode 6.1, many of the block names have shorter 4118synonyms. These are always given in the new style. 4119 4120=head2 Use with older Unicode versions 4121 4122The functions in this module work as well as can be expected when 4123used on earlier Unicode versions. But, obviously, they use the available data 4124from that Unicode version. For example, if the Unicode version predates the 4125definition of the script property (Unicode 3.1), then any function that deals 4126with scripts is going to return C<undef> for the script portion of the return 4127value. 4128 4129=head1 AUTHOR 4130 4131Jarkko Hietaniemi. Now maintained by perl5 porters. 4132 4133=cut 4134 41351; 4136