1# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 2# This file is machine-generated by lib/unicore/mktables from the Unicode 3# database, Version 13.0.0. Any changes made here will be lost! 4 5 6# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! 7# This file is for internal use by core Perl only. The format and even the 8# name or existence of this file are subject to change without notice. Don't 9# use it directly. Use Unicode::UCD to access the Unicode character data 10# base. 11 12 13package charnames; 14 15# This module contains machine-generated tables and code for the 16# algorithmically-determinable Unicode character names. The following 17# routines can be used to translate between name and code point and vice versa 18 19{ # Closure 20 21 # Matches legal code point. 4-6 hex numbers, If there are 6, the first 22 # two must be 10; if there are 5, the first must not be a 0. Written this 23 # way to decrease backtracking. The first regex allows the code point to 24 # be at the end of a word, but to work properly, the word shouldn't end 25 # with a valid hex character. The second one won't match a code point at 26 # the end of a word, and doesn't have the run-on issue 27 my $run_on_code_point_re = qr/(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b)/; 28 my $code_point_re = qr/(?^aa:\b(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b))/; 29 30 # In the following hash, the keys are the bases of names which include 31 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value 32 # of each key is another hash which is used to get the low and high ends 33 # for each range of code points that apply to the name. 34 my %names_ending_in_code_point = ( 35'CJK COMPATIBILITY IDEOGRAPH' => 36{ 37'high' => 38[ 3964109, 4064217, 41195101, 42], 43'low' => 44[ 4563744, 4664112, 47194560, 48], 49}, 50'CJK UNIFIED IDEOGRAPH' => 51{ 52'high' => 53[ 5419903, 5540956, 56173789, 57177972, 58178205, 59183969, 60191456, 61201546, 62], 63'low' => 64[ 6513312, 6619968, 67131072, 68173824, 69177984, 70178208, 71183984, 72196608, 73], 74}, 75'KHITAN SMALL SCRIPT CHARACTER' => 76{ 77'high' => 78[ 79101589, 80], 81'low' => 82[ 83101120, 84], 85}, 86'NUSHU CHARACTER' => 87{ 88'high' => 89[ 90111355, 91], 92'low' => 93[ 94110960, 95], 96}, 97'TANGUT IDEOGRAPH' => 98{ 99'high' => 100[ 101100343, 102], 103'low' => 104[ 10594208, 106], 107}, 108'TANGUT IDEOGRAPH SUPPLEMENT' => 109{ 110'high' => 111[ 112101640, 113], 114'low' => 115[ 116101632, 117], 118}, 119 120 ); 121 122 # The following hash is a copy of the previous one, except is for loose 123 # matching, so each name has blanks and dashes squeezed out 124 my %loose_names_ending_in_code_point = ( 125'CJKCOMPATIBILITYIDEOGRAPH' => 126{ 127'high' => 128[ 12964109, 13064217, 131195101, 132], 133'low' => 134[ 13563744, 13664112, 137194560, 138], 139}, 140'CJKUNIFIEDIDEOGRAPH' => 141{ 142'high' => 143[ 14419903, 14540956, 146173789, 147177972, 148178205, 149183969, 150191456, 151201546, 152], 153'low' => 154[ 15513312, 15619968, 157131072, 158173824, 159177984, 160178208, 161183984, 162196608, 163], 164}, 165'KHITANSMALLSCRIPTCHARACTER' => 166{ 167'high' => 168[ 169101589, 170], 171'low' => 172[ 173101120, 174], 175}, 176'NUSHUCHARACTER' => 177{ 178'high' => 179[ 180111355, 181], 182'low' => 183[ 184110960, 185], 186}, 187'TANGUTIDEOGRAPH' => 188{ 189'high' => 190[ 191100343, 192], 193'low' => 194[ 19594208, 196], 197}, 198'TANGUTIDEOGRAPHSUPPLEMENT' => 199{ 200'high' => 201[ 202101640, 203], 204'low' => 205[ 206101632, 207], 208}, 209 210 ); 211 212 # And the following array gives the inverse mapping from code points to 213 # names. Lowest code points are first 214 @code_points_ending_in_code_point = ( 215 216{ 217'high' => 19903, 218'legal' => 219' 220 -0123456789ABCDEFGHIJKNOPRU', 221'low' => 13312, 222'name' => 'CJK UNIFIED IDEOGRAPH', 223}, 224{ 225'high' => 40956, 226'legal' => 227' 228 -0123456789ABCDEFGHIJKNOPRU', 229'low' => 19968, 230'name' => 'CJK UNIFIED IDEOGRAPH', 231}, 232{ 233'high' => 64109, 234'legal' => 235' 236 -0123456789ABCDEFGHIJKLMOPRTY', 237'low' => 63744, 238'name' => 'CJK COMPATIBILITY IDEOGRAPH', 239}, 240{ 241'high' => 64217, 242'legal' => 243' 244 -0123456789ABCDEFGHIJKLMOPRTY', 245'low' => 64112, 246'name' => 'CJK COMPATIBILITY IDEOGRAPH', 247}, 248{ 249'high' => 100343, 250'legal' => 251' 252 -0123456789ABCDEFGHINOPRTU', 253'low' => 94208, 254'name' => 'TANGUT IDEOGRAPH', 255}, 256{ 257'high' => 101589, 258'legal' => 259' 260 -0123456789ABCDEFHIKLMNPRST', 261'low' => 101120, 262'name' => 'KHITAN SMALL SCRIPT CHARACTER', 263}, 264{ 265'high' => 101640, 266'legal' => 267' 268 -0123456789ABCDEFGHILMNOPRSTU', 269'low' => 101632, 270'name' => 'TANGUT IDEOGRAPH SUPPLEMENT', 271}, 272{ 273'high' => 111355, 274'legal' => 275' 276 -0123456789ABCDEFHNRSTU', 277'low' => 110960, 278'name' => 'NUSHU CHARACTER', 279}, 280{ 281'high' => 173789, 282'legal' => 283' 284 -0123456789ABCDEFGHIJKNOPRU', 285'low' => 131072, 286'name' => 'CJK UNIFIED IDEOGRAPH', 287}, 288{ 289'high' => 177972, 290'legal' => 291' 292 -0123456789ABCDEFGHIJKNOPRU', 293'low' => 173824, 294'name' => 'CJK UNIFIED IDEOGRAPH', 295}, 296{ 297'high' => 178205, 298'legal' => 299' 300 -0123456789ABCDEFGHIJKNOPRU', 301'low' => 177984, 302'name' => 'CJK UNIFIED IDEOGRAPH', 303}, 304{ 305'high' => 183969, 306'legal' => 307' 308 -0123456789ABCDEFGHIJKNOPRU', 309'low' => 178208, 310'name' => 'CJK UNIFIED IDEOGRAPH', 311}, 312{ 313'high' => 191456, 314'legal' => 315' 316 -0123456789ABCDEFGHIJKNOPRU', 317'low' => 183984, 318'name' => 'CJK UNIFIED IDEOGRAPH', 319}, 320{ 321'high' => 195101, 322'legal' => 323' 324 -0123456789ABCDEFGHIJKLMOPRTY', 325'low' => 194560, 326'name' => 'CJK COMPATIBILITY IDEOGRAPH', 327}, 328{ 329'high' => 201546, 330'legal' => 331' 332 -0123456789ABCDEFGHIJKNOPRU', 333'low' => 196608, 334'name' => 'CJK UNIFIED IDEOGRAPH', 335}, 336, 337 338 ); 339 340 # Is exportable, make read-only 341 Internals::SvREADONLY(@code_points_ending_in_code_point, 1); 342 343 # Convert from code point to Jamo short name for use in composing Hangul 344 # syllable names 345 my %Jamo = ( 3464352 => 'G', 3474353 => 'GG', 3484354 => 'N', 3494355 => 'D', 3504356 => 'DD', 3514357 => 'R', 3524358 => 'M', 3534359 => 'B', 3544360 => 'BB', 3554361 => 'S', 3564362 => 'SS', 3574363 => '', 3584364 => 'J', 3594365 => 'JJ', 3604366 => 'C', 3614367 => 'K', 3624368 => 'T', 3634369 => 'P', 3644370 => 'H', 3654449 => 'A', 3664450 => 'AE', 3674451 => 'YA', 3684452 => 'YAE', 3694453 => 'EO', 3704454 => 'E', 3714455 => 'YEO', 3724456 => 'YE', 3734457 => 'O', 3744458 => 'WA', 3754459 => 'WAE', 3764460 => 'OE', 3774461 => 'YO', 3784462 => 'U', 3794463 => 'WEO', 3804464 => 'WE', 3814465 => 'WI', 3824466 => 'YU', 3834467 => 'EU', 3844468 => 'YI', 3854469 => 'I', 3864520 => 'G', 3874521 => 'GG', 3884522 => 'GS', 3894523 => 'N', 3904524 => 'NJ', 3914525 => 'NH', 3924526 => 'D', 3934527 => 'L', 3944528 => 'LG', 3954529 => 'LM', 3964530 => 'LB', 3974531 => 'LS', 3984532 => 'LT', 3994533 => 'LP', 4004534 => 'LH', 4014535 => 'M', 4024536 => 'B', 4034537 => 'BS', 4044538 => 'S', 4054539 => 'SS', 4064540 => 'NG', 4074541 => 'J', 4084542 => 'C', 4094543 => 'K', 4104544 => 'T', 4114545 => 'P', 4124546 => 'H', 413 414 ); 415 416 # Leading consonant (can be null) 417 my %Jamo_L = ( 418'' => 11, 419'B' => 7, 420'BB' => 8, 421'C' => 14, 422'D' => 3, 423'DD' => 4, 424'G' => 0, 425'GG' => 1, 426'H' => 18, 427'J' => 12, 428'JJ' => 13, 429'K' => 15, 430'M' => 6, 431'N' => 2, 432'P' => 17, 433'R' => 5, 434'S' => 9, 435'SS' => 10, 436'T' => 16, 437 438 ); 439 440 # Vowel 441 my %Jamo_V = ( 442'A' => 0, 443'AE' => 1, 444'E' => 5, 445'EO' => 4, 446'EU' => 18, 447'I' => 20, 448'O' => 8, 449'OE' => 11, 450'U' => 13, 451'WA' => 9, 452'WAE' => 10, 453'WE' => 15, 454'WEO' => 14, 455'WI' => 16, 456'YA' => 2, 457'YAE' => 3, 458'YE' => 7, 459'YEO' => 6, 460'YI' => 19, 461'YO' => 12, 462'YU' => 17, 463 464 ); 465 466 # Optional trailing consonant 467 my %Jamo_T = ( 468'B' => 17, 469'BS' => 18, 470'C' => 23, 471'D' => 7, 472'G' => 1, 473'GG' => 2, 474'GS' => 3, 475'H' => 27, 476'J' => 22, 477'K' => 24, 478'L' => 8, 479'LB' => 11, 480'LG' => 9, 481'LH' => 15, 482'LM' => 10, 483'LP' => 14, 484'LS' => 12, 485'LT' => 13, 486'M' => 16, 487'N' => 4, 488'NG' => 21, 489'NH' => 6, 490'NJ' => 5, 491'P' => 26, 492'S' => 19, 493'SS' => 20, 494'T' => 25, 495 496 ); 497 498 # Computed re that splits up a Hangul name into LVT or LV syllables 499 my $syllable_re = qr/(|B|BB|C|D|DD|G|GG|H|J|JJ|K|M|N|P|R|S|SS|T)(A|AE|E|EO|EU|I|O|OE|U|WA|WAE|WE|WEO|WI|YA|YAE|YE|YEO|YI|YO|YU)(B|BS|C|D|G|GG|GS|H|J|K|L|LB|LG|LH|LM|LP|LS|LT|M|N|NG|NH|NJ|P|S|SS|T)?/; 500 501 my $HANGUL_SYLLABLE = "HANGUL SYLLABLE "; 502 my $loose_HANGUL_SYLLABLE = "HANGULSYLLABLE"; 503 504 # These constants names and values were taken from the Unicode standard, 505 # version 5.1, section 3.12. They are used in conjunction with Hangul 506 # syllables 507 my $SBase = 0xAC00; 508 my $LBase = 0x1100; 509 my $VBase = 0x1161; 510 my $TBase = 0x11A7; 511 my $SCount = 11172; 512 my $LCount = 19; 513 my $VCount = 21; 514 my $TCount = 28; 515 my $NCount = $VCount * $TCount; 516 517 sub name_to_code_point_special { 518 my ($name, $loose) = @_; 519 520 # Returns undef if not one of the specially handled names; otherwise 521 # returns the code point equivalent to the input name 522 # $loose is non-zero if to use loose matching, 'name' in that case 523 # must be input as upper case with all blanks and dashes squeezed out. 524 525 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//) 526 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//)) 527 { 528 return if $name !~ qr/^$syllable_re$/; 529 my $L = $Jamo_L{$1}; 530 my $V = $Jamo_V{$2}; 531 my $T = (defined $3) ? $Jamo_T{$3} : 0; 532 return ($L * $VCount + $V) * $TCount + $T + $SBase; 533 } 534 535 # Name must end in 'code_point' for this to handle. 536 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x) 537 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x)); 538 539 my $base = $1; 540 my $code_point = CORE::hex $2; 541 my $names_ref; 542 543 if ($loose) { 544 $names_ref = \%loose_names_ending_in_code_point; 545 } 546 else { 547 return if $base !~ s/-$//; 548 $names_ref = \%names_ending_in_code_point; 549 } 550 551 # Name must be one of the ones which has the code point in it. 552 return if ! $names_ref->{$base}; 553 554 # Look through the list of ranges that apply to this name to see if 555 # the code point is in one of them. 556 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) { 557 return if $names_ref->{$base}{'low'}->[$i] > $code_point; 558 next if $names_ref->{$base}{'high'}->[$i] < $code_point; 559 560 # Here, the code point is in the range. 561 return $code_point; 562 } 563 564 # Here, looked like the name had a code point number in it, but 565 # did not match one of the valid ones. 566 return; 567 } 568 569 sub code_point_to_name_special { 570 my $code_point = shift; 571 572 # Returns the name of a code point if algorithmically determinable; 573 # undef if not 574 575 # If in the Hangul range, calculate the name based on Unicode's 576 # algorithm 577 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) { 578 use integer; 579 my $SIndex = $code_point - $SBase; 580 my $L = $LBase + $SIndex / $NCount; 581 my $V = $VBase + ($SIndex % $NCount) / $TCount; 582 my $T = $TBase + $SIndex % $TCount; 583 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; 584 $name .= $Jamo{$T} if $T != $TBase; 585 return $name; 586 } 587 588 # Look through list of these code points for one in range. 589 foreach my $hash (@code_points_ending_in_code_point) { 590 return if $code_point < $hash->{'low'}; 591 if ($code_point <= $hash->{'high'}) { 592 return sprintf("%s-%04X", $hash->{'name'}, $code_point); 593 } 594 } 595 return; # None found 596 } 597} # End closure 598 5991; 600