1# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 2# This file is machine-generated by lib/unicore/mktables from the Unicode 3# database, Version 14.0.0. Any changes made here will be lost! 4 5 6# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! 7# This file is for internal use by core Perl only. The format and even the 8# name or existence of this file are subject to change without notice. Don't 9# use it directly. Use Unicode::UCD to access the Unicode character data 10# base. 11 12 13=head1 NAME -- Internal generated file for use by charnames 14 15=cut 16 17 18package charnames; 19 20# This module contains machine-generated tables and code for the 21# algorithmically-determinable Unicode character names. The following 22# routines can be used to translate between name and code point and vice versa 23 24{ # Closure 25 26 # Matches legal code point. 4-6 hex numbers, If there are 6, the first 27 # two must be 10; if there are 5, the first must not be a 0. Written this 28 # way to decrease backtracking. The first regex allows the code point to 29 # be at the end of a word, but to work properly, the word shouldn't end 30 # with a valid hex character. The second one won't match a code point at 31 # the end of a word, and doesn't have the run-on issue 32 my $run_on_code_point_re = qr/(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b)/; 33 my $code_point_re = qr/(?^aa:\b(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b))/; 34 35 # In the following hash, the keys are the bases of names which include 36 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value 37 # of each key is another hash which is used to get the low and high ends 38 # for each range of code points that apply to the name. 39 my %names_ending_in_code_point = ( 40'CJK COMPATIBILITY IDEOGRAPH' => 41{ 42'high' => 43[ 4464109, 4564217, 46195101, 47], 48'low' => 49[ 5063744, 5164112, 52194560, 53], 54}, 55'CJK UNIFIED IDEOGRAPH' => 56{ 57'high' => 58[ 5919903, 6040959, 61173791, 62177976, 63178205, 64183969, 65191456, 66201546, 67], 68'low' => 69[ 7013312, 7119968, 72131072, 73173824, 74177984, 75178208, 76183984, 77196608, 78], 79}, 80'KHITAN SMALL SCRIPT CHARACTER' => 81{ 82'high' => 83[ 84101589, 85], 86'low' => 87[ 88101120, 89], 90}, 91'NUSHU CHARACTER' => 92{ 93'high' => 94[ 95111355, 96], 97'low' => 98[ 99110960, 100], 101}, 102'TANGUT IDEOGRAPH' => 103{ 104'high' => 105[ 106100343, 107], 108'low' => 109[ 11094208, 111], 112}, 113'TANGUT IDEOGRAPH SUPPLEMENT' => 114{ 115'high' => 116[ 117101640, 118], 119'low' => 120[ 121101632, 122], 123}, 124 125 ); 126 127 # The following hash is a copy of the previous one, except is for loose 128 # matching, so each name has blanks and dashes squeezed out 129 my %loose_names_ending_in_code_point = ( 130'CJKCOMPATIBILITYIDEOGRAPH' => 131{ 132'high' => 133[ 13464109, 13564217, 136195101, 137], 138'low' => 139[ 14063744, 14164112, 142194560, 143], 144}, 145'CJKUNIFIEDIDEOGRAPH' => 146{ 147'high' => 148[ 14919903, 15040959, 151173791, 152177976, 153178205, 154183969, 155191456, 156201546, 157], 158'low' => 159[ 16013312, 16119968, 162131072, 163173824, 164177984, 165178208, 166183984, 167196608, 168], 169}, 170'KHITANSMALLSCRIPTCHARACTER' => 171{ 172'high' => 173[ 174101589, 175], 176'low' => 177[ 178101120, 179], 180}, 181'NUSHUCHARACTER' => 182{ 183'high' => 184[ 185111355, 186], 187'low' => 188[ 189110960, 190], 191}, 192'TANGUTIDEOGRAPH' => 193{ 194'high' => 195[ 196100343, 197], 198'low' => 199[ 20094208, 201], 202}, 203'TANGUTIDEOGRAPHSUPPLEMENT' => 204{ 205'high' => 206[ 207101640, 208], 209'low' => 210[ 211101632, 212], 213}, 214 215 ); 216 217 # And the following array gives the inverse mapping from code points to 218 # names. Lowest code points are first 219 @code_points_ending_in_code_point = ( 220 221{ 222'high' => 19903, 223'legal' => 224' 225 -0123456789ABCDEFGHIJKNOPRU', 226'low' => 13312, 227'name' => 'CJK UNIFIED IDEOGRAPH', 228}, 229{ 230'high' => 40959, 231'legal' => 232' 233 -0123456789ABCDEFGHIJKNOPRU', 234'low' => 19968, 235'name' => 'CJK UNIFIED IDEOGRAPH', 236}, 237{ 238'high' => 64109, 239'legal' => 240' 241 -0123456789ABCDEFGHIJKLMOPRTY', 242'low' => 63744, 243'name' => 'CJK COMPATIBILITY IDEOGRAPH', 244}, 245{ 246'high' => 64217, 247'legal' => 248' 249 -0123456789ABCDEFGHIJKLMOPRTY', 250'low' => 64112, 251'name' => 'CJK COMPATIBILITY IDEOGRAPH', 252}, 253{ 254'high' => 100343, 255'legal' => 256' 257 -0123456789ABCDEFGHINOPRTU', 258'low' => 94208, 259'name' => 'TANGUT IDEOGRAPH', 260}, 261{ 262'high' => 101589, 263'legal' => 264' 265 -0123456789ABCDEFHIKLMNPRST', 266'low' => 101120, 267'name' => 'KHITAN SMALL SCRIPT CHARACTER', 268}, 269{ 270'high' => 101640, 271'legal' => 272' 273 -0123456789ABCDEFGHILMNOPRSTU', 274'low' => 101632, 275'name' => 'TANGUT IDEOGRAPH SUPPLEMENT', 276}, 277{ 278'high' => 111355, 279'legal' => 280' 281 -0123456789ABCDEFHNRSTU', 282'low' => 110960, 283'name' => 'NUSHU CHARACTER', 284}, 285{ 286'high' => 173791, 287'legal' => 288' 289 -0123456789ABCDEFGHIJKNOPRU', 290'low' => 131072, 291'name' => 'CJK UNIFIED IDEOGRAPH', 292}, 293{ 294'high' => 177976, 295'legal' => 296' 297 -0123456789ABCDEFGHIJKNOPRU', 298'low' => 173824, 299'name' => 'CJK UNIFIED IDEOGRAPH', 300}, 301{ 302'high' => 178205, 303'legal' => 304' 305 -0123456789ABCDEFGHIJKNOPRU', 306'low' => 177984, 307'name' => 'CJK UNIFIED IDEOGRAPH', 308}, 309{ 310'high' => 183969, 311'legal' => 312' 313 -0123456789ABCDEFGHIJKNOPRU', 314'low' => 178208, 315'name' => 'CJK UNIFIED IDEOGRAPH', 316}, 317{ 318'high' => 191456, 319'legal' => 320' 321 -0123456789ABCDEFGHIJKNOPRU', 322'low' => 183984, 323'name' => 'CJK UNIFIED IDEOGRAPH', 324}, 325{ 326'high' => 195101, 327'legal' => 328' 329 -0123456789ABCDEFGHIJKLMOPRTY', 330'low' => 194560, 331'name' => 'CJK COMPATIBILITY IDEOGRAPH', 332}, 333{ 334'high' => 201546, 335'legal' => 336' 337 -0123456789ABCDEFGHIJKNOPRU', 338'low' => 196608, 339'name' => 'CJK UNIFIED IDEOGRAPH', 340}, 341, 342 343 ); 344 345 # Is exportable, make read-only 346 Internals::SvREADONLY(@code_points_ending_in_code_point, 1); 347 348 # Convert from code point to Jamo short name for use in composing Hangul 349 # syllable names 350 my %Jamo = ( 3514352 => 'G', 3524353 => 'GG', 3534354 => 'N', 3544355 => 'D', 3554356 => 'DD', 3564357 => 'R', 3574358 => 'M', 3584359 => 'B', 3594360 => 'BB', 3604361 => 'S', 3614362 => 'SS', 3624363 => '', 3634364 => 'J', 3644365 => 'JJ', 3654366 => 'C', 3664367 => 'K', 3674368 => 'T', 3684369 => 'P', 3694370 => 'H', 3704449 => 'A', 3714450 => 'AE', 3724451 => 'YA', 3734452 => 'YAE', 3744453 => 'EO', 3754454 => 'E', 3764455 => 'YEO', 3774456 => 'YE', 3784457 => 'O', 3794458 => 'WA', 3804459 => 'WAE', 3814460 => 'OE', 3824461 => 'YO', 3834462 => 'U', 3844463 => 'WEO', 3854464 => 'WE', 3864465 => 'WI', 3874466 => 'YU', 3884467 => 'EU', 3894468 => 'YI', 3904469 => 'I', 3914520 => 'G', 3924521 => 'GG', 3934522 => 'GS', 3944523 => 'N', 3954524 => 'NJ', 3964525 => 'NH', 3974526 => 'D', 3984527 => 'L', 3994528 => 'LG', 4004529 => 'LM', 4014530 => 'LB', 4024531 => 'LS', 4034532 => 'LT', 4044533 => 'LP', 4054534 => 'LH', 4064535 => 'M', 4074536 => 'B', 4084537 => 'BS', 4094538 => 'S', 4104539 => 'SS', 4114540 => 'NG', 4124541 => 'J', 4134542 => 'C', 4144543 => 'K', 4154544 => 'T', 4164545 => 'P', 4174546 => 'H', 418 419 ); 420 421 # Leading consonant (can be null) 422 my %Jamo_L = ( 423'' => 11, 424'B' => 7, 425'BB' => 8, 426'C' => 14, 427'D' => 3, 428'DD' => 4, 429'G' => 0, 430'GG' => 1, 431'H' => 18, 432'J' => 12, 433'JJ' => 13, 434'K' => 15, 435'M' => 6, 436'N' => 2, 437'P' => 17, 438'R' => 5, 439'S' => 9, 440'SS' => 10, 441'T' => 16, 442 443 ); 444 445 # Vowel 446 my %Jamo_V = ( 447'A' => 0, 448'AE' => 1, 449'E' => 5, 450'EO' => 4, 451'EU' => 18, 452'I' => 20, 453'O' => 8, 454'OE' => 11, 455'U' => 13, 456'WA' => 9, 457'WAE' => 10, 458'WE' => 15, 459'WEO' => 14, 460'WI' => 16, 461'YA' => 2, 462'YAE' => 3, 463'YE' => 7, 464'YEO' => 6, 465'YI' => 19, 466'YO' => 12, 467'YU' => 17, 468 469 ); 470 471 # Optional trailing consonant 472 my %Jamo_T = ( 473'B' => 17, 474'BS' => 18, 475'C' => 23, 476'D' => 7, 477'G' => 1, 478'GG' => 2, 479'GS' => 3, 480'H' => 27, 481'J' => 22, 482'K' => 24, 483'L' => 8, 484'LB' => 11, 485'LG' => 9, 486'LH' => 15, 487'LM' => 10, 488'LP' => 14, 489'LS' => 12, 490'LT' => 13, 491'M' => 16, 492'N' => 4, 493'NG' => 21, 494'NH' => 6, 495'NJ' => 5, 496'P' => 26, 497'S' => 19, 498'SS' => 20, 499'T' => 25, 500 501 ); 502 503 # Computed re that splits up a Hangul name into LVT or LV syllables 504 my $syllable_re = qr/(|B|BB|C|D|DD|G|GG|H|J|JJ|K|M|N|P|R|S|SS|T)(A|AE|E|EO|EU|I|O|OE|U|WA|WAE|WE|WEO|WI|YA|YAE|YE|YEO|YI|YO|YU)(B|BS|C|D|G|GG|GS|H|J|K|L|LB|LG|LH|LM|LP|LS|LT|M|N|NG|NH|NJ|P|S|SS|T)?/; 505 506 my $HANGUL_SYLLABLE = "HANGUL SYLLABLE "; 507 my $loose_HANGUL_SYLLABLE = "HANGULSYLLABLE"; 508 509 # These constants names and values were taken from the Unicode standard, 510 # version 5.1, section 3.12. They are used in conjunction with Hangul 511 # syllables 512 my $SBase = 0xAC00; 513 my $LBase = 0x1100; 514 my $VBase = 0x1161; 515 my $TBase = 0x11A7; 516 my $SCount = 11172; 517 my $LCount = 19; 518 my $VCount = 21; 519 my $TCount = 28; 520 my $NCount = $VCount * $TCount; 521 522 sub name_to_code_point_special { 523 my ($name, $loose) = @_; 524 525 # Returns undef if not one of the specially handled names; otherwise 526 # returns the code point equivalent to the input name 527 # $loose is non-zero if to use loose matching, 'name' in that case 528 # must be input as upper case with all blanks and dashes squeezed out. 529 530 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//) 531 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//)) 532 { 533 return if $name !~ qr/^$syllable_re$/; 534 my $L = $Jamo_L{$1}; 535 my $V = $Jamo_V{$2}; 536 my $T = (defined $3) ? $Jamo_T{$3} : 0; 537 return ($L * $VCount + $V) * $TCount + $T + $SBase; 538 } 539 540 # Name must end in 'code_point' for this to handle. 541 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x) 542 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x)); 543 544 my $base = $1; 545 my $code_point = CORE::hex $2; 546 my $names_ref; 547 548 if ($loose) { 549 $names_ref = \%loose_names_ending_in_code_point; 550 } 551 else { 552 return if $base !~ s/-$//; 553 $names_ref = \%names_ending_in_code_point; 554 } 555 556 # Name must be one of the ones which has the code point in it. 557 return if ! $names_ref->{$base}; 558 559 # Look through the list of ranges that apply to this name to see if 560 # the code point is in one of them. 561 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) { 562 return if $names_ref->{$base}{'low'}->[$i] > $code_point; 563 next if $names_ref->{$base}{'high'}->[$i] < $code_point; 564 565 # Here, the code point is in the range. 566 return $code_point; 567 } 568 569 # Here, looked like the name had a code point number in it, but 570 # did not match one of the valid ones. 571 return; 572 } 573 574 sub code_point_to_name_special { 575 my $code_point = shift; 576 577 # Returns the name of a code point if algorithmically determinable; 578 # undef if not 579 580 # If in the Hangul range, calculate the name based on Unicode's 581 # algorithm 582 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) { 583 use integer; 584 my $SIndex = $code_point - $SBase; 585 my $L = $LBase + $SIndex / $NCount; 586 my $V = $VBase + ($SIndex % $NCount) / $TCount; 587 my $T = $TBase + $SIndex % $TCount; 588 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; 589 $name .= $Jamo{$T} if $T != $TBase; 590 return $name; 591 } 592 593 # Look through list of these code points for one in range. 594 foreach my $hash (@code_points_ending_in_code_point) { 595 return if $code_point < $hash->{'low'}; 596 if ($code_point <= $hash->{'high'}) { 597 return sprintf("%s-%04X", $hash->{'name'}, $code_point); 598 } 599 } 600 return; # None found 601 } 602} # End closure 603 6041; 605