1# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 2# This file is machine-generated by lib/unicore/mktables from the Unicode 3# database, Version 6.2.0. Any changes made here will be lost! 4 5 6# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! 7# This file is for internal use by core Perl only. The format and even the 8# name or existence of this file are subject to change without notice. Don't 9# use it directly. 10 11 12package charnames; 13 14# This module contains machine-generated tables and code for the 15# algorithmically-determinable Unicode character names. The following 16# routines can be used to translate between name and code point and vice versa 17 18{ # Closure 19 20 # Matches legal code point. 4-6 hex numbers, If there are 6, the first 21 # two must be 10; if there are 5, the first must not be a 0. Written this 22 # way to decrease backtracking. The first regex allows the code point to 23 # be at the end of a word, but to work properly, the word shouldn't end 24 # with a valid hex character. The second one won't match a code point at 25 # the end of a word, and doesn't have the run-on issue 26 my $run_on_code_point_re = qr/(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b)/; 27 my $code_point_re = qr/(?^aa:\b(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b))/; 28 29 # In the following hash, the keys are the bases of names which include 30 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value 31 # of each key is another hash which is used to get the low and high ends 32 # for each range of code points that apply to the name. 33 my %names_ending_in_code_point = ( 34'CJK COMPATIBILITY IDEOGRAPH' => 35{ 36'high' => 37[ 3864109, 3964217, 40195101, 41], 42'low' => 43[ 4463744, 4564112, 46194560, 47], 48}, 49'CJK UNIFIED IDEOGRAPH' => 50{ 51'high' => 52[ 5319893, 5440908, 55173782, 56177972, 57178205, 58], 59'low' => 60[ 6113312, 6219968, 63131072, 64173824, 65177984, 66], 67}, 68 69 ); 70 71 # The following hash is a copy of the previous one, except is for loose 72 # matching, so each name has blanks and dashes squeezed out 73 my %loose_names_ending_in_code_point = ( 74'CJKCOMPATIBILITYIDEOGRAPH' => 75{ 76'high' => 77[ 7864109, 7964217, 80195101, 81], 82'low' => 83[ 8463744, 8564112, 86194560, 87], 88}, 89'CJKUNIFIEDIDEOGRAPH' => 90{ 91'high' => 92[ 9319893, 9440908, 95173782, 96177972, 97178205, 98], 99'low' => 100[ 10113312, 10219968, 103131072, 104173824, 105177984, 106], 107}, 108 109 ); 110 111 # And the following array gives the inverse mapping from code points to 112 # names. Lowest code points are first 113 my @code_points_ending_in_code_point = ( 114 115{ 116'high' => 19893, 117'low' => 13312, 118'name' => 'CJK UNIFIED IDEOGRAPH', 119}, 120{ 121'high' => 40908, 122'low' => 19968, 123'name' => 'CJK UNIFIED IDEOGRAPH', 124}, 125{ 126'high' => 64109, 127'low' => 63744, 128'name' => 'CJK COMPATIBILITY IDEOGRAPH', 129}, 130{ 131'high' => 64217, 132'low' => 64112, 133'name' => 'CJK COMPATIBILITY IDEOGRAPH', 134}, 135{ 136'high' => 173782, 137'low' => 131072, 138'name' => 'CJK UNIFIED IDEOGRAPH', 139}, 140{ 141'high' => 177972, 142'low' => 173824, 143'name' => 'CJK UNIFIED IDEOGRAPH', 144}, 145{ 146'high' => 178205, 147'low' => 177984, 148'name' => 'CJK UNIFIED IDEOGRAPH', 149}, 150{ 151'high' => 195101, 152'low' => 194560, 153'name' => 'CJK COMPATIBILITY IDEOGRAPH', 154}, 155, 156 157 ); 158 159 # Convert from code point to Jamo short name for use in composing Hangul 160 # syllable names 161 my %Jamo = ( 1624352 => 'G', 1634353 => 'GG', 1644354 => 'N', 1654355 => 'D', 1664356 => 'DD', 1674357 => 'R', 1684358 => 'M', 1694359 => 'B', 1704360 => 'BB', 1714361 => 'S', 1724362 => 'SS', 1734363 => '', 1744364 => 'J', 1754365 => 'JJ', 1764366 => 'C', 1774367 => 'K', 1784368 => 'T', 1794369 => 'P', 1804370 => 'H', 1814449 => 'A', 1824450 => 'AE', 1834451 => 'YA', 1844452 => 'YAE', 1854453 => 'EO', 1864454 => 'E', 1874455 => 'YEO', 1884456 => 'YE', 1894457 => 'O', 1904458 => 'WA', 1914459 => 'WAE', 1924460 => 'OE', 1934461 => 'YO', 1944462 => 'U', 1954463 => 'WEO', 1964464 => 'WE', 1974465 => 'WI', 1984466 => 'YU', 1994467 => 'EU', 2004468 => 'YI', 2014469 => 'I', 2024520 => 'G', 2034521 => 'GG', 2044522 => 'GS', 2054523 => 'N', 2064524 => 'NJ', 2074525 => 'NH', 2084526 => 'D', 2094527 => 'L', 2104528 => 'LG', 2114529 => 'LM', 2124530 => 'LB', 2134531 => 'LS', 2144532 => 'LT', 2154533 => 'LP', 2164534 => 'LH', 2174535 => 'M', 2184536 => 'B', 2194537 => 'BS', 2204538 => 'S', 2214539 => 'SS', 2224540 => 'NG', 2234541 => 'J', 2244542 => 'C', 2254543 => 'K', 2264544 => 'T', 2274545 => 'P', 2284546 => 'H', 229 230 ); 231 232 # Leading consonant (can be null) 233 my %Jamo_L = ( 234'' => 11, 235'B' => 7, 236'BB' => 8, 237'C' => 14, 238'D' => 3, 239'DD' => 4, 240'G' => 0, 241'GG' => 1, 242'H' => 18, 243'J' => 12, 244'JJ' => 13, 245'K' => 15, 246'M' => 6, 247'N' => 2, 248'P' => 17, 249'R' => 5, 250'S' => 9, 251'SS' => 10, 252'T' => 16, 253 254 ); 255 256 # Vowel 257 my %Jamo_V = ( 258'A' => 0, 259'AE' => 1, 260'E' => 5, 261'EO' => 4, 262'EU' => 18, 263'I' => 20, 264'O' => 8, 265'OE' => 11, 266'U' => 13, 267'WA' => 9, 268'WAE' => 10, 269'WE' => 15, 270'WEO' => 14, 271'WI' => 16, 272'YA' => 2, 273'YAE' => 3, 274'YE' => 7, 275'YEO' => 6, 276'YI' => 19, 277'YO' => 12, 278'YU' => 17, 279 280 ); 281 282 # Optional trailing consonant 283 my %Jamo_T = ( 284'B' => 17, 285'BS' => 18, 286'C' => 23, 287'D' => 7, 288'G' => 1, 289'GG' => 2, 290'GS' => 3, 291'H' => 27, 292'J' => 22, 293'K' => 24, 294'L' => 8, 295'LB' => 11, 296'LG' => 9, 297'LH' => 15, 298'LM' => 10, 299'LP' => 14, 300'LS' => 12, 301'LT' => 13, 302'M' => 16, 303'N' => 4, 304'NG' => 21, 305'NH' => 6, 306'NJ' => 5, 307'P' => 26, 308'S' => 19, 309'SS' => 20, 310'T' => 25, 311 312 ); 313 314 # Computed re that splits up a Hangul name into LVT or LV syllables 315 my $syllable_re = qr/(|B|BB|C|D|DD|G|GG|H|J|JJ|K|M|N|P|R|S|SS|T)(A|AE|E|EO|EU|I|O|OE|U|WA|WAE|WE|WEO|WI|YA|YAE|YE|YEO|YI|YO|YU)(B|BS|C|D|G|GG|GS|H|J|K|L|LB|LG|LH|LM|LP|LS|LT|M|N|NG|NH|NJ|P|S|SS|T)?/; 316 317 my $HANGUL_SYLLABLE = "HANGUL SYLLABLE "; 318 my $loose_HANGUL_SYLLABLE = "HANGULSYLLABLE"; 319 320 # These constants names and values were taken from the Unicode standard, 321 # version 5.1, section 3.12. They are used in conjunction with Hangul 322 # syllables 323 my $SBase = 0xAC00; 324 my $LBase = 0x1100; 325 my $VBase = 0x1161; 326 my $TBase = 0x11A7; 327 my $SCount = 11172; 328 my $LCount = 19; 329 my $VCount = 21; 330 my $TCount = 28; 331 my $NCount = $VCount * $TCount; 332 333 sub name_to_code_point_special { 334 my ($name, $loose) = @_; 335 336 # Returns undef if not one of the specially handled names; otherwise 337 # returns the code point equivalent to the input name 338 # $loose is non-zero if to use loose matching, 'name' in that case 339 # must be input as upper case with all blanks and dashes squeezed out. 340 341 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//) 342 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//)) 343 { 344 return if $name !~ qr/^$syllable_re$/; 345 my $L = $Jamo_L{$1}; 346 my $V = $Jamo_V{$2}; 347 my $T = (defined $3) ? $Jamo_T{$3} : 0; 348 return ($L * $VCount + $V) * $TCount + $T + $SBase; 349 } 350 351 # Name must end in 'code_point' for this to handle. 352 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x) 353 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x)); 354 355 my $base = $1; 356 my $code_point = CORE::hex $2; 357 my $names_ref; 358 359 if ($loose) { 360 $names_ref = \%loose_names_ending_in_code_point; 361 } 362 else { 363 return if $base !~ s/-$//; 364 $names_ref = \%names_ending_in_code_point; 365 } 366 367 # Name must be one of the ones which has the code point in it. 368 return if ! $names_ref->{$base}; 369 370 # Look through the list of ranges that apply to this name to see if 371 # the code point is in one of them. 372 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) { 373 return if $names_ref->{$base}{'low'}->[$i] > $code_point; 374 next if $names_ref->{$base}{'high'}->[$i] < $code_point; 375 376 # Here, the code point is in the range. 377 return $code_point; 378 } 379 380 # Here, looked like the name had a code point number in it, but 381 # did not match one of the valid ones. 382 return; 383 } 384 385 sub code_point_to_name_special { 386 my $code_point = shift; 387 388 # Returns the name of a code point if algorithmically determinable; 389 # undef if not 390 391 # If in the Hangul range, calculate the name based on Unicode's 392 # algorithm 393 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) { 394 use integer; 395 my $SIndex = $code_point - $SBase; 396 my $L = $LBase + $SIndex / $NCount; 397 my $V = $VBase + ($SIndex % $NCount) / $TCount; 398 my $T = $TBase + $SIndex % $TCount; 399 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; 400 $name .= $Jamo{$T} if $T != $TBase; 401 return $name; 402 } 403 404 # Look through list of these code points for one in range. 405 foreach my $hash (@code_points_ending_in_code_point) { 406 return if $code_point < $hash->{'low'}; 407 if ($code_point <= $hash->{'high'}) { 408 return sprintf("%s-%04X", $hash->{'name'}, $code_point); 409 } 410 } 411 return; # None found 412 } 413} # End closure 414 4151; 416