lib/unicore/Name.pm

# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
# This file is machine-generated by lib/unicore/mktables from the Unicode
# database, Version 13.0.0.  Any changes made here will be lost!


# !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
# This file is for internal use by core Perl only.  The format and even the
# name or existence of this file are subject to change without notice.  Don't
# use it directly.  Use Unicode::UCD to access the Unicode character data
# base.


package charnames;

# This module contains machine-generated tables and code for the
# algorithmically-determinable Unicode character names.  The following
# routines can be used to translate between name and code point and vice versa

{ # Closure

    # Matches legal code point.  4-6 hex numbers, If there are 6, the first
    # two must be 10; if there are 5, the first must not be a 0.  Written this
    # way to decrease backtracking.  The first regex allows the code point to
    # be at the end of a word, but to work properly, the word shouldn't end
    # with a valid hex character.  The second one won't match a code point at
    # the end of a word, and doesn't have the run-on issue
    my $run_on_code_point_re = qr/(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b)/;
    my $code_point_re = qr/(?^aa:\b(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b))/;

    # In the following hash, the keys are the bases of names which include
    # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
    # of each key is another hash which is used to get the low and high ends
    # for each range of code points that apply to the name.
    my %names_ending_in_code_point = (
'CJK COMPATIBILITY IDEOGRAPH' =>
{
'high' =>
[
64109,
64217,
195101,
],
'low' =>
[
63744,
64112,
194560,
],
},
'CJK UNIFIED IDEOGRAPH' =>
{
'high' =>
[
19903,
40956,
173789,
177972,
178205,
183969,
191456,
201546,
],
'low' =>
[
13312,
19968,
131072,
173824,
177984,
178208,
183984,
196608,
],
},
'KHITAN SMALL SCRIPT CHARACTER' =>
{
'high' =>
[
101589,
],
'low' =>
[
101120,
],
},
'NUSHU CHARACTER' =>
{
'high' =>
[
111355,
],
'low' =>
[
110960,
],
},
'TANGUT IDEOGRAPH' =>
{
'high' =>
[
100343,
],
'low' =>
[
94208,
],
},
'TANGUT IDEOGRAPH SUPPLEMENT' =>
{
'high' =>
[
101640,
],
'low' =>
[
101632,
],
},

    );

    # The following hash is a copy of the previous one, except is for loose
    # matching, so each name has blanks and dashes squeezed out
    my %loose_names_ending_in_code_point = (
'CJKCOMPATIBILITYIDEOGRAPH' =>
{
'high' =>
[
64109,
64217,
195101,
],
'low' =>
[
63744,
64112,
194560,
],
},
'CJKUNIFIEDIDEOGRAPH' =>
{
'high' =>
[
19903,
40956,
173789,
177972,
178205,
183969,
191456,
201546,
],
'low' =>
[
13312,
19968,
131072,
173824,
177984,
178208,
183984,
196608,
],
},
'KHITANSMALLSCRIPTCHARACTER' =>
{
'high' =>
[
101589,
],
'low' =>
[
101120,
],
},
'NUSHUCHARACTER' =>
{
'high' =>
[
111355,
],
'low' =>
[
110960,
],
},
'TANGUTIDEOGRAPH' =>
{
'high' =>
[
100343,
],
'low' =>
[
94208,
],
},
'TANGUTIDEOGRAPHSUPPLEMENT' =>
{
'high' =>
[
101640,
],
'low' =>
[
101632,
],
},

    );

    # And the following array gives the inverse mapping from code points to
    # names.  Lowest code points are first
    @code_points_ending_in_code_point = (

{
'high' => 19903,
'legal' =>
'
 -0123456789ABCDEFGHIJKNOPRU',
'low' => 13312,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
{
'high' => 40956,
'legal' =>
'
 -0123456789ABCDEFGHIJKNOPRU',
'low' => 19968,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
{
'high' => 64109,
'legal' =>
'
 -0123456789ABCDEFGHIJKLMOPRTY',
'low' => 63744,
'name' => 'CJK COMPATIBILITY IDEOGRAPH',
},
{
'high' => 64217,
'legal' =>
'
 -0123456789ABCDEFGHIJKLMOPRTY',
'low' => 64112,
'name' => 'CJK COMPATIBILITY IDEOGRAPH',
},
{
'high' => 100343,
'legal' =>
'
 -0123456789ABCDEFGHINOPRTU',
'low' => 94208,
'name' => 'TANGUT IDEOGRAPH',
},
{
'high' => 101589,
'legal' =>
'
 -0123456789ABCDEFHIKLMNPRST',
'low' => 101120,
'name' => 'KHITAN SMALL SCRIPT CHARACTER',
},
{
'high' => 101640,
'legal' =>
'
 -0123456789ABCDEFGHILMNOPRSTU',
'low' => 101632,
'name' => 'TANGUT IDEOGRAPH SUPPLEMENT',
},
{
'high' => 111355,
'legal' =>
'
 -0123456789ABCDEFHNRSTU',
'low' => 110960,
'name' => 'NUSHU CHARACTER',
},
{
'high' => 173789,
'legal' =>
'
 -0123456789ABCDEFGHIJKNOPRU',
'low' => 131072,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
{
'high' => 177972,
'legal' =>
'
 -0123456789ABCDEFGHIJKNOPRU',
'low' => 173824,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
{
'high' => 178205,
'legal' =>
'
 -0123456789ABCDEFGHIJKNOPRU',
'low' => 177984,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
{
'high' => 183969,
'legal' =>
'
 -0123456789ABCDEFGHIJKNOPRU',
'low' => 178208,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
{
'high' => 191456,
'legal' =>
'
 -0123456789ABCDEFGHIJKNOPRU',
'low' => 183984,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
{
'high' => 195101,
'legal' =>
'
 -0123456789ABCDEFGHIJKLMOPRTY',
'low' => 194560,
'name' => 'CJK COMPATIBILITY IDEOGRAPH',
},
{
'high' => 201546,
'legal' =>
'
 -0123456789ABCDEFGHIJKNOPRU',
'low' => 196608,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
,

    );

    # Is exportable, make read-only
    Internals::SvREADONLY(@code_points_ending_in_code_point, 1);

    # Convert from code point to Jamo short name for use in composing Hangul
    # syllable names
    my %Jamo = (
4352 => 'G',
4353 => 'GG',
4354 => 'N',
4355 => 'D',
4356 => 'DD',
4357 => 'R',
4358 => 'M',
4359 => 'B',
4360 => 'BB',
4361 => 'S',
4362 => 'SS',
4363 => '',
4364 => 'J',
4365 => 'JJ',
4366 => 'C',
4367 => 'K',
4368 => 'T',
4369 => 'P',
4370 => 'H',
4449 => 'A',
4450 => 'AE',
4451 => 'YA',
4452 => 'YAE',
4453 => 'EO',
4454 => 'E',
4455 => 'YEO',
4456 => 'YE',
4457 => 'O',
4458 => 'WA',
4459 => 'WAE',
4460 => 'OE',
4461 => 'YO',
4462 => 'U',
4463 => 'WEO',
4464 => 'WE',
4465 => 'WI',
4466 => 'YU',
4467 => 'EU',
4468 => 'YI',
4469 => 'I',
4520 => 'G',
4521 => 'GG',
4522 => 'GS',
4523 => 'N',
4524 => 'NJ',
4525 => 'NH',
4526 => 'D',
4527 => 'L',
4528 => 'LG',
4529 => 'LM',
4530 => 'LB',
4531 => 'LS',
4532 => 'LT',
4533 => 'LP',
4534 => 'LH',
4535 => 'M',
4536 => 'B',
4537 => 'BS',
4538 => 'S',
4539 => 'SS',
4540 => 'NG',
4541 => 'J',
4542 => 'C',
4543 => 'K',
4544 => 'T',
4545 => 'P',
4546 => 'H',

    );

    # Leading consonant (can be null)
    my %Jamo_L = (
'' => 11,
'B' => 7,
'BB' => 8,
'C' => 14,
'D' => 3,
'DD' => 4,
'G' => 0,
'GG' => 1,
'H' => 18,
'J' => 12,
'JJ' => 13,
'K' => 15,
'M' => 6,
'N' => 2,
'P' => 17,
'R' => 5,
'S' => 9,
'SS' => 10,
'T' => 16,

    );

    # Vowel
    my %Jamo_V = (
'A' => 0,
'AE' => 1,
'E' => 5,
'EO' => 4,
'EU' => 18,
'I' => 20,
'O' => 8,
'OE' => 11,
'U' => 13,
'WA' => 9,
'WAE' => 10,
'WE' => 15,
'WEO' => 14,
'WI' => 16,
'YA' => 2,
'YAE' => 3,
'YE' => 7,
'YEO' => 6,
'YI' => 19,
'YO' => 12,
'YU' => 17,

    );

    # Optional trailing consonant
    my %Jamo_T = (
'B' => 17,
'BS' => 18,
'C' => 23,
'D' => 7,
'G' => 1,
'GG' => 2,
'GS' => 3,
'H' => 27,
'J' => 22,
'K' => 24,
'L' => 8,
'LB' => 11,
'LG' => 9,
'LH' => 15,
'LM' => 10,
'LP' => 14,
'LS' => 12,
'LT' => 13,
'M' => 16,
'N' => 4,
'NG' => 21,
'NH' => 6,
'NJ' => 5,
'P' => 26,
'S' => 19,
'SS' => 20,
'T' => 25,

    );

    # Computed re that splits up a Hangul name into LVT or LV syllables
    my $syllable_re = qr/(|B|BB|C|D|DD|G|GG|H|J|JJ|K|M|N|P|R|S|SS|T)(A|AE|E|EO|EU|I|O|OE|U|WA|WAE|WE|WEO|WI|YA|YAE|YE|YEO|YI|YO|YU)(B|BS|C|D|G|GG|GS|H|J|K|L|LB|LG|LH|LM|LP|LS|LT|M|N|NG|NH|NJ|P|S|SS|T)?/;

    my $HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
    my $loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";

    # These constants names and values were taken from the Unicode standard,
    # version 5.1, section 3.12.  They are used in conjunction with Hangul
    # syllables
    my $SBase = 0xAC00;
    my $LBase = 0x1100;
    my $VBase = 0x1161;
    my $TBase = 0x11A7;
    my $SCount = 11172;
    my $LCount = 19;
    my $VCount = 21;
    my $TCount = 28;
    my $NCount = $VCount * $TCount;

    sub name_to_code_point_special {
        my ($name, $loose) = @_;

        # Returns undef if not one of the specially handled names; otherwise
        # returns the code point equivalent to the input name
        # $loose is non-zero if to use loose matching, 'name' in that case
        # must be input as upper case with all blanks and dashes squeezed out.

        if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
            || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
        {
            return if $name !~ qr/^$syllable_re$/;
            my $L = $Jamo_L{$1};
            my $V = $Jamo_V{$2};
            my $T = (defined $3) ? $Jamo_T{$3} : 0;
            return ($L * $VCount + $V) * $TCount + $T + $SBase;
        }

        # Name must end in 'code_point' for this to handle.
        return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
                   || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));

        my $base = $1;
        my $code_point = CORE::hex $2;
        my $names_ref;

        if ($loose) {
            $names_ref = \%loose_names_ending_in_code_point;
        }
        else {
            return if $base !~ s/-$//;
            $names_ref = \%names_ending_in_code_point;
        }

        # Name must be one of the ones which has the code point in it.
        return if ! $names_ref->{$base};

        # Look through the list of ranges that apply to this name to see if
        # the code point is in one of them.
        for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
            return if $names_ref->{$base}{'low'}->[$i] > $code_point;
            next if $names_ref->{$base}{'high'}->[$i] < $code_point;

            # Here, the code point is in the range.
            return $code_point;
        }

        # Here, looked like the name had a code point number in it, but
        # did not match one of the valid ones.
        return;
    }

    sub code_point_to_name_special {
        my $code_point = shift;

        # Returns the name of a code point if algorithmically determinable;
        # undef if not

        # If in the Hangul range, calculate the name based on Unicode's
        # algorithm
        if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
            use integer;
            my $SIndex = $code_point - $SBase;
            my $L = $LBase + $SIndex / $NCount;
            my $V = $VBase + ($SIndex % $NCount) / $TCount;
            my $T = $TBase + $SIndex % $TCount;
            $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
            $name .= $Jamo{$T} if $T != $TBase;
            return $name;
        }

        # Look through list of these code points for one in range.
        foreach my $hash (@code_points_ending_in_code_point) {
            return if $code_point < $hash->{'low'};
            if ($code_point <= $hash->{'high'}) {
                return sprintf("%s-%04X", $hash->{'name'}, $code_point);
            }
        }
        return;            # None found
    }
} # End closure

1;