xref: /openbsd-src/gnu/usr.bin/perl/lib/unicore/Name.pm (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
2# This file is machine-generated by lib/unicore/mktables from the Unicode
3# database, Version 6.2.0.  Any changes made here will be lost!
4
5
6# !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
7# This file is for internal use by core Perl only.  The format and even the
8# name or existence of this file are subject to change without notice.  Don't
9# use it directly.
10
11
12package charnames;
13
14# This module contains machine-generated tables and code for the
15# algorithmically-determinable Unicode character names.  The following
16# routines can be used to translate between name and code point and vice versa
17
18{ # Closure
19
20    # Matches legal code point.  4-6 hex numbers, If there are 6, the first
21    # two must be 10; if there are 5, the first must not be a 0.  Written this
22    # way to decrease backtracking.  The first regex allows the code point to
23    # be at the end of a word, but to work properly, the word shouldn't end
24    # with a valid hex character.  The second one won't match a code point at
25    # the end of a word, and doesn't have the run-on issue
26    my $run_on_code_point_re = qr/(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b)/;
27    my $code_point_re = qr/(?^aa:\b(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b))/;
28
29    # In the following hash, the keys are the bases of names which include
30    # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
31    # of each key is another hash which is used to get the low and high ends
32    # for each range of code points that apply to the name.
33    my %names_ending_in_code_point = (
34'CJK COMPATIBILITY IDEOGRAPH' =>
35{
36'high' =>
37[
3864109,
3964217,
40195101,
41],
42'low' =>
43[
4463744,
4564112,
46194560,
47],
48},
49'CJK UNIFIED IDEOGRAPH' =>
50{
51'high' =>
52[
5319893,
5440908,
55173782,
56177972,
57178205,
58],
59'low' =>
60[
6113312,
6219968,
63131072,
64173824,
65177984,
66],
67},
68
69    );
70
71    # The following hash is a copy of the previous one, except is for loose
72    # matching, so each name has blanks and dashes squeezed out
73    my %loose_names_ending_in_code_point = (
74'CJKCOMPATIBILITYIDEOGRAPH' =>
75{
76'high' =>
77[
7864109,
7964217,
80195101,
81],
82'low' =>
83[
8463744,
8564112,
86194560,
87],
88},
89'CJKUNIFIEDIDEOGRAPH' =>
90{
91'high' =>
92[
9319893,
9440908,
95173782,
96177972,
97178205,
98],
99'low' =>
100[
10113312,
10219968,
103131072,
104173824,
105177984,
106],
107},
108
109    );
110
111    # And the following array gives the inverse mapping from code points to
112    # names.  Lowest code points are first
113    my @code_points_ending_in_code_point = (
114
115{
116'high' => 19893,
117'low' => 13312,
118'name' => 'CJK UNIFIED IDEOGRAPH',
119},
120{
121'high' => 40908,
122'low' => 19968,
123'name' => 'CJK UNIFIED IDEOGRAPH',
124},
125{
126'high' => 64109,
127'low' => 63744,
128'name' => 'CJK COMPATIBILITY IDEOGRAPH',
129},
130{
131'high' => 64217,
132'low' => 64112,
133'name' => 'CJK COMPATIBILITY IDEOGRAPH',
134},
135{
136'high' => 173782,
137'low' => 131072,
138'name' => 'CJK UNIFIED IDEOGRAPH',
139},
140{
141'high' => 177972,
142'low' => 173824,
143'name' => 'CJK UNIFIED IDEOGRAPH',
144},
145{
146'high' => 178205,
147'low' => 177984,
148'name' => 'CJK UNIFIED IDEOGRAPH',
149},
150{
151'high' => 195101,
152'low' => 194560,
153'name' => 'CJK COMPATIBILITY IDEOGRAPH',
154},
155,
156
157    );
158
159    # Convert from code point to Jamo short name for use in composing Hangul
160    # syllable names
161    my %Jamo = (
1624352 => 'G',
1634353 => 'GG',
1644354 => 'N',
1654355 => 'D',
1664356 => 'DD',
1674357 => 'R',
1684358 => 'M',
1694359 => 'B',
1704360 => 'BB',
1714361 => 'S',
1724362 => 'SS',
1734363 => '',
1744364 => 'J',
1754365 => 'JJ',
1764366 => 'C',
1774367 => 'K',
1784368 => 'T',
1794369 => 'P',
1804370 => 'H',
1814449 => 'A',
1824450 => 'AE',
1834451 => 'YA',
1844452 => 'YAE',
1854453 => 'EO',
1864454 => 'E',
1874455 => 'YEO',
1884456 => 'YE',
1894457 => 'O',
1904458 => 'WA',
1914459 => 'WAE',
1924460 => 'OE',
1934461 => 'YO',
1944462 => 'U',
1954463 => 'WEO',
1964464 => 'WE',
1974465 => 'WI',
1984466 => 'YU',
1994467 => 'EU',
2004468 => 'YI',
2014469 => 'I',
2024520 => 'G',
2034521 => 'GG',
2044522 => 'GS',
2054523 => 'N',
2064524 => 'NJ',
2074525 => 'NH',
2084526 => 'D',
2094527 => 'L',
2104528 => 'LG',
2114529 => 'LM',
2124530 => 'LB',
2134531 => 'LS',
2144532 => 'LT',
2154533 => 'LP',
2164534 => 'LH',
2174535 => 'M',
2184536 => 'B',
2194537 => 'BS',
2204538 => 'S',
2214539 => 'SS',
2224540 => 'NG',
2234541 => 'J',
2244542 => 'C',
2254543 => 'K',
2264544 => 'T',
2274545 => 'P',
2284546 => 'H',
229
230    );
231
232    # Leading consonant (can be null)
233    my %Jamo_L = (
234'' => 11,
235'B' => 7,
236'BB' => 8,
237'C' => 14,
238'D' => 3,
239'DD' => 4,
240'G' => 0,
241'GG' => 1,
242'H' => 18,
243'J' => 12,
244'JJ' => 13,
245'K' => 15,
246'M' => 6,
247'N' => 2,
248'P' => 17,
249'R' => 5,
250'S' => 9,
251'SS' => 10,
252'T' => 16,
253
254    );
255
256    # Vowel
257    my %Jamo_V = (
258'A' => 0,
259'AE' => 1,
260'E' => 5,
261'EO' => 4,
262'EU' => 18,
263'I' => 20,
264'O' => 8,
265'OE' => 11,
266'U' => 13,
267'WA' => 9,
268'WAE' => 10,
269'WE' => 15,
270'WEO' => 14,
271'WI' => 16,
272'YA' => 2,
273'YAE' => 3,
274'YE' => 7,
275'YEO' => 6,
276'YI' => 19,
277'YO' => 12,
278'YU' => 17,
279
280    );
281
282    # Optional trailing consonant
283    my %Jamo_T = (
284'B' => 17,
285'BS' => 18,
286'C' => 23,
287'D' => 7,
288'G' => 1,
289'GG' => 2,
290'GS' => 3,
291'H' => 27,
292'J' => 22,
293'K' => 24,
294'L' => 8,
295'LB' => 11,
296'LG' => 9,
297'LH' => 15,
298'LM' => 10,
299'LP' => 14,
300'LS' => 12,
301'LT' => 13,
302'M' => 16,
303'N' => 4,
304'NG' => 21,
305'NH' => 6,
306'NJ' => 5,
307'P' => 26,
308'S' => 19,
309'SS' => 20,
310'T' => 25,
311
312    );
313
314    # Computed re that splits up a Hangul name into LVT or LV syllables
315    my $syllable_re = qr/(|B|BB|C|D|DD|G|GG|H|J|JJ|K|M|N|P|R|S|SS|T)(A|AE|E|EO|EU|I|O|OE|U|WA|WAE|WE|WEO|WI|YA|YAE|YE|YEO|YI|YO|YU)(B|BS|C|D|G|GG|GS|H|J|K|L|LB|LG|LH|LM|LP|LS|LT|M|N|NG|NH|NJ|P|S|SS|T)?/;
316
317    my $HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
318    my $loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
319
320    # These constants names and values were taken from the Unicode standard,
321    # version 5.1, section 3.12.  They are used in conjunction with Hangul
322    # syllables
323    my $SBase = 0xAC00;
324    my $LBase = 0x1100;
325    my $VBase = 0x1161;
326    my $TBase = 0x11A7;
327    my $SCount = 11172;
328    my $LCount = 19;
329    my $VCount = 21;
330    my $TCount = 28;
331    my $NCount = $VCount * $TCount;
332
333    sub name_to_code_point_special {
334        my ($name, $loose) = @_;
335
336        # Returns undef if not one of the specially handled names; otherwise
337        # returns the code point equivalent to the input name
338        # $loose is non-zero if to use loose matching, 'name' in that case
339        # must be input as upper case with all blanks and dashes squeezed out.
340
341        if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
342            || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
343        {
344            return if $name !~ qr/^$syllable_re$/;
345            my $L = $Jamo_L{$1};
346            my $V = $Jamo_V{$2};
347            my $T = (defined $3) ? $Jamo_T{$3} : 0;
348            return ($L * $VCount + $V) * $TCount + $T + $SBase;
349        }
350
351        # Name must end in 'code_point' for this to handle.
352        return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
353                   || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
354
355        my $base = $1;
356        my $code_point = CORE::hex $2;
357        my $names_ref;
358
359        if ($loose) {
360            $names_ref = \%loose_names_ending_in_code_point;
361        }
362        else {
363            return if $base !~ s/-$//;
364            $names_ref = \%names_ending_in_code_point;
365        }
366
367        # Name must be one of the ones which has the code point in it.
368        return if ! $names_ref->{$base};
369
370        # Look through the list of ranges that apply to this name to see if
371        # the code point is in one of them.
372        for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
373            return if $names_ref->{$base}{'low'}->[$i] > $code_point;
374            next if $names_ref->{$base}{'high'}->[$i] < $code_point;
375
376            # Here, the code point is in the range.
377            return $code_point;
378        }
379
380        # Here, looked like the name had a code point number in it, but
381        # did not match one of the valid ones.
382        return;
383    }
384
385    sub code_point_to_name_special {
386        my $code_point = shift;
387
388        # Returns the name of a code point if algorithmically determinable;
389        # undef if not
390
391        # If in the Hangul range, calculate the name based on Unicode's
392        # algorithm
393        if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
394            use integer;
395            my $SIndex = $code_point - $SBase;
396            my $L = $LBase + $SIndex / $NCount;
397            my $V = $VBase + ($SIndex % $NCount) / $TCount;
398            my $T = $TBase + $SIndex % $TCount;
399            $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
400            $name .= $Jamo{$T} if $T != $TBase;
401            return $name;
402        }
403
404        # Look through list of these code points for one in range.
405        foreach my $hash (@code_points_ending_in_code_point) {
406            return if $code_point < $hash->{'low'};
407            if ($code_point <= $hash->{'high'}) {
408                return sprintf("%s-%04X", $hash->{'name'}, $code_point);
409            }
410        }
411        return;            # None found
412    }
413} # End closure
414
4151;
416