xref: /openbsd-src/gnu/usr.bin/perl/lib/unicore/Name.pm (revision f1dd7b858388b4a23f4f67a4957ec5ff656ebbe8)
1# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
2# This file is machine-generated by lib/unicore/mktables from the Unicode
3# database, Version 13.0.0.  Any changes made here will be lost!
4
5
6# !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
7# This file is for internal use by core Perl only.  The format and even the
8# name or existence of this file are subject to change without notice.  Don't
9# use it directly.  Use Unicode::UCD to access the Unicode character data
10# base.
11
12
13package charnames;
14
15# This module contains machine-generated tables and code for the
16# algorithmically-determinable Unicode character names.  The following
17# routines can be used to translate between name and code point and vice versa
18
19{ # Closure
20
21    # Matches legal code point.  4-6 hex numbers, If there are 6, the first
22    # two must be 10; if there are 5, the first must not be a 0.  Written this
23    # way to decrease backtracking.  The first regex allows the code point to
24    # be at the end of a word, but to work properly, the word shouldn't end
25    # with a valid hex character.  The second one won't match a code point at
26    # the end of a word, and doesn't have the run-on issue
27    my $run_on_code_point_re = qr/(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b)/;
28    my $code_point_re = qr/(?^aa:\b(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b))/;
29
30    # In the following hash, the keys are the bases of names which include
31    # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
32    # of each key is another hash which is used to get the low and high ends
33    # for each range of code points that apply to the name.
34    my %names_ending_in_code_point = (
35'CJK COMPATIBILITY IDEOGRAPH' =>
36{
37'high' =>
38[
3964109,
4064217,
41195101,
42],
43'low' =>
44[
4563744,
4664112,
47194560,
48],
49},
50'CJK UNIFIED IDEOGRAPH' =>
51{
52'high' =>
53[
5419903,
5540956,
56173789,
57177972,
58178205,
59183969,
60191456,
61201546,
62],
63'low' =>
64[
6513312,
6619968,
67131072,
68173824,
69177984,
70178208,
71183984,
72196608,
73],
74},
75'KHITAN SMALL SCRIPT CHARACTER' =>
76{
77'high' =>
78[
79101589,
80],
81'low' =>
82[
83101120,
84],
85},
86'NUSHU CHARACTER' =>
87{
88'high' =>
89[
90111355,
91],
92'low' =>
93[
94110960,
95],
96},
97'TANGUT IDEOGRAPH' =>
98{
99'high' =>
100[
101100343,
102],
103'low' =>
104[
10594208,
106],
107},
108'TANGUT IDEOGRAPH SUPPLEMENT' =>
109{
110'high' =>
111[
112101640,
113],
114'low' =>
115[
116101632,
117],
118},
119
120    );
121
122    # The following hash is a copy of the previous one, except is for loose
123    # matching, so each name has blanks and dashes squeezed out
124    my %loose_names_ending_in_code_point = (
125'CJKCOMPATIBILITYIDEOGRAPH' =>
126{
127'high' =>
128[
12964109,
13064217,
131195101,
132],
133'low' =>
134[
13563744,
13664112,
137194560,
138],
139},
140'CJKUNIFIEDIDEOGRAPH' =>
141{
142'high' =>
143[
14419903,
14540956,
146173789,
147177972,
148178205,
149183969,
150191456,
151201546,
152],
153'low' =>
154[
15513312,
15619968,
157131072,
158173824,
159177984,
160178208,
161183984,
162196608,
163],
164},
165'KHITANSMALLSCRIPTCHARACTER' =>
166{
167'high' =>
168[
169101589,
170],
171'low' =>
172[
173101120,
174],
175},
176'NUSHUCHARACTER' =>
177{
178'high' =>
179[
180111355,
181],
182'low' =>
183[
184110960,
185],
186},
187'TANGUTIDEOGRAPH' =>
188{
189'high' =>
190[
191100343,
192],
193'low' =>
194[
19594208,
196],
197},
198'TANGUTIDEOGRAPHSUPPLEMENT' =>
199{
200'high' =>
201[
202101640,
203],
204'low' =>
205[
206101632,
207],
208},
209
210    );
211
212    # And the following array gives the inverse mapping from code points to
213    # names.  Lowest code points are first
214    @code_points_ending_in_code_point = (
215
216{
217'high' => 19903,
218'legal' =>
219'
220 -0123456789ABCDEFGHIJKNOPRU',
221'low' => 13312,
222'name' => 'CJK UNIFIED IDEOGRAPH',
223},
224{
225'high' => 40956,
226'legal' =>
227'
228 -0123456789ABCDEFGHIJKNOPRU',
229'low' => 19968,
230'name' => 'CJK UNIFIED IDEOGRAPH',
231},
232{
233'high' => 64109,
234'legal' =>
235'
236 -0123456789ABCDEFGHIJKLMOPRTY',
237'low' => 63744,
238'name' => 'CJK COMPATIBILITY IDEOGRAPH',
239},
240{
241'high' => 64217,
242'legal' =>
243'
244 -0123456789ABCDEFGHIJKLMOPRTY',
245'low' => 64112,
246'name' => 'CJK COMPATIBILITY IDEOGRAPH',
247},
248{
249'high' => 100343,
250'legal' =>
251'
252 -0123456789ABCDEFGHINOPRTU',
253'low' => 94208,
254'name' => 'TANGUT IDEOGRAPH',
255},
256{
257'high' => 101589,
258'legal' =>
259'
260 -0123456789ABCDEFHIKLMNPRST',
261'low' => 101120,
262'name' => 'KHITAN SMALL SCRIPT CHARACTER',
263},
264{
265'high' => 101640,
266'legal' =>
267'
268 -0123456789ABCDEFGHILMNOPRSTU',
269'low' => 101632,
270'name' => 'TANGUT IDEOGRAPH SUPPLEMENT',
271},
272{
273'high' => 111355,
274'legal' =>
275'
276 -0123456789ABCDEFHNRSTU',
277'low' => 110960,
278'name' => 'NUSHU CHARACTER',
279},
280{
281'high' => 173789,
282'legal' =>
283'
284 -0123456789ABCDEFGHIJKNOPRU',
285'low' => 131072,
286'name' => 'CJK UNIFIED IDEOGRAPH',
287},
288{
289'high' => 177972,
290'legal' =>
291'
292 -0123456789ABCDEFGHIJKNOPRU',
293'low' => 173824,
294'name' => 'CJK UNIFIED IDEOGRAPH',
295},
296{
297'high' => 178205,
298'legal' =>
299'
300 -0123456789ABCDEFGHIJKNOPRU',
301'low' => 177984,
302'name' => 'CJK UNIFIED IDEOGRAPH',
303},
304{
305'high' => 183969,
306'legal' =>
307'
308 -0123456789ABCDEFGHIJKNOPRU',
309'low' => 178208,
310'name' => 'CJK UNIFIED IDEOGRAPH',
311},
312{
313'high' => 191456,
314'legal' =>
315'
316 -0123456789ABCDEFGHIJKNOPRU',
317'low' => 183984,
318'name' => 'CJK UNIFIED IDEOGRAPH',
319},
320{
321'high' => 195101,
322'legal' =>
323'
324 -0123456789ABCDEFGHIJKLMOPRTY',
325'low' => 194560,
326'name' => 'CJK COMPATIBILITY IDEOGRAPH',
327},
328{
329'high' => 201546,
330'legal' =>
331'
332 -0123456789ABCDEFGHIJKNOPRU',
333'low' => 196608,
334'name' => 'CJK UNIFIED IDEOGRAPH',
335},
336,
337
338    );
339
340    # Is exportable, make read-only
341    Internals::SvREADONLY(@code_points_ending_in_code_point, 1);
342
343    # Convert from code point to Jamo short name for use in composing Hangul
344    # syllable names
345    my %Jamo = (
3464352 => 'G',
3474353 => 'GG',
3484354 => 'N',
3494355 => 'D',
3504356 => 'DD',
3514357 => 'R',
3524358 => 'M',
3534359 => 'B',
3544360 => 'BB',
3554361 => 'S',
3564362 => 'SS',
3574363 => '',
3584364 => 'J',
3594365 => 'JJ',
3604366 => 'C',
3614367 => 'K',
3624368 => 'T',
3634369 => 'P',
3644370 => 'H',
3654449 => 'A',
3664450 => 'AE',
3674451 => 'YA',
3684452 => 'YAE',
3694453 => 'EO',
3704454 => 'E',
3714455 => 'YEO',
3724456 => 'YE',
3734457 => 'O',
3744458 => 'WA',
3754459 => 'WAE',
3764460 => 'OE',
3774461 => 'YO',
3784462 => 'U',
3794463 => 'WEO',
3804464 => 'WE',
3814465 => 'WI',
3824466 => 'YU',
3834467 => 'EU',
3844468 => 'YI',
3854469 => 'I',
3864520 => 'G',
3874521 => 'GG',
3884522 => 'GS',
3894523 => 'N',
3904524 => 'NJ',
3914525 => 'NH',
3924526 => 'D',
3934527 => 'L',
3944528 => 'LG',
3954529 => 'LM',
3964530 => 'LB',
3974531 => 'LS',
3984532 => 'LT',
3994533 => 'LP',
4004534 => 'LH',
4014535 => 'M',
4024536 => 'B',
4034537 => 'BS',
4044538 => 'S',
4054539 => 'SS',
4064540 => 'NG',
4074541 => 'J',
4084542 => 'C',
4094543 => 'K',
4104544 => 'T',
4114545 => 'P',
4124546 => 'H',
413
414    );
415
416    # Leading consonant (can be null)
417    my %Jamo_L = (
418'' => 11,
419'B' => 7,
420'BB' => 8,
421'C' => 14,
422'D' => 3,
423'DD' => 4,
424'G' => 0,
425'GG' => 1,
426'H' => 18,
427'J' => 12,
428'JJ' => 13,
429'K' => 15,
430'M' => 6,
431'N' => 2,
432'P' => 17,
433'R' => 5,
434'S' => 9,
435'SS' => 10,
436'T' => 16,
437
438    );
439
440    # Vowel
441    my %Jamo_V = (
442'A' => 0,
443'AE' => 1,
444'E' => 5,
445'EO' => 4,
446'EU' => 18,
447'I' => 20,
448'O' => 8,
449'OE' => 11,
450'U' => 13,
451'WA' => 9,
452'WAE' => 10,
453'WE' => 15,
454'WEO' => 14,
455'WI' => 16,
456'YA' => 2,
457'YAE' => 3,
458'YE' => 7,
459'YEO' => 6,
460'YI' => 19,
461'YO' => 12,
462'YU' => 17,
463
464    );
465
466    # Optional trailing consonant
467    my %Jamo_T = (
468'B' => 17,
469'BS' => 18,
470'C' => 23,
471'D' => 7,
472'G' => 1,
473'GG' => 2,
474'GS' => 3,
475'H' => 27,
476'J' => 22,
477'K' => 24,
478'L' => 8,
479'LB' => 11,
480'LG' => 9,
481'LH' => 15,
482'LM' => 10,
483'LP' => 14,
484'LS' => 12,
485'LT' => 13,
486'M' => 16,
487'N' => 4,
488'NG' => 21,
489'NH' => 6,
490'NJ' => 5,
491'P' => 26,
492'S' => 19,
493'SS' => 20,
494'T' => 25,
495
496    );
497
498    # Computed re that splits up a Hangul name into LVT or LV syllables
499    my $syllable_re = qr/(|B|BB|C|D|DD|G|GG|H|J|JJ|K|M|N|P|R|S|SS|T)(A|AE|E|EO|EU|I|O|OE|U|WA|WAE|WE|WEO|WI|YA|YAE|YE|YEO|YI|YO|YU)(B|BS|C|D|G|GG|GS|H|J|K|L|LB|LG|LH|LM|LP|LS|LT|M|N|NG|NH|NJ|P|S|SS|T)?/;
500
501    my $HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
502    my $loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
503
504    # These constants names and values were taken from the Unicode standard,
505    # version 5.1, section 3.12.  They are used in conjunction with Hangul
506    # syllables
507    my $SBase = 0xAC00;
508    my $LBase = 0x1100;
509    my $VBase = 0x1161;
510    my $TBase = 0x11A7;
511    my $SCount = 11172;
512    my $LCount = 19;
513    my $VCount = 21;
514    my $TCount = 28;
515    my $NCount = $VCount * $TCount;
516
517    sub name_to_code_point_special {
518        my ($name, $loose) = @_;
519
520        # Returns undef if not one of the specially handled names; otherwise
521        # returns the code point equivalent to the input name
522        # $loose is non-zero if to use loose matching, 'name' in that case
523        # must be input as upper case with all blanks and dashes squeezed out.
524
525        if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
526            || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
527        {
528            return if $name !~ qr/^$syllable_re$/;
529            my $L = $Jamo_L{$1};
530            my $V = $Jamo_V{$2};
531            my $T = (defined $3) ? $Jamo_T{$3} : 0;
532            return ($L * $VCount + $V) * $TCount + $T + $SBase;
533        }
534
535        # Name must end in 'code_point' for this to handle.
536        return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
537                   || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
538
539        my $base = $1;
540        my $code_point = CORE::hex $2;
541        my $names_ref;
542
543        if ($loose) {
544            $names_ref = \%loose_names_ending_in_code_point;
545        }
546        else {
547            return if $base !~ s/-$//;
548            $names_ref = \%names_ending_in_code_point;
549        }
550
551        # Name must be one of the ones which has the code point in it.
552        return if ! $names_ref->{$base};
553
554        # Look through the list of ranges that apply to this name to see if
555        # the code point is in one of them.
556        for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
557            return if $names_ref->{$base}{'low'}->[$i] > $code_point;
558            next if $names_ref->{$base}{'high'}->[$i] < $code_point;
559
560            # Here, the code point is in the range.
561            return $code_point;
562        }
563
564        # Here, looked like the name had a code point number in it, but
565        # did not match one of the valid ones.
566        return;
567    }
568
569    sub code_point_to_name_special {
570        my $code_point = shift;
571
572        # Returns the name of a code point if algorithmically determinable;
573        # undef if not
574
575        # If in the Hangul range, calculate the name based on Unicode's
576        # algorithm
577        if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
578            use integer;
579            my $SIndex = $code_point - $SBase;
580            my $L = $LBase + $SIndex / $NCount;
581            my $V = $VBase + ($SIndex % $NCount) / $TCount;
582            my $T = $TBase + $SIndex % $TCount;
583            $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
584            $name .= $Jamo{$T} if $T != $TBase;
585            return $name;
586        }
587
588        # Look through list of these code points for one in range.
589        foreach my $hash (@code_points_ending_in_code_point) {
590            return if $code_point < $hash->{'low'};
591            if ($code_point <= $hash->{'high'}) {
592                return sprintf("%s-%04X", $hash->{'name'}, $code_point);
593            }
594        }
595        return;            # None found
596    }
597} # End closure
598
5991;
600