xref: /openbsd-src/gnu/usr.bin/perl/lib/unicore/Name.pm (revision fc405d53b73a2d73393cb97f684863d17b583e38)
1# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
2# This file is machine-generated by lib/unicore/mktables from the Unicode
3# database, Version 14.0.0.  Any changes made here will be lost!
4
5
6# !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
7# This file is for internal use by core Perl only.  The format and even the
8# name or existence of this file are subject to change without notice.  Don't
9# use it directly.  Use Unicode::UCD to access the Unicode character data
10# base.
11
12
13=head1 NAME -- Internal generated file for use by charnames
14
15=cut
16
17
18package charnames;
19
20# This module contains machine-generated tables and code for the
21# algorithmically-determinable Unicode character names.  The following
22# routines can be used to translate between name and code point and vice versa
23
24{ # Closure
25
26    # Matches legal code point.  4-6 hex numbers, If there are 6, the first
27    # two must be 10; if there are 5, the first must not be a 0.  Written this
28    # way to decrease backtracking.  The first regex allows the code point to
29    # be at the end of a word, but to work properly, the word shouldn't end
30    # with a valid hex character.  The second one won't match a code point at
31    # the end of a word, and doesn't have the run-on issue
32    my $run_on_code_point_re = qr/(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b)/;
33    my $code_point_re = qr/(?^aa:\b(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b))/;
34
35    # In the following hash, the keys are the bases of names which include
36    # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
37    # of each key is another hash which is used to get the low and high ends
38    # for each range of code points that apply to the name.
39    my %names_ending_in_code_point = (
40'CJK COMPATIBILITY IDEOGRAPH' =>
41{
42'high' =>
43[
4464109,
4564217,
46195101,
47],
48'low' =>
49[
5063744,
5164112,
52194560,
53],
54},
55'CJK UNIFIED IDEOGRAPH' =>
56{
57'high' =>
58[
5919903,
6040959,
61173791,
62177976,
63178205,
64183969,
65191456,
66201546,
67],
68'low' =>
69[
7013312,
7119968,
72131072,
73173824,
74177984,
75178208,
76183984,
77196608,
78],
79},
80'KHITAN SMALL SCRIPT CHARACTER' =>
81{
82'high' =>
83[
84101589,
85],
86'low' =>
87[
88101120,
89],
90},
91'NUSHU CHARACTER' =>
92{
93'high' =>
94[
95111355,
96],
97'low' =>
98[
99110960,
100],
101},
102'TANGUT IDEOGRAPH' =>
103{
104'high' =>
105[
106100343,
107],
108'low' =>
109[
11094208,
111],
112},
113'TANGUT IDEOGRAPH SUPPLEMENT' =>
114{
115'high' =>
116[
117101640,
118],
119'low' =>
120[
121101632,
122],
123},
124
125    );
126
127    # The following hash is a copy of the previous one, except is for loose
128    # matching, so each name has blanks and dashes squeezed out
129    my %loose_names_ending_in_code_point = (
130'CJKCOMPATIBILITYIDEOGRAPH' =>
131{
132'high' =>
133[
13464109,
13564217,
136195101,
137],
138'low' =>
139[
14063744,
14164112,
142194560,
143],
144},
145'CJKUNIFIEDIDEOGRAPH' =>
146{
147'high' =>
148[
14919903,
15040959,
151173791,
152177976,
153178205,
154183969,
155191456,
156201546,
157],
158'low' =>
159[
16013312,
16119968,
162131072,
163173824,
164177984,
165178208,
166183984,
167196608,
168],
169},
170'KHITANSMALLSCRIPTCHARACTER' =>
171{
172'high' =>
173[
174101589,
175],
176'low' =>
177[
178101120,
179],
180},
181'NUSHUCHARACTER' =>
182{
183'high' =>
184[
185111355,
186],
187'low' =>
188[
189110960,
190],
191},
192'TANGUTIDEOGRAPH' =>
193{
194'high' =>
195[
196100343,
197],
198'low' =>
199[
20094208,
201],
202},
203'TANGUTIDEOGRAPHSUPPLEMENT' =>
204{
205'high' =>
206[
207101640,
208],
209'low' =>
210[
211101632,
212],
213},
214
215    );
216
217    # And the following array gives the inverse mapping from code points to
218    # names.  Lowest code points are first
219    @code_points_ending_in_code_point = (
220
221{
222'high' => 19903,
223'legal' =>
224'
225 -0123456789ABCDEFGHIJKNOPRU',
226'low' => 13312,
227'name' => 'CJK UNIFIED IDEOGRAPH',
228},
229{
230'high' => 40959,
231'legal' =>
232'
233 -0123456789ABCDEFGHIJKNOPRU',
234'low' => 19968,
235'name' => 'CJK UNIFIED IDEOGRAPH',
236},
237{
238'high' => 64109,
239'legal' =>
240'
241 -0123456789ABCDEFGHIJKLMOPRTY',
242'low' => 63744,
243'name' => 'CJK COMPATIBILITY IDEOGRAPH',
244},
245{
246'high' => 64217,
247'legal' =>
248'
249 -0123456789ABCDEFGHIJKLMOPRTY',
250'low' => 64112,
251'name' => 'CJK COMPATIBILITY IDEOGRAPH',
252},
253{
254'high' => 100343,
255'legal' =>
256'
257 -0123456789ABCDEFGHINOPRTU',
258'low' => 94208,
259'name' => 'TANGUT IDEOGRAPH',
260},
261{
262'high' => 101589,
263'legal' =>
264'
265 -0123456789ABCDEFHIKLMNPRST',
266'low' => 101120,
267'name' => 'KHITAN SMALL SCRIPT CHARACTER',
268},
269{
270'high' => 101640,
271'legal' =>
272'
273 -0123456789ABCDEFGHILMNOPRSTU',
274'low' => 101632,
275'name' => 'TANGUT IDEOGRAPH SUPPLEMENT',
276},
277{
278'high' => 111355,
279'legal' =>
280'
281 -0123456789ABCDEFHNRSTU',
282'low' => 110960,
283'name' => 'NUSHU CHARACTER',
284},
285{
286'high' => 173791,
287'legal' =>
288'
289 -0123456789ABCDEFGHIJKNOPRU',
290'low' => 131072,
291'name' => 'CJK UNIFIED IDEOGRAPH',
292},
293{
294'high' => 177976,
295'legal' =>
296'
297 -0123456789ABCDEFGHIJKNOPRU',
298'low' => 173824,
299'name' => 'CJK UNIFIED IDEOGRAPH',
300},
301{
302'high' => 178205,
303'legal' =>
304'
305 -0123456789ABCDEFGHIJKNOPRU',
306'low' => 177984,
307'name' => 'CJK UNIFIED IDEOGRAPH',
308},
309{
310'high' => 183969,
311'legal' =>
312'
313 -0123456789ABCDEFGHIJKNOPRU',
314'low' => 178208,
315'name' => 'CJK UNIFIED IDEOGRAPH',
316},
317{
318'high' => 191456,
319'legal' =>
320'
321 -0123456789ABCDEFGHIJKNOPRU',
322'low' => 183984,
323'name' => 'CJK UNIFIED IDEOGRAPH',
324},
325{
326'high' => 195101,
327'legal' =>
328'
329 -0123456789ABCDEFGHIJKLMOPRTY',
330'low' => 194560,
331'name' => 'CJK COMPATIBILITY IDEOGRAPH',
332},
333{
334'high' => 201546,
335'legal' =>
336'
337 -0123456789ABCDEFGHIJKNOPRU',
338'low' => 196608,
339'name' => 'CJK UNIFIED IDEOGRAPH',
340},
341,
342
343    );
344
345    # Is exportable, make read-only
346    Internals::SvREADONLY(@code_points_ending_in_code_point, 1);
347
348    # Convert from code point to Jamo short name for use in composing Hangul
349    # syllable names
350    my %Jamo = (
3514352 => 'G',
3524353 => 'GG',
3534354 => 'N',
3544355 => 'D',
3554356 => 'DD',
3564357 => 'R',
3574358 => 'M',
3584359 => 'B',
3594360 => 'BB',
3604361 => 'S',
3614362 => 'SS',
3624363 => '',
3634364 => 'J',
3644365 => 'JJ',
3654366 => 'C',
3664367 => 'K',
3674368 => 'T',
3684369 => 'P',
3694370 => 'H',
3704449 => 'A',
3714450 => 'AE',
3724451 => 'YA',
3734452 => 'YAE',
3744453 => 'EO',
3754454 => 'E',
3764455 => 'YEO',
3774456 => 'YE',
3784457 => 'O',
3794458 => 'WA',
3804459 => 'WAE',
3814460 => 'OE',
3824461 => 'YO',
3834462 => 'U',
3844463 => 'WEO',
3854464 => 'WE',
3864465 => 'WI',
3874466 => 'YU',
3884467 => 'EU',
3894468 => 'YI',
3904469 => 'I',
3914520 => 'G',
3924521 => 'GG',
3934522 => 'GS',
3944523 => 'N',
3954524 => 'NJ',
3964525 => 'NH',
3974526 => 'D',
3984527 => 'L',
3994528 => 'LG',
4004529 => 'LM',
4014530 => 'LB',
4024531 => 'LS',
4034532 => 'LT',
4044533 => 'LP',
4054534 => 'LH',
4064535 => 'M',
4074536 => 'B',
4084537 => 'BS',
4094538 => 'S',
4104539 => 'SS',
4114540 => 'NG',
4124541 => 'J',
4134542 => 'C',
4144543 => 'K',
4154544 => 'T',
4164545 => 'P',
4174546 => 'H',
418
419    );
420
421    # Leading consonant (can be null)
422    my %Jamo_L = (
423'' => 11,
424'B' => 7,
425'BB' => 8,
426'C' => 14,
427'D' => 3,
428'DD' => 4,
429'G' => 0,
430'GG' => 1,
431'H' => 18,
432'J' => 12,
433'JJ' => 13,
434'K' => 15,
435'M' => 6,
436'N' => 2,
437'P' => 17,
438'R' => 5,
439'S' => 9,
440'SS' => 10,
441'T' => 16,
442
443    );
444
445    # Vowel
446    my %Jamo_V = (
447'A' => 0,
448'AE' => 1,
449'E' => 5,
450'EO' => 4,
451'EU' => 18,
452'I' => 20,
453'O' => 8,
454'OE' => 11,
455'U' => 13,
456'WA' => 9,
457'WAE' => 10,
458'WE' => 15,
459'WEO' => 14,
460'WI' => 16,
461'YA' => 2,
462'YAE' => 3,
463'YE' => 7,
464'YEO' => 6,
465'YI' => 19,
466'YO' => 12,
467'YU' => 17,
468
469    );
470
471    # Optional trailing consonant
472    my %Jamo_T = (
473'B' => 17,
474'BS' => 18,
475'C' => 23,
476'D' => 7,
477'G' => 1,
478'GG' => 2,
479'GS' => 3,
480'H' => 27,
481'J' => 22,
482'K' => 24,
483'L' => 8,
484'LB' => 11,
485'LG' => 9,
486'LH' => 15,
487'LM' => 10,
488'LP' => 14,
489'LS' => 12,
490'LT' => 13,
491'M' => 16,
492'N' => 4,
493'NG' => 21,
494'NH' => 6,
495'NJ' => 5,
496'P' => 26,
497'S' => 19,
498'SS' => 20,
499'T' => 25,
500
501    );
502
503    # Computed re that splits up a Hangul name into LVT or LV syllables
504    my $syllable_re = qr/(|B|BB|C|D|DD|G|GG|H|J|JJ|K|M|N|P|R|S|SS|T)(A|AE|E|EO|EU|I|O|OE|U|WA|WAE|WE|WEO|WI|YA|YAE|YE|YEO|YI|YO|YU)(B|BS|C|D|G|GG|GS|H|J|K|L|LB|LG|LH|LM|LP|LS|LT|M|N|NG|NH|NJ|P|S|SS|T)?/;
505
506    my $HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
507    my $loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
508
509    # These constants names and values were taken from the Unicode standard,
510    # version 5.1, section 3.12.  They are used in conjunction with Hangul
511    # syllables
512    my $SBase = 0xAC00;
513    my $LBase = 0x1100;
514    my $VBase = 0x1161;
515    my $TBase = 0x11A7;
516    my $SCount = 11172;
517    my $LCount = 19;
518    my $VCount = 21;
519    my $TCount = 28;
520    my $NCount = $VCount * $TCount;
521
522    sub name_to_code_point_special {
523        my ($name, $loose) = @_;
524
525        # Returns undef if not one of the specially handled names; otherwise
526        # returns the code point equivalent to the input name
527        # $loose is non-zero if to use loose matching, 'name' in that case
528        # must be input as upper case with all blanks and dashes squeezed out.
529
530        if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
531            || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
532        {
533            return if $name !~ qr/^$syllable_re$/;
534            my $L = $Jamo_L{$1};
535            my $V = $Jamo_V{$2};
536            my $T = (defined $3) ? $Jamo_T{$3} : 0;
537            return ($L * $VCount + $V) * $TCount + $T + $SBase;
538        }
539
540        # Name must end in 'code_point' for this to handle.
541        return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
542                   || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
543
544        my $base = $1;
545        my $code_point = CORE::hex $2;
546        my $names_ref;
547
548        if ($loose) {
549            $names_ref = \%loose_names_ending_in_code_point;
550        }
551        else {
552            return if $base !~ s/-$//;
553            $names_ref = \%names_ending_in_code_point;
554        }
555
556        # Name must be one of the ones which has the code point in it.
557        return if ! $names_ref->{$base};
558
559        # Look through the list of ranges that apply to this name to see if
560        # the code point is in one of them.
561        for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
562            return if $names_ref->{$base}{'low'}->[$i] > $code_point;
563            next if $names_ref->{$base}{'high'}->[$i] < $code_point;
564
565            # Here, the code point is in the range.
566            return $code_point;
567        }
568
569        # Here, looked like the name had a code point number in it, but
570        # did not match one of the valid ones.
571        return;
572    }
573
574    sub code_point_to_name_special {
575        my $code_point = shift;
576
577        # Returns the name of a code point if algorithmically determinable;
578        # undef if not
579
580        # If in the Hangul range, calculate the name based on Unicode's
581        # algorithm
582        if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
583            use integer;
584            my $SIndex = $code_point - $SBase;
585            my $L = $LBase + $SIndex / $NCount;
586            my $V = $VBase + ($SIndex % $NCount) / $TCount;
587            my $T = $TBase + $SIndex % $TCount;
588            $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
589            $name .= $Jamo{$T} if $T != $TBase;
590            return $name;
591        }
592
593        # Look through list of these code points for one in range.
594        foreach my $hash (@code_points_ending_in_code_point) {
595            return if $code_point < $hash->{'low'};
596            if ($code_point <= $hash->{'high'}) {
597                return sprintf("%s-%04X", $hash->{'name'}, $code_point);
598            }
599        }
600        return;            # None found
601    }
602} # End closure
603
6041;
605