xref: /openbsd-src/gnu/usr.bin/perl/lib/Unicode/UCD.pm (revision c90a81c56dcebd6a1b73fe4aff9b03385b8e63b3)
1package Unicode::UCD;
2
3use strict;
4use warnings;
5no warnings 'surrogate';    # surrogates can be inputs to this
6use charnames ();
7
8our $VERSION = '0.64';
9
10require Exporter;
11
12our @ISA = qw(Exporter);
13
14our @EXPORT_OK = qw(charinfo
15		    charblock charscript
16		    charblocks charscripts
17		    charinrange
18		    charprop
19		    charprops_all
20		    general_categories bidi_types
21		    compexcl
22		    casefold all_casefolds casespec
23		    namedseq
24                    num
25                    prop_aliases
26                    prop_value_aliases
27                    prop_values
28                    prop_invlist
29                    prop_invmap
30                    search_invlist
31                    MAX_CP
32                );
33
34use Carp;
35
36sub IS_ASCII_PLATFORM { ord("A") == 65 }
37
38=head1 NAME
39
40Unicode::UCD - Unicode character database
41
42=head1 SYNOPSIS
43
44    use Unicode::UCD 'charinfo';
45    my $charinfo   = charinfo($codepoint);
46
47    use Unicode::UCD 'charprop';
48    my $value  = charprop($codepoint, $property);
49
50    use Unicode::UCD 'charprops_all';
51    my $all_values_hash_ref = charprops_all($codepoint);
52
53    use Unicode::UCD 'casefold';
54    my $casefold = casefold($codepoint);
55
56    use Unicode::UCD 'all_casefolds';
57    my $all_casefolds_ref = all_casefolds();
58
59    use Unicode::UCD 'casespec';
60    my $casespec = casespec($codepoint);
61
62    use Unicode::UCD 'charblock';
63    my $charblock  = charblock($codepoint);
64
65    use Unicode::UCD 'charscript';
66    my $charscript = charscript($codepoint);
67
68    use Unicode::UCD 'charblocks';
69    my $charblocks = charblocks();
70
71    use Unicode::UCD 'charscripts';
72    my $charscripts = charscripts();
73
74    use Unicode::UCD qw(charscript charinrange);
75    my $range = charscript($script);
76    print "looks like $script\n" if charinrange($range, $codepoint);
77
78    use Unicode::UCD qw(general_categories bidi_types);
79    my $categories = general_categories();
80    my $types = bidi_types();
81
82    use Unicode::UCD 'prop_aliases';
83    my @space_names = prop_aliases("space");
84
85    use Unicode::UCD 'prop_value_aliases';
86    my @gc_punct_names = prop_value_aliases("Gc", "Punct");
87
88    use Unicode::UCD 'prop_values';
89    my @all_EA_short_names = prop_values("East_Asian_Width");
90
91    use Unicode::UCD 'prop_invlist';
92    my @puncts = prop_invlist("gc=punctuation");
93
94    use Unicode::UCD 'prop_invmap';
95    my ($list_ref, $map_ref, $format, $missing)
96                                      = prop_invmap("General Category");
97
98    use Unicode::UCD 'search_invlist';
99    my $index = search_invlist(\@invlist, $code_point);
100
101    use Unicode::UCD 'compexcl';
102    my $compexcl = compexcl($codepoint);
103
104    use Unicode::UCD 'namedseq';
105    my $namedseq = namedseq($named_sequence_name);
106
107    my $unicode_version = Unicode::UCD::UnicodeVersion();
108
109    my $convert_to_numeric =
110              Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
111
112=head1 DESCRIPTION
113
114The Unicode::UCD module offers a series of functions that
115provide a simple interface to the Unicode
116Character Database.
117
118=head2 code point argument
119
120Some of the functions are called with a I<code point argument>, which is either
121a decimal or a hexadecimal scalar designating a code point in the platform's
122native character set (extended to Unicode), or a string containing C<U+>
123followed by hexadecimals
124designating a Unicode code point.  A leading 0 will force a hexadecimal
125interpretation, as will a hexadecimal digit that isn't a decimal digit.
126
127Examples:
128
129    223     # Decimal 223 in native character set
130    0223    # Hexadecimal 223, native (= 547 decimal)
131    0xDF    # Hexadecimal DF, native (= 223 decimal
132    'U+DF'  # Hexadecimal DF, in Unicode's character set
133                              (= LATIN SMALL LETTER SHARP S)
134
135Note that the largest code point in Unicode is U+10FFFF.
136
137=cut
138
139my $BLOCKSFH;
140my $VERSIONFH;
141my $CASEFOLDFH;
142my $CASESPECFH;
143my $NAMEDSEQFH;
144my $v_unicode_version;  # v-string.
145
146sub openunicode {
147    my ($rfh, @path) = @_;
148    my $f;
149    unless (defined $$rfh) {
150	for my $d (@INC) {
151	    use File::Spec;
152	    $f = File::Spec->catfile($d, "unicore", @path);
153	    last if open($$rfh, $f);
154	    undef $f;
155	}
156	croak __PACKAGE__, ": failed to find ",
157              File::Spec->catfile(@path), " in @INC"
158	    unless defined $f;
159    }
160    return $f;
161}
162
163sub _dclone ($) {   # Use Storable::dclone if available; otherwise emulate it.
164
165    use if defined &DynaLoader::boot_DynaLoader, Storable => qw(dclone);
166
167    return dclone(shift) if defined &dclone;
168
169    my $arg = shift;
170    my $type = ref $arg;
171    return $arg unless $type;   # No deep cloning needed for scalars
172
173    if ($type eq 'ARRAY') {
174        my @return;
175        foreach my $element (@$arg) {
176            push @return, &_dclone($element);
177        }
178        return \@return;
179    }
180    elsif ($type eq 'HASH') {
181        my %return;
182        foreach my $key (keys %$arg) {
183            $return{$key} = &_dclone($arg->{$key});
184        }
185        return \%return;
186    }
187    else {
188        croak "_dclone can't handle " . $type;
189    }
190}
191
192=head2 B<charinfo()>
193
194    use Unicode::UCD 'charinfo';
195
196    my $charinfo = charinfo(0x41);
197
198This returns information about the input L</code point argument>
199as a reference to a hash of fields as defined by the Unicode
200standard.  If the L</code point argument> is not assigned in the standard
201(i.e., has the general category C<Cn> meaning C<Unassigned>)
202or is a non-character (meaning it is guaranteed to never be assigned in
203the standard),
204C<undef> is returned.
205
206Fields that aren't applicable to the particular code point argument exist in the
207returned hash, and are empty.
208
209For results that are less "raw" than this function returns, or to get the values for
210any property, not just the few covered by this function, use the
211L</charprop()> function.
212
213The keys in the hash with the meanings of their values are:
214
215=over
216
217=item B<code>
218
219the input native L</code point argument> expressed in hexadecimal, with
220leading zeros
221added if necessary to make it contain at least four hexdigits
222
223=item B<name>
224
225name of I<code>, all IN UPPER CASE.
226Some control-type code points do not have names.
227This field will be empty for C<Surrogate> and C<Private Use> code points,
228and for the others without a name,
229it will contain a description enclosed in angle brackets, like
230C<E<lt>controlE<gt>>.
231
232
233=item B<category>
234
235The short name of the general category of I<code>.
236This will match one of the keys in the hash returned by L</general_categories()>.
237
238The L</prop_value_aliases()> function can be used to get all the synonyms
239of the category name.
240
241=item B<combining>
242
243the combining class number for I<code> used in the Canonical Ordering Algorithm.
244For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
245available at
246L<http://www.unicode.org/versions/Unicode5.1.0/>
247
248The L</prop_value_aliases()> function can be used to get all the synonyms
249of the combining class number.
250
251=item B<bidi>
252
253bidirectional type of I<code>.
254This will match one of the keys in the hash returned by L</bidi_types()>.
255
256The L</prop_value_aliases()> function can be used to get all the synonyms
257of the bidi type name.
258
259=item B<decomposition>
260
261is empty if I<code> has no decomposition; or is one or more codes
262(separated by spaces) that, taken in order, represent a decomposition for
263I<code>.  Each has at least four hexdigits.
264The codes may be preceded by a word enclosed in angle brackets, then a space,
265like C<E<lt>compatE<gt> >, giving the type of decomposition
266
267This decomposition may be an intermediate one whose components are also
268decomposable.  Use L<Unicode::Normalize> to get the final decomposition in one
269step.
270
271=item B<decimal>
272
273if I<code> represents a decimal digit this is its integer numeric value
274
275=item B<digit>
276
277if I<code> represents some other digit-like number, this is its integer
278numeric value
279
280=item B<numeric>
281
282if I<code> represents a whole or rational number, this is its numeric value.
283Rational values are expressed as a string like C<1/4>.
284
285=item B<mirrored>
286
287C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
288
289=item B<unicode10>
290
291name of I<code> in the Unicode 1.0 standard if one
292existed for this code point and is different from the current name
293
294=item B<comment>
295
296As of Unicode 6.0, this is always empty.
297
298=item B<upper>
299
300is, if non-empty, the uppercase mapping for I<code> expressed as at least four
301hexdigits.  This indicates that the full uppercase mapping is a single
302character, and is identical to the simple (single-character only) mapping.
303When this field is empty, it means that the simple uppercase mapping is
304I<code> itself; you'll need some other means, (like L</charprop()> or
305L</casespec()> to get the full mapping.
306
307=item B<lower>
308
309is, if non-empty, the lowercase mapping for I<code> expressed as at least four
310hexdigits.  This indicates that the full lowercase mapping is a single
311character, and is identical to the simple (single-character only) mapping.
312When this field is empty, it means that the simple lowercase mapping is
313I<code> itself; you'll need some other means, (like L</charprop()> or
314L</casespec()> to get the full mapping.
315
316=item B<title>
317
318is, if non-empty, the titlecase mapping for I<code> expressed as at least four
319hexdigits.  This indicates that the full titlecase mapping is a single
320character, and is identical to the simple (single-character only) mapping.
321When this field is empty, it means that the simple titlecase mapping is
322I<code> itself; you'll need some other means, (like L</charprop()> or
323L</casespec()> to get the full mapping.
324
325=item B<block>
326
327the block I<code> belongs to (used in C<\p{Blk=...}>).
328The L</prop_value_aliases()> function can be used to get all the synonyms
329of the block name.
330
331See L</Blocks versus Scripts>.
332
333=item B<script>
334
335the script I<code> belongs to.
336The L</prop_value_aliases()> function can be used to get all the synonyms
337of the script name.
338
339See L</Blocks versus Scripts>.
340
341=back
342
343Note that you cannot do (de)composition and casing based solely on the
344I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields; you
345will need also the L</casespec()> function and the C<Composition_Exclusion>
346property.  (Or you could just use the L<lc()|perlfunc/lc>,
347L<uc()|perlfunc/uc>, and L<ucfirst()|perlfunc/ucfirst> functions, and the
348L<Unicode::Normalize> module.)
349
350=cut
351
352# NB: This function is nearly duplicated in charnames.pm
353sub _getcode {
354    my $arg = shift;
355
356    if ($arg =~ /^[1-9]\d*$/) {
357	return $arg;
358    }
359    elsif ($arg =~ /^(?:0[xX])?([[:xdigit:]]+)$/) {
360	return CORE::hex($1);
361    }
362    elsif ($arg =~ /^[Uu]\+([[:xdigit:]]+)$/) { # Is of form U+0000, means
363                                                # wants the Unicode code
364                                                # point, not the native one
365        my $decimal = CORE::hex($1);
366        return $decimal if IS_ASCII_PLATFORM;
367        return utf8::unicode_to_native($decimal);
368    }
369
370    return;
371}
372
373# Populated by _num.  Converts real number back to input rational
374my %real_to_rational;
375
376# To store the contents of files found on disk.
377my @BIDIS;
378my @CATEGORIES;
379my @DECOMPOSITIONS;
380my @NUMERIC_TYPES;
381my %SIMPLE_LOWER;
382my %SIMPLE_TITLE;
383my %SIMPLE_UPPER;
384my %UNICODE_1_NAMES;
385my %ISO_COMMENT;
386
387# Eval'd so can run on versions earlier than the property is available in
388my $Hangul_Syllables_re = eval 'qr/\p{Block=Hangul_Syllables}/';
389
390sub charinfo {
391
392    # This function has traditionally mimicked what is in UnicodeData.txt,
393    # warts and all.  This is a re-write that avoids UnicodeData.txt so that
394    # it can be removed to save disk space.  Instead, this assembles
395    # information gotten by other methods that get data from various other
396    # files.  It uses charnames to get the character name; and various
397    # mktables tables.
398
399    use feature 'unicode_strings';
400
401    # Will fail if called under minitest
402    use if defined &DynaLoader::boot_DynaLoader, "Unicode::Normalize" => qw(getCombinClass NFD);
403
404    my $arg  = shift;
405    my $code = _getcode($arg);
406    croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code;
407
408    # Non-unicode implies undef.
409    return if $code > 0x10FFFF;
410
411    my %prop;
412    my $char = chr($code);
413
414    @CATEGORIES =_read_table("To/Gc.pl") unless @CATEGORIES;
415    $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code)
416                        // $utf8::SwashInfo{'ToGc'}{'missing'};
417    # Return undef if category value is 'Unassigned' or one of its synonyms
418    return if grep { lc $_ eq 'unassigned' }
419                                    prop_value_aliases('Gc', $prop{'category'});
420
421    $prop{'code'} = sprintf "%04X", $code;
422    $prop{'name'} = ($char =~ /\p{Cntrl}/) ? '<control>'
423                                           : (charnames::viacode($code) // "");
424
425    $prop{'combining'} = getCombinClass($code);
426
427    @BIDIS =_read_table("To/Bc.pl") unless @BIDIS;
428    $prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code)
429                    // $utf8::SwashInfo{'ToBc'}{'missing'};
430
431    # For most code points, we can just read in "unicore/Decomposition.pl", as
432    # its contents are exactly what should be output.  But that file doesn't
433    # contain the data for the Hangul syllable decompositions, which can be
434    # algorithmically computed, and NFD() does that, so we call NFD() for
435    # those.  We can't use NFD() for everything, as it does a complete
436    # recursive decomposition, and what this function has always done is to
437    # return what's in UnicodeData.txt which doesn't show that recursiveness.
438    # Fortunately, the NFD() of the Hanguls doesn't have any recursion
439    # issues.
440    # Having no decomposition implies an empty field; otherwise, all but
441    # "Canonical" imply a compatible decomposition, and the type is prefixed
442    # to that, as it is in UnicodeData.txt
443    UnicodeVersion() unless defined $v_unicode_version;
444    if ($v_unicode_version ge v2.0.0 && $char =~ $Hangul_Syllables_re) {
445        # The code points of the decomposition are output in standard Unicode
446        # hex format, separated by blanks.
447        $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
448                                           unpack "U*", NFD($char);
449    }
450    else {
451        @DECOMPOSITIONS = _read_table("Decomposition.pl")
452                          unless @DECOMPOSITIONS;
453        $prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS,
454                                                                $code) // "";
455    }
456
457    # Can use num() to get the numeric values, if any.
458    if (! defined (my $value = num($char))) {
459        $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = "";
460    }
461    else {
462        if ($char =~ /\d/) {
463            $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = $value;
464        }
465        else {
466
467            # For non-decimal-digits, we have to read in the Numeric type
468            # to distinguish them.  It is not just a matter of integer vs.
469            # rational, as some whole number values are not considered digits,
470            # e.g., TAMIL NUMBER TEN.
471            $prop{'decimal'} = "";
472
473            @NUMERIC_TYPES =_read_table("To/Nt.pl") unless @NUMERIC_TYPES;
474            if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "")
475                eq 'Digit')
476            {
477                $prop{'digit'} = $prop{'numeric'} = $value;
478            }
479            else {
480                $prop{'digit'} = "";
481                $prop{'numeric'} = $real_to_rational{$value} // $value;
482            }
483        }
484    }
485
486    $prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
487
488    %UNICODE_1_NAMES =_read_table("To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
489    $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // "";
490
491    UnicodeVersion() unless defined $v_unicode_version;
492    if ($v_unicode_version ge v6.0.0) {
493        $prop{'comment'} = "";
494    }
495    else {
496        %ISO_COMMENT = _read_table("To/Isc.pl", "use_hash") unless %ISO_COMMENT;
497        $prop{'comment'} = (defined $ISO_COMMENT{$code})
498                           ? $ISO_COMMENT{$code}
499                           : "";
500    }
501
502    %SIMPLE_UPPER = _read_table("To/Uc.pl", "use_hash") unless %SIMPLE_UPPER;
503    $prop{'upper'} = (defined $SIMPLE_UPPER{$code})
504                     ? sprintf("%04X", $SIMPLE_UPPER{$code})
505                     : "";
506
507    %SIMPLE_LOWER = _read_table("To/Lc.pl", "use_hash") unless %SIMPLE_LOWER;
508    $prop{'lower'} = (defined $SIMPLE_LOWER{$code})
509                     ? sprintf("%04X", $SIMPLE_LOWER{$code})
510                     : "";
511
512    %SIMPLE_TITLE = _read_table("To/Tc.pl", "use_hash") unless %SIMPLE_TITLE;
513    $prop{'title'} = (defined $SIMPLE_TITLE{$code})
514                     ? sprintf("%04X", $SIMPLE_TITLE{$code})
515                     : "";
516
517    $prop{block}  = charblock($code);
518    $prop{script} = charscript($code);
519    return \%prop;
520}
521
522sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
523    my ($table, $lo, $hi, $code) = @_;
524
525    return if $lo > $hi;
526
527    my $mid = int(($lo+$hi) / 2);
528
529    if ($table->[$mid]->[0] < $code) {
530	if ($table->[$mid]->[1] >= $code) {
531	    return $table->[$mid]->[2];
532	} else {
533	    _search($table, $mid + 1, $hi, $code);
534	}
535    } elsif ($table->[$mid]->[0] > $code) {
536	_search($table, $lo, $mid - 1, $code);
537    } else {
538	return $table->[$mid]->[2];
539    }
540}
541
542sub _read_table ($;$) {
543
544    # Returns the contents of the mktables generated table file located at $1
545    # in the form of either an array of arrays or a hash, depending on if the
546    # optional second parameter is true (for hash return) or not.  In the case
547    # of a hash return, each key is a code point, and its corresponding value
548    # is what the table gives as the code point's corresponding value.  In the
549    # case of an array return, each outer array denotes a range with [0] the
550    # start point of that range; [1] the end point; and [2] the value that
551    # every code point in the range has.  The hash return is useful for fast
552    # lookup when the table contains only single code point ranges.  The array
553    # return takes much less memory when there are large ranges.
554    #
555    # This function has the side effect of setting
556    # $utf8::SwashInfo{$property}{'format'} to be the mktables format of the
557    #                                       table; and
558    # $utf8::SwashInfo{$property}{'missing'} to be the value for all entries
559    #                                        not listed in the table.
560    # where $property is the Unicode property name, preceded by 'To' for map
561    # properties., e.g., 'ToSc'.
562    #
563    # Table entries look like one of:
564    # 0000	0040	Common	# [65]
565    # 00AA		Latin
566
567    my $table = shift;
568    my $return_hash = shift;
569    $return_hash = 0 unless defined $return_hash;
570    my @return;
571    my %return;
572    local $_;
573    my $list = do "unicore/$table";
574
575    # Look up if this property requires adjustments, which we do below if it
576    # does.
577    require "unicore/Heavy.pl";
578    my $property = $table =~ s/\.pl//r;
579    $property = $utf8::file_to_swash_name{$property};
580    my $to_adjust = defined $property
581                    && $utf8::SwashInfo{$property}{'format'} =~ / ^ a /x;
582
583    for (split /^/m, $list) {
584        my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
585                                        \s* ( \# .* )?  # Optional comment
586                                        $ /x;
587        my $decimal_start = hex $start;
588        my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
589        $value = hex $value if $to_adjust
590                               && $utf8::SwashInfo{$property}{'format'} eq 'ax';
591        if ($return_hash) {
592            foreach my $i ($decimal_start .. $decimal_end) {
593                $return{$i} = ($to_adjust)
594                              ? $value + $i - $decimal_start
595                              : $value;
596            }
597        }
598        elsif (! $to_adjust
599               && @return
600               && $return[-1][1] == $decimal_start - 1
601               && $return[-1][2] eq $value)
602        {
603            # If this is merely extending the previous range, do just that.
604            $return[-1]->[1] = $decimal_end;
605        }
606        else {
607            push @return, [ $decimal_start, $decimal_end, $value ];
608        }
609    }
610    return ($return_hash) ? %return : @return;
611}
612
613sub charinrange {
614    my ($range, $arg) = @_;
615    my $code = _getcode($arg);
616    croak __PACKAGE__, "::charinrange: unknown code '$arg'"
617	unless defined $code;
618    _search($range, 0, $#$range, $code);
619}
620
621=head2 B<charprop()>
622
623    use Unicode::UCD 'charprop';
624
625    print charprop(0x41, "Gc"), "\n";
626    print charprop(0x61, "General_Category"), "\n";
627
628  prints
629    Lu
630    Ll
631
632This returns the value of the Unicode property given by the second parameter
633for the  L</code point argument> given by the first.
634
635The passed-in property may be specified as any of the synonyms returned by
636L</prop_aliases()>.
637
638The return value is always a scalar, either a string or a number.  For
639properties where there are synonyms for the values, the synonym returned by
640this function is the longest, most descriptive form, the one returned by
641L</prop_value_aliases()> when called in a scalar context.  Of course, you can
642call L</prop_value_aliases()> on the result to get other synonyms.
643
644The return values are more "cooked" than the L</charinfo()> ones.  For
645example, the C<"uc"> property value is the actual string containing the full
646uppercase mapping of the input code point.  You have to go to extra trouble
647with C<charinfo> to get this value from its C<upper> hash element when the
648full mapping differs from the simple one.
649
650Special note should be made of the return values for a few properties:
651
652=over
653
654=item Block
655
656The value returned is the new-style (see L</Old-style versus new-style block
657names>).
658
659=item Decomposition_Mapping
660
661Like L</charinfo()>, the result may be an intermediate decomposition whose
662components are also decomposable.  Use L<Unicode::Normalize> to get the final
663decomposition in one step.
664
665Unlike L</charinfo()>, this does not include the decomposition type.  Use the
666C<Decomposition_Type> property to get that.
667
668=item Name_Alias
669
670If the input code point's name has more than one synonym, they are returned
671joined into a single comma-separated string.
672
673=item Numeric_Value
674
675If the result is a fraction, it is converted into a floating point number to
676the accuracy of your platform.
677
678=item Script_Extensions
679
680If the result is multiple script names, they are returned joined into a single
681comma-separated string.
682
683=back
684
685When called with a property that is a Perl extension that isn't expressible in
686a compound form, this function currently returns C<undef>, as the only two
687possible values are I<true> or I<false> (1 or 0 I suppose).  This behavior may
688change in the future, so don't write code that relies on it.  C<Present_In> is
689a Perl extension that is expressible in a bipartite or compound form (for
690example, C<\p{Present_In=4.0}>), so C<charprop> accepts it.  But C<Any> is a
691Perl extension that isn't expressible that way, so C<charprop> returns
692C<undef> for it.  Also C<charprop> returns C<undef> for all Perl extensions
693that are internal-only.
694
695=cut
696
697sub charprop ($$) {
698    my ($input_cp, $prop) = @_;
699
700    my $cp = _getcode($input_cp);
701    croak __PACKAGE__, "::charprop: unknown code point '$input_cp'" unless defined $cp;
702
703    my ($list_ref, $map_ref, $format, $default)
704                                      = prop_invmap($prop);
705    return undef unless defined $list_ref;
706
707    my $i = search_invlist($list_ref, $cp);
708    croak __PACKAGE__, "::charprop: prop_invmap return is invalid for charprop('$input_cp', '$prop)" unless defined $i;
709
710    # $i is the index into both the inversion list and map of $cp.
711    my $map = $map_ref->[$i];
712
713    # Convert enumeration values to their most complete form.
714    if (! ref $map) {
715        my $long_form = prop_value_aliases($prop, $map);
716        $map = $long_form if defined $long_form;
717    }
718
719    if ($format =~ / ^ s /x) {  # Scalars
720        return join ",", @$map if ref $map; # Convert to scalar with comma
721                                            # separated array elements
722
723        # Resolve ambiguity as to whether an all digit value is a code point
724        # that should be converted to a character, or whether it is really
725        # just a number.  To do this, look at the default.  If it is a
726        # non-empty number, we can safely assume the result is also a number.
727        if ($map =~ / ^ \d+ $ /ax && $default !~ / ^ \d+ $ /ax) {
728            $map = chr $map;
729        }
730        elsif ($map =~ / ^ (?: Y | N ) $ /x) {
731
732            # prop_invmap() returns these values for properties that are Perl
733            # extensions.  But this is misleading.  For now, return undef for
734            # these, as currently documented.
735            undef $map unless
736                exists $Unicode::UCD::prop_aliases{utf8::_loose_name(lc $prop)};
737        }
738        return $map;
739    }
740    elsif ($format eq 'ar') {   # numbers, including rationals
741        my $offset = $cp - $list_ref->[$i];
742        return $map if $map =~ /nan/i;
743        return $map + $offset if $offset != 0;  # If needs adjustment
744        return eval $map;   # Convert e.g., 1/2 to 0.5
745    }
746    elsif ($format =~ /^a/) {   # Some entries need adjusting
747
748        # Linearize sequences into a string.
749        return join "", map { chr $_ } @$map if ref $map; # XXX && $format =~ /^ a [dl] /x;
750
751        return "" if $map eq "" && $format =~ /^a.*e/;
752
753        # These are all character mappings.  Return the chr if no adjustment
754        # is needed
755        return chr $cp if $map eq "0";
756
757        # Convert special entry.
758        if ($map eq '<hangul syllable>' && $format eq 'ad') {
759            use Unicode::Normalize qw(NFD);
760            return NFD(chr $cp);
761        }
762
763        # The rest need adjustment from the first entry in the inversion list
764        # corresponding to this map.
765        my $offset = $cp - $list_ref->[$i];
766        return chr($map + $cp - $list_ref->[$i]);
767    }
768    elsif ($format eq 'n') {    # The name property
769
770        # There are two special cases, handled here.
771        if ($map =~ / ( .+ ) <code\ point> $ /x) {
772            $map = sprintf("$1%04X", $cp);
773        }
774        elsif ($map eq '<hangul syllable>') {
775            $map = charnames::viacode($cp);
776        }
777        return $map;
778    }
779    else {
780        croak __PACKAGE__, "::charprop: Internal error: unknown format '$format'.  Please perlbug this";
781    }
782}
783
784=head2 B<charprops_all()>
785
786    use Unicode::UCD 'charprops_all';
787
788    my $%properties_of_A_hash_ref = charprops_all("U+41");
789
790This returns a reference to a hash whose keys are all the distinct Unicode (no
791Perl extension) properties, and whose values are the respective values for
792those properties for the input L</code point argument>.
793
794Each key is the property name in its longest, most descriptive form.  The
795values are what L</charprop()> would return.
796
797This function is expensive in time and memory.
798
799=cut
800
801sub charprops_all($) {
802    my $input_cp = shift;
803
804    my $cp = _getcode($input_cp);
805    croak __PACKAGE__, "::charprops_all: unknown code point '$input_cp'" unless defined $cp;
806
807    my %return;
808
809    require "unicore/UCD.pl";
810
811    foreach my $prop (keys %Unicode::UCD::prop_aliases) {
812
813        # Don't return a Perl extension.  (This is the only one that
814        # %prop_aliases has in it.)
815        next if $prop eq 'perldecimaldigit';
816
817        # Use long name for $prop in the hash
818        $return{scalar prop_aliases($prop)} = charprop($cp, $prop);
819    }
820
821    return \%return;
822}
823
824=head2 B<charblock()>
825
826    use Unicode::UCD 'charblock';
827
828    my $charblock = charblock(0x41);
829    my $charblock = charblock(1234);
830    my $charblock = charblock(0x263a);
831    my $charblock = charblock("U+263a");
832
833    my $range     = charblock('Armenian');
834
835With a L</code point argument> C<charblock()> returns the I<block> the code point
836belongs to, e.g.  C<Basic Latin>.  The old-style block name is returned (see
837L</Old-style versus new-style block names>).
838The L</prop_value_aliases()> function can be used to get all the synonyms
839of the block name.
840
841If the code point is unassigned, this returns the block it would belong to if
842it were assigned.  (If the Unicode version being used is so early as to not
843have blocks, all code points are considered to be in C<No_Block>.)
844
845See also L</Blocks versus Scripts>.
846
847If supplied with an argument that can't be a code point, C<charblock()> tries to
848do the opposite and interpret the argument as an old-style block name.  On an
849ASCII platform, the return value is a I<range set> with one range: an
850anonymous array with a single element that consists of another anonymous array
851whose first element is the first code point in the block, and whose second
852element is the final code point in the block.  On an EBCDIC
853platform, the first two Unicode blocks are not contiguous.  Their range sets
854are lists containing I<start-of-range>, I<end-of-range> code point pairs.  You
855can test whether a code point is in a range set using the L</charinrange()>
856function.  (To be precise, each I<range set> contains a third array element,
857after the range boundary ones: the old_style block name.)
858
859If the argument to C<charblock()> is not a known block, C<undef> is
860returned.
861
862=cut
863
864my @BLOCKS;
865my %BLOCKS;
866
867sub _charblocks {
868
869    # Can't read from the mktables table because it loses the hyphens in the
870    # original.
871    unless (@BLOCKS) {
872        UnicodeVersion() unless defined $v_unicode_version;
873        if ($v_unicode_version lt v2.0.0) {
874            my $subrange = [ 0, 0x10FFFF, 'No_Block' ];
875            push @BLOCKS, $subrange;
876            push @{$BLOCKS{'No_Block'}}, $subrange;
877        }
878        elsif (openunicode(\$BLOCKSFH, "Blocks.txt")) {
879	    local $_;
880	    local $/ = "\n";
881	    while (<$BLOCKSFH>) {
882
883                # Old versions used a different syntax to mark the range.
884                $_ =~ s/;\s+/../ if $v_unicode_version lt v3.1.0;
885
886		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
887		    my ($lo, $hi) = (hex($1), hex($2));
888		    my $subrange = [ $lo, $hi, $3 ];
889		    push @BLOCKS, $subrange;
890		    push @{$BLOCKS{$3}}, $subrange;
891		}
892	    }
893	    close($BLOCKSFH);
894            if (! IS_ASCII_PLATFORM) {
895                # The first two blocks, through 0xFF, are wrong on EBCDIC
896                # platforms.
897
898                my @new_blocks = _read_table("To/Blk.pl");
899
900                # Get rid of the first two ranges in the Unicode version, and
901                # replace them with the ones computed by mktables.
902                shift @BLOCKS;
903                shift @BLOCKS;
904                delete $BLOCKS{'Basic Latin'};
905                delete $BLOCKS{'Latin-1 Supplement'};
906
907                # But there are multiple entries in the computed versions, and
908                # we change their names to (which we know) to be the old-style
909                # ones.
910                for my $i (0.. @new_blocks - 1) {
911                    if ($new_blocks[$i][2] =~ s/Basic_Latin/Basic Latin/
912                        or $new_blocks[$i][2] =~
913                                    s/Latin_1_Supplement/Latin-1 Supplement/)
914                    {
915                        push @{$BLOCKS{$new_blocks[$i][2]}}, $new_blocks[$i];
916                    }
917                    else {
918                        splice @new_blocks, $i;
919                        last;
920                    }
921                }
922                unshift @BLOCKS, @new_blocks;
923            }
924	}
925    }
926}
927
928sub charblock {
929    my $arg = shift;
930
931    _charblocks() unless @BLOCKS;
932
933    my $code = _getcode($arg);
934
935    if (defined $code) {
936	my $result = _search(\@BLOCKS, 0, $#BLOCKS, $code);
937        return $result if defined $result;
938        return 'No_Block';
939    }
940    elsif (exists $BLOCKS{$arg}) {
941        return _dclone $BLOCKS{$arg};
942    }
943
944    carp __PACKAGE__, "::charblock: unknown code '$arg'";
945    return;
946}
947
948=head2 B<charscript()>
949
950    use Unicode::UCD 'charscript';
951
952    my $charscript = charscript(0x41);
953    my $charscript = charscript(1234);
954    my $charscript = charscript("U+263a");
955
956    my $range      = charscript('Thai');
957
958With a L</code point argument>, C<charscript()> returns the I<script> the
959code point belongs to, e.g., C<Latin>, C<Greek>, C<Han>.
960If the code point is unassigned or the Unicode version being used is so early
961that it doesn't have scripts, this function returns C<"Unknown">.
962The L</prop_value_aliases()> function can be used to get all the synonyms
963of the script name.
964
965If supplied with an argument that can't be a code point, charscript() tries
966to do the opposite and interpret the argument as a script name. The
967return value is a I<range set>: an anonymous array of arrays that contain
968I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
969code point is in a range set using the L</charinrange()> function.
970(To be precise, each I<range set> contains a third array element,
971after the range boundary ones: the script name.)
972
973If the C<charscript()> argument is not a known script, C<undef> is returned.
974
975See also L</Blocks versus Scripts>.
976
977=cut
978
979my @SCRIPTS;
980my %SCRIPTS;
981
982sub _charscripts {
983    unless (@SCRIPTS) {
984        UnicodeVersion() unless defined $v_unicode_version;
985        if ($v_unicode_version lt v3.1.0) {
986            push @SCRIPTS, [ 0, 0x10FFFF, 'Unknown' ];
987        }
988        else {
989            @SCRIPTS =_read_table("To/Sc.pl");
990        }
991    }
992    foreach my $entry (@SCRIPTS) {
993        $entry->[2] =~ s/(_\w)/\L$1/g;  # Preserve old-style casing
994        push @{$SCRIPTS{$entry->[2]}}, $entry;
995    }
996}
997
998sub charscript {
999    my $arg = shift;
1000
1001    _charscripts() unless @SCRIPTS;
1002
1003    my $code = _getcode($arg);
1004
1005    if (defined $code) {
1006	my $result = _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
1007        return $result if defined $result;
1008        return $utf8::SwashInfo{'ToSc'}{'missing'};
1009    } elsif (exists $SCRIPTS{$arg}) {
1010        return _dclone $SCRIPTS{$arg};
1011    }
1012
1013    carp __PACKAGE__, "::charscript: unknown code '$arg'";
1014    return;
1015}
1016
1017=head2 B<charblocks()>
1018
1019    use Unicode::UCD 'charblocks';
1020
1021    my $charblocks = charblocks();
1022
1023C<charblocks()> returns a reference to a hash with the known block names
1024as the keys, and the code point ranges (see L</charblock()>) as the values.
1025
1026The names are in the old-style (see L</Old-style versus new-style block
1027names>).
1028
1029L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a
1030different type of data structure.
1031
1032L<prop_values("Block")|/prop_values()> can be used to get all
1033the known new-style block names as a list, without the code point ranges.
1034
1035See also L</Blocks versus Scripts>.
1036
1037=cut
1038
1039sub charblocks {
1040    _charblocks() unless %BLOCKS;
1041    return _dclone \%BLOCKS;
1042}
1043
1044=head2 B<charscripts()>
1045
1046    use Unicode::UCD 'charscripts';
1047
1048    my $charscripts = charscripts();
1049
1050C<charscripts()> returns a reference to a hash with the known script
1051names as the keys, and the code point ranges (see L</charscript()>) as
1052the values.
1053
1054L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
1055different type of data structure.
1056
1057L<C<prop_values("Script")>|/prop_values()> can be used to get all
1058the known script names as a list, without the code point ranges.
1059
1060See also L</Blocks versus Scripts>.
1061
1062=cut
1063
1064sub charscripts {
1065    _charscripts() unless %SCRIPTS;
1066    return _dclone \%SCRIPTS;
1067}
1068
1069=head2 B<charinrange()>
1070
1071In addition to using the C<\p{Blk=...}> and C<\P{Blk=...}> constructs, you
1072can also test whether a code point is in the I<range> as returned by
1073L</charblock()> and L</charscript()> or as the values of the hash returned
1074by L</charblocks()> and L</charscripts()> by using C<charinrange()>:
1075
1076    use Unicode::UCD qw(charscript charinrange);
1077
1078    $range = charscript('Hiragana');
1079    print "looks like hiragana\n" if charinrange($range, $codepoint);
1080
1081=cut
1082
1083my %GENERAL_CATEGORIES =
1084 (
1085    'L'  =>         'Letter',
1086    'LC' =>         'CasedLetter',
1087    'Lu' =>         'UppercaseLetter',
1088    'Ll' =>         'LowercaseLetter',
1089    'Lt' =>         'TitlecaseLetter',
1090    'Lm' =>         'ModifierLetter',
1091    'Lo' =>         'OtherLetter',
1092    'M'  =>         'Mark',
1093    'Mn' =>         'NonspacingMark',
1094    'Mc' =>         'SpacingMark',
1095    'Me' =>         'EnclosingMark',
1096    'N'  =>         'Number',
1097    'Nd' =>         'DecimalNumber',
1098    'Nl' =>         'LetterNumber',
1099    'No' =>         'OtherNumber',
1100    'P'  =>         'Punctuation',
1101    'Pc' =>         'ConnectorPunctuation',
1102    'Pd' =>         'DashPunctuation',
1103    'Ps' =>         'OpenPunctuation',
1104    'Pe' =>         'ClosePunctuation',
1105    'Pi' =>         'InitialPunctuation',
1106    'Pf' =>         'FinalPunctuation',
1107    'Po' =>         'OtherPunctuation',
1108    'S'  =>         'Symbol',
1109    'Sm' =>         'MathSymbol',
1110    'Sc' =>         'CurrencySymbol',
1111    'Sk' =>         'ModifierSymbol',
1112    'So' =>         'OtherSymbol',
1113    'Z'  =>         'Separator',
1114    'Zs' =>         'SpaceSeparator',
1115    'Zl' =>         'LineSeparator',
1116    'Zp' =>         'ParagraphSeparator',
1117    'C'  =>         'Other',
1118    'Cc' =>         'Control',
1119    'Cf' =>         'Format',
1120    'Cs' =>         'Surrogate',
1121    'Co' =>         'PrivateUse',
1122    'Cn' =>         'Unassigned',
1123 );
1124
1125sub general_categories {
1126    return _dclone \%GENERAL_CATEGORIES;
1127}
1128
1129=head2 B<general_categories()>
1130
1131    use Unicode::UCD 'general_categories';
1132
1133    my $categories = general_categories();
1134
1135This returns a reference to a hash which has short
1136general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
1137names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
1138C<Symbol>) as values.  The hash is reversible in case you need to go
1139from the long names to the short names.  The general category is the
1140one returned from
1141L</charinfo()> under the C<category> key.
1142
1143The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
1144alternative to this function; the first returning a simple list of the short
1145category names; and the second gets all the synonyms of a given category name.
1146
1147=cut
1148
1149my %BIDI_TYPES =
1150 (
1151   'L'   => 'Left-to-Right',
1152   'LRE' => 'Left-to-Right Embedding',
1153   'LRO' => 'Left-to-Right Override',
1154   'R'   => 'Right-to-Left',
1155   'AL'  => 'Right-to-Left Arabic',
1156   'RLE' => 'Right-to-Left Embedding',
1157   'RLO' => 'Right-to-Left Override',
1158   'PDF' => 'Pop Directional Format',
1159   'EN'  => 'European Number',
1160   'ES'  => 'European Number Separator',
1161   'ET'  => 'European Number Terminator',
1162   'AN'  => 'Arabic Number',
1163   'CS'  => 'Common Number Separator',
1164   'NSM' => 'Non-Spacing Mark',
1165   'BN'  => 'Boundary Neutral',
1166   'B'   => 'Paragraph Separator',
1167   'S'   => 'Segment Separator',
1168   'WS'  => 'Whitespace',
1169   'ON'  => 'Other Neutrals',
1170 );
1171
1172=head2 B<bidi_types()>
1173
1174    use Unicode::UCD 'bidi_types';
1175
1176    my $categories = bidi_types();
1177
1178This returns a reference to a hash which has the short
1179bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
1180names (such as C<Left-to-Right>, C<Right-to-Left>) as values.  The
1181hash is reversible in case you need to go from the long names to the
1182short names.  The bidi type is the one returned from
1183L</charinfo()>
1184under the C<bidi> key.  For the exact meaning of the various bidi classes
1185the Unicode TR9 is recommended reading:
1186L<http://www.unicode.org/reports/tr9/>
1187(as of Unicode 5.0.0)
1188
1189The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
1190alternative to this function; the first returning a simple list of the short
1191bidi type names; and the second gets all the synonyms of a given bidi type
1192name.
1193
1194=cut
1195
1196sub bidi_types {
1197    return _dclone \%BIDI_TYPES;
1198}
1199
1200=head2 B<compexcl()>
1201
1202    use Unicode::UCD 'compexcl';
1203
1204    my $compexcl = compexcl(0x09dc);
1205
1206This routine returns C<undef> if the Unicode version being used is so early
1207that it doesn't have this property.
1208
1209C<compexcl()> is included for backwards
1210compatibility, but as of Perl 5.12 and more modern Unicode versions, for
1211most purposes it is probably more convenient to use one of the following
1212instead:
1213
1214    my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
1215    my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
1216
1217or even
1218
1219    my $compexcl = chr(0x09dc) =~ /\p{CE};
1220    my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
1221
1222The first two forms return B<true> if the L</code point argument> should not
1223be produced by composition normalization.  For the final two forms to return
1224B<true>, it is additionally required that this fact not otherwise be
1225determinable from the Unicode data base.
1226
1227This routine behaves identically to the final two forms.  That is,
1228it does not return B<true> if the code point has a decomposition
1229consisting of another single code point, nor if its decomposition starts
1230with a code point whose combining class is non-zero.  Code points that meet
1231either of these conditions should also not be produced by composition
1232normalization, which is probably why you should use the
1233C<Full_Composition_Exclusion> property instead, as shown above.
1234
1235The routine returns B<false> otherwise.
1236
1237=cut
1238
1239# Eval'd so can run on versions earlier than the property is available in
1240my $Composition_Exclusion_re = eval 'qr/\p{Composition_Exclusion}/';
1241
1242sub compexcl {
1243    my $arg  = shift;
1244    my $code = _getcode($arg);
1245    croak __PACKAGE__, "::compexcl: unknown code '$arg'"
1246	unless defined $code;
1247
1248    UnicodeVersion() unless defined $v_unicode_version;
1249    return if $v_unicode_version lt v3.0.0;
1250
1251    no warnings "non_unicode";     # So works on non-Unicode code points
1252    return chr($code) =~ $Composition_Exclusion_re
1253}
1254
1255=head2 B<casefold()>
1256
1257    use Unicode::UCD 'casefold';
1258
1259    my $casefold = casefold(0xDF);
1260    if (defined $casefold) {
1261        my @full_fold_hex = split / /, $casefold->{'full'};
1262        my $full_fold_string =
1263                    join "", map {chr(hex($_))} @full_fold_hex;
1264        my @turkic_fold_hex =
1265                        split / /, ($casefold->{'turkic'} ne "")
1266                                        ? $casefold->{'turkic'}
1267                                        : $casefold->{'full'};
1268        my $turkic_fold_string =
1269                        join "", map {chr(hex($_))} @turkic_fold_hex;
1270    }
1271    if (defined $casefold && $casefold->{'simple'} ne "") {
1272        my $simple_fold_hex = $casefold->{'simple'};
1273        my $simple_fold_string = chr(hex($simple_fold_hex));
1274    }
1275
1276This returns the (almost) locale-independent case folding of the
1277character specified by the L</code point argument>.  (Starting in Perl v5.16,
1278the core function C<fc()> returns the C<full> mapping (described below)
1279faster than this does, and for entire strings.)
1280
1281If there is no case folding for the input code point, C<undef> is returned.
1282
1283If there is a case folding for that code point, a reference to a hash
1284with the following fields is returned:
1285
1286=over
1287
1288=item B<code>
1289
1290the input native L</code point argument> expressed in hexadecimal, with
1291leading zeros
1292added if necessary to make it contain at least four hexdigits
1293
1294=item B<full>
1295
1296one or more codes (separated by spaces) that, taken in order, give the
1297code points for the case folding for I<code>.
1298Each has at least four hexdigits.
1299
1300=item B<simple>
1301
1302is empty, or is exactly one code with at least four hexdigits which can be used
1303as an alternative case folding when the calling program cannot cope with the
1304fold being a sequence of multiple code points.  If I<full> is just one code
1305point, then I<simple> equals I<full>.  If there is no single code point folding
1306defined for I<code>, then I<simple> is the empty string.  Otherwise, it is an
1307inferior, but still better-than-nothing alternative folding to I<full>.
1308
1309=item B<mapping>
1310
1311is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
1312otherwise.  It can be considered to be the simplest possible folding for
1313I<code>.  It is defined primarily for backwards compatibility.
1314
1315=item B<status>
1316
1317is C<C> (for C<common>) if the best possible fold is a single code point
1318(I<simple> equals I<full> equals I<mapping>).  It is C<S> if there are distinct
1319folds, I<simple> and I<full> (I<mapping> equals I<simple>).  And it is C<F> if
1320there is only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).
1321Note that this
1322describes the contents of I<mapping>.  It is defined primarily for backwards
1323compatibility.
1324
1325For Unicode versions between 3.1 and 3.1.1 inclusive, I<status> can also be
1326C<I> which is the same as C<C> but is a special case for dotted uppercase I and
1327dotless lowercase i:
1328
1329=over
1330
1331=item Z<>B<*> If you use this C<I> mapping
1332
1333the result is case-insensitive,
1334but dotless and dotted I's are not distinguished
1335
1336=item Z<>B<*> If you exclude this C<I> mapping
1337
1338the result is not fully case-insensitive, but
1339dotless and dotted I's are distinguished
1340
1341=back
1342
1343=item B<turkic>
1344
1345contains any special folding for Turkic languages.  For versions of Unicode
1346starting with 3.2, this field is empty unless I<code> has a different folding
1347in Turkic languages, in which case it is one or more codes (separated by
1348spaces) that, taken in order, give the code points for the case folding for
1349I<code> in those languages.
1350Each code has at least four hexdigits.
1351Note that this folding does not maintain canonical equivalence without
1352additional processing.
1353
1354For Unicode versions between 3.1 and 3.1.1 inclusive, this field is empty unless
1355there is a
1356special folding for Turkic languages, in which case I<status> is C<I>, and
1357I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
1358
1359=back
1360
1361Programs that want complete generality and the best folding results should use
1362the folding contained in the I<full> field.  But note that the fold for some
1363code points will be a sequence of multiple code points.
1364
1365Programs that can't cope with the fold mapping being multiple code points can
1366use the folding contained in the I<simple> field, with the loss of some
1367generality.  In Unicode 5.1, about 7% of the defined foldings have no single
1368code point folding.
1369
1370The I<mapping> and I<status> fields are provided for backwards compatibility for
1371existing programs.  They contain the same values as in previous versions of
1372this function.
1373
1374Locale is not completely independent.  The I<turkic> field contains results to
1375use when the locale is a Turkic language.
1376
1377For more information about case mappings see
1378L<http://www.unicode.org/unicode/reports/tr21>
1379
1380=cut
1381
1382my %CASEFOLD;
1383
1384sub _casefold {
1385    unless (%CASEFOLD) {   # Populate the hash
1386        my ($full_invlist_ref, $full_invmap_ref, undef, $default)
1387                                                = prop_invmap('Case_Folding');
1388
1389        # Use the recipe given in the prop_invmap() pod to convert the
1390        # inversion map into the hash.
1391        for my $i (0 .. @$full_invlist_ref - 1 - 1) {
1392            next if $full_invmap_ref->[$i] == $default;
1393            my $adjust = -1;
1394            for my $j ($full_invlist_ref->[$i] .. $full_invlist_ref->[$i+1] -1) {
1395                $adjust++;
1396                if (! ref $full_invmap_ref->[$i]) {
1397
1398                    # This is a single character mapping
1399                    $CASEFOLD{$j}{'status'} = 'C';
1400                    $CASEFOLD{$j}{'simple'}
1401                        = $CASEFOLD{$j}{'full'}
1402                        = $CASEFOLD{$j}{'mapping'}
1403                        = sprintf("%04X", $full_invmap_ref->[$i] + $adjust);
1404                    $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
1405                    $CASEFOLD{$j}{'turkic'} = "";
1406                }
1407                else {  # prop_invmap ensures that $adjust is 0 for a ref
1408                    $CASEFOLD{$j}{'status'} = 'F';
1409                    $CASEFOLD{$j}{'full'}
1410                    = $CASEFOLD{$j}{'mapping'}
1411                    = join " ", map { sprintf "%04X", $_ }
1412                                                    @{$full_invmap_ref->[$i]};
1413                    $CASEFOLD{$j}{'simple'} = "";
1414                    $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
1415                    $CASEFOLD{$j}{'turkic'} = "";
1416                }
1417            }
1418        }
1419
1420        # We have filled in the full mappings above, assuming there were no
1421        # simple ones for the ones with multi-character maps.  Now, we find
1422        # and fix the cases where that assumption was false.
1423        (my ($simple_invlist_ref, $simple_invmap_ref, undef), $default)
1424                                        = prop_invmap('Simple_Case_Folding');
1425        for my $i (0 .. @$simple_invlist_ref - 1 - 1) {
1426            next if $simple_invmap_ref->[$i] == $default;
1427            my $adjust = -1;
1428            for my $j ($simple_invlist_ref->[$i]
1429                       .. $simple_invlist_ref->[$i+1] -1)
1430            {
1431                $adjust++;
1432                next if $CASEFOLD{$j}{'status'} eq 'C';
1433                $CASEFOLD{$j}{'status'} = 'S';
1434                $CASEFOLD{$j}{'simple'}
1435                    = $CASEFOLD{$j}{'mapping'}
1436                    = sprintf("%04X", $simple_invmap_ref->[$i] + $adjust);
1437                $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
1438                $CASEFOLD{$j}{'turkic'} = "";
1439            }
1440        }
1441
1442        # We hard-code in the turkish rules
1443        UnicodeVersion() unless defined $v_unicode_version;
1444        if ($v_unicode_version ge v3.2.0) {
1445
1446            # These two code points should already have regular entries, so
1447            # just fill in the turkish fields
1448            $CASEFOLD{ord('I')}{'turkic'} = '0131';
1449            $CASEFOLD{0x130}{'turkic'} = sprintf "%04X", ord('i');
1450        }
1451        elsif ($v_unicode_version ge v3.1.0) {
1452
1453            # These two code points don't have entries otherwise.
1454            $CASEFOLD{0x130}{'code'} = '0130';
1455            $CASEFOLD{0x131}{'code'} = '0131';
1456            $CASEFOLD{0x130}{'status'} = $CASEFOLD{0x131}{'status'} = 'I';
1457            $CASEFOLD{0x130}{'turkic'}
1458                = $CASEFOLD{0x130}{'mapping'}
1459                = $CASEFOLD{0x130}{'full'}
1460                = $CASEFOLD{0x130}{'simple'}
1461                = $CASEFOLD{0x131}{'turkic'}
1462                = $CASEFOLD{0x131}{'mapping'}
1463                = $CASEFOLD{0x131}{'full'}
1464                = $CASEFOLD{0x131}{'simple'}
1465                = sprintf "%04X", ord('i');
1466        }
1467    }
1468}
1469
1470sub casefold {
1471    my $arg  = shift;
1472    my $code = _getcode($arg);
1473    croak __PACKAGE__, "::casefold: unknown code '$arg'"
1474	unless defined $code;
1475
1476    _casefold() unless %CASEFOLD;
1477
1478    return $CASEFOLD{$code};
1479}
1480
1481=head2 B<all_casefolds()>
1482
1483
1484    use Unicode::UCD 'all_casefolds';
1485
1486    my $all_folds_ref = all_casefolds();
1487    foreach my $char_with_casefold (sort { $a <=> $b }
1488                                    keys %$all_folds_ref)
1489    {
1490        printf "%04X:", $char_with_casefold;
1491        my $casefold = $all_folds_ref->{$char_with_casefold};
1492
1493        # Get folds for $char_with_casefold
1494
1495        my @full_fold_hex = split / /, $casefold->{'full'};
1496        my $full_fold_string =
1497                    join "", map {chr(hex($_))} @full_fold_hex;
1498        print " full=", join " ", @full_fold_hex;
1499        my @turkic_fold_hex =
1500                        split / /, ($casefold->{'turkic'} ne "")
1501                                        ? $casefold->{'turkic'}
1502                                        : $casefold->{'full'};
1503        my $turkic_fold_string =
1504                        join "", map {chr(hex($_))} @turkic_fold_hex;
1505        print "; turkic=", join " ", @turkic_fold_hex;
1506        if (defined $casefold && $casefold->{'simple'} ne "") {
1507            my $simple_fold_hex = $casefold->{'simple'};
1508            my $simple_fold_string = chr(hex($simple_fold_hex));
1509            print "; simple=$simple_fold_hex";
1510        }
1511        print "\n";
1512    }
1513
1514This returns all the case foldings in the current version of Unicode in the
1515form of a reference to a hash.  Each key to the hash is the decimal
1516representation of a Unicode character that has a casefold to other than
1517itself.  The casefold of a semi-colon is itself, so it isn't in the hash;
1518likewise for a lowercase "a", but there is an entry for a capital "A".  The
1519hash value for each key is another hash, identical to what is returned by
1520L</casefold()> if called with that code point as its argument.  So the value
1521C<< all_casefolds()->{ord("A")}' >> is equivalent to C<casefold(ord("A"))>;
1522
1523=cut
1524
1525sub all_casefolds () {
1526    _casefold() unless %CASEFOLD;
1527    return _dclone \%CASEFOLD;
1528}
1529
1530=head2 B<casespec()>
1531
1532    use Unicode::UCD 'casespec';
1533
1534    my $casespec = casespec(0xFB00);
1535
1536This returns the potentially locale-dependent case mappings of the L</code point
1537argument>.  The mappings may be longer than a single code point (which the basic
1538Unicode case mappings as returned by L</charinfo()> never are).
1539
1540If there are no case mappings for the L</code point argument>, or if all three
1541possible mappings (I<lower>, I<title> and I<upper>) result in single code
1542points and are locale independent and unconditional, C<undef> is returned
1543(which means that the case mappings, if any, for the code point are those
1544returned by L</charinfo()>).
1545
1546Otherwise, a reference to a hash giving the mappings (or a reference to a hash
1547of such hashes, explained below) is returned with the following keys and their
1548meanings:
1549
1550The keys in the bottom layer hash with the meanings of their values are:
1551
1552=over
1553
1554=item B<code>
1555
1556the input native L</code point argument> expressed in hexadecimal, with
1557leading zeros
1558added if necessary to make it contain at least four hexdigits
1559
1560=item B<lower>
1561
1562one or more codes (separated by spaces) that, taken in order, give the
1563code points for the lower case of I<code>.
1564Each has at least four hexdigits.
1565
1566=item B<title>
1567
1568one or more codes (separated by spaces) that, taken in order, give the
1569code points for the title case of I<code>.
1570Each has at least four hexdigits.
1571
1572=item B<upper>
1573
1574one or more codes (separated by spaces) that, taken in order, give the
1575code points for the upper case of I<code>.
1576Each has at least four hexdigits.
1577
1578=item B<condition>
1579
1580the conditions for the mappings to be valid.
1581If C<undef>, the mappings are always valid.
1582When defined, this field is a list of conditions,
1583all of which must be true for the mappings to be valid.
1584The list consists of one or more
1585I<locales> (see below)
1586and/or I<contexts> (explained in the next paragraph),
1587separated by spaces.
1588(Other than as used to separate elements, spaces are to be ignored.)
1589Case distinctions in the condition list are not significant.
1590Conditions preceded by "NON_" represent the negation of the condition.
1591
1592A I<context> is one of those defined in the Unicode standard.
1593For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1594available at
1595L<http://www.unicode.org/versions/Unicode5.1.0/>.
1596These are for context-sensitive casing.
1597
1598=back
1599
1600The hash described above is returned for locale-independent casing, where
1601at least one of the mappings has length longer than one.  If C<undef> is
1602returned, the code point may have mappings, but if so, all are length one,
1603and are returned by L</charinfo()>.
1604Note that when this function does return a value, it will be for the complete
1605set of mappings for a code point, even those whose length is one.
1606
1607If there are additional casing rules that apply only in certain locales,
1608an additional key for each will be defined in the returned hash.  Each such key
1609will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1610followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1611and a variant code).  You can find the lists of all possible locales, see
1612L<Locale::Country> and L<Locale::Language>.
1613(In Unicode 6.0, the only locales returned by this function
1614are C<lt>, C<tr>, and C<az>.)
1615
1616Each locale key is a reference to a hash that has the form above, and gives
1617the casing rules for that particular locale, which take precedence over the
1618locale-independent ones when in that locale.
1619
1620If the only casing for a code point is locale-dependent, then the returned
1621hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1622will contain only locale keys.
1623
1624For more information about case mappings see
1625L<http://www.unicode.org/unicode/reports/tr21/>
1626
1627=cut
1628
1629my %CASESPEC;
1630
1631sub _casespec {
1632    unless (%CASESPEC) {
1633        UnicodeVersion() unless defined $v_unicode_version;
1634        if ($v_unicode_version lt v2.1.8) {
1635            %CASESPEC = {};
1636        }
1637	elsif (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
1638	    local $_;
1639	    local $/ = "\n";
1640	    while (<$CASESPECFH>) {
1641		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
1642
1643		    my ($hexcode, $lower, $title, $upper, $condition) =
1644			($1, $2, $3, $4, $5);
1645                    if (! IS_ASCII_PLATFORM) { # Remap entry to native
1646                        foreach my $var_ref (\$hexcode,
1647                                             \$lower,
1648                                             \$title,
1649                                             \$upper)
1650                        {
1651                            next unless defined $$var_ref;
1652                            $$var_ref = join " ",
1653                                        map { sprintf("%04X",
1654                                              utf8::unicode_to_native(hex $_)) }
1655                                        split " ", $$var_ref;
1656                        }
1657                    }
1658
1659		    my $code = hex($hexcode);
1660
1661                    # In 2.1.8, there were duplicate entries; ignore all but
1662                    # the first one -- there were no conditions in the file
1663                    # anyway.
1664		    if (exists $CASESPEC{$code} && $v_unicode_version ne v2.1.8)
1665                    {
1666			if (exists $CASESPEC{$code}->{code}) {
1667			    my ($oldlower,
1668				$oldtitle,
1669				$oldupper,
1670				$oldcondition) =
1671				    @{$CASESPEC{$code}}{qw(lower
1672							   title
1673							   upper
1674							   condition)};
1675			    if (defined $oldcondition) {
1676				my ($oldlocale) =
1677				($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
1678				delete $CASESPEC{$code};
1679				$CASESPEC{$code}->{$oldlocale} =
1680				{ code      => $hexcode,
1681				  lower     => $oldlower,
1682				  title     => $oldtitle,
1683				  upper     => $oldupper,
1684				  condition => $oldcondition };
1685			    }
1686			}
1687			my ($locale) =
1688			    ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1689			$CASESPEC{$code}->{$locale} =
1690			{ code      => $hexcode,
1691			  lower     => $lower,
1692			  title     => $title,
1693			  upper     => $upper,
1694			  condition => $condition };
1695		    } else {
1696			$CASESPEC{$code} =
1697			{ code      => $hexcode,
1698			  lower     => $lower,
1699			  title     => $title,
1700			  upper     => $upper,
1701			  condition => $condition };
1702		    }
1703		}
1704	    }
1705	    close($CASESPECFH);
1706	}
1707    }
1708}
1709
1710sub casespec {
1711    my $arg  = shift;
1712    my $code = _getcode($arg);
1713    croak __PACKAGE__, "::casespec: unknown code '$arg'"
1714	unless defined $code;
1715
1716    _casespec() unless %CASESPEC;
1717
1718    return ref $CASESPEC{$code} ? _dclone $CASESPEC{$code} : $CASESPEC{$code};
1719}
1720
1721=head2 B<namedseq()>
1722
1723    use Unicode::UCD 'namedseq';
1724
1725    my $namedseq = namedseq("KATAKANA LETTER AINU P");
1726    my @namedseq = namedseq("KATAKANA LETTER AINU P");
1727    my %namedseq = namedseq();
1728
1729If used with a single argument in a scalar context, returns the string
1730consisting of the code points of the named sequence, or C<undef> if no
1731named sequence by that name exists.  If used with a single argument in
1732a list context, it returns the list of the ordinals of the code points.
1733
1734If used with no
1735arguments in a list context, it returns a hash with the names of all the
1736named sequences as the keys and their sequences as strings as
1737the values.  Otherwise, it returns C<undef> or an empty list depending
1738on the context.
1739
1740This function only operates on officially approved (not provisional) named
1741sequences.
1742
1743Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named
1744sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA
1745LETTER AINU P")> will return the same string this function does, but will also
1746operate on character names that aren't named sequences, without you having to
1747know which are which.  See L<charnames>.
1748
1749=cut
1750
1751my %NAMEDSEQ;
1752
1753sub _namedseq {
1754    unless (%NAMEDSEQ) {
1755	if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
1756	    local $_;
1757	    local $/ = "\n";
1758	    while (<$NAMEDSEQFH>) {
1759		if (/^ [0-9A-F]+ \  /x) {
1760                    chomp;
1761                    my ($sequence, $name) = split /\t/;
1762		    my @s = map { chr(hex($_)) } split(' ', $sequence);
1763		    $NAMEDSEQ{$name} = join("", @s);
1764		}
1765	    }
1766	    close($NAMEDSEQFH);
1767	}
1768    }
1769}
1770
1771sub namedseq {
1772
1773    # Use charnames::string_vianame() which now returns this information,
1774    # unless the caller wants the hash returned, in which case we read it in,
1775    # and thereafter use it instead of calling charnames, as it is faster.
1776
1777    my $wantarray = wantarray();
1778    if (defined $wantarray) {
1779	if ($wantarray) {
1780	    if (@_ == 0) {
1781                _namedseq() unless %NAMEDSEQ;
1782		return %NAMEDSEQ;
1783	    } elsif (@_ == 1) {
1784		my $s;
1785                if (%NAMEDSEQ) {
1786                    $s = $NAMEDSEQ{ $_[0] };
1787                }
1788                else {
1789                    $s = charnames::string_vianame($_[0]);
1790                }
1791		return defined $s ? map { ord($_) } split('', $s) : ();
1792	    }
1793	} elsif (@_ == 1) {
1794            return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
1795            return charnames::string_vianame($_[0]);
1796	}
1797    }
1798    return;
1799}
1800
1801my %NUMERIC;
1802
1803sub _numeric {
1804    my @numbers = _read_table("To/Nv.pl");
1805    foreach my $entry (@numbers) {
1806        my ($start, $end, $value) = @$entry;
1807
1808        # If value contains a slash, convert to decimal, add a reverse hash
1809        # used by charinfo.
1810        if ((my @rational = split /\//, $value) == 2) {
1811            my $real = $rational[0] / $rational[1];
1812            $real_to_rational{$real} = $value;
1813            $value = $real;
1814
1815            # Should only be single element, but just in case...
1816            for my $i ($start .. $end) {
1817                $NUMERIC{$i} = $value;
1818            }
1819        }
1820        else {
1821            # The values require adjusting, as is in 'a' format
1822            for my $i ($start .. $end) {
1823                $NUMERIC{$i} = $value + $i - $start;
1824            }
1825        }
1826    }
1827
1828    # Decided unsafe to use these that aren't officially part of the Unicode
1829    # standard.
1830    #use Math::Trig;
1831    #my $pi = acos(-1.0);
1832    #$NUMERIC{0x03C0} = $pi;
1833
1834    # Euler's constant, not to be confused with Euler's number
1835    #$NUMERIC{0x2107} = 0.57721566490153286060651209008240243104215933593992;
1836
1837    # Euler's number
1838    #$NUMERIC{0x212F} = 2.7182818284590452353602874713526624977572;
1839
1840    return;
1841}
1842
1843=pod
1844
1845=head2 B<num()>
1846
1847    use Unicode::UCD 'num';
1848
1849    my $val = num("123");
1850    my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
1851
1852C<num()> returns the numeric value of the input Unicode string; or C<undef> if it
1853doesn't think the entire string has a completely valid, safe numeric value.
1854
1855If the string is just one character in length, the Unicode numeric value
1856is returned if it has one, or C<undef> otherwise.  Note that this need
1857not be a whole number.  C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
1858example returns -0.5.
1859
1860=cut
1861
1862#A few characters to which Unicode doesn't officially
1863#assign a numeric value are considered numeric by C<num>.
1864#These are:
1865
1866# EULER CONSTANT             0.5772...  (this is NOT Euler's number)
1867# SCRIPT SMALL E             2.71828... (this IS Euler's number)
1868# GREEK SMALL LETTER PI      3.14159...
1869
1870=pod
1871
1872If the string is more than one character, C<undef> is returned unless
1873all its characters are decimal digits (that is, they would match C<\d+>),
1874from the same script.  For example if you have an ASCII '0' and a Bengali
1875'3', mixed together, they aren't considered a valid number, and C<undef>
1876is returned.  A further restriction is that the digits all have to be of
1877the same form.  A half-width digit mixed with a full-width one will
1878return C<undef>.  The Arabic script has two sets of digits;  C<num> will
1879return C<undef> unless all the digits in the string come from the same
1880set.
1881
1882C<num> errs on the side of safety, and there may be valid strings of
1883decimal digits that it doesn't recognize.  Note that Unicode defines
1884a number of "digit" characters that aren't "decimal digit" characters.
1885"Decimal digits" have the property that they have a positional value, i.e.,
1886there is a units position, a 10's position, a 100's, etc, AND they are
1887arranged in Unicode in blocks of 10 contiguous code points.  The Chinese
1888digits, for example, are not in such a contiguous block, and so Unicode
1889doesn't view them as decimal digits, but merely digits, and so C<\d> will not
1890match them.  A single-character string containing one of these digits will
1891have its decimal value returned by C<num>, but any longer string containing
1892only these digits will return C<undef>.
1893
1894Strings of multiple sub- and superscripts are not recognized as numbers.  You
1895can use either of the compatibility decompositions in Unicode::Normalize to
1896change these into digits, and then call C<num> on the result.
1897
1898=cut
1899
1900# To handle sub, superscripts, this could if called in list context,
1901# consider those, and return the <decomposition> type in the second
1902# array element.
1903
1904sub num {
1905    my $string = $_[0];
1906
1907    _numeric unless %NUMERIC;
1908
1909    my $length = length($string);
1910    return $NUMERIC{ord($string)} if $length == 1;
1911    return if $string =~ /\D/;
1912    my $first_ord = ord(substr($string, 0, 1));
1913    my $value = $NUMERIC{$first_ord};
1914
1915    # To be a valid decimal number, it should be in a block of 10 consecutive
1916    # characters, whose values are 0, 1, 2, ... 9.  Therefore this digit's
1917    # value is its offset in that block from the character that means zero.
1918    my $zero_ord = $first_ord - $value;
1919
1920    # Unicode 6.0 instituted the rule that only digits in a consecutive
1921    # block of 10 would be considered decimal digits.  If this is an earlier
1922    # release, we verify that this first character is a member of such a
1923    # block.  That is, that the block of characters surrounding this one
1924    # consists of all \d characters whose numeric values are the expected
1925    # ones.
1926    UnicodeVersion() unless defined $v_unicode_version;
1927    if ($v_unicode_version lt v6.0.0) {
1928        for my $i (0 .. 9) {
1929            my $ord = $zero_ord + $i;
1930            return unless chr($ord) =~ /\d/;
1931            my $numeric = $NUMERIC{$ord};
1932            return unless defined $numeric;
1933            return unless $numeric == $i;
1934        }
1935    }
1936
1937    for my $i (1 .. $length -1) {
1938
1939        # Here we know either by verifying, or by fact of the first character
1940        # being a \d in Unicode 6.0 or later, that any character between the
1941        # character that means 0, and 9 positions above it must be \d, and
1942        # must have its value correspond to its offset from the zero.  Any
1943        # characters outside these 10 do not form a legal number for this
1944        # function.
1945        my $ord = ord(substr($string, $i, 1));
1946        my $digit = $ord - $zero_ord;
1947        return unless $digit >= 0 && $digit <= 9;
1948        $value = $value * 10 + $digit;
1949    }
1950
1951    return $value;
1952}
1953
1954=pod
1955
1956=head2 B<prop_aliases()>
1957
1958    use Unicode::UCD 'prop_aliases';
1959
1960    my ($short_name, $full_name, @other_names) = prop_aliases("space");
1961    my $same_full_name = prop_aliases("Space");     # Scalar context
1962    my ($same_short_name) = prop_aliases("Space");  # gets 0th element
1963    print "The full name is $full_name\n";
1964    print "The short name is $short_name\n";
1965    print "The other aliases are: ", join(", ", @other_names), "\n";
1966
1967    prints:
1968    The full name is White_Space
1969    The short name is WSpace
1970    The other aliases are: Space
1971
1972Most Unicode properties have several synonymous names.  Typically, there is at
1973least a short name, convenient to type, and a long name that more fully
1974describes the property, and hence is more easily understood.
1975
1976If you know one name for a Unicode property, you can use C<prop_aliases> to find
1977either the long name (when called in scalar context), or a list of all of the
1978names, somewhat ordered so that the short name is in the 0th element, the long
1979name in the next element, and any other synonyms are in the remaining
1980elements, in no particular order.
1981
1982The long name is returned in a form nicely capitalized, suitable for printing.
1983
1984The input parameter name is loosely matched, which means that white space,
1985hyphens, and underscores are ignored (except for the trailing underscore in
1986the old_form grandfathered-in C<"L_">, which is better written as C<"LC">, and
1987both of which mean C<General_Category=Cased Letter>).
1988
1989If the name is unknown, C<undef> is returned (or an empty list in list
1990context).  Note that Perl typically recognizes property names in regular
1991expressions with an optional C<"Is_>" (with or without the underscore)
1992prefixed to them, such as C<\p{isgc=punct}>.  This function does not recognize
1993those in the input, returning C<undef>.  Nor are they included in the output
1994as possible synonyms.
1995
1996C<prop_aliases> does know about the Perl extensions to Unicode properties,
1997such as C<Any> and C<XPosixAlpha>, and the single form equivalents to Unicode
1998properties such as C<XDigit>, C<Greek>, C<In_Greek>, and C<Is_Greek>.  The
1999final example demonstrates that the C<"Is_"> prefix is recognized for these
2000extensions; it is needed to resolve ambiguities.  For example,
2001C<prop_aliases('lc')> returns the list C<(lc, Lowercase_Mapping)>, but
2002C<prop_aliases('islc')> returns C<(Is_LC, Cased_Letter)>.  This is
2003because C<islc> is a Perl extension which is short for
2004C<General_Category=Cased Letter>.  The lists returned for the Perl extensions
2005will not include the C<"Is_"> prefix (whether or not the input had it) unless
2006needed to resolve ambiguities, as shown in the C<"islc"> example, where the
2007returned list had one element containing C<"Is_">, and the other without.
2008
2009It is also possible for the reverse to happen:  C<prop_aliases('isc')> returns
2010the list C<(isc, ISO_Comment)>; whereas C<prop_aliases('c')> returns
2011C<(C, Other)> (the latter being a Perl extension meaning
2012C<General_Category=Other>.
2013L<perluniprops/Properties accessible through Unicode::UCD> lists the available
2014forms, including which ones are discouraged from use.
2015
2016Those discouraged forms are accepted as input to C<prop_aliases>, but are not
2017returned in the lists.  C<prop_aliases('isL&')> and C<prop_aliases('isL_')>,
2018which are old synonyms for C<"Is_LC"> and should not be used in new code, are
2019examples of this.  These both return C<(Is_LC, Cased_Letter)>.  Thus this
2020function allows you to take a discouraged form, and find its acceptable
2021alternatives.  The same goes with single-form Block property equivalences.
2022Only the forms that begin with C<"In_"> are not discouraged; if you pass
2023C<prop_aliases> a discouraged form, you will get back the equivalent ones that
2024begin with C<"In_">.  It will otherwise look like a new-style block name (see.
2025L</Old-style versus new-style block names>).
2026
2027C<prop_aliases> does not know about any user-defined properties, and will
2028return C<undef> if called with one of those.  Likewise for Perl internal
2029properties, with the exception of "Perl_Decimal_Digit" which it does know
2030about (and which is documented below in L</prop_invmap()>).
2031
2032=cut
2033
2034# It may be that there are use cases where the discouraged forms should be
2035# returned.  If that comes up, an optional boolean second parameter to the
2036# function could be created, for example.
2037
2038# These are created by mktables for this routine and stored in unicore/UCD.pl
2039# where their structures are described.
2040our %string_property_loose_to_name;
2041our %ambiguous_names;
2042our %loose_perlprop_to_name;
2043our %prop_aliases;
2044
2045sub prop_aliases ($) {
2046    my $prop = $_[0];
2047    return unless defined $prop;
2048
2049    require "unicore/UCD.pl";
2050    require "unicore/Heavy.pl";
2051    require "utf8_heavy.pl";
2052
2053    # The property name may be loosely or strictly matched; we don't know yet.
2054    # But both types use lower-case.
2055    $prop = lc $prop;
2056
2057    # It is loosely matched if its lower case isn't known to be strict.
2058    my $list_ref;
2059    if (! exists $utf8::stricter_to_file_of{$prop}) {
2060        my $loose = utf8::_loose_name($prop);
2061
2062        # There is a hash that converts from any loose name to its standard
2063        # form, mapping all synonyms for a  name to one name that can be used
2064        # as a key into another hash.  The whole concept is for memory
2065        # savings, as the second hash doesn't have to have all the
2066        # combinations.  Actually, there are two hashes that do the
2067        # converstion.  One is used in utf8_heavy.pl (stored in Heavy.pl) for
2068        # looking up properties matchable in regexes.  This function needs to
2069        # access string properties, which aren't available in regexes, so a
2070        # second conversion hash is made for them (stored in UCD.pl).  Look in
2071        # the string one now, as the rest can have an optional 'is' prefix,
2072        # which these don't.
2073        if (exists $string_property_loose_to_name{$loose}) {
2074
2075            # Convert to its standard loose name.
2076            $prop = $string_property_loose_to_name{$loose};
2077        }
2078        else {
2079            my $retrying = 0;   # bool.  ? Has an initial 'is' been stripped
2080        RETRY:
2081            if (exists $utf8::loose_property_name_of{$loose}
2082                && (! $retrying
2083                    || ! exists $ambiguous_names{$loose}))
2084            {
2085                # Found an entry giving the standard form.  We don't get here
2086                # (in the test above) when we've stripped off an
2087                # 'is' and the result is an ambiguous name.  That is because
2088                # these are official Unicode properties (though Perl can have
2089                # an optional 'is' prefix meaning the official property), and
2090                # all ambiguous cases involve a Perl single-form extension
2091                # for the gc, script, or block properties, and the stripped
2092                # 'is' means that they mean one of those, and not one of
2093                # these
2094                $prop = $utf8::loose_property_name_of{$loose};
2095            }
2096            elsif (exists $loose_perlprop_to_name{$loose}) {
2097
2098                # This hash is specifically for this function to list Perl
2099                # extensions that aren't in the earlier hashes.  If there is
2100                # only one element, the short and long names are identical.
2101                # Otherwise the form is already in the same form as
2102                # %prop_aliases, which is handled at the end of the function.
2103                $list_ref = $loose_perlprop_to_name{$loose};
2104                if (@$list_ref == 1) {
2105                    my @list = ($list_ref->[0], $list_ref->[0]);
2106                    $list_ref = \@list;
2107                }
2108            }
2109            elsif (! exists $utf8::loose_to_file_of{$loose}) {
2110
2111                # loose_to_file_of is a complete list of loose names.  If not
2112                # there, the input is unknown.
2113                return;
2114            }
2115            elsif ($loose =~ / [:=] /x) {
2116
2117                # Here we found the name but not its aliases, so it has to
2118                # exist.  Exclude property-value combinations.  (This shows up
2119                # for something like ccc=vr which matches loosely, but is a
2120                # synonym for ccc=9 which matches only strictly.
2121                return;
2122            }
2123            else {
2124
2125                # Here it has to exist, and isn't a property-value
2126                # combination.  This means it must be one of the Perl
2127                # single-form extensions.  First see if it is for a
2128                # property-value combination in one of the following
2129                # properties.
2130                my @list;
2131                foreach my $property ("gc", "script") {
2132                    @list = prop_value_aliases($property, $loose);
2133                    last if @list;
2134                }
2135                if (@list) {
2136
2137                    # Here, it is one of those property-value combination
2138                    # single-form synonyms.  There are ambiguities with some
2139                    # of these.  Check against the list for these, and adjust
2140                    # if necessary.
2141                    for my $i (0 .. @list -1) {
2142                        if (exists $ambiguous_names
2143                                   {utf8::_loose_name(lc $list[$i])})
2144                        {
2145                            # The ambiguity is resolved by toggling whether or
2146                            # not it has an 'is' prefix
2147                            $list[$i] =~ s/^Is_// or $list[$i] =~ s/^/Is_/;
2148                        }
2149                    }
2150                    return @list;
2151                }
2152
2153                # Here, it wasn't one of the gc or script single-form
2154                # extensions.  It could be a block property single-form
2155                # extension.  An 'in' prefix definitely means that, and should
2156                # be looked up without the prefix.  However, starting in
2157                # Unicode 6.1, we have to special case 'indic...', as there
2158                # is a property that begins with that name.   We shouldn't
2159                # strip the 'in' from that.   I'm (khw) generalizing this to
2160                # 'indic' instead of the single property, because I suspect
2161                # that others of this class may come along in the future.
2162                # However, this could backfire and a block created whose name
2163                # begins with 'dic...', and we would want to strip the 'in'.
2164                # At which point this would have to be tweaked.
2165                my $began_with_in = $loose =~ s/^in(?!dic)//;
2166                @list = prop_value_aliases("block", $loose);
2167                if (@list) {
2168                    map { $_ =~ s/^/In_/ } @list;
2169                    return @list;
2170                }
2171
2172                # Here still haven't found it.  The last opportunity for it
2173                # being valid is only if it began with 'is'.  We retry without
2174                # the 'is', setting a flag to that effect so that we don't
2175                # accept things that begin with 'isis...'
2176                if (! $retrying && ! $began_with_in && $loose =~ s/^is//) {
2177                    $retrying = 1;
2178                    goto RETRY;
2179                }
2180
2181                # Here, didn't find it.  Since it was in %loose_to_file_of, we
2182                # should have been able to find it.
2183                carp __PACKAGE__, "::prop_aliases: Unexpectedly could not find '$prop'.  Send bug report to perlbug\@perl.org";
2184                return;
2185            }
2186        }
2187    }
2188
2189    if (! $list_ref) {
2190        # Here, we have set $prop to a standard form name of the input.  Look
2191        # it up in the structure created by mktables for this purpose, which
2192        # contains both strict and loosely matched properties.  Avoid
2193        # autovivifying.
2194        $list_ref = $prop_aliases{$prop} if exists $prop_aliases{$prop};
2195        return unless $list_ref;
2196    }
2197
2198    # The full name is in element 1.
2199    return $list_ref->[1] unless wantarray;
2200
2201    return @{_dclone $list_ref};
2202}
2203
2204=pod
2205
2206=head2 B<prop_values()>
2207
2208    use Unicode::UCD 'prop_values';
2209
2210    print "AHex values are: ", join(", ", prop_values("AHex")),
2211                               "\n";
2212  prints:
2213    AHex values are: N, Y
2214
2215Some Unicode properties have a restricted set of legal values.  For example,
2216all binary properties are restricted to just C<true> or C<false>; and there
2217are only a few dozen possible General Categories.  Use C<prop_values>
2218to find out if a given property is one such, and if so, to get a list of the
2219values:
2220
2221    print join ", ", prop_values("NFC_Quick_Check");
2222  prints:
2223    M, N, Y
2224
2225If the property doesn't have such a restricted set, C<undef> is returned.
2226
2227There are usually several synonyms for each possible value.  Use
2228L</prop_value_aliases()> to access those.
2229
2230Case, white space, hyphens, and underscores are ignored in the input property
2231name (except for the trailing underscore in the old-form grandfathered-in
2232general category property value C<"L_">, which is better written as C<"LC">).
2233
2234If the property name is unknown, C<undef> is returned.  Note that Perl typically
2235recognizes property names in regular expressions with an optional C<"Is_>"
2236(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
2237This function does not recognize those in the property parameter, returning
2238C<undef>.
2239
2240For the block property, new-style block names are returned (see
2241L</Old-style versus new-style block names>).
2242
2243C<prop_values> does not know about any user-defined properties, and
2244will return C<undef> if called with one of those.
2245
2246=cut
2247
2248# These are created by mktables for this module and stored in unicore/UCD.pl
2249# where their structures are described.
2250our %loose_to_standard_value;
2251our %prop_value_aliases;
2252
2253sub prop_values ($) {
2254    my $prop = shift;
2255    return undef unless defined $prop;
2256
2257    require "unicore/UCD.pl";
2258    require "utf8_heavy.pl";
2259
2260    # Find the property name synonym that's used as the key in other hashes,
2261    # which is element 0 in the returned list.
2262    ($prop) = prop_aliases($prop);
2263    return undef if ! $prop;
2264    $prop = utf8::_loose_name(lc $prop);
2265
2266    # Here is a legal property.
2267    return undef unless exists $prop_value_aliases{$prop};
2268    my @return;
2269    foreach my $value_key (sort { lc $a cmp lc $b }
2270                            keys %{$prop_value_aliases{$prop}})
2271    {
2272        push @return, $prop_value_aliases{$prop}{$value_key}[0];
2273    }
2274    return @return;
2275}
2276
2277=pod
2278
2279=head2 B<prop_value_aliases()>
2280
2281    use Unicode::UCD 'prop_value_aliases';
2282
2283    my ($short_name, $full_name, @other_names)
2284                                   = prop_value_aliases("Gc", "Punct");
2285    my $same_full_name = prop_value_aliases("Gc", "P");   # Scalar cntxt
2286    my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th
2287                                                           # element
2288    print "The full name is $full_name\n";
2289    print "The short name is $short_name\n";
2290    print "The other aliases are: ", join(", ", @other_names), "\n";
2291
2292  prints:
2293    The full name is Punctuation
2294    The short name is P
2295    The other aliases are: Punct
2296
2297Some Unicode properties have a restricted set of legal values.  For example,
2298all binary properties are restricted to just C<true> or C<false>; and there
2299are only a few dozen possible General Categories.
2300
2301You can use L</prop_values()> to find out if a given property is one which has
2302a restricted set of values, and if so, what those values are.  But usually
2303each value actually has several synonyms.  For example, in Unicode binary
2304properties, I<truth> can be represented by any of the strings "Y", "Yes", "T",
2305or "True"; and the General Category "Punctuation" by that string, or "Punct",
2306or simply "P".
2307
2308Like property names, there is typically at least a short name for each such
2309property-value, and a long name.  If you know any name of the property-value
2310(which you can get by L</prop_values()>, you can use C<prop_value_aliases>()
2311to get the long name (when called in scalar context), or a list of all the
2312names, with the short name in the 0th element, the long name in the next
2313element, and any other synonyms in the remaining elements, in no particular
2314order, except that any all-numeric synonyms will be last.
2315
2316The long name is returned in a form nicely capitalized, suitable for printing.
2317
2318Case, white space, hyphens, and underscores are ignored in the input parameters
2319(except for the trailing underscore in the old-form grandfathered-in general
2320category property value C<"L_">, which is better written as C<"LC">).
2321
2322If either name is unknown, C<undef> is returned.  Note that Perl typically
2323recognizes property names in regular expressions with an optional C<"Is_>"
2324(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
2325This function does not recognize those in the property parameter, returning
2326C<undef>.
2327
2328If called with a property that doesn't have synonyms for its values, it
2329returns the input value, possibly normalized with capitalization and
2330underscores, but not necessarily checking that the input value is valid.
2331
2332For the block property, new-style block names are returned (see
2333L</Old-style versus new-style block names>).
2334
2335To find the synonyms for single-forms, such as C<\p{Any}>, use
2336L</prop_aliases()> instead.
2337
2338C<prop_value_aliases> does not know about any user-defined properties, and
2339will return C<undef> if called with one of those.
2340
2341=cut
2342
2343sub prop_value_aliases ($$) {
2344    my ($prop, $value) = @_;
2345    return unless defined $prop && defined $value;
2346
2347    require "unicore/UCD.pl";
2348    require "utf8_heavy.pl";
2349
2350    # Find the property name synonym that's used as the key in other hashes,
2351    # which is element 0 in the returned list.
2352    ($prop) = prop_aliases($prop);
2353    return if ! $prop;
2354    $prop = utf8::_loose_name(lc $prop);
2355
2356    # Here is a legal property, but the hash below (created by mktables for
2357    # this purpose) only knows about the properties that have a very finite
2358    # number of potential values, that is not ones whose value could be
2359    # anything, like most (if not all) string properties.  These don't have
2360    # synonyms anyway.  Simply return the input.  For example, there is no
2361    # synonym for ('Uppercase_Mapping', A').
2362    if (! exists $prop_value_aliases{$prop}) {
2363
2364        # Here, we have a legal property, but an unknown value.  Since the
2365        # property is legal, if it isn't in the prop_aliases hash, it must be
2366        # a Perl-extension All perl extensions are binary, hence are
2367        # enumerateds, which means that we know that the input unknown value
2368        # is illegal.
2369        return if ! exists $Unicode::UCD::prop_aliases{$prop};
2370
2371        # Otherwise, we assume it's valid, as documented.
2372        return $value;
2373    }
2374
2375    # The value name may be loosely or strictly matched; we don't know yet.
2376    # But both types use lower-case.
2377    $value = lc $value;
2378
2379    # If the name isn't found under loose matching, it certainly won't be
2380    # found under strict
2381    my $loose_value = utf8::_loose_name($value);
2382    return unless exists $loose_to_standard_value{"$prop=$loose_value"};
2383
2384    # Similarly if the combination under loose matching doesn't exist, it
2385    # won't exist under strict.
2386    my $standard_value = $loose_to_standard_value{"$prop=$loose_value"};
2387    return unless exists $prop_value_aliases{$prop}{$standard_value};
2388
2389    # Here we did find a combination under loose matching rules.  But it could
2390    # be that is a strict property match that shouldn't have matched.
2391    # %prop_value_aliases is set up so that the strict matches will appear as
2392    # if they were in loose form.  Thus, if the non-loose version is legal,
2393    # we're ok, can skip the further check.
2394    if (! exists $utf8::stricter_to_file_of{"$prop=$value"}
2395
2396        # We're also ok and skip the further check if value loosely matches.
2397        # mktables has verified that no strict name under loose rules maps to
2398        # an existing loose name.  This code relies on the very limited
2399        # circumstances that strict names can be here.  Strict name matching
2400        # happens under two conditions:
2401        # 1) when the name begins with an underscore.  But this function
2402        #    doesn't accept those, and %prop_value_aliases doesn't have
2403        #    them.
2404        # 2) When the values are numeric, in which case we need to look
2405        #    further, but their squeezed-out loose values will be in
2406        #    %stricter_to_file_of
2407        && exists $utf8::stricter_to_file_of{"$prop=$loose_value"})
2408    {
2409        # The only thing that's legal loosely under strict is that can have an
2410        # underscore between digit pairs XXX
2411        while ($value =~ s/(\d)_(\d)/$1$2/g) {}
2412        return unless exists $utf8::stricter_to_file_of{"$prop=$value"};
2413    }
2414
2415    # Here, we know that the combination exists.  Return it.
2416    my $list_ref = $prop_value_aliases{$prop}{$standard_value};
2417    if (@$list_ref > 1) {
2418        # The full name is in element 1.
2419        return $list_ref->[1] unless wantarray;
2420
2421        return @{_dclone $list_ref};
2422    }
2423
2424    return $list_ref->[0] unless wantarray;
2425
2426    # Only 1 element means that it repeats
2427    return ( $list_ref->[0], $list_ref->[0] );
2428}
2429
2430# All 1 bits is the largest possible UV.
2431$Unicode::UCD::MAX_CP = ~0;
2432
2433=pod
2434
2435=head2 B<prop_invlist()>
2436
2437C<prop_invlist> returns an inversion list (described below) that defines all the
2438code points for the binary Unicode property (or "property=value" pair) given
2439by the input parameter string:
2440
2441 use feature 'say';
2442 use Unicode::UCD 'prop_invlist';
2443 say join ", ", prop_invlist("Any");
2444
2445 prints:
2446 0, 1114112
2447
2448If the input is unknown C<undef> is returned in scalar context; an empty-list
2449in list context.  If the input is known, the number of elements in
2450the list is returned if called in scalar context.
2451
2452L<perluniprops|perluniprops/Properties accessible through \p{} and \P{}> gives
2453the list of properties that this function accepts, as well as all the possible
2454forms for them (including with the optional "Is_" prefixes).  (Except this
2455function doesn't accept any Perl-internal properties, some of which are listed
2456there.) This function uses the same loose or tighter matching rules for
2457resolving the input property's name as is done for regular expressions.  These
2458are also specified in L<perluniprops|perluniprops/Properties accessible
2459through \p{} and \P{}>.  Examples of using the "property=value" form are:
2460
2461 say join ", ", prop_invlist("Script=Shavian");
2462
2463 prints:
2464 66640, 66688
2465
2466 say join ", ", prop_invlist("ASCII_Hex_Digit=No");
2467
2468 prints:
2469 0, 48, 58, 65, 71, 97, 103
2470
2471 say join ", ", prop_invlist("ASCII_Hex_Digit=Yes");
2472
2473 prints:
2474 48, 58, 65, 71, 97, 103
2475
2476Inversion lists are a compact way of specifying Unicode property-value
2477definitions.  The 0th item in the list is the lowest code point that has the
2478property-value.  The next item (item [1]) is the lowest code point beyond that
2479one that does NOT have the property-value.  And the next item beyond that
2480([2]) is the lowest code point beyond that one that does have the
2481property-value, and so on.  Put another way, each element in the list gives
2482the beginning of a range that has the property-value (for even numbered
2483elements), or doesn't have the property-value (for odd numbered elements).
2484The name for this data structure stems from the fact that each element in the
2485list toggles (or inverts) whether the corresponding range is or isn't on the
2486list.
2487
2488In the final example above, the first ASCII Hex digit is code point 48, the
2489character "0", and all code points from it through 57 (a "9") are ASCII hex
2490digits.  Code points 58 through 64 aren't, but 65 (an "A") through 70 (an "F")
2491are, as are 97 ("a") through 102 ("f").  103 starts a range of code points
2492that aren't ASCII hex digits.  That range extends to infinity, which on your
2493computer can be found in the variable C<$Unicode::UCD::MAX_CP>.  (This
2494variable is as close to infinity as Perl can get on your platform, and may be
2495too high for some operations to work; you may wish to use a smaller number for
2496your purposes.)
2497
2498Note that the inversion lists returned by this function can possibly include
2499non-Unicode code points, that is anything above 0x10FFFF.  Unicode properties
2500are not defined on such code points.  You might wish to change the output to
2501not include these.  Simply add 0x110000 at the end of the non-empty returned
2502list if it isn't already that value; and pop that value if it is; like:
2503
2504 my @list = prop_invlist("foo");
2505 if (@list) {
2506     if ($list[-1] == 0x110000) {
2507         pop @list;  # Defeat the turning on for above Unicode
2508     }
2509     else {
2510         push @list, 0x110000; # Turn off for above Unicode
2511     }
2512 }
2513
2514It is a simple matter to expand out an inversion list to a full list of all
2515code points that have the property-value:
2516
2517 my @invlist = prop_invlist($property_name);
2518 die "empty" unless @invlist;
2519 my @full_list;
2520 for (my $i = 0; $i < @invlist; $i += 2) {
2521    my $upper = ($i + 1) < @invlist
2522                ? $invlist[$i+1] - 1      # In range
2523                : $Unicode::UCD::MAX_CP;  # To infinity.  You may want
2524                                          # to stop much much earlier;
2525                                          # going this high may expose
2526                                          # perl deficiencies with very
2527                                          # large numbers.
2528    for my $j ($invlist[$i] .. $upper) {
2529        push @full_list, $j;
2530    }
2531 }
2532
2533C<prop_invlist> does not know about any user-defined nor Perl internal-only
2534properties, and will return C<undef> if called with one of those.
2535
2536The L</search_invlist()> function is provided for finding a code point within
2537an inversion list.
2538
2539=cut
2540
2541# User-defined properties could be handled with some changes to utf8_heavy.pl;
2542# and implementing here of dealing with EXTRAS.  If done, consideration should
2543# be given to the fact that the user subroutine could return different results
2544# with each call; security issues need to be thought about.
2545
2546# These are created by mktables for this routine and stored in unicore/UCD.pl
2547# where their structures are described.
2548our %loose_defaults;
2549our $MAX_UNICODE_CODEPOINT;
2550
2551sub prop_invlist ($;$) {
2552    my $prop = $_[0];
2553
2554    # Undocumented way to get at Perl internal properties; it may be changed
2555    # or removed without notice at any time.
2556    my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok';
2557
2558    return if ! defined $prop;
2559
2560    require "utf8_heavy.pl";
2561
2562    # Warnings for these are only for regexes, so not applicable to us
2563    no warnings 'deprecated';
2564
2565    # Get the swash definition of the property-value.
2566    my $swash = utf8::SWASHNEW(__PACKAGE__, $prop, undef, 1, 0);
2567
2568    # Fail if not found, or isn't a boolean property-value, or is a
2569    # user-defined property, or is internal-only.
2570    return if ! $swash
2571              || ref $swash eq ""
2572              || $swash->{'BITS'} != 1
2573              || $swash->{'USER_DEFINED'}
2574              || (! $internal_ok && $prop =~ /^\s*_/);
2575
2576    if ($swash->{'EXTRAS'}) {
2577        carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has EXTRAS magic";
2578        return;
2579    }
2580    if ($swash->{'SPECIALS'}) {
2581        carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has SPECIALS magic";
2582        return;
2583    }
2584
2585    my @invlist;
2586
2587    if ($swash->{'LIST'} =~ /^V/) {
2588
2589        # A 'V' as the first character marks the input as already an inversion
2590        # list, in which case, all we need to do is put the remaining lines
2591        # into our array.
2592        @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
2593        shift @invlist;
2594    }
2595    else {
2596        # The input lines look like:
2597        # 0041\t005A   # [26]
2598        # 005F
2599
2600        # Split into lines, stripped of trailing comments
2601        foreach my $range (split "\n",
2602                              $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr)
2603        {
2604            # And find the beginning and end of the range on the line
2605            my ($hex_begin, $hex_end) = split "\t", $range;
2606            my $begin = hex $hex_begin;
2607
2608            # If the new range merely extends the old, we remove the marker
2609            # created the last time through the loop for the old's end, which
2610            # causes the new one's end to be used instead.
2611            if (@invlist && $begin == $invlist[-1]) {
2612                pop @invlist;
2613            }
2614            else {
2615                # Add the beginning of the range
2616                push @invlist, $begin;
2617            }
2618
2619            if (defined $hex_end) { # The next item starts with the code point 1
2620                                    # beyond the end of the range.
2621                no warnings 'portable';
2622                my $end = hex $hex_end;
2623                last if $end == $Unicode::UCD::MAX_CP;
2624                push @invlist, $end + 1;
2625            }
2626            else {  # No end of range, is a single code point.
2627                push @invlist, $begin + 1;
2628            }
2629        }
2630    }
2631
2632    # Could need to be inverted: add or subtract a 0 at the beginning of the
2633    # list.
2634    if ($swash->{'INVERT_IT'}) {
2635        if (@invlist && $invlist[0] == 0) {
2636            shift @invlist;
2637        }
2638        else {
2639            unshift @invlist, 0;
2640        }
2641    }
2642
2643    return @invlist;
2644}
2645
2646=pod
2647
2648=head2 B<prop_invmap()>
2649
2650 use Unicode::UCD 'prop_invmap';
2651 my ($list_ref, $map_ref, $format, $default)
2652                                      = prop_invmap("General Category");
2653
2654C<prop_invmap> is used to get the complete mapping definition for a property,
2655in the form of an inversion map.  An inversion map consists of two parallel
2656arrays.  One is an ordered list of code points that mark range beginnings, and
2657the other gives the value (or mapping) that all code points in the
2658corresponding range have.
2659
2660C<prop_invmap> is called with the name of the desired property.  The name is
2661loosely matched, meaning that differences in case, white-space, hyphens, and
2662underscores are not meaningful (except for the trailing underscore in the
2663old-form grandfathered-in property C<"L_">, which is better written as C<"LC">,
2664or even better, C<"Gc=LC">).
2665
2666Many Unicode properties have more than one name (or alias).  C<prop_invmap>
2667understands all of these, including Perl extensions to them.  Ambiguities are
2668resolved as described above for L</prop_aliases()> (except if a property has
2669both a complete mapping, and a binary C<Y>/C<N> mapping, then specifying the
2670property name prefixed by C<"is"> causes the binary one to be returned).  The
2671Perl internal property "Perl_Decimal_Digit, described below, is also accepted.
2672An empty list is returned if the property name is unknown.
2673See L<perluniprops/Properties accessible through Unicode::UCD> for the
2674properties acceptable as inputs to this function.
2675
2676It is a fatal error to call this function except in list context.
2677
2678In addition to the two arrays that form the inversion map, C<prop_invmap>
2679returns two other values; one is a scalar that gives some details as to the
2680format of the entries of the map array; the other is a default value, useful
2681in maps whose format name begins with the letter C<"a">, as described
2682L<below in its subsection|/a>; and for specialized purposes, such as
2683converting to another data structure, described at the end of this main
2684section.
2685
2686This means that C<prop_invmap> returns a 4 element list.  For example,
2687
2688 my ($blocks_ranges_ref, $blocks_maps_ref, $format, $default)
2689                                                 = prop_invmap("Block");
2690
2691In this call, the two arrays will be populated as shown below (for Unicode
26926.0):
2693
2694 Index  @blocks_ranges  @blocks_maps
2695   0        0x0000      Basic Latin
2696   1        0x0080      Latin-1 Supplement
2697   2        0x0100      Latin Extended-A
2698   3        0x0180      Latin Extended-B
2699   4        0x0250      IPA Extensions
2700   5        0x02B0      Spacing Modifier Letters
2701   6        0x0300      Combining Diacritical Marks
2702   7        0x0370      Greek and Coptic
2703   8        0x0400      Cyrillic
2704  ...
2705 233        0x2B820     No_Block
2706 234        0x2F800     CJK Compatibility Ideographs Supplement
2707 235        0x2FA20     No_Block
2708 236        0xE0000     Tags
2709 237        0xE0080     No_Block
2710 238        0xE0100     Variation Selectors Supplement
2711 239        0xE01F0     No_Block
2712 240        0xF0000     Supplementary Private Use Area-A
2713 241        0x100000    Supplementary Private Use Area-B
2714 242        0x110000    No_Block
2715
2716The first line (with Index [0]) means that the value for code point 0 is "Basic
2717Latin".  The entry "0x0080" in the @blocks_ranges column in the second line
2718means that the value from the first line, "Basic Latin", extends to all code
2719points in the range from 0 up to but not including 0x0080, that is, through
2720127.  In other words, the code points from 0 to 127 are all in the "Basic
2721Latin" block.  Similarly, all code points in the range from 0x0080 up to (but
2722not including) 0x0100 are in the block named "Latin-1 Supplement", etc.
2723(Notice that the return is the old-style block names; see L</Old-style versus
2724new-style block names>).
2725
2726The final line (with Index [242]) means that the value for all code points above
2727the legal Unicode maximum code point have the value "No_Block", which is the
2728term Unicode uses for a non-existing block.
2729
2730The arrays completely specify the mappings for all possible code points.
2731The final element in an inversion map returned by this function will always be
2732for the range that consists of all the code points that aren't legal Unicode,
2733but that are expressible on the platform.  (That is, it starts with code point
27340x110000, the first code point above the legal Unicode maximum, and extends to
2735infinity.) The value for that range will be the same that any typical
2736unassigned code point has for the specified property.  (Certain unassigned
2737code points are not "typical"; for example the non-character code points, or
2738those in blocks that are to be written right-to-left.  The above-Unicode
2739range's value is not based on these atypical code points.)  It could be argued
2740that, instead of treating these as unassigned Unicode code points, the value
2741for this range should be C<undef>.  If you wish, you can change the returned
2742arrays accordingly.
2743
2744The maps for almost all properties are simple scalars that should be
2745interpreted as-is.
2746These values are those given in the Unicode-supplied data files, which may be
2747inconsistent as to capitalization and as to which synonym for a property-value
2748is given.  The results may be normalized by using the L</prop_value_aliases()>
2749function.
2750
2751There are exceptions to the simple scalar maps.  Some properties have some
2752elements in their map list that are themselves lists of scalars; and some
2753special strings are returned that are not to be interpreted as-is.  Element
2754[2] (placed into C<$format> in the example above) of the returned four element
2755list tells you if the map has any of these special elements or not, as follows:
2756
2757=over
2758
2759=item B<C<s>>
2760
2761means all the elements of the map array are simple scalars, with no special
2762elements.  Almost all properties are like this, like the C<block> example
2763above.
2764
2765=item B<C<sl>>
2766
2767means that some of the map array elements have the form given by C<"s">, and
2768the rest are lists of scalars.  For example, here is a portion of the output
2769of calling C<prop_invmap>() with the "Script Extensions" property:
2770
2771 @scripts_ranges  @scripts_maps
2772      ...
2773      0x0953      Devanagari
2774      0x0964      [ Bengali, Devanagari, Gurumukhi, Oriya ]
2775      0x0966      Devanagari
2776      0x0970      Common
2777
2778Here, the code points 0x964 and 0x965 are both used in Bengali,
2779Devanagari, Gurmukhi, and Oriya, but no other scripts.
2780
2781The Name_Alias property is also of this form.  But each scalar consists of two
2782components:  1) the name, and 2) the type of alias this is.  They are
2783separated by a colon and a space.  In Unicode 6.1, there are several alias types:
2784
2785=over
2786
2787=item C<correction>
2788
2789indicates that the name is a corrected form for the
2790original name (which remains valid) for the same code point.
2791
2792=item C<control>
2793
2794adds a new name for a control character.
2795
2796=item C<alternate>
2797
2798is an alternate name for a character
2799
2800=item C<figment>
2801
2802is a name for a character that has been documented but was never in any
2803actual standard.
2804
2805=item C<abbreviation>
2806
2807is a common abbreviation for a character
2808
2809=back
2810
2811The lists are ordered (roughly) so the most preferred names come before less
2812preferred ones.
2813
2814For example,
2815
2816 @aliases_ranges        @alias_maps
2817    ...
2818    0x009E        [ 'PRIVACY MESSAGE: control', 'PM: abbreviation' ]
2819    0x009F        [ 'APPLICATION PROGRAM COMMAND: control',
2820                    'APC: abbreviation'
2821                  ]
2822    0x00A0        'NBSP: abbreviation'
2823    0x00A1        ""
2824    0x00AD        'SHY: abbreviation'
2825    0x00AE        ""
2826    0x01A2        'LATIN CAPITAL LETTER GHA: correction'
2827    0x01A3        'LATIN SMALL LETTER GHA: correction'
2828    0x01A4        ""
2829    ...
2830
2831A map to the empty string means that there is no alias defined for the code
2832point.
2833
2834=item B<C<a>>
2835
2836is like C<"s"> in that all the map array elements are scalars, but here they are
2837restricted to all being integers, and some have to be adjusted (hence the name
2838C<"a">) to get the correct result.  For example, in:
2839
2840 my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
2841                          = prop_invmap("Simple_Uppercase_Mapping");
2842
2843the returned arrays look like this:
2844
2845 @$uppers_ranges_ref    @$uppers_maps_ref   Note
2846       0                      0
2847      97                     65          'a' maps to 'A', b => B ...
2848     123                      0
2849     181                    924          MICRO SIGN => Greek Cap MU
2850     182                      0
2851     ...
2852
2853and C<$default> is 0.
2854
2855Let's start with the second line.  It says that the uppercase of code point 97
2856is 65; or C<uc("a")> == "A".  But the line is for the entire range of code
2857points 97 through 122.  To get the mapping for any code point in this range,
2858you take the offset it has from the beginning code point of the range, and add
2859that to the mapping for that first code point.  So, the mapping for 122 ("z")
2860is derived by taking the offset of 122 from 97 (=25) and adding that to 65,
2861yielding 90 ("z").  Likewise for everything in between.
2862
2863Requiring this simple adjustment allows the returned arrays to be
2864significantly smaller than otherwise, up to a factor of 10, speeding up
2865searching through them.
2866
2867Ranges that map to C<$default>, C<"0">, behave somewhat differently.  For
2868these, each code point maps to itself.  So, in the first line in the example,
2869S<C<ord(uc(chr(0)))>> is 0, S<C<ord(uc(chr(1)))>> is 1, ..
2870S<C<ord(uc(chr(96)))>> is 96.
2871
2872=item B<C<al>>
2873
2874means that some of the map array elements have the form given by C<"a">, and
2875the rest are ordered lists of code points.
2876For example, in:
2877
2878 my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
2879                                 = prop_invmap("Uppercase_Mapping");
2880
2881the returned arrays look like this:
2882
2883 @$uppers_ranges_ref    @$uppers_maps_ref
2884       0                      0
2885      97                     65
2886     123                      0
2887     181                    924
2888     182                      0
2889     ...
2890    0x0149              [ 0x02BC 0x004E ]
2891    0x014A                    0
2892    0x014B                  330
2893     ...
2894
2895This is the full Uppercase_Mapping property (as opposed to the
2896Simple_Uppercase_Mapping given in the example for format C<"a">).  The only
2897difference between the two in the ranges shown is that the code point at
28980x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two
2899characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN
2900CAPITAL LETTER N).
2901
2902No adjustments are needed to entries that are references to arrays; each such
2903entry will have exactly one element in its range, so the offset is always 0.
2904
2905The fourth (index [3]) element (C<$default>) in the list returned for this
2906format is 0.
2907
2908=item B<C<ae>>
2909
2910This is like C<"a">, but some elements are the empty string, and should not be
2911adjusted.
2912The one internal Perl property accessible by C<prop_invmap> is of this type:
2913"Perl_Decimal_Digit" returns an inversion map which gives the numeric values
2914that are represented by the Unicode decimal digit characters.  Characters that
2915don't represent decimal digits map to the empty string, like so:
2916
2917 @digits    @values
2918 0x0000       ""
2919 0x0030        0
2920 0x003A:      ""
2921 0x0660:       0
2922 0x066A:      ""
2923 0x06F0:       0
2924 0x06FA:      ""
2925 0x07C0:       0
2926 0x07CA:      ""
2927 0x0966:       0
2928 ...
2929
2930This means that the code points from 0 to 0x2F do not represent decimal digits;
2931the code point 0x30 (DIGIT ZERO) represents 0;  code point 0x31, (DIGIT ONE),
2932represents 0+1-0 = 1; ... code point 0x39, (DIGIT NINE), represents 0+9-0 = 9;
2933... code points 0x3A through 0x65F do not represent decimal digits; 0x660
2934(ARABIC-INDIC DIGIT ZERO), represents 0; ... 0x07C1 (NKO DIGIT ONE),
2935represents 0+1-0 = 1 ...
2936
2937The fourth (index [3]) element (C<$default>) in the list returned for this
2938format is the empty string.
2939
2940=item B<C<ale>>
2941
2942is a combination of the C<"al"> type and the C<"ae"> type.  Some of
2943the map array elements have the forms given by C<"al">, and
2944the rest are the empty string.  The property C<NFKC_Casefold> has this form.
2945An example slice is:
2946
2947 @$ranges_ref  @$maps_ref         Note
2948    ...
2949   0x00AA       97                FEMININE ORDINAL INDICATOR => 'a'
2950   0x00AB        0
2951   0x00AD                         SOFT HYPHEN => ""
2952   0x00AE        0
2953   0x00AF     [ 0x0020, 0x0304 ]  MACRON => SPACE . COMBINING MACRON
2954   0x00B0        0
2955   ...
2956
2957The fourth (index [3]) element (C<$default>) in the list returned for this
2958format is 0.
2959
2960=item B<C<ar>>
2961
2962means that all the elements of the map array are either rational numbers or
2963the string C<"NaN">, meaning "Not a Number".  A rational number is either an
2964integer, or two integers separated by a solidus (C<"/">).  The second integer
2965represents the denominator of the division implied by the solidus, and is
2966actually always positive, so it is guaranteed not to be 0 and to not be
2967signed.  When the element is a plain integer (without the
2968solidus), it may need to be adjusted to get the correct value by adding the
2969offset, just as other C<"a"> properties.  No adjustment is needed for
2970fractions, as the range is guaranteed to have just a single element, and so
2971the offset is always 0.
2972
2973If you want to convert the returned map to entirely scalar numbers, you
2974can use something like this:
2975
2976 my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property);
2977 if ($format && $format eq "ar") {
2978     map { $_ = eval $_ if $_ ne 'NaN' } @$map_ref;
2979 }
2980
2981Here's some entries from the output of the property "Nv", which has format
2982C<"ar">.
2983
2984 @numerics_ranges  @numerics_maps       Note
2985        0x00           "NaN"
2986        0x30             0           DIGIT 0 .. DIGIT 9
2987        0x3A           "NaN"
2988        0xB2             2           SUPERSCRIPTs 2 and 3
2989        0xB4           "NaN"
2990        0xB9             1           SUPERSCRIPT 1
2991        0xBA           "NaN"
2992        0xBC            1/4          VULGAR FRACTION 1/4
2993        0xBD            1/2          VULGAR FRACTION 1/2
2994        0xBE            3/4          VULGAR FRACTION 3/4
2995        0xBF           "NaN"
2996        0x660            0           ARABIC-INDIC DIGIT ZERO .. NINE
2997        0x66A          "NaN"
2998
2999The fourth (index [3]) element (C<$default>) in the list returned for this
3000format is C<"NaN">.
3001
3002=item B<C<n>>
3003
3004means the Name property.  All the elements of the map array are simple
3005scalars, but some of them contain special strings that require more work to
3006get the actual name.
3007
3008Entries such as:
3009
3010 CJK UNIFIED IDEOGRAPH-<code point>
3011
3012mean that the name for the code point is "CJK UNIFIED IDEOGRAPH-"
3013with the code point (expressed in hexadecimal) appended to it, like "CJK
3014UNIFIED IDEOGRAPH-3403" (similarly for S<C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code
3015pointE<gt>>>).
3016
3017Also, entries like
3018
3019 <hangul syllable>
3020
3021means that the name is algorithmically calculated.  This is easily done by
3022the function L<charnames/charnames::viacode(code)>.
3023
3024Note that for control characters (C<Gc=cc>), Unicode's data files have the
3025string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty
3026string.  This function returns that real name, the empty string.  (There are
3027names for these characters, but they are considered aliases, not the Name
3028property name, and are contained in the C<Name_Alias> property.)
3029
3030=item B<C<ad>>
3031
3032means the Decomposition_Mapping property.  This property is like C<"al">
3033properties, except that one of the scalar elements is of the form:
3034
3035 <hangul syllable>
3036
3037This signifies that this entry should be replaced by the decompositions for
3038all the code points whose decomposition is algorithmically calculated.  (All
3039of them are currently in one range and no others outside the range are likely
3040to ever be added to Unicode; the C<"n"> format
3041has this same entry.)  These can be generated via the function
3042L<Unicode::Normalize::NFD()|Unicode::Normalize>.
3043
3044Note that the mapping is the one that is specified in the Unicode data files,
3045and to get the final decomposition, it may need to be applied recursively.
3046
3047The fourth (index [3]) element (C<$default>) in the list returned for this
3048format is 0.
3049
3050=back
3051
3052Note that a format begins with the letter "a" if and only the property it is
3053for requires adjustments by adding the offsets in multi-element ranges.  For
3054all these properties, an entry should be adjusted only if the map is a scalar
3055which is an integer.  That is, it must match the regular expression:
3056
3057    / ^ -? \d+ $ /xa
3058
3059Further, the first element in a range never needs adjustment, as the
3060adjustment would be just adding 0.
3061
3062A binary search such as that provided by L</search_invlist()>, can be used to
3063quickly find a code point in the inversion list, and hence its corresponding
3064mapping.
3065
3066The final, fourth element (index [3], assigned to C<$default> in the "block"
3067example) in the four element list returned by this function is used with the
3068C<"a"> format types; it may also be useful for applications
3069that wish to convert the returned inversion map data structure into some
3070other, such as a hash.  It gives the mapping that most code points map to
3071under the property.  If you establish the convention that any code point not
3072explicitly listed in your data structure maps to this value, you can
3073potentially make your data structure much smaller.  As you construct your data
3074structure from the one returned by this function, simply ignore those ranges
3075that map to this value.  For example, to
3076convert to the data structure searchable by L</charinrange()>, you can follow
3077this recipe for properties that don't require adjustments:
3078
3079 my ($list_ref, $map_ref, $format, $default) = prop_invmap($property);
3080 my @range_list;
3081
3082 # Look at each element in the list, but the -2 is needed because we
3083 # look at $i+1 in the loop, and the final element is guaranteed to map
3084 # to $default by prop_invmap(), so we would skip it anyway.
3085 for my $i (0 .. @$list_ref - 2) {
3086    next if $map_ref->[$i] eq $default;
3087    push @range_list, [ $list_ref->[$i],
3088                        $list_ref->[$i+1],
3089                        $map_ref->[$i]
3090                      ];
3091 }
3092
3093 print charinrange(\@range_list, $code_point), "\n";
3094
3095With this, C<charinrange()> will return C<undef> if its input code point maps
3096to C<$default>.  You can avoid this by omitting the C<next> statement, and adding
3097a line after the loop to handle the final element of the inversion map.
3098
3099Similarly, this recipe can be used for properties that do require adjustments:
3100
3101 for my $i (0 .. @$list_ref - 2) {
3102    next if $map_ref->[$i] eq $default;
3103
3104    # prop_invmap() guarantees that if the mapping is to an array, the
3105    # range has just one element, so no need to worry about adjustments.
3106    if (ref $map_ref->[$i]) {
3107        push @range_list,
3108                   [ $list_ref->[$i], $list_ref->[$i], $map_ref->[$i] ];
3109    }
3110    else {  # Otherwise each element is actually mapped to a separate
3111            # value, so the range has to be split into single code point
3112            # ranges.
3113
3114        my $adjustment = 0;
3115
3116        # For each code point that gets mapped to something...
3117        for my $j ($list_ref->[$i] .. $list_ref->[$i+1] -1 ) {
3118
3119            # ... add a range consisting of just it mapping to the
3120            # original plus the adjustment, which is incremented for the
3121            # next time through the loop, as the offset increases by 1
3122            # for each element in the range
3123            push @range_list,
3124                             [ $j, $j, $map_ref->[$i] + $adjustment++ ];
3125        }
3126    }
3127 }
3128
3129Note that the inversion maps returned for the C<Case_Folding> and
3130C<Simple_Case_Folding> properties do not include the Turkic-locale mappings.
3131Use L</casefold()> for these.
3132
3133C<prop_invmap> does not know about any user-defined properties, and will
3134return C<undef> if called with one of those.
3135
3136The returned values for the Perl extension properties, such as C<Any> and
3137C<Greek> are somewhat misleading.  The values are either C<"Y"> or C<"N>".
3138All Unicode properties are bipartite, so you can actually use the C<"Y"> or
3139C<"N>" in a Perl regular rexpression for these, like C<qr/\p{ID_Start=Y/}> or
3140C<qr/\p{Upper=N/}>.  But the Perl extensions aren't specified this way, only
3141like C</qr/\p{Any}>, I<etc>.  You can't actually use the C<"Y"> and C<"N>" in
3142them.
3143
3144=cut
3145
3146# User-defined properties could be handled with some changes to utf8_heavy.pl;
3147# if done, consideration should be given to the fact that the user subroutine
3148# could return different results with each call, which could lead to some
3149# security issues.
3150
3151# One could store things in memory so they don't have to be recalculated, but
3152# it is unlikely this will be called often, and some properties would take up
3153# significant memory.
3154
3155# These are created by mktables for this routine and stored in unicore/UCD.pl
3156# where their structures are described.
3157our @algorithmic_named_code_points;
3158our $HANGUL_BEGIN;
3159our $HANGUL_COUNT;
3160
3161sub prop_invmap ($;$) {
3162
3163    croak __PACKAGE__, "::prop_invmap: must be called in list context" unless wantarray;
3164
3165    my $prop = $_[0];
3166    return unless defined $prop;
3167
3168    # Undocumented way to get at Perl internal properties; it may be changed
3169    # or removed without notice at any time.  It currently also changes the
3170    # output to use the format specified in the file rather than the one we
3171    # normally compute and return
3172    my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok';
3173
3174    # Fail internal properties
3175    return if $prop =~ /^_/ && ! $internal_ok;
3176
3177    # The values returned by this function.
3178    my (@invlist, @invmap, $format, $missing);
3179
3180    # The swash has two components we look at, the base list, and a hash,
3181    # named 'SPECIALS', containing any additional members whose mappings don't
3182    # fit into the base list scheme of things.  These generally 'override'
3183    # any value in the base list for the same code point.
3184    my $overrides;
3185
3186    require "utf8_heavy.pl";
3187    require "unicore/UCD.pl";
3188
3189RETRY:
3190
3191    # If there are multiple entries for a single code point
3192    my $has_multiples = 0;
3193
3194    # Try to get the map swash for the property.  They have 'To' prepended to
3195    # the property name, and 32 means we will accept 32 bit return values.
3196    # The 0 means we aren't calling this from tr///.
3197    my $swash = utf8::SWASHNEW(__PACKAGE__, "To$prop", undef, 32, 0);
3198
3199    # If didn't find it, could be because needs a proxy.  And if was the
3200    # 'Block' or 'Name' property, use a proxy even if did find it.  Finding it
3201    # in these cases would be the result of the installation changing mktables
3202    # to output the Block or Name tables.  The Block table gives block names
3203    # in the new-style, and this routine is supposed to return old-style block
3204    # names.  The Name table is valid, but we need to execute the special code
3205    # below to add in the algorithmic-defined name entries.
3206    # And NFKCCF needs conversion, so handle that here too.
3207    if (ref $swash eq ""
3208        || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na | NFKCCF ) $ /x)
3209    {
3210
3211        # Get the short name of the input property, in standard form
3212        my ($second_try) = prop_aliases($prop);
3213        return unless $second_try;
3214        $second_try = utf8::_loose_name(lc $second_try);
3215
3216        if ($second_try eq "in") {
3217
3218            # This property is identical to age for inversion map purposes
3219            $prop = "age";
3220            goto RETRY;
3221        }
3222        elsif ($second_try =~ / ^ s ( cf | fc | [ltu] c ) $ /x) {
3223
3224            # These properties use just the LIST part of the full mapping,
3225            # which includes the simple maps that are otherwise overridden by
3226            # the SPECIALS.  So all we need do is to not look at the SPECIALS;
3227            # set $overrides to indicate that
3228            $overrides = -1;
3229
3230            # The full name is the simple name stripped of its initial 's'
3231            $prop = $1;
3232
3233            # .. except for this case
3234            $prop = 'cf' if $prop eq 'fc';
3235
3236            goto RETRY;
3237        }
3238        elsif ($second_try eq "blk") {
3239
3240            # We use the old block names.  Just create a fake swash from its
3241            # data.
3242            _charblocks();
3243            my %blocks;
3244            $blocks{'LIST'} = "";
3245            $blocks{'TYPE'} = "ToBlk";
3246            $utf8::SwashInfo{ToBlk}{'missing'} = "No_Block";
3247            $utf8::SwashInfo{ToBlk}{'format'} = "s";
3248
3249            foreach my $block (@BLOCKS) {
3250                $blocks{'LIST'} .= sprintf "%x\t%x\t%s\n",
3251                                           $block->[0],
3252                                           $block->[1],
3253                                           $block->[2];
3254            }
3255            $swash = \%blocks;
3256        }
3257        elsif ($second_try eq "na") {
3258
3259            # Use the combo file that has all the Name-type properties in it,
3260            # extracting just the ones that are for the actual 'Name'
3261            # property.  And create a fake swash from it.
3262            my %names;
3263            $names{'LIST'} = "";
3264            my $original = do "unicore/Name.pl";
3265            my $algorithm_names = \@algorithmic_named_code_points;
3266
3267            # We need to remove the names from it that are aliases.  For that
3268            # we need to also read in that table.  Create a hash with the keys
3269            # being the code points, and the values being a list of the
3270            # aliases for the code point key.
3271            my ($aliases_code_points, $aliases_maps, undef, undef)
3272                  = &prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok');
3273            my %aliases;
3274            for (my $i = 0; $i < @$aliases_code_points; $i++) {
3275                my $code_point = $aliases_code_points->[$i];
3276                $aliases{$code_point} = $aliases_maps->[$i];
3277
3278                # If not already a list, make it into one, so that later we
3279                # can treat things uniformly
3280                if (! ref $aliases{$code_point}) {
3281                    $aliases{$code_point} = [ $aliases{$code_point} ];
3282                }
3283
3284                # Remove the alias type from the entry, retaining just the
3285                # name.
3286                map { s/:.*// } @{$aliases{$code_point}};
3287            }
3288
3289            my $i = 0;
3290            foreach my $line (split "\n", $original) {
3291                my ($hex_code_point, $name) = split "\t", $line;
3292
3293                # Weeds out all comments, blank lines, and named sequences
3294                next if $hex_code_point =~ /[^[:xdigit:]]/a;
3295
3296                my $code_point = hex $hex_code_point;
3297
3298                # The name of all controls is the default: the empty string.
3299                # The set of controls is immutable
3300                next if chr($code_point) =~ /[[:cntrl:]]/u;
3301
3302                # If this is a name_alias, it isn't a name
3303                next if grep { $_ eq $name } @{$aliases{$code_point}};
3304
3305                # If we are beyond where one of the special lines needs to
3306                # be inserted ...
3307                while ($i < @$algorithm_names
3308                    && $code_point > $algorithm_names->[$i]->{'low'})
3309                {
3310
3311                    # ... then insert it, ahead of what we were about to
3312                    # output
3313                    $names{'LIST'} .= sprintf "%x\t%x\t%s\n",
3314                                            $algorithm_names->[$i]->{'low'},
3315                                            $algorithm_names->[$i]->{'high'},
3316                                            $algorithm_names->[$i]->{'name'};
3317
3318                    # Done with this range.
3319                    $i++;
3320
3321                    # We loop until all special lines that precede the next
3322                    # regular one are output.
3323                }
3324
3325                # Here, is a normal name.
3326                $names{'LIST'} .= sprintf "%x\t\t%s\n", $code_point, $name;
3327            } # End of loop through all the names
3328
3329            $names{'TYPE'} = "ToNa";
3330            $utf8::SwashInfo{ToNa}{'missing'} = "";
3331            $utf8::SwashInfo{ToNa}{'format'} = "n";
3332            $swash = \%names;
3333        }
3334        elsif ($second_try =~ / ^ ( d [mt] ) $ /x) {
3335
3336            # The file is a combination of dt and dm properties.  Create a
3337            # fake swash from the portion that we want.
3338            my $original = do "unicore/Decomposition.pl";
3339            my %decomps;
3340
3341            if ($second_try eq 'dt') {
3342                $decomps{'TYPE'} = "ToDt";
3343                $utf8::SwashInfo{'ToDt'}{'missing'} = "None";
3344                $utf8::SwashInfo{'ToDt'}{'format'} = "s";
3345            }   # 'dm' is handled below, with 'nfkccf'
3346
3347            $decomps{'LIST'} = "";
3348
3349            # This property has one special range not in the file: for the
3350            # hangul syllables.  But not in Unicode version 1.
3351            UnicodeVersion() unless defined $v_unicode_version;
3352            my $done_hangul = ($v_unicode_version lt v2.0.0)
3353                              ? 1
3354                              : 0;    # Have we done the hangul range ?
3355            foreach my $line (split "\n", $original) {
3356                my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line;
3357                my $code_point = hex $hex_lower;
3358                my $value;
3359                my $redo = 0;
3360
3361                # The type, enclosed in <...>, precedes the mapping separated
3362                # by blanks
3363                if ($type_and_map =~ / ^ < ( .* ) > \s+ (.*) $ /x) {
3364                    $value = ($second_try eq 'dt') ? $1 : $2
3365                }
3366                else {  # If there is no type specified, it's canonical
3367                    $value = ($second_try eq 'dt')
3368                             ? "Canonical" :
3369                             $type_and_map;
3370                }
3371
3372                # Insert the hangul range at the appropriate spot.
3373                if (! $done_hangul && $code_point > $HANGUL_BEGIN) {
3374                    $done_hangul = 1;
3375                    $decomps{'LIST'} .=
3376                                sprintf "%x\t%x\t%s\n",
3377                                        $HANGUL_BEGIN,
3378                                        $HANGUL_BEGIN + $HANGUL_COUNT - 1,
3379                                        ($second_try eq 'dt')
3380                                        ? "Canonical"
3381                                        : "<hangul syllable>";
3382                }
3383
3384                if ($value =~ / / && $hex_upper ne "" && $hex_upper ne $hex_lower) {
3385                    $line = sprintf("%04X\t%s\t%s", hex($hex_lower) + 1, $hex_upper, $value);
3386                    $hex_upper = "";
3387                    $redo = 1;
3388                }
3389
3390                # And append this to our constructed LIST.
3391                $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n";
3392
3393                redo if $redo;
3394            }
3395            $swash = \%decomps;
3396        }
3397        elsif ($second_try ne 'nfkccf') { # Don't know this property. Fail.
3398            return;
3399        }
3400
3401        if ($second_try eq 'nfkccf' || $second_try eq 'dm') {
3402
3403            # The 'nfkccf' property is stored in the old format for backwards
3404            # compatibility for any applications that has read its file
3405            # directly before prop_invmap() existed.
3406            # And the code above has extracted the 'dm' property from its file
3407            # yielding the same format.  So here we convert them to adjusted
3408            # format for compatibility with the other properties similar to
3409            # them.
3410            my %revised_swash;
3411
3412            # We construct a new converted list.
3413            my $list = "";
3414
3415            my @ranges = split "\n", $swash->{'LIST'};
3416            for (my $i = 0; $i < @ranges; $i++) {
3417                my ($hex_begin, $hex_end, $map) = split "\t", $ranges[$i];
3418
3419                # The dm property has maps that are space separated sequences
3420                # of code points, as well as the special entry "<hangul
3421                # syllable>, which also contains a blank.
3422                my @map = split " ", $map;
3423                if (@map > 1) {
3424
3425                    # If it's just the special entry, append as-is.
3426                    if ($map eq '<hangul syllable>') {
3427                        $list .= "$ranges[$i]\n";
3428                    }
3429                    else {
3430
3431                        # These should all be single-element ranges.
3432                        croak __PACKAGE__, "::prop_invmap: Not expecting a mapping with multiple code points in a multi-element range, $ranges[$i]" if $hex_end ne "" && $hex_end ne $hex_begin;
3433
3434                        # Convert them to decimal, as that's what's expected.
3435                        $list .= "$hex_begin\t\t"
3436                            . join(" ", map { hex } @map)
3437                            . "\n";
3438                    }
3439                    next;
3440                }
3441
3442                # Here, the mapping doesn't have a blank, is for a single code
3443                # point.
3444                my $begin = hex $hex_begin;
3445                my $end = (defined $hex_end && $hex_end ne "")
3446                        ? hex $hex_end
3447                        : $begin;
3448
3449                # Again, the output is to be in decimal.
3450                my $decimal_map = hex $map;
3451
3452                # We know that multi-element ranges with the same mapping
3453                # should not be adjusted, as after the adjustment
3454                # multi-element ranges are for consecutive increasing code
3455                # points.  Further, the final element in the list won't be
3456                # adjusted, as there is nothing after it to include in the
3457                # adjustment
3458                if ($begin != $end || $i == @ranges -1) {
3459
3460                    # So just convert these to single-element ranges
3461                    foreach my $code_point ($begin .. $end) {
3462                        $list .= sprintf("%04X\t\t%d\n",
3463                                        $code_point, $decimal_map);
3464                    }
3465                }
3466                else {
3467
3468                    # Here, we have a candidate for adjusting.  What we do is
3469                    # look through the subsequent adjacent elements in the
3470                    # input.  If the map to the next one differs by 1 from the
3471                    # one before, then we combine into a larger range with the
3472                    # initial map.  Loop doing this until we find one that
3473                    # can't be combined.
3474
3475                    my $offset = 0;     # How far away are we from the initial
3476                                        # map
3477                    my $squished = 0;   # ? Did we squish at least two
3478                                        # elements together into one range
3479                    for ( ; $i < @ranges; $i++) {
3480                        my ($next_hex_begin, $next_hex_end, $next_map)
3481                                                = split "\t", $ranges[$i+1];
3482
3483                        # In the case of 'dm', the map may be a sequence of
3484                        # multiple code points, which are never combined with
3485                        # another range
3486                        last if $next_map =~ / /;
3487
3488                        $offset++;
3489                        my $next_decimal_map = hex $next_map;
3490
3491                        # If the next map is not next in sequence, it
3492                        # shouldn't be combined.
3493                        last if $next_decimal_map != $decimal_map + $offset;
3494
3495                        my $next_begin = hex $next_hex_begin;
3496
3497                        # Likewise, if the next element isn't adjacent to the
3498                        # previous one, it shouldn't be combined.
3499                        last if $next_begin != $begin + $offset;
3500
3501                        my $next_end = (defined $next_hex_end
3502                                        && $next_hex_end ne "")
3503                                            ? hex $next_hex_end
3504                                            : $next_begin;
3505
3506                        # And finally, if the next element is a multi-element
3507                        # range, it shouldn't be combined.
3508                        last if $next_end != $next_begin;
3509
3510                        # Here, we will combine.  Loop to see if we should
3511                        # combine the next element too.
3512                        $squished = 1;
3513                    }
3514
3515                    if ($squished) {
3516
3517                        # Here, 'i' is the element number of the last element to
3518                        # be combined, and the range is single-element, or we
3519                        # wouldn't be combining.  Get it's code point.
3520                        my ($hex_end, undef, undef) = split "\t", $ranges[$i];
3521                        $list .= "$hex_begin\t$hex_end\t$decimal_map\n";
3522                    } else {
3523
3524                        # Here, no combining done.  Just append the initial
3525                        # (and current) values.
3526                        $list .= "$hex_begin\t\t$decimal_map\n";
3527                    }
3528                }
3529            } # End of loop constructing the converted list
3530
3531            # Finish up the data structure for our converted swash
3532            my $type = ($second_try eq 'nfkccf') ? 'ToNFKCCF' : 'ToDm';
3533            $revised_swash{'LIST'} = $list;
3534            $revised_swash{'TYPE'} = $type;
3535            $revised_swash{'SPECIALS'} = $swash->{'SPECIALS'};
3536            $swash = \%revised_swash;
3537
3538            $utf8::SwashInfo{$type}{'missing'} = 0;
3539            $utf8::SwashInfo{$type}{'format'} = 'a';
3540        }
3541    }
3542
3543    if ($swash->{'EXTRAS'}) {
3544        carp __PACKAGE__, "::prop_invmap: swash returned for $prop unexpectedly has EXTRAS magic";
3545        return;
3546    }
3547
3548    # Here, have a valid swash return.  Examine it.
3549    my $returned_prop = $swash->{'TYPE'};
3550
3551    # All properties but binary ones should have 'missing' and 'format'
3552    # entries
3553    $missing = $utf8::SwashInfo{$returned_prop}{'missing'};
3554    $missing = 'N' unless defined $missing;
3555
3556    $format = $utf8::SwashInfo{$returned_prop}{'format'};
3557    $format = 'b' unless defined $format;
3558
3559    my $requires_adjustment = $format =~ /^a/;
3560
3561    if ($swash->{'LIST'} =~ /^V/) {
3562        @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
3563
3564        shift @invlist;     # Get rid of 'V';
3565
3566        # Could need to be inverted: add or subtract a 0 at the beginning of
3567        # the list.
3568        if ($swash->{'INVERT_IT'}) {
3569            if (@invlist && $invlist[0] == 0) {
3570                shift @invlist;
3571            }
3572            else {
3573                unshift @invlist, 0;
3574            }
3575        }
3576        foreach my $i (0 .. @invlist - 1) {
3577            $invmap[$i] = ($i % 2 == 0) ? 'Y' : 'N'
3578        }
3579
3580        # The map includes lines for all code points; add one for the range
3581        # from 0 to the first Y.
3582        if ($invlist[0] != 0) {
3583            unshift @invlist, 0;
3584            unshift @invmap, 'N';
3585        }
3586    }
3587    else {
3588        if ($swash->{'INVERT_IT'}) {
3589            croak __PACKAGE__, ":prop_invmap: Don't know how to deal with inverted";
3590        }
3591
3592        # The LIST input lines look like:
3593        # ...
3594        # 0374\t\tCommon
3595        # 0375\t0377\tGreek   # [3]
3596        # 037A\t037D\tGreek   # [4]
3597        # 037E\t\tCommon
3598        # 0384\t\tGreek
3599        # ...
3600        #
3601        # Convert them to like
3602        # 0374 => Common
3603        # 0375 => Greek
3604        # 0378 => $missing
3605        # 037A => Greek
3606        # 037E => Common
3607        # 037F => $missing
3608        # 0384 => Greek
3609        #
3610        # For binary properties, the final non-comment column is absent, and
3611        # assumed to be 'Y'.
3612
3613        foreach my $range (split "\n", $swash->{'LIST'}) {
3614            $range =~ s/ \s* (?: \# .* )? $ //xg; # rmv trailing space, comments
3615
3616            # Find the beginning and end of the range on the line
3617            my ($hex_begin, $hex_end, $map) = split "\t", $range;
3618            my $begin = hex $hex_begin;
3619            no warnings 'portable';
3620            my $end = (defined $hex_end && $hex_end ne "")
3621                    ? hex $hex_end
3622                    : $begin;
3623
3624            # Each time through the loop (after the first):
3625            # $invlist[-2] contains the beginning of the previous range processed
3626            # $invlist[-1] contains the end+1 of the previous range processed
3627            # $invmap[-2] contains the value of the previous range processed
3628            # $invmap[-1] contains the default value for missing ranges
3629            #                                                       ($missing)
3630            #
3631            # Thus, things are set up for the typical case of a new
3632            # non-adjacent range of non-missings to be added.  But, if the new
3633            # range is adjacent, it needs to replace the [-1] element; and if
3634            # the new range is a multiple value of the previous one, it needs
3635            # to be added to the [-2] map element.
3636
3637            # The first time through, everything will be empty.  If the
3638            # property doesn't have a range that begins at 0, add one that
3639            # maps to $missing
3640            if (! @invlist) {
3641                if ($begin != 0) {
3642                    push @invlist, 0;
3643                    push @invmap, $missing;
3644                }
3645            }
3646            elsif (@invlist > 1 && $invlist[-2] == $begin) {
3647
3648                # Here we handle the case where the input has multiple entries
3649                # for each code point.  mktables should have made sure that
3650                # each such range contains only one code point.  At this
3651                # point, $invlist[-1] is the $missing that was added at the
3652                # end of the last loop iteration, and [-2] is the last real
3653                # input code point, and that code point is the same as the one
3654                # we are adding now, making the new one a multiple entry.  Add
3655                # it to the existing entry, either by pushing it to the
3656                # existing list of multiple entries, or converting the single
3657                # current entry into a list with both on it.  This is all we
3658                # need do for this iteration.
3659
3660                if ($end != $begin) {
3661                    croak __PACKAGE__, ":prop_invmap: Multiple maps per code point in '$prop' require single-element ranges: begin=$begin, end=$end, map=$map";
3662                }
3663                if (! ref $invmap[-2]) {
3664                    $invmap[-2] = [ $invmap[-2], $map ];
3665                }
3666                else {
3667                    push @{$invmap[-2]}, $map;
3668                }
3669                $has_multiples = 1;
3670                next;
3671            }
3672            elsif ($invlist[-1] == $begin) {
3673
3674                # If the input isn't in the most compact form, so that there
3675                # are two adjacent ranges that map to the same thing, they
3676                # should be combined (EXCEPT where the arrays require
3677                # adjustments, in which case everything is already set up
3678                # correctly).  This happens in our constructed dt mapping, as
3679                # Element [-2] is the map for the latest range so far
3680                # processed.  Just set the beginning point of the map to
3681                # $missing (in invlist[-1]) to 1 beyond where this range ends.
3682                # For example, in
3683                # 12\t13\tXYZ
3684                # 14\t17\tXYZ
3685                # we have set it up so that it looks like
3686                # 12 => XYZ
3687                # 14 => $missing
3688                #
3689                # We now see that it should be
3690                # 12 => XYZ
3691                # 18 => $missing
3692                if (! $requires_adjustment && @invlist > 1 && ( (defined $map)
3693                                    ? $invmap[-2] eq $map
3694                                    : $invmap[-2] eq 'Y'))
3695                {
3696                    $invlist[-1] = $end + 1;
3697                    next;
3698                }
3699
3700                # Here, the range started in the previous iteration that maps
3701                # to $missing starts at the same code point as this range.
3702                # That means there is no gap to fill that that range was
3703                # intended for, so we just pop it off the parallel arrays.
3704                pop @invlist;
3705                pop @invmap;
3706            }
3707
3708            # Add the range beginning, and the range's map.
3709            push @invlist, $begin;
3710            if ($returned_prop eq 'ToDm') {
3711
3712                # The decomposition maps are either a line like <hangul
3713                # syllable> which are to be taken as is; or a sequence of code
3714                # points in hex and separated by blanks.  Convert them to
3715                # decimal, and if there is more than one, use an anonymous
3716                # array as the map.
3717                if ($map =~ /^ < /x) {
3718                    push @invmap, $map;
3719                }
3720                else {
3721                    my @map = split " ", $map;
3722                    if (@map == 1) {
3723                        push @invmap, $map[0];
3724                    }
3725                    else {
3726                        push @invmap, \@map;
3727                    }
3728                }
3729            }
3730            else {
3731
3732                # Otherwise, convert hex formatted list entries to decimal;
3733                # add a 'Y' map for the missing value in binary properties, or
3734                # otherwise, use the input map unchanged.
3735                $map = ($format eq 'x' || $format eq 'ax')
3736                    ? hex $map
3737                    : $format eq 'b'
3738                    ? 'Y'
3739                    : $map;
3740                push @invmap, $map;
3741            }
3742
3743            # We just started a range.  It ends with $end.  The gap between it
3744            # and the next element in the list must be filled with a range
3745            # that maps to the default value.  If there is no gap, the next
3746            # iteration will pop this, unless there is no next iteration, and
3747            # we have filled all of the Unicode code space, so check for that
3748            # and skip.
3749            if ($end < $Unicode::UCD::MAX_CP) {
3750                push @invlist, $end + 1;
3751                push @invmap, $missing;
3752            }
3753        }
3754    }
3755
3756    # If the property is empty, make all code points use the value for missing
3757    # ones.
3758    if (! @invlist) {
3759        push @invlist, 0;
3760        push @invmap, $missing;
3761    }
3762
3763    # The final element is always for just the above-Unicode code points.  If
3764    # not already there, add it.  It merely splits the current final range
3765    # that extends to infinity into two elements, each with the same map.
3766    # (This is to conform with the API that says the final element is for
3767    # $MAX_UNICODE_CODEPOINT + 1 .. INFINITY.)
3768    if ($invlist[-1] != $MAX_UNICODE_CODEPOINT + 1) {
3769        push @invmap, $invmap[-1];
3770        push @invlist, $MAX_UNICODE_CODEPOINT + 1;
3771    }
3772
3773    # The second component of the map are those values that require
3774    # non-standard specification, stored in SPECIALS.  These override any
3775    # duplicate code points in LIST.  If we are using a proxy, we may have
3776    # already set $overrides based on the proxy.
3777    $overrides = $swash->{'SPECIALS'} unless defined $overrides;
3778    if ($overrides) {
3779
3780        # A negative $overrides implies that the SPECIALS should be ignored,
3781        # and a simple 'a' list is the value.
3782        if ($overrides < 0) {
3783            $format = 'a';
3784        }
3785        else {
3786
3787            # Currently, all overrides are for properties that normally map to
3788            # single code points, but now some will map to lists of code
3789            # points (but there is an exception case handled below).
3790            $format = 'al';
3791
3792            # Look through the overrides.
3793            foreach my $cp_maybe_utf8 (keys %$overrides) {
3794                my $cp;
3795                my @map;
3796
3797                # If the overrides came from SPECIALS, the code point keys are
3798                # packed UTF-8.
3799                if ($overrides == $swash->{'SPECIALS'}) {
3800                    $cp = $cp_maybe_utf8;
3801                    if (! utf8::decode($cp)) {
3802                        croak __PACKAGE__, "::prop_invmap: Malformed UTF-8: ",
3803                              map { sprintf("\\x{%02X}", unpack("C", $_)) }
3804                                                                split "", $cp;
3805                    }
3806
3807                    $cp = unpack("W", $cp);
3808                    @map = unpack "W*", $swash->{'SPECIALS'}{$cp_maybe_utf8};
3809
3810                    # The empty string will show up unpacked as an empty
3811                    # array.
3812                    $format = 'ale' if @map == 0;
3813                }
3814                else {
3815
3816                    # But if we generated the overrides, we didn't bother to
3817                    # pack them, and we, so far, do this only for properties
3818                    # that are 'a' ones.
3819                    $cp = $cp_maybe_utf8;
3820                    @map = hex $overrides->{$cp};
3821                    $format = 'a';
3822                }
3823
3824                # Find the range that the override applies to.
3825                my $i = search_invlist(\@invlist, $cp);
3826                if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) {
3827                    croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
3828                }
3829
3830                # And what that range currently maps to
3831                my $cur_map = $invmap[$i];
3832
3833                # If there is a gap between the next range and the code point
3834                # we are overriding, we have to add elements to both arrays to
3835                # fill that gap, using the map that applies to it, which is
3836                # $cur_map, since it is part of the current range.
3837                if ($invlist[$i + 1] > $cp + 1) {
3838                    #use feature 'say';
3839                    #say "Before splice:";
3840                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
3841                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
3842                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
3843                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
3844                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
3845
3846                    splice @invlist, $i + 1, 0, $cp + 1;
3847                    splice @invmap, $i + 1, 0, $cur_map;
3848
3849                    #say "After splice:";
3850                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
3851                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
3852                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
3853                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
3854                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
3855                }
3856
3857                # If the remaining portion of the range is multiple code
3858                # points (ending with the one we are replacing, guaranteed by
3859                # the earlier splice).  We must split it into two
3860                if ($invlist[$i] < $cp) {
3861                    $i++;   # Compensate for the new element
3862
3863                    #use feature 'say';
3864                    #say "Before splice:";
3865                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
3866                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
3867                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
3868                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
3869                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
3870
3871                    splice @invlist, $i, 0, $cp;
3872                    splice @invmap, $i, 0, 'dummy';
3873
3874                    #say "After splice:";
3875                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
3876                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
3877                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
3878                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
3879                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
3880                }
3881
3882                # Here, the range we are overriding contains a single code
3883                # point.  The result could be the empty string, a single
3884                # value, or a list.  If the last case, we use an anonymous
3885                # array.
3886                $invmap[$i] = (scalar @map == 0)
3887                               ? ""
3888                               : (scalar @map > 1)
3889                                  ? \@map
3890                                  : $map[0];
3891            }
3892        }
3893    }
3894    elsif ($format eq 'x') {
3895
3896        # All hex-valued properties are really to code points, and have been
3897        # converted to decimal.
3898        $format = 's';
3899    }
3900    elsif ($returned_prop eq 'ToDm') {
3901        $format = 'ad';
3902    }
3903    elsif ($format eq 'sw') { # blank-separated elements to form a list.
3904        map { $_ = [ split " ", $_  ] if $_ =~ / / } @invmap;
3905        $format = 'sl';
3906    }
3907    elsif ($returned_prop =~ / To ( _Perl )? NameAlias/x) {
3908
3909        # This property currently doesn't have any lists, but theoretically
3910        # could
3911        $format = 'sl';
3912    }
3913    elsif ($returned_prop eq 'ToPerlDecimalDigit') {
3914        $format = 'ae';
3915    }
3916    elsif ($returned_prop eq 'ToNv') {
3917
3918        # The one property that has this format is stored as a delta, so needs
3919        # to indicate that need to add code point to it.
3920        $format = 'ar';
3921    }
3922    elsif ($format eq 'ax') {
3923
3924        # Normally 'ax' properties have overrides, and will have been handled
3925        # above, but if not, they still need adjustment, and the hex values
3926        # have already been converted to decimal
3927        $format = 'a';
3928    }
3929    elsif ($format ne 'n' && $format !~ / ^ a /x) {
3930
3931        # All others are simple scalars
3932        $format = 's';
3933    }
3934    if ($has_multiples &&  $format !~ /l/) {
3935	croak __PACKAGE__, "::prop_invmap: Wrong format '$format' for prop_invmap('$prop'); should indicate has lists";
3936    }
3937
3938    return (\@invlist, \@invmap, $format, $missing);
3939}
3940
3941sub search_invlist {
3942
3943=pod
3944
3945=head2 B<search_invlist()>
3946
3947 use Unicode::UCD qw(prop_invmap prop_invlist);
3948 use Unicode::UCD 'search_invlist';
3949
3950 my @invlist = prop_invlist($property_name);
3951 print $code_point, ((search_invlist(\@invlist, $code_point) // -1) % 2)
3952                     ? " isn't"
3953                     : " is",
3954     " in $property_name\n";
3955
3956 my ($blocks_ranges_ref, $blocks_map_ref) = prop_invmap("Block");
3957 my $index = search_invlist($blocks_ranges_ref, $code_point);
3958 print "$code_point is in block ", $blocks_map_ref->[$index], "\n";
3959
3960C<search_invlist> is used to search an inversion list returned by
3961C<prop_invlist> or C<prop_invmap> for a particular L</code point argument>.
3962C<undef> is returned if the code point is not found in the inversion list
3963(this happens only when it is not a legal L<code point argument>, or is less
3964than the list's first element).  A warning is raised in the first instance.
3965
3966Otherwise, it returns the index into the list of the range that contains the
3967code point.; that is, find C<i> such that
3968
3969    list[i]<= code_point < list[i+1].
3970
3971As explained in L</prop_invlist()>, whether a code point is in the list or not
3972depends on if the index is even (in) or odd (not in).  And as explained in
3973L</prop_invmap()>, the index is used with the returned parallel array to find
3974the mapping.
3975
3976=cut
3977
3978
3979    my $list_ref = shift;
3980    my $input_code_point = shift;
3981    my $code_point = _getcode($input_code_point);
3982
3983    if (! defined $code_point) {
3984        carp __PACKAGE__, "::search_invlist: unknown code '$input_code_point'";
3985        return;
3986    }
3987
3988    my $max_element = @$list_ref - 1;
3989
3990    # Return undef if list is empty or requested item is before the first element.
3991    return if $max_element < 0;
3992    return if $code_point < $list_ref->[0];
3993
3994    # Short cut something at the far-end of the table.  This also allows us to
3995    # refer to element [$i+1] without fear of being out-of-bounds in the loop
3996    # below.
3997    return $max_element if $code_point >= $list_ref->[$max_element];
3998
3999    use integer;        # want integer division
4000
4001    my $i = $max_element / 2;
4002
4003    my $lower = 0;
4004    my $upper = $max_element;
4005    while (1) {
4006
4007        if ($code_point >= $list_ref->[$i]) {
4008
4009            # Here we have met the lower constraint.  We can quit if we
4010            # also meet the upper one.
4011            last if $code_point < $list_ref->[$i+1];
4012
4013            $lower = $i;        # Still too low.
4014
4015        }
4016        else {
4017
4018            # Here, $code_point < $list_ref[$i], so look lower down.
4019            $upper = $i;
4020        }
4021
4022        # Split search domain in half to try again.
4023        my $temp = ($upper + $lower) / 2;
4024
4025        # No point in continuing unless $i changes for next time
4026        # in the loop.
4027        return $i if $temp == $i;
4028        $i = $temp;
4029    } # End of while loop
4030
4031    # Here we have found the offset
4032    return $i;
4033}
4034
4035=head2 Unicode::UCD::UnicodeVersion
4036
4037This returns the version of the Unicode Character Database, in other words, the
4038version of the Unicode standard the database implements.  The version is a
4039string of numbers delimited by dots (C<'.'>).
4040
4041=cut
4042
4043my $UNICODEVERSION;
4044
4045sub UnicodeVersion {
4046    unless (defined $UNICODEVERSION) {
4047	openunicode(\$VERSIONFH, "version");
4048	local $/ = "\n";
4049	chomp($UNICODEVERSION = <$VERSIONFH>);
4050	close($VERSIONFH);
4051	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
4052	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
4053    }
4054    $v_unicode_version = pack "C*", split /\./, $UNICODEVERSION;
4055    return $UNICODEVERSION;
4056}
4057
4058=head2 B<Blocks versus Scripts>
4059
4060The difference between a block and a script is that scripts are closer
4061to the linguistic notion of a set of code points required to represent
4062languages, while block is more of an artifact of the Unicode code point
4063numbering and separation into blocks of consecutive code points (so far the
4064size of a block is some multiple of 16, like 128 or 256).
4065
4066For example the Latin B<script> is spread over several B<blocks>, such
4067as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
4068C<Latin Extended-B>.  On the other hand, the Latin script does not
4069contain all the characters of the C<Basic Latin> block (also known as
4070ASCII): it includes only the letters, and not, for example, the digits
4071nor the punctuation.
4072
4073For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
4074
4075For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
4076
4077=head2 B<Matching Scripts and Blocks>
4078
4079Scripts are matched with the regular-expression construct
4080C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
4081while C<\p{Blk=...}> is used for blocks (e.g. C<\p{Blk=Tibetan}> matches
4082any of the 256 code points in the Tibetan block).
4083
4084=head2 Old-style versus new-style block names
4085
4086Unicode publishes the names of blocks in two different styles, though the two
4087are equivalent under Unicode's loose matching rules.
4088
4089The original style uses blanks and hyphens in the block names (except for
4090C<No_Block>), like so:
4091
4092 Miscellaneous Mathematical Symbols-B
4093
4094The newer style replaces these with underscores, like this:
4095
4096 Miscellaneous_Mathematical_Symbols_B
4097
4098This newer style is consistent with the values of other Unicode properties.
4099To preserve backward compatibility, all the functions in Unicode::UCD that
4100return block names (except as noted) return the old-style ones.
4101L</prop_value_aliases()> returns the new-style and can be used to convert from
4102old-style to new-style:
4103
4104 my $new_style = prop_values_aliases("block", $old_style);
4105
4106Perl also has single-form extensions that refer to blocks, C<In_Cyrillic>,
4107meaning C<Block=Cyrillic>.  These have always been written in the new style.
4108
4109To convert from new-style to old-style, follow this recipe:
4110
4111 $old_style = charblock((prop_invlist("block=$new_style"))[0]);
4112
4113(which finds the range of code points in the block using C<prop_invlist>,
4114gets the lower end of the range (0th element) and then looks up the old name
4115for its block using C<charblock>).
4116
4117Note that starting in Unicode 6.1, many of the block names have shorter
4118synonyms.  These are always given in the new style.
4119
4120=head2 Use with older Unicode versions
4121
4122The functions in this module work as well as can be expected when
4123used on earlier Unicode versions.  But, obviously, they use the available data
4124from that Unicode version.  For example, if the Unicode version predates the
4125definition of the script property (Unicode 3.1), then any function that deals
4126with scripts is going to return C<undef> for the script portion of the return
4127value.
4128
4129=head1 AUTHOR
4130
4131Jarkko Hietaniemi.  Now maintained by perl5 porters.
4132
4133=cut
4134
41351;
4136