1898184e3Ssthen# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! 2898184e3Ssthen# This helper module is for internal use by core Perl only. This module is 3898184e3Ssthen# subject to change or removal at any time without notice. Don't use it 4898184e3Ssthen# directly. Use the public <charnames> module instead. 5898184e3Ssthen 6898184e3Ssthenpackage _charnames; 7898184e3Ssthenuse strict; 8898184e3Ssthenuse warnings; 9*eac174f2Safresh1our $VERSION = '1.50'; 10898184e3Ssthenuse unicore::Name; # mktables-generated algorithmically-defined names 11898184e3Ssthen 12898184e3Ssthenuse bytes (); # for $bytes::hint_bits 13898184e3Ssthenuse re "/aa"; # Everything in here should be ASCII 14898184e3Ssthen 15898184e3Ssthen$Carp::Internal{ (__PACKAGE__) } = 1; 16898184e3Ssthen 17898184e3Ssthen# Translate between Unicode character names and their code points. This is a 18898184e3Ssthen# submodule of package <charnames>, used to allow \N{...} to be autoloaded, 19898184e3Ssthen# but it was decided not to autoload the various functions in charnames; the 20898184e3Ssthen# splitting allows this behavior. 21898184e3Ssthen# 22898184e3Ssthen# The official names with their code points are stored in a table in 23898184e3Ssthen# lib/unicore/Name.pl which is read in as a large string (almost 3/4 Mb in 2456d68f1eSafresh1# Unicode 6.0). Each code point sequence appears on a line by itself, with 2556d68f1eSafresh1# its corresponding name occupying the next line in the string. (Some of the 2656d68f1eSafresh1# CJK and the Hangul syllable names are instead determined algorithmically via 2756d68f1eSafresh1# subroutines stored instead in lib/unicore/Name.pm). Because of the large 2856d68f1eSafresh1# size of this table, it isn't converted into hashes for faster lookup. 29898184e3Ssthen# 30898184e3Ssthen# But, user defined aliases are stored in their own hashes, as are Perl 31898184e3Ssthen# extensions to the official names. These are checked first before looking at 32898184e3Ssthen# the official table. 33898184e3Ssthen# 34898184e3Ssthen# Basically, the table is grepped for the input code point (viacode()) or 3556d68f1eSafresh1# name (the other functions), and the corresponding value on the next or 3656d68f1eSafresh1# previous line is returned. The grepping is done by turning the input into a 3756d68f1eSafresh1# regular expression. Thus, the same table does double duty, used by both 3856d68f1eSafresh1# name and code point lookup. (If we were to have hashes, we would need two, 3956d68f1eSafresh1# one for each lookup direction.) 40898184e3Ssthen# 41898184e3Ssthen# For loose name matching, the logical thing would be to have a table 42898184e3Ssthen# with all the ignorable characters squeezed out, and then grep it with the 43898184e3Ssthen# similiarly-squeezed input name. (And this is in fact how the lookups are 44898184e3Ssthen# done with the small Perl extension hashes.) But since we need to be able to 45898184e3Ssthen# go from code point to official name, the original table would still need to 46898184e3Ssthen# exist. Due to the large size of the table, it was decided to not read 47898184e3Ssthen# another very large string into memory for a second table. Instead, the 48898184e3Ssthen# regular expression of the input name is modified to have optional spaces and 49898184e3Ssthen# dashes between characters. For example, in strict matching, the regular 50898184e3Ssthen# expression would be: 5156d68f1eSafresh1# qr/^DIGIT ONE$/m 52898184e3Ssthen# Under loose matching, the blank would be squeezed out, and the re would be: 5356d68f1eSafresh1# qr/^D[- ]?I[- ]?G[- ]?I[- ]?T[- ]?O[- ]?N[- ]?E$/m 54898184e3Ssthen# which matches a blank or dash between any characters in the official table. 55898184e3Ssthen# 56898184e3Ssthen# This is also how script lookup is done. Basically the re looks like 57898184e3Ssthen# qr/ (?:LATIN|GREEK|CYRILLIC) (?:SMALL )?LETTER $name/ 58898184e3Ssthen# where $name is the loose or strict regex for the remainder of the name. 59898184e3Ssthen 60898184e3Ssthen# The hashes are stored as utf8 strings. This makes it easier to deal with 61898184e3Ssthen# sequences. I (khw) also tried making Name.pl utf8, but it slowed things 6256d68f1eSafresh1# down by a factor of 7. I then tried making Name.pl store the utf8 63898184e3Ssthen# equivalents but not calling them utf8. That led to similar speed as leaving 64898184e3Ssthen# it alone, but since that is harder for a human to parse, I left it as-is. 65898184e3Ssthen 66898184e3Ssthenmy %system_aliases = ( 67898184e3Ssthen 68b8851fccSafresh1 'SINGLE-SHIFT 2' => chr utf8::unicode_to_native(0x8E), 69b8851fccSafresh1 'SINGLE-SHIFT 3' => chr utf8::unicode_to_native(0x8F), 70b8851fccSafresh1 'PRIVATE USE 1' => chr utf8::unicode_to_native(0x91), 71b8851fccSafresh1 'PRIVATE USE 2' => chr utf8::unicode_to_native(0x92), 72898184e3Ssthen); 73898184e3Ssthen 74898184e3Ssthen# These are the aliases above that differ under :loose and :full matching 75898184e3Ssthen# because the :full versions have blanks or hyphens in them. 76898184e3Ssthen#my %loose_system_aliases = ( 77898184e3Ssthen#); 78898184e3Ssthen 7991f110e0Safresh1#my %deprecated_aliases; 80b8851fccSafresh1#$deprecated_aliases{'BELL'} = chr utf8::unicode_to_native(0x07) if $^V lt v5.17.0; 81898184e3Ssthen 82898184e3Ssthen#my %loose_deprecated_aliases = ( 83898184e3Ssthen#); 84898184e3Ssthen 85898184e3Ssthen# These are special cased in :loose matching, differing only in a medial 86898184e3Ssthen# hyphen 87b8851fccSafresh1my $HANGUL_JUNGSEONG_O_E_utf8 = chr 0x1180; 88b8851fccSafresh1my $HANGUL_JUNGSEONG_OE_utf8 = chr 0x116C; 89898184e3Ssthen 90898184e3Ssthen 91898184e3Ssthenmy $txt; # The table of official character names 92898184e3Ssthen 93898184e3Ssthenmy %full_names_cache; # Holds already-looked-up names, so don't have to 94898184e3Ssthen# re-look them up again. The previous versions of charnames had scoping 95898184e3Ssthen# bugs. For example if we use script A in one scope and find and cache 96898184e3Ssthen# what Z resolves to, we can't use that cache in a different scope that 97898184e3Ssthen# uses script B instead of A, as Z might be an entirely different letter 98898184e3Ssthen# there; or there might be different aliases in effect in different 99898184e3Ssthen# scopes, or :short may be in effect or not effect in different scopes, 100898184e3Ssthen# or various combinations thereof. This was solved in this version 101898184e3Ssthen# mostly by moving things to %^H. But some things couldn't be moved 102898184e3Ssthen# there. One of them was the cache of runtime looked-up names, in part 103898184e3Ssthen# because %^H is read-only at runtime. I (khw) don't know why the cache 104898184e3Ssthen# was run-time only in the previous versions: perhaps oversight; perhaps 105898184e3Ssthen# that compile time looking doesn't happen in a loop so didn't think it 106898184e3Ssthen# was worthwhile; perhaps not wanting to make the cache too large. But 107898184e3Ssthen# I decided to make it compile time as well; this could easily be 108898184e3Ssthen# changed. 109898184e3Ssthen# Anyway, this hash is not scoped, and is added to at runtime. It 110898184e3Ssthen# doesn't have scoping problems because the data in it is restricted to 111898184e3Ssthen# official names, which are always invariant, and we only set it and 112898184e3Ssthen# look at it at during :full lookups, so is unaffected by any other 113898184e3Ssthen# scoped options. I put this in to maintain parity with the older 114898184e3Ssthen# version. If desired, a %short_names cache could also be made, as well 115898184e3Ssthen# as one for each script, say in %script_names_cache, with each key 116898184e3Ssthen# being a hash for a script named in a 'use charnames' statement. I 117898184e3Ssthen# decided not to do that for now, just because it's added complication, 118898184e3Ssthen# and because I'm just trying to maintain parity, not extend it. 119898184e3Ssthen 120898184e3Ssthen# Like %full_names_cache, but for use when :loose is in effect. There needs 121898184e3Ssthen# to be two caches because :loose may not be in effect for a scope, and a 122898184e3Ssthen# loose name could inappropriately be returned when only exact matching is 123898184e3Ssthen# called for. 124898184e3Ssthenmy %loose_names_cache; 125898184e3Ssthen 126898184e3Ssthen# Designed so that test decimal first, and then hex. Leading zeros 127898184e3Ssthen# imply non-decimal, as do non-[0-9] 128898184e3Ssthenmy $decimal_qr = qr/^[1-9]\d*$/; 129898184e3Ssthen 130898184e3Ssthen# Returns the hex number in $1. 131898184e3Ssthenmy $hex_qr = qr/^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/; 132898184e3Ssthen 133898184e3Ssthensub croak 134898184e3Ssthen{ 135898184e3Ssthen require Carp; goto &Carp::croak; 136898184e3Ssthen} # croak 137898184e3Ssthen 138898184e3Ssthensub carp 139898184e3Ssthen{ 140898184e3Ssthen require Carp; goto &Carp::carp; 141898184e3Ssthen} # carp 142898184e3Ssthen 14356d68f1eSafresh1sub populate_txt() 14456d68f1eSafresh1{ 14556d68f1eSafresh1 return if $txt; 14656d68f1eSafresh1 14756d68f1eSafresh1 $txt = do "unicore/Name.pl"; 14856d68f1eSafresh1 Internals::SvREADONLY($txt, 1); 14956d68f1eSafresh1} 15056d68f1eSafresh1 151898184e3Ssthensub alias (@) # Set up a single alias 152898184e3Ssthen{ 15391f110e0Safresh1 my @errors; 154b8851fccSafresh1 my $nbsp = chr utf8::unicode_to_native(0xA0); 15591f110e0Safresh1 156898184e3Ssthen my $alias = ref $_[0] ? $_[0] : { @_ }; 15791f110e0Safresh1 foreach my $name (sort keys %$alias) { # Sort only because it helps having 15891f110e0Safresh1 # deterministic output for 15991f110e0Safresh1 # t/lib/charnames/alias 160898184e3Ssthen my $value = $alias->{$name}; 161898184e3Ssthen next unless defined $value; # Omit if screwed up. 162898184e3Ssthen 163898184e3Ssthen # Is slightly slower to just after this statement see if it is 164898184e3Ssthen # decimal, since we already know it is after having converted from 165898184e3Ssthen # hex, but makes the code easier to maintain, and is called 166898184e3Ssthen # infrequently, only at compile-time 167898184e3Ssthen if ($value !~ $decimal_qr && $value =~ $hex_qr) { 1686fb12b70Safresh1 my $temp = CORE::hex $1; 1696fb12b70Safresh1 $temp = utf8::unicode_to_native($temp) if $value =~ /^[Uu]\+/; 1706fb12b70Safresh1 $value = $temp; 171898184e3Ssthen } 172898184e3Ssthen if ($value =~ $decimal_qr) { 17391f110e0Safresh1 no warnings qw(non_unicode surrogate nonchar); # Allow any of these 174b8851fccSafresh1 $^H{charnames_ord_aliases}{$name} = chr $value; 175898184e3Ssthen 176898184e3Ssthen # Use a canonical form. 177898184e3Ssthen $^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name; 178898184e3Ssthen } 179898184e3Ssthen else { 180b8851fccSafresh1 my $ok_portion = ""; 181b8851fccSafresh1 $ok_portion = $1 if $name =~ / ^ ( 18291f110e0Safresh1 \p{_Perl_Charname_Begin} 18391f110e0Safresh1 \p{_Perl_Charname_Continue}* 184b8851fccSafresh1 ) /x; 18591f110e0Safresh1 186b8851fccSafresh1 # If the name was fully correct, the above should have matched all of 187b8851fccSafresh1 # it. 188b8851fccSafresh1 if (length $ok_portion < length $name) { 189b8851fccSafresh1 my $first_bad = substr($name, length($ok_portion), 1); 190b8851fccSafresh1 push @errors, "Invalid character in charnames alias definition; " 191b8851fccSafresh1 . "marked by <-- HERE in '$ok_portion$first_bad<-- HERE " 192b8851fccSafresh1 . substr($name, length($ok_portion) + 1) 193b8851fccSafresh1 . "'"; 19491f110e0Safresh1 } 19591f110e0Safresh1 else { 19691f110e0Safresh1 if ($name =~ / ( .* \s ) ( \s* ) $ /x) { 197b8851fccSafresh1 push @errors, "charnames alias definitions may not contain " 198b8851fccSafresh1 . "trailing white-space; marked by <-- HERE in " 199b8851fccSafresh1 . "'$1 <-- HERE " . $2 . "'"; 200b8851fccSafresh1 next; 20191f110e0Safresh1 } 20291f110e0Safresh1 20391f110e0Safresh1 # Use '+' instead of '*' in this regex, because any trailing 204b8851fccSafresh1 # blanks have already been found 20591f110e0Safresh1 if ($name =~ / ( .*? \s{2} ) ( .+ ) /x) { 206b8851fccSafresh1 push @errors, "charnames alias definitions may not contain a " 207b8851fccSafresh1 . "sequence of multiple spaces; marked by <-- HERE " 208b8851fccSafresh1 . "in '$1 <-- HERE " . $2 . "'"; 209b8851fccSafresh1 next; 210898184e3Ssthen } 211b8851fccSafresh1 212b8851fccSafresh1 $^H{charnames_name_aliases}{$name} = $value; 21391f110e0Safresh1 } 21491f110e0Safresh1 } 21591f110e0Safresh1 } 21691f110e0Safresh1 21791f110e0Safresh1 # We find and output all errors from this :alias definition, rather than 21891f110e0Safresh1 # failing on the first one, so fewer runs are needed to get it to compile 21991f110e0Safresh1 if (@errors) { 22091f110e0Safresh1 croak join "\n", @errors; 22191f110e0Safresh1 } 22291f110e0Safresh1 22391f110e0Safresh1 return; 224898184e3Ssthen} # alias 225898184e3Ssthen 226898184e3Ssthensub not_legal_use_bytes_msg { 227898184e3Ssthen my ($name, $utf8) = @_; 228898184e3Ssthen my $return; 229898184e3Ssthen 230898184e3Ssthen if (length($utf8) == 1) { 231898184e3Ssthen $return = sprintf("Character 0x%04x with name '%s' is", ord $utf8, $name); 232898184e3Ssthen } else { 233898184e3Ssthen $return = sprintf("String with name '%s' (and ordinals %s) contains character(s)", $name, join(" ", map { sprintf "0x%04X", ord $_ } split(//, $utf8))); 234898184e3Ssthen } 235898184e3Ssthen return $return . " above 0xFF with 'use bytes' in effect"; 236898184e3Ssthen} 237898184e3Ssthen 238898184e3Ssthensub alias_file ($) # Reads a file containing alias definitions 239898184e3Ssthen{ 240b8851fccSafresh1 require File::Spec; 241898184e3Ssthen my ($arg, $file) = @_; 242898184e3Ssthen if (-f $arg && File::Spec->file_name_is_absolute ($arg)) { 243898184e3Ssthen $file = $arg; 244898184e3Ssthen } 24591f110e0Safresh1 elsif ($arg =~ m/ ^ \p{_Perl_IDStart} \p{_Perl_IDCont}* $/x) { 246898184e3Ssthen $file = "unicore/${arg}_alias.pl"; 247898184e3Ssthen } 248898184e3Ssthen else { 24991f110e0Safresh1 croak "Charnames alias file names can only have identifier characters"; 250898184e3Ssthen } 251898184e3Ssthen if (my @alias = do $file) { 252898184e3Ssthen @alias == 1 && !defined $alias[0] and 253898184e3Ssthen croak "$file cannot be used as alias file for charnames"; 254898184e3Ssthen @alias % 2 and 255898184e3Ssthen croak "$file did not return a (valid) list of alias pairs"; 256898184e3Ssthen alias (@alias); 257898184e3Ssthen return (1); 258898184e3Ssthen } 259898184e3Ssthen 0; 260898184e3Ssthen} # alias_file 261898184e3Ssthen 262898184e3Ssthen# For use when don't import anything. This structure must be kept in 263898184e3Ssthen# sync with the one that import() fills up. 264898184e3Ssthenmy %dummy_H = ( 265898184e3Ssthen charnames_stringified_names => "", 266898184e3Ssthen charnames_stringified_ords => "", 267898184e3Ssthen charnames_scripts => "", 268898184e3Ssthen charnames_full => 1, 269898184e3Ssthen charnames_loose => 0, 270898184e3Ssthen charnames_short => 0, 271898184e3Ssthen ); 272898184e3Ssthen 273898184e3Ssthen 27456d68f1eSafresh1sub lookup_name ($$$;$) { 27556d68f1eSafresh1 my ($name, $wants_ord, $runtime, $regex_loose) = @_; 27656d68f1eSafresh1 $regex_loose //= 0; 277898184e3Ssthen 278898184e3Ssthen # Lookup the name or sequence $name in the tables. If $wants_ord is false, 279898184e3Ssthen # returns the string equivalent of $name; if true, returns the ordinal value 280898184e3Ssthen # instead, but in this case $name must not be a sequence; otherwise undef is 281898184e3Ssthen # returned and a warning raised. $runtime is 0 if compiletime, otherwise 282898184e3Ssthen # gives the number of stack frames to go back to get the application caller 283898184e3Ssthen # info. 284898184e3Ssthen # If $name is not found, returns undef in runtime with no warning; and in 285898184e3Ssthen # compiletime, the Unicode replacement character, with a warning. 286898184e3Ssthen 287898184e3Ssthen # It looks first in the aliases, then in the large table of official Unicode 288898184e3Ssthen # names. 289898184e3Ssthen 290b8851fccSafresh1 my $result; # The string result 291898184e3Ssthen my $save_input; 292898184e3Ssthen 29356d68f1eSafresh1 if ($runtime && ! $regex_loose) { 294898184e3Ssthen 295898184e3Ssthen my $hints_ref = (caller($runtime))[10]; 296898184e3Ssthen 297898184e3Ssthen # If we didn't import anything (which happens with 'use charnames ()', 298898184e3Ssthen # substitute a dummy structure. 299898184e3Ssthen $hints_ref = \%dummy_H if ! defined $hints_ref 300898184e3Ssthen || (! defined $hints_ref->{charnames_full} 301898184e3Ssthen && ! defined $hints_ref->{charnames_loose}); 302898184e3Ssthen 303b8851fccSafresh1 # At runtime, but currently not at compile time, %^H gets 304898184e3Ssthen # stringified, so un-stringify back to the original data structures. 305898184e3Ssthen # These get thrown away by perl before the next invocation 306898184e3Ssthen # Also fill in the hash with the non-stringified data. 307898184e3Ssthen # N.B. New fields must be also added to %dummy_H 308898184e3Ssthen 309898184e3Ssthen %{$^H{charnames_name_aliases}} = split ',', 310898184e3Ssthen $hints_ref->{charnames_stringified_names}; 311898184e3Ssthen %{$^H{charnames_ord_aliases}} = split ',', 312898184e3Ssthen $hints_ref->{charnames_stringified_ords}; 313898184e3Ssthen $^H{charnames_scripts} = $hints_ref->{charnames_scripts}; 314898184e3Ssthen $^H{charnames_full} = $hints_ref->{charnames_full}; 315898184e3Ssthen $^H{charnames_loose} = $hints_ref->{charnames_loose}; 316898184e3Ssthen $^H{charnames_short} = $hints_ref->{charnames_short}; 317898184e3Ssthen } 318898184e3Ssthen 31956d68f1eSafresh1 my $loose = $regex_loose || $^H{charnames_loose}; 320898184e3Ssthen my $lookup_name; # Input name suitably modified for grepping for in the 321898184e3Ssthen # table 322898184e3Ssthen 323898184e3Ssthen # User alias should be checked first or else can't override ours, and if we 324898184e3Ssthen # were to add any, could conflict with theirs. 32556d68f1eSafresh1 if (! $regex_loose && exists $^H{charnames_ord_aliases}{$name}) { 326b8851fccSafresh1 $result = $^H{charnames_ord_aliases}{$name}; 327898184e3Ssthen } 32856d68f1eSafresh1 elsif (! $regex_loose && exists $^H{charnames_name_aliases}{$name}) { 329898184e3Ssthen $name = $^H{charnames_name_aliases}{$name}; 330898184e3Ssthen $save_input = $lookup_name = $name; # Cache the result for any error 331898184e3Ssthen # message 332898184e3Ssthen # The aliases are documented to not match loosely, so change loose match 333898184e3Ssthen # into full. 334898184e3Ssthen if ($loose) { 335898184e3Ssthen $loose = 0; 336898184e3Ssthen $^H{charnames_full} = 1; 337898184e3Ssthen } 338898184e3Ssthen } 339898184e3Ssthen else { 340898184e3Ssthen 341898184e3Ssthen # Here, not a user alias. That means that loose matching may be in 342898184e3Ssthen # effect; will have to modify the input name. 343898184e3Ssthen $lookup_name = $name; 344898184e3Ssthen if ($loose) { 345898184e3Ssthen $lookup_name = uc $lookup_name; 346898184e3Ssthen 347898184e3Ssthen # Squeeze out all underscores 348898184e3Ssthen $lookup_name =~ s/_//g; 349898184e3Ssthen 350898184e3Ssthen # Remove all medial hyphens 351898184e3Ssthen $lookup_name =~ s/ (?<= \S ) - (?= \S )//gx; 352898184e3Ssthen 353898184e3Ssthen # Squeeze out all spaces 354898184e3Ssthen $lookup_name =~ s/\s//g; 355898184e3Ssthen } 356898184e3Ssthen 357898184e3Ssthen # Here, $lookup_name has been modified as necessary for looking in the 358898184e3Ssthen # hashes. Check the system alias files next. Most of these aliases are 359898184e3Ssthen # the same for both strict and loose matching. To save space, the ones 360898184e3Ssthen # which differ are in their own separate hash, which is checked if loose 361898184e3Ssthen # matching is selected and the regular match fails. To save time, the 362898184e3Ssthen # loose hashes could be expanded to include all aliases, and there would 363898184e3Ssthen # only have to be one check. But if someone specifies :loose, they are 364898184e3Ssthen # interested in convenience over speed, and the time for this second check 365898184e3Ssthen # is miniscule compared to the rest of the routine. 366898184e3Ssthen if (exists $system_aliases{$lookup_name}) { 367b8851fccSafresh1 $result = $system_aliases{$lookup_name}; 368898184e3Ssthen } 369898184e3Ssthen # There are currently no entries in this hash, so don't waste time looking 370898184e3Ssthen # for them. But the code is retained for the unlikely possibility that 371898184e3Ssthen # some will be added in the future. 372898184e3Ssthen# elsif ($loose && exists $loose_system_aliases{$lookup_name}) { 373b8851fccSafresh1# $result = $loose_system_aliases{$lookup_name}; 374898184e3Ssthen# } 37591f110e0Safresh1# if (exists $deprecated_aliases{$lookup_name}) { 37691f110e0Safresh1# require warnings; 37791f110e0Safresh1# warnings::warnif('deprecated', 37891f110e0Safresh1# "Unicode character name \"$name\" is deprecated, use \"" 37991f110e0Safresh1# . viacode(ord $deprecated_aliases{$lookup_name}) 38091f110e0Safresh1# . "\" instead"); 381b8851fccSafresh1# $result = $deprecated_aliases{$lookup_name}; 38291f110e0Safresh1# } 383898184e3Ssthen # There are currently no entries in this hash, so don't waste time looking 384898184e3Ssthen # for them. But the code is retained for the unlikely possibility that 385898184e3Ssthen # some will be added in the future. 386898184e3Ssthen# elsif ($loose && exists $loose_deprecated_aliases{$lookup_name}) { 387898184e3Ssthen# require warnings; 388898184e3Ssthen# warnings::warnif('deprecated', 389898184e3Ssthen# "Unicode character name \"$name\" is deprecated, use \"" 390898184e3Ssthen# . viacode(ord $loose_deprecated_aliases{$lookup_name}) 391898184e3Ssthen# . "\" instead"); 392b8851fccSafresh1# $result = $loose_deprecated_aliases{$lookup_name}; 393898184e3Ssthen# } 394898184e3Ssthen } 395898184e3Ssthen 396898184e3Ssthen my @off; # Offsets into table of pattern match begin and end 397898184e3Ssthen 398898184e3Ssthen # If haven't found it yet... 399b8851fccSafresh1 if (! defined $result) { 400898184e3Ssthen 401898184e3Ssthen # See if has looked this input up earlier. 402898184e3Ssthen if (! $loose && $^H{charnames_full} && exists $full_names_cache{$name}) { 403b8851fccSafresh1 $result = $full_names_cache{$name}; 404898184e3Ssthen } 405898184e3Ssthen elsif ($loose && exists $loose_names_cache{$name}) { 406b8851fccSafresh1 $result = $loose_names_cache{$name}; 407898184e3Ssthen } 408898184e3Ssthen else { # Here, must do a look-up 409898184e3Ssthen 410898184e3Ssthen # If full or loose matching succeeded, points to where to cache the 411898184e3Ssthen # result 412898184e3Ssthen my $cache_ref; 413898184e3Ssthen 414898184e3Ssthen ## Suck in the code/name list as a big string. 41556d68f1eSafresh1 ## Entries look like: 41656d68f1eSafresh1 ## "00052\nLATIN CAPITAL LETTER R\n\n" 417898184e3Ssthen # or 41856d68f1eSafresh1 # "0052 0303\nLATIN CAPITAL LETTER R WITH TILDE\n\n" 41956d68f1eSafresh1 populate_txt() unless $txt; 420898184e3Ssthen 421898184e3Ssthen ## @off will hold the index into the code/name string of the start and 422898184e3Ssthen ## end of the name as we find it. 423898184e3Ssthen 424898184e3Ssthen ## If :loose, look for a loose match; if :full, look for the name 425898184e3Ssthen ## exactly 426898184e3Ssthen # First, see if the name is one which is algorithmically determinable. 427898184e3Ssthen # The subroutine is included in Name.pl. The table contained in 428898184e3Ssthen # $txt doesn't contain these. Experiments show that checking 429898184e3Ssthen # for these before checking for the regular names has no 430898184e3Ssthen # noticeable impact on performance for the regular names, but 431898184e3Ssthen # the other way around slows down finding these immensely. 432898184e3Ssthen # Algorithmically determinables are not placed in the cache because 433898184e3Ssthen # that uses up memory, and finding these again is fast. 434898184e3Ssthen if ( ($loose || $^H{charnames_full}) 435898184e3Ssthen && (defined (my $ord = charnames::name_to_code_point_special($lookup_name, $loose)))) 436898184e3Ssthen { 437b8851fccSafresh1 $result = chr $ord; 438898184e3Ssthen } 439898184e3Ssthen else { 440898184e3Ssthen 441898184e3Ssthen # Not algorithmically determinable; look up in the table. The name 442898184e3Ssthen # will be turned into a regex, so quote any meta characters. 443898184e3Ssthen $lookup_name = quotemeta $lookup_name; 444898184e3Ssthen 445898184e3Ssthen if ($loose) { 446898184e3Ssthen 447898184e3Ssthen # For loose matches, $lookup_name has already squeezed out the 448898184e3Ssthen # non-essential characters. We have to add in code to make the 449898184e3Ssthen # squeezed version match the non-squeezed equivalent in the table. 450898184e3Ssthen # The only remaining hyphens are ones that start or end a word in 451898184e3Ssthen # the original. They have been quoted in $lookup_name so they look 452898184e3Ssthen # like "\-". Change all other characters except the backslash 453898184e3Ssthen # quotes for any metacharacters, and the final character, so that 454898184e3Ssthen # e.g., COLON gets transformed into: /C[- ]?O[- ]?L[- ]?O[- ]?N/ 455898184e3Ssthen $lookup_name =~ s/ (?! \\ -) # Don't do this to the \- sequence 456898184e3Ssthen ( [^-\\] ) # Nor the "-" within that sequence, 457898184e3Ssthen # nor the "\" that quotes metachars, 458898184e3Ssthen # but otherwise put the char into $1 459898184e3Ssthen (?=.) # And don't do it for the final char 460898184e3Ssthen /$1\[- \]?/gx; # And add an optional blank or 461898184e3Ssthen # '-' after each $1 char 462898184e3Ssthen 463898184e3Ssthen # Those remaining hyphens were originally at the beginning or end of 464898184e3Ssthen # a word, so they can match either a blank before or after, but not 465898184e3Ssthen # both. (Keep in mind that they have been quoted, so are a '\-' 466898184e3Ssthen # sequence) 467898184e3Ssthen $lookup_name =~ s/\\ -/(?:- | -)/xg; 468898184e3Ssthen } 469898184e3Ssthen 470898184e3Ssthen # Do the lookup in the full table if asked for, and if succeeds 471898184e3Ssthen # save the offsets and set where to cache the result. 47256d68f1eSafresh1 if (($loose || $^H{charnames_full}) && $txt =~ /^$lookup_name$/m) { 47356d68f1eSafresh1 @off = ($-[0], $+[0]); 474898184e3Ssthen $cache_ref = ($loose) ? \%loose_names_cache : \%full_names_cache; 475898184e3Ssthen } 47656d68f1eSafresh1 elsif ($regex_loose) { 47756d68f1eSafresh1 # Currently don't allow :short when this is set 47856d68f1eSafresh1 return; 47956d68f1eSafresh1 } 480898184e3Ssthen else { 481898184e3Ssthen 482898184e3Ssthen # Here, didn't look for, or didn't find the name. 483898184e3Ssthen # If :short is allowed, see if input is like "greek:Sigma". 484898184e3Ssthen # Keep in mind that $lookup_name has had the metas quoted. 485898184e3Ssthen my $scripts_trie = ""; 486898184e3Ssthen my $name_has_uppercase; 487*eac174f2Safresh1 my @scripts; 488898184e3Ssthen if (($^H{charnames_short}) 489898184e3Ssthen && $lookup_name =~ /^ (?: \\ \s)* # Quoted space 490898184e3Ssthen (.+?) # $1 = the script 491898184e3Ssthen (?: \\ \s)* 492898184e3Ssthen \\ : # Quoted colon 493898184e3Ssthen (?: \\ \s)* 494898184e3Ssthen (.+?) # $2 = the name 495898184e3Ssthen (?: \\ \s)* $ 496898184e3Ssthen /xs) 497898184e3Ssthen { 498898184e3Ssthen # Even in non-loose matching, the script traditionally has been 4996fb12b70Safresh1 # case insensitive 500898184e3Ssthen $scripts_trie = "\U$1"; 501898184e3Ssthen $lookup_name = $2; 502898184e3Ssthen 503898184e3Ssthen # Use original name to find its input casing, but ignore the 504898184e3Ssthen # script part of that to make the determination. 505898184e3Ssthen $save_input = $name if ! defined $save_input; 506898184e3Ssthen $name =~ s/.*?://; 507898184e3Ssthen $name_has_uppercase = $name =~ /[[:upper:]]/; 508898184e3Ssthen } 509898184e3Ssthen else { # Otherwise look in allowed scripts 510*eac174f2Safresh1 # We want to search first by script name then by letter name, so that 511*eac174f2Safresh1 # if the user imported `use charnames qw(arabic hebrew)` and asked for 512*eac174f2Safresh1 # \N{alef} they get ARABIC LETTER ALEF, and if they imported 513*eac174f2Safresh1 # `... (hebrew arabic)` and ask for \N{alef} they get HEBREW LETTER ALEF. 514*eac174f2Safresh1 # We can't rely on the regex engine to preserve ordering like that, so 515*eac174f2Safresh1 # pick the pipe-seperated string apart so we can iterate over it. 516*eac174f2Safresh1 @scripts = split(/\|/, $^H{charnames_scripts}); 517898184e3Ssthen 518898184e3Ssthen # Use original name to find its input casing 519898184e3Ssthen $name_has_uppercase = $name =~ /[[:upper:]]/; 520898184e3Ssthen } 521898184e3Ssthen my $case = $name_has_uppercase ? "CAPITAL" : "SMALL"; 522*eac174f2Safresh1 523*eac174f2Safresh1 if(@scripts) { 524*eac174f2Safresh1 SCRIPTS: foreach my $script (@scripts) { 525*eac174f2Safresh1 if($txt =~ /^ (?: $script ) \ (?:$case\ )? LETTER \ \U$lookup_name $/xm) { 526*eac174f2Safresh1 @off = ($-[0], $+[0]); 527*eac174f2Safresh1 last SCRIPTS; 528*eac174f2Safresh1 } 529*eac174f2Safresh1 } 530*eac174f2Safresh1 return unless(@off); 531*eac174f2Safresh1 } 532*eac174f2Safresh1 else { 53391f110e0Safresh1 return if (! $scripts_trie || $txt !~ 53456d68f1eSafresh1 /^ (?: $scripts_trie ) \ (?:$case\ )? LETTER \ \U$lookup_name $/xm); 53556d68f1eSafresh1 @off = ($-[0], $+[0]); 536898184e3Ssthen } 537*eac174f2Safresh1 } 538898184e3Ssthen 539898184e3Ssthen # Here, the input name has been found; we haven't set up the output, 540898184e3Ssthen # but we know where in the string 541898184e3Ssthen # the name starts. The string is set up so that for single characters 54256d68f1eSafresh1 # (and not named sequences), the name is on a line by itself, and the 54356d68f1eSafresh1 # previous line contains precisely 5 hex digits for its code point. 54456d68f1eSafresh1 # Named sequences won't have the 7th preceding character be a \n. 545898184e3Ssthen # (Actually, for the very first entry in the table this isn't strictly 546898184e3Ssthen # true: subtracting 7 will yield -1, and the substr below will 547898184e3Ssthen # therefore yield the very last character in the table, which should 548898184e3Ssthen # also be a \n, so the statement works anyway.) 549898184e3Ssthen if (substr($txt, $off[0] - 7, 1) eq "\n") { 550b8851fccSafresh1 $result = chr CORE::hex substr($txt, $off[0] - 6, 5); 551898184e3Ssthen 552898184e3Ssthen # Handle the single loose matching special case, in which two names 553898184e3Ssthen # differ only by a single medial hyphen. If the original had a 554898184e3Ssthen # hyphen (or more) in the right place, then it is that one. 555b8851fccSafresh1 $result = $HANGUL_JUNGSEONG_O_E_utf8 556898184e3Ssthen if $loose 557b8851fccSafresh1 && $result eq $HANGUL_JUNGSEONG_OE_utf8 558898184e3Ssthen && $name =~ m/O \s* - [-\s]* E/ix; 559898184e3Ssthen # Note that this wouldn't work if there were a 2nd 560898184e3Ssthen # OE in the name 561898184e3Ssthen } 562898184e3Ssthen else { 563898184e3Ssthen 564898184e3Ssthen # Here, is a named sequence. Need to go looking for the beginning, 565898184e3Ssthen # which is just after the \n from the previous entry in the table. 566898184e3Ssthen # The +1 skips past that newline, or, if the rindex() fails, to put 567898184e3Ssthen # us to an offset of zero. 568898184e3Ssthen my $charstart = rindex($txt, "\n", $off[0] - 7) + 1; 569b8851fccSafresh1 $result = pack("W*", map { CORE::hex } 570898184e3Ssthen split " ", substr($txt, $charstart, $off[0] - $charstart - 1)); 571898184e3Ssthen } 572898184e3Ssthen } 573898184e3Ssthen 574898184e3Ssthen # Cache the input so as to not have to search the large table 575898184e3Ssthen # again, but only if it came from the one search that we cache. 576898184e3Ssthen # (Haven't bothered with the pain of sorting out scoping issues for the 577898184e3Ssthen # scripts searches.) 578b8851fccSafresh1 $cache_ref->{$name} = $result if defined $cache_ref; 579898184e3Ssthen } 580898184e3Ssthen } 581898184e3Ssthen 582b8851fccSafresh1 # Here, have the result character. If the return is to be an ord, must be 583b8851fccSafresh1 # any single character. 584898184e3Ssthen if ($wants_ord) { 585b8851fccSafresh1 return ord($result) if length $result == 1; 586b8851fccSafresh1 } 587b8851fccSafresh1 elsif (! utf8::is_utf8($result)) { 588b8851fccSafresh1 589b8851fccSafresh1 # Here isn't UTF-8. That's OK if it is all ASCII, or we are being called 590b8851fccSafresh1 # at compile time where we know we can guarantee that Unicode rules are 591b8851fccSafresh1 # correctly imposed on the result, or under 'bytes' where we don't want 592b8851fccSafresh1 # those rules. But otherwise we have to make it UTF8 to guarantee Unicode 593b8851fccSafresh1 # rules on the returned string. 594b8851fccSafresh1 return $result if ! $runtime 595b8851fccSafresh1 || (caller $runtime)[8] & $bytes::hint_bits 596b8851fccSafresh1 || $result !~ /[[:^ascii:]]/; 597b8851fccSafresh1 utf8::upgrade($result); 598b8851fccSafresh1 return $result; 599898184e3Ssthen } 600898184e3Ssthen else { 601898184e3Ssthen 602898184e3Ssthen # Here, wants string output. If utf8 is acceptable, just return what 603898184e3Ssthen # we've got; otherwise attempt to convert it to non-utf8 and return that. 60456d68f1eSafresh1 my $in_bytes = ! $regex_loose # \p{name=} doesn't currently care if 60556d68f1eSafresh1 # in bytes or not 60656d68f1eSafresh1 && (($runtime) 607898184e3Ssthen ? (caller $runtime)[8] & $bytes::hint_bits 60856d68f1eSafresh1 : $^H & $bytes::hint_bits); 609b8851fccSafresh1 return $result if (! $in_bytes || utf8::downgrade($result, 1)) # The 1 arg 610898184e3Ssthen # means don't die on failure 611898184e3Ssthen } 612898184e3Ssthen 613898184e3Ssthen # Here, there is an error: either there are too many characters, or the 614898184e3Ssthen # result string needs to be non-utf8, and at least one character requires 615898184e3Ssthen # utf8. Prefer any official name over the input one for the error message. 616898184e3Ssthen if (@off) { 617898184e3Ssthen $name = substr($txt, $off[0], $off[1] - $off[0]) if @off; 618898184e3Ssthen } 619898184e3Ssthen else { 620898184e3Ssthen $name = (defined $save_input) ? $save_input : $_[0]; 621898184e3Ssthen } 622898184e3Ssthen 623898184e3Ssthen if ($wants_ord) { 624898184e3Ssthen # Only way to get here in this case is if result too long. Message 625898184e3Ssthen # assumes that our only caller that requires single char result is 626898184e3Ssthen # vianame. 627898184e3Ssthen carp "charnames::vianame() doesn't handle named sequences ($name). Use charnames::string_vianame() instead"; 628898184e3Ssthen return; 629898184e3Ssthen } 630898184e3Ssthen 631898184e3Ssthen # Only other possible failure here is from use bytes. 632898184e3Ssthen if ($runtime) { 633b8851fccSafresh1 carp not_legal_use_bytes_msg($name, $result); 634898184e3Ssthen return; 635898184e3Ssthen } else { 636b8851fccSafresh1 croak not_legal_use_bytes_msg($name, $result); 637898184e3Ssthen } 638898184e3Ssthen 639898184e3Ssthen} # lookup_name 640898184e3Ssthen 641898184e3Ssthensub charnames { 642898184e3Ssthen 643898184e3Ssthen # For \N{...}. Looks up the character name and returns the string 644898184e3Ssthen # representation of it. 645898184e3Ssthen 646898184e3Ssthen # The first 0 arg means wants a string returned; the second that we are in 647898184e3Ssthen # compile time 648898184e3Ssthen return lookup_name($_[0], 0, 0); 649898184e3Ssthen} 650898184e3Ssthen 65156d68f1eSafresh1sub _loose_regcomp_lookup { 65256d68f1eSafresh1 # For use only by regcomp.c to compile \p{name=...} 65356d68f1eSafresh1 # khw thinks it best to not do :short matching, and only official names. 65456d68f1eSafresh1 # But that is only a guess, and if demand warrants, could be changed 65556d68f1eSafresh1 return lookup_name($_[0], 0, 1, 65656d68f1eSafresh1 1 # Always use :loose matching 65756d68f1eSafresh1 ); 65856d68f1eSafresh1} 65956d68f1eSafresh1 66056d68f1eSafresh1sub _get_names_info { 66156d68f1eSafresh1 # For use only by regcomp.c to compile \p{name=/.../} 66256d68f1eSafresh1 populate_txt() unless $txt; 66356d68f1eSafresh1 66456d68f1eSafresh1 66556d68f1eSafresh1 return ( \$txt, \@charnames::code_points_ending_in_code_point ); 66656d68f1eSafresh1} 66756d68f1eSafresh1 668898184e3Ssthensub import 669898184e3Ssthen{ 670898184e3Ssthen shift; ## ignore class name 671898184e3Ssthen 672*eac174f2Safresh1 populate_txt() unless $txt; 673*eac174f2Safresh1 674898184e3Ssthen if (not @_) { 675898184e3Ssthen carp("'use charnames' needs explicit imports list"); 676898184e3Ssthen } 677898184e3Ssthen $^H{charnames} = \&charnames ; 678898184e3Ssthen $^H{charnames_ord_aliases} = {}; 679898184e3Ssthen $^H{charnames_name_aliases} = {}; 680898184e3Ssthen $^H{charnames_inverse_ords} = {}; 681898184e3Ssthen # New fields must be added to %dummy_H, and the code in lookup_name() 682898184e3Ssthen # that copies fields from the runtime structure 683898184e3Ssthen 684898184e3Ssthen ## 685898184e3Ssthen ## fill %h keys with our @_ args. 686898184e3Ssthen ## 687898184e3Ssthen my ($promote, %h, @args) = (0); 688898184e3Ssthen while (my $arg = shift) { 689898184e3Ssthen if ($arg eq ":alias") { 690898184e3Ssthen @_ or 691898184e3Ssthen croak ":alias needs an argument in charnames"; 692898184e3Ssthen my $alias = shift; 693898184e3Ssthen if (ref $alias) { 694898184e3Ssthen ref $alias eq "HASH" or 695898184e3Ssthen croak "Only HASH reference supported as argument to :alias"; 696898184e3Ssthen alias ($alias); 69791f110e0Safresh1 $promote = 1; 698898184e3Ssthen next; 699898184e3Ssthen } 700898184e3Ssthen if ($alias =~ m{:(\w+)$}) { 701898184e3Ssthen $1 eq "full" || $1 eq "loose" || $1 eq "short" and 702898184e3Ssthen croak ":alias cannot use existing pragma :$1 (reversed order?)"; 703898184e3Ssthen alias_file ($1) and $promote = 1; 704898184e3Ssthen next; 705898184e3Ssthen } 70691f110e0Safresh1 alias_file ($alias) and $promote = 1; 707898184e3Ssthen next; 708898184e3Ssthen } 709898184e3Ssthen if (substr($arg, 0, 1) eq ':' 710898184e3Ssthen and ! ($arg eq ":full" || $arg eq ":short" || $arg eq ":loose")) 711898184e3Ssthen { 712898184e3Ssthen warn "unsupported special '$arg' in charnames"; 713898184e3Ssthen next; 714898184e3Ssthen } 715898184e3Ssthen push @args, $arg; 716898184e3Ssthen } 717898184e3Ssthen 718898184e3Ssthen @args == 0 && $promote and @args = (":full"); 719898184e3Ssthen @h{@args} = (1) x @args; 720898184e3Ssthen 721898184e3Ssthen # Don't leave these undefined as are tested for in lookup_names 722898184e3Ssthen $^H{charnames_full} = delete $h{':full'} || 0; 723898184e3Ssthen $^H{charnames_loose} = delete $h{':loose'} || 0; 724898184e3Ssthen $^H{charnames_short} = delete $h{':short'} || 0; 725*eac174f2Safresh1 my @scripts = map { uc quotemeta } grep { /^[^:]/ } @args; 726898184e3Ssthen 727898184e3Ssthen ## 728898184e3Ssthen ## If utf8? warnings are enabled, and some scripts were given, 729898184e3Ssthen ## see if at least we can find one letter from each script. 730898184e3Ssthen ## 731898184e3Ssthen if (warnings::enabled('utf8') && @scripts) { 732898184e3Ssthen for my $script (@scripts) { 73356d68f1eSafresh1 if (not $txt =~ m/^$script (?:CAPITAL |SMALL )?LETTER /m) { 734898184e3Ssthen warnings::warn('utf8', "No such script: '$script'"); 735898184e3Ssthen $script = quotemeta $script; # Escape it, for use in the re. 736898184e3Ssthen } 737898184e3Ssthen } 738898184e3Ssthen } 739898184e3Ssthen 740898184e3Ssthen # %^H gets stringified, so serialize it ourselves so can extract the 741898184e3Ssthen # real data back later. 742898184e3Ssthen $^H{charnames_stringified_ords} = join ",", %{$^H{charnames_ord_aliases}}; 743898184e3Ssthen $^H{charnames_stringified_names} = join ",", %{$^H{charnames_name_aliases}}; 744898184e3Ssthen $^H{charnames_stringified_inverse_ords} = join ",", %{$^H{charnames_inverse_ords}}; 745898184e3Ssthen 746898184e3Ssthen # Modify the input script names for loose name matching if that is also 747898184e3Ssthen # specified, similar to the way the base character name is prepared. They 748898184e3Ssthen # don't (currently, and hopefully never will) have dashes. These go into a 749898184e3Ssthen # regex, and have already been uppercased and quotemeta'd. Squeeze out all 750898184e3Ssthen # input underscores, blanks, and dashes. Then convert so will match a blank 751898184e3Ssthen # between any characters. 752898184e3Ssthen if ($^H{charnames_loose}) { 753898184e3Ssthen for (my $i = 0; $i < @scripts; $i++) { 754898184e3Ssthen $scripts[$i] =~ s/[_ -]//g; 755898184e3Ssthen $scripts[$i] =~ s/ ( [^\\] ) (?= . ) /$1\\ ?/gx; 756898184e3Ssthen } 757898184e3Ssthen } 758898184e3Ssthen 759*eac174f2Safresh1 my %letters_by_script = map { 760*eac174f2Safresh1 $_ => [ 761*eac174f2Safresh1 ($txt =~ m/$_(?: (?:small|capital))? letter (.*)/ig) 762*eac174f2Safresh1 ] 763*eac174f2Safresh1 } @scripts; 764*eac174f2Safresh1 SCRIPTS: foreach my $this_script (@scripts) { 765*eac174f2Safresh1 my @other_scripts = grep { $_ ne $this_script } @scripts; 766*eac174f2Safresh1 my @this_script_letters = @{$letters_by_script{$this_script}}; 767*eac174f2Safresh1 my @other_script_letters = map { @{$letters_by_script{$_}} } @other_scripts; 768*eac174f2Safresh1 foreach my $this_letter (@this_script_letters) { 769*eac174f2Safresh1 if(grep { $_ eq $this_letter } @other_script_letters) { 770*eac174f2Safresh1 warn "charnames: some short character names may clash in [".join(', ', sort @scripts)."], for example $this_letter\n"; 771*eac174f2Safresh1 last SCRIPTS; 772*eac174f2Safresh1 } 773*eac174f2Safresh1 } 774*eac174f2Safresh1 } 775*eac174f2Safresh1 776898184e3Ssthen $^H{charnames_scripts} = join "|", @scripts; # Stringifiy them as a trie 777898184e3Ssthen} # import 778898184e3Ssthen 779898184e3Ssthen# Cache of already looked-up values. This is set to only contain 780898184e3Ssthen# official values, and user aliases can't override them, so scoping is 781898184e3Ssthen# not an issue. 782898184e3Ssthenmy %viacode; 783898184e3Ssthen 7846fb12b70Safresh1my $no_name_code_points_re = join "|", map { sprintf("%05X", 7856fb12b70Safresh1 utf8::unicode_to_native($_)) } 7866fb12b70Safresh1 0x80, 0x81, 0x84, 0x99; 7876fb12b70Safresh1$no_name_code_points_re = qr/$no_name_code_points_re/; 7886fb12b70Safresh1 789898184e3Ssthensub viacode { 790898184e3Ssthen 791898184e3Ssthen # Returns the name of the code point argument 792898184e3Ssthen 793898184e3Ssthen if (@_ != 1) { 794898184e3Ssthen carp "charnames::viacode() expects one argument"; 795898184e3Ssthen return; 796898184e3Ssthen } 797898184e3Ssthen 798898184e3Ssthen my $arg = shift; 799898184e3Ssthen 800898184e3Ssthen # This is derived from Unicode::UCD, where it is nearly the same as the 801898184e3Ssthen # function _getcode(), but here it makes sure that even a hex argument 802898184e3Ssthen # has the proper number of leading zeros, which is critical in 803898184e3Ssthen # matching against $txt below 804898184e3Ssthen # Must check if decimal first; see comments at that definition 805898184e3Ssthen my $hex; 806898184e3Ssthen if ($arg =~ $decimal_qr) { 807898184e3Ssthen $hex = sprintf "%05X", $arg; 808898184e3Ssthen } elsif ($arg =~ $hex_qr) { 8096fb12b70Safresh1 $hex = CORE::hex $1; 8106fb12b70Safresh1 $hex = utf8::unicode_to_native($hex) if $arg =~ /^[Uu]\+/; 811898184e3Ssthen # Below is the line that differs from the _getcode() source 8126fb12b70Safresh1 $hex = sprintf "%05X", $hex; 813898184e3Ssthen } else { 814898184e3Ssthen carp("unexpected arg \"$arg\" to charnames::viacode()"); 815898184e3Ssthen return; 816898184e3Ssthen } 817898184e3Ssthen 818898184e3Ssthen return $viacode{$hex} if exists $viacode{$hex}; 819898184e3Ssthen 820898184e3Ssthen my $return; 821898184e3Ssthen 822898184e3Ssthen # If the code point is above the max in the table, there's no point 823898184e3Ssthen # looking through it. Checking the length first is slightly faster 824898184e3Ssthen if (length($hex) <= 5 || CORE::hex($hex) <= 0x10FFFF) { 82556d68f1eSafresh1 populate_txt() unless $txt; 826898184e3Ssthen 827898184e3Ssthen # See if the name is algorithmically determinable. 828898184e3Ssthen my $algorithmic = charnames::code_point_to_name_special(CORE::hex $hex); 829898184e3Ssthen if (defined $algorithmic) { 830898184e3Ssthen $viacode{$hex} = $algorithmic; 831898184e3Ssthen return $algorithmic; 832898184e3Ssthen } 833898184e3Ssthen 834898184e3Ssthen # Return the official name, if exists. It's unclear to me (khw) at 835898184e3Ssthen # this juncture if it is better to return a user-defined override, so 836898184e3Ssthen # leaving it as is for now. 83756d68f1eSafresh1 if ($txt =~ m/^$hex\n/m) { 838898184e3Ssthen 839898184e3Ssthen # The name starts with the next character and goes up to the 840898184e3Ssthen # next new-line. Using capturing parentheses above instead of 841898184e3Ssthen # @+ more than doubles the execution time in Perl 5.13 842898184e3Ssthen $return = substr($txt, $+[0], index($txt, "\n", $+[0]) - $+[0]); 843898184e3Ssthen 844898184e3Ssthen # If not one of these 4 code points, return what we've found. 8456fb12b70Safresh1 if ($hex !~ / ^ $no_name_code_points_re $ /x) { 846898184e3Ssthen $viacode{$hex} = $return; 847898184e3Ssthen return $return; 848898184e3Ssthen } 849898184e3Ssthen 850898184e3Ssthen # For backwards compatibility, we don't return the official name of 851898184e3Ssthen # the 4 code points if there are user-defined aliases for them -- so 852898184e3Ssthen # continue looking. 853898184e3Ssthen } 854898184e3Ssthen } 855898184e3Ssthen 856898184e3Ssthen # See if there is a user name for it, before giving up completely. 857898184e3Ssthen # First get the scoped aliases, give up if have none. 858898184e3Ssthen my $H_ref = (caller(1))[10]; 859898184e3Ssthen return if ! defined $return 860898184e3Ssthen && (! defined $H_ref 861898184e3Ssthen || ! exists $H_ref->{charnames_stringified_inverse_ords}); 862898184e3Ssthen 863898184e3Ssthen my %code_point_aliases; 864898184e3Ssthen if (defined $H_ref->{charnames_stringified_inverse_ords}) { 865898184e3Ssthen %code_point_aliases = split ',', 866898184e3Ssthen $H_ref->{charnames_stringified_inverse_ords}; 867898184e3Ssthen return $code_point_aliases{$hex} if exists $code_point_aliases{$hex}; 868898184e3Ssthen } 869898184e3Ssthen 870898184e3Ssthen # Here there is no user-defined alias, return any official one. 871898184e3Ssthen return $return if defined $return; 872898184e3Ssthen 87391f110e0Safresh1 if (CORE::hex($hex) > 0x10FFFF 87491f110e0Safresh1 && warnings::enabled('non_unicode')) 87591f110e0Safresh1 { 876898184e3Ssthen carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)"; 877898184e3Ssthen } 878898184e3Ssthen return; 879898184e3Ssthen 8806fb12b70Safresh1} # viacode 881898184e3Ssthen 882898184e3Ssthen1; 883898184e3Ssthen 884898184e3Ssthen# ex: set ts=8 sts=2 sw=2 et: 885