1*0Sstevel@tonic-gatepackage charnames; 2*0Sstevel@tonic-gateuse strict; 3*0Sstevel@tonic-gateuse warnings; 4*0Sstevel@tonic-gateuse Carp; 5*0Sstevel@tonic-gateuse File::Spec; 6*0Sstevel@tonic-gateour $VERSION = '1.03'; 7*0Sstevel@tonic-gate 8*0Sstevel@tonic-gateuse bytes (); # for $bytes::hint_bits 9*0Sstevel@tonic-gate$charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH 10*0Sstevel@tonic-gate 11*0Sstevel@tonic-gatemy %alias1 = ( 12*0Sstevel@tonic-gate # Icky 3.2 names with parentheses. 13*0Sstevel@tonic-gate 'LINE FEED' => 'LINE FEED (LF)', 14*0Sstevel@tonic-gate 'FORM FEED' => 'FORM FEED (FF)', 15*0Sstevel@tonic-gate 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)', 16*0Sstevel@tonic-gate 'NEXT LINE' => 'NEXT LINE (NEL)', 17*0Sstevel@tonic-gate # Convenience. 18*0Sstevel@tonic-gate 'LF' => 'LINE FEED (LF)', 19*0Sstevel@tonic-gate 'FF' => 'FORM FEED (FF)', 20*0Sstevel@tonic-gate 'CR' => 'CARRIAGE RETURN (CR)', 21*0Sstevel@tonic-gate 'NEL' => 'NEXT LINE (NEL)', 22*0Sstevel@tonic-gate # More convenience. For futher convencience, 23*0Sstevel@tonic-gate # it is suggested some way using using the NamesList 24*0Sstevel@tonic-gate # aliases is implemented. 25*0Sstevel@tonic-gate 'ZWNJ' => 'ZERO WIDTH NON-JOINER', 26*0Sstevel@tonic-gate 'ZWJ' => 'ZERO WIDTH JOINER', 27*0Sstevel@tonic-gate 'BOM' => 'BYTE ORDER MARK', 28*0Sstevel@tonic-gate ); 29*0Sstevel@tonic-gate 30*0Sstevel@tonic-gatemy %alias2 = ( 31*0Sstevel@tonic-gate # Pre-3.2 compatibility (only for the first 256 characters). 32*0Sstevel@tonic-gate 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION', 33*0Sstevel@tonic-gate 'VERTICAL TABULATION' => 'LINE TABULATION', 34*0Sstevel@tonic-gate 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR', 35*0Sstevel@tonic-gate 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE', 36*0Sstevel@tonic-gate 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO', 37*0Sstevel@tonic-gate 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE', 38*0Sstevel@tonic-gate 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD', 39*0Sstevel@tonic-gate 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD', 40*0Sstevel@tonic-gate ); 41*0Sstevel@tonic-gate 42*0Sstevel@tonic-gatemy %alias3 = ( 43*0Sstevel@tonic-gate # User defined aliasses. Even more convenient :) 44*0Sstevel@tonic-gate ); 45*0Sstevel@tonic-gatemy $txt; 46*0Sstevel@tonic-gate 47*0Sstevel@tonic-gatesub alias (@) 48*0Sstevel@tonic-gate{ 49*0Sstevel@tonic-gate @_ or return %alias3; 50*0Sstevel@tonic-gate my $alias = ref $_[0] ? $_[0] : { @_ }; 51*0Sstevel@tonic-gate @alias3{keys %$alias} = values %$alias; 52*0Sstevel@tonic-gate} # alias 53*0Sstevel@tonic-gate 54*0Sstevel@tonic-gatesub alias_file ($) 55*0Sstevel@tonic-gate{ 56*0Sstevel@tonic-gate my ($arg, $file) = @_; 57*0Sstevel@tonic-gate if (-f $arg && File::Spec->file_name_is_absolute ($arg)) { 58*0Sstevel@tonic-gate $file = $arg; 59*0Sstevel@tonic-gate } 60*0Sstevel@tonic-gate elsif ($arg =~ m/^\w+$/) { 61*0Sstevel@tonic-gate $file = "unicore/${arg}_alias.pl"; 62*0Sstevel@tonic-gate } 63*0Sstevel@tonic-gate else { 64*0Sstevel@tonic-gate croak "Charnames alias files can only have identifier characters"; 65*0Sstevel@tonic-gate } 66*0Sstevel@tonic-gate if (my @alias = do $file) { 67*0Sstevel@tonic-gate @alias == 1 && !defined $alias[0] and 68*0Sstevel@tonic-gate croak "$file cannot be used as alias file for charnames"; 69*0Sstevel@tonic-gate @alias % 2 and 70*0Sstevel@tonic-gate croak "$file did not return a (valid) list of alias pairs"; 71*0Sstevel@tonic-gate alias (@alias); 72*0Sstevel@tonic-gate return (1); 73*0Sstevel@tonic-gate } 74*0Sstevel@tonic-gate 0; 75*0Sstevel@tonic-gate} # alias_file 76*0Sstevel@tonic-gate 77*0Sstevel@tonic-gate# This is not optimized in any way yet 78*0Sstevel@tonic-gatesub charnames 79*0Sstevel@tonic-gate{ 80*0Sstevel@tonic-gate my $name = shift; 81*0Sstevel@tonic-gate 82*0Sstevel@tonic-gate if (exists $alias1{$name}) { 83*0Sstevel@tonic-gate $name = $alias1{$name}; 84*0Sstevel@tonic-gate } 85*0Sstevel@tonic-gate elsif (exists $alias2{$name}) { 86*0Sstevel@tonic-gate require warnings; 87*0Sstevel@tonic-gate warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead}); 88*0Sstevel@tonic-gate $name = $alias2{$name}; 89*0Sstevel@tonic-gate } 90*0Sstevel@tonic-gate elsif (exists $alias3{$name}) { 91*0Sstevel@tonic-gate $name = $alias3{$name}; 92*0Sstevel@tonic-gate } 93*0Sstevel@tonic-gate 94*0Sstevel@tonic-gate my $ord; 95*0Sstevel@tonic-gate my @off; 96*0Sstevel@tonic-gate my $fname; 97*0Sstevel@tonic-gate 98*0Sstevel@tonic-gate if ($name eq "BYTE ORDER MARK") { 99*0Sstevel@tonic-gate $fname = $name; 100*0Sstevel@tonic-gate $ord = 0xFEFF; 101*0Sstevel@tonic-gate } else { 102*0Sstevel@tonic-gate ## Suck in the code/name list as a big string. 103*0Sstevel@tonic-gate ## Lines look like: 104*0Sstevel@tonic-gate ## "0052\t\tLATIN CAPITAL LETTER R\n" 105*0Sstevel@tonic-gate $txt = do "unicore/Name.pl" unless $txt; 106*0Sstevel@tonic-gate 107*0Sstevel@tonic-gate ## @off will hold the index into the code/name string of the start and 108*0Sstevel@tonic-gate ## end of the name as we find it. 109*0Sstevel@tonic-gate 110*0Sstevel@tonic-gate ## If :full, look for the name exactly 111*0Sstevel@tonic-gate if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) { 112*0Sstevel@tonic-gate @off = ($-[0], $+[0]); 113*0Sstevel@tonic-gate } 114*0Sstevel@tonic-gate 115*0Sstevel@tonic-gate ## If we didn't get above, and :short allowed, look for the short name. 116*0Sstevel@tonic-gate ## The short name is like "greek:Sigma" 117*0Sstevel@tonic-gate unless (@off) { 118*0Sstevel@tonic-gate if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) { 119*0Sstevel@tonic-gate my ($script, $cname) = ($1, $2); 120*0Sstevel@tonic-gate my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"; 121*0Sstevel@tonic-gate if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) { 122*0Sstevel@tonic-gate @off = ($-[0], $+[0]); 123*0Sstevel@tonic-gate } 124*0Sstevel@tonic-gate } 125*0Sstevel@tonic-gate } 126*0Sstevel@tonic-gate 127*0Sstevel@tonic-gate ## If we still don't have it, check for the name among the loaded 128*0Sstevel@tonic-gate ## scripts. 129*0Sstevel@tonic-gate if (not @off) { 130*0Sstevel@tonic-gate my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"; 131*0Sstevel@tonic-gate for my $script (@{$^H{charnames_scripts}}) { 132*0Sstevel@tonic-gate if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) { 133*0Sstevel@tonic-gate @off = ($-[0], $+[0]); 134*0Sstevel@tonic-gate last; 135*0Sstevel@tonic-gate } 136*0Sstevel@tonic-gate } 137*0Sstevel@tonic-gate } 138*0Sstevel@tonic-gate 139*0Sstevel@tonic-gate ## If we don't have it by now, give up. 140*0Sstevel@tonic-gate unless (@off) { 141*0Sstevel@tonic-gate carp "Unknown charname '$name'"; 142*0Sstevel@tonic-gate return "\x{FFFD}"; 143*0Sstevel@tonic-gate } 144*0Sstevel@tonic-gate 145*0Sstevel@tonic-gate ## 146*0Sstevel@tonic-gate ## Now know where in the string the name starts. 147*0Sstevel@tonic-gate ## The code, in hex, is before that. 148*0Sstevel@tonic-gate ## 149*0Sstevel@tonic-gate ## The code can be 4-6 characters long, so we've got to sort of 150*0Sstevel@tonic-gate ## go look for it, just after the newline that comes before $off[0]. 151*0Sstevel@tonic-gate ## 152*0Sstevel@tonic-gate ## This would be much easier if unicore/Name.pl had info in 153*0Sstevel@tonic-gate ## a name/code order, instead of code/name order. 154*0Sstevel@tonic-gate ## 155*0Sstevel@tonic-gate ## The +1 after the rindex() is to skip past the newline we're finding, 156*0Sstevel@tonic-gate ## or, if the rindex() fails, to put us to an offset of zero. 157*0Sstevel@tonic-gate ## 158*0Sstevel@tonic-gate my $hexstart = rindex($txt, "\n", $off[0]) + 1; 159*0Sstevel@tonic-gate 160*0Sstevel@tonic-gate ## we know where it starts, so turn into number - 161*0Sstevel@tonic-gate ## the ordinal for the char. 162*0Sstevel@tonic-gate $ord = hex substr($txt, $hexstart, $off[0] - $hexstart); 163*0Sstevel@tonic-gate } 164*0Sstevel@tonic-gate 165*0Sstevel@tonic-gate if ($^H & $bytes::hint_bits) { # "use bytes" in effect? 166*0Sstevel@tonic-gate use bytes; 167*0Sstevel@tonic-gate return chr $ord if $ord <= 255; 168*0Sstevel@tonic-gate my $hex = sprintf "%04x", $ord; 169*0Sstevel@tonic-gate if (not defined $fname) { 170*0Sstevel@tonic-gate $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2; 171*0Sstevel@tonic-gate } 172*0Sstevel@tonic-gate croak "Character 0x$hex with name '$fname' is above 0xFF"; 173*0Sstevel@tonic-gate } 174*0Sstevel@tonic-gate 175*0Sstevel@tonic-gate no warnings 'utf8'; # allow even illegal characters 176*0Sstevel@tonic-gate return pack "U", $ord; 177*0Sstevel@tonic-gate} # charnames 178*0Sstevel@tonic-gate 179*0Sstevel@tonic-gatesub import 180*0Sstevel@tonic-gate{ 181*0Sstevel@tonic-gate shift; ## ignore class name 182*0Sstevel@tonic-gate 183*0Sstevel@tonic-gate if (not @_) { 184*0Sstevel@tonic-gate carp("`use charnames' needs explicit imports list"); 185*0Sstevel@tonic-gate } 186*0Sstevel@tonic-gate $^H |= $charnames::hint_bits; 187*0Sstevel@tonic-gate $^H{charnames} = \&charnames ; 188*0Sstevel@tonic-gate 189*0Sstevel@tonic-gate ## 190*0Sstevel@tonic-gate ## fill %h keys with our @_ args. 191*0Sstevel@tonic-gate ## 192*0Sstevel@tonic-gate my ($promote, %h, @args) = (0); 193*0Sstevel@tonic-gate while (@_ and $_ = shift) { 194*0Sstevel@tonic-gate if ($_ eq ":alias") { 195*0Sstevel@tonic-gate @_ or 196*0Sstevel@tonic-gate croak ":alias needs an argument in charnames"; 197*0Sstevel@tonic-gate my $alias = shift; 198*0Sstevel@tonic-gate if (ref $alias) { 199*0Sstevel@tonic-gate ref $alias eq "HASH" or 200*0Sstevel@tonic-gate croak "Only HASH reference supported as argument to :alias"; 201*0Sstevel@tonic-gate alias ($alias); 202*0Sstevel@tonic-gate next; 203*0Sstevel@tonic-gate } 204*0Sstevel@tonic-gate if ($alias =~ m{:(\w+)$}) { 205*0Sstevel@tonic-gate $1 eq "full" || $1 eq "short" and 206*0Sstevel@tonic-gate croak ":alias cannot use existing pragma :$1 (reversed order?)"; 207*0Sstevel@tonic-gate alias_file ($1) and $promote = 1; 208*0Sstevel@tonic-gate next; 209*0Sstevel@tonic-gate } 210*0Sstevel@tonic-gate alias_file ($alias); 211*0Sstevel@tonic-gate next; 212*0Sstevel@tonic-gate } 213*0Sstevel@tonic-gate if (m/^:/ and ! ($_ eq ":full" || $_ eq ":short")) { 214*0Sstevel@tonic-gate warn "unsupported special '$_' in charnames"; 215*0Sstevel@tonic-gate next; 216*0Sstevel@tonic-gate } 217*0Sstevel@tonic-gate push @args, $_; 218*0Sstevel@tonic-gate } 219*0Sstevel@tonic-gate @args == 0 && $promote and @args = (":full"); 220*0Sstevel@tonic-gate @h{@args} = (1) x @args; 221*0Sstevel@tonic-gate 222*0Sstevel@tonic-gate $^H{charnames_full} = delete $h{':full'}; 223*0Sstevel@tonic-gate $^H{charnames_short} = delete $h{':short'}; 224*0Sstevel@tonic-gate $^H{charnames_scripts} = [map uc, keys %h]; 225*0Sstevel@tonic-gate 226*0Sstevel@tonic-gate ## 227*0Sstevel@tonic-gate ## If utf8? warnings are enabled, and some scripts were given, 228*0Sstevel@tonic-gate ## see if at least we can find one letter of each script. 229*0Sstevel@tonic-gate ## 230*0Sstevel@tonic-gate if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) { 231*0Sstevel@tonic-gate $txt = do "unicore/Name.pl" unless $txt; 232*0Sstevel@tonic-gate 233*0Sstevel@tonic-gate for my $script (@{$^H{charnames_scripts}}) { 234*0Sstevel@tonic-gate if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) { 235*0Sstevel@tonic-gate warnings::warn('utf8', "No such script: '$script'"); 236*0Sstevel@tonic-gate } 237*0Sstevel@tonic-gate } 238*0Sstevel@tonic-gate } 239*0Sstevel@tonic-gate} # import 240*0Sstevel@tonic-gate 241*0Sstevel@tonic-gate# this comes actually from Unicode::UCD, but it avoids the 242*0Sstevel@tonic-gate# overhead of loading it 243*0Sstevel@tonic-gatesub _getcode { 244*0Sstevel@tonic-gate my $arg = shift; 245*0Sstevel@tonic-gate 246*0Sstevel@tonic-gate if ($arg =~ /^[1-9]\d*$/) { 247*0Sstevel@tonic-gate return $arg; 248*0Sstevel@tonic-gate } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { 249*0Sstevel@tonic-gate return hex($1); 250*0Sstevel@tonic-gate } 251*0Sstevel@tonic-gate 252*0Sstevel@tonic-gate return; 253*0Sstevel@tonic-gate} 254*0Sstevel@tonic-gate 255*0Sstevel@tonic-gatemy %viacode; 256*0Sstevel@tonic-gate 257*0Sstevel@tonic-gatesub viacode 258*0Sstevel@tonic-gate{ 259*0Sstevel@tonic-gate if (@_ != 1) { 260*0Sstevel@tonic-gate carp "charnames::viacode() expects one argument"; 261*0Sstevel@tonic-gate return () 262*0Sstevel@tonic-gate } 263*0Sstevel@tonic-gate 264*0Sstevel@tonic-gate my $arg = shift; 265*0Sstevel@tonic-gate my $code = _getcode($arg); 266*0Sstevel@tonic-gate 267*0Sstevel@tonic-gate my $hex; 268*0Sstevel@tonic-gate 269*0Sstevel@tonic-gate if (defined $code) { 270*0Sstevel@tonic-gate $hex = sprintf "%04X", $arg; 271*0Sstevel@tonic-gate } else { 272*0Sstevel@tonic-gate carp("unexpected arg \"$arg\" to charnames::viacode()"); 273*0Sstevel@tonic-gate return; 274*0Sstevel@tonic-gate } 275*0Sstevel@tonic-gate 276*0Sstevel@tonic-gate if ($code > 0x10FFFF) { 277*0Sstevel@tonic-gate carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex; 278*0Sstevel@tonic-gate return; 279*0Sstevel@tonic-gate } 280*0Sstevel@tonic-gate 281*0Sstevel@tonic-gate return $viacode{$hex} if exists $viacode{$hex}; 282*0Sstevel@tonic-gate 283*0Sstevel@tonic-gate $txt = do "unicore/Name.pl" unless $txt; 284*0Sstevel@tonic-gate 285*0Sstevel@tonic-gate if ($txt =~ m/^$hex\t\t(.+)/m) { 286*0Sstevel@tonic-gate return $viacode{$hex} = $1; 287*0Sstevel@tonic-gate } else { 288*0Sstevel@tonic-gate return; 289*0Sstevel@tonic-gate } 290*0Sstevel@tonic-gate} # viacode 291*0Sstevel@tonic-gate 292*0Sstevel@tonic-gatemy %vianame; 293*0Sstevel@tonic-gate 294*0Sstevel@tonic-gatesub vianame 295*0Sstevel@tonic-gate{ 296*0Sstevel@tonic-gate if (@_ != 1) { 297*0Sstevel@tonic-gate carp "charnames::vianame() expects one name argument"; 298*0Sstevel@tonic-gate return () 299*0Sstevel@tonic-gate } 300*0Sstevel@tonic-gate 301*0Sstevel@tonic-gate my $arg = shift; 302*0Sstevel@tonic-gate 303*0Sstevel@tonic-gate return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/; 304*0Sstevel@tonic-gate 305*0Sstevel@tonic-gate return $vianame{$arg} if exists $vianame{$arg}; 306*0Sstevel@tonic-gate 307*0Sstevel@tonic-gate $txt = do "unicore/Name.pl" unless $txt; 308*0Sstevel@tonic-gate 309*0Sstevel@tonic-gate my $pos = index $txt, "\t\t$arg\n"; 310*0Sstevel@tonic-gate if ($[ <= $pos) { 311*0Sstevel@tonic-gate my $posLF = rindex $txt, "\n", $pos; 312*0Sstevel@tonic-gate (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d; 313*0Sstevel@tonic-gate return $vianame{$arg} = hex $code; 314*0Sstevel@tonic-gate 315*0Sstevel@tonic-gate # If $pos is at the 1st line, $posLF must be $[ - 1 (not found); 316*0Sstevel@tonic-gate # then $posLF + 1 equals to $[ (at the beginning of $txt). 317*0Sstevel@tonic-gate # Otherwise $posLF is the position of "\n"; 318*0Sstevel@tonic-gate # then $posLF + 1 must be the position of the next to "\n" 319*0Sstevel@tonic-gate # (the beginning of the line). 320*0Sstevel@tonic-gate # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t", 321*0Sstevel@tonic-gate # "10300\t", "100000", etc. So we can get the code via removing TAB. 322*0Sstevel@tonic-gate } else { 323*0Sstevel@tonic-gate return; 324*0Sstevel@tonic-gate } 325*0Sstevel@tonic-gate} # vianame 326*0Sstevel@tonic-gate 327*0Sstevel@tonic-gate 328*0Sstevel@tonic-gate1; 329*0Sstevel@tonic-gate__END__ 330*0Sstevel@tonic-gate 331*0Sstevel@tonic-gate=head1 NAME 332*0Sstevel@tonic-gate 333*0Sstevel@tonic-gatecharnames - define character names for C<\N{named}> string literal escapes 334*0Sstevel@tonic-gate 335*0Sstevel@tonic-gate=head1 SYNOPSIS 336*0Sstevel@tonic-gate 337*0Sstevel@tonic-gate use charnames ':full'; 338*0Sstevel@tonic-gate print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; 339*0Sstevel@tonic-gate 340*0Sstevel@tonic-gate use charnames ':short'; 341*0Sstevel@tonic-gate print "\N{greek:Sigma} is an upper-case sigma.\n"; 342*0Sstevel@tonic-gate 343*0Sstevel@tonic-gate use charnames qw(cyrillic greek); 344*0Sstevel@tonic-gate print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n"; 345*0Sstevel@tonic-gate 346*0Sstevel@tonic-gate use charnames ":full", ":alias" => { 347*0Sstevel@tonic-gate e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", 348*0Sstevel@tonic-gate }; 349*0Sstevel@tonic-gate print "\N{e_ACUTE} is a small letter e with an acute.\n"; 350*0Sstevel@tonic-gate 351*0Sstevel@tonic-gate use charnames (); 352*0Sstevel@tonic-gate print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE" 353*0Sstevel@tonic-gate printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330" 354*0Sstevel@tonic-gate 355*0Sstevel@tonic-gate=head1 DESCRIPTION 356*0Sstevel@tonic-gate 357*0Sstevel@tonic-gatePragma C<use charnames> supports arguments C<:full>, C<:short>, script 358*0Sstevel@tonic-gatenames and customized aliases. If C<:full> is present, for expansion of 359*0Sstevel@tonic-gateC<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of 360*0Sstevel@tonic-gatestandard Unicode character names. If C<:short> is present, and 361*0Sstevel@tonic-gateC<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up 362*0Sstevel@tonic-gateas a letter in script C<SCRIPT>. If pragma C<use charnames> is used 363*0Sstevel@tonic-gatewith script name arguments, then for C<\N{CHARNAME}> the name 364*0Sstevel@tonic-gateC<CHARNAME> is looked up as a letter in the given scripts (in the 365*0Sstevel@tonic-gatespecified order). Customized aliases are explained in L</CUSTOM ALIASES>. 366*0Sstevel@tonic-gate 367*0Sstevel@tonic-gateFor lookup of C<CHARNAME> inside a given script C<SCRIPTNAME> 368*0Sstevel@tonic-gatethis pragma looks for the names 369*0Sstevel@tonic-gate 370*0Sstevel@tonic-gate SCRIPTNAME CAPITAL LETTER CHARNAME 371*0Sstevel@tonic-gate SCRIPTNAME SMALL LETTER CHARNAME 372*0Sstevel@tonic-gate SCRIPTNAME LETTER CHARNAME 373*0Sstevel@tonic-gate 374*0Sstevel@tonic-gatein the table of standard Unicode names. If C<CHARNAME> is lowercase, 375*0Sstevel@tonic-gatethen the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant 376*0Sstevel@tonic-gateis ignored. 377*0Sstevel@tonic-gate 378*0Sstevel@tonic-gateNote that C<\N{...}> is compile-time, it's a special form of string 379*0Sstevel@tonic-gateconstant used inside double-quoted strings: in other words, you cannot 380*0Sstevel@tonic-gateuse variables inside the C<\N{...}>. If you want similar run-time 381*0Sstevel@tonic-gatefunctionality, use charnames::vianame(). 382*0Sstevel@tonic-gate 383*0Sstevel@tonic-gateFor the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F) 384*0Sstevel@tonic-gateas of Unicode 3.1, there are no official Unicode names but you can use 385*0Sstevel@tonic-gateinstead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). In 386*0Sstevel@tonic-gateUnicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429 387*0Sstevel@tonic-gatehas been updated, see L</ALIASES>. Also note that the U+UU80, U+0081, 388*0Sstevel@tonic-gateU+0084, and U+0099 do not have names even in ISO 6429. 389*0Sstevel@tonic-gate 390*0Sstevel@tonic-gateSince the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}" 391*0Sstevel@tonic-gateis the Unicode smiley face, or "\N{WHITE SMILING FACE}". 392*0Sstevel@tonic-gate 393*0Sstevel@tonic-gate=head1 CUSTOM TRANSLATORS 394*0Sstevel@tonic-gate 395*0Sstevel@tonic-gateThe mechanism of translation of C<\N{...}> escapes is general and not 396*0Sstevel@tonic-gatehardwired into F<charnames.pm>. A module can install custom 397*0Sstevel@tonic-gatetranslations (inside the scope which C<use>s the module) with the 398*0Sstevel@tonic-gatefollowing magic incantation: 399*0Sstevel@tonic-gate 400*0Sstevel@tonic-gate use charnames (); # for $charnames::hint_bits 401*0Sstevel@tonic-gate sub import { 402*0Sstevel@tonic-gate shift; 403*0Sstevel@tonic-gate $^H |= $charnames::hint_bits; 404*0Sstevel@tonic-gate $^H{charnames} = \&translator; 405*0Sstevel@tonic-gate } 406*0Sstevel@tonic-gate 407*0Sstevel@tonic-gateHere translator() is a subroutine which takes C<CHARNAME> as an 408*0Sstevel@tonic-gateargument, and returns text to insert into the string instead of the 409*0Sstevel@tonic-gateC<\N{CHARNAME}> escape. Since the text to insert should be different 410*0Sstevel@tonic-gatein C<bytes> mode and out of it, the function should check the current 411*0Sstevel@tonic-gatestate of C<bytes>-flag as in: 412*0Sstevel@tonic-gate 413*0Sstevel@tonic-gate use bytes (); # for $bytes::hint_bits 414*0Sstevel@tonic-gate sub translator { 415*0Sstevel@tonic-gate if ($^H & $bytes::hint_bits) { 416*0Sstevel@tonic-gate return bytes_translator(@_); 417*0Sstevel@tonic-gate } 418*0Sstevel@tonic-gate else { 419*0Sstevel@tonic-gate return utf8_translator(@_); 420*0Sstevel@tonic-gate } 421*0Sstevel@tonic-gate } 422*0Sstevel@tonic-gate 423*0Sstevel@tonic-gate=head1 CUSTOM ALIASES 424*0Sstevel@tonic-gate 425*0Sstevel@tonic-gateThis version of charnames supports three mechanisms of adding local 426*0Sstevel@tonic-gateor customized aliases to standard Unicode naming conventions (:full) 427*0Sstevel@tonic-gate 428*0Sstevel@tonic-gate=head2 Anonymous hashes 429*0Sstevel@tonic-gate 430*0Sstevel@tonic-gate use charnames ":full", ":alias" => { 431*0Sstevel@tonic-gate e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", 432*0Sstevel@tonic-gate }; 433*0Sstevel@tonic-gate my $str = "\N{e_ACUTE}"; 434*0Sstevel@tonic-gate 435*0Sstevel@tonic-gate=head2 Alias file 436*0Sstevel@tonic-gate 437*0Sstevel@tonic-gate use charnames ":full", ":alias" => "pro"; 438*0Sstevel@tonic-gate 439*0Sstevel@tonic-gate will try to read "unicore/pro_alias.pl" from the @INC path. This 440*0Sstevel@tonic-gate file should return a list in plain perl: 441*0Sstevel@tonic-gate 442*0Sstevel@tonic-gate ( 443*0Sstevel@tonic-gate A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE", 444*0Sstevel@tonic-gate A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX", 445*0Sstevel@tonic-gate A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS", 446*0Sstevel@tonic-gate A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE", 447*0Sstevel@tonic-gate A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE", 448*0Sstevel@tonic-gate A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE", 449*0Sstevel@tonic-gate A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON", 450*0Sstevel@tonic-gate ); 451*0Sstevel@tonic-gate 452*0Sstevel@tonic-gate=head2 Alias shortcut 453*0Sstevel@tonic-gate 454*0Sstevel@tonic-gate use charnames ":alias" => ":pro"; 455*0Sstevel@tonic-gate 456*0Sstevel@tonic-gate works exactly the same as the alias pairs, only this time, 457*0Sstevel@tonic-gate ":full" is inserted automatically as first argument (if no 458*0Sstevel@tonic-gate other argument is given). 459*0Sstevel@tonic-gate 460*0Sstevel@tonic-gate=head1 charnames::viacode(code) 461*0Sstevel@tonic-gate 462*0Sstevel@tonic-gateReturns the full name of the character indicated by the numeric code. 463*0Sstevel@tonic-gateThe example 464*0Sstevel@tonic-gate 465*0Sstevel@tonic-gate print charnames::viacode(0x2722); 466*0Sstevel@tonic-gate 467*0Sstevel@tonic-gateprints "FOUR TEARDROP-SPOKED ASTERISK". 468*0Sstevel@tonic-gate 469*0Sstevel@tonic-gateReturns undef if no name is known for the code. 470*0Sstevel@tonic-gate 471*0Sstevel@tonic-gateThis works only for the standard names, and does not yet apply 472*0Sstevel@tonic-gateto custom translators. 473*0Sstevel@tonic-gate 474*0Sstevel@tonic-gateNotice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK 475*0Sstevel@tonic-gateSPACE", not "BYTE ORDER MARK". 476*0Sstevel@tonic-gate 477*0Sstevel@tonic-gate=head1 charnames::vianame(name) 478*0Sstevel@tonic-gate 479*0Sstevel@tonic-gateReturns the code point indicated by the name. 480*0Sstevel@tonic-gateThe example 481*0Sstevel@tonic-gate 482*0Sstevel@tonic-gate printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK"); 483*0Sstevel@tonic-gate 484*0Sstevel@tonic-gateprints "2722". 485*0Sstevel@tonic-gate 486*0Sstevel@tonic-gateReturns undef if the name is unknown. 487*0Sstevel@tonic-gate 488*0Sstevel@tonic-gateThis works only for the standard names, and does not yet apply 489*0Sstevel@tonic-gateto custom translators. 490*0Sstevel@tonic-gate 491*0Sstevel@tonic-gate=head1 ALIASES 492*0Sstevel@tonic-gate 493*0Sstevel@tonic-gateA few aliases have been defined for convenience: instead of having 494*0Sstevel@tonic-gateto use the official names 495*0Sstevel@tonic-gate 496*0Sstevel@tonic-gate LINE FEED (LF) 497*0Sstevel@tonic-gate FORM FEED (FF) 498*0Sstevel@tonic-gate CARRIAGE RETURN (CR) 499*0Sstevel@tonic-gate NEXT LINE (NEL) 500*0Sstevel@tonic-gate 501*0Sstevel@tonic-gate(yes, with parentheses) one can use 502*0Sstevel@tonic-gate 503*0Sstevel@tonic-gate LINE FEED 504*0Sstevel@tonic-gate FORM FEED 505*0Sstevel@tonic-gate CARRIAGE RETURN 506*0Sstevel@tonic-gate NEXT LINE 507*0Sstevel@tonic-gate LF 508*0Sstevel@tonic-gate FF 509*0Sstevel@tonic-gate CR 510*0Sstevel@tonic-gate NEL 511*0Sstevel@tonic-gate 512*0Sstevel@tonic-gateOne can also use 513*0Sstevel@tonic-gate 514*0Sstevel@tonic-gate BYTE ORDER MARK 515*0Sstevel@tonic-gate BOM 516*0Sstevel@tonic-gate 517*0Sstevel@tonic-gateand 518*0Sstevel@tonic-gate 519*0Sstevel@tonic-gate ZWNJ 520*0Sstevel@tonic-gate ZWJ 521*0Sstevel@tonic-gate 522*0Sstevel@tonic-gatefor ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER. 523*0Sstevel@tonic-gate 524*0Sstevel@tonic-gateFor backward compatibility one can use the old names for 525*0Sstevel@tonic-gatecertain C0 and C1 controls 526*0Sstevel@tonic-gate 527*0Sstevel@tonic-gate old new 528*0Sstevel@tonic-gate 529*0Sstevel@tonic-gate HORIZONTAL TABULATION CHARACTER TABULATION 530*0Sstevel@tonic-gate VERTICAL TABULATION LINE TABULATION 531*0Sstevel@tonic-gate FILE SEPARATOR INFORMATION SEPARATOR FOUR 532*0Sstevel@tonic-gate GROUP SEPARATOR INFORMATION SEPARATOR THREE 533*0Sstevel@tonic-gate RECORD SEPARATOR INFORMATION SEPARATOR TWO 534*0Sstevel@tonic-gate UNIT SEPARATOR INFORMATION SEPARATOR ONE 535*0Sstevel@tonic-gate PARTIAL LINE DOWN PARTIAL LINE FORWARD 536*0Sstevel@tonic-gate PARTIAL LINE UP PARTIAL LINE BACKWARD 537*0Sstevel@tonic-gate 538*0Sstevel@tonic-gatebut the old names in addition to giving the character 539*0Sstevel@tonic-gatewill also give a warning about being deprecated. 540*0Sstevel@tonic-gate 541*0Sstevel@tonic-gate=head1 ILLEGAL CHARACTERS 542*0Sstevel@tonic-gate 543*0Sstevel@tonic-gateIf you ask by name for a character that does not exist, a warning is 544*0Sstevel@tonic-gategiven and the Unicode I<replacement character> "\x{FFFD}" is returned. 545*0Sstevel@tonic-gate 546*0Sstevel@tonic-gateIf you ask by code for a character that does not exist, no warning is 547*0Sstevel@tonic-gategiven and C<undef> is returned. (Though if you ask for a code point 548*0Sstevel@tonic-gatepast U+10FFFF you do get a warning.) 549*0Sstevel@tonic-gate 550*0Sstevel@tonic-gate=head1 BUGS 551*0Sstevel@tonic-gate 552*0Sstevel@tonic-gateSince evaluation of the translation function happens in a middle of 553*0Sstevel@tonic-gatecompilation (of a string literal), the translation function should not 554*0Sstevel@tonic-gatedo any C<eval>s or C<require>s. This restriction should be lifted in 555*0Sstevel@tonic-gatea future version of Perl. 556*0Sstevel@tonic-gate 557*0Sstevel@tonic-gate=cut 558