xref: /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/lib/charnames.pm (revision 0:68f95e015346)
1*0Sstevel@tonic-gatepackage charnames;
2*0Sstevel@tonic-gateuse strict;
3*0Sstevel@tonic-gateuse warnings;
4*0Sstevel@tonic-gateuse Carp;
5*0Sstevel@tonic-gateuse File::Spec;
6*0Sstevel@tonic-gateour $VERSION = '1.03';
7*0Sstevel@tonic-gate
8*0Sstevel@tonic-gateuse bytes ();		# for $bytes::hint_bits
9*0Sstevel@tonic-gate$charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH
10*0Sstevel@tonic-gate
11*0Sstevel@tonic-gatemy %alias1 = (
12*0Sstevel@tonic-gate		# Icky 3.2 names with parentheses.
13*0Sstevel@tonic-gate		'LINE FEED'		=> 'LINE FEED (LF)',
14*0Sstevel@tonic-gate		'FORM FEED'		=> 'FORM FEED (FF)',
15*0Sstevel@tonic-gate		'CARRIAGE RETURN'	=> 'CARRIAGE RETURN (CR)',
16*0Sstevel@tonic-gate		'NEXT LINE'		=> 'NEXT LINE (NEL)',
17*0Sstevel@tonic-gate		# Convenience.
18*0Sstevel@tonic-gate		'LF'			=> 'LINE FEED (LF)',
19*0Sstevel@tonic-gate		'FF'			=> 'FORM FEED (FF)',
20*0Sstevel@tonic-gate		'CR'			=> 'CARRIAGE RETURN (CR)',
21*0Sstevel@tonic-gate		'NEL'			=> 'NEXT LINE (NEL)',
22*0Sstevel@tonic-gate	        # More convenience.  For futher convencience,
23*0Sstevel@tonic-gate	        # it is suggested some way using using the NamesList
24*0Sstevel@tonic-gate		# aliases is implemented.
25*0Sstevel@tonic-gate	        'ZWNJ'			=> 'ZERO WIDTH NON-JOINER',
26*0Sstevel@tonic-gate	        'ZWJ'			=> 'ZERO WIDTH JOINER',
27*0Sstevel@tonic-gate		'BOM'			=> 'BYTE ORDER MARK',
28*0Sstevel@tonic-gate	    );
29*0Sstevel@tonic-gate
30*0Sstevel@tonic-gatemy %alias2 = (
31*0Sstevel@tonic-gate		# Pre-3.2 compatibility (only for the first 256 characters).
32*0Sstevel@tonic-gate		'HORIZONTAL TABULATION'	=> 'CHARACTER TABULATION',
33*0Sstevel@tonic-gate		'VERTICAL TABULATION'	=> 'LINE TABULATION',
34*0Sstevel@tonic-gate		'FILE SEPARATOR'	=> 'INFORMATION SEPARATOR FOUR',
35*0Sstevel@tonic-gate		'GROUP SEPARATOR'	=> 'INFORMATION SEPARATOR THREE',
36*0Sstevel@tonic-gate		'RECORD SEPARATOR'	=> 'INFORMATION SEPARATOR TWO',
37*0Sstevel@tonic-gate		'UNIT SEPARATOR'	=> 'INFORMATION SEPARATOR ONE',
38*0Sstevel@tonic-gate		'PARTIAL LINE DOWN'	=> 'PARTIAL LINE FORWARD',
39*0Sstevel@tonic-gate		'PARTIAL LINE UP'	=> 'PARTIAL LINE BACKWARD',
40*0Sstevel@tonic-gate	    );
41*0Sstevel@tonic-gate
42*0Sstevel@tonic-gatemy %alias3 = (
43*0Sstevel@tonic-gate		# User defined aliasses. Even more convenient :)
44*0Sstevel@tonic-gate	    );
45*0Sstevel@tonic-gatemy $txt;
46*0Sstevel@tonic-gate
47*0Sstevel@tonic-gatesub alias (@)
48*0Sstevel@tonic-gate{
49*0Sstevel@tonic-gate  @_ or return %alias3;
50*0Sstevel@tonic-gate  my $alias = ref $_[0] ? $_[0] : { @_ };
51*0Sstevel@tonic-gate  @alias3{keys %$alias} = values %$alias;
52*0Sstevel@tonic-gate} # alias
53*0Sstevel@tonic-gate
54*0Sstevel@tonic-gatesub alias_file ($)
55*0Sstevel@tonic-gate{
56*0Sstevel@tonic-gate  my ($arg, $file) = @_;
57*0Sstevel@tonic-gate  if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
58*0Sstevel@tonic-gate    $file = $arg;
59*0Sstevel@tonic-gate  }
60*0Sstevel@tonic-gate  elsif ($arg =~ m/^\w+$/) {
61*0Sstevel@tonic-gate    $file = "unicore/${arg}_alias.pl";
62*0Sstevel@tonic-gate  }
63*0Sstevel@tonic-gate  else {
64*0Sstevel@tonic-gate    croak "Charnames alias files can only have identifier characters";
65*0Sstevel@tonic-gate  }
66*0Sstevel@tonic-gate  if (my @alias = do $file) {
67*0Sstevel@tonic-gate    @alias == 1 && !defined $alias[0] and
68*0Sstevel@tonic-gate      croak "$file cannot be used as alias file for charnames";
69*0Sstevel@tonic-gate    @alias % 2 and
70*0Sstevel@tonic-gate      croak "$file did not return a (valid) list of alias pairs";
71*0Sstevel@tonic-gate    alias (@alias);
72*0Sstevel@tonic-gate    return (1);
73*0Sstevel@tonic-gate  }
74*0Sstevel@tonic-gate  0;
75*0Sstevel@tonic-gate} # alias_file
76*0Sstevel@tonic-gate
77*0Sstevel@tonic-gate# This is not optimized in any way yet
78*0Sstevel@tonic-gatesub charnames
79*0Sstevel@tonic-gate{
80*0Sstevel@tonic-gate  my $name = shift;
81*0Sstevel@tonic-gate
82*0Sstevel@tonic-gate  if (exists $alias1{$name}) {
83*0Sstevel@tonic-gate    $name = $alias1{$name};
84*0Sstevel@tonic-gate  }
85*0Sstevel@tonic-gate  elsif (exists $alias2{$name}) {
86*0Sstevel@tonic-gate    require warnings;
87*0Sstevel@tonic-gate    warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
88*0Sstevel@tonic-gate    $name = $alias2{$name};
89*0Sstevel@tonic-gate  }
90*0Sstevel@tonic-gate  elsif (exists $alias3{$name}) {
91*0Sstevel@tonic-gate    $name = $alias3{$name};
92*0Sstevel@tonic-gate  }
93*0Sstevel@tonic-gate
94*0Sstevel@tonic-gate  my $ord;
95*0Sstevel@tonic-gate  my @off;
96*0Sstevel@tonic-gate  my $fname;
97*0Sstevel@tonic-gate
98*0Sstevel@tonic-gate  if ($name eq "BYTE ORDER MARK") {
99*0Sstevel@tonic-gate    $fname = $name;
100*0Sstevel@tonic-gate    $ord = 0xFEFF;
101*0Sstevel@tonic-gate  } else {
102*0Sstevel@tonic-gate    ## Suck in the code/name list as a big string.
103*0Sstevel@tonic-gate    ## Lines look like:
104*0Sstevel@tonic-gate    ##     "0052\t\tLATIN CAPITAL LETTER R\n"
105*0Sstevel@tonic-gate    $txt = do "unicore/Name.pl" unless $txt;
106*0Sstevel@tonic-gate
107*0Sstevel@tonic-gate    ## @off will hold the index into the code/name string of the start and
108*0Sstevel@tonic-gate    ## end of the name as we find it.
109*0Sstevel@tonic-gate
110*0Sstevel@tonic-gate    ## If :full, look for the name exactly
111*0Sstevel@tonic-gate    if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
112*0Sstevel@tonic-gate      @off = ($-[0], $+[0]);
113*0Sstevel@tonic-gate    }
114*0Sstevel@tonic-gate
115*0Sstevel@tonic-gate    ## If we didn't get above, and :short allowed, look for the short name.
116*0Sstevel@tonic-gate    ## The short name is like "greek:Sigma"
117*0Sstevel@tonic-gate    unless (@off) {
118*0Sstevel@tonic-gate      if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
119*0Sstevel@tonic-gate	my ($script, $cname) = ($1, $2);
120*0Sstevel@tonic-gate	my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
121*0Sstevel@tonic-gate	if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
122*0Sstevel@tonic-gate	  @off = ($-[0], $+[0]);
123*0Sstevel@tonic-gate	}
124*0Sstevel@tonic-gate      }
125*0Sstevel@tonic-gate    }
126*0Sstevel@tonic-gate
127*0Sstevel@tonic-gate    ## If we still don't have it, check for the name among the loaded
128*0Sstevel@tonic-gate    ## scripts.
129*0Sstevel@tonic-gate    if (not @off) {
130*0Sstevel@tonic-gate      my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
131*0Sstevel@tonic-gate      for my $script (@{$^H{charnames_scripts}}) {
132*0Sstevel@tonic-gate	if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
133*0Sstevel@tonic-gate	  @off = ($-[0], $+[0]);
134*0Sstevel@tonic-gate	  last;
135*0Sstevel@tonic-gate	}
136*0Sstevel@tonic-gate      }
137*0Sstevel@tonic-gate    }
138*0Sstevel@tonic-gate
139*0Sstevel@tonic-gate    ## If we don't have it by now, give up.
140*0Sstevel@tonic-gate    unless (@off) {
141*0Sstevel@tonic-gate      carp "Unknown charname '$name'";
142*0Sstevel@tonic-gate      return "\x{FFFD}";
143*0Sstevel@tonic-gate    }
144*0Sstevel@tonic-gate
145*0Sstevel@tonic-gate    ##
146*0Sstevel@tonic-gate    ## Now know where in the string the name starts.
147*0Sstevel@tonic-gate    ## The code, in hex, is before that.
148*0Sstevel@tonic-gate    ##
149*0Sstevel@tonic-gate    ## The code can be 4-6 characters long, so we've got to sort of
150*0Sstevel@tonic-gate    ## go look for it, just after the newline that comes before $off[0].
151*0Sstevel@tonic-gate    ##
152*0Sstevel@tonic-gate    ## This would be much easier if unicore/Name.pl had info in
153*0Sstevel@tonic-gate    ## a name/code order, instead of code/name order.
154*0Sstevel@tonic-gate    ##
155*0Sstevel@tonic-gate    ## The +1 after the rindex() is to skip past the newline we're finding,
156*0Sstevel@tonic-gate    ## or, if the rindex() fails, to put us to an offset of zero.
157*0Sstevel@tonic-gate    ##
158*0Sstevel@tonic-gate    my $hexstart = rindex($txt, "\n", $off[0]) + 1;
159*0Sstevel@tonic-gate
160*0Sstevel@tonic-gate    ## we know where it starts, so turn into number -
161*0Sstevel@tonic-gate    ## the ordinal for the char.
162*0Sstevel@tonic-gate    $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
163*0Sstevel@tonic-gate  }
164*0Sstevel@tonic-gate
165*0Sstevel@tonic-gate  if ($^H & $bytes::hint_bits) {	# "use bytes" in effect?
166*0Sstevel@tonic-gate    use bytes;
167*0Sstevel@tonic-gate    return chr $ord if $ord <= 255;
168*0Sstevel@tonic-gate    my $hex = sprintf "%04x", $ord;
169*0Sstevel@tonic-gate    if (not defined $fname) {
170*0Sstevel@tonic-gate      $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
171*0Sstevel@tonic-gate    }
172*0Sstevel@tonic-gate    croak "Character 0x$hex with name '$fname' is above 0xFF";
173*0Sstevel@tonic-gate  }
174*0Sstevel@tonic-gate
175*0Sstevel@tonic-gate  no warnings 'utf8'; # allow even illegal characters
176*0Sstevel@tonic-gate  return pack "U", $ord;
177*0Sstevel@tonic-gate} # charnames
178*0Sstevel@tonic-gate
179*0Sstevel@tonic-gatesub import
180*0Sstevel@tonic-gate{
181*0Sstevel@tonic-gate  shift; ## ignore class name
182*0Sstevel@tonic-gate
183*0Sstevel@tonic-gate  if (not @_) {
184*0Sstevel@tonic-gate    carp("`use charnames' needs explicit imports list");
185*0Sstevel@tonic-gate  }
186*0Sstevel@tonic-gate  $^H |= $charnames::hint_bits;
187*0Sstevel@tonic-gate  $^H{charnames} = \&charnames ;
188*0Sstevel@tonic-gate
189*0Sstevel@tonic-gate  ##
190*0Sstevel@tonic-gate  ## fill %h keys with our @_ args.
191*0Sstevel@tonic-gate  ##
192*0Sstevel@tonic-gate  my ($promote, %h, @args) = (0);
193*0Sstevel@tonic-gate  while (@_ and $_ = shift) {
194*0Sstevel@tonic-gate    if ($_ eq ":alias") {
195*0Sstevel@tonic-gate      @_ or
196*0Sstevel@tonic-gate	croak ":alias needs an argument in charnames";
197*0Sstevel@tonic-gate      my $alias = shift;
198*0Sstevel@tonic-gate      if (ref $alias) {
199*0Sstevel@tonic-gate	ref $alias eq "HASH" or
200*0Sstevel@tonic-gate	  croak "Only HASH reference supported as argument to :alias";
201*0Sstevel@tonic-gate	alias ($alias);
202*0Sstevel@tonic-gate	next;
203*0Sstevel@tonic-gate      }
204*0Sstevel@tonic-gate      if ($alias =~ m{:(\w+)$}) {
205*0Sstevel@tonic-gate	$1 eq "full" || $1 eq "short" and
206*0Sstevel@tonic-gate	  croak ":alias cannot use existing pragma :$1 (reversed order?)";
207*0Sstevel@tonic-gate	alias_file ($1) and $promote = 1;
208*0Sstevel@tonic-gate	next;
209*0Sstevel@tonic-gate      }
210*0Sstevel@tonic-gate      alias_file ($alias);
211*0Sstevel@tonic-gate      next;
212*0Sstevel@tonic-gate    }
213*0Sstevel@tonic-gate    if (m/^:/ and ! ($_ eq ":full" || $_ eq ":short")) {
214*0Sstevel@tonic-gate      warn "unsupported special '$_' in charnames";
215*0Sstevel@tonic-gate      next;
216*0Sstevel@tonic-gate    }
217*0Sstevel@tonic-gate    push @args, $_;
218*0Sstevel@tonic-gate  }
219*0Sstevel@tonic-gate  @args == 0 && $promote and @args = (":full");
220*0Sstevel@tonic-gate  @h{@args} = (1) x @args;
221*0Sstevel@tonic-gate
222*0Sstevel@tonic-gate  $^H{charnames_full} = delete $h{':full'};
223*0Sstevel@tonic-gate  $^H{charnames_short} = delete $h{':short'};
224*0Sstevel@tonic-gate  $^H{charnames_scripts} = [map uc, keys %h];
225*0Sstevel@tonic-gate
226*0Sstevel@tonic-gate  ##
227*0Sstevel@tonic-gate  ## If utf8? warnings are enabled, and some scripts were given,
228*0Sstevel@tonic-gate  ## see if at least we can find one letter of each script.
229*0Sstevel@tonic-gate  ##
230*0Sstevel@tonic-gate  if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
231*0Sstevel@tonic-gate    $txt = do "unicore/Name.pl" unless $txt;
232*0Sstevel@tonic-gate
233*0Sstevel@tonic-gate    for my $script (@{$^H{charnames_scripts}}) {
234*0Sstevel@tonic-gate      if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
235*0Sstevel@tonic-gate	warnings::warn('utf8',  "No such script: '$script'");
236*0Sstevel@tonic-gate      }
237*0Sstevel@tonic-gate    }
238*0Sstevel@tonic-gate  }
239*0Sstevel@tonic-gate} # import
240*0Sstevel@tonic-gate
241*0Sstevel@tonic-gate# this comes actually from Unicode::UCD, but it avoids the
242*0Sstevel@tonic-gate# overhead of loading it
243*0Sstevel@tonic-gatesub _getcode {
244*0Sstevel@tonic-gate    my $arg = shift;
245*0Sstevel@tonic-gate
246*0Sstevel@tonic-gate    if ($arg =~ /^[1-9]\d*$/) {
247*0Sstevel@tonic-gate	return $arg;
248*0Sstevel@tonic-gate    } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
249*0Sstevel@tonic-gate	return hex($1);
250*0Sstevel@tonic-gate    }
251*0Sstevel@tonic-gate
252*0Sstevel@tonic-gate    return;
253*0Sstevel@tonic-gate}
254*0Sstevel@tonic-gate
255*0Sstevel@tonic-gatemy %viacode;
256*0Sstevel@tonic-gate
257*0Sstevel@tonic-gatesub viacode
258*0Sstevel@tonic-gate{
259*0Sstevel@tonic-gate  if (@_ != 1) {
260*0Sstevel@tonic-gate    carp "charnames::viacode() expects one argument";
261*0Sstevel@tonic-gate    return ()
262*0Sstevel@tonic-gate  }
263*0Sstevel@tonic-gate
264*0Sstevel@tonic-gate  my $arg = shift;
265*0Sstevel@tonic-gate  my $code = _getcode($arg);
266*0Sstevel@tonic-gate
267*0Sstevel@tonic-gate  my $hex;
268*0Sstevel@tonic-gate
269*0Sstevel@tonic-gate  if (defined $code) {
270*0Sstevel@tonic-gate    $hex = sprintf "%04X", $arg;
271*0Sstevel@tonic-gate  } else {
272*0Sstevel@tonic-gate    carp("unexpected arg \"$arg\" to charnames::viacode()");
273*0Sstevel@tonic-gate    return;
274*0Sstevel@tonic-gate  }
275*0Sstevel@tonic-gate
276*0Sstevel@tonic-gate  if ($code > 0x10FFFF) {
277*0Sstevel@tonic-gate    carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex;
278*0Sstevel@tonic-gate    return;
279*0Sstevel@tonic-gate  }
280*0Sstevel@tonic-gate
281*0Sstevel@tonic-gate  return $viacode{$hex} if exists $viacode{$hex};
282*0Sstevel@tonic-gate
283*0Sstevel@tonic-gate  $txt = do "unicore/Name.pl" unless $txt;
284*0Sstevel@tonic-gate
285*0Sstevel@tonic-gate  if ($txt =~ m/^$hex\t\t(.+)/m) {
286*0Sstevel@tonic-gate    return $viacode{$hex} = $1;
287*0Sstevel@tonic-gate  } else {
288*0Sstevel@tonic-gate    return;
289*0Sstevel@tonic-gate  }
290*0Sstevel@tonic-gate} # viacode
291*0Sstevel@tonic-gate
292*0Sstevel@tonic-gatemy %vianame;
293*0Sstevel@tonic-gate
294*0Sstevel@tonic-gatesub vianame
295*0Sstevel@tonic-gate{
296*0Sstevel@tonic-gate  if (@_ != 1) {
297*0Sstevel@tonic-gate    carp "charnames::vianame() expects one name argument";
298*0Sstevel@tonic-gate    return ()
299*0Sstevel@tonic-gate  }
300*0Sstevel@tonic-gate
301*0Sstevel@tonic-gate  my $arg = shift;
302*0Sstevel@tonic-gate
303*0Sstevel@tonic-gate  return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;
304*0Sstevel@tonic-gate
305*0Sstevel@tonic-gate  return $vianame{$arg} if exists $vianame{$arg};
306*0Sstevel@tonic-gate
307*0Sstevel@tonic-gate  $txt = do "unicore/Name.pl" unless $txt;
308*0Sstevel@tonic-gate
309*0Sstevel@tonic-gate  my $pos = index $txt, "\t\t$arg\n";
310*0Sstevel@tonic-gate  if ($[ <= $pos) {
311*0Sstevel@tonic-gate    my $posLF = rindex $txt, "\n", $pos;
312*0Sstevel@tonic-gate    (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d;
313*0Sstevel@tonic-gate    return $vianame{$arg} = hex $code;
314*0Sstevel@tonic-gate
315*0Sstevel@tonic-gate    # If $pos is at the 1st line, $posLF must be $[ - 1 (not found);
316*0Sstevel@tonic-gate    # then $posLF + 1 equals to $[ (at the beginning of $txt).
317*0Sstevel@tonic-gate    # Otherwise $posLF is the position of "\n";
318*0Sstevel@tonic-gate    # then $posLF + 1 must be the position of the next to "\n"
319*0Sstevel@tonic-gate    # (the beginning of the line).
320*0Sstevel@tonic-gate    # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t",
321*0Sstevel@tonic-gate    # "10300\t", "100000", etc. So we can get the code via removing TAB.
322*0Sstevel@tonic-gate  } else {
323*0Sstevel@tonic-gate    return;
324*0Sstevel@tonic-gate  }
325*0Sstevel@tonic-gate} # vianame
326*0Sstevel@tonic-gate
327*0Sstevel@tonic-gate
328*0Sstevel@tonic-gate1;
329*0Sstevel@tonic-gate__END__
330*0Sstevel@tonic-gate
331*0Sstevel@tonic-gate=head1 NAME
332*0Sstevel@tonic-gate
333*0Sstevel@tonic-gatecharnames - define character names for C<\N{named}> string literal escapes
334*0Sstevel@tonic-gate
335*0Sstevel@tonic-gate=head1 SYNOPSIS
336*0Sstevel@tonic-gate
337*0Sstevel@tonic-gate  use charnames ':full';
338*0Sstevel@tonic-gate  print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
339*0Sstevel@tonic-gate
340*0Sstevel@tonic-gate  use charnames ':short';
341*0Sstevel@tonic-gate  print "\N{greek:Sigma} is an upper-case sigma.\n";
342*0Sstevel@tonic-gate
343*0Sstevel@tonic-gate  use charnames qw(cyrillic greek);
344*0Sstevel@tonic-gate  print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
345*0Sstevel@tonic-gate
346*0Sstevel@tonic-gate  use charnames ":full", ":alias" => {
347*0Sstevel@tonic-gate    e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
348*0Sstevel@tonic-gate  };
349*0Sstevel@tonic-gate  print "\N{e_ACUTE} is a small letter e with an acute.\n";
350*0Sstevel@tonic-gate
351*0Sstevel@tonic-gate  use charnames ();
352*0Sstevel@tonic-gate  print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
353*0Sstevel@tonic-gate  printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330"
354*0Sstevel@tonic-gate
355*0Sstevel@tonic-gate=head1 DESCRIPTION
356*0Sstevel@tonic-gate
357*0Sstevel@tonic-gatePragma C<use charnames> supports arguments C<:full>, C<:short>, script
358*0Sstevel@tonic-gatenames and customized aliases.  If C<:full> is present, for expansion of
359*0Sstevel@tonic-gateC<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of
360*0Sstevel@tonic-gatestandard Unicode character names.  If C<:short> is present, and
361*0Sstevel@tonic-gateC<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
362*0Sstevel@tonic-gateas a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
363*0Sstevel@tonic-gatewith script name arguments, then for C<\N{CHARNAME}> the name
364*0Sstevel@tonic-gateC<CHARNAME> is looked up as a letter in the given scripts (in the
365*0Sstevel@tonic-gatespecified order). Customized aliases are explained in L</CUSTOM ALIASES>.
366*0Sstevel@tonic-gate
367*0Sstevel@tonic-gateFor lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
368*0Sstevel@tonic-gatethis pragma looks for the names
369*0Sstevel@tonic-gate
370*0Sstevel@tonic-gate  SCRIPTNAME CAPITAL LETTER CHARNAME
371*0Sstevel@tonic-gate  SCRIPTNAME SMALL LETTER CHARNAME
372*0Sstevel@tonic-gate  SCRIPTNAME LETTER CHARNAME
373*0Sstevel@tonic-gate
374*0Sstevel@tonic-gatein the table of standard Unicode names.  If C<CHARNAME> is lowercase,
375*0Sstevel@tonic-gatethen the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
376*0Sstevel@tonic-gateis ignored.
377*0Sstevel@tonic-gate
378*0Sstevel@tonic-gateNote that C<\N{...}> is compile-time, it's a special form of string
379*0Sstevel@tonic-gateconstant used inside double-quoted strings: in other words, you cannot
380*0Sstevel@tonic-gateuse variables inside the C<\N{...}>.  If you want similar run-time
381*0Sstevel@tonic-gatefunctionality, use charnames::vianame().
382*0Sstevel@tonic-gate
383*0Sstevel@tonic-gateFor the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
384*0Sstevel@tonic-gateas of Unicode 3.1, there are no official Unicode names but you can use
385*0Sstevel@tonic-gateinstead the ISO 6429 names (LINE FEED, ESCAPE, and so forth).  In
386*0Sstevel@tonic-gateUnicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
387*0Sstevel@tonic-gatehas been updated, see L</ALIASES>.  Also note that the U+UU80, U+0081,
388*0Sstevel@tonic-gateU+0084, and U+0099 do not have names even in ISO 6429.
389*0Sstevel@tonic-gate
390*0Sstevel@tonic-gateSince the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
391*0Sstevel@tonic-gateis the Unicode smiley face, or "\N{WHITE SMILING FACE}".
392*0Sstevel@tonic-gate
393*0Sstevel@tonic-gate=head1 CUSTOM TRANSLATORS
394*0Sstevel@tonic-gate
395*0Sstevel@tonic-gateThe mechanism of translation of C<\N{...}> escapes is general and not
396*0Sstevel@tonic-gatehardwired into F<charnames.pm>.  A module can install custom
397*0Sstevel@tonic-gatetranslations (inside the scope which C<use>s the module) with the
398*0Sstevel@tonic-gatefollowing magic incantation:
399*0Sstevel@tonic-gate
400*0Sstevel@tonic-gate    use charnames ();		# for $charnames::hint_bits
401*0Sstevel@tonic-gate    sub import {
402*0Sstevel@tonic-gate	shift;
403*0Sstevel@tonic-gate	$^H |= $charnames::hint_bits;
404*0Sstevel@tonic-gate	$^H{charnames} = \&translator;
405*0Sstevel@tonic-gate    }
406*0Sstevel@tonic-gate
407*0Sstevel@tonic-gateHere translator() is a subroutine which takes C<CHARNAME> as an
408*0Sstevel@tonic-gateargument, and returns text to insert into the string instead of the
409*0Sstevel@tonic-gateC<\N{CHARNAME}> escape.  Since the text to insert should be different
410*0Sstevel@tonic-gatein C<bytes> mode and out of it, the function should check the current
411*0Sstevel@tonic-gatestate of C<bytes>-flag as in:
412*0Sstevel@tonic-gate
413*0Sstevel@tonic-gate    use bytes ();			# for $bytes::hint_bits
414*0Sstevel@tonic-gate    sub translator {
415*0Sstevel@tonic-gate	if ($^H & $bytes::hint_bits) {
416*0Sstevel@tonic-gate	    return bytes_translator(@_);
417*0Sstevel@tonic-gate	}
418*0Sstevel@tonic-gate	else {
419*0Sstevel@tonic-gate	    return utf8_translator(@_);
420*0Sstevel@tonic-gate	}
421*0Sstevel@tonic-gate    }
422*0Sstevel@tonic-gate
423*0Sstevel@tonic-gate=head1 CUSTOM ALIASES
424*0Sstevel@tonic-gate
425*0Sstevel@tonic-gateThis version of charnames supports three mechanisms of adding local
426*0Sstevel@tonic-gateor customized aliases to standard Unicode naming conventions (:full)
427*0Sstevel@tonic-gate
428*0Sstevel@tonic-gate=head2 Anonymous hashes
429*0Sstevel@tonic-gate
430*0Sstevel@tonic-gate    use charnames ":full", ":alias" => {
431*0Sstevel@tonic-gate        e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
432*0Sstevel@tonic-gate        };
433*0Sstevel@tonic-gate    my $str = "\N{e_ACUTE}";
434*0Sstevel@tonic-gate
435*0Sstevel@tonic-gate=head2 Alias file
436*0Sstevel@tonic-gate
437*0Sstevel@tonic-gate    use charnames ":full", ":alias" => "pro";
438*0Sstevel@tonic-gate
439*0Sstevel@tonic-gate    will try to read "unicore/pro_alias.pl" from the @INC path. This
440*0Sstevel@tonic-gate    file should return a list in plain perl:
441*0Sstevel@tonic-gate
442*0Sstevel@tonic-gate    (
443*0Sstevel@tonic-gate    A_GRAVE         => "LATIN CAPITAL LETTER A WITH GRAVE",
444*0Sstevel@tonic-gate    A_CIRCUM        => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
445*0Sstevel@tonic-gate    A_DIAERES       => "LATIN CAPITAL LETTER A WITH DIAERESIS",
446*0Sstevel@tonic-gate    A_TILDE         => "LATIN CAPITAL LETTER A WITH TILDE",
447*0Sstevel@tonic-gate    A_BREVE         => "LATIN CAPITAL LETTER A WITH BREVE",
448*0Sstevel@tonic-gate    A_RING          => "LATIN CAPITAL LETTER A WITH RING ABOVE",
449*0Sstevel@tonic-gate    A_MACRON        => "LATIN CAPITAL LETTER A WITH MACRON",
450*0Sstevel@tonic-gate    );
451*0Sstevel@tonic-gate
452*0Sstevel@tonic-gate=head2 Alias shortcut
453*0Sstevel@tonic-gate
454*0Sstevel@tonic-gate    use charnames ":alias" => ":pro";
455*0Sstevel@tonic-gate
456*0Sstevel@tonic-gate    works exactly the same as the alias pairs, only this time,
457*0Sstevel@tonic-gate    ":full" is inserted automatically as first argument (if no
458*0Sstevel@tonic-gate    other argument is given).
459*0Sstevel@tonic-gate
460*0Sstevel@tonic-gate=head1 charnames::viacode(code)
461*0Sstevel@tonic-gate
462*0Sstevel@tonic-gateReturns the full name of the character indicated by the numeric code.
463*0Sstevel@tonic-gateThe example
464*0Sstevel@tonic-gate
465*0Sstevel@tonic-gate    print charnames::viacode(0x2722);
466*0Sstevel@tonic-gate
467*0Sstevel@tonic-gateprints "FOUR TEARDROP-SPOKED ASTERISK".
468*0Sstevel@tonic-gate
469*0Sstevel@tonic-gateReturns undef if no name is known for the code.
470*0Sstevel@tonic-gate
471*0Sstevel@tonic-gateThis works only for the standard names, and does not yet apply
472*0Sstevel@tonic-gateto custom translators.
473*0Sstevel@tonic-gate
474*0Sstevel@tonic-gateNotice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
475*0Sstevel@tonic-gateSPACE", not "BYTE ORDER MARK".
476*0Sstevel@tonic-gate
477*0Sstevel@tonic-gate=head1 charnames::vianame(name)
478*0Sstevel@tonic-gate
479*0Sstevel@tonic-gateReturns the code point indicated by the name.
480*0Sstevel@tonic-gateThe example
481*0Sstevel@tonic-gate
482*0Sstevel@tonic-gate    printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
483*0Sstevel@tonic-gate
484*0Sstevel@tonic-gateprints "2722".
485*0Sstevel@tonic-gate
486*0Sstevel@tonic-gateReturns undef if the name is unknown.
487*0Sstevel@tonic-gate
488*0Sstevel@tonic-gateThis works only for the standard names, and does not yet apply
489*0Sstevel@tonic-gateto custom translators.
490*0Sstevel@tonic-gate
491*0Sstevel@tonic-gate=head1 ALIASES
492*0Sstevel@tonic-gate
493*0Sstevel@tonic-gateA few aliases have been defined for convenience: instead of having
494*0Sstevel@tonic-gateto use the official names
495*0Sstevel@tonic-gate
496*0Sstevel@tonic-gate    LINE FEED (LF)
497*0Sstevel@tonic-gate    FORM FEED (FF)
498*0Sstevel@tonic-gate    CARRIAGE RETURN (CR)
499*0Sstevel@tonic-gate    NEXT LINE (NEL)
500*0Sstevel@tonic-gate
501*0Sstevel@tonic-gate(yes, with parentheses) one can use
502*0Sstevel@tonic-gate
503*0Sstevel@tonic-gate    LINE FEED
504*0Sstevel@tonic-gate    FORM FEED
505*0Sstevel@tonic-gate    CARRIAGE RETURN
506*0Sstevel@tonic-gate    NEXT LINE
507*0Sstevel@tonic-gate    LF
508*0Sstevel@tonic-gate    FF
509*0Sstevel@tonic-gate    CR
510*0Sstevel@tonic-gate    NEL
511*0Sstevel@tonic-gate
512*0Sstevel@tonic-gateOne can also use
513*0Sstevel@tonic-gate
514*0Sstevel@tonic-gate    BYTE ORDER MARK
515*0Sstevel@tonic-gate    BOM
516*0Sstevel@tonic-gate
517*0Sstevel@tonic-gateand
518*0Sstevel@tonic-gate
519*0Sstevel@tonic-gate    ZWNJ
520*0Sstevel@tonic-gate    ZWJ
521*0Sstevel@tonic-gate
522*0Sstevel@tonic-gatefor ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER.
523*0Sstevel@tonic-gate
524*0Sstevel@tonic-gateFor backward compatibility one can use the old names for
525*0Sstevel@tonic-gatecertain C0 and C1 controls
526*0Sstevel@tonic-gate
527*0Sstevel@tonic-gate    old                         new
528*0Sstevel@tonic-gate
529*0Sstevel@tonic-gate    HORIZONTAL TABULATION       CHARACTER TABULATION
530*0Sstevel@tonic-gate    VERTICAL TABULATION         LINE TABULATION
531*0Sstevel@tonic-gate    FILE SEPARATOR              INFORMATION SEPARATOR FOUR
532*0Sstevel@tonic-gate    GROUP SEPARATOR             INFORMATION SEPARATOR THREE
533*0Sstevel@tonic-gate    RECORD SEPARATOR            INFORMATION SEPARATOR TWO
534*0Sstevel@tonic-gate    UNIT SEPARATOR              INFORMATION SEPARATOR ONE
535*0Sstevel@tonic-gate    PARTIAL LINE DOWN           PARTIAL LINE FORWARD
536*0Sstevel@tonic-gate    PARTIAL LINE UP             PARTIAL LINE BACKWARD
537*0Sstevel@tonic-gate
538*0Sstevel@tonic-gatebut the old names in addition to giving the character
539*0Sstevel@tonic-gatewill also give a warning about being deprecated.
540*0Sstevel@tonic-gate
541*0Sstevel@tonic-gate=head1 ILLEGAL CHARACTERS
542*0Sstevel@tonic-gate
543*0Sstevel@tonic-gateIf you ask by name for a character that does not exist, a warning is
544*0Sstevel@tonic-gategiven and the Unicode I<replacement character> "\x{FFFD}" is returned.
545*0Sstevel@tonic-gate
546*0Sstevel@tonic-gateIf you ask by code for a character that does not exist, no warning is
547*0Sstevel@tonic-gategiven and C<undef> is returned.  (Though if you ask for a code point
548*0Sstevel@tonic-gatepast U+10FFFF you do get a warning.)
549*0Sstevel@tonic-gate
550*0Sstevel@tonic-gate=head1 BUGS
551*0Sstevel@tonic-gate
552*0Sstevel@tonic-gateSince evaluation of the translation function happens in a middle of
553*0Sstevel@tonic-gatecompilation (of a string literal), the translation function should not
554*0Sstevel@tonic-gatedo any C<eval>s or C<require>s.  This restriction should be lifted in
555*0Sstevel@tonic-gatea future version of Perl.
556*0Sstevel@tonic-gate
557*0Sstevel@tonic-gate=cut
558