xref: /openbsd-src/gnu/usr.bin/perl/lib/charnames.pm (revision b2ea75c1b17e1a9a339660e7ed45cd24946b230e)
1package charnames;
2use bytes ();		# for $bytes::hint_bits
3use warnings();
4$charnames::hint_bits = 0x20000;
5
6my $txt;
7
8# This is not optimized in any way yet
9sub charnames {
10  $name = shift;
11  $txt = do "unicode/Name.pl" unless $txt;
12  my @off;
13  if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
14    @off = ($-[0], $+[0]);
15  }
16  unless (@off) {
17    if ($^H{charnames_short} and $name =~ /^(.*?):(.*)/s) {
18      my ($script, $cname) = ($1,$2);
19      my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
20      if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
21	@off = ($-[0], $+[0]);
22      }
23    }
24  }
25  unless (@off) {
26    my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
27    for ( @{$^H{charnames_scripts}} ) {
28      (@off = ($-[0], $+[0])), last
29	if $txt =~ m/\t\t$_ (?:$case )?LETTER \U$name$/m;
30    }
31  }
32  die "Unknown charname '$name'" unless @off;
33
34  my $hexlen = 4; # Unicode guarantees 4-, 5-, or 6-digit format
35  $hexlen++ while
36      $hexlen < 6 && substr($txt, $off[0] - $hexlen - 1, 1) =~ /[0-9a-f]/;
37  my $ord = hex substr $txt, $off[0] - $hexlen, $hexlen;
38  if ($^H & $bytes::hint_bits) {	# "use bytes" in effect?
39    use bytes;
40    return chr $ord if $ord <= 255;
41    my $hex = sprintf '%X=0%o', $ord, $ord;
42    my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
43    die "Character 0x$hex with name '$fname' is above 0xFF";
44  }
45  return chr $ord;
46}
47
48sub import {
49  shift;
50  die "`use charnames' needs explicit imports list" unless @_;
51  $^H |= $charnames::hint_bits;
52  $^H{charnames} = \&charnames ;
53  my %h;
54  @h{@_} = (1) x @_;
55  $^H{charnames_full} = delete $h{':full'};
56  $^H{charnames_short} = delete $h{':short'};
57  $^H{charnames_scripts} = [map uc, keys %h];
58  if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
59	$txt = do "unicode/Name.pl" unless $txt;
60    for (@{$^H{charnames_scripts}}) {
61        warnings::warn('utf8',  "No such script: '$_'") unless
62	    $txt =~ m/\t\t$_ (?:CAPITAL |SMALL )?LETTER /;
63	}
64  }
65}
66
67
681;
69__END__
70
71=head1 NAME
72
73charnames - define character names for C<\N{named}> string literal escape.
74
75=head1 SYNOPSIS
76
77  use charnames ':full';
78  print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
79
80  use charnames ':short';
81  print "\N{greek:Sigma} is an upper-case sigma.\n";
82
83  use charnames qw(cyrillic greek);
84  print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
85
86=head1 DESCRIPTION
87
88Pragma C<use charnames> supports arguments C<:full>, C<:short> and
89script names.  If C<:full> is present, for expansion of
90C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
91standard Unicode names of chars.  If C<:short> is present, and
92C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
93as a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
94with script name arguments, then for C<\N{CHARNAME}}> the name
95C<CHARNAME> is looked up as a letter in the given scripts (in the
96specified order).
97
98For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
99this pragma looks for the names
100
101  SCRIPTNAME CAPITAL LETTER CHARNAME
102  SCRIPTNAME SMALL LETTER CHARNAME
103  SCRIPTNAME LETTER CHARNAME
104
105in the table of standard Unicode names.  If C<CHARNAME> is lowercase,
106then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant is
107ignored.
108
109=head1 CUSTOM TRANSLATORS
110
111The mechanism of translation of C<\N{...}> escapes is general and not
112hardwired into F<charnames.pm>.  A module can install custom
113translations (inside the scope which C<use>s the module) with the
114following magic incantation:
115
116    use charnames ();		# for $charnames::hint_bits
117    sub import {
118	shift;
119	$^H |= $charnames::hint_bits;
120	$^H{charnames} = \&translator;
121    }
122
123Here translator() is a subroutine which takes C<CHARNAME> as an
124argument, and returns text to insert into the string instead of the
125C<\N{CHARNAME}> escape.  Since the text to insert should be different
126in C<bytes> mode and out of it, the function should check the current
127state of C<bytes>-flag as in:
128
129    use bytes ();			# for $bytes::hint_bits
130    sub translator {
131	if ($^H & $bytes::hint_bits) {
132	    return bytes_translator(@_);
133	}
134	else {
135	    return utf8_translator(@_);
136	}
137    }
138
139=head1 BUGS
140
141Since evaluation of the translation function happens in a middle of
142compilation (of a string literal), the translation function should not
143do any C<eval>s or C<require>s.  This restriction should be lifted in
144a future version of Perl.
145
146=cut
147