1package utf8; 2use strict; 3use warnings; 4 5sub DEBUG () { 0 } 6 7sub DESTROY {} 8 9my %Cache; 10 11sub croak { require Carp; Carp::croak(@_) } 12 13## 14## "SWASH" == "SWATCH HASH". A "swatch" is a swatch of the Unicode landscape. 15## It's a data structure that encodes a set of Unicode characters. 16## 17 18sub SWASHNEW { 19 my ($class, $type, $list, $minbits, $none) = @_; 20 local $^D = 0 if $^D; 21 22 print STDERR "SWASHNEW @_\n" if DEBUG; 23 24 ## 25 ## Get the list of codepoints for the type. 26 ## Called from utf8.c 27 ## 28 ## Given a $type, our goal is to fill $list with the set of codepoint 29 ## ranges. 30 ## 31 ## To make the parsing of $type clear, this code takes the a rather 32 ## unorthodox approach of last'ing out of the block once we have the 33 ## info we need. Were this to be a subroutine, the 'last' would just 34 ## be a 'return'. 35 ## 36 my $file; ## file to load data from, and also part of the %Cache key. 37 my $ListSorted = 0; 38 39 if ($type) 40 { 41 $type =~ s/^\s+//; 42 $type =~ s/\s+$//; 43 44 print "type = $type\n" if DEBUG; 45 46 GETFILE: 47 { 48 ## 49 ## 'Is' is always optional, so if it's there, remove it. 50 ## Same with 'Category=' and 'Script='. 51 ## 52 ## 'Block=' is replaced by 'In'. 53 ## 54 my $wasIs; 55 56 ($wasIs = $type =~ s/^Is(?:\s+|[-_])?//i) 57 or 58 $type =~ s/^Category\s*=\s*//i 59 or 60 $type =~ s/^Script\s*=\s*//i 61 or 62 $type =~ s/^Block\s*=\s*/In/i; 63 64 ## 65 ## See if it's in the direct mapping table. 66 ## 67 require "unicore/Exact.pl"; 68 if (my $base = $utf8::Exact{$type}) { 69 $file = "unicore/lib/$base.pl"; 70 last GETFILE; 71 } 72 73 ## 74 ## If not there exactly, try the canonical form. The canonical 75 ## form is lowercased, with any separators (\s+|[-_]) removed. 76 ## 77 my $canonical = lc $type; 78 $canonical =~ s/(?<=[a-z\d])(?:\s+|[-_])(?=[a-z\d])//g; 79 print "canonical = $canonical\n" if DEBUG; 80 81 require "unicore/Canonical.pl"; 82 if (my $base = $utf8::Canonical{$canonical}) { 83 $file = "unicore/lib/$base.pl"; 84 last GETFILE; 85 } 86 87 ## 88 ## It could be a user-defined property. 89 ## 90 91 my $caller1 = caller(1); 92 93 if (defined $caller1 && $type =~ /^(?:\w+)$/) { 94 my $prop = $caller1 . "::" . ( $wasIs ? "Is" : "" ) . $type; 95 if (exists &{$prop}) { 96 no strict 'refs'; 97 98 $list = &{$prop}; 99 last GETFILE; 100 } 101 } 102 103 ## 104 ## See if it's a user-level "To". 105 ## 106 107 my $caller0 = caller(0); 108 109 if (defined $caller0 && $type =~ /^To(?:\w+)$/) { 110 my $map = $caller0 . "::" . $type; 111 if (exists &{$map}) { 112 no strict 'refs'; 113 114 $list = &{$map}; 115 last GETFILE; 116 } 117 } 118 119 ## 120 ## Last attempt -- see if it's a standard "To" name 121 ## (e.g. "ToLower") ToTitle is used by ucfirst(). 122 ## The user-level way to access ToDigit() and ToFold() 123 ## is to use Unicode::UCD. 124 ## 125 if ($type =~ /^To(Digit|Fold|Lower|Title|Upper)$/) 126 { 127 $file = "unicore/To/$1.pl"; 128 ## would like to test to see if $file actually exists.... 129 last GETFILE; 130 } 131 132 ## 133 ## If we reach this line, it's because we couldn't figure 134 ## out what to do with $type. Ouch. 135 ## 136 137 return $type; 138 } 139 140 if (defined $file) { 141 print "found it (file='$file')\n" if DEBUG; 142 143 ## 144 ## If we reach here, it was due to a 'last GETFILE' above 145 ## (exception: user-defined properties and mappings), so we 146 ## have a filename, so now we load it if we haven't already. 147 ## If we have, return the cached results. The cache key is the 148 ## file to load. 149 ## 150 if ($Cache{$file} and ref($Cache{$file}) eq $class) 151 { 152 print "Returning cached '$file' for \\p{$type}\n" if DEBUG; 153 return $Cache{$class, $file}; 154 } 155 156 $list = do $file; 157 } 158 159 $ListSorted = 1; ## we know that these lists are sorted 160 } 161 162 my $extras; 163 my $bits = 0; 164 165 my $ORIG = $list; 166 if ($list) { 167 my @tmp = split(/^/m, $list); 168 my %seen; 169 no warnings; 170 $extras = join '', grep /^[^0-9a-fA-F]/, @tmp; 171 $list = join '', 172 map { $_->[1] } 173 sort { $a->[0] <=> $b->[0] } 174 map { /^([0-9a-fA-F]+)/; [ hex($1), $_ ] } 175 grep { /^([0-9a-fA-F]+)/ and not $seen{$1}++ } @tmp; # XXX doesn't do ranges right 176 } 177 178 if ($none) { 179 my $hextra = sprintf "%04x", $none + 1; 180 $list =~ s/\tXXXX$/\t$hextra/mg; 181 } 182 183 if ($minbits < 32) { 184 my $top = 0; 185 while ($list =~ /^([0-9a-fA-F]+)(?:[\t]([0-9a-fA-F]+)?)(?:[ \t]([0-9a-fA-F]+))?/mg) { 186 my $min = hex $1; 187 my $max = defined $2 ? hex $2 : $min; 188 my $val = defined $3 ? hex $3 : 0; 189 $val += $max - $min if defined $3; 190 $top = $val if $val > $top; 191 } 192 $bits = 193 $top > 0xffff ? 32 : 194 $top > 0xff ? 16 : 195 $top > 1 ? 8 : 1 196 } 197 $bits = $minbits if $bits < $minbits; 198 199 my @extras; 200 for my $x ($extras) { 201 pos $x = 0; 202 while ($x =~ /^([^0-9a-fA-F\n])(.*)/mg) { 203 my $char = $1; 204 my $name = $2; 205 print STDERR "$1 => $2\n" if DEBUG; 206 if ($char =~ /[-+!]/) { 207 my ($c,$t) = split(/::/, $name, 2); # bogus use of ::, really 208 my $subobj; 209 if ($c eq 'utf8') { 210 $subobj = $c->SWASHNEW($t, "", 0, 0, 0); 211 } 212 elsif ($c =~ /^([0-9a-fA-F]+)/) { 213 $subobj = utf8->SWASHNEW("", $c, 0, 0, 0); 214 } 215 return $subobj unless ref $subobj; 216 push @extras, $name => $subobj; 217 $bits = $subobj->{BITS} if $bits < $subobj->{BITS}; 218 } 219 } 220 } 221 222 print STDERR "CLASS = $class, TYPE => $type, BITS => $bits, NONE => $none\nEXTRAS =>\n$extras\nLIST =>\n$list\n" if DEBUG; 223 224 my $SWASH = bless { 225 TYPE => $type, 226 BITS => $bits, 227 EXTRAS => $extras, 228 LIST => $list, 229 NONE => $none, 230 @extras, 231 } => $class; 232 233 if ($file) { 234 $Cache{$class, $file} = $SWASH; 235 } 236 237 return $SWASH; 238} 239 240# NOTE: utf8.c:swash_init() assumes entries are never modified once generated. 241 242sub SWASHGET { 243 # See utf8.c:Perl_swash_fetch for problems with this interface. 244 my ($self, $start, $len) = @_; 245 local $^D = 0 if $^D; 246 my $type = $self->{TYPE}; 247 my $bits = $self->{BITS}; 248 my $none = $self->{NONE}; 249 print STDERR "SWASHGET @_ [$type/$bits/$none]\n" if DEBUG; 250 my $end = $start + $len; 251 my $swatch = ""; 252 my $key; 253 vec($swatch, $len - 1, $bits) = 0; # Extend to correct length. 254 if ($none) { 255 for $key (0 .. $len - 1) { vec($swatch, $key, $bits) = $none } 256 } 257 258 for ($self->{LIST}) { 259 pos $_ = 0; 260 if ($bits > 1) { 261 LINE: 262 while (/^([0-9a-fA-F]+)(?:[ \t]([0-9a-fA-F]+)?)?(?:[ \t]([0-9a-fA-F]+))?/mg) { 263 chomp; 264 my ($a, $b, $c) = ($1, $2, $3); 265 croak "$type: illegal mapping '$_'" 266 if $type =~ /^To/ && 267 !(defined $a && defined $c); 268 my $min = hex $a; 269 my $max = defined $b ? hex $b : $min; 270 my $val = defined $c ? hex $c : 0; 271 next if $max < $start; 272 print "$min $max $val\n" if DEBUG; 273 if ($none) { 274 if ($min < $start) { 275 $val += $start - $min if $val < $none; 276 $min = $start; 277 } 278 for ($key = $min; $key <= $max; $key++) { 279 last LINE if $key >= $end; 280 print STDERR "$key => $val\n" if DEBUG; 281 vec($swatch, $key - $start, $bits) = $val; 282 ++$val if $val < $none; 283 } 284 } 285 else { 286 if ($min < $start) { 287 $val += $start - $min; 288 $min = $start; 289 } 290 for ($key = $min; $key <= $max; $key++, $val++) { 291 last LINE if $key >= $end; 292 print STDERR "$key => $val\n" if DEBUG; 293 vec($swatch, $key - $start, $bits) = $val; 294 } 295 } 296 } 297 } 298 else { 299 LINE: 300 while (/^([0-9a-fA-F]+)(?:[ \t]+([0-9a-fA-F]+))?/mg) { 301 chomp; 302 my $min = hex $1; 303 my $max = defined $2 ? hex $2 : $min; 304 next if $max < $start; 305 if ($min < $start) { 306 $min = $start; 307 } 308 for ($key = $min; $key <= $max; $key++) { 309 last LINE if $key >= $end; 310 print STDERR "$key => 1\n" if DEBUG; 311 vec($swatch, $key - $start, 1) = 1; 312 } 313 } 314 } 315 } 316 for my $x ($self->{EXTRAS}) { 317 pos $x = 0; 318 while ($x =~ /^([-+!])(.*)/mg) { 319 my $char = $1; 320 my $name = $2; 321 print STDERR "INDIRECT $1 $2\n" if DEBUG; 322 my $otherbits = $self->{$name}->{BITS}; 323 croak("SWASHGET size mismatch") if $bits < $otherbits; 324 my $other = $self->{$name}->SWASHGET($start, $len); 325 if ($char eq '+') { 326 if ($bits == 1 and $otherbits == 1) { 327 $swatch |= $other; 328 } 329 else { 330 for ($key = 0; $key < $len; $key++) { 331 vec($swatch, $key, $bits) = vec($other, $key, $otherbits); 332 } 333 } 334 } 335 elsif ($char eq '!') { 336 if ($bits == 1 and $otherbits == 1) { 337 $swatch |= ~$other; 338 } 339 else { 340 for ($key = 0; $key < $len; $key++) { 341 if (!vec($other, $key, $otherbits)) { 342 vec($swatch, $key, $bits) = 1; 343 } 344 } 345 } 346 } 347 elsif ($char eq '-') { 348 if ($bits == 1 and $otherbits == 1) { 349 $swatch &= ~$other; 350 } 351 else { 352 for ($key = 0; $key < $len; $key++) { 353 if (vec($other, $key, $otherbits)) { 354 vec($swatch, $key, $bits) = 0; 355 } 356 } 357 } 358 } 359 } 360 } 361 if (DEBUG) { 362 print STDERR "CELLS "; 363 for ($key = 0; $key < $len; $key++) { 364 print STDERR vec($swatch, $key, $bits), " "; 365 } 366 print STDERR "\n"; 367 } 368 $swatch; 369} 370 3711; 372