1*0Sstevel@tonic-gatepackage utf8; 2*0Sstevel@tonic-gateuse strict; 3*0Sstevel@tonic-gateuse warnings; 4*0Sstevel@tonic-gate 5*0Sstevel@tonic-gatesub DEBUG () { 0 } 6*0Sstevel@tonic-gate 7*0Sstevel@tonic-gatesub DESTROY {} 8*0Sstevel@tonic-gate 9*0Sstevel@tonic-gatemy %Cache; 10*0Sstevel@tonic-gate 11*0Sstevel@tonic-gatesub croak { require Carp; Carp::croak(@_) } 12*0Sstevel@tonic-gate 13*0Sstevel@tonic-gate## 14*0Sstevel@tonic-gate## "SWASH" == "SWATCH HASH". A "swatch" is a swatch of the Unicode landscape. 15*0Sstevel@tonic-gate## It's a data structure that encodes a set of Unicode characters. 16*0Sstevel@tonic-gate## 17*0Sstevel@tonic-gate 18*0Sstevel@tonic-gatesub SWASHNEW { 19*0Sstevel@tonic-gate my ($class, $type, $list, $minbits, $none) = @_; 20*0Sstevel@tonic-gate local $^D = 0 if $^D; 21*0Sstevel@tonic-gate 22*0Sstevel@tonic-gate print STDERR "SWASHNEW @_\n" if DEBUG; 23*0Sstevel@tonic-gate 24*0Sstevel@tonic-gate ## 25*0Sstevel@tonic-gate ## Get the list of codepoints for the type. 26*0Sstevel@tonic-gate ## Called from utf8.c 27*0Sstevel@tonic-gate ## 28*0Sstevel@tonic-gate ## Given a $type, our goal is to fill $list with the set of codepoint 29*0Sstevel@tonic-gate ## ranges. 30*0Sstevel@tonic-gate ## 31*0Sstevel@tonic-gate ## To make the parsing of $type clear, this code takes the a rather 32*0Sstevel@tonic-gate ## unorthodox approach of last'ing out of the block once we have the 33*0Sstevel@tonic-gate ## info we need. Were this to be a subroutine, the 'last' would just 34*0Sstevel@tonic-gate ## be a 'return'. 35*0Sstevel@tonic-gate ## 36*0Sstevel@tonic-gate my $file; ## file to load data from, and also part of the %Cache key. 37*0Sstevel@tonic-gate my $ListSorted = 0; 38*0Sstevel@tonic-gate 39*0Sstevel@tonic-gate if ($type) 40*0Sstevel@tonic-gate { 41*0Sstevel@tonic-gate $type =~ s/^\s+//; 42*0Sstevel@tonic-gate $type =~ s/\s+$//; 43*0Sstevel@tonic-gate 44*0Sstevel@tonic-gate print "type = $type\n" if DEBUG; 45*0Sstevel@tonic-gate 46*0Sstevel@tonic-gate GETFILE: 47*0Sstevel@tonic-gate { 48*0Sstevel@tonic-gate ## 49*0Sstevel@tonic-gate ## 'Is' is always optional, so if it's there, remove it. 50*0Sstevel@tonic-gate ## Same with 'Category=' and 'Script='. 51*0Sstevel@tonic-gate ## 52*0Sstevel@tonic-gate ## 'Block=' is replaced by 'In'. 53*0Sstevel@tonic-gate ## 54*0Sstevel@tonic-gate my $wasIs; 55*0Sstevel@tonic-gate 56*0Sstevel@tonic-gate ($wasIs = $type =~ s/^Is(?:\s+|[-_])?//i) 57*0Sstevel@tonic-gate or 58*0Sstevel@tonic-gate $type =~ s/^Category\s*=\s*//i 59*0Sstevel@tonic-gate or 60*0Sstevel@tonic-gate $type =~ s/^Script\s*=\s*//i 61*0Sstevel@tonic-gate or 62*0Sstevel@tonic-gate $type =~ s/^Block\s*=\s*/In/i; 63*0Sstevel@tonic-gate 64*0Sstevel@tonic-gate ## 65*0Sstevel@tonic-gate ## See if it's in the direct mapping table. 66*0Sstevel@tonic-gate ## 67*0Sstevel@tonic-gate require "unicore/Exact.pl"; 68*0Sstevel@tonic-gate if (my $base = $utf8::Exact{$type}) { 69*0Sstevel@tonic-gate $file = "unicore/lib/$base.pl"; 70*0Sstevel@tonic-gate last GETFILE; 71*0Sstevel@tonic-gate } 72*0Sstevel@tonic-gate 73*0Sstevel@tonic-gate ## 74*0Sstevel@tonic-gate ## If not there exactly, try the canonical form. The canonical 75*0Sstevel@tonic-gate ## form is lowercased, with any separators (\s+|[-_]) removed. 76*0Sstevel@tonic-gate ## 77*0Sstevel@tonic-gate my $canonical = lc $type; 78*0Sstevel@tonic-gate $canonical =~ s/(?<=[a-z\d])(?:\s+|[-_])(?=[a-z\d])//g; 79*0Sstevel@tonic-gate print "canonical = $canonical\n" if DEBUG; 80*0Sstevel@tonic-gate 81*0Sstevel@tonic-gate require "unicore/Canonical.pl"; 82*0Sstevel@tonic-gate if (my $base = $utf8::Canonical{$canonical}) { 83*0Sstevel@tonic-gate $file = "unicore/lib/$base.pl"; 84*0Sstevel@tonic-gate last GETFILE; 85*0Sstevel@tonic-gate } 86*0Sstevel@tonic-gate 87*0Sstevel@tonic-gate ## 88*0Sstevel@tonic-gate ## It could be a user-defined property. 89*0Sstevel@tonic-gate ## 90*0Sstevel@tonic-gate 91*0Sstevel@tonic-gate my $caller1 = caller(1); 92*0Sstevel@tonic-gate 93*0Sstevel@tonic-gate if (defined $caller1 && $type =~ /^(?:\w+)$/) { 94*0Sstevel@tonic-gate my $prop = $caller1 . "::" . ( $wasIs ? "Is" : "" ) . $type; 95*0Sstevel@tonic-gate if (exists &{$prop}) { 96*0Sstevel@tonic-gate no strict 'refs'; 97*0Sstevel@tonic-gate 98*0Sstevel@tonic-gate $list = &{$prop}; 99*0Sstevel@tonic-gate last GETFILE; 100*0Sstevel@tonic-gate } 101*0Sstevel@tonic-gate } 102*0Sstevel@tonic-gate 103*0Sstevel@tonic-gate ## 104*0Sstevel@tonic-gate ## See if it's a user-level "To". 105*0Sstevel@tonic-gate ## 106*0Sstevel@tonic-gate 107*0Sstevel@tonic-gate my $caller0 = caller(0); 108*0Sstevel@tonic-gate 109*0Sstevel@tonic-gate if (defined $caller0 && $type =~ /^To(?:\w+)$/) { 110*0Sstevel@tonic-gate my $map = $caller0 . "::" . $type; 111*0Sstevel@tonic-gate if (exists &{$map}) { 112*0Sstevel@tonic-gate no strict 'refs'; 113*0Sstevel@tonic-gate 114*0Sstevel@tonic-gate $list = &{$map}; 115*0Sstevel@tonic-gate last GETFILE; 116*0Sstevel@tonic-gate } 117*0Sstevel@tonic-gate } 118*0Sstevel@tonic-gate 119*0Sstevel@tonic-gate ## 120*0Sstevel@tonic-gate ## Last attempt -- see if it's a standard "To" name 121*0Sstevel@tonic-gate ## (e.g. "ToLower") ToTitle is used by ucfirst(). 122*0Sstevel@tonic-gate ## The user-level way to access ToDigit() and ToFold() 123*0Sstevel@tonic-gate ## is to use Unicode::UCD. 124*0Sstevel@tonic-gate ## 125*0Sstevel@tonic-gate if ($type =~ /^To(Digit|Fold|Lower|Title|Upper)$/) 126*0Sstevel@tonic-gate { 127*0Sstevel@tonic-gate $file = "unicore/To/$1.pl"; 128*0Sstevel@tonic-gate ## would like to test to see if $file actually exists.... 129*0Sstevel@tonic-gate last GETFILE; 130*0Sstevel@tonic-gate } 131*0Sstevel@tonic-gate 132*0Sstevel@tonic-gate ## 133*0Sstevel@tonic-gate ## If we reach this line, it's because we couldn't figure 134*0Sstevel@tonic-gate ## out what to do with $type. Ouch. 135*0Sstevel@tonic-gate ## 136*0Sstevel@tonic-gate 137*0Sstevel@tonic-gate return $type; 138*0Sstevel@tonic-gate } 139*0Sstevel@tonic-gate 140*0Sstevel@tonic-gate if (defined $file) { 141*0Sstevel@tonic-gate print "found it (file='$file')\n" if DEBUG; 142*0Sstevel@tonic-gate 143*0Sstevel@tonic-gate ## 144*0Sstevel@tonic-gate ## If we reach here, it was due to a 'last GETFILE' above 145*0Sstevel@tonic-gate ## (exception: user-defined properties and mappings), so we 146*0Sstevel@tonic-gate ## have a filename, so now we load it if we haven't already. 147*0Sstevel@tonic-gate ## If we have, return the cached results. The cache key is the 148*0Sstevel@tonic-gate ## file to load. 149*0Sstevel@tonic-gate ## 150*0Sstevel@tonic-gate if ($Cache{$file} and ref($Cache{$file}) eq $class) 151*0Sstevel@tonic-gate { 152*0Sstevel@tonic-gate print "Returning cached '$file' for \\p{$type}\n" if DEBUG; 153*0Sstevel@tonic-gate return $Cache{$class, $file}; 154*0Sstevel@tonic-gate } 155*0Sstevel@tonic-gate 156*0Sstevel@tonic-gate $list = do $file; 157*0Sstevel@tonic-gate } 158*0Sstevel@tonic-gate 159*0Sstevel@tonic-gate $ListSorted = 1; ## we know that these lists are sorted 160*0Sstevel@tonic-gate } 161*0Sstevel@tonic-gate 162*0Sstevel@tonic-gate my $extras; 163*0Sstevel@tonic-gate my $bits = 0; 164*0Sstevel@tonic-gate 165*0Sstevel@tonic-gate my $ORIG = $list; 166*0Sstevel@tonic-gate if ($list) { 167*0Sstevel@tonic-gate my @tmp = split(/^/m, $list); 168*0Sstevel@tonic-gate my %seen; 169*0Sstevel@tonic-gate no warnings; 170*0Sstevel@tonic-gate $extras = join '', grep /^[^0-9a-fA-F]/, @tmp; 171*0Sstevel@tonic-gate $list = join '', 172*0Sstevel@tonic-gate map { $_->[1] } 173*0Sstevel@tonic-gate sort { $a->[0] <=> $b->[0] } 174*0Sstevel@tonic-gate map { /^([0-9a-fA-F]+)/; [ hex($1), $_ ] } 175*0Sstevel@tonic-gate grep { /^([0-9a-fA-F]+)/ and not $seen{$1}++ } @tmp; # XXX doesn't do ranges right 176*0Sstevel@tonic-gate } 177*0Sstevel@tonic-gate 178*0Sstevel@tonic-gate if ($none) { 179*0Sstevel@tonic-gate my $hextra = sprintf "%04x", $none + 1; 180*0Sstevel@tonic-gate $list =~ s/\tXXXX$/\t$hextra/mg; 181*0Sstevel@tonic-gate } 182*0Sstevel@tonic-gate 183*0Sstevel@tonic-gate if ($minbits < 32) { 184*0Sstevel@tonic-gate my $top = 0; 185*0Sstevel@tonic-gate while ($list =~ /^([0-9a-fA-F]+)(?:[\t]([0-9a-fA-F]+)?)(?:[ \t]([0-9a-fA-F]+))?/mg) { 186*0Sstevel@tonic-gate my $min = hex $1; 187*0Sstevel@tonic-gate my $max = defined $2 ? hex $2 : $min; 188*0Sstevel@tonic-gate my $val = defined $3 ? hex $3 : 0; 189*0Sstevel@tonic-gate $val += $max - $min if defined $3; 190*0Sstevel@tonic-gate $top = $val if $val > $top; 191*0Sstevel@tonic-gate } 192*0Sstevel@tonic-gate $bits = 193*0Sstevel@tonic-gate $top > 0xffff ? 32 : 194*0Sstevel@tonic-gate $top > 0xff ? 16 : 195*0Sstevel@tonic-gate $top > 1 ? 8 : 1 196*0Sstevel@tonic-gate } 197*0Sstevel@tonic-gate $bits = $minbits if $bits < $minbits; 198*0Sstevel@tonic-gate 199*0Sstevel@tonic-gate my @extras; 200*0Sstevel@tonic-gate for my $x ($extras) { 201*0Sstevel@tonic-gate pos $x = 0; 202*0Sstevel@tonic-gate while ($x =~ /^([^0-9a-fA-F\n])(.*)/mg) { 203*0Sstevel@tonic-gate my $char = $1; 204*0Sstevel@tonic-gate my $name = $2; 205*0Sstevel@tonic-gate print STDERR "$1 => $2\n" if DEBUG; 206*0Sstevel@tonic-gate if ($char =~ /[-+!]/) { 207*0Sstevel@tonic-gate my ($c,$t) = split(/::/, $name, 2); # bogus use of ::, really 208*0Sstevel@tonic-gate my $subobj; 209*0Sstevel@tonic-gate if ($c eq 'utf8') { 210*0Sstevel@tonic-gate $subobj = $c->SWASHNEW($t, "", 0, 0, 0); 211*0Sstevel@tonic-gate } 212*0Sstevel@tonic-gate elsif ($c =~ /^([0-9a-fA-F]+)/) { 213*0Sstevel@tonic-gate $subobj = utf8->SWASHNEW("", $c, 0, 0, 0); 214*0Sstevel@tonic-gate } 215*0Sstevel@tonic-gate return $subobj unless ref $subobj; 216*0Sstevel@tonic-gate push @extras, $name => $subobj; 217*0Sstevel@tonic-gate $bits = $subobj->{BITS} if $bits < $subobj->{BITS}; 218*0Sstevel@tonic-gate } 219*0Sstevel@tonic-gate } 220*0Sstevel@tonic-gate } 221*0Sstevel@tonic-gate 222*0Sstevel@tonic-gate print STDERR "CLASS = $class, TYPE => $type, BITS => $bits, NONE => $none\nEXTRAS =>\n$extras\nLIST =>\n$list\n" if DEBUG; 223*0Sstevel@tonic-gate 224*0Sstevel@tonic-gate my $SWASH = bless { 225*0Sstevel@tonic-gate TYPE => $type, 226*0Sstevel@tonic-gate BITS => $bits, 227*0Sstevel@tonic-gate EXTRAS => $extras, 228*0Sstevel@tonic-gate LIST => $list, 229*0Sstevel@tonic-gate NONE => $none, 230*0Sstevel@tonic-gate @extras, 231*0Sstevel@tonic-gate } => $class; 232*0Sstevel@tonic-gate 233*0Sstevel@tonic-gate if ($file) { 234*0Sstevel@tonic-gate $Cache{$class, $file} = $SWASH; 235*0Sstevel@tonic-gate } 236*0Sstevel@tonic-gate 237*0Sstevel@tonic-gate return $SWASH; 238*0Sstevel@tonic-gate} 239*0Sstevel@tonic-gate 240*0Sstevel@tonic-gate# NOTE: utf8.c:swash_init() assumes entries are never modified once generated. 241*0Sstevel@tonic-gate 242*0Sstevel@tonic-gatesub SWASHGET { 243*0Sstevel@tonic-gate # See utf8.c:Perl_swash_fetch for problems with this interface. 244*0Sstevel@tonic-gate my ($self, $start, $len) = @_; 245*0Sstevel@tonic-gate local $^D = 0 if $^D; 246*0Sstevel@tonic-gate my $type = $self->{TYPE}; 247*0Sstevel@tonic-gate my $bits = $self->{BITS}; 248*0Sstevel@tonic-gate my $none = $self->{NONE}; 249*0Sstevel@tonic-gate print STDERR "SWASHGET @_ [$type/$bits/$none]\n" if DEBUG; 250*0Sstevel@tonic-gate my $end = $start + $len; 251*0Sstevel@tonic-gate my $swatch = ""; 252*0Sstevel@tonic-gate my $key; 253*0Sstevel@tonic-gate vec($swatch, $len - 1, $bits) = 0; # Extend to correct length. 254*0Sstevel@tonic-gate if ($none) { 255*0Sstevel@tonic-gate for $key (0 .. $len - 1) { vec($swatch, $key, $bits) = $none } 256*0Sstevel@tonic-gate } 257*0Sstevel@tonic-gate 258*0Sstevel@tonic-gate for ($self->{LIST}) { 259*0Sstevel@tonic-gate pos $_ = 0; 260*0Sstevel@tonic-gate if ($bits > 1) { 261*0Sstevel@tonic-gate LINE: 262*0Sstevel@tonic-gate while (/^([0-9a-fA-F]+)(?:[ \t]([0-9a-fA-F]+)?)?(?:[ \t]([0-9a-fA-F]+))?/mg) { 263*0Sstevel@tonic-gate chomp; 264*0Sstevel@tonic-gate my ($a, $b, $c) = ($1, $2, $3); 265*0Sstevel@tonic-gate croak "$type: illegal mapping '$_'" 266*0Sstevel@tonic-gate if $type =~ /^To/ && 267*0Sstevel@tonic-gate !(defined $a && defined $c); 268*0Sstevel@tonic-gate my $min = hex $a; 269*0Sstevel@tonic-gate my $max = defined $b ? hex $b : $min; 270*0Sstevel@tonic-gate my $val = defined $c ? hex $c : 0; 271*0Sstevel@tonic-gate next if $max < $start; 272*0Sstevel@tonic-gate print "$min $max $val\n" if DEBUG; 273*0Sstevel@tonic-gate if ($none) { 274*0Sstevel@tonic-gate if ($min < $start) { 275*0Sstevel@tonic-gate $val += $start - $min if $val < $none; 276*0Sstevel@tonic-gate $min = $start; 277*0Sstevel@tonic-gate } 278*0Sstevel@tonic-gate for ($key = $min; $key <= $max; $key++) { 279*0Sstevel@tonic-gate last LINE if $key >= $end; 280*0Sstevel@tonic-gate print STDERR "$key => $val\n" if DEBUG; 281*0Sstevel@tonic-gate vec($swatch, $key - $start, $bits) = $val; 282*0Sstevel@tonic-gate ++$val if $val < $none; 283*0Sstevel@tonic-gate } 284*0Sstevel@tonic-gate } 285*0Sstevel@tonic-gate else { 286*0Sstevel@tonic-gate if ($min < $start) { 287*0Sstevel@tonic-gate $val += $start - $min; 288*0Sstevel@tonic-gate $min = $start; 289*0Sstevel@tonic-gate } 290*0Sstevel@tonic-gate for ($key = $min; $key <= $max; $key++, $val++) { 291*0Sstevel@tonic-gate last LINE if $key >= $end; 292*0Sstevel@tonic-gate print STDERR "$key => $val\n" if DEBUG; 293*0Sstevel@tonic-gate vec($swatch, $key - $start, $bits) = $val; 294*0Sstevel@tonic-gate } 295*0Sstevel@tonic-gate } 296*0Sstevel@tonic-gate } 297*0Sstevel@tonic-gate } 298*0Sstevel@tonic-gate else { 299*0Sstevel@tonic-gate LINE: 300*0Sstevel@tonic-gate while (/^([0-9a-fA-F]+)(?:[ \t]+([0-9a-fA-F]+))?/mg) { 301*0Sstevel@tonic-gate chomp; 302*0Sstevel@tonic-gate my $min = hex $1; 303*0Sstevel@tonic-gate my $max = defined $2 ? hex $2 : $min; 304*0Sstevel@tonic-gate next if $max < $start; 305*0Sstevel@tonic-gate if ($min < $start) { 306*0Sstevel@tonic-gate $min = $start; 307*0Sstevel@tonic-gate } 308*0Sstevel@tonic-gate for ($key = $min; $key <= $max; $key++) { 309*0Sstevel@tonic-gate last LINE if $key >= $end; 310*0Sstevel@tonic-gate print STDERR "$key => 1\n" if DEBUG; 311*0Sstevel@tonic-gate vec($swatch, $key - $start, 1) = 1; 312*0Sstevel@tonic-gate } 313*0Sstevel@tonic-gate } 314*0Sstevel@tonic-gate } 315*0Sstevel@tonic-gate } 316*0Sstevel@tonic-gate for my $x ($self->{EXTRAS}) { 317*0Sstevel@tonic-gate pos $x = 0; 318*0Sstevel@tonic-gate while ($x =~ /^([-+!])(.*)/mg) { 319*0Sstevel@tonic-gate my $char = $1; 320*0Sstevel@tonic-gate my $name = $2; 321*0Sstevel@tonic-gate print STDERR "INDIRECT $1 $2\n" if DEBUG; 322*0Sstevel@tonic-gate my $otherbits = $self->{$name}->{BITS}; 323*0Sstevel@tonic-gate croak("SWASHGET size mismatch") if $bits < $otherbits; 324*0Sstevel@tonic-gate my $other = $self->{$name}->SWASHGET($start, $len); 325*0Sstevel@tonic-gate if ($char eq '+') { 326*0Sstevel@tonic-gate if ($bits == 1 and $otherbits == 1) { 327*0Sstevel@tonic-gate $swatch |= $other; 328*0Sstevel@tonic-gate } 329*0Sstevel@tonic-gate else { 330*0Sstevel@tonic-gate for ($key = 0; $key < $len; $key++) { 331*0Sstevel@tonic-gate vec($swatch, $key, $bits) = vec($other, $key, $otherbits); 332*0Sstevel@tonic-gate } 333*0Sstevel@tonic-gate } 334*0Sstevel@tonic-gate } 335*0Sstevel@tonic-gate elsif ($char eq '!') { 336*0Sstevel@tonic-gate if ($bits == 1 and $otherbits == 1) { 337*0Sstevel@tonic-gate $swatch |= ~$other; 338*0Sstevel@tonic-gate } 339*0Sstevel@tonic-gate else { 340*0Sstevel@tonic-gate for ($key = 0; $key < $len; $key++) { 341*0Sstevel@tonic-gate if (!vec($other, $key, $otherbits)) { 342*0Sstevel@tonic-gate vec($swatch, $key, $bits) = 1; 343*0Sstevel@tonic-gate } 344*0Sstevel@tonic-gate } 345*0Sstevel@tonic-gate } 346*0Sstevel@tonic-gate } 347*0Sstevel@tonic-gate elsif ($char eq '-') { 348*0Sstevel@tonic-gate if ($bits == 1 and $otherbits == 1) { 349*0Sstevel@tonic-gate $swatch &= ~$other; 350*0Sstevel@tonic-gate } 351*0Sstevel@tonic-gate else { 352*0Sstevel@tonic-gate for ($key = 0; $key < $len; $key++) { 353*0Sstevel@tonic-gate if (vec($other, $key, $otherbits)) { 354*0Sstevel@tonic-gate vec($swatch, $key, $bits) = 0; 355*0Sstevel@tonic-gate } 356*0Sstevel@tonic-gate } 357*0Sstevel@tonic-gate } 358*0Sstevel@tonic-gate } 359*0Sstevel@tonic-gate } 360*0Sstevel@tonic-gate } 361*0Sstevel@tonic-gate if (DEBUG) { 362*0Sstevel@tonic-gate print STDERR "CELLS "; 363*0Sstevel@tonic-gate for ($key = 0; $key < $len; $key++) { 364*0Sstevel@tonic-gate print STDERR vec($swatch, $key, $bits), " "; 365*0Sstevel@tonic-gate } 366*0Sstevel@tonic-gate print STDERR "\n"; 367*0Sstevel@tonic-gate } 368*0Sstevel@tonic-gate $swatch; 369*0Sstevel@tonic-gate} 370*0Sstevel@tonic-gate 371*0Sstevel@tonic-gate1; 372