1#!perl 2# 3# This script generates "unfcan.h", "unfcpt.h", "unfcmb.h", 4# "unfcmp.h", and "unfexc.h" 5# from CombiningClass.pl, Decomposition.pl, CompositionExclusions.txt 6# in lib/unicore or unicode directory 7# for Unicode::Normalize.xs. (cf. Makefile.PL) 8# 9# Usage: <perl mkheader> in command line 10# or <do 'mkheader'> in perl 11# 12use 5.006; 13use strict; 14use warnings; 15use Carp; 16use File::Spec; 17 18BEGIN { 19 unless ("A" eq pack('U', 0x41)) { 20 die "Unicode::Normalize cannot stringify a Unicode code point\n"; 21 } 22} 23 24our $PACKAGE = 'Unicode::Normalize, mkheader'; 25 26our $Combin = do "unicore/CombiningClass.pl" 27 || do "unicode/CombiningClass.pl" 28 || croak "$PACKAGE: CombiningClass.pl not found"; 29 30our $Decomp = do "unicore/Decomposition.pl" 31 || do "unicode/Decomposition.pl" 32 || croak "$PACKAGE: Decomposition.pl not found"; 33 34our %Combin; # $codepoint => $number : combination class 35our %Canon; # $codepoint => \@codepoints : canonical decomp. 36our %Compat; # $codepoint => \@codepoints : compat. decomp. 37# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat 38our %Exclus; # $codepoint => 1 : composition exclusions 39our %Single; # $codepoint => 1 : singletons 40our %NonStD; # $codepoint => 1 : non-starter decompositions 41 42our %Comp1st; # $codepoint => $listname : may be composed with a next char. 43our %Comp2nd; # $codepoint => 1 : may be composed with a prev char. 44our %CompList; # $listname,$2nd => $codepoint : composite 45 46our $prefix = "UNF_"; 47our $structname = "${prefix}complist"; 48 49########## definition of Hangul constants ########## 50use constant SBase => 0xAC00; 51use constant SFinal => 0xD7A3; # SBase -1 + SCount 52use constant SCount => 11172; # LCount * NCount 53use constant NCount => 588; # VCount * TCount 54use constant LBase => 0x1100; 55use constant LFinal => 0x1112; 56use constant LCount => 19; 57use constant VBase => 0x1161; 58use constant VFinal => 0x1175; 59use constant VCount => 21; 60use constant TBase => 0x11A7; 61use constant TFinal => 0x11C2; 62use constant TCount => 28; 63 64sub decomposeHangul { 65 my $SIndex = $_[0] - SBase; 66 my $LIndex = int( $SIndex / NCount); 67 my $VIndex = int(($SIndex % NCount) / TCount); 68 my $TIndex = $SIndex % TCount; 69 my @ret = ( 70 LBase + $LIndex, 71 VBase + $VIndex, 72 $TIndex ? (TBase + $TIndex) : (), 73 ); 74 wantarray ? @ret : pack('U*', @ret); 75 # any element in @ret greater than 0xFF, so no need of u2n conversion. 76} 77 78########## getting full decomposion ########## 79{ 80 my($f, $fh); 81 foreach my $d (@INC) { 82 $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt"); 83 last if open($fh, $f); 84 $f = File::Spec->catfile($d, "unicode", "CompExcl.txt"); 85 last if open($fh, $f); 86 $f = undef; 87 } 88 croak "$PACKAGE: neither unicore/CompositionExclusions.txt " 89 . "nor unicode/CompExcl.txt is found in @INC" unless defined $f; 90 91 while (<$fh>) { 92 next if /^#/ or /^$/; 93 s/#.*//; 94 $Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/; 95 } 96 close $fh; 97} 98 99## 100## converts string "hhhh hhhh hhhh" to a numeric list 101## 102sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g } 103 104while ($Combin =~ /(.+)/g) { 105 my @tab = split /\t/, $1; 106 my $ini = hex $tab[0]; 107 if ($tab[1] eq '') { 108 $Combin{ $ini } = $tab[2]; 109 } else { 110 $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]); 111 } 112} 113 114while ($Decomp =~ /(.+)/g) { 115 my @tab = split /\t/, $1; 116 my $compat = $tab[2] =~ s/<[^>]+>//; 117 my $dec = [ _getHexArray($tab[2]) ]; # decomposition 118 my $ini = hex($tab[0]); # initial decomposable character 119 120 my $listname = 121 @$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS'; 122 # %04x is bad since it'd place _3046 after _1d157. 123 124 if ($tab[1] eq '') { 125 $Compat{ $ini } = $dec; 126 127 if (! $compat) { 128 $Canon{ $ini } = $dec; 129 130 if (@$dec == 2) { 131 if ($Combin{ $dec->[0] }) { 132 $NonStD{ $ini } = 1; 133 } else { 134 $CompList{ $listname }{ $dec->[1] } = $ini; 135 $Comp1st{ $dec->[0] } = $listname; 136 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$ini}; 137 } 138 } elsif (@$dec == 1) { 139 $Single{ $ini } = 1; 140 } else { 141 croak("Weird Canonical Decomposition of U+$tab[0]"); 142 } 143 } 144 } else { 145 foreach my $u ($ini .. hex($tab[1])) { 146 $Compat{ $u } = $dec; 147 148 if (! $compat) { 149 $Canon{ $u } = $dec; 150 151 if (@$dec == 2) { 152 if ($Combin{ $dec->[0] }) { 153 $NonStD{ $u } = 1; 154 } else { 155 $CompList{ $listname }{ $dec->[1] } = $u; 156 $Comp1st{ $dec->[0] } = $listname; 157 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u}; 158 } 159 } elsif (@$dec == 1) { 160 $Single{ $u } = 1; 161 } else { 162 croak("Weird Canonical Decomposition of U+$tab[0]"); 163 } 164 } 165 } 166 } 167} 168 169# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo 170foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) { 171 $Comp2nd{$j} = 1; 172} 173 174sub getCanonList { 175 my @src = @_; 176 my @dec = map { 177 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) 178 : $Canon{$_} ? @{ $Canon{$_} } : $_ 179 } @src; 180 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); 181 # condition @src == @dec is not ok. 182} 183 184sub getCompatList { 185 my @src = @_; 186 my @dec = map { 187 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) 188 : $Compat{$_} ? @{ $Compat{$_} } : $_ 189 } @src; 190 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); 191 # condition @src == @dec is not ok. 192} 193 194# exhaustive decomposition 195foreach my $key (keys %Canon) { 196 $Canon{$key} = [ getCanonList($key) ]; 197} 198 199# exhaustive decomposition 200foreach my $key (keys %Compat) { 201 $Compat{$key} = [ getCompatList($key) ]; 202} 203 204sub _pack_U { 205 return pack('U*', @_); 206} 207 208sub _U_stringify { 209 sprintf '"%s"', join '', 210 map sprintf("\\x%02x", $_), unpack 'C*', _pack_U(@_); 211} 212 213foreach my $hash (\%Canon, \%Compat) { 214 foreach my $key (keys %$hash) { 215 $hash->{$key} = _U_stringify( @{ $hash->{$key} } ); 216 } 217} 218 219########## writing header files ########## 220 221my @boolfunc = ( 222 { 223 name => "Exclusion", 224 type => "bool", 225 hash => \%Exclus, 226 }, 227 { 228 name => "Singleton", 229 type => "bool", 230 hash => \%Single, 231 }, 232 { 233 name => "NonStDecomp", 234 type => "bool", 235 hash => \%NonStD, 236 }, 237 { 238 name => "Comp2nd", 239 type => "bool", 240 hash => \%Comp2nd, 241 }, 242); 243 244my $file = "unfexc.h"; 245open FH, ">$file" or croak "$PACKAGE: $file can't be made"; 246binmode FH; select FH; 247 248 print << 'EOF'; 249/* 250 * This file is auto-generated by mkheader. 251 * Any changes here will be lost! 252 */ 253EOF 254 255foreach my $tbl (@boolfunc) { 256 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}}; 257 my $type = $tbl->{type}; 258 my $name = $tbl->{name}; 259 print "$type is$name (UV uv)\n{\nreturn\n\t"; 260 261 while (@temp) { 262 my $cur = shift @temp; 263 if (@temp && $cur + 1 == $temp[0]) { 264 print "($cur <= uv && uv <= "; 265 while (@temp && $cur + 1 == $temp[0]) { 266 $cur = shift @temp; 267 } 268 print "$cur)"; 269 print "\n\t|| " if @temp; 270 } else { 271 print "uv == $cur"; 272 print "\n\t|| " if @temp; 273 } 274 } 275 print "\n\t? TRUE : FALSE;\n}\n\n"; 276} 277 278close FH; 279 280#################################### 281 282my $compinit = 283 "typedef struct { UV nextchar; UV composite; } $structname;\n\n"; 284 285foreach my $i (sort keys %CompList) { 286 $compinit .= "$structname $i [] = {\n"; 287 $compinit .= join ",\n", 288 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}), 289 sort {$a <=> $b } keys %{ $CompList{$i} }; 290 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel 291} 292 293my @tripletable = ( 294 { 295 file => "unfcmb", 296 name => "combin", 297 type => "STDCHAR", 298 hash => \%Combin, 299 null => 0, 300 }, 301 { 302 file => "unfcan", 303 name => "canon", 304 type => "char*", 305 hash => \%Canon, 306 null => "NULL", 307 }, 308 { 309 file => "unfcpt", 310 name => "compat", 311 type => "char*", 312 hash => \%Compat, 313 null => "NULL", 314 }, 315 { 316 file => "unfcmp", 317 name => "compos", 318 type => "$structname *", 319 hash => \%Comp1st, 320 null => "NULL", 321 init => $compinit, 322 }, 323); 324 325foreach my $tbl (@tripletable) { 326 my $file = "$tbl->{file}.h"; 327 my $head = "${prefix}$tbl->{name}"; 328 my $type = $tbl->{type}; 329 my $hash = $tbl->{hash}; 330 my $null = $tbl->{null}; 331 my $init = $tbl->{init}; 332 333 open FH, ">$file" or croak "$PACKAGE: $file can't be made"; 334 binmode FH; select FH; 335 my %val; 336 337 print FH << 'EOF'; 338/* 339 * This file is auto-generated by mkheader. 340 * Any changes here will be lost! 341 */ 342EOF 343 344 print $init if defined $init; 345 346 foreach my $uv (keys %$hash) { 347 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) 348 unless $uv <= 0x10FFFF; 349 my @c = unpack 'CCCC', pack 'N', $uv; 350 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; 351 } 352 353 foreach my $p (sort { $a <=> $b } keys %val) { 354 next if ! $val{ $p }; 355 for (my $r = 0; $r < 256; $r++) { 356 next if ! $val{ $p }{ $r }; 357 printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r; 358 for (my $c = 0; $c < 256; $c++) { 359 print "\t", defined $val{$p}{$r}{$c} 360 ? "($type)".$val{$p}{$r}{$c} 361 : $null; 362 print ',' if $c != 255; 363 print "\n" if $c % 8 == 7; 364 } 365 print "};\n\n"; 366 } 367 } 368 foreach my $p (sort { $a <=> $b } keys %val) { 369 next if ! $val{ $p }; 370 printf "$type* ${head}_%02x [256] = {\n", $p; 371 for (my $r = 0; $r < 256; $r++) { 372 print $val{ $p }{ $r } 373 ? sprintf("${head}_%02x_%02x", $p, $r) 374 : "NULL"; 375 print ',' if $r != 255; 376 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; 377 } 378 print "};\n\n"; 379 } 380 print "$type** $head [] = {\n"; 381 for (my $p = 0; $p <= 0x10; $p++) { 382 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; 383 print ',' if $p != 0x10; 384 print "\n"; 385 } 386 print "};\n\n"; 387 close FH; 388} 389 390__END__ 391