1#!perl
2#
3# This script generates "unfcan.h", "unfcpt.h", "unfcmb.h",
4# "unfcmp.h", and "unfexc.h"
5# from CombiningClass.pl, Decomposition.pl, CompositionExclusions.txt
6# in lib/unicore or unicode directory
7# for Unicode::Normalize.xs. (cf. Makefile.PL)
8#
9#  Usage: <perl mkheader> in command line
10#      or <do 'mkheader'> in perl
11#
12use 5.006;
13use strict;
14use warnings;
15use Carp;
16use File::Spec;
17
18BEGIN {
19    unless ("A" eq pack('U', 0x41)) {
20	die "Unicode::Normalize cannot stringify a Unicode code point\n";
21    }
22}
23
24our $PACKAGE = 'Unicode::Normalize, mkheader';
25
26our $Combin = do "unicore/CombiningClass.pl"
27    || do "unicode/CombiningClass.pl"
28    || croak "$PACKAGE: CombiningClass.pl not found";
29
30our $Decomp = do "unicore/Decomposition.pl"
31    || do "unicode/Decomposition.pl"
32    || croak "$PACKAGE: Decomposition.pl not found";
33
34our %Combin;	# $codepoint => $number    : combination class
35our %Canon;	# $codepoint => \@codepoints : canonical decomp.
36our %Compat;	# $codepoint => \@codepoints : compat. decomp.
37# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat
38our %Exclus;	# $codepoint => 1          : composition exclusions
39our %Single;	# $codepoint => 1          : singletons
40our %NonStD;	# $codepoint => 1          : non-starter decompositions
41
42our %Comp1st;	# $codepoint => $listname  : may be composed with a next char.
43our %Comp2nd;	# $codepoint => 1          : may be composed with a prev char.
44our %CompList;	# $listname,$2nd  => $codepoint : composite
45
46our $prefix = "UNF_";
47our $structname = "${prefix}complist";
48
49########## definition of Hangul constants ##########
50use constant SBase  => 0xAC00;
51use constant SFinal => 0xD7A3; # SBase -1 + SCount
52use constant SCount =>  11172; # LCount * NCount
53use constant NCount =>    588; # VCount * TCount
54use constant LBase  => 0x1100;
55use constant LFinal => 0x1112;
56use constant LCount =>     19;
57use constant VBase  => 0x1161;
58use constant VFinal => 0x1175;
59use constant VCount =>     21;
60use constant TBase  => 0x11A7;
61use constant TFinal => 0x11C2;
62use constant TCount =>     28;
63
64sub decomposeHangul {
65    my $SIndex = $_[0] - SBase;
66    my $LIndex = int( $SIndex / NCount);
67    my $VIndex = int(($SIndex % NCount) / TCount);
68    my $TIndex =      $SIndex % TCount;
69    my @ret = (
70       LBase + $LIndex,
71       VBase + $VIndex,
72      $TIndex ? (TBase + $TIndex) : (),
73    );
74    wantarray ? @ret : pack('U*', @ret);
75     # any element in @ret greater than 0xFF, so no need of u2n conversion.
76}
77
78########## getting full decomposion ##########
79{
80    my($f, $fh);
81    foreach my $d (@INC) {
82	$f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
83	last if open($fh, $f);
84	$f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
85	last if open($fh, $f);
86	$f = undef;
87    }
88    croak "$PACKAGE: neither unicore/CompositionExclusions.txt "
89	. "nor unicode/CompExcl.txt is found in @INC" unless defined $f;
90
91    while (<$fh>) {
92	next if /^#/ or /^$/;
93	s/#.*//;
94	$Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/;
95    }
96    close $fh;
97}
98
99##
100## converts string "hhhh hhhh hhhh" to a numeric list
101##
102sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }
103
104while ($Combin =~ /(.+)/g) {
105    my @tab = split /\t/, $1;
106    my $ini = hex $tab[0];
107    if ($tab[1] eq '') {
108	$Combin{ $ini } = $tab[2];
109    } else {
110	$Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
111    }
112}
113
114while ($Decomp =~ /(.+)/g) {
115    my @tab = split /\t/, $1;
116    my $compat = $tab[2] =~ s/<[^>]+>//;
117    my $dec = [ _getHexArray($tab[2]) ]; # decomposition
118    my $ini = hex($tab[0]); # initial decomposable character
119
120    my $listname =
121	@$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS';
122		# %04x is bad since it'd place _3046 after _1d157.
123
124    if ($tab[1] eq '') {
125	$Compat{ $ini } = $dec;
126
127	if (! $compat) {
128	    $Canon{ $ini } = $dec;
129
130	    if (@$dec == 2) {
131		if ($Combin{ $dec->[0] }) {
132		    $NonStD{ $ini } = 1;
133		} else {
134		    $CompList{ $listname }{ $dec->[1] } = $ini;
135		    $Comp1st{ $dec->[0] } = $listname;
136		    $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$ini};
137		}
138	    } elsif (@$dec == 1) {
139		$Single{ $ini } = 1;
140	    } else {
141		croak("Weird Canonical Decomposition of U+$tab[0]");
142	    }
143	}
144    } else {
145	foreach my $u ($ini .. hex($tab[1])) {
146	    $Compat{ $u } = $dec;
147
148	    if (! $compat) {
149		$Canon{ $u } = $dec;
150
151		if (@$dec == 2) {
152		    if ($Combin{ $dec->[0] }) {
153			$NonStD{ $u } = 1;
154		    } else {
155			$CompList{ $listname }{ $dec->[1] } = $u;
156			$Comp1st{ $dec->[0] } = $listname;
157			$Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
158		    }
159		} elsif (@$dec == 1) {
160		    $Single{ $u } = 1;
161		} else {
162		    croak("Weird Canonical Decomposition of U+$tab[0]");
163		}
164	    }
165	}
166    }
167}
168
169# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
170foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
171    $Comp2nd{$j} = 1;
172}
173
174sub getCanonList {
175    my @src = @_;
176    my @dec = map {
177	(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
178	    : $Canon{$_} ? @{ $Canon{$_} } : $_
179		} @src;
180    return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
181    # condition @src == @dec is not ok.
182}
183
184sub getCompatList {
185    my @src = @_;
186    my @dec = map {
187	(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
188	    : $Compat{$_} ? @{ $Compat{$_} } : $_
189		} @src;
190    return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
191    # condition @src == @dec is not ok.
192}
193
194# exhaustive decomposition
195foreach my $key (keys %Canon) {
196    $Canon{$key}  = [ getCanonList($key) ];
197}
198
199# exhaustive decomposition
200foreach my $key (keys %Compat) {
201    $Compat{$key} = [ getCompatList($key) ];
202}
203
204sub _pack_U {
205    return pack('U*', @_);
206}
207
208sub _U_stringify {
209    sprintf '"%s"', join '',
210	map sprintf("\\x%02x", $_), unpack 'C*', _pack_U(@_);
211}
212
213foreach my $hash (\%Canon, \%Compat) {
214    foreach my $key (keys %$hash) {
215	$hash->{$key} = _U_stringify( @{ $hash->{$key} } );
216    }
217}
218
219########## writing header files ##########
220
221my @boolfunc = (
222    {
223	name => "Exclusion",
224	type => "bool",
225	hash => \%Exclus,
226    },
227    {
228	name => "Singleton",
229	type => "bool",
230	hash => \%Single,
231    },
232    {
233	name => "NonStDecomp",
234	type => "bool",
235	hash => \%NonStD,
236    },
237    {
238	name => "Comp2nd",
239	type => "bool",
240	hash => \%Comp2nd,
241    },
242);
243
244my $file = "unfexc.h";
245open FH, ">$file" or croak "$PACKAGE: $file can't be made";
246binmode FH; select FH;
247
248    print << 'EOF';
249/*
250 * This file is auto-generated by mkheader.
251 * Any changes here will be lost!
252 */
253EOF
254
255foreach my $tbl (@boolfunc) {
256    my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
257    my $type = $tbl->{type};
258    my $name = $tbl->{name};
259    print "$type is$name (UV uv)\n{\nreturn\n\t";
260
261    while (@temp) {
262	my $cur = shift @temp;
263	if (@temp && $cur + 1 == $temp[0]) {
264	    print "($cur <= uv && uv <= ";
265	    while (@temp && $cur + 1 == $temp[0]) {
266		$cur = shift @temp;
267	    }
268	    print "$cur)";
269	    print "\n\t|| " if @temp;
270	} else {
271	    print "uv == $cur";
272	    print "\n\t|| " if @temp;
273	}
274    }
275    print "\n\t? TRUE : FALSE;\n}\n\n";
276}
277
278close FH;
279
280####################################
281
282my $compinit =
283    "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
284
285foreach my $i (sort keys %CompList) {
286    $compinit .= "$structname $i [] = {\n";
287    $compinit .= join ",\n",
288	map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
289	    sort {$a <=> $b } keys %{ $CompList{$i} };
290    $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
291}
292
293my @tripletable = (
294    {
295	file => "unfcmb",
296	name => "combin",
297	type => "STDCHAR",
298	hash => \%Combin,
299	null =>  0,
300    },
301    {
302	file => "unfcan",
303	name => "canon",
304	type => "char*",
305	hash => \%Canon,
306	null => "NULL",
307    },
308    {
309	file => "unfcpt",
310	name => "compat",
311	type => "char*",
312	hash => \%Compat,
313	null => "NULL",
314    },
315    {
316	file => "unfcmp",
317	name => "compos",
318	type => "$structname *",
319	hash => \%Comp1st,
320	null => "NULL",
321	init => $compinit,
322    },
323);
324
325foreach my $tbl (@tripletable) {
326    my $file = "$tbl->{file}.h";
327    my $head = "${prefix}$tbl->{name}";
328    my $type = $tbl->{type};
329    my $hash = $tbl->{hash};
330    my $null = $tbl->{null};
331    my $init = $tbl->{init};
332
333    open FH, ">$file" or croak "$PACKAGE: $file can't be made";
334    binmode FH; select FH;
335    my %val;
336
337    print FH << 'EOF';
338/*
339 * This file is auto-generated by mkheader.
340 * Any changes here will be lost!
341 */
342EOF
343
344    print $init if defined $init;
345
346    foreach my $uv (keys %$hash) {
347	croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
348	    unless $uv <= 0x10FFFF;
349	my @c = unpack 'CCCC', pack 'N', $uv;
350	$val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
351    }
352
353    foreach my $p (sort { $a <=> $b } keys %val) {
354	next if ! $val{ $p };
355	for (my $r = 0; $r < 256; $r++) {
356	    next if ! $val{ $p }{ $r };
357	    printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r;
358	    for (my $c = 0; $c < 256; $c++) {
359		print "\t", defined $val{$p}{$r}{$c}
360		    ? "($type)".$val{$p}{$r}{$c}
361		    : $null;
362		print ','  if $c != 255;
363		print "\n" if $c % 8 == 7;
364	    }
365	    print "};\n\n";
366	}
367    }
368    foreach my $p (sort { $a <=> $b } keys %val) {
369	next if ! $val{ $p };
370	printf "$type* ${head}_%02x [256] = {\n", $p;
371	for (my $r = 0; $r < 256; $r++) {
372	    print $val{ $p }{ $r }
373		? sprintf("${head}_%02x_%02x", $p, $r)
374		: "NULL";
375	    print ','  if $r != 255;
376	    print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
377	}
378	print "};\n\n";
379    }
380    print "$type** $head [] = {\n";
381    for (my $p = 0; $p <= 0x10; $p++) {
382	print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
383	print ','  if $p != 0x10;
384	print "\n";
385    }
386    print "};\n\n";
387    close FH;
388}
389
390__END__
391