xref: /netbsd-src/external/bsd/less/dist/mkutable (revision e4a6e799a67c2028562d75b4e61407b22434aa36)
1*e4a6e799Ssimonb#!/usr/bin/env perl
2*e4a6e799Ssimonbuse strict;
3*e4a6e799Ssimonb
4*e4a6e799Ssimonbmy $USAGE = <<__EOF__;
5*e4a6e799Ssimonb   usage: mkutable [-n] [-f#] type... [--] [<] UnicodeData.txt
6*e4a6e799Ssimonb          -n = take non-matching types
7*e4a6e799Ssimonb          -f = zero-based type field (default 2)
8*e4a6e799Ssimonb__EOF__
9*e4a6e799Ssimonb
10*e4a6e799Ssimonbuse Getopt::Std;
11*e4a6e799Ssimonbuse vars qw( $opt_f $opt_n );
12*e4a6e799Ssimonb
13*e4a6e799Ssimonbmy $type_field = 2;
14*e4a6e799Ssimonb
15*e4a6e799Ssimonb# Override Unicode tables for certain control chars
16*e4a6e799Ssimonb# that are expected to be found in normal text files.
17*e4a6e799Ssimonbmy %force_space = (
18*e4a6e799Ssimonb    0x08 => 1, # backspace
19*e4a6e799Ssimonb    0x09 => 1, # tab
20*e4a6e799Ssimonb    0x0a => 1, # newline
21*e4a6e799Ssimonb    0x0c => 1, # form feed
22*e4a6e799Ssimonb    0x0d => 1, # carriage return
23*e4a6e799Ssimonb);
24*e4a6e799Ssimonb
25*e4a6e799Ssimonb# Hangul Jamo medial vowels and final consonants should be zero width.
26*e4a6e799Ssimonbmy @force_compose = (
27*e4a6e799Ssimonb    [0x1160, 0x11ff],
28*e4a6e799Ssimonb    [0xd7b0, 0xd7c6],
29*e4a6e799Ssimonb    [0xd7cb, 0xd7fb]
30*e4a6e799Ssimonb);
31*e4a6e799Ssimonb
32*e4a6e799Ssimonbexit (main() ? 0 : 1);
33*e4a6e799Ssimonb
34*e4a6e799Ssimonbsub main {
35*e4a6e799Ssimonb    my $args = join ' ', @ARGV;
36*e4a6e799Ssimonb    die $USAGE if not getopts('f:n');
37*e4a6e799Ssimonb    $type_field = $opt_f if $opt_f;
38*e4a6e799Ssimonb
39*e4a6e799Ssimonb    my %types;
40*e4a6e799Ssimonb    my $arg;
41*e4a6e799Ssimonb    while ($arg = shift @ARGV) {
42*e4a6e799Ssimonb        last if $arg eq '--';
43*e4a6e799Ssimonb        $types{$arg} = 1;
44*e4a6e799Ssimonb    }
45*e4a6e799Ssimonb    my %out = ( 'types' => \%types );
46*e4a6e799Ssimonb
47*e4a6e799Ssimonb    my %force_compose;
48*e4a6e799Ssimonb    foreach my $comp (@force_compose) {
49*e4a6e799Ssimonb        my ($lo,$hi) = @$comp;
50*e4a6e799Ssimonb        for (my $ch = $lo; $ch <= $hi; ++$ch) {
51*e4a6e799Ssimonb            $force_compose{$ch} = 1;
52*e4a6e799Ssimonb        }
53*e4a6e799Ssimonb    }
54*e4a6e799Ssimonb
55*e4a6e799Ssimonb    my $date = `date`;
56*e4a6e799Ssimonb    chomp $date;
57*e4a6e799Ssimonb    print "/* Generated by \"$0 $args\" on $date */\n";
58*e4a6e799Ssimonb
59*e4a6e799Ssimonb    my $last_code = 0;
60*e4a6e799Ssimonb    my $start_range = 0;
61*e4a6e799Ssimonb    while (<>) {
62*e4a6e799Ssimonb        chomp;
63*e4a6e799Ssimonb        s/#.*//;
64*e4a6e799Ssimonb        my @fields = split /;/;
65*e4a6e799Ssimonb        next if not @fields;
66*e4a6e799Ssimonb        my ($lo_code, $hi_code);
67*e4a6e799Ssimonb        my $codes = $fields[0];
68*e4a6e799Ssimonb        if ($codes =~ /(\w+)\.\.(\w+)/) {
69*e4a6e799Ssimonb            $lo_code = hex $1;
70*e4a6e799Ssimonb            $hi_code = hex $2;
71*e4a6e799Ssimonb        } else {
72*e4a6e799Ssimonb            $lo_code = $hi_code = hex $codes;
73*e4a6e799Ssimonb        }
74*e4a6e799Ssimonb        if ($fields[1] =~ /, First>$/) {
75*e4a6e799Ssimonb            die "invalid Unicode data: First with range" if $hi_code != $lo_code;
76*e4a6e799Ssimonb            $start_range = $lo_code;
77*e4a6e799Ssimonb            next;
78*e4a6e799Ssimonb        }
79*e4a6e799Ssimonb        if ($fields[1] =~ /, Last>$/) {
80*e4a6e799Ssimonb            die "invalid Unicode data: Last without First" if not $start_range;
81*e4a6e799Ssimonb            $lo_code = $start_range;
82*e4a6e799Ssimonb            $start_range = 0;
83*e4a6e799Ssimonb        } elsif ($start_range) {
84*e4a6e799Ssimonb            die "invalid Unicode data: First without Last";
85*e4a6e799Ssimonb        }
86*e4a6e799Ssimonb        my $type = $fields[$type_field];
87*e4a6e799Ssimonb        $type =~ s/\s//g;
88*e4a6e799Ssimonb        for ($last_code = $lo_code; $last_code <= $hi_code; ++$last_code) {
89*e4a6e799Ssimonb            output(\%out, $last_code,
90*e4a6e799Ssimonb                $force_space{$last_code} ? 'Zs' : $force_compose{$last_code} ? 'Mn' : $type);
91*e4a6e799Ssimonb        }
92*e4a6e799Ssimonb    }
93*e4a6e799Ssimonb    output(\%out, $last_code);
94*e4a6e799Ssimonb    return 1;
95*e4a6e799Ssimonb}
96*e4a6e799Ssimonb
97*e4a6e799Ssimonbsub output {
98*e4a6e799Ssimonb    my ($out, $code, $type) = @_;
99*e4a6e799Ssimonb    my $type_ok = ($type and ${${$out}{types}}{$type});
100*e4a6e799Ssimonb    $type_ok = not $type_ok if $opt_n;
101*e4a6e799Ssimonb    my $prev_code = $$out{prev_code};
102*e4a6e799Ssimonb
103*e4a6e799Ssimonb    if (not $type_ok) {
104*e4a6e799Ssimonb        end_run($out, $prev_code);
105*e4a6e799Ssimonb    } elsif (not $$out{in_run} or $type ne $$out{run_type} or $code != $prev_code+1) {
106*e4a6e799Ssimonb        end_run($out, $prev_code);
107*e4a6e799Ssimonb        start_run($out, $code, $type);
108*e4a6e799Ssimonb    }
109*e4a6e799Ssimonb    $$out{prev_code} = $code;
110*e4a6e799Ssimonb}
111*e4a6e799Ssimonb
112*e4a6e799Ssimonbsub start_run {
113*e4a6e799Ssimonb    my ($out, $code, $type) = @_;
114*e4a6e799Ssimonb    $$out{start_code} = $code;
115*e4a6e799Ssimonb    $$out{prev_code} = $code;
116*e4a6e799Ssimonb    $$out{run_type} = $type;
117*e4a6e799Ssimonb    $$out{in_run} = 1;
118*e4a6e799Ssimonb}
119*e4a6e799Ssimonb
120*e4a6e799Ssimonbsub end_run {
121*e4a6e799Ssimonb    my ($out, $code) = @_;
122*e4a6e799Ssimonb    return if not $$out{in_run};
123*e4a6e799Ssimonb    printf "\t{ 0x%04x, 0x%04x }, /* %s */\n", $$out{start_code}, $code, $$out{run_type};
124*e4a6e799Ssimonb    $$out{in_run} = 0;
125*e4a6e799Ssimonb}
126