1*e4a6e799Ssimonb#!/usr/bin/env perl 2*e4a6e799Ssimonbuse strict; 3*e4a6e799Ssimonb 4*e4a6e799Ssimonbmy $USAGE = <<__EOF__; 5*e4a6e799Ssimonb usage: mkutable [-n] [-f#] type... [--] [<] UnicodeData.txt 6*e4a6e799Ssimonb -n = take non-matching types 7*e4a6e799Ssimonb -f = zero-based type field (default 2) 8*e4a6e799Ssimonb__EOF__ 9*e4a6e799Ssimonb 10*e4a6e799Ssimonbuse Getopt::Std; 11*e4a6e799Ssimonbuse vars qw( $opt_f $opt_n ); 12*e4a6e799Ssimonb 13*e4a6e799Ssimonbmy $type_field = 2; 14*e4a6e799Ssimonb 15*e4a6e799Ssimonb# Override Unicode tables for certain control chars 16*e4a6e799Ssimonb# that are expected to be found in normal text files. 17*e4a6e799Ssimonbmy %force_space = ( 18*e4a6e799Ssimonb 0x08 => 1, # backspace 19*e4a6e799Ssimonb 0x09 => 1, # tab 20*e4a6e799Ssimonb 0x0a => 1, # newline 21*e4a6e799Ssimonb 0x0c => 1, # form feed 22*e4a6e799Ssimonb 0x0d => 1, # carriage return 23*e4a6e799Ssimonb); 24*e4a6e799Ssimonb 25*e4a6e799Ssimonb# Hangul Jamo medial vowels and final consonants should be zero width. 26*e4a6e799Ssimonbmy @force_compose = ( 27*e4a6e799Ssimonb [0x1160, 0x11ff], 28*e4a6e799Ssimonb [0xd7b0, 0xd7c6], 29*e4a6e799Ssimonb [0xd7cb, 0xd7fb] 30*e4a6e799Ssimonb); 31*e4a6e799Ssimonb 32*e4a6e799Ssimonbexit (main() ? 0 : 1); 33*e4a6e799Ssimonb 34*e4a6e799Ssimonbsub main { 35*e4a6e799Ssimonb my $args = join ' ', @ARGV; 36*e4a6e799Ssimonb die $USAGE if not getopts('f:n'); 37*e4a6e799Ssimonb $type_field = $opt_f if $opt_f; 38*e4a6e799Ssimonb 39*e4a6e799Ssimonb my %types; 40*e4a6e799Ssimonb my $arg; 41*e4a6e799Ssimonb while ($arg = shift @ARGV) { 42*e4a6e799Ssimonb last if $arg eq '--'; 43*e4a6e799Ssimonb $types{$arg} = 1; 44*e4a6e799Ssimonb } 45*e4a6e799Ssimonb my %out = ( 'types' => \%types ); 46*e4a6e799Ssimonb 47*e4a6e799Ssimonb my %force_compose; 48*e4a6e799Ssimonb foreach my $comp (@force_compose) { 49*e4a6e799Ssimonb my ($lo,$hi) = @$comp; 50*e4a6e799Ssimonb for (my $ch = $lo; $ch <= $hi; ++$ch) { 51*e4a6e799Ssimonb $force_compose{$ch} = 1; 52*e4a6e799Ssimonb } 53*e4a6e799Ssimonb } 54*e4a6e799Ssimonb 55*e4a6e799Ssimonb my $date = `date`; 56*e4a6e799Ssimonb chomp $date; 57*e4a6e799Ssimonb print "/* Generated by \"$0 $args\" on $date */\n"; 58*e4a6e799Ssimonb 59*e4a6e799Ssimonb my $last_code = 0; 60*e4a6e799Ssimonb my $start_range = 0; 61*e4a6e799Ssimonb while (<>) { 62*e4a6e799Ssimonb chomp; 63*e4a6e799Ssimonb s/#.*//; 64*e4a6e799Ssimonb my @fields = split /;/; 65*e4a6e799Ssimonb next if not @fields; 66*e4a6e799Ssimonb my ($lo_code, $hi_code); 67*e4a6e799Ssimonb my $codes = $fields[0]; 68*e4a6e799Ssimonb if ($codes =~ /(\w+)\.\.(\w+)/) { 69*e4a6e799Ssimonb $lo_code = hex $1; 70*e4a6e799Ssimonb $hi_code = hex $2; 71*e4a6e799Ssimonb } else { 72*e4a6e799Ssimonb $lo_code = $hi_code = hex $codes; 73*e4a6e799Ssimonb } 74*e4a6e799Ssimonb if ($fields[1] =~ /, First>$/) { 75*e4a6e799Ssimonb die "invalid Unicode data: First with range" if $hi_code != $lo_code; 76*e4a6e799Ssimonb $start_range = $lo_code; 77*e4a6e799Ssimonb next; 78*e4a6e799Ssimonb } 79*e4a6e799Ssimonb if ($fields[1] =~ /, Last>$/) { 80*e4a6e799Ssimonb die "invalid Unicode data: Last without First" if not $start_range; 81*e4a6e799Ssimonb $lo_code = $start_range; 82*e4a6e799Ssimonb $start_range = 0; 83*e4a6e799Ssimonb } elsif ($start_range) { 84*e4a6e799Ssimonb die "invalid Unicode data: First without Last"; 85*e4a6e799Ssimonb } 86*e4a6e799Ssimonb my $type = $fields[$type_field]; 87*e4a6e799Ssimonb $type =~ s/\s//g; 88*e4a6e799Ssimonb for ($last_code = $lo_code; $last_code <= $hi_code; ++$last_code) { 89*e4a6e799Ssimonb output(\%out, $last_code, 90*e4a6e799Ssimonb $force_space{$last_code} ? 'Zs' : $force_compose{$last_code} ? 'Mn' : $type); 91*e4a6e799Ssimonb } 92*e4a6e799Ssimonb } 93*e4a6e799Ssimonb output(\%out, $last_code); 94*e4a6e799Ssimonb return 1; 95*e4a6e799Ssimonb} 96*e4a6e799Ssimonb 97*e4a6e799Ssimonbsub output { 98*e4a6e799Ssimonb my ($out, $code, $type) = @_; 99*e4a6e799Ssimonb my $type_ok = ($type and ${${$out}{types}}{$type}); 100*e4a6e799Ssimonb $type_ok = not $type_ok if $opt_n; 101*e4a6e799Ssimonb my $prev_code = $$out{prev_code}; 102*e4a6e799Ssimonb 103*e4a6e799Ssimonb if (not $type_ok) { 104*e4a6e799Ssimonb end_run($out, $prev_code); 105*e4a6e799Ssimonb } elsif (not $$out{in_run} or $type ne $$out{run_type} or $code != $prev_code+1) { 106*e4a6e799Ssimonb end_run($out, $prev_code); 107*e4a6e799Ssimonb start_run($out, $code, $type); 108*e4a6e799Ssimonb } 109*e4a6e799Ssimonb $$out{prev_code} = $code; 110*e4a6e799Ssimonb} 111*e4a6e799Ssimonb 112*e4a6e799Ssimonbsub start_run { 113*e4a6e799Ssimonb my ($out, $code, $type) = @_; 114*e4a6e799Ssimonb $$out{start_code} = $code; 115*e4a6e799Ssimonb $$out{prev_code} = $code; 116*e4a6e799Ssimonb $$out{run_type} = $type; 117*e4a6e799Ssimonb $$out{in_run} = 1; 118*e4a6e799Ssimonb} 119*e4a6e799Ssimonb 120*e4a6e799Ssimonbsub end_run { 121*e4a6e799Ssimonb my ($out, $code) = @_; 122*e4a6e799Ssimonb return if not $$out{in_run}; 123*e4a6e799Ssimonb printf "\t{ 0x%04x, 0x%04x }, /* %s */\n", $$out{start_code}, $code, $$out{run_type}; 124*e4a6e799Ssimonb $$out{in_run} = 0; 125*e4a6e799Ssimonb} 126