1#!/usr/local/bin/perl 2# 3# $Id: ucmlint,v 2.2 2008/03/12 09:51:11 dankogai Exp $ 4# 5 6BEGIN { pop @INC if $INC[-1] eq '.' } 7use strict; 8our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; 9 10use Getopt::Std; 11our %Opt; 12getopts("Dehfv", \%Opt); 13 14if ($Opt{e}){ 15 eval{ require Encode; }; 16 $@ and die "can't load Encode : $@"; 17} 18 19$Opt{h} and help(); 20@ARGV or help(); 21 22sub help{ 23 print <<""; 24$0 -[Dehfv] [ucm files ...] 25 -D debug mode on 26 -e test with Encode module also (requires perl 5.7.3 or higher) 27 -h shows this message 28 -f forces roundtrip check even for |[123] 29 -v verbose mode 30 31} 32 33$| = 1; 34my (%Hdr, %U2E, %E2U, %Fallback); 35my $in_charmap = 0; 36my $nerror = 0; 37my $nwarning = 0; 38 39sub nit($;$){ 40 my ($msg, $level) = @_; 41 my $lstr; 42 if ($level == 2){ 43 $lstr = 'notice'; 44 }elsif ($level == 1){ 45 $lstr = 'warning'; $nwarning++; 46 }else{ 47 $lstr = 'error'; $nerror++; 48 } 49 print "$ARGV:$lstr in line $.: $msg\n"; 50} 51 52for $ARGV (@ARGV){ 53 open UCM, $ARGV or die "$ARGV:$!"; 54 %Hdr = %U2E = %E2U = %Fallback = (); 55 $in_charmap = $nerror = $nwarning = 0; 56 $. = 0; 57 while(<UCM>){ 58 chomp; 59 s/\s*#.*$//o; /^$/ and next; 60 if ($_ eq "CHARMAP"){ 61 $in_charmap = 1; 62 for my $must (qw/code_set_name mb_cur_min mb_cur_max/){ 63 exists $Hdr{$must} or nit "<$must> nonexistent"; 64 } 65 $Hdr{mb_cur_min} > $Hdr{mb_cur_max} 66 and nit sprintf("mb_cur_min(%d) > mb_cur_max(%d)", 67 $Hdr{mb_cur_min},$Hdr{mb_cur_max}); 68 $in_charmap = 1; 69 next; 70 } 71 unless ($in_charmap){ 72 my($hkey, $hvalue) = /^<(\S+)>\s+[\"\']?([^\"\']+)/o or next; 73 $Opt{D} and warn "$hkey => $hvalue"; 74 if ($hkey eq "code_set_name"){ # name check 75 exists $Hdr{code_set_name} 76 and nit "Duplicate <code_set_name>: $hkey"; 77 } 78 if ($hkey eq "code_set_alias"){ # alias check 79 $hvalue eq $Hdr{code_set_name} 80 and nit qq(alias "$hvalue" is already in <code_set_name>); 81 } 82 $Hdr{$hkey} = $hvalue; 83 }else{ 84 my $name = $Hdr{code_set_name}; 85 my($unistr, $encstr, $fb) = /^(\S+)\s+(\S+)\s(\S+)/o or next; 86 $Opt{v} and nit $_, 2; 87 my $uni = uniparse($unistr); 88 my $enc = encparse($encstr); 89 $fb =~ /^\|([0123])$/ or nit "malformed fallback: $fb"; 90 $fb = $1; 91 $Opt{f} and $fb = 0; 92 unless ($fb == 3){ # check uni -> enc 93 if (exists $U2E{$uni}){ 94 nit "dupe encode map: U$uni => $U2E{$uni} and $enc", 1; 95 }else{ 96 $U2E{$uni} = $enc; 97 $Fallback{$uni}{$enc} = 1 if $fb == 1; 98 if ($Opt{e}) { 99 my $e = hex2enc($enc); 100 my $u = hex2uni($uni); 101 my $eu = Encode::encode($name, $u); 102 $e eq $eu 103 or nit qq(encode('$name', $uni) != $enc); 104 } 105 } 106 } 107 unless ($fb == 1){ # check enc -> uni 108 if (exists $E2U{$enc}){ 109 nit "dupe decode map: $enc => U$E2U{$enc} and U$uni", 1; 110 }else{ 111 $E2U{$enc} = $uni; 112 $Fallback{$enc}{$uni} = 1 if $fb == 3; 113 if ($Opt{e}) { 114 my $e = hex2enc($enc); 115 my $u = hex2uni($uni); 116 $Opt{D} and warn "$uni, $enc"; 117 my $de = Encode::decode($name, $e); 118 $de eq $u 119 or nit qq(decode('$name', $enc) != $uni); 120 } 121 } 122 } 123 # warn "$uni, $enc, $fb"; 124 } 125 } 126 $in_charmap or nit "Where is CHARMAP?"; 127 checkRT(); 128 printf ("$ARGV: %s error%s found\n", 129 ($nerror == 0 ? 'no' : $nerror), 130 ($nerror > 1 ? 's' : '')); 131} 132 133exit; 134 135sub hex2enc{ 136 pack("C*", map {hex($_)} split(",", shift)); 137} 138sub hex2uni{ 139 join("", map { chr(hex($_)) } split(",", shift)); 140} 141 142sub checkRT{ 143 for my $uni (keys %E2U){ 144 my $enc = $U2E{$uni} or next; # okay 145 $E2U{$U2E{$uni}} eq $uni or $Fallback{$uni}{$enc} or 146 nit "RT failure: U$uni => $enc =>U$E2U{$U2E{$uni}}"; 147 } 148 for my $enc (keys %E2U){ 149 my $uni = $E2U{$enc} or next; # okay 150 $U2E{$E2U{$enc}} eq $enc or $Fallback{$enc}{$uni} or 151 nit "RT failure: $enc => U$uni => $U2E{$E2U{$enc}}"; 152 } 153} 154 155 156sub uniparse{ 157 my $str = shift; 158 my @u; 159 push @u, $1 while($str =~ /\G<U(.*?)>/ig); 160 for my $u (@u){ 161 $u =~ /^([0-9A-Za-z]+)$/o 162 or nit "malformed Unicode character: $u"; 163 } 164 return join(',', @u); 165} 166 167sub encparse{ 168 my $str = shift; 169 my @e; 170 for my $e (split /\\x/io, $str){ 171 $e or next; # first \x 172 $e =~ /^([0-9A-Za-z]{1,2})$/io 173 or nit "Hex $e in $str is bogus"; 174 push @e, $1; 175 } 176 return join(',', @e); 177} 178 179 180 181__END__ 182 183A UCM file looks like this. 184 185 # 186 # Comments 187 # 188 <code_set_name> "US-ascii" # Required 189 <code_set_alias> "ascii" # Optional 190 <mb_cur_min> 1 # Required; usually 1 191 <mb_cur_max> 1 # Max. # of bytes/char 192 <subchar> \x3F # Substitution char 193 # 194 CHARMAP 195 <U0000> \x00 |0 # <control> 196 <U0001> \x01 |0 # <control> 197 <U0002> \x02 |0 # <control> 198 .... 199 <U007C> \x7C |0 # VERTICAL LINE 200 <U007D> \x7D |0 # RIGHT CURLY BRACKET 201 <U007E> \x7E |0 # TILDE 202 <U007F> \x7F |0 # <control> 203 END CHARMAP 204 205