xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/bin/ucmlint (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1#!/usr/local/bin/perl
2#
3# $Id: ucmlint,v 2.2 2008/03/12 09:51:11 dankogai Exp $
4#
5
6BEGIN { pop @INC if $INC[-1] eq '.' }
7use strict;
8our  $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
9
10use Getopt::Std;
11our %Opt;
12getopts("Dehfv", \%Opt);
13
14if ($Opt{e}){
15   eval{ require Encode; };
16   $@ and die "can't load Encode : $@";
17}
18
19$Opt{h} and help();
20@ARGV or help();
21
22sub help{
23    print <<"";
24$0 -[Dehfv] [ucm files ...]
25  -D debug mode on
26  -e test with Encode module also (requires perl 5.7.3 or higher)
27  -h shows this message
28  -f forces roundtrip check even for |[123]
29  -v verbose mode
30
31}
32
33$| = 1;
34my (%Hdr, %U2E, %E2U, %Fallback);
35my $in_charmap = 0;
36my $nerror = 0;
37my $nwarning = 0;
38
39sub nit($;$){
40    my ($msg, $level) = @_;
41    my $lstr;
42    if ($level == 2){
43        $lstr = 'notice';
44    }elsif ($level == 1){
45        $lstr = 'warning'; $nwarning++;
46    }else{
47        $lstr = 'error'; $nerror++;
48    }
49    print "$ARGV:$lstr in line $.: $msg\n";
50}
51
52for $ARGV (@ARGV){
53    open UCM, $ARGV or die "$ARGV:$!";
54    %Hdr = %U2E = %E2U = %Fallback = ();
55    $in_charmap = $nerror = $nwarning = 0;
56    $. = 0;
57    while(<UCM>){
58        chomp;
59        s/\s*#.*$//o; /^$/ and next;
60        if ($_ eq "CHARMAP"){
61            $in_charmap = 1;
62            for my $must (qw/code_set_name mb_cur_min mb_cur_max/){
63                exists $Hdr{$must} or nit "<$must> nonexistent";
64            }
65            $Hdr{mb_cur_min} > $Hdr{mb_cur_max}
66                and nit sprintf("mb_cur_min(%d) > mb_cur_max(%d)",
67                                $Hdr{mb_cur_min},$Hdr{mb_cur_max});
68            $in_charmap = 1;
69            next;
70        }
71        unless ($in_charmap){
72            my($hkey, $hvalue) = /^<(\S+)>\s+[\"\']?([^\"\']+)/o or next;
73            $Opt{D} and warn "$hkey => $hvalue";
74            if ($hkey eq "code_set_name"){ # name check
75                exists $Hdr{code_set_name}
76                    and nit "Duplicate <code_set_name>: $hkey";
77            }
78            if ($hkey eq "code_set_alias"){ # alias check
79                $hvalue eq $Hdr{code_set_name}
80                    and nit qq(alias "$hvalue" is already in <code_set_name>);
81            }
82            $Hdr{$hkey} = $hvalue;
83        }else{
84            my $name = $Hdr{code_set_name};
85            my($unistr, $encstr, $fb) = /^(\S+)\s+(\S+)\s(\S+)/o or next;
86            $Opt{v} and nit $_, 2;
87            my $uni = uniparse($unistr);
88            my $enc = encparse($encstr);
89            $fb =~ /^\|([0123])$/ or nit "malformed fallback: $fb";
90            $fb = $1;
91            $Opt{f} and $fb = 0;
92            unless ($fb == 3){ # check uni -> enc
93                if (exists $U2E{$uni}){
94                    nit "dupe encode map: U$uni => $U2E{$uni} and $enc", 1;
95                }else{
96                    $U2E{$uni} = $enc;
97                    $Fallback{$uni}{$enc} = 1 if $fb == 1;
98                    if ($Opt{e}) {
99                        my $e = hex2enc($enc);
100                        my $u = hex2uni($uni);
101                        my $eu = Encode::encode($name, $u);
102                        $e eq $eu
103                            or nit qq(encode('$name', $uni) != $enc);
104                    }
105                }
106            }
107            unless ($fb == 1){  # check enc -> uni
108                if (exists $E2U{$enc}){
109                    nit "dupe decode map: $enc => U$E2U{$enc} and U$uni", 1;
110                }else{
111                    $E2U{$enc} = $uni;
112                    $Fallback{$enc}{$uni} = 1 if $fb == 3;
113                    if ($Opt{e}) {
114                        my $e = hex2enc($enc);
115                        my $u = hex2uni($uni);
116                        $Opt{D} and warn "$uni, $enc";
117                        my $de = Encode::decode($name, $e);
118                        $de eq $u
119                            or nit qq(decode('$name', $enc) != $uni);
120                    }
121                }
122            }
123            # warn "$uni, $enc, $fb";
124        }
125    }
126    $in_charmap or nit "Where is CHARMAP?";
127    checkRT();
128    printf ("$ARGV: %s error%s found\n",
129            ($nerror == 0 ? 'no' : $nerror),
130            ($nerror > 1 ? 's' : ''));
131}
132
133exit;
134
135sub hex2enc{
136    pack("C*", map {hex($_)} split(",", shift));
137}
138sub hex2uni{
139    join("", map { chr(hex($_)) } split(",", shift));
140}
141
142sub checkRT{
143    for my $uni (keys %E2U){
144        my $enc = $U2E{$uni} or next; # okay
145        $E2U{$U2E{$uni}} eq $uni or $Fallback{$uni}{$enc} or
146            nit "RT failure: U$uni => $enc =>U$E2U{$U2E{$uni}}";
147    }
148    for my $enc (keys %E2U){
149        my $uni = $E2U{$enc} or next; # okay
150        $U2E{$E2U{$enc}} eq $enc or $Fallback{$enc}{$uni} or
151            nit "RT failure: $enc => U$uni => $U2E{$E2U{$enc}}";
152    }
153}
154
155
156sub uniparse{
157    my $str = shift;
158    my @u;
159    push @u, $1 while($str =~ /\G<U(.*?)>/ig);
160    for my $u (@u){
161        $u =~ /^([0-9A-Za-z]+)$/o
162            or nit "malformed Unicode character: $u";
163    }
164    return join(',', @u);
165}
166
167sub encparse{
168    my $str = shift;
169    my @e;
170    for my $e (split /\\x/io, $str){
171        $e or next; # first \x
172        $e =~ /^([0-9A-Za-z]{1,2})$/io
173            or nit "Hex $e in $str is bogus";
174        push @e, $1;
175    }
176    return join(',', @e);
177}
178
179
180
181__END__
182
183A UCM file looks like this.
184
185  #
186  # Comments
187  #
188  <code_set_name> "US-ascii" # Required
189  <code_set_alias> "ascii"   # Optional
190  <mb_cur_min> 1             # Required; usually 1
191  <mb_cur_max> 1             # Max. # of bytes/char
192  <subchar> \x3F             # Substitution char
193  #
194  CHARMAP
195  <U0000> \x00 |0 # <control>
196  <U0001> \x01 |0 # <control>
197  <U0002> \x02 |0 # <control>
198  ....
199  <U007C> \x7C |0 # VERTICAL LINE
200  <U007D> \x7D |0 # RIGHT CURLY BRACKET
201  <U007E> \x7E |0 # TILDE
202  <U007F> \x7F |0 # <control>
203  END CHARMAP
204
205