xref: /netbsd-src/external/gpl3/gdb.old/dist/gdb/contrib/words.sh (revision 70f7362772ba52b749c976fb5e86e39a8b2c9afc)
1#!/bin/sh
2
3# Copyright (C) 2019-2020 Free Software Foundation, Inc.
4# This program is free software; you can redistribute it and/or modify
5# it under the terms of the GNU General Public License as published by
6# the Free Software Foundation; either version 3 of the License, or
7# (at your option) any later version.
8#
9# This program is distributed in the hope that it will be useful,
10# but WITHOUT ANY WARRANTY; without even the implied warranty of
11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12# GNU General Public License for more details.
13#
14# You should have received a copy of the GNU General Public License
15# along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17# This script intends to facilitate spell checking of source/doc files.
18# It:
19# - transforms the files into a list of lowercase words
20# - prefixes each word with the frequency
21# - filters out words within a frequency range
22# - sorts the words, longest first
23#
24# If '-c' is passed as option, it operates on the C comments only, rather than
25# on the entire file.
26#
27# For:
28# ...
29# $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
30# $ ./gdb/contrib/words.sh -c $files
31# ...
32# it generates a list of ~15000 words prefixed with frequency.
33#
34# This could be used to generate a dictionary that is kept as part of the
35# sources, against which new code can be checked, generating a warning or
36# error.  The hope is that misspellings would trigger this frequently, and rare
37# words rarely, otherwise the burden of updating the dictionary would be too
38# much.
39#
40# And for:
41# ...
42# $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
43# $ ./gdb/contrib/words.sh -c -f 1 $files
44# ...
45# it generates a list of ~5000 words with frequency 1.
46#
47# This can be used to scan for misspellings manually.
48#
49
50minfreq=
51maxfreq=
52c=false
53while [ $# -gt 0 ]; do
54    case "$1" in
55	-c)
56	    c=true
57	    shift
58	    ;;
59	--freq|-f)
60	    minfreq=$2
61	    maxfreq=$2
62	    shift 2
63	    ;;
64	--min)
65	    minfreq=$2
66	    if [ "$maxfreq" = "" ]; then
67		maxfreq=0
68	    fi
69	    shift 2
70	    ;;
71	--max)
72	    maxfreq=$2
73	    if [ "$minfreq" = "" ]; then
74		minfreq=0
75	    fi
76	    shift 2
77	    ;;
78	*)
79	    break;
80	    ;;
81    esac
82done
83
84if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then
85    minfreq=0
86    maxfreq=0
87fi
88
89awkfile=$(mktemp)
90trap 'rm -f "$awkfile"' EXIT
91
92cat > "$awkfile" <<EOF
93BEGIN {
94    in_comment=0
95}
96
97// {
98    line=\$0
99}
100
101/\/\*/ {
102    in_comment=1
103    sub(/.*\/\*/, "", line)
104}
105
106/\*\// {
107    sub(/\*\/.*/, "", line)
108    in_comment=0
109    print line
110    next
111}
112
113// {
114    if (in_comment) {
115	print line
116    }
117}
118EOF
119
120# Stabilize sort.
121export LC_ALL=C
122
123if $c; then
124    awk \
125	-f "$awkfile" \
126	-- "$@"
127else
128    cat "$@"
129fi \
130    | sed \
131	  -e 's/[!"?;:%^$~#{}`&=@,. \t\/_()|<>\+\*-]/\n/g' \
132	  -e 's/\[/\n/g' \
133	  -e 's/\]/\n/g' \
134	  -e "s/'/\n/g" \
135	  -e 's/[0-9][0-9]*/\n/g' \
136	  -e 's/[ \t]*//g' \
137    | tr '[:upper:]' '[:lower:]' \
138    | sort \
139    | uniq -c \
140    | awk "{ if (($minfreq == 0 || $minfreq <= \$1) \
141                 && ($maxfreq == 0 || \$1 <= $maxfreq)) { print \$0; } }" \
142    | awk '{ print length($0) " " $0; }' \
143    | sort -n -r \
144    | cut -d ' ' -f 2-
145