xref: /plan9/sys/src/cmd/dict/comfix.awk (revision 219b2ee8daee37f4aad58d63f21287faa8e4ffdc)
1# when raw index has a lot of entries like
2# 1578324	problematico, a, ci, che
3# apply this algorithm:
4#  treat things after comma as suffixes
5#  for each suffix:
6#      if single letter, replace last letter
7#      else search backwards for beginning of suffix
8#      and if it leads to an old suffix of approximately
9#      the same length, put replace that suffix
10# This will still leave some commas to fix by hand
11# Usage: awk -F'	' -f comfix.awk rawindex > newrawindex
12
13NF == 2	{
14		i = index($2, ",")
15		if(i == 0 || length($2) == 0)
16			print $0
17		else {
18			n = split($2, a, /,[ ]*/)
19			w = a[1]
20			printf "%s\t%s\n", $1, w
21			for(i = 2; i <= n; i++) {
22				suf = a[i]
23				m = matchsuflen(w, suf)
24				if(m) {
25					nw = substr(w, 1, length(w)-m) suf
26					printf "%s\t%s\n", $1, nw
27				} else
28					printf "%s\t%s\n", $1, w ", " suf
29			}
30		}
31	}
32NF != 2 {
33	print $0
34	}
35
36function matchsuflen(w, suf,		wlen,suflen,c,pat,k,d)
37{
38	wlen = length(w)
39	suflen = length(suf)
40	if(suflen == 1)
41		return 1
42	else {
43		c = substr(suf, 1, 1)
44		for (k = 1; k <= wlen ; k++)
45			if(substr(w, wlen-k+1, 1) == c)
46				break
47		if(k > wlen)
48			return 0
49		d = k-suflen
50		if(d < 0)
51			d = -d
52		if(d > 3)
53			return 0
54		return k
55	}
56}
57