1*4246b616SDavid du Colombier #include <u.h>
2*4246b616SDavid du Colombier #include <libc.h>
3*4246b616SDavid du Colombier #include <bio.h>
4*4246b616SDavid du Colombier #include <regexp.h>
5*4246b616SDavid du Colombier #include "hash.h"
6*4246b616SDavid du Colombier
7*4246b616SDavid du Colombier enum
8*4246b616SDavid du Colombier {
9*4246b616SDavid du Colombier MAXTAB = 256,
10*4246b616SDavid du Colombier MAXBEST = 32,
11*4246b616SDavid du Colombier };
12*4246b616SDavid du Colombier
13*4246b616SDavid du Colombier typedef struct Table Table;
14*4246b616SDavid du Colombier struct Table
15*4246b616SDavid du Colombier {
16*4246b616SDavid du Colombier char *file;
17*4246b616SDavid du Colombier Hash *hash;
18*4246b616SDavid du Colombier int nmsg;
19*4246b616SDavid du Colombier };
20*4246b616SDavid du Colombier
21*4246b616SDavid du Colombier typedef struct Word Word;
22*4246b616SDavid du Colombier struct Word
23*4246b616SDavid du Colombier {
24*4246b616SDavid du Colombier Stringtab *s; /* from hmsg */
25*4246b616SDavid du Colombier int count[MAXTAB]; /* counts from each table */
26*4246b616SDavid du Colombier double p[MAXTAB]; /* probabilities from each table */
27*4246b616SDavid du Colombier double mp; /* max probability */
28*4246b616SDavid du Colombier int mi; /* w.p[w.mi] = w.mp */
29*4246b616SDavid du Colombier };
30*4246b616SDavid du Colombier
31*4246b616SDavid du Colombier Table tab[MAXTAB];
32*4246b616SDavid du Colombier int ntab;
33*4246b616SDavid du Colombier
34*4246b616SDavid du Colombier Word best[MAXBEST];
35*4246b616SDavid du Colombier int mbest;
36*4246b616SDavid du Colombier int nbest;
37*4246b616SDavid du Colombier
38*4246b616SDavid du Colombier int debug;
39*4246b616SDavid du Colombier
40*4246b616SDavid du Colombier void
usage(void)41*4246b616SDavid du Colombier usage(void)
42*4246b616SDavid du Colombier {
43*4246b616SDavid du Colombier fprint(2, "usage: bayes [-D] [-m maxword] boxhash ... ~ msghash ...\n");
44*4246b616SDavid du Colombier exits("usage");
45*4246b616SDavid du Colombier }
46*4246b616SDavid du Colombier
47*4246b616SDavid du Colombier void*
emalloc(int n)48*4246b616SDavid du Colombier emalloc(int n)
49*4246b616SDavid du Colombier {
50*4246b616SDavid du Colombier void *v;
51*4246b616SDavid du Colombier
52*4246b616SDavid du Colombier v = mallocz(n, 1);
53*4246b616SDavid du Colombier if(v == nil)
54*4246b616SDavid du Colombier sysfatal("out of memory");
55*4246b616SDavid du Colombier return v;
56*4246b616SDavid du Colombier }
57*4246b616SDavid du Colombier
58*4246b616SDavid du Colombier void
noteword(Word * w)59*4246b616SDavid du Colombier noteword(Word *w)
60*4246b616SDavid du Colombier {
61*4246b616SDavid du Colombier int i;
62*4246b616SDavid du Colombier
63*4246b616SDavid du Colombier for(i=nbest-1; i>=0; i--)
64*4246b616SDavid du Colombier if(w->mp < best[i].mp)
65*4246b616SDavid du Colombier break;
66*4246b616SDavid du Colombier i++;
67*4246b616SDavid du Colombier
68*4246b616SDavid du Colombier if(i >= mbest)
69*4246b616SDavid du Colombier return;
70*4246b616SDavid du Colombier if(nbest == mbest)
71*4246b616SDavid du Colombier nbest--;
72*4246b616SDavid du Colombier if(i < nbest)
73*4246b616SDavid du Colombier memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));
74*4246b616SDavid du Colombier best[i] = *w;
75*4246b616SDavid du Colombier nbest++;
76*4246b616SDavid du Colombier }
77*4246b616SDavid du Colombier
78*4246b616SDavid du Colombier Hash*
hread(char * s)79*4246b616SDavid du Colombier hread(char *s)
80*4246b616SDavid du Colombier {
81*4246b616SDavid du Colombier Hash *h;
82*4246b616SDavid du Colombier Biobuf *b;
83*4246b616SDavid du Colombier
84*4246b616SDavid du Colombier if((b = Bopenlock(s, OREAD)) == nil)
85*4246b616SDavid du Colombier sysfatal("open %s: %r", s);
86*4246b616SDavid du Colombier
87*4246b616SDavid du Colombier h = emalloc(sizeof(Hash));
88*4246b616SDavid du Colombier Breadhash(b, h, 1);
89*4246b616SDavid du Colombier Bterm(b);
90*4246b616SDavid du Colombier return h;
91*4246b616SDavid du Colombier }
92*4246b616SDavid du Colombier
93*4246b616SDavid du Colombier void
main(int argc,char ** argv)94*4246b616SDavid du Colombier main(int argc, char **argv)
95*4246b616SDavid du Colombier {
96*4246b616SDavid du Colombier int i, j, a, mi, oi, tot, keywords;
97*4246b616SDavid du Colombier double totp, p, xp[MAXTAB];
98*4246b616SDavid du Colombier Hash *hmsg;
99*4246b616SDavid du Colombier Word w;
100*4246b616SDavid du Colombier Stringtab *s, *t;
101*4246b616SDavid du Colombier Biobuf bout;
102*4246b616SDavid du Colombier
103*4246b616SDavid du Colombier mbest = 15;
104*4246b616SDavid du Colombier keywords = 0;
105*4246b616SDavid du Colombier ARGBEGIN{
106*4246b616SDavid du Colombier case 'D':
107*4246b616SDavid du Colombier debug = 1;
108*4246b616SDavid du Colombier break;
109*4246b616SDavid du Colombier case 'k':
110*4246b616SDavid du Colombier keywords = 1;
111*4246b616SDavid du Colombier break;
112*4246b616SDavid du Colombier case 'm':
113*4246b616SDavid du Colombier mbest = atoi(EARGF(usage()));
114*4246b616SDavid du Colombier if(mbest > MAXBEST)
115*4246b616SDavid du Colombier sysfatal("cannot keep more than %d words", MAXBEST);
116*4246b616SDavid du Colombier break;
117*4246b616SDavid du Colombier default:
118*4246b616SDavid du Colombier usage();
119*4246b616SDavid du Colombier }ARGEND
120*4246b616SDavid du Colombier
121*4246b616SDavid du Colombier for(i=0; i<argc; i++)
122*4246b616SDavid du Colombier if(strcmp(argv[i], "~") == 0)
123*4246b616SDavid du Colombier break;
124*4246b616SDavid du Colombier
125*4246b616SDavid du Colombier if(i > MAXTAB)
126*4246b616SDavid du Colombier sysfatal("cannot handle more than %d tables", MAXTAB);
127*4246b616SDavid du Colombier
128*4246b616SDavid du Colombier if(i+1 >= argc)
129*4246b616SDavid du Colombier usage();
130*4246b616SDavid du Colombier
131*4246b616SDavid du Colombier for(i=0; i<argc; i++){
132*4246b616SDavid du Colombier if(strcmp(argv[i], "~") == 0)
133*4246b616SDavid du Colombier break;
134*4246b616SDavid du Colombier tab[ntab].file = argv[i];
135*4246b616SDavid du Colombier tab[ntab].hash = hread(argv[i]);
136*4246b616SDavid du Colombier s = findstab(tab[ntab].hash, "*nmsg*", 6, 1);
137*4246b616SDavid du Colombier if(s == nil || s->count == 0)
138*4246b616SDavid du Colombier tab[ntab].nmsg = 1;
139*4246b616SDavid du Colombier else
140*4246b616SDavid du Colombier tab[ntab].nmsg = s->count;
141*4246b616SDavid du Colombier ntab++;
142*4246b616SDavid du Colombier }
143*4246b616SDavid du Colombier
144*4246b616SDavid du Colombier Binit(&bout, 1, OWRITE);
145*4246b616SDavid du Colombier
146*4246b616SDavid du Colombier oi = ++i;
147*4246b616SDavid du Colombier for(a=i; a<argc; a++){
148*4246b616SDavid du Colombier hmsg = hread(argv[a]);
149*4246b616SDavid du Colombier nbest = 0;
150*4246b616SDavid du Colombier for(s=hmsg->all; s; s=s->link){
151*4246b616SDavid du Colombier w.s = s;
152*4246b616SDavid du Colombier tot = 0;
153*4246b616SDavid du Colombier totp = 0.0;
154*4246b616SDavid du Colombier for(i=0; i<ntab; i++){
155*4246b616SDavid du Colombier t = findstab(tab[i].hash, s->str, s->n, 0);
156*4246b616SDavid du Colombier if(t == nil)
157*4246b616SDavid du Colombier w.count[i] = 0;
158*4246b616SDavid du Colombier else
159*4246b616SDavid du Colombier w.count[i] = t->count;
160*4246b616SDavid du Colombier tot += w.count[i];
161*4246b616SDavid du Colombier p = w.count[i]/(double)tab[i].nmsg;
162*4246b616SDavid du Colombier if(p >= 1.0)
163*4246b616SDavid du Colombier p = 1.0;
164*4246b616SDavid du Colombier w.p[i] = p;
165*4246b616SDavid du Colombier totp += p;
166*4246b616SDavid du Colombier }
167*4246b616SDavid du Colombier
168*4246b616SDavid du Colombier if(tot < 5){ /* word does not appear enough; give to box 0 */
169*4246b616SDavid du Colombier w.p[0] = 0.5;
170*4246b616SDavid du Colombier for(i=1; i<ntab; i++)
171*4246b616SDavid du Colombier w.p[i] = 0.1;
172*4246b616SDavid du Colombier w.mp = 0.5;
173*4246b616SDavid du Colombier w.mi = 0;
174*4246b616SDavid du Colombier noteword(&w);
175*4246b616SDavid du Colombier continue;
176*4246b616SDavid du Colombier }
177*4246b616SDavid du Colombier
178*4246b616SDavid du Colombier w.mp = 0.0;
179*4246b616SDavid du Colombier for(i=0; i<ntab; i++){
180*4246b616SDavid du Colombier p = w.p[i];
181*4246b616SDavid du Colombier p /= totp;
182*4246b616SDavid du Colombier if(p < 0.01)
183*4246b616SDavid du Colombier p = 0.01;
184*4246b616SDavid du Colombier else if(p > 0.99)
185*4246b616SDavid du Colombier p = 0.99;
186*4246b616SDavid du Colombier if(p > w.mp){
187*4246b616SDavid du Colombier w.mp = p;
188*4246b616SDavid du Colombier w.mi = i;
189*4246b616SDavid du Colombier }
190*4246b616SDavid du Colombier w.p[i] = p;
191*4246b616SDavid du Colombier }
192*4246b616SDavid du Colombier noteword(&w);
193*4246b616SDavid du Colombier }
194*4246b616SDavid du Colombier
195*4246b616SDavid du Colombier totp = 0.0;
196*4246b616SDavid du Colombier for(i=0; i<ntab; i++){
197*4246b616SDavid du Colombier p = 1.0;
198*4246b616SDavid du Colombier for(j=0; j<nbest; j++)
199*4246b616SDavid du Colombier p *= best[j].p[i];
200*4246b616SDavid du Colombier xp[i] = p;
201*4246b616SDavid du Colombier totp += p;
202*4246b616SDavid du Colombier }
203*4246b616SDavid du Colombier for(i=0; i<ntab; i++)
204*4246b616SDavid du Colombier xp[i] /= totp;
205*4246b616SDavid du Colombier mi = 0;
206*4246b616SDavid du Colombier for(i=1; i<ntab; i++)
207*4246b616SDavid du Colombier if(xp[i] > xp[mi])
208*4246b616SDavid du Colombier mi = i;
209*4246b616SDavid du Colombier if(oi != argc-1)
210*4246b616SDavid du Colombier Bprint(&bout, "%s: ", argv[a]);
211*4246b616SDavid du Colombier Bprint(&bout, "%s %f", tab[mi].file, xp[mi]);
212*4246b616SDavid du Colombier if(keywords){
213*4246b616SDavid du Colombier for(i=0; i<nbest; i++){
214*4246b616SDavid du Colombier Bprint(&bout, " ");
215*4246b616SDavid du Colombier Bwrite(&bout, best[i].s->str, best[i].s->n);
216*4246b616SDavid du Colombier Bprint(&bout, " %f", best[i].p[mi]);
217*4246b616SDavid du Colombier }
218*4246b616SDavid du Colombier }
219*4246b616SDavid du Colombier freehash(hmsg);
220*4246b616SDavid du Colombier Bprint(&bout, "\n");
221*4246b616SDavid du Colombier if(debug){
222*4246b616SDavid du Colombier for(i=0; i<nbest; i++){
223*4246b616SDavid du Colombier Bwrite(&bout, best[i].s->str, best[i].s->n);
224*4246b616SDavid du Colombier Bprint(&bout, " %f", best[i].p[mi]);
225*4246b616SDavid du Colombier if(best[i].p[mi] < best[i].mp)
226*4246b616SDavid du Colombier Bprint(&bout, " (%f %s)", best[i].mp, tab[best[i].mi].file);
227*4246b616SDavid du Colombier Bprint(&bout, "\n");
228*4246b616SDavid du Colombier }
229*4246b616SDavid du Colombier }
230*4246b616SDavid du Colombier }
231*4246b616SDavid du Colombier Bterm(&bout);
232*4246b616SDavid du Colombier }
233