1*4246b616SDavid du Colombier /*
2*4246b616SDavid du Colombier * RFC822 message tokenizer (really feature generator) for spam filter.
3*4246b616SDavid du Colombier *
4*4246b616SDavid du Colombier * See Paul Graham's musings on spam filtering for theory.
5*4246b616SDavid du Colombier */
6*4246b616SDavid du Colombier
7*4246b616SDavid du Colombier #include <u.h>
8*4246b616SDavid du Colombier #include <libc.h>
9*4246b616SDavid du Colombier #include <bio.h>
10*4246b616SDavid du Colombier #include <regexp.h>
11*4246b616SDavid du Colombier #include <ctype.h>
12*4246b616SDavid du Colombier #include "dfa.h"
13*4246b616SDavid du Colombier
14*4246b616SDavid du Colombier void buildre(Dreprog*[3]);
15*4246b616SDavid du Colombier int debug;
16*4246b616SDavid du Colombier char *refile = "/mail/lib/classify.re";
17*4246b616SDavid du Colombier int maxtoklen = 20;
18*4246b616SDavid du Colombier int trim(char*);
19*4246b616SDavid du Colombier
20*4246b616SDavid du Colombier void
usage(void)21*4246b616SDavid du Colombier usage(void)
22*4246b616SDavid du Colombier {
23*4246b616SDavid du Colombier fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");
24*4246b616SDavid du Colombier exits("usage");
25*4246b616SDavid du Colombier }
26*4246b616SDavid du Colombier
27*4246b616SDavid du Colombier void
main(int argc,char ** argv)28*4246b616SDavid du Colombier main(int argc, char **argv)
29*4246b616SDavid du Colombier {
30*4246b616SDavid du Colombier int i, hdr, n, eof, off;
31*4246b616SDavid du Colombier Dreprog *re[3];
32*4246b616SDavid du Colombier int m[3];
33*4246b616SDavid du Colombier char *p, *ep, *tag;
34*4246b616SDavid du Colombier Biobuf bout, bin;
35*4246b616SDavid du Colombier char msg[1024+1];
36*4246b616SDavid du Colombier char buf[1024];
37*4246b616SDavid du Colombier
38*4246b616SDavid du Colombier buildre(re);
39*4246b616SDavid du Colombier ARGBEGIN{
40*4246b616SDavid du Colombier case 'D':
41*4246b616SDavid du Colombier debug = 1;
42*4246b616SDavid du Colombier break;
43*4246b616SDavid du Colombier case 'n':
44*4246b616SDavid du Colombier maxtoklen = atoi(EARGF(usage()));
45*4246b616SDavid du Colombier break;
46*4246b616SDavid du Colombier case 'r':
47*4246b616SDavid du Colombier refile = EARGF(usage());
48*4246b616SDavid du Colombier break;
49*4246b616SDavid du Colombier default:
50*4246b616SDavid du Colombier usage();
51*4246b616SDavid du Colombier }ARGEND;
52*4246b616SDavid du Colombier
53*4246b616SDavid du Colombier if(argc > 1)
54*4246b616SDavid du Colombier usage();
55*4246b616SDavid du Colombier if(argc == 1){
56*4246b616SDavid du Colombier close(0);
57*4246b616SDavid du Colombier if(open(argv[0], OREAD) < 0)
58*4246b616SDavid du Colombier sysfatal("open %s: %r", argv[0]);
59*4246b616SDavid du Colombier }
60*4246b616SDavid du Colombier
61*4246b616SDavid du Colombier tag = nil;
62*4246b616SDavid du Colombier Binit(&bin, 0, OREAD);
63*4246b616SDavid du Colombier Binit(&bout, 1, OWRITE);
64*4246b616SDavid du Colombier ep = msg;
65*4246b616SDavid du Colombier p = msg;
66*4246b616SDavid du Colombier eof = 0;
67*4246b616SDavid du Colombier off = 0;
68*4246b616SDavid du Colombier hdr = 1;
69*4246b616SDavid du Colombier for(;;){
70*4246b616SDavid du Colombier /* replenish buffer */
71*4246b616SDavid du Colombier if(ep - p < 512 && !eof){
72*4246b616SDavid du Colombier if(p > msg + 1){
73*4246b616SDavid du Colombier n = ep - p;
74*4246b616SDavid du Colombier memmove(msg, p-1, ep-(p-1));
75*4246b616SDavid du Colombier off += (p-1) - msg;
76*4246b616SDavid du Colombier p = msg+1;
77*4246b616SDavid du Colombier ep = p + n;
78*4246b616SDavid du Colombier }
79*4246b616SDavid du Colombier n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);
80*4246b616SDavid du Colombier if(n < 0)
81*4246b616SDavid du Colombier sysfatal("read error: %r");
82*4246b616SDavid du Colombier if(n == 0)
83*4246b616SDavid du Colombier eof = 1;
84*4246b616SDavid du Colombier ep += n;
85*4246b616SDavid du Colombier *ep = 0;
86*4246b616SDavid du Colombier }
87*4246b616SDavid du Colombier if(p >= ep)
88*4246b616SDavid du Colombier break;
89*4246b616SDavid du Colombier
90*4246b616SDavid du Colombier if(*p == 0){
91*4246b616SDavid du Colombier p++;
92*4246b616SDavid du Colombier continue;
93*4246b616SDavid du Colombier }
94*4246b616SDavid du Colombier
95*4246b616SDavid du Colombier if(hdr && p[-1]=='\n'){
96*4246b616SDavid du Colombier if(p[0]=='\n')
97*4246b616SDavid du Colombier hdr = 0;
98*4246b616SDavid du Colombier else if(cistrncmp(p-1, "\nfrom:", 6) == 0)
99*4246b616SDavid du Colombier tag = "From*";
100*4246b616SDavid du Colombier else if(cistrncmp(p-1, "\nto:", 4) == 0)
101*4246b616SDavid du Colombier tag = "To*";
102*4246b616SDavid du Colombier else if(cistrncmp(p-1, "\nsubject:", 9) == 0)
103*4246b616SDavid du Colombier tag = "Subject*";
104*4246b616SDavid du Colombier else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)
105*4246b616SDavid du Colombier tag = "Return-Path*";
106*4246b616SDavid du Colombier else
107*4246b616SDavid du Colombier tag = nil;
108*4246b616SDavid du Colombier }
109*4246b616SDavid du Colombier m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');
110*4246b616SDavid du Colombier m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');
111*4246b616SDavid du Colombier m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');
112*4246b616SDavid du Colombier
113*4246b616SDavid du Colombier n = m[0];
114*4246b616SDavid du Colombier if(n < m[1])
115*4246b616SDavid du Colombier n = m[1];
116*4246b616SDavid du Colombier if(n < m[2])
117*4246b616SDavid du Colombier n = m[2];
118*4246b616SDavid du Colombier if(n <= 0){
119*4246b616SDavid du Colombier fprint(2, "«%s» %.2ux", p, p[0]);
120*4246b616SDavid du Colombier sysfatal("no regexps matched at %ld", off + (p-msg));
121*4246b616SDavid du Colombier }
122*4246b616SDavid du Colombier
123*4246b616SDavid du Colombier if(m[0] >= m[1] && m[0] >= m[2]){
124*4246b616SDavid du Colombier /* "From " marks start of new message */
125*4246b616SDavid du Colombier Bprint(&bout, "*From*\n");
126*4246b616SDavid du Colombier n = m[0];
127*4246b616SDavid du Colombier hdr = 1;
128*4246b616SDavid du Colombier }else if(m[2] > 1){
129*4246b616SDavid du Colombier /* ignore */
130*4246b616SDavid du Colombier n = m[2];
131*4246b616SDavid du Colombier }else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){
132*4246b616SDavid du Colombier /* keyword */
133*4246b616SDavid du Colombier /* should do UTF-aware lowercasing, too much bother */
134*4246b616SDavid du Colombier /*
135*4246b616SDavid du Colombier for(i=0; i<n; i++)
136*4246b616SDavid du Colombier if('A' <= p[i] && p[i] <= 'Z')
137*4246b616SDavid du Colombier p[i] += 'a' - 'A';
138*4246b616SDavid du Colombier */
139*4246b616SDavid du Colombier if(tag){
140*4246b616SDavid du Colombier i = strlen(tag);
141*4246b616SDavid du Colombier memmove(buf, tag, i);
142*4246b616SDavid du Colombier memmove(buf+i, p, m[1]);
143*4246b616SDavid du Colombier buf[i+m[1]] = 0;
144*4246b616SDavid du Colombier }else{
145*4246b616SDavid du Colombier memmove(buf, p, m[1]);
146*4246b616SDavid du Colombier buf[m[1]] = 0;
147*4246b616SDavid du Colombier }
148*4246b616SDavid du Colombier Bprint(&bout, "%s\n", buf);
149*4246b616SDavid du Colombier while(trim(buf) >= 0)
150*4246b616SDavid du Colombier Bprint(&bout, "stem*%s\n", buf);
151*4246b616SDavid du Colombier n = m[1];
152*4246b616SDavid du Colombier }else
153*4246b616SDavid du Colombier n = m[2];
154*4246b616SDavid du Colombier if(debug)
155*4246b616SDavid du Colombier fprint(2, "%.*s¦", utfnlen(p, n), p);
156*4246b616SDavid du Colombier p += n;
157*4246b616SDavid du Colombier }
158*4246b616SDavid du Colombier Bterm(&bout);
159*4246b616SDavid du Colombier exits(0);
160*4246b616SDavid du Colombier }
161*4246b616SDavid du Colombier
162*4246b616SDavid du Colombier void
buildre(Dreprog * re[3])163*4246b616SDavid du Colombier buildre(Dreprog *re[3])
164*4246b616SDavid du Colombier {
165*4246b616SDavid du Colombier Biobuf *b;
166*4246b616SDavid du Colombier
167*4246b616SDavid du Colombier if((b = Bopen(refile, OREAD)) == nil)
168*4246b616SDavid du Colombier sysfatal("open %s: %r", refile);
169*4246b616SDavid du Colombier
170*4246b616SDavid du Colombier re[0] = Breaddfa(b);
171*4246b616SDavid du Colombier re[1] = Breaddfa(b);
172*4246b616SDavid du Colombier re[2] = Breaddfa(b);
173*4246b616SDavid du Colombier
174*4246b616SDavid du Colombier if(re[0]==nil || re[1]==nil || re[2]==nil)
175*4246b616SDavid du Colombier sysfatal("Breaddfa: %r");
176*4246b616SDavid du Colombier Bterm(b);
177*4246b616SDavid du Colombier }
178*4246b616SDavid du Colombier
179*4246b616SDavid du Colombier /* perhaps this belongs in the tokenizer */
180*4246b616SDavid du Colombier int
trim(char * s)181*4246b616SDavid du Colombier trim(char *s)
182*4246b616SDavid du Colombier {
183*4246b616SDavid du Colombier char *p, *op;
184*4246b616SDavid du Colombier int mix, mix1;
185*4246b616SDavid du Colombier
186*4246b616SDavid du Colombier if(*s == '*')
187*4246b616SDavid du Colombier return -1;
188*4246b616SDavid du Colombier
189*4246b616SDavid du Colombier /* strip leading punctuation */
190*4246b616SDavid du Colombier p = strchr(s, '*');
191*4246b616SDavid du Colombier if(p == nil)
192*4246b616SDavid du Colombier p = s;
193*4246b616SDavid du Colombier while(*p && !isalpha(*p))
194*4246b616SDavid du Colombier p++;
195*4246b616SDavid du Colombier if(strlen(p) < 2)
196*4246b616SDavid du Colombier {
197*4246b616SDavid du Colombier return -1;
198*4246b616SDavid du Colombier }
199*4246b616SDavid du Colombier memmove(s, p, strlen(p)+1);
200*4246b616SDavid du Colombier
201*4246b616SDavid du Colombier /* strip suffix of punctuation */
202*4246b616SDavid du Colombier p = s+strlen(s);
203*4246b616SDavid du Colombier op = p;
204*4246b616SDavid du Colombier while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))
205*4246b616SDavid du Colombier p--;
206*4246b616SDavid du Colombier
207*4246b616SDavid du Colombier /* chop punctuation */
208*4246b616SDavid du Colombier if(p > s){
209*4246b616SDavid du Colombier /* free!!! -> free! */
210*4246b616SDavid du Colombier if(p+1 < op){
211*4246b616SDavid du Colombier p[1] = 0;
212*4246b616SDavid du Colombier return 0;
213*4246b616SDavid du Colombier }
214*4246b616SDavid du Colombier /* free! -> free */
215*4246b616SDavid du Colombier if(p < op){
216*4246b616SDavid du Colombier p[0] = 0;
217*4246b616SDavid du Colombier return 0;
218*4246b616SDavid du Colombier }
219*4246b616SDavid du Colombier }
220*4246b616SDavid du Colombier
221*4246b616SDavid du Colombier mix = mix1 = 0;
222*4246b616SDavid du Colombier if(isupper(s[0]))
223*4246b616SDavid du Colombier mix = 1;
224*4246b616SDavid du Colombier for(p=s+1; *p; p++)
225*4246b616SDavid du Colombier if(isupper(*p)){
226*4246b616SDavid du Colombier mix1 = 1;
227*4246b616SDavid du Colombier break;
228*4246b616SDavid du Colombier }
229*4246b616SDavid du Colombier
230*4246b616SDavid du Colombier /* turn FREE into Free */
231*4246b616SDavid du Colombier if(mix1){
232*4246b616SDavid du Colombier for(p=s+1; *p; p++)
233*4246b616SDavid du Colombier if(isupper(*p))
234*4246b616SDavid du Colombier *p += 'a'-'A';
235*4246b616SDavid du Colombier return 0;
236*4246b616SDavid du Colombier }
237*4246b616SDavid du Colombier
238*4246b616SDavid du Colombier /* turn Free into free */
239*4246b616SDavid du Colombier if(mix){
240*4246b616SDavid du Colombier *s += 'a'-'A';
241*4246b616SDavid du Colombier return 0;
242*4246b616SDavid du Colombier }
243*4246b616SDavid du Colombier return -1;
244*4246b616SDavid du Colombier }
245*4246b616SDavid du Colombier
246