xref: /plan9-contrib/sys/src/cmd/upas/bayes/msgtok.c (revision 4246b6162acdbb658503b8bdc98024362bfbf0fe)
1 /*
2  * RFC822 message tokenizer (really feature generator) for spam filter.
3  *
4  * See Paul Graham's musings on spam filtering for theory.
5  */
6 
7 #include <u.h>
8 #include <libc.h>
9 #include <bio.h>
10 #include <regexp.h>
11 #include <ctype.h>
12 #include "dfa.h"
13 
14 void buildre(Dreprog*[3]);
15 int debug;
16 char *refile = "/mail/lib/classify.re";
17 int maxtoklen = 20;
18 int trim(char*);
19 
20 void
usage(void)21 usage(void)
22 {
23 	fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");
24 	exits("usage");
25 }
26 
27 void
main(int argc,char ** argv)28 main(int argc, char **argv)
29 {
30 	int i, hdr, n, eof, off;
31 	Dreprog *re[3];
32 	int m[3];
33 	char *p, *ep, *tag;
34 	Biobuf bout, bin;
35 	char msg[1024+1];
36 	char buf[1024];
37 
38 	buildre(re);
39 	ARGBEGIN{
40 	case 'D':
41 		debug = 1;
42 		break;
43 	case 'n':
44 		maxtoklen = atoi(EARGF(usage()));
45 		break;
46 	case 'r':
47 		refile = EARGF(usage());
48 		break;
49 	default:
50 		usage();
51 	}ARGEND;
52 
53 	if(argc > 1)
54 		usage();
55 	if(argc == 1){
56 		close(0);
57 		if(open(argv[0], OREAD) < 0)
58 			sysfatal("open %s: %r", argv[0]);
59 	}
60 
61 	tag = nil;
62 	Binit(&bin, 0, OREAD);
63 	Binit(&bout, 1, OWRITE);
64 	ep = msg;
65 	p = msg;
66 	eof = 0;
67 	off = 0;
68 	hdr = 1;
69 	for(;;){
70 		/* replenish buffer */
71 		if(ep - p < 512 && !eof){
72 			if(p > msg + 1){
73 				n = ep - p;
74 				memmove(msg, p-1, ep-(p-1));
75 				off += (p-1) - msg;
76 				p = msg+1;
77 				ep = p + n;
78 			}
79 			n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);
80 			if(n < 0)
81 				sysfatal("read error: %r");
82 			if(n == 0)
83 				eof = 1;
84 			ep += n;
85 			*ep = 0;
86 		}
87 		if(p >= ep)
88 			break;
89 
90 		if(*p == 0){
91 			p++;
92 			continue;
93 		}
94 
95 		if(hdr && p[-1]=='\n'){
96 			if(p[0]=='\n')
97 				hdr = 0;
98 			else if(cistrncmp(p-1, "\nfrom:", 6) == 0)
99 				tag = "From*";
100 			else if(cistrncmp(p-1, "\nto:", 4) == 0)
101 				tag = "To*";
102 			else if(cistrncmp(p-1, "\nsubject:", 9) == 0)
103 				tag = "Subject*";
104 			else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)
105 				tag = "Return-Path*";
106 			else
107 				tag = nil;
108 		}
109 		m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');
110 		m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');
111 		m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');
112 
113 		n = m[0];
114 		if(n < m[1])
115 			n = m[1];
116 		if(n < m[2])
117 			n = m[2];
118 		if(n <= 0){
119 fprint(2, "«%s» %.2ux", p, p[0]);
120 			sysfatal("no regexps matched at %ld", off + (p-msg));
121 		}
122 
123 		if(m[0] >= m[1] && m[0] >= m[2]){
124 			/* "From " marks start of new message */
125 			Bprint(&bout, "*From*\n");
126 			n = m[0];
127 			hdr = 1;
128 		}else if(m[2] > 1){
129 			/* ignore */
130 			n = m[2];
131 		}else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){
132 			/* keyword */
133 			/* should do UTF-aware lowercasing, too much bother */
134 /*
135 			for(i=0; i<n; i++)
136 				if('A' <= p[i] && p[i] <= 'Z')
137 					p[i] += 'a' - 'A';
138 */
139 			if(tag){
140 				i = strlen(tag);
141 				memmove(buf, tag, i);
142 				memmove(buf+i, p, m[1]);
143 				buf[i+m[1]] = 0;
144 			}else{
145 				memmove(buf, p, m[1]);
146 				buf[m[1]] = 0;
147 			}
148 			Bprint(&bout, "%s\n", buf);
149 			while(trim(buf) >= 0)
150 				Bprint(&bout, "stem*%s\n", buf);
151 			n = m[1];
152 		}else
153 			n = m[2];
154 		if(debug)
155 			fprint(2, "%.*s¦", utfnlen(p, n), p);
156 		p += n;
157 	}
158 	Bterm(&bout);
159 	exits(0);
160 }
161 
162 void
buildre(Dreprog * re[3])163 buildre(Dreprog *re[3])
164 {
165 	Biobuf *b;
166 
167 	if((b = Bopen(refile, OREAD)) == nil)
168 		sysfatal("open %s: %r", refile);
169 
170 	re[0] = Breaddfa(b);
171 	re[1] = Breaddfa(b);
172 	re[2] = Breaddfa(b);
173 
174 	if(re[0]==nil || re[1]==nil || re[2]==nil)
175 		sysfatal("Breaddfa: %r");
176 	Bterm(b);
177 }
178 
179 /* perhaps this belongs in the tokenizer */
180 int
trim(char * s)181 trim(char *s)
182 {
183 	char *p, *op;
184 	int mix, mix1;
185 
186 	if(*s == '*')
187 		return -1;
188 
189 	/* strip leading punctuation */
190 	p = strchr(s, '*');
191 	if(p == nil)
192 		p = s;
193 	while(*p && !isalpha(*p))
194 		p++;
195 	if(strlen(p) < 2)
196 {
197 		return -1;
198 }
199 	memmove(s, p, strlen(p)+1);
200 
201 	/* strip suffix of punctuation */
202 	p = s+strlen(s);
203 	op = p;
204 	while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))
205 		p--;
206 
207 	/* chop punctuation */
208 	if(p > s){
209 		/* free!!! -> free! */
210 		if(p+1 < op){
211 			p[1] = 0;
212 			return 0;
213 		}
214 		/* free! -> free */
215 		if(p < op){
216 			p[0] = 0;
217 			return 0;
218 		}
219 	}
220 
221 	mix = mix1 = 0;
222 	if(isupper(s[0]))
223 		mix = 1;
224 	for(p=s+1; *p; p++)
225 		if(isupper(*p)){
226 			mix1 = 1;
227 			break;
228 		}
229 
230 	/* turn FREE into Free */
231 	if(mix1){
232 		for(p=s+1; *p; p++)
233 			if(isupper(*p))
234 				*p += 'a'-'A';
235 		return 0;
236 	}
237 
238 	/* turn Free into free */
239 	if(mix){
240 		*s += 'a'-'A';
241 		return 0;
242 	}
243 	return -1;
244 }
245 
246