1 /*
2 * RFC822 message tokenizer (really feature generator) for spam filter.
3 *
4 * See Paul Graham's musings on spam filtering for theory.
5 */
6
7 #include <u.h>
8 #include <libc.h>
9 #include <bio.h>
10 #include <regexp.h>
11 #include <ctype.h>
12 #include "dfa.h"
13
14 void buildre(Dreprog*[3]);
15 int debug;
16 char *refile = "/mail/lib/classify.re";
17 int maxtoklen = 20;
18 int trim(char*);
19
20 void
usage(void)21 usage(void)
22 {
23 fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");
24 exits("usage");
25 }
26
27 void
main(int argc,char ** argv)28 main(int argc, char **argv)
29 {
30 int i, hdr, n, eof, off;
31 Dreprog *re[3];
32 int m[3];
33 char *p, *ep, *tag;
34 Biobuf bout, bin;
35 char msg[1024+1];
36 char buf[1024];
37
38 buildre(re);
39 ARGBEGIN{
40 case 'D':
41 debug = 1;
42 break;
43 case 'n':
44 maxtoklen = atoi(EARGF(usage()));
45 break;
46 case 'r':
47 refile = EARGF(usage());
48 break;
49 default:
50 usage();
51 }ARGEND;
52
53 if(argc > 1)
54 usage();
55 if(argc == 1){
56 close(0);
57 if(open(argv[0], OREAD) < 0)
58 sysfatal("open %s: %r", argv[0]);
59 }
60
61 tag = nil;
62 Binit(&bin, 0, OREAD);
63 Binit(&bout, 1, OWRITE);
64 ep = msg;
65 p = msg;
66 eof = 0;
67 off = 0;
68 hdr = 1;
69 for(;;){
70 /* replenish buffer */
71 if(ep - p < 512 && !eof){
72 if(p > msg + 1){
73 n = ep - p;
74 memmove(msg, p-1, ep-(p-1));
75 off += (p-1) - msg;
76 p = msg+1;
77 ep = p + n;
78 }
79 n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);
80 if(n < 0)
81 sysfatal("read error: %r");
82 if(n == 0)
83 eof = 1;
84 ep += n;
85 *ep = 0;
86 }
87 if(p >= ep)
88 break;
89
90 if(*p == 0){
91 p++;
92 continue;
93 }
94
95 if(hdr && p[-1]=='\n'){
96 if(p[0]=='\n')
97 hdr = 0;
98 else if(cistrncmp(p-1, "\nfrom:", 6) == 0)
99 tag = "From*";
100 else if(cistrncmp(p-1, "\nto:", 4) == 0)
101 tag = "To*";
102 else if(cistrncmp(p-1, "\nsubject:", 9) == 0)
103 tag = "Subject*";
104 else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)
105 tag = "Return-Path*";
106 else
107 tag = nil;
108 }
109 m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');
110 m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');
111 m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');
112
113 n = m[0];
114 if(n < m[1])
115 n = m[1];
116 if(n < m[2])
117 n = m[2];
118 if(n <= 0){
119 fprint(2, "«%s» %.2ux", p, p[0]);
120 sysfatal("no regexps matched at %ld", off + (p-msg));
121 }
122
123 if(m[0] >= m[1] && m[0] >= m[2]){
124 /* "From " marks start of new message */
125 Bprint(&bout, "*From*\n");
126 n = m[0];
127 hdr = 1;
128 }else if(m[2] > 1){
129 /* ignore */
130 n = m[2];
131 }else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){
132 /* keyword */
133 /* should do UTF-aware lowercasing, too much bother */
134 /*
135 for(i=0; i<n; i++)
136 if('A' <= p[i] && p[i] <= 'Z')
137 p[i] += 'a' - 'A';
138 */
139 if(tag){
140 i = strlen(tag);
141 memmove(buf, tag, i);
142 memmove(buf+i, p, m[1]);
143 buf[i+m[1]] = 0;
144 }else{
145 memmove(buf, p, m[1]);
146 buf[m[1]] = 0;
147 }
148 Bprint(&bout, "%s\n", buf);
149 while(trim(buf) >= 0)
150 Bprint(&bout, "stem*%s\n", buf);
151 n = m[1];
152 }else
153 n = m[2];
154 if(debug)
155 fprint(2, "%.*s¦", utfnlen(p, n), p);
156 p += n;
157 }
158 Bterm(&bout);
159 exits(0);
160 }
161
162 void
buildre(Dreprog * re[3])163 buildre(Dreprog *re[3])
164 {
165 Biobuf *b;
166
167 if((b = Bopen(refile, OREAD)) == nil)
168 sysfatal("open %s: %r", refile);
169
170 re[0] = Breaddfa(b);
171 re[1] = Breaddfa(b);
172 re[2] = Breaddfa(b);
173
174 if(re[0]==nil || re[1]==nil || re[2]==nil)
175 sysfatal("Breaddfa: %r");
176 Bterm(b);
177 }
178
179 /* perhaps this belongs in the tokenizer */
180 int
trim(char * s)181 trim(char *s)
182 {
183 char *p, *op;
184 int mix, mix1;
185
186 if(*s == '*')
187 return -1;
188
189 /* strip leading punctuation */
190 p = strchr(s, '*');
191 if(p == nil)
192 p = s;
193 while(*p && !isalpha(*p))
194 p++;
195 if(strlen(p) < 2)
196 {
197 return -1;
198 }
199 memmove(s, p, strlen(p)+1);
200
201 /* strip suffix of punctuation */
202 p = s+strlen(s);
203 op = p;
204 while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))
205 p--;
206
207 /* chop punctuation */
208 if(p > s){
209 /* free!!! -> free! */
210 if(p+1 < op){
211 p[1] = 0;
212 return 0;
213 }
214 /* free! -> free */
215 if(p < op){
216 p[0] = 0;
217 return 0;
218 }
219 }
220
221 mix = mix1 = 0;
222 if(isupper(s[0]))
223 mix = 1;
224 for(p=s+1; *p; p++)
225 if(isupper(*p)){
226 mix1 = 1;
227 break;
228 }
229
230 /* turn FREE into Free */
231 if(mix1){
232 for(p=s+1; *p; p++)
233 if(isupper(*p))
234 *p += 'a'-'A';
235 return 0;
236 }
237
238 /* turn Free into free */
239 if(mix){
240 *s += 'a'-'A';
241 return 0;
242 }
243 return -1;
244 }
245
246