xref: /plan9/sys/src/cmd/upas/scanmail/common.c (revision 9a747e4fd48b9f4522c70c07e8f882a15030f964)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <regexp.h>
5 #include "spam.h"
6 
7 enum {
8 	Quanta	= 8192,
9 	Minbody = 6000,
10 	HdrMax	= 15,
11 };
12 
13 typedef struct keyword Keyword;
14 typedef struct word Word;
15 
16 struct word{
17 	char	*string;
18 	int	n;
19 };
20 
21 struct	keyword{
22 	char	*string;
23 	int	value;
24 };
25 
26 Word	htmlcmds[] =
27 {
28 	"html",		4,
29 	"!doctype html", 13,
30 	0,
31 
32 };
33 
34 Word	hrefs[] =
35 {
36 	"a href=",	7,
37 	"a title=",	8,
38 	"a target=",	9,
39 	"base href=",	10,
40 	"img src=",	8,
41 	"img border=",	11,
42 	"form action=", 12,
43 	"!--",		3,
44 	0,
45 
46 };
47 
48 /*
49  *	RFC822 header keywords to look for for fractured header.
50  *	all lengths must be less than HdrMax defined above.
51  */
52 Word	hdrwords[] =
53 {
54 	"cc:",			3,
55 	"bcc:", 		4,
56 	"to:",			3,
57 	0,			0,
58 
59 };
60 
61 Keyword	keywords[] =
62 {
63 	"header",	HoldHeader,
64 	"line",		SaveLine,
65 	"hold",		Hold,
66 	"dump",		Dump,
67 	"loff",		Lineoff,
68 	0,		Nactions,
69 };
70 
71 Patterns patterns[] = {
72 [Dump]		{ "DUMP:", 0, 0 },
73 [HoldHeader]	{ "HEADER:", 0, 0 },
74 [Hold]		{ "HOLD:", 0, 0 },
75 [SaveLine]	{ "LINE:", 0, 0 },
76 [Lineoff]	{ "LINEOFF:", 0, 0 },
77 [Nactions]	{ 0, 0, 0 },
78 };
79 
80 static char*	endofhdr(char*, char*);
81 static	int	escape(char**);
82 static	int	extract(char*);
83 static	int	findkey(char*);
84 static	int	hash(int);
85 static	int	isword(Word*, char*, int);
86 static	void	parsealt(Biobuf*, char*, Spat**);
87 
88 /*
89  *	The canonicalizer: convert input to canonical representation
90  */
91 char*
readmsg(Biobuf * bp,int * hsize,int * bufsize)92 readmsg(Biobuf *bp, int *hsize, int *bufsize)
93 {
94 	char *p, *buf;
95 	int n, offset, eoh, bsize, delta;
96 
97 	buf = 0;
98 	offset = 0;
99 	if(bufsize)
100 		*bufsize = 0;
101 	if(hsize)
102 		*hsize = 0;
103 	for(;;) {
104 		buf = Realloc(buf, offset+Quanta+1);
105 		n = Bread(bp, buf+offset, Quanta);
106 		if(n < 0){
107 			free(buf);
108 			return 0;
109 		}
110 		p = buf+offset;			/* start of this chunk */
111 		offset += n;			/* end of this chunk */
112 		buf[offset] = 0;
113 		if(n == 0){
114 			if(offset == 0)
115 				return 0;
116 			break;
117 		}
118 
119 		if(hsize == 0)			/* don't process header */
120 			break;
121 		if(p != buf && p[-1] == '\n')	/* check for EOH across buffer split */
122 			p--;
123 		p = endofhdr(p, buf+offset);
124 		if(p)
125 			break;
126 		if(offset >= Maxread)		/* gargantuan header - just punt*/
127 		{
128 			if(hsize)
129 				*hsize = offset;
130 			if(bufsize)
131 				*bufsize = offset;
132 			return buf;
133 		}
134 	}
135 	eoh = p-buf;				/* End of header */
136 	bsize = offset - eoh;			/* amount of body already read */
137 
138 		/* Read at least Minbody bytes of the body */
139 	if (bsize < Minbody){
140 		delta = Minbody-bsize;
141 		buf = Realloc(buf, offset+delta+1);
142 		n = Bread(bp, buf+offset, delta);
143 		if(n > 0) {
144 			offset += n;
145 			buf[offset] = 0;
146 		}
147 	}
148 	if(hsize)
149 		*hsize = eoh;
150 	if(bufsize)
151 		*bufsize = offset;
152 	return buf;
153 }
154 
155 static	int
isword(Word * wp,char * text,int len)156 isword(Word *wp, char *text, int len)
157 {
158 	for(;wp->string; wp++)
159 		if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0)
160 			return 1;
161 	return 0;
162 }
163 
164 static char*
endofhdr(char * raw,char * end)165 endofhdr(char *raw, char *end)
166 {
167 	int i;
168 	char *p, *q;
169 	char buf[HdrMax];
170 
171 	/*
172  	 * can't use strchr to search for newlines because
173 	 * there may be embedded NULL's.
174 	 */
175 	for(p = raw; p < end; p++){
176 		if(*p != '\n' || p[1] != '\n')
177 			continue;
178 		p++;
179 		for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){
180 			buf[i++] = tolower(*q);
181 			if(*q == ':' || *q == '\n')
182 				break;
183 		}
184 		if(!isword(hdrwords, buf, i))
185 			return p+1;
186 	}
187 	return 0;
188 }
189 
190 static	int
htmlmatch(Word * wp,char * text,char * end,int * n)191 htmlmatch(Word *wp, char *text, char *end, int *n)
192 {
193 	char *cp;
194 	int i, c, lastc;
195 	char buf[MaxHtml];
196 
197 	/*
198 	 * extract a string up to '>'
199 	 */
200 
201 	i = lastc = 0;
202 	cp = text;
203 	while (cp < end && i < sizeof(buf)-1){
204 		c = *cp++;
205 		if(c == '=')
206 			c = escape(&cp);
207 		switch(c){
208 		case 0:
209 		case '\r':
210 			continue;
211 		case '>':
212 			goto out;
213 		case '\n':
214 		case ' ':
215 		case '\t':
216 			if(lastc == ' ')
217 				continue;
218 			c = ' ';
219 			break;
220 		default:
221 			c = tolower(c);
222 			break;
223 		}
224 		buf[i++] = lastc = c;
225 	}
226 out:
227 	buf[i] = 0;
228 	if(n)
229 		*n = cp-text;
230 	return isword(wp, buf, i);
231 }
232 
233 static int
escape(char ** msg)234 escape(char **msg)
235 {
236 	int c;
237 	char *p;
238 
239 	p = *msg;
240 	c = *p;
241 	if(c == '\n'){
242 		p++;
243 		c = *p++;
244 	} else
245 	if(c == '2'){
246 		c = tolower(p[1]);
247 		if(c == 'e'){
248 			p += 2;
249 			c = '.';
250 		}else
251 		if(c == 'f'){
252 			p += 2;
253 			c = '/';
254 		}else
255 		if(c == '0'){
256 			p += 2;
257 			c = ' ';
258 		}
259 		else c = '=';
260 	} else {
261 		if(c == '3' && tolower(p[1]) == 'd')
262 			p += 2;
263 		c = '=';
264 	}
265 	*msg = p;
266 	return c;
267 }
268 
269 static int
htmlchk(char ** msg,char * end)270 htmlchk(char **msg, char *end)
271 {
272 	int n;
273 	char *p;
274 
275 	static int ishtml;
276 
277 	p = *msg;
278 	if(ishtml == 0){
279 		ishtml = htmlmatch(htmlcmds, p, end, &n);
280 
281 		/* If not an HTML keyword, check if it's
282 		 * an HTML comment (<!comment>).  if so,
283 		 * skip over it; otherwise copy it in.
284 		 */
285 		if(ishtml == 0 && *p != '!')	/* not comment */
286 			return '<';		/* copy it */
287 
288 	} else if(htmlmatch(hrefs, p, end, &n))	/* if special HTML string  */
289 		return '<';			/* copy it */
290 
291 	/*
292 	 * this is an uninteresting HTML command; skip over it.
293 	 */
294 	p += n;
295 	*msg = p+1;
296 	return *p;
297 }
298 
299 /*
300  * decode a base 64 encode body
301  */
302 void
conv64(char * msg,char * end,char * buf,int bufsize)303 conv64(char *msg, char *end, char *buf, int bufsize)
304 {
305 	int len, i;
306 	char *cp;
307 
308 	len = end - msg;
309 	i = (len*3)/4+1;	// room for max chars + null
310 	cp = Malloc(i);
311 	len = dec64((uchar*)cp, i, msg, len);
312 	convert(cp, cp+len, buf, bufsize, 1);
313 	free(cp);
314 }
315 
316 int
convert(char * msg,char * end,char * buf,int bufsize,int isbody)317 convert(char *msg, char *end, char *buf, int bufsize, int isbody)
318 {
319 
320 	char *p;
321 	int c, lastc, base64;
322 
323 	lastc = 0;
324 	base64 = 0;
325 	while(msg < end && bufsize > 0){
326 		c = *msg++;
327 
328 		/*
329 		 * In the body only, try to strip most HTML and
330 		 * replace certain MIME escape sequences with the character
331 		 */
332 		if(isbody) {
333 			do{
334 				p = msg;
335 				if(c == '<')
336 					c = htmlchk(&msg, end);
337 				if(c == '=')
338 					c = escape(&msg);
339 			} while(p != msg && p < end);
340 		}
341 		switch(c){
342 		case 0:
343 		case '\r':
344 			continue;
345 		case '\t':
346 		case ' ':
347 		case '\n':
348 			if(lastc == ' ')
349 				continue;
350 			c = ' ';
351 			break;
352 		case 'C':	/* check for MIME base 64 encoding in header */
353 		case 'c':
354 			if(isbody == 0)
355 			if(msg < end-32 && *msg == 'o' && msg[1] == 'n')
356 			if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0)
357 				base64 = 1;
358 			c = 'c';
359 			break;
360 		default:
361 			c = tolower(c);
362 			break;
363 		}
364 		*buf++ = c;
365 		lastc = c;
366 		bufsize--;
367 	}
368 	*buf = 0;
369 	return base64;
370 }
371 
372 /*
373  *	The pattern parser: build data structures from the pattern file
374  */
375 
376 static int
hash(int c)377 hash(int c)
378 {
379 	return c & 127;
380 }
381 
382 static	int
findkey(char * val)383 findkey(char *val)
384 {
385 	Keyword *kp;
386 
387 	for(kp = keywords; kp->string; kp++)
388 		if(strcmp(val, kp->string) == 0)
389 				break;
390 	return kp->value;
391 }
392 
393 #define	whitespace(c)	((c) == ' ' || (c) == '\t')
394 
395 void
parsepats(Biobuf * bp)396 parsepats(Biobuf *bp)
397 {
398 	Pattern *p, *new;
399 	char *cp, *qp;
400 	int type, action, n, h;
401 	Spat *spat;
402 
403 	for(;;){
404 		cp = Brdline(bp, '\n');
405 		if(cp == 0)
406 			break;
407 		cp[Blinelen(bp)-1] = 0;
408 		while(*cp == ' ' || *cp == '\t')
409 			cp++;
410 		if(*cp == '#' || *cp == 0)
411 			continue;
412 		type = regexp;
413 		if(*cp == '*'){
414 			type = string;
415 			cp++;
416 		}
417 		qp = strchr(cp, ':');
418 		if(qp == 0)
419 			continue;
420 		*qp = 0;
421 		if(debug)
422 			fprint(2, "action = %s\n", cp);
423 		action = findkey(cp);
424 		if(action >= Nactions)
425 			continue;
426 		cp = qp+1;
427 		n = extract(cp);
428 		if(n <= 0 || *cp == 0)
429 			continue;
430 
431 		qp = strstr(cp, "~~");
432 		if(qp){
433 			*qp = 0;
434 			n = strlen(cp);
435 		}
436 		if(debug)
437 			fprint(2, " Pattern: `%s'\n", cp);
438 
439 			/* Hook regexps into a chain */
440 		if(type == regexp) {
441 			new = Malloc(sizeof(Pattern));
442 			new->action = action;
443 			new->pat = regcomp(cp);
444 			if(new->pat == 0){
445 				free(new);
446 				continue;
447 			}
448 			new->type = regexp;
449 			new->alt = 0;
450 			new->next = 0;
451 
452 			if(qp)
453 				parsealt(bp, qp+2, &new->alt);
454 
455 			new->next = patterns[action].regexps;
456 			patterns[action].regexps = new;
457 			continue;
458 
459 		}
460 			/* not a Regexp - hook strings into Pattern hash chain */
461 		spat = Malloc(sizeof(*spat));
462 		spat->next = 0;
463 		spat->alt = 0;
464 		spat->len = n;
465 		spat->string = Malloc(n+1);
466 		spat->c1 = cp[1];
467 		strcpy(spat->string, cp);
468 
469 		if(qp)
470 			parsealt(bp, qp+2, &spat->alt);
471 
472 		p = patterns[action].strings;
473 		if(p == 0) {
474 			p = Malloc(sizeof(Pattern));
475 			memset(p, 0, sizeof(*p));
476 			p->action = action;
477 			p->type = string;
478 			patterns[action].strings = p;
479 		}
480 		h = hash(*spat->string);
481 		spat->next = p->spat[h];
482 		p->spat[h] = spat;
483 	}
484 }
485 
486 static void
parsealt(Biobuf * bp,char * cp,Spat ** head)487 parsealt(Biobuf *bp, char *cp, Spat** head)
488 {
489 	char *p;
490 	Spat *alt;
491 
492 	while(cp){
493 		if(*cp == 0){		/*escaped newline*/
494 			do{
495 				cp = Brdline(bp, '\n');
496 				if(cp == 0)
497 					return;
498 				cp[Blinelen(bp)-1] = 0;
499 			} while(extract(cp) <= 0 || *cp == 0);
500 		}
501 
502 		p = cp;
503 		cp = strstr(p, "~~");
504 		if(cp){
505 			*cp = 0;
506 			cp += 2;
507 		}
508 		if(strlen(p)){
509 			alt = Malloc(sizeof(*alt));
510 			alt->string = strdup(p);
511 			alt->next = *head;
512 			*head = alt;
513 		}
514 	}
515 }
516 
517 static int
extract(char * cp)518 extract(char *cp)
519 {
520 	int c;
521 	char *p, *q, *r;
522 
523 	p = q = r = cp;
524 	while(whitespace(*p))
525 		p++;
526 	while(c = *p++){
527 		if (c == '#')
528 			break;
529 		if(c == '"'){
530 			while(*p && *p != '"'){
531 				if(*p == '\\' && p[1] == '"')
532 					p++;
533 				if('A' <= *p && *p <= 'Z')
534 					*q++ = *p++ + ('a'-'A');
535 				else
536 					*q++ = *p++;
537 			}
538 			if(*p)
539 				p++;
540 			r = q;		/* never back up over a quoted string */
541 		} else {
542 			if('A' <= c && c <= 'Z')
543 				c += ('a'-'A');
544 			*q++ = c;
545 		}
546 	}
547 	while(q > r && whitespace(q[-1]))
548 		q--;
549 	*q = 0;
550 	return q-cp;
551 }
552 
553 /*
554  *	The matching engine: compare canonical input to pattern structures
555  */
556 
557 static Spat*
isalt(char * message,Spat * alt)558 isalt(char *message, Spat *alt)
559 {
560 	while(alt) {
561 		if(*cmd)
562 		if(message != cmd && strstr(cmd, alt->string))
563 			break;
564 		if(message != header+1 && strstr(header+1, alt->string))
565 			break;
566 		if(strstr(message, alt->string))
567 			break;
568 		alt = alt->next;
569 	}
570 	return alt;
571 }
572 
573 int
matchpat(Pattern * p,char * message,Resub * m)574 matchpat(Pattern *p, char *message, Resub *m)
575 {
576 	Spat *spat;
577 	char *s;
578 	int c, c1;
579 
580 	if(p->type == string){
581 		c1 = *message;
582 		for(s=message; c=c1; s++){
583 			c1 = s[1];
584 			for(spat=p->spat[hash(c)]; spat; spat=spat->next){
585 				if(c1 == spat->c1)
586 				if(memcmp(s, spat->string, spat->len) == 0)
587 				if(!isalt(message, spat->alt)){
588 					m->sp = s;
589 					m->ep = s + spat->len;
590 					return 1;
591 				}
592 			}
593 		}
594 		return 0;
595 	}
596 	m->sp = m->ep = 0;
597 	if(regexec(p->pat, message, m, 1) == 0)
598 		return 0;
599 	if(isalt(message, p->alt))
600 		return 0;
601 	return 1;
602 }
603 
604 
605 void
xprint(int fd,char * type,Resub * m)606 xprint(int fd, char *type, Resub *m)
607 {
608 	char *p, *q;
609 	int i;
610 
611 	if(m->sp == 0 || m->ep == 0)
612 		return;
613 
614 		/* back up approx 30 characters to whitespace */
615 	for(p = m->sp, i = 0; *p && i < 30; i++, p--)
616 			;
617 	while(*p && *p != ' ')
618 		p--;
619 	p++;
620 
621 		/* grab about 30 more chars beyond the end of the match */
622 	for(q = m->ep, i = 0; *q && i < 30; i++, q++)
623 			;
624 	while(*q && *q != ' ')
625 		q++;
626 
627 	fprint(fd, "%s %.*s~%.*s~%.*s\n", type, (int)(m->sp-p), p, (int)(m->ep-m->sp), m->sp, (int)(q-m->ep), m->ep);
628 }
629 
630 enum {
631 	INVAL=	255
632 };
633 
634 static uchar t64d[256] = {
635 /*00 */	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
636 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
637 /*10*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
638 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
639 /*20*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
640 	INVAL, INVAL, INVAL,    62, INVAL, INVAL, INVAL,    63,
641 /*30*/	   52,	  53,	 54,	55,    56,    57,    58,    59,
642 	   60,	  61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
643 /*40*/	INVAL,    0,      1,     2,     3,     4,     5,     6,
644 	    7,    8,      9,    10,    11,    12,    13,    14,
645 /*50*/	   15,   16,     17,    18,    19,    20,    21,    22,
646 	   23,   24,     25, INVAL, INVAL, INVAL, INVAL, INVAL,
647 /*60*/	INVAL,   26,     27,    28,    29,    30,    31,    32,
648 	   33,   34,     35,    36,    37,    38,    39,    40,
649 /*70*/	   41,   42,     43,    44,    45,    46,    47,    48,
650 	   49,   50,     51, INVAL, INVAL, INVAL, INVAL, INVAL,
651 /*80*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
652 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
653 /*90*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
654 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
655 /*A0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
656 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
657 /*B0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
658 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
659 /*C0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
660 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
661 /*D0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
662 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
663 /*E0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
664 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
665 /*F0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
666 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
667 };
668