xref: /plan9/sys/src/cmd/upas/smtp/rfc822.y (revision 426fe5994a5cc658001a4bec2c6d67ca434c4ae9)
1 %{
2 #include "common.h"
3 #include "smtp.h"
4 #include <ctype.h>
5 
6 #define YYMAXDEPTH	500		/* was default 150 */
7 
8 char	*yylp;		/* next character to be lex'd */
9 int	yydone;		/* tell yylex to give up */
10 char	*yybuffer;	/* first parsed character */
11 char	*yyend;		/* end of buffer to be parsed */
12 Node	*root;
13 Field	*firstfield;
14 Field	*lastfield;
15 Node	*usender;
16 Node	*usys;
17 Node	*udate;
18 char	*startfield, *endfield;
19 int	originator;
20 int	destination;
21 int	date;
22 int	received;
23 int	messageid;
24 %}
25 
26 %term WORD
27 %term DATE
28 %term RESENT_DATE
29 %term RETURN_PATH
30 %term FROM
31 %term SENDER
32 %term REPLY_TO
33 %term RESENT_FROM
34 %term RESENT_SENDER
35 %term RESENT_REPLY_TO
36 %term SUBJECT
37 %term TO
38 %term CC
39 %term BCC
40 %term RESENT_TO
41 %term RESENT_CC
42 %term RESENT_BCC
43 %term REMOTE
44 %term PRECEDENCE
45 %term MIMEVERSION
46 %term CONTENTTYPE
47 %term MESSAGEID
48 %term RECEIVED
49 %term MAILER
50 %term BADTOKEN
51 %start msg
52 %%
53 
54 msg		: fields
55 		| unixfrom '\n' fields
56 		;
57 fields		: '\n'
58 			{ yydone = 1; }
59 		| field '\n'
60 		| field '\n' fields
61 		;
62 field		: dates
63 			{ date = 1; }
64 		| originator
65 			{ originator = 1; }
66 		| destination
67 			{ destination = 1; }
68 		| subject
69 		| optional
70 		| ignored
71 		| received
72 		| precedence
73 		| error '\n' field
74 		;
75 unixfrom	: FROM route_addr unix_date_time REMOTE FROM word
76 			{ freenode($1); freenode($4); freenode($5);
77 			  usender = $2; udate = $3; usys = $6;
78 			}
79 		;
80 originator	: REPLY_TO ':' address_list
81 			{ newfield(link3($1, $2, $3), 1); }
82 		| RETURN_PATH ':' route_addr
83 			{ newfield(link3($1, $2, $3), 1); }
84 		| FROM ':' mailbox_list
85 			{ newfield(link3($1, $2, $3), 1); }
86 		| SENDER ':' mailbox
87 			{ newfield(link3($1, $2, $3), 1); }
88 		| RESENT_REPLY_TO ':' address_list
89 			{ newfield(link3($1, $2, $3), 1); }
90 		| RESENT_SENDER ':' mailbox
91 			{ newfield(link3($1, $2, $3), 1); }
92 		| RESENT_FROM ':' mailbox
93 			{ newfield(link3($1, $2, $3), 1); }
94 		;
95 dates 		: DATE ':' date_time
96 			{ newfield(link3($1, $2, $3), 0); }
97 		| RESENT_DATE ':' date_time
98 			{ newfield(link3($1, $2, $3), 0); }
99 		;
100 destination	: TO ':'
101 			{ newfield(link2($1, $2), 0); }
102 		| TO ':' address_list
103 			{ newfield(link3($1, $2, $3), 0); }
104 		| RESENT_TO ':'
105 			{ newfield(link2($1, $2), 0); }
106 		| RESENT_TO ':' address_list
107 			{ newfield(link3($1, $2, $3), 0); }
108 		| CC ':'
109 			{ newfield(link2($1, $2), 0); }
110 		| CC ':' address_list
111 			{ newfield(link3($1, $2, $3), 0); }
112 		| RESENT_CC ':'
113 			{ newfield(link2($1, $2), 0); }
114 		| RESENT_CC ':' address_list
115 			{ newfield(link3($1, $2, $3), 0); }
116 		| BCC ':'
117 			{ newfield(link2($1, $2), 0); }
118 		| BCC ':' address_list
119 			{ newfield(link3($1, $2, $3), 0); }
120 		| RESENT_BCC ':'
121 			{ newfield(link2($1, $2), 0); }
122 		| RESENT_BCC ':' address_list
123 			{ newfield(link3($1, $2, $3), 0); }
124 		;
125 subject		: SUBJECT ':' things
126 			{ newfield(link3($1, $2, $3), 0); }
127 		| SUBJECT ':'
128 			{ newfield(link2($1, $2), 0); }
129 		;
130 received	: RECEIVED ':' things
131 			{ newfield(link3($1, $2, $3), 0); received++; }
132 		| RECEIVED ':'
133 			{ newfield(link2($1, $2), 0); received++; }
134 		;
135 precedence	: PRECEDENCE ':' things
136 			{ newfield(link3($1, $2, $3), 0); }
137 		| PRECEDENCE ':'
138 			{ newfield(link2($1, $2), 0); }
139 		;
140 ignored		: ignoredhdr ':' things
141 			{ newfield(link3($1, $2, $3), 0); }
142 		| ignoredhdr ':'
143 			{ newfield(link2($1, $2), 0); }
144 		;
145 ignoredhdr	: MIMEVERSION | CONTENTTYPE | MESSAGEID { messageid = 1; } | MAILER
146 		;
147 optional	: fieldwords ':' things
148 			{ /* hack to allow same lex for field names and the rest */
149 			 if(badfieldname($1)){
150 				freenode($1);
151 				freenode($2);
152 				freenode($3);
153 				return 1;
154 			 }
155 			 newfield(link3($1, $2, $3), 0);
156 			}
157 		| fieldwords ':'
158 			{ /* hack to allow same lex for field names and the rest */
159 			 if(badfieldname($1)){
160 				freenode($1);
161 				freenode($2);
162 				return 1;
163 			 }
164 			 newfield(link2($1, $2), 0);
165 			}
166 		;
167 address_list	: address
168 		| address_list ',' address
169 			{ $$ = link3($1, $2, $3); }
170 		;
171 address		: mailbox
172 		| group
173 		;
174 group		: phrase ':' address_list ';'
175 			{ $$ = link2($1, link3($2, $3, $4)); }
176 		| phrase ':' ';'
177 			{ $$ = link3($1, $2, $3); }
178 		;
179 mailbox_list	: mailbox
180 		| mailbox_list ',' mailbox
181 			{ $$ = link3($1, $2, $3); }
182 		;
183 mailbox		: route_addr
184 		| phrase brak_addr
185 			{ $$ = link2($1, $2); }
186 		| brak_addr
187 		;
188 brak_addr	: '<' route_addr '>'
189 			{ $$ = link3($1, $2, $3); }
190 		| '<' '>'
191 			{ $$ = nobody($2); freenode($1); }
192 		;
193 route_addr	: route ':' at_addr
194 			{ $$ = address(concat($1, concat($2, $3))); }
195 		| addr_spec
196 		;
197 route		: '@' domain
198 			{ $$ = concat($1, $2); }
199 		| route ',' '@' domain
200 			{ $$ = concat($1, concat($2, concat($3, $4))); }
201 		;
202 addr_spec	: local_part
203 			{ $$ = address($1); }
204 		| at_addr
205 		;
206 at_addr		: local_part '@' domain
207 			{ $$ = address(concat($1, concat($2, $3)));}
208 		| at_addr '@' domain
209 			{ $$ = address(concat($1, concat($2, $3)));}
210 		;
211 local_part	: word
212 		;
213 domain		: word
214 		;
215 phrase		: word
216 		| phrase word
217 			{ $$ = link2($1, $2); }
218 		;
219 things		: thing
220 		| things thing
221 			{ $$ = link2($1, $2); }
222 		;
223 thing		: word | '<' | '>' | '@' | ':' | ';' | ','
224 		;
225 date_time	: things
226 		;
227 unix_date_time	: word word word unix_time word word
228 			{ $$ = link3($1, $3, link3($2, $6, link2($4, $5))); }
229 		;
230 unix_time	: word
231 		| unix_time ':' word
232 			{ $$ = link3($1, $2, $3); }
233 		;
234 word		: WORD | DATE | RESENT_DATE | RETURN_PATH | FROM | SENDER
235 		| REPLY_TO | RESENT_FROM | RESENT_SENDER | RESENT_REPLY_TO
236 		| TO | CC | BCC | RESENT_TO | RESENT_CC | RESENT_BCC | REMOTE | SUBJECT
237 		| PRECEDENCE | MIMEVERSION | CONTENTTYPE | MESSAGEID | RECEIVED | MAILER
238 		;
239 fieldwords	: fieldword
240 		| WORD
241 		| fieldwords fieldword
242 			{ $$ = link2($1, $2); }
243 		| fieldwords word
244 			{ $$ = link2($1, $2); }
245 		;
246 fieldword	: '<' | '>' | '@' | ';' | ','
247 		;
248 %%
249 
250 /*
251  *  Initialize the parsing.  Done once for each header field.
252  */
253 void
254 yyinit(char *p, int len)
255 {
256 	yybuffer = p;
257 	yylp = p;
258 	yyend = p + len;
259 	firstfield = lastfield = 0;
260 	received = 0;
261 }
262 
263 /*
264  *  keywords identifying header fields we care about
265  */
266 typedef struct Keyword	Keyword;
267 struct Keyword {
268 	char	*rep;
269 	int	val;
270 };
271 
272 /* field names that we need to recognize */
273 Keyword key[] = {
274 	{ "date", DATE },
275 	{ "resent-date", RESENT_DATE },
276 	{ "return_path", RETURN_PATH },
277 	{ "from", FROM },
278 	{ "sender", SENDER },
279 	{ "reply-to", REPLY_TO },
280 	{ "resent-from", RESENT_FROM },
281 	{ "resent-sender", RESENT_SENDER },
282 	{ "resent-reply-to", RESENT_REPLY_TO },
283 	{ "to", TO },
284 	{ "cc", CC },
285 	{ "bcc", BCC },
286 	{ "resent-to", RESENT_TO },
287 	{ "resent-cc", RESENT_CC },
288 	{ "resent-bcc", RESENT_BCC },
289 	{ "remote", REMOTE },
290 	{ "subject", SUBJECT },
291 	{ "precedence", PRECEDENCE },
292 	{ "mime-version", MIMEVERSION },
293 	{ "content-type", CONTENTTYPE },
294 	{ "message-id", MESSAGEID },
295 	{ "received", RECEIVED },
296 	{ "mailer", MAILER },
297 	{ "who-the-hell-cares", WORD }
298 };
299 
300 /*
301  *  Lexical analysis for an rfc822 header field.  Continuation lines
302  *  are handled in yywhite() when skipping over white space.
303  *
304  */
yylex(void)305 yylex(void)
306 {
307 	String *t;
308 	int quoting;
309 	int escaping;
310 	char *start;
311 	Keyword *kp;
312 	int c, d;
313 
314 /*	print("lexing\n"); /**/
315 	if(yylp >= yyend)
316 		return 0;
317 	if(yydone)
318 		return 0;
319 
320 	quoting = escaping = 0;
321 	start = yylp;
322 	yylval = malloc(sizeof(Node));
323 	yylval->white = yylval->s = 0;
324 	yylval->next = 0;
325 	yylval->addr = 0;
326 	yylval->start = yylp;
327 	for(t = 0; yylp < yyend; yylp++){
328 		c = *yylp & 0xff;
329 
330 		/* dump nulls, they can't be in header */
331 		if(c == 0)
332 			continue;
333 
334 		if(escaping) {
335 			escaping = 0;
336 		} else if(quoting) {
337 			switch(c){
338 			case '\\':
339 				escaping = 1;
340 				break;
341 			case '\n':
342 				d = (*(yylp+1))&0xff;
343 				if(d != ' ' && d != '\t'){
344 					quoting = 0;
345 					yylp--;
346 					continue;
347 				}
348 				break;
349 			case '"':
350 				quoting = 0;
351 				break;
352 			}
353 		} else {
354 			switch(c){
355 			case '\\':
356 				escaping = 1;
357 				break;
358 			case '(':
359 			case ' ':
360 			case '\t':
361 			case '\r':
362 				goto out;
363 			case '\n':
364 				if(yylp == start){
365 					yylp++;
366 /*					print("lex(c %c)\n", c); /**/
367 					yylval->end = yylp;
368 					return yylval->c = c;
369 				}
370 				goto out;
371 			case '@':
372 			case '>':
373 			case '<':
374 			case ':':
375 			case ',':
376 			case ';':
377 				if(yylp == start){
378 					yylp++;
379 					yylval->white = yywhite();
380 /*					print("lex(c %c)\n", c); /**/
381 					yylval->end = yylp;
382 					return yylval->c = c;
383 				}
384 				goto out;
385 			case '"':
386 				quoting = 1;
387 				break;
388 			default:
389 				break;
390 			}
391 		}
392 		if(t == 0)
393 			t = s_new();
394 		s_putc(t, c);
395 	}
396 out:
397 	yylval->white = yywhite();
398 	if(t) {
399 		s_terminate(t);
400 	} else				/* message begins with white-space! */
401 		return yylval->c = '\n';
402 	yylval->s = t;
403 	for(kp = key; kp->val != WORD; kp++)
404 		if(cistrcmp(s_to_c(t), kp->rep)==0)
405 			break;
406 /*	print("lex(%d) %s\n", kp->val-WORD, s_to_c(t)); /**/
407 	yylval->end = yylp;
408 	return yylval->c = kp->val;
409 }
410 
411 void
yyerror(char * x)412 yyerror(char *x)
413 {
414 	USED(x);
415 
416 	/*fprint(2, "parse err: %s\n", x);/**/
417 }
418 
419 /*
420  *  parse white space and comments
421  */
422 String *
yywhite(void)423 yywhite(void)
424 {
425 	String *w;
426 	int clevel;
427 	int c;
428 	int escaping;
429 
430 	escaping = clevel = 0;
431 	for(w = 0; yylp < yyend; yylp++){
432 		c = *yylp & 0xff;
433 
434 		/* dump nulls, they can't be in header */
435 		if(c == 0)
436 			continue;
437 
438 		if(escaping){
439 			escaping = 0;
440 		} else if(clevel) {
441 			switch(c){
442 			case '\n':
443 				/*
444 				 *  look for multiline fields
445 				 */
446 				if(*(yylp+1)==' ' || *(yylp+1)=='\t')
447 					break;
448 				else
449 					goto out;
450 			case '\\':
451 				escaping = 1;
452 				break;
453 			case '(':
454 				clevel++;
455 				break;
456 			case ')':
457 				clevel--;
458 				break;
459 			}
460 		} else {
461 			switch(c){
462 			case '\\':
463 				escaping = 1;
464 				break;
465 			case '(':
466 				clevel++;
467 				break;
468 			case ' ':
469 			case '\t':
470 			case '\r':
471 				break;
472 			case '\n':
473 				/*
474 				 *  look for multiline fields
475 				 */
476 				if(*(yylp+1)==' ' || *(yylp+1)=='\t')
477 					break;
478 				else
479 					goto out;
480 			default:
481 				goto out;
482 			}
483 		}
484 		if(w == 0)
485 			w = s_new();
486 		s_putc(w, c);
487 	}
488 out:
489 	if(w)
490 		s_terminate(w);
491 	return w;
492 }
493 
494 /*
495  *  link two parsed entries together
496  */
497 Node*
link2(Node * p1,Node * p2)498 link2(Node *p1, Node *p2)
499 {
500 	Node *p;
501 
502 	for(p = p1; p->next; p = p->next)
503 		;
504 	p->next = p2;
505 	return p1;
506 }
507 
508 /*
509  *  link three parsed entries together
510  */
511 Node*
link3(Node * p1,Node * p2,Node * p3)512 link3(Node *p1, Node *p2, Node *p3)
513 {
514 	Node *p;
515 
516 	for(p = p2; p->next; p = p->next)
517 		;
518 	p->next = p3;
519 
520 	for(p = p1; p->next; p = p->next)
521 		;
522 	p->next = p2;
523 
524 	return p1;
525 }
526 
527 /*
528  *  make a:b, move all white space after both
529  */
530 Node*
colon(Node * p1,Node * p2)531 colon(Node *p1, Node *p2)
532 {
533 	if(p1->white){
534 		if(p2->white)
535 			s_append(p1->white, s_to_c(p2->white));
536 	} else {
537 		p1->white = p2->white;
538 		p2->white = 0;
539 	}
540 
541 	s_append(p1->s, ":");
542 	if(p2->s)
543 		s_append(p1->s, s_to_c(p2->s));
544 
545 	if(p1->end < p2->end)
546 		p1->end = p2->end;
547 	freenode(p2);
548 	return p1;
549 }
550 
551 /*
552  *  concatenate two fields, move all white space after both
553  */
554 Node*
concat(Node * p1,Node * p2)555 concat(Node *p1, Node *p2)
556 {
557 	char buf[2];
558 
559 	if(p1->white){
560 		if(p2->white)
561 			s_append(p1->white, s_to_c(p2->white));
562 	} else {
563 		p1->white = p2->white;
564 		p2->white = 0;
565 	}
566 
567 	if(p1->s == nil){
568 		buf[0] = p1->c;
569 		buf[1] = 0;
570 		p1->s = s_new();
571 		s_append(p1->s, buf);
572 	}
573 
574 	if(p2->s)
575 		s_append(p1->s, s_to_c(p2->s));
576 	else {
577 		buf[0] = p2->c;
578 		buf[1] = 0;
579 		s_append(p1->s, buf);
580 	}
581 
582 	if(p1->end < p2->end)
583 		p1->end = p2->end;
584 	freenode(p2);
585 	return p1;
586 }
587 
588 /*
589  *  look for disallowed chars in the field name
590  */
591 int
badfieldname(Node * p)592 badfieldname(Node *p)
593 {
594 	for(; p; p = p->next){
595 		/* field name can't contain white space */
596 		if(p->white && p->next)
597 			return 1;
598 	}
599 	return 0;
600 }
601 
602 /*
603  *  mark as an address
604  */
605 Node *
address(Node * p)606 address(Node *p)
607 {
608 	p->addr = 1;
609 	return p;
610 }
611 
612 /*
613  *  case independent string compare
614  */
615 int
cistrcmp(char * s1,char * s2)616 cistrcmp(char *s1, char *s2)
617 {
618 	int c1, c2;
619 
620 	for(; *s1; s1++, s2++){
621 		c1 = isupper(*s1) ? tolower(*s1) : *s1;
622 		c2 = isupper(*s2) ? tolower(*s2) : *s2;
623 		if (c1 != c2)
624 			return -1;
625 	}
626 	return *s2;
627 }
628 
629 /*
630  *  free a node
631  */
632 void
freenode(Node * p)633 freenode(Node *p)
634 {
635 	Node *tp;
636 
637 	while(p){
638 		tp = p->next;
639 		if(p->s)
640 			s_free(p->s);
641 		if(p->white)
642 			s_free(p->white);
643 		free(p);
644 		p = tp;
645 	}
646 }
647 
648 
649 /*
650  *  an anonymous user
651  */
652 Node*
nobody(Node * p)653 nobody(Node *p)
654 {
655 	if(p->s)
656 		s_free(p->s);
657 	p->s = s_copy("pOsTmAsTeR");
658 	p->addr = 1;
659 	return p;
660 }
661 
662 /*
663  *  add anything that was dropped because of a parse error
664  */
665 void
missing(Node * p)666 missing(Node *p)
667 {
668 	Node *np;
669 	char *start, *end;
670 	Field *f;
671 	String *s;
672 
673 	start = yybuffer;
674 	if(lastfield != nil){
675 		for(np = lastfield->node; np; np = np->next)
676 			start = np->end+1;
677 	}
678 
679 	end = p->start-1;
680 
681 	if(end <= start)
682 		return;
683 
684 	if(strncmp(start, "From ", 5) == 0)
685 		return;
686 
687 	np = malloc(sizeof(Node));
688 	np->start = start;
689 	np->end = end;
690 	np->white = nil;
691 	s = s_copy("BadHeader: ");
692 	np->s = s_nappend(s, start, end-start);
693 	np->next = nil;
694 
695 	f = malloc(sizeof(Field));
696 	f->next = 0;
697 	f->node = np;
698 	f->source = 0;
699 	if(firstfield)
700 		lastfield->next = f;
701 	else
702 		firstfield = f;
703 	lastfield = f;
704 }
705 
706 /*
707  *  create a new field
708  */
709 void
newfield(Node * p,int source)710 newfield(Node *p, int source)
711 {
712 	Field *f;
713 
714 	missing(p);
715 
716 	f = malloc(sizeof(Field));
717 	f->next = 0;
718 	f->node = p;
719 	f->source = source;
720 	if(firstfield)
721 		lastfield->next = f;
722 	else
723 		firstfield = f;
724 	lastfield = f;
725 	endfield = startfield;
726 	startfield = yylp;
727 }
728 
729 /*
730  *  fee a list of fields
731  */
732 void
freefield(Field * f)733 freefield(Field *f)
734 {
735 	Field *tf;
736 
737 	while(f){
738 		tf = f->next;
739 		freenode(f->node);
740 		free(f);
741 		f = tf;
742 	}
743 }
744 
745 /*
746  *  add some white space to a node
747  */
748 Node*
whiten(Node * p)749 whiten(Node *p)
750 {
751 	Node *tp;
752 
753 	for(tp = p; tp->next; tp = tp->next)
754 		;
755 	if(tp->white == 0)
756 		tp->white = s_copy(" ");
757 	return p;
758 }
759 
760 void
yycleanup(void)761 yycleanup(void)
762 {
763 	Field *f, *fnext;
764 	Node *np, *next;
765 
766 	for(f = firstfield; f; f = fnext){
767 		for(np = f->node; np; np = next){
768 			if(np->s)
769 				s_free(np->s);
770 			if(np->white)
771 				s_free(np->white);
772 			next = np->next;
773 			free(np);
774 		}
775 		fnext = f->next;
776 		free(f);
777 	}
778 	firstfield = lastfield = 0;
779 }
780