xref: /netbsd-src/usr.bin/mail/mime_header.c (revision 8b0f9554ff8762542c4defc4f70e1eb76fb508fa)
1 /*	$NetBSD: mime_header.c,v 1.4 2007/10/23 14:58:44 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 2006 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Anon Ymous.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *        This product includes software developed by the NetBSD
21  *        Foundation, Inc. and its contributors.
22  * 4. Neither the name of The NetBSD Foundation nor the names of its
23  *    contributors may be used to endorse or promote products derived
24  *    from this software without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 
39 
40 /*
41  * This module contains the core MIME header decoding routines.
42  * Please refer to RFC 2047 and RFC 2822.
43  */
44 
45 #ifdef MIME_SUPPORT
46 
47 #include <sys/cdefs.h>
48 #ifndef __lint__
49 __RCSID("$NetBSD: mime_header.c,v 1.4 2007/10/23 14:58:44 christos Exp $");
50 #endif /* not __lint__ */
51 
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <string.h>
55 
56 #include "def.h"
57 #include "extern.h"
58 #include "mime.h"
59 #include "mime_header.h"
60 #include "mime_codecs.h"
61 
62 /*
63  * Our interface to mime_b64tobin()
64  *
65  * XXX - This should move to mime_codecs.c.
66  */
67 static ssize_t
68 mime_B64_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen)
69 {
70 	if (outlen < 3 * roundup(inlen, 4) / 4)
71 		return -1;
72 
73 	return mime_b64tobin(outbuf, inbuf, inlen);
74 }
75 
76 
77 /*
78  * Header specific "quoted-printable" decode!
79  * Differences with body QP decoding (see rfc 2047, sec 4.2):
80  * 1) '=' occurs _only_ when followed by two hex digits (FWS is not allowed).
81  * 2) Spaces can be encoded as '_' in headers for readability.
82  *
83  * XXX - This should move to mime_codecs.c.
84  */
85 static ssize_t
86 mime_QPh_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen)
87 {
88 	const char *p, *inend;
89 	char *outend;
90 	char *q;
91 
92 	outend = outbuf + outlen;
93 	inend = inbuf + inlen;
94 	q = outbuf;
95 	for (p = inbuf; p < inend; p++) {
96 		if (q >= outend)
97 			return -1;
98 		if (*p == '=') {
99 			p++;
100 			if (p + 1 < inend) {
101 				int c;
102 				char *bufend;
103 				char buf[3];
104 				buf[0] = *p++;
105 				buf[1] = *p;
106 				buf[2] = '\0';
107 				c = strtol(buf, &bufend, 16);
108 				if (bufend != &buf[2])
109 					return -1;
110 				*q++ = c;
111 			}
112 			else
113 				return -1;
114 		}
115 		else if (*p == '_')  /* header's may encode ' ' as '_' */
116 			*q++ = ' ';
117 		else
118 			*q++ = *p;
119 	}
120 	return q - outbuf;
121 }
122 
123 static const char *
124 grab_charset(char *from_cs, size_t from_cs_len, const char *p)
125 {
126 	char *q;
127 	q = from_cs;
128 	for (/*EMPTY*/; *p != '?'; p++) {
129 		if (*p == '\0' || q >= from_cs + from_cs_len - 1)
130 			return NULL;
131 		*q++ = *p;
132 	}
133 	*q = '\0';
134 	return ++p;	/* if here, then we got the '?' */
135 }
136 
137 /*
138  * An encoded word is a string of at most 75 non-white space
139  * characters of the following form:
140  *
141  *  =?charset?X?encoding?=
142  *
143  * where:
144  *   'charset'	is the original character set of the unencoded string.
145  *
146  *   'X'	is the encoding type 'B' or 'Q' for "base64" or
147  *              "quoted-printable", respectively,
148  *   'encoding'	is the encoded string.
149  *
150  * Both 'charset' and 'X' are case independent and 'encoding' cannot
151  * contain any whitespace or '?' characters.  The 'encoding' must also
152  * be fully contained within the encoded words, i.e., it cannot be
153  * split between encoded words.
154  *
155  * Note: the 'B' encoding is a slightly modified "quoted-printable"
156  * encoding.  In particular, spaces (' ') may be encoded as '_' to
157  * improve undecoded readability.
158  */
159 static int
160 decode_word(const char **ibuf, char **obuf, char *oend, const char *to_cs)
161 {
162 	ssize_t declen;
163 	size_t enclen, dstlen;
164 	char decword[LINESIZE];
165 	char from_cs[LINESIZE];
166 	const char *encword, *iend, *p;
167 	char *dstend;
168 	char enctype;
169 
170 	p = *ibuf;
171 	if (p[0] != '=' && p[1] != '?')
172 		return -1;
173 	if (strlen(p) <  2 + 1 + 3 + 1 + 2)
174 		return -1;
175 	p = grab_charset(from_cs, sizeof(from_cs), p + 2);
176 	if (p == NULL)
177 		return -1;
178 	enctype = *p++;
179 	if (*p++ != '?')
180 		return -1;
181 	encword = p;
182 	p = strchr(p, '?');
183 	if (p == NULL || p[1] != '=')
184 		return -1;
185 	enclen = p - encword;	/* length of encoded substring */
186 	iend = p + 2;
187 	/* encoded words are at most 75 characters (RFC 2047, sec 2) */
188 	if (iend > *ibuf + 75)
189 		return -1;
190 
191 	dstend = to_cs ? decword : *obuf;
192 	dstlen = (to_cs ? sizeof(decword): oend - *obuf) - 1;
193 
194 	if (enctype == 'B' || enctype == 'b')
195 		declen = mime_B64_decode(dstend, dstlen, encword, enclen);
196 	else if (enctype == 'Q' || enctype == 'q')
197 		declen = mime_QPh_decode(dstend, dstlen, encword, enclen);
198 	else
199 		return -1;
200 
201 	if (declen == -1)
202 		return -1;
203 
204 	dstend += declen;
205 #ifdef CHARSET_SUPPORT
206 	if (to_cs != NULL) {
207 		iconv_t cd;
208 		const char *src;
209 		size_t srclen;
210 		size_t cnt;
211 
212 		cd = iconv_open(to_cs, from_cs);
213 		if (cd == (iconv_t)-1)
214 			return -1;
215 
216 		src = decword;
217 		srclen = declen;
218 		dstend = *obuf;
219 		dstlen = oend - *obuf - 1;
220 		cnt = mime_iconv(cd, &src, &srclen, &dstend, &dstlen);
221 
222 		(void)iconv_close(cd);
223 		if (cnt == (size_t)-1)
224 			return -1;
225 	}
226 #endif /* CHARSET_SUPPORT */
227 	*dstend = '\0';
228 	*ibuf = iend;
229 	*obuf = dstend;
230 	return 0;
231 }
232 
233 
234 /*
235  * Folding White Space.  See RFC 2822.
236  *
237  * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF
238  * pairs (i.e., "\r\n") and never separately.  However, by the time
239  * mail(1) sees the messages, all CRLF pairs have been converted to
240  * '\n' characters.
241  *
242  * XXX - pull is_FWS() and skip_FWS() up to def.h?
243  */
244 static inline int
245 is_FWS(int c)
246 {
247 	return c == ' ' || c == '\t' || c == '\n';
248 }
249 
250 static inline const char *
251 skip_FWS(const char *p)
252 {
253 	while (is_FWS(*p))
254 		p++;
255 	return p;
256 }
257 
258 static inline void
259 copy_skipped_FWS(char **dst, char *dstend, const char **src, const char *srcend)
260 {
261 	const char *p, *pend;
262 	char *q, *qend;
263 
264 	p = *src;
265 	q = *dst;
266 	pend = srcend;
267 	qend = dstend;
268 
269 	if (p) {  /* copy any skipped linear-white-space */
270 		while (p < pend && q < qend)
271 			*q++ = *p++;
272 		*dst = q;
273 		*src = NULL;
274 	}
275 }
276 
277 /*
278  * Decode an unstructured field.
279  *
280  * See RFC 2822 Sec 2.2.1 and 3.6.5.
281  * Encoded words may occur anywhere in unstructured fields provided
282  * they are separated from any other text or encoded words by at least
283  * one linear-white-space character. (See RFC 2047 sec 5.1.)  If two
284  * encoded words occur sequentially (separated by only FWS) then the
285  * separating FWS is removed.
286  *
287  * NOTE: unstructured fields cannot contain 'quoted-pairs' (see
288  * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\'
289  * (or any non-whitespace character) immediately before an
290  * encoded-word will prevent it from being decoded.
291  *
292  * hstring should be a NULL terminated string.
293  * outbuf should be sufficiently large to hold the result.
294  */
295 static void
296 mime_decode_usfield(char *outbuf, size_t outsize, const char *hstring)
297 {
298 	const char *p, *p0;
299 	char *q, *qend;
300 	int lastc;
301 	const char *charset;
302 
303 	charset = value(ENAME_MIME_CHARSET);
304 	qend = outbuf + outsize - 1; /* Make sure there is room for the trailing NULL! */
305 	q = outbuf;
306 	p = hstring;
307 	p0 = NULL;
308 	lastc = (unsigned char)' ';
309 	while (*p && q < qend) {
310 		const char *p1;
311 		char *q1;
312 		if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
313 		    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
314 		    (*p1 == '\0' || is_FWS(*p1))) {
315 			p0 = p1;  /* pointer to first character after encoded word */
316 			q = q1;
317 			p = skip_FWS(p1);
318 			lastc = (unsigned char)*p0;
319 		}
320 		else {
321 			copy_skipped_FWS(&q, qend, &p0, p);
322 			lastc = (unsigned char)*p;
323 			if (q < qend)
324 				*q++ = *p++;
325 		}
326 	}
327 	copy_skipped_FWS(&q, qend, &p0, p);
328 	*q = '\0';
329 }
330 
331 /*
332  * Decode a field comment.
333  *
334  * Comments only occur in structured fields, can be nested (rfc 2822,
335  * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'.
336  * Otherwise, they can be regarded as unstructured fields that are
337  * bounded by '(' and ')' characters.
338  */
339 static int
340 decode_comment(char **obuf, char *oend, const char **ibuf, const char *iend, const char *charset)
341 {
342 	const char *p, *pend, *p0;
343 	char *q, *qend;
344 	int lastc;
345 
346 	p = *ibuf;
347 	q = *obuf;
348 	pend = iend;
349 	qend = oend;
350 	lastc = ' ';
351 	p0 = NULL;
352 	while (p < pend && q < qend) {
353 		const char *p1;
354 		char *q1;
355 
356 		if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
357 		    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
358 		    (*p1 == ')' || is_FWS(*p1))) {
359 			lastc = (unsigned char)*p1;
360 			p0 = p1;
361 			q = q1;
362 			p = skip_FWS(p1);
363 			/*
364 			 * XXX - this check should be unnecessary as *pend should
365 			 * be '\0' which will stop skip_FWS()
366 			 */
367 			if (p > pend)
368 				p = pend;
369 		}
370 		else {
371 			copy_skipped_FWS(&q, qend, &p0, p);
372 			if (q >= qend)	/* XXX - q > qend cannot happen */
373 				break;
374 
375 			if (*p == ')') {
376 				*q++ = *p++;	/* copy the closing ')' */
377 				break;		/* and get out of here! */
378 			}
379 
380 			if (*p == '(') {
381 				*q++ = *p++;	/* copy the opening '(' */
382 				if (decode_comment(&q, qend, &p, pend, charset) == -1)
383 					return -1;	/* is this right or should we update? */
384 				lastc = ')';
385 			}
386 			else if (*p == '\\' && p + 1 < pend) {	/* quoted-pair */
387 				if (p[1] == '(' || p[1] == ')' || p[1] == '\\') /* need quoted-pair*/
388 					*q++ = *p;
389 				p++;
390 				lastc = (unsigned char)*p;
391 				if (q < qend)
392 					*q++ = *p++;
393 			}
394 			else {
395 				lastc = (unsigned char)*p;
396 				*q++ = *p++;
397 			}
398 		}
399 	}
400 	*ibuf = p;
401 	*obuf = q;
402 	return 0;
403 }
404 
405 /*
406  * Decode a quoted-string or no-fold-quote.
407  *
408  * These cannot contain encoded words.  They can contain quoted-pairs,
409  * making '\\' special.  They have no other structure.  See RFC 2822
410  * sec 3.2.5 and 3.6.4.
411  */
412 static void
413 decode_quoted_string(char **obuf, char *oend, const char **ibuf, const char *iend)
414 {
415 	const char *p, *pend;
416 	char *q, *qend;
417 
418 	qend = oend;
419 	pend = iend;
420 	p = *ibuf;
421 	q = *obuf;
422 	while (p < pend && q < qend) {
423 		if (*p == '"') {
424 			*q++ = *p++;	/* copy the closing '"' */
425 			break;
426 		}
427 		if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
428 			if (p[1] == '"' || p[1] == '\\') {
429 				*q++ = *p;
430 				if (q >= qend)
431 					break;
432 			}
433 			p++;
434 		}
435 		*q++ = *p++;
436 	}
437 	*ibuf = p;
438 	*obuf = q;
439 }
440 
441 /*
442  * Decode a domain-literal or no-fold-literal.
443  *
444  * These cannot contain encoded words.  They can have quoted pairs and
445  * are delimited by '[' and ']' making '\\', '[', and ']' special.
446  * They have no other structure.  See RFC 2822 sec 3.4.1 and 3.6.4.
447  */
448 static void
449 decode_domain_literal(char **obuf, char *oend, const char **ibuf, const char *iend)
450 {
451 	const char *p, *pend;
452 	char *q, *qend;
453 
454 	qend = oend;
455 	pend = iend;
456 	p = *ibuf;
457 	q = *obuf;
458 	while (p < pend && q < qend) {
459 		if (*p == ']') {
460 			*q++ = *p++;	/* copy the closing ']' */
461 			break;
462 		}
463 		if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
464 			if (p[1] == '[' || p[1] == ']' || p[1] == '\\') {
465 				*q++ = *p;
466 				if (q >= qend)
467 					break;
468 			}
469 			p++;
470 		}
471 		*q++ = *p++;
472 	}
473 	*ibuf = p;
474 	*obuf = q;
475 }
476 
477 /*
478  * Specials: see RFC 2822 sec 3.2.1.
479  */
480 static inline int
481 is_specials(int c)
482 {
483 	static const char specialtab[] = {
484 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
485 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
486 		0, 0, 1, 0,  0, 0, 0, 0,  1, 1, 0, 0,  1, 0, 1, 0,
487 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 1, 1,  1, 0, 1, 0,
488 
489 		1, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
490 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 1,  1, 1, 0, 0,
491 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
492 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
493 	};
494 	return !(c & ~0x7f) ? specialtab[c] : 0;
495 }
496 
497 /*
498  * Decode a structured field.
499  *
500  * At the top level, structured fields can only contain encoded-words
501  * via 'phrases' and 'comments'.  See RFC 2047 sec 5.
502  */
503 static void
504 mime_decode_sfield(char *linebuf, size_t bufsize, const char *hstring)
505 {
506 	const char *p, *pend, *p0;
507 	char *q, *qend;
508 	const char *charset;
509 	int lastc;
510 
511 	charset = value(ENAME_MIME_CHARSET);
512 
513 	p = hstring;
514 	q = linebuf;
515 	pend = hstring + strlen(hstring);
516 	qend = linebuf + bufsize - 1;	/* save room for the NULL terminator */
517 	lastc = (unsigned char)' ';
518 	p0 = NULL;
519 	while (p < pend && q < qend) {
520 		const char *p1;
521 		char *q1;
522 
523 		if (*p != '=') {
524 			copy_skipped_FWS(&q, qend, &p0, p);
525 			if (q >= qend)
526 				break;
527 		}
528 
529 		switch (*p) {
530 		case '(':	/* start of comment */
531 			*q++ = *p++;	/* copy the opening '(' */
532 			(void)decode_comment(&q, qend, &p, pend, charset);
533 			lastc = (unsigned char)p[-1];
534 			break;
535 
536 		case '"':	/* start of quoted-string or no-fold-quote */
537 			*q++ = *p++;	/* copy the opening '"' */
538 			decode_quoted_string(&q, qend, &p, pend);
539 			lastc = (unsigned char)p[-1];
540 			break;
541 
542 		case '[':	/* start of domain-literal or no-fold-literal */
543 			*q++ = *p++;	/* copy the opening '[' */
544 			decode_domain_literal(&q, qend, &p, pend);
545 			lastc = (unsigned char)p[-1];
546 			break;
547 
548 		case '\\':	/* start of quoted-pair */
549 			if (p + 1 < pend) {		/* quoted pair */
550 				if (is_specials(p[1])) {
551 					*q++ = *p;
552 					if (q >= qend)
553 						break;
554 				}
555 				p++;	/* skip the '\\' */
556 			}
557 			goto copy_char;
558 
559 		case '=':
560 			/*
561 			 * At this level encoded words can appear via
562 			 * 'phrases' (possibly delimited by ',' as in
563 			 * 'keywords').  Thus we handle them as such.
564 			 * Hopefully this is sufficient.
565 			 */
566 			if ((lastc == ',' || is_FWS(lastc)) && p[1] == '?' &&
567 			    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
568 			    (*p1 == '\0' || *p1 == ',' || is_FWS(*p1))) {
569 				lastc = (unsigned char)*p1;
570 				p0 = p1;
571 				q = q1;
572 				p = skip_FWS(p1);
573 				/*
574 				 * XXX - this check should be
575 				 * unnecessary as *pend should be '\0'
576 				 * which will stop skip_FWS()
577 				 */
578 				if (p > pend)
579 					p = pend;
580 				break;
581 			}
582 			else {
583 				copy_skipped_FWS(&q, qend, &p0, p);
584 				if (q >= qend)
585 					break;
586 				goto copy_char;
587 			}
588 
589 		case '<':	/* start of angle-addr, msg-id, or path. */
590 			/*
591 			 * A msg-id cannot contain encoded-pairs or
592 			 * encoded-words, but angle-addr and path can.
593 			 * Distinguishing between them seems to be
594 			 * unnecessary, so let's be loose and just
595 			 * decode them as if they were all the same.
596 			 */
597 		default:
598 	copy_char:
599 			lastc = (unsigned char)*p;
600 			*q++ = *p++;
601 			break;
602 		}
603 	}
604 	copy_skipped_FWS(&q, qend, &p0, p);
605 	*q = '\0';	/* null terminate the result! */
606 }
607 
608 
609 /*
610  * Returns the correct hfield decoder, or NULL if none.
611  * Info extracted from RFC 2822.
612  */
613 PUBLIC hfield_decoder_t
614 mime_hfield_decoder(char *name)
615 {
616 	static const struct field_decoder_tbl_s {
617 		const char *field_name;
618 		hfield_decoder_t decoder;
619 	} field_decoder_tbl[] = {
620 		{ "Received:",			NULL },
621 		{ "Content-Type:",		NULL },
622 		{ "Content-Disposition:",	NULL },
623 		{ "Content-Transfer-Encoding:",	NULL },
624 		{ "Content-Description:",	mime_decode_sfield },
625 		{ "Content-ID:",		mime_decode_sfield },
626 		{ "MIME-Version:",		mime_decode_sfield },
627 		{ "Bcc:",			mime_decode_sfield },
628 		{ "Cc:",			mime_decode_sfield },
629 		{ "Date:",			mime_decode_sfield },
630 		{ "From:",			mime_decode_sfield },
631 		{ "In-Reply-To:",		mime_decode_sfield },
632 		{ "Keywords:",			mime_decode_sfield },
633 		{ "Message-ID:",		mime_decode_sfield },
634 		{ "References:",		mime_decode_sfield },
635 		{ "Reply-To:",			mime_decode_sfield },
636 		{ "Return-Path:",		mime_decode_sfield },
637 		{ "Sender:",			mime_decode_sfield },
638 		{ "To:",			mime_decode_sfield },
639 		{ "Subject:",			mime_decode_usfield },
640 		{ "Comments:",			mime_decode_usfield },
641 		{ "X-",				mime_decode_usfield },
642 		{ NULL,				mime_decode_usfield },	/* optional-fields */
643 	};
644 	const struct field_decoder_tbl_s *fp;
645 
646 	/* XXX - this begs for a hash table! */
647 	for (fp = field_decoder_tbl; fp->field_name; fp++)
648 		if (strncasecmp(name, fp->field_name, strlen(fp->field_name)) == 0)
649 			return fp->decoder;
650 	return fp->decoder;
651 }
652 
653 #endif /* MIME_SUPPORT */
654