xref: /netbsd-src/usr.bin/mail/mime_header.c (revision c8a35b6227034951e874c2def577388e79ede4a5)
1 /*	$NetBSD: mime_header.c,v 1.7 2009/01/18 01:29:57 lukem Exp $	*/
2 
3 /*-
4  * Copyright (c) 2006 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Anon Ymous.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 /*
34  * This module contains the core MIME header decoding routines.
35  * Please refer to RFC 2047 and RFC 2822.
36  */
37 
38 #ifdef MIME_SUPPORT
39 
40 #include <sys/cdefs.h>
41 #ifndef __lint__
42 __RCSID("$NetBSD: mime_header.c,v 1.7 2009/01/18 01:29:57 lukem Exp $");
43 #endif /* not __lint__ */
44 
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 
49 #include "def.h"
50 #include "extern.h"
51 #include "mime.h"
52 #include "mime_header.h"
53 #include "mime_codecs.h"
54 
55 /*
56  * Our interface to mime_b64tobin()
57  *
58  * XXX - This should move to mime_codecs.c.
59  */
60 static ssize_t
61 mime_B64_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen)
62 {
63 	if (outlen < 3 * roundup(inlen, 4) / 4)
64 		return -1;
65 
66 	return mime_b64tobin(outbuf, inbuf, inlen);
67 }
68 
69 
70 /*
71  * Header specific "quoted-printable" decode!
72  * Differences with body QP decoding (see rfc 2047, sec 4.2):
73  * 1) '=' occurs _only_ when followed by two hex digits (FWS is not allowed).
74  * 2) Spaces can be encoded as '_' in headers for readability.
75  *
76  * XXX - This should move to mime_codecs.c.
77  */
78 static ssize_t
79 mime_QPh_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen)
80 {
81 	const char *p, *inend;
82 	char *outend;
83 	char *q;
84 
85 	outend = outbuf + outlen;
86 	inend = inbuf + inlen;
87 	q = outbuf;
88 	for (p = inbuf; p < inend; p++) {
89 		if (q >= outend)
90 			return -1;
91 		if (*p == '=') {
92 			p++;
93 			if (p + 1 < inend) {
94 				int c;
95 				char *bufend;
96 				char buf[3];
97 				buf[0] = *p++;
98 				buf[1] = *p;
99 				buf[2] = '\0';
100 				c = strtol(buf, &bufend, 16);
101 				if (bufend != &buf[2])
102 					return -1;
103 				*q++ = c;
104 			}
105 			else
106 				return -1;
107 		}
108 		else if (*p == '_')  /* header's may encode ' ' as '_' */
109 			*q++ = ' ';
110 		else
111 			*q++ = *p;
112 	}
113 	return q - outbuf;
114 }
115 
116 static const char *
117 grab_charset(char *from_cs, size_t from_cs_len, const char *p)
118 {
119 	char *q;
120 	q = from_cs;
121 	for (/*EMPTY*/; *p != '?'; p++) {
122 		if (*p == '\0' || q >= from_cs + from_cs_len - 1)
123 			return NULL;
124 		*q++ = *p;
125 	}
126 	*q = '\0';
127 	return ++p;	/* if here, then we got the '?' */
128 }
129 
130 /*
131  * An encoded word is a string of at most 75 non-white space
132  * characters of the following form:
133  *
134  *  =?charset?X?encoding?=
135  *
136  * where:
137  *   'charset'	is the original character set of the unencoded string.
138  *
139  *   'X'	is the encoding type 'B' or 'Q' for "base64" or
140  *              "quoted-printable", respectively,
141  *   'encoding'	is the encoded string.
142  *
143  * Both 'charset' and 'X' are case independent and 'encoding' cannot
144  * contain any whitespace or '?' characters.  The 'encoding' must also
145  * be fully contained within the encoded words, i.e., it cannot be
146  * split between encoded words.
147  *
148  * Note: the 'B' encoding is a slightly modified "quoted-printable"
149  * encoding.  In particular, spaces (' ') may be encoded as '_' to
150  * improve undecoded readability.
151  */
152 static int
153 decode_word(const char **ibuf, char **obuf, char *oend, const char *to_cs)
154 {
155 	ssize_t declen;
156 	size_t enclen, dstlen;
157 	char decword[LINESIZE];
158 	char from_cs[LINESIZE];
159 	const char *encword, *iend, *p;
160 	char *dstend;
161 	char enctype;
162 
163 	p = *ibuf;
164 	if (p[0] != '=' && p[1] != '?')
165 		return -1;
166 	if (strlen(p) <  2 + 1 + 3 + 1 + 2)
167 		return -1;
168 	p = grab_charset(from_cs, sizeof(from_cs), p + 2);
169 	if (p == NULL)
170 		return -1;
171 	enctype = *p++;
172 	if (*p++ != '?')
173 		return -1;
174 	encword = p;
175 	p = strchr(p, '?');
176 	if (p == NULL || p[1] != '=')
177 		return -1;
178 	enclen = p - encword;	/* length of encoded substring */
179 	iend = p + 2;
180 	/* encoded words are at most 75 characters (RFC 2047, sec 2) */
181 	if (iend > *ibuf + 75)
182 		return -1;
183 
184 	dstend = to_cs ? decword : *obuf;
185 /* XXX: what if oend <= *obuf, or decword == "" ? */
186 	dstlen = (to_cs ? sizeof(decword) : (size_t)(oend - *obuf)) - 1;
187 
188 	if (enctype == 'B' || enctype == 'b')
189 		declen = mime_B64_decode(dstend, dstlen, encword, enclen);
190 	else if (enctype == 'Q' || enctype == 'q')
191 		declen = mime_QPh_decode(dstend, dstlen, encword, enclen);
192 	else
193 		return -1;
194 
195 	if (declen == -1)
196 		return -1;
197 
198 	dstend += declen;
199 #ifdef CHARSET_SUPPORT
200 	if (to_cs != NULL) {
201 		iconv_t cd;
202 		const char *src;
203 		size_t srclen;
204 		size_t cnt;
205 
206 		cd = iconv_open(to_cs, from_cs);
207 		if (cd == (iconv_t)-1)
208 			return -1;
209 
210 		src = decword;
211 		srclen = declen;
212 		dstend = *obuf;
213 		dstlen = oend - *obuf - 1;
214 		cnt = mime_iconv(cd, &src, &srclen, &dstend, &dstlen);
215 
216 		(void)iconv_close(cd);
217 		if (cnt == (size_t)-1)
218 			return -1;
219 	}
220 #endif /* CHARSET_SUPPORT */
221 	*dstend = '\0';
222 	*ibuf = iend;
223 	*obuf = dstend;
224 	return 0;
225 }
226 
227 
228 /*
229  * Folding White Space.  See RFC 2822.
230  *
231  * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF
232  * pairs (i.e., "\r\n") and never separately.  However, by the time
233  * mail(1) sees the messages, all CRLF pairs have been converted to
234  * '\n' characters.
235  *
236  * XXX - pull is_FWS() and skip_FWS() up to def.h?
237  */
238 static inline int
239 is_FWS(int c)
240 {
241 	return c == ' ' || c == '\t' || c == '\n';
242 }
243 
244 static inline const char *
245 skip_FWS(const char *p)
246 {
247 	while (is_FWS(*p))
248 		p++;
249 	return p;
250 }
251 
252 static inline void
253 copy_skipped_FWS(char **dst, char *dstend, const char **src, const char *srcend)
254 {
255 	const char *p, *pend;
256 	char *q, *qend;
257 
258 	p = *src;
259 	q = *dst;
260 	pend = srcend;
261 	qend = dstend;
262 
263 	if (p) {  /* copy any skipped linear-white-space */
264 		while (p < pend && q < qend)
265 			*q++ = *p++;
266 		*dst = q;
267 		*src = NULL;
268 	}
269 }
270 
271 /*
272  * Decode an unstructured field.
273  *
274  * See RFC 2822 Sec 2.2.1 and 3.6.5.
275  * Encoded words may occur anywhere in unstructured fields provided
276  * they are separated from any other text or encoded words by at least
277  * one linear-white-space character. (See RFC 2047 sec 5.1.)  If two
278  * encoded words occur sequentially (separated by only FWS) then the
279  * separating FWS is removed.
280  *
281  * NOTE: unstructured fields cannot contain 'quoted-pairs' (see
282  * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\'
283  * (or any non-whitespace character) immediately before an
284  * encoded-word will prevent it from being decoded.
285  *
286  * hstring should be a NULL terminated string.
287  * outbuf should be sufficiently large to hold the result.
288  */
289 static void
290 mime_decode_usfield(char *outbuf, size_t outsize, const char *hstring)
291 {
292 	const char *p, *p0;
293 	char *q, *qend;
294 	int lastc;
295 	const char *charset;
296 
297 	charset = value(ENAME_MIME_CHARSET);
298 	qend = outbuf + outsize - 1; /* Make sure there is room for the trailing NULL! */
299 	q = outbuf;
300 	p = hstring;
301 	p0 = NULL;
302 	lastc = (unsigned char)' ';
303 	while (*p && q < qend) {
304 		const char *p1;
305 		char *q1;
306 		if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
307 		    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
308 		    (*p1 == '\0' || is_FWS(*p1))) {
309 			p0 = p1;  /* pointer to first character after encoded word */
310 			q = q1;
311 			p = skip_FWS(p1);
312 			lastc = (unsigned char)*p0;
313 		}
314 		else {
315 			copy_skipped_FWS(&q, qend, &p0, p);
316 			lastc = (unsigned char)*p;
317 			if (q < qend)
318 				*q++ = *p++;
319 		}
320 	}
321 	copy_skipped_FWS(&q, qend, &p0, p);
322 	*q = '\0';
323 }
324 
325 /*
326  * Decode a field comment.
327  *
328  * Comments only occur in structured fields, can be nested (rfc 2822,
329  * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'.
330  * Otherwise, they can be regarded as unstructured fields that are
331  * bounded by '(' and ')' characters.
332  */
333 static int
334 decode_comment(char **obuf, char *oend, const char **ibuf, const char *iend, const char *charset)
335 {
336 	const char *p, *pend, *p0;
337 	char *q, *qend;
338 	int lastc;
339 
340 	p = *ibuf;
341 	q = *obuf;
342 	pend = iend;
343 	qend = oend;
344 	lastc = ' ';
345 	p0 = NULL;
346 	while (p < pend && q < qend) {
347 		const char *p1;
348 		char *q1;
349 
350 		if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
351 		    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
352 		    (*p1 == ')' || is_FWS(*p1))) {
353 			lastc = (unsigned char)*p1;
354 			p0 = p1;
355 			q = q1;
356 			p = skip_FWS(p1);
357 			/*
358 			 * XXX - this check should be unnecessary as *pend should
359 			 * be '\0' which will stop skip_FWS()
360 			 */
361 			if (p > pend)
362 				p = pend;
363 		}
364 		else {
365 			copy_skipped_FWS(&q, qend, &p0, p);
366 			if (q >= qend)	/* XXX - q > qend cannot happen */
367 				break;
368 
369 			if (*p == ')') {
370 				*q++ = *p++;	/* copy the closing ')' */
371 				break;		/* and get out of here! */
372 			}
373 
374 			if (*p == '(') {
375 				*q++ = *p++;	/* copy the opening '(' */
376 				if (decode_comment(&q, qend, &p, pend, charset) == -1)
377 					return -1;	/* is this right or should we update? */
378 				lastc = ')';
379 			}
380 			else if (*p == '\\' && p + 1 < pend) {	/* quoted-pair */
381 				if (p[1] == '(' || p[1] == ')' || p[1] == '\\') /* need quoted-pair*/
382 					*q++ = *p;
383 				p++;
384 				lastc = (unsigned char)*p;
385 				if (q < qend)
386 					*q++ = *p++;
387 			}
388 			else {
389 				lastc = (unsigned char)*p;
390 				*q++ = *p++;
391 			}
392 		}
393 	}
394 	*ibuf = p;
395 	*obuf = q;
396 	return 0;
397 }
398 
399 /*
400  * Decode a quoted-string or no-fold-quote.
401  *
402  * These cannot contain encoded words.  They can contain quoted-pairs,
403  * making '\\' special.  They have no other structure.  See RFC 2822
404  * sec 3.2.5 and 3.6.4.
405  */
406 static void
407 decode_quoted_string(char **obuf, char *oend, const char **ibuf, const char *iend)
408 {
409 	const char *p, *pend;
410 	char *q, *qend;
411 
412 	qend = oend;
413 	pend = iend;
414 	p = *ibuf;
415 	q = *obuf;
416 	while (p < pend && q < qend) {
417 		if (*p == '"') {
418 			*q++ = *p++;	/* copy the closing '"' */
419 			break;
420 		}
421 		if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
422 			if (p[1] == '"' || p[1] == '\\') {
423 				*q++ = *p;
424 				if (q >= qend)
425 					break;
426 			}
427 			p++;
428 		}
429 		*q++ = *p++;
430 	}
431 	*ibuf = p;
432 	*obuf = q;
433 }
434 
435 /*
436  * Decode a domain-literal or no-fold-literal.
437  *
438  * These cannot contain encoded words.  They can have quoted pairs and
439  * are delimited by '[' and ']' making '\\', '[', and ']' special.
440  * They have no other structure.  See RFC 2822 sec 3.4.1 and 3.6.4.
441  */
442 static void
443 decode_domain_literal(char **obuf, char *oend, const char **ibuf, const char *iend)
444 {
445 	const char *p, *pend;
446 	char *q, *qend;
447 
448 	qend = oend;
449 	pend = iend;
450 	p = *ibuf;
451 	q = *obuf;
452 	while (p < pend && q < qend) {
453 		if (*p == ']') {
454 			*q++ = *p++;	/* copy the closing ']' */
455 			break;
456 		}
457 		if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
458 			if (p[1] == '[' || p[1] == ']' || p[1] == '\\') {
459 				*q++ = *p;
460 				if (q >= qend)
461 					break;
462 			}
463 			p++;
464 		}
465 		*q++ = *p++;
466 	}
467 	*ibuf = p;
468 	*obuf = q;
469 }
470 
471 /*
472  * Specials: see RFC 2822 sec 3.2.1.
473  */
474 static inline int
475 is_specials(int c)
476 {
477 	static const char specialtab[] = {
478 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
479 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
480 		0, 0, 1, 0,  0, 0, 0, 0,  1, 1, 0, 0,  1, 0, 1, 0,
481 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 1, 1,  1, 0, 1, 0,
482 
483 		1, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
484 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 1,  1, 1, 0, 0,
485 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
486 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
487 	};
488 	return !(c & ~0x7f) ? specialtab[c] : 0;
489 }
490 
491 /*
492  * Decode a structured field.
493  *
494  * At the top level, structured fields can only contain encoded-words
495  * via 'phrases' and 'comments'.  See RFC 2047 sec 5.
496  */
497 static void
498 mime_decode_sfield(char *linebuf, size_t bufsize, const char *hstring)
499 {
500 	const char *p, *pend, *p0;
501 	char *q, *qend;
502 	const char *charset;
503 	int lastc;
504 
505 	charset = value(ENAME_MIME_CHARSET);
506 
507 	p = hstring;
508 	q = linebuf;
509 	pend = hstring + strlen(hstring);
510 	qend = linebuf + bufsize - 1;	/* save room for the NULL terminator */
511 	lastc = (unsigned char)' ';
512 	p0 = NULL;
513 	while (p < pend && q < qend) {
514 		const char *p1;
515 		char *q1;
516 
517 		if (*p != '=') {
518 			copy_skipped_FWS(&q, qend, &p0, p);
519 			if (q >= qend)
520 				break;
521 		}
522 
523 		switch (*p) {
524 		case '(':	/* start of comment */
525 			*q++ = *p++;	/* copy the opening '(' */
526 			(void)decode_comment(&q, qend, &p, pend, charset);
527 			lastc = (unsigned char)p[-1];
528 			break;
529 
530 		case '"':	/* start of quoted-string or no-fold-quote */
531 			*q++ = *p++;	/* copy the opening '"' */
532 			decode_quoted_string(&q, qend, &p, pend);
533 			lastc = (unsigned char)p[-1];
534 			break;
535 
536 		case '[':	/* start of domain-literal or no-fold-literal */
537 			*q++ = *p++;	/* copy the opening '[' */
538 			decode_domain_literal(&q, qend, &p, pend);
539 			lastc = (unsigned char)p[-1];
540 			break;
541 
542 		case '\\':	/* start of quoted-pair */
543 			if (p + 1 < pend) {		/* quoted pair */
544 				if (is_specials(p[1])) {
545 					*q++ = *p;
546 					if (q >= qend)
547 						break;
548 				}
549 				p++;	/* skip the '\\' */
550 			}
551 			goto copy_char;
552 
553 		case '=':
554 			/*
555 			 * At this level encoded words can appear via
556 			 * 'phrases' (possibly delimited by ',' as in
557 			 * 'keywords').  Thus we handle them as such.
558 			 * Hopefully this is sufficient.
559 			 */
560 			if ((lastc == ',' || is_FWS(lastc)) && p[1] == '?' &&
561 			    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
562 			    (*p1 == '\0' || *p1 == ',' || is_FWS(*p1))) {
563 				lastc = (unsigned char)*p1;
564 				p0 = p1;
565 				q = q1;
566 				p = skip_FWS(p1);
567 				/*
568 				 * XXX - this check should be
569 				 * unnecessary as *pend should be '\0'
570 				 * which will stop skip_FWS()
571 				 */
572 				if (p > pend)
573 					p = pend;
574 				break;
575 			}
576 			else {
577 				copy_skipped_FWS(&q, qend, &p0, p);
578 				if (q >= qend)
579 					break;
580 				goto copy_char;
581 			}
582 
583 		case '<':	/* start of angle-addr, msg-id, or path. */
584 			/*
585 			 * A msg-id cannot contain encoded-pairs or
586 			 * encoded-words, but angle-addr and path can.
587 			 * Distinguishing between them seems to be
588 			 * unnecessary, so let's be loose and just
589 			 * decode them as if they were all the same.
590 			 */
591 		default:
592 	copy_char:
593 			lastc = (unsigned char)*p;
594 			*q++ = *p++;
595 			break;
596 		}
597 	}
598 	copy_skipped_FWS(&q, qend, &p0, p);
599 	*q = '\0';	/* null terminate the result! */
600 }
601 
602 /*
603  * Returns the correct hfield decoder, or NULL if none.
604  * Info extracted from RFC 2822.
605  *
606  * name - pointer to field name of header line (with colon).
607  */
608 PUBLIC hfield_decoder_t
609 mime_hfield_decoder(const char *name)
610 {
611 	static const struct field_decoder_tbl_s {
612 		const char *field_name;
613 		size_t field_len;
614 		hfield_decoder_t decoder;
615 	} field_decoder_tbl[] = {
616 #define X(s)	s, sizeof(s) - 1
617 		{ X("Received:"),			NULL },
618 
619 		{ X("Content-Type:"),			NULL },
620 		{ X("Content-Disposition:"),		NULL },
621 		{ X("Content-Transfer-Encoding:"),	NULL },
622 		{ X("Content-Description:"),		mime_decode_sfield },
623 		{ X("Content-ID:"),			mime_decode_sfield },
624 		{ X("MIME-Version:"),			mime_decode_sfield },
625 
626 		{ X("Bcc:"),				mime_decode_sfield },
627 		{ X("Cc:"),				mime_decode_sfield },
628 		{ X("Date:"),				mime_decode_sfield },
629 		{ X("From:"),				mime_decode_sfield },
630 		{ X("In-Reply-To:"),			mime_decode_sfield },
631 		{ X("Keywords:"),			mime_decode_sfield },
632 		{ X("Message-ID:"),			mime_decode_sfield },
633 		{ X("References:"),			mime_decode_sfield },
634 		{ X("Reply-To:"),			mime_decode_sfield },
635 		{ X("Return-Path:"),			mime_decode_sfield },
636 		{ X("Sender:"),				mime_decode_sfield },
637 		{ X("To:"),				mime_decode_sfield },
638 		{ X("Subject:"),			mime_decode_usfield },
639 		{ X("Comments:"),			mime_decode_usfield },
640 		{ X("X-"),				mime_decode_usfield },
641 		{ NULL, 0,				mime_decode_usfield },	/* optional-fields */
642 #undef X
643 	};
644 	const struct field_decoder_tbl_s *fp;
645 
646 	/* XXX - this begs for a hash table! */
647 	for (fp = field_decoder_tbl; fp->field_name; fp++)
648 		if (strncasecmp(name, fp->field_name, fp->field_len) == 0)
649 			break;
650 	return fp->decoder;
651 }
652 
653 #endif /* MIME_SUPPORT */
654