xref: /netbsd-src/usr.bin/mail/mime_header.c (revision 404fbe5fb94ca1e054339640cabb2801ce52dd30)
1 /*	$NetBSD: mime_header.c,v 1.6 2008/04/28 20:24:14 martin Exp $	*/
2 
3 /*-
4  * Copyright (c) 2006 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Anon Ymous.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 /*
34  * This module contains the core MIME header decoding routines.
35  * Please refer to RFC 2047 and RFC 2822.
36  */
37 
38 #ifdef MIME_SUPPORT
39 
40 #include <sys/cdefs.h>
41 #ifndef __lint__
42 __RCSID("$NetBSD: mime_header.c,v 1.6 2008/04/28 20:24:14 martin Exp $");
43 #endif /* not __lint__ */
44 
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 
49 #include "def.h"
50 #include "extern.h"
51 #include "mime.h"
52 #include "mime_header.h"
53 #include "mime_codecs.h"
54 
55 /*
56  * Our interface to mime_b64tobin()
57  *
58  * XXX - This should move to mime_codecs.c.
59  */
60 static ssize_t
61 mime_B64_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen)
62 {
63 	if (outlen < 3 * roundup(inlen, 4) / 4)
64 		return -1;
65 
66 	return mime_b64tobin(outbuf, inbuf, inlen);
67 }
68 
69 
70 /*
71  * Header specific "quoted-printable" decode!
72  * Differences with body QP decoding (see rfc 2047, sec 4.2):
73  * 1) '=' occurs _only_ when followed by two hex digits (FWS is not allowed).
74  * 2) Spaces can be encoded as '_' in headers for readability.
75  *
76  * XXX - This should move to mime_codecs.c.
77  */
78 static ssize_t
79 mime_QPh_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen)
80 {
81 	const char *p, *inend;
82 	char *outend;
83 	char *q;
84 
85 	outend = outbuf + outlen;
86 	inend = inbuf + inlen;
87 	q = outbuf;
88 	for (p = inbuf; p < inend; p++) {
89 		if (q >= outend)
90 			return -1;
91 		if (*p == '=') {
92 			p++;
93 			if (p + 1 < inend) {
94 				int c;
95 				char *bufend;
96 				char buf[3];
97 				buf[0] = *p++;
98 				buf[1] = *p;
99 				buf[2] = '\0';
100 				c = strtol(buf, &bufend, 16);
101 				if (bufend != &buf[2])
102 					return -1;
103 				*q++ = c;
104 			}
105 			else
106 				return -1;
107 		}
108 		else if (*p == '_')  /* header's may encode ' ' as '_' */
109 			*q++ = ' ';
110 		else
111 			*q++ = *p;
112 	}
113 	return q - outbuf;
114 }
115 
116 static const char *
117 grab_charset(char *from_cs, size_t from_cs_len, const char *p)
118 {
119 	char *q;
120 	q = from_cs;
121 	for (/*EMPTY*/; *p != '?'; p++) {
122 		if (*p == '\0' || q >= from_cs + from_cs_len - 1)
123 			return NULL;
124 		*q++ = *p;
125 	}
126 	*q = '\0';
127 	return ++p;	/* if here, then we got the '?' */
128 }
129 
130 /*
131  * An encoded word is a string of at most 75 non-white space
132  * characters of the following form:
133  *
134  *  =?charset?X?encoding?=
135  *
136  * where:
137  *   'charset'	is the original character set of the unencoded string.
138  *
139  *   'X'	is the encoding type 'B' or 'Q' for "base64" or
140  *              "quoted-printable", respectively,
141  *   'encoding'	is the encoded string.
142  *
143  * Both 'charset' and 'X' are case independent and 'encoding' cannot
144  * contain any whitespace or '?' characters.  The 'encoding' must also
145  * be fully contained within the encoded words, i.e., it cannot be
146  * split between encoded words.
147  *
148  * Note: the 'B' encoding is a slightly modified "quoted-printable"
149  * encoding.  In particular, spaces (' ') may be encoded as '_' to
150  * improve undecoded readability.
151  */
152 static int
153 decode_word(const char **ibuf, char **obuf, char *oend, const char *to_cs)
154 {
155 	ssize_t declen;
156 	size_t enclen, dstlen;
157 	char decword[LINESIZE];
158 	char from_cs[LINESIZE];
159 	const char *encword, *iend, *p;
160 	char *dstend;
161 	char enctype;
162 
163 	p = *ibuf;
164 	if (p[0] != '=' && p[1] != '?')
165 		return -1;
166 	if (strlen(p) <  2 + 1 + 3 + 1 + 2)
167 		return -1;
168 	p = grab_charset(from_cs, sizeof(from_cs), p + 2);
169 	if (p == NULL)
170 		return -1;
171 	enctype = *p++;
172 	if (*p++ != '?')
173 		return -1;
174 	encword = p;
175 	p = strchr(p, '?');
176 	if (p == NULL || p[1] != '=')
177 		return -1;
178 	enclen = p - encword;	/* length of encoded substring */
179 	iend = p + 2;
180 	/* encoded words are at most 75 characters (RFC 2047, sec 2) */
181 	if (iend > *ibuf + 75)
182 		return -1;
183 
184 	dstend = to_cs ? decword : *obuf;
185 	dstlen = (to_cs ? sizeof(decword): oend - *obuf) - 1;
186 
187 	if (enctype == 'B' || enctype == 'b')
188 		declen = mime_B64_decode(dstend, dstlen, encword, enclen);
189 	else if (enctype == 'Q' || enctype == 'q')
190 		declen = mime_QPh_decode(dstend, dstlen, encword, enclen);
191 	else
192 		return -1;
193 
194 	if (declen == -1)
195 		return -1;
196 
197 	dstend += declen;
198 #ifdef CHARSET_SUPPORT
199 	if (to_cs != NULL) {
200 		iconv_t cd;
201 		const char *src;
202 		size_t srclen;
203 		size_t cnt;
204 
205 		cd = iconv_open(to_cs, from_cs);
206 		if (cd == (iconv_t)-1)
207 			return -1;
208 
209 		src = decword;
210 		srclen = declen;
211 		dstend = *obuf;
212 		dstlen = oend - *obuf - 1;
213 		cnt = mime_iconv(cd, &src, &srclen, &dstend, &dstlen);
214 
215 		(void)iconv_close(cd);
216 		if (cnt == (size_t)-1)
217 			return -1;
218 	}
219 #endif /* CHARSET_SUPPORT */
220 	*dstend = '\0';
221 	*ibuf = iend;
222 	*obuf = dstend;
223 	return 0;
224 }
225 
226 
227 /*
228  * Folding White Space.  See RFC 2822.
229  *
230  * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF
231  * pairs (i.e., "\r\n") and never separately.  However, by the time
232  * mail(1) sees the messages, all CRLF pairs have been converted to
233  * '\n' characters.
234  *
235  * XXX - pull is_FWS() and skip_FWS() up to def.h?
236  */
237 static inline int
238 is_FWS(int c)
239 {
240 	return c == ' ' || c == '\t' || c == '\n';
241 }
242 
243 static inline const char *
244 skip_FWS(const char *p)
245 {
246 	while (is_FWS(*p))
247 		p++;
248 	return p;
249 }
250 
251 static inline void
252 copy_skipped_FWS(char **dst, char *dstend, const char **src, const char *srcend)
253 {
254 	const char *p, *pend;
255 	char *q, *qend;
256 
257 	p = *src;
258 	q = *dst;
259 	pend = srcend;
260 	qend = dstend;
261 
262 	if (p) {  /* copy any skipped linear-white-space */
263 		while (p < pend && q < qend)
264 			*q++ = *p++;
265 		*dst = q;
266 		*src = NULL;
267 	}
268 }
269 
270 /*
271  * Decode an unstructured field.
272  *
273  * See RFC 2822 Sec 2.2.1 and 3.6.5.
274  * Encoded words may occur anywhere in unstructured fields provided
275  * they are separated from any other text or encoded words by at least
276  * one linear-white-space character. (See RFC 2047 sec 5.1.)  If two
277  * encoded words occur sequentially (separated by only FWS) then the
278  * separating FWS is removed.
279  *
280  * NOTE: unstructured fields cannot contain 'quoted-pairs' (see
281  * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\'
282  * (or any non-whitespace character) immediately before an
283  * encoded-word will prevent it from being decoded.
284  *
285  * hstring should be a NULL terminated string.
286  * outbuf should be sufficiently large to hold the result.
287  */
288 static void
289 mime_decode_usfield(char *outbuf, size_t outsize, const char *hstring)
290 {
291 	const char *p, *p0;
292 	char *q, *qend;
293 	int lastc;
294 	const char *charset;
295 
296 	charset = value(ENAME_MIME_CHARSET);
297 	qend = outbuf + outsize - 1; /* Make sure there is room for the trailing NULL! */
298 	q = outbuf;
299 	p = hstring;
300 	p0 = NULL;
301 	lastc = (unsigned char)' ';
302 	while (*p && q < qend) {
303 		const char *p1;
304 		char *q1;
305 		if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
306 		    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
307 		    (*p1 == '\0' || is_FWS(*p1))) {
308 			p0 = p1;  /* pointer to first character after encoded word */
309 			q = q1;
310 			p = skip_FWS(p1);
311 			lastc = (unsigned char)*p0;
312 		}
313 		else {
314 			copy_skipped_FWS(&q, qend, &p0, p);
315 			lastc = (unsigned char)*p;
316 			if (q < qend)
317 				*q++ = *p++;
318 		}
319 	}
320 	copy_skipped_FWS(&q, qend, &p0, p);
321 	*q = '\0';
322 }
323 
324 /*
325  * Decode a field comment.
326  *
327  * Comments only occur in structured fields, can be nested (rfc 2822,
328  * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'.
329  * Otherwise, they can be regarded as unstructured fields that are
330  * bounded by '(' and ')' characters.
331  */
332 static int
333 decode_comment(char **obuf, char *oend, const char **ibuf, const char *iend, const char *charset)
334 {
335 	const char *p, *pend, *p0;
336 	char *q, *qend;
337 	int lastc;
338 
339 	p = *ibuf;
340 	q = *obuf;
341 	pend = iend;
342 	qend = oend;
343 	lastc = ' ';
344 	p0 = NULL;
345 	while (p < pend && q < qend) {
346 		const char *p1;
347 		char *q1;
348 
349 		if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
350 		    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
351 		    (*p1 == ')' || is_FWS(*p1))) {
352 			lastc = (unsigned char)*p1;
353 			p0 = p1;
354 			q = q1;
355 			p = skip_FWS(p1);
356 			/*
357 			 * XXX - this check should be unnecessary as *pend should
358 			 * be '\0' which will stop skip_FWS()
359 			 */
360 			if (p > pend)
361 				p = pend;
362 		}
363 		else {
364 			copy_skipped_FWS(&q, qend, &p0, p);
365 			if (q >= qend)	/* XXX - q > qend cannot happen */
366 				break;
367 
368 			if (*p == ')') {
369 				*q++ = *p++;	/* copy the closing ')' */
370 				break;		/* and get out of here! */
371 			}
372 
373 			if (*p == '(') {
374 				*q++ = *p++;	/* copy the opening '(' */
375 				if (decode_comment(&q, qend, &p, pend, charset) == -1)
376 					return -1;	/* is this right or should we update? */
377 				lastc = ')';
378 			}
379 			else if (*p == '\\' && p + 1 < pend) {	/* quoted-pair */
380 				if (p[1] == '(' || p[1] == ')' || p[1] == '\\') /* need quoted-pair*/
381 					*q++ = *p;
382 				p++;
383 				lastc = (unsigned char)*p;
384 				if (q < qend)
385 					*q++ = *p++;
386 			}
387 			else {
388 				lastc = (unsigned char)*p;
389 				*q++ = *p++;
390 			}
391 		}
392 	}
393 	*ibuf = p;
394 	*obuf = q;
395 	return 0;
396 }
397 
398 /*
399  * Decode a quoted-string or no-fold-quote.
400  *
401  * These cannot contain encoded words.  They can contain quoted-pairs,
402  * making '\\' special.  They have no other structure.  See RFC 2822
403  * sec 3.2.5 and 3.6.4.
404  */
405 static void
406 decode_quoted_string(char **obuf, char *oend, const char **ibuf, const char *iend)
407 {
408 	const char *p, *pend;
409 	char *q, *qend;
410 
411 	qend = oend;
412 	pend = iend;
413 	p = *ibuf;
414 	q = *obuf;
415 	while (p < pend && q < qend) {
416 		if (*p == '"') {
417 			*q++ = *p++;	/* copy the closing '"' */
418 			break;
419 		}
420 		if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
421 			if (p[1] == '"' || p[1] == '\\') {
422 				*q++ = *p;
423 				if (q >= qend)
424 					break;
425 			}
426 			p++;
427 		}
428 		*q++ = *p++;
429 	}
430 	*ibuf = p;
431 	*obuf = q;
432 }
433 
434 /*
435  * Decode a domain-literal or no-fold-literal.
436  *
437  * These cannot contain encoded words.  They can have quoted pairs and
438  * are delimited by '[' and ']' making '\\', '[', and ']' special.
439  * They have no other structure.  See RFC 2822 sec 3.4.1 and 3.6.4.
440  */
441 static void
442 decode_domain_literal(char **obuf, char *oend, const char **ibuf, const char *iend)
443 {
444 	const char *p, *pend;
445 	char *q, *qend;
446 
447 	qend = oend;
448 	pend = iend;
449 	p = *ibuf;
450 	q = *obuf;
451 	while (p < pend && q < qend) {
452 		if (*p == ']') {
453 			*q++ = *p++;	/* copy the closing ']' */
454 			break;
455 		}
456 		if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
457 			if (p[1] == '[' || p[1] == ']' || p[1] == '\\') {
458 				*q++ = *p;
459 				if (q >= qend)
460 					break;
461 			}
462 			p++;
463 		}
464 		*q++ = *p++;
465 	}
466 	*ibuf = p;
467 	*obuf = q;
468 }
469 
470 /*
471  * Specials: see RFC 2822 sec 3.2.1.
472  */
473 static inline int
474 is_specials(int c)
475 {
476 	static const char specialtab[] = {
477 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
478 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
479 		0, 0, 1, 0,  0, 0, 0, 0,  1, 1, 0, 0,  1, 0, 1, 0,
480 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 1, 1,  1, 0, 1, 0,
481 
482 		1, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
483 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 1,  1, 1, 0, 0,
484 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
485 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
486 	};
487 	return !(c & ~0x7f) ? specialtab[c] : 0;
488 }
489 
490 /*
491  * Decode a structured field.
492  *
493  * At the top level, structured fields can only contain encoded-words
494  * via 'phrases' and 'comments'.  See RFC 2047 sec 5.
495  */
496 static void
497 mime_decode_sfield(char *linebuf, size_t bufsize, const char *hstring)
498 {
499 	const char *p, *pend, *p0;
500 	char *q, *qend;
501 	const char *charset;
502 	int lastc;
503 
504 	charset = value(ENAME_MIME_CHARSET);
505 
506 	p = hstring;
507 	q = linebuf;
508 	pend = hstring + strlen(hstring);
509 	qend = linebuf + bufsize - 1;	/* save room for the NULL terminator */
510 	lastc = (unsigned char)' ';
511 	p0 = NULL;
512 	while (p < pend && q < qend) {
513 		const char *p1;
514 		char *q1;
515 
516 		if (*p != '=') {
517 			copy_skipped_FWS(&q, qend, &p0, p);
518 			if (q >= qend)
519 				break;
520 		}
521 
522 		switch (*p) {
523 		case '(':	/* start of comment */
524 			*q++ = *p++;	/* copy the opening '(' */
525 			(void)decode_comment(&q, qend, &p, pend, charset);
526 			lastc = (unsigned char)p[-1];
527 			break;
528 
529 		case '"':	/* start of quoted-string or no-fold-quote */
530 			*q++ = *p++;	/* copy the opening '"' */
531 			decode_quoted_string(&q, qend, &p, pend);
532 			lastc = (unsigned char)p[-1];
533 			break;
534 
535 		case '[':	/* start of domain-literal or no-fold-literal */
536 			*q++ = *p++;	/* copy the opening '[' */
537 			decode_domain_literal(&q, qend, &p, pend);
538 			lastc = (unsigned char)p[-1];
539 			break;
540 
541 		case '\\':	/* start of quoted-pair */
542 			if (p + 1 < pend) {		/* quoted pair */
543 				if (is_specials(p[1])) {
544 					*q++ = *p;
545 					if (q >= qend)
546 						break;
547 				}
548 				p++;	/* skip the '\\' */
549 			}
550 			goto copy_char;
551 
552 		case '=':
553 			/*
554 			 * At this level encoded words can appear via
555 			 * 'phrases' (possibly delimited by ',' as in
556 			 * 'keywords').  Thus we handle them as such.
557 			 * Hopefully this is sufficient.
558 			 */
559 			if ((lastc == ',' || is_FWS(lastc)) && p[1] == '?' &&
560 			    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
561 			    (*p1 == '\0' || *p1 == ',' || is_FWS(*p1))) {
562 				lastc = (unsigned char)*p1;
563 				p0 = p1;
564 				q = q1;
565 				p = skip_FWS(p1);
566 				/*
567 				 * XXX - this check should be
568 				 * unnecessary as *pend should be '\0'
569 				 * which will stop skip_FWS()
570 				 */
571 				if (p > pend)
572 					p = pend;
573 				break;
574 			}
575 			else {
576 				copy_skipped_FWS(&q, qend, &p0, p);
577 				if (q >= qend)
578 					break;
579 				goto copy_char;
580 			}
581 
582 		case '<':	/* start of angle-addr, msg-id, or path. */
583 			/*
584 			 * A msg-id cannot contain encoded-pairs or
585 			 * encoded-words, but angle-addr and path can.
586 			 * Distinguishing between them seems to be
587 			 * unnecessary, so let's be loose and just
588 			 * decode them as if they were all the same.
589 			 */
590 		default:
591 	copy_char:
592 			lastc = (unsigned char)*p;
593 			*q++ = *p++;
594 			break;
595 		}
596 	}
597 	copy_skipped_FWS(&q, qend, &p0, p);
598 	*q = '\0';	/* null terminate the result! */
599 }
600 
601 /*
602  * Returns the correct hfield decoder, or NULL if none.
603  * Info extracted from RFC 2822.
604  *
605  * name - pointer to field name of header line (with colon).
606  */
607 PUBLIC hfield_decoder_t
608 mime_hfield_decoder(const char *name)
609 {
610 	static const struct field_decoder_tbl_s {
611 		const char *field_name;
612 		size_t field_len;
613 		hfield_decoder_t decoder;
614 	} field_decoder_tbl[] = {
615 #define X(s)	s, sizeof(s) - 1
616 		{ X("Received:"),			NULL },
617 
618 		{ X("Content-Type:"),			NULL },
619 		{ X("Content-Disposition:"),		NULL },
620 		{ X("Content-Transfer-Encoding:"),	NULL },
621 		{ X("Content-Description:"),		mime_decode_sfield },
622 		{ X("Content-ID:"),			mime_decode_sfield },
623 		{ X("MIME-Version:"),			mime_decode_sfield },
624 
625 		{ X("Bcc:"),				mime_decode_sfield },
626 		{ X("Cc:"),				mime_decode_sfield },
627 		{ X("Date:"),				mime_decode_sfield },
628 		{ X("From:"),				mime_decode_sfield },
629 		{ X("In-Reply-To:"),			mime_decode_sfield },
630 		{ X("Keywords:"),			mime_decode_sfield },
631 		{ X("Message-ID:"),			mime_decode_sfield },
632 		{ X("References:"),			mime_decode_sfield },
633 		{ X("Reply-To:"),			mime_decode_sfield },
634 		{ X("Return-Path:"),			mime_decode_sfield },
635 		{ X("Sender:"),				mime_decode_sfield },
636 		{ X("To:"),				mime_decode_sfield },
637 		{ X("Subject:"),			mime_decode_usfield },
638 		{ X("Comments:"),			mime_decode_usfield },
639 		{ X("X-"),				mime_decode_usfield },
640 		{ NULL, 0,				mime_decode_usfield },	/* optional-fields */
641 #undef X
642 	};
643 	const struct field_decoder_tbl_s *fp;
644 
645 	/* XXX - this begs for a hash table! */
646 	for (fp = field_decoder_tbl; fp->field_name; fp++)
647 		if (strncasecmp(name, fp->field_name, fp->field_len) == 0)
648 			break;
649 	return fp->decoder;
650 }
651 
652 #endif /* MIME_SUPPORT */
653