1 /* $NetBSD: mime_header.c,v 1.7 2009/01/18 01:29:57 lukem Exp $ */ 2 3 /*- 4 * Copyright (c) 2006 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Anon Ymous. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 33 /* 34 * This module contains the core MIME header decoding routines. 35 * Please refer to RFC 2047 and RFC 2822. 36 */ 37 38 #ifdef MIME_SUPPORT 39 40 #include <sys/cdefs.h> 41 #ifndef __lint__ 42 __RCSID("$NetBSD: mime_header.c,v 1.7 2009/01/18 01:29:57 lukem Exp $"); 43 #endif /* not __lint__ */ 44 45 #include <stdio.h> 46 #include <stdlib.h> 47 #include <string.h> 48 49 #include "def.h" 50 #include "extern.h" 51 #include "mime.h" 52 #include "mime_header.h" 53 #include "mime_codecs.h" 54 55 /* 56 * Our interface to mime_b64tobin() 57 * 58 * XXX - This should move to mime_codecs.c. 59 */ 60 static ssize_t 61 mime_B64_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen) 62 { 63 if (outlen < 3 * roundup(inlen, 4) / 4) 64 return -1; 65 66 return mime_b64tobin(outbuf, inbuf, inlen); 67 } 68 69 70 /* 71 * Header specific "quoted-printable" decode! 72 * Differences with body QP decoding (see rfc 2047, sec 4.2): 73 * 1) '=' occurs _only_ when followed by two hex digits (FWS is not allowed). 74 * 2) Spaces can be encoded as '_' in headers for readability. 75 * 76 * XXX - This should move to mime_codecs.c. 77 */ 78 static ssize_t 79 mime_QPh_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen) 80 { 81 const char *p, *inend; 82 char *outend; 83 char *q; 84 85 outend = outbuf + outlen; 86 inend = inbuf + inlen; 87 q = outbuf; 88 for (p = inbuf; p < inend; p++) { 89 if (q >= outend) 90 return -1; 91 if (*p == '=') { 92 p++; 93 if (p + 1 < inend) { 94 int c; 95 char *bufend; 96 char buf[3]; 97 buf[0] = *p++; 98 buf[1] = *p; 99 buf[2] = '\0'; 100 c = strtol(buf, &bufend, 16); 101 if (bufend != &buf[2]) 102 return -1; 103 *q++ = c; 104 } 105 else 106 return -1; 107 } 108 else if (*p == '_') /* header's may encode ' ' as '_' */ 109 *q++ = ' '; 110 else 111 *q++ = *p; 112 } 113 return q - outbuf; 114 } 115 116 static const char * 117 grab_charset(char *from_cs, size_t from_cs_len, const char *p) 118 { 119 char *q; 120 q = from_cs; 121 for (/*EMPTY*/; *p != '?'; p++) { 122 if (*p == '\0' || q >= from_cs + from_cs_len - 1) 123 return NULL; 124 *q++ = *p; 125 } 126 *q = '\0'; 127 return ++p; /* if here, then we got the '?' */ 128 } 129 130 /* 131 * An encoded word is a string of at most 75 non-white space 132 * characters of the following form: 133 * 134 * =?charset?X?encoding?= 135 * 136 * where: 137 * 'charset' is the original character set of the unencoded string. 138 * 139 * 'X' is the encoding type 'B' or 'Q' for "base64" or 140 * "quoted-printable", respectively, 141 * 'encoding' is the encoded string. 142 * 143 * Both 'charset' and 'X' are case independent and 'encoding' cannot 144 * contain any whitespace or '?' characters. The 'encoding' must also 145 * be fully contained within the encoded words, i.e., it cannot be 146 * split between encoded words. 147 * 148 * Note: the 'B' encoding is a slightly modified "quoted-printable" 149 * encoding. In particular, spaces (' ') may be encoded as '_' to 150 * improve undecoded readability. 151 */ 152 static int 153 decode_word(const char **ibuf, char **obuf, char *oend, const char *to_cs) 154 { 155 ssize_t declen; 156 size_t enclen, dstlen; 157 char decword[LINESIZE]; 158 char from_cs[LINESIZE]; 159 const char *encword, *iend, *p; 160 char *dstend; 161 char enctype; 162 163 p = *ibuf; 164 if (p[0] != '=' && p[1] != '?') 165 return -1; 166 if (strlen(p) < 2 + 1 + 3 + 1 + 2) 167 return -1; 168 p = grab_charset(from_cs, sizeof(from_cs), p + 2); 169 if (p == NULL) 170 return -1; 171 enctype = *p++; 172 if (*p++ != '?') 173 return -1; 174 encword = p; 175 p = strchr(p, '?'); 176 if (p == NULL || p[1] != '=') 177 return -1; 178 enclen = p - encword; /* length of encoded substring */ 179 iend = p + 2; 180 /* encoded words are at most 75 characters (RFC 2047, sec 2) */ 181 if (iend > *ibuf + 75) 182 return -1; 183 184 dstend = to_cs ? decword : *obuf; 185 /* XXX: what if oend <= *obuf, or decword == "" ? */ 186 dstlen = (to_cs ? sizeof(decword) : (size_t)(oend - *obuf)) - 1; 187 188 if (enctype == 'B' || enctype == 'b') 189 declen = mime_B64_decode(dstend, dstlen, encword, enclen); 190 else if (enctype == 'Q' || enctype == 'q') 191 declen = mime_QPh_decode(dstend, dstlen, encword, enclen); 192 else 193 return -1; 194 195 if (declen == -1) 196 return -1; 197 198 dstend += declen; 199 #ifdef CHARSET_SUPPORT 200 if (to_cs != NULL) { 201 iconv_t cd; 202 const char *src; 203 size_t srclen; 204 size_t cnt; 205 206 cd = iconv_open(to_cs, from_cs); 207 if (cd == (iconv_t)-1) 208 return -1; 209 210 src = decword; 211 srclen = declen; 212 dstend = *obuf; 213 dstlen = oend - *obuf - 1; 214 cnt = mime_iconv(cd, &src, &srclen, &dstend, &dstlen); 215 216 (void)iconv_close(cd); 217 if (cnt == (size_t)-1) 218 return -1; 219 } 220 #endif /* CHARSET_SUPPORT */ 221 *dstend = '\0'; 222 *ibuf = iend; 223 *obuf = dstend; 224 return 0; 225 } 226 227 228 /* 229 * Folding White Space. See RFC 2822. 230 * 231 * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF 232 * pairs (i.e., "\r\n") and never separately. However, by the time 233 * mail(1) sees the messages, all CRLF pairs have been converted to 234 * '\n' characters. 235 * 236 * XXX - pull is_FWS() and skip_FWS() up to def.h? 237 */ 238 static inline int 239 is_FWS(int c) 240 { 241 return c == ' ' || c == '\t' || c == '\n'; 242 } 243 244 static inline const char * 245 skip_FWS(const char *p) 246 { 247 while (is_FWS(*p)) 248 p++; 249 return p; 250 } 251 252 static inline void 253 copy_skipped_FWS(char **dst, char *dstend, const char **src, const char *srcend) 254 { 255 const char *p, *pend; 256 char *q, *qend; 257 258 p = *src; 259 q = *dst; 260 pend = srcend; 261 qend = dstend; 262 263 if (p) { /* copy any skipped linear-white-space */ 264 while (p < pend && q < qend) 265 *q++ = *p++; 266 *dst = q; 267 *src = NULL; 268 } 269 } 270 271 /* 272 * Decode an unstructured field. 273 * 274 * See RFC 2822 Sec 2.2.1 and 3.6.5. 275 * Encoded words may occur anywhere in unstructured fields provided 276 * they are separated from any other text or encoded words by at least 277 * one linear-white-space character. (See RFC 2047 sec 5.1.) If two 278 * encoded words occur sequentially (separated by only FWS) then the 279 * separating FWS is removed. 280 * 281 * NOTE: unstructured fields cannot contain 'quoted-pairs' (see 282 * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\' 283 * (or any non-whitespace character) immediately before an 284 * encoded-word will prevent it from being decoded. 285 * 286 * hstring should be a NULL terminated string. 287 * outbuf should be sufficiently large to hold the result. 288 */ 289 static void 290 mime_decode_usfield(char *outbuf, size_t outsize, const char *hstring) 291 { 292 const char *p, *p0; 293 char *q, *qend; 294 int lastc; 295 const char *charset; 296 297 charset = value(ENAME_MIME_CHARSET); 298 qend = outbuf + outsize - 1; /* Make sure there is room for the trailing NULL! */ 299 q = outbuf; 300 p = hstring; 301 p0 = NULL; 302 lastc = (unsigned char)' '; 303 while (*p && q < qend) { 304 const char *p1; 305 char *q1; 306 if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' && 307 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 308 (*p1 == '\0' || is_FWS(*p1))) { 309 p0 = p1; /* pointer to first character after encoded word */ 310 q = q1; 311 p = skip_FWS(p1); 312 lastc = (unsigned char)*p0; 313 } 314 else { 315 copy_skipped_FWS(&q, qend, &p0, p); 316 lastc = (unsigned char)*p; 317 if (q < qend) 318 *q++ = *p++; 319 } 320 } 321 copy_skipped_FWS(&q, qend, &p0, p); 322 *q = '\0'; 323 } 324 325 /* 326 * Decode a field comment. 327 * 328 * Comments only occur in structured fields, can be nested (rfc 2822, 329 * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'. 330 * Otherwise, they can be regarded as unstructured fields that are 331 * bounded by '(' and ')' characters. 332 */ 333 static int 334 decode_comment(char **obuf, char *oend, const char **ibuf, const char *iend, const char *charset) 335 { 336 const char *p, *pend, *p0; 337 char *q, *qend; 338 int lastc; 339 340 p = *ibuf; 341 q = *obuf; 342 pend = iend; 343 qend = oend; 344 lastc = ' '; 345 p0 = NULL; 346 while (p < pend && q < qend) { 347 const char *p1; 348 char *q1; 349 350 if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' && 351 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 352 (*p1 == ')' || is_FWS(*p1))) { 353 lastc = (unsigned char)*p1; 354 p0 = p1; 355 q = q1; 356 p = skip_FWS(p1); 357 /* 358 * XXX - this check should be unnecessary as *pend should 359 * be '\0' which will stop skip_FWS() 360 */ 361 if (p > pend) 362 p = pend; 363 } 364 else { 365 copy_skipped_FWS(&q, qend, &p0, p); 366 if (q >= qend) /* XXX - q > qend cannot happen */ 367 break; 368 369 if (*p == ')') { 370 *q++ = *p++; /* copy the closing ')' */ 371 break; /* and get out of here! */ 372 } 373 374 if (*p == '(') { 375 *q++ = *p++; /* copy the opening '(' */ 376 if (decode_comment(&q, qend, &p, pend, charset) == -1) 377 return -1; /* is this right or should we update? */ 378 lastc = ')'; 379 } 380 else if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 381 if (p[1] == '(' || p[1] == ')' || p[1] == '\\') /* need quoted-pair*/ 382 *q++ = *p; 383 p++; 384 lastc = (unsigned char)*p; 385 if (q < qend) 386 *q++ = *p++; 387 } 388 else { 389 lastc = (unsigned char)*p; 390 *q++ = *p++; 391 } 392 } 393 } 394 *ibuf = p; 395 *obuf = q; 396 return 0; 397 } 398 399 /* 400 * Decode a quoted-string or no-fold-quote. 401 * 402 * These cannot contain encoded words. They can contain quoted-pairs, 403 * making '\\' special. They have no other structure. See RFC 2822 404 * sec 3.2.5 and 3.6.4. 405 */ 406 static void 407 decode_quoted_string(char **obuf, char *oend, const char **ibuf, const char *iend) 408 { 409 const char *p, *pend; 410 char *q, *qend; 411 412 qend = oend; 413 pend = iend; 414 p = *ibuf; 415 q = *obuf; 416 while (p < pend && q < qend) { 417 if (*p == '"') { 418 *q++ = *p++; /* copy the closing '"' */ 419 break; 420 } 421 if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 422 if (p[1] == '"' || p[1] == '\\') { 423 *q++ = *p; 424 if (q >= qend) 425 break; 426 } 427 p++; 428 } 429 *q++ = *p++; 430 } 431 *ibuf = p; 432 *obuf = q; 433 } 434 435 /* 436 * Decode a domain-literal or no-fold-literal. 437 * 438 * These cannot contain encoded words. They can have quoted pairs and 439 * are delimited by '[' and ']' making '\\', '[', and ']' special. 440 * They have no other structure. See RFC 2822 sec 3.4.1 and 3.6.4. 441 */ 442 static void 443 decode_domain_literal(char **obuf, char *oend, const char **ibuf, const char *iend) 444 { 445 const char *p, *pend; 446 char *q, *qend; 447 448 qend = oend; 449 pend = iend; 450 p = *ibuf; 451 q = *obuf; 452 while (p < pend && q < qend) { 453 if (*p == ']') { 454 *q++ = *p++; /* copy the closing ']' */ 455 break; 456 } 457 if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 458 if (p[1] == '[' || p[1] == ']' || p[1] == '\\') { 459 *q++ = *p; 460 if (q >= qend) 461 break; 462 } 463 p++; 464 } 465 *q++ = *p++; 466 } 467 *ibuf = p; 468 *obuf = q; 469 } 470 471 /* 472 * Specials: see RFC 2822 sec 3.2.1. 473 */ 474 static inline int 475 is_specials(int c) 476 { 477 static const char specialtab[] = { 478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 480 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 482 483 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 487 }; 488 return !(c & ~0x7f) ? specialtab[c] : 0; 489 } 490 491 /* 492 * Decode a structured field. 493 * 494 * At the top level, structured fields can only contain encoded-words 495 * via 'phrases' and 'comments'. See RFC 2047 sec 5. 496 */ 497 static void 498 mime_decode_sfield(char *linebuf, size_t bufsize, const char *hstring) 499 { 500 const char *p, *pend, *p0; 501 char *q, *qend; 502 const char *charset; 503 int lastc; 504 505 charset = value(ENAME_MIME_CHARSET); 506 507 p = hstring; 508 q = linebuf; 509 pend = hstring + strlen(hstring); 510 qend = linebuf + bufsize - 1; /* save room for the NULL terminator */ 511 lastc = (unsigned char)' '; 512 p0 = NULL; 513 while (p < pend && q < qend) { 514 const char *p1; 515 char *q1; 516 517 if (*p != '=') { 518 copy_skipped_FWS(&q, qend, &p0, p); 519 if (q >= qend) 520 break; 521 } 522 523 switch (*p) { 524 case '(': /* start of comment */ 525 *q++ = *p++; /* copy the opening '(' */ 526 (void)decode_comment(&q, qend, &p, pend, charset); 527 lastc = (unsigned char)p[-1]; 528 break; 529 530 case '"': /* start of quoted-string or no-fold-quote */ 531 *q++ = *p++; /* copy the opening '"' */ 532 decode_quoted_string(&q, qend, &p, pend); 533 lastc = (unsigned char)p[-1]; 534 break; 535 536 case '[': /* start of domain-literal or no-fold-literal */ 537 *q++ = *p++; /* copy the opening '[' */ 538 decode_domain_literal(&q, qend, &p, pend); 539 lastc = (unsigned char)p[-1]; 540 break; 541 542 case '\\': /* start of quoted-pair */ 543 if (p + 1 < pend) { /* quoted pair */ 544 if (is_specials(p[1])) { 545 *q++ = *p; 546 if (q >= qend) 547 break; 548 } 549 p++; /* skip the '\\' */ 550 } 551 goto copy_char; 552 553 case '=': 554 /* 555 * At this level encoded words can appear via 556 * 'phrases' (possibly delimited by ',' as in 557 * 'keywords'). Thus we handle them as such. 558 * Hopefully this is sufficient. 559 */ 560 if ((lastc == ',' || is_FWS(lastc)) && p[1] == '?' && 561 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 562 (*p1 == '\0' || *p1 == ',' || is_FWS(*p1))) { 563 lastc = (unsigned char)*p1; 564 p0 = p1; 565 q = q1; 566 p = skip_FWS(p1); 567 /* 568 * XXX - this check should be 569 * unnecessary as *pend should be '\0' 570 * which will stop skip_FWS() 571 */ 572 if (p > pend) 573 p = pend; 574 break; 575 } 576 else { 577 copy_skipped_FWS(&q, qend, &p0, p); 578 if (q >= qend) 579 break; 580 goto copy_char; 581 } 582 583 case '<': /* start of angle-addr, msg-id, or path. */ 584 /* 585 * A msg-id cannot contain encoded-pairs or 586 * encoded-words, but angle-addr and path can. 587 * Distinguishing between them seems to be 588 * unnecessary, so let's be loose and just 589 * decode them as if they were all the same. 590 */ 591 default: 592 copy_char: 593 lastc = (unsigned char)*p; 594 *q++ = *p++; 595 break; 596 } 597 } 598 copy_skipped_FWS(&q, qend, &p0, p); 599 *q = '\0'; /* null terminate the result! */ 600 } 601 602 /* 603 * Returns the correct hfield decoder, or NULL if none. 604 * Info extracted from RFC 2822. 605 * 606 * name - pointer to field name of header line (with colon). 607 */ 608 PUBLIC hfield_decoder_t 609 mime_hfield_decoder(const char *name) 610 { 611 static const struct field_decoder_tbl_s { 612 const char *field_name; 613 size_t field_len; 614 hfield_decoder_t decoder; 615 } field_decoder_tbl[] = { 616 #define X(s) s, sizeof(s) - 1 617 { X("Received:"), NULL }, 618 619 { X("Content-Type:"), NULL }, 620 { X("Content-Disposition:"), NULL }, 621 { X("Content-Transfer-Encoding:"), NULL }, 622 { X("Content-Description:"), mime_decode_sfield }, 623 { X("Content-ID:"), mime_decode_sfield }, 624 { X("MIME-Version:"), mime_decode_sfield }, 625 626 { X("Bcc:"), mime_decode_sfield }, 627 { X("Cc:"), mime_decode_sfield }, 628 { X("Date:"), mime_decode_sfield }, 629 { X("From:"), mime_decode_sfield }, 630 { X("In-Reply-To:"), mime_decode_sfield }, 631 { X("Keywords:"), mime_decode_sfield }, 632 { X("Message-ID:"), mime_decode_sfield }, 633 { X("References:"), mime_decode_sfield }, 634 { X("Reply-To:"), mime_decode_sfield }, 635 { X("Return-Path:"), mime_decode_sfield }, 636 { X("Sender:"), mime_decode_sfield }, 637 { X("To:"), mime_decode_sfield }, 638 { X("Subject:"), mime_decode_usfield }, 639 { X("Comments:"), mime_decode_usfield }, 640 { X("X-"), mime_decode_usfield }, 641 { NULL, 0, mime_decode_usfield }, /* optional-fields */ 642 #undef X 643 }; 644 const struct field_decoder_tbl_s *fp; 645 646 /* XXX - this begs for a hash table! */ 647 for (fp = field_decoder_tbl; fp->field_name; fp++) 648 if (strncasecmp(name, fp->field_name, fp->field_len) == 0) 649 break; 650 return fp->decoder; 651 } 652 653 #endif /* MIME_SUPPORT */ 654