1 /* $NetBSD: mime_header.c,v 1.6 2008/04/28 20:24:14 martin Exp $ */ 2 3 /*- 4 * Copyright (c) 2006 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Anon Ymous. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 33 /* 34 * This module contains the core MIME header decoding routines. 35 * Please refer to RFC 2047 and RFC 2822. 36 */ 37 38 #ifdef MIME_SUPPORT 39 40 #include <sys/cdefs.h> 41 #ifndef __lint__ 42 __RCSID("$NetBSD: mime_header.c,v 1.6 2008/04/28 20:24:14 martin Exp $"); 43 #endif /* not __lint__ */ 44 45 #include <stdio.h> 46 #include <stdlib.h> 47 #include <string.h> 48 49 #include "def.h" 50 #include "extern.h" 51 #include "mime.h" 52 #include "mime_header.h" 53 #include "mime_codecs.h" 54 55 /* 56 * Our interface to mime_b64tobin() 57 * 58 * XXX - This should move to mime_codecs.c. 59 */ 60 static ssize_t 61 mime_B64_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen) 62 { 63 if (outlen < 3 * roundup(inlen, 4) / 4) 64 return -1; 65 66 return mime_b64tobin(outbuf, inbuf, inlen); 67 } 68 69 70 /* 71 * Header specific "quoted-printable" decode! 72 * Differences with body QP decoding (see rfc 2047, sec 4.2): 73 * 1) '=' occurs _only_ when followed by two hex digits (FWS is not allowed). 74 * 2) Spaces can be encoded as '_' in headers for readability. 75 * 76 * XXX - This should move to mime_codecs.c. 77 */ 78 static ssize_t 79 mime_QPh_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen) 80 { 81 const char *p, *inend; 82 char *outend; 83 char *q; 84 85 outend = outbuf + outlen; 86 inend = inbuf + inlen; 87 q = outbuf; 88 for (p = inbuf; p < inend; p++) { 89 if (q >= outend) 90 return -1; 91 if (*p == '=') { 92 p++; 93 if (p + 1 < inend) { 94 int c; 95 char *bufend; 96 char buf[3]; 97 buf[0] = *p++; 98 buf[1] = *p; 99 buf[2] = '\0'; 100 c = strtol(buf, &bufend, 16); 101 if (bufend != &buf[2]) 102 return -1; 103 *q++ = c; 104 } 105 else 106 return -1; 107 } 108 else if (*p == '_') /* header's may encode ' ' as '_' */ 109 *q++ = ' '; 110 else 111 *q++ = *p; 112 } 113 return q - outbuf; 114 } 115 116 static const char * 117 grab_charset(char *from_cs, size_t from_cs_len, const char *p) 118 { 119 char *q; 120 q = from_cs; 121 for (/*EMPTY*/; *p != '?'; p++) { 122 if (*p == '\0' || q >= from_cs + from_cs_len - 1) 123 return NULL; 124 *q++ = *p; 125 } 126 *q = '\0'; 127 return ++p; /* if here, then we got the '?' */ 128 } 129 130 /* 131 * An encoded word is a string of at most 75 non-white space 132 * characters of the following form: 133 * 134 * =?charset?X?encoding?= 135 * 136 * where: 137 * 'charset' is the original character set of the unencoded string. 138 * 139 * 'X' is the encoding type 'B' or 'Q' for "base64" or 140 * "quoted-printable", respectively, 141 * 'encoding' is the encoded string. 142 * 143 * Both 'charset' and 'X' are case independent and 'encoding' cannot 144 * contain any whitespace or '?' characters. The 'encoding' must also 145 * be fully contained within the encoded words, i.e., it cannot be 146 * split between encoded words. 147 * 148 * Note: the 'B' encoding is a slightly modified "quoted-printable" 149 * encoding. In particular, spaces (' ') may be encoded as '_' to 150 * improve undecoded readability. 151 */ 152 static int 153 decode_word(const char **ibuf, char **obuf, char *oend, const char *to_cs) 154 { 155 ssize_t declen; 156 size_t enclen, dstlen; 157 char decword[LINESIZE]; 158 char from_cs[LINESIZE]; 159 const char *encword, *iend, *p; 160 char *dstend; 161 char enctype; 162 163 p = *ibuf; 164 if (p[0] != '=' && p[1] != '?') 165 return -1; 166 if (strlen(p) < 2 + 1 + 3 + 1 + 2) 167 return -1; 168 p = grab_charset(from_cs, sizeof(from_cs), p + 2); 169 if (p == NULL) 170 return -1; 171 enctype = *p++; 172 if (*p++ != '?') 173 return -1; 174 encword = p; 175 p = strchr(p, '?'); 176 if (p == NULL || p[1] != '=') 177 return -1; 178 enclen = p - encword; /* length of encoded substring */ 179 iend = p + 2; 180 /* encoded words are at most 75 characters (RFC 2047, sec 2) */ 181 if (iend > *ibuf + 75) 182 return -1; 183 184 dstend = to_cs ? decword : *obuf; 185 dstlen = (to_cs ? sizeof(decword): oend - *obuf) - 1; 186 187 if (enctype == 'B' || enctype == 'b') 188 declen = mime_B64_decode(dstend, dstlen, encword, enclen); 189 else if (enctype == 'Q' || enctype == 'q') 190 declen = mime_QPh_decode(dstend, dstlen, encword, enclen); 191 else 192 return -1; 193 194 if (declen == -1) 195 return -1; 196 197 dstend += declen; 198 #ifdef CHARSET_SUPPORT 199 if (to_cs != NULL) { 200 iconv_t cd; 201 const char *src; 202 size_t srclen; 203 size_t cnt; 204 205 cd = iconv_open(to_cs, from_cs); 206 if (cd == (iconv_t)-1) 207 return -1; 208 209 src = decword; 210 srclen = declen; 211 dstend = *obuf; 212 dstlen = oend - *obuf - 1; 213 cnt = mime_iconv(cd, &src, &srclen, &dstend, &dstlen); 214 215 (void)iconv_close(cd); 216 if (cnt == (size_t)-1) 217 return -1; 218 } 219 #endif /* CHARSET_SUPPORT */ 220 *dstend = '\0'; 221 *ibuf = iend; 222 *obuf = dstend; 223 return 0; 224 } 225 226 227 /* 228 * Folding White Space. See RFC 2822. 229 * 230 * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF 231 * pairs (i.e., "\r\n") and never separately. However, by the time 232 * mail(1) sees the messages, all CRLF pairs have been converted to 233 * '\n' characters. 234 * 235 * XXX - pull is_FWS() and skip_FWS() up to def.h? 236 */ 237 static inline int 238 is_FWS(int c) 239 { 240 return c == ' ' || c == '\t' || c == '\n'; 241 } 242 243 static inline const char * 244 skip_FWS(const char *p) 245 { 246 while (is_FWS(*p)) 247 p++; 248 return p; 249 } 250 251 static inline void 252 copy_skipped_FWS(char **dst, char *dstend, const char **src, const char *srcend) 253 { 254 const char *p, *pend; 255 char *q, *qend; 256 257 p = *src; 258 q = *dst; 259 pend = srcend; 260 qend = dstend; 261 262 if (p) { /* copy any skipped linear-white-space */ 263 while (p < pend && q < qend) 264 *q++ = *p++; 265 *dst = q; 266 *src = NULL; 267 } 268 } 269 270 /* 271 * Decode an unstructured field. 272 * 273 * See RFC 2822 Sec 2.2.1 and 3.6.5. 274 * Encoded words may occur anywhere in unstructured fields provided 275 * they are separated from any other text or encoded words by at least 276 * one linear-white-space character. (See RFC 2047 sec 5.1.) If two 277 * encoded words occur sequentially (separated by only FWS) then the 278 * separating FWS is removed. 279 * 280 * NOTE: unstructured fields cannot contain 'quoted-pairs' (see 281 * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\' 282 * (or any non-whitespace character) immediately before an 283 * encoded-word will prevent it from being decoded. 284 * 285 * hstring should be a NULL terminated string. 286 * outbuf should be sufficiently large to hold the result. 287 */ 288 static void 289 mime_decode_usfield(char *outbuf, size_t outsize, const char *hstring) 290 { 291 const char *p, *p0; 292 char *q, *qend; 293 int lastc; 294 const char *charset; 295 296 charset = value(ENAME_MIME_CHARSET); 297 qend = outbuf + outsize - 1; /* Make sure there is room for the trailing NULL! */ 298 q = outbuf; 299 p = hstring; 300 p0 = NULL; 301 lastc = (unsigned char)' '; 302 while (*p && q < qend) { 303 const char *p1; 304 char *q1; 305 if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' && 306 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 307 (*p1 == '\0' || is_FWS(*p1))) { 308 p0 = p1; /* pointer to first character after encoded word */ 309 q = q1; 310 p = skip_FWS(p1); 311 lastc = (unsigned char)*p0; 312 } 313 else { 314 copy_skipped_FWS(&q, qend, &p0, p); 315 lastc = (unsigned char)*p; 316 if (q < qend) 317 *q++ = *p++; 318 } 319 } 320 copy_skipped_FWS(&q, qend, &p0, p); 321 *q = '\0'; 322 } 323 324 /* 325 * Decode a field comment. 326 * 327 * Comments only occur in structured fields, can be nested (rfc 2822, 328 * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'. 329 * Otherwise, they can be regarded as unstructured fields that are 330 * bounded by '(' and ')' characters. 331 */ 332 static int 333 decode_comment(char **obuf, char *oend, const char **ibuf, const char *iend, const char *charset) 334 { 335 const char *p, *pend, *p0; 336 char *q, *qend; 337 int lastc; 338 339 p = *ibuf; 340 q = *obuf; 341 pend = iend; 342 qend = oend; 343 lastc = ' '; 344 p0 = NULL; 345 while (p < pend && q < qend) { 346 const char *p1; 347 char *q1; 348 349 if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' && 350 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 351 (*p1 == ')' || is_FWS(*p1))) { 352 lastc = (unsigned char)*p1; 353 p0 = p1; 354 q = q1; 355 p = skip_FWS(p1); 356 /* 357 * XXX - this check should be unnecessary as *pend should 358 * be '\0' which will stop skip_FWS() 359 */ 360 if (p > pend) 361 p = pend; 362 } 363 else { 364 copy_skipped_FWS(&q, qend, &p0, p); 365 if (q >= qend) /* XXX - q > qend cannot happen */ 366 break; 367 368 if (*p == ')') { 369 *q++ = *p++; /* copy the closing ')' */ 370 break; /* and get out of here! */ 371 } 372 373 if (*p == '(') { 374 *q++ = *p++; /* copy the opening '(' */ 375 if (decode_comment(&q, qend, &p, pend, charset) == -1) 376 return -1; /* is this right or should we update? */ 377 lastc = ')'; 378 } 379 else if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 380 if (p[1] == '(' || p[1] == ')' || p[1] == '\\') /* need quoted-pair*/ 381 *q++ = *p; 382 p++; 383 lastc = (unsigned char)*p; 384 if (q < qend) 385 *q++ = *p++; 386 } 387 else { 388 lastc = (unsigned char)*p; 389 *q++ = *p++; 390 } 391 } 392 } 393 *ibuf = p; 394 *obuf = q; 395 return 0; 396 } 397 398 /* 399 * Decode a quoted-string or no-fold-quote. 400 * 401 * These cannot contain encoded words. They can contain quoted-pairs, 402 * making '\\' special. They have no other structure. See RFC 2822 403 * sec 3.2.5 and 3.6.4. 404 */ 405 static void 406 decode_quoted_string(char **obuf, char *oend, const char **ibuf, const char *iend) 407 { 408 const char *p, *pend; 409 char *q, *qend; 410 411 qend = oend; 412 pend = iend; 413 p = *ibuf; 414 q = *obuf; 415 while (p < pend && q < qend) { 416 if (*p == '"') { 417 *q++ = *p++; /* copy the closing '"' */ 418 break; 419 } 420 if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 421 if (p[1] == '"' || p[1] == '\\') { 422 *q++ = *p; 423 if (q >= qend) 424 break; 425 } 426 p++; 427 } 428 *q++ = *p++; 429 } 430 *ibuf = p; 431 *obuf = q; 432 } 433 434 /* 435 * Decode a domain-literal or no-fold-literal. 436 * 437 * These cannot contain encoded words. They can have quoted pairs and 438 * are delimited by '[' and ']' making '\\', '[', and ']' special. 439 * They have no other structure. See RFC 2822 sec 3.4.1 and 3.6.4. 440 */ 441 static void 442 decode_domain_literal(char **obuf, char *oend, const char **ibuf, const char *iend) 443 { 444 const char *p, *pend; 445 char *q, *qend; 446 447 qend = oend; 448 pend = iend; 449 p = *ibuf; 450 q = *obuf; 451 while (p < pend && q < qend) { 452 if (*p == ']') { 453 *q++ = *p++; /* copy the closing ']' */ 454 break; 455 } 456 if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 457 if (p[1] == '[' || p[1] == ']' || p[1] == '\\') { 458 *q++ = *p; 459 if (q >= qend) 460 break; 461 } 462 p++; 463 } 464 *q++ = *p++; 465 } 466 *ibuf = p; 467 *obuf = q; 468 } 469 470 /* 471 * Specials: see RFC 2822 sec 3.2.1. 472 */ 473 static inline int 474 is_specials(int c) 475 { 476 static const char specialtab[] = { 477 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 479 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 480 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 481 482 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 486 }; 487 return !(c & ~0x7f) ? specialtab[c] : 0; 488 } 489 490 /* 491 * Decode a structured field. 492 * 493 * At the top level, structured fields can only contain encoded-words 494 * via 'phrases' and 'comments'. See RFC 2047 sec 5. 495 */ 496 static void 497 mime_decode_sfield(char *linebuf, size_t bufsize, const char *hstring) 498 { 499 const char *p, *pend, *p0; 500 char *q, *qend; 501 const char *charset; 502 int lastc; 503 504 charset = value(ENAME_MIME_CHARSET); 505 506 p = hstring; 507 q = linebuf; 508 pend = hstring + strlen(hstring); 509 qend = linebuf + bufsize - 1; /* save room for the NULL terminator */ 510 lastc = (unsigned char)' '; 511 p0 = NULL; 512 while (p < pend && q < qend) { 513 const char *p1; 514 char *q1; 515 516 if (*p != '=') { 517 copy_skipped_FWS(&q, qend, &p0, p); 518 if (q >= qend) 519 break; 520 } 521 522 switch (*p) { 523 case '(': /* start of comment */ 524 *q++ = *p++; /* copy the opening '(' */ 525 (void)decode_comment(&q, qend, &p, pend, charset); 526 lastc = (unsigned char)p[-1]; 527 break; 528 529 case '"': /* start of quoted-string or no-fold-quote */ 530 *q++ = *p++; /* copy the opening '"' */ 531 decode_quoted_string(&q, qend, &p, pend); 532 lastc = (unsigned char)p[-1]; 533 break; 534 535 case '[': /* start of domain-literal or no-fold-literal */ 536 *q++ = *p++; /* copy the opening '[' */ 537 decode_domain_literal(&q, qend, &p, pend); 538 lastc = (unsigned char)p[-1]; 539 break; 540 541 case '\\': /* start of quoted-pair */ 542 if (p + 1 < pend) { /* quoted pair */ 543 if (is_specials(p[1])) { 544 *q++ = *p; 545 if (q >= qend) 546 break; 547 } 548 p++; /* skip the '\\' */ 549 } 550 goto copy_char; 551 552 case '=': 553 /* 554 * At this level encoded words can appear via 555 * 'phrases' (possibly delimited by ',' as in 556 * 'keywords'). Thus we handle them as such. 557 * Hopefully this is sufficient. 558 */ 559 if ((lastc == ',' || is_FWS(lastc)) && p[1] == '?' && 560 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 561 (*p1 == '\0' || *p1 == ',' || is_FWS(*p1))) { 562 lastc = (unsigned char)*p1; 563 p0 = p1; 564 q = q1; 565 p = skip_FWS(p1); 566 /* 567 * XXX - this check should be 568 * unnecessary as *pend should be '\0' 569 * which will stop skip_FWS() 570 */ 571 if (p > pend) 572 p = pend; 573 break; 574 } 575 else { 576 copy_skipped_FWS(&q, qend, &p0, p); 577 if (q >= qend) 578 break; 579 goto copy_char; 580 } 581 582 case '<': /* start of angle-addr, msg-id, or path. */ 583 /* 584 * A msg-id cannot contain encoded-pairs or 585 * encoded-words, but angle-addr and path can. 586 * Distinguishing between them seems to be 587 * unnecessary, so let's be loose and just 588 * decode them as if they were all the same. 589 */ 590 default: 591 copy_char: 592 lastc = (unsigned char)*p; 593 *q++ = *p++; 594 break; 595 } 596 } 597 copy_skipped_FWS(&q, qend, &p0, p); 598 *q = '\0'; /* null terminate the result! */ 599 } 600 601 /* 602 * Returns the correct hfield decoder, or NULL if none. 603 * Info extracted from RFC 2822. 604 * 605 * name - pointer to field name of header line (with colon). 606 */ 607 PUBLIC hfield_decoder_t 608 mime_hfield_decoder(const char *name) 609 { 610 static const struct field_decoder_tbl_s { 611 const char *field_name; 612 size_t field_len; 613 hfield_decoder_t decoder; 614 } field_decoder_tbl[] = { 615 #define X(s) s, sizeof(s) - 1 616 { X("Received:"), NULL }, 617 618 { X("Content-Type:"), NULL }, 619 { X("Content-Disposition:"), NULL }, 620 { X("Content-Transfer-Encoding:"), NULL }, 621 { X("Content-Description:"), mime_decode_sfield }, 622 { X("Content-ID:"), mime_decode_sfield }, 623 { X("MIME-Version:"), mime_decode_sfield }, 624 625 { X("Bcc:"), mime_decode_sfield }, 626 { X("Cc:"), mime_decode_sfield }, 627 { X("Date:"), mime_decode_sfield }, 628 { X("From:"), mime_decode_sfield }, 629 { X("In-Reply-To:"), mime_decode_sfield }, 630 { X("Keywords:"), mime_decode_sfield }, 631 { X("Message-ID:"), mime_decode_sfield }, 632 { X("References:"), mime_decode_sfield }, 633 { X("Reply-To:"), mime_decode_sfield }, 634 { X("Return-Path:"), mime_decode_sfield }, 635 { X("Sender:"), mime_decode_sfield }, 636 { X("To:"), mime_decode_sfield }, 637 { X("Subject:"), mime_decode_usfield }, 638 { X("Comments:"), mime_decode_usfield }, 639 { X("X-"), mime_decode_usfield }, 640 { NULL, 0, mime_decode_usfield }, /* optional-fields */ 641 #undef X 642 }; 643 const struct field_decoder_tbl_s *fp; 644 645 /* XXX - this begs for a hash table! */ 646 for (fp = field_decoder_tbl; fp->field_name; fp++) 647 if (strncasecmp(name, fp->field_name, fp->field_len) == 0) 648 break; 649 return fp->decoder; 650 } 651 652 #endif /* MIME_SUPPORT */ 653