1 /* $NetBSD: mime_header.c,v 1.4 2007/10/23 14:58:44 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 2006 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Anon Ymous. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 40 /* 41 * This module contains the core MIME header decoding routines. 42 * Please refer to RFC 2047 and RFC 2822. 43 */ 44 45 #ifdef MIME_SUPPORT 46 47 #include <sys/cdefs.h> 48 #ifndef __lint__ 49 __RCSID("$NetBSD: mime_header.c,v 1.4 2007/10/23 14:58:44 christos Exp $"); 50 #endif /* not __lint__ */ 51 52 #include <stdio.h> 53 #include <stdlib.h> 54 #include <string.h> 55 56 #include "def.h" 57 #include "extern.h" 58 #include "mime.h" 59 #include "mime_header.h" 60 #include "mime_codecs.h" 61 62 /* 63 * Our interface to mime_b64tobin() 64 * 65 * XXX - This should move to mime_codecs.c. 66 */ 67 static ssize_t 68 mime_B64_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen) 69 { 70 if (outlen < 3 * roundup(inlen, 4) / 4) 71 return -1; 72 73 return mime_b64tobin(outbuf, inbuf, inlen); 74 } 75 76 77 /* 78 * Header specific "quoted-printable" decode! 79 * Differences with body QP decoding (see rfc 2047, sec 4.2): 80 * 1) '=' occurs _only_ when followed by two hex digits (FWS is not allowed). 81 * 2) Spaces can be encoded as '_' in headers for readability. 82 * 83 * XXX - This should move to mime_codecs.c. 84 */ 85 static ssize_t 86 mime_QPh_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen) 87 { 88 const char *p, *inend; 89 char *outend; 90 char *q; 91 92 outend = outbuf + outlen; 93 inend = inbuf + inlen; 94 q = outbuf; 95 for (p = inbuf; p < inend; p++) { 96 if (q >= outend) 97 return -1; 98 if (*p == '=') { 99 p++; 100 if (p + 1 < inend) { 101 int c; 102 char *bufend; 103 char buf[3]; 104 buf[0] = *p++; 105 buf[1] = *p; 106 buf[2] = '\0'; 107 c = strtol(buf, &bufend, 16); 108 if (bufend != &buf[2]) 109 return -1; 110 *q++ = c; 111 } 112 else 113 return -1; 114 } 115 else if (*p == '_') /* header's may encode ' ' as '_' */ 116 *q++ = ' '; 117 else 118 *q++ = *p; 119 } 120 return q - outbuf; 121 } 122 123 static const char * 124 grab_charset(char *from_cs, size_t from_cs_len, const char *p) 125 { 126 char *q; 127 q = from_cs; 128 for (/*EMPTY*/; *p != '?'; p++) { 129 if (*p == '\0' || q >= from_cs + from_cs_len - 1) 130 return NULL; 131 *q++ = *p; 132 } 133 *q = '\0'; 134 return ++p; /* if here, then we got the '?' */ 135 } 136 137 /* 138 * An encoded word is a string of at most 75 non-white space 139 * characters of the following form: 140 * 141 * =?charset?X?encoding?= 142 * 143 * where: 144 * 'charset' is the original character set of the unencoded string. 145 * 146 * 'X' is the encoding type 'B' or 'Q' for "base64" or 147 * "quoted-printable", respectively, 148 * 'encoding' is the encoded string. 149 * 150 * Both 'charset' and 'X' are case independent and 'encoding' cannot 151 * contain any whitespace or '?' characters. The 'encoding' must also 152 * be fully contained within the encoded words, i.e., it cannot be 153 * split between encoded words. 154 * 155 * Note: the 'B' encoding is a slightly modified "quoted-printable" 156 * encoding. In particular, spaces (' ') may be encoded as '_' to 157 * improve undecoded readability. 158 */ 159 static int 160 decode_word(const char **ibuf, char **obuf, char *oend, const char *to_cs) 161 { 162 ssize_t declen; 163 size_t enclen, dstlen; 164 char decword[LINESIZE]; 165 char from_cs[LINESIZE]; 166 const char *encword, *iend, *p; 167 char *dstend; 168 char enctype; 169 170 p = *ibuf; 171 if (p[0] != '=' && p[1] != '?') 172 return -1; 173 if (strlen(p) < 2 + 1 + 3 + 1 + 2) 174 return -1; 175 p = grab_charset(from_cs, sizeof(from_cs), p + 2); 176 if (p == NULL) 177 return -1; 178 enctype = *p++; 179 if (*p++ != '?') 180 return -1; 181 encword = p; 182 p = strchr(p, '?'); 183 if (p == NULL || p[1] != '=') 184 return -1; 185 enclen = p - encword; /* length of encoded substring */ 186 iend = p + 2; 187 /* encoded words are at most 75 characters (RFC 2047, sec 2) */ 188 if (iend > *ibuf + 75) 189 return -1; 190 191 dstend = to_cs ? decword : *obuf; 192 dstlen = (to_cs ? sizeof(decword): oend - *obuf) - 1; 193 194 if (enctype == 'B' || enctype == 'b') 195 declen = mime_B64_decode(dstend, dstlen, encword, enclen); 196 else if (enctype == 'Q' || enctype == 'q') 197 declen = mime_QPh_decode(dstend, dstlen, encword, enclen); 198 else 199 return -1; 200 201 if (declen == -1) 202 return -1; 203 204 dstend += declen; 205 #ifdef CHARSET_SUPPORT 206 if (to_cs != NULL) { 207 iconv_t cd; 208 const char *src; 209 size_t srclen; 210 size_t cnt; 211 212 cd = iconv_open(to_cs, from_cs); 213 if (cd == (iconv_t)-1) 214 return -1; 215 216 src = decword; 217 srclen = declen; 218 dstend = *obuf; 219 dstlen = oend - *obuf - 1; 220 cnt = mime_iconv(cd, &src, &srclen, &dstend, &dstlen); 221 222 (void)iconv_close(cd); 223 if (cnt == (size_t)-1) 224 return -1; 225 } 226 #endif /* CHARSET_SUPPORT */ 227 *dstend = '\0'; 228 *ibuf = iend; 229 *obuf = dstend; 230 return 0; 231 } 232 233 234 /* 235 * Folding White Space. See RFC 2822. 236 * 237 * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF 238 * pairs (i.e., "\r\n") and never separately. However, by the time 239 * mail(1) sees the messages, all CRLF pairs have been converted to 240 * '\n' characters. 241 * 242 * XXX - pull is_FWS() and skip_FWS() up to def.h? 243 */ 244 static inline int 245 is_FWS(int c) 246 { 247 return c == ' ' || c == '\t' || c == '\n'; 248 } 249 250 static inline const char * 251 skip_FWS(const char *p) 252 { 253 while (is_FWS(*p)) 254 p++; 255 return p; 256 } 257 258 static inline void 259 copy_skipped_FWS(char **dst, char *dstend, const char **src, const char *srcend) 260 { 261 const char *p, *pend; 262 char *q, *qend; 263 264 p = *src; 265 q = *dst; 266 pend = srcend; 267 qend = dstend; 268 269 if (p) { /* copy any skipped linear-white-space */ 270 while (p < pend && q < qend) 271 *q++ = *p++; 272 *dst = q; 273 *src = NULL; 274 } 275 } 276 277 /* 278 * Decode an unstructured field. 279 * 280 * See RFC 2822 Sec 2.2.1 and 3.6.5. 281 * Encoded words may occur anywhere in unstructured fields provided 282 * they are separated from any other text or encoded words by at least 283 * one linear-white-space character. (See RFC 2047 sec 5.1.) If two 284 * encoded words occur sequentially (separated by only FWS) then the 285 * separating FWS is removed. 286 * 287 * NOTE: unstructured fields cannot contain 'quoted-pairs' (see 288 * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\' 289 * (or any non-whitespace character) immediately before an 290 * encoded-word will prevent it from being decoded. 291 * 292 * hstring should be a NULL terminated string. 293 * outbuf should be sufficiently large to hold the result. 294 */ 295 static void 296 mime_decode_usfield(char *outbuf, size_t outsize, const char *hstring) 297 { 298 const char *p, *p0; 299 char *q, *qend; 300 int lastc; 301 const char *charset; 302 303 charset = value(ENAME_MIME_CHARSET); 304 qend = outbuf + outsize - 1; /* Make sure there is room for the trailing NULL! */ 305 q = outbuf; 306 p = hstring; 307 p0 = NULL; 308 lastc = (unsigned char)' '; 309 while (*p && q < qend) { 310 const char *p1; 311 char *q1; 312 if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' && 313 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 314 (*p1 == '\0' || is_FWS(*p1))) { 315 p0 = p1; /* pointer to first character after encoded word */ 316 q = q1; 317 p = skip_FWS(p1); 318 lastc = (unsigned char)*p0; 319 } 320 else { 321 copy_skipped_FWS(&q, qend, &p0, p); 322 lastc = (unsigned char)*p; 323 if (q < qend) 324 *q++ = *p++; 325 } 326 } 327 copy_skipped_FWS(&q, qend, &p0, p); 328 *q = '\0'; 329 } 330 331 /* 332 * Decode a field comment. 333 * 334 * Comments only occur in structured fields, can be nested (rfc 2822, 335 * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'. 336 * Otherwise, they can be regarded as unstructured fields that are 337 * bounded by '(' and ')' characters. 338 */ 339 static int 340 decode_comment(char **obuf, char *oend, const char **ibuf, const char *iend, const char *charset) 341 { 342 const char *p, *pend, *p0; 343 char *q, *qend; 344 int lastc; 345 346 p = *ibuf; 347 q = *obuf; 348 pend = iend; 349 qend = oend; 350 lastc = ' '; 351 p0 = NULL; 352 while (p < pend && q < qend) { 353 const char *p1; 354 char *q1; 355 356 if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' && 357 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 358 (*p1 == ')' || is_FWS(*p1))) { 359 lastc = (unsigned char)*p1; 360 p0 = p1; 361 q = q1; 362 p = skip_FWS(p1); 363 /* 364 * XXX - this check should be unnecessary as *pend should 365 * be '\0' which will stop skip_FWS() 366 */ 367 if (p > pend) 368 p = pend; 369 } 370 else { 371 copy_skipped_FWS(&q, qend, &p0, p); 372 if (q >= qend) /* XXX - q > qend cannot happen */ 373 break; 374 375 if (*p == ')') { 376 *q++ = *p++; /* copy the closing ')' */ 377 break; /* and get out of here! */ 378 } 379 380 if (*p == '(') { 381 *q++ = *p++; /* copy the opening '(' */ 382 if (decode_comment(&q, qend, &p, pend, charset) == -1) 383 return -1; /* is this right or should we update? */ 384 lastc = ')'; 385 } 386 else if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 387 if (p[1] == '(' || p[1] == ')' || p[1] == '\\') /* need quoted-pair*/ 388 *q++ = *p; 389 p++; 390 lastc = (unsigned char)*p; 391 if (q < qend) 392 *q++ = *p++; 393 } 394 else { 395 lastc = (unsigned char)*p; 396 *q++ = *p++; 397 } 398 } 399 } 400 *ibuf = p; 401 *obuf = q; 402 return 0; 403 } 404 405 /* 406 * Decode a quoted-string or no-fold-quote. 407 * 408 * These cannot contain encoded words. They can contain quoted-pairs, 409 * making '\\' special. They have no other structure. See RFC 2822 410 * sec 3.2.5 and 3.6.4. 411 */ 412 static void 413 decode_quoted_string(char **obuf, char *oend, const char **ibuf, const char *iend) 414 { 415 const char *p, *pend; 416 char *q, *qend; 417 418 qend = oend; 419 pend = iend; 420 p = *ibuf; 421 q = *obuf; 422 while (p < pend && q < qend) { 423 if (*p == '"') { 424 *q++ = *p++; /* copy the closing '"' */ 425 break; 426 } 427 if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 428 if (p[1] == '"' || p[1] == '\\') { 429 *q++ = *p; 430 if (q >= qend) 431 break; 432 } 433 p++; 434 } 435 *q++ = *p++; 436 } 437 *ibuf = p; 438 *obuf = q; 439 } 440 441 /* 442 * Decode a domain-literal or no-fold-literal. 443 * 444 * These cannot contain encoded words. They can have quoted pairs and 445 * are delimited by '[' and ']' making '\\', '[', and ']' special. 446 * They have no other structure. See RFC 2822 sec 3.4.1 and 3.6.4. 447 */ 448 static void 449 decode_domain_literal(char **obuf, char *oend, const char **ibuf, const char *iend) 450 { 451 const char *p, *pend; 452 char *q, *qend; 453 454 qend = oend; 455 pend = iend; 456 p = *ibuf; 457 q = *obuf; 458 while (p < pend && q < qend) { 459 if (*p == ']') { 460 *q++ = *p++; /* copy the closing ']' */ 461 break; 462 } 463 if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 464 if (p[1] == '[' || p[1] == ']' || p[1] == '\\') { 465 *q++ = *p; 466 if (q >= qend) 467 break; 468 } 469 p++; 470 } 471 *q++ = *p++; 472 } 473 *ibuf = p; 474 *obuf = q; 475 } 476 477 /* 478 * Specials: see RFC 2822 sec 3.2.1. 479 */ 480 static inline int 481 is_specials(int c) 482 { 483 static const char specialtab[] = { 484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 486 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 488 489 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 493 }; 494 return !(c & ~0x7f) ? specialtab[c] : 0; 495 } 496 497 /* 498 * Decode a structured field. 499 * 500 * At the top level, structured fields can only contain encoded-words 501 * via 'phrases' and 'comments'. See RFC 2047 sec 5. 502 */ 503 static void 504 mime_decode_sfield(char *linebuf, size_t bufsize, const char *hstring) 505 { 506 const char *p, *pend, *p0; 507 char *q, *qend; 508 const char *charset; 509 int lastc; 510 511 charset = value(ENAME_MIME_CHARSET); 512 513 p = hstring; 514 q = linebuf; 515 pend = hstring + strlen(hstring); 516 qend = linebuf + bufsize - 1; /* save room for the NULL terminator */ 517 lastc = (unsigned char)' '; 518 p0 = NULL; 519 while (p < pend && q < qend) { 520 const char *p1; 521 char *q1; 522 523 if (*p != '=') { 524 copy_skipped_FWS(&q, qend, &p0, p); 525 if (q >= qend) 526 break; 527 } 528 529 switch (*p) { 530 case '(': /* start of comment */ 531 *q++ = *p++; /* copy the opening '(' */ 532 (void)decode_comment(&q, qend, &p, pend, charset); 533 lastc = (unsigned char)p[-1]; 534 break; 535 536 case '"': /* start of quoted-string or no-fold-quote */ 537 *q++ = *p++; /* copy the opening '"' */ 538 decode_quoted_string(&q, qend, &p, pend); 539 lastc = (unsigned char)p[-1]; 540 break; 541 542 case '[': /* start of domain-literal or no-fold-literal */ 543 *q++ = *p++; /* copy the opening '[' */ 544 decode_domain_literal(&q, qend, &p, pend); 545 lastc = (unsigned char)p[-1]; 546 break; 547 548 case '\\': /* start of quoted-pair */ 549 if (p + 1 < pend) { /* quoted pair */ 550 if (is_specials(p[1])) { 551 *q++ = *p; 552 if (q >= qend) 553 break; 554 } 555 p++; /* skip the '\\' */ 556 } 557 goto copy_char; 558 559 case '=': 560 /* 561 * At this level encoded words can appear via 562 * 'phrases' (possibly delimited by ',' as in 563 * 'keywords'). Thus we handle them as such. 564 * Hopefully this is sufficient. 565 */ 566 if ((lastc == ',' || is_FWS(lastc)) && p[1] == '?' && 567 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 568 (*p1 == '\0' || *p1 == ',' || is_FWS(*p1))) { 569 lastc = (unsigned char)*p1; 570 p0 = p1; 571 q = q1; 572 p = skip_FWS(p1); 573 /* 574 * XXX - this check should be 575 * unnecessary as *pend should be '\0' 576 * which will stop skip_FWS() 577 */ 578 if (p > pend) 579 p = pend; 580 break; 581 } 582 else { 583 copy_skipped_FWS(&q, qend, &p0, p); 584 if (q >= qend) 585 break; 586 goto copy_char; 587 } 588 589 case '<': /* start of angle-addr, msg-id, or path. */ 590 /* 591 * A msg-id cannot contain encoded-pairs or 592 * encoded-words, but angle-addr and path can. 593 * Distinguishing between them seems to be 594 * unnecessary, so let's be loose and just 595 * decode them as if they were all the same. 596 */ 597 default: 598 copy_char: 599 lastc = (unsigned char)*p; 600 *q++ = *p++; 601 break; 602 } 603 } 604 copy_skipped_FWS(&q, qend, &p0, p); 605 *q = '\0'; /* null terminate the result! */ 606 } 607 608 609 /* 610 * Returns the correct hfield decoder, or NULL if none. 611 * Info extracted from RFC 2822. 612 */ 613 PUBLIC hfield_decoder_t 614 mime_hfield_decoder(char *name) 615 { 616 static const struct field_decoder_tbl_s { 617 const char *field_name; 618 hfield_decoder_t decoder; 619 } field_decoder_tbl[] = { 620 { "Received:", NULL }, 621 { "Content-Type:", NULL }, 622 { "Content-Disposition:", NULL }, 623 { "Content-Transfer-Encoding:", NULL }, 624 { "Content-Description:", mime_decode_sfield }, 625 { "Content-ID:", mime_decode_sfield }, 626 { "MIME-Version:", mime_decode_sfield }, 627 { "Bcc:", mime_decode_sfield }, 628 { "Cc:", mime_decode_sfield }, 629 { "Date:", mime_decode_sfield }, 630 { "From:", mime_decode_sfield }, 631 { "In-Reply-To:", mime_decode_sfield }, 632 { "Keywords:", mime_decode_sfield }, 633 { "Message-ID:", mime_decode_sfield }, 634 { "References:", mime_decode_sfield }, 635 { "Reply-To:", mime_decode_sfield }, 636 { "Return-Path:", mime_decode_sfield }, 637 { "Sender:", mime_decode_sfield }, 638 { "To:", mime_decode_sfield }, 639 { "Subject:", mime_decode_usfield }, 640 { "Comments:", mime_decode_usfield }, 641 { "X-", mime_decode_usfield }, 642 { NULL, mime_decode_usfield }, /* optional-fields */ 643 }; 644 const struct field_decoder_tbl_s *fp; 645 646 /* XXX - this begs for a hash table! */ 647 for (fp = field_decoder_tbl; fp->field_name; fp++) 648 if (strncasecmp(name, fp->field_name, strlen(fp->field_name)) == 0) 649 return fp->decoder; 650 return fp->decoder; 651 } 652 653 #endif /* MIME_SUPPORT */ 654