1/* $NetBSD: mime_header.c,v 1.8 2009/04/10 13:08:25 christos Exp $ */ 2 3/*- 4 * Copyright (c) 2006 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Anon Ymous. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 33/* 34 * This module contains the core MIME header decoding routines. 35 * Please refer to RFC 2047 and RFC 2822. 36 */ 37 38#ifdef MIME_SUPPORT 39 40#include <sys/cdefs.h> 41#ifndef __lint__ 42__RCSID("$NetBSD: mime_header.c,v 1.8 2009/04/10 13:08:25 christos Exp $"); 43#endif /* not __lint__ */ 44 45#include <assert.h> 46#include <stdio.h> 47#include <stdlib.h> 48#include <string.h> 49 50#include "def.h" 51#include "extern.h" 52#include "mime.h" 53#include "mime_header.h" 54#include "mime_codecs.h" 55 56static const char * 57grab_charset(char *from_cs, size_t from_cs_len, const char *p) 58{ 59 char *q; 60 q = from_cs; 61 for (/*EMPTY*/; *p != '?'; p++) { 62 if (*p == '\0' || q >= from_cs + from_cs_len - 1) 63 return NULL; 64 *q++ = *p; 65 } 66 *q = '\0'; 67 return ++p; /* if here, then we got the '?' */ 68} 69 70/* 71 * An encoded word is a string of at most 75 non-white space 72 * characters of the following form: 73 * 74 * =?charset?X?encoding?= 75 * 76 * where: 77 * 'charset' is the original character set of the unencoded string. 78 * 79 * 'X' is the encoding type 'B' or 'Q' for "base64" or 80 * "quoted-printable", respectively, 81 * 'encoding' is the encoded string. 82 * 83 * Both 'charset' and 'X' are case independent and 'encoding' cannot 84 * contain any whitespace or '?' characters. The 'encoding' must also 85 * be fully contained within the encoded words, i.e., it cannot be 86 * split between encoded words. 87 * 88 * Note: the 'B' encoding is a slightly modified "quoted-printable" 89 * encoding. In particular, spaces (' ') may be encoded as '_' to 90 * improve undecoded readability. 91 */ 92static int 93decode_word(const char **ibuf, char **obuf, char *oend, const char *to_cs) 94{ 95 ssize_t declen; 96 size_t enclen, dstlen; 97 char decword[LINESIZE]; 98 char from_cs[LINESIZE]; 99 const char *encword, *iend, *p; 100 char *dstend; 101 char enctype; 102 103 p = *ibuf; 104 if (p[0] != '=' && p[1] != '?') 105 return -1; 106 if (strlen(p) < 2 + 1 + 3 + 1 + 2) 107 return -1; 108 p = grab_charset(from_cs, sizeof(from_cs), p + 2); 109 if (p == NULL) 110 return -1; 111 enctype = *p++; 112 if (*p++ != '?') 113 return -1; 114 encword = p; 115 p = strchr(p, '?'); 116 if (p == NULL || p[1] != '=') 117 return -1; 118 enclen = p - encword; /* length of encoded substring */ 119 iend = p + 2; 120 /* encoded words are at most 75 characters (RFC 2047, sec 2) */ 121 if (iend > *ibuf + 75) 122 return -1; 123 124 if (oend < *obuf + 1) { 125 assert(/*CONSTCOND*/ 0); /* We have a coding error! */ 126 return -1; 127 } 128 dstend = to_cs ? decword : *obuf; 129 dstlen = (to_cs ? sizeof(decword) : (size_t)(oend - *obuf)) - 1; 130 131 declen = mime_rfc2047_decode(enctype, dstend, dstlen, encword, enclen); 132 if (declen == -1) 133 return -1; 134 135 dstend += declen; 136#ifdef CHARSET_SUPPORT 137 if (to_cs != NULL) { 138 iconv_t cd; 139 const char *src; 140 size_t srclen; 141 size_t cnt; 142 143 cd = iconv_open(to_cs, from_cs); 144 if (cd == (iconv_t)-1) 145 return -1; 146 147 src = decword; 148 srclen = declen; 149 dstend = *obuf; 150 dstlen = oend - *obuf - 1; 151 cnt = mime_iconv(cd, &src, &srclen, &dstend, &dstlen); 152 153 (void)iconv_close(cd); 154 if (cnt == (size_t)-1) 155 return -1; 156 } 157#endif /* CHARSET_SUPPORT */ 158 *dstend = '\0'; 159 *ibuf = iend; 160 *obuf = dstend; 161 return 0; 162} 163 164 165/* 166 * Folding White Space. See RFC 2822. 167 * 168 * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF 169 * pairs (i.e., "\r\n") and never separately. However, by the time 170 * mail(1) sees the messages, all CRLF pairs have been converted to 171 * '\n' characters. 172 * 173 * XXX - pull is_FWS() and skip_FWS() up to def.h? 174 */ 175static inline int 176is_FWS(int c) 177{ 178 return c == ' ' || c == '\t' || c == '\n'; 179} 180 181static inline const char * 182skip_FWS(const char *p) 183{ 184 while (is_FWS(*p)) 185 p++; 186 return p; 187} 188 189static inline void 190copy_skipped_FWS(char **dst, char *dstend, const char **src, const char *srcend) 191{ 192 const char *p, *pend; 193 char *q, *qend; 194 195 p = *src; 196 q = *dst; 197 pend = srcend; 198 qend = dstend; 199 200 if (p) { /* copy any skipped linear-white-space */ 201 while (p < pend && q < qend) 202 *q++ = *p++; 203 *dst = q; 204 *src = NULL; 205 } 206} 207 208/* 209 * Decode an unstructured field. 210 * 211 * See RFC 2822 Sec 2.2.1 and 3.6.5. 212 * Encoded words may occur anywhere in unstructured fields provided 213 * they are separated from any other text or encoded words by at least 214 * one linear-white-space character. (See RFC 2047 sec 5.1.) If two 215 * encoded words occur sequentially (separated by only FWS) then the 216 * separating FWS is removed. 217 * 218 * NOTE: unstructured fields cannot contain 'quoted-pairs' (see 219 * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\' 220 * (or any non-whitespace character) immediately before an 221 * encoded-word will prevent it from being decoded. 222 * 223 * hstring should be a NULL terminated string. 224 * outbuf should be sufficiently large to hold the result. 225 */ 226static void 227mime_decode_usfield(char *outbuf, size_t outsize, const char *hstring) 228{ 229 const char *p, *p0; 230 char *q, *qend; 231 int lastc; 232 const char *charset; 233 234 charset = value(ENAME_MIME_CHARSET); 235 qend = outbuf + outsize - 1; /* Make sure there is room for the trailing NULL! */ 236 q = outbuf; 237 p = hstring; 238 p0 = NULL; 239 lastc = (unsigned char)' '; 240 while (*p && q < qend) { 241 const char *p1; 242 char *q1; 243 if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' && 244 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 245 (*p1 == '\0' || is_FWS(*p1))) { 246 p0 = p1; /* pointer to first character after encoded word */ 247 q = q1; 248 p = skip_FWS(p1); 249 lastc = (unsigned char)*p0; 250 } 251 else { 252 copy_skipped_FWS(&q, qend, &p0, p); 253 lastc = (unsigned char)*p; 254 if (q < qend) 255 *q++ = *p++; 256 } 257 } 258 copy_skipped_FWS(&q, qend, &p0, p); 259 *q = '\0'; 260} 261 262/* 263 * Decode a field comment. 264 * 265 * Comments only occur in structured fields, can be nested (rfc 2822, 266 * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'. 267 * Otherwise, they can be regarded as unstructured fields that are 268 * bounded by '(' and ')' characters. 269 */ 270static int 271decode_comment(char **obuf, char *oend, const char **ibuf, const char *iend, const char *charset) 272{ 273 const char *p, *pend, *p0; 274 char *q, *qend; 275 int lastc; 276 277 p = *ibuf; 278 q = *obuf; 279 pend = iend; 280 qend = oend; 281 lastc = ' '; 282 p0 = NULL; 283 while (p < pend && q < qend) { 284 const char *p1; 285 char *q1; 286 287 if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' && 288 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 289 (*p1 == ')' || is_FWS(*p1))) { 290 lastc = (unsigned char)*p1; 291 p0 = p1; 292 q = q1; 293 p = skip_FWS(p1); 294 /* 295 * XXX - this check should be unnecessary as *pend should 296 * be '\0' which will stop skip_FWS() 297 */ 298 if (p > pend) 299 p = pend; 300 } 301 else { 302 copy_skipped_FWS(&q, qend, &p0, p); 303 if (q >= qend) /* XXX - q > qend cannot happen */ 304 break; 305 306 if (*p == ')') { 307 *q++ = *p++; /* copy the closing ')' */ 308 break; /* and get out of here! */ 309 } 310 311 if (*p == '(') { 312 *q++ = *p++; /* copy the opening '(' */ 313 if (decode_comment(&q, qend, &p, pend, charset) == -1) 314 return -1; /* is this right or should we update? */ 315 lastc = ')'; 316 } 317 else if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 318 if (p[1] == '(' || p[1] == ')' || p[1] == '\\') /* need quoted-pair*/ 319 *q++ = *p; 320 p++; 321 lastc = (unsigned char)*p; 322 if (q < qend) 323 *q++ = *p++; 324 } 325 else { 326 lastc = (unsigned char)*p; 327 *q++ = *p++; 328 } 329 } 330 } 331 *ibuf = p; 332 *obuf = q; 333 return 0; 334} 335 336/* 337 * Decode a quoted-string or no-fold-quote. 338 * 339 * These cannot contain encoded words. They can contain quoted-pairs, 340 * making '\\' special. They have no other structure. See RFC 2822 341 * sec 3.2.5 and 3.6.4. 342 */ 343static void 344decode_quoted_string(char **obuf, char *oend, const char **ibuf, const char *iend) 345{ 346 const char *p, *pend; 347 char *q, *qend; 348 349 qend = oend; 350 pend = iend; 351 p = *ibuf; 352 q = *obuf; 353 while (p < pend && q < qend) { 354 if (*p == '"') { 355 *q++ = *p++; /* copy the closing '"' */ 356 break; 357 } 358 if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 359 if (p[1] == '"' || p[1] == '\\') { 360 *q++ = *p; 361 if (q >= qend) 362 break; 363 } 364 p++; 365 } 366 *q++ = *p++; 367 } 368 *ibuf = p; 369 *obuf = q; 370} 371 372/* 373 * Decode a domain-literal or no-fold-literal. 374 * 375 * These cannot contain encoded words. They can have quoted pairs and 376 * are delimited by '[' and ']' making '\\', '[', and ']' special. 377 * They have no other structure. See RFC 2822 sec 3.4.1 and 3.6.4. 378 */ 379static void 380decode_domain_literal(char **obuf, char *oend, const char **ibuf, const char *iend) 381{ 382 const char *p, *pend; 383 char *q, *qend; 384 385 qend = oend; 386 pend = iend; 387 p = *ibuf; 388 q = *obuf; 389 while (p < pend && q < qend) { 390 if (*p == ']') { 391 *q++ = *p++; /* copy the closing ']' */ 392 break; 393 } 394 if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 395 if (p[1] == '[' || p[1] == ']' || p[1] == '\\') { 396 *q++ = *p; 397 if (q >= qend) 398 break; 399 } 400 p++; 401 } 402 *q++ = *p++; 403 } 404 *ibuf = p; 405 *obuf = q; 406} 407 408/* 409 * Specials: see RFC 2822 sec 3.2.1. 410 */ 411static inline int 412is_specials(int c) 413{ 414 static const char specialtab[] = { 415 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 416 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 417 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 418 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 419 420 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 422 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 423 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 424 }; 425 return !(c & ~0x7f) ? specialtab[c] : 0; 426} 427 428/* 429 * Decode a structured field. 430 * 431 * At the top level, structured fields can only contain encoded-words 432 * via 'phrases' and 'comments'. See RFC 2047 sec 5. 433 */ 434static void 435mime_decode_sfield(char *linebuf, size_t bufsize, const char *hstring) 436{ 437 const char *p, *pend, *p0; 438 char *q, *qend; 439 const char *charset; 440 int lastc; 441 442 charset = value(ENAME_MIME_CHARSET); 443 444 p = hstring; 445 q = linebuf; 446 pend = hstring + strlen(hstring); 447 qend = linebuf + bufsize - 1; /* save room for the NULL terminator */ 448 lastc = (unsigned char)' '; 449 p0 = NULL; 450 while (p < pend && q < qend) { 451 const char *p1; 452 char *q1; 453 454 if (*p != '=') { 455 copy_skipped_FWS(&q, qend, &p0, p); 456 if (q >= qend) 457 break; 458 } 459 460 switch (*p) { 461 case '(': /* start of comment */ 462 *q++ = *p++; /* copy the opening '(' */ 463 (void)decode_comment(&q, qend, &p, pend, charset); 464 lastc = (unsigned char)p[-1]; 465 break; 466 467 case '"': /* start of quoted-string or no-fold-quote */ 468 *q++ = *p++; /* copy the opening '"' */ 469 decode_quoted_string(&q, qend, &p, pend); 470 lastc = (unsigned char)p[-1]; 471 break; 472 473 case '[': /* start of domain-literal or no-fold-literal */ 474 *q++ = *p++; /* copy the opening '[' */ 475 decode_domain_literal(&q, qend, &p, pend); 476 lastc = (unsigned char)p[-1]; 477 break; 478 479 case '\\': /* start of quoted-pair */ 480 if (p + 1 < pend) { /* quoted pair */ 481 if (is_specials(p[1])) { 482 *q++ = *p; 483 if (q >= qend) 484 break; 485 } 486 p++; /* skip the '\\' */ 487 } 488 goto copy_char; 489 490 case '=': 491 /* 492 * At this level encoded words can appear via 493 * 'phrases' (possibly delimited by ',' as in 494 * 'keywords'). Thus we handle them as such. 495 * Hopefully this is sufficient. 496 */ 497 if ((lastc == ',' || is_FWS(lastc)) && p[1] == '?' && 498 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 499 (*p1 == '\0' || *p1 == ',' || is_FWS(*p1))) { 500 lastc = (unsigned char)*p1; 501 p0 = p1; 502 q = q1; 503 p = skip_FWS(p1); 504 /* 505 * XXX - this check should be 506 * unnecessary as *pend should be '\0' 507 * which will stop skip_FWS() 508 */ 509 if (p > pend) 510 p = pend; 511 break; 512 } 513 else { 514 copy_skipped_FWS(&q, qend, &p0, p); 515 if (q >= qend) 516 break; 517 goto copy_char; 518 } 519 520 case '<': /* start of angle-addr, msg-id, or path. */ 521 /* 522 * A msg-id cannot contain encoded-pairs or 523 * encoded-words, but angle-addr and path can. 524 * Distinguishing between them seems to be 525 * unnecessary, so let's be loose and just 526 * decode them as if they were all the same. 527 */ 528 default: 529 copy_char: 530 lastc = (unsigned char)*p; 531 *q++ = *p++; 532 break; 533 } 534 } 535 copy_skipped_FWS(&q, qend, &p0, p); 536 *q = '\0'; /* null terminate the result! */ 537} 538 539/* 540 * Returns the correct hfield decoder, or NULL if none. 541 * Info extracted from RFC 2822. 542 * 543 * name - pointer to field name of header line (with colon). 544 */ 545PUBLIC hfield_decoder_t 546mime_hfield_decoder(const char *name) 547{ 548 static const struct field_decoder_tbl_s { 549 const char *field_name; 550 size_t field_len; 551 hfield_decoder_t decoder; 552 } field_decoder_tbl[] = { 553#define X(s) s, sizeof(s) - 1 554 { X("Received:"), NULL }, 555 556 { X("Content-Type:"), NULL }, 557 { X("Content-Disposition:"), NULL }, 558 { X("Content-Transfer-Encoding:"), NULL }, 559 { X("Content-Description:"), mime_decode_sfield }, 560 { X("Content-ID:"), mime_decode_sfield }, 561 { X("MIME-Version:"), mime_decode_sfield }, 562 563 { X("Bcc:"), mime_decode_sfield }, 564 { X("Cc:"), mime_decode_sfield }, 565 { X("Date:"), mime_decode_sfield }, 566 { X("From:"), mime_decode_sfield }, 567 { X("In-Reply-To:"), mime_decode_sfield }, 568 { X("Keywords:"), mime_decode_sfield }, 569 { X("Message-ID:"), mime_decode_sfield }, 570 { X("References:"), mime_decode_sfield }, 571 { X("Reply-To:"), mime_decode_sfield }, 572 { X("Return-Path:"), mime_decode_sfield }, 573 { X("Sender:"), mime_decode_sfield }, 574 { X("To:"), mime_decode_sfield }, 575 { X("Subject:"), mime_decode_usfield }, 576 { X("Comments:"), mime_decode_usfield }, 577 { X("X-"), mime_decode_usfield }, 578 { NULL, 0, mime_decode_usfield }, /* optional-fields */ 579#undef X 580 }; 581 const struct field_decoder_tbl_s *fp; 582 583 /* XXX - this begs for a hash table! */ 584 for (fp = field_decoder_tbl; fp->field_name; fp++) 585 if (strncasecmp(name, fp->field_name, fp->field_len) == 0) 586 break; 587 return fp->decoder; 588} 589 590#endif /* MIME_SUPPORT */ 591