1/*	$NetBSD: mime_header.c,v 1.8 2009/04/10 13:08:25 christos Exp $	*/
2
3/*-
4 * Copyright (c) 2006 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Anon Ymous.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32
33/*
34 * This module contains the core MIME header decoding routines.
35 * Please refer to RFC 2047 and RFC 2822.
36 */
37
38#ifdef MIME_SUPPORT
39
40#include <sys/cdefs.h>
41#ifndef __lint__
42__RCSID("$NetBSD: mime_header.c,v 1.8 2009/04/10 13:08:25 christos Exp $");
43#endif /* not __lint__ */
44
45#include <assert.h>
46#include <stdio.h>
47#include <stdlib.h>
48#include <string.h>
49
50#include "def.h"
51#include "extern.h"
52#include "mime.h"
53#include "mime_header.h"
54#include "mime_codecs.h"
55
56static const char *
57grab_charset(char *from_cs, size_t from_cs_len, const char *p)
58{
59	char *q;
60	q = from_cs;
61	for (/*EMPTY*/; *p != '?'; p++) {
62		if (*p == '\0' || q >= from_cs + from_cs_len - 1)
63			return NULL;
64		*q++ = *p;
65	}
66	*q = '\0';
67	return ++p;	/* if here, then we got the '?' */
68}
69
70/*
71 * An encoded word is a string of at most 75 non-white space
72 * characters of the following form:
73 *
74 *  =?charset?X?encoding?=
75 *
76 * where:
77 *   'charset'	is the original character set of the unencoded string.
78 *
79 *   'X'	is the encoding type 'B' or 'Q' for "base64" or
80 *              "quoted-printable", respectively,
81 *   'encoding'	is the encoded string.
82 *
83 * Both 'charset' and 'X' are case independent and 'encoding' cannot
84 * contain any whitespace or '?' characters.  The 'encoding' must also
85 * be fully contained within the encoded words, i.e., it cannot be
86 * split between encoded words.
87 *
88 * Note: the 'B' encoding is a slightly modified "quoted-printable"
89 * encoding.  In particular, spaces (' ') may be encoded as '_' to
90 * improve undecoded readability.
91 */
92static int
93decode_word(const char **ibuf, char **obuf, char *oend, const char *to_cs)
94{
95	ssize_t declen;
96	size_t enclen, dstlen;
97	char decword[LINESIZE];
98	char from_cs[LINESIZE];
99	const char *encword, *iend, *p;
100	char *dstend;
101	char enctype;
102
103	p = *ibuf;
104	if (p[0] != '=' && p[1] != '?')
105		return -1;
106	if (strlen(p) <  2 + 1 + 3 + 1 + 2)
107		return -1;
108	p = grab_charset(from_cs, sizeof(from_cs), p + 2);
109	if (p == NULL)
110		return -1;
111	enctype = *p++;
112	if (*p++ != '?')
113		return -1;
114	encword = p;
115	p = strchr(p, '?');
116	if (p == NULL || p[1] != '=')
117		return -1;
118	enclen = p - encword;	/* length of encoded substring */
119	iend = p + 2;
120	/* encoded words are at most 75 characters (RFC 2047, sec 2) */
121	if (iend > *ibuf + 75)
122		return -1;
123
124	if (oend < *obuf + 1) {
125		assert(/*CONSTCOND*/ 0);	/* We have a coding error! */
126		return -1;
127	}
128	dstend = to_cs ? decword : *obuf;
129	dstlen = (to_cs ? sizeof(decword) : (size_t)(oend - *obuf)) - 1;
130
131	declen = mime_rfc2047_decode(enctype, dstend, dstlen, encword, enclen);
132	if (declen == -1)
133		return -1;
134
135	dstend += declen;
136#ifdef CHARSET_SUPPORT
137	if (to_cs != NULL) {
138		iconv_t cd;
139		const char *src;
140		size_t srclen;
141		size_t cnt;
142
143		cd = iconv_open(to_cs, from_cs);
144		if (cd == (iconv_t)-1)
145			return -1;
146
147		src = decword;
148		srclen = declen;
149		dstend = *obuf;
150		dstlen = oend - *obuf - 1;
151		cnt = mime_iconv(cd, &src, &srclen, &dstend, &dstlen);
152
153		(void)iconv_close(cd);
154		if (cnt == (size_t)-1)
155			return -1;
156	}
157#endif /* CHARSET_SUPPORT */
158	*dstend = '\0';
159	*ibuf = iend;
160	*obuf = dstend;
161	return 0;
162}
163
164
165/*
166 * Folding White Space.  See RFC 2822.
167 *
168 * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF
169 * pairs (i.e., "\r\n") and never separately.  However, by the time
170 * mail(1) sees the messages, all CRLF pairs have been converted to
171 * '\n' characters.
172 *
173 * XXX - pull is_FWS() and skip_FWS() up to def.h?
174 */
175static inline int
176is_FWS(int c)
177{
178	return c == ' ' || c == '\t' || c == '\n';
179}
180
181static inline const char *
182skip_FWS(const char *p)
183{
184	while (is_FWS(*p))
185		p++;
186	return p;
187}
188
189static inline void
190copy_skipped_FWS(char **dst, char *dstend, const char **src, const char *srcend)
191{
192	const char *p, *pend;
193	char *q, *qend;
194
195	p = *src;
196	q = *dst;
197	pend = srcend;
198	qend = dstend;
199
200	if (p) {  /* copy any skipped linear-white-space */
201		while (p < pend && q < qend)
202			*q++ = *p++;
203		*dst = q;
204		*src = NULL;
205	}
206}
207
208/*
209 * Decode an unstructured field.
210 *
211 * See RFC 2822 Sec 2.2.1 and 3.6.5.
212 * Encoded words may occur anywhere in unstructured fields provided
213 * they are separated from any other text or encoded words by at least
214 * one linear-white-space character. (See RFC 2047 sec 5.1.)  If two
215 * encoded words occur sequentially (separated by only FWS) then the
216 * separating FWS is removed.
217 *
218 * NOTE: unstructured fields cannot contain 'quoted-pairs' (see
219 * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\'
220 * (or any non-whitespace character) immediately before an
221 * encoded-word will prevent it from being decoded.
222 *
223 * hstring should be a NULL terminated string.
224 * outbuf should be sufficiently large to hold the result.
225 */
226static void
227mime_decode_usfield(char *outbuf, size_t outsize, const char *hstring)
228{
229	const char *p, *p0;
230	char *q, *qend;
231	int lastc;
232	const char *charset;
233
234	charset = value(ENAME_MIME_CHARSET);
235	qend = outbuf + outsize - 1; /* Make sure there is room for the trailing NULL! */
236	q = outbuf;
237	p = hstring;
238	p0 = NULL;
239	lastc = (unsigned char)' ';
240	while (*p && q < qend) {
241		const char *p1;
242		char *q1;
243		if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
244		    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
245		    (*p1 == '\0' || is_FWS(*p1))) {
246			p0 = p1;  /* pointer to first character after encoded word */
247			q = q1;
248			p = skip_FWS(p1);
249			lastc = (unsigned char)*p0;
250		}
251		else {
252			copy_skipped_FWS(&q, qend, &p0, p);
253			lastc = (unsigned char)*p;
254			if (q < qend)
255				*q++ = *p++;
256		}
257	}
258	copy_skipped_FWS(&q, qend, &p0, p);
259	*q = '\0';
260}
261
262/*
263 * Decode a field comment.
264 *
265 * Comments only occur in structured fields, can be nested (rfc 2822,
266 * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'.
267 * Otherwise, they can be regarded as unstructured fields that are
268 * bounded by '(' and ')' characters.
269 */
270static int
271decode_comment(char **obuf, char *oend, const char **ibuf, const char *iend, const char *charset)
272{
273	const char *p, *pend, *p0;
274	char *q, *qend;
275	int lastc;
276
277	p = *ibuf;
278	q = *obuf;
279	pend = iend;
280	qend = oend;
281	lastc = ' ';
282	p0 = NULL;
283	while (p < pend && q < qend) {
284		const char *p1;
285		char *q1;
286
287		if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
288		    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
289		    (*p1 == ')' || is_FWS(*p1))) {
290			lastc = (unsigned char)*p1;
291			p0 = p1;
292			q = q1;
293			p = skip_FWS(p1);
294			/*
295			 * XXX - this check should be unnecessary as *pend should
296			 * be '\0' which will stop skip_FWS()
297			 */
298			if (p > pend)
299				p = pend;
300		}
301		else {
302			copy_skipped_FWS(&q, qend, &p0, p);
303			if (q >= qend)	/* XXX - q > qend cannot happen */
304				break;
305
306			if (*p == ')') {
307				*q++ = *p++;	/* copy the closing ')' */
308				break;		/* and get out of here! */
309			}
310
311			if (*p == '(') {
312				*q++ = *p++;	/* copy the opening '(' */
313				if (decode_comment(&q, qend, &p, pend, charset) == -1)
314					return -1;	/* is this right or should we update? */
315				lastc = ')';
316			}
317			else if (*p == '\\' && p + 1 < pend) {	/* quoted-pair */
318				if (p[1] == '(' || p[1] == ')' || p[1] == '\\') /* need quoted-pair*/
319					*q++ = *p;
320				p++;
321				lastc = (unsigned char)*p;
322				if (q < qend)
323					*q++ = *p++;
324			}
325			else {
326				lastc = (unsigned char)*p;
327				*q++ = *p++;
328			}
329		}
330	}
331	*ibuf = p;
332	*obuf = q;
333	return 0;
334}
335
336/*
337 * Decode a quoted-string or no-fold-quote.
338 *
339 * These cannot contain encoded words.  They can contain quoted-pairs,
340 * making '\\' special.  They have no other structure.  See RFC 2822
341 * sec 3.2.5 and 3.6.4.
342 */
343static void
344decode_quoted_string(char **obuf, char *oend, const char **ibuf, const char *iend)
345{
346	const char *p, *pend;
347	char *q, *qend;
348
349	qend = oend;
350	pend = iend;
351	p = *ibuf;
352	q = *obuf;
353	while (p < pend && q < qend) {
354		if (*p == '"') {
355			*q++ = *p++;	/* copy the closing '"' */
356			break;
357		}
358		if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
359			if (p[1] == '"' || p[1] == '\\') {
360				*q++ = *p;
361				if (q >= qend)
362					break;
363			}
364			p++;
365		}
366		*q++ = *p++;
367	}
368	*ibuf = p;
369	*obuf = q;
370}
371
372/*
373 * Decode a domain-literal or no-fold-literal.
374 *
375 * These cannot contain encoded words.  They can have quoted pairs and
376 * are delimited by '[' and ']' making '\\', '[', and ']' special.
377 * They have no other structure.  See RFC 2822 sec 3.4.1 and 3.6.4.
378 */
379static void
380decode_domain_literal(char **obuf, char *oend, const char **ibuf, const char *iend)
381{
382	const char *p, *pend;
383	char *q, *qend;
384
385	qend = oend;
386	pend = iend;
387	p = *ibuf;
388	q = *obuf;
389	while (p < pend && q < qend) {
390		if (*p == ']') {
391			*q++ = *p++;	/* copy the closing ']' */
392			break;
393		}
394		if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
395			if (p[1] == '[' || p[1] == ']' || p[1] == '\\') {
396				*q++ = *p;
397				if (q >= qend)
398					break;
399			}
400			p++;
401		}
402		*q++ = *p++;
403	}
404	*ibuf = p;
405	*obuf = q;
406}
407
408/*
409 * Specials: see RFC 2822 sec 3.2.1.
410 */
411static inline int
412is_specials(int c)
413{
414	static const char specialtab[] = {
415		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
416		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
417		0, 0, 1, 0,  0, 0, 0, 0,  1, 1, 0, 0,  1, 0, 1, 0,
418		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 1, 1,  1, 0, 1, 0,
419
420		1, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
421		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 1,  1, 1, 0, 0,
422		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
423		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
424	};
425	return !(c & ~0x7f) ? specialtab[c] : 0;
426}
427
428/*
429 * Decode a structured field.
430 *
431 * At the top level, structured fields can only contain encoded-words
432 * via 'phrases' and 'comments'.  See RFC 2047 sec 5.
433 */
434static void
435mime_decode_sfield(char *linebuf, size_t bufsize, const char *hstring)
436{
437	const char *p, *pend, *p0;
438	char *q, *qend;
439	const char *charset;
440	int lastc;
441
442	charset = value(ENAME_MIME_CHARSET);
443
444	p = hstring;
445	q = linebuf;
446	pend = hstring + strlen(hstring);
447	qend = linebuf + bufsize - 1;	/* save room for the NULL terminator */
448	lastc = (unsigned char)' ';
449	p0 = NULL;
450	while (p < pend && q < qend) {
451		const char *p1;
452		char *q1;
453
454		if (*p != '=') {
455			copy_skipped_FWS(&q, qend, &p0, p);
456			if (q >= qend)
457				break;
458		}
459
460		switch (*p) {
461		case '(':	/* start of comment */
462			*q++ = *p++;	/* copy the opening '(' */
463			(void)decode_comment(&q, qend, &p, pend, charset);
464			lastc = (unsigned char)p[-1];
465			break;
466
467		case '"':	/* start of quoted-string or no-fold-quote */
468			*q++ = *p++;	/* copy the opening '"' */
469			decode_quoted_string(&q, qend, &p, pend);
470			lastc = (unsigned char)p[-1];
471			break;
472
473		case '[':	/* start of domain-literal or no-fold-literal */
474			*q++ = *p++;	/* copy the opening '[' */
475			decode_domain_literal(&q, qend, &p, pend);
476			lastc = (unsigned char)p[-1];
477			break;
478
479		case '\\':	/* start of quoted-pair */
480			if (p + 1 < pend) {		/* quoted pair */
481				if (is_specials(p[1])) {
482					*q++ = *p;
483					if (q >= qend)
484						break;
485				}
486				p++;	/* skip the '\\' */
487			}
488			goto copy_char;
489
490		case '=':
491			/*
492			 * At this level encoded words can appear via
493			 * 'phrases' (possibly delimited by ',' as in
494			 * 'keywords').  Thus we handle them as such.
495			 * Hopefully this is sufficient.
496			 */
497			if ((lastc == ',' || is_FWS(lastc)) && p[1] == '?' &&
498			    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
499			    (*p1 == '\0' || *p1 == ',' || is_FWS(*p1))) {
500				lastc = (unsigned char)*p1;
501				p0 = p1;
502				q = q1;
503				p = skip_FWS(p1);
504				/*
505				 * XXX - this check should be
506				 * unnecessary as *pend should be '\0'
507				 * which will stop skip_FWS()
508				 */
509				if (p > pend)
510					p = pend;
511				break;
512			}
513			else {
514				copy_skipped_FWS(&q, qend, &p0, p);
515				if (q >= qend)
516					break;
517				goto copy_char;
518			}
519
520		case '<':	/* start of angle-addr, msg-id, or path. */
521			/*
522			 * A msg-id cannot contain encoded-pairs or
523			 * encoded-words, but angle-addr and path can.
524			 * Distinguishing between them seems to be
525			 * unnecessary, so let's be loose and just
526			 * decode them as if they were all the same.
527			 */
528		default:
529	copy_char:
530			lastc = (unsigned char)*p;
531			*q++ = *p++;
532			break;
533		}
534	}
535	copy_skipped_FWS(&q, qend, &p0, p);
536	*q = '\0';	/* null terminate the result! */
537}
538
539/*
540 * Returns the correct hfield decoder, or NULL if none.
541 * Info extracted from RFC 2822.
542 *
543 * name - pointer to field name of header line (with colon).
544 */
545PUBLIC hfield_decoder_t
546mime_hfield_decoder(const char *name)
547{
548	static const struct field_decoder_tbl_s {
549		const char *field_name;
550		size_t field_len;
551		hfield_decoder_t decoder;
552	} field_decoder_tbl[] = {
553#define X(s)	s, sizeof(s) - 1
554		{ X("Received:"),			NULL },
555
556		{ X("Content-Type:"),			NULL },
557		{ X("Content-Disposition:"),		NULL },
558		{ X("Content-Transfer-Encoding:"),	NULL },
559		{ X("Content-Description:"),		mime_decode_sfield },
560		{ X("Content-ID:"),			mime_decode_sfield },
561		{ X("MIME-Version:"),			mime_decode_sfield },
562
563		{ X("Bcc:"),				mime_decode_sfield },
564		{ X("Cc:"),				mime_decode_sfield },
565		{ X("Date:"),				mime_decode_sfield },
566		{ X("From:"),				mime_decode_sfield },
567		{ X("In-Reply-To:"),			mime_decode_sfield },
568		{ X("Keywords:"),			mime_decode_sfield },
569		{ X("Message-ID:"),			mime_decode_sfield },
570		{ X("References:"),			mime_decode_sfield },
571		{ X("Reply-To:"),			mime_decode_sfield },
572		{ X("Return-Path:"),			mime_decode_sfield },
573		{ X("Sender:"),				mime_decode_sfield },
574		{ X("To:"),				mime_decode_sfield },
575		{ X("Subject:"),			mime_decode_usfield },
576		{ X("Comments:"),			mime_decode_usfield },
577		{ X("X-"),				mime_decode_usfield },
578		{ NULL, 0,				mime_decode_usfield },	/* optional-fields */
579#undef X
580	};
581	const struct field_decoder_tbl_s *fp;
582
583	/* XXX - this begs for a hash table! */
584	for (fp = field_decoder_tbl; fp->field_name; fp++)
585		if (strncasecmp(name, fp->field_name, fp->field_len) == 0)
586			break;
587	return fp->decoder;
588}
589
590#endif /* MIME_SUPPORT */
591