usr.bin/mail/mime_header.c

/*	$NetBSD: mime_header.c,v 1.8 2009/04/10 13:08:25 christos Exp $	*/

/*-
 * Copyright (c) 2006 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Anon Ymous.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */


/*
 * This module contains the core MIME header decoding routines.
 * Please refer to RFC 2047 and RFC 2822.
 */

#ifdef MIME_SUPPORT

#include <sys/cdefs.h>
#ifndef __lint__
__RCSID("$NetBSD: mime_header.c,v 1.8 2009/04/10 13:08:25 christos Exp $");
#endif /* not __lint__ */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "def.h"
#include "extern.h"
#include "mime.h"
#include "mime_header.h"
#include "mime_codecs.h"

static const char *
grab_charset(char *from_cs, size_t from_cs_len, const char *p)
{
	char *q;
	q = from_cs;
	for (/*EMPTY*/; *p != '?'; p++) {
		if (*p == '\0' || q >= from_cs + from_cs_len - 1)
			return NULL;
		*q++ = *p;
	}
	*q = '\0';
	return ++p;	/* if here, then we got the '?' */
}

/*
 * An encoded word is a string of at most 75 non-white space
 * characters of the following form:
 *
 *  =?charset?X?encoding?=
 *
 * where:
 *   'charset'	is the original character set of the unencoded string.
 *
 *   'X'	is the encoding type 'B' or 'Q' for "base64" or
 *              "quoted-printable", respectively,
 *   'encoding'	is the encoded string.
 *
 * Both 'charset' and 'X' are case independent and 'encoding' cannot
 * contain any whitespace or '?' characters.  The 'encoding' must also
 * be fully contained within the encoded words, i.e., it cannot be
 * split between encoded words.
 *
 * Note: the 'B' encoding is a slightly modified "quoted-printable"
 * encoding.  In particular, spaces (' ') may be encoded as '_' to
 * improve undecoded readability.
 */
static int
decode_word(const char **ibuf, char **obuf, char *oend, const char *to_cs)
{
	ssize_t declen;
	size_t enclen, dstlen;
	char decword[LINESIZE];
	char from_cs[LINESIZE];
	const char *encword, *iend, *p;
	char *dstend;
	char enctype;

	p = *ibuf;
	if (p[0] != '=' && p[1] != '?')
		return -1;
	if (strlen(p) <  2 + 1 + 3 + 1 + 2)
		return -1;
	p = grab_charset(from_cs, sizeof(from_cs), p + 2);
	if (p == NULL)
		return -1;
	enctype = *p++;
	if (*p++ != '?')
		return -1;
	encword = p;
	p = strchr(p, '?');
	if (p == NULL || p[1] != '=')
		return -1;
	enclen = p - encword;	/* length of encoded substring */
	iend = p + 2;
	/* encoded words are at most 75 characters (RFC 2047, sec 2) */
	if (iend > *ibuf + 75)
		return -1;

	if (oend < *obuf + 1) {
		assert(/*CONSTCOND*/ 0);	/* We have a coding error! */
		return -1;
	}
	dstend = to_cs ? decword : *obuf;
	dstlen = (to_cs ? sizeof(decword) : (size_t)(oend - *obuf)) - 1;

	declen = mime_rfc2047_decode(enctype, dstend, dstlen, encword, enclen);
	if (declen == -1)
		return -1;

	dstend += declen;
#ifdef CHARSET_SUPPORT
	if (to_cs != NULL) {
		iconv_t cd;
		const char *src;
		size_t srclen;
		size_t cnt;

		cd = iconv_open(to_cs, from_cs);
		if (cd == (iconv_t)-1)
			return -1;

		src = decword;
		srclen = declen;
		dstend = *obuf;
		dstlen = oend - *obuf - 1;
		cnt = mime_iconv(cd, &src, &srclen, &dstend, &dstlen);

		(void)iconv_close(cd);
		if (cnt == (size_t)-1)
			return -1;
	}
#endif /* CHARSET_SUPPORT */
	*dstend = '\0';
	*ibuf = iend;
	*obuf = dstend;
	return 0;
}


/*
 * Folding White Space.  See RFC 2822.
 *
 * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF
 * pairs (i.e., "\r\n") and never separately.  However, by the time
 * mail(1) sees the messages, all CRLF pairs have been converted to
 * '\n' characters.
 *
 * XXX - pull is_FWS() and skip_FWS() up to def.h?
 */
static inline int
is_FWS(int c)
{
	return c == ' ' || c == '\t' || c == '\n';
}

static inline const char *
skip_FWS(const char *p)
{
	while (is_FWS(*p))
		p++;
	return p;
}

static inline void
copy_skipped_FWS(char **dst, char *dstend, const char **src, const char *srcend)
{
	const char *p, *pend;
	char *q, *qend;

	p = *src;
	q = *dst;
	pend = srcend;
	qend = dstend;

	if (p) {  /* copy any skipped linear-white-space */
		while (p < pend && q < qend)
			*q++ = *p++;
		*dst = q;
		*src = NULL;
	}
}

/*
 * Decode an unstructured field.
 *
 * See RFC 2822 Sec 2.2.1 and 3.6.5.
 * Encoded words may occur anywhere in unstructured fields provided
 * they are separated from any other text or encoded words by at least
 * one linear-white-space character. (See RFC 2047 sec 5.1.)  If two
 * encoded words occur sequentially (separated by only FWS) then the
 * separating FWS is removed.
 *
 * NOTE: unstructured fields cannot contain 'quoted-pairs' (see
 * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\'
 * (or any non-whitespace character) immediately before an
 * encoded-word will prevent it from being decoded.
 *
 * hstring should be a NULL terminated string.
 * outbuf should be sufficiently large to hold the result.
 */
static void
mime_decode_usfield(char *outbuf, size_t outsize, const char *hstring)
{
	const char *p, *p0;
	char *q, *qend;
	int lastc;
	const char *charset;

	charset = value(ENAME_MIME_CHARSET);
	qend = outbuf + outsize - 1; /* Make sure there is room for the trailing NULL! */
	q = outbuf;
	p = hstring;
	p0 = NULL;
	lastc = (unsigned char)' ';
	while (*p && q < qend) {
		const char *p1;
		char *q1;
		if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
		    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
		    (*p1 == '\0' || is_FWS(*p1))) {
			p0 = p1;  /* pointer to first character after encoded word */
			q = q1;
			p = skip_FWS(p1);
			lastc = (unsigned char)*p0;
		}
		else {
			copy_skipped_FWS(&q, qend, &p0, p);
			lastc = (unsigned char)*p;
			if (q < qend)
				*q++ = *p++;
		}
	}
	copy_skipped_FWS(&q, qend, &p0, p);
	*q = '\0';
}

/*
 * Decode a field comment.
 *
 * Comments only occur in structured fields, can be nested (rfc 2822,
 * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'.
 * Otherwise, they can be regarded as unstructured fields that are
 * bounded by '(' and ')' characters.
 */
static int
decode_comment(char **obuf, char *oend, const char **ibuf, const char *iend, const char *charset)
{
	const char *p, *pend, *p0;
	char *q, *qend;
	int lastc;

	p = *ibuf;
	q = *obuf;
	pend = iend;
	qend = oend;
	lastc = ' ';
	p0 = NULL;
	while (p < pend && q < qend) {
		const char *p1;
		char *q1;

		if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
		    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
		    (*p1 == ')' || is_FWS(*p1))) {
			lastc = (unsigned char)*p1;
			p0 = p1;
			q = q1;
			p = skip_FWS(p1);
			/*
			 * XXX - this check should be unnecessary as *pend should
			 * be '\0' which will stop skip_FWS()
			 */
			if (p > pend)
				p = pend;
		}
		else {
			copy_skipped_FWS(&q, qend, &p0, p);
			if (q >= qend)	/* XXX - q > qend cannot happen */
				break;

			if (*p == ')') {
				*q++ = *p++;	/* copy the closing ')' */
				break;		/* and get out of here! */
			}

			if (*p == '(') {
				*q++ = *p++;	/* copy the opening '(' */
				if (decode_comment(&q, qend, &p, pend, charset) == -1)
					return -1;	/* is this right or should we update? */
				lastc = ')';
			}
			else if (*p == '\\' && p + 1 < pend) {	/* quoted-pair */
				if (p[1] == '(' || p[1] == ')' || p[1] == '\\') /* need quoted-pair*/
					*q++ = *p;
				p++;
				lastc = (unsigned char)*p;
				if (q < qend)
					*q++ = *p++;
			}
			else {
				lastc = (unsigned char)*p;
				*q++ = *p++;
			}
		}
	}
	*ibuf = p;
	*obuf = q;
	return 0;
}

/*
 * Decode a quoted-string or no-fold-quote.
 *
 * These cannot contain encoded words.  They can contain quoted-pairs,
 * making '\\' special.  They have no other structure.  See RFC 2822
 * sec 3.2.5 and 3.6.4.
 */
static void
decode_quoted_string(char **obuf, char *oend, const char **ibuf, const char *iend)
{
	const char *p, *pend;
	char *q, *qend;

	qend = oend;
	pend = iend;
	p = *ibuf;
	q = *obuf;
	while (p < pend && q < qend) {
		if (*p == '"') {
			*q++ = *p++;	/* copy the closing '"' */
			break;
		}
		if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
			if (p[1] == '"' || p[1] == '\\') {
				*q++ = *p;
				if (q >= qend)
					break;
			}
			p++;
		}
		*q++ = *p++;
	}
	*ibuf = p;
	*obuf = q;
}

/*
 * Decode a domain-literal or no-fold-literal.
 *
 * These cannot contain encoded words.  They can have quoted pairs and
 * are delimited by '[' and ']' making '\\', '[', and ']' special.
 * They have no other structure.  See RFC 2822 sec 3.4.1 and 3.6.4.
 */
static void
decode_domain_literal(char **obuf, char *oend, const char **ibuf, const char *iend)
{
	const char *p, *pend;
	char *q, *qend;

	qend = oend;
	pend = iend;
	p = *ibuf;
	q = *obuf;
	while (p < pend && q < qend) {
		if (*p == ']') {
			*q++ = *p++;	/* copy the closing ']' */
			break;
		}
		if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
			if (p[1] == '[' || p[1] == ']' || p[1] == '\\') {
				*q++ = *p;
				if (q >= qend)
					break;
			}
			p++;
		}
		*q++ = *p++;
	}
	*ibuf = p;
	*obuf = q;
}

/*
 * Specials: see RFC 2822 sec 3.2.1.
 */
static inline int
is_specials(int c)
{
	static const char specialtab[] = {
		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
		0, 0, 1, 0,  0, 0, 0, 0,  1, 1, 0, 0,  1, 0, 1, 0,
		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 1, 1,  1, 0, 1, 0,

		1, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 1,  1, 1, 0, 0,
		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
	};
	return !(c & ~0x7f) ? specialtab[c] : 0;
}

/*
 * Decode a structured field.
 *
 * At the top level, structured fields can only contain encoded-words
 * via 'phrases' and 'comments'.  See RFC 2047 sec 5.
 */
static void
mime_decode_sfield(char *linebuf, size_t bufsize, const char *hstring)
{
	const char *p, *pend, *p0;
	char *q, *qend;
	const char *charset;
	int lastc;

	charset = value(ENAME_MIME_CHARSET);

	p = hstring;
	q = linebuf;
	pend = hstring + strlen(hstring);
	qend = linebuf + bufsize - 1;	/* save room for the NULL terminator */
	lastc = (unsigned char)' ';
	p0 = NULL;
	while (p < pend && q < qend) {
		const char *p1;
		char *q1;

		if (*p != '=') {
			copy_skipped_FWS(&q, qend, &p0, p);
			if (q >= qend)
				break;
		}

		switch (*p) {
		case '(':	/* start of comment */
			*q++ = *p++;	/* copy the opening '(' */
			(void)decode_comment(&q, qend, &p, pend, charset);
			lastc = (unsigned char)p[-1];
			break;

		case '"':	/* start of quoted-string or no-fold-quote */
			*q++ = *p++;	/* copy the opening '"' */
			decode_quoted_string(&q, qend, &p, pend);
			lastc = (unsigned char)p[-1];
			break;

		case '[':	/* start of domain-literal or no-fold-literal */
			*q++ = *p++;	/* copy the opening '[' */
			decode_domain_literal(&q, qend, &p, pend);
			lastc = (unsigned char)p[-1];
			break;

		case '\\':	/* start of quoted-pair */
			if (p + 1 < pend) {		/* quoted pair */
				if (is_specials(p[1])) {
					*q++ = *p;
					if (q >= qend)
						break;
				}
				p++;	/* skip the '\\' */
			}
			goto copy_char;

		case '=':
			/*
			 * At this level encoded words can appear via
			 * 'phrases' (possibly delimited by ',' as in
			 * 'keywords').  Thus we handle them as such.
			 * Hopefully this is sufficient.
			 */
			if ((lastc == ',' || is_FWS(lastc)) && p[1] == '?' &&
			    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
			    (*p1 == '\0' || *p1 == ',' || is_FWS(*p1))) {
				lastc = (unsigned char)*p1;
				p0 = p1;
				q = q1;
				p = skip_FWS(p1);
				/*
				 * XXX - this check should be
				 * unnecessary as *pend should be '\0'
				 * which will stop skip_FWS()
				 */
				if (p > pend)
					p = pend;
				break;
			}
			else {
				copy_skipped_FWS(&q, qend, &p0, p);
				if (q >= qend)
					break;
				goto copy_char;
			}

		case '<':	/* start of angle-addr, msg-id, or path. */
			/*
			 * A msg-id cannot contain encoded-pairs or
			 * encoded-words, but angle-addr and path can.
			 * Distinguishing between them seems to be
			 * unnecessary, so let's be loose and just
			 * decode them as if they were all the same.
			 */
		default:
	copy_char:
			lastc = (unsigned char)*p;
			*q++ = *p++;
			break;
		}
	}
	copy_skipped_FWS(&q, qend, &p0, p);
	*q = '\0';	/* null terminate the result! */
}

/*
 * Returns the correct hfield decoder, or NULL if none.
 * Info extracted from RFC 2822.
 *
 * name - pointer to field name of header line (with colon).
 */
PUBLIC hfield_decoder_t
mime_hfield_decoder(const char *name)
{
	static const struct field_decoder_tbl_s {
		const char *field_name;
		size_t field_len;
		hfield_decoder_t decoder;
	} field_decoder_tbl[] = {
#define X(s)	s, sizeof(s) - 1
		{ X("Received:"),			NULL },

		{ X("Content-Type:"),			NULL },
		{ X("Content-Disposition:"),		NULL },
		{ X("Content-Transfer-Encoding:"),	NULL },
		{ X("Content-Description:"),		mime_decode_sfield },
		{ X("Content-ID:"),			mime_decode_sfield },
		{ X("MIME-Version:"),			mime_decode_sfield },

		{ X("Bcc:"),				mime_decode_sfield },
		{ X("Cc:"),				mime_decode_sfield },
		{ X("Date:"),				mime_decode_sfield },
		{ X("From:"),				mime_decode_sfield },
		{ X("In-Reply-To:"),			mime_decode_sfield },
		{ X("Keywords:"),			mime_decode_sfield },
		{ X("Message-ID:"),			mime_decode_sfield },
		{ X("References:"),			mime_decode_sfield },
		{ X("Reply-To:"),			mime_decode_sfield },
		{ X("Return-Path:"),			mime_decode_sfield },
		{ X("Sender:"),				mime_decode_sfield },
		{ X("To:"),				mime_decode_sfield },
		{ X("Subject:"),			mime_decode_usfield },
		{ X("Comments:"),			mime_decode_usfield },
		{ X("X-"),				mime_decode_usfield },
		{ NULL, 0,				mime_decode_usfield },	/* optional-fields */
#undef X
	};
	const struct field_decoder_tbl_s *fp;

	/* XXX - this begs for a hash table! */
	for (fp = field_decoder_tbl; fp->field_name; fp++)
		if (strncasecmp(name, fp->field_name, fp->field_len) == 0)
			break;
	return fp->decoder;
}

#endif /* MIME_SUPPORT */