src/global/header_token.c

/*	$NetBSD$	*/

/*++
/* NAME
/*	header_token 3
/* SUMMARY
/*	mail header parser
/* SYNOPSIS
/*	#include <header_token.h>
/*
/*	typedef struct {
/* .in +4
/*	    int     type;
/*	    const char *u.value;
/*	    /* ... */
/* .in
/*	} HEADER_TOKEN;
/*
/*	ssize_t	header_token(token, token_len, token_buffer, ptr,
/*				specials, terminator)
/*	HEADER_TOKEN *token;
/*	ssize_t	token_len;
/*	VSTRING *token_buffer;
/*	const char **ptr;
/*	const char *specials;
/*	int	terminator;
/* DESCRIPTION
/*	This module parses a mail header value (text after field-name:)
/*	into tokens. The parser understands RFC 822 linear white space,
/*	quoted-string, comment, control characters, and a set of
/*	user-specified special characters.
/*
/*	A result token type is one of the following:
/* .IP HEADER_TOK_QSTRING
/*	Quoted string as per RFC 822.
/* .IP HEADER_TOK_TOKEN
/*	Token as per RFC 822, and the special characters supplied by the
/*	caller.
/* .IP other
/*	The value of a control character or special character.
/* .PP
/*	header_token() tokenizes the input and stops after a user-specified
/*	terminator (ignoring all tokens that exceed the capacity of
/*	the result storage), or when it runs out of space for the result.
/*	The terminator is not stored. The result value is the number of
/*	tokens stored, or -1 when the input was exhausted before any tokens
/*	were found.
/*
/*	Arguments:
/* .IP token
/*	Result array of HEADER_TOKEN structures. Token string values
/*	are pointers to null-terminated substrings in the token_buffer.
/* .IP token_len
/*	Length of the array of HEADER_TOKEN structures.
/* .IP token_buffer
/*	Storage for result token string values.
/* .IP ptr
/*	Input/output read position. The input is a null-terminated string.
/* .IP specials
/*	Special characters according to the relevant RFC, or a
/*	null pointer (default to the RFC 822 special characters).
/*	This must include the optional terminator if one is specified.
/* .IP terminator
/*	The special character to stop after, or zero.
/* BUGS
/*	Eight-bit characters are not given special treatment.
/* SEE ALSO
/*	RFC 822 (ARPA Internet Text Messages)
/* DIAGNOSTICS
/*	Fatal errors: memory allocation problem.
/* LICENSE
/* .ad
/* .fi
/*	The Secure Mailer license must be distributed with this software.
/* AUTHOR(S)
/*	Wietse Venema
/*	IBM T.J. Watson Research
/*	P.O. Box 704
/*	Yorktown Heights, NY 10598, USA
/*--*/

/* System library. */

#include <sys_defs.h>
#include <string.h>
#include <ctype.h>

/* Utility library. */

#include <msg.h>
#include <vstring.h>

/* Global library. */

#include <lex_822.h>
#include <header_token.h>

/* Application-specific. */

 /*
  * Silly little macros.
  */
#define STR(x)	vstring_str(x)
#define LEN(x)	VSTRING_LEN(x)
#define CU_CHAR_PTR(x)	((const unsigned char *) (x))

/* header_token - parse out the next item in a message header */

ssize_t header_token(HEADER_TOKEN *token, ssize_t token_len,
		             VSTRING *token_buffer, const char **ptr,
		             const char *user_specials, int user_terminator)
{
    ssize_t comment_level;
    const unsigned char *cp;
    ssize_t len;
    int     ch;
    ssize_t tok_count;
    ssize_t n;

    /*
     * Initialize.
     */
    VSTRING_RESET(token_buffer);
    cp = CU_CHAR_PTR(*ptr);
    tok_count = 0;
    if (user_specials == 0)
	user_specials = LEX_822_SPECIALS;

    /*
     * Main parsing loop.
     *
     * XXX What was the reason to continue parsing when user_terminator is
     * specified? Perhaps this was needed at some intermediate stage of
     * development?
     */
    while ((ch = *cp) != 0 && (user_terminator != 0 || tok_count < token_len)) {
	cp++;

	/*
	 * Skip RFC 822 linear white space.
	 */
	if (IS_SPACE_TAB_CR_LF(ch))
	    continue;

	/*
	 * Terminator.
	 */
	if (ch == user_terminator)
	    break;

	/*
	 * Skip RFC 822 comment.
	 */
	if (ch == '(') {
	    comment_level = 1;
	    while ((ch = *cp) != 0) {
		cp++;
		if (ch == '(') {		/* comments can nest! */
		    comment_level++;
		} else if (ch == ')') {
		    if (--comment_level == 0)
			break;
		} else if (ch == '\\') {
		    if ((ch = *cp) == 0)
			break;
		    cp++;
		}
	    }
	    continue;
	}

	/*
	 * Copy quoted text according to RFC 822.
	 */
	if (ch == '"') {
	    if (tok_count < token_len) {
		token[tok_count].u.offset = LEN(token_buffer);
		token[tok_count].type = HEADER_TOK_QSTRING;
	    }
	    while ((ch = *cp) != 0) {
		cp++;
		if (ch == '"')
		    break;
		if (ch == '\n') {		/* unfold */
		    if (tok_count < token_len) {
			len = LEN(token_buffer);
			while (len > 0
			  && IS_SPACE_TAB_CR_LF(STR(token_buffer)[len - 1]))
			    len--;
			if (len < LEN(token_buffer))
			    vstring_truncate(token_buffer, len);
		    }
		    continue;
		}
		if (ch == '\\') {
		    if ((ch = *cp) == 0)
			break;
		    cp++;
		}
		if (tok_count < token_len)
		    VSTRING_ADDCH(token_buffer, ch);
	    }
	    if (tok_count < token_len) {
		VSTRING_ADDCH(token_buffer, 0);
		tok_count++;
	    }
	    continue;
	}

	/*
	 * Control, or special.
	 */
	if (strchr(user_specials, ch) || ISCNTRL(ch)) {
	    if (tok_count < token_len) {
		token[tok_count].u.offset = LEN(token_buffer);
		token[tok_count].type = ch;
		VSTRING_ADDCH(token_buffer, ch);
		VSTRING_ADDCH(token_buffer, 0);
		tok_count++;
	    }
	    continue;
	}

	/*
	 * Token.
	 */
	else {
	    if (tok_count < token_len) {
		token[tok_count].u.offset = LEN(token_buffer);
		token[tok_count].type = HEADER_TOK_TOKEN;
		VSTRING_ADDCH(token_buffer, ch);
	    }
	    while ((ch = *cp) != 0 && !IS_SPACE_TAB_CR_LF(ch)
		   && !ISCNTRL(ch) && !strchr(user_specials, ch)) {
		cp++;
		if (tok_count < token_len)
		    VSTRING_ADDCH(token_buffer, ch);
	    }
	    if (tok_count < token_len) {
		VSTRING_ADDCH(token_buffer, 0);
		tok_count++;
	    }
	    continue;
	}
    }

    /*
     * Ignore a zero-length item after the last terminator.
     */
    if (tok_count == 0 && ch == 0)
	return (-1);

    /*
     * Finalize. Fill in the string pointer array, now that the token buffer
     * is no longer dynamically reallocated as it grows.
     */
    *ptr = (const char *) cp;
    for (n = 0; n < tok_count; n++)
	token[n].u.value = STR(token_buffer) + token[n].u.offset;

    if (msg_verbose)
	msg_info("header_token: %s %s %s",
		 tok_count > 0 ? token[0].u.value : "",
		 tok_count > 1 ? token[1].u.value : "",
		 tok_count > 2 ? token[2].u.value : "");

    return (tok_count);
}