1/*	$NetBSD$	*/
2
3/*++
4/* NAME
5/*	header_token 3
6/* SUMMARY
7/*	mail header parser
8/* SYNOPSIS
9/*	#include <header_token.h>
10/*
11/*	typedef struct {
12/* .in +4
13/*	    int     type;
14/*	    const char *u.value;
15/*	    /* ... */
16/* .in
17/*	} HEADER_TOKEN;
18/*
19/*	ssize_t	header_token(token, token_len, token_buffer, ptr,
20/*				specials, terminator)
21/*	HEADER_TOKEN *token;
22/*	ssize_t	token_len;
23/*	VSTRING *token_buffer;
24/*	const char **ptr;
25/*	const char *specials;
26/*	int	terminator;
27/* DESCRIPTION
28/*	This module parses a mail header value (text after field-name:)
29/*	into tokens. The parser understands RFC 822 linear white space,
30/*	quoted-string, comment, control characters, and a set of
31/*	user-specified special characters.
32/*
33/*	A result token type is one of the following:
34/* .IP HEADER_TOK_QSTRING
35/*	Quoted string as per RFC 822.
36/* .IP HEADER_TOK_TOKEN
37/*	Token as per RFC 822, and the special characters supplied by the
38/*	caller.
39/* .IP other
40/*	The value of a control character or special character.
41/* .PP
42/*	header_token() tokenizes the input and stops after a user-specified
43/*	terminator (ignoring all tokens that exceed the capacity of
44/*	the result storage), or when it runs out of space for the result.
45/*	The terminator is not stored. The result value is the number of
46/*	tokens stored, or -1 when the input was exhausted before any tokens
47/*	were found.
48/*
49/*	Arguments:
50/* .IP token
51/*	Result array of HEADER_TOKEN structures. Token string values
52/*	are pointers to null-terminated substrings in the token_buffer.
53/* .IP token_len
54/*	Length of the array of HEADER_TOKEN structures.
55/* .IP token_buffer
56/*	Storage for result token string values.
57/* .IP ptr
58/*	Input/output read position. The input is a null-terminated string.
59/* .IP specials
60/*	Special characters according to the relevant RFC, or a
61/*	null pointer (default to the RFC 822 special characters).
62/*	This must include the optional terminator if one is specified.
63/* .IP terminator
64/*	The special character to stop after, or zero.
65/* BUGS
66/*	Eight-bit characters are not given special treatment.
67/* SEE ALSO
68/*	RFC 822 (ARPA Internet Text Messages)
69/* DIAGNOSTICS
70/*	Fatal errors: memory allocation problem.
71/* LICENSE
72/* .ad
73/* .fi
74/*	The Secure Mailer license must be distributed with this software.
75/* AUTHOR(S)
76/*	Wietse Venema
77/*	IBM T.J. Watson Research
78/*	P.O. Box 704
79/*	Yorktown Heights, NY 10598, USA
80/*--*/
81
82/* System library. */
83
84#include <sys_defs.h>
85#include <string.h>
86#include <ctype.h>
87
88/* Utility library. */
89
90#include <msg.h>
91#include <vstring.h>
92
93/* Global library. */
94
95#include <lex_822.h>
96#include <header_token.h>
97
98/* Application-specific. */
99
100 /*
101  * Silly little macros.
102  */
103#define STR(x)	vstring_str(x)
104#define LEN(x)	VSTRING_LEN(x)
105#define CU_CHAR_PTR(x)	((const unsigned char *) (x))
106
107/* header_token - parse out the next item in a message header */
108
109ssize_t header_token(HEADER_TOKEN *token, ssize_t token_len,
110		             VSTRING *token_buffer, const char **ptr,
111		             const char *user_specials, int user_terminator)
112{
113    ssize_t comment_level;
114    const unsigned char *cp;
115    ssize_t len;
116    int     ch;
117    ssize_t tok_count;
118    ssize_t n;
119
120    /*
121     * Initialize.
122     */
123    VSTRING_RESET(token_buffer);
124    cp = CU_CHAR_PTR(*ptr);
125    tok_count = 0;
126    if (user_specials == 0)
127	user_specials = LEX_822_SPECIALS;
128
129    /*
130     * Main parsing loop.
131     *
132     * XXX What was the reason to continue parsing when user_terminator is
133     * specified? Perhaps this was needed at some intermediate stage of
134     * development?
135     */
136    while ((ch = *cp) != 0 && (user_terminator != 0 || tok_count < token_len)) {
137	cp++;
138
139	/*
140	 * Skip RFC 822 linear white space.
141	 */
142	if (IS_SPACE_TAB_CR_LF(ch))
143	    continue;
144
145	/*
146	 * Terminator.
147	 */
148	if (ch == user_terminator)
149	    break;
150
151	/*
152	 * Skip RFC 822 comment.
153	 */
154	if (ch == '(') {
155	    comment_level = 1;
156	    while ((ch = *cp) != 0) {
157		cp++;
158		if (ch == '(') {		/* comments can nest! */
159		    comment_level++;
160		} else if (ch == ')') {
161		    if (--comment_level == 0)
162			break;
163		} else if (ch == '\\') {
164		    if ((ch = *cp) == 0)
165			break;
166		    cp++;
167		}
168	    }
169	    continue;
170	}
171
172	/*
173	 * Copy quoted text according to RFC 822.
174	 */
175	if (ch == '"') {
176	    if (tok_count < token_len) {
177		token[tok_count].u.offset = LEN(token_buffer);
178		token[tok_count].type = HEADER_TOK_QSTRING;
179	    }
180	    while ((ch = *cp) != 0) {
181		cp++;
182		if (ch == '"')
183		    break;
184		if (ch == '\n') {		/* unfold */
185		    if (tok_count < token_len) {
186			len = LEN(token_buffer);
187			while (len > 0
188			  && IS_SPACE_TAB_CR_LF(STR(token_buffer)[len - 1]))
189			    len--;
190			if (len < LEN(token_buffer))
191			    vstring_truncate(token_buffer, len);
192		    }
193		    continue;
194		}
195		if (ch == '\\') {
196		    if ((ch = *cp) == 0)
197			break;
198		    cp++;
199		}
200		if (tok_count < token_len)
201		    VSTRING_ADDCH(token_buffer, ch);
202	    }
203	    if (tok_count < token_len) {
204		VSTRING_ADDCH(token_buffer, 0);
205		tok_count++;
206	    }
207	    continue;
208	}
209
210	/*
211	 * Control, or special.
212	 */
213	if (strchr(user_specials, ch) || ISCNTRL(ch)) {
214	    if (tok_count < token_len) {
215		token[tok_count].u.offset = LEN(token_buffer);
216		token[tok_count].type = ch;
217		VSTRING_ADDCH(token_buffer, ch);
218		VSTRING_ADDCH(token_buffer, 0);
219		tok_count++;
220	    }
221	    continue;
222	}
223
224	/*
225	 * Token.
226	 */
227	else {
228	    if (tok_count < token_len) {
229		token[tok_count].u.offset = LEN(token_buffer);
230		token[tok_count].type = HEADER_TOK_TOKEN;
231		VSTRING_ADDCH(token_buffer, ch);
232	    }
233	    while ((ch = *cp) != 0 && !IS_SPACE_TAB_CR_LF(ch)
234		   && !ISCNTRL(ch) && !strchr(user_specials, ch)) {
235		cp++;
236		if (tok_count < token_len)
237		    VSTRING_ADDCH(token_buffer, ch);
238	    }
239	    if (tok_count < token_len) {
240		VSTRING_ADDCH(token_buffer, 0);
241		tok_count++;
242	    }
243	    continue;
244	}
245    }
246
247    /*
248     * Ignore a zero-length item after the last terminator.
249     */
250    if (tok_count == 0 && ch == 0)
251	return (-1);
252
253    /*
254     * Finalize. Fill in the string pointer array, now that the token buffer
255     * is no longer dynamically reallocated as it grows.
256     */
257    *ptr = (const char *) cp;
258    for (n = 0; n < tok_count; n++)
259	token[n].u.value = STR(token_buffer) + token[n].u.offset;
260
261    if (msg_verbose)
262	msg_info("header_token: %s %s %s",
263		 tok_count > 0 ? token[0].u.value : "",
264		 tok_count > 1 ? token[1].u.value : "",
265		 tok_count > 2 ? token[2].u.value : "");
266
267    return (tok_count);
268}
269