1/*++
2/* NAME
3/*	header_token 3
4/* SUMMARY
5/*	mail header parser
6/* SYNOPSIS
7/*	#include <header_token.h>
8/*
9/*	typedef struct {
10/* .in +4
11/*	    int     type;
12/*	    const char *u.value;
13/*	    /* ... */
14/* .in
15/*	} HEADER_TOKEN;
16/*
17/*	ssize_t	header_token(token, token_len, token_buffer, ptr,
18/*				specials, terminator)
19/*	HEADER_TOKEN *token;
20/*	ssize_t	token_len;
21/*	VSTRING *token_buffer;
22/*	const char **ptr;
23/*	const char *specials;
24/*	int	terminator;
25/* DESCRIPTION
26/*	This module parses a mail header value (text after field-name:)
27/*	into tokens. The parser understands RFC 822 linear white space,
28/*	quoted-string, comment, control characters, and a set of
29/*	user-specified special characters.
30/*
31/*	A result token type is one of the following:
32/* .IP HEADER_TOK_QSTRING
33/*	Quoted string as per RFC 822.
34/* .IP HEADER_TOK_TOKEN
35/*	Token as per RFC 822, and the special characters supplied by the
36/*	caller.
37/* .IP other
38/*	The value of a control character or special character.
39/* .PP
40/*	header_token() tokenizes the input and stops after a user-specified
41/*	terminator (ignoring all tokens that exceed the capacity of
42/*	the result storage), or when it runs out of space for the result.
43/*	The terminator is not stored. The result value is the number of
44/*	tokens stored, or -1 when the input was exhausted before any tokens
45/*	were found.
46/*
47/*	Arguments:
48/* .IP token
49/*	Result array of HEADER_TOKEN structures. Token string values
50/*	are pointers to null-terminated substrings in the token_buffer.
51/* .IP token_len
52/*	Length of the array of HEADER_TOKEN structures.
53/* .IP token_buffer
54/*	Storage for result token string values.
55/* .IP ptr
56/*	Input/output read position. The input is a null-terminated string.
57/* .IP specials
58/*	Special characters according to the relevant RFC, or a
59/*	null pointer (default to the RFC 822 special characters).
60/*	This must include the optional terminator if one is specified.
61/* .IP terminator
62/*	The special character to stop after, or zero.
63/* BUGS
64/*	Eight-bit characters are not given special treatment.
65/* SEE ALSO
66/*	RFC 822 (ARPA Internet Text Messages)
67/* DIAGNOSTICS
68/*	Fatal errors: memory allocation problem.
69/* LICENSE
70/* .ad
71/* .fi
72/*	The Secure Mailer license must be distributed with this software.
73/* AUTHOR(S)
74/*	Wietse Venema
75/*	IBM T.J. Watson Research
76/*	P.O. Box 704
77/*	Yorktown Heights, NY 10598, USA
78/*--*/
79
80/* System library. */
81
82#include <sys_defs.h>
83#include <string.h>
84#include <ctype.h>
85
86/* Utility library. */
87
88#include <msg.h>
89#include <vstring.h>
90
91/* Global library. */
92
93#include <lex_822.h>
94#include <header_token.h>
95
96/* Application-specific. */
97
98 /*
99  * Silly little macros.
100  */
101#define STR(x)	vstring_str(x)
102#define LEN(x)	VSTRING_LEN(x)
103#define CU_CHAR_PTR(x)	((const unsigned char *) (x))
104
105/* header_token - parse out the next item in a message header */
106
107ssize_t header_token(HEADER_TOKEN *token, ssize_t token_len,
108		             VSTRING *token_buffer, const char **ptr,
109		             const char *user_specials, int user_terminator)
110{
111    ssize_t comment_level;
112    const unsigned char *cp;
113    ssize_t len;
114    int     ch;
115    ssize_t tok_count;
116    ssize_t n;
117
118    /*
119     * Initialize.
120     */
121    VSTRING_RESET(token_buffer);
122    cp = CU_CHAR_PTR(*ptr);
123    tok_count = 0;
124    if (user_specials == 0)
125	user_specials = LEX_822_SPECIALS;
126
127    /*
128     * Main parsing loop.
129     *
130     * XXX What was the reason to continue parsing when user_terminator is
131     * specified? Perhaps this was needed at some intermediate stage of
132     * development?
133     */
134    while ((ch = *cp) != 0 && (user_terminator != 0 || tok_count < token_len)) {
135	cp++;
136
137	/*
138	 * Skip RFC 822 linear white space.
139	 */
140	if (IS_SPACE_TAB_CR_LF(ch))
141	    continue;
142
143	/*
144	 * Terminator.
145	 */
146	if (ch == user_terminator)
147	    break;
148
149	/*
150	 * Skip RFC 822 comment.
151	 */
152	if (ch == '(') {
153	    comment_level = 1;
154	    while ((ch = *cp) != 0) {
155		cp++;
156		if (ch == '(') {		/* comments can nest! */
157		    comment_level++;
158		} else if (ch == ')') {
159		    if (--comment_level == 0)
160			break;
161		} else if (ch == '\\') {
162		    if ((ch = *cp) == 0)
163			break;
164		    cp++;
165		}
166	    }
167	    continue;
168	}
169
170	/*
171	 * Copy quoted text according to RFC 822.
172	 */
173	if (ch == '"') {
174	    if (tok_count < token_len) {
175		token[tok_count].u.offset = LEN(token_buffer);
176		token[tok_count].type = HEADER_TOK_QSTRING;
177	    }
178	    while ((ch = *cp) != 0) {
179		cp++;
180		if (ch == '"')
181		    break;
182		if (ch == '\n') {		/* unfold */
183		    if (tok_count < token_len) {
184			len = LEN(token_buffer);
185			while (len > 0
186			  && IS_SPACE_TAB_CR_LF(STR(token_buffer)[len - 1]))
187			    len--;
188			if (len < LEN(token_buffer))
189			    vstring_truncate(token_buffer, len);
190		    }
191		    continue;
192		}
193		if (ch == '\\') {
194		    if ((ch = *cp) == 0)
195			break;
196		    cp++;
197		}
198		if (tok_count < token_len)
199		    VSTRING_ADDCH(token_buffer, ch);
200	    }
201	    if (tok_count < token_len) {
202		VSTRING_ADDCH(token_buffer, 0);
203		tok_count++;
204	    }
205	    continue;
206	}
207
208	/*
209	 * Control, or special.
210	 */
211	if (strchr(user_specials, ch) || ISCNTRL(ch)) {
212	    if (tok_count < token_len) {
213		token[tok_count].u.offset = LEN(token_buffer);
214		token[tok_count].type = ch;
215		VSTRING_ADDCH(token_buffer, ch);
216		VSTRING_ADDCH(token_buffer, 0);
217		tok_count++;
218	    }
219	    continue;
220	}
221
222	/*
223	 * Token.
224	 */
225	else {
226	    if (tok_count < token_len) {
227		token[tok_count].u.offset = LEN(token_buffer);
228		token[tok_count].type = HEADER_TOK_TOKEN;
229		VSTRING_ADDCH(token_buffer, ch);
230	    }
231	    while ((ch = *cp) != 0 && !IS_SPACE_TAB_CR_LF(ch)
232		   && !ISCNTRL(ch) && !strchr(user_specials, ch)) {
233		cp++;
234		if (tok_count < token_len)
235		    VSTRING_ADDCH(token_buffer, ch);
236	    }
237	    if (tok_count < token_len) {
238		VSTRING_ADDCH(token_buffer, 0);
239		tok_count++;
240	    }
241	    continue;
242	}
243    }
244
245    /*
246     * Ignore a zero-length item after the last terminator.
247     */
248    if (tok_count == 0 && ch == 0)
249	return (-1);
250
251    /*
252     * Finalize. Fill in the string pointer array, now that the token buffer
253     * is no longer dynamically reallocated as it grows.
254     */
255    *ptr = (const char *) cp;
256    for (n = 0; n < tok_count; n++)
257	token[n].u.value = STR(token_buffer) + token[n].u.offset;
258
259    if (msg_verbose)
260	msg_info("header_token: %s %s %s",
261		 tok_count > 0 ? token[0].u.value : "",
262		 tok_count > 1 ? token[1].u.value : "",
263		 tok_count > 2 ? token[2].u.value : "");
264
265    return (tok_count);
266}
267