1/*++ 2/* NAME 3/* header_token 3 4/* SUMMARY 5/* mail header parser 6/* SYNOPSIS 7/* #include <header_token.h> 8/* 9/* typedef struct { 10/* .in +4 11/* int type; 12/* const char *u.value; 13/* /* ... */ 14/* .in 15/* } HEADER_TOKEN; 16/* 17/* ssize_t header_token(token, token_len, token_buffer, ptr, 18/* specials, terminator) 19/* HEADER_TOKEN *token; 20/* ssize_t token_len; 21/* VSTRING *token_buffer; 22/* const char **ptr; 23/* const char *specials; 24/* int terminator; 25/* DESCRIPTION 26/* This module parses a mail header value (text after field-name:) 27/* into tokens. The parser understands RFC 822 linear white space, 28/* quoted-string, comment, control characters, and a set of 29/* user-specified special characters. 30/* 31/* A result token type is one of the following: 32/* .IP HEADER_TOK_QSTRING 33/* Quoted string as per RFC 822. 34/* .IP HEADER_TOK_TOKEN 35/* Token as per RFC 822, and the special characters supplied by the 36/* caller. 37/* .IP other 38/* The value of a control character or special character. 39/* .PP 40/* header_token() tokenizes the input and stops after a user-specified 41/* terminator (ignoring all tokens that exceed the capacity of 42/* the result storage), or when it runs out of space for the result. 43/* The terminator is not stored. The result value is the number of 44/* tokens stored, or -1 when the input was exhausted before any tokens 45/* were found. 46/* 47/* Arguments: 48/* .IP token 49/* Result array of HEADER_TOKEN structures. Token string values 50/* are pointers to null-terminated substrings in the token_buffer. 51/* .IP token_len 52/* Length of the array of HEADER_TOKEN structures. 53/* .IP token_buffer 54/* Storage for result token string values. 55/* .IP ptr 56/* Input/output read position. The input is a null-terminated string. 57/* .IP specials 58/* Special characters according to the relevant RFC, or a 59/* null pointer (default to the RFC 822 special characters). 60/* This must include the optional terminator if one is specified. 61/* .IP terminator 62/* The special character to stop after, or zero. 63/* BUGS 64/* Eight-bit characters are not given special treatment. 65/* SEE ALSO 66/* RFC 822 (ARPA Internet Text Messages) 67/* DIAGNOSTICS 68/* Fatal errors: memory allocation problem. 69/* LICENSE 70/* .ad 71/* .fi 72/* The Secure Mailer license must be distributed with this software. 73/* AUTHOR(S) 74/* Wietse Venema 75/* IBM T.J. Watson Research 76/* P.O. Box 704 77/* Yorktown Heights, NY 10598, USA 78/*--*/ 79 80/* System library. */ 81 82#include <sys_defs.h> 83#include <string.h> 84#include <ctype.h> 85 86/* Utility library. */ 87 88#include <msg.h> 89#include <vstring.h> 90 91/* Global library. */ 92 93#include <lex_822.h> 94#include <header_token.h> 95 96/* Application-specific. */ 97 98 /* 99 * Silly little macros. 100 */ 101#define STR(x) vstring_str(x) 102#define LEN(x) VSTRING_LEN(x) 103#define CU_CHAR_PTR(x) ((const unsigned char *) (x)) 104 105/* header_token - parse out the next item in a message header */ 106 107ssize_t header_token(HEADER_TOKEN *token, ssize_t token_len, 108 VSTRING *token_buffer, const char **ptr, 109 const char *user_specials, int user_terminator) 110{ 111 ssize_t comment_level; 112 const unsigned char *cp; 113 ssize_t len; 114 int ch; 115 ssize_t tok_count; 116 ssize_t n; 117 118 /* 119 * Initialize. 120 */ 121 VSTRING_RESET(token_buffer); 122 cp = CU_CHAR_PTR(*ptr); 123 tok_count = 0; 124 if (user_specials == 0) 125 user_specials = LEX_822_SPECIALS; 126 127 /* 128 * Main parsing loop. 129 * 130 * XXX What was the reason to continue parsing when user_terminator is 131 * specified? Perhaps this was needed at some intermediate stage of 132 * development? 133 */ 134 while ((ch = *cp) != 0 && (user_terminator != 0 || tok_count < token_len)) { 135 cp++; 136 137 /* 138 * Skip RFC 822 linear white space. 139 */ 140 if (IS_SPACE_TAB_CR_LF(ch)) 141 continue; 142 143 /* 144 * Terminator. 145 */ 146 if (ch == user_terminator) 147 break; 148 149 /* 150 * Skip RFC 822 comment. 151 */ 152 if (ch == '(') { 153 comment_level = 1; 154 while ((ch = *cp) != 0) { 155 cp++; 156 if (ch == '(') { /* comments can nest! */ 157 comment_level++; 158 } else if (ch == ')') { 159 if (--comment_level == 0) 160 break; 161 } else if (ch == '\\') { 162 if ((ch = *cp) == 0) 163 break; 164 cp++; 165 } 166 } 167 continue; 168 } 169 170 /* 171 * Copy quoted text according to RFC 822. 172 */ 173 if (ch == '"') { 174 if (tok_count < token_len) { 175 token[tok_count].u.offset = LEN(token_buffer); 176 token[tok_count].type = HEADER_TOK_QSTRING; 177 } 178 while ((ch = *cp) != 0) { 179 cp++; 180 if (ch == '"') 181 break; 182 if (ch == '\n') { /* unfold */ 183 if (tok_count < token_len) { 184 len = LEN(token_buffer); 185 while (len > 0 186 && IS_SPACE_TAB_CR_LF(STR(token_buffer)[len - 1])) 187 len--; 188 if (len < LEN(token_buffer)) 189 vstring_truncate(token_buffer, len); 190 } 191 continue; 192 } 193 if (ch == '\\') { 194 if ((ch = *cp) == 0) 195 break; 196 cp++; 197 } 198 if (tok_count < token_len) 199 VSTRING_ADDCH(token_buffer, ch); 200 } 201 if (tok_count < token_len) { 202 VSTRING_ADDCH(token_buffer, 0); 203 tok_count++; 204 } 205 continue; 206 } 207 208 /* 209 * Control, or special. 210 */ 211 if (strchr(user_specials, ch) || ISCNTRL(ch)) { 212 if (tok_count < token_len) { 213 token[tok_count].u.offset = LEN(token_buffer); 214 token[tok_count].type = ch; 215 VSTRING_ADDCH(token_buffer, ch); 216 VSTRING_ADDCH(token_buffer, 0); 217 tok_count++; 218 } 219 continue; 220 } 221 222 /* 223 * Token. 224 */ 225 else { 226 if (tok_count < token_len) { 227 token[tok_count].u.offset = LEN(token_buffer); 228 token[tok_count].type = HEADER_TOK_TOKEN; 229 VSTRING_ADDCH(token_buffer, ch); 230 } 231 while ((ch = *cp) != 0 && !IS_SPACE_TAB_CR_LF(ch) 232 && !ISCNTRL(ch) && !strchr(user_specials, ch)) { 233 cp++; 234 if (tok_count < token_len) 235 VSTRING_ADDCH(token_buffer, ch); 236 } 237 if (tok_count < token_len) { 238 VSTRING_ADDCH(token_buffer, 0); 239 tok_count++; 240 } 241 continue; 242 } 243 } 244 245 /* 246 * Ignore a zero-length item after the last terminator. 247 */ 248 if (tok_count == 0 && ch == 0) 249 return (-1); 250 251 /* 252 * Finalize. Fill in the string pointer array, now that the token buffer 253 * is no longer dynamically reallocated as it grows. 254 */ 255 *ptr = (const char *) cp; 256 for (n = 0; n < tok_count; n++) 257 token[n].u.value = STR(token_buffer) + token[n].u.offset; 258 259 if (msg_verbose) 260 msg_info("header_token: %s %s %s", 261 tok_count > 0 ? token[0].u.value : "", 262 tok_count > 1 ? token[1].u.value : "", 263 tok_count > 2 ? token[2].u.value : ""); 264 265 return (tok_count); 266} 267