1/* $NetBSD$ */ 2 3/*++ 4/* NAME 5/* header_token 3 6/* SUMMARY 7/* mail header parser 8/* SYNOPSIS 9/* #include <header_token.h> 10/* 11/* typedef struct { 12/* .in +4 13/* int type; 14/* const char *u.value; 15/* /* ... */ 16/* .in 17/* } HEADER_TOKEN; 18/* 19/* ssize_t header_token(token, token_len, token_buffer, ptr, 20/* specials, terminator) 21/* HEADER_TOKEN *token; 22/* ssize_t token_len; 23/* VSTRING *token_buffer; 24/* const char **ptr; 25/* const char *specials; 26/* int terminator; 27/* DESCRIPTION 28/* This module parses a mail header value (text after field-name:) 29/* into tokens. The parser understands RFC 822 linear white space, 30/* quoted-string, comment, control characters, and a set of 31/* user-specified special characters. 32/* 33/* A result token type is one of the following: 34/* .IP HEADER_TOK_QSTRING 35/* Quoted string as per RFC 822. 36/* .IP HEADER_TOK_TOKEN 37/* Token as per RFC 822, and the special characters supplied by the 38/* caller. 39/* .IP other 40/* The value of a control character or special character. 41/* .PP 42/* header_token() tokenizes the input and stops after a user-specified 43/* terminator (ignoring all tokens that exceed the capacity of 44/* the result storage), or when it runs out of space for the result. 45/* The terminator is not stored. The result value is the number of 46/* tokens stored, or -1 when the input was exhausted before any tokens 47/* were found. 48/* 49/* Arguments: 50/* .IP token 51/* Result array of HEADER_TOKEN structures. Token string values 52/* are pointers to null-terminated substrings in the token_buffer. 53/* .IP token_len 54/* Length of the array of HEADER_TOKEN structures. 55/* .IP token_buffer 56/* Storage for result token string values. 57/* .IP ptr 58/* Input/output read position. The input is a null-terminated string. 59/* .IP specials 60/* Special characters according to the relevant RFC, or a 61/* null pointer (default to the RFC 822 special characters). 62/* This must include the optional terminator if one is specified. 63/* .IP terminator 64/* The special character to stop after, or zero. 65/* BUGS 66/* Eight-bit characters are not given special treatment. 67/* SEE ALSO 68/* RFC 822 (ARPA Internet Text Messages) 69/* DIAGNOSTICS 70/* Fatal errors: memory allocation problem. 71/* LICENSE 72/* .ad 73/* .fi 74/* The Secure Mailer license must be distributed with this software. 75/* AUTHOR(S) 76/* Wietse Venema 77/* IBM T.J. Watson Research 78/* P.O. Box 704 79/* Yorktown Heights, NY 10598, USA 80/*--*/ 81 82/* System library. */ 83 84#include <sys_defs.h> 85#include <string.h> 86#include <ctype.h> 87 88/* Utility library. */ 89 90#include <msg.h> 91#include <vstring.h> 92 93/* Global library. */ 94 95#include <lex_822.h> 96#include <header_token.h> 97 98/* Application-specific. */ 99 100 /* 101 * Silly little macros. 102 */ 103#define STR(x) vstring_str(x) 104#define LEN(x) VSTRING_LEN(x) 105#define CU_CHAR_PTR(x) ((const unsigned char *) (x)) 106 107/* header_token - parse out the next item in a message header */ 108 109ssize_t header_token(HEADER_TOKEN *token, ssize_t token_len, 110 VSTRING *token_buffer, const char **ptr, 111 const char *user_specials, int user_terminator) 112{ 113 ssize_t comment_level; 114 const unsigned char *cp; 115 ssize_t len; 116 int ch; 117 ssize_t tok_count; 118 ssize_t n; 119 120 /* 121 * Initialize. 122 */ 123 VSTRING_RESET(token_buffer); 124 cp = CU_CHAR_PTR(*ptr); 125 tok_count = 0; 126 if (user_specials == 0) 127 user_specials = LEX_822_SPECIALS; 128 129 /* 130 * Main parsing loop. 131 * 132 * XXX What was the reason to continue parsing when user_terminator is 133 * specified? Perhaps this was needed at some intermediate stage of 134 * development? 135 */ 136 while ((ch = *cp) != 0 && (user_terminator != 0 || tok_count < token_len)) { 137 cp++; 138 139 /* 140 * Skip RFC 822 linear white space. 141 */ 142 if (IS_SPACE_TAB_CR_LF(ch)) 143 continue; 144 145 /* 146 * Terminator. 147 */ 148 if (ch == user_terminator) 149 break; 150 151 /* 152 * Skip RFC 822 comment. 153 */ 154 if (ch == '(') { 155 comment_level = 1; 156 while ((ch = *cp) != 0) { 157 cp++; 158 if (ch == '(') { /* comments can nest! */ 159 comment_level++; 160 } else if (ch == ')') { 161 if (--comment_level == 0) 162 break; 163 } else if (ch == '\\') { 164 if ((ch = *cp) == 0) 165 break; 166 cp++; 167 } 168 } 169 continue; 170 } 171 172 /* 173 * Copy quoted text according to RFC 822. 174 */ 175 if (ch == '"') { 176 if (tok_count < token_len) { 177 token[tok_count].u.offset = LEN(token_buffer); 178 token[tok_count].type = HEADER_TOK_QSTRING; 179 } 180 while ((ch = *cp) != 0) { 181 cp++; 182 if (ch == '"') 183 break; 184 if (ch == '\n') { /* unfold */ 185 if (tok_count < token_len) { 186 len = LEN(token_buffer); 187 while (len > 0 188 && IS_SPACE_TAB_CR_LF(STR(token_buffer)[len - 1])) 189 len--; 190 if (len < LEN(token_buffer)) 191 vstring_truncate(token_buffer, len); 192 } 193 continue; 194 } 195 if (ch == '\\') { 196 if ((ch = *cp) == 0) 197 break; 198 cp++; 199 } 200 if (tok_count < token_len) 201 VSTRING_ADDCH(token_buffer, ch); 202 } 203 if (tok_count < token_len) { 204 VSTRING_ADDCH(token_buffer, 0); 205 tok_count++; 206 } 207 continue; 208 } 209 210 /* 211 * Control, or special. 212 */ 213 if (strchr(user_specials, ch) || ISCNTRL(ch)) { 214 if (tok_count < token_len) { 215 token[tok_count].u.offset = LEN(token_buffer); 216 token[tok_count].type = ch; 217 VSTRING_ADDCH(token_buffer, ch); 218 VSTRING_ADDCH(token_buffer, 0); 219 tok_count++; 220 } 221 continue; 222 } 223 224 /* 225 * Token. 226 */ 227 else { 228 if (tok_count < token_len) { 229 token[tok_count].u.offset = LEN(token_buffer); 230 token[tok_count].type = HEADER_TOK_TOKEN; 231 VSTRING_ADDCH(token_buffer, ch); 232 } 233 while ((ch = *cp) != 0 && !IS_SPACE_TAB_CR_LF(ch) 234 && !ISCNTRL(ch) && !strchr(user_specials, ch)) { 235 cp++; 236 if (tok_count < token_len) 237 VSTRING_ADDCH(token_buffer, ch); 238 } 239 if (tok_count < token_len) { 240 VSTRING_ADDCH(token_buffer, 0); 241 tok_count++; 242 } 243 continue; 244 } 245 } 246 247 /* 248 * Ignore a zero-length item after the last terminator. 249 */ 250 if (tok_count == 0 && ch == 0) 251 return (-1); 252 253 /* 254 * Finalize. Fill in the string pointer array, now that the token buffer 255 * is no longer dynamically reallocated as it grows. 256 */ 257 *ptr = (const char *) cp; 258 for (n = 0; n < tok_count; n++) 259 token[n].u.value = STR(token_buffer) + token[n].u.offset; 260 261 if (msg_verbose) 262 msg_info("header_token: %s %s %s", 263 tok_count > 0 ? token[0].u.value : "", 264 tok_count > 1 ? token[1].u.value : "", 265 tok_count > 2 ? token[2].u.value : ""); 266 267 return (tok_count); 268} 269