/*++ /* NAME /* tok822_parse 3 /* SUMMARY /* RFC 822 address parser /* SYNOPSIS /* #include /* /* TOK822 *tok822_scan_limit(str, tailp, limit) /* const char *str; /* TOK822 **tailp; /* int limit; /* /* TOK822 *tok822_scan(str, tailp) /* const char *str; /* TOK822 **tailp; /* /* TOK822 *tok822_parse_limit(str, limit) /* const char *str; /* int limit; /* /* TOK822 *tok822_parse(str) /* const char *str; /* /* TOK822 *tok822_scan_addr(str) /* const char *str; /* /* VSTRING *tok822_externalize(buffer, tree, flags) /* VSTRING *buffer; /* TOK822 *tree; /* int flags; /* /* VSTRING *tok822_internalize(buffer, tree, flags) /* VSTRING *buffer; /* TOK822 *tree; /* int flags; /* DESCRIPTION /* This module converts address lists between string form and parse /* tree formats. The string form can appear in two different ways: /* external (or quoted) form, as used in message headers, and internal /* (unquoted) form, as used internally by the mail software. /* Although RFC 822 expects 7-bit data, these routines pay no /* special attention to 8-bit characters. /* /* tok822_scan() converts the external-form string in \fIstr\fR /* to a linear token list. The \fItailp\fR argument is a null pointer /* or receives the pointer value of the last result list element. /* /* tok822_scan_limit() implements tok822_scan(), which is a macro. /* The \fIlimit\fR argument is either zero or an upper bound on the /* number of tokens produced. /* /* tok822_parse() converts the external-form address list in /* \fIstr\fR to the corresponding token tree. The parser is permissive /* and will not throw away information that it does not understand. /* The parser adds missing commas between addresses. /* /* tok822_parse_limit() implements tok822_parse(), which is a macro. /* The \fIlimit\fR argument is either zero or an upper bound on the /* number of tokens produced. /* /* tok822_scan_addr() converts the external-form string in /* \fIstr\fR to an address token tree. This is just string to /* token list conversion; no parsing is done. This routine is /* suitable for data that should contain just one address and no /* other information. /* /* tok822_externalize() converts a token list to external form. /* Where appropriate, characters and strings are quoted and white /* space is inserted. The \fIflags\fR argument is the binary OR of /* zero or more of the following: /* .IP TOK822_STR_WIPE /* Initially, truncate the result to zero length. /* .IP TOK822_STR_TERM /* Append a null terminator to the result when done. /* .IP TOK822_STR_LINE /* Append a line break after each comma token, instead of appending /* whitespace. It is up to the caller to concatenate short lines to /* produce longer ones. /* .IP TOK822_STR_TRNC /* Truncate non-address information to 250 characters per address, to /* protect Sendmail systems that are vulnerable to the problem in CERT /* advisory CA-2003-07. /* This flag has effect with tok822_externalize() only. /* .PP /* The macro TOK_822_NONE expresses that none of the above features /* should be activated. /* /* The macro TOK822_STR_DEFL combines the TOK822_STR_WIPE and /* TOK822_STR_TERM flags. This is useful for most token to string /* conversions. /* /* The macro TOK822_STR_HEAD combines the TOK822_STR_TERM, /* TOK822_STR_LINE and TOK822_STR_TRNC flags. This is useful for /* the special case of token to mail header conversion. /* /* tok822_internalize() converts a token list to string form, /* without quoting. White space is inserted where appropriate. /* The \fIflags\fR argument is as with tok822_externalize(). /* STANDARDS /* .ad /* .fi /* RFC 822 (ARPA Internet Text Messages). In addition to this standard /* this module implements additional operators such as % and !. These /* are needed because the real world is not all RFC 822. Also, the ':' /* operator is allowed to appear inside addresses, to accommodate DECnet. /* In addition, 8-bit data is not given special treatment. /* LICENSE /* .ad /* .fi /* The Secure Mailer license must be distributed with this software. /* AUTHOR(S) /* Wietse Venema /* IBM T.J. Watson Research /* P.O. Box 704 /* Yorktown Heights, NY 10598, USA /*--*/ /* System library. */ #include #include #include /* Utility library. */ #include #include #include /* Global library. */ #include "lex_822.h" #include "quote_822_local.h" #include "tok822.h" /* * I suppose this is my favorite macro. Used heavily for tokenizing. */ #define COLLECT(t,s,c,cond) { \ while ((c = *(unsigned char *) s) != 0) { \ if (c == '\\') { \ if ((c = *(unsigned char *)++s) == 0) \ break; \ } else if (!(cond)) { \ break; \ } \ VSTRING_ADDCH(t->vstr, IS_SPACE_TAB_CR_LF(c) ? ' ' : c); \ s++; \ } \ VSTRING_TERMINATE(t->vstr); \ } #define COLLECT_SKIP_LAST(t,s,c,cond) { COLLECT(t,s,c,cond); if (*s) s++; } /* * Not quite as complex. The parser depends heavily on it. */ #define SKIP(tp, cond) { \ while (tp->type && (cond)) \ tp = tp->prev; \ } #define MOVE_COMMENT_AND_CONTINUE(tp, right) { \ TOK822 *prev = tok822_unlink(tp); \ right = tok822_prepend(right, tp); \ tp = prev; \ continue; \ } #define SKIP_MOVE_COMMENT(tp, cond, right) { \ while (tp->type && (cond)) { \ if (tp->type == TOK822_COMMENT) \ MOVE_COMMENT_AND_CONTINUE(tp, right); \ tp = tp->prev; \ } \ } /* * Single-character operators. We include the % and ! operators because not * all the world is RFC822. XXX Make this operator list configurable when we * have a real rewriting language. Include | for aliases file parsing. */ static char tok822_opchar[] = "|%!" LEX_822_SPECIALS; static void tok822_quote_atom(TOK822 *); static const char *tok822_comment(TOK822 *, const char *); static TOK822 *tok822_group(int, TOK822 *, TOK822 *, int); static void tok822_copy_quoted(VSTRING *, char *, char *); static int tok822_append_space(TOK822 *); #define DO_WORD (1<<0) /* finding a word is ok here */ #define DO_GROUP (1<<1) /* doing an address group */ #define ADD_COMMA ',' /* resynchronize */ #define NO_MISSING_COMMA 0 /* tok822_internalize - token tree to string, internal form */ VSTRING *tok822_internalize(VSTRING *vp, TOK822 *tree, int flags) { TOK822 *tp; if (flags & TOK822_STR_WIPE) VSTRING_RESET(vp); for (tp = tree; tp; tp = tp->next) { switch (tp->type) { case ',': VSTRING_ADDCH(vp, tp->type); if (flags & TOK822_STR_LINE) { VSTRING_ADDCH(vp, '\n'); continue; } break; case TOK822_ADDR: tok822_internalize(vp, tp->head, TOK822_STR_NONE); break; case TOK822_COMMENT: case TOK822_ATOM: case TOK822_QSTRING: vstring_strcat(vp, vstring_str(tp->vstr)); break; case TOK822_DOMLIT: VSTRING_ADDCH(vp, '['); vstring_strcat(vp, vstring_str(tp->vstr)); VSTRING_ADDCH(vp, ']'); break; case TOK822_STARTGRP: VSTRING_ADDCH(vp, ':'); break; default: if (tp->type >= TOK822_MINTOK) msg_panic("tok822_internalize: unknown operator %d", tp->type); VSTRING_ADDCH(vp, tp->type); } if (tok822_append_space(tp)) VSTRING_ADDCH(vp, ' '); } if (flags & TOK822_STR_TERM) VSTRING_TERMINATE(vp); return (vp); } /* strip_address - strip non-address text from address expression */ static void strip_address(VSTRING *vp, ssize_t start, TOK822 *addr) { VSTRING *tmp; /* * Emit plain
. Discard any comments or phrases. */ VSTRING_TERMINATE(vp); msg_warn("stripping too many comments from address: %.100s...", printable(vstring_str(vp) + start, '?')); vstring_truncate(vp, start); VSTRING_ADDCH(vp, '<'); if (addr) { tmp = vstring_alloc(100); tok822_internalize(tmp, addr, TOK822_STR_TERM); quote_822_local_flags(vp, vstring_str(tmp), QUOTE_FLAG_8BITCLEAN | QUOTE_FLAG_APPEND); vstring_free(tmp); } VSTRING_ADDCH(vp, '>'); } /* tok822_externalize - token tree to string, external form */ VSTRING *tok822_externalize(VSTRING *vp, TOK822 *tree, int flags) { VSTRING *tmp; TOK822 *tp; ssize_t start; TOK822 *addr; ssize_t addr_len; /* * Guard against a Sendmail buffer overflow (CERT advisory CA-2003-07). * The problem was that Sendmail could store too much non-address text * (comments, phrases, etc.) into a static 256-byte buffer. * * When the buffer fills up, fixed Sendmail versions remove comments etc. * and reduce the information to just <$g>, which expands to
. * No change is made when an address expression (text separated by * commas) contains no address. This fix reportedly also protects * Sendmail systems that are still vulnerable to this problem. * * Postfix takes the same approach, grudgingly. To avoid unnecessary damage, * Postfix removes comments etc. only when the amount of non-address text * in an address expression (text separated by commas) exceeds 250 bytes. * * With Sendmail, the address part of an address expression is the * right-most <> instance in that expression. If an address expression * contains no <>, then Postfix guarantees that it contains at most one * non-comment string; that string is the address part of the address * expression, so there is no ambiguity. * * Finally, we note that stress testing shows that other code in Sendmail * 8.12.8 bluntly truncates ``text
'' to 256 bytes even when * this means chopping the
somewhere in the middle. This is a * loss of control that we're not entirely comfortable with. However, * unbalanced quotes and dangling backslash do not seem to influence the * way that Sendmail parses headers, so this is not an urgent problem. */ #define MAX_NONADDR_LENGTH 250 #define RESET_NONADDR_LENGTH { \ start = VSTRING_LEN(vp); \ addr = 0; \ addr_len = 0; \ } #define ENFORCE_NONADDR_LENGTH do { \ if (addr && VSTRING_LEN(vp) - addr_len > start + MAX_NONADDR_LENGTH) \ strip_address(vp, start, addr->head); \ } while(0) if (flags & TOK822_STR_WIPE) VSTRING_RESET(vp); if (flags & TOK822_STR_TRNC) RESET_NONADDR_LENGTH; for (tp = tree; tp; tp = tp->next) { switch (tp->type) { case ',': if (flags & TOK822_STR_TRNC) ENFORCE_NONADDR_LENGTH; VSTRING_ADDCH(vp, tp->type); VSTRING_ADDCH(vp, (flags & TOK822_STR_LINE) ? '\n' : ' '); if (flags & TOK822_STR_TRNC) RESET_NONADDR_LENGTH; continue; /* * XXX In order to correctly externalize an address, it is not * sufficient to quote individual atoms. There are higher-level * rules that say when an address localpart needs to be quoted. * We wing it with the quote_822_local() routine, which ignores * the issue of atoms in the domain part that would need quoting. */ case TOK822_ADDR: addr = tp; tmp = vstring_alloc(100); tok822_internalize(tmp, tp->head, TOK822_STR_TERM); addr_len = VSTRING_LEN(vp); quote_822_local_flags(vp, vstring_str(tmp), QUOTE_FLAG_8BITCLEAN | QUOTE_FLAG_APPEND); addr_len = VSTRING_LEN(vp) - addr_len; vstring_free(tmp); break; case TOK822_ATOM: case TOK822_COMMENT: vstring_strcat(vp, vstring_str(tp->vstr)); break; case TOK822_QSTRING: VSTRING_ADDCH(vp, '"'); tok822_copy_quoted(vp, vstring_str(tp->vstr), "\"\\\r\n"); VSTRING_ADDCH(vp, '"'); break; case TOK822_DOMLIT: VSTRING_ADDCH(vp, '['); tok822_copy_quoted(vp, vstring_str(tp->vstr), "\\\r\n"); VSTRING_ADDCH(vp, ']'); break; case TOK822_STARTGRP: VSTRING_ADDCH(vp, ':'); break; case '<': if (tp->next && tp->next->type == '>') { addr = tp; addr_len = 0; } VSTRING_ADDCH(vp, '<'); break; default: if (tp->type >= TOK822_MINTOK) msg_panic("tok822_externalize: unknown operator %d", tp->type); VSTRING_ADDCH(vp, tp->type); } if (tok822_append_space(tp)) VSTRING_ADDCH(vp, ' '); } if (flags & TOK822_STR_TRNC) ENFORCE_NONADDR_LENGTH; if (flags & TOK822_STR_TERM) VSTRING_TERMINATE(vp); return (vp); } /* tok822_copy_quoted - copy a string while quoting */ static void tok822_copy_quoted(VSTRING *vp, char *str, char *quote_set) { int ch; while ((ch = *(unsigned char *) str++) != 0) { if (strchr(quote_set, ch)) VSTRING_ADDCH(vp, '\\'); VSTRING_ADDCH(vp, ch); } } /* tok822_append_space - see if space is needed after this token */ static int tok822_append_space(TOK822 *tp) { TOK822 *next; if (tp == 0 || (next = tp->next) == 0 || tp->owner != 0) return (0); if (tp->type == ',' || tp->type == TOK822_STARTGRP || next->type == '<') return (1); #define NON_OPERATOR(x) \ (x->type == TOK822_ATOM || x->type == TOK822_QSTRING \ || x->type == TOK822_COMMENT || x->type == TOK822_DOMLIT \ || x->type == TOK822_ADDR) return (NON_OPERATOR(tp) && NON_OPERATOR(next)); } /* tok822_scan_limit - tokenize string */ TOK822 *tok822_scan_limit(const char *str, TOK822 **tailp, int tok_count_limit) { TOK822 *head = 0; TOK822 *tail = 0; TOK822 *tp; int ch; int tok_count = 0; /* * XXX 2822 new feature: Section 4.1 allows "." to appear in a phrase (to * allow for forms such as: Johnny B. Goode . I cannot * handle that at the tokenizer level - it is not context sensitive. And * to fix this at the parser level requires radical changes to preserve * white space as part of the token stream. Thanks a lot, people. */ while ((ch = *(unsigned char *) str++) != 0) { if (IS_SPACE_TAB_CR_LF(ch)) continue; if (ch == '(') { tp = tok822_alloc(TOK822_COMMENT, (char *) 0); str = tok822_comment(tp, str); } else if (ch == '[') { tp = tok822_alloc(TOK822_DOMLIT, (char *) 0); COLLECT_SKIP_LAST(tp, str, ch, ch != ']'); } else if (ch == '"') { tp = tok822_alloc(TOK822_QSTRING, (char *) 0); COLLECT_SKIP_LAST(tp, str, ch, ch != '"'); } else if (ch != '\\' && strchr(tok822_opchar, ch)) { tp = tok822_alloc(ch, (char *) 0); } else { tp = tok822_alloc(TOK822_ATOM, (char *) 0); str -= 1; /* \ may be first */ COLLECT(tp, str, ch, !IS_SPACE_TAB_CR_LF(ch) && !strchr(tok822_opchar, ch)); tok822_quote_atom(tp); } if (head == 0) { head = tail = tp; while (tail->next) tail = tail->next; } else { tail = tok822_append(tail, tp); } if (tok_count_limit > 0 && ++tok_count >= tok_count_limit) break; } if (tailp) *tailp = tail; return (head); } /* tok822_parse_limit - translate external string to token tree */ TOK822 *tok822_parse_limit(const char *str, int tok_count_limit) { TOK822 *head; TOK822 *tail; TOK822 *right; TOK822 *first_token; TOK822 *last_token; TOK822 *tp; int state; /* * First, tokenize the string, from left to right. We are not allowed to * throw away any information that we do not understand. With a flat * token list that contains all tokens, we can always convert back to * string form. */ if ((first_token = tok822_scan_limit(str, &last_token, tok_count_limit)) == 0) return (0); /* * For convenience, sandwich the token list between two sentinel tokens. */ #define GLUE(left,rite) { left->next = rite; rite->prev = left; } head = tok822_alloc(0, (char *) 0); GLUE(head, first_token); tail = tok822_alloc(0, (char *) 0); GLUE(last_token, tail); /* * Next step is to transform the token list into a parse tree. This is * done most conveniently from right to left. If there is something that * we do not understand, just leave it alone, don't throw it away. The * address information that we're looking for sits in-between the current * node (tp) and the one called right. Add missing commas on the fly. */ state = DO_WORD; right = tail; tp = tail->prev; while (tp->type) { if (tp->type == TOK822_COMMENT) { /* move comment to the side */ MOVE_COMMENT_AND_CONTINUE(tp, right); } else if (tp->type == ';') { /* rh side of named group */ right = tok822_group(TOK822_ADDR, tp, right, ADD_COMMA); state = DO_GROUP | DO_WORD; } else if (tp->type == ':' && (state & DO_GROUP) != 0) { tp->type = TOK822_STARTGRP; (void) tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA); SKIP(tp, tp->type != ','); right = tp; continue; } else if (tp->type == '>') { /* rh side of */ right = tok822_group(TOK822_ADDR, tp, right, ADD_COMMA); SKIP_MOVE_COMMENT(tp, tp->type != '<', right); (void) tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA); SKIP(tp, tp->type > 0xff || strchr(">;,:", tp->type) == 0); right = tp; state |= DO_WORD; continue; } else if (tp->type == TOK822_ATOM || tp->type == TOK822_QSTRING || tp->type == TOK822_DOMLIT) { if ((state & DO_WORD) == 0) right = tok822_group(TOK822_ADDR, tp, right, ADD_COMMA)->next; state &= ~DO_WORD; } else if (tp->type == ',') { right = tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA); state |= DO_WORD; } else { state |= DO_WORD; } tp = tp->prev; } (void) tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA); /* * Discard the sentinel tokens on the left and right extremes. Properly * terminate the resulting list. */ tp = (head->next != tail ? head->next : 0); tok822_cut_before(head->next); tok822_free(head); tok822_cut_before(tail); tok822_free(tail); return (tp); } /* tok822_quote_atom - see if an atom needs quoting when externalized */ static void tok822_quote_atom(TOK822 *tp) { char *cp; int ch; /* * RFC 822 expects 7-bit data. Rather than quoting every 8-bit character * (and still passing it on as 8-bit data) we leave 8-bit data alone. */ for (cp = vstring_str(tp->vstr); (ch = *(unsigned char *) cp) != 0; cp++) { if ( /* !ISASCII(ch) || */ ch == ' ' || ISCNTRL(ch) || strchr(tok822_opchar, ch)) { tp->type = TOK822_QSTRING; break; } } } /* tok822_comment - tokenize comment */ static const char *tok822_comment(TOK822 *tp, const char *str) { int level = 1; int ch; /* * XXX We cheat by storing comments in their external form. Otherwise it * would be a royal pain to preserve \ before (. That would require a * recursive parser; the easy to implement stack-based recursion would be * too expensive. */ VSTRING_ADDCH(tp->vstr, '('); while ((ch = *(unsigned char *) str) != 0) { VSTRING_ADDCH(tp->vstr, ch); str++; if (ch == '(') { /* comments can nest! */ level++; } else if (ch == ')') { if (--level == 0) break; } else if (ch == '\\') { if ((ch = *(unsigned char *) str) == 0) break; VSTRING_ADDCH(tp->vstr, ch); str++; } } VSTRING_TERMINATE(tp->vstr); return (str); } /* tok822_group - cluster a group of tokens */ static TOK822 *tok822_group(int group_type, TOK822 *left, TOK822 *right, int sync_type) { TOK822 *group; TOK822 *sync; TOK822 *first; /* * Cluster the tokens between left and right under their own parse tree * node. Optionally insert a resync token. */ if (left != right && (first = left->next) != right) { tok822_cut_before(right); tok822_cut_before(first); group = tok822_alloc(group_type, (char *) 0); tok822_sub_append(group, first); tok822_append(left, group); tok822_append(group, right); if (sync_type) { sync = tok822_alloc(sync_type, (char *) 0); tok822_append(left, sync); } } return (left); } /* tok822_scan_addr - convert external address string to address token */ TOK822 *tok822_scan_addr(const char *addr) { TOK822 *tree = tok822_alloc(TOK822_ADDR, (char *) 0); tree->head = tok822_scan(addr, &tree->tail); return (tree); } #ifdef TEST #include #include #include /* tok822_print - display token */ static void tok822_print(TOK822 *list, int indent) { TOK822 *tp; for (tp = list; tp; tp = tp->next) { if (tp->type < TOK822_MINTOK) { vstream_printf("%*s %s \"%c\"\n", indent, "", "OP", tp->type); } else if (tp->type == TOK822_ADDR) { vstream_printf("%*s %s\n", indent, "", "address"); tok822_print(tp->head, indent + 2); } else if (tp->type == TOK822_STARTGRP) { vstream_printf("%*s %s\n", indent, "", "group \":\""); } else { vstream_printf("%*s %s \"%s\"\n", indent, "", tp->type == TOK822_COMMENT ? "comment" : tp->type == TOK822_ATOM ? "atom" : tp->type == TOK822_QSTRING ? "quoted string" : tp->type == TOK822_DOMLIT ? "domain literal" : tp->type == TOK822_ADDR ? "address" : "unknown\n", vstring_str(tp->vstr)); } } } int main(int unused_argc, char **unused_argv) { VSTRING *vp = vstring_alloc(100); TOK822 *list; VSTRING *buf = vstring_alloc(100); #define TEST_TOKEN_LIMIT 20 while (readlline(buf, VSTREAM_IN, (int *) 0)) { while (VSTRING_LEN(buf) > 0 && vstring_end(buf)[-1] == '\n') { vstring_end(buf)[-1] = 0; vstring_truncate(buf, VSTRING_LEN(buf) - 1); } if (!isatty(vstream_fileno(VSTREAM_IN))) vstream_printf(">>>%s<<<\n\n", vstring_str(buf)); list = tok822_parse_limit(vstring_str(buf), TEST_TOKEN_LIMIT); vstream_printf("Parse tree:\n"); tok822_print(list, 0); vstream_printf("\n"); vstream_printf("Internalized:\n%s\n\n", vstring_str(tok822_internalize(vp, list, TOK822_STR_DEFL))); vstream_fflush(VSTREAM_OUT); vstream_printf("Externalized, no newlines inserted:\n%s\n\n", vstring_str(tok822_externalize(vp, list, TOK822_STR_DEFL | TOK822_STR_TRNC))); vstream_fflush(VSTREAM_OUT); vstream_printf("Externalized, newlines inserted:\n%s\n\n", vstring_str(tok822_externalize(vp, list, TOK822_STR_DEFL | TOK822_STR_LINE | TOK822_STR_TRNC))); vstream_fflush(VSTREAM_OUT); tok822_free_tree(list); } vstring_free(vp); vstring_free(buf); return (0); } #endif