1/*++
2/* NAME
3/*	tok822_parse 3
4/* SUMMARY
5/*	RFC 822 address parser
6/* SYNOPSIS
7/*	#include <tok822.h>
8/*
9/*	TOK822 *tok822_scan_limit(str, tailp, limit)
10/*	const char *str;
11/*	TOK822	**tailp;
12/*	int	limit;
13/*
14/*	TOK822 *tok822_scan(str, tailp)
15/*	const char *str;
16/*	TOK822	**tailp;
17/*
18/*	TOK822	*tok822_parse_limit(str, limit)
19/*	const char *str;
20/*	int	limit;
21/*
22/*	TOK822	*tok822_parse(str)
23/*	const char *str;
24/*
25/*	TOK822	*tok822_scan_addr(str)
26/*	const char *str;
27/*
28/*	VSTRING	*tok822_externalize(buffer, tree, flags)
29/*	VSTRING	*buffer;
30/*	TOK822	*tree;
31/*	int	flags;
32/*
33/*	VSTRING	*tok822_internalize(buffer, tree, flags)
34/*	VSTRING	*buffer;
35/*	TOK822	*tree;
36/*	int	flags;
37/* DESCRIPTION
38/*	This module converts address lists between string form and parse
39/*	tree formats. The string form can appear in two different ways:
40/*	external (or quoted) form, as used in message headers, and internal
41/*	(unquoted) form, as used internally by the mail software.
42/*	Although RFC 822 expects 7-bit data, these routines pay no
43/*	special attention to 8-bit characters.
44/*
45/*	tok822_scan() converts the external-form string in \fIstr\fR
46/*	to a linear token list. The \fItailp\fR argument is a null pointer
47/*	or receives the pointer value of the last result list element.
48/*
49/*	tok822_scan_limit() implements tok822_scan(), which is a macro.
50/*	The \fIlimit\fR argument is either zero or an upper bound on the
51/*	number of tokens produced.
52/*
53/*	tok822_parse() converts the external-form address list in
54/*	\fIstr\fR to the corresponding token tree. The parser is permissive
55/*	and will not throw away information that it does not understand.
56/*	The parser adds missing commas between addresses.
57/*
58/*	tok822_parse_limit() implements tok822_parse(), which is a macro.
59/*	The \fIlimit\fR argument is either zero or an upper bound on the
60/*	number of tokens produced.
61/*
62/*	tok822_scan_addr() converts the external-form string in
63/*	\fIstr\fR to an address token tree. This is just string to
64/*	token list conversion; no parsing is done. This routine is
65/*	suitable for data that should contain just one address and no
66/*	other information.
67/*
68/*	tok822_externalize() converts a token list to external form.
69/*	Where appropriate, characters and strings are quoted and white
70/*	space is inserted. The \fIflags\fR argument is the binary OR of
71/*	zero or more of the following:
72/* .IP TOK822_STR_WIPE
73/*	Initially, truncate the result to zero length.
74/* .IP TOK822_STR_TERM
75/*	Append a null terminator to the result when done.
76/* .IP TOK822_STR_LINE
77/*	Append a line break after each comma token, instead of appending
78/*	whitespace.  It is up to the caller to concatenate short lines to
79/*	produce longer ones.
80/* .IP TOK822_STR_TRNC
81/*	Truncate non-address information to 250 characters per address, to
82/*	protect Sendmail systems that are vulnerable to the problem in CERT
83/*	advisory CA-2003-07.
84/*	This flag has effect with tok822_externalize() only.
85/* .PP
86/*	The macro TOK_822_NONE expresses that none of the above features
87/*	should be activated.
88/*
89/*	The macro TOK822_STR_DEFL combines the TOK822_STR_WIPE and
90/*	TOK822_STR_TERM flags. This is useful for most token to string
91/*	conversions.
92/*
93/*	The macro TOK822_STR_HEAD combines the TOK822_STR_TERM,
94/*	TOK822_STR_LINE and TOK822_STR_TRNC flags. This is useful for
95/*	the special case of token to mail header conversion.
96/*
97/*	tok822_internalize() converts a token list to string form,
98/*	without quoting. White space is inserted where appropriate.
99/*	The \fIflags\fR argument is as with tok822_externalize().
100/* STANDARDS
101/* .ad
102/* .fi
103/*	RFC 822 (ARPA Internet Text Messages). In addition to this standard
104/*	this module implements additional operators such as % and !. These
105/*	are needed because the real world is not all RFC 822. Also, the ':'
106/*	operator is allowed to appear inside addresses, to accommodate DECnet.
107/*	In addition, 8-bit data is not given special treatment.
108/* LICENSE
109/* .ad
110/* .fi
111/*	The Secure Mailer license must be distributed with this software.
112/* AUTHOR(S)
113/*	Wietse Venema
114/*	IBM T.J. Watson Research
115/*	P.O. Box 704
116/*	Yorktown Heights, NY 10598, USA
117/*--*/
118
119/* System library. */
120
121#include <sys_defs.h>
122#include <ctype.h>
123#include <string.h>
124
125/* Utility library. */
126
127#include <vstring.h>
128#include <msg.h>
129#include <stringops.h>
130
131/* Global library. */
132
133#include "lex_822.h"
134#include "quote_822_local.h"
135#include "tok822.h"
136
137 /*
138  * I suppose this is my favorite macro. Used heavily for tokenizing.
139  */
140#define COLLECT(t,s,c,cond) { \
141	while ((c = *(unsigned char *) s) != 0) { \
142	    if (c == '\\') { \
143		if ((c = *(unsigned char *)++s) == 0) \
144		    break; \
145	    } else if (!(cond)) { \
146		break; \
147	    } \
148	    VSTRING_ADDCH(t->vstr, IS_SPACE_TAB_CR_LF(c) ? ' ' : c); \
149	    s++; \
150	} \
151	VSTRING_TERMINATE(t->vstr); \
152    }
153
154#define COLLECT_SKIP_LAST(t,s,c,cond) { COLLECT(t,s,c,cond); if (*s) s++; }
155
156 /*
157  * Not quite as complex. The parser depends heavily on it.
158  */
159#define SKIP(tp, cond) { \
160	while (tp->type && (cond)) \
161	    tp = tp->prev; \
162    }
163
164#define MOVE_COMMENT_AND_CONTINUE(tp, right) { \
165	TOK822 *prev = tok822_unlink(tp); \
166	right = tok822_prepend(right, tp); \
167	tp = prev; \
168	continue; \
169    }
170
171#define SKIP_MOVE_COMMENT(tp, cond, right) { \
172	while (tp->type && (cond)) { \
173	    if (tp->type == TOK822_COMMENT) \
174		MOVE_COMMENT_AND_CONTINUE(tp, right); \
175	    tp = tp->prev; \
176	} \
177    }
178
179 /*
180  * Single-character operators. We include the % and ! operators because not
181  * all the world is RFC822. XXX Make this operator list configurable when we
182  * have a real rewriting language. Include | for aliases file parsing.
183  */
184static char tok822_opchar[] = "|%!" LEX_822_SPECIALS;
185static void tok822_quote_atom(TOK822 *);
186static const char *tok822_comment(TOK822 *, const char *);
187static TOK822 *tok822_group(int, TOK822 *, TOK822 *, int);
188static void tok822_copy_quoted(VSTRING *, char *, char *);
189static int tok822_append_space(TOK822 *);
190
191#define DO_WORD		(1<<0)		/* finding a word is ok here */
192#define DO_GROUP	(1<<1)		/* doing an address group */
193
194#define ADD_COMMA	','		/* resynchronize */
195#define NO_MISSING_COMMA 0
196
197/* tok822_internalize - token tree to string, internal form */
198
199VSTRING *tok822_internalize(VSTRING *vp, TOK822 *tree, int flags)
200{
201    TOK822 *tp;
202
203    if (flags & TOK822_STR_WIPE)
204	VSTRING_RESET(vp);
205
206    for (tp = tree; tp; tp = tp->next) {
207	switch (tp->type) {
208	case ',':
209	    VSTRING_ADDCH(vp, tp->type);
210	    if (flags & TOK822_STR_LINE) {
211		VSTRING_ADDCH(vp, '\n');
212		continue;
213	    }
214	    break;
215	case TOK822_ADDR:
216	    tok822_internalize(vp, tp->head, TOK822_STR_NONE);
217	    break;
218	case TOK822_COMMENT:
219	case TOK822_ATOM:
220	case TOK822_QSTRING:
221	    vstring_strcat(vp, vstring_str(tp->vstr));
222	    break;
223	case TOK822_DOMLIT:
224	    VSTRING_ADDCH(vp, '[');
225	    vstring_strcat(vp, vstring_str(tp->vstr));
226	    VSTRING_ADDCH(vp, ']');
227	    break;
228	case TOK822_STARTGRP:
229	    VSTRING_ADDCH(vp, ':');
230	    break;
231	default:
232	    if (tp->type >= TOK822_MINTOK)
233		msg_panic("tok822_internalize: unknown operator %d", tp->type);
234	    VSTRING_ADDCH(vp, tp->type);
235	}
236	if (tok822_append_space(tp))
237	    VSTRING_ADDCH(vp, ' ');
238    }
239    if (flags & TOK822_STR_TERM)
240	VSTRING_TERMINATE(vp);
241    return (vp);
242}
243
244/* strip_address - strip non-address text from address expression */
245
246static void strip_address(VSTRING *vp, ssize_t start, TOK822 *addr)
247{
248    VSTRING *tmp;
249
250    /*
251     * Emit plain <address>. Discard any comments or phrases.
252     */
253    VSTRING_TERMINATE(vp);
254    msg_warn("stripping too many comments from address: %.100s...",
255	     printable(vstring_str(vp) + start, '?'));
256    vstring_truncate(vp, start);
257    VSTRING_ADDCH(vp, '<');
258    if (addr) {
259	tmp = vstring_alloc(100);
260	tok822_internalize(tmp, addr, TOK822_STR_TERM);
261	quote_822_local_flags(vp, vstring_str(tmp),
262			      QUOTE_FLAG_8BITCLEAN | QUOTE_FLAG_APPEND);
263	vstring_free(tmp);
264    }
265    VSTRING_ADDCH(vp, '>');
266}
267
268/* tok822_externalize - token tree to string, external form */
269
270VSTRING *tok822_externalize(VSTRING *vp, TOK822 *tree, int flags)
271{
272    VSTRING *tmp;
273    TOK822 *tp;
274    ssize_t start;
275    TOK822 *addr;
276    ssize_t addr_len;
277
278    /*
279     * Guard against a Sendmail buffer overflow (CERT advisory CA-2003-07).
280     * The problem was that Sendmail could store too much non-address text
281     * (comments, phrases, etc.) into a static 256-byte buffer.
282     *
283     * When the buffer fills up, fixed Sendmail versions remove comments etc.
284     * and reduce the information to just <$g>, which expands to <address>.
285     * No change is made when an address expression (text separated by
286     * commas) contains no address. This fix reportedly also protects
287     * Sendmail systems that are still vulnerable to this problem.
288     *
289     * Postfix takes the same approach, grudgingly. To avoid unnecessary damage,
290     * Postfix removes comments etc. only when the amount of non-address text
291     * in an address expression (text separated by commas) exceeds 250 bytes.
292     *
293     * With Sendmail, the address part of an address expression is the
294     * right-most <> instance in that expression. If an address expression
295     * contains no <>, then Postfix guarantees that it contains at most one
296     * non-comment string; that string is the address part of the address
297     * expression, so there is no ambiguity.
298     *
299     * Finally, we note that stress testing shows that other code in Sendmail
300     * 8.12.8 bluntly truncates ``text <address>'' to 256 bytes even when
301     * this means chopping the <address> somewhere in the middle. This is a
302     * loss of control that we're not entirely comfortable with. However,
303     * unbalanced quotes and dangling backslash do not seem to influence the
304     * way that Sendmail parses headers, so this is not an urgent problem.
305     */
306#define MAX_NONADDR_LENGTH 250
307
308#define RESET_NONADDR_LENGTH { \
309	start = VSTRING_LEN(vp); \
310	addr = 0; \
311	addr_len = 0; \
312    }
313
314#define ENFORCE_NONADDR_LENGTH do { \
315	if (addr && VSTRING_LEN(vp) - addr_len > start + MAX_NONADDR_LENGTH) \
316	    strip_address(vp, start, addr->head); \
317    } while(0)
318
319    if (flags & TOK822_STR_WIPE)
320	VSTRING_RESET(vp);
321
322    if (flags & TOK822_STR_TRNC)
323	RESET_NONADDR_LENGTH;
324
325    for (tp = tree; tp; tp = tp->next) {
326	switch (tp->type) {
327	case ',':
328	    if (flags & TOK822_STR_TRNC)
329		ENFORCE_NONADDR_LENGTH;
330	    VSTRING_ADDCH(vp, tp->type);
331	    VSTRING_ADDCH(vp, (flags & TOK822_STR_LINE) ? '\n' : ' ');
332	    if (flags & TOK822_STR_TRNC)
333		RESET_NONADDR_LENGTH;
334	    continue;
335
336	    /*
337	     * XXX In order to correctly externalize an address, it is not
338	     * sufficient to quote individual atoms. There are higher-level
339	     * rules that say when an address localpart needs to be quoted.
340	     * We wing it with the quote_822_local() routine, which ignores
341	     * the issue of atoms in the domain part that would need quoting.
342	     */
343	case TOK822_ADDR:
344	    addr = tp;
345	    tmp = vstring_alloc(100);
346	    tok822_internalize(tmp, tp->head, TOK822_STR_TERM);
347	    addr_len = VSTRING_LEN(vp);
348	    quote_822_local_flags(vp, vstring_str(tmp),
349				  QUOTE_FLAG_8BITCLEAN | QUOTE_FLAG_APPEND);
350	    addr_len = VSTRING_LEN(vp) - addr_len;
351	    vstring_free(tmp);
352	    break;
353	case TOK822_ATOM:
354	case TOK822_COMMENT:
355	    vstring_strcat(vp, vstring_str(tp->vstr));
356	    break;
357	case TOK822_QSTRING:
358	    VSTRING_ADDCH(vp, '"');
359	    tok822_copy_quoted(vp, vstring_str(tp->vstr), "\"\\\r\n");
360	    VSTRING_ADDCH(vp, '"');
361	    break;
362	case TOK822_DOMLIT:
363	    VSTRING_ADDCH(vp, '[');
364	    tok822_copy_quoted(vp, vstring_str(tp->vstr), "\\\r\n");
365	    VSTRING_ADDCH(vp, ']');
366	    break;
367	case TOK822_STARTGRP:
368	    VSTRING_ADDCH(vp, ':');
369	    break;
370	case '<':
371	    if (tp->next && tp->next->type == '>') {
372		addr = tp;
373		addr_len = 0;
374	    }
375	    VSTRING_ADDCH(vp, '<');
376	    break;
377	default:
378	    if (tp->type >= TOK822_MINTOK)
379		msg_panic("tok822_externalize: unknown operator %d", tp->type);
380	    VSTRING_ADDCH(vp, tp->type);
381	}
382	if (tok822_append_space(tp))
383	    VSTRING_ADDCH(vp, ' ');
384    }
385    if (flags & TOK822_STR_TRNC)
386	ENFORCE_NONADDR_LENGTH;
387
388    if (flags & TOK822_STR_TERM)
389	VSTRING_TERMINATE(vp);
390    return (vp);
391}
392
393/* tok822_copy_quoted - copy a string while quoting */
394
395static void tok822_copy_quoted(VSTRING *vp, char *str, char *quote_set)
396{
397    int     ch;
398
399    while ((ch = *(unsigned char *) str++) != 0) {
400	if (strchr(quote_set, ch))
401	    VSTRING_ADDCH(vp, '\\');
402	VSTRING_ADDCH(vp, ch);
403    }
404}
405
406/* tok822_append_space - see if space is needed after this token */
407
408static int tok822_append_space(TOK822 *tp)
409{
410    TOK822 *next;
411
412    if (tp == 0 || (next = tp->next) == 0 || tp->owner != 0)
413	return (0);
414    if (tp->type == ',' || tp->type == TOK822_STARTGRP || next->type == '<')
415	return (1);
416
417#define NON_OPERATOR(x) \
418    (x->type == TOK822_ATOM || x->type == TOK822_QSTRING \
419     || x->type == TOK822_COMMENT || x->type == TOK822_DOMLIT \
420     || x->type == TOK822_ADDR)
421
422    return (NON_OPERATOR(tp) && NON_OPERATOR(next));
423}
424
425/* tok822_scan_limit - tokenize string */
426
427TOK822 *tok822_scan_limit(const char *str, TOK822 **tailp, int tok_count_limit)
428{
429    TOK822 *head = 0;
430    TOK822 *tail = 0;
431    TOK822 *tp;
432    int     ch;
433    int     tok_count = 0;
434
435    /*
436     * XXX 2822 new feature: Section 4.1 allows "." to appear in a phrase (to
437     * allow for forms such as: Johnny B. Goode <johhny@domain.org>. I cannot
438     * handle that at the tokenizer level - it is not context sensitive. And
439     * to fix this at the parser level requires radical changes to preserve
440     * white space as part of the token stream. Thanks a lot, people.
441     */
442    while ((ch = *(unsigned char *) str++) != 0) {
443	if (IS_SPACE_TAB_CR_LF(ch))
444	    continue;
445	if (ch == '(') {
446	    tp = tok822_alloc(TOK822_COMMENT, (char *) 0);
447	    str = tok822_comment(tp, str);
448	} else if (ch == '[') {
449	    tp = tok822_alloc(TOK822_DOMLIT, (char *) 0);
450	    COLLECT_SKIP_LAST(tp, str, ch, ch != ']');
451	} else if (ch == '"') {
452	    tp = tok822_alloc(TOK822_QSTRING, (char *) 0);
453	    COLLECT_SKIP_LAST(tp, str, ch, ch != '"');
454	} else if (ch != '\\' && strchr(tok822_opchar, ch)) {
455	    tp = tok822_alloc(ch, (char *) 0);
456	} else {
457	    tp = tok822_alloc(TOK822_ATOM, (char *) 0);
458	    str -= 1;				/* \ may be first */
459	    COLLECT(tp, str, ch, !IS_SPACE_TAB_CR_LF(ch) && !strchr(tok822_opchar, ch));
460	    tok822_quote_atom(tp);
461	}
462	if (head == 0) {
463	    head = tail = tp;
464	    while (tail->next)
465		tail = tail->next;
466	} else {
467	    tail = tok822_append(tail, tp);
468	}
469	if (tok_count_limit > 0 && ++tok_count >= tok_count_limit)
470	    break;
471    }
472    if (tailp)
473	*tailp = tail;
474    return (head);
475}
476
477/* tok822_parse_limit - translate external string to token tree */
478
479TOK822 *tok822_parse_limit(const char *str, int tok_count_limit)
480{
481    TOK822 *head;
482    TOK822 *tail;
483    TOK822 *right;
484    TOK822 *first_token;
485    TOK822 *last_token;
486    TOK822 *tp;
487    int     state;
488
489    /*
490     * First, tokenize the string, from left to right. We are not allowed to
491     * throw away any information that we do not understand. With a flat
492     * token list that contains all tokens, we can always convert back to
493     * string form.
494     */
495    if ((first_token = tok822_scan_limit(str, &last_token, tok_count_limit)) == 0)
496	return (0);
497
498    /*
499     * For convenience, sandwich the token list between two sentinel tokens.
500     */
501#define GLUE(left,rite) { left->next = rite; rite->prev = left; }
502
503    head = tok822_alloc(0, (char *) 0);
504    GLUE(head, first_token);
505    tail = tok822_alloc(0, (char *) 0);
506    GLUE(last_token, tail);
507
508    /*
509     * Next step is to transform the token list into a parse tree. This is
510     * done most conveniently from right to left. If there is something that
511     * we do not understand, just leave it alone, don't throw it away. The
512     * address information that we're looking for sits in-between the current
513     * node (tp) and the one called right. Add missing commas on the fly.
514     */
515    state = DO_WORD;
516    right = tail;
517    tp = tail->prev;
518    while (tp->type) {
519	if (tp->type == TOK822_COMMENT) {	/* move comment to the side */
520	    MOVE_COMMENT_AND_CONTINUE(tp, right);
521	} else if (tp->type == ';') {		/* rh side of named group */
522	    right = tok822_group(TOK822_ADDR, tp, right, ADD_COMMA);
523	    state = DO_GROUP | DO_WORD;
524	} else if (tp->type == ':' && (state & DO_GROUP) != 0) {
525	    tp->type = TOK822_STARTGRP;
526	    (void) tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA);
527	    SKIP(tp, tp->type != ',');
528	    right = tp;
529	    continue;
530	} else if (tp->type == '>') {		/* rh side of <route> */
531	    right = tok822_group(TOK822_ADDR, tp, right, ADD_COMMA);
532	    SKIP_MOVE_COMMENT(tp, tp->type != '<', right);
533	    (void) tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA);
534	    SKIP(tp, tp->type > 0xff || strchr(">;,:", tp->type) == 0);
535	    right = tp;
536	    state |= DO_WORD;
537	    continue;
538	} else if (tp->type == TOK822_ATOM || tp->type == TOK822_QSTRING
539		   || tp->type == TOK822_DOMLIT) {
540	    if ((state & DO_WORD) == 0)
541		right = tok822_group(TOK822_ADDR, tp, right, ADD_COMMA)->next;
542	    state &= ~DO_WORD;
543	} else if (tp->type == ',') {
544	    right = tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA);
545	    state |= DO_WORD;
546	} else {
547	    state |= DO_WORD;
548	}
549	tp = tp->prev;
550    }
551    (void) tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA);
552
553    /*
554     * Discard the sentinel tokens on the left and right extremes. Properly
555     * terminate the resulting list.
556     */
557    tp = (head->next != tail ? head->next : 0);
558    tok822_cut_before(head->next);
559    tok822_free(head);
560    tok822_cut_before(tail);
561    tok822_free(tail);
562    return (tp);
563}
564
565/* tok822_quote_atom - see if an atom needs quoting when externalized */
566
567static void tok822_quote_atom(TOK822 *tp)
568{
569    char   *cp;
570    int     ch;
571
572    /*
573     * RFC 822 expects 7-bit data. Rather than quoting every 8-bit character
574     * (and still passing it on as 8-bit data) we leave 8-bit data alone.
575     */
576    for (cp = vstring_str(tp->vstr); (ch = *(unsigned char *) cp) != 0; cp++) {
577	if ( /* !ISASCII(ch) || */ ch == ' '
578	    || ISCNTRL(ch) || strchr(tok822_opchar, ch)) {
579	    tp->type = TOK822_QSTRING;
580	    break;
581	}
582    }
583}
584
585/* tok822_comment - tokenize comment */
586
587static const char *tok822_comment(TOK822 *tp, const char *str)
588{
589    int     level = 1;
590    int     ch;
591
592    /*
593     * XXX We cheat by storing comments in their external form. Otherwise it
594     * would be a royal pain to preserve \ before (. That would require a
595     * recursive parser; the easy to implement stack-based recursion would be
596     * too expensive.
597     */
598    VSTRING_ADDCH(tp->vstr, '(');
599
600    while ((ch = *(unsigned char *) str) != 0) {
601	VSTRING_ADDCH(tp->vstr, ch);
602	str++;
603	if (ch == '(') {			/* comments can nest! */
604	    level++;
605	} else if (ch == ')') {
606	    if (--level == 0)
607		break;
608	} else if (ch == '\\') {
609	    if ((ch = *(unsigned char *) str) == 0)
610		break;
611	    VSTRING_ADDCH(tp->vstr, ch);
612	    str++;
613	}
614    }
615    VSTRING_TERMINATE(tp->vstr);
616    return (str);
617}
618
619/* tok822_group - cluster a group of tokens */
620
621static TOK822 *tok822_group(int group_type, TOK822 *left, TOK822 *right, int sync_type)
622{
623    TOK822 *group;
624    TOK822 *sync;
625    TOK822 *first;
626
627    /*
628     * Cluster the tokens between left and right under their own parse tree
629     * node. Optionally insert a resync token.
630     */
631    if (left != right && (first = left->next) != right) {
632	tok822_cut_before(right);
633	tok822_cut_before(first);
634	group = tok822_alloc(group_type, (char *) 0);
635	tok822_sub_append(group, first);
636	tok822_append(left, group);
637	tok822_append(group, right);
638	if (sync_type) {
639	    sync = tok822_alloc(sync_type, (char *) 0);
640	    tok822_append(left, sync);
641	}
642    }
643    return (left);
644}
645
646/* tok822_scan_addr - convert external address string to address token */
647
648TOK822 *tok822_scan_addr(const char *addr)
649{
650    TOK822 *tree = tok822_alloc(TOK822_ADDR, (char *) 0);
651
652    tree->head = tok822_scan(addr, &tree->tail);
653    return (tree);
654}
655
656#ifdef TEST
657
658#include <unistd.h>
659#include <vstream.h>
660#include <readlline.h>
661
662/* tok822_print - display token */
663
664static void tok822_print(TOK822 *list, int indent)
665{
666    TOK822 *tp;
667
668    for (tp = list; tp; tp = tp->next) {
669	if (tp->type < TOK822_MINTOK) {
670	    vstream_printf("%*s %s \"%c\"\n", indent, "", "OP", tp->type);
671	} else if (tp->type == TOK822_ADDR) {
672	    vstream_printf("%*s %s\n", indent, "", "address");
673	    tok822_print(tp->head, indent + 2);
674	} else if (tp->type == TOK822_STARTGRP) {
675	    vstream_printf("%*s %s\n", indent, "", "group \":\"");
676	} else {
677	    vstream_printf("%*s %s \"%s\"\n", indent, "",
678			   tp->type == TOK822_COMMENT ? "comment" :
679			   tp->type == TOK822_ATOM ? "atom" :
680			   tp->type == TOK822_QSTRING ? "quoted string" :
681			   tp->type == TOK822_DOMLIT ? "domain literal" :
682			   tp->type == TOK822_ADDR ? "address" :
683			   "unknown\n", vstring_str(tp->vstr));
684	}
685    }
686}
687
688int     main(int unused_argc, char **unused_argv)
689{
690    VSTRING *vp = vstring_alloc(100);
691    TOK822 *list;
692    VSTRING *buf = vstring_alloc(100);
693
694#define TEST_TOKEN_LIMIT 20
695
696    while (readlline(buf, VSTREAM_IN, (int *) 0)) {
697	while (VSTRING_LEN(buf) > 0 && vstring_end(buf)[-1] == '\n') {
698	    vstring_end(buf)[-1] = 0;
699	    vstring_truncate(buf, VSTRING_LEN(buf) - 1);
700	}
701	if (!isatty(vstream_fileno(VSTREAM_IN)))
702	    vstream_printf(">>>%s<<<\n\n", vstring_str(buf));
703	list = tok822_parse_limit(vstring_str(buf), TEST_TOKEN_LIMIT);
704	vstream_printf("Parse tree:\n");
705	tok822_print(list, 0);
706	vstream_printf("\n");
707
708	vstream_printf("Internalized:\n%s\n\n",
709		vstring_str(tok822_internalize(vp, list, TOK822_STR_DEFL)));
710	vstream_fflush(VSTREAM_OUT);
711	vstream_printf("Externalized, no newlines inserted:\n%s\n\n",
712		       vstring_str(tok822_externalize(vp, list,
713				       TOK822_STR_DEFL | TOK822_STR_TRNC)));
714	vstream_fflush(VSTREAM_OUT);
715	vstream_printf("Externalized, newlines inserted:\n%s\n\n",
716		       vstring_str(tok822_externalize(vp, list,
717		     TOK822_STR_DEFL | TOK822_STR_LINE | TOK822_STR_TRNC)));
718	vstream_fflush(VSTREAM_OUT);
719	tok822_free_tree(list);
720    }
721    vstring_free(vp);
722    vstring_free(buf);
723    return (0);
724}
725
726#endif
727