compile.c revision 171206
1164640Sflz/*-
298186Sgordon * Copyright (c) 1992 Diomidis Spinellis.
378344Sobrien * Copyright (c) 1992, 1993
4157473Sflz *	The Regents of the University of California.  All rights reserved.
578344Sobrien *
678344Sobrien * This code is derived from software contributed to Berkeley by
778344Sobrien * Diomidis Spinellis of Imperial College, University of London.
878344Sobrien *
978344Sobrien * Redistribution and use in source and binary forms, with or without
1078344Sobrien * modification, are permitted provided that the following conditions
1178344Sobrien * are met:
1278344Sobrien * 1. Redistributions of source code must retain the above copyright
1378344Sobrien *    notice, this list of conditions and the following disclaimer.
1478344Sobrien * 2. Redistributions in binary form must reproduce the above copyright
1578344Sobrien *    notice, this list of conditions and the following disclaimer in the
1678344Sobrien *    documentation and/or other materials provided with the distribution.
1778344Sobrien * 4. Neither the name of the University nor the names of its contributors
1878344Sobrien *    may be used to endorse or promote products derived from this software
1978344Sobrien *    without specific prior written permission.
2078344Sobrien *
2178344Sobrien * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
2278344Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2378344Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2478344Sobrien * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2578344Sobrien * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2678344Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2778344Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2878344Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2978344Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3078344Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3178344Sobrien * SUCH DAMAGE.
3278344Sobrien */
3378344Sobrien
3478344Sobrien#include <sys/cdefs.h>
35157473Sflz__FBSDID("$FreeBSD: head/usr.bin/sed/compile.c 171206 2007-07-04 16:42:41Z ssouhlal $");
36169668Smtm
37157473Sflz#ifndef lint
3878344Sobrienstatic const char sccsid[] = "@(#)compile.c	8.1 (Berkeley) 6/6/93";
3998186Sgordon#endif
4098186Sgordon
4198186Sgordon#include <sys/types.h>
42131550Scperciva#include <sys/stat.h>
43131550Scperciva
44131550Scperciva#include <ctype.h>
45131550Scperciva#include <err.h>
4698186Sgordon#include <errno.h>
4798186Sgordon#include <fcntl.h>
48202988Semaste#include <limits.h>
49124832Smtm#include <regex.h>
50124832Smtm#include <stdio.h>
51161435Syar#include <stdlib.h>
52161435Syar#include <string.h>
5398186Sgordon#include <wchar.h>
5498186Sgordon
5578344Sobrien#include "defs.h"
5678344Sobrien#include "extern.h"
5778344Sobrien
58197144Shrs#define LHSZ	128
5978344Sobrien#define	LHMASK	(LHSZ - 1)
60197144Shrsstatic struct labhash {
61197144Shrs	struct	labhash *lh_next;
6298186Sgordon	u_int	lh_hash;
63197144Shrs	struct	s_command *lh_cmd;
64197144Shrs	int	lh_ref;
65197144Shrs} *labels[LHSZ];
66197144Shrs
67197144Shrsstatic char	 *compile_addr(char *, struct s_addr *);
68197144Shrsstatic char	 *compile_ccl(char **, char *);
69197144Shrsstatic char	 *compile_delimited(char *, char *);
70197144Shrsstatic char	 *compile_flags(char *, struct s_subst *);
71197144Shrsstatic regex_t	 *compile_re(char *, int);
7298186Sgordonstatic char	 *compile_subst(char *, struct s_subst *);
7398186Sgordonstatic char	 *compile_text(void);
74197144Shrsstatic char	 *compile_tr(char *, struct s_tr **);
75197144Shrsstatic struct s_command
76197144Shrs		**compile_stream(struct s_command **);
7798186Sgordonstatic char	 *duptoeol(char *, const char *);
78197144Shrsstatic void	  enterlabel(struct s_command *);
79197144Shrsstatic struct s_command
8098186Sgordon		 *findlabel(char *);
8198186Sgordonstatic void	  fixuplabel(struct s_command *, struct s_command *);
82197144Shrsstatic void	  uselabel(void);
83197144Shrs
84197144Shrs/*
85197144Shrs * Command specification.  This is used to drive the command parser.
86197144Shrs */
87197144Shrsstruct s_format {
88197144Shrs	char code;				/* Command code */
89197144Shrs	int naddr;				/* Number of address args */
90197144Shrs	enum e_args args;			/* Argument type */
91197144Shrs};
92197144Shrs
93197144Shrsstatic struct s_format cmd_fmts[] = {
94197144Shrs	{'{', 2, GROUP},
9598186Sgordon	{'}', 0, ENDGROUP},
9698186Sgordon	{'a', 1, TEXT},
9798186Sgordon	{'b', 2, BRANCH},
9898186Sgordon	{'c', 2, TEXT},
99197144Shrs	{'d', 2, EMPTY},
100197144Shrs	{'D', 2, EMPTY},
101197144Shrs	{'g', 2, EMPTY},
10298186Sgordon	{'G', 2, EMPTY},
103197144Shrs	{'h', 2, EMPTY},
104197144Shrs	{'H', 2, EMPTY},
105197144Shrs	{'i', 1, TEXT},
106197144Shrs	{'l', 2, EMPTY},
107197144Shrs	{'n', 2, EMPTY},
108197144Shrs	{'N', 2, EMPTY},
109197144Shrs	{'p', 2, EMPTY},
110197144Shrs	{'P', 2, EMPTY},
111197144Shrs	{'q', 1, EMPTY},
112197144Shrs	{'r', 1, RFILE},
113197144Shrs	{'s', 2, SUBST},
114197144Shrs	{'t', 2, BRANCH},
115197144Shrs	{'w', 2, WFILE},
11698186Sgordon	{'x', 2, EMPTY},
11798186Sgordon	{'y', 2, TR},
11898186Sgordon	{'!', 2, NONSEL},
11998186Sgordon	{':', 0, LABEL},
12098186Sgordon	{'#', 0, COMMENT},
12198186Sgordon	{'=', 1, EMPTY},
12298186Sgordon	{'\0', 0, COMMENT},
12398186Sgordon};
12498186Sgordon
12598186Sgordon/* The compiled program. */
12698186Sgordonstruct s_command *prog;
127146490Sschweikh
12898186Sgordon/*
12998186Sgordon * Compile the program into prog.
13098186Sgordon * Initialise appends.
13198186Sgordon */
13298186Sgordonvoid
13398186Sgordoncompile(void)
13498186Sgordon{
13578344Sobrien	*compile_stream(&prog) = NULL;
13678344Sobrien	fixuplabel(prog, NULL);
13778344Sobrien	uselabel();
13878344Sobrien	if (appendnum == 0)
13978344Sobrien		appends = NULL;
14078344Sobrien	else if ((appends = malloc(sizeof(struct s_appends) * appendnum)) ==
14178344Sobrien	    NULL)
14298186Sgordon		err(1, "malloc");
14378344Sobrien	if ((match = malloc((maxnsub + 1) * sizeof(regmatch_t))) == NULL)
14478344Sobrien		err(1, "malloc");
14578344Sobrien}
14678344Sobrien
14778344Sobrien#define EATSPACE() do {							\
14878344Sobrien	if (p)								\
14978344Sobrien		while (*p && isspace((unsigned char)*p))                \
15078344Sobrien			p++;						\
15178344Sobrien	} while (0)
15278344Sobrien
15378344Sobrienstatic struct s_command **
15478344Sobriencompile_stream(struct s_command **link)
155157473Sflz{
15678344Sobrien	char *p;
15778344Sobrien	static char lbuf[_POSIX2_LINE_MAX + 1];	/* To save stack */
15878344Sobrien	struct s_command *cmd, *cmd2, *stack;
15978344Sobrien	struct s_format *fp;
16078344Sobrien	char re[_POSIX2_LINE_MAX + 1];
161157473Sflz	int naddr;				/* Number of addresses */
16298186Sgordon
16398186Sgordon	stack = 0;
16478344Sobrien	for (;;) {
16598186Sgordon		if ((p = cu_fgets(lbuf, sizeof(lbuf), NULL)) == NULL) {
16698186Sgordon			if (stack != 0)
16798186Sgordon				errx(1, "%lu: %s: unexpected EOF (pending }'s)",
168126286Smtm							linenum, fname);
16998186Sgordon			return (link);
17098186Sgordon		}
17198186Sgordon
17298186Sgordonsemicolon:	EATSPACE();
17398186Sgordon		if (p) {
174169668Smtm			if (*p == '#' || *p == '\0')
175169668Smtm				continue;
176169668Smtm			else if (*p == ';') {
177169668Smtm				p++;
17878344Sobrien				goto semicolon;
179169668Smtm			}
180169668Smtm		}
181169668Smtm		if ((*link = cmd = malloc(sizeof(struct s_command))) == NULL)
182169668Smtm			err(1, "malloc");
183178776Smaxim		link = &cmd->next;
184178776Smaxim		cmd->nonsel = cmd->inrange = 0;
185178770Smtm		/* First parse the addresses */
186169668Smtm		naddr = 0;
187178770Smtm
188178770Smtm/* Valid characters to start an address */
189169668Smtm#define	addrchar(c)	(strchr("0123456789/\\$", (c)))
190178770Smtm		if (addrchar(*p)) {
191178775Smaxim			naddr++;
192169668Smtm			if ((cmd->a1 = malloc(sizeof(struct s_addr))) == NULL)
193169668Smtm				err(1, "malloc");
194169668Smtm			p = compile_addr(p, cmd->a1);
195169668Smtm			EATSPACE();				/* EXTENSION */
196169668Smtm			if (*p == ',') {
197169668Smtm				p++;
198169668Smtm				EATSPACE();			/* EXTENSION */
199169668Smtm				naddr++;
20098186Sgordon				if ((cmd->a2 = malloc(sizeof(struct s_addr)))
20198186Sgordon				    == NULL)
20298186Sgordon					err(1, "malloc");
20398186Sgordon				p = compile_addr(p, cmd->a2);
20498186Sgordon				EATSPACE();
20578344Sobrien			} else
20678344Sobrien				cmd->a2 = 0;
20798186Sgordon		} else
20878344Sobrien			cmd->a1 = cmd->a2 = 0;
20978344Sobrien
210126285Smtmnonsel:		/* Now parse the command */
21178344Sobrien		if (!*p)
21278344Sobrien			errx(1, "%lu: %s: command expected", linenum, fname);
213126285Smtm		cmd->code = *p;
21478344Sobrien		for (fp = cmd_fmts; fp->code; fp++)
21578344Sobrien			if (fp->code == *p)
216126285Smtm				break;
217126285Smtm		if (!fp->code)
218126285Smtm			errx(1, "%lu: %s: invalid command code %c", linenum, fname, *p);
21978344Sobrien		if (naddr > fp->naddr)
22078344Sobrien			errx(1,
22198186Sgordon				"%lu: %s: command %c expects up to %d address(es), found %d",
22278344Sobrien				linenum, fname, *p, fp->naddr, naddr);
22378344Sobrien		switch (fp->args) {
22478344Sobrien		case NONSEL:			/* ! */
22578344Sobrien			p++;
22698186Sgordon			EATSPACE();
22798186Sgordon			cmd->nonsel = ! cmd->nonsel;
22878344Sobrien			goto nonsel;
22998186Sgordon		case GROUP:			/* { */
23098186Sgordon			p++;
23178344Sobrien			EATSPACE();
23278344Sobrien			cmd->next = stack;
23378344Sobrien			stack = cmd;
23478344Sobrien			link = &cmd->u.c;
23578344Sobrien			if (*p)
23698186Sgordon				goto semicolon;
23778344Sobrien			break;
23898186Sgordon		case ENDGROUP:
23978344Sobrien			/*
24078344Sobrien			 * Short-circuit command processing, since end of
241131061Smtm			 * group is really just a noop.
24278344Sobrien			 */
24378344Sobrien			cmd->nonsel = 1;
24478344Sobrien			if (stack == 0)
24578344Sobrien				errx(1, "%lu: %s: unexpected }", linenum, fname);
246139949Skeramida			cmd2 = stack;
24778344Sobrien			stack = cmd2->next;
24878344Sobrien			cmd2->next = cmd;
24998186Sgordon			/*FALLTHROUGH*/
25078344Sobrien		case EMPTY:		/* d D g G h H l n N p P q x = \0 */
25178344Sobrien			p++;
25278344Sobrien			EATSPACE();
25398186Sgordon			if (*p == ';') {
25478344Sobrien				p++;
25598186Sgordon				link = &cmd->next;
25698186Sgordon				goto semicolon;
25778344Sobrien			}
25878344Sobrien			if (*p)
25978344Sobrien				errx(1, "%lu: %s: extra characters at the end of %c command",
26078344Sobrien						linenum, fname, cmd->code);
26198186Sgordon			break;
26278344Sobrien		case TEXT:			/* a c i */
26398186Sgordon			p++;
26478344Sobrien			EATSPACE();
26598186Sgordon			if (*p != '\\')
26698186Sgordon				errx(1,
26798186Sgordon"%lu: %s: command %c expects \\ followed by text", linenum, fname, cmd->code);
26898186Sgordon			p++;
26998186Sgordon			EATSPACE();
27098186Sgordon			if (*p)
27198186Sgordon				errx(1,
27298186Sgordon				"%lu: %s: extra characters after \\ at the end of %c command",
27398186Sgordon				linenum, fname, cmd->code);
27498186Sgordon			cmd->t = compile_text();
27598186Sgordon			break;
27698186Sgordon		case COMMENT:			/* \0 # */
27798186Sgordon			break;
27898186Sgordon		case WFILE:			/* w */
279155719Sceri			p++;
28098186Sgordon			EATSPACE();
28198186Sgordon			if (*p == '\0')
28298186Sgordon				errx(1, "%lu: %s: filename expected", linenum, fname);
28398186Sgordon			cmd->t = duptoeol(p, "w command");
284157841Sflz			if (aflag)
285157841Sflz				cmd->u.fd = -1;
286157841Sflz			else if ((cmd->u.fd = open(p,
28798186Sgordon			    O_WRONLY|O_APPEND|O_CREAT|O_TRUNC,
28898186Sgordon			    DEFFILEMODE)) == -1)
28998186Sgordon				err(1, "%s", p);
29098186Sgordon			break;
29198186Sgordon		case RFILE:			/* r */
29298186Sgordon			p++;
29398186Sgordon			EATSPACE();
29498186Sgordon			if (*p == '\0')
29598186Sgordon				errx(1, "%lu: %s: filename expected", linenum, fname);
29698186Sgordon			else
29778344Sobrien				cmd->t = duptoeol(p, "read command");
29898186Sgordon			break;
299170282Syar		case BRANCH:			/* b t */
300170282Syar			p++;
301170282Syar			EATSPACE();
302170282Syar			if (*p == '\0')
303170282Syar				cmd->t = NULL;
304170282Syar			else
305170282Syar				cmd->t = duptoeol(p, "branch");
306170282Syar			break;
307170282Syar		case LABEL:			/* : */
308170282Syar			p++;
309170282Syar			EATSPACE();
310170282Syar			cmd->t = duptoeol(p, "label");
311170282Syar			if (strlen(p) == 0)
312170282Syar				errx(1, "%lu: %s: empty label", linenum, fname);
313170282Syar			enterlabel(cmd);
314170282Syar			break;
315170282Syar		case SUBST:			/* s */
316170282Syar			p++;
317170282Syar			if (*p == '\0' || *p == '\\')
318170282Syar				errx(1,
319170282Syar"%lu: %s: substitute pattern can not be delimited by newline or backslash",
320170282Syar					linenum, fname);
321170282Syar			if ((cmd->u.s = malloc(sizeof(struct s_subst))) == NULL)
322170282Syar				err(1, "malloc");
32378344Sobrien			p = compile_delimited(p, re);
32498186Sgordon			if (p == NULL)
325157841Sflz				errx(1,
32698186Sgordon				"%lu: %s: unterminated substitute pattern", linenum, fname);
32798186Sgordon			if (*re == '\0')
328157841Sflz				cmd->u.s->re = NULL;
32998186Sgordon			else
33098186Sgordon				cmd->u.s->re = compile_re(re, cmd->u.s->icase);
33198186Sgordon			--p;
33298186Sgordon			p = compile_subst(p, cmd->u.s);
333151426Sjhb			p = compile_flags(p, cmd->u.s);
33498186Sgordon			EATSPACE();
33598186Sgordon			if (*p == ';') {
336161435Syar				p++;
337161436Syar				link = &cmd->next;
338157657Sflz				goto semicolon;
339161436Syar			}
340157657Sflz			break;
341157657Sflz		case TR:			/* y */
342157657Sflz			p++;
343157657Sflz			p = compile_tr(p, &cmd->u.y);
34498186Sgordon			EATSPACE();
34598186Sgordon			if (*p == ';') {
34698186Sgordon				p++;
34798186Sgordon				link = &cmd->next;
348114272Smtm				goto semicolon;
34998186Sgordon			}
35098186Sgordon			if (*p)
35198186Sgordon				errx(1,
35298186Sgordon"%lu: %s: extra text at the end of a transform command", linenum, fname);
35398186Sgordon			break;
35498186Sgordon		}
35598186Sgordon	}
35698186Sgordon}
35798186Sgordon
358206248Sdougb/*
359206248Sdougb * Get a delimited string.  P points to the delimeter of the string; d points
360126286Smtm * to a buffer area.  Newline and delimiter escapes are processed; other
36198186Sgordon * escapes are ignored.
36298186Sgordon *
36398186Sgordon * Returns a pointer to the first character after the final delimiter or NULL
36498186Sgordon * in the case of a non-terminated string.  The character array d is filled
36598186Sgordon * with the processed string.
36698186Sgordon */
36798186Sgordonstatic char *
36898186Sgordoncompile_delimited(char *p, char *d)
36998186Sgordon{
370206248Sdougb	char c;
37198186Sgordon
37298186Sgordon	c = *p++;
37398186Sgordon	if (c == '\0')
37498186Sgordon		return (NULL);
37578344Sobrien	else if (c == '\\')
37698186Sgordon		errx(1, "%lu: %s: \\ can not be used as a string delimiter",
37798186Sgordon				linenum, fname);
37898186Sgordon	else if (c == '\n')
379206248Sdougb		errx(1, "%lu: %s: newline can not be used as a string delimiter",
38078344Sobrien				linenum, fname);
38198186Sgordon	while (*p) {
38298186Sgordon		if (*p == '[') {
38398186Sgordon			if ((d = compile_ccl(&p, d)) == NULL)
38478344Sobrien				errx(1, "%lu: %s: unbalanced brackets ([])", linenum, fname);
38578344Sobrien			continue;
38678344Sobrien		} else if (*p == '\\' && p[1] == '[') {
387220962Sdougb			*d++ = *p++;
388220962Sdougb		} else if (*p == '\\' && p[1] == c)
389220962Sdougb			p++;
390220962Sdougb		else if (*p == '\\' && p[1] == 'n') {
391220962Sdougb			*d++ = '\n';
392220962Sdougb			p += 2;
393220962Sdougb			continue;
394220962Sdougb		} else if (*p == '\\' && p[1] == '\\')
395220962Sdougb			*d++ = *p++;
396220962Sdougb		else if (*p == c) {
397220962Sdougb			*d = '\0';
398220962Sdougb			return (p + 1);
399220962Sdougb		}
400220962Sdougb		*d++ = *p++;
401220962Sdougb	}
402220962Sdougb	return (NULL);
403220962Sdougb}
404220962Sdougb
405220962Sdougb
406220962Sdougb/* compile_ccl: expand a POSIX character class */
407220962Sdougbstatic char *
408220962Sdougbcompile_ccl(char **sp, char *t)
409220962Sdougb{
410220962Sdougb	int c, d;
411220962Sdougb	char *s = *sp;
412220962Sdougb
413220962Sdougb	*t++ = *s++;
414220962Sdougb	if (*s == '^')
415220962Sdougb		*t++ = *s++;
416220962Sdougb	if (*s == ']')
417220962Sdougb		*t++ = *s++;
418220962Sdougb	for (; *s && (*t = *s) != ']'; s++, t++)
419220962Sdougb		if (*s == '[' && ((d = *(s+1)) == '.' || d == ':' || d == '=')) {
420220962Sdougb			*++t = *++s, t++, s++;
421220962Sdougb			for (c = *s; (*t = *s) != ']' || c != d; s++, t++)
422220962Sdougb				if ((c = *s) == '\0')
423220962Sdougb					return NULL;
424220962Sdougb		} else if (*s == '\\' && s[1] == 'n')
425220962Sdougb			    *t = '\n', s++;
426197947Sdougb	return (*s == ']') ? *sp = ++s, ++t : NULL;
427197947Sdougb}
428197947Sdougb
429197947Sdougb/*
430197947Sdougb * Compiles the regular expression in RE and returns a pointer to the compiled
431197947Sdougb * regular expression.
432197947Sdougb * Cflags are passed to regcomp.
433197947Sdougb */
434197947Sdougbstatic regex_t *
435197947Sdougbcompile_re(char *re, int case_insensitive)
436197947Sdougb{
437197947Sdougb	regex_t *rep;
438197947Sdougb	int eval, flags;
439197947Sdougb
44098186Sgordon
44198186Sgordon	flags = rflags;
44298186Sgordon	if (case_insensitive)
44398186Sgordon		flags |= REG_ICASE;
44498186Sgordon	if ((rep = malloc(sizeof(regex_t))) == NULL)
44578344Sobrien		err(1, "malloc");
44698186Sgordon	if (eval = regcomp(rep, re, flags) != 0)
44798186Sgordon		errx(1, "%lu: %s: RE error: %s",
44878344Sobrien				linenum, fname, strregerror(eval, rep));
449175676Smtm	if (maxnsub < rep->re_nsub)
45098186Sgordon		maxnsub = rep->re_nsub;
451126303Smtm	return (rep);
452175676Smtm}
45378344Sobrien
45478344Sobrien/*
45578344Sobrien * Compile the substitution string of a regular expression and set res to
45698186Sgordon * point to a saved copy of it.  Nsub is the number of parenthesized regular
45798186Sgordon * expressions.
45878344Sobrien */
45978344Sobrienstatic char *
46078344Sobriencompile_subst(char *p, struct s_subst *s)
46198186Sgordon{
46278344Sobrien	static char lbuf[_POSIX2_LINE_MAX + 1];
46378344Sobrien	int asize, size;
46478344Sobrien	u_char ref;
46578344Sobrien	char c, *text, *op, *sp;
46698186Sgordon	int more = 1, sawesc = 0;
46798186Sgordon
46898186Sgordon	c = *p++;			/* Terminator character */
469197144Shrs	if (c == '\0')
470197144Shrs		return (NULL);
47178344Sobrien
47278344Sobrien	s->maxbref = 0;
47398186Sgordon	s->linenum = linenum;
47498186Sgordon	asize = 2 * _POSIX2_LINE_MAX + 1;
47598186Sgordon	if ((text = malloc(asize)) == NULL)
47678344Sobrien		err(1, "malloc");
47798186Sgordon	size = 0;
47898186Sgordon	do {
47978344Sobrien		op = sp = text + size;
48078344Sobrien		for (; *p; p++) {
48178344Sobrien			if (*p == '\\' || sawesc) {
482157653Sflz				/*
483157653Sflz				 * If this is a continuation from the last
484157653Sflz				 * buffer, we won't have a character to
485157653Sflz				 * skip over.
48678344Sobrien				 */
48798186Sgordon				if (sawesc)
48878344Sobrien					sawesc = 0;
48978344Sobrien				else
49078344Sobrien					p++;
49178344Sobrien
49278344Sobrien				if (*p == '\0') {
49378344Sobrien					/*
49478344Sobrien					 * This escaped character is continued
49578344Sobrien					 * in the next part of the line.  Note
49678344Sobrien					 * this fact, then cause the loop to
49778344Sobrien					 * exit w/ normal EOL case and reenter
49878344Sobrien					 * above with the new buffer.
49978344Sobrien					 */
50098186Sgordon					sawesc = 1;
50178344Sobrien					p--;
50278344Sobrien					continue;
50398186Sgordon				} else if (strchr("123456789", *p) != NULL) {
50478344Sobrien					*sp++ = '\\';
50598186Sgordon					ref = *p - '0';
50698186Sgordon					if (s->re != NULL &&
50798186Sgordon					    ref > s->re->re_nsub)
50878344Sobrien						errx(1, "%lu: %s: \\%c not defined in the RE",
50998186Sgordon								linenum, fname, *p);
51078344Sobrien					if (s->maxbref < ref)
51178344Sobrien						s->maxbref = ref;
51298186Sgordon				} else if (*p == '&' || *p == '\\')
51398186Sgordon					*sp++ = '\\';
51498186Sgordon			} else if (*p == c) {
51598186Sgordon				if (*++p == '\0' && more) {
51678344Sobrien					if (cu_fgets(lbuf, sizeof(lbuf), &more))
51798186Sgordon						p = lbuf;
51878344Sobrien				}
51998186Sgordon				*sp++ = '\0';
52098186Sgordon				size += sp - op;
52198186Sgordon				if ((s->new = realloc(text, size)) == NULL)
52298186Sgordon					err(1, "realloc");
52378344Sobrien				return (p);
524165565Syar			} else if (*p == '\n') {
52578344Sobrien				errx(1,
52678344Sobrien"%lu: %s: unescaped newline inside substitute pattern", linenum, fname);
527165565Syar				/* NOTREACHED */
52878344Sobrien			}
529165565Syar			*sp++ = *p;
530165565Syar		}
531165565Syar		size += sp - op;
532165565Syar		if (asize - size < _POSIX2_LINE_MAX + 1) {
533165565Syar			asize *= 2;
534165565Syar			if ((text = realloc(text, asize)) == NULL)
535165565Syar				err(1, "realloc");
536165565Syar		}
537165565Syar	} while (cu_fgets(p = lbuf, sizeof(lbuf), &more));
538165565Syar	errx(1, "%lu: %s: unterminated substitute in regular expression",
539165565Syar			linenum, fname);
540165565Syar	/* NOTREACHED */
541165565Syar}
542165565Syar
543165565Syar/*
54478344Sobrien * Compile the flags of the s command
54578344Sobrien */
54678344Sobrienstatic char *
54778344Sobriencompile_flags(char *p, struct s_subst *s)
54898186Sgordon{
54998186Sgordon	int gn;			/* True if we have seen g or n */
55078344Sobrien	unsigned long nval;
55198186Sgordon	char wfile[_POSIX2_LINE_MAX + 1], *q;
55298186Sgordon
55378344Sobrien	s->n = 1;				/* Default */
55478344Sobrien	s->p = 0;
55578344Sobrien	s->wfile = NULL;
55678344Sobrien	s->wfd = -1;
55798186Sgordon	s->icase = 0;
55878344Sobrien	for (gn = 0;;) {
55998186Sgordon		EATSPACE();			/* EXTENSION */
56098186Sgordon		switch (*p) {
56198186Sgordon		case 'g':
56298186Sgordon			if (gn)
56378344Sobrien				errx(1,
56498186Sgordon"%lu: %s: more than one number or 'g' in substitute flags", linenum, fname);
56598186Sgordon			gn = 1;
56678344Sobrien			s->n = 0;
567151685Syar			break;
568151685Syar		case '\0':
56978344Sobrien		case '\n':
57078344Sobrien		case ';':
57178344Sobrien			return (p);
57298186Sgordon		case 'p':
57378344Sobrien			s->p = 1;
57498186Sgordon			break;
57598186Sgordon		case 'I':
57698186Sgordon			s->icase = 1;
57798186Sgordon			break;
57898186Sgordon		case '1': case '2': case '3':
57998186Sgordon		case '4': case '5': case '6':
58098186Sgordon		case '7': case '8': case '9':
58198186Sgordon			if (gn)
58298186Sgordon				errx(1,
583126303Smtm"%lu: %s: more than one number or 'g' in substitute flags", linenum, fname);
58498186Sgordon			gn = 1;
58598186Sgordon			errno = 0;
58698186Sgordon			nval = strtol(p, &p, 10);
58798186Sgordon			if (errno == ERANGE || nval > INT_MAX)
58898186Sgordon				errx(1,
58998186Sgordon"%lu: %s: overflow in the 'N' substitute flag", linenum, fname);
59098186Sgordon			s->n = nval;
59198186Sgordon			p--;
59298186Sgordon			break;
59398186Sgordon		case 'w':
59498186Sgordon			p++;
59598186Sgordon#ifdef HISTORIC_PRACTICE
59698186Sgordon			if (*p != ' ') {
597175676Smtm				warnx("%lu: %s: space missing before w wfile", linenum, fname);
59898186Sgordon				return (p);
599175676Smtm			}
60078344Sobrien#endif
60178344Sobrien			EATSPACE();
602116097Smtm			q = wfile;
60398186Sgordon			while (*p) {
60478344Sobrien				if (*p == '\n')
60598186Sgordon					break;
60678344Sobrien				*q++ = *p++;
60778344Sobrien			}
608132892Smtm			*q = '\0';
609132892Smtm			if (q == wfile)
610132892Smtm				errx(1, "%lu: %s: no wfile specified", linenum, fname);
611132892Smtm			s->wfile = strdup(wfile);
612132892Smtm			if (!aflag && (s->wfd = open(wfile,
613132892Smtm			    O_WRONLY|O_APPEND|O_CREAT|O_TRUNC,
614126303Smtm			    DEFFILEMODE)) == -1)
61598186Sgordon				err(1, "%s", wfile);
61678344Sobrien			return (p);
61798186Sgordon		default:
61898186Sgordon			errx(1, "%lu: %s: bad flag in substitute command: '%c'",
619175676Smtm					linenum, fname, *p);
62078344Sobrien			break;
621198216Sed		}
62298186Sgordon		p++;
623126303Smtm	}
624126303Smtm}
62578344Sobrien
62678344Sobrien/*
62778344Sobrien * Compile a translation set of strings into a lookup table.
62878344Sobrien */
629126303Smtmstatic char *
630126303Smtmcompile_tr(char *p, struct s_tr **py)
631126303Smtm{
632126303Smtm	struct s_tr *y;
633126303Smtm	int i;
634126303Smtm	const char *op, *np;
635126303Smtm	char old[_POSIX2_LINE_MAX + 1];
636175676Smtm	char new[_POSIX2_LINE_MAX + 1];
637175676Smtm	size_t oclen, oldlen, nclen, newlen;
638175676Smtm	mbstate_t mbs1, mbs2;
639175676Smtm
640175676Smtm	if ((*py = y = malloc(sizeof(*y))) == NULL)
64178344Sobrien		err(1, NULL);
64278344Sobrien	y->multis = NULL;
643161530Sflz	y->nmultis = 0;
644198162Sdougb
645161530Sflz	if (*p == '\0' || *p == '\\')
64678344Sobrien		errx(1,
64798186Sgordon	"%lu: %s: transform pattern can not be delimited by newline or backslash",
64878344Sobrien			linenum, fname);
64998186Sgordon	p = compile_delimited(p, old);
65098186Sgordon	if (p == NULL)
651131135Smtm		errx(1, "%lu: %s: unterminated transform source string",
652131135Smtm				linenum, fname);
65378344Sobrien	p = compile_delimited(p - 1, new);
65498186Sgordon	if (p == NULL)
65598186Sgordon		errx(1, "%lu: %s: unterminated transform target string",
65698186Sgordon				linenum, fname);
65778344Sobrien	EATSPACE();
65878344Sobrien	op = old;
65998186Sgordon	oldlen = mbsrtowcs(NULL, &op, 0, NULL);
66078344Sobrien	if (oldlen == (size_t)-1)
66178344Sobrien		err(1, NULL);
66278344Sobrien	np = new;
66398186Sgordon	newlen = mbsrtowcs(NULL, &np, 0, NULL);
664150796Syar	if (newlen == (size_t)-1)
66578344Sobrien		err(1, NULL);
66678344Sobrien	if (newlen != oldlen)
66778344Sobrien		errx(1, "%lu: %s: transform strings are not the same length",
66898186Sgordon				linenum, fname);
66978344Sobrien	if (MB_CUR_MAX == 1) {
67098186Sgordon		/*
67178344Sobrien		 * The single-byte encoding case is easy: generate a
67298186Sgordon		 * lookup table.
67398186Sgordon		 */
67498186Sgordon		for (i = 0; i <= UCHAR_MAX; i++)
67578344Sobrien			y->bytetab[i] = (char)i;
67698186Sgordon		for (; *op; op++, np++)
677124832Smtm			y->bytetab[(u_char)*op] = *np;
67898186Sgordon	} else {
67998186Sgordon		/*
68098186Sgordon		 * Multi-byte encoding case: generate a lookup table as
68198186Sgordon		 * above, but only for single-byte characters. The first
682179870Smtm		 * bytes of multi-byte characters have their lookup table
683179870Smtm		 * entries set to 0, which causes do_tr() to search through
684179870Smtm		 * an auxiliary vector of multi-byte mappings.
685179870Smtm		 */
686179870Smtm		memset(&mbs1, 0, sizeof(mbs1));
687179870Smtm		memset(&mbs2, 0, sizeof(mbs2));
688206686Sdougb		for (i = 0; i <= UCHAR_MAX; i++)
689206686Sdougb			y->bytetab[i] = (btowc(i) != WEOF) ? i : 0;
69078344Sobrien		while (*op != '\0') {
69178344Sobrien			oclen = mbrlen(op, MB_LEN_MAX, &mbs1);
69278344Sobrien			if (oclen == (size_t)-1 || oclen == (size_t)-2)
693220760Sdougb				errc(1, EILSEQ, NULL);
694220760Sdougb			nclen = mbrlen(np, MB_LEN_MAX, &mbs2);
695179870Smtm			if (nclen == (size_t)-1 || nclen == (size_t)-2)
696179870Smtm				errc(1, EILSEQ, NULL);
697179870Smtm			if (oclen == 1 && nclen == 1)
698179870Smtm				y->bytetab[(u_char)*op] = *np;
699179870Smtm			else {
700179870Smtm				y->bytetab[(u_char)*op] = 0;
701179870Smtm				y->multis = realloc(y->multis,
702175676Smtm				    (y->nmultis + 1) * sizeof(*y->multis));
703175676Smtm				if (y->multis == NULL)
70478344Sobrien					err(1, NULL);
70578344Sobrien				i = y->nmultis++;
70678344Sobrien				y->multis[i].fromlen = oclen;
70778344Sobrien				memcpy(y->multis[i].from, op, oclen);
70878344Sobrien				y->multis[i].tolen = nclen;
709165565Syar				memcpy(y->multis[i].to, np, nclen);
710165565Syar			}
711165565Syar			op += oclen;
712165565Syar			np += nclen;
71378344Sobrien		}
714165565Syar	}
715165565Syar	return (p);
716165565Syar}
717116097Smtm
71878344Sobrien/*
71978344Sobrien * Compile the text following an a or i command.
72098186Sgordon */
72178344Sobrienstatic char *
72278344Sobriencompile_text(void)
723165565Syar{
72498186Sgordon	int asize, esc_nl, size;
72598186Sgordon	char *text, *p, *op, *s;
72678344Sobrien	char lbuf[_POSIX2_LINE_MAX + 1];
72778344Sobrien
72878344Sobrien	asize = 2 * _POSIX2_LINE_MAX + 1;
72978344Sobrien	if ((text = malloc(asize)) == NULL)
730165565Syar		err(1, "malloc");
73178344Sobrien	size = 0;
73278344Sobrien	while (cu_fgets(lbuf, sizeof(lbuf), NULL)) {
73378344Sobrien		op = s = text + size;
734131135Smtm		p = lbuf;
735157473Sflz		EATSPACE();
736153152Syar		for (esc_nl = 0; *p != '\0'; p++) {
73778344Sobrien			if (*p == '\\' && p[1] != '\0' && *++p == '\n')
73878344Sobrien				esc_nl = 1;
739167413Syar			*s++ = *p;
740160667Syar		}
741153152Syar		size += s - op;
74278344Sobrien		if (!esc_nl) {
74378344Sobrien			*s = '\0';
744179946Smtm			break;
745179946Smtm		}
746179946Smtm		if (asize - size < _POSIX2_LINE_MAX + 1) {
747179946Smtm			asize *= 2;
74878344Sobrien			if ((text = realloc(text, asize)) == NULL)
749160668Syar				err(1, "realloc");
75078344Sobrien		}
751197947Sdougb	}
75278344Sobrien	text[size] = '\0';
75378344Sobrien	if ((p = realloc(text, size + 1)) == NULL)
75478344Sobrien		err(1, "realloc");
75578344Sobrien	return (p);
75698186Sgordon}
75778344Sobrien
75878344Sobrien/*
759161396Syar * Get an address and return a pointer to the first character after
76098186Sgordon * it.  Fill the structure pointed to according to the address.
76198186Sgordon */
76298186Sgordonstatic char *
76398186Sgordoncompile_addr(char *p, struct s_addr *a)
764161396Syar{
765161396Syar	char *end, re[_POSIX2_LINE_MAX + 1];
766161396Syar	int icase;
767201036Sdougb
768161396Syar	icase = 0;
769161396Syar
77078344Sobrien	switch (*p) {
77198186Sgordon	case '\\':				/* Context address */
772165565Syar		++p;
77398186Sgordon		/* FALLTHROUGH */
774179946Smtm	case '/':				/* Context address */
775179946Smtm		p = compile_delimited(p, re);
776179946Smtm		if (p == NULL)
777179946Smtm			errx(1, "%lu: %s: unterminated regular expression", linenum, fname);
77898186Sgordon		/* Check for case insensitive regexp flag */
77998186Sgordon		if (*p == 'I') {
78098186Sgordon			icase = 1;
781165565Syar			p++;
78278344Sobrien		}
78378344Sobrien		if (*re == '\0')
78478344Sobrien			a->u.r = NULL;
78598186Sgordon		else
786153152Syar			a->u.r = compile_re(re, icase);
787165565Syar		a->type = AT_RE;
788153152Syar		return (p);
78978344Sobrien
79078344Sobrien	case '$':				/* Last line */
791165565Syar		a->type = AT_LAST;
79298186Sgordon		return (p + 1);
79398186Sgordon						/* Line number */
79498186Sgordon	case '0': case '1': case '2': case '3': case '4':
79578344Sobrien	case '5': case '6': case '7': case '8': case '9':
796165565Syar		a->type = AT_LINE;
797165565Syar		a->u.l = strtol(p, &end, 10);
79898186Sgordon		return (end);
79998186Sgordon	default:
80098186Sgordon		errx(1, "%lu: %s: expected context address", linenum, fname);
80198186Sgordon		return (NULL);
802165565Syar	}
803165565Syar}
80478344Sobrien
80578344Sobrien/*
80678344Sobrien * duptoeol --
80798186Sgordon *	Return a copy of all the characters up to \n or \0.
808165565Syar */
809153152Syarstatic char *
81078344Sobrienduptoeol(char *s, const char *ctype)
811165565Syar{
812165565Syar	size_t len;
813165565Syar	int ws;
814165565Syar	char *p, *start;
815165565Syar
816165565Syar	ws = 0;
817165565Syar	for (start = s; *s != '\0' && *s != '\n'; ++s)
81878344Sobrien		ws = isspace((unsigned char)*s);
81978344Sobrien	*s = '\0';
82078344Sobrien	if (ws)
82178344Sobrien		warnx("%lu: %s: whitespace after %s", linenum, fname, ctype);
82278344Sobrien	len = s - start + 1;
82378344Sobrien	if ((p = malloc(len)) == NULL)
824126285Smtm		err(1, "malloc");
82578344Sobrien	return (memmove(p, start, len));
82678344Sobrien}
827126285Smtm
82878344Sobrien/*
829165565Syar * Convert goto label names to addresses, and count a and r commands, in
830165565Syar * the given subset of the script.  Free the memory used by labels in b
831165565Syar * and t commands (but not by :).
832152519Syar *
833165565Syar * TODO: Remove } nodes
834165565Syar */
835165565Syarstatic void
83698186Sgordonfixuplabel(struct s_command *cp, struct s_command *end)
837165565Syar{
83878344Sobrien
83978344Sobrien	for (; cp != end; cp = cp->next)
84098186Sgordon		switch (cp->code) {
841165565Syar		case 'a':
84298186Sgordon		case 'r':
84398186Sgordon			appendnum++;
84498186Sgordon			break;
845165565Syar		case 'b':
84698186Sgordon		case 't':
84798186Sgordon			/* Resolve branch target. */
84878344Sobrien			if (cp->t == NULL) {
849197144Shrs				cp->u.c = NULL;
850197144Shrs				break;
851197144Shrs			}
852197144Shrs			if ((cp->u.c = findlabel(cp->t)) == NULL)
853197144Shrs				errx(1, "%lu: %s: undefined label '%s'", linenum, fname, cp->t);
854197144Shrs			free(cp->t);
855197144Shrs			break;
856197144Shrs		case '{':
857197144Shrs			/* Do interior commands. */
858197144Shrs			fixuplabel(cp->u.c, cp->next);
859197144Shrs			break;
860197144Shrs		}
861197144Shrs}
862197144Shrs
863197144Shrs/*
864197144Shrs * Associate the given command label for later lookup.
865197144Shrs */
866197144Shrsstatic void
867197144Shrsenterlabel(struct s_command *cp)
86878344Sobrien{
869197144Shrs	struct labhash **lhp, *lh;
870197144Shrs	u_char *p;
871197144Shrs	u_int h, c;
872197144Shrs
873197144Shrs	for (h = 0, p = (u_char *)cp->t; (c = *p) != 0; p++)
874197144Shrs		h = (h << 5) + h + c;
875197144Shrs	lhp = &labels[h & LHMASK];
876197144Shrs	for (lh = *lhp; lh != NULL; lh = lh->lh_next)
877197144Shrs		if (lh->lh_hash == h && strcmp(cp->t, lh->lh_cmd->t) == 0)
878197144Shrs			errx(1, "%lu: %s: duplicate label '%s'", linenum, fname, cp->t);
879197144Shrs	if ((lh = malloc(sizeof *lh)) == NULL)
880197144Shrs		err(1, "malloc");
881197144Shrs	lh->lh_next = *lhp;
882197144Shrs	lh->lh_hash = h;
883197144Shrs	lh->lh_cmd = cp;
884197144Shrs	lh->lh_ref = 0;
885197144Shrs	*lhp = lh;
886197144Shrs}
887197144Shrs
888197144Shrs/*
889197144Shrs * Find the label contained in the command l in the command linked
890197144Shrs * list cp.  L is excluded from the search.  Return NULL if not found.
891197144Shrs */
89278344Sobrienstatic struct s_command *
89378344Sobrienfindlabel(char *name)
89478344Sobrien{
895150796Syar	struct labhash *lh;
89678344Sobrien	u_char *p;
89778344Sobrien	u_int h, c;
89878344Sobrien
899116097Smtm	for (h = 0, p = (u_char *)name; (c = *p) != 0; p++)
90078344Sobrien		h = (h << 5) + h + c;
90178344Sobrien	for (lh = labels[h & LHMASK]; lh != NULL; lh = lh->lh_next) {
90298186Sgordon		if (lh->lh_hash == h && strcmp(name, lh->lh_cmd->t) == 0) {
903150796Syar			lh->lh_ref = 1;
904153152Syar			return (lh->lh_cmd);
90578344Sobrien		}
90678344Sobrien	}
90778344Sobrien	return (NULL);
908165565Syar}
909165565Syar
910165565Syar/*
911165565Syar * Warn about any unused labels.  As a side effect, release the label hash
912165565Syar * table space.
913165565Syar */
914165565Syarstatic void
915165565Syaruselabel(void)
916165565Syar{
917165565Syar	struct labhash *lh, *next;
918165565Syar	int i;
919165565Syar
920165565Syar	for (i = 0; i < LHSZ; i++) {
921165565Syar		for (lh = labels[i]; lh != NULL; lh = next) {
922165565Syar			next = lh->lh_next;
923165565Syar			if (!lh->lh_ref)
924165565Syar				warnx("%lu: %s: unused label '%s'",
925165565Syar				    linenum, fname, lh->lh_cmd->t);
926165565Syar			free(lh);
927165565Syar		}
928165565Syar	}
929165565Syar}
930165565Syar