11590Srgrimes/*-
21590Srgrimes * Copyright (c) 1992 Diomidis Spinellis.
31590Srgrimes * Copyright (c) 1992, 1993
41590Srgrimes *	The Regents of the University of California.  All rights reserved.
51590Srgrimes *
61590Srgrimes * This code is derived from software contributed to Berkeley by
71590Srgrimes * Diomidis Spinellis of Imperial College, University of London.
81590Srgrimes *
91590Srgrimes * Redistribution and use in source and binary forms, with or without
101590Srgrimes * modification, are permitted provided that the following conditions
111590Srgrimes * are met:
121590Srgrimes * 1. Redistributions of source code must retain the above copyright
131590Srgrimes *    notice, this list of conditions and the following disclaimer.
141590Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
151590Srgrimes *    notice, this list of conditions and the following disclaimer in the
161590Srgrimes *    documentation and/or other materials provided with the distribution.
171590Srgrimes * 4. Neither the name of the University nor the names of its contributors
181590Srgrimes *    may be used to endorse or promote products derived from this software
191590Srgrimes *    without specific prior written permission.
201590Srgrimes *
211590Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
221590Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
231590Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
241590Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
251590Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
261590Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
271590Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
281590Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
291590Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
301590Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
311590Srgrimes * SUCH DAMAGE.
321590Srgrimes */
331590Srgrimes
3487766Smarkm#include <sys/cdefs.h>
3587766Smarkm__FBSDID("$FreeBSD: releng/10.2/usr.bin/sed/compile.c 276099 2014-12-23 02:46:00Z pfg $");
3687766Smarkm
371590Srgrimes#ifndef lint
3887766Smarkmstatic const char sccsid[] = "@(#)compile.c	8.1 (Berkeley) 6/6/93";
3928066Scharnier#endif
401590Srgrimes
411590Srgrimes#include <sys/types.h>
421590Srgrimes#include <sys/stat.h>
431590Srgrimes
441590Srgrimes#include <ctype.h>
4528066Scharnier#include <err.h>
46132145Stjr#include <errno.h>
471590Srgrimes#include <fcntl.h>
481590Srgrimes#include <limits.h>
491590Srgrimes#include <regex.h>
501590Srgrimes#include <stdio.h>
511590Srgrimes#include <stdlib.h>
521590Srgrimes#include <string.h>
53132145Stjr#include <wchar.h>
541590Srgrimes
551590Srgrimes#include "defs.h"
561590Srgrimes#include "extern.h"
571590Srgrimes
581590Srgrimes#define LHSZ	128
591590Srgrimes#define	LHMASK	(LHSZ - 1)
601590Srgrimesstatic struct labhash {
611590Srgrimes	struct	labhash *lh_next;
621590Srgrimes	u_int	lh_hash;
631590Srgrimes	struct	s_command *lh_cmd;
641590Srgrimes	int	lh_ref;
651590Srgrimes} *labels[LHSZ];
661590Srgrimes
6792922Simpstatic char	 *compile_addr(char *, struct s_addr *);
6892922Simpstatic char	 *compile_ccl(char **, char *);
69197361Sddsstatic char	 *compile_delimited(char *, char *, int);
7092922Simpstatic char	 *compile_flags(char *, struct s_subst *);
71171206Sssouhlalstatic regex_t	 *compile_re(char *, int);
7292922Simpstatic char	 *compile_subst(char *, struct s_subst *);
7392922Simpstatic char	 *compile_text(void);
74132145Stjrstatic char	 *compile_tr(char *, struct s_tr **);
751590Srgrimesstatic struct s_command
7692922Simp		**compile_stream(struct s_command **);
7792922Simpstatic char	 *duptoeol(char *, const char *);
7892922Simpstatic void	  enterlabel(struct s_command *);
791590Srgrimesstatic struct s_command
8092922Simp		 *findlabel(char *);
8192922Simpstatic void	  fixuplabel(struct s_command *, struct s_command *);
8292922Simpstatic void	  uselabel(void);
831590Srgrimes
841590Srgrimes/*
851590Srgrimes * Command specification.  This is used to drive the command parser.
861590Srgrimes */
871590Srgrimesstruct s_format {
881590Srgrimes	char code;				/* Command code */
891590Srgrimes	int naddr;				/* Number of address args */
901590Srgrimes	enum e_args args;			/* Argument type */
911590Srgrimes};
921590Srgrimes
931590Srgrimesstatic struct s_format cmd_fmts[] = {
941590Srgrimes	{'{', 2, GROUP},
9510075Sjkh	{'}', 0, ENDGROUP},
961590Srgrimes	{'a', 1, TEXT},
971590Srgrimes	{'b', 2, BRANCH},
981590Srgrimes	{'c', 2, TEXT},
991590Srgrimes	{'d', 2, EMPTY},
1001590Srgrimes	{'D', 2, EMPTY},
1011590Srgrimes	{'g', 2, EMPTY},
1021590Srgrimes	{'G', 2, EMPTY},
1031590Srgrimes	{'h', 2, EMPTY},
1041590Srgrimes	{'H', 2, EMPTY},
1051590Srgrimes	{'i', 1, TEXT},
1061590Srgrimes	{'l', 2, EMPTY},
1071590Srgrimes	{'n', 2, EMPTY},
1081590Srgrimes	{'N', 2, EMPTY},
1091590Srgrimes	{'p', 2, EMPTY},
1101590Srgrimes	{'P', 2, EMPTY},
1111590Srgrimes	{'q', 1, EMPTY},
1121590Srgrimes	{'r', 1, RFILE},
1131590Srgrimes	{'s', 2, SUBST},
1141590Srgrimes	{'t', 2, BRANCH},
1151590Srgrimes	{'w', 2, WFILE},
1161590Srgrimes	{'x', 2, EMPTY},
1171590Srgrimes	{'y', 2, TR},
1181590Srgrimes	{'!', 2, NONSEL},
1191590Srgrimes	{':', 0, LABEL},
1201590Srgrimes	{'#', 0, COMMENT},
1211590Srgrimes	{'=', 1, EMPTY},
1221590Srgrimes	{'\0', 0, COMMENT},
1231590Srgrimes};
1241590Srgrimes
1251590Srgrimes/* The compiled program. */
1261590Srgrimesstruct s_command *prog;
1271590Srgrimes
1281590Srgrimes/*
1291590Srgrimes * Compile the program into prog.
1301590Srgrimes * Initialise appends.
1311590Srgrimes */
1321590Srgrimesvoid
133122044Sdescompile(void)
1341590Srgrimes{
13510075Sjkh	*compile_stream(&prog) = NULL;
1361590Srgrimes	fixuplabel(prog, NULL);
1371590Srgrimes	uselabel();
13886193Smikeh	if (appendnum == 0)
13986193Smikeh		appends = NULL;
14086193Smikeh	else if ((appends = malloc(sizeof(struct s_appends) * appendnum)) ==
14186193Smikeh	    NULL)
14280286Sobrien		err(1, "malloc");
14380286Sobrien	if ((match = malloc((maxnsub + 1) * sizeof(regmatch_t))) == NULL)
14480286Sobrien		err(1, "malloc");
1451590Srgrimes}
1461590Srgrimes
1471590Srgrimes#define EATSPACE() do {							\
1481590Srgrimes	if (p)								\
14917522Sache		while (*p && isspace((unsigned char)*p))                \
1501590Srgrimes			p++;						\
1511590Srgrimes	} while (0)
1521590Srgrimes
1531590Srgrimesstatic struct s_command **
154122044Sdescompile_stream(struct s_command **link)
15510075Sjkh{
15687766Smarkm	char *p;
1571590Srgrimes	static char lbuf[_POSIX2_LINE_MAX + 1];	/* To save stack */
15810075Sjkh	struct s_command *cmd, *cmd2, *stack;
1591590Srgrimes	struct s_format *fp;
160171206Sssouhlal	char re[_POSIX2_LINE_MAX + 1];
1611590Srgrimes	int naddr;				/* Number of addresses */
1621590Srgrimes
16310075Sjkh	stack = 0;
1641590Srgrimes	for (;;) {
16541602Sarchie		if ((p = cu_fgets(lbuf, sizeof(lbuf), NULL)) == NULL) {
16610075Sjkh			if (stack != 0)
16728066Scharnier				errx(1, "%lu: %s: unexpected EOF (pending }'s)",
16828066Scharnier							linenum, fname);
1691590Srgrimes			return (link);
1701590Srgrimes		}
1711590Srgrimes
1721590Srgrimessemicolon:	EATSPACE();
173122045Sdes		if (p) {
174122045Sdes			if (*p == '#' || *p == '\0')
175122045Sdes				continue;
176122045Sdes			else if (*p == ';') {
177122045Sdes				p++;
178122045Sdes				goto semicolon;
179122045Sdes			}
180122045Sdes		}
18180286Sobrien		if ((*link = cmd = malloc(sizeof(struct s_command))) == NULL)
18280286Sobrien			err(1, "malloc");
1831590Srgrimes		link = &cmd->next;
184192732Sbrian		cmd->startline = cmd->nonsel = 0;
1851590Srgrimes		/* First parse the addresses */
1861590Srgrimes		naddr = 0;
1871590Srgrimes
1881590Srgrimes/* Valid characters to start an address */
1891590Srgrimes#define	addrchar(c)	(strchr("0123456789/\\$", (c)))
1901590Srgrimes		if (addrchar(*p)) {
1911590Srgrimes			naddr++;
19280286Sobrien			if ((cmd->a1 = malloc(sizeof(struct s_addr))) == NULL)
19380286Sobrien				err(1, "malloc");
1941590Srgrimes			p = compile_addr(p, cmd->a1);
1951590Srgrimes			EATSPACE();				/* EXTENSION */
1961590Srgrimes			if (*p == ',') {
1971590Srgrimes				p++;
1981590Srgrimes				EATSPACE();			/* EXTENSION */
19910075Sjkh				naddr++;
20080286Sobrien				if ((cmd->a2 = malloc(sizeof(struct s_addr)))
20180286Sobrien				    == NULL)
20280286Sobrien					err(1, "malloc");
2031590Srgrimes				p = compile_addr(p, cmd->a2);
20410075Sjkh				EATSPACE();
20510075Sjkh			} else
20610075Sjkh				cmd->a2 = 0;
20710075Sjkh		} else
20810075Sjkh			cmd->a1 = cmd->a2 = 0;
2091590Srgrimes
2101590Srgrimesnonsel:		/* Now parse the command */
2111590Srgrimes		if (!*p)
21228066Scharnier			errx(1, "%lu: %s: command expected", linenum, fname);
2131590Srgrimes		cmd->code = *p;
2141590Srgrimes		for (fp = cmd_fmts; fp->code; fp++)
2151590Srgrimes			if (fp->code == *p)
2161590Srgrimes				break;
2171590Srgrimes		if (!fp->code)
21828066Scharnier			errx(1, "%lu: %s: invalid command code %c", linenum, fname, *p);
2191590Srgrimes		if (naddr > fp->naddr)
22028066Scharnier			errx(1,
22128066Scharnier				"%lu: %s: command %c expects up to %d address(es), found %d",
22228066Scharnier				linenum, fname, *p, fp->naddr, naddr);
2231590Srgrimes		switch (fp->args) {
2241590Srgrimes		case NONSEL:			/* ! */
22510075Sjkh			p++;
22610075Sjkh			EATSPACE();
227184854Sdds			cmd->nonsel = 1;
2281590Srgrimes			goto nonsel;
2291590Srgrimes		case GROUP:			/* { */
2301590Srgrimes			p++;
2311590Srgrimes			EATSPACE();
23210075Sjkh			cmd->next = stack;
23310075Sjkh			stack = cmd;
23410075Sjkh			link = &cmd->u.c;
23510075Sjkh			if (*p)
23610075Sjkh				goto semicolon;
2371590Srgrimes			break;
23810075Sjkh		case ENDGROUP:
23910075Sjkh			/*
24010075Sjkh			 * Short-circuit command processing, since end of
24110075Sjkh			 * group is really just a noop.
24210075Sjkh			 */
24310075Sjkh			cmd->nonsel = 1;
24410075Sjkh			if (stack == 0)
24528066Scharnier				errx(1, "%lu: %s: unexpected }", linenum, fname);
24610075Sjkh			cmd2 = stack;
24710075Sjkh			stack = cmd2->next;
24810075Sjkh			cmd2->next = cmd;
24910075Sjkh			/*FALLTHROUGH*/
2501590Srgrimes		case EMPTY:		/* d D g G h H l n N p P q x = \0 */
2511590Srgrimes			p++;
2521590Srgrimes			EATSPACE();
2531590Srgrimes			if (*p == ';') {
2541590Srgrimes				p++;
2551590Srgrimes				link = &cmd->next;
2561590Srgrimes				goto semicolon;
2571590Srgrimes			}
2581590Srgrimes			if (*p)
25928066Scharnier				errx(1, "%lu: %s: extra characters at the end of %c command",
26028066Scharnier						linenum, fname, cmd->code);
2611590Srgrimes			break;
2621590Srgrimes		case TEXT:			/* a c i */
2631590Srgrimes			p++;
2641590Srgrimes			EATSPACE();
2651590Srgrimes			if (*p != '\\')
26628066Scharnier				errx(1,
26728066Scharnier"%lu: %s: command %c expects \\ followed by text", linenum, fname, cmd->code);
2681590Srgrimes			p++;
2691590Srgrimes			EATSPACE();
2701590Srgrimes			if (*p)
27128066Scharnier				errx(1,
27228066Scharnier				"%lu: %s: extra characters after \\ at the end of %c command",
27328066Scharnier				linenum, fname, cmd->code);
2741590Srgrimes			cmd->t = compile_text();
2751590Srgrimes			break;
2761590Srgrimes		case COMMENT:			/* \0 # */
2771590Srgrimes			break;
2781590Srgrimes		case WFILE:			/* w */
2791590Srgrimes			p++;
2801590Srgrimes			EATSPACE();
2811590Srgrimes			if (*p == '\0')
28228066Scharnier				errx(1, "%lu: %s: filename expected", linenum, fname);
2831590Srgrimes			cmd->t = duptoeol(p, "w command");
2841590Srgrimes			if (aflag)
2851590Srgrimes				cmd->u.fd = -1;
286122045Sdes			else if ((cmd->u.fd = open(p,
2871590Srgrimes			    O_WRONLY|O_APPEND|O_CREAT|O_TRUNC,
2881590Srgrimes			    DEFFILEMODE)) == -1)
28928066Scharnier				err(1, "%s", p);
2901590Srgrimes			break;
2911590Srgrimes		case RFILE:			/* r */
2921590Srgrimes			p++;
2931590Srgrimes			EATSPACE();
2941590Srgrimes			if (*p == '\0')
29528066Scharnier				errx(1, "%lu: %s: filename expected", linenum, fname);
2961590Srgrimes			else
2971590Srgrimes				cmd->t = duptoeol(p, "read command");
2981590Srgrimes			break;
2991590Srgrimes		case BRANCH:			/* b t */
3001590Srgrimes			p++;
3011590Srgrimes			EATSPACE();
3021590Srgrimes			if (*p == '\0')
3031590Srgrimes				cmd->t = NULL;
3041590Srgrimes			else
3051590Srgrimes				cmd->t = duptoeol(p, "branch");
3061590Srgrimes			break;
3071590Srgrimes		case LABEL:			/* : */
3081590Srgrimes			p++;
3091590Srgrimes			EATSPACE();
3101590Srgrimes			cmd->t = duptoeol(p, "label");
3111590Srgrimes			if (strlen(p) == 0)
31228066Scharnier				errx(1, "%lu: %s: empty label", linenum, fname);
3131590Srgrimes			enterlabel(cmd);
3141590Srgrimes			break;
3151590Srgrimes		case SUBST:			/* s */
3161590Srgrimes			p++;
3171590Srgrimes			if (*p == '\0' || *p == '\\')
31828066Scharnier				errx(1,
319122045Sdes"%lu: %s: substitute pattern can not be delimited by newline or backslash",
32028066Scharnier					linenum, fname);
321171284Sdelphij			if ((cmd->u.s = calloc(1, sizeof(struct s_subst))) == NULL)
32280286Sobrien				err(1, "malloc");
323197361Sdds			p = compile_delimited(p, re, 0);
3241590Srgrimes			if (p == NULL)
32528066Scharnier				errx(1,
32628066Scharnier				"%lu: %s: unterminated substitute pattern", linenum, fname);
327184777Shrs
328184777Shrs			/* Compile RE with no case sensitivity temporarily */
329184777Shrs			if (*re == '\0')
330184777Shrs				cmd->u.s->re = NULL;
331184777Shrs			else
332184777Shrs				cmd->u.s->re = compile_re(re, 0);
333171284Sdelphij			--p;
334171284Sdelphij			p = compile_subst(p, cmd->u.s);
335171284Sdelphij			p = compile_flags(p, cmd->u.s);
336184777Shrs
337184777Shrs			/* Recompile RE with case sensitivity from "I" flag if any */
338171206Sssouhlal			if (*re == '\0')
339171206Sssouhlal				cmd->u.s->re = NULL;
340171206Sssouhlal			else
341171206Sssouhlal				cmd->u.s->re = compile_re(re, cmd->u.s->icase);
3421590Srgrimes			EATSPACE();
3431590Srgrimes			if (*p == ';') {
3441590Srgrimes				p++;
3451590Srgrimes				link = &cmd->next;
3461590Srgrimes				goto semicolon;
3471590Srgrimes			}
3481590Srgrimes			break;
3491590Srgrimes		case TR:			/* y */
3501590Srgrimes			p++;
351132145Stjr			p = compile_tr(p, &cmd->u.y);
3521590Srgrimes			EATSPACE();
3531590Srgrimes			if (*p == ';') {
3541590Srgrimes				p++;
3551590Srgrimes				link = &cmd->next;
3561590Srgrimes				goto semicolon;
3571590Srgrimes			}
3581590Srgrimes			if (*p)
35928066Scharnier				errx(1,
36028066Scharnier"%lu: %s: extra text at the end of a transform command", linenum, fname);
3611590Srgrimes			break;
3621590Srgrimes		}
3631590Srgrimes	}
3641590Srgrimes}
3651590Srgrimes
3661590Srgrimes/*
3671590Srgrimes * Get a delimited string.  P points to the delimeter of the string; d points
3681590Srgrimes * to a buffer area.  Newline and delimiter escapes are processed; other
3691590Srgrimes * escapes are ignored.
3701590Srgrimes *
3711590Srgrimes * Returns a pointer to the first character after the final delimiter or NULL
3721590Srgrimes * in the case of a non-terminated string.  The character array d is filled
3731590Srgrimes * with the processed string.
3741590Srgrimes */
3751590Srgrimesstatic char *
376197361Sddscompile_delimited(char *p, char *d, int is_tr)
3771590Srgrimes{
3781590Srgrimes	char c;
3791590Srgrimes
3801590Srgrimes	c = *p++;
3811590Srgrimes	if (c == '\0')
3821590Srgrimes		return (NULL);
3831590Srgrimes	else if (c == '\\')
38428066Scharnier		errx(1, "%lu: %s: \\ can not be used as a string delimiter",
38528066Scharnier				linenum, fname);
3861590Srgrimes	else if (c == '\n')
38728066Scharnier		errx(1, "%lu: %s: newline can not be used as a string delimiter",
38828066Scharnier				linenum, fname);
3891590Srgrimes	while (*p) {
390197356Sdds		if (*p == '[' && *p != c) {
39110075Sjkh			if ((d = compile_ccl(&p, d)) == NULL)
39228066Scharnier				errx(1, "%lu: %s: unbalanced brackets ([])", linenum, fname);
39310075Sjkh			continue;
39410075Sjkh		} else if (*p == '\\' && p[1] == '[') {
39510075Sjkh			*d++ = *p++;
39610075Sjkh		} else if (*p == '\\' && p[1] == c)
3971590Srgrimes			p++;
3981590Srgrimes		else if (*p == '\\' && p[1] == 'n') {
3991590Srgrimes			*d++ = '\n';
4001590Srgrimes			p += 2;
4011590Srgrimes			continue;
402197361Sdds		} else if (*p == '\\' && p[1] == '\\') {
403197361Sdds			if (is_tr)
404197361Sdds				p++;
405197361Sdds			else
406197361Sdds				*d++ = *p++;
407197361Sdds		} else if (*p == c) {
4081590Srgrimes			*d = '\0';
4091590Srgrimes			return (p + 1);
4101590Srgrimes		}
4111590Srgrimes		*d++ = *p++;
4121590Srgrimes	}
4131590Srgrimes	return (NULL);
4141590Srgrimes}
4151590Srgrimes
41610075Sjkh
41710075Sjkh/* compile_ccl: expand a POSIX character class */
41810075Sjkhstatic char *
419122044Sdescompile_ccl(char **sp, char *t)
42010075Sjkh{
42110075Sjkh	int c, d;
42210075Sjkh	char *s = *sp;
42310075Sjkh
42410075Sjkh	*t++ = *s++;
42510075Sjkh	if (*s == '^')
42610075Sjkh		*t++ = *s++;
42710075Sjkh	if (*s == ']')
42810075Sjkh		*t++ = *s++;
42910075Sjkh	for (; *s && (*t = *s) != ']'; s++, t++)
43010075Sjkh		if (*s == '[' && ((d = *(s+1)) == '.' || d == ':' || d == '=')) {
43110075Sjkh			*++t = *++s, t++, s++;
43210075Sjkh			for (c = *s; (*t = *s) != ']' || c != d; s++, t++)
43310075Sjkh				if ((c = *s) == '\0')
43410075Sjkh					return NULL;
435197362Sdds		}
43610075Sjkh	return (*s == ']') ? *sp = ++s, ++t : NULL;
43710075Sjkh}
43810075Sjkh
4391590Srgrimes/*
440171206Sssouhlal * Compiles the regular expression in RE and returns a pointer to the compiled
441171206Sssouhlal * regular expression.
4421590Srgrimes * Cflags are passed to regcomp.
4431590Srgrimes */
444171206Sssouhlalstatic regex_t *
445171206Sssouhlalcompile_re(char *re, int case_insensitive)
4461590Srgrimes{
447171206Sssouhlal	regex_t *rep;
448171206Sssouhlal	int eval, flags;
4491590Srgrimes
450171206Sssouhlal
451171206Sssouhlal	flags = rflags;
452171206Sssouhlal	if (case_insensitive)
453171206Sssouhlal		flags |= REG_ICASE;
454171206Sssouhlal	if ((rep = malloc(sizeof(regex_t))) == NULL)
45580286Sobrien		err(1, "malloc");
456176126Sdwmalone	if ((eval = regcomp(rep, re, flags)) != 0)
45728066Scharnier		errx(1, "%lu: %s: RE error: %s",
458171206Sssouhlal				linenum, fname, strregerror(eval, rep));
459171206Sssouhlal	if (maxnsub < rep->re_nsub)
460171206Sssouhlal		maxnsub = rep->re_nsub;
461171206Sssouhlal	return (rep);
4621590Srgrimes}
4631590Srgrimes
4641590Srgrimes/*
4651590Srgrimes * Compile the substitution string of a regular expression and set res to
4661590Srgrimes * point to a saved copy of it.  Nsub is the number of parenthesized regular
4671590Srgrimes * expressions.
4681590Srgrimes */
4691590Srgrimesstatic char *
470122044Sdescompile_subst(char *p, struct s_subst *s)
4711590Srgrimes{
4721590Srgrimes	static char lbuf[_POSIX2_LINE_MAX + 1];
47387766Smarkm	int asize, size;
47487766Smarkm	u_char ref;
4751590Srgrimes	char c, *text, *op, *sp;
47697703Sgreen	int more = 1, sawesc = 0;
4771590Srgrimes
4781590Srgrimes	c = *p++;			/* Terminator character */
4791590Srgrimes	if (c == '\0')
4801590Srgrimes		return (NULL);
4811590Srgrimes
4821590Srgrimes	s->maxbref = 0;
4831590Srgrimes	s->linenum = linenum;
4841590Srgrimes	asize = 2 * _POSIX2_LINE_MAX + 1;
48580286Sobrien	if ((text = malloc(asize)) == NULL)
48680286Sobrien		err(1, "malloc");
4871590Srgrimes	size = 0;
4881590Srgrimes	do {
4891590Srgrimes		op = sp = text + size;
4901590Srgrimes		for (; *p; p++) {
49197703Sgreen			if (*p == '\\' || sawesc) {
49297703Sgreen				/*
49397703Sgreen				 * If this is a continuation from the last
49497703Sgreen				 * buffer, we won't have a character to
49597703Sgreen				 * skip over.
49697703Sgreen				 */
49797703Sgreen				if (sawesc)
49897703Sgreen					sawesc = 0;
49997703Sgreen				else
50097703Sgreen					p++;
50197703Sgreen
50297703Sgreen				if (*p == '\0') {
50397703Sgreen					/*
50497703Sgreen					 * This escaped character is continued
50597703Sgreen					 * in the next part of the line.  Note
50697703Sgreen					 * this fact, then cause the loop to
50797703Sgreen					 * exit w/ normal EOL case and reenter
50897703Sgreen					 * above with the new buffer.
50997703Sgreen					 */
51097703Sgreen					sawesc = 1;
51197703Sgreen					p--;
51297703Sgreen					continue;
51397703Sgreen				} else if (strchr("123456789", *p) != NULL) {
5141590Srgrimes					*sp++ = '\\';
5151590Srgrimes					ref = *p - '0';
5161590Srgrimes					if (s->re != NULL &&
5171590Srgrimes					    ref > s->re->re_nsub)
51828066Scharnier						errx(1, "%lu: %s: \\%c not defined in the RE",
51928066Scharnier								linenum, fname, *p);
5201590Srgrimes					if (s->maxbref < ref)
5211590Srgrimes						s->maxbref = ref;
5221590Srgrimes				} else if (*p == '&' || *p == '\\')
5231590Srgrimes					*sp++ = '\\';
5241590Srgrimes			} else if (*p == c) {
52541602Sarchie				if (*++p == '\0' && more) {
52641602Sarchie					if (cu_fgets(lbuf, sizeof(lbuf), &more))
52741573Sarchie						p = lbuf;
52841573Sarchie				}
5291590Srgrimes				*sp++ = '\0';
5301590Srgrimes				size += sp - op;
53180286Sobrien				if ((s->new = realloc(text, size)) == NULL)
53280286Sobrien					err(1, "realloc");
5331590Srgrimes				return (p);
5341590Srgrimes			} else if (*p == '\n') {
53528066Scharnier				errx(1,
53628066Scharnier"%lu: %s: unescaped newline inside substitute pattern", linenum, fname);
5371590Srgrimes				/* NOTREACHED */
5381590Srgrimes			}
5391590Srgrimes			*sp++ = *p;
5401590Srgrimes		}
5411590Srgrimes		size += sp - op;
5421590Srgrimes		if (asize - size < _POSIX2_LINE_MAX + 1) {
5431590Srgrimes			asize *= 2;
54480286Sobrien			if ((text = realloc(text, asize)) == NULL)
54580286Sobrien				err(1, "realloc");
5461590Srgrimes		}
54741602Sarchie	} while (cu_fgets(p = lbuf, sizeof(lbuf), &more));
54828066Scharnier	errx(1, "%lu: %s: unterminated substitute in regular expression",
54928066Scharnier			linenum, fname);
5501590Srgrimes	/* NOTREACHED */
5511590Srgrimes}
5521590Srgrimes
5531590Srgrimes/*
5541590Srgrimes * Compile the flags of the s command
5551590Srgrimes */
5561590Srgrimesstatic char *
557122044Sdescompile_flags(char *p, struct s_subst *s)
5581590Srgrimes{
5591590Srgrimes	int gn;			/* True if we have seen g or n */
560148692Sdds	unsigned long nval;
561276099Spfg	char wfile[_POSIX2_LINE_MAX + 1], *q, *eq;
5621590Srgrimes
5631590Srgrimes	s->n = 1;				/* Default */
5641590Srgrimes	s->p = 0;
5651590Srgrimes	s->wfile = NULL;
5661590Srgrimes	s->wfd = -1;
567171206Sssouhlal	s->icase = 0;
5681590Srgrimes	for (gn = 0;;) {
5691590Srgrimes		EATSPACE();			/* EXTENSION */
5701590Srgrimes		switch (*p) {
5711590Srgrimes		case 'g':
5721590Srgrimes			if (gn)
57328066Scharnier				errx(1,
57428066Scharnier"%lu: %s: more than one number or 'g' in substitute flags", linenum, fname);
5751590Srgrimes			gn = 1;
5761590Srgrimes			s->n = 0;
5771590Srgrimes			break;
5781590Srgrimes		case '\0':
5791590Srgrimes		case '\n':
5801590Srgrimes		case ';':
5811590Srgrimes			return (p);
5821590Srgrimes		case 'p':
5831590Srgrimes			s->p = 1;
5841590Srgrimes			break;
585259443Seadler		case 'i':
586171206Sssouhlal		case 'I':
587171206Sssouhlal			s->icase = 1;
588171206Sssouhlal			break;
5891590Srgrimes		case '1': case '2': case '3':
5901590Srgrimes		case '4': case '5': case '6':
5911590Srgrimes		case '7': case '8': case '9':
5921590Srgrimes			if (gn)
59328066Scharnier				errx(1,
59428066Scharnier"%lu: %s: more than one number or 'g' in substitute flags", linenum, fname);
5951590Srgrimes			gn = 1;
596148692Sdds			errno = 0;
597148692Sdds			nval = strtol(p, &p, 10);
598148692Sdds			if (errno == ERANGE || nval > INT_MAX)
599148692Sdds				errx(1,
600148692Sdds"%lu: %s: overflow in the 'N' substitute flag", linenum, fname);
601148692Sdds			s->n = nval;
602148692Sdds			p--;
6031590Srgrimes			break;
6041590Srgrimes		case 'w':
6051590Srgrimes			p++;
6061590Srgrimes#ifdef HISTORIC_PRACTICE
6071590Srgrimes			if (*p != ' ') {
60828066Scharnier				warnx("%lu: %s: space missing before w wfile", linenum, fname);
6091590Srgrimes				return (p);
6101590Srgrimes			}
6111590Srgrimes#endif
6121590Srgrimes			EATSPACE();
6131590Srgrimes			q = wfile;
614276099Spfg			eq = wfile + sizeof(wfile) - 1;
6151590Srgrimes			while (*p) {
6161590Srgrimes				if (*p == '\n')
6171590Srgrimes					break;
618276099Spfg				if (q >= eq)
619276099Spfg					err(1, "wfile too long");
6201590Srgrimes				*q++ = *p++;
6211590Srgrimes			}
6221590Srgrimes			*q = '\0';
6231590Srgrimes			if (q == wfile)
62428066Scharnier				errx(1, "%lu: %s: no wfile specified", linenum, fname);
6251590Srgrimes			s->wfile = strdup(wfile);
6261590Srgrimes			if (!aflag && (s->wfd = open(wfile,
6271590Srgrimes			    O_WRONLY|O_APPEND|O_CREAT|O_TRUNC,
6281590Srgrimes			    DEFFILEMODE)) == -1)
62928066Scharnier				err(1, "%s", wfile);
6301590Srgrimes			return (p);
6311590Srgrimes		default:
63228066Scharnier			errx(1, "%lu: %s: bad flag in substitute command: '%c'",
63328066Scharnier					linenum, fname, *p);
6341590Srgrimes			break;
6351590Srgrimes		}
6361590Srgrimes		p++;
6371590Srgrimes	}
6381590Srgrimes}
6391590Srgrimes
6401590Srgrimes/*
6411590Srgrimes * Compile a translation set of strings into a lookup table.
6421590Srgrimes */
6431590Srgrimesstatic char *
644132145Stjrcompile_tr(char *p, struct s_tr **py)
6451590Srgrimes{
646132145Stjr	struct s_tr *y;
6471590Srgrimes	int i;
648132145Stjr	const char *op, *np;
6491590Srgrimes	char old[_POSIX2_LINE_MAX + 1];
6501590Srgrimes	char new[_POSIX2_LINE_MAX + 1];
651132145Stjr	size_t oclen, oldlen, nclen, newlen;
652132145Stjr	mbstate_t mbs1, mbs2;
6531590Srgrimes
654132145Stjr	if ((*py = y = malloc(sizeof(*y))) == NULL)
655132145Stjr		err(1, NULL);
656132145Stjr	y->multis = NULL;
657132145Stjr	y->nmultis = 0;
658132145Stjr
6591590Srgrimes	if (*p == '\0' || *p == '\\')
66028066Scharnier		errx(1,
66128066Scharnier	"%lu: %s: transform pattern can not be delimited by newline or backslash",
66228066Scharnier			linenum, fname);
663197361Sdds	p = compile_delimited(p, old, 1);
66428066Scharnier	if (p == NULL)
66528066Scharnier		errx(1, "%lu: %s: unterminated transform source string",
66628066Scharnier				linenum, fname);
667197361Sdds	p = compile_delimited(p - 1, new, 1);
66828066Scharnier	if (p == NULL)
66928066Scharnier		errx(1, "%lu: %s: unterminated transform target string",
67028066Scharnier				linenum, fname);
6711590Srgrimes	EATSPACE();
672132145Stjr	op = old;
673132145Stjr	oldlen = mbsrtowcs(NULL, &op, 0, NULL);
674132145Stjr	if (oldlen == (size_t)-1)
675132145Stjr		err(1, NULL);
676132145Stjr	np = new;
677132145Stjr	newlen = mbsrtowcs(NULL, &np, 0, NULL);
678132145Stjr	if (newlen == (size_t)-1)
679132145Stjr		err(1, NULL);
680132145Stjr	if (newlen != oldlen)
68128066Scharnier		errx(1, "%lu: %s: transform strings are not the same length",
68228066Scharnier				linenum, fname);
683132145Stjr	if (MB_CUR_MAX == 1) {
684132145Stjr		/*
685132145Stjr		 * The single-byte encoding case is easy: generate a
686132145Stjr		 * lookup table.
687132145Stjr		 */
688132145Stjr		for (i = 0; i <= UCHAR_MAX; i++)
689132145Stjr			y->bytetab[i] = (char)i;
690132145Stjr		for (; *op; op++, np++)
691132145Stjr			y->bytetab[(u_char)*op] = *np;
692132145Stjr	} else {
693132145Stjr		/*
694132145Stjr		 * Multi-byte encoding case: generate a lookup table as
695132145Stjr		 * above, but only for single-byte characters. The first
696132145Stjr		 * bytes of multi-byte characters have their lookup table
697132145Stjr		 * entries set to 0, which causes do_tr() to search through
698132145Stjr		 * an auxiliary vector of multi-byte mappings.
699132145Stjr		 */
700132145Stjr		memset(&mbs1, 0, sizeof(mbs1));
701132145Stjr		memset(&mbs2, 0, sizeof(mbs2));
702132145Stjr		for (i = 0; i <= UCHAR_MAX; i++)
703132145Stjr			y->bytetab[i] = (btowc(i) != WEOF) ? i : 0;
704132145Stjr		while (*op != '\0') {
705132145Stjr			oclen = mbrlen(op, MB_LEN_MAX, &mbs1);
706132145Stjr			if (oclen == (size_t)-1 || oclen == (size_t)-2)
707132145Stjr				errc(1, EILSEQ, NULL);
708132145Stjr			nclen = mbrlen(np, MB_LEN_MAX, &mbs2);
709132145Stjr			if (nclen == (size_t)-1 || nclen == (size_t)-2)
710132145Stjr				errc(1, EILSEQ, NULL);
711132145Stjr			if (oclen == 1 && nclen == 1)
712132145Stjr				y->bytetab[(u_char)*op] = *np;
713132145Stjr			else {
714132145Stjr				y->bytetab[(u_char)*op] = 0;
715132145Stjr				y->multis = realloc(y->multis,
716132145Stjr				    (y->nmultis + 1) * sizeof(*y->multis));
717132145Stjr				if (y->multis == NULL)
718132145Stjr					err(1, NULL);
719132145Stjr				i = y->nmultis++;
720132145Stjr				y->multis[i].fromlen = oclen;
721132145Stjr				memcpy(y->multis[i].from, op, oclen);
722132145Stjr				y->multis[i].tolen = nclen;
723132145Stjr				memcpy(y->multis[i].to, np, nclen);
724132145Stjr			}
725132145Stjr			op += oclen;
726132145Stjr			np += nclen;
727132145Stjr		}
728132145Stjr	}
7291590Srgrimes	return (p);
7301590Srgrimes}
7311590Srgrimes
7321590Srgrimes/*
7331590Srgrimes * Compile the text following an a or i command.
7341590Srgrimes */
7351590Srgrimesstatic char *
736122044Sdescompile_text(void)
7371590Srgrimes{
73817195Sbde	int asize, esc_nl, size;
7391590Srgrimes	char *text, *p, *op, *s;
7401590Srgrimes	char lbuf[_POSIX2_LINE_MAX + 1];
7411590Srgrimes
7421590Srgrimes	asize = 2 * _POSIX2_LINE_MAX + 1;
74380286Sobrien	if ((text = malloc(asize)) == NULL)
74480286Sobrien		err(1, "malloc");
7451590Srgrimes	size = 0;
74641602Sarchie	while (cu_fgets(lbuf, sizeof(lbuf), NULL)) {
7471590Srgrimes		op = s = text + size;
7481590Srgrimes		p = lbuf;
7491590Srgrimes		EATSPACE();
75017195Sbde		for (esc_nl = 0; *p != '\0'; p++) {
75117195Sbde			if (*p == '\\' && p[1] != '\0' && *++p == '\n')
75217195Sbde				esc_nl = 1;
7531590Srgrimes			*s++ = *p;
7541590Srgrimes		}
7551590Srgrimes		size += s - op;
75617195Sbde		if (!esc_nl) {
7571590Srgrimes			*s = '\0';
7581590Srgrimes			break;
7591590Srgrimes		}
7601590Srgrimes		if (asize - size < _POSIX2_LINE_MAX + 1) {
7611590Srgrimes			asize *= 2;
76280286Sobrien			if ((text = realloc(text, asize)) == NULL)
76380286Sobrien				err(1, "realloc");
7641590Srgrimes		}
7651590Srgrimes	}
76639571Sbrian	text[size] = '\0';
76780286Sobrien	if ((p = realloc(text, size + 1)) == NULL)
76880286Sobrien		err(1, "realloc");
76980286Sobrien	return (p);
7701590Srgrimes}
7711590Srgrimes
7721590Srgrimes/*
7731590Srgrimes * Get an address and return a pointer to the first character after
7741590Srgrimes * it.  Fill the structure pointed to according to the address.
7751590Srgrimes */
7761590Srgrimesstatic char *
777122044Sdescompile_addr(char *p, struct s_addr *a)
7781590Srgrimes{
779171206Sssouhlal	char *end, re[_POSIX2_LINE_MAX + 1];
780171206Sssouhlal	int icase;
7811590Srgrimes
782171206Sssouhlal	icase = 0;
783171206Sssouhlal
784192732Sbrian	a->type = 0;
7851590Srgrimes	switch (*p) {
7861590Srgrimes	case '\\':				/* Context address */
7871590Srgrimes		++p;
7881590Srgrimes		/* FALLTHROUGH */
7891590Srgrimes	case '/':				/* Context address */
790197361Sdds		p = compile_delimited(p, re, 0);
7911590Srgrimes		if (p == NULL)
79228066Scharnier			errx(1, "%lu: %s: unterminated regular expression", linenum, fname);
793171206Sssouhlal		/* Check for case insensitive regexp flag */
794171206Sssouhlal		if (*p == 'I') {
795171206Sssouhlal			icase = 1;
796171206Sssouhlal			p++;
797171206Sssouhlal		}
798171206Sssouhlal		if (*re == '\0')
799171206Sssouhlal			a->u.r = NULL;
800171206Sssouhlal		else
801171206Sssouhlal			a->u.r = compile_re(re, icase);
8021590Srgrimes		a->type = AT_RE;
8031590Srgrimes		return (p);
8041590Srgrimes
8051590Srgrimes	case '$':				/* Last line */
8061590Srgrimes		a->type = AT_LAST;
8071590Srgrimes		return (p + 1);
808192732Sbrian
809192732Sbrian	case '+':				/* Relative line number */
810192732Sbrian		a->type = AT_RELLINE;
811192732Sbrian		p++;
812192732Sbrian		/* FALLTHROUGH */
8131590Srgrimes						/* Line number */
8148874Srgrimes	case '0': case '1': case '2': case '3': case '4':
8151590Srgrimes	case '5': case '6': case '7': case '8': case '9':
816192732Sbrian		if (a->type == 0)
817192732Sbrian			a->type = AT_LINE;
8181590Srgrimes		a->u.l = strtol(p, &end, 10);
8191590Srgrimes		return (end);
8201590Srgrimes	default:
82128066Scharnier		errx(1, "%lu: %s: expected context address", linenum, fname);
8221590Srgrimes		return (NULL);
8231590Srgrimes	}
8241590Srgrimes}
8251590Srgrimes
8261590Srgrimes/*
8271590Srgrimes * duptoeol --
8281590Srgrimes *	Return a copy of all the characters up to \n or \0.
8291590Srgrimes */
8301590Srgrimesstatic char *
831122044Sdesduptoeol(char *s, const char *ctype)
8321590Srgrimes{
8331590Srgrimes	size_t len;
8341590Srgrimes	int ws;
83580286Sobrien	char *p, *start;
8361590Srgrimes
8371590Srgrimes	ws = 0;
8381590Srgrimes	for (start = s; *s != '\0' && *s != '\n'; ++s)
83917522Sache		ws = isspace((unsigned char)*s);
8401590Srgrimes	*s = '\0';
8411590Srgrimes	if (ws)
84228066Scharnier		warnx("%lu: %s: whitespace after %s", linenum, fname, ctype);
8431590Srgrimes	len = s - start + 1;
84480286Sobrien	if ((p = malloc(len)) == NULL)
84580286Sobrien		err(1, "malloc");
84680286Sobrien	return (memmove(p, start, len));
8471590Srgrimes}
8481590Srgrimes
8491590Srgrimes/*
8501590Srgrimes * Convert goto label names to addresses, and count a and r commands, in
8511590Srgrimes * the given subset of the script.  Free the memory used by labels in b
8521590Srgrimes * and t commands (but not by :).
8531590Srgrimes *
8541590Srgrimes * TODO: Remove } nodes
8551590Srgrimes */
8561590Srgrimesstatic void
857122044Sdesfixuplabel(struct s_command *cp, struct s_command *end)
8581590Srgrimes{
8591590Srgrimes
8601590Srgrimes	for (; cp != end; cp = cp->next)
8611590Srgrimes		switch (cp->code) {
8621590Srgrimes		case 'a':
8631590Srgrimes		case 'r':
8641590Srgrimes			appendnum++;
8651590Srgrimes			break;
8661590Srgrimes		case 'b':
8671590Srgrimes		case 't':
8681590Srgrimes			/* Resolve branch target. */
8691590Srgrimes			if (cp->t == NULL) {
8701590Srgrimes				cp->u.c = NULL;
8711590Srgrimes				break;
8721590Srgrimes			}
8731590Srgrimes			if ((cp->u.c = findlabel(cp->t)) == NULL)
87428066Scharnier				errx(1, "%lu: %s: undefined label '%s'", linenum, fname, cp->t);
8751590Srgrimes			free(cp->t);
8761590Srgrimes			break;
8771590Srgrimes		case '{':
8781590Srgrimes			/* Do interior commands. */
8791590Srgrimes			fixuplabel(cp->u.c, cp->next);
8801590Srgrimes			break;
8811590Srgrimes		}
8821590Srgrimes}
8831590Srgrimes
8841590Srgrimes/*
8851590Srgrimes * Associate the given command label for later lookup.
8861590Srgrimes */
8871590Srgrimesstatic void
888122044Sdesenterlabel(struct s_command *cp)
8891590Srgrimes{
89087766Smarkm	struct labhash **lhp, *lh;
89187766Smarkm	u_char *p;
89287766Smarkm	u_int h, c;
8931590Srgrimes
8941590Srgrimes	for (h = 0, p = (u_char *)cp->t; (c = *p) != 0; p++)
8951590Srgrimes		h = (h << 5) + h + c;
8961590Srgrimes	lhp = &labels[h & LHMASK];
8971590Srgrimes	for (lh = *lhp; lh != NULL; lh = lh->lh_next)
8981590Srgrimes		if (lh->lh_hash == h && strcmp(cp->t, lh->lh_cmd->t) == 0)
89928066Scharnier			errx(1, "%lu: %s: duplicate label '%s'", linenum, fname, cp->t);
90080286Sobrien	if ((lh = malloc(sizeof *lh)) == NULL)
90180286Sobrien		err(1, "malloc");
9021590Srgrimes	lh->lh_next = *lhp;
9031590Srgrimes	lh->lh_hash = h;
9041590Srgrimes	lh->lh_cmd = cp;
9051590Srgrimes	lh->lh_ref = 0;
9061590Srgrimes	*lhp = lh;
9071590Srgrimes}
9081590Srgrimes
9091590Srgrimes/*
9101590Srgrimes * Find the label contained in the command l in the command linked
9111590Srgrimes * list cp.  L is excluded from the search.  Return NULL if not found.
9121590Srgrimes */
9131590Srgrimesstatic struct s_command *
914122044Sdesfindlabel(char *name)
9151590Srgrimes{
91687766Smarkm	struct labhash *lh;
91787766Smarkm	u_char *p;
91887766Smarkm	u_int h, c;
9191590Srgrimes
9201590Srgrimes	for (h = 0, p = (u_char *)name; (c = *p) != 0; p++)
9211590Srgrimes		h = (h << 5) + h + c;
9221590Srgrimes	for (lh = labels[h & LHMASK]; lh != NULL; lh = lh->lh_next) {
9231590Srgrimes		if (lh->lh_hash == h && strcmp(name, lh->lh_cmd->t) == 0) {
9241590Srgrimes			lh->lh_ref = 1;
9251590Srgrimes			return (lh->lh_cmd);
9261590Srgrimes		}
9271590Srgrimes	}
9281590Srgrimes	return (NULL);
9291590Srgrimes}
9301590Srgrimes
9318874Srgrimes/*
9321590Srgrimes * Warn about any unused labels.  As a side effect, release the label hash
9331590Srgrimes * table space.
9341590Srgrimes */
9351590Srgrimesstatic void
936122044Sdesuselabel(void)
9371590Srgrimes{
93887766Smarkm	struct labhash *lh, *next;
93987766Smarkm	int i;
9401590Srgrimes
9411590Srgrimes	for (i = 0; i < LHSZ; i++) {
9421590Srgrimes		for (lh = labels[i]; lh != NULL; lh = next) {
9431590Srgrimes			next = lh->lh_next;
9441590Srgrimes			if (!lh->lh_ref)
94528066Scharnier				warnx("%lu: %s: unused label '%s'",
94628066Scharnier				    linenum, fname, lh->lh_cmd->t);
9471590Srgrimes			free(lh);
9481590Srgrimes		}
9491590Srgrimes	}
9501590Srgrimes}
951