ex_subst.c revision 267654
1/*-
2 * Copyright (c) 1992, 1993, 1994
3 *	The Regents of the University of California.  All rights reserved.
4 * Copyright (c) 1992, 1993, 1994, 1995, 1996
5 *	Keith Bostic.  All rights reserved.
6 *
7 * See the LICENSE file for redistribution information.
8 */
9
10#include "config.h"
11
12#ifndef lint
13static const char sccsid[] = "@(#)ex_subst.c	10.37 (Berkeley) 9/15/96";
14#endif /* not lint */
15
16#include <sys/types.h>
17#include <sys/queue.h>
18#include <sys/time.h>
19
20#include <bitstring.h>
21#include <ctype.h>
22#include <errno.h>
23#include <limits.h>
24#include <stdio.h>
25#include <stdlib.h>
26#include <string.h>
27#include <unistd.h>
28
29#include "../common/common.h"
30#include "../vi/vi.h"
31
32#define	SUB_FIRST	0x01		/* The 'r' flag isn't reasonable. */
33#define	SUB_MUSTSETR	0x02		/* The 'r' flag is required. */
34
35static int re_conv __P((SCR *, char **, size_t *, int *));
36static int re_cscope_conv __P((SCR *, char **, size_t *, int *));
37static int re_sub __P((SCR *,
38		char *, char **, size_t *, size_t *, regmatch_t [10]));
39static int re_tag_conv __P((SCR *, char **, size_t *, int *));
40static int s __P((SCR *, EXCMD *, char *, regex_t *, u_int));
41
42/*
43 * ex_s --
44 *	[line [,line]] s[ubstitute] [[/;]pat[/;]/repl[/;] [cgr] [count] [#lp]]
45 *
46 *	Substitute on lines matching a pattern.
47 *
48 * PUBLIC: int ex_s __P((SCR *, EXCMD *));
49 */
50int
51ex_s(sp, cmdp)
52	SCR *sp;
53	EXCMD *cmdp;
54{
55	regex_t *re;
56	size_t blen, len;
57	u_int flags;
58	int delim;
59	char *bp, *ptrn, *rep, *p, *t;
60
61	/*
62	 * Skip leading white space.
63	 *
64	 * !!!
65	 * Historic vi allowed any non-alphanumeric to serve as the
66	 * substitution command delimiter.
67	 *
68	 * !!!
69	 * If the arguments are empty, it's the same as &, i.e. we
70	 * repeat the last substitution.
71	 */
72	if (cmdp->argc == 0)
73		goto subagain;
74	for (p = cmdp->argv[0]->bp,
75	    len = cmdp->argv[0]->len; len > 0; --len, ++p) {
76		if (!isblank(*p))
77			break;
78	}
79	if (len == 0)
80subagain:	return (ex_subagain(sp, cmdp));
81
82	delim = *p++;
83	if (isalnum(delim) || delim == '\\')
84		return (s(sp, cmdp, p, &sp->subre_c, SUB_MUSTSETR));
85
86	/*
87	 * !!!
88	 * The full-blown substitute command reset the remembered
89	 * state of the 'c' and 'g' suffices.
90	 */
91	sp->c_suffix = sp->g_suffix = 0;
92
93	/*
94	 * Get the pattern string, toss escaping characters.
95	 *
96	 * !!!
97	 * Historic vi accepted any of the following forms:
98	 *
99	 *	:s/abc/def/		change "abc" to "def"
100	 *	:s/abc/def		change "abc" to "def"
101	 *	:s/abc/			delete "abc"
102	 *	:s/abc			delete "abc"
103	 *
104	 * QUOTING NOTE:
105	 *
106	 * Only toss an escaping character if it escapes a delimiter.
107	 * This means that "s/A/\\\\f" replaces "A" with "\\f".  It
108	 * would be nice to be more regular, i.e. for each layer of
109	 * escaping a single escaping character is removed, but that's
110	 * not how the historic vi worked.
111	 */
112	for (ptrn = t = p;;) {
113		if (p[0] == '\0' || p[0] == delim) {
114			if (p[0] == delim)
115				++p;
116			/*
117			 * !!!
118			 * Nul terminate the pattern string -- it's passed
119			 * to regcomp which doesn't understand anything else.
120			 */
121			*t = '\0';
122			break;
123		}
124		if (p[0] == '\\')
125			if (p[1] == delim)
126				++p;
127			else if (p[1] == '\\')
128				*t++ = *p++;
129		*t++ = *p++;
130	}
131
132	/*
133	 * If the pattern string is empty, use the last RE (not just the
134	 * last substitution RE).
135	 */
136	if (*ptrn == '\0') {
137		if (sp->re == NULL) {
138			ex_emsg(sp, NULL, EXM_NOPREVRE);
139			return (1);
140		}
141
142		/* Re-compile the RE if necessary. */
143		if (!F_ISSET(sp, SC_RE_SEARCH) && re_compile(sp,
144		    sp->re, sp->re_len, NULL, NULL, &sp->re_c, RE_C_SEARCH))
145			return (1);
146		flags = 0;
147	} else {
148		/*
149		 * !!!
150		 * Compile the RE.  Historic practice is that substitutes set
151		 * the search direction as well as both substitute and search
152		 * RE's.  We compile the RE twice, as we don't want to bother
153		 * ref counting the pattern string and (opaque) structure.
154		 */
155		if (re_compile(sp, ptrn, t - ptrn,
156		    &sp->re, &sp->re_len, &sp->re_c, RE_C_SEARCH))
157			return (1);
158		if (re_compile(sp, ptrn, t - ptrn,
159		    &sp->subre, &sp->subre_len, &sp->subre_c, RE_C_SUBST))
160			return (1);
161
162		flags = SUB_FIRST;
163		sp->searchdir = FORWARD;
164	}
165	re = &sp->re_c;
166
167	/*
168	 * Get the replacement string.
169	 *
170	 * The special character & (\& if O_MAGIC not set) matches the
171	 * entire RE.  No handling of & is required here, it's done by
172	 * re_sub().
173	 *
174	 * The special character ~ (\~ if O_MAGIC not set) inserts the
175	 * previous replacement string into this replacement string.
176	 * Count ~'s to figure out how much space we need.  We could
177	 * special case nonexistent last patterns or whether or not
178	 * O_MAGIC is set, but it's probably not worth the effort.
179	 *
180	 * QUOTING NOTE:
181	 *
182	 * Only toss an escaping character if it escapes a delimiter or
183	 * if O_MAGIC is set and it escapes a tilde.
184	 *
185	 * !!!
186	 * If the entire replacement pattern is "%", then use the last
187	 * replacement pattern.  This semantic was added to vi in System
188	 * V and then percolated elsewhere, presumably around the time
189	 * that it was added to their version of ed(1).
190	 */
191	if (p[0] == '\0' || p[0] == delim) {
192		if (p[0] == delim)
193			++p;
194		if (sp->repl != NULL)
195			free(sp->repl);
196		sp->repl = NULL;
197		sp->repl_len = 0;
198	} else if (p[0] == '%' && (p[1] == '\0' || p[1] == delim))
199		p += p[1] == delim ? 2 : 1;
200	else {
201		for (rep = p, len = 0;
202		    p[0] != '\0' && p[0] != delim; ++p, ++len)
203			if (p[0] == '~')
204				len += sp->repl_len;
205		GET_SPACE_RET(sp, bp, blen, len);
206		for (t = bp, len = 0, p = rep;;) {
207			if (p[0] == '\0' || p[0] == delim) {
208				if (p[0] == delim)
209					++p;
210				break;
211			}
212			if (p[0] == '\\') {
213				if (p[1] == delim)
214					++p;
215				else if (p[1] == '\\') {
216					*t++ = *p++;
217					++len;
218				} else if (p[1] == '~') {
219					++p;
220					if (!O_ISSET(sp, O_MAGIC))
221						goto tilde;
222				}
223			} else if (p[0] == '~' && O_ISSET(sp, O_MAGIC)) {
224tilde:				++p;
225				memcpy(t, sp->repl, sp->repl_len);
226				t += sp->repl_len;
227				len += sp->repl_len;
228				continue;
229			}
230			*t++ = *p++;
231			++len;
232		}
233		if ((sp->repl_len = len) != 0) {
234			if (sp->repl != NULL)
235				free(sp->repl);
236			if ((sp->repl = malloc(len)) == NULL) {
237				msgq(sp, M_SYSERR, NULL);
238				FREE_SPACE(sp, bp, blen);
239				return (1);
240			}
241			memcpy(sp->repl, bp, len);
242		}
243		FREE_SPACE(sp, bp, blen);
244	}
245	return (s(sp, cmdp, p, re, flags));
246}
247
248/*
249 * ex_subagain --
250 *	[line [,line]] & [cgr] [count] [#lp]]
251 *
252 *	Substitute using the last substitute RE and replacement pattern.
253 *
254 * PUBLIC: int ex_subagain __P((SCR *, EXCMD *));
255 */
256int
257ex_subagain(sp, cmdp)
258	SCR *sp;
259	EXCMD *cmdp;
260{
261	if (sp->subre == NULL) {
262		ex_emsg(sp, NULL, EXM_NOPREVRE);
263		return (1);
264	}
265	if (!F_ISSET(sp, SC_RE_SUBST) && re_compile(sp,
266	    sp->subre, sp->subre_len, NULL, NULL, &sp->subre_c, RE_C_SUBST))
267		return (1);
268	return (s(sp,
269	    cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->subre_c, 0));
270}
271
272/*
273 * ex_subtilde --
274 *	[line [,line]] ~ [cgr] [count] [#lp]]
275 *
276 *	Substitute using the last RE and last substitute replacement pattern.
277 *
278 * PUBLIC: int ex_subtilde __P((SCR *, EXCMD *));
279 */
280int
281ex_subtilde(sp, cmdp)
282	SCR *sp;
283	EXCMD *cmdp;
284{
285	if (sp->re == NULL) {
286		ex_emsg(sp, NULL, EXM_NOPREVRE);
287		return (1);
288	}
289	if (!F_ISSET(sp, SC_RE_SEARCH) && re_compile(sp,
290	    sp->re, sp->re_len, NULL, NULL, &sp->re_c, RE_C_SEARCH))
291		return (1);
292	return (s(sp,
293	    cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->re_c, 0));
294}
295
296/*
297 * s --
298 * Do the substitution.  This stuff is *really* tricky.  There are lots of
299 * special cases, and general nastiness.  Don't mess with it unless you're
300 * pretty confident.
301 *
302 * The nasty part of the substitution is what happens when the replacement
303 * string contains newlines.  It's a bit tricky -- consider the information
304 * that has to be retained for "s/f\(o\)o/^M\1^M\1/".  The solution here is
305 * to build a set of newline offsets which we use to break the line up later,
306 * when the replacement is done.  Don't change it unless you're *damned*
307 * confident.
308 */
309#define	NEEDNEWLINE(sp) {						\
310	if (sp->newl_len == sp->newl_cnt) {				\
311		sp->newl_len += 25;					\
312		REALLOC(sp, sp->newl, size_t *,				\
313		    sp->newl_len * sizeof(size_t));			\
314		if (sp->newl == NULL) {					\
315			sp->newl_len = 0;				\
316			return (1);					\
317		}							\
318	}								\
319}
320
321#define	BUILD(sp, l, len) {						\
322	if (lbclen + (len) > lblen) {					\
323		lblen += MAX(lbclen + (len), 256);			\
324		REALLOC(sp, lb, char *, lblen);				\
325		if (lb == NULL) {					\
326			lbclen = 0;					\
327			return (1);					\
328		}							\
329	}								\
330	memcpy(lb + lbclen, l, len);					\
331	lbclen += len;							\
332}
333
334#define	NEEDSP(sp, len, pnt) {						\
335	if (lbclen + (len) > lblen) {					\
336		lblen += MAX(lbclen + (len), 256);			\
337		REALLOC(sp, lb, char *, lblen);				\
338		if (lb == NULL) {					\
339			lbclen = 0;					\
340			return (1);					\
341		}							\
342		pnt = lb + lbclen;					\
343	}								\
344}
345
346static int
347s(sp, cmdp, s, re, flags)
348	SCR *sp;
349	EXCMD *cmdp;
350	char *s;
351	regex_t *re;
352	u_int flags;
353{
354	EVENT ev;
355	MARK from, to;
356	TEXTH tiq;
357	recno_t elno, lno, slno;
358	regmatch_t match[10];
359	size_t blen, cnt, last, lbclen, lblen, len, llen;
360	size_t offset, saved_offset, scno;
361	int cflag, lflag, nflag, pflag, rflag;
362	int didsub, do_eol_match, eflags, empty_ok, eval;
363	int linechanged, matched, quit, rval;
364	char *bp, *lb;
365
366	NEEDFILE(sp, cmdp);
367
368	slno = sp->lno;
369	scno = sp->cno;
370
371	/*
372	 * !!!
373	 * Historically, the 'g' and 'c' suffices were always toggled as flags,
374	 * so ":s/A/B/" was the same as ":s/A/B/ccgg".  If O_EDCOMPATIBLE was
375	 * not set, they were initialized to 0 for all substitute commands.  If
376	 * O_EDCOMPATIBLE was set, they were initialized to 0 only if the user
377	 * specified substitute/replacement patterns (see ex_s()).
378	 */
379	if (!O_ISSET(sp, O_EDCOMPATIBLE))
380		sp->c_suffix = sp->g_suffix = 0;
381
382	/*
383	 * Historic vi permitted the '#', 'l' and 'p' options in vi mode, but
384	 * it only displayed the last change.  I'd disallow them, but they are
385	 * useful in combination with the [v]global commands.  In the current
386	 * model the problem is combining them with the 'c' flag -- the screen
387	 * would have to flip back and forth between the confirm screen and the
388	 * ex print screen, which would be pretty awful.  We do display all
389	 * changes, though, for what that's worth.
390	 *
391	 * !!!
392	 * Historic vi was fairly strict about the order of "options", the
393	 * count, and "flags".  I'm somewhat fuzzy on the difference between
394	 * options and flags, anyway, so this is a simpler approach, and we
395	 * just take it them in whatever order the user gives them.  (The ex
396	 * usage statement doesn't reflect this.)
397	 */
398	cflag = lflag = nflag = pflag = rflag = 0;
399	if (s == NULL)
400		goto noargs;
401	for (lno = OOBLNO; *s != '\0'; ++s)
402		switch (*s) {
403		case ' ':
404		case '\t':
405			continue;
406		case '+':
407			++cmdp->flagoff;
408			break;
409		case '-':
410			--cmdp->flagoff;
411			break;
412		case '0': case '1': case '2': case '3': case '4':
413		case '5': case '6': case '7': case '8': case '9':
414			if (lno != OOBLNO)
415				goto usage;
416			errno = 0;
417			lno = strtoul(s, &s, 10);
418			if (*s == '\0')		/* Loop increment correction. */
419				--s;
420			if (errno == ERANGE) {
421				if (lno == LONG_MAX)
422					msgq(sp, M_ERR, "153|Count overflow");
423				else if (lno == LONG_MIN)
424					msgq(sp, M_ERR, "154|Count underflow");
425				else
426					msgq(sp, M_SYSERR, NULL);
427				return (1);
428			}
429			/*
430			 * In historic vi, the count was inclusive from the
431			 * second address.
432			 */
433			cmdp->addr1.lno = cmdp->addr2.lno;
434			cmdp->addr2.lno += lno - 1;
435			if (!db_exist(sp, cmdp->addr2.lno) &&
436			    db_last(sp, &cmdp->addr2.lno))
437				return (1);
438			break;
439		case '#':
440			nflag = 1;
441			break;
442		case 'c':
443			sp->c_suffix = !sp->c_suffix;
444
445			/* Ex text structure initialization. */
446			if (F_ISSET(sp, SC_EX)) {
447				memset(&tiq, 0, sizeof(TEXTH));
448				CIRCLEQ_INIT(&tiq);
449			}
450			break;
451		case 'g':
452			sp->g_suffix = !sp->g_suffix;
453			break;
454		case 'l':
455			lflag = 1;
456			break;
457		case 'p':
458			pflag = 1;
459			break;
460		case 'r':
461			if (LF_ISSET(SUB_FIRST)) {
462				msgq(sp, M_ERR,
463		    "155|Regular expression specified; r flag meaningless");
464				return (1);
465			}
466			if (!F_ISSET(sp, SC_RE_SEARCH)) {
467				ex_emsg(sp, NULL, EXM_NOPREVRE);
468				return (1);
469			}
470			rflag = 1;
471			re = &sp->re_c;
472			break;
473		default:
474			goto usage;
475		}
476
477	if (*s != '\0' || !rflag && LF_ISSET(SUB_MUSTSETR)) {
478usage:		ex_emsg(sp, cmdp->cmd->usage, EXM_USAGE);
479		return (1);
480	}
481
482noargs:	if (F_ISSET(sp, SC_VI) && sp->c_suffix && (lflag || nflag || pflag)) {
483		msgq(sp, M_ERR,
484"156|The #, l and p flags may not be combined with the c flag in vi mode");
485		return (1);
486	}
487
488	/*
489	 * bp:		if interactive, line cache
490	 * blen:	if interactive, line cache length
491	 * lb:		build buffer pointer.
492	 * lbclen:	current length of built buffer.
493	 * lblen;	length of build buffer.
494	 */
495	bp = lb = NULL;
496	blen = lbclen = lblen = 0;
497
498	/* For each line... */
499	for (matched = quit = 0, lno = cmdp->addr1.lno,
500	    elno = cmdp->addr2.lno; !quit && lno <= elno; ++lno) {
501
502		/* Someone's unhappy, time to stop. */
503		if (INTERRUPTED(sp))
504			break;
505
506		/* Get the line. */
507		if (db_get(sp, lno, DBG_FATAL, &s, &llen))
508			goto err;
509
510		/*
511		 * Make a local copy if doing confirmation -- when calling
512		 * the confirm routine we're likely to lose the cached copy.
513		 */
514		if (sp->c_suffix) {
515			if (bp == NULL) {
516				GET_SPACE_RET(sp, bp, blen, llen);
517			} else
518				ADD_SPACE_RET(sp, bp, blen, llen);
519			memcpy(bp, s, llen);
520			s = bp;
521		}
522
523		/* Start searching from the beginning. */
524		offset = 0;
525		len = llen;
526
527		/* Reset the build buffer offset. */
528		lbclen = 0;
529
530		/* Reset empty match flag. */
531		empty_ok = 1;
532
533		/*
534		 * We don't want to have to do a setline if the line didn't
535		 * change -- keep track of whether or not this line changed.
536		 * If doing confirmations, don't want to keep setting the
537		 * line if change is refused -- keep track of substitutions.
538		 */
539		didsub = linechanged = 0;
540
541		/* New line, do an EOL match. */
542		do_eol_match = 1;
543
544		/* It's not nul terminated, but we pretend it is. */
545		eflags = REG_STARTEND;
546
547		/*
548		 * The search area is from s + offset to the EOL.
549		 *
550		 * Generally, match[0].rm_so is the offset of the start
551		 * of the match from the start of the search, and offset
552		 * is the offset of the start of the last search.
553		 */
554nextmatch:	match[0].rm_so = 0;
555		match[0].rm_eo = len;
556
557		/* Get the next match. */
558		eval = regexec(re, (char *)s + offset, 10, match, eflags);
559
560		/*
561		 * There wasn't a match or if there was an error, deal with
562		 * it.  If there was a previous match in this line, resolve
563		 * the changes into the database.  Otherwise, just move on.
564		 */
565		if (eval == REG_NOMATCH)
566			goto endmatch;
567		if (eval != 0) {
568			re_error(sp, eval, re);
569			goto err;
570		}
571		matched = 1;
572
573		/* Only the first search can match an anchored expression. */
574		eflags |= REG_NOTBOL;
575
576		/*
577		 * !!!
578		 * It's possible to match 0-length strings -- for example, the
579		 * command s;a*;X;, when matched against the string "aabb" will
580		 * result in "XbXbX", i.e. the matches are "aa", the space
581		 * between the b's and the space between the b's and the end of
582		 * the string.  There is a similar space between the beginning
583		 * of the string and the a's.  The rule that we use (because vi
584		 * historically used it) is that any 0-length match, occurring
585		 * immediately after a match, is ignored.  Otherwise, the above
586		 * example would have resulted in "XXbXbX".  Another example is
587		 * incorrectly using " *" to replace groups of spaces with one
588		 * space.
589		 *
590		 * The way we do this is that if we just had a successful match,
591		 * the starting offset does not skip characters, and the match
592		 * is empty, ignore the match and move forward.  If there's no
593		 * more characters in the string, we were attempting to match
594		 * after the last character, so quit.
595		 */
596		if (!empty_ok && match[0].rm_so == 0 && match[0].rm_eo == 0) {
597			empty_ok = 1;
598			if (len == 0)
599				goto endmatch;
600			BUILD(sp, s + offset, 1)
601			++offset;
602			--len;
603			goto nextmatch;
604		}
605
606		/* Confirm change. */
607		if (sp->c_suffix) {
608			/*
609			 * Set the cursor position for confirmation.  Note,
610			 * if we matched on a '$', the cursor may be past
611			 * the end of line.
612			 */
613			from.lno = to.lno = lno;
614			from.cno = match[0].rm_so + offset;
615			to.cno = match[0].rm_eo + offset;
616			/*
617			 * Both ex and vi have to correct for a change before
618			 * the first character in the line.
619			 */
620			if (llen == 0)
621				from.cno = to.cno = 0;
622			if (F_ISSET(sp, SC_VI)) {
623				/*
624				 * Only vi has to correct for a change after
625				 * the last character in the line.
626				 *
627				 * XXX
628				 * It would be nice to change the vi code so
629				 * that we could display a cursor past EOL.
630				 */
631				if (to.cno >= llen)
632					to.cno = llen - 1;
633				if (from.cno >= llen)
634					from.cno = llen - 1;
635
636				sp->lno = from.lno;
637				sp->cno = from.cno;
638				if (vs_refresh(sp, 1))
639					goto err;
640
641				vs_update(sp, msg_cat(sp,
642				    "169|Confirm change? [n]", NULL), NULL);
643
644				if (v_event_get(sp, &ev, 0, 0))
645					goto err;
646				switch (ev.e_event) {
647				case E_CHARACTER:
648					break;
649				case E_EOF:
650				case E_ERR:
651				case E_INTERRUPT:
652					goto lquit;
653				default:
654					v_event_err(sp, &ev);
655					goto lquit;
656				}
657			} else {
658				if (ex_print(sp, cmdp, &from, &to, 0) ||
659				    ex_scprint(sp, &from, &to))
660					goto lquit;
661				if (ex_txt(sp, &tiq, 0, TXT_CR))
662					goto err;
663				ev.e_c = tiq.cqh_first->lb[0];
664			}
665
666			switch (ev.e_c) {
667			case CH_YES:
668				break;
669			default:
670			case CH_NO:
671				didsub = 0;
672				BUILD(sp, s +offset, match[0].rm_eo);
673				goto skip;
674			case CH_QUIT:
675				/* Set the quit/interrupted flags. */
676lquit:				quit = 1;
677				F_SET(sp->gp, G_INTERRUPTED);
678
679				/*
680				 * Resolve any changes, then return to (and
681				 * exit from) the main loop.
682				 */
683				goto endmatch;
684			}
685		}
686
687		/*
688		 * Set the cursor to the last position changed, converting
689		 * from 1-based to 0-based.
690		 */
691		sp->lno = lno;
692		sp->cno = match[0].rm_so;
693
694		/* Copy the bytes before the match into the build buffer. */
695		BUILD(sp, s + offset, match[0].rm_so);
696
697		/* Substitute the matching bytes. */
698		didsub = 1;
699		if (re_sub(sp, s + offset, &lb, &lbclen, &lblen, match))
700			goto err;
701
702		/* Set the change flag so we know this line was modified. */
703		linechanged = 1;
704
705		/* Move past the matched bytes. */
706skip:		offset += match[0].rm_eo;
707		len -= match[0].rm_eo;
708
709		/* A match cannot be followed by an empty pattern. */
710		empty_ok = 0;
711
712		/*
713		 * If doing a global change with confirmation, we have to
714		 * update the screen.  The basic idea is to store the line
715		 * so the screen update routines can find it, and restart.
716		 */
717		if (didsub && sp->c_suffix && sp->g_suffix) {
718			/*
719			 * The new search offset will be the end of the
720			 * modified line.
721			 */
722			saved_offset = lbclen;
723
724			/* Copy the rest of the line. */
725			if (len)
726				BUILD(sp, s + offset, len)
727
728			/* Set the new offset. */
729			offset = saved_offset;
730
731			/* Store inserted lines, adjusting the build buffer. */
732			last = 0;
733			if (sp->newl_cnt) {
734				for (cnt = 0;
735				    cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) {
736					if (db_insert(sp, lno,
737					    lb + last, sp->newl[cnt] - last))
738						goto err;
739					last = sp->newl[cnt] + 1;
740					++sp->rptlines[L_ADDED];
741				}
742				lbclen -= last;
743				offset -= last;
744				sp->newl_cnt = 0;
745			}
746
747			/* Store and retrieve the line. */
748			if (db_set(sp, lno, lb + last, lbclen))
749				goto err;
750			if (db_get(sp, lno, DBG_FATAL, &s, &llen))
751				goto err;
752			ADD_SPACE_RET(sp, bp, blen, llen)
753			memcpy(bp, s, llen);
754			s = bp;
755			len = llen - offset;
756
757			/* Restart the build. */
758			lbclen = 0;
759			BUILD(sp, s, offset);
760
761			/*
762			 * If we haven't already done the after-the-string
763			 * match, do one.  Set REG_NOTEOL so the '$' pattern
764			 * only matches once.
765			 */
766			if (!do_eol_match)
767				goto endmatch;
768			if (offset == len) {
769				do_eol_match = 0;
770				eflags |= REG_NOTEOL;
771			}
772			goto nextmatch;
773		}
774
775		/*
776		 * If it's a global:
777		 *
778		 * If at the end of the string, do a test for the after
779		 * the string match.  Set REG_NOTEOL so the '$' pattern
780		 * only matches once.
781		 */
782		if (sp->g_suffix && do_eol_match) {
783			if (len == 0) {
784				do_eol_match = 0;
785				eflags |= REG_NOTEOL;
786			}
787			goto nextmatch;
788		}
789
790endmatch:	if (!linechanged)
791			continue;
792
793		/* Copy any remaining bytes into the build buffer. */
794		if (len)
795			BUILD(sp, s + offset, len)
796
797		/* Store inserted lines, adjusting the build buffer. */
798		last = 0;
799		if (sp->newl_cnt) {
800			for (cnt = 0;
801			    cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) {
802				if (db_insert(sp,
803				    lno, lb + last, sp->newl[cnt] - last))
804					goto err;
805				last = sp->newl[cnt] + 1;
806				++sp->rptlines[L_ADDED];
807			}
808			lbclen -= last;
809			sp->newl_cnt = 0;
810		}
811
812		/* Store the changed line. */
813		if (db_set(sp, lno, lb + last, lbclen))
814			goto err;
815
816		/* Update changed line counter. */
817		if (sp->rptlchange != lno) {
818			sp->rptlchange = lno;
819			++sp->rptlines[L_CHANGED];
820		}
821
822		/*
823		 * !!!
824		 * Display as necessary.  Historic practice is to only
825		 * display the last line of a line split into multiple
826		 * lines.
827		 */
828		if (lflag || nflag || pflag) {
829			from.lno = to.lno = lno;
830			from.cno = to.cno = 0;
831			if (lflag)
832				(void)ex_print(sp, cmdp, &from, &to, E_C_LIST);
833			if (nflag)
834				(void)ex_print(sp, cmdp, &from, &to, E_C_HASH);
835			if (pflag)
836				(void)ex_print(sp, cmdp, &from, &to, E_C_PRINT);
837		}
838	}
839
840	/*
841	 * !!!
842	 * Historically, vi attempted to leave the cursor at the same place if
843	 * the substitution was done at the current cursor position.  Otherwise
844	 * it moved it to the first non-blank of the last line changed.  There
845	 * were some problems: for example, :s/$/foo/ with the cursor on the
846	 * last character of the line left the cursor on the last character, or
847	 * the & command with multiple occurrences of the matching string in the
848	 * line usually left the cursor in a fairly random position.
849	 *
850	 * We try to do the same thing, with the exception that if the user is
851	 * doing substitution with confirmation, we move to the last line about
852	 * which the user was consulted, as opposed to the last line that they
853	 * actually changed.  This prevents a screen flash if the user doesn't
854	 * change many of the possible lines.
855	 */
856	if (!sp->c_suffix && (sp->lno != slno || sp->cno != scno)) {
857		sp->cno = 0;
858		(void)nonblank(sp, sp->lno, &sp->cno);
859	}
860
861	/*
862	 * If not in a global command, and nothing matched, say so.
863	 * Else, if none of the lines displayed, put something up.
864	 */
865	rval = 0;
866	if (!matched) {
867		if (!F_ISSET(sp, SC_EX_GLOBAL)) {
868			msgq(sp, M_ERR, "157|No match found");
869			goto err;
870		}
871	} else if (!lflag && !nflag && !pflag)
872		F_SET(cmdp, E_AUTOPRINT);
873
874	if (0) {
875err:		rval = 1;
876	}
877
878	if (bp != NULL)
879		FREE_SPACE(sp, bp, blen);
880	if (lb != NULL)
881		free(lb);
882	return (rval);
883}
884
885/*
886 * re_compile --
887 *	Compile the RE.
888 *
889 * PUBLIC: int re_compile __P((SCR *,
890 * PUBLIC:     char *, size_t, char **, size_t *, regex_t *, u_int));
891 */
892int
893re_compile(sp, ptrn, plen, ptrnp, lenp, rep, flags)
894	SCR *sp;
895	char *ptrn, **ptrnp;
896	size_t plen, *lenp;
897	regex_t *rep;
898	u_int flags;
899{
900	size_t len;
901	int reflags, replaced, rval;
902	char *p;
903
904	/* Set RE flags. */
905	reflags = 0;
906	if (!LF_ISSET(RE_C_CSCOPE | RE_C_TAG)) {
907		if (O_ISSET(sp, O_EXTENDED))
908			reflags |= REG_EXTENDED;
909		if (O_ISSET(sp, O_IGNORECASE))
910			reflags |= REG_ICASE;
911		if (O_ISSET(sp, O_ICLOWER)) {
912			for (p = ptrn, len = plen; len > 0; ++p, --len)
913				if (isupper(*p))
914					break;
915			if (len == 0)
916				reflags |= REG_ICASE;
917		}
918	}
919
920	/* If we're replacing a saved value, clear the old one. */
921	if (LF_ISSET(RE_C_SEARCH) && F_ISSET(sp, SC_RE_SEARCH)) {
922		regfree(&sp->re_c);
923		F_CLR(sp, SC_RE_SEARCH);
924	}
925	if (LF_ISSET(RE_C_SUBST) && F_ISSET(sp, SC_RE_SUBST)) {
926		regfree(&sp->subre_c);
927		F_CLR(sp, SC_RE_SUBST);
928	}
929
930	/*
931	 * If we're saving the string, it's a pattern we haven't seen before,
932	 * so convert the vi-style RE's to POSIX 1003.2 RE's.  Save a copy for
933	 * later recompilation.   Free any previously saved value.
934	 */
935	if (ptrnp != NULL) {
936		if (LF_ISSET(RE_C_CSCOPE)) {
937			if (re_cscope_conv(sp, &ptrn, &plen, &replaced))
938				return (1);
939			/*
940			 * XXX
941			 * Currently, the match-any-<blank> expression used in
942			 * re_cscope_conv() requires extended RE's.  This may
943			 * not be right or safe.
944			 */
945			reflags |= REG_EXTENDED;
946		} else if (LF_ISSET(RE_C_TAG)) {
947			if (re_tag_conv(sp, &ptrn, &plen, &replaced))
948				return (1);
949		} else
950			if (re_conv(sp, &ptrn, &plen, &replaced))
951				return (1);
952
953		/* Discard previous pattern. */
954		if (*ptrnp != NULL) {
955			free(*ptrnp);
956			*ptrnp = NULL;
957		}
958		if (lenp != NULL)
959			*lenp = plen;
960
961		/*
962		 * Copy the string into allocated memory.
963		 *
964		 * XXX
965		 * Regcomp isn't 8-bit clean, so the pattern is nul-terminated
966		 * for now.  There's just no other solution.
967		 */
968		MALLOC(sp, *ptrnp, char *, plen + 1);
969		if (*ptrnp != NULL) {
970			memcpy(*ptrnp, ptrn, plen);
971			(*ptrnp)[plen] = '\0';
972		}
973
974		/* Free up conversion-routine-allocated memory. */
975		if (replaced)
976			FREE_SPACE(sp, ptrn, 0);
977
978		if (*ptrnp == NULL)
979			return (1);
980
981		ptrn = *ptrnp;
982	}
983
984	/*
985	 * XXX
986	 * Regcomp isn't 8-bit clean, so we just lost if the pattern
987	 * contained a nul.  Bummer!
988	 */
989	if ((rval = regcomp(rep, ptrn, /* plen, */ reflags)) != 0) {
990		if (!LF_ISSET(RE_C_SILENT))
991			re_error(sp, rval, rep);
992		return (1);
993	}
994
995	if (LF_ISSET(RE_C_SEARCH))
996		F_SET(sp, SC_RE_SEARCH);
997	if (LF_ISSET(RE_C_SUBST))
998		F_SET(sp, SC_RE_SUBST);
999
1000	return (0);
1001}
1002
1003/*
1004 * re_conv --
1005 *	Convert vi's regular expressions into something that the
1006 *	the POSIX 1003.2 RE functions can handle.
1007 *
1008 * There are three conversions we make to make vi's RE's (specifically
1009 * the global, search, and substitute patterns) work with POSIX RE's.
1010 *
1011 * 1: If O_MAGIC is not set, strip backslashes from the magic character
1012 *    set (.[*~) that have them, and add them to the ones that don't.
1013 * 2: If O_MAGIC is not set, the string "\~" is replaced with the text
1014 *    from the last substitute command's replacement string.  If O_MAGIC
1015 *    is set, it's the string "~".
1016 * 3: The pattern \<ptrn\> does "word" searches, convert it to use the
1017 *    new RE escapes.
1018 *
1019 * !!!/XXX
1020 * This doesn't exactly match the historic behavior of vi because we do
1021 * the ~ substitution before calling the RE engine, so magic characters
1022 * in the replacement string will be expanded by the RE engine, and they
1023 * weren't historically.  It's a bug.
1024 */
1025static int
1026re_conv(sp, ptrnp, plenp, replacedp)
1027	SCR *sp;
1028	char **ptrnp;
1029	size_t *plenp;
1030	int *replacedp;
1031{
1032	size_t blen, len, needlen;
1033	int magic;
1034	char *bp, *p, *t;
1035
1036	/*
1037	 * First pass through, we figure out how much space we'll need.
1038	 * We do it in two passes, on the grounds that most of the time
1039	 * the user is doing a search and won't have magic characters.
1040	 * That way we can skip most of the memory allocation and copies.
1041	 */
1042	magic = 0;
1043	for (p = *ptrnp, len = *plenp, needlen = 0; len > 0; ++p, --len)
1044		switch (*p) {
1045		case '\\':
1046			if (len > 1) {
1047				--len;
1048				switch (*++p) {
1049				case '<':
1050					magic = 1;
1051					needlen += sizeof(RE_WSTART);
1052					break;
1053				case '>':
1054					magic = 1;
1055					needlen += sizeof(RE_WSTOP);
1056					break;
1057				case '~':
1058					if (!O_ISSET(sp, O_MAGIC)) {
1059						magic = 1;
1060						needlen += sp->repl_len;
1061					}
1062					break;
1063				case '.':
1064				case '[':
1065				case '*':
1066					if (!O_ISSET(sp, O_MAGIC)) {
1067						magic = 1;
1068						needlen += 1;
1069					}
1070					break;
1071				default:
1072					needlen += 2;
1073				}
1074			} else
1075				needlen += 1;
1076			break;
1077		case '~':
1078			if (O_ISSET(sp, O_MAGIC)) {
1079				magic = 1;
1080				needlen += sp->repl_len;
1081			}
1082			break;
1083		case '.':
1084		case '[':
1085		case '*':
1086			if (!O_ISSET(sp, O_MAGIC)) {
1087				magic = 1;
1088				needlen += 2;
1089			}
1090			break;
1091		default:
1092			needlen += 1;
1093			break;
1094		}
1095
1096	if (!magic) {
1097		*replacedp = 0;
1098		return (0);
1099	}
1100
1101	/* Get enough memory to hold the final pattern. */
1102	*replacedp = 1;
1103	GET_SPACE_RET(sp, bp, blen, needlen);
1104
1105	for (p = *ptrnp, len = *plenp, t = bp; len > 0; ++p, --len)
1106		switch (*p) {
1107		case '\\':
1108			if (len > 1) {
1109				--len;
1110				switch (*++p) {
1111				case '<':
1112					memcpy(t,
1113					    RE_WSTART, sizeof(RE_WSTART) - 1);
1114					t += sizeof(RE_WSTART) - 1;
1115					break;
1116				case '>':
1117					memcpy(t,
1118					    RE_WSTOP, sizeof(RE_WSTOP) - 1);
1119					t += sizeof(RE_WSTOP) - 1;
1120					break;
1121				case '~':
1122					if (O_ISSET(sp, O_MAGIC))
1123						*t++ = '~';
1124					else {
1125						memcpy(t,
1126						    sp->repl, sp->repl_len);
1127						t += sp->repl_len;
1128					}
1129					break;
1130				case '.':
1131				case '[':
1132				case '*':
1133					if (O_ISSET(sp, O_MAGIC))
1134						*t++ = '\\';
1135					*t++ = *p;
1136					break;
1137				default:
1138					*t++ = '\\';
1139					*t++ = *p;
1140				}
1141			} else
1142				*t++ = '\\';
1143			break;
1144		case '~':
1145			if (O_ISSET(sp, O_MAGIC)) {
1146				memcpy(t, sp->repl, sp->repl_len);
1147				t += sp->repl_len;
1148			} else
1149				*t++ = '~';
1150			break;
1151		case '.':
1152		case '[':
1153		case '*':
1154			if (!O_ISSET(sp, O_MAGIC))
1155				*t++ = '\\';
1156			*t++ = *p;
1157			break;
1158		default:
1159			*t++ = *p;
1160			break;
1161		}
1162
1163	*ptrnp = bp;
1164	*plenp = t - bp;
1165	return (0);
1166}
1167
1168/*
1169 * re_tag_conv --
1170 *	Convert a tags search path into something that the POSIX
1171 *	1003.2 RE functions can handle.
1172 */
1173static int
1174re_tag_conv(sp, ptrnp, plenp, replacedp)
1175	SCR *sp;
1176	char **ptrnp;
1177	size_t *plenp;
1178	int *replacedp;
1179{
1180	size_t blen, len;
1181	int lastdollar;
1182	char *bp, *p, *t;
1183
1184	len = *plenp;
1185
1186	/* Max memory usage is 2 times the length of the string. */
1187	*replacedp = 1;
1188	GET_SPACE_RET(sp, bp, blen, len * 2);
1189
1190	p = *ptrnp;
1191	t = bp;
1192
1193	/* If the last character is a '/' or '?', we just strip it. */
1194	if (len > 0 && (p[len - 1] == '/' || p[len - 1] == '?'))
1195		--len;
1196
1197	/* If the next-to-last or last character is a '$', it's magic. */
1198	if (len > 0 && p[len - 1] == '$') {
1199		--len;
1200		lastdollar = 1;
1201	} else
1202		lastdollar = 0;
1203
1204	/* If the first character is a '/' or '?', we just strip it. */
1205	if (len > 0 && (p[0] == '/' || p[0] == '?')) {
1206		++p;
1207		--len;
1208	}
1209
1210	/* If the first or second character is a '^', it's magic. */
1211	if (p[0] == '^') {
1212		*t++ = *p++;
1213		--len;
1214	}
1215
1216	/*
1217	 * Escape every other magic character we can find, meanwhile stripping
1218	 * the backslashes ctags inserts when escaping the search delimiter
1219	 * characters.
1220	 */
1221	for (; len > 0; --len) {
1222		if (p[0] == '\\' && (p[1] == '/' || p[1] == '?')) {
1223			++p;
1224			--len;
1225		} else if (strchr("^.[]$*", p[0]))
1226			*t++ = '\\';
1227		*t++ = *p++;
1228	}
1229	if (lastdollar)
1230		*t++ = '$';
1231
1232	*ptrnp = bp;
1233	*plenp = t - bp;
1234	return (0);
1235}
1236
1237/*
1238 * re_cscope_conv --
1239 *	 Convert a cscope search path into something that the POSIX
1240 *      1003.2 RE functions can handle.
1241 */
1242static int
1243re_cscope_conv(sp, ptrnp, plenp, replacedp)
1244	SCR *sp;
1245	char **ptrnp;
1246	size_t *plenp;
1247	int *replacedp;
1248{
1249	size_t blen, len, nspaces;
1250	char *bp, *p, *t;
1251
1252	/*
1253	 * Each space in the source line printed by cscope represents an
1254	 * arbitrary sequence of spaces, tabs, and comments.
1255	 */
1256#define	CSCOPE_RE_SPACE		"([ \t]|/\\*([^*]|\\*/)*\\*/)*"
1257	for (nspaces = 0, p = *ptrnp, len = *plenp; len > 0; ++p, --len)
1258		if (*p == ' ')
1259			++nspaces;
1260
1261	/*
1262	 * Allocate plenty of space:
1263	 *	the string, plus potential escaping characters;
1264	 *	nspaces + 2 copies of CSCOPE_RE_SPACE;
1265	 *	^, $, nul terminator characters.
1266	 */
1267	*replacedp = 1;
1268	len = (p - *ptrnp) * 2 + (nspaces + 2) * sizeof(CSCOPE_RE_SPACE) + 3;
1269	GET_SPACE_RET(sp, bp, blen, len);
1270
1271	p = *ptrnp;
1272	t = bp;
1273
1274	*t++ = '^';
1275	memcpy(t, CSCOPE_RE_SPACE, sizeof(CSCOPE_RE_SPACE) - 1);
1276	t += sizeof(CSCOPE_RE_SPACE) - 1;
1277
1278	for (len = *plenp; len > 0; ++p, --len)
1279		if (*p == ' ') {
1280			memcpy(t, CSCOPE_RE_SPACE, sizeof(CSCOPE_RE_SPACE) - 1);
1281			t += sizeof(CSCOPE_RE_SPACE) - 1;
1282		} else {
1283			if (strchr("\\^.[]$*+?()|{}", *p))
1284				*t++ = '\\';
1285			*t++ = *p;
1286		}
1287
1288	memcpy(t, CSCOPE_RE_SPACE, sizeof(CSCOPE_RE_SPACE) - 1);
1289	t += sizeof(CSCOPE_RE_SPACE) - 1;
1290	*t++ = '$';
1291
1292	*ptrnp = bp;
1293	*plenp = t - bp;
1294	return (0);
1295}
1296
1297/*
1298 * re_error --
1299 *	Report a regular expression error.
1300 *
1301 * PUBLIC: void re_error __P((SCR *, int, regex_t *));
1302 */
1303void
1304re_error(sp, errcode, preg)
1305	SCR *sp;
1306	int errcode;
1307	regex_t *preg;
1308{
1309	size_t s;
1310	char *oe;
1311
1312	s = regerror(errcode, preg, "", 0);
1313	if ((oe = malloc(s)) == NULL)
1314		msgq(sp, M_SYSERR, NULL);
1315	else {
1316		(void)regerror(errcode, preg, oe, s);
1317		msgq(sp, M_ERR, "RE error: %s", oe);
1318		free(oe);
1319	}
1320}
1321
1322/*
1323 * re_sub --
1324 * 	Do the substitution for a regular expression.
1325 */
1326static int
1327re_sub(sp, ip, lbp, lbclenp, lblenp, match)
1328	SCR *sp;
1329	char *ip;			/* Input line. */
1330	char **lbp;
1331	size_t *lbclenp, *lblenp;
1332	regmatch_t match[10];
1333{
1334	enum { C_NOTSET, C_LOWER, C_ONELOWER, C_ONEUPPER, C_UPPER } conv;
1335	size_t lbclen, lblen;		/* Local copies. */
1336	size_t mlen;			/* Match length. */
1337	size_t rpl;			/* Remaining replacement length. */
1338	char *rp;			/* Replacement pointer. */
1339	int ch;
1340	int no;				/* Match replacement offset. */
1341	char *p, *t;			/* Buffer pointers. */
1342	char *lb;			/* Local copies. */
1343
1344	lb = *lbp;			/* Get local copies. */
1345	lbclen = *lbclenp;
1346	lblen = *lblenp;
1347
1348	/*
1349	 * QUOTING NOTE:
1350	 *
1351	 * There are some special sequences that vi provides in the
1352	 * replacement patterns.
1353	 *	 & string the RE matched (\& if nomagic set)
1354	 *	\# n-th regular subexpression
1355	 *	\E end \U, \L conversion
1356	 *	\e end \U, \L conversion
1357	 *	\l convert the next character to lower-case
1358	 *	\L convert to lower-case, until \E, \e, or end of replacement
1359	 *	\u convert the next character to upper-case
1360	 *	\U convert to upper-case, until \E, \e, or end of replacement
1361	 *
1362	 * Otherwise, since this is the lowest level of replacement, discard
1363	 * all escaping characters.  This (hopefully) matches historic practice.
1364	 */
1365#define	OUTCH(ch, nltrans) {						\
1366	CHAR_T __ch = (ch);						\
1367	u_int __value = KEY_VAL(sp, __ch);				\
1368	if (nltrans && (__value == K_CR || __value == K_NL)) {		\
1369		NEEDNEWLINE(sp);					\
1370		sp->newl[sp->newl_cnt++] = lbclen;			\
1371	} else if (conv != C_NOTSET) {					\
1372		switch (conv) {						\
1373		case C_ONELOWER:					\
1374			conv = C_NOTSET;				\
1375			/* FALLTHROUGH */				\
1376		case C_LOWER:						\
1377			if (isupper(__ch))				\
1378				__ch = tolower(__ch);			\
1379			break;						\
1380		case C_ONEUPPER:					\
1381			conv = C_NOTSET;				\
1382			/* FALLTHROUGH */				\
1383		case C_UPPER:						\
1384			if (islower(__ch))				\
1385				__ch = toupper(__ch);			\
1386			break;						\
1387		default:						\
1388			abort();					\
1389		}							\
1390	}								\
1391	NEEDSP(sp, 1, p);						\
1392	*p++ = __ch;							\
1393	++lbclen;							\
1394}
1395	conv = C_NOTSET;
1396	for (rp = sp->repl, rpl = sp->repl_len, p = lb + lbclen; rpl--;) {
1397		switch (ch = *rp++) {
1398		case '&':
1399			if (O_ISSET(sp, O_MAGIC)) {
1400				no = 0;
1401				goto subzero;
1402			}
1403			break;
1404		case '\\':
1405			if (rpl == 0)
1406				break;
1407			--rpl;
1408			switch (ch = *rp) {
1409			case '&':
1410				++rp;
1411				if (!O_ISSET(sp, O_MAGIC)) {
1412					no = 0;
1413					goto subzero;
1414				}
1415				break;
1416			case '0': case '1': case '2': case '3': case '4':
1417			case '5': case '6': case '7': case '8': case '9':
1418				no = *rp++ - '0';
1419subzero:			if (match[no].rm_so == -1 ||
1420			    	    match[no].rm_eo == -1)
1421					break;
1422				mlen = match[no].rm_eo - match[no].rm_so;
1423				for (t = ip + match[no].rm_so; mlen--; ++t)
1424					OUTCH(*t, 0);
1425				continue;
1426			case 'e':
1427			case 'E':
1428				++rp;
1429				conv = C_NOTSET;
1430				continue;
1431			case 'l':
1432				++rp;
1433				conv = C_ONELOWER;
1434				continue;
1435			case 'L':
1436				++rp;
1437				conv = C_LOWER;
1438				continue;
1439			case 'u':
1440				++rp;
1441				conv = C_ONEUPPER;
1442				continue;
1443			case 'U':
1444				++rp;
1445				conv = C_UPPER;
1446				continue;
1447			default:
1448				++rp;
1449				break;
1450			}
1451		}
1452		OUTCH(ch, 1);
1453	}
1454
1455	*lbp = lb;			/* Update caller's information. */
1456	*lbclenp = lbclen;
1457	*lblenp = lblen;
1458	return (0);
1459}
1460