1/*-
2 * Copyright (c) 1992, 1993, 1994
3 *	The Regents of the University of California.  All rights reserved.
4 * Copyright (c) 1992, 1993, 1994, 1995, 1996
5 *	Keith Bostic.  All rights reserved.
6 *
7 * See the LICENSE file for redistribution information.
8 */
9
10#include "config.h"
11
12#include <sys/types.h>
13#include <sys/queue.h>
14#include <sys/time.h>
15
16#include <bitstring.h>
17#include <ctype.h>
18#include <errno.h>
19#include <limits.h>
20#include <stdio.h>
21#include <stdlib.h>
22#include <string.h>
23#include <unistd.h>
24
25#include "../common/common.h"
26#include "../vi/vi.h"
27
28#define	SUB_FIRST	0x01		/* The 'r' flag isn't reasonable. */
29#define	SUB_MUSTSETR	0x02		/* The 'r' flag is required. */
30
31static int re_conv(SCR *, CHAR_T **, size_t *, int *);
32static int re_cscope_conv(SCR *, CHAR_T **, size_t *, int *);
33static int re_sub(SCR *,
34		CHAR_T *, CHAR_T **, size_t *, size_t *, regmatch_t [10]);
35static int re_tag_conv(SCR *, CHAR_T **, size_t *, int *);
36static int s(SCR *, EXCMD *, CHAR_T *, regex_t *, u_int);
37
38/*
39 * ex_s --
40 *	[line [,line]] s[ubstitute] [[/;]pat[/;]/repl[/;] [cgr] [count] [#lp]]
41 *
42 *	Substitute on lines matching a pattern.
43 *
44 * PUBLIC: int ex_s(SCR *, EXCMD *);
45 */
46int
47ex_s(SCR *sp, EXCMD *cmdp)
48{
49	regex_t *re;
50	size_t blen, len;
51	u_int flags;
52	int delim;
53	CHAR_T *bp, *p, *ptrn, *rep, *t;
54
55	/*
56	 * Skip leading white space.
57	 *
58	 * !!!
59	 * Historic vi allowed any non-alphanumeric to serve as the
60	 * substitution command delimiter.
61	 *
62	 * !!!
63	 * If the arguments are empty, it's the same as &, i.e. we
64	 * repeat the last substitution.
65	 */
66	if (cmdp->argc == 0)
67		goto subagain;
68	for (p = cmdp->argv[0]->bp,
69	    len = cmdp->argv[0]->len; len > 0; --len, ++p) {
70		if (!cmdskip(*p))
71			break;
72	}
73	if (len == 0)
74subagain:	return (ex_subagain(sp, cmdp));
75
76	delim = *p++;
77	if (is09azAZ(delim) || delim == '\\')
78		return (s(sp, cmdp, p, &sp->subre_c, SUB_MUSTSETR));
79
80	/*
81	 * !!!
82	 * The full-blown substitute command reset the remembered
83	 * state of the 'c' and 'g' suffices.
84	 */
85	sp->c_suffix = sp->g_suffix = 0;
86
87	/*
88	 * Get the pattern string, toss escaping characters.
89	 *
90	 * !!!
91	 * Historic vi accepted any of the following forms:
92	 *
93	 *	:s/abc/def/		change "abc" to "def"
94	 *	:s/abc/def		change "abc" to "def"
95	 *	:s/abc/			delete "abc"
96	 *	:s/abc			delete "abc"
97	 *
98	 * QUOTING NOTE:
99	 *
100	 * Only toss an escaping character if it escapes a delimiter.
101	 * This means that "s/A/\\\\f" replaces "A" with "\\f".  It
102	 * would be nice to be more regular, i.e. for each layer of
103	 * escaping a single escaping character is removed, but that's
104	 * not how the historic vi worked.
105	 */
106	for (ptrn = t = p;;) {
107		if (p[0] == '\0' || p[0] == delim) {
108			if (p[0] == delim)
109				++p;
110			/*
111			 * !!!
112			 * Nul terminate the pattern string -- it's passed
113			 * to regcomp which doesn't understand anything else.
114			 */
115			*t = '\0';
116			break;
117		}
118		if (p[0] == '\\') {
119			if (p[1] == delim)
120				++p;
121			else if (p[1] == '\\')
122				*t++ = *p++;
123		}
124		*t++ = *p++;
125	}
126
127	/*
128	 * If the pattern string is empty, use the last RE (not just the
129	 * last substitution RE).
130	 */
131	if (*ptrn == '\0') {
132		if (sp->re == NULL) {
133			ex_emsg(sp, NULL, EXM_NOPREVRE);
134			return (1);
135		}
136
137		/* Re-compile the RE if necessary. */
138		if (!F_ISSET(sp, SC_RE_SEARCH) &&
139		    re_compile(sp, sp->re, sp->re_len,
140		    NULL, NULL, &sp->re_c, RE_C_SEARCH))
141			return (1);
142		flags = 0;
143	} else {
144		/*
145		 * !!!
146		 * Compile the RE.  Historic practice is that substitutes set
147		 * the search direction as well as both substitute and search
148		 * RE's.  We compile the RE twice, as we don't want to bother
149		 * ref counting the pattern string and (opaque) structure.
150		 */
151		if (re_compile(sp, ptrn, t - ptrn, &sp->re,
152		    &sp->re_len, &sp->re_c, RE_C_SEARCH))
153			return (1);
154		if (re_compile(sp, ptrn, t - ptrn, &sp->subre,
155		    &sp->subre_len, &sp->subre_c, RE_C_SUBST))
156			return (1);
157
158		flags = SUB_FIRST;
159		sp->searchdir = FORWARD;
160	}
161	re = &sp->re_c;
162
163	/*
164	 * Get the replacement string.
165	 *
166	 * The special character & (\& if O_MAGIC not set) matches the
167	 * entire RE.  No handling of & is required here, it's done by
168	 * re_sub().
169	 *
170	 * The special character ~ (\~ if O_MAGIC not set) inserts the
171	 * previous replacement string into this replacement string.
172	 * Count ~'s to figure out how much space we need.  We could
173	 * special case nonexistent last patterns or whether or not
174	 * O_MAGIC is set, but it's probably not worth the effort.
175	 *
176	 * QUOTING NOTE:
177	 *
178	 * Only toss an escaping character if it escapes a delimiter or
179	 * if O_MAGIC is set and it escapes a tilde.
180	 *
181	 * !!!
182	 * If the entire replacement pattern is "%", then use the last
183	 * replacement pattern.  This semantic was added to vi in System
184	 * V and then percolated elsewhere, presumably around the time
185	 * that it was added to their version of ed(1).
186	 */
187	if (p[0] == '\0' || p[0] == delim) {
188		if (p[0] == delim)
189			++p;
190		free(sp->repl);
191		sp->repl = NULL;
192		sp->repl_len = 0;
193	} else if (p[0] == '%' && (p[1] == '\0' || p[1] == delim))
194		p += p[1] == delim ? 2 : 1;
195	else {
196		for (rep = p, len = 0;
197		    p[0] != '\0' && p[0] != delim; ++p, ++len)
198			if (p[0] == '~')
199				len += sp->repl_len;
200		GET_SPACE_RETW(sp, bp, blen, len);
201		for (t = bp, len = 0, p = rep;;) {
202			if (p[0] == '\0' || p[0] == delim) {
203				if (p[0] == delim)
204					++p;
205				break;
206			}
207			if (p[0] == '\\') {
208				if (p[1] == delim)
209					++p;
210				else if (p[1] == '\\') {
211					*t++ = *p++;
212					++len;
213				} else if (p[1] == '~') {
214					++p;
215					if (!O_ISSET(sp, O_MAGIC))
216						goto tilde;
217				}
218			} else if (p[0] == '~' && O_ISSET(sp, O_MAGIC)) {
219tilde:				++p;
220				MEMCPY(t, sp->repl, sp->repl_len);
221				t += sp->repl_len;
222				len += sp->repl_len;
223				continue;
224			}
225			*t++ = *p++;
226			++len;
227		}
228		if ((sp->repl_len = len) != 0) {
229			free(sp->repl);
230			MALLOC(sp, sp->repl, len * sizeof(CHAR_T));
231			if (sp->repl == NULL) {
232				FREE_SPACEW(sp, bp, blen);
233				return (1);
234			}
235			MEMCPY(sp->repl, bp, len);
236		}
237		FREE_SPACEW(sp, bp, blen);
238	}
239	return (s(sp, cmdp, p, re, flags));
240}
241
242/*
243 * ex_subagain --
244 *	[line [,line]] & [cgr] [count] [#lp]]
245 *
246 *	Substitute using the last substitute RE and replacement pattern.
247 *
248 * PUBLIC: int ex_subagain(SCR *, EXCMD *);
249 */
250int
251ex_subagain(SCR *sp, EXCMD *cmdp)
252{
253	if (sp->subre == NULL) {
254		ex_emsg(sp, NULL, EXM_NOPREVRE);
255		return (1);
256	}
257	if (!F_ISSET(sp, SC_RE_SUBST) &&
258	    re_compile(sp, sp->subre, sp->subre_len,
259	    NULL, NULL, &sp->subre_c, RE_C_SUBST))
260		return (1);
261	return (s(sp,
262	    cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->subre_c, 0));
263}
264
265/*
266 * ex_subtilde --
267 *	[line [,line]] ~ [cgr] [count] [#lp]]
268 *
269 *	Substitute using the last RE and last substitute replacement pattern.
270 *
271 * PUBLIC: int ex_subtilde(SCR *, EXCMD *);
272 */
273int
274ex_subtilde(SCR *sp, EXCMD *cmdp)
275{
276	if (sp->re == NULL) {
277		ex_emsg(sp, NULL, EXM_NOPREVRE);
278		return (1);
279	}
280	if (!F_ISSET(sp, SC_RE_SEARCH) && re_compile(sp, sp->re,
281	    sp->re_len, NULL, NULL, &sp->re_c, RE_C_SEARCH))
282		return (1);
283	return (s(sp,
284	    cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->re_c, 0));
285}
286
287/*
288 * s --
289 * Do the substitution.  This stuff is *really* tricky.  There are lots of
290 * special cases, and general nastiness.  Don't mess with it unless you're
291 * pretty confident.
292 *
293 * The nasty part of the substitution is what happens when the replacement
294 * string contains newlines.  It's a bit tricky -- consider the information
295 * that has to be retained for "s/f\(o\)o/^M\1^M\1/".  The solution here is
296 * to build a set of newline offsets which we use to break the line up later,
297 * when the replacement is done.  Don't change it unless you're *damned*
298 * confident.
299 */
300#define	NEEDNEWLINE(sp) do {						\
301	if (sp->newl_len == sp->newl_cnt) {				\
302		sp->newl_len += 25;					\
303		REALLOC(sp, sp->newl, size_t *,				\
304		    sp->newl_len * sizeof(size_t));			\
305		if (sp->newl == NULL) {					\
306			sp->newl_len = 0;				\
307			return (1);					\
308		}							\
309	}								\
310} while (0)
311
312#define	BUILD(sp, l, len) do {						\
313	if (lbclen + (len) > lblen) {					\
314		lblen = p2roundup(MAX(lbclen + (len), 256));		\
315		REALLOC(sp, lb, CHAR_T *, lblen * sizeof(CHAR_T));	\
316		if (lb == NULL) {					\
317			lbclen = 0;					\
318			return (1);					\
319		}							\
320	}								\
321	MEMCPY(lb + lbclen, l, len);					\
322	lbclen += len;							\
323} while (0)
324
325#define	NEEDSP(sp, len, pnt) do {					\
326	if (lbclen + (len) > lblen) {					\
327		lblen = p2roundup(MAX(lbclen + (len), 256));		\
328		REALLOC(sp, lb, CHAR_T *, lblen * sizeof(CHAR_T));	\
329		if (lb == NULL) {					\
330			lbclen = 0;					\
331			return (1);					\
332		}							\
333		pnt = lb + lbclen;					\
334	}								\
335} while (0)
336
337static int
338s(SCR *sp, EXCMD *cmdp, CHAR_T *s, regex_t *re, u_int flags)
339{
340	EVENT ev;
341	MARK from, to;
342	TEXTH tiq[] = {{ 0 }};
343	recno_t elno, lno, slno;
344	u_long ul;
345	regmatch_t match[10];
346	size_t blen, cnt, last, lbclen, lblen, len, llen;
347	size_t offset, saved_offset, scno;
348	int cflag, lflag, nflag, pflag, rflag;
349	int didsub, do_eol_match, eflags, empty_ok, eval;
350	int linechanged, matched, quit, rval;
351	CHAR_T *bp, *lb;
352	enum nresult nret;
353
354	NEEDFILE(sp, cmdp);
355
356	slno = sp->lno;
357	scno = sp->cno;
358
359	/*
360	 * !!!
361	 * Historically, the 'g' and 'c' suffices were always toggled as flags,
362	 * so ":s/A/B/" was the same as ":s/A/B/ccgg".  If O_EDCOMPATIBLE was
363	 * not set, they were initialized to 0 for all substitute commands.  If
364	 * O_EDCOMPATIBLE was set, they were initialized to 0 only if the user
365	 * specified substitute/replacement patterns (see ex_s()).
366	 */
367	if (!O_ISSET(sp, O_EDCOMPATIBLE))
368		sp->c_suffix = sp->g_suffix = 0;
369
370	/*
371	 * Historic vi permitted the '#', 'l' and 'p' options in vi mode, but
372	 * it only displayed the last change.  I'd disallow them, but they are
373	 * useful in combination with the [v]global commands.  In the current
374	 * model the problem is combining them with the 'c' flag -- the screen
375	 * would have to flip back and forth between the confirm screen and the
376	 * ex print screen, which would be pretty awful.  We do display all
377	 * changes, though, for what that's worth.
378	 *
379	 * !!!
380	 * Historic vi was fairly strict about the order of "options", the
381	 * count, and "flags".  I'm somewhat fuzzy on the difference between
382	 * options and flags, anyway, so this is a simpler approach, and we
383	 * just take it them in whatever order the user gives them.  (The ex
384	 * usage statement doesn't reflect this.)
385	 */
386	cflag = lflag = nflag = pflag = rflag = 0;
387	if (s == NULL)
388		goto noargs;
389	for (lno = OOBLNO; *s != '\0'; ++s)
390		switch (*s) {
391		case ' ':
392		case '\t':
393			continue;
394		case '+':
395			++cmdp->flagoff;
396			break;
397		case '-':
398			--cmdp->flagoff;
399			break;
400		case '0': case '1': case '2': case '3': case '4':
401		case '5': case '6': case '7': case '8': case '9':
402			if (lno != OOBLNO)
403				goto usage;
404			errno = 0;
405			nret = nget_uslong(&ul, s, &s, 10);
406			lno = ul;
407			if (*s == '\0')		/* Loop increment correction. */
408				--s;
409			if (nret != NUM_OK) {
410				if (nret == NUM_OVER)
411					msgq(sp, M_ERR, "153|Count overflow");
412				else if (nret == NUM_UNDER)
413					msgq(sp, M_ERR, "154|Count underflow");
414				else
415					msgq(sp, M_SYSERR, NULL);
416				return (1);
417			}
418			/*
419			 * In historic vi, the count was inclusive from the
420			 * second address.
421			 */
422			cmdp->addr1.lno = cmdp->addr2.lno;
423			cmdp->addr2.lno += lno - 1;
424			if (!db_exist(sp, cmdp->addr2.lno) &&
425			    db_last(sp, &cmdp->addr2.lno))
426				return (1);
427			break;
428		case '#':
429			nflag = 1;
430			break;
431		case 'c':
432			sp->c_suffix = !sp->c_suffix;
433
434			/* Ex text structure initialization. */
435			if (F_ISSET(sp, SC_EX))
436				TAILQ_INIT(tiq);
437			break;
438		case 'g':
439			sp->g_suffix = !sp->g_suffix;
440			break;
441		case 'l':
442			lflag = 1;
443			break;
444		case 'p':
445			pflag = 1;
446			break;
447		case 'r':
448			if (LF_ISSET(SUB_FIRST)) {
449				msgq(sp, M_ERR,
450		    "155|Regular expression specified; r flag meaningless");
451				return (1);
452			}
453			if (!F_ISSET(sp, SC_RE_SEARCH)) {
454				ex_emsg(sp, NULL, EXM_NOPREVRE);
455				return (1);
456			}
457			rflag = 1;
458			re = &sp->re_c;
459			break;
460		default:
461			goto usage;
462		}
463
464	if (*s != '\0' || (!rflag && LF_ISSET(SUB_MUSTSETR))) {
465usage:		ex_emsg(sp, cmdp->cmd->usage, EXM_USAGE);
466		return (1);
467	}
468
469noargs:	if (F_ISSET(sp, SC_VI) && sp->c_suffix && (lflag || nflag || pflag)) {
470		msgq(sp, M_ERR,
471"156|The #, l and p flags may not be combined with the c flag in vi mode");
472		return (1);
473	}
474
475	/*
476	 * bp:		if interactive, line cache
477	 * blen:	if interactive, line cache length
478	 * lb:		build buffer pointer.
479	 * lbclen:	current length of built buffer.
480	 * lblen;	length of build buffer.
481	 */
482	bp = lb = NULL;
483	blen = lbclen = lblen = 0;
484
485	/* For each line... */
486	lno = cmdp->addr1.lno == 0 ? 1 : cmdp->addr1.lno;
487	for (matched = quit = 0,
488	    elno = cmdp->addr2.lno; !quit && lno <= elno; ++lno) {
489
490		/* Someone's unhappy, time to stop. */
491		if (INTERRUPTED(sp))
492			break;
493
494		/* Get the line. */
495		if (db_get(sp, lno, DBG_FATAL, &s, &llen))
496			goto err;
497
498		/*
499		 * Make a local copy if doing confirmation -- when calling
500		 * the confirm routine we're likely to lose the cached copy.
501		 */
502		if (sp->c_suffix) {
503			if (bp == NULL) {
504				GET_SPACE_RETW(sp, bp, blen, llen);
505			} else
506				ADD_SPACE_RETW(sp, bp, blen, llen);
507			MEMCPY(bp, s, llen);
508			s = bp;
509		}
510
511		/* Start searching from the beginning. */
512		offset = 0;
513		len = llen;
514
515		/* Reset the build buffer offset. */
516		lbclen = 0;
517
518		/* Reset empty match flag. */
519		empty_ok = 1;
520
521		/*
522		 * We don't want to have to do a setline if the line didn't
523		 * change -- keep track of whether or not this line changed.
524		 * If doing confirmations, don't want to keep setting the
525		 * line if change is refused -- keep track of substitutions.
526		 */
527		didsub = linechanged = 0;
528
529		/* New line, do an EOL match. */
530		do_eol_match = 1;
531
532		/* It's not nul terminated, but we pretend it is. */
533		eflags = REG_STARTEND;
534
535		/*
536		 * The search area is from s + offset to the EOL.
537		 *
538		 * Generally, match[0].rm_so is the offset of the start
539		 * of the match from the start of the search, and offset
540		 * is the offset of the start of the last search.
541		 */
542nextmatch:	match[0].rm_so = 0;
543		match[0].rm_eo = len;
544
545		/* Get the next match. */
546		eval = regexec(re, s + offset, 10, match, eflags);
547
548		/*
549		 * There wasn't a match or if there was an error, deal with
550		 * it.  If there was a previous match in this line, resolve
551		 * the changes into the database.  Otherwise, just move on.
552		 */
553		if (eval == REG_NOMATCH)
554			goto endmatch;
555		if (eval != 0) {
556			re_error(sp, eval, re);
557			goto err;
558		}
559		matched = 1;
560
561		/* Only the first search can match an anchored expression. */
562		eflags |= REG_NOTBOL;
563
564		/*
565		 * !!!
566		 * It's possible to match 0-length strings -- for example, the
567		 * command s;a*;X;, when matched against the string "aabb" will
568		 * result in "XbXbX", i.e. the matches are "aa", the space
569		 * between the b's and the space between the b's and the end of
570		 * the string.  There is a similar space between the beginning
571		 * of the string and the a's.  The rule that we use (because vi
572		 * historically used it) is that any 0-length match, occurring
573		 * immediately after a match, is ignored.  Otherwise, the above
574		 * example would have resulted in "XXbXbX".  Another example is
575		 * incorrectly using " *" to replace groups of spaces with one
576		 * space.
577		 *
578		 * The way we do this is that if we just had a successful match,
579		 * the starting offset does not skip characters, and the match
580		 * is empty, ignore the match and move forward.  If there's no
581		 * more characters in the string, we were attempting to match
582		 * after the last character, so quit.
583		 */
584		if (!empty_ok && match[0].rm_so == 0 && match[0].rm_eo == 0) {
585			empty_ok = 1;
586			if (len == 0)
587				goto endmatch;
588			BUILD(sp, s + offset, 1);
589			++offset;
590			--len;
591			goto nextmatch;
592		}
593
594		/* Confirm change. */
595		if (sp->c_suffix) {
596			/*
597			 * Set the cursor position for confirmation.  Note,
598			 * if we matched on a '$', the cursor may be past
599			 * the end of line.
600			 */
601			from.lno = to.lno = lno;
602			from.cno = match[0].rm_so + offset;
603			to.cno = match[0].rm_eo + offset;
604			/*
605			 * Both ex and vi have to correct for a change before
606			 * the first character in the line.
607			 */
608			if (llen == 0)
609				from.cno = to.cno = 0;
610			if (F_ISSET(sp, SC_VI)) {
611				/*
612				 * Only vi has to correct for a change after
613				 * the last character in the line.
614				 *
615				 * XXX
616				 * It would be nice to change the vi code so
617				 * that we could display a cursor past EOL.
618				 */
619				if (to.cno >= llen)
620					to.cno = llen - 1;
621				if (from.cno >= llen)
622					from.cno = llen - 1;
623
624				sp->lno = from.lno;
625				sp->cno = from.cno;
626				if (vs_refresh(sp, 1))
627					goto err;
628
629				vs_update(sp, msg_cat(sp,
630				    "169|Confirm change? [n]", NULL), NULL);
631
632				if (v_event_get(sp, &ev, 0, 0))
633					goto err;
634				switch (ev.e_event) {
635				case E_CHARACTER:
636					break;
637				case E_EOF:
638				case E_ERR:
639				case E_INTERRUPT:
640					goto lquit;
641				default:
642					v_event_err(sp, &ev);
643					goto lquit;
644				}
645			} else {
646				if (ex_print(sp, cmdp, &from, &to, 0) ||
647				    ex_scprint(sp, &from, &to))
648					goto lquit;
649				if (ex_txt(sp, tiq, 0, TXT_CR))
650					goto err;
651				ev.e_c = TAILQ_FIRST(tiq)->lb[0];
652			}
653
654			switch (ev.e_c) {
655			case CH_YES:
656				break;
657			default:
658			case CH_NO:
659				didsub = 0;
660				BUILD(sp, s +offset, match[0].rm_eo);
661				goto skip;
662			case CH_QUIT:
663				/* Set the quit/interrupted flags. */
664lquit:				quit = 1;
665				F_SET(sp->gp, G_INTERRUPTED);
666
667				/*
668				 * Resolve any changes, then return to (and
669				 * exit from) the main loop.
670				 */
671				goto endmatch;
672			}
673		}
674
675		/*
676		 * Set the cursor to the last position changed, converting
677		 * from 1-based to 0-based.
678		 */
679		sp->lno = lno;
680		sp->cno = match[0].rm_so;
681
682		/* Copy the bytes before the match into the build buffer. */
683		BUILD(sp, s + offset, match[0].rm_so);
684
685		/* Substitute the matching bytes. */
686		didsub = 1;
687		if (re_sub(sp, s + offset, &lb, &lbclen, &lblen, match))
688			goto err;
689
690		/* Set the change flag so we know this line was modified. */
691		linechanged = 1;
692
693		/* Move past the matched bytes. */
694skip:		offset += match[0].rm_eo;
695		len -= match[0].rm_eo;
696
697		/* A match cannot be followed by an empty pattern. */
698		empty_ok = 0;
699
700		/*
701		 * If doing a global change with confirmation, we have to
702		 * update the screen.  The basic idea is to store the line
703		 * so the screen update routines can find it, and restart.
704		 */
705		if (didsub && sp->c_suffix && sp->g_suffix) {
706			/*
707			 * The new search offset will be the end of the
708			 * modified line.
709			 */
710			saved_offset = lbclen;
711
712			/* Copy the rest of the line. */
713			if (len)
714				BUILD(sp, s + offset, len);
715
716			/* Set the new offset. */
717			offset = saved_offset;
718
719			/* Store inserted lines, adjusting the build buffer. */
720			last = 0;
721			if (sp->newl_cnt) {
722				for (cnt = 0;
723				    cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) {
724					if (db_insert(sp, lno,
725					    lb + last, sp->newl[cnt] - last))
726						goto err;
727					last = sp->newl[cnt] + 1;
728					++sp->rptlines[L_ADDED];
729				}
730				lbclen -= last;
731				offset -= last;
732				sp->newl_cnt = 0;
733			}
734
735			/* Store and retrieve the line. */
736			if (db_set(sp, lno, lb + last, lbclen))
737				goto err;
738			if (db_get(sp, lno, DBG_FATAL, &s, &llen))
739				goto err;
740			ADD_SPACE_RETW(sp, bp, blen, llen);
741			MEMCPY(bp, s, llen);
742			s = bp;
743			len = llen - offset;
744
745			/* Restart the build. */
746			lbclen = 0;
747			BUILD(sp, s, offset);
748
749			/*
750			 * If we haven't already done the after-the-string
751			 * match, do one.  Set REG_NOTEOL so the '$' pattern
752			 * only matches once.
753			 */
754			if (!do_eol_match)
755				goto endmatch;
756			if (offset == len) {
757				do_eol_match = 0;
758				eflags |= REG_NOTEOL;
759			}
760			goto nextmatch;
761		}
762
763		/*
764		 * If it's a global:
765		 *
766		 * If at the end of the string, do a test for the after
767		 * the string match.  Set REG_NOTEOL so the '$' pattern
768		 * only matches once.
769		 */
770		if (sp->g_suffix && do_eol_match) {
771			if (len == 0) {
772				do_eol_match = 0;
773				eflags |= REG_NOTEOL;
774			}
775			goto nextmatch;
776		}
777
778endmatch:	if (!linechanged)
779			continue;
780
781		/* Copy any remaining bytes into the build buffer. */
782		if (len)
783			BUILD(sp, s + offset, len);
784
785		/* Store inserted lines, adjusting the build buffer. */
786		last = 0;
787		if (sp->newl_cnt) {
788			for (cnt = 0;
789			    cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) {
790				if (db_insert(sp,
791				    lno, lb + last, sp->newl[cnt] - last))
792					goto err;
793				last = sp->newl[cnt] + 1;
794				++sp->rptlines[L_ADDED];
795			}
796			lbclen -= last;
797			sp->newl_cnt = 0;
798		}
799
800		/* Store the changed line. */
801		if (db_set(sp, lno, lb + last, lbclen))
802			goto err;
803
804		/* Update changed line counter. */
805		if (sp->rptlchange != lno) {
806			sp->rptlchange = lno;
807			++sp->rptlines[L_CHANGED];
808		}
809
810		/*
811		 * !!!
812		 * Display as necessary.  Historic practice is to only
813		 * display the last line of a line split into multiple
814		 * lines.
815		 */
816		if (lflag || nflag || pflag) {
817			from.lno = to.lno = lno;
818			from.cno = to.cno = 0;
819			if (lflag)
820				(void)ex_print(sp, cmdp, &from, &to, E_C_LIST);
821			if (nflag)
822				(void)ex_print(sp, cmdp, &from, &to, E_C_HASH);
823			if (pflag)
824				(void)ex_print(sp, cmdp, &from, &to, E_C_PRINT);
825		}
826	}
827
828	/*
829	 * !!!
830	 * Historically, vi attempted to leave the cursor at the same place if
831	 * the substitution was done at the current cursor position.  Otherwise
832	 * it moved it to the first non-blank of the last line changed.  There
833	 * were some problems: for example, :s/$/foo/ with the cursor on the
834	 * last character of the line left the cursor on the last character, or
835	 * the & command with multiple occurrences of the matching string in the
836	 * line usually left the cursor in a fairly random position.
837	 *
838	 * We try to do the same thing, with the exception that if the user is
839	 * doing substitution with confirmation, we move to the last line about
840	 * which the user was consulted, as opposed to the last line that they
841	 * actually changed.  This prevents a screen flash if the user doesn't
842	 * change many of the possible lines.
843	 */
844	if (!sp->c_suffix && (sp->lno != slno || sp->cno != scno)) {
845		sp->cno = 0;
846		(void)nonblank(sp, sp->lno, &sp->cno);
847	}
848
849	/*
850	 * If not in a global command, and nothing matched, say so.
851	 * Else, if none of the lines displayed, put something up.
852	 */
853	rval = 0;
854	if (!matched) {
855		if (!F_ISSET(sp, SC_EX_GLOBAL)) {
856			msgq(sp, M_ERR, "157|No match found");
857			goto err;
858		}
859	} else if (!lflag && !nflag && !pflag)
860		F_SET(cmdp, E_AUTOPRINT);
861
862	if (0) {
863err:		rval = 1;
864	}
865
866	if (bp != NULL)
867		FREE_SPACEW(sp, bp, blen);
868	free(lb);
869	return (rval);
870}
871
872/*
873 * re_compile --
874 *	Compile the RE.
875 *
876 * PUBLIC: int re_compile(SCR *,
877 * PUBLIC:     CHAR_T *, size_t, CHAR_T **, size_t *, regex_t *, u_int);
878 */
879int
880re_compile(SCR *sp, CHAR_T *ptrn, size_t plen, CHAR_T **ptrnp, size_t *lenp, regex_t *rep, u_int flags)
881{
882	size_t len;
883	int reflags, replaced, rval;
884	CHAR_T *p;
885
886	/* Set RE flags. */
887	reflags = 0;
888	if (!LF_ISSET(RE_C_CSCOPE | RE_C_TAG)) {
889		if (O_ISSET(sp, O_EXTENDED))
890			reflags |= REG_EXTENDED;
891		if (O_ISSET(sp, O_IGNORECASE))
892			reflags |= REG_ICASE;
893		if (O_ISSET(sp, O_ICLOWER)) {
894			for (p = ptrn, len = plen; len > 0; ++p, --len)
895				if (ISUPPER(*p))
896					break;
897			if (len == 0)
898				reflags |= REG_ICASE;
899		}
900	}
901
902	/* If we're replacing a saved value, clear the old one. */
903	if (LF_ISSET(RE_C_SEARCH) && F_ISSET(sp, SC_RE_SEARCH)) {
904		regfree(&sp->re_c);
905		F_CLR(sp, SC_RE_SEARCH);
906	}
907	if (LF_ISSET(RE_C_SUBST) && F_ISSET(sp, SC_RE_SUBST)) {
908		regfree(&sp->subre_c);
909		F_CLR(sp, SC_RE_SUBST);
910	}
911
912	/*
913	 * If we're saving the string, it's a pattern we haven't seen before,
914	 * so convert the vi-style RE's to POSIX 1003.2 RE's.  Save a copy for
915	 * later recompilation.   Free any previously saved value.
916	 */
917	if (ptrnp != NULL) {
918		replaced = 0;
919		if (LF_ISSET(RE_C_CSCOPE)) {
920			if (re_cscope_conv(sp, &ptrn, &plen, &replaced))
921				return (1);
922			/*
923			 * XXX
924			 * Currently, the match-any-<blank> expression used in
925			 * re_cscope_conv() requires extended RE's.  This may
926			 * not be right or safe.
927			 */
928			reflags |= REG_EXTENDED;
929		} else if (LF_ISSET(RE_C_TAG)) {
930			if (re_tag_conv(sp, &ptrn, &plen, &replaced))
931				return (1);
932		} else
933			if (re_conv(sp, &ptrn, &plen, &replaced))
934				return (1);
935
936		/* Discard previous pattern. */
937		free(*ptrnp);
938		*ptrnp = NULL;
939
940		if (lenp != NULL)
941			*lenp = plen;
942
943		/*
944		 * Copy the string into allocated memory.
945		 *
946		 * XXX
947		 * Regcomp isn't 8-bit clean, so the pattern is nul-terminated
948		 * for now.  There's just no other solution.
949		 */
950		MALLOC(sp, *ptrnp, (plen + 1) * sizeof(CHAR_T));
951		if (*ptrnp != NULL) {
952			MEMCPY(*ptrnp, ptrn, plen);
953			(*ptrnp)[plen] = '\0';
954		}
955
956		/* Free up conversion-routine-allocated memory. */
957		if (replaced)
958			FREE_SPACEW(sp, ptrn, 0);
959
960		if (*ptrnp == NULL)
961			return (1);
962
963		ptrn = *ptrnp;
964	}
965
966	/*
967	 * XXX
968	 * Regcomp isn't 8-bit clean, so we just lost if the pattern
969	 * contained a nul.  Bummer!
970	 */
971	if ((rval = regcomp(rep, ptrn, /* plen, */ reflags)) != 0) {
972		if (!LF_ISSET(RE_C_SILENT))
973			re_error(sp, rval, rep);
974		return (1);
975	}
976
977	if (LF_ISSET(RE_C_SEARCH))
978		F_SET(sp, SC_RE_SEARCH);
979	if (LF_ISSET(RE_C_SUBST))
980		F_SET(sp, SC_RE_SUBST);
981
982	return (0);
983}
984
985/*
986 * re_conv --
987 *	Convert vi's regular expressions into something that the
988 *	the POSIX 1003.2 RE functions can handle.
989 *
990 * There are three conversions we make to make vi's RE's (specifically
991 * the global, search, and substitute patterns) work with POSIX RE's.
992 *
993 * 1: If O_MAGIC is not set, strip backslashes from the magic character
994 *    set (.[*~) that have them, and add them to the ones that don't.
995 * 2: If O_MAGIC is not set, the string "\~" is replaced with the text
996 *    from the last substitute command's replacement string.  If O_MAGIC
997 *    is set, it's the string "~".
998 * 3: The pattern \<ptrn\> does "word" searches, convert it to use the
999 *    new RE escapes.
1000 *
1001 * !!!/XXX
1002 * This doesn't exactly match the historic behavior of vi because we do
1003 * the ~ substitution before calling the RE engine, so magic characters
1004 * in the replacement string will be expanded by the RE engine, and they
1005 * weren't historically.  It's a bug.
1006 */
1007static int
1008re_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp)
1009{
1010	size_t blen, len, needlen;
1011	int magic;
1012	CHAR_T *bp, *p, *t;
1013
1014	/*
1015	 * First pass through, we figure out how much space we'll need.
1016	 * We do it in two passes, on the grounds that most of the time
1017	 * the user is doing a search and won't have magic characters.
1018	 * That way we can skip most of the memory allocation and copies.
1019	 */
1020	magic = 0;
1021	for (p = *ptrnp, len = *plenp, needlen = 0; len > 0; ++p, --len)
1022		switch (*p) {
1023		case '\\':
1024			if (len > 1) {
1025				--len;
1026				switch (*++p) {
1027				case '<':
1028					magic = 1;
1029					needlen += RE_WSTART_LEN + 1;
1030					break;
1031				case '>':
1032					magic = 1;
1033					needlen += RE_WSTOP_LEN + 1;
1034					break;
1035				case '~':
1036					if (!O_ISSET(sp, O_MAGIC)) {
1037						magic = 1;
1038						needlen += sp->repl_len;
1039					}
1040					break;
1041				case '.':
1042				case '[':
1043				case '*':
1044					if (!O_ISSET(sp, O_MAGIC)) {
1045						magic = 1;
1046						needlen += 1;
1047					}
1048					break;
1049				default:
1050					needlen += 2;
1051				}
1052			} else
1053				needlen += 1;
1054			break;
1055		case '~':
1056			if (O_ISSET(sp, O_MAGIC)) {
1057				magic = 1;
1058				needlen += sp->repl_len;
1059			}
1060			break;
1061		case '.':
1062		case '[':
1063		case '*':
1064			if (!O_ISSET(sp, O_MAGIC)) {
1065				magic = 1;
1066				needlen += 2;
1067			}
1068			break;
1069		default:
1070			needlen += 1;
1071			break;
1072		}
1073
1074	if (!magic) {
1075		*replacedp = 0;
1076		return (0);
1077	}
1078
1079	/* Get enough memory to hold the final pattern. */
1080	*replacedp = 1;
1081	GET_SPACE_RETW(sp, bp, blen, needlen);
1082
1083	for (p = *ptrnp, len = *plenp, t = bp; len > 0; ++p, --len)
1084		switch (*p) {
1085		case '\\':
1086			if (len > 1) {
1087				--len;
1088				switch (*++p) {
1089				case '<':
1090					MEMCPY(t,
1091					    RE_WSTART, RE_WSTART_LEN);
1092					t += RE_WSTART_LEN;
1093					break;
1094				case '>':
1095					MEMCPY(t,
1096					    RE_WSTOP, RE_WSTOP_LEN);
1097					t += RE_WSTOP_LEN;
1098					break;
1099				case '~':
1100					if (O_ISSET(sp, O_MAGIC))
1101						*t++ = '~';
1102					else {
1103						MEMCPY(t,
1104						    sp->repl, sp->repl_len);
1105						t += sp->repl_len;
1106					}
1107					break;
1108				case '.':
1109				case '[':
1110				case '*':
1111					if (O_ISSET(sp, O_MAGIC))
1112						*t++ = '\\';
1113					*t++ = *p;
1114					break;
1115				default:
1116					*t++ = '\\';
1117					*t++ = *p;
1118				}
1119			} else
1120				*t++ = '\\';
1121			break;
1122		case '~':
1123			if (O_ISSET(sp, O_MAGIC)) {
1124				MEMCPY(t, sp->repl, sp->repl_len);
1125				t += sp->repl_len;
1126			} else
1127				*t++ = '~';
1128			break;
1129		case '.':
1130		case '[':
1131		case '*':
1132			if (!O_ISSET(sp, O_MAGIC))
1133				*t++ = '\\';
1134			*t++ = *p;
1135			break;
1136		default:
1137			*t++ = *p;
1138			break;
1139		}
1140
1141	*ptrnp = bp;
1142	*plenp = t - bp;
1143	return (0);
1144}
1145
1146/*
1147 * re_tag_conv --
1148 *	Convert a tags search path into something that the POSIX
1149 *	1003.2 RE functions can handle.
1150 */
1151static int
1152re_tag_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp)
1153{
1154	size_t blen, len;
1155	int lastdollar;
1156	CHAR_T *bp, *p, *t;
1157
1158	len = *plenp;
1159
1160	/* Max memory usage is 2 times the length of the string. */
1161	*replacedp = 1;
1162	GET_SPACE_RETW(sp, bp, blen, len * 2);
1163
1164	p = *ptrnp;
1165	t = bp;
1166
1167	/* If the last character is a '/' or '?', we just strip it. */
1168	if (len > 0 && (p[len - 1] == '/' || p[len - 1] == '?'))
1169		--len;
1170
1171	/* If the next-to-last or last character is a '$', it's magic. */
1172	if (len > 0 && p[len - 1] == '$') {
1173		--len;
1174		lastdollar = 1;
1175	} else
1176		lastdollar = 0;
1177
1178	/* If the first character is a '/' or '?', we just strip it. */
1179	if (len > 0 && (p[0] == '/' || p[0] == '?')) {
1180		++p;
1181		--len;
1182	}
1183
1184	/* If the first or second character is a '^', it's magic. */
1185	if (p[0] == '^') {
1186		*t++ = *p++;
1187		--len;
1188	}
1189
1190	/*
1191	 * Escape every other magic character we can find, meanwhile stripping
1192	 * the backslashes ctags inserts when escaping the search delimiter
1193	 * characters.
1194	 */
1195	for (; len > 0; --len) {
1196		if (p[0] == '\\' && (p[1] == '/' || p[1] == '?')) {
1197			++p;
1198			--len;
1199		} else if (STRCHR(L("^.[]$*"), p[0]))
1200			*t++ = '\\';
1201		*t++ = *p++;
1202	}
1203	if (lastdollar)
1204		*t++ = '$';
1205
1206	*ptrnp = bp;
1207	*plenp = t - bp;
1208	return (0);
1209}
1210
1211/*
1212 * re_cscope_conv --
1213 *	 Convert a cscope search path into something that the POSIX
1214 *      1003.2 RE functions can handle.
1215 */
1216static int
1217re_cscope_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp)
1218{
1219	size_t blen, len, nspaces;
1220	CHAR_T *bp, *t;
1221	CHAR_T *p;
1222	CHAR_T *wp;
1223	size_t wlen;
1224
1225	/*
1226	 * Each space in the source line printed by cscope represents an
1227	 * arbitrary sequence of spaces, tabs, and comments.
1228	 */
1229#define	CSCOPE_RE_SPACE		"([ \t]|/\\*([^*]|\\*/)*\\*/)*"
1230#define CSCOPE_LEN	sizeof(CSCOPE_RE_SPACE) - 1
1231	CHAR2INT(sp, CSCOPE_RE_SPACE, CSCOPE_LEN, wp, wlen);
1232	for (nspaces = 0, p = *ptrnp, len = *plenp; len > 0; ++p, --len)
1233		if (*p == ' ')
1234			++nspaces;
1235
1236	/*
1237	 * Allocate plenty of space:
1238	 *	the string, plus potential escaping characters;
1239	 *	nspaces + 2 copies of CSCOPE_RE_SPACE;
1240	 *	^, $, nul terminator characters.
1241	 */
1242	*replacedp = 1;
1243	len = (p - *ptrnp) * 2 + (nspaces + 2) * sizeof(CSCOPE_RE_SPACE) + 3;
1244	GET_SPACE_RETW(sp, bp, blen, len);
1245
1246	p = *ptrnp;
1247	t = bp;
1248
1249	*t++ = '^';
1250	MEMCPY(t, wp, wlen);
1251	t += wlen;
1252
1253	for (len = *plenp; len > 0; ++p, --len)
1254		if (*p == ' ') {
1255			MEMCPY(t, wp, wlen);
1256			t += wlen;
1257		} else {
1258			if (STRCHR(L("\\^.[]$*+?()|{}"), *p))
1259				*t++ = '\\';
1260			*t++ = *p;
1261		}
1262
1263	MEMCPY(t, wp, wlen);
1264	t += wlen;
1265	*t++ = '$';
1266
1267	*ptrnp = bp;
1268	*plenp = t - bp;
1269	return (0);
1270}
1271
1272/*
1273 * re_error --
1274 *	Report a regular expression error.
1275 *
1276 * PUBLIC: void re_error(SCR *, int, regex_t *);
1277 */
1278void
1279re_error(SCR *sp, int errcode, regex_t *preg)
1280{
1281	size_t s;
1282	char *oe;
1283
1284	s = regerror(errcode, preg, "", 0);
1285	MALLOC(sp, oe, s);
1286	if (oe != NULL) {
1287		(void)regerror(errcode, preg, oe, s);
1288		msgq(sp, M_ERR, "RE error: %s", oe);
1289		free(oe);
1290	}
1291}
1292
1293/*
1294 * re_sub --
1295 * 	Do the substitution for a regular expression.
1296 */
1297static int
1298re_sub(
1299	SCR *sp,
1300	CHAR_T *ip,			/* Input line. */
1301	CHAR_T **lbp,
1302	size_t *lbclenp,
1303	size_t *lblenp,
1304	regmatch_t match[10])
1305{
1306	enum { C_NOTSET, C_LOWER, C_ONELOWER, C_ONEUPPER, C_UPPER } conv;
1307	size_t lbclen, lblen;		/* Local copies. */
1308	size_t mlen;			/* Match length. */
1309	size_t rpl;			/* Remaining replacement length. */
1310	CHAR_T *rp;			/* Replacement pointer. */
1311	int ch;
1312	int no;				/* Match replacement offset. */
1313	CHAR_T *p, *t;			/* Buffer pointers. */
1314	CHAR_T *lb;			/* Local copies. */
1315
1316	lb = *lbp;			/* Get local copies. */
1317	lbclen = *lbclenp;
1318	lblen = *lblenp;
1319
1320	/*
1321	 * QUOTING NOTE:
1322	 *
1323	 * There are some special sequences that vi provides in the
1324	 * replacement patterns.
1325	 *	 & string the RE matched (\& if nomagic set)
1326	 *	\# n-th regular subexpression
1327	 *	\E end \U, \L conversion
1328	 *	\e end \U, \L conversion
1329	 *	\l convert the next character to lower-case
1330	 *	\L convert to lower-case, until \E, \e, or end of replacement
1331	 *	\u convert the next character to upper-case
1332	 *	\U convert to upper-case, until \E, \e, or end of replacement
1333	 *
1334	 * Otherwise, since this is the lowest level of replacement, discard
1335	 * all escaping characters.  This (hopefully) matches historic practice.
1336	 */
1337#define	OUTCH(ch, nltrans) do {						\
1338	ARG_CHAR_T __ch = (ch);						\
1339	e_key_t __value = KEY_VAL(sp, __ch);				\
1340	if (nltrans && (__value == K_CR || __value == K_NL)) {		\
1341		NEEDNEWLINE(sp);					\
1342		sp->newl[sp->newl_cnt++] = lbclen;			\
1343	} else if (conv != C_NOTSET) {					\
1344		switch (conv) {						\
1345		case C_ONELOWER:					\
1346			conv = C_NOTSET;				\
1347			/* FALLTHROUGH */				\
1348		case C_LOWER:						\
1349			if (ISUPPER(__ch))				\
1350				__ch = TOLOWER(__ch);			\
1351			break;						\
1352		case C_ONEUPPER:					\
1353			conv = C_NOTSET;				\
1354			/* FALLTHROUGH */				\
1355		case C_UPPER:						\
1356			if (ISLOWER(__ch))				\
1357				__ch = TOUPPER(__ch);			\
1358			break;						\
1359		default:						\
1360			abort();					\
1361		}							\
1362	}								\
1363	NEEDSP(sp, 1, p);						\
1364	*p++ = __ch;							\
1365	++lbclen;							\
1366} while (0)
1367	conv = C_NOTSET;
1368	for (rp = sp->repl, rpl = sp->repl_len, p = lb + lbclen; rpl--;) {
1369		switch (ch = *rp++) {
1370		case '&':
1371			if (O_ISSET(sp, O_MAGIC)) {
1372				no = 0;
1373				goto subzero;
1374			}
1375			break;
1376		case '\\':
1377			if (rpl == 0)
1378				break;
1379			--rpl;
1380			switch (ch = *rp) {
1381			case '&':
1382				++rp;
1383				if (!O_ISSET(sp, O_MAGIC)) {
1384					no = 0;
1385					goto subzero;
1386				}
1387				break;
1388			case '0': case '1': case '2': case '3': case '4':
1389			case '5': case '6': case '7': case '8': case '9':
1390				no = *rp++ - '0';
1391subzero:			if (match[no].rm_so == -1 ||
1392				    match[no].rm_eo == -1)
1393					break;
1394				mlen = match[no].rm_eo - match[no].rm_so;
1395				for (t = ip + match[no].rm_so; mlen--; ++t)
1396					OUTCH(*t, 0);
1397				continue;
1398			case 'e':
1399			case 'E':
1400				++rp;
1401				conv = C_NOTSET;
1402				continue;
1403			case 'l':
1404				++rp;
1405				conv = C_ONELOWER;
1406				continue;
1407			case 'L':
1408				++rp;
1409				conv = C_LOWER;
1410				continue;
1411			case 'u':
1412				++rp;
1413				conv = C_ONEUPPER;
1414				continue;
1415			case 'U':
1416				++rp;
1417				conv = C_UPPER;
1418				continue;
1419			case '\r':
1420				OUTCH(ch, 0);
1421				continue;
1422			default:
1423				++rp;
1424				break;
1425			}
1426		}
1427		OUTCH(ch, 1);
1428	}
1429
1430	*lbp = lb;			/* Update caller's information. */
1431	*lbclenp = lbclen;
1432	*lblenp = lblen;
1433	return (0);
1434}
1435