1/*-
2 * Copyright (c) 1992, 1993, 1994
3 *	The Regents of the University of California.  All rights reserved.
4 * Copyright (c) 1992, 1993, 1994, 1995, 1996
5 *	Keith Bostic.  All rights reserved.
6 *
7 * See the LICENSE file for redistribution information.
8 */
9
10#include "config.h"
11
12#ifndef lint
13static const char sccsid[] = "$Id: ex_subst.c,v 10.53 2011/12/21 20:40:35 zy Exp $";
14#endif /* not lint */
15
16#include <sys/types.h>
17#include <sys/queue.h>
18#include <sys/time.h>
19
20#include <bitstring.h>
21#include <ctype.h>
22#include <errno.h>
23#include <limits.h>
24#include <stdio.h>
25#include <stdlib.h>
26#include <string.h>
27#include <unistd.h>
28
29#include "../common/common.h"
30#include "../vi/vi.h"
31
32#define	SUB_FIRST	0x01		/* The 'r' flag isn't reasonable. */
33#define	SUB_MUSTSETR	0x02		/* The 'r' flag is required. */
34
35static int re_conv __P((SCR *, CHAR_T **, size_t *, int *));
36static int re_cscope_conv __P((SCR *, CHAR_T **, size_t *, int *));
37static int re_sub __P((SCR *,
38		CHAR_T *, CHAR_T **, size_t *, size_t *, regmatch_t [10]));
39static int re_tag_conv __P((SCR *, CHAR_T **, size_t *, int *));
40static int s __P((SCR *, EXCMD *, CHAR_T *, regex_t *, u_int));
41
42/*
43 * ex_s --
44 *	[line [,line]] s[ubstitute] [[/;]pat[/;]/repl[/;] [cgr] [count] [#lp]]
45 *
46 *	Substitute on lines matching a pattern.
47 *
48 * PUBLIC: int ex_s __P((SCR *, EXCMD *));
49 */
50int
51ex_s(SCR *sp, EXCMD *cmdp)
52{
53	regex_t *re;
54	size_t blen, len;
55	u_int flags;
56	int delim;
57	CHAR_T *bp, *p, *ptrn, *rep, *t;
58
59	/*
60	 * Skip leading white space.
61	 *
62	 * !!!
63	 * Historic vi allowed any non-alphanumeric to serve as the
64	 * substitution command delimiter.
65	 *
66	 * !!!
67	 * If the arguments are empty, it's the same as &, i.e. we
68	 * repeat the last substitution.
69	 */
70	if (cmdp->argc == 0)
71		goto subagain;
72	for (p = cmdp->argv[0]->bp,
73	    len = cmdp->argv[0]->len; len > 0; --len, ++p) {
74		if (!cmdskip(*p))
75			break;
76	}
77	if (len == 0)
78subagain:	return (ex_subagain(sp, cmdp));
79
80	delim = *p++;
81	if (!isascii(delim) || isalnum(delim) || delim == '\\')
82		return (s(sp, cmdp, p, &sp->subre_c, SUB_MUSTSETR));
83
84	/*
85	 * !!!
86	 * The full-blown substitute command reset the remembered
87	 * state of the 'c' and 'g' suffices.
88	 */
89	sp->c_suffix = sp->g_suffix = 0;
90
91	/*
92	 * Get the pattern string, toss escaping characters.
93	 *
94	 * !!!
95	 * Historic vi accepted any of the following forms:
96	 *
97	 *	:s/abc/def/		change "abc" to "def"
98	 *	:s/abc/def		change "abc" to "def"
99	 *	:s/abc/			delete "abc"
100	 *	:s/abc			delete "abc"
101	 *
102	 * QUOTING NOTE:
103	 *
104	 * Only toss an escaping character if it escapes a delimiter.
105	 * This means that "s/A/\\\\f" replaces "A" with "\\f".  It
106	 * would be nice to be more regular, i.e. for each layer of
107	 * escaping a single escaping character is removed, but that's
108	 * not how the historic vi worked.
109	 */
110	for (ptrn = t = p;;) {
111		if (p[0] == '\0' || p[0] == delim) {
112			if (p[0] == delim)
113				++p;
114			/*
115			 * !!!
116			 * Nul terminate the pattern string -- it's passed
117			 * to regcomp which doesn't understand anything else.
118			 */
119			*t = '\0';
120			break;
121		}
122		if (p[0] == '\\')
123			if (p[1] == delim)
124				++p;
125			else if (p[1] == '\\')
126				*t++ = *p++;
127		*t++ = *p++;
128	}
129
130	/*
131	 * If the pattern string is empty, use the last RE (not just the
132	 * last substitution RE).
133	 */
134	if (*ptrn == '\0') {
135		if (sp->re == NULL) {
136			ex_emsg(sp, NULL, EXM_NOPREVRE);
137			return (1);
138		}
139
140		/* Re-compile the RE if necessary. */
141		if (!F_ISSET(sp, SC_RE_SEARCH) &&
142		    re_compile(sp, sp->re, sp->re_len,
143		    NULL, NULL, &sp->re_c, RE_C_SEARCH))
144			return (1);
145		flags = 0;
146	} else {
147		/*
148		 * !!!
149		 * Compile the RE.  Historic practice is that substitutes set
150		 * the search direction as well as both substitute and search
151		 * RE's.  We compile the RE twice, as we don't want to bother
152		 * ref counting the pattern string and (opaque) structure.
153		 */
154		if (re_compile(sp, ptrn, t - ptrn, &sp->re,
155		    &sp->re_len, &sp->re_c, RE_C_SEARCH))
156			return (1);
157		if (re_compile(sp, ptrn, t - ptrn, &sp->subre,
158		    &sp->subre_len, &sp->subre_c, RE_C_SUBST))
159			return (1);
160
161		flags = SUB_FIRST;
162		sp->searchdir = FORWARD;
163	}
164	re = &sp->re_c;
165
166	/*
167	 * Get the replacement string.
168	 *
169	 * The special character & (\& if O_MAGIC not set) matches the
170	 * entire RE.  No handling of & is required here, it's done by
171	 * re_sub().
172	 *
173	 * The special character ~ (\~ if O_MAGIC not set) inserts the
174	 * previous replacement string into this replacement string.
175	 * Count ~'s to figure out how much space we need.  We could
176	 * special case nonexistent last patterns or whether or not
177	 * O_MAGIC is set, but it's probably not worth the effort.
178	 *
179	 * QUOTING NOTE:
180	 *
181	 * Only toss an escaping character if it escapes a delimiter or
182	 * if O_MAGIC is set and it escapes a tilde.
183	 *
184	 * !!!
185	 * If the entire replacement pattern is "%", then use the last
186	 * replacement pattern.  This semantic was added to vi in System
187	 * V and then percolated elsewhere, presumably around the time
188	 * that it was added to their version of ed(1).
189	 */
190	if (p[0] == '\0' || p[0] == delim) {
191		if (p[0] == delim)
192			++p;
193		if (sp->repl != NULL)
194			free(sp->repl);
195		sp->repl = NULL;
196		sp->repl_len = 0;
197	} else if (p[0] == '%' && (p[1] == '\0' || p[1] == delim))
198		p += p[1] == delim ? 2 : 1;
199	else {
200		for (rep = p, len = 0;
201		    p[0] != '\0' && p[0] != delim; ++p, ++len)
202			if (p[0] == '~')
203				len += sp->repl_len;
204		GET_SPACE_RETW(sp, bp, blen, len);
205		for (t = bp, len = 0, p = rep;;) {
206			if (p[0] == '\0' || p[0] == delim) {
207				if (p[0] == delim)
208					++p;
209				break;
210			}
211			if (p[0] == '\\') {
212				if (p[1] == delim)
213					++p;
214				else if (p[1] == '\\') {
215					*t++ = *p++;
216					++len;
217				} else if (p[1] == '~') {
218					++p;
219					if (!O_ISSET(sp, O_MAGIC))
220						goto tilde;
221				}
222			} else if (p[0] == '~' && O_ISSET(sp, O_MAGIC)) {
223tilde:				++p;
224				MEMCPY(t, sp->repl, sp->repl_len);
225				t += sp->repl_len;
226				len += sp->repl_len;
227				continue;
228			}
229			*t++ = *p++;
230			++len;
231		}
232		if ((sp->repl_len = len) != 0) {
233			if (sp->repl != NULL)
234				free(sp->repl);
235			MALLOC(sp, sp->repl, CHAR_T *, len * sizeof(CHAR_T));
236			if (sp->repl == NULL) {
237				FREE_SPACEW(sp, bp, blen);
238				return (1);
239			}
240			MEMCPY(sp->repl, bp, len);
241		}
242		FREE_SPACEW(sp, bp, blen);
243	}
244	return (s(sp, cmdp, p, re, flags));
245}
246
247/*
248 * ex_subagain --
249 *	[line [,line]] & [cgr] [count] [#lp]]
250 *
251 *	Substitute using the last substitute RE and replacement pattern.
252 *
253 * PUBLIC: int ex_subagain __P((SCR *, EXCMD *));
254 */
255int
256ex_subagain(SCR *sp, EXCMD *cmdp)
257{
258	if (sp->subre == NULL) {
259		ex_emsg(sp, NULL, EXM_NOPREVRE);
260		return (1);
261	}
262	if (!F_ISSET(sp, SC_RE_SUBST) &&
263	    re_compile(sp, sp->subre, sp->subre_len,
264	    NULL, NULL, &sp->subre_c, RE_C_SUBST))
265		return (1);
266	return (s(sp,
267	    cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->subre_c, 0));
268}
269
270/*
271 * ex_subtilde --
272 *	[line [,line]] ~ [cgr] [count] [#lp]]
273 *
274 *	Substitute using the last RE and last substitute replacement pattern.
275 *
276 * PUBLIC: int ex_subtilde __P((SCR *, EXCMD *));
277 */
278int
279ex_subtilde(SCR *sp, EXCMD *cmdp)
280{
281	if (sp->re == NULL) {
282		ex_emsg(sp, NULL, EXM_NOPREVRE);
283		return (1);
284	}
285	if (!F_ISSET(sp, SC_RE_SEARCH) && re_compile(sp, sp->re,
286	    sp->re_len, NULL, NULL, &sp->re_c, RE_C_SEARCH))
287		return (1);
288	return (s(sp,
289	    cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->re_c, 0));
290}
291
292/*
293 * s --
294 * Do the substitution.  This stuff is *really* tricky.  There are lots of
295 * special cases, and general nastiness.  Don't mess with it unless you're
296 * pretty confident.
297 *
298 * The nasty part of the substitution is what happens when the replacement
299 * string contains newlines.  It's a bit tricky -- consider the information
300 * that has to be retained for "s/f\(o\)o/^M\1^M\1/".  The solution here is
301 * to build a set of newline offsets which we use to break the line up later,
302 * when the replacement is done.  Don't change it unless you're *damned*
303 * confident.
304 */
305#define	NEEDNEWLINE(sp) {						\
306	if (sp->newl_len == sp->newl_cnt) {				\
307		sp->newl_len += 25;					\
308		REALLOC(sp, sp->newl, size_t *,				\
309		    sp->newl_len * sizeof(size_t));			\
310		if (sp->newl == NULL) {					\
311			sp->newl_len = 0;				\
312			return (1);					\
313		}							\
314	}								\
315}
316
317#define	BUILD(sp, l, len) {						\
318	if (lbclen + (len) > lblen) {					\
319		lblen = p2roundup(MAX(lbclen + (len), 256));		\
320		REALLOC(sp, lb, CHAR_T *, lblen * sizeof(CHAR_T));	\
321		if (lb == NULL) {					\
322			lbclen = 0;					\
323			return (1);					\
324		}							\
325	}								\
326	MEMCPY(lb + lbclen, l, len);					\
327	lbclen += len;							\
328}
329
330#define	NEEDSP(sp, len, pnt) {						\
331	if (lbclen + (len) > lblen) {					\
332		lblen = p2roundup(MAX(lbclen + (len), 256));		\
333		REALLOC(sp, lb, CHAR_T *, lblen * sizeof(CHAR_T));	\
334		if (lb == NULL) {					\
335			lbclen = 0;					\
336			return (1);					\
337		}							\
338		pnt = lb + lbclen;					\
339	}								\
340}
341
342static int
343s(SCR *sp, EXCMD *cmdp, CHAR_T *s, regex_t *re, u_int flags)
344{
345	EVENT ev;
346	MARK from, to;
347	TEXTH tiq[] = {{ 0 }};
348	recno_t elno, lno, slno;
349	u_long ul;
350	regmatch_t match[10];
351	size_t blen, cnt, last, lbclen, lblen, len, llen;
352	size_t offset, saved_offset, scno;
353	int cflag, lflag, nflag, pflag, rflag;
354	int didsub, do_eol_match, eflags, empty_ok, eval;
355	int linechanged, matched, quit, rval;
356	CHAR_T *bp, *lb;
357	enum nresult nret;
358
359	NEEDFILE(sp, cmdp);
360
361	slno = sp->lno;
362	scno = sp->cno;
363
364	/*
365	 * !!!
366	 * Historically, the 'g' and 'c' suffices were always toggled as flags,
367	 * so ":s/A/B/" was the same as ":s/A/B/ccgg".  If O_EDCOMPATIBLE was
368	 * not set, they were initialized to 0 for all substitute commands.  If
369	 * O_EDCOMPATIBLE was set, they were initialized to 0 only if the user
370	 * specified substitute/replacement patterns (see ex_s()).
371	 */
372	if (!O_ISSET(sp, O_EDCOMPATIBLE))
373		sp->c_suffix = sp->g_suffix = 0;
374
375	/*
376	 * Historic vi permitted the '#', 'l' and 'p' options in vi mode, but
377	 * it only displayed the last change.  I'd disallow them, but they are
378	 * useful in combination with the [v]global commands.  In the current
379	 * model the problem is combining them with the 'c' flag -- the screen
380	 * would have to flip back and forth between the confirm screen and the
381	 * ex print screen, which would be pretty awful.  We do display all
382	 * changes, though, for what that's worth.
383	 *
384	 * !!!
385	 * Historic vi was fairly strict about the order of "options", the
386	 * count, and "flags".  I'm somewhat fuzzy on the difference between
387	 * options and flags, anyway, so this is a simpler approach, and we
388	 * just take it them in whatever order the user gives them.  (The ex
389	 * usage statement doesn't reflect this.)
390	 */
391	cflag = lflag = nflag = pflag = rflag = 0;
392	if (s == NULL)
393		goto noargs;
394	for (lno = OOBLNO; *s != '\0'; ++s)
395		switch (*s) {
396		case ' ':
397		case '\t':
398			continue;
399		case '+':
400			++cmdp->flagoff;
401			break;
402		case '-':
403			--cmdp->flagoff;
404			break;
405		case '0': case '1': case '2': case '3': case '4':
406		case '5': case '6': case '7': case '8': case '9':
407			if (lno != OOBLNO)
408				goto usage;
409			errno = 0;
410			nret = nget_uslong(&ul, s, &s, 10);
411			lno = ul;
412			if (*s == '\0')		/* Loop increment correction. */
413				--s;
414			if (nret != NUM_OK) {
415				if (nret == NUM_OVER)
416					msgq(sp, M_ERR, "153|Count overflow");
417				else if (nret == NUM_UNDER)
418					msgq(sp, M_ERR, "154|Count underflow");
419				else
420					msgq(sp, M_SYSERR, NULL);
421				return (1);
422			}
423			/*
424			 * In historic vi, the count was inclusive from the
425			 * second address.
426			 */
427			cmdp->addr1.lno = cmdp->addr2.lno;
428			cmdp->addr2.lno += lno - 1;
429			if (!db_exist(sp, cmdp->addr2.lno) &&
430			    db_last(sp, &cmdp->addr2.lno))
431				return (1);
432			break;
433		case '#':
434			nflag = 1;
435			break;
436		case 'c':
437			sp->c_suffix = !sp->c_suffix;
438
439			/* Ex text structure initialization. */
440			if (F_ISSET(sp, SC_EX))
441				TAILQ_INIT(tiq);
442			break;
443		case 'g':
444			sp->g_suffix = !sp->g_suffix;
445			break;
446		case 'l':
447			lflag = 1;
448			break;
449		case 'p':
450			pflag = 1;
451			break;
452		case 'r':
453			if (LF_ISSET(SUB_FIRST)) {
454				msgq(sp, M_ERR,
455		    "155|Regular expression specified; r flag meaningless");
456				return (1);
457			}
458			if (!F_ISSET(sp, SC_RE_SEARCH)) {
459				ex_emsg(sp, NULL, EXM_NOPREVRE);
460				return (1);
461			}
462			rflag = 1;
463			re = &sp->re_c;
464			break;
465		default:
466			goto usage;
467		}
468
469	if (*s != '\0' || (!rflag && LF_ISSET(SUB_MUSTSETR))) {
470usage:		ex_emsg(sp, cmdp->cmd->usage, EXM_USAGE);
471		return (1);
472	}
473
474noargs:	if (F_ISSET(sp, SC_VI) && sp->c_suffix && (lflag || nflag || pflag)) {
475		msgq(sp, M_ERR,
476"156|The #, l and p flags may not be combined with the c flag in vi mode");
477		return (1);
478	}
479
480	/*
481	 * bp:		if interactive, line cache
482	 * blen:	if interactive, line cache length
483	 * lb:		build buffer pointer.
484	 * lbclen:	current length of built buffer.
485	 * lblen;	length of build buffer.
486	 */
487	bp = lb = NULL;
488	blen = lbclen = lblen = 0;
489
490	/* For each line... */
491	lno = cmdp->addr1.lno == 0 ? 1 : cmdp->addr1.lno;
492	for (matched = quit = 0,
493	    elno = cmdp->addr2.lno; !quit && lno <= elno; ++lno) {
494
495		/* Someone's unhappy, time to stop. */
496		if (INTERRUPTED(sp))
497			break;
498
499		/* Get the line. */
500		if (db_get(sp, lno, DBG_FATAL, &s, &llen))
501			goto err;
502
503		/*
504		 * Make a local copy if doing confirmation -- when calling
505		 * the confirm routine we're likely to lose the cached copy.
506		 */
507		if (sp->c_suffix) {
508			if (bp == NULL) {
509				GET_SPACE_RETW(sp, bp, blen, llen);
510			} else
511				ADD_SPACE_RETW(sp, bp, blen, llen);
512			MEMCPY(bp, s, llen);
513			s = bp;
514		}
515
516		/* Start searching from the beginning. */
517		offset = 0;
518		len = llen;
519
520		/* Reset the build buffer offset. */
521		lbclen = 0;
522
523		/* Reset empty match flag. */
524		empty_ok = 1;
525
526		/*
527		 * We don't want to have to do a setline if the line didn't
528		 * change -- keep track of whether or not this line changed.
529		 * If doing confirmations, don't want to keep setting the
530		 * line if change is refused -- keep track of substitutions.
531		 */
532		didsub = linechanged = 0;
533
534		/* New line, do an EOL match. */
535		do_eol_match = 1;
536
537		/* It's not nul terminated, but we pretend it is. */
538		eflags = REG_STARTEND;
539
540		/*
541		 * The search area is from s + offset to the EOL.
542		 *
543		 * Generally, match[0].rm_so is the offset of the start
544		 * of the match from the start of the search, and offset
545		 * is the offset of the start of the last search.
546		 */
547nextmatch:	match[0].rm_so = 0;
548		match[0].rm_eo = len;
549
550		/* Get the next match. */
551		eval = regexec(re, s + offset, 10, match, eflags);
552
553		/*
554		 * There wasn't a match or if there was an error, deal with
555		 * it.  If there was a previous match in this line, resolve
556		 * the changes into the database.  Otherwise, just move on.
557		 */
558		if (eval == REG_NOMATCH)
559			goto endmatch;
560		if (eval != 0) {
561			re_error(sp, eval, re);
562			goto err;
563		}
564		matched = 1;
565
566		/* Only the first search can match an anchored expression. */
567		eflags |= REG_NOTBOL;
568
569		/*
570		 * !!!
571		 * It's possible to match 0-length strings -- for example, the
572		 * command s;a*;X;, when matched against the string "aabb" will
573		 * result in "XbXbX", i.e. the matches are "aa", the space
574		 * between the b's and the space between the b's and the end of
575		 * the string.  There is a similar space between the beginning
576		 * of the string and the a's.  The rule that we use (because vi
577		 * historically used it) is that any 0-length match, occurring
578		 * immediately after a match, is ignored.  Otherwise, the above
579		 * example would have resulted in "XXbXbX".  Another example is
580		 * incorrectly using " *" to replace groups of spaces with one
581		 * space.
582		 *
583		 * The way we do this is that if we just had a successful match,
584		 * the starting offset does not skip characters, and the match
585		 * is empty, ignore the match and move forward.  If there's no
586		 * more characters in the string, we were attempting to match
587		 * after the last character, so quit.
588		 */
589		if (!empty_ok && match[0].rm_so == 0 && match[0].rm_eo == 0) {
590			empty_ok = 1;
591			if (len == 0)
592				goto endmatch;
593			BUILD(sp, s + offset, 1)
594			++offset;
595			--len;
596			goto nextmatch;
597		}
598
599		/* Confirm change. */
600		if (sp->c_suffix) {
601			/*
602			 * Set the cursor position for confirmation.  Note,
603			 * if we matched on a '$', the cursor may be past
604			 * the end of line.
605			 */
606			from.lno = to.lno = lno;
607			from.cno = match[0].rm_so + offset;
608			to.cno = match[0].rm_eo + offset;
609			/*
610			 * Both ex and vi have to correct for a change before
611			 * the first character in the line.
612			 */
613			if (llen == 0)
614				from.cno = to.cno = 0;
615			if (F_ISSET(sp, SC_VI)) {
616				/*
617				 * Only vi has to correct for a change after
618				 * the last character in the line.
619				 *
620				 * XXX
621				 * It would be nice to change the vi code so
622				 * that we could display a cursor past EOL.
623				 */
624				if (to.cno >= llen)
625					to.cno = llen - 1;
626				if (from.cno >= llen)
627					from.cno = llen - 1;
628
629				sp->lno = from.lno;
630				sp->cno = from.cno;
631				if (vs_refresh(sp, 1))
632					goto err;
633
634				vs_update(sp, msg_cat(sp,
635				    "169|Confirm change? [n]", NULL), NULL);
636
637				if (v_event_get(sp, &ev, 0, 0))
638					goto err;
639				switch (ev.e_event) {
640				case E_CHARACTER:
641					break;
642				case E_EOF:
643				case E_ERR:
644				case E_INTERRUPT:
645					goto lquit;
646				default:
647					v_event_err(sp, &ev);
648					goto lquit;
649				}
650			} else {
651				if (ex_print(sp, cmdp, &from, &to, 0) ||
652				    ex_scprint(sp, &from, &to))
653					goto lquit;
654				if (ex_txt(sp, tiq, 0, TXT_CR))
655					goto err;
656				ev.e_c = TAILQ_FIRST(tiq)->lb[0];
657			}
658
659			switch (ev.e_c) {
660			case CH_YES:
661				break;
662			default:
663			case CH_NO:
664				didsub = 0;
665				BUILD(sp, s +offset, match[0].rm_eo);
666				goto skip;
667			case CH_QUIT:
668				/* Set the quit/interrupted flags. */
669lquit:				quit = 1;
670				F_SET(sp->gp, G_INTERRUPTED);
671
672				/*
673				 * Resolve any changes, then return to (and
674				 * exit from) the main loop.
675				 */
676				goto endmatch;
677			}
678		}
679
680		/*
681		 * Set the cursor to the last position changed, converting
682		 * from 1-based to 0-based.
683		 */
684		sp->lno = lno;
685		sp->cno = match[0].rm_so;
686
687		/* Copy the bytes before the match into the build buffer. */
688		BUILD(sp, s + offset, match[0].rm_so);
689
690		/* Substitute the matching bytes. */
691		didsub = 1;
692		if (re_sub(sp, s + offset, &lb, &lbclen, &lblen, match))
693			goto err;
694
695		/* Set the change flag so we know this line was modified. */
696		linechanged = 1;
697
698		/* Move past the matched bytes. */
699skip:		offset += match[0].rm_eo;
700		len -= match[0].rm_eo;
701
702		/* A match cannot be followed by an empty pattern. */
703		empty_ok = 0;
704
705		/*
706		 * If doing a global change with confirmation, we have to
707		 * update the screen.  The basic idea is to store the line
708		 * so the screen update routines can find it, and restart.
709		 */
710		if (didsub && sp->c_suffix && sp->g_suffix) {
711			/*
712			 * The new search offset will be the end of the
713			 * modified line.
714			 */
715			saved_offset = lbclen;
716
717			/* Copy the rest of the line. */
718			if (len)
719				BUILD(sp, s + offset, len)
720
721			/* Set the new offset. */
722			offset = saved_offset;
723
724			/* Store inserted lines, adjusting the build buffer. */
725			last = 0;
726			if (sp->newl_cnt) {
727				for (cnt = 0;
728				    cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) {
729					if (db_insert(sp, lno,
730					    lb + last, sp->newl[cnt] - last))
731						goto err;
732					last = sp->newl[cnt] + 1;
733					++sp->rptlines[L_ADDED];
734				}
735				lbclen -= last;
736				offset -= last;
737				sp->newl_cnt = 0;
738			}
739
740			/* Store and retrieve the line. */
741			if (db_set(sp, lno, lb + last, lbclen))
742				goto err;
743			if (db_get(sp, lno, DBG_FATAL, &s, &llen))
744				goto err;
745			ADD_SPACE_RETW(sp, bp, blen, llen)
746			MEMCPY(bp, s, llen);
747			s = bp;
748			len = llen - offset;
749
750			/* Restart the build. */
751			lbclen = 0;
752			BUILD(sp, s, offset);
753
754			/*
755			 * If we haven't already done the after-the-string
756			 * match, do one.  Set REG_NOTEOL so the '$' pattern
757			 * only matches once.
758			 */
759			if (!do_eol_match)
760				goto endmatch;
761			if (offset == len) {
762				do_eol_match = 0;
763				eflags |= REG_NOTEOL;
764			}
765			goto nextmatch;
766		}
767
768		/*
769		 * If it's a global:
770		 *
771		 * If at the end of the string, do a test for the after
772		 * the string match.  Set REG_NOTEOL so the '$' pattern
773		 * only matches once.
774		 */
775		if (sp->g_suffix && do_eol_match) {
776			if (len == 0) {
777				do_eol_match = 0;
778				eflags |= REG_NOTEOL;
779			}
780			goto nextmatch;
781		}
782
783endmatch:	if (!linechanged)
784			continue;
785
786		/* Copy any remaining bytes into the build buffer. */
787		if (len)
788			BUILD(sp, s + offset, len)
789
790		/* Store inserted lines, adjusting the build buffer. */
791		last = 0;
792		if (sp->newl_cnt) {
793			for (cnt = 0;
794			    cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) {
795				if (db_insert(sp,
796				    lno, lb + last, sp->newl[cnt] - last))
797					goto err;
798				last = sp->newl[cnt] + 1;
799				++sp->rptlines[L_ADDED];
800			}
801			lbclen -= last;
802			sp->newl_cnt = 0;
803		}
804
805		/* Store the changed line. */
806		if (db_set(sp, lno, lb + last, lbclen))
807			goto err;
808
809		/* Update changed line counter. */
810		if (sp->rptlchange != lno) {
811			sp->rptlchange = lno;
812			++sp->rptlines[L_CHANGED];
813		}
814
815		/*
816		 * !!!
817		 * Display as necessary.  Historic practice is to only
818		 * display the last line of a line split into multiple
819		 * lines.
820		 */
821		if (lflag || nflag || pflag) {
822			from.lno = to.lno = lno;
823			from.cno = to.cno = 0;
824			if (lflag)
825				(void)ex_print(sp, cmdp, &from, &to, E_C_LIST);
826			if (nflag)
827				(void)ex_print(sp, cmdp, &from, &to, E_C_HASH);
828			if (pflag)
829				(void)ex_print(sp, cmdp, &from, &to, E_C_PRINT);
830		}
831	}
832
833	/*
834	 * !!!
835	 * Historically, vi attempted to leave the cursor at the same place if
836	 * the substitution was done at the current cursor position.  Otherwise
837	 * it moved it to the first non-blank of the last line changed.  There
838	 * were some problems: for example, :s/$/foo/ with the cursor on the
839	 * last character of the line left the cursor on the last character, or
840	 * the & command with multiple occurrences of the matching string in the
841	 * line usually left the cursor in a fairly random position.
842	 *
843	 * We try to do the same thing, with the exception that if the user is
844	 * doing substitution with confirmation, we move to the last line about
845	 * which the user was consulted, as opposed to the last line that they
846	 * actually changed.  This prevents a screen flash if the user doesn't
847	 * change many of the possible lines.
848	 */
849	if (!sp->c_suffix && (sp->lno != slno || sp->cno != scno)) {
850		sp->cno = 0;
851		(void)nonblank(sp, sp->lno, &sp->cno);
852	}
853
854	/*
855	 * If not in a global command, and nothing matched, say so.
856	 * Else, if none of the lines displayed, put something up.
857	 */
858	rval = 0;
859	if (!matched) {
860		if (!F_ISSET(sp, SC_EX_GLOBAL)) {
861			msgq(sp, M_ERR, "157|No match found");
862			goto err;
863		}
864	} else if (!lflag && !nflag && !pflag)
865		F_SET(cmdp, E_AUTOPRINT);
866
867	if (0) {
868err:		rval = 1;
869	}
870
871	if (bp != NULL)
872		FREE_SPACEW(sp, bp, blen);
873	if (lb != NULL)
874		free(lb);
875	return (rval);
876}
877
878/*
879 * re_compile --
880 *	Compile the RE.
881 *
882 * PUBLIC: int re_compile __P((SCR *,
883 * PUBLIC:     CHAR_T *, size_t, CHAR_T **, size_t *, regex_t *, u_int));
884 */
885int
886re_compile(SCR *sp, CHAR_T *ptrn, size_t plen, CHAR_T **ptrnp, size_t *lenp, regex_t *rep, u_int flags)
887{
888	size_t len;
889	int reflags, replaced, rval;
890	CHAR_T *p;
891
892	/* Set RE flags. */
893	reflags = 0;
894	if (!LF_ISSET(RE_C_CSCOPE | RE_C_TAG)) {
895		if (O_ISSET(sp, O_EXTENDED))
896			reflags |= REG_EXTENDED;
897		if (O_ISSET(sp, O_IGNORECASE))
898			reflags |= REG_ICASE;
899		if (O_ISSET(sp, O_ICLOWER)) {
900			for (p = ptrn, len = plen; len > 0; ++p, --len)
901				if (ISUPPER(*p))
902					break;
903			if (len == 0)
904				reflags |= REG_ICASE;
905		}
906	}
907
908	/* If we're replacing a saved value, clear the old one. */
909	if (LF_ISSET(RE_C_SEARCH) && F_ISSET(sp, SC_RE_SEARCH)) {
910		regfree(&sp->re_c);
911		F_CLR(sp, SC_RE_SEARCH);
912	}
913	if (LF_ISSET(RE_C_SUBST) && F_ISSET(sp, SC_RE_SUBST)) {
914		regfree(&sp->subre_c);
915		F_CLR(sp, SC_RE_SUBST);
916	}
917
918	/*
919	 * If we're saving the string, it's a pattern we haven't seen before,
920	 * so convert the vi-style RE's to POSIX 1003.2 RE's.  Save a copy for
921	 * later recompilation.   Free any previously saved value.
922	 */
923	if (ptrnp != NULL) {
924		replaced = 0;
925		if (LF_ISSET(RE_C_CSCOPE)) {
926			if (re_cscope_conv(sp, &ptrn, &plen, &replaced))
927				return (1);
928			/*
929			 * XXX
930			 * Currently, the match-any-<blank> expression used in
931			 * re_cscope_conv() requires extended RE's.  This may
932			 * not be right or safe.
933			 */
934			reflags |= REG_EXTENDED;
935		} else if (LF_ISSET(RE_C_TAG)) {
936			if (re_tag_conv(sp, &ptrn, &plen, &replaced))
937				return (1);
938		} else
939			if (re_conv(sp, &ptrn, &plen, &replaced))
940				return (1);
941
942		/* Discard previous pattern. */
943		if (*ptrnp != NULL) {
944			free(*ptrnp);
945			*ptrnp = NULL;
946		}
947		if (lenp != NULL)
948			*lenp = plen;
949
950		/*
951		 * Copy the string into allocated memory.
952		 *
953		 * XXX
954		 * Regcomp isn't 8-bit clean, so the pattern is nul-terminated
955		 * for now.  There's just no other solution.
956		 */
957		MALLOC(sp, *ptrnp, CHAR_T *, (plen + 1) * sizeof(CHAR_T));
958		if (*ptrnp != NULL) {
959			MEMCPY(*ptrnp, ptrn, plen);
960			(*ptrnp)[plen] = '\0';
961		}
962
963		/* Free up conversion-routine-allocated memory. */
964		if (replaced)
965			FREE_SPACEW(sp, ptrn, 0);
966
967		if (*ptrnp == NULL)
968			return (1);
969
970		ptrn = *ptrnp;
971	}
972
973	/*
974	 * XXX
975	 * Regcomp isn't 8-bit clean, so we just lost if the pattern
976	 * contained a nul.  Bummer!
977	 */
978	if ((rval = regcomp(rep, ptrn, /* plen, */ reflags)) != 0) {
979		if (!LF_ISSET(RE_C_SILENT))
980			re_error(sp, rval, rep);
981		return (1);
982	}
983
984	if (LF_ISSET(RE_C_SEARCH))
985		F_SET(sp, SC_RE_SEARCH);
986	if (LF_ISSET(RE_C_SUBST))
987		F_SET(sp, SC_RE_SUBST);
988
989	return (0);
990}
991
992/*
993 * re_conv --
994 *	Convert vi's regular expressions into something that the
995 *	the POSIX 1003.2 RE functions can handle.
996 *
997 * There are three conversions we make to make vi's RE's (specifically
998 * the global, search, and substitute patterns) work with POSIX RE's.
999 *
1000 * 1: If O_MAGIC is not set, strip backslashes from the magic character
1001 *    set (.[*~) that have them, and add them to the ones that don't.
1002 * 2: If O_MAGIC is not set, the string "\~" is replaced with the text
1003 *    from the last substitute command's replacement string.  If O_MAGIC
1004 *    is set, it's the string "~".
1005 * 3: The pattern \<ptrn\> does "word" searches, convert it to use the
1006 *    new RE escapes.
1007 *
1008 * !!!/XXX
1009 * This doesn't exactly match the historic behavior of vi because we do
1010 * the ~ substitution before calling the RE engine, so magic characters
1011 * in the replacement string will be expanded by the RE engine, and they
1012 * weren't historically.  It's a bug.
1013 */
1014static int
1015re_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp)
1016{
1017	size_t blen, len, needlen;
1018	int magic;
1019	CHAR_T *bp, *p, *t;
1020
1021	/*
1022	 * First pass through, we figure out how much space we'll need.
1023	 * We do it in two passes, on the grounds that most of the time
1024	 * the user is doing a search and won't have magic characters.
1025	 * That way we can skip most of the memory allocation and copies.
1026	 */
1027	magic = 0;
1028	for (p = *ptrnp, len = *plenp, needlen = 0; len > 0; ++p, --len)
1029		switch (*p) {
1030		case '\\':
1031			if (len > 1) {
1032				--len;
1033				switch (*++p) {
1034				case '<':
1035					magic = 1;
1036					needlen += RE_WSTART_LEN + 1;
1037					break;
1038				case '>':
1039					magic = 1;
1040					needlen += RE_WSTOP_LEN + 1;
1041					break;
1042				case '~':
1043					if (!O_ISSET(sp, O_MAGIC)) {
1044						magic = 1;
1045						needlen += sp->repl_len;
1046					}
1047					break;
1048				case '.':
1049				case '[':
1050				case '*':
1051					if (!O_ISSET(sp, O_MAGIC)) {
1052						magic = 1;
1053						needlen += 1;
1054					}
1055					break;
1056				default:
1057					needlen += 2;
1058				}
1059			} else
1060				needlen += 1;
1061			break;
1062		case '~':
1063			if (O_ISSET(sp, O_MAGIC)) {
1064				magic = 1;
1065				needlen += sp->repl_len;
1066			}
1067			break;
1068		case '.':
1069		case '[':
1070		case '*':
1071			if (!O_ISSET(sp, O_MAGIC)) {
1072				magic = 1;
1073				needlen += 2;
1074			}
1075			break;
1076		default:
1077			needlen += 1;
1078			break;
1079		}
1080
1081	if (!magic) {
1082		*replacedp = 0;
1083		return (0);
1084	}
1085
1086	/* Get enough memory to hold the final pattern. */
1087	*replacedp = 1;
1088	GET_SPACE_RETW(sp, bp, blen, needlen);
1089
1090	for (p = *ptrnp, len = *plenp, t = bp; len > 0; ++p, --len)
1091		switch (*p) {
1092		case '\\':
1093			if (len > 1) {
1094				--len;
1095				switch (*++p) {
1096				case '<':
1097					MEMCPY(t,
1098					    RE_WSTART, RE_WSTART_LEN);
1099					t += RE_WSTART_LEN;
1100					break;
1101				case '>':
1102					MEMCPY(t,
1103					    RE_WSTOP, RE_WSTOP_LEN);
1104					t += RE_WSTOP_LEN;
1105					break;
1106				case '~':
1107					if (O_ISSET(sp, O_MAGIC))
1108						*t++ = '~';
1109					else {
1110						MEMCPY(t,
1111						    sp->repl, sp->repl_len);
1112						t += sp->repl_len;
1113					}
1114					break;
1115				case '.':
1116				case '[':
1117				case '*':
1118					if (O_ISSET(sp, O_MAGIC))
1119						*t++ = '\\';
1120					*t++ = *p;
1121					break;
1122				default:
1123					*t++ = '\\';
1124					*t++ = *p;
1125				}
1126			} else
1127				*t++ = '\\';
1128			break;
1129		case '~':
1130			if (O_ISSET(sp, O_MAGIC)) {
1131				MEMCPY(t, sp->repl, sp->repl_len);
1132				t += sp->repl_len;
1133			} else
1134				*t++ = '~';
1135			break;
1136		case '.':
1137		case '[':
1138		case '*':
1139			if (!O_ISSET(sp, O_MAGIC))
1140				*t++ = '\\';
1141			*t++ = *p;
1142			break;
1143		default:
1144			*t++ = *p;
1145			break;
1146		}
1147
1148	*ptrnp = bp;
1149	*plenp = t - bp;
1150	return (0);
1151}
1152
1153/*
1154 * re_tag_conv --
1155 *	Convert a tags search path into something that the POSIX
1156 *	1003.2 RE functions can handle.
1157 */
1158static int
1159re_tag_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp)
1160{
1161	size_t blen, len;
1162	int lastdollar;
1163	CHAR_T *bp, *p, *t;
1164
1165	len = *plenp;
1166
1167	/* Max memory usage is 2 times the length of the string. */
1168	*replacedp = 1;
1169	GET_SPACE_RETW(sp, bp, blen, len * 2);
1170
1171	p = *ptrnp;
1172	t = bp;
1173
1174	/* If the last character is a '/' or '?', we just strip it. */
1175	if (len > 0 && (p[len - 1] == '/' || p[len - 1] == '?'))
1176		--len;
1177
1178	/* If the next-to-last or last character is a '$', it's magic. */
1179	if (len > 0 && p[len - 1] == '$') {
1180		--len;
1181		lastdollar = 1;
1182	} else
1183		lastdollar = 0;
1184
1185	/* If the first character is a '/' or '?', we just strip it. */
1186	if (len > 0 && (p[0] == '/' || p[0] == '?')) {
1187		++p;
1188		--len;
1189	}
1190
1191	/* If the first or second character is a '^', it's magic. */
1192	if (p[0] == '^') {
1193		*t++ = *p++;
1194		--len;
1195	}
1196
1197	/*
1198	 * Escape every other magic character we can find, meanwhile stripping
1199	 * the backslashes ctags inserts when escaping the search delimiter
1200	 * characters.
1201	 */
1202	for (; len > 0; --len) {
1203		if (p[0] == '\\' && (p[1] == '/' || p[1] == '?')) {
1204			++p;
1205			--len;
1206		} else if (STRCHR(L("^.[]$*"), p[0]))
1207			*t++ = '\\';
1208		*t++ = *p++;
1209	}
1210	if (lastdollar)
1211		*t++ = '$';
1212
1213	*ptrnp = bp;
1214	*plenp = t - bp;
1215	return (0);
1216}
1217
1218/*
1219 * re_cscope_conv --
1220 *	 Convert a cscope search path into something that the POSIX
1221 *      1003.2 RE functions can handle.
1222 */
1223static int
1224re_cscope_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp)
1225{
1226	size_t blen, len, nspaces;
1227	CHAR_T *bp, *t;
1228	CHAR_T *p;
1229	CHAR_T *wp;
1230	size_t wlen;
1231
1232	/*
1233	 * Each space in the source line printed by cscope represents an
1234	 * arbitrary sequence of spaces, tabs, and comments.
1235	 */
1236#define	CSCOPE_RE_SPACE		"([ \t]|/\\*([^*]|\\*/)*\\*/)*"
1237#define CSCOPE_LEN	sizeof(CSCOPE_RE_SPACE) - 1
1238	CHAR2INT(sp, CSCOPE_RE_SPACE, CSCOPE_LEN, wp, wlen);
1239	for (nspaces = 0, p = *ptrnp, len = *plenp; len > 0; ++p, --len)
1240		if (*p == ' ')
1241			++nspaces;
1242
1243	/*
1244	 * Allocate plenty of space:
1245	 *	the string, plus potential escaping characters;
1246	 *	nspaces + 2 copies of CSCOPE_RE_SPACE;
1247	 *	^, $, nul terminator characters.
1248	 */
1249	*replacedp = 1;
1250	len = (p - *ptrnp) * 2 + (nspaces + 2) * sizeof(CSCOPE_RE_SPACE) + 3;
1251	GET_SPACE_RETW(sp, bp, blen, len);
1252
1253	p = *ptrnp;
1254	t = bp;
1255
1256	*t++ = '^';
1257	MEMCPY(t, wp, wlen);
1258	t += wlen;
1259
1260	for (len = *plenp; len > 0; ++p, --len)
1261		if (*p == ' ') {
1262			MEMCPY(t, wp, wlen);
1263			t += wlen;
1264		} else {
1265			if (STRCHR(L("\\^.[]$*+?()|{}"), *p))
1266				*t++ = '\\';
1267			*t++ = *p;
1268		}
1269
1270	MEMCPY(t, wp, wlen);
1271	t += wlen;
1272	*t++ = '$';
1273
1274	*ptrnp = bp;
1275	*plenp = t - bp;
1276	return (0);
1277}
1278
1279/*
1280 * re_error --
1281 *	Report a regular expression error.
1282 *
1283 * PUBLIC: void re_error __P((SCR *, int, regex_t *));
1284 */
1285void
1286re_error(SCR *sp, int errcode, regex_t *preg)
1287{
1288	size_t s;
1289	char *oe;
1290
1291	s = regerror(errcode, preg, "", 0);
1292	MALLOC(sp, oe, char *, s);
1293	if (oe != NULL) {
1294		(void)regerror(errcode, preg, oe, s);
1295		msgq(sp, M_ERR, "RE error: %s", oe);
1296		free(oe);
1297	}
1298}
1299
1300/*
1301 * re_sub --
1302 * 	Do the substitution for a regular expression.
1303 */
1304static int
1305re_sub(
1306	SCR *sp,
1307	CHAR_T *ip,			/* Input line. */
1308	CHAR_T **lbp,
1309	size_t *lbclenp,
1310	size_t *lblenp,
1311	regmatch_t match[10])
1312{
1313	enum { C_NOTSET, C_LOWER, C_ONELOWER, C_ONEUPPER, C_UPPER } conv;
1314	size_t lbclen, lblen;		/* Local copies. */
1315	size_t mlen;			/* Match length. */
1316	size_t rpl;			/* Remaining replacement length. */
1317	CHAR_T *rp;			/* Replacement pointer. */
1318	int ch;
1319	int no;				/* Match replacement offset. */
1320	CHAR_T *p, *t;			/* Buffer pointers. */
1321	CHAR_T *lb;			/* Local copies. */
1322
1323	lb = *lbp;			/* Get local copies. */
1324	lbclen = *lbclenp;
1325	lblen = *lblenp;
1326
1327	/*
1328	 * QUOTING NOTE:
1329	 *
1330	 * There are some special sequences that vi provides in the
1331	 * replacement patterns.
1332	 *	 & string the RE matched (\& if nomagic set)
1333	 *	\# n-th regular subexpression
1334	 *	\E end \U, \L conversion
1335	 *	\e end \U, \L conversion
1336	 *	\l convert the next character to lower-case
1337	 *	\L convert to lower-case, until \E, \e, or end of replacement
1338	 *	\u convert the next character to upper-case
1339	 *	\U convert to upper-case, until \E, \e, or end of replacement
1340	 *
1341	 * Otherwise, since this is the lowest level of replacement, discard
1342	 * all escaping characters.  This (hopefully) matches historic practice.
1343	 */
1344#define	OUTCH(ch, nltrans) {						\
1345	ARG_CHAR_T __ch = (ch);						\
1346	e_key_t __value = KEY_VAL(sp, __ch);				\
1347	if (nltrans && (__value == K_CR || __value == K_NL)) {		\
1348		NEEDNEWLINE(sp);					\
1349		sp->newl[sp->newl_cnt++] = lbclen;			\
1350	} else if (conv != C_NOTSET) {					\
1351		switch (conv) {						\
1352		case C_ONELOWER:					\
1353			conv = C_NOTSET;				\
1354			/* FALLTHROUGH */				\
1355		case C_LOWER:						\
1356			if (ISUPPER(__ch))				\
1357				__ch = TOLOWER(__ch);			\
1358			break;						\
1359		case C_ONEUPPER:					\
1360			conv = C_NOTSET;				\
1361			/* FALLTHROUGH */				\
1362		case C_UPPER:						\
1363			if (ISLOWER(__ch))				\
1364				__ch = TOUPPER(__ch);			\
1365			break;						\
1366		default:						\
1367			abort();					\
1368		}							\
1369	}								\
1370	NEEDSP(sp, 1, p);						\
1371	*p++ = __ch;							\
1372	++lbclen;							\
1373}
1374	conv = C_NOTSET;
1375	for (rp = sp->repl, rpl = sp->repl_len, p = lb + lbclen; rpl--;) {
1376		switch (ch = *rp++) {
1377		case '&':
1378			if (O_ISSET(sp, O_MAGIC)) {
1379				no = 0;
1380				goto subzero;
1381			}
1382			break;
1383		case '\\':
1384			if (rpl == 0)
1385				break;
1386			--rpl;
1387			switch (ch = *rp) {
1388			case '&':
1389				++rp;
1390				if (!O_ISSET(sp, O_MAGIC)) {
1391					no = 0;
1392					goto subzero;
1393				}
1394				break;
1395			case '0': case '1': case '2': case '3': case '4':
1396			case '5': case '6': case '7': case '8': case '9':
1397				no = *rp++ - '0';
1398subzero:			if (match[no].rm_so == -1 ||
1399			    	    match[no].rm_eo == -1)
1400					break;
1401				mlen = match[no].rm_eo - match[no].rm_so;
1402				for (t = ip + match[no].rm_so; mlen--; ++t)
1403					OUTCH(*t, 0);
1404				continue;
1405			case 'e':
1406			case 'E':
1407				++rp;
1408				conv = C_NOTSET;
1409				continue;
1410			case 'l':
1411				++rp;
1412				conv = C_ONELOWER;
1413				continue;
1414			case 'L':
1415				++rp;
1416				conv = C_LOWER;
1417				continue;
1418			case 'u':
1419				++rp;
1420				conv = C_ONEUPPER;
1421				continue;
1422			case 'U':
1423				++rp;
1424				conv = C_UPPER;
1425				continue;
1426			case '\r':
1427				OUTCH(ch, 0);
1428				continue;
1429			default:
1430				++rp;
1431				break;
1432			}
1433		}
1434		OUTCH(ch, 1);
1435	}
1436
1437	*lbp = lb;			/* Update caller's information. */
1438	*lbclenp = lbclen;
1439	*lblenp = lblen;
1440	return (0);
1441}
1442