1/* vi: set sw=4 ts=4: */
2/*
3 * sed.c - very minimalist version of sed
4 *
5 * Copyright (C) 1999,2000,2001 by Lineo, inc. and Mark Whitley
6 * Copyright (C) 1999,2000,2001 by Mark Whitley <markw@codepoet.org>
7 * Copyright (C) 2002  Matt Kraai
8 * Copyright (C) 2003 by Glenn McGrath <bug1@iinet.net.au>
9 * Copyright (C) 2003,2004 by Rob Landley <rob@landley.net>
10 *
11 * MAINTAINER: Rob Landley <rob@landley.net>
12 *
13 * Licensed under GPL version 2, see file LICENSE in this tarball for details.
14 */
15
16/* Code overview.
17
18  Files are laid out to avoid unnecessary function declarations.  So for
19  example, every function add_cmd calls occurs before add_cmd in this file.
20
21  add_cmd() is called on each line of sed command text (from a file or from
22  the command line).  It calls get_address() and parse_cmd_args().  The
23  resulting sed_cmd_t structures are appended to a linked list
24  (G.sed_cmd_head/G.sed_cmd_tail).
25
26  add_input_file() adds a FILE * to the list of input files.  We need to
27  know all input sources ahead of time to find the last line for the $ match.
28
29  process_files() does actual sedding, reading data lines from each input FILE *
30  (which could be stdin) and applying the sed command list (sed_cmd_head) to
31  each of the resulting lines.
32
33  sed_main() is where external code calls into this, with a command line.
34*/
35
36
37/*
38	Supported features and commands in this version of sed:
39
40	 - comments ('#')
41	 - address matching: num|/matchstr/[,num|/matchstr/|$]command
42	 - commands: (p)rint, (d)elete, (s)ubstitue (with g & I flags)
43	 - edit commands: (a)ppend, (i)nsert, (c)hange
44	 - file commands: (r)ead
45	 - backreferences in substitution expressions (\0, \1, \2...\9)
46	 - grouped commands: {cmd1;cmd2}
47	 - transliteration (y/source-chars/dest-chars/)
48	 - pattern space hold space storing / swapping (g, h, x)
49	 - labels / branching (: label, b, t, T)
50
51	 (Note: Specifying an address (range) to match is *optional*; commands
52	 default to the whole pattern space if no specific address match was
53	 requested.)
54
55	Todo:
56	 - Create a wrapper around regex to make libc's regex conform with sed
57
58	Reference http://www.opengroup.org/onlinepubs/007904975/utilities/sed.html
59*/
60
61#include "libbb.h"
62#include "xregex.h"
63
64/* Each sed command turns into one of these structures. */
65typedef struct sed_cmd_s {
66	/* Ordered by alignment requirements: currently 36 bytes on x86 */
67	struct sed_cmd_s *next; /* Next command (linked list, NULL terminated) */
68
69	/* address storage */
70	regex_t *beg_match;     /* sed -e '/match/cmd' */
71	regex_t *end_match;     /* sed -e '/match/,/end_match/cmd' */
72	regex_t *sub_match;     /* For 's/sub_match/string/' */
73	int beg_line;           /* 'sed 1p'   0 == apply commands to all lines */
74	int end_line;           /* 'sed 1,3p' 0 == one line only. -1 = last line ($) */
75
76	FILE *sw_file;          /* File (sw) command writes to, -1 for none. */
77	char *string;           /* Data string for (saicytb) commands. */
78
79	unsigned short which_match; /* (s) Which match to replace (0 for all) */
80
81	/* Bitfields (gcc won't group them if we don't) */
82	unsigned invert:1;      /* the '!' after the address */
83	unsigned in_match:1;    /* Next line also included in match? */
84	unsigned sub_p:1;       /* (s) print option */
85
86	char sw_last_char;      /* Last line written by (sw) had no '\n' */
87
88	/* GENERAL FIELDS */
89	char cmd;               /* The command char: abcdDgGhHilnNpPqrstwxy:={} */
90} sed_cmd_t;
91
92static const char semicolon_whitespace[] ALIGN1 = "; \n\r\t\v";
93
94struct globals {
95	/* options */
96	int be_quiet, regex_type;
97	FILE *nonstdout;
98	char *outname, *hold_space;
99
100	/* List of input files */
101	int input_file_count, current_input_file;
102	FILE **input_file_list;
103
104	regmatch_t regmatch[10];
105	regex_t *previous_regex_ptr;
106
107	/* linked list of sed commands */
108	sed_cmd_t sed_cmd_head, *sed_cmd_tail;
109
110	/* Linked list of append lines */
111	llist_t *append_head;
112
113	char *add_cmd_line;
114
115	struct pipeline {
116		char *buf;	/* Space to hold string */
117		int idx;	/* Space used */
118		int len;	/* Space allocated */
119	} pipeline;
120};
121#define G (*(struct globals*)&bb_common_bufsiz1)
122void BUG_sed_globals_too_big(void);
123#define INIT_G() do { \
124	if (sizeof(struct globals) > COMMON_BUFSIZE) \
125		BUG_sed_globals_too_big(); \
126	G.sed_cmd_tail = &G.sed_cmd_head; \
127} while (0)
128
129
130#if ENABLE_FEATURE_CLEAN_UP
131static void sed_free_and_close_stuff(void)
132{
133	sed_cmd_t *sed_cmd = G.sed_cmd_head.next;
134
135	llist_free(G.append_head, free);
136
137	while (sed_cmd) {
138		sed_cmd_t *sed_cmd_next = sed_cmd->next;
139
140		if (sed_cmd->sw_file)
141			xprint_and_close_file(sed_cmd->sw_file);
142
143		if (sed_cmd->beg_match) {
144			regfree(sed_cmd->beg_match);
145			free(sed_cmd->beg_match);
146		}
147		if (sed_cmd->end_match) {
148			regfree(sed_cmd->end_match);
149			free(sed_cmd->end_match);
150		}
151		if (sed_cmd->sub_match) {
152			regfree(sed_cmd->sub_match);
153			free(sed_cmd->sub_match);
154		}
155		free(sed_cmd->string);
156		free(sed_cmd);
157		sed_cmd = sed_cmd_next;
158	}
159
160	if (G.hold_space) free(G.hold_space);
161
162	while (G.current_input_file < G.input_file_count)
163		fclose(G.input_file_list[G.current_input_file++]);
164}
165#else
166void sed_free_and_close_stuff(void);
167#endif
168
169/* If something bad happens during -i operation, delete temp file */
170
171static void cleanup_outname(void)
172{
173	if (G.outname) unlink(G.outname);
174}
175
176/* strdup, replacing "\n" with '\n', and "\delimiter" with 'delimiter' */
177
178static void parse_escapes(char *dest, const char *string, int len, char from, char to)
179{
180	int i = 0;
181
182	while (i < len) {
183		if (string[i] == '\\') {
184			if (!to || string[i+1] == from) {
185				*dest++ = to ? to : string[i+1];
186				i += 2;
187				continue;
188			}
189			*dest++ = string[i++];
190		}
191		*dest++ = string[i++];
192	}
193	*dest = 0;
194}
195
196static char *copy_parsing_escapes(const char *string, int len)
197{
198	char *dest = xmalloc(len + 1);
199
200	parse_escapes(dest, string, len, 'n', '\n');
201	return dest;
202}
203
204
205/*
206 * index_of_next_unescaped_regexp_delim - walks left to right through a string
207 * beginning at a specified index and returns the index of the next regular
208 * expression delimiter (typically a forward * slash ('/')) not preceded by
209 * a backslash ('\').  A negative delimiter disables square bracket checking.
210 */
211static int index_of_next_unescaped_regexp_delim(int delimiter, const char *str)
212{
213	int bracket = -1;
214	int escaped = 0;
215	int idx = 0;
216	char ch;
217
218	if (delimiter < 0) {
219		bracket--;
220		delimiter = -delimiter;
221	}
222
223	for (; (ch = str[idx]); idx++) {
224		if (bracket >= 0) {
225			if (ch == ']' && !(bracket == idx - 1 || (bracket == idx - 2
226					&& str[idx - 1] == '^')))
227				bracket = -1;
228		} else if (escaped)
229			escaped = 0;
230		else if (ch == '\\')
231			escaped = 1;
232		else if (bracket == -1 && ch == '[')
233			bracket = idx;
234		else if (ch == delimiter)
235			return idx;
236	}
237
238	/* if we make it to here, we've hit the end of the string */
239	bb_error_msg_and_die("unmatched '%c'", delimiter);
240}
241
242/*
243 *  Returns the index of the third delimiter
244 */
245static int parse_regex_delim(const char *cmdstr, char **match, char **replace)
246{
247	const char *cmdstr_ptr = cmdstr;
248	char delimiter;
249	int idx = 0;
250
251	/* verify that the 's' or 'y' is followed by something.  That something
252	 * (typically a 'slash') is now our regexp delimiter... */
253	if (*cmdstr == '\0')
254		bb_error_msg_and_die("bad format in substitution expression");
255	delimiter = *cmdstr_ptr++;
256
257	/* save the match string */
258	idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr);
259	*match = copy_parsing_escapes(cmdstr_ptr, idx);
260
261	/* save the replacement string */
262	cmdstr_ptr += idx + 1;
263	idx = index_of_next_unescaped_regexp_delim(-delimiter, cmdstr_ptr);
264	*replace = copy_parsing_escapes(cmdstr_ptr, idx);
265
266	return ((cmdstr_ptr - cmdstr) + idx);
267}
268
269/*
270 * returns the index in the string just past where the address ends.
271 */
272static int get_address(const char *my_str, int *linenum, regex_t ** regex)
273{
274	const char *pos = my_str;
275
276	if (isdigit(*my_str)) {
277		*linenum = strtol(my_str, (char**)&pos, 10);
278		/* endstr shouldnt ever equal NULL */
279	} else if (*my_str == '$') {
280		*linenum = -1;
281		pos++;
282	} else if (*my_str == '/' || *my_str == '\\') {
283		int next;
284		char delimiter;
285		char *temp;
286
287		delimiter = '/';
288		if (*my_str == '\\') delimiter = *++pos;
289		next = index_of_next_unescaped_regexp_delim(delimiter, ++pos);
290		temp = copy_parsing_escapes(pos, next);
291		*regex = xmalloc(sizeof(regex_t));
292		xregcomp(*regex, temp, G.regex_type|REG_NEWLINE);
293		free(temp);
294		/* Move position to next character after last delimiter */
295		pos += (next+1);
296	}
297	return pos - my_str;
298}
299
300/* Grab a filename.  Whitespace at start is skipped, then goes to EOL. */
301static int parse_file_cmd(sed_cmd_t *sed_cmd, const char *filecmdstr, char **retval)
302{
303	int start = 0, idx, hack = 0;
304
305	/* Skip whitespace, then grab filename to end of line */
306	while (isspace(filecmdstr[start]))
307		start++;
308	idx = start;
309	while (filecmdstr[idx] && filecmdstr[idx] != '\n')
310		idx++;
311
312	/* If lines glued together, put backslash back. */
313	if (filecmdstr[idx] == '\n')
314		hack = 1;
315	if (idx == start)
316		bb_error_msg_and_die("empty filename");
317	*retval = xstrndup(filecmdstr+start, idx-start+hack+1);
318	if (hack)
319		(*retval)[idx] = '\\';
320
321	return idx;
322}
323
324static int parse_subst_cmd(sed_cmd_t *sed_cmd, const char *substr)
325{
326	int cflags = G.regex_type;
327	char *match;
328	int idx;
329
330	/*
331	 * A substitution command should look something like this:
332	 *    s/match/replace/ #gIpw
333	 *    ||     |        |||
334	 *    mandatory       optional
335	 */
336	idx = parse_regex_delim(substr, &match, &sed_cmd->string);
337
338	/* determine the number of back references in the match string */
339	/* Note: we compute this here rather than in the do_subst_command()
340	 * function to save processor time, at the expense of a little more memory
341	 * (4 bits) per sed_cmd */
342
343	/* process the flags */
344
345	sed_cmd->which_match = 1;
346	while (substr[++idx]) {
347		/* Parse match number */
348		if (isdigit(substr[idx])) {
349			if (match[0] != '^') {
350				/* Match 0 treated as all, multiple matches we take the last one. */
351				const char *pos = substr + idx;
352				sed_cmd->which_match = (unsigned short)strtol(substr+idx, (char**) &pos, 10);
353				idx = pos - substr;
354			}
355			continue;
356		}
357		/* Skip spaces */
358		if (isspace(substr[idx])) continue;
359
360		switch (substr[idx]) {
361		/* Replace all occurrences */
362		case 'g':
363			if (match[0] != '^') sed_cmd->which_match = 0;
364			break;
365		/* Print pattern space */
366		case 'p':
367			sed_cmd->sub_p = 1;
368			break;
369		/* Write to file */
370		case 'w':
371		{
372			char *temp;
373			idx += parse_file_cmd(sed_cmd, substr+idx, &temp);
374			break;
375		}
376		/* Ignore case (gnu exension) */
377		case 'I':
378			cflags |= REG_ICASE;
379			break;
380		/* Comment */
381		case '#':
382			while (substr[++idx]) /*skip all*/;
383			/* Fall through */
384		/* End of command */
385		case ';':
386		case '}':
387			goto out;
388		default:
389			bb_error_msg_and_die("bad option in substitution expression");
390		}
391	}
392out:
393	/* compile the match string into a regex */
394	if (*match != '\0') {
395		/* If match is empty, we use last regex used at runtime */
396		sed_cmd->sub_match = xmalloc(sizeof(regex_t));
397		xregcomp(sed_cmd->sub_match, match, cflags);
398	}
399	free(match);
400
401	return idx;
402}
403
404/*
405 *  Process the commands arguments
406 */
407static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const char *cmdstr)
408{
409	/* handle (s)ubstitution command */
410	if (sed_cmd->cmd == 's')
411		cmdstr += parse_subst_cmd(sed_cmd, cmdstr);
412	/* handle edit cmds: (a)ppend, (i)nsert, and (c)hange */
413	else if (strchr("aic", sed_cmd->cmd)) {
414		if ((sed_cmd->end_line || sed_cmd->end_match) && sed_cmd->cmd != 'c')
415			bb_error_msg_and_die
416				("only a beginning address can be specified for edit commands");
417		for (;;) {
418			if (*cmdstr == '\n' || *cmdstr == '\\') {
419				cmdstr++;
420				break;
421			} else if (isspace(*cmdstr))
422				cmdstr++;
423			else
424				break;
425		}
426		sed_cmd->string = xstrdup(cmdstr);
427		parse_escapes(sed_cmd->string, sed_cmd->string, strlen(cmdstr), 0, 0);
428		cmdstr += strlen(cmdstr);
429	/* handle file cmds: (r)ead */
430	} else if (strchr("rw", sed_cmd->cmd)) {
431		if (sed_cmd->end_line || sed_cmd->end_match)
432			bb_error_msg_and_die("command only uses one address");
433		cmdstr += parse_file_cmd(sed_cmd, cmdstr, &sed_cmd->string);
434		if (sed_cmd->cmd == 'w') {
435			sed_cmd->sw_file = xfopen(sed_cmd->string, "w");
436			sed_cmd->sw_last_char = '\n';
437		}
438	/* handle branch commands */
439	} else if (strchr(":btT", sed_cmd->cmd)) {
440		int length;
441
442		cmdstr = skip_whitespace(cmdstr);
443		length = strcspn(cmdstr, semicolon_whitespace);
444		if (length) {
445			sed_cmd->string = xstrndup(cmdstr, length);
446			cmdstr += length;
447		}
448	}
449	/* translation command */
450	else if (sed_cmd->cmd == 'y') {
451		char *match, *replace;
452		int i = cmdstr[0];
453
454		cmdstr += parse_regex_delim(cmdstr, &match, &replace)+1;
455		/* \n already parsed, but \delimiter needs unescaping. */
456		parse_escapes(match, match, strlen(match), i, i);
457		parse_escapes(replace, replace, strlen(replace), i, i);
458
459		sed_cmd->string = xzalloc((strlen(match) + 1) * 2);
460		for (i = 0; match[i] && replace[i]; i++) {
461			sed_cmd->string[i*2] = match[i];
462			sed_cmd->string[i*2+1] = replace[i];
463		}
464		free(match);
465		free(replace);
466	}
467	/* if it wasnt a single-letter command that takes no arguments
468	 * then it must be an invalid command.
469	 */
470	else if (strchr("dDgGhHlnNpPqx={}", sed_cmd->cmd) == 0) {
471		bb_error_msg_and_die("unsupported command %c", sed_cmd->cmd);
472	}
473
474	/* give back whatever's left over */
475	return cmdstr;
476}
477
478
479/* Parse address+command sets, skipping comment lines. */
480
481static void add_cmd(const char *cmdstr)
482{
483	sed_cmd_t *sed_cmd;
484	int temp;
485
486	/* Append this line to any unfinished line from last time. */
487	if (G.add_cmd_line) {
488		char *tp = xasprintf("%s\n%s", G.add_cmd_line, cmdstr);
489		free(G.add_cmd_line);
490		cmdstr = G.add_cmd_line = tp;
491	}
492
493	/* If this line ends with backslash, request next line. */
494	temp = strlen(cmdstr);
495	if (temp && cmdstr[--temp] == '\\') {
496		if (!G.add_cmd_line)
497			G.add_cmd_line = xstrdup(cmdstr);
498		G.add_cmd_line[temp] = '\0';
499		return;
500	}
501
502	/* Loop parsing all commands in this line. */
503	while (*cmdstr) {
504		/* Skip leading whitespace and semicolons */
505		cmdstr += strspn(cmdstr, semicolon_whitespace);
506
507		/* If no more commands, exit. */
508		if (!*cmdstr) break;
509
510		/* if this is a comment, jump past it and keep going */
511		if (*cmdstr == '#') {
512			/* "#n" is the same as using -n on the command line */
513			if (cmdstr[1] == 'n')
514				G.be_quiet++;
515			cmdstr = strpbrk(cmdstr, "\n\r");
516			if (!cmdstr) break;
517			continue;
518		}
519
520		/* parse the command
521		 * format is: [addr][,addr][!]cmd
522		 *            |----||-----||-|
523		 *            part1 part2  part3
524		 */
525
526		sed_cmd = xzalloc(sizeof(sed_cmd_t));
527
528		/* first part (if present) is an address: either a '$', a number or a /regex/ */
529		cmdstr += get_address(cmdstr, &sed_cmd->beg_line, &sed_cmd->beg_match);
530
531		/* second part (if present) will begin with a comma */
532		if (*cmdstr == ',') {
533			int idx;
534
535			cmdstr++;
536			idx = get_address(cmdstr, &sed_cmd->end_line, &sed_cmd->end_match);
537			if (!idx)
538				bb_error_msg_and_die("no address after comma");
539			cmdstr += idx;
540		}
541
542		/* skip whitespace before the command */
543		cmdstr = skip_whitespace(cmdstr);
544
545		/* Check for inversion flag */
546		if (*cmdstr == '!') {
547			sed_cmd->invert = 1;
548			cmdstr++;
549
550			/* skip whitespace before the command */
551			cmdstr = skip_whitespace(cmdstr);
552		}
553
554		/* last part (mandatory) will be a command */
555		if (!*cmdstr)
556			bb_error_msg_and_die("missing command");
557		sed_cmd->cmd = *(cmdstr++);
558		cmdstr = parse_cmd_args(sed_cmd, cmdstr);
559
560		/* Add the command to the command array */
561		G.sed_cmd_tail->next = sed_cmd;
562		G.sed_cmd_tail = G.sed_cmd_tail->next;
563	}
564
565	/* If we glued multiple lines together, free the memory. */
566	free(G.add_cmd_line);
567	G.add_cmd_line = NULL;
568}
569
570/* Append to a string, reallocating memory as necessary. */
571
572#define PIPE_GROW 64
573
574static void pipe_putc(char c)
575{
576	if (G.pipeline.idx == G.pipeline.len) {
577		G.pipeline.buf = xrealloc(G.pipeline.buf,
578				G.pipeline.len + PIPE_GROW);
579		G.pipeline.len += PIPE_GROW;
580	}
581	G.pipeline.buf[G.pipeline.idx++] = c;
582}
583
584static void do_subst_w_backrefs(char *line, char *replace)
585{
586	int i,j;
587
588	/* go through the replacement string */
589	for (i = 0; replace[i]; i++) {
590		/* if we find a backreference (\1, \2, etc.) print the backref'ed * text */
591		if (replace[i] == '\\') {
592			unsigned backref = replace[++i] - '0';
593			if (backref <= 9) {
594				/* print out the text held in G.regmatch[backref] */
595				if (G.regmatch[backref].rm_so != -1) {
596					j = G.regmatch[backref].rm_so;
597					while (j < G.regmatch[backref].rm_eo)
598						pipe_putc(line[j++]);
599				}
600				continue;
601			}
602			/* I _think_ it is impossible to get '\' to be
603			 * the last char in replace string. Thus we dont check
604			 * for replace[i] == NUL. (counterexample anyone?) */
605			/* if we find a backslash escaped character, print the character */
606			pipe_putc(replace[i]);
607			continue;
608		}
609		/* if we find an unescaped '&' print out the whole matched text. */
610		if (replace[i] == '&') {
611			j = G.regmatch[0].rm_so;
612			while (j < G.regmatch[0].rm_eo)
613				pipe_putc(line[j++]);
614			continue;
615		}
616		/* Otherwise just output the character. */
617		pipe_putc(replace[i]);
618	}
619}
620
621static int do_subst_command(sed_cmd_t *sed_cmd, char **line)
622{
623	char *oldline = *line;
624	int altered = 0;
625	int match_count = 0;
626	regex_t *current_regex;
627
628	/* Handle empty regex. */
629	if (sed_cmd->sub_match == NULL) {
630		current_regex = G.previous_regex_ptr;
631		if (!current_regex)
632			bb_error_msg_and_die("no previous regexp");
633	} else
634		G.previous_regex_ptr = current_regex = sed_cmd->sub_match;
635
636	/* Find the first match */
637	if (REG_NOMATCH == regexec(current_regex, oldline, 10, G.regmatch, 0))
638		return 0;
639
640	/* Initialize temporary output buffer. */
641	G.pipeline.buf = xmalloc(PIPE_GROW);
642	G.pipeline.len = PIPE_GROW;
643	G.pipeline.idx = 0;
644
645	/* Now loop through, substituting for matches */
646	do {
647		int i;
648
649		if (!G.regmatch[0].rm_so && !G.regmatch[0].rm_eo && match_count) {
650			pipe_putc(*oldline++);
651			continue;
652		}
653
654		match_count++;
655
656		/* If we aren't interested in this match, output old line to
657		   end of match and continue */
658		if (sed_cmd->which_match && sed_cmd->which_match != match_count) {
659			for (i = 0; i < G.regmatch[0].rm_eo; i++)
660				pipe_putc(*oldline++);
661			continue;
662		}
663
664		/* print everything before the match */
665		for (i = 0; i < G.regmatch[0].rm_so; i++)
666			pipe_putc(oldline[i]);
667
668		/* then print the substitution string */
669		do_subst_w_backrefs(oldline, sed_cmd->string);
670
671		/* advance past the match */
672		oldline += G.regmatch[0].rm_eo;
673		/* flag that something has changed */
674		altered++;
675
676		/* if we're not doing this globally, get out now */
677		if (sed_cmd->which_match) break;
678	} while (*oldline && (regexec(current_regex, oldline, 10, G.regmatch, 0) != REG_NOMATCH));
679
680	/* Copy rest of string into output pipeline */
681
682	while (*oldline)
683		pipe_putc(*oldline++);
684	pipe_putc(0);
685
686	free(*line);
687	*line = G.pipeline.buf;
688	return altered;
689}
690
691/* Set command pointer to point to this label.  (Does not handle null label.) */
692static sed_cmd_t *branch_to(char *label)
693{
694	sed_cmd_t *sed_cmd;
695
696	for (sed_cmd = G.sed_cmd_head.next; sed_cmd; sed_cmd = sed_cmd->next) {
697		if (sed_cmd->cmd == ':' && sed_cmd->string && !strcmp(sed_cmd->string, label)) {
698			return sed_cmd;
699		}
700	}
701	bb_error_msg_and_die("can't find label for jump to '%s'", label);
702}
703
704static void append(char *s)
705{
706	llist_add_to_end(&G.append_head, xstrdup(s));
707}
708
709static void flush_append(void)
710{
711	char *data;
712
713	/* Output appended lines. */
714	while ((data = (char *)llist_pop(&G.append_head))) {
715		fprintf(G.nonstdout, "%s\n", data);
716		free(data);
717	}
718}
719
720static void add_input_file(FILE *file)
721{
722	G.input_file_list = xrealloc(G.input_file_list,
723			(G.input_file_count + 1) * sizeof(FILE *));
724	G.input_file_list[G.input_file_count++] = file;
725}
726
727/* Get next line of input from G.input_file_list, flushing append buffer and
728 * noting if we ran out of files without a newline on the last line we read.
729 */
730enum {
731	NO_EOL_CHAR = 1,
732	LAST_IS_NUL = 2,
733};
734static char *get_next_line(char *gets_char)
735{
736	char *temp = NULL;
737	int len;
738	char gc;
739
740	flush_append();
741
742	/* will be returned if last line in the file
743	 * doesn't end with either '\n' or '\0' */
744	gc = NO_EOL_CHAR;
745	while (G.current_input_file < G.input_file_count) {
746		FILE *fp = G.input_file_list[G.current_input_file];
747		/* Read line up to a newline or NUL byte, inclusive,
748		 * return malloc'ed char[]. length of the chunk read
749		 * is stored in len. NULL if EOF/error */
750		temp = bb_get_chunk_from_file(fp, &len);
751		if (temp) {
752			/* len > 0 here, it's ok to do temp[len-1] */
753			char c = temp[len-1];
754			if (c == '\n' || c == '\0') {
755				temp[len-1] = '\0';
756				gc = c;
757				if (c == '\0') {
758					int ch = fgetc(fp);
759					if (ch != EOF)
760						ungetc(ch, fp);
761					else
762						gc = LAST_IS_NUL;
763				}
764			}
765			/* else we put NO_EOL_CHAR into *gets_char */
766			break;
767
768		/* NB: I had the idea of peeking next file(s) and returning
769		 * NO_EOL_CHAR only if it is the *last* non-empty
770		 * input file. But there is a case where this won't work:
771		 * file1: "a woo\nb woo"
772		 * file2: "c no\nd no"
773		 * sed -ne 's/woo/bang/p' input1 input2 => "a bang\nb bang"
774		 * (note: *no* newline after "b bang"!) */
775		}
776		/* Close this file and advance to next one */
777		fclose(fp);
778		G.current_input_file++;
779	}
780	*gets_char = gc;
781	return temp;
782}
783
784/* Output line of text. */
785/* Note:
786 * The tricks with NO_EOL_CHAR and last_puts_char are there to emulate gnu sed.
787 * Without them, we had this:
788 * echo -n thingy >z1
789 * echo -n again >z2
790 * >znull
791 * sed "s/i/z/" z1 z2 znull | hexdump -vC
792 * output:
793 * gnu sed 4.1.5:
794 * 00000000  74 68 7a 6e 67 79 0a 61  67 61 7a 6e              |thzngy.agazn|
795 * bbox:
796 * 00000000  74 68 7a 6e 67 79 61 67  61 7a 6e                 |thzngyagazn|
797 */
798static void puts_maybe_newline(char *s, FILE *file, char *last_puts_char, char last_gets_char)
799{
800	char lpc = *last_puts_char;
801
802	/* Need to insert a '\n' between two files because first file's
803	 * last line wasn't terminated? */
804	if (lpc != '\n' && lpc != '\0') {
805		fputc('\n', file);
806		lpc = '\n';
807	}
808	fputs(s, file);
809
810	/* 'x' - just something which is not '\n', '\0' or NO_EOL_CHAR */
811	if (s[0])
812		lpc = 'x';
813
814	/* had trailing '\0' and it was last char of file? */
815	if (last_gets_char == LAST_IS_NUL) {
816		fputc('\0', file);
817		lpc = 'x'; /* */
818	} else
819	/* had trailing '\n' or '\0'? */
820	if (last_gets_char != NO_EOL_CHAR) {
821		fputc(last_gets_char, file);
822		lpc = last_gets_char;
823	}
824
825	if (ferror(file)) {
826		xfunc_error_retval = 4;  /* It's what gnu sed exits with... */
827		bb_error_msg_and_die(bb_msg_write_error);
828	}
829	*last_puts_char = lpc;
830}
831
832#define sed_puts(s, n) (puts_maybe_newline(s, G.nonstdout, &last_puts_char, n))
833
834static int beg_match(sed_cmd_t *sed_cmd, const char *pattern_space)
835{
836	int retval = sed_cmd->beg_match && !regexec(sed_cmd->beg_match, pattern_space, 0, NULL, 0);
837	if (retval)
838		G.previous_regex_ptr = sed_cmd->beg_match;
839	return retval;
840}
841
842/* Process all the lines in all the files */
843
844static void process_files(void)
845{
846	char *pattern_space, *next_line;
847	int linenum = 0;
848	char last_puts_char = '\n';
849	char last_gets_char, next_gets_char;
850	sed_cmd_t *sed_cmd;
851	int substituted;
852
853	/* Prime the pump */
854	next_line = get_next_line(&next_gets_char);
855
856	/* go through every line in each file */
857again:
858	substituted = 0;
859
860	/* Advance to next line.  Stop if out of lines. */
861	pattern_space = next_line;
862	if (!pattern_space) return;
863	last_gets_char = next_gets_char;
864
865	/* Read one line in advance so we can act on the last line,
866	 * the '$' address */
867	next_line = get_next_line(&next_gets_char);
868	linenum++;
869restart:
870	/* for every line, go through all the commands */
871	for (sed_cmd = G.sed_cmd_head.next; sed_cmd; sed_cmd = sed_cmd->next) {
872		int old_matched, matched;
873
874		old_matched = sed_cmd->in_match;
875
876		/* Determine if this command matches this line: */
877
878		/* Are we continuing a previous multi-line match? */
879		sed_cmd->in_match = sed_cmd->in_match
880			/* Or is no range necessary? */
881			|| (!sed_cmd->beg_line && !sed_cmd->end_line
882				&& !sed_cmd->beg_match && !sed_cmd->end_match)
883			/* Or did we match the start of a numerical range? */
884			|| (sed_cmd->beg_line > 0 && (sed_cmd->beg_line == linenum))
885			/* Or does this line match our begin address regex? */
886			|| (beg_match(sed_cmd, pattern_space))
887			/* Or did we match last line of input? */
888			|| (sed_cmd->beg_line == -1 && next_line == NULL);
889
890		/* Snapshot the value */
891
892		matched = sed_cmd->in_match;
893
894		/* Is this line the end of the current match? */
895
896		if (matched) {
897			sed_cmd->in_match = !(
898				/* has the ending line come, or is this a single address command? */
899				(sed_cmd->end_line ?
900					sed_cmd->end_line == -1 ?
901						!next_line
902						: (sed_cmd->end_line <= linenum)
903					: !sed_cmd->end_match
904				)
905				/* or does this line matches our last address regex */
906				|| (sed_cmd->end_match && old_matched
907				     && (regexec(sed_cmd->end_match,
908				                 pattern_space, 0, NULL, 0) == 0))
909			);
910		}
911
912		/* Skip blocks of commands we didn't match. */
913		if (sed_cmd->cmd == '{') {
914			if (sed_cmd->invert ? matched : !matched) {
915				while (sed_cmd->cmd != '}') {
916					sed_cmd = sed_cmd->next;
917					if (!sed_cmd)
918						bb_error_msg_and_die("unterminated {");
919				}
920			}
921			continue;
922		}
923
924		/* Okay, so did this line match? */
925		if (sed_cmd->invert ? !matched : matched) {
926			/* Update last used regex in case a blank substitute BRE is found */
927			if (sed_cmd->beg_match) {
928				G.previous_regex_ptr = sed_cmd->beg_match;
929			}
930
931			/* actual sedding */
932			switch (sed_cmd->cmd) {
933
934			/* Print line number */
935			case '=':
936				fprintf(G.nonstdout, "%d\n", linenum);
937				break;
938
939			/* Write the current pattern space up to the first newline */
940			case 'P':
941			{
942				char *tmp = strchr(pattern_space, '\n');
943
944				if (tmp) {
945					*tmp = '\0';
946					/* TODO: explain why '\n' below */
947					sed_puts(pattern_space, '\n');
948					*tmp = '\n';
949					break;
950				}
951				/* Fall Through */
952			}
953
954			/* Write the current pattern space to output */
955			case 'p':
956				/* NB: we print this _before_ the last line
957				 * (of current file) is printed. Even if
958				 * that line is nonterminated, we print
959				 * '\n' here (gnu sed does the same) */
960				sed_puts(pattern_space, '\n');
961				break;
962			/* Delete up through first newline */
963			case 'D':
964			{
965				char *tmp = strchr(pattern_space, '\n');
966
967				if (tmp) {
968					tmp = xstrdup(tmp+1);
969					free(pattern_space);
970					pattern_space = tmp;
971					goto restart;
972				}
973			}
974			/* discard this line. */
975			case 'd':
976				goto discard_line;
977
978			/* Substitute with regex */
979			case 's':
980				if (!do_subst_command(sed_cmd, &pattern_space))
981					break;
982				substituted |= 1;
983
984				/* handle p option */
985				if (sed_cmd->sub_p)
986					sed_puts(pattern_space, last_gets_char);
987				/* handle w option */
988				if (sed_cmd->sw_file)
989					puts_maybe_newline(
990						pattern_space, sed_cmd->sw_file,
991						&sed_cmd->sw_last_char, last_gets_char);
992				break;
993
994			/* Append line to linked list to be printed later */
995			case 'a':
996				append(sed_cmd->string);
997				break;
998
999			/* Insert text before this line */
1000			case 'i':
1001				sed_puts(sed_cmd->string, '\n');
1002				break;
1003
1004			/* Cut and paste text (replace) */
1005			case 'c':
1006				/* Only triggers on last line of a matching range. */
1007				if (!sed_cmd->in_match)
1008					sed_puts(sed_cmd->string, NO_EOL_CHAR);
1009				goto discard_line;
1010
1011			/* Read file, append contents to output */
1012			case 'r':
1013			{
1014				FILE *rfile;
1015
1016				rfile = fopen(sed_cmd->string, "r");
1017				if (rfile) {
1018					char *line;
1019
1020					while ((line = xmalloc_getline(rfile))
1021							!= NULL)
1022						append(line);
1023					xprint_and_close_file(rfile);
1024				}
1025
1026				break;
1027			}
1028
1029			/* Write pattern space to file. */
1030			case 'w':
1031				puts_maybe_newline(
1032					pattern_space, sed_cmd->sw_file,
1033					&sed_cmd->sw_last_char, last_gets_char);
1034				break;
1035
1036			/* Read next line from input */
1037			case 'n':
1038				if (!G.be_quiet)
1039					sed_puts(pattern_space, last_gets_char);
1040				if (next_line) {
1041					free(pattern_space);
1042					pattern_space = next_line;
1043					last_gets_char = next_gets_char;
1044					next_line = get_next_line(&next_gets_char);
1045					linenum++;
1046					break;
1047				}
1048				/* fall through */
1049
1050			/* Quit.  End of script, end of input. */
1051			case 'q':
1052				/* Exit the outer while loop */
1053				free(next_line);
1054				next_line = NULL;
1055				goto discard_commands;
1056
1057			/* Append the next line to the current line */
1058			case 'N':
1059			{
1060				int len;
1061				/* If no next line, jump to end of script and exit. */
1062				if (next_line == NULL) {
1063					/* Jump to end of script and exit */
1064					free(next_line);
1065					next_line = NULL;
1066					goto discard_line;
1067				/* append next_line, read new next_line. */
1068				}
1069				len = strlen(pattern_space);
1070				pattern_space = realloc(pattern_space, len + strlen(next_line) + 2);
1071				pattern_space[len] = '\n';
1072				strcpy(pattern_space + len+1, next_line);
1073				last_gets_char = next_gets_char;
1074				next_line = get_next_line(&next_gets_char);
1075				linenum++;
1076				break;
1077			}
1078
1079			/* Test/branch if substitution occurred */
1080			case 't':
1081				if (!substituted) break;
1082				substituted = 0;
1083				/* Fall through */
1084			/* Test/branch if substitution didn't occur */
1085			case 'T':
1086				if (substituted) break;
1087				/* Fall through */
1088			/* Branch to label */
1089			case 'b':
1090				if (!sed_cmd->string) goto discard_commands;
1091				else sed_cmd = branch_to(sed_cmd->string);
1092				break;
1093			/* Transliterate characters */
1094			case 'y':
1095			{
1096				int i, j;
1097
1098				for (i = 0; pattern_space[i]; i++) {
1099					for (j = 0; sed_cmd->string[j]; j += 2) {
1100						if (pattern_space[i] == sed_cmd->string[j]) {
1101							pattern_space[i] = sed_cmd->string[j + 1];
1102							break;
1103						}
1104					}
1105				}
1106
1107				break;
1108			}
1109			case 'g':	/* Replace pattern space with hold space */
1110				free(pattern_space);
1111				pattern_space = xstrdup(G.hold_space ? G.hold_space : "");
1112				break;
1113			case 'G':	/* Append newline and hold space to pattern space */
1114			{
1115				int pattern_space_size = 2;
1116				int hold_space_size = 0;
1117
1118				if (pattern_space)
1119					pattern_space_size += strlen(pattern_space);
1120				if (G.hold_space)
1121					hold_space_size = strlen(G.hold_space);
1122				pattern_space = xrealloc(pattern_space,
1123						pattern_space_size + hold_space_size);
1124				if (pattern_space_size == 2)
1125					pattern_space[0] = 0;
1126				strcat(pattern_space, "\n");
1127				if (G.hold_space)
1128					strcat(pattern_space, G.hold_space);
1129				last_gets_char = '\n';
1130
1131				break;
1132			}
1133			case 'h':	/* Replace hold space with pattern space */
1134				free(G.hold_space);
1135				G.hold_space = xstrdup(pattern_space);
1136				break;
1137			case 'H':	/* Append newline and pattern space to hold space */
1138			{
1139				int hold_space_size = 2;
1140				int pattern_space_size = 0;
1141
1142				if (G.hold_space)
1143					hold_space_size += strlen(G.hold_space);
1144				if (pattern_space)
1145					pattern_space_size = strlen(pattern_space);
1146				G.hold_space = xrealloc(G.hold_space,
1147						hold_space_size + pattern_space_size);
1148
1149				if (hold_space_size == 2)
1150					*G.hold_space = 0;
1151				strcat(G.hold_space, "\n");
1152				if (pattern_space)
1153					strcat(G.hold_space, pattern_space);
1154
1155				break;
1156			}
1157			case 'x': /* Exchange hold and pattern space */
1158			{
1159				char *tmp = pattern_space;
1160				pattern_space = G.hold_space ? : xzalloc(1);
1161				last_gets_char = '\n';
1162				G.hold_space = tmp;
1163				break;
1164			}
1165			}
1166		}
1167	}
1168
1169	/*
1170	 * exit point from sedding...
1171	 */
1172 discard_commands:
1173	/* we will print the line unless we were told to be quiet ('-n')
1174	   or if the line was suppressed (ala 'd'elete) */
1175	if (!G.be_quiet)
1176		sed_puts(pattern_space, last_gets_char);
1177
1178	/* Delete and such jump here. */
1179 discard_line:
1180	flush_append();
1181	free(pattern_space);
1182
1183	goto again;
1184}
1185
1186/* It is possible to have a command line argument with embedded
1187 * newlines.  This counts as multiple command lines.
1188 * However, newline can be escaped: 's/e/z\<newline>z/'
1189 * We check for this.
1190 */
1191
1192static void add_cmd_block(char *cmdstr)
1193{
1194	char *sv, *eol;
1195
1196	cmdstr = sv = xstrdup(cmdstr);
1197	do {
1198		eol = strchr(cmdstr, '\n');
1199 next:
1200		if (eol) {
1201			/* Count preceding slashes */
1202			int slashes = 0;
1203			char *sl = eol;
1204
1205			while (sl != cmdstr && *--sl == '\\')
1206				slashes++;
1207			/* Odd number of preceding slashes - newline is escaped */
1208			if (slashes & 1) {
1209				strcpy(eol-1, eol);
1210				eol = strchr(eol, '\n');
1211				goto next;
1212			}
1213			*eol = '\0';
1214		}
1215		add_cmd(cmdstr);
1216		cmdstr = eol + 1;
1217	} while (eol);
1218	free(sv);
1219}
1220
1221int sed_main(int argc, char **argv);
1222int sed_main(int argc, char **argv)
1223{
1224	enum {
1225		OPT_in_place = 1 << 0,
1226	};
1227	unsigned opt;
1228	llist_t *opt_e, *opt_f;
1229	int status = EXIT_SUCCESS;
1230
1231	INIT_G();
1232
1233	/* destroy command strings on exit */
1234	if (ENABLE_FEATURE_CLEAN_UP) atexit(sed_free_and_close_stuff);
1235
1236	/* Lie to autoconf when it starts asking stupid questions. */
1237	if (argc == 2 && !strcmp(argv[1], "--version")) {
1238		puts("This is not GNU sed version 4.0");
1239		return 0;
1240	}
1241
1242	/* do normal option parsing */
1243	opt_e = opt_f = NULL;
1244	opt_complementary = "e::f::" /* can occur multiple times */
1245	                    "nn"; /* count -n */
1246	opt = getopt32(argv, "irne:f:", &opt_e, &opt_f,
1247			    &G.be_quiet); /* counter for -n */
1248	argc -= optind;
1249	argv += optind;
1250	if (opt & OPT_in_place) { // -i
1251		atexit(cleanup_outname);
1252	}
1253	if (opt & 0x2) G.regex_type |= REG_EXTENDED; // -r
1254	//if (opt & 0x4) G.be_quiet++; // -n
1255	while (opt_e) { // -e
1256		add_cmd_block(opt_e->data);
1257		opt_e = opt_e->link;
1258		/* we leak opt_e here... */
1259	}
1260	while (opt_f) { // -f
1261		char *line;
1262		FILE *cmdfile;
1263		cmdfile = xfopen(opt_f->data, "r");
1264		while ((line = xmalloc_getline(cmdfile)) != NULL) {
1265			add_cmd(line);
1266			free(line);
1267		}
1268		fclose(cmdfile);
1269		opt_f = opt_f->link;
1270		/* we leak opt_f here... */
1271	}
1272	/* if we didn't get a pattern from -e or -f, use argv[0] */
1273	if (!(opt & 0x18)) {
1274		if (!argc)
1275			bb_show_usage();
1276		add_cmd_block(*argv++);
1277		argc--;
1278	}
1279	/* Flush any unfinished commands. */
1280	add_cmd("");
1281
1282	/* By default, we write to stdout */
1283	G.nonstdout = stdout;
1284
1285	/* argv[0..(argc-1)] should be names of file to process. If no
1286	 * files were specified or '-' was specified, take input from stdin.
1287	 * Otherwise, we process all the files specified. */
1288	if (argv[0] == NULL) {
1289		if (opt & OPT_in_place)
1290			bb_error_msg_and_die(bb_msg_requires_arg, "-i");
1291		add_input_file(stdin);
1292		process_files();
1293	} else {
1294		int i;
1295		FILE *file;
1296
1297		for (i = 0; i < argc; i++) {
1298			struct stat statbuf;
1299			int nonstdoutfd;
1300
1301			if (LONE_DASH(argv[i]) && !(opt & OPT_in_place)) {
1302				add_input_file(stdin);
1303				process_files();
1304				continue;
1305			}
1306			file = fopen_or_warn(argv[i], "r");
1307			if (!file) {
1308				status = EXIT_FAILURE;
1309				continue;
1310			}
1311			if (!(opt & OPT_in_place)) {
1312				add_input_file(file);
1313				continue;
1314			}
1315
1316			G.outname = xasprintf("%sXXXXXX", argv[i]);
1317			nonstdoutfd = mkstemp(G.outname);
1318			if (-1 == nonstdoutfd)
1319				bb_perror_msg_and_die("cannot create temp file %s", G.outname);
1320			G.nonstdout = fdopen(nonstdoutfd, "w");
1321
1322			/* Set permissions of output file */
1323
1324			fstat(fileno(file), &statbuf);
1325			fchmod(nonstdoutfd, statbuf.st_mode);
1326			add_input_file(file);
1327			process_files();
1328			fclose(G.nonstdout);
1329
1330			G.nonstdout = stdout;
1331			/* unlink(argv[i]); */
1332			rename(G.outname, argv[i]);
1333			free(G.outname);
1334			G.outname = 0;
1335		}
1336		if (G.input_file_count > G.current_input_file)
1337			process_files();
1338	}
1339
1340	return status;
1341}
1342