split.c revision 272345
1/*	$NetBSD: split.c,v 1.1 2011/01/08 18:10:31 pgoyette Exp $	*/
2
3/*-
4 * Copyright (c) 1993 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <regex.h>
30#include <stdio.h>
31#include <string.h>
32
33#include "test_regex.h"
34
35/*
36 * split - divide a string into fields, like awk split()
37 *
38 * returns number of fields, including overflow
39 *
40 * fields[]	list is not NULL-terminated
41 * nfields	number of entries available in fields[]
42 * sep		"" white, "c" single char, "ab" [ab]+
43 */
44int
45split(char *string, char *fields[], int nfields, const char *sep)
46{
47	char *p = string;
48	char c;			/* latest character */
49	char sepc = *sep;
50	char sepc2;
51	int fn;
52	char **fp = fields;
53	const char *sepp;
54	int trimtrail;
55
56	/* white space */
57	if (sepc == '\0') {
58		while ((c = *p++) == ' ' || c == '\t')
59			continue;
60		p--;
61		trimtrail = 1;
62		sep = " \t";	/* note, code below knows this is 2 long */
63		sepc = ' ';
64	} else
65		trimtrail = 0;
66	sepc2 = sep[1];		/* now we can safely pick this up */
67
68	/* catch empties */
69	if (*p == '\0')
70		return(0);
71
72	/* single separator */
73	if (sepc2 == '\0') {
74		fn = nfields;
75		for (;;) {
76			*fp++ = p;
77			fn--;
78			if (fn == 0)
79				break;
80			while ((c = *p++) != sepc)
81				if (c == '\0')
82					return(nfields - fn);
83			*(p-1) = '\0';
84		}
85		/* we have overflowed the fields vector -- just count them */
86		fn = nfields;
87		for (;;) {
88			while ((c = *p++) != sepc)
89				if (c == '\0')
90					return(fn);
91			fn++;
92		}
93		/* not reached */
94	}
95
96	/* two separators */
97	if (sep[2] == '\0') {
98		fn = nfields;
99		for (;;) {
100			*fp++ = p;
101			fn--;
102			while ((c = *p++) != sepc && c != sepc2)
103				if (c == '\0') {
104					if (trimtrail && **(fp-1) == '\0')
105						fn++;
106					return(nfields - fn);
107				}
108			if (fn == 0)
109				break;
110			*(p-1) = '\0';
111			while ((c = *p++) == sepc || c == sepc2)
112				continue;
113			p--;
114		}
115		/* we have overflowed the fields vector -- just count them */
116		fn = nfields;
117		while (c != '\0') {
118			while ((c = *p++) == sepc || c == sepc2)
119				continue;
120			p--;
121			fn++;
122			while ((c = *p++) != '\0' && c != sepc && c != sepc2)
123				continue;
124		}
125		/* might have to trim trailing white space */
126		if (trimtrail) {
127			p--;
128			while ((c = *--p) == sepc || c == sepc2)
129				continue;
130			p++;
131			if (*p != '\0') {
132				if (fn == nfields+1)
133					*p = '\0';
134				fn--;
135			}
136		}
137		return(fn);
138	}
139
140	/* n separators */
141	fn = 0;
142	for (;;) {
143		if (fn < nfields)
144			*fp++ = p;
145		fn++;
146		for (;;) {
147			c = *p++;
148			if (c == '\0')
149				return(fn);
150			sepp = sep;
151			while ((sepc = *sepp++) != '\0' && sepc != c)
152				continue;
153			if (sepc != '\0')	/* it was a separator */
154				break;
155		}
156		if (fn < nfields)
157			*(p-1) = '\0';
158		for (;;) {
159			c = *p++;
160			sepp = sep;
161			while ((sepc = *sepp++) != '\0' && sepc != c)
162				continue;
163			if (sepc == '\0')	/* it wasn't a separator */
164				break;
165		}
166		p--;
167	}
168
169	/* not reached */
170}
171
172#ifdef TEST_SPLIT
173
174
175/*
176 * test program
177 * pgm		runs regression
178 * pgm sep	splits stdin lines by sep
179 * pgm str sep	splits str by sep
180 * pgm str sep n	splits str by sep n times
181 */
182int
183main(int argc, char *argv[])
184{
185	char buf[512];
186	int n;
187#	define	MNF	10
188	char *fields[MNF];
189
190	if (argc > 4)
191		for (n = atoi(argv[3]); n > 0; n--) {
192			(void) strcpy(buf, argv[1]);
193		}
194	else if (argc > 3)
195		for (n = atoi(argv[3]); n > 0; n--) {
196			(void) strcpy(buf, argv[1]);
197			(void) split(buf, fields, MNF, argv[2]);
198		}
199	else if (argc > 2)
200		dosplit(argv[1], argv[2]);
201	else if (argc > 1)
202		while (fgets(buf, sizeof(buf), stdin) != NULL) {
203			buf[strlen(buf)-1] = '\0';	/* stomp newline */
204			dosplit(buf, argv[1]);
205		}
206	else
207		regress();
208
209	exit(0);
210}
211
212void
213dosplit(char *string, char *seps)
214{
215#	define	NF	5
216	char *fields[NF];
217	int nf;
218
219	nf = split(string, fields, NF, seps);
220	print(nf, NF, fields);
221}
222
223void
224print(int nf, int nfp, char *fields)
225{
226	int fn;
227	int bound;
228
229	bound = (nf > nfp) ? nfp : nf;
230	printf("%d:\t", nf);
231	for (fn = 0; fn < bound; fn++)
232		printf("\"%s\"%s", fields[fn], (fn+1 < nf) ? ", " : "\n");
233}
234
235#define	RNF	5		/* some table entries know this */
236struct {
237	char *str;
238	char *seps;
239	int nf;
240	char *fi[RNF];
241} tests[] = {
242	"",		" ",	0,	{ "" },
243	" ",		" ",	2,	{ "", "" },
244	"x",		" ",	1,	{ "x" },
245	"xy",		" ",	1,	{ "xy" },
246	"x y",		" ",	2,	{ "x", "y" },
247	"abc def  g ",	" ",	5,	{ "abc", "def", "", "g", "" },
248	"  a bcd",	" ",	4,	{ "", "", "a", "bcd" },
249	"a b c d e f",	" ",	6,	{ "a", "b", "c", "d", "e f" },
250	" a b c d ",	" ",	6,	{ "", "a", "b", "c", "d " },
251
252	"",		" _",	0,	{ "" },
253	" ",		" _",	2,	{ "", "" },
254	"x",		" _",	1,	{ "x" },
255	"x y",		" _",	2,	{ "x", "y" },
256	"ab _ cd",	" _",	2,	{ "ab", "cd" },
257	" a_b  c ",	" _",	5,	{ "", "a", "b", "c", "" },
258	"a b c_d e f",	" _",	6,	{ "a", "b", "c", "d", "e f" },
259	" a b c d ",	" _",	6,	{ "", "a", "b", "c", "d " },
260
261	"",		" _~",	0,	{ "" },
262	" ",		" _~",	2,	{ "", "" },
263	"x",		" _~",	1,	{ "x" },
264	"x y",		" _~",	2,	{ "x", "y" },
265	"ab _~ cd",	" _~",	2,	{ "ab", "cd" },
266	" a_b  c~",	" _~",	5,	{ "", "a", "b", "c", "" },
267	"a b_c d~e f",	" _~",	6,	{ "a", "b", "c", "d", "e f" },
268	"~a b c d ",	" _~",	6,	{ "", "a", "b", "c", "d " },
269
270	"",		" _~-",	0,	{ "" },
271	" ",		" _~-",	2,	{ "", "" },
272	"x",		" _~-",	1,	{ "x" },
273	"x y",		" _~-",	2,	{ "x", "y" },
274	"ab _~- cd",	" _~-",	2,	{ "ab", "cd" },
275	" a_b  c~",	" _~-",	5,	{ "", "a", "b", "c", "" },
276	"a b_c-d~e f",	" _~-",	6,	{ "a", "b", "c", "d", "e f" },
277	"~a-b c d ",	" _~-",	6,	{ "", "a", "b", "c", "d " },
278
279	"",		"  ",	0,	{ "" },
280	" ",		"  ",	2,	{ "", "" },
281	"x",		"  ",	1,	{ "x" },
282	"xy",		"  ",	1,	{ "xy" },
283	"x y",		"  ",	2,	{ "x", "y" },
284	"abc def  g ",	"  ",	4,	{ "abc", "def", "g", "" },
285	"  a bcd",	"  ",	3,	{ "", "a", "bcd" },
286	"a b c d e f",	"  ",	6,	{ "a", "b", "c", "d", "e f" },
287	" a b c d ",	"  ",	6,	{ "", "a", "b", "c", "d " },
288
289	"",		"",	0,	{ "" },
290	" ",		"",	0,	{ "" },
291	"x",		"",	1,	{ "x" },
292	"xy",		"",	1,	{ "xy" },
293	"x y",		"",	2,	{ "x", "y" },
294	"abc def  g ",	"",	3,	{ "abc", "def", "g" },
295	"\t a bcd",	"",	2,	{ "a", "bcd" },
296	"  a \tb\t c ",	"",	3,	{ "a", "b", "c" },
297	"a b c d e ",	"",	5,	{ "a", "b", "c", "d", "e" },
298	"a b\tc d e f",	"",	6,	{ "a", "b", "c", "d", "e f" },
299	" a b c d e f ",	"",	6,	{ "a", "b", "c", "d", "e f " },
300
301	NULL,		NULL,	0,	{ NULL },
302};
303
304void
305regress(void)
306{
307	char buf[512];
308	int n;
309	char *fields[RNF+1];
310	int nf;
311	int i;
312	int printit;
313	char *f;
314
315	for (n = 0; tests[n].str != NULL; n++) {
316		(void) strcpy(buf, tests[n].str);
317		fields[RNF] = NULL;
318		nf = split(buf, fields, RNF, tests[n].seps);
319		printit = 0;
320		if (nf != tests[n].nf) {
321			printf("split `%s' by `%s' gave %d fields, not %d\n",
322				tests[n].str, tests[n].seps, nf, tests[n].nf);
323			printit = 1;
324		} else if (fields[RNF] != NULL) {
325			printf("split() went beyond array end\n");
326			printit = 1;
327		} else {
328			for (i = 0; i < nf && i < RNF; i++) {
329				f = fields[i];
330				if (f == NULL)
331					f = "(NULL)";
332				if (strcmp(f, tests[n].fi[i]) != 0) {
333					printf("split `%s' by `%s', field %d is `%s', not `%s'\n",
334						tests[n].str, tests[n].seps,
335						i, fields[i], tests[n].fi[i]);
336					printit = 1;
337				}
338			}
339		}
340		if (printit)
341			print(nf, RNF, fields);
342	}
343}
344#endif
345