tokenizer.c revision 84201
1/*-
2 * Copyright (c) 1992, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Christos Zoulas of Cornell University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/lib/libedit/tokenizer.c 84201 2001-09-30 21:21:36Z dillon $");
39#if !defined(lint) && !defined(SCCSID)
40static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
41#endif /* not lint && not SCCSID */
42
43/*
44 * tokenize.c: Bourne shell like tokenizer
45 */
46#include "sys.h"
47#include <string.h>
48#include <stdlib.h>
49#include "tokenizer.h"
50
51typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
52
53#define IFS "\t \n"
54
55#define TOK_KEEP	1
56#define TOK_EAT		2
57
58#define WINCR 20
59#define AINCR 10
60
61#define tok_malloc(a)		malloc(a)
62#define tok_free(a)		free(a)
63#define tok_realloc(a, b)	realloc(a, b)
64#define tok_reallocf(a, b)	reallocf(a, b)
65
66
67struct tokenizer {
68    char   *ifs;		/* In field separator			*/
69    int     argc, amax;		/* Current and maximum number of args	*/
70    char  **argv;		/* Argument list			*/
71    char   *wptr, *wmax;	/* Space and limit on the word buffer	*/
72    char   *wstart;		/* Beginning of next word		*/
73    char   *wspace;		/* Space of word buffer			*/
74    quote_t quote;		/* Quoting state			*/
75    int	    flags;		/* flags;				*/
76};
77
78
79private void tok_finish	__P((Tokenizer *));
80
81
82/* tok_finish():
83 *	Finish a word in the tokenizer.
84 */
85private void
86tok_finish(tok)
87    Tokenizer *tok;
88{
89    *tok->wptr = '\0';
90    if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
91	tok->argv[tok->argc++] = tok->wstart;
92	tok->argv[tok->argc] = NULL;
93	tok->wstart = ++tok->wptr;
94    }
95    tok->flags &= ~TOK_KEEP;
96}
97
98
99/* tok_init():
100 *	Initialize the tokenizer
101 */
102public Tokenizer *
103tok_init(ifs)
104    const char *ifs;
105{
106    Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
107
108    tok->ifs     = strdup(ifs ? ifs : IFS);
109    tok->argc    = 0;
110    tok->amax    = AINCR;
111    tok->argv    = (char **) tok_malloc(sizeof(char *) * tok->amax);
112    tok->argv[0] = NULL;
113    tok->wspace  = (char *) tok_malloc(WINCR);
114    tok->wmax    = tok->wspace + WINCR;
115    tok->wstart  = tok->wspace;
116    tok->wptr    = tok->wspace;
117    tok->flags   = 0;
118    tok->quote   = Q_none;
119
120    return tok;
121}
122
123
124/* tok_reset():
125 *	Reset the tokenizer
126 */
127public void
128tok_reset(tok)
129    Tokenizer *tok;
130{
131    tok->argc  = 0;
132    tok->wstart = tok->wspace;
133    tok->wptr = tok->wspace;
134    tok->flags = 0;
135    tok->quote = Q_none;
136}
137
138
139/* tok_end():
140 *	Clean up
141 */
142public void
143tok_end(tok)
144    Tokenizer *tok;
145{
146    tok_free((ptr_t) tok->ifs);
147    tok_free((ptr_t) tok->wspace);
148    tok_free((ptr_t) tok->argv);
149    tok_free((ptr_t) tok);
150}
151
152
153
154/* tok_line():
155 *	Bourne shell like tokenizing
156 *	Return:
157 *		-1: Internal error
158 *		 3: Quoted return
159 *		 2: Unmatched double quote
160 *		 1: Unmatched single quote
161 *		 0: Ok
162 */
163public int
164tok_line(tok, line, argc, argv)
165    Tokenizer *tok;
166    const char* line;
167    int *argc;
168    char ***argv;
169{
170    const char *ptr;
171
172    while (1) {
173	switch (*(ptr = line++)) {
174	case '\'':
175	    tok->flags |= TOK_KEEP;
176	    tok->flags &= ~TOK_EAT;
177	    switch (tok->quote) {
178	    case Q_none:
179		tok->quote = Q_single;	/* Enter single quote mode */
180		break;
181
182	    case Q_single:		/* Exit single quote mode */
183		tok->quote = Q_none;
184		break;
185
186	    case Q_one:			/* Quote this ' */
187		tok->quote = Q_none;
188		*tok->wptr++ = *ptr;
189		break;
190
191	    case Q_double:		/* Stay in double quote mode */
192		*tok->wptr++ = *ptr;
193		break;
194
195	    case Q_doubleone:		/* Quote this ' */
196		tok->quote = Q_double;
197		*tok->wptr++ = *ptr;
198		break;
199
200	    default:
201		return(-1);
202	    }
203	    break;
204
205	case '"':
206	    tok->flags &= ~TOK_EAT;
207	    tok->flags |= TOK_KEEP;
208	    switch (tok->quote) {
209	    case Q_none:		/* Enter double quote mode */
210		tok->quote = Q_double;
211		break;
212
213	    case Q_double:
214		tok->quote = Q_none;	/* Exit double quote mode */
215		break;
216
217	    case Q_one:			/* Quote this " */
218		tok->quote = Q_none;
219		*tok->wptr++ = *ptr;
220		break;
221
222	    case Q_single:		/* Stay in single quote mode */
223		*tok->wptr++ = *ptr;
224		break;
225
226	    case Q_doubleone:		/* Quote this " */
227		tok->quote = Q_double;
228		*tok->wptr++ = *ptr;
229		break;
230
231	    default:
232		return(-1);
233	    }
234	    break;
235
236	case '\\':
237	    tok->flags |= TOK_KEEP;
238	    tok->flags &= ~TOK_EAT;
239	    switch (tok->quote) {
240	    case Q_none:		/* Quote next character */
241		tok->quote = Q_one;
242		break;
243
244	    case Q_double:
245		tok->quote = Q_doubleone;/* Quote next character */
246		break;
247
248	    case Q_one:
249		*tok->wptr++ = *ptr;
250		tok->quote = Q_none;	/* Quote this, restore state */
251		break;
252
253	    case Q_single:		/* Stay in single quote mode */
254		*tok->wptr++ = *ptr;
255		break;
256
257	    case Q_doubleone:		/* Quote this \ */
258		tok->quote = Q_double;
259		*tok->wptr++ = *ptr;
260		break;
261
262	    default:
263		return(-1);
264	    }
265	    break;
266
267	case '\n':
268	    tok->flags &= ~TOK_EAT;
269	    switch (tok->quote) {
270	    case Q_none:
271		tok_finish(tok);
272		*argv = tok->argv;
273		*argc = tok->argc;
274		return(0);
275
276	    case Q_single:
277	    case Q_double:
278		*tok->wptr++ = *ptr;	/* Add the return		*/
279		break;
280
281	    case Q_doubleone:
282		tok->flags |= TOK_EAT;
283		tok->quote = Q_double;	/* Back to double, eat the '\n' */
284		break;
285
286	    case Q_one:
287		tok->flags |= TOK_EAT;
288		tok->quote = Q_none;	/* No quote, more eat the '\n' */
289		break;
290
291	    default:
292		return(0);
293	    }
294	    break;
295
296	case '\0':
297	    switch (tok->quote) {
298	    case Q_none:
299		/* Finish word and return */
300		if (tok->flags & TOK_EAT) {
301		    tok->flags &= ~TOK_EAT;
302		    return 3;
303		}
304		tok_finish(tok);
305		*argv = tok->argv;
306		*argc = tok->argc;
307		return(0);
308
309	    case Q_single:
310		return(1);
311
312	    case Q_double:
313		return(2);
314
315	    case Q_doubleone:
316		tok->quote = Q_double;
317		*tok->wptr++ = *ptr;
318		break;
319
320	    case Q_one:
321		tok->quote = Q_none;
322		*tok->wptr++ = *ptr;
323		break;
324
325	    default:
326		return(-1);
327	    }
328	    break;
329
330	default:
331	    tok->flags &= ~TOK_EAT;
332	    switch (tok->quote) {
333	    case Q_none:
334		if (strchr(tok->ifs, *ptr) != NULL)
335		    tok_finish(tok);
336		else
337		    *tok->wptr++ = *ptr;
338		break;
339
340	    case Q_single:
341	    case Q_double:
342		*tok->wptr++ = *ptr;
343		break;
344
345
346	    case Q_doubleone:
347		*tok->wptr++ = '\\';
348		tok->quote = Q_double;
349		*tok->wptr++ = *ptr;
350		break;
351
352	    case Q_one:
353		tok->quote = Q_none;
354		*tok->wptr++ = *ptr;
355		break;
356
357	    default:
358		return(-1);
359
360	    }
361	    break;
362	}
363
364	if (tok->wptr >= tok->wmax - 4) {
365	    size_t size = tok->wmax - tok->wspace + WINCR;
366	    char *s = (char *) tok_realloc(tok->wspace, size);
367	    /*SUPPRESS 22*/
368	    int offs = s - tok->wspace;
369
370	    if (offs != 0) {
371		int i;
372		for (i = 0; i < tok->argc; i++)
373		    tok->argv[i] = tok->argv[i] + offs;
374		tok->wptr   = tok->wptr + offs;
375		tok->wstart = tok->wstart + offs;
376		tok->wmax   = s + size;
377		tok->wspace = s;
378	    }
379	}
380
381	if (tok->argc >= tok->amax - 4) {
382	    tok->amax += AINCR;
383	    tok->argv = (char **) tok_reallocf(tok->argv,
384					       tok->amax * sizeof(char*));
385	}
386
387    }
388}
389