tokenizer.c revision 1573
1/*-
2 * Copyright (c) 1992, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Christos Zoulas of Cornell University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37#if !defined(lint) && !defined(SCCSID)
38static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
39#endif /* not lint && not SCCSID */
40
41/*
42 * tokenize.c: Bourne shell like tokenizer
43 */
44#include "sys.h"
45#include <string.h>
46#include <stdlib.h>
47#include "tokenizer.h"
48
49typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
50
51#define IFS "\t \n"
52
53#define TOK_KEEP	1
54#define TOK_EAT		2
55
56#define WINCR 20
57#define AINCR 10
58
59#define tok_malloc(a)		malloc(a)
60#define tok_free(a)		free(a)
61#define tok_realloc(a, b)	realloc(a, b)
62
63
64struct tokenizer {
65    char   *ifs;		/* In field separator			*/
66    int     argc, amax;		/* Current and maximum number of args	*/
67    char  **argv;		/* Argument list			*/
68    char   *wptr, *wmax;	/* Space and limit on the word buffer	*/
69    char   *wstart;		/* Beginning of next word		*/
70    char   *wspace;		/* Space of word buffer			*/
71    quote_t quote;		/* Quoting state			*/
72    int	    flags;		/* flags;				*/
73};
74
75
76private void tok_finish	__P((Tokenizer *));
77
78
79/* tok_finish():
80 *	Finish a word in the tokenizer.
81 */
82private void
83tok_finish(tok)
84    Tokenizer *tok;
85{
86    *tok->wptr = '\0';
87    if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
88	tok->argv[tok->argc++] = tok->wstart;
89	tok->argv[tok->argc] = NULL;
90	tok->wstart = ++tok->wptr;
91    }
92    tok->flags &= ~TOK_KEEP;
93}
94
95
96/* tok_init():
97 *	Initialize the tokenizer
98 */
99public Tokenizer *
100tok_init(ifs)
101    const char *ifs;
102{
103    Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
104
105    tok->ifs     = strdup(ifs ? ifs : IFS);
106    tok->argc    = 0;
107    tok->amax    = AINCR;
108    tok->argv    = (char **) tok_malloc(sizeof(char *) * tok->amax);
109    tok->argv[0] = NULL;
110    tok->wspace  = (char *) tok_malloc(WINCR);
111    tok->wmax    = tok->wspace + WINCR;
112    tok->wstart  = tok->wspace;
113    tok->wptr    = tok->wspace;
114    tok->flags   = 0;
115    tok->quote   = Q_none;
116
117    return tok;
118}
119
120
121/* tok_reset():
122 *	Reset the tokenizer
123 */
124public void
125tok_reset(tok)
126    Tokenizer *tok;
127{
128    tok->argc  = 0;
129    tok->wstart = tok->wspace;
130    tok->wptr = tok->wspace;
131    tok->flags = 0;
132    tok->quote = Q_none;
133}
134
135
136/* tok_end():
137 *	Clean up
138 */
139public void
140tok_end(tok)
141    Tokenizer *tok;
142{
143    tok_free((ptr_t) tok->ifs);
144    tok_free((ptr_t) tok->wspace);
145    tok_free((ptr_t) tok->argv);
146    tok_free((ptr_t) tok);
147}
148
149
150
151/* tok_line():
152 *	Bourne shell like tokenizing
153 *	Return:
154 *		-1: Internal error
155 *		 3: Quoted return
156 *		 2: Unmatched double quote
157 *		 1: Unmatched single quote
158 *		 0: Ok
159 */
160public int
161tok_line(tok, line, argc, argv)
162    Tokenizer *tok;
163    const char* line;
164    int *argc;
165    char ***argv;
166{
167    const char *ptr;
168
169    while (1) {
170	switch (*(ptr = line++)) {
171	case '\'':
172	    tok->flags |= TOK_KEEP;
173	    tok->flags &= ~TOK_EAT;
174	    switch (tok->quote) {
175	    case Q_none:
176		tok->quote = Q_single;	/* Enter single quote mode */
177		break;
178
179	    case Q_single:		/* Exit single quote mode */
180		tok->quote = Q_none;
181		break;
182
183	    case Q_one:			/* Quote this ' */
184		tok->quote = Q_none;
185		*tok->wptr++ = *ptr;
186		break;
187
188	    case Q_double:		/* Stay in double quote mode */
189		*tok->wptr++ = *ptr;
190		break;
191
192	    case Q_doubleone:		/* Quote this ' */
193		tok->quote = Q_double;
194		*tok->wptr++ = *ptr;
195		break;
196
197	    default:
198		return(-1);
199	    }
200	    break;
201
202	case '"':
203	    tok->flags &= ~TOK_EAT;
204	    tok->flags |= TOK_KEEP;
205	    switch (tok->quote) {
206	    case Q_none:		/* Enter double quote mode */
207		tok->quote = Q_double;
208		break;
209
210	    case Q_double:
211		tok->quote = Q_none;	/* Exit double quote mode */
212		break;
213
214	    case Q_one:			/* Quote this " */
215		tok->quote = Q_none;
216		*tok->wptr++ = *ptr;
217		break;
218
219	    case Q_single:		/* Stay in single quote mode */
220		*tok->wptr++ = *ptr;
221		break;
222
223	    case Q_doubleone:		/* Quote this " */
224		tok->quote = Q_double;
225		*tok->wptr++ = *ptr;
226		break;
227
228	    default:
229		return(-1);
230	    }
231	    break;
232
233	case '\\':
234	    tok->flags |= TOK_KEEP;
235	    tok->flags &= ~TOK_EAT;
236	    switch (tok->quote) {
237	    case Q_none:		/* Quote next character */
238		tok->quote = Q_one;
239		break;
240
241	    case Q_double:
242		tok->quote = Q_doubleone;/* Quote next character */
243		break;
244
245	    case Q_one:
246		*tok->wptr++ = *ptr;
247		tok->quote = Q_none;	/* Quote this, restore state */
248		break;
249
250	    case Q_single:		/* Stay in single quote mode */
251		*tok->wptr++ = *ptr;
252		break;
253
254	    case Q_doubleone:		/* Quote this \ */
255		tok->quote = Q_double;
256		*tok->wptr++ = *ptr;
257		break;
258
259	    default:
260		return(-1);
261	    }
262	    break;
263
264	case '\n':
265	    tok->flags &= ~TOK_EAT;
266	    switch (tok->quote) {
267	    case Q_none:
268		tok_finish(tok);
269		*argv = tok->argv;
270		*argc = tok->argc;
271		return(0);
272
273	    case Q_single:
274	    case Q_double:
275		*tok->wptr++ = *ptr;	/* Add the return		*/
276		break;
277
278	    case Q_doubleone:
279		tok->flags |= TOK_EAT;
280		tok->quote = Q_double;	/* Back to double, eat the '\n' */
281		break;
282
283	    case Q_one:
284		tok->flags |= TOK_EAT;
285		tok->quote = Q_none;	/* No quote, more eat the '\n' */
286		break;
287
288	    default:
289		return(0);
290	    }
291	    break;
292
293	case '\0':
294	    switch (tok->quote) {
295	    case Q_none:
296		/* Finish word and return */
297		if (tok->flags & TOK_EAT) {
298		    tok->flags &= ~TOK_EAT;
299		    return 3;
300		}
301		tok_finish(tok);
302		*argv = tok->argv;
303		*argc = tok->argc;
304		return(0);
305
306	    case Q_single:
307		return(1);
308
309	    case Q_double:
310		return(2);
311
312	    case Q_doubleone:
313		tok->quote = Q_double;
314		*tok->wptr++ = *ptr;
315		break;
316
317	    case Q_one:
318		tok->quote = Q_none;
319		*tok->wptr++ = *ptr;
320		break;
321
322	    default:
323		return(-1);
324	    }
325	    break;
326
327	default:
328	    tok->flags &= ~TOK_EAT;
329	    switch (tok->quote) {
330	    case Q_none:
331		if (strchr(tok->ifs, *ptr) != NULL)
332		    tok_finish(tok);
333		else
334		    *tok->wptr++ = *ptr;
335		break;
336
337	    case Q_single:
338	    case Q_double:
339		*tok->wptr++ = *ptr;
340		break;
341
342
343	    case Q_doubleone:
344		*tok->wptr++ = '\\';
345		tok->quote = Q_double;
346		*tok->wptr++ = *ptr;
347		break;
348
349	    case Q_one:
350		tok->quote = Q_none;
351		*tok->wptr++ = *ptr;
352		break;
353
354	    default:
355		return(-1);
356
357	    }
358	    break;
359	}
360
361	if (tok->wptr >= tok->wmax - 4) {
362	    size_t size = tok->wmax - tok->wspace + WINCR;
363	    char *s = (char *) tok_realloc(tok->wspace, size);
364	    /*SUPPRESS 22*/
365	    int offs = s - tok->wspace;
366
367	    if (offs != 0) {
368		int i;
369		for (i = 0; i < tok->argc; i++)
370		    tok->argv[i] = tok->argv[i] + offs;
371		tok->wptr   = tok->wptr + offs;
372		tok->wstart = tok->wstart + offs;
373		tok->wmax   = s + size;
374		tok->wspace = s;
375	    }
376	}
377
378	if (tok->argc >= tok->amax - 4) {
379	    tok->amax += AINCR;
380	    tok->argv = (char **) tok_realloc(tok->argv,
381					      tok->amax * sizeof(char*));
382	}
383
384    }
385}
386