tokenizer.c revision 39327
1/*-
2 * Copyright (c) 1992, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Christos Zoulas of Cornell University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37#if !defined(lint) && !defined(SCCSID)
38static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
39#endif /* not lint && not SCCSID */
40
41/*
42 * tokenize.c: Bourne shell like tokenizer
43 */
44#include "sys.h"
45#include <string.h>
46#include <stdlib.h>
47#include "tokenizer.h"
48
49typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
50
51#define IFS "\t \n"
52
53#define TOK_KEEP	1
54#define TOK_EAT		2
55
56#define WINCR 20
57#define AINCR 10
58
59#define tok_malloc(a)		malloc(a)
60#define tok_free(a)		free(a)
61#define tok_realloc(a, b)	realloc(a, b)
62#define tok_reallocf(a, b)	reallocf(a, b)
63
64
65struct tokenizer {
66    char   *ifs;		/* In field separator			*/
67    int     argc, amax;		/* Current and maximum number of args	*/
68    char  **argv;		/* Argument list			*/
69    char   *wptr, *wmax;	/* Space and limit on the word buffer	*/
70    char   *wstart;		/* Beginning of next word		*/
71    char   *wspace;		/* Space of word buffer			*/
72    quote_t quote;		/* Quoting state			*/
73    int	    flags;		/* flags;				*/
74};
75
76
77private void tok_finish	__P((Tokenizer *));
78
79
80/* tok_finish():
81 *	Finish a word in the tokenizer.
82 */
83private void
84tok_finish(tok)
85    Tokenizer *tok;
86{
87    *tok->wptr = '\0';
88    if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
89	tok->argv[tok->argc++] = tok->wstart;
90	tok->argv[tok->argc] = NULL;
91	tok->wstart = ++tok->wptr;
92    }
93    tok->flags &= ~TOK_KEEP;
94}
95
96
97/* tok_init():
98 *	Initialize the tokenizer
99 */
100public Tokenizer *
101tok_init(ifs)
102    const char *ifs;
103{
104    Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
105
106    tok->ifs     = strdup(ifs ? ifs : IFS);
107    tok->argc    = 0;
108    tok->amax    = AINCR;
109    tok->argv    = (char **) tok_malloc(sizeof(char *) * tok->amax);
110    tok->argv[0] = NULL;
111    tok->wspace  = (char *) tok_malloc(WINCR);
112    tok->wmax    = tok->wspace + WINCR;
113    tok->wstart  = tok->wspace;
114    tok->wptr    = tok->wspace;
115    tok->flags   = 0;
116    tok->quote   = Q_none;
117
118    return tok;
119}
120
121
122/* tok_reset():
123 *	Reset the tokenizer
124 */
125public void
126tok_reset(tok)
127    Tokenizer *tok;
128{
129    tok->argc  = 0;
130    tok->wstart = tok->wspace;
131    tok->wptr = tok->wspace;
132    tok->flags = 0;
133    tok->quote = Q_none;
134}
135
136
137/* tok_end():
138 *	Clean up
139 */
140public void
141tok_end(tok)
142    Tokenizer *tok;
143{
144    tok_free((ptr_t) tok->ifs);
145    tok_free((ptr_t) tok->wspace);
146    tok_free((ptr_t) tok->argv);
147    tok_free((ptr_t) tok);
148}
149
150
151
152/* tok_line():
153 *	Bourne shell like tokenizing
154 *	Return:
155 *		-1: Internal error
156 *		 3: Quoted return
157 *		 2: Unmatched double quote
158 *		 1: Unmatched single quote
159 *		 0: Ok
160 */
161public int
162tok_line(tok, line, argc, argv)
163    Tokenizer *tok;
164    const char* line;
165    int *argc;
166    char ***argv;
167{
168    const char *ptr;
169
170    while (1) {
171	switch (*(ptr = line++)) {
172	case '\'':
173	    tok->flags |= TOK_KEEP;
174	    tok->flags &= ~TOK_EAT;
175	    switch (tok->quote) {
176	    case Q_none:
177		tok->quote = Q_single;	/* Enter single quote mode */
178		break;
179
180	    case Q_single:		/* Exit single quote mode */
181		tok->quote = Q_none;
182		break;
183
184	    case Q_one:			/* Quote this ' */
185		tok->quote = Q_none;
186		*tok->wptr++ = *ptr;
187		break;
188
189	    case Q_double:		/* Stay in double quote mode */
190		*tok->wptr++ = *ptr;
191		break;
192
193	    case Q_doubleone:		/* Quote this ' */
194		tok->quote = Q_double;
195		*tok->wptr++ = *ptr;
196		break;
197
198	    default:
199		return(-1);
200	    }
201	    break;
202
203	case '"':
204	    tok->flags &= ~TOK_EAT;
205	    tok->flags |= TOK_KEEP;
206	    switch (tok->quote) {
207	    case Q_none:		/* Enter double quote mode */
208		tok->quote = Q_double;
209		break;
210
211	    case Q_double:
212		tok->quote = Q_none;	/* Exit double quote mode */
213		break;
214
215	    case Q_one:			/* Quote this " */
216		tok->quote = Q_none;
217		*tok->wptr++ = *ptr;
218		break;
219
220	    case Q_single:		/* Stay in single quote mode */
221		*tok->wptr++ = *ptr;
222		break;
223
224	    case Q_doubleone:		/* Quote this " */
225		tok->quote = Q_double;
226		*tok->wptr++ = *ptr;
227		break;
228
229	    default:
230		return(-1);
231	    }
232	    break;
233
234	case '\\':
235	    tok->flags |= TOK_KEEP;
236	    tok->flags &= ~TOK_EAT;
237	    switch (tok->quote) {
238	    case Q_none:		/* Quote next character */
239		tok->quote = Q_one;
240		break;
241
242	    case Q_double:
243		tok->quote = Q_doubleone;/* Quote next character */
244		break;
245
246	    case Q_one:
247		*tok->wptr++ = *ptr;
248		tok->quote = Q_none;	/* Quote this, restore state */
249		break;
250
251	    case Q_single:		/* Stay in single quote mode */
252		*tok->wptr++ = *ptr;
253		break;
254
255	    case Q_doubleone:		/* Quote this \ */
256		tok->quote = Q_double;
257		*tok->wptr++ = *ptr;
258		break;
259
260	    default:
261		return(-1);
262	    }
263	    break;
264
265	case '\n':
266	    tok->flags &= ~TOK_EAT;
267	    switch (tok->quote) {
268	    case Q_none:
269		tok_finish(tok);
270		*argv = tok->argv;
271		*argc = tok->argc;
272		return(0);
273
274	    case Q_single:
275	    case Q_double:
276		*tok->wptr++ = *ptr;	/* Add the return		*/
277		break;
278
279	    case Q_doubleone:
280		tok->flags |= TOK_EAT;
281		tok->quote = Q_double;	/* Back to double, eat the '\n' */
282		break;
283
284	    case Q_one:
285		tok->flags |= TOK_EAT;
286		tok->quote = Q_none;	/* No quote, more eat the '\n' */
287		break;
288
289	    default:
290		return(0);
291	    }
292	    break;
293
294	case '\0':
295	    switch (tok->quote) {
296	    case Q_none:
297		/* Finish word and return */
298		if (tok->flags & TOK_EAT) {
299		    tok->flags &= ~TOK_EAT;
300		    return 3;
301		}
302		tok_finish(tok);
303		*argv = tok->argv;
304		*argc = tok->argc;
305		return(0);
306
307	    case Q_single:
308		return(1);
309
310	    case Q_double:
311		return(2);
312
313	    case Q_doubleone:
314		tok->quote = Q_double;
315		*tok->wptr++ = *ptr;
316		break;
317
318	    case Q_one:
319		tok->quote = Q_none;
320		*tok->wptr++ = *ptr;
321		break;
322
323	    default:
324		return(-1);
325	    }
326	    break;
327
328	default:
329	    tok->flags &= ~TOK_EAT;
330	    switch (tok->quote) {
331	    case Q_none:
332		if (strchr(tok->ifs, *ptr) != NULL)
333		    tok_finish(tok);
334		else
335		    *tok->wptr++ = *ptr;
336		break;
337
338	    case Q_single:
339	    case Q_double:
340		*tok->wptr++ = *ptr;
341		break;
342
343
344	    case Q_doubleone:
345		*tok->wptr++ = '\\';
346		tok->quote = Q_double;
347		*tok->wptr++ = *ptr;
348		break;
349
350	    case Q_one:
351		tok->quote = Q_none;
352		*tok->wptr++ = *ptr;
353		break;
354
355	    default:
356		return(-1);
357
358	    }
359	    break;
360	}
361
362	if (tok->wptr >= tok->wmax - 4) {
363	    size_t size = tok->wmax - tok->wspace + WINCR;
364	    char *s = (char *) tok_realloc(tok->wspace, size);
365	    /*SUPPRESS 22*/
366	    int offs = s - tok->wspace;
367
368	    if (offs != 0) {
369		int i;
370		for (i = 0; i < tok->argc; i++)
371		    tok->argv[i] = tok->argv[i] + offs;
372		tok->wptr   = tok->wptr + offs;
373		tok->wstart = tok->wstart + offs;
374		tok->wmax   = s + size;
375		tok->wspace = s;
376	    }
377	}
378
379	if (tok->argc >= tok->amax - 4) {
380	    tok->amax += AINCR;
381	    tok->argv = (char **) tok_reallocf(tok->argv,
382					       tok->amax * sizeof(char*));
383	}
384
385    }
386}
387