1/*	$NetBSD: tokenizer.c,v 1.5 2005/06/09 16:48:58 lukem Exp $	*/
2/*	from	NetBSD: tokenizer.c,v 1.14 2003/12/05 13:37:48 lukem Exp	*/
3
4/*-
5 * Copyright (c) 1992, 1993
6 *	The Regents of the University of California.  All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Christos Zoulas of Cornell University.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36#include "tnftp.h"
37#include "sys.h"
38
39/*
40 * tokenize.c: Bourne shell like tokenizer
41 */
42#include <string.h>
43#include <stdlib.h>
44#include "histedit.h"
45
46typedef enum {
47	Q_none, Q_single, Q_double, Q_one, Q_doubleone
48} quote_t;
49
50#define	IFS		"\t \n"
51
52#define	TOK_KEEP	1
53#define	TOK_EAT		2
54
55#define	WINCR		20
56#define	AINCR		10
57
58#define	tok_strdup(a)		strdup(a)
59#define	tok_malloc(a)		malloc(a)
60#define	tok_free(a)		free(a)
61#define	tok_realloc(a, b)	realloc(a, b)
62
63
64struct tokenizer {
65	char	*ifs;		/* In field separator			 */
66	int	 argc, amax;	/* Current and maximum number of args	 */
67	char   **argv;		/* Argument list			 */
68	char	*wptr, *wmax;	/* Space and limit on the word buffer	 */
69	char	*wstart;	/* Beginning of next word		 */
70	char	*wspace;	/* Space of word buffer			 */
71	quote_t	 quote;		/* Quoting state			 */
72	int	 flags;		/* flags;				 */
73};
74
75
76private void tok_finish(Tokenizer *);
77
78
79/* tok_finish():
80 *	Finish a word in the tokenizer.
81 */
82private void
83tok_finish(Tokenizer *tok)
84{
85
86	*tok->wptr = '\0';
87	if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
88		tok->argv[tok->argc++] = tok->wstart;
89		tok->argv[tok->argc] = NULL;
90		tok->wstart = ++tok->wptr;
91	}
92	tok->flags &= ~TOK_KEEP;
93}
94
95
96/* tok_init():
97 *	Initialize the tokenizer
98 */
99public Tokenizer *
100tok_init(const char *ifs)
101{
102	Tokenizer *tok = (Tokenizer *) tok_malloc(sizeof(Tokenizer));
103
104	if (tok == NULL)
105		return NULL;
106	tok->ifs = tok_strdup(ifs ? ifs : IFS);
107	if (tok->ifs == NULL) {
108		tok_free((ptr_t)tok);
109		return NULL;
110	}
111	tok->argc = 0;
112	tok->amax = AINCR;
113	tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax);
114	if (tok->argv == NULL) {
115		tok_free((ptr_t)tok->ifs);
116		tok_free((ptr_t)tok);
117		return NULL;
118	}
119	tok->argv[0] = NULL;
120	tok->wspace = (char *) tok_malloc(WINCR);
121	if (tok->wspace == NULL) {
122		tok_free((ptr_t)tok->argv);
123		tok_free((ptr_t)tok->ifs);
124		tok_free((ptr_t)tok);
125		return NULL;
126	}
127	tok->wmax = tok->wspace + WINCR;
128	tok->wstart = tok->wspace;
129	tok->wptr = tok->wspace;
130	tok->flags = 0;
131	tok->quote = Q_none;
132
133	return (tok);
134}
135
136
137/* tok_reset():
138 *	Reset the tokenizer
139 */
140public void
141tok_reset(Tokenizer *tok)
142{
143
144	tok->argc = 0;
145	tok->wstart = tok->wspace;
146	tok->wptr = tok->wspace;
147	tok->flags = 0;
148	tok->quote = Q_none;
149}
150
151
152/* tok_end():
153 *	Clean up
154 */
155public void
156tok_end(Tokenizer *tok)
157{
158
159	tok_free((ptr_t) tok->ifs);
160	tok_free((ptr_t) tok->wspace);
161	tok_free((ptr_t) tok->argv);
162	tok_free((ptr_t) tok);
163}
164
165
166
167/* tok_line():
168 *	Bourne shell (sh(1)) like tokenizing
169 *	Arguments:
170 *		tok	current tokenizer state (setup with tok_init())
171 *		line	line to parse
172 *	Returns:
173 *		-1	Internal error
174 *		 3	Quoted return
175 *		 2	Unmatched double quote
176 *		 1	Unmatched single quote
177 *		 0	Ok
178 *	Modifies (if return value is 0):
179 *		argc	number of arguments
180 *		argv	argument array
181 *		cursorc	if !NULL, argv element containing cursor
182 *		cursorv	if !NULL, offset in argv[cursorc] of cursor
183 */
184public int
185tok_line(Tokenizer *tok, const LineInfo *line,
186    int *argc, const char ***argv, int *cursorc, int *cursoro)
187{
188	const char *ptr;
189	int cc, co;
190
191	cc = co = -1;
192	ptr = line->buffer;
193	for (ptr = line->buffer; ;ptr++) {
194		if (ptr >= line->lastchar)
195			ptr = "";
196		if (ptr == line->cursor) {
197			cc = tok->argc;
198			co = tok->wptr - tok->wstart;
199		}
200		switch (*ptr) {
201		case '\'':
202			tok->flags |= TOK_KEEP;
203			tok->flags &= ~TOK_EAT;
204			switch (tok->quote) {
205			case Q_none:
206				tok->quote = Q_single;	/* Enter single quote
207							 * mode */
208				break;
209
210			case Q_single:	/* Exit single quote mode */
211				tok->quote = Q_none;
212				break;
213
214			case Q_one:	/* Quote this ' */
215				tok->quote = Q_none;
216				*tok->wptr++ = *ptr;
217				break;
218
219			case Q_double:	/* Stay in double quote mode */
220				*tok->wptr++ = *ptr;
221				break;
222
223			case Q_doubleone:	/* Quote this ' */
224				tok->quote = Q_double;
225				*tok->wptr++ = *ptr;
226				break;
227
228			default:
229				return (-1);
230			}
231			break;
232
233		case '"':
234			tok->flags &= ~TOK_EAT;
235			tok->flags |= TOK_KEEP;
236			switch (tok->quote) {
237			case Q_none:	/* Enter double quote mode */
238				tok->quote = Q_double;
239				break;
240
241			case Q_double:	/* Exit double quote mode */
242				tok->quote = Q_none;
243				break;
244
245			case Q_one:	/* Quote this " */
246				tok->quote = Q_none;
247				*tok->wptr++ = *ptr;
248				break;
249
250			case Q_single:	/* Stay in single quote mode */
251				*tok->wptr++ = *ptr;
252				break;
253
254			case Q_doubleone:	/* Quote this " */
255				tok->quote = Q_double;
256				*tok->wptr++ = *ptr;
257				break;
258
259			default:
260				return (-1);
261			}
262			break;
263
264		case '\\':
265			tok->flags |= TOK_KEEP;
266			tok->flags &= ~TOK_EAT;
267			switch (tok->quote) {
268			case Q_none:	/* Quote next character */
269				tok->quote = Q_one;
270				break;
271
272			case Q_double:	/* Quote next character */
273				tok->quote = Q_doubleone;
274				break;
275
276			case Q_one:	/* Quote this, restore state */
277				*tok->wptr++ = *ptr;
278				tok->quote = Q_none;
279				break;
280
281			case Q_single:	/* Stay in single quote mode */
282				*tok->wptr++ = *ptr;
283				break;
284
285			case Q_doubleone:	/* Quote this \ */
286				tok->quote = Q_double;
287				*tok->wptr++ = *ptr;
288				break;
289
290			default:
291				return (-1);
292			}
293			break;
294
295		case '\n':
296			tok->flags &= ~TOK_EAT;
297			switch (tok->quote) {
298			case Q_none:
299				goto tok_line_outok;
300
301			case Q_single:
302			case Q_double:
303				*tok->wptr++ = *ptr;	/* Add the return */
304				break;
305
306			case Q_doubleone:   /* Back to double, eat the '\n' */
307				tok->flags |= TOK_EAT;
308				tok->quote = Q_double;
309				break;
310
311			case Q_one:	/* No quote, more eat the '\n' */
312				tok->flags |= TOK_EAT;
313				tok->quote = Q_none;
314				break;
315
316			default:
317				return (0);
318			}
319			break;
320
321		case '\0':
322			switch (tok->quote) {
323			case Q_none:
324				/* Finish word and return */
325				if (tok->flags & TOK_EAT) {
326					tok->flags &= ~TOK_EAT;
327					return (3);
328				}
329				goto tok_line_outok;
330
331			case Q_single:
332				return (1);
333
334			case Q_double:
335				return (2);
336
337			case Q_doubleone:
338				tok->quote = Q_double;
339				*tok->wptr++ = *ptr;
340				break;
341
342			case Q_one:
343				tok->quote = Q_none;
344				*tok->wptr++ = *ptr;
345				break;
346
347			default:
348				return (-1);
349			}
350			break;
351
352		default:
353			tok->flags &= ~TOK_EAT;
354			switch (tok->quote) {
355			case Q_none:
356				if (strchr(tok->ifs, *ptr) != NULL)
357					tok_finish(tok);
358				else
359					*tok->wptr++ = *ptr;
360				break;
361
362			case Q_single:
363			case Q_double:
364				*tok->wptr++ = *ptr;
365				break;
366
367
368			case Q_doubleone:
369				*tok->wptr++ = '\\';
370				tok->quote = Q_double;
371				*tok->wptr++ = *ptr;
372				break;
373
374			case Q_one:
375				tok->quote = Q_none;
376				*tok->wptr++ = *ptr;
377				break;
378
379			default:
380				return (-1);
381
382			}
383			break;
384		}
385
386		if (tok->wptr >= tok->wmax - 4) {
387			size_t size = tok->wmax - tok->wspace + WINCR;
388			char *s = (char *) tok_realloc(tok->wspace, size);
389			if (s == NULL)
390				return (-1);
391
392			if (s != tok->wspace) {
393				int i;
394				for (i = 0; i < tok->argc; i++) {
395				    tok->argv[i] =
396					(tok->argv[i] - tok->wspace) + s;
397				}
398				tok->wptr = (tok->wptr - tok->wspace) + s;
399				tok->wstart = (tok->wstart - tok->wspace) + s;
400				tok->wspace = s;
401			}
402			tok->wmax = s + size;
403		}
404		if (tok->argc >= tok->amax - 4) {
405			char **p;
406			tok->amax += AINCR;
407			p = (char **) tok_realloc(tok->argv,
408			    tok->amax * sizeof(char *));
409			if (p == NULL)
410				return (-1);
411			tok->argv = p;
412		}
413	}
414 tok_line_outok:
415	if (cc == -1 && co == -1) {
416		cc = tok->argc;
417		co = tok->wptr - tok->wstart;
418	}
419	if (cursorc != NULL)
420		*cursorc = cc;
421	if (cursoro != NULL)
422		*cursoro = co;
423	tok_finish(tok);
424	*argv = (const char **)tok->argv;
425	*argc = tok->argc;
426	return (0);
427}
428
429/* tok_str():
430 *	Simpler version of tok_line, taking a NUL terminated line
431 *	and splitting into words, ignoring cursor state.
432 */
433public int
434tok_str(Tokenizer *tok, const char *line, int *argc, const char ***argv)
435{
436	LineInfo li;
437
438	memset(&li, 0, sizeof(li));
439	li.buffer = line;
440	li.cursor = li.lastchar = strchr(line, '\0');
441	return (tok_line(tok, &li, argc, argv, NULL, NULL));
442}
443