tokenizer.c revision 1573
1/*- 2 * Copyright (c) 1992, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Christos Zoulas of Cornell University. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37#if !defined(lint) && !defined(SCCSID) 38static char sccsid[] = "@(#)tokenizer.c 8.1 (Berkeley) 6/4/93"; 39#endif /* not lint && not SCCSID */ 40 41/* 42 * tokenize.c: Bourne shell like tokenizer 43 */ 44#include "sys.h" 45#include <string.h> 46#include <stdlib.h> 47#include "tokenizer.h" 48 49typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t; 50 51#define IFS "\t \n" 52 53#define TOK_KEEP 1 54#define TOK_EAT 2 55 56#define WINCR 20 57#define AINCR 10 58 59#define tok_malloc(a) malloc(a) 60#define tok_free(a) free(a) 61#define tok_realloc(a, b) realloc(a, b) 62 63 64struct tokenizer { 65 char *ifs; /* In field separator */ 66 int argc, amax; /* Current and maximum number of args */ 67 char **argv; /* Argument list */ 68 char *wptr, *wmax; /* Space and limit on the word buffer */ 69 char *wstart; /* Beginning of next word */ 70 char *wspace; /* Space of word buffer */ 71 quote_t quote; /* Quoting state */ 72 int flags; /* flags; */ 73}; 74 75 76private void tok_finish __P((Tokenizer *)); 77 78 79/* tok_finish(): 80 * Finish a word in the tokenizer. 81 */ 82private void 83tok_finish(tok) 84 Tokenizer *tok; 85{ 86 *tok->wptr = '\0'; 87 if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) { 88 tok->argv[tok->argc++] = tok->wstart; 89 tok->argv[tok->argc] = NULL; 90 tok->wstart = ++tok->wptr; 91 } 92 tok->flags &= ~TOK_KEEP; 93} 94 95 96/* tok_init(): 97 * Initialize the tokenizer 98 */ 99public Tokenizer * 100tok_init(ifs) 101 const char *ifs; 102{ 103 Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer)); 104 105 tok->ifs = strdup(ifs ? ifs : IFS); 106 tok->argc = 0; 107 tok->amax = AINCR; 108 tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax); 109 tok->argv[0] = NULL; 110 tok->wspace = (char *) tok_malloc(WINCR); 111 tok->wmax = tok->wspace + WINCR; 112 tok->wstart = tok->wspace; 113 tok->wptr = tok->wspace; 114 tok->flags = 0; 115 tok->quote = Q_none; 116 117 return tok; 118} 119 120 121/* tok_reset(): 122 * Reset the tokenizer 123 */ 124public void 125tok_reset(tok) 126 Tokenizer *tok; 127{ 128 tok->argc = 0; 129 tok->wstart = tok->wspace; 130 tok->wptr = tok->wspace; 131 tok->flags = 0; 132 tok->quote = Q_none; 133} 134 135 136/* tok_end(): 137 * Clean up 138 */ 139public void 140tok_end(tok) 141 Tokenizer *tok; 142{ 143 tok_free((ptr_t) tok->ifs); 144 tok_free((ptr_t) tok->wspace); 145 tok_free((ptr_t) tok->argv); 146 tok_free((ptr_t) tok); 147} 148 149 150 151/* tok_line(): 152 * Bourne shell like tokenizing 153 * Return: 154 * -1: Internal error 155 * 3: Quoted return 156 * 2: Unmatched double quote 157 * 1: Unmatched single quote 158 * 0: Ok 159 */ 160public int 161tok_line(tok, line, argc, argv) 162 Tokenizer *tok; 163 const char* line; 164 int *argc; 165 char ***argv; 166{ 167 const char *ptr; 168 169 while (1) { 170 switch (*(ptr = line++)) { 171 case '\'': 172 tok->flags |= TOK_KEEP; 173 tok->flags &= ~TOK_EAT; 174 switch (tok->quote) { 175 case Q_none: 176 tok->quote = Q_single; /* Enter single quote mode */ 177 break; 178 179 case Q_single: /* Exit single quote mode */ 180 tok->quote = Q_none; 181 break; 182 183 case Q_one: /* Quote this ' */ 184 tok->quote = Q_none; 185 *tok->wptr++ = *ptr; 186 break; 187 188 case Q_double: /* Stay in double quote mode */ 189 *tok->wptr++ = *ptr; 190 break; 191 192 case Q_doubleone: /* Quote this ' */ 193 tok->quote = Q_double; 194 *tok->wptr++ = *ptr; 195 break; 196 197 default: 198 return(-1); 199 } 200 break; 201 202 case '"': 203 tok->flags &= ~TOK_EAT; 204 tok->flags |= TOK_KEEP; 205 switch (tok->quote) { 206 case Q_none: /* Enter double quote mode */ 207 tok->quote = Q_double; 208 break; 209 210 case Q_double: 211 tok->quote = Q_none; /* Exit double quote mode */ 212 break; 213 214 case Q_one: /* Quote this " */ 215 tok->quote = Q_none; 216 *tok->wptr++ = *ptr; 217 break; 218 219 case Q_single: /* Stay in single quote mode */ 220 *tok->wptr++ = *ptr; 221 break; 222 223 case Q_doubleone: /* Quote this " */ 224 tok->quote = Q_double; 225 *tok->wptr++ = *ptr; 226 break; 227 228 default: 229 return(-1); 230 } 231 break; 232 233 case '\\': 234 tok->flags |= TOK_KEEP; 235 tok->flags &= ~TOK_EAT; 236 switch (tok->quote) { 237 case Q_none: /* Quote next character */ 238 tok->quote = Q_one; 239 break; 240 241 case Q_double: 242 tok->quote = Q_doubleone;/* Quote next character */ 243 break; 244 245 case Q_one: 246 *tok->wptr++ = *ptr; 247 tok->quote = Q_none; /* Quote this, restore state */ 248 break; 249 250 case Q_single: /* Stay in single quote mode */ 251 *tok->wptr++ = *ptr; 252 break; 253 254 case Q_doubleone: /* Quote this \ */ 255 tok->quote = Q_double; 256 *tok->wptr++ = *ptr; 257 break; 258 259 default: 260 return(-1); 261 } 262 break; 263 264 case '\n': 265 tok->flags &= ~TOK_EAT; 266 switch (tok->quote) { 267 case Q_none: 268 tok_finish(tok); 269 *argv = tok->argv; 270 *argc = tok->argc; 271 return(0); 272 273 case Q_single: 274 case Q_double: 275 *tok->wptr++ = *ptr; /* Add the return */ 276 break; 277 278 case Q_doubleone: 279 tok->flags |= TOK_EAT; 280 tok->quote = Q_double; /* Back to double, eat the '\n' */ 281 break; 282 283 case Q_one: 284 tok->flags |= TOK_EAT; 285 tok->quote = Q_none; /* No quote, more eat the '\n' */ 286 break; 287 288 default: 289 return(0); 290 } 291 break; 292 293 case '\0': 294 switch (tok->quote) { 295 case Q_none: 296 /* Finish word and return */ 297 if (tok->flags & TOK_EAT) { 298 tok->flags &= ~TOK_EAT; 299 return 3; 300 } 301 tok_finish(tok); 302 *argv = tok->argv; 303 *argc = tok->argc; 304 return(0); 305 306 case Q_single: 307 return(1); 308 309 case Q_double: 310 return(2); 311 312 case Q_doubleone: 313 tok->quote = Q_double; 314 *tok->wptr++ = *ptr; 315 break; 316 317 case Q_one: 318 tok->quote = Q_none; 319 *tok->wptr++ = *ptr; 320 break; 321 322 default: 323 return(-1); 324 } 325 break; 326 327 default: 328 tok->flags &= ~TOK_EAT; 329 switch (tok->quote) { 330 case Q_none: 331 if (strchr(tok->ifs, *ptr) != NULL) 332 tok_finish(tok); 333 else 334 *tok->wptr++ = *ptr; 335 break; 336 337 case Q_single: 338 case Q_double: 339 *tok->wptr++ = *ptr; 340 break; 341 342 343 case Q_doubleone: 344 *tok->wptr++ = '\\'; 345 tok->quote = Q_double; 346 *tok->wptr++ = *ptr; 347 break; 348 349 case Q_one: 350 tok->quote = Q_none; 351 *tok->wptr++ = *ptr; 352 break; 353 354 default: 355 return(-1); 356 357 } 358 break; 359 } 360 361 if (tok->wptr >= tok->wmax - 4) { 362 size_t size = tok->wmax - tok->wspace + WINCR; 363 char *s = (char *) tok_realloc(tok->wspace, size); 364 /*SUPPRESS 22*/ 365 int offs = s - tok->wspace; 366 367 if (offs != 0) { 368 int i; 369 for (i = 0; i < tok->argc; i++) 370 tok->argv[i] = tok->argv[i] + offs; 371 tok->wptr = tok->wptr + offs; 372 tok->wstart = tok->wstart + offs; 373 tok->wmax = s + size; 374 tok->wspace = s; 375 } 376 } 377 378 if (tok->argc >= tok->amax - 4) { 379 tok->amax += AINCR; 380 tok->argv = (char **) tok_realloc(tok->argv, 381 tok->amax * sizeof(char*)); 382 } 383 384 } 385} 386