1/* $NetBSD: tokenize.c,v 1.1.1.1 2009/12/13 16:55:15 kardel Exp $ */ 2 3/* 4 * This file defines the string_tokenize interface 5 * Time-stamp: "2007-11-12 20:40:36 bkorb" 6 * 7 * This file is part of AutoOpts, a companion to AutoGen. 8 * AutoOpts is free software. 9 * AutoOpts is copyright (c) 1992-2009 by Bruce Korb - all rights reserved 10 * 11 * AutoOpts is available under any one of two licenses. The license 12 * in use must be one of these two and the choice is under the control 13 * of the user of the license. 14 * 15 * The GNU Lesser General Public License, version 3 or later 16 * See the files "COPYING.lgplv3" and "COPYING.gplv3" 17 * 18 * The Modified Berkeley Software Distribution License 19 * See the file "COPYING.mbsd" 20 * 21 * These files have the following md5sums: 22 * 23 * 43b91e8ca915626ed3818ffb1b71248b pkg/libopts/COPYING.gplv3 24 * 06a1a2e4760c90ea5e1dad8dfaac4d39 pkg/libopts/COPYING.lgplv3 25 * 66a5cedaf62c4b2637025f049f9b826f pkg/libopts/COPYING.mbsd 26 */ 27 28#include <errno.h> 29#include <stdlib.h> 30 31#define cc_t const unsigned char 32#define ch_t unsigned char 33 34/* = = = START-STATIC-FORWARD = = = */ 35/* static forward declarations maintained by mk-fwd */ 36static void 37copy_cooked( ch_t** ppDest, char const ** ppSrc ); 38 39static void 40copy_raw( ch_t** ppDest, char const ** ppSrc ); 41/* = = = END-STATIC-FORWARD = = = */ 42 43static void 44copy_cooked( ch_t** ppDest, char const ** ppSrc ) 45{ 46 ch_t* pDest = (ch_t*)*ppDest; 47 const ch_t* pSrc = (const ch_t*)(*ppSrc + 1); 48 49 for (;;) { 50 ch_t ch = *(pSrc++); 51 switch (ch) { 52 case NUL: *ppSrc = NULL; return; 53 case '"': goto done; 54 case '\\': 55 pSrc += ao_string_cook_escape_char( (const char*)pSrc, (char*)&ch, 0x7F ); 56 if (ch == 0x7F) 57 break; 58 /* FALLTHROUGH */ 59 60 default: 61 *(pDest++) = ch; 62 } 63 } 64 65 done: 66 *ppDest = (ch_t*)pDest; /* next spot for storing character */ 67 *ppSrc = (char const *)pSrc; /* char following closing quote */ 68} 69 70 71static void 72copy_raw( ch_t** ppDest, char const ** ppSrc ) 73{ 74 ch_t* pDest = *ppDest; 75 cc_t* pSrc = (cc_t*) (*ppSrc + 1); 76 77 for (;;) { 78 ch_t ch = *(pSrc++); 79 switch (ch) { 80 case NUL: *ppSrc = NULL; return; 81 case '\'': goto done; 82 case '\\': 83 /* 84 * *Four* escapes are handled: newline removal, escape char 85 * quoting and apostrophe quoting 86 */ 87 switch (*pSrc) { 88 case NUL: *ppSrc = NULL; return; 89 case '\r': 90 if (*(++pSrc) == '\n') 91 ++pSrc; 92 continue; 93 94 case '\n': 95 ++pSrc; 96 continue; 97 98 case '\'': 99 ch = '\''; 100 /* FALLTHROUGH */ 101 102 case '\\': 103 ++pSrc; 104 break; 105 } 106 /* FALLTHROUGH */ 107 108 default: 109 *(pDest++) = ch; 110 } 111 } 112 113 done: 114 *ppDest = pDest; /* next spot for storing character */ 115 *ppSrc = (char const *) pSrc; /* char following closing quote */ 116} 117 118 119/*=export_func ao_string_tokenize 120 * 121 * what: tokenize an input string 122 * 123 * arg: + char const* + string + string to be tokenized + 124 * 125 * ret_type: token_list_t* 126 * ret_desc: pointer to a structure that lists each token 127 * 128 * doc: 129 * 130 * This function will convert one input string into a list of strings. 131 * The list of strings is derived by separating the input based on 132 * white space separation. However, if the input contains either single 133 * or double quote characters, then the text after that character up to 134 * a matching quote will become the string in the list. 135 * 136 * The returned pointer should be deallocated with @code{free(3C)} when 137 * are done using the data. The data are placed in a single block of 138 * allocated memory. Do not deallocate individual token/strings. 139 * 140 * The structure pointed to will contain at least these two fields: 141 * @table @samp 142 * @item tkn_ct 143 * The number of tokens found in the input string. 144 * @item tok_list 145 * An array of @code{tkn_ct + 1} pointers to substring tokens, with 146 * the last pointer set to NULL. 147 * @end table 148 * 149 * There are two types of quoted strings: single quoted (@code{'}) and 150 * double quoted (@code{"}). Singly quoted strings are fairly raw in that 151 * escape characters (@code{\\}) are simply another character, except when 152 * preceding the following characters: 153 * @example 154 * @code{\\} double backslashes reduce to one 155 * @code{'} incorporates the single quote into the string 156 * @code{\n} suppresses both the backslash and newline character 157 * @end example 158 * 159 * Double quote strings are formed according to the rules of string 160 * constants in ANSI-C programs. 161 * 162 * example: 163 * @example 164 * #include <stdlib.h> 165 * int ix; 166 * token_list_t* ptl = ao_string_tokenize( some_string ) 167 * for (ix = 0; ix < ptl->tkn_ct; ix++) 168 * do_something_with_tkn( ptl->tkn_list[ix] ); 169 * free( ptl ); 170 * @end example 171 * Note that everything is freed with the one call to @code{free(3C)}. 172 * 173 * err: 174 * NULL is returned and @code{errno} will be set to indicate the problem: 175 * @itemize @bullet 176 * @item 177 * @code{EINVAL} - There was an unterminated quoted string. 178 * @item 179 * @code{ENOENT} - The input string was empty. 180 * @item 181 * @code{ENOMEM} - There is not enough memory. 182 * @end itemize 183=*/ 184token_list_t* 185ao_string_tokenize( char const* str ) 186{ 187 int max_token_ct = 1; /* allow for trailing NUL on string */ 188 token_list_t* res; 189 190 if (str == NULL) goto bogus_str; 191 192 /* 193 * Trim leading white space. Use "ENOENT" and a NULL return to indicate 194 * an empty string was passed. 195 */ 196 while (IS_WHITESPACE_CHAR(*str)) str++; 197 if (*str == NUL) { 198 bogus_str: 199 errno = ENOENT; 200 return NULL; 201 } 202 203 /* 204 * Take an approximate count of tokens. If no quoted strings are used, 205 * it will be accurate. If quoted strings are used, it will be a little 206 * high and we'll squander the space for a few extra pointers. 207 */ 208 { 209 cc_t* pz = (cc_t*)str; 210 211 do { 212 max_token_ct++; 213 while (! IS_WHITESPACE_CHAR(*++pz)) 214 if (*pz == NUL) goto found_nul; 215 while (IS_WHITESPACE_CHAR(*pz)) pz++; 216 } while (*pz != NUL); 217 218 found_nul: 219 ; 220 } 221 222 res = malloc( sizeof(*res) + strlen(str) + (max_token_ct * sizeof(ch_t*)) ); 223 if (res == NULL) { 224 errno = ENOMEM; 225 return res; 226 } 227 228 /* 229 * Now copy each token into the output buffer. 230 */ 231 { 232 ch_t* pzDest = (ch_t*)(res->tkn_list + (max_token_ct + 1)); 233 res->tkn_ct = 0; 234 235 do { 236 res->tkn_list[ res->tkn_ct++ ] = pzDest; 237 for (;;) { 238 int ch = (ch_t)*str; 239 if (IS_WHITESPACE_CHAR(ch)) { 240 found_white_space: 241 while (IS_WHITESPACE_CHAR(*++str)) ; 242 break; 243 } 244 245 switch (ch) { 246 case '"': 247 copy_cooked( &pzDest, &str ); 248 if (str == NULL) { 249 free(res); 250 errno = EINVAL; 251 return NULL; 252 } 253 if (IS_WHITESPACE_CHAR(*str)) 254 goto found_white_space; 255 break; 256 257 case '\'': 258 copy_raw( &pzDest, &str ); 259 if (str == NULL) { 260 free(res); 261 errno = EINVAL; 262 return NULL; 263 } 264 if (IS_WHITESPACE_CHAR(*str)) 265 goto found_white_space; 266 break; 267 268 case NUL: 269 goto copy_done; 270 271 default: 272 str++; 273 *(pzDest++) = ch; 274 } 275 } copy_done:; 276 277 /* 278 * NUL terminate the last token and see if we have any more tokens. 279 */ 280 *(pzDest++) = NUL; 281 } while (*str != NUL); 282 283 res->tkn_list[ res->tkn_ct ] = NULL; 284 } 285 286 return res; 287} 288 289#ifdef TEST 290#include <stdio.h> 291#include <string.h> 292 293int 294main( int argc, char** argv ) 295{ 296 if (argc == 1) { 297 printf("USAGE: %s arg [ ... ]\n", *argv); 298 return 1; 299 } 300 while (--argc > 0) { 301 char* arg = *(++argv); 302 token_list_t* p = ao_string_tokenize( arg ); 303 if (p == NULL) { 304 printf( "Parsing string ``%s'' failed:\n\terrno %d (%s)\n", 305 arg, errno, strerror( errno )); 306 } else { 307 int ix = 0; 308 printf( "Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct ); 309 do { 310 printf( " %3d: ``%s''\n", ix+1, p->tkn_list[ix] ); 311 } while (++ix < p->tkn_ct); 312 free(p); 313 } 314 } 315 return 0; 316} 317#endif 318 319/* 320 * Local Variables: 321 * mode: C 322 * c-file-style: "stroustrup" 323 * indent-tabs-mode: nil 324 * End: 325 * end of autoopts/tokenize.c */ 326