1/* 2 * This file defines the string_tokenize interface 3 * Time-stamp: "2007-11-12 20:40:36 bkorb" 4 * 5 * This file is part of AutoOpts, a companion to AutoGen. 6 * AutoOpts is free software. 7 * AutoOpts is copyright (c) 1992-2009 by Bruce Korb - all rights reserved 8 * 9 * AutoOpts is available under any one of two licenses. The license 10 * in use must be one of these two and the choice is under the control 11 * of the user of the license. 12 * 13 * The GNU Lesser General Public License, version 3 or later 14 * See the files "COPYING.lgplv3" and "COPYING.gplv3" 15 * 16 * The Modified Berkeley Software Distribution License 17 * See the file "COPYING.mbsd" 18 * 19 * These files have the following md5sums: 20 * 21 * 43b91e8ca915626ed3818ffb1b71248b pkg/libopts/COPYING.gplv3 22 * 06a1a2e4760c90ea5e1dad8dfaac4d39 pkg/libopts/COPYING.lgplv3 23 * 66a5cedaf62c4b2637025f049f9b826f pkg/libopts/COPYING.mbsd 24 */ 25 26#include <errno.h> 27#include <stdlib.h> 28 29#define cc_t const unsigned char 30#define ch_t unsigned char 31 32/* = = = START-STATIC-FORWARD = = = */ 33/* static forward declarations maintained by mk-fwd */ 34static void 35copy_cooked( ch_t** ppDest, char const ** ppSrc ); 36 37static void 38copy_raw( ch_t** ppDest, char const ** ppSrc ); 39/* = = = END-STATIC-FORWARD = = = */ 40 41static void 42copy_cooked( ch_t** ppDest, char const ** ppSrc ) 43{ 44 ch_t* pDest = (ch_t*)*ppDest; 45 const ch_t* pSrc = (const ch_t*)(*ppSrc + 1); 46 47 for (;;) { 48 ch_t ch = *(pSrc++); 49 switch (ch) { 50 case NUL: *ppSrc = NULL; return; 51 case '"': goto done; 52 case '\\': 53 pSrc += ao_string_cook_escape_char( (char*)pSrc, (char*)&ch, 0x7F ); 54 if (ch == 0x7F) 55 break; 56 /* FALLTHROUGH */ 57 58 default: 59 *(pDest++) = ch; 60 } 61 } 62 63 done: 64 *ppDest = (ch_t*)pDest; /* next spot for storing character */ 65 *ppSrc = (char const *)pSrc; /* char following closing quote */ 66} 67 68 69static void 70copy_raw( ch_t** ppDest, char const ** ppSrc ) 71{ 72 ch_t* pDest = *ppDest; 73 cc_t* pSrc = (cc_t*) (*ppSrc + 1); 74 75 for (;;) { 76 ch_t ch = *(pSrc++); 77 switch (ch) { 78 case NUL: *ppSrc = NULL; return; 79 case '\'': goto done; 80 case '\\': 81 /* 82 * *Four* escapes are handled: newline removal, escape char 83 * quoting and apostrophe quoting 84 */ 85 switch (*pSrc) { 86 case NUL: *ppSrc = NULL; return; 87 case '\r': 88 if (*(++pSrc) == '\n') 89 ++pSrc; 90 continue; 91 92 case '\n': 93 ++pSrc; 94 continue; 95 96 case '\'': 97 ch = '\''; 98 /* FALLTHROUGH */ 99 100 case '\\': 101 ++pSrc; 102 break; 103 } 104 /* FALLTHROUGH */ 105 106 default: 107 *(pDest++) = ch; 108 } 109 } 110 111 done: 112 *ppDest = pDest; /* next spot for storing character */ 113 *ppSrc = (char const *) pSrc; /* char following closing quote */ 114} 115 116 117/*=export_func ao_string_tokenize 118 * 119 * what: tokenize an input string 120 * 121 * arg: + char const* + string + string to be tokenized + 122 * 123 * ret_type: token_list_t* 124 * ret_desc: pointer to a structure that lists each token 125 * 126 * doc: 127 * 128 * This function will convert one input string into a list of strings. 129 * The list of strings is derived by separating the input based on 130 * white space separation. However, if the input contains either single 131 * or double quote characters, then the text after that character up to 132 * a matching quote will become the string in the list. 133 * 134 * The returned pointer should be deallocated with @code{free(3C)} when 135 * are done using the data. The data are placed in a single block of 136 * allocated memory. Do not deallocate individual token/strings. 137 * 138 * The structure pointed to will contain at least these two fields: 139 * @table @samp 140 * @item tkn_ct 141 * The number of tokens found in the input string. 142 * @item tok_list 143 * An array of @code{tkn_ct + 1} pointers to substring tokens, with 144 * the last pointer set to NULL. 145 * @end table 146 * 147 * There are two types of quoted strings: single quoted (@code{'}) and 148 * double quoted (@code{"}). Singly quoted strings are fairly raw in that 149 * escape characters (@code{\\}) are simply another character, except when 150 * preceding the following characters: 151 * @example 152 * @code{\\} double backslashes reduce to one 153 * @code{'} incorporates the single quote into the string 154 * @code{\n} suppresses both the backslash and newline character 155 * @end example 156 * 157 * Double quote strings are formed according to the rules of string 158 * constants in ANSI-C programs. 159 * 160 * example: 161 * @example 162 * #include <stdlib.h> 163 * int ix; 164 * token_list_t* ptl = ao_string_tokenize( some_string ) 165 * for (ix = 0; ix < ptl->tkn_ct; ix++) 166 * do_something_with_tkn( ptl->tkn_list[ix] ); 167 * free( ptl ); 168 * @end example 169 * Note that everything is freed with the one call to @code{free(3C)}. 170 * 171 * err: 172 * NULL is returned and @code{errno} will be set to indicate the problem: 173 * @itemize @bullet 174 * @item 175 * @code{EINVAL} - There was an unterminated quoted string. 176 * @item 177 * @code{ENOENT} - The input string was empty. 178 * @item 179 * @code{ENOMEM} - There is not enough memory. 180 * @end itemize 181=*/ 182token_list_t* 183ao_string_tokenize( char const* str ) 184{ 185 int max_token_ct = 1; /* allow for trailing NUL on string */ 186 token_list_t* res; 187 188 if (str == NULL) goto bogus_str; 189 190 /* 191 * Trim leading white space. Use "ENOENT" and a NULL return to indicate 192 * an empty string was passed. 193 */ 194 while (IS_WHITESPACE_CHAR(*str)) str++; 195 if (*str == NUL) { 196 bogus_str: 197 errno = ENOENT; 198 return NULL; 199 } 200 201 /* 202 * Take an approximate count of tokens. If no quoted strings are used, 203 * it will be accurate. If quoted strings are used, it will be a little 204 * high and we'll squander the space for a few extra pointers. 205 */ 206 { 207 cc_t* pz = (cc_t*)str; 208 209 do { 210 max_token_ct++; 211 while (! IS_WHITESPACE_CHAR(*++pz)) 212 if (*pz == NUL) goto found_nul; 213 while (IS_WHITESPACE_CHAR(*pz)) pz++; 214 } while (*pz != NUL); 215 216 found_nul: 217 ; 218 } 219 220 res = malloc( sizeof(*res) + strlen(str) + (max_token_ct * sizeof(ch_t*)) ); 221 if (res == NULL) { 222 errno = ENOMEM; 223 return res; 224 } 225 226 /* 227 * Now copy each token into the output buffer. 228 */ 229 { 230 ch_t* pzDest = (ch_t*)(res->tkn_list + (max_token_ct + 1)); 231 res->tkn_ct = 0; 232 233 do { 234 res->tkn_list[ res->tkn_ct++ ] = pzDest; 235 for (;;) { 236 int ch = (ch_t)*str; 237 if (IS_WHITESPACE_CHAR(ch)) { 238 found_white_space: 239 while (IS_WHITESPACE_CHAR(*++str)) ; 240 break; 241 } 242 243 switch (ch) { 244 case '"': 245 copy_cooked( &pzDest, &str ); 246 if (str == NULL) { 247 free(res); 248 errno = EINVAL; 249 return NULL; 250 } 251 if (IS_WHITESPACE_CHAR(*str)) 252 goto found_white_space; 253 break; 254 255 case '\'': 256 copy_raw( &pzDest, &str ); 257 if (str == NULL) { 258 free(res); 259 errno = EINVAL; 260 return NULL; 261 } 262 if (IS_WHITESPACE_CHAR(*str)) 263 goto found_white_space; 264 break; 265 266 case NUL: 267 goto copy_done; 268 269 default: 270 str++; 271 *(pzDest++) = ch; 272 } 273 } copy_done:; 274 275 /* 276 * NUL terminate the last token and see if we have any more tokens. 277 */ 278 *(pzDest++) = NUL; 279 } while (*str != NUL); 280 281 res->tkn_list[ res->tkn_ct ] = NULL; 282 } 283 284 return res; 285} 286 287#ifdef TEST 288#include <stdio.h> 289#include <string.h> 290 291int 292main( int argc, char** argv ) 293{ 294 if (argc == 1) { 295 printf("USAGE: %s arg [ ... ]\n", *argv); 296 return 1; 297 } 298 while (--argc > 0) { 299 char* arg = *(++argv); 300 token_list_t* p = ao_string_tokenize( arg ); 301 if (p == NULL) { 302 printf( "Parsing string ``%s'' failed:\n\terrno %d (%s)\n", 303 arg, errno, strerror( errno )); 304 } else { 305 int ix = 0; 306 printf( "Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct ); 307 do { 308 printf( " %3d: ``%s''\n", ix+1, p->tkn_list[ix] ); 309 } while (++ix < p->tkn_ct); 310 free(p); 311 } 312 } 313 return 0; 314} 315#endif 316 317/* 318 * Local Variables: 319 * mode: C 320 * c-file-style: "stroustrup" 321 * indent-tabs-mode: nil 322 * End: 323 * end of autoopts/tokenize.c */ 324