Deleted Added
sdiff udiff text old ( 285612 ) new ( 294569 )
full compact
1/** \file tokenize.c
2 *
3 * Tokenize a string, accommodating quoted strings.
4 *
5 * @addtogroup autoopts
6 * @{
7 */
8/*
9 * This file defines the string_tokenize interface
10 * This file is part of AutoOpts, a companion to AutoGen.
11 * AutoOpts is free software.
12 * AutoOpts is Copyright (C) 1992-2015 by Bruce Korb - all rights reserved
13 *
14 * AutoOpts is available under any one of two licenses. The license
15 * in use must be one of these two and the choice is under the control
16 * of the user of the license.
17 *
18 * The GNU Lesser General Public License, version 3 or later
19 * See the files "COPYING.lgplv3" and "COPYING.gplv3"
20 *
21 * The Modified Berkeley Software Distribution License
22 * See the file "COPYING.mbsd"
23 *
24 * These files have the following sha256 sums:
25 *
26 * 8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95 COPYING.gplv3
27 * 4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b COPYING.lgplv3
28 * 13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239 COPYING.mbsd
29 */
30
31#include <errno.h>
32#include <stdlib.h>
33
34#define cc_t const unsigned char
35#define ch_t unsigned char
36
37/* = = = START-STATIC-FORWARD = = = */
38static void
39copy_cooked(ch_t ** ppDest, char const ** ppSrc);
40
41static void
42copy_raw(ch_t ** ppDest, char const ** ppSrc);
43
44static token_list_t *
45alloc_token_list(char const * str);
46/* = = = END-STATIC-FORWARD = = = */
47
48static void
49copy_cooked(ch_t ** ppDest, char const ** ppSrc)
50{
51 ch_t * pDest = (ch_t *)*ppDest;
52 const ch_t * pSrc = (const ch_t *)(*ppSrc + 1);
53
54 for (;;) {
55 ch_t ch = *(pSrc++);
56 switch (ch) {
57 case NUL: *ppSrc = NULL; return;
58 case '"': goto done;
59 case '\\':
60 pSrc += ao_string_cook_escape_char((char *)pSrc, (char *)&ch, 0x7F);
61 if (ch == 0x7F)
62 break;
63 /* FALLTHROUGH */
64
65 default:
66 *(pDest++) = ch;
67 }
68 }
69
70 done:
71 *ppDest = (ch_t *)pDest; /* next spot for storing character */
72 *ppSrc = (char const *)pSrc; /* char following closing quote */
73}
74
75
76static void
77copy_raw(ch_t ** ppDest, char const ** ppSrc)
78{
79 ch_t * pDest = *ppDest;
80 cc_t * pSrc = (cc_t *) (*ppSrc + 1);
81
82 for (;;) {
83 ch_t ch = *(pSrc++);
84 switch (ch) {
85 case NUL: *ppSrc = NULL; return;
86 case '\'': goto done;
87 case '\\':
88 /*
89 * *Four* escapes are handled: newline removal, escape char
90 * quoting and apostrophe quoting
91 */
92 switch (*pSrc) {
93 case NUL: *ppSrc = NULL; return;
94 case '\r':
95 if (*(++pSrc) == NL)
96 ++pSrc;
97 continue;
98
99 case NL:
100 ++pSrc;
101 continue;
102
103 case '\'':
104 ch = '\'';
105 /* FALLTHROUGH */
106
107 case '\\':
108 ++pSrc;
109 break;
110 }
111 /* FALLTHROUGH */
112
113 default:
114 *(pDest++) = ch;
115 }
116 }
117
118 done:
119 *ppDest = pDest; /* next spot for storing character */
120 *ppSrc = (char const *) pSrc; /* char following closing quote */
121}
122
123static token_list_t *
124alloc_token_list(char const * str)
125{
126 token_list_t * res;
127
128 int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
129
130 if (str == NULL) goto enoent_res;
131
132 /*
133 * Trim leading white space. Use "ENOENT" and a NULL return to indicate
134 * an empty string was passed.
135 */
136 str = SPN_WHITESPACE_CHARS(str);
137 if (*str == NUL) goto enoent_res;
138
139 /*
140 * Take an approximate count of tokens. If no quoted strings are used,
141 * it will be accurate. If quoted strings are used, it will be a little
142 * high and we'll squander the space for a few extra pointers.
143 */
144 {
145 char const * pz = str;
146
147 do {
148 max_token_ct++;
149 pz = BRK_WHITESPACE_CHARS(pz+1);
150 pz = SPN_WHITESPACE_CHARS(pz);
151 } while (*pz != NUL);
152
153 res = malloc(sizeof(*res) + (size_t)(pz - str)
154 + ((size_t)max_token_ct * sizeof(ch_t *)));
155 }
156
157 if (res == NULL)
158 errno = ENOMEM;
159 else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1));
160
161 return res;
162
163 enoent_res:
164
165 errno = ENOENT;
166 return NULL;
167}
168
169/*=export_func ao_string_tokenize
170 *
171 * what: tokenize an input string
172 *
173 * arg: + char const * + string + string to be tokenized +
174 *
175 * ret_type: token_list_t *
176 * ret_desc: pointer to a structure that lists each token
177 *
178 * doc:
179 *
180 * This function will convert one input string into a list of strings.
181 * The list of strings is derived by separating the input based on
182 * white space separation. However, if the input contains either single
183 * or double quote characters, then the text after that character up to
184 * a matching quote will become the string in the list.
185 *
186 * The returned pointer should be deallocated with @code{free(3C)} when
187 * are done using the data. The data are placed in a single block of
188 * allocated memory. Do not deallocate individual token/strings.
189 *
190 * The structure pointed to will contain at least these two fields:
191 * @table @samp
192 * @item tkn_ct
193 * The number of tokens found in the input string.
194 * @item tok_list
195 * An array of @code{tkn_ct + 1} pointers to substring tokens, with
196 * the last pointer set to NULL.
197 * @end table
198 *
199 * There are two types of quoted strings: single quoted (@code{'}) and
200 * double quoted (@code{"}). Singly quoted strings are fairly raw in that
201 * escape characters (@code{\\}) are simply another character, except when
202 * preceding the following characters:
203 * @example
204 * @code{\\} double backslashes reduce to one
205 * @code{'} incorporates the single quote into the string
206 * @code{\n} suppresses both the backslash and newline character
207 * @end example
208 *
209 * Double quote strings are formed according to the rules of string
210 * constants in ANSI-C programs.
211 *
212 * example:
213 * @example
214 * #include <stdlib.h>
215 * int ix;
216 * token_list_t * ptl = ao_string_tokenize(some_string)
217 * for (ix = 0; ix < ptl->tkn_ct; ix++)
218 * do_something_with_tkn(ptl->tkn_list[ix]);
219 * free(ptl);
220 * @end example
221 * Note that everything is freed with the one call to @code{free(3C)}.
222 *
223 * err:
224 * NULL is returned and @code{errno} will be set to indicate the problem:
225 * @itemize @bullet
226 * @item
227 * @code{EINVAL} - There was an unterminated quoted string.
228 * @item
229 * @code{ENOENT} - The input string was empty.
230 * @item
231 * @code{ENOMEM} - There is not enough memory.
232 * @end itemize
233=*/
234token_list_t *
235ao_string_tokenize(char const * str)
236{
237 token_list_t * res = alloc_token_list(str);
238 ch_t * pzDest;
239
240 /*
241 * Now copy each token into the output buffer.
242 */
243 if (res == NULL)
244 return res;
245
246 pzDest = (ch_t *)(res->tkn_list[0]);
247 res->tkn_ct = 0;
248
249 do {
250 res->tkn_list[ res->tkn_ct++ ] = pzDest;
251 for (;;) {
252 int ch = (ch_t)*str;
253 if (IS_WHITESPACE_CHAR(ch)) {
254 found_white_space:
255 str = SPN_WHITESPACE_CHARS(str+1);
256 break;
257 }
258
259 switch (ch) {
260 case '"':
261 copy_cooked(&pzDest, &str);
262 if (str == NULL) {
263 free(res);
264 errno = EINVAL;
265 return NULL;
266 }
267 if (IS_WHITESPACE_CHAR(*str))
268 goto found_white_space;
269 break;
270
271 case '\'':
272 copy_raw(&pzDest, &str);
273 if (str == NULL) {
274 free(res);
275 errno = EINVAL;
276 return NULL;
277 }
278 if (IS_WHITESPACE_CHAR(*str))
279 goto found_white_space;
280 break;
281
282 case NUL:
283 goto copy_done;
284
285 default:
286 str++;
287 *(pzDest++) = (unsigned char)ch;
288 }
289 } copy_done:;
290
291 /*
292 * NUL terminate the last token and see if we have any more tokens.
293 */
294 *(pzDest++) = NUL;
295 } while (*str != NUL);
296
297 res->tkn_list[ res->tkn_ct ] = NULL;
298
299 return res;
300}
301
302#ifdef TEST
303#include <stdio.h>
304#include <string.h>
305
306int
307main(int argc, char ** argv)
308{
309 if (argc == 1) {
310 printf("USAGE: %s arg [ ... ]\n", *argv);
311 return 1;
312 }
313 while (--argc > 0) {
314 char * arg = *(++argv);
315 token_list_t * p = ao_string_tokenize(arg);
316 if (p == NULL) {
317 printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
318 arg, errno, strerror(errno));
319 } else {
320 int ix = 0;
321 printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
322 do {
323 printf(" %3d: ``%s''\n", ix+1, p->tkn_list[ix]);
324 } while (++ix < p->tkn_ct);
325 free(p);
326 }
327 }
328 return 0;
329}
330#endif
331
332/** @}
333 *
334 * Local Variables:
335 * mode: C
336 * c-file-style: "stroustrup"
337 * indent-tabs-mode: nil
338 * End:
339 * end of autoopts/tokenize.c */