tokenize.c revision 294905
1/** \file tokenize.c
2 *
3 *  Tokenize a string, accommodating quoted strings.
4 *
5 * @addtogroup autoopts
6 * @{
7 */
8/*
9 *  This file defines the string_tokenize interface
10 *  This file is part of AutoOpts, a companion to AutoGen.
11 *  AutoOpts is free software.
12 *  AutoOpts is Copyright (C) 1992-2015 by Bruce Korb - all rights reserved
13 *
14 *  AutoOpts is available under any one of two licenses.  The license
15 *  in use must be one of these two and the choice is under the control
16 *  of the user of the license.
17 *
18 *   The GNU Lesser General Public License, version 3 or later
19 *      See the files "COPYING.lgplv3" and "COPYING.gplv3"
20 *
21 *   The Modified Berkeley Software Distribution License
22 *      See the file "COPYING.mbsd"
23 *
24 *  These files have the following sha256 sums:
25 *
26 *  8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95  COPYING.gplv3
27 *  4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b  COPYING.lgplv3
28 *  13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239  COPYING.mbsd
29 */
30
31#include <errno.h>
32#include <stdlib.h>
33
34#define cc_t   const unsigned char
35#define ch_t   unsigned char
36
37/* = = = START-STATIC-FORWARD = = = */
38static void
39copy_cooked(ch_t ** ppDest, char const ** ppSrc);
40
41static void
42copy_raw(ch_t ** ppDest, char const ** ppSrc);
43
44static token_list_t *
45alloc_token_list(char const * str);
46/* = = = END-STATIC-FORWARD = = = */
47
48static void
49copy_cooked(ch_t ** ppDest, char const ** ppSrc)
50{
51    ch_t * pDest = (ch_t *)*ppDest;
52    const ch_t * pSrc  = (const ch_t *)(*ppSrc + 1);
53
54    for (;;) {
55        ch_t ch = *(pSrc++);
56        switch (ch) {
57        case NUL:   *ppSrc = NULL; return;
58        case '"':   goto done;
59        case '\\':
60            pSrc += ao_string_cook_escape_char((const char *)pSrc, (char *)&ch, 0x7F);
61            if (ch == 0x7F)
62                break;
63            /* FALLTHROUGH */
64
65        default:
66            *(pDest++) = ch;
67        }
68    }
69
70 done:
71    *ppDest = (ch_t *)pDest; /* next spot for storing character */
72    *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
73}
74
75
76static void
77copy_raw(ch_t ** ppDest, char const ** ppSrc)
78{
79    ch_t * pDest = *ppDest;
80    cc_t * pSrc  = (cc_t *) (*ppSrc + 1);
81
82    for (;;) {
83        ch_t ch = *(pSrc++);
84        switch (ch) {
85        case NUL:   *ppSrc = NULL; return;
86        case '\'':  goto done;
87        case '\\':
88            /*
89             *  *Four* escapes are handled:  newline removal, escape char
90             *  quoting and apostrophe quoting
91             */
92            switch (*pSrc) {
93            case NUL:   *ppSrc = NULL; return;
94            case '\r':
95                if (*(++pSrc) == NL)
96                    ++pSrc;
97                continue;
98
99            case NL:
100                ++pSrc;
101                continue;
102
103            case '\'':
104                ch = '\'';
105                /* FALLTHROUGH */
106
107            case '\\':
108                ++pSrc;
109                break;
110            }
111            /* FALLTHROUGH */
112
113        default:
114            *(pDest++) = ch;
115        }
116    }
117
118 done:
119    *ppDest = pDest; /* next spot for storing character */
120    *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
121}
122
123static token_list_t *
124alloc_token_list(char const * str)
125{
126    token_list_t * res;
127
128    int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
129
130    if (str == NULL) goto enoent_res;
131
132    /*
133     *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
134     *  an empty string was passed.
135     */
136    str = SPN_WHITESPACE_CHARS(str);
137    if (*str == NUL)  goto enoent_res;
138
139    /*
140     *  Take an approximate count of tokens.  If no quoted strings are used,
141     *  it will be accurate.  If quoted strings are used, it will be a little
142     *  high and we'll squander the space for a few extra pointers.
143     */
144    {
145        char const * pz = str;
146
147        do {
148            max_token_ct++;
149            pz = BRK_WHITESPACE_CHARS(pz+1);
150            pz = SPN_WHITESPACE_CHARS(pz);
151        } while (*pz != NUL);
152
153        res = malloc(sizeof(*res) + (size_t)(pz - str)
154                     + ((size_t)max_token_ct * sizeof(ch_t *)));
155    }
156
157    if (res == NULL)
158        errno = ENOMEM;
159    else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1));
160
161    return res;
162
163    enoent_res:
164
165    errno = ENOENT;
166    return NULL;
167}
168
169/*=export_func ao_string_tokenize
170 *
171 * what: tokenize an input string
172 *
173 * arg:  + char const * + string + string to be tokenized +
174 *
175 * ret_type:  token_list_t *
176 * ret_desc:  pointer to a structure that lists each token
177 *
178 * doc:
179 *
180 * This function will convert one input string into a list of strings.
181 * The list of strings is derived by separating the input based on
182 * white space separation.  However, if the input contains either single
183 * or double quote characters, then the text after that character up to
184 * a matching quote will become the string in the list.
185 *
186 *  The returned pointer should be deallocated with @code{free(3C)} when
187 *  are done using the data.  The data are placed in a single block of
188 *  allocated memory.  Do not deallocate individual token/strings.
189 *
190 *  The structure pointed to will contain at least these two fields:
191 *  @table @samp
192 *  @item tkn_ct
193 *  The number of tokens found in the input string.
194 *  @item tok_list
195 *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
196 *  the last pointer set to NULL.
197 *  @end table
198 *
199 * There are two types of quoted strings: single quoted (@code{'}) and
200 * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
201 * escape characters (@code{\\}) are simply another character, except when
202 * preceding the following characters:
203 * @example
204 * @code{\\}  double backslashes reduce to one
205 * @code{'}   incorporates the single quote into the string
206 * @code{\n}  suppresses both the backslash and newline character
207 * @end example
208 *
209 * Double quote strings are formed according to the rules of string
210 * constants in ANSI-C programs.
211 *
212 * example:
213 * @example
214 *    #include <stdlib.h>
215 *    int ix;
216 *    token_list_t * ptl = ao_string_tokenize(some_string)
217 *    for (ix = 0; ix < ptl->tkn_ct; ix++)
218 *       do_something_with_tkn(ptl->tkn_list[ix]);
219 *    free(ptl);
220 * @end example
221 * Note that everything is freed with the one call to @code{free(3C)}.
222 *
223 * err:
224 *  NULL is returned and @code{errno} will be set to indicate the problem:
225 *  @itemize @bullet
226 *  @item
227 *  @code{EINVAL} - There was an unterminated quoted string.
228 *  @item
229 *  @code{ENOENT} - The input string was empty.
230 *  @item
231 *  @code{ENOMEM} - There is not enough memory.
232 *  @end itemize
233=*/
234token_list_t *
235ao_string_tokenize(char const * str)
236{
237    token_list_t * res = alloc_token_list(str);
238    ch_t * pzDest;
239
240    /*
241     *  Now copy each token into the output buffer.
242     */
243    if (res == NULL)
244        return res;
245
246    pzDest = (ch_t *)(res->tkn_list[0]);
247    res->tkn_ct  = 0;
248
249    do  {
250        res->tkn_list[ res->tkn_ct++ ] = pzDest;
251        for (;;) {
252            int ch = (ch_t)*str;
253            if (IS_WHITESPACE_CHAR(ch)) {
254            found_white_space:
255                str = SPN_WHITESPACE_CHARS(str+1);
256                break;
257            }
258
259            switch (ch) {
260            case '"':
261                copy_cooked(&pzDest, &str);
262                if (str == NULL) {
263                    free(res);
264                    errno = EINVAL;
265                    return NULL;
266                }
267                if (IS_WHITESPACE_CHAR(*str))
268                    goto found_white_space;
269                break;
270
271            case '\'':
272                copy_raw(&pzDest, &str);
273                if (str == NULL) {
274                    free(res);
275                    errno = EINVAL;
276                    return NULL;
277                }
278                if (IS_WHITESPACE_CHAR(*str))
279                    goto found_white_space;
280                break;
281
282            case NUL:
283                goto copy_done;
284
285            default:
286                str++;
287                *(pzDest++) = (unsigned char)ch;
288            }
289        } copy_done:;
290
291        /*
292         * NUL terminate the last token and see if we have any more tokens.
293         */
294        *(pzDest++) = NUL;
295    } while (*str != NUL);
296
297    res->tkn_list[ res->tkn_ct ] = NULL;
298
299    return res;
300}
301
302#ifdef TEST
303#include <stdio.h>
304#include <string.h>
305
306int
307main(int argc, char ** argv)
308{
309    if (argc == 1) {
310        printf("USAGE:  %s arg [ ... ]\n", *argv);
311        return 1;
312    }
313    while (--argc > 0) {
314        char * arg = *(++argv);
315        token_list_t * p = ao_string_tokenize(arg);
316        if (p == NULL) {
317            printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
318                   arg, errno, strerror(errno));
319        } else {
320            int ix = 0;
321            printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
322            do {
323                printf(" %3d:  ``%s''\n", ix+1, p->tkn_list[ix]);
324            } while (++ix < p->tkn_ct);
325            free(p);
326        }
327    }
328    return 0;
329}
330#endif
331
332/** @}
333 *
334 * Local Variables:
335 * mode: C
336 * c-file-style: "stroustrup"
337 * indent-tabs-mode: nil
338 * End:
339 * end of autoopts/tokenize.c */
340