1/** \file tokenize.c
2 *
3 *  Tokenize a string, accommodating quoted strings.
4 *
5 * @addtogroup autoopts
6 * @{
7 */
8/*
9 *  This file defines the string_tokenize interface
10 *  This file is part of AutoOpts, a companion to AutoGen.
11 *  AutoOpts is free software.
12 *  AutoOpts is Copyright (C) 1992-2018 by Bruce Korb - all rights reserved
13 *
14 *  AutoOpts is available under any one of two licenses.  The license
15 *  in use must be one of these two and the choice is under the control
16 *  of the user of the license.
17 *
18 *   The GNU Lesser General Public License, version 3 or later
19 *      See the files "COPYING.lgplv3" and "COPYING.gplv3"
20 *
21 *   The Modified Berkeley Software Distribution License
22 *      See the file "COPYING.mbsd"
23 *
24 *  These files have the following sha256 sums:
25 *
26 *  8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95  COPYING.gplv3
27 *  4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b  COPYING.lgplv3
28 *  13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239  COPYING.mbsd
29 */
30
31static void
32copy_cooked(ch_t ** ppDest, char const ** ppSrc)
33{
34    ch_t * pDest = (ch_t *)*ppDest;
35    const ch_t * pSrc  = (const ch_t *)(*ppSrc + 1);
36
37    for (;;) {
38        ch_t ch = *(pSrc++);
39        switch (ch) {
40        case NUL:   *ppSrc = NULL; return;
41        case '"':   goto done;
42        case '\\':
43            pSrc += ao_string_cook_escape_char((char *)pSrc, (char *)&ch, 0x7F);
44            if (ch == 0x7F)
45                break;
46            /* FALLTHROUGH */
47
48        default:
49            *(pDest++) = ch;
50        }
51    }
52
53 done:
54    *ppDest = (ch_t *)pDest; /* next spot for storing character */
55    *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
56}
57
58
59static void
60copy_raw(ch_t ** ppDest, char const ** ppSrc)
61{
62    ch_t * pDest = *ppDest;
63    cc_t * pSrc  = (cc_t *) (*ppSrc + 1);
64
65    for (;;) {
66        ch_t ch = *(pSrc++);
67        switch (ch) {
68        case NUL:   *ppSrc = NULL; return;
69        case '\'':  goto done;
70        case '\\':
71            /*
72             *  *Four* escapes are handled:  newline removal, escape char
73             *  quoting and apostrophe quoting
74             */
75            switch (*pSrc) {
76            case NUL:   *ppSrc = NULL; return;
77            case '\r':
78                if (*(++pSrc) == NL)
79                    ++pSrc;
80                continue;
81
82            case NL:
83                ++pSrc;
84                continue;
85
86            case '\'':
87                ch = '\'';
88                /* FALLTHROUGH */
89
90            case '\\':
91                ++pSrc;
92                break;
93            }
94            /* FALLTHROUGH */
95
96        default:
97            *(pDest++) = ch;
98        }
99    }
100
101 done:
102    *ppDest = pDest; /* next spot for storing character */
103    *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
104}
105
106static token_list_t *
107alloc_token_list(char const * str)
108{
109    token_list_t * res;
110
111    int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
112
113    if (str == NULL) goto enoent_res;
114
115    /*
116     *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
117     *  an empty string was passed.
118     */
119    str = SPN_WHITESPACE_CHARS(str);
120    if (*str == NUL)  goto enoent_res;
121
122    /*
123     *  Take an approximate count of tokens.  If no quoted strings are used,
124     *  it will be accurate.  If quoted strings are used, it will be a little
125     *  high and we'll squander the space for a few extra pointers.
126     */
127    {
128        char const * pz = str;
129
130        do {
131            max_token_ct++;
132            pz = BRK_WHITESPACE_CHARS(pz+1);
133            pz = SPN_WHITESPACE_CHARS(pz);
134        } while (*pz != NUL);
135
136        res = malloc(sizeof(*res) + (size_t)(pz - str)
137                     + ((size_t)max_token_ct * sizeof(ch_t *)));
138    }
139
140    if (res == NULL)
141        errno = ENOMEM;
142    else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1));
143
144    return res;
145
146    enoent_res:
147
148    errno = ENOENT;
149    return NULL;
150}
151
152/*=export_func ao_string_tokenize
153 *
154 * what: tokenize an input string
155 *
156 * arg:  + char const * + string + string to be tokenized +
157 *
158 * ret_type:  token_list_t *
159 * ret_desc:  pointer to a structure that lists each token
160 *
161 * doc:
162 *
163 * This function will convert one input string into a list of strings.
164 * The list of strings is derived by separating the input based on
165 * white space separation.  However, if the input contains either single
166 * or double quote characters, then the text after that character up to
167 * a matching quote will become the string in the list.
168 *
169 *  The returned pointer should be deallocated with @code{free(3C)} when
170 *  are done using the data.  The data are placed in a single block of
171 *  allocated memory.  Do not deallocate individual token/strings.
172 *
173 *  The structure pointed to will contain at least these two fields:
174 *  @table @samp
175 *  @item tkn_ct
176 *  The number of tokens found in the input string.
177 *  @item tok_list
178 *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
179 *  the last pointer set to NULL.
180 *  @end table
181 *
182 * There are two types of quoted strings: single quoted (@code{'}) and
183 * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
184 * escape characters (@code{\\}) are simply another character, except when
185 * preceding the following characters:
186 * @example
187 * @code{\\}  double backslashes reduce to one
188 * @code{'}   incorporates the single quote into the string
189 * @code{\n}  suppresses both the backslash and newline character
190 * @end example
191 *
192 * Double quote strings are formed according to the rules of string
193 * constants in ANSI-C programs.
194 *
195 * example:
196 * @example
197 *    #include <stdlib.h>
198 *    int ix;
199 *    token_list_t * ptl = ao_string_tokenize(some_string)
200 *    for (ix = 0; ix < ptl->tkn_ct; ix++)
201 *       do_something_with_tkn(ptl->tkn_list[ix]);
202 *    free(ptl);
203 * @end example
204 * Note that everything is freed with the one call to @code{free(3C)}.
205 *
206 * err:
207 *  NULL is returned and @code{errno} will be set to indicate the problem:
208 *  @itemize @bullet
209 *  @item
210 *  @code{EINVAL} - There was an unterminated quoted string.
211 *  @item
212 *  @code{ENOENT} - The input string was empty.
213 *  @item
214 *  @code{ENOMEM} - There is not enough memory.
215 *  @end itemize
216=*/
217token_list_t *
218ao_string_tokenize(char const * str)
219{
220    token_list_t * res = alloc_token_list(str);
221    ch_t * pzDest;
222
223    /*
224     *  Now copy each token into the output buffer.
225     */
226    if (res == NULL)
227        return res;
228
229    pzDest = (ch_t *)(res->tkn_list[0]);
230    res->tkn_ct  = 0;
231
232    do  {
233        res->tkn_list[ res->tkn_ct++ ] = pzDest;
234        for (;;) {
235            int ch = (ch_t)*str;
236            if (IS_WHITESPACE_CHAR(ch)) {
237            found_white_space:
238                str = SPN_WHITESPACE_CHARS(str+1);
239                break;
240            }
241
242            switch (ch) {
243            case '"':
244                copy_cooked(&pzDest, &str);
245                if (str == NULL) {
246                    free(res);
247                    errno = EINVAL;
248                    return NULL;
249                }
250                if (IS_WHITESPACE_CHAR(*str))
251                    goto found_white_space;
252                break;
253
254            case '\'':
255                copy_raw(&pzDest, &str);
256                if (str == NULL) {
257                    free(res);
258                    errno = EINVAL;
259                    return NULL;
260                }
261                if (IS_WHITESPACE_CHAR(*str))
262                    goto found_white_space;
263                break;
264
265            case NUL:
266                goto copy_done;
267
268            default:
269                str++;
270                *(pzDest++) = (unsigned char)ch;
271            }
272        } copy_done:;
273
274        /*
275         * NUL terminate the last token and see if we have any more tokens.
276         */
277        *(pzDest++) = NUL;
278    } while (*str != NUL);
279
280    res->tkn_list[ res->tkn_ct ] = NULL;
281
282    return res;
283}
284
285#ifdef TEST
286#include <stdio.h>
287#include <string.h>
288
289int
290main(int argc, char ** argv)
291{
292    if (argc == 1) {
293        printf("USAGE:  %s arg [ ... ]\n", *argv);
294        return 1;
295    }
296    while (--argc > 0) {
297        char * arg = *(++argv);
298        token_list_t * p = ao_string_tokenize(arg);
299        if (p == NULL) {
300            printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
301                   arg, errno, strerror(errno));
302        } else {
303            int ix = 0;
304            printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
305            do {
306                printf(" %3d:  ``%s''\n", ix+1, p->tkn_list[ix]);
307            } while (++ix < p->tkn_ct);
308            free(p);
309        }
310    }
311    return 0;
312}
313#endif
314
315/** @}
316 *
317 * Local Variables:
318 * mode: C
319 * c-file-style: "stroustrup"
320 * indent-tabs-mode: nil
321 * End:
322 * end of autoopts/tokenize.c */
323