1/*
2 *  This file defines the string_tokenize interface
3 * Time-stamp:      "2007-11-12 20:40:36 bkorb"
4 *
5 *  This file is part of AutoOpts, a companion to AutoGen.
6 *  AutoOpts is free software.
7 *  AutoOpts is copyright (c) 1992-2009 by Bruce Korb - all rights reserved
8 *
9 *  AutoOpts is available under any one of two licenses.  The license
10 *  in use must be one of these two and the choice is under the control
11 *  of the user of the license.
12 *
13 *   The GNU Lesser General Public License, version 3 or later
14 *      See the files "COPYING.lgplv3" and "COPYING.gplv3"
15 *
16 *   The Modified Berkeley Software Distribution License
17 *      See the file "COPYING.mbsd"
18 *
19 *  These files have the following md5sums:
20 *
21 *  43b91e8ca915626ed3818ffb1b71248b pkg/libopts/COPYING.gplv3
22 *  06a1a2e4760c90ea5e1dad8dfaac4d39 pkg/libopts/COPYING.lgplv3
23 *  66a5cedaf62c4b2637025f049f9b826f pkg/libopts/COPYING.mbsd
24 */
25
26#include <errno.h>
27#include <stdlib.h>
28
29#define cc_t   const unsigned char
30#define ch_t   unsigned char
31
32/* = = = START-STATIC-FORWARD = = = */
33/* static forward declarations maintained by mk-fwd */
34static void
35copy_cooked( ch_t** ppDest, char const ** ppSrc );
36
37static void
38copy_raw( ch_t** ppDest, char const ** ppSrc );
39/* = = = END-STATIC-FORWARD = = = */
40
41static void
42copy_cooked( ch_t** ppDest, char const ** ppSrc )
43{
44    ch_t* pDest = (ch_t*)*ppDest;
45    const ch_t* pSrc  = (const ch_t*)(*ppSrc + 1);
46
47    for (;;) {
48        ch_t ch = *(pSrc++);
49        switch (ch) {
50        case NUL:   *ppSrc = NULL; return;
51        case '"':   goto done;
52        case '\\':
53            pSrc += ao_string_cook_escape_char( (char*)pSrc, (char*)&ch, 0x7F );
54            if (ch == 0x7F)
55                break;
56            /* FALLTHROUGH */
57
58        default:
59            *(pDest++) = ch;
60        }
61    }
62
63 done:
64    *ppDest = (ch_t*)pDest; /* next spot for storing character */
65    *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
66}
67
68
69static void
70copy_raw( ch_t** ppDest, char const ** ppSrc )
71{
72    ch_t* pDest = *ppDest;
73    cc_t* pSrc  = (cc_t*) (*ppSrc + 1);
74
75    for (;;) {
76        ch_t ch = *(pSrc++);
77        switch (ch) {
78        case NUL:   *ppSrc = NULL; return;
79        case '\'':  goto done;
80        case '\\':
81            /*
82             *  *Four* escapes are handled:  newline removal, escape char
83             *  quoting and apostrophe quoting
84             */
85            switch (*pSrc) {
86            case NUL:   *ppSrc = NULL; return;
87            case '\r':
88                if (*(++pSrc) == '\n')
89                    ++pSrc;
90                continue;
91
92            case '\n':
93                ++pSrc;
94                continue;
95
96            case '\'':
97                ch = '\'';
98                /* FALLTHROUGH */
99
100            case '\\':
101                ++pSrc;
102                break;
103            }
104            /* FALLTHROUGH */
105
106        default:
107            *(pDest++) = ch;
108        }
109    }
110
111 done:
112    *ppDest = pDest; /* next spot for storing character */
113    *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
114}
115
116
117/*=export_func ao_string_tokenize
118 *
119 * what: tokenize an input string
120 *
121 * arg:  + char const* + string + string to be tokenized +
122 *
123 * ret_type:  token_list_t*
124 * ret_desc:  pointer to a structure that lists each token
125 *
126 * doc:
127 *
128 * This function will convert one input string into a list of strings.
129 * The list of strings is derived by separating the input based on
130 * white space separation.  However, if the input contains either single
131 * or double quote characters, then the text after that character up to
132 * a matching quote will become the string in the list.
133 *
134 *  The returned pointer should be deallocated with @code{free(3C)} when
135 *  are done using the data.  The data are placed in a single block of
136 *  allocated memory.  Do not deallocate individual token/strings.
137 *
138 *  The structure pointed to will contain at least these two fields:
139 *  @table @samp
140 *  @item tkn_ct
141 *  The number of tokens found in the input string.
142 *  @item tok_list
143 *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
144 *  the last pointer set to NULL.
145 *  @end table
146 *
147 * There are two types of quoted strings: single quoted (@code{'}) and
148 * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
149 * escape characters (@code{\\}) are simply another character, except when
150 * preceding the following characters:
151 * @example
152 * @code{\\}  double backslashes reduce to one
153 * @code{'}   incorporates the single quote into the string
154 * @code{\n}  suppresses both the backslash and newline character
155 * @end example
156 *
157 * Double quote strings are formed according to the rules of string
158 * constants in ANSI-C programs.
159 *
160 * example:
161 * @example
162 *    #include <stdlib.h>
163 *    int ix;
164 *    token_list_t* ptl = ao_string_tokenize( some_string )
165 *    for (ix = 0; ix < ptl->tkn_ct; ix++)
166 *       do_something_with_tkn( ptl->tkn_list[ix] );
167 *    free( ptl );
168 * @end example
169 * Note that everything is freed with the one call to @code{free(3C)}.
170 *
171 * err:
172 *  NULL is returned and @code{errno} will be set to indicate the problem:
173 *  @itemize @bullet
174 *  @item
175 *  @code{EINVAL} - There was an unterminated quoted string.
176 *  @item
177 *  @code{ENOENT} - The input string was empty.
178 *  @item
179 *  @code{ENOMEM} - There is not enough memory.
180 *  @end itemize
181=*/
182token_list_t*
183ao_string_tokenize( char const* str )
184{
185    int max_token_ct = 1; /* allow for trailing NUL on string */
186    token_list_t* res;
187
188    if (str == NULL)  goto bogus_str;
189
190    /*
191     *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
192     *  an empty string was passed.
193     */
194    while (IS_WHITESPACE_CHAR(*str))  str++;
195    if (*str == NUL) {
196    bogus_str:
197        errno = ENOENT;
198        return NULL;
199    }
200
201    /*
202     *  Take an approximate count of tokens.  If no quoted strings are used,
203     *  it will be accurate.  If quoted strings are used, it will be a little
204     *  high and we'll squander the space for a few extra pointers.
205     */
206    {
207        cc_t* pz = (cc_t*)str;
208
209        do {
210            max_token_ct++;
211            while (! IS_WHITESPACE_CHAR(*++pz))
212                if (*pz == NUL) goto found_nul;
213            while (IS_WHITESPACE_CHAR(*pz))  pz++;
214        } while (*pz != NUL);
215
216    found_nul:
217        ;
218    }
219
220    res = malloc( sizeof(*res) + strlen(str) + (max_token_ct * sizeof(ch_t*)) );
221    if (res == NULL) {
222        errno = ENOMEM;
223        return res;
224    }
225
226    /*
227     *  Now copy each token into the output buffer.
228     */
229    {
230        ch_t* pzDest = (ch_t*)(res->tkn_list + (max_token_ct + 1));
231        res->tkn_ct  = 0;
232
233        do  {
234            res->tkn_list[ res->tkn_ct++ ] = pzDest;
235            for (;;) {
236                int ch = (ch_t)*str;
237                if (IS_WHITESPACE_CHAR(ch)) {
238                found_white_space:
239                    while (IS_WHITESPACE_CHAR(*++str))  ;
240                    break;
241                }
242
243                switch (ch) {
244                case '"':
245                    copy_cooked( &pzDest, &str );
246                    if (str == NULL) {
247                        free(res);
248                        errno = EINVAL;
249                        return NULL;
250                    }
251                    if (IS_WHITESPACE_CHAR(*str))
252                        goto found_white_space;
253                    break;
254
255                case '\'':
256                    copy_raw( &pzDest, &str );
257                    if (str == NULL) {
258                        free(res);
259                        errno = EINVAL;
260                        return NULL;
261                    }
262                    if (IS_WHITESPACE_CHAR(*str))
263                        goto found_white_space;
264                    break;
265
266                case NUL:
267                    goto copy_done;
268
269                default:
270                    str++;
271                    *(pzDest++) = ch;
272                }
273            } copy_done:;
274
275            /*
276             * NUL terminate the last token and see if we have any more tokens.
277             */
278            *(pzDest++) = NUL;
279        } while (*str != NUL);
280
281        res->tkn_list[ res->tkn_ct ] = NULL;
282    }
283
284    return res;
285}
286
287#ifdef TEST
288#include <stdio.h>
289#include <string.h>
290
291int
292main( int argc, char** argv )
293{
294    if (argc == 1) {
295        printf("USAGE:  %s arg [ ... ]\n", *argv);
296        return 1;
297    }
298    while (--argc > 0) {
299        char* arg = *(++argv);
300        token_list_t* p = ao_string_tokenize( arg );
301        if (p == NULL) {
302            printf( "Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
303                    arg, errno, strerror( errno ));
304        } else {
305            int ix = 0;
306            printf( "Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct );
307            do {
308                printf( " %3d:  ``%s''\n", ix+1, p->tkn_list[ix] );
309            } while (++ix < p->tkn_ct);
310            free(p);
311        }
312    }
313    return 0;
314}
315#endif
316
317/*
318 * Local Variables:
319 * mode: C
320 * c-file-style: "stroustrup"
321 * indent-tabs-mode: nil
322 * End:
323 * end of autoopts/tokenize.c */
324