1/*	$NetBSD: tokenize.c,v 1.2 2010/12/04 23:08:34 christos Exp $	*/
2
3/*
4 *  This file defines the string_tokenize interface
5 * Time-stamp:      "2007-11-12 20:40:36 bkorb"
6 *
7 *  This file is part of AutoOpts, a companion to AutoGen.
8 *  AutoOpts is free software.
9 *  AutoOpts is copyright (c) 1992-2009 by Bruce Korb - all rights reserved
10 *
11 *  AutoOpts is available under any one of two licenses.  The license
12 *  in use must be one of these two and the choice is under the control
13 *  of the user of the license.
14 *
15 *   The GNU Lesser General Public License, version 3 or later
16 *      See the files "COPYING.lgplv3" and "COPYING.gplv3"
17 *
18 *   The Modified Berkeley Software Distribution License
19 *      See the file "COPYING.mbsd"
20 *
21 *  These files have the following md5sums:
22 *
23 *  43b91e8ca915626ed3818ffb1b71248b pkg/libopts/COPYING.gplv3
24 *  06a1a2e4760c90ea5e1dad8dfaac4d39 pkg/libopts/COPYING.lgplv3
25 *  66a5cedaf62c4b2637025f049f9b826f pkg/libopts/COPYING.mbsd
26 */
27
28#include <errno.h>
29#include <stdlib.h>
30
31#define cc_t   const unsigned char
32#define ch_t   unsigned char
33
34/* = = = START-STATIC-FORWARD = = = */
35/* static forward declarations maintained by mk-fwd */
36static void
37copy_cooked( ch_t** ppDest, char const ** ppSrc );
38
39static void
40copy_raw( ch_t** ppDest, char const ** ppSrc );
41/* = = = END-STATIC-FORWARD = = = */
42
43static void
44copy_cooked( ch_t** ppDest, char const ** ppSrc )
45{
46    ch_t* pDest = (ch_t*)*ppDest;
47    const ch_t* pSrc  = (const ch_t*)(*ppSrc + 1);
48
49    for (;;) {
50        ch_t ch = *(pSrc++);
51        switch (ch) {
52        case NUL:   *ppSrc = NULL; return;
53        case '"':   goto done;
54        case '\\':
55            pSrc += ao_string_cook_escape_char( (const char*)pSrc, (char*)&ch, 0x7F );
56            if (ch == 0x7F)
57                break;
58            /* FALLTHROUGH */
59
60        default:
61            *(pDest++) = ch;
62        }
63    }
64
65 done:
66    *ppDest = (ch_t*)pDest; /* next spot for storing character */
67    *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
68}
69
70
71static void
72copy_raw( ch_t** ppDest, char const ** ppSrc )
73{
74    ch_t* pDest = *ppDest;
75    cc_t* pSrc  = (cc_t*) (*ppSrc + 1);
76
77    for (;;) {
78        ch_t ch = *(pSrc++);
79        switch (ch) {
80        case NUL:   *ppSrc = NULL; return;
81        case '\'':  goto done;
82        case '\\':
83            /*
84             *  *Four* escapes are handled:  newline removal, escape char
85             *  quoting and apostrophe quoting
86             */
87            switch (*pSrc) {
88            case NUL:   *ppSrc = NULL; return;
89            case '\r':
90                if (*(++pSrc) == '\n')
91                    ++pSrc;
92                continue;
93
94            case '\n':
95                ++pSrc;
96                continue;
97
98            case '\'':
99                ch = '\'';
100                /* FALLTHROUGH */
101
102            case '\\':
103                ++pSrc;
104                break;
105            }
106            /* FALLTHROUGH */
107
108        default:
109            *(pDest++) = ch;
110        }
111    }
112
113 done:
114    *ppDest = pDest; /* next spot for storing character */
115    *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
116}
117
118
119/*=export_func ao_string_tokenize
120 *
121 * what: tokenize an input string
122 *
123 * arg:  + char const* + string + string to be tokenized +
124 *
125 * ret_type:  token_list_t*
126 * ret_desc:  pointer to a structure that lists each token
127 *
128 * doc:
129 *
130 * This function will convert one input string into a list of strings.
131 * The list of strings is derived by separating the input based on
132 * white space separation.  However, if the input contains either single
133 * or double quote characters, then the text after that character up to
134 * a matching quote will become the string in the list.
135 *
136 *  The returned pointer should be deallocated with @code{free(3C)} when
137 *  are done using the data.  The data are placed in a single block of
138 *  allocated memory.  Do not deallocate individual token/strings.
139 *
140 *  The structure pointed to will contain at least these two fields:
141 *  @table @samp
142 *  @item tkn_ct
143 *  The number of tokens found in the input string.
144 *  @item tok_list
145 *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
146 *  the last pointer set to NULL.
147 *  @end table
148 *
149 * There are two types of quoted strings: single quoted (@code{'}) and
150 * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
151 * escape characters (@code{\\}) are simply another character, except when
152 * preceding the following characters:
153 * @example
154 * @code{\\}  double backslashes reduce to one
155 * @code{'}   incorporates the single quote into the string
156 * @code{\n}  suppresses both the backslash and newline character
157 * @end example
158 *
159 * Double quote strings are formed according to the rules of string
160 * constants in ANSI-C programs.
161 *
162 * example:
163 * @example
164 *    #include <stdlib.h>
165 *    int ix;
166 *    token_list_t* ptl = ao_string_tokenize( some_string )
167 *    for (ix = 0; ix < ptl->tkn_ct; ix++)
168 *       do_something_with_tkn( ptl->tkn_list[ix] );
169 *    free( ptl );
170 * @end example
171 * Note that everything is freed with the one call to @code{free(3C)}.
172 *
173 * err:
174 *  NULL is returned and @code{errno} will be set to indicate the problem:
175 *  @itemize @bullet
176 *  @item
177 *  @code{EINVAL} - There was an unterminated quoted string.
178 *  @item
179 *  @code{ENOENT} - The input string was empty.
180 *  @item
181 *  @code{ENOMEM} - There is not enough memory.
182 *  @end itemize
183=*/
184token_list_t*
185ao_string_tokenize( char const* str )
186{
187    int max_token_ct = 1; /* allow for trailing NUL on string */
188    token_list_t* res;
189
190    if (str == NULL)  goto bogus_str;
191
192    /*
193     *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
194     *  an empty string was passed.
195     */
196    while (IS_WHITESPACE_CHAR(*str))  str++;
197    if (*str == NUL) {
198    bogus_str:
199        errno = ENOENT;
200        return NULL;
201    }
202
203    /*
204     *  Take an approximate count of tokens.  If no quoted strings are used,
205     *  it will be accurate.  If quoted strings are used, it will be a little
206     *  high and we'll squander the space for a few extra pointers.
207     */
208    {
209        cc_t* pz = (cc_t*)str;
210
211        do {
212            max_token_ct++;
213            while (! IS_WHITESPACE_CHAR(*++pz))
214                if (*pz == NUL) goto found_nul;
215            while (IS_WHITESPACE_CHAR(*pz))  pz++;
216        } while (*pz != NUL);
217
218    found_nul:
219        ;
220    }
221
222    res = malloc( sizeof(*res) + strlen(str) + (max_token_ct * sizeof(ch_t*)) );
223    if (res == NULL) {
224        errno = ENOMEM;
225        return res;
226    }
227
228    /*
229     *  Now copy each token into the output buffer.
230     */
231    {
232        ch_t* pzDest = (ch_t*)(res->tkn_list + (max_token_ct + 1));
233        res->tkn_ct  = 0;
234
235        do  {
236            res->tkn_list[ res->tkn_ct++ ] = pzDest;
237            for (;;) {
238                int ch = (ch_t)*str;
239                if (IS_WHITESPACE_CHAR(ch)) {
240                found_white_space:
241                    while (IS_WHITESPACE_CHAR(*++str))  ;
242                    break;
243                }
244
245                switch (ch) {
246                case '"':
247                    copy_cooked( &pzDest, &str );
248                    if (str == NULL) {
249                        free(res);
250                        errno = EINVAL;
251                        return NULL;
252                    }
253                    if (IS_WHITESPACE_CHAR(*str))
254                        goto found_white_space;
255                    break;
256
257                case '\'':
258                    copy_raw( &pzDest, &str );
259                    if (str == NULL) {
260                        free(res);
261                        errno = EINVAL;
262                        return NULL;
263                    }
264                    if (IS_WHITESPACE_CHAR(*str))
265                        goto found_white_space;
266                    break;
267
268                case NUL:
269                    goto copy_done;
270
271                default:
272                    str++;
273                    *(pzDest++) = ch;
274                }
275            } copy_done:;
276
277            /*
278             * NUL terminate the last token and see if we have any more tokens.
279             */
280            *(pzDest++) = NUL;
281        } while (*str != NUL);
282
283        res->tkn_list[ res->tkn_ct ] = NULL;
284    }
285
286    return res;
287}
288
289#ifdef TEST
290#include <stdio.h>
291#include <string.h>
292
293int
294main( int argc, char** argv )
295{
296    if (argc == 1) {
297        printf("USAGE:  %s arg [ ... ]\n", *argv);
298        return 1;
299    }
300    while (--argc > 0) {
301        char* arg = *(++argv);
302        token_list_t* p = ao_string_tokenize( arg );
303        if (p == NULL) {
304            printf( "Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
305                    arg, errno, strerror( errno ));
306        } else {
307            int ix = 0;
308            printf( "Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct );
309            do {
310                printf( " %3d:  ``%s''\n", ix+1, p->tkn_list[ix] );
311            } while (++ix < p->tkn_ct);
312            free(p);
313        }
314    }
315    return 0;
316}
317#endif
318
319/*
320 * Local Variables:
321 * mode: C
322 * c-file-style: "stroustrup"
323 * indent-tabs-mode: nil
324 * End:
325 * end of autoopts/tokenize.c */
326