1/*	$NetBSD: str.c,v 1.30 2018/05/26 11:20:30 leot Exp $	*/
2
3/*-
4 * Copyright (c) 1991, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33#ifndef lint
34#if 0
35static char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
36#endif
37__RCSID("$NetBSD: str.c,v 1.30 2018/05/26 11:20:30 leot Exp $");
38#endif /* not lint */
39
40#include <sys/types.h>
41
42#include <err.h>
43#include <errno.h>
44#include <stddef.h>
45#include <stdio.h>
46#include <stdlib.h>
47#include <string.h>
48#include <ctype.h>
49#include <assert.h>
50
51#include "extern.h"
52
53struct str {
54	enum { STRING1, STRING2 } which;
55	enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
56	int cnt;			/* character count */
57	int lastch;			/* last character */
58	int equiv[2];			/* equivalence set */
59	int *set;			/* set of characters */
60	const char *str;		/* user's string */
61};
62
63static int backslash(STR *);
64static int bracket(STR *);
65static int c_class(const void *, const void *);
66static int *genclass(const char *, size_t);
67static void genequiv(STR *);
68static int genrange(STR *);
69static void genseq(STR *);
70
71STR *
72str_create(int whichstring, const char *txt)
73{
74	STR *s;
75
76	s = malloc(sizeof(*s));
77	if (s == NULL) {
78		err(1, "Out of memory");
79	}
80
81	s->which = whichstring == 2 ? STRING2 : STRING1;
82	s->state = NORMAL;
83	s->cnt = 0;
84	s->lastch = OOBCH;
85	s->equiv[0] = 0;
86	s->equiv[1] = OOBCH;
87	s->set = NULL;
88	s->str = txt;
89
90	return s;
91}
92
93void
94str_destroy(STR *s)
95{
96	if (s->set != NULL && s->set != s->equiv) {
97		free(s->set);
98	}
99	free(s);
100}
101
102int
103next(STR *s, int *ret)
104{
105	int ch;
106
107	switch (s->state) {
108	case EOS:
109		*ret = s->lastch;
110		return 0;
111	case INFINITE:
112		*ret = s->lastch;
113		return 1;
114	case NORMAL:
115		ch = (unsigned char)s->str[0];
116		switch (ch) {
117		case '\0':
118			s->state = EOS;
119			*ret = s->lastch;
120			return 0;
121		case '\\':
122			s->lastch = backslash(s);
123			break;
124		case '[':
125			if (bracket(s)) {
126				return next(s, ret);
127			}
128			/* FALLTHROUGH */
129		default:
130			++s->str;
131			s->lastch = ch;
132			break;
133		}
134
135		/* We can start a range at any time. */
136		if (s->str[0] == '-' && genrange(s)) {
137			return next(s, ret);
138		}
139		*ret = s->lastch;
140		return 1;
141	case RANGE:
142		if (s->cnt == 0) {
143			s->state = NORMAL;
144			return next(s, ret);
145		}
146		s->cnt--;
147		++s->lastch;
148		*ret = s->lastch;
149		return 1;
150	case SEQUENCE:
151		if (s->cnt == 0) {
152			s->state = NORMAL;
153			return next(s, ret);
154		}
155		s->cnt--;
156		*ret = s->lastch;
157		return 1;
158	case SET:
159		s->lastch = s->set[s->cnt++];
160		if (s->lastch == OOBCH) {
161			s->state = NORMAL;
162			if (s->set != s->equiv) {
163				free(s->set);
164			}
165			s->set = NULL;
166			return next(s, ret);
167		}
168		*ret = s->lastch;
169		return 1;
170	}
171	/* NOTREACHED */
172	assert(0);
173	*ret = s->lastch;
174	return 0;
175}
176
177static int
178bracket(STR *s)
179{
180	const char *p;
181	int *q;
182
183	switch (s->str[1]) {
184	case ':':				/* "[:class:]" */
185		if ((p = strstr(s->str + 2, ":]")) == NULL)
186			return 0;
187		s->str += 2;
188		q = genclass(s->str, p - s->str);
189		s->state = SET;
190		s->set = q;
191		s->cnt = 0;
192		s->str = p + 2;
193		return 1;
194	case '=':				/* "[=equiv=]" */
195		if ((p = strstr(s->str + 2, "=]")) == NULL)
196			return 0;
197		s->str += 2;
198		genequiv(s);
199		s->str = p + 2;
200		return 1;
201	default:				/* "[\###*n]" or "[#*n]" */
202		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
203			return 0;
204		if (p[0] != '*' || strchr(p, ']') == NULL)
205			return 0;
206		s->str += 1;
207		genseq(s);
208		return 1;
209	}
210	/* NOTREACHED */
211}
212
213typedef struct {
214	const char *name;
215	int (*func)(int);
216} CLASS;
217
218static const CLASS classes[] = {
219	{ "alnum",  isalnum  },
220	{ "alpha",  isalpha  },
221	{ "blank",  isblank  },
222	{ "cntrl",  iscntrl  },
223	{ "digit",  isdigit  },
224	{ "graph",  isgraph  },
225	{ "lower",  islower  },
226	{ "print",  isprint  },
227	{ "punct",  ispunct  },
228	{ "space",  isspace  },
229	{ "upper",  isupper  },
230	{ "xdigit", isxdigit },
231};
232
233typedef struct {
234	const char *name;
235	size_t len;
236} CLASSKEY;
237
238static int *
239genclass(const char *class, size_t len)
240{
241	int ch;
242	const CLASS *cp;
243	CLASSKEY key;
244	int *p;
245	unsigned pos, num;
246
247	/* Find the class */
248	key.name = class;
249	key.len = len;
250	cp = bsearch(&key, classes, __arraycount(classes), sizeof(classes[0]),
251		     c_class);
252	if (cp == NULL) {
253		errx(1, "unknown class %.*s", (int)len, class);
254	}
255
256	/*
257	 * Figure out what characters are in the class
258	 */
259
260	num = NCHARS + 1;
261	p = malloc(num * sizeof(*p));
262	if (p == NULL) {
263		err(1, "malloc");
264	}
265
266	pos = 0;
267	for (ch = 0; ch < NCHARS; ch++) {
268		if (cp->func(ch)) {
269			p[pos++] = ch;
270		}
271	}
272
273	p[pos++] = OOBCH;
274	for (; pos < num; pos++) {
275		p[pos] = 0;
276	}
277
278	return p;
279}
280
281static int
282c_class(const void *av, const void *bv)
283{
284	const CLASSKEY *a = av;
285	const CLASS *b = bv;
286	size_t blen;
287	int r;
288
289	blen = strlen(b->name);
290	r = strncmp(a->name, b->name, a->len);
291	if (r != 0) {
292		return r;
293	}
294	if (a->len < blen) {
295		/* someone gave us a prefix of the right name */
296		return -1;
297	}
298	assert(a-> len == blen);
299	return 0;
300}
301
302/*
303 * English doesn't have any equivalence classes, so for now
304 * we just syntax check and grab the character.
305 */
306static void
307genequiv(STR *s)
308{
309	int ch;
310
311	ch = (unsigned char)s->str[0];
312	if (ch == '\\') {
313		s->equiv[0] = backslash(s);
314	} else {
315		s->equiv[0] = ch;
316		s->str++;
317	}
318	if (s->str[0] != '=') {
319		errx(1, "Misplaced equivalence equals sign");
320	}
321	s->str++;
322	if (s->str[0] != ']') {
323		errx(1, "Misplaced equivalence right bracket");
324	}
325	s->str++;
326
327	s->cnt = 0;
328	s->state = SET;
329	s->set = s->equiv;
330}
331
332static int
333genrange(STR *s)
334{
335	int stopval;
336	const char *savestart;
337
338	savestart = s->str++;
339	stopval = s->str[0] == '\\' ? backslash(s) : (unsigned char)*s->str++;
340	if (stopval < (unsigned char)s->lastch) {
341		s->str = savestart;
342		return 0;
343	}
344	s->cnt = stopval - s->lastch + 1;
345	s->state = RANGE;
346	--s->lastch;
347	return 1;
348}
349
350static void
351genseq(STR *s)
352{
353	char *ep;
354
355	if (s->which == STRING1) {
356		errx(1, "Sequences only valid in string2");
357	}
358
359	if (*s->str == '\\') {
360		s->lastch = backslash(s);
361	} else {
362		s->lastch = (unsigned char)*s->str++;
363	}
364	if (*s->str != '*') {
365		errx(1, "Misplaced sequence asterisk");
366	}
367
368	s->str++;
369	switch (s->str[0]) {
370	case '\\':
371		s->cnt = backslash(s);
372		break;
373	case ']':
374		s->cnt = 0;
375		++s->str;
376		break;
377	default:
378		if (isdigit((unsigned char)s->str[0])) {
379			s->cnt = strtol(s->str, &ep, 0);
380			if (*ep == ']') {
381				s->str = ep + 1;
382				break;
383			}
384		}
385		errx(1, "illegal sequence count");
386		/* NOTREACHED */
387	}
388
389	s->state = s->cnt ? SEQUENCE : INFINITE;
390}
391
392/*
393 * Translate \??? into a character.  Up to 3 octal digits, if no digits either
394 * an escape code or a literal character.
395 */
396static int
397backslash(STR *s)
398{
399	int ch, cnt, val;
400
401	cnt = val = 0;
402	for (;;) {
403		/* Consume the character we're already on. */
404		s->str++;
405
406		/* Look at the next character. */
407		ch = (unsigned char)s->str[0];
408		if (!isascii(ch) || !isdigit(ch)) {
409			break;
410		}
411		val = val * 8 + ch - '0';
412		if (++cnt == 3) {
413			/* Enough digits; consume this one and stop */
414			++s->str;
415			break;
416		}
417	}
418	if (cnt) {
419		/* We saw digits, so return their value */
420		if (val >= OOBCH)
421			errx(1, "Invalid octal character value");
422		return val;
423	}
424	if (ch == '\0') {
425		/* \<end> -> \ */
426		s->state = EOS;
427		return '\\';
428	}
429
430	/* Consume the escaped character */
431	s->str++;
432
433	switch (ch) {
434	case 'a':			/* escape characters */
435		return '\7';
436	case 'b':
437		return '\b';
438	case 'e':
439		return '\033';
440	case 'f':
441		return '\f';
442	case 'n':
443		return '\n';
444	case 'r':
445		return '\r';
446	case 't':
447		return '\t';
448	case 'v':
449		return '\13';
450	default:			/* \q -> q */
451		return ch;
452	}
453}
454