1/*	$OpenBSD: str.c,v 1.15 2023/05/04 16:08:29 tb Exp $	*/
2/*	$NetBSD: str.c,v 1.7 1995/08/31 22:13:47 jtc Exp $	*/
3
4/*-
5 * Copyright (c) 1991, 1993
6 *	The Regents of the University of California.  All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32
33#include <sys/types.h>
34
35#include <assert.h>
36#include <errno.h>
37#include <stddef.h>
38#include <stdio.h>
39#include <stdlib.h>
40#include <string.h>
41#include <ctype.h>
42#include <err.h>
43
44#include "extern.h"
45
46static int	backslash(STR *);
47static int	bracket(STR *);
48static int	c_class(const void *, const void *);
49static void	genclass(STR *);
50static void	genequiv(STR *);
51static int	genrange(STR *);
52static void	genseq(STR *);
53
54int
55next(STR *s)
56{
57	int ch;
58
59	switch (s->state) {
60	case EOS:
61		return (0);
62	case INFINITE:
63		return (1);
64	case NORMAL:
65		switch (ch = *s->str) {
66		case '\0':
67			s->state = EOS;
68			return (0);
69		case '\\':
70			s->lastch = backslash(s);
71			break;
72		case '[':
73			if (bracket(s))
74				return (next(s));
75			/* FALLTHROUGH */
76		default:
77			++s->str;
78			s->lastch = ch;
79			break;
80		}
81
82		/* We can start a range at any time. */
83		if (s->str[0] == '-' && genrange(s))
84			return (next(s));
85		return (1);
86	case RANGE:
87		if (s->cnt-- == 0) {
88			s->state = NORMAL;
89			return (next(s));
90		}
91		++s->lastch;
92		return (1);
93	case SEQUENCE:
94		if (s->cnt-- == 0) {
95			s->state = NORMAL;
96			return (next(s));
97		}
98		return (1);
99	case SET:
100		if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
101			s->state = NORMAL;
102			return (next(s));
103		}
104		return (1);
105	default:
106		return 0;
107	}
108	/* NOTREACHED */
109}
110
111static int
112bracket(STR *s)
113{
114	char *p;
115
116	switch (s->str[1]) {
117	case ':':				/* "[:class:]" */
118		if ((p = strstr((char *)s->str + 2, ":]")) == NULL)
119			return (0);
120		*p = '\0';
121		s->str += 2;
122		genclass(s);
123		s->str = (unsigned char *)p + 2;
124		return (1);
125	case '=':				/* "[=equiv=]" */
126		if ((p = strstr((char *)s->str + 2, "=]")) == NULL)
127			return (0);
128		s->str += 2;
129		genequiv(s);
130		return (1);
131	default:				/* "[\###*n]" or "[#*n]" */
132		if ((p = strpbrk((char *)s->str + 2, "*]")) == NULL)
133			return (0);
134		if (p[0] != '*' || strchr(p, ']') == NULL)
135			return (0);
136		s->str += 1;
137		genseq(s);
138		return (1);
139	}
140	/* NOTREACHED */
141}
142
143typedef struct {
144	char *name;
145	int (*func)(int);
146	int *set;
147} CLASS;
148
149static CLASS classes[] = {
150	{ "alnum",  isalnum,  },
151	{ "alpha",  isalpha,  },
152	{ "blank",  isblank,  },
153	{ "cntrl",  iscntrl,  },
154	{ "digit",  isdigit,  },
155	{ "graph",  isgraph,  },
156	{ "lower",  islower,  },
157	{ "print",  isprint,  },
158	{ "punct",  ispunct,  },
159	{ "space",  isspace,  },
160	{ "upper",  isupper,  },
161	{ "xdigit", isxdigit, },
162};
163
164static void
165genclass(STR *s)
166{
167	CLASS *cp, tmp;
168	size_t len;
169	int i;
170
171	tmp.name = (char *)s->str;
172	if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) /
173	    sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
174		errx(1, "unknown class %s", s->str);
175
176	/*
177	 * Generate the set of characters in the class if we haven't
178	 * already done so.
179	 */
180	if (cp->set == NULL) {
181		cp->set = reallocarray(NULL, NCHARS + 1, sizeof(*cp->set));
182		if (cp->set == NULL)
183			err(1, NULL);
184		len = 0;
185		for (i = 0; i < NCHARS; i++) {
186			if (cp->func(i)) {
187				cp->set[len] = i;
188				len++;
189			}
190		}
191		cp->set[len] = OOBCH;
192		len++;
193		cp->set = reallocarray(cp->set, len, sizeof(*cp->set));
194		if (cp->set == NULL)
195			err(1, NULL);
196	}
197
198	s->cnt = 0;
199	s->state = SET;
200	s->set = cp->set;
201}
202
203static int
204c_class(const void *a, const void *b)
205{
206	return (strcmp(((CLASS *)a)->name, ((CLASS *)b)->name));
207}
208
209/*
210 * English doesn't have any equivalence classes, so for now
211 * we just syntax check and grab the character.
212 */
213static void
214genequiv(STR *s)
215{
216	if (*s->str == '\\') {
217		s->equiv[0] = backslash(s);
218		if (*s->str != '=')
219			errx(1, "misplaced equivalence equals sign");
220	} else {
221		s->equiv[0] = s->str[0];
222		if (s->str[1] != '=')
223			errx(1, "misplaced equivalence equals sign");
224	}
225	s->str += 2;
226	s->cnt = 0;
227	s->state = SET;
228	s->set = s->equiv;
229}
230
231static int
232genrange(STR *s)
233{
234	int stopval;
235	unsigned char *savestart;
236
237	savestart = s->str;
238	stopval = *++s->str == '\\' ? backslash(s) : *s->str++;
239	if (stopval < (u_char)s->lastch) {
240		s->str = savestart;
241		return (0);
242	}
243	s->cnt = stopval - s->lastch + 1;
244	s->state = RANGE;
245	--s->lastch;
246	return (1);
247}
248
249static void
250genseq(STR *s)
251{
252	char *ep;
253
254	if (s->which == STRING1)
255		errx(1, "sequences only valid in string2");
256
257	if (*s->str == '\\')
258		s->lastch = backslash(s);
259	else
260		s->lastch = *s->str++;
261	if (*s->str != '*')
262		errx(1, "misplaced sequence asterisk");
263
264	switch (*++s->str) {
265	case '\\':
266		s->cnt = backslash(s);
267		break;
268	case ']':
269		s->cnt = 0;
270		++s->str;
271		break;
272	default:
273		if (isdigit(*s->str)) {
274			s->cnt = strtol((char *)s->str, &ep, 0);
275			if (*ep == ']') {
276				s->str = (unsigned char *)ep + 1;
277				break;
278			}
279		}
280		errx(1, "illegal sequence count");
281		/* NOTREACHED */
282	}
283
284	s->state = s->cnt ? SEQUENCE : INFINITE;
285}
286
287/*
288 * Translate \??? into a character.  Up to 3 octal digits, if no digits either
289 * an escape code or a literal character.
290 */
291static int
292backslash(STR *s)
293{
294	size_t i;
295	int ch, val;
296
297	assert(*s->str == '\\');
298	s->str++;
299
300	/* Empty escapes become plain backslashes. */
301	if (*s->str == '\0') {
302		s->state = EOS;
303		return ('\\');
304	}
305
306	val = 0;
307	for (i = 0; i < 3; i++) {
308		if (s->str[i] < '0' || '7' < s->str[i])
309			break;
310		val = val * 8 + s->str[i] - '0';
311	}
312	if (i > 0) {
313		if (val > UCHAR_MAX)
314			errx(1, "octal value out of range: %d", val);
315		s->str += i;
316		return (val);
317	}
318
319	ch = *s->str++;
320	switch (ch) {
321		case 'a':			/* escape characters */
322			return ('\7');
323		case 'b':
324			return ('\b');
325		case 'f':
326			return ('\f');
327		case 'n':
328			return ('\n');
329		case 'r':
330			return ('\r');
331		case 't':
332			return ('\t');
333		case 'v':
334			return ('\13');
335		default:			/* \x" -> x */
336			return (ch);
337	}
338}
339