1/*-
2 * Copyright (c) 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31
32__FBSDID("$FreeBSD: releng/10.2/usr.bin/tr/str.c 229403 2012-01-03 18:51:58Z ed $");
33
34#ifndef lint
35static const char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
36#endif
37
38#include <sys/types.h>
39
40#include <ctype.h>
41#include <err.h>
42#include <errno.h>
43#include <stddef.h>
44#include <stdio.h>
45#include <stdlib.h>
46#include <string.h>
47#include <wchar.h>
48#include <wctype.h>
49
50#include "extern.h"
51
52static int      backslash(STR *, int *);
53static int	bracket(STR *);
54static void	genclass(STR *);
55static void	genequiv(STR *);
56static int      genrange(STR *, int);
57static void	genseq(STR *);
58
59wint_t
60next(STR *s)
61{
62	int is_octal;
63	wint_t ch;
64	wchar_t wch;
65	size_t clen;
66
67	switch (s->state) {
68	case EOS:
69		return (0);
70	case INFINITE:
71		return (1);
72	case NORMAL:
73		switch (*s->str) {
74		case '\0':
75			s->state = EOS;
76			return (0);
77		case '\\':
78			s->lastch = backslash(s, &is_octal);
79			break;
80		case '[':
81			if (bracket(s))
82				return (next(s));
83			/* FALLTHROUGH */
84		default:
85			clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL);
86			if (clen == (size_t)-1 || clen == (size_t)-2 ||
87			    clen == 0)
88				errc(1, EILSEQ, NULL);
89			is_octal = 0;
90			s->lastch = wch;
91			s->str += clen;
92			break;
93		}
94
95		/* We can start a range at any time. */
96		if (s->str[0] == '-' && genrange(s, is_octal))
97			return (next(s));
98		return (1);
99	case RANGE:
100		if (s->cnt-- == 0) {
101			s->state = NORMAL;
102			return (next(s));
103		}
104		++s->lastch;
105		return (1);
106	case SEQUENCE:
107		if (s->cnt-- == 0) {
108			s->state = NORMAL;
109			return (next(s));
110		}
111		return (1);
112	case CCLASS:
113	case CCLASS_UPPER:
114	case CCLASS_LOWER:
115		s->cnt++;
116		ch = nextwctype(s->lastch, s->cclass);
117		if (ch == -1) {
118			s->state = NORMAL;
119			return (next(s));
120		}
121		s->lastch = ch;
122		return (1);
123	case SET:
124		if ((ch = s->set[s->cnt++]) == OOBCH) {
125			s->state = NORMAL;
126			return (next(s));
127		}
128		s->lastch = ch;
129		return (1);
130	default:
131		return (0);
132	}
133	/* NOTREACHED */
134}
135
136static int
137bracket(STR *s)
138{
139	char *p;
140
141	switch (s->str[1]) {
142	case ':':				/* "[:class:]" */
143		if ((p = strchr(s->str + 2, ']')) == NULL)
144			return (0);
145		if (*(p - 1) != ':' || p - s->str < 4)
146			goto repeat;
147		*(p - 1) = '\0';
148		s->str += 2;
149		genclass(s);
150		s->str = p + 1;
151		return (1);
152	case '=':				/* "[=equiv=]" */
153		if (s->str[2] == '\0' || (p = strchr(s->str + 3, ']')) == NULL)
154			return (0);
155		if (*(p - 1) != '=' || p - s->str < 4)
156			goto repeat;
157		s->str += 2;
158		genequiv(s);
159		return (1);
160	default:				/* "[\###*n]" or "[#*n]" */
161	repeat:
162		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
163			return (0);
164		if (p[0] != '*' || strchr(p, ']') == NULL)
165			return (0);
166		s->str += 1;
167		genseq(s);
168		return (1);
169	}
170	/* NOTREACHED */
171}
172
173static void
174genclass(STR *s)
175{
176
177	if ((s->cclass = wctype(s->str)) == 0)
178		errx(1, "unknown class %s", s->str);
179	s->cnt = 0;
180	s->lastch = -1;		/* incremented before check in next() */
181	if (strcmp(s->str, "upper") == 0)
182		s->state = CCLASS_UPPER;
183	else if (strcmp(s->str, "lower") == 0)
184		s->state = CCLASS_LOWER;
185	else
186		s->state = CCLASS;
187}
188
189static void
190genequiv(STR *s)
191{
192	int i, p, pri;
193	char src[2], dst[3];
194	size_t clen;
195	wchar_t wc;
196
197	if (*s->str == '\\') {
198		s->equiv[0] = backslash(s, NULL);
199		if (*s->str != '=')
200			errx(1, "misplaced equivalence equals sign");
201		s->str += 2;
202	} else {
203		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
204		if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0)
205			errc(1, EILSEQ, NULL);
206		s->equiv[0] = wc;
207		if (s->str[clen] != '=')
208			errx(1, "misplaced equivalence equals sign");
209		s->str += clen + 2;
210	}
211
212	/*
213	 * Calculate the set of all characters in the same equivalence class
214	 * as the specified character (they will have the same primary
215	 * collation weights).
216	 * XXX Knows too much about how strxfrm() is implemented. Assumes
217	 * it fills the string with primary collation weight bytes. Only one-
218	 * to-one mappings are supported.
219	 * XXX Equivalence classes not supported in multibyte locales.
220	 */
221	src[0] = (char)s->equiv[0];
222	src[1] = '\0';
223	if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) {
224		pri = (unsigned char)*dst;
225		for (p = 1, i = 1; i < NCHARS_SB; i++) {
226			*src = i;
227			if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
228			    pri == (unsigned char)*dst)
229				s->equiv[p++] = i;
230		}
231		s->equiv[p] = OOBCH;
232	}
233
234	s->cnt = 0;
235	s->state = SET;
236	s->set = s->equiv;
237}
238
239static int
240genrange(STR *s, int was_octal)
241{
242	int stopval, octal;
243	char *savestart;
244	int n, cnt, *p;
245	size_t clen;
246	wchar_t wc;
247
248	octal = 0;
249	savestart = s->str;
250	if (*++s->str == '\\')
251		stopval = backslash(s, &octal);
252	else {
253		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
254		if (clen == (size_t)-1 || clen == (size_t)-2)
255			errc(1, EILSEQ, NULL);
256		stopval = wc;
257		s->str += clen;
258	}
259	/*
260	 * XXX Characters are not ordered according to collating sequence in
261	 * multibyte locales.
262	 */
263	if (octal || was_octal || MB_CUR_MAX > 1) {
264		if (stopval < s->lastch) {
265			s->str = savestart;
266			return (0);
267		}
268		s->cnt = stopval - s->lastch + 1;
269		s->state = RANGE;
270		--s->lastch;
271		return (1);
272	}
273	if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
274		s->str = savestart;
275		return (0);
276	}
277	if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
278		err(1, "genrange() malloc");
279	for (cnt = 0; cnt < NCHARS_SB; cnt++)
280		if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
281		    charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
282			*p++ = cnt;
283	*p = OOBCH;
284	n = p - s->set;
285
286	s->cnt = 0;
287	s->state = SET;
288	if (n > 1)
289		mergesort(s->set, n, sizeof(*(s->set)), charcoll);
290	return (1);
291}
292
293static void
294genseq(STR *s)
295{
296	char *ep;
297	wchar_t wc;
298	size_t clen;
299
300	if (s->which == STRING1)
301		errx(1, "sequences only valid in string2");
302
303	if (*s->str == '\\')
304		s->lastch = backslash(s, NULL);
305	else {
306		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
307		if (clen == (size_t)-1 || clen == (size_t)-2)
308			errc(1, EILSEQ, NULL);
309		s->lastch = wc;
310		s->str += clen;
311	}
312	if (*s->str != '*')
313		errx(1, "misplaced sequence asterisk");
314
315	switch (*++s->str) {
316	case '\\':
317		s->cnt = backslash(s, NULL);
318		break;
319	case ']':
320		s->cnt = 0;
321		++s->str;
322		break;
323	default:
324		if (isdigit((u_char)*s->str)) {
325			s->cnt = strtol(s->str, &ep, 0);
326			if (*ep == ']') {
327				s->str = ep + 1;
328				break;
329			}
330		}
331		errx(1, "illegal sequence count");
332		/* NOTREACHED */
333	}
334
335	s->state = s->cnt ? SEQUENCE : INFINITE;
336}
337
338/*
339 * Translate \??? into a character.  Up to 3 octal digits, if no digits either
340 * an escape code or a literal character.
341 */
342static int
343backslash(STR *s, int *is_octal)
344{
345	int ch, cnt, val;
346
347	if (is_octal != NULL)
348		*is_octal = 0;
349	for (cnt = val = 0;;) {
350		ch = (u_char)*++s->str;
351		if (!isdigit(ch) || ch > '7')
352			break;
353		val = val * 8 + ch - '0';
354		if (++cnt == 3) {
355			++s->str;
356			break;
357		}
358	}
359	if (cnt) {
360		if (is_octal != NULL)
361			*is_octal = 1;
362		return (val);
363	}
364	if (ch != '\0')
365		++s->str;
366	switch (ch) {
367		case 'a':			/* escape characters */
368			return ('\7');
369		case 'b':
370			return ('\b');
371		case 'f':
372			return ('\f');
373		case 'n':
374			return ('\n');
375		case 'r':
376			return ('\r');
377		case 't':
378			return ('\t');
379		case 'v':
380			return ('\13');
381		case '\0':			/*  \" -> \ */
382			s->state = EOS;
383			return ('\\');
384		default:			/* \x" -> x */
385			return (ch);
386	}
387}
388