1/*-
2 * Copyright (c) 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 */
33
34#include <sys/cdefs.h>
35
36__FBSDID("$FreeBSD: src/usr.bin/tr/str.c,v 1.24 2004/11/14 05:15:25 jkh Exp $");
37
38#ifndef lint
39static const char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
40#endif
41
42#include <sys/cdefs.h>
43#include <sys/types.h>
44
45#include <ctype.h>
46#include <err.h>
47#include <errno.h>
48#include <stddef.h>
49#include <stdio.h>
50#include <stdlib.h>
51#include <string.h>
52#include <wchar.h>
53#include <wctype.h>
54
55#include "extern.h"
56
57static int      backslash(STR *, int *);
58static int	bracket(STR *);
59static void	genclass(STR *);
60static void	genequiv(STR *);
61static int      genrange(STR *, int);
62static void	genseq(STR *);
63
64wint_t
65next(s)
66	STR *s;
67{
68	int is_octal;
69	wint_t ch;
70	wchar_t wch;
71	size_t clen;
72
73	switch (s->state) {
74	case EOS:
75		return (0);
76	case INFINITE:
77#ifdef __APPLE__
78		switch (ch = (u_char)*s->str) {
79		case '\0':
80			/*
81			 * force at least one postive return so setup() will
82			 * process lastch of a sequence like [a*]; but change
83			 * state so it won't get stuck in a while(next(s)) loop
84			 */
85			s->state = NORMAL;
86		}
87#endif /* __APPLE__ */
88		return (1);
89	case NORMAL:
90		switch (*s->str) {
91		case '\0':
92			s->state = EOS;
93			return (0);
94		case '\\':
95			s->lastch = backslash(s, &is_octal);
96			break;
97		case '[':
98			if (bracket(s))
99				return (next(s));
100			/* FALLTHROUGH */
101		default:
102			clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL);
103			if (clen == (size_t)-1 || clen == (size_t)-2 ||
104			    clen == 0)
105				errc(1, EILSEQ, NULL);
106			is_octal = 0;
107			s->lastch = wch;
108			s->str += clen;
109			break;
110		}
111
112		/* We can start a range at any time. */
113		if (s->str[0] == '-' && genrange(s, is_octal))
114			return (next(s));
115		return (1);
116	case RANGE:
117		if (s->cnt-- == 0) {
118			s->state = NORMAL;
119			return (next(s));
120		}
121		++s->lastch;
122		return (1);
123	case SEQUENCE:
124		if (s->cnt-- == 0) {
125			s->state = NORMAL;
126			return (next(s));
127		}
128		return (1);
129	case CCLASS:
130	case CCLASS_UPPER:
131	case CCLASS_LOWER:
132		s->cnt++;
133		ch = nextwctype(s->lastch, s->cclass);
134		if (ch == -1) {
135			s->state = NORMAL;
136			return (next(s));
137		}
138		s->lastch = ch;
139		return (1);
140	case SET:
141		if ((ch = s->set[s->cnt++]) == OOBCH) {
142			s->state = NORMAL;
143			return (next(s));
144		}
145		s->lastch = ch;
146		return (1);
147	default:
148		return (0);
149	}
150	/* NOTREACHED */
151}
152
153static int
154bracket(s)
155	STR *s;
156{
157	char *p;
158
159	switch (s->str[1]) {
160	case ':':				/* "[:class:]" */
161		if ((p = strchr(s->str + 2, ']')) == NULL)
162			return (0);
163		if (*(p - 1) != ':' || p - s->str < 4)
164			goto repeat;
165		*(p - 1) = '\0';
166		s->str += 2;
167		genclass(s);
168		s->str = p + 1;
169		return (1);
170	case '=':				/* "[=equiv=]" */
171		if ((p = strchr(s->str + 2, ']')) == NULL)
172			return (0);
173		if (*(p - 1) != '=' || p - s->str < 4)
174			goto repeat;
175		s->str += 2;
176		genequiv(s);
177		return (1);
178	default:				/* "[\###*n]" or "[#*n]" */
179	repeat:
180		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
181			return (0);
182		if (p[0] != '*' || index(p, ']') == NULL)
183			return (0);
184		s->str += 1;
185		genseq(s);
186		return (1);
187	}
188	/* NOTREACHED */
189}
190
191static void
192genclass(s)
193	STR *s;
194{
195
196	if ((s->cclass = wctype(s->str)) == 0)
197		errx(1, "unknown class %s", s->str);
198	s->cnt = 0;
199	s->lastch = -1;		/* incremented before check in next() */
200	if (strcmp(s->str, "upper") == 0)
201		s->state = CCLASS_UPPER;
202	else if (strcmp(s->str, "lower") == 0)
203		s->state = CCLASS_LOWER;
204	else
205		s->state = CCLASS;
206}
207
208static void
209genequiv(s)
210	STR *s;
211{
212	int i, p, pri;
213	char src[2], dst[3];
214	size_t clen;
215	wchar_t wc;
216
217	if (*s->str == '\\') {
218		s->equiv[0] = backslash(s, NULL);
219		if (*s->str != '=')
220			errx(1, "misplaced equivalence equals sign");
221		s->str += 2;
222	} else {
223		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
224		if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0)
225			errc(1, EILSEQ, NULL);
226		s->equiv[0] = wc;
227		if (s->str[clen] != '=')
228			errx(1, "misplaced equivalence equals sign");
229		s->str += clen + 2;
230	}
231
232	/*
233	 * Calculate the set of all characters in the same equivalence class
234	 * as the specified character (they will have the same primary
235	 * collation weights).
236	 * XXX Knows too much about how strxfrm() is implemented. Assumes
237	 * it fills the string with primary collation weight bytes. Only one-
238	 * to-one mappings are supported.
239	 * XXX Equivalence classes not supported in multibyte locales.
240	 */
241	src[0] = (char)s->equiv[0];
242	src[1] = '\0';
243	if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) {
244		pri = (unsigned char)*dst;
245		for (p = 1, i = 1; i < NCHARS_SB; i++) {
246			*src = i;
247			if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
248			    pri == (unsigned char)*dst)
249				s->equiv[p++] = i;
250		}
251		s->equiv[p] = OOBCH;
252	}
253
254	s->cnt = 0;
255	s->state = SET;
256	s->set = s->equiv;
257}
258
259static int
260genrange(STR *s, int was_octal)
261{
262	int stopval, octal;
263	char *savestart;
264	int n, cnt, *p;
265	size_t clen;
266	wchar_t wc;
267
268	octal = 0;
269	savestart = s->str;
270	if (*++s->str == '\\')
271		stopval = backslash(s, &octal);
272	else {
273		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
274		if (clen == (size_t)-1 || clen == (size_t)-2)
275			errc(1, EILSEQ, NULL);
276		stopval = wc;
277		s->str += clen;
278	}
279	/*
280	 * XXX Characters are not ordered according to collating sequence in
281	 * multibyte locales.
282	 */
283	if (octal || was_octal || MB_CUR_MAX > 1) {
284		if (stopval < s->lastch) {
285			s->str = savestart;
286			return (0);
287		}
288		s->cnt = stopval - s->lastch + 1;
289		s->state = RANGE;
290		--s->lastch;
291		return (1);
292	}
293	if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
294		s->str = savestart;
295		return (0);
296	}
297	if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
298		err(1, "genrange() malloc");
299	for (cnt = 0; cnt < NCHARS_SB; cnt++)
300		if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
301		    charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
302			*p++ = cnt;
303	*p = OOBCH;
304	n = p - s->set;
305
306	s->cnt = 0;
307	s->state = SET;
308	if (n > 1)
309		mergesort(s->set, n, sizeof(*(s->set)), charcoll);
310	return (1);
311}
312
313static void
314genseq(s)
315	STR *s;
316{
317	char *ep;
318	wchar_t wc;
319	size_t clen;
320
321#ifndef __APPLE__
322	if (s->which == STRING1)
323		errx(1, "sequences only valid in string2");
324#endif /* !__APPLE__ */
325
326	if (*s->str == '\\')
327		s->lastch = backslash(s, NULL);
328	else {
329		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
330		if (clen == (size_t)-1 || clen == (size_t)-2)
331			errc(1, EILSEQ, NULL);
332		s->lastch = wc;
333		s->str += clen;
334	}
335	if (*s->str != '*')
336		errx(1, "misplaced sequence asterisk");
337
338	switch (*++s->str) {
339	case '\\':
340		s->cnt = backslash(s, NULL);
341		break;
342	case ']':
343		s->cnt = 0;
344		++s->str;
345		break;
346	default:
347		if (isdigit((u_char)*s->str)) {
348			s->cnt = strtol(s->str, &ep, 0);
349			if (*ep == ']') {
350				s->str = ep + 1;
351				break;
352			}
353		}
354		errx(1, "illegal sequence count");
355		/* NOTREACHED */
356	}
357
358	s->state = s->cnt ? SEQUENCE : INFINITE;
359}
360
361/*
362 * Translate \??? into a character.  Up to 3 octal digits, if no digits either
363 * an escape code or a literal character.
364 */
365static int
366backslash(STR *s, int *is_octal)
367{
368	int ch, cnt, val;
369
370	if (is_octal != NULL)
371		*is_octal = 0;
372	for (cnt = val = 0;;) {
373		ch = (u_char)*++s->str;
374		if (!isdigit(ch) || ch > '7')
375			break;
376		val = val * 8 + ch - '0';
377		if (++cnt == 3) {
378			++s->str;
379			break;
380		}
381	}
382	if (cnt) {
383		if (is_octal != NULL)
384			*is_octal = 1;
385		return (val);
386	}
387	if (ch != '\0')
388		++s->str;
389	switch (ch) {
390		case 'a':			/* escape characters */
391			return ('\7');
392		case 'b':
393			return ('\b');
394		case 'f':
395			return ('\f');
396		case 'n':
397			return ('\n');
398		case 'r':
399			return ('\r');
400		case 't':
401			return ('\t');
402		case 'v':
403			return ('\13');
404		case '\0':			/*  \" -> \ */
405			s->state = EOS;
406			return ('\\');
407		default:			/* \x" -> x */
408			return (ch);
409	}
410}
411