str.c revision 216370
1275970Scy/*-
2275970Scy * Copyright (c) 1991, 1993
3275970Scy *	The Regents of the University of California.  All rights reserved.
4275970Scy *
5275970Scy * Redistribution and use in source and binary forms, with or without
6275970Scy * modification, are permitted provided that the following conditions
7275970Scy * are met:
8275970Scy * 1. Redistributions of source code must retain the above copyright
9275970Scy *    notice, this list of conditions and the following disclaimer.
10275970Scy * 2. Redistributions in binary form must reproduce the above copyright
11275970Scy *    notice, this list of conditions and the following disclaimer in the
12275970Scy *    documentation and/or other materials provided with the distribution.
13275970Scy * 4. Neither the name of the University nor the names of its contributors
14275970Scy *    may be used to endorse or promote products derived from this software
15275970Scy *    without specific prior written permission.
16275970Scy *
17275970Scy * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18285612Sdelphij * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31
32__FBSDID("$FreeBSD: head/usr.bin/tr/str.c 216370 2010-12-11 08:32:16Z joel $");
33
34#ifndef lint
35static const char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
36#endif
37
38#include <sys/types.h>
39
40#include <ctype.h>
41#include <err.h>
42#include <errno.h>
43#include <stddef.h>
44#include <stdio.h>
45#include <stdlib.h>
46#include <string.h>
47#include <wchar.h>
48#include <wctype.h>
49
50#include "extern.h"
51
52static int      backslash(STR *, int *);
53static int	bracket(STR *);
54static void	genclass(STR *);
55static void	genequiv(STR *);
56static int      genrange(STR *, int);
57static void	genseq(STR *);
58
59wint_t
60next(s)
61	STR *s;
62{
63	int is_octal;
64	wint_t ch;
65	wchar_t wch;
66	size_t clen;
67
68	switch (s->state) {
69	case EOS:
70		return (0);
71	case INFINITE:
72		return (1);
73	case NORMAL:
74		switch (*s->str) {
75		case '\0':
76			s->state = EOS;
77			return (0);
78		case '\\':
79			s->lastch = backslash(s, &is_octal);
80			break;
81		case '[':
82			if (bracket(s))
83				return (next(s));
84			/* FALLTHROUGH */
85		default:
86			clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL);
87			if (clen == (size_t)-1 || clen == (size_t)-2 ||
88			    clen == 0)
89				errc(1, EILSEQ, NULL);
90			is_octal = 0;
91			s->lastch = wch;
92			s->str += clen;
93			break;
94		}
95
96		/* We can start a range at any time. */
97		if (s->str[0] == '-' && genrange(s, is_octal))
98			return (next(s));
99		return (1);
100	case RANGE:
101		if (s->cnt-- == 0) {
102			s->state = NORMAL;
103			return (next(s));
104		}
105		++s->lastch;
106		return (1);
107	case SEQUENCE:
108		if (s->cnt-- == 0) {
109			s->state = NORMAL;
110			return (next(s));
111		}
112		return (1);
113	case CCLASS:
114	case CCLASS_UPPER:
115	case CCLASS_LOWER:
116		s->cnt++;
117		ch = nextwctype(s->lastch, s->cclass);
118		if (ch == -1) {
119			s->state = NORMAL;
120			return (next(s));
121		}
122		s->lastch = ch;
123		return (1);
124	case SET:
125		if ((ch = s->set[s->cnt++]) == OOBCH) {
126			s->state = NORMAL;
127			return (next(s));
128		}
129		s->lastch = ch;
130		return (1);
131	default:
132		return (0);
133	}
134	/* NOTREACHED */
135}
136
137static int
138bracket(s)
139	STR *s;
140{
141	char *p;
142
143	switch (s->str[1]) {
144	case ':':				/* "[:class:]" */
145		if ((p = strchr(s->str + 2, ']')) == NULL)
146			return (0);
147		if (*(p - 1) != ':' || p - s->str < 4)
148			goto repeat;
149		*(p - 1) = '\0';
150		s->str += 2;
151		genclass(s);
152		s->str = p + 1;
153		return (1);
154	case '=':				/* "[=equiv=]" */
155		if (s->str[2] == '\0' || (p = strchr(s->str + 3, ']')) == NULL)
156			return (0);
157		if (*(p - 1) != '=' || p - s->str < 4)
158			goto repeat;
159		s->str += 2;
160		genequiv(s);
161		return (1);
162	default:				/* "[\###*n]" or "[#*n]" */
163	repeat:
164		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
165			return (0);
166		if (p[0] != '*' || index(p, ']') == NULL)
167			return (0);
168		s->str += 1;
169		genseq(s);
170		return (1);
171	}
172	/* NOTREACHED */
173}
174
175static void
176genclass(s)
177	STR *s;
178{
179
180	if ((s->cclass = wctype(s->str)) == 0)
181		errx(1, "unknown class %s", s->str);
182	s->cnt = 0;
183	s->lastch = -1;		/* incremented before check in next() */
184	if (strcmp(s->str, "upper") == 0)
185		s->state = CCLASS_UPPER;
186	else if (strcmp(s->str, "lower") == 0)
187		s->state = CCLASS_LOWER;
188	else
189		s->state = CCLASS;
190}
191
192static void
193genequiv(s)
194	STR *s;
195{
196	int i, p, pri;
197	char src[2], dst[3];
198	size_t clen;
199	wchar_t wc;
200
201	if (*s->str == '\\') {
202		s->equiv[0] = backslash(s, NULL);
203		if (*s->str != '=')
204			errx(1, "misplaced equivalence equals sign");
205		s->str += 2;
206	} else {
207		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
208		if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0)
209			errc(1, EILSEQ, NULL);
210		s->equiv[0] = wc;
211		if (s->str[clen] != '=')
212			errx(1, "misplaced equivalence equals sign");
213		s->str += clen + 2;
214	}
215
216	/*
217	 * Calculate the set of all characters in the same equivalence class
218	 * as the specified character (they will have the same primary
219	 * collation weights).
220	 * XXX Knows too much about how strxfrm() is implemented. Assumes
221	 * it fills the string with primary collation weight bytes. Only one-
222	 * to-one mappings are supported.
223	 * XXX Equivalence classes not supported in multibyte locales.
224	 */
225	src[0] = (char)s->equiv[0];
226	src[1] = '\0';
227	if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) {
228		pri = (unsigned char)*dst;
229		for (p = 1, i = 1; i < NCHARS_SB; i++) {
230			*src = i;
231			if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
232			    pri == (unsigned char)*dst)
233				s->equiv[p++] = i;
234		}
235		s->equiv[p] = OOBCH;
236	}
237
238	s->cnt = 0;
239	s->state = SET;
240	s->set = s->equiv;
241}
242
243static int
244genrange(STR *s, int was_octal)
245{
246	int stopval, octal;
247	char *savestart;
248	int n, cnt, *p;
249	size_t clen;
250	wchar_t wc;
251
252	octal = 0;
253	savestart = s->str;
254	if (*++s->str == '\\')
255		stopval = backslash(s, &octal);
256	else {
257		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
258		if (clen == (size_t)-1 || clen == (size_t)-2)
259			errc(1, EILSEQ, NULL);
260		stopval = wc;
261		s->str += clen;
262	}
263	/*
264	 * XXX Characters are not ordered according to collating sequence in
265	 * multibyte locales.
266	 */
267	if (octal || was_octal || MB_CUR_MAX > 1) {
268		if (stopval < s->lastch) {
269			s->str = savestart;
270			return (0);
271		}
272		s->cnt = stopval - s->lastch + 1;
273		s->state = RANGE;
274		--s->lastch;
275		return (1);
276	}
277	if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
278		s->str = savestart;
279		return (0);
280	}
281	if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
282		err(1, "genrange() malloc");
283	for (cnt = 0; cnt < NCHARS_SB; cnt++)
284		if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
285		    charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
286			*p++ = cnt;
287	*p = OOBCH;
288	n = p - s->set;
289
290	s->cnt = 0;
291	s->state = SET;
292	if (n > 1)
293		mergesort(s->set, n, sizeof(*(s->set)), charcoll);
294	return (1);
295}
296
297static void
298genseq(s)
299	STR *s;
300{
301	char *ep;
302	wchar_t wc;
303	size_t clen;
304
305	if (s->which == STRING1)
306		errx(1, "sequences only valid in string2");
307
308	if (*s->str == '\\')
309		s->lastch = backslash(s, NULL);
310	else {
311		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
312		if (clen == (size_t)-1 || clen == (size_t)-2)
313			errc(1, EILSEQ, NULL);
314		s->lastch = wc;
315		s->str += clen;
316	}
317	if (*s->str != '*')
318		errx(1, "misplaced sequence asterisk");
319
320	switch (*++s->str) {
321	case '\\':
322		s->cnt = backslash(s, NULL);
323		break;
324	case ']':
325		s->cnt = 0;
326		++s->str;
327		break;
328	default:
329		if (isdigit((u_char)*s->str)) {
330			s->cnt = strtol(s->str, &ep, 0);
331			if (*ep == ']') {
332				s->str = ep + 1;
333				break;
334			}
335		}
336		errx(1, "illegal sequence count");
337		/* NOTREACHED */
338	}
339
340	s->state = s->cnt ? SEQUENCE : INFINITE;
341}
342
343/*
344 * Translate \??? into a character.  Up to 3 octal digits, if no digits either
345 * an escape code or a literal character.
346 */
347static int
348backslash(STR *s, int *is_octal)
349{
350	int ch, cnt, val;
351
352	if (is_octal != NULL)
353		*is_octal = 0;
354	for (cnt = val = 0;;) {
355		ch = (u_char)*++s->str;
356		if (!isdigit(ch) || ch > '7')
357			break;
358		val = val * 8 + ch - '0';
359		if (++cnt == 3) {
360			++s->str;
361			break;
362		}
363	}
364	if (cnt) {
365		if (is_octal != NULL)
366			*is_octal = 1;
367		return (val);
368	}
369	if (ch != '\0')
370		++s->str;
371	switch (ch) {
372		case 'a':			/* escape characters */
373			return ('\7');
374		case 'b':
375			return ('\b');
376		case 'f':
377			return ('\f');
378		case 'n':
379			return ('\n');
380		case 'r':
381			return ('\r');
382		case 't':
383			return ('\t');
384		case 'v':
385			return ('\13');
386		case '\0':			/*  \" -> \ */
387			s->state = EOS;
388			return ('\\');
389		default:			/* \x" -> x */
390			return (ch);
391	}
392}
393