str.c revision 118399
154359Sroberto/*-
254359Sroberto * Copyright (c) 1991, 1993
354359Sroberto *	The Regents of the University of California.  All rights reserved.
454359Sroberto *
554359Sroberto * Redistribution and use in source and binary forms, with or without
654359Sroberto * modification, are permitted provided that the following conditions
754359Sroberto * are met:
854359Sroberto * 1. Redistributions of source code must retain the above copyright
954359Sroberto *    notice, this list of conditions and the following disclaimer.
1082498Sroberto * 2. Redistributions in binary form must reproduce the above copyright
1182498Sroberto *    notice, this list of conditions and the following disclaimer in the
1282498Sroberto *    documentation and/or other materials provided with the distribution.
1382498Sroberto * 3. All advertising materials mentioning features or use of this software
1482498Sroberto *    must display the following acknowledgement:
1582498Sroberto *	This product includes software developed by the University of
1654359Sroberto *	California, Berkeley and its contributors.
1754359Sroberto * 4. Neither the name of the University nor the names of its contributors
1854359Sroberto *    may be used to endorse or promote products derived from this software
1954359Sroberto *    without specific prior written permission.
2054359Sroberto *
2154359Sroberto * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
2254359Sroberto * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2354359Sroberto * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2454359Sroberto * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2554359Sroberto * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2654359Sroberto * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2754359Sroberto * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2854359Sroberto * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2954359Sroberto * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3054359Sroberto * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3154359Sroberto * SUCH DAMAGE.
3254359Sroberto */
3354359Sroberto
3454359Sroberto#include <sys/cdefs.h>
3554359Sroberto
3654359Sroberto__FBSDID("$FreeBSD: head/usr.bin/tr/str.c 118399 2003-08-03 22:02:49Z ache $");
3754359Sroberto
3854359Sroberto#ifndef lint
3954359Srobertostatic const char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
4054359Sroberto#endif
4154359Sroberto
4254359Sroberto#include <sys/cdefs.h>
4354359Sroberto#include <sys/types.h>
4454359Sroberto
4554359Sroberto#include <ctype.h>
4654359Sroberto#include <err.h>
4754359Sroberto#include <stddef.h>
4854359Sroberto#include <stdio.h>
4954359Sroberto#include <stdlib.h>
5054359Sroberto#include <string.h>
5154359Sroberto
5254359Sroberto#include "extern.h"
5354359Sroberto
5454359Srobertostatic int	backslash(STR *);
5554359Srobertostatic int	bracket(STR *);
5654359Srobertostatic int	c_class(const void *, const void *);
5754359Srobertostatic void	genclass(STR *);
5854359Srobertostatic void	genequiv(STR *);
5954359Srobertostatic int	genrange(STR *);
6054359Srobertostatic void	genseq(STR *);
6154359Sroberto
6254359Srobertoint
6354359Srobertonext(s)
6454359Sroberto	STR *s;
6554359Sroberto{
6654359Sroberto	int ch;
6754359Sroberto
6854359Sroberto	switch (s->state) {
6954359Sroberto	case EOS:
7054359Sroberto		return (0);
7154359Sroberto	case INFINITE:
7254359Sroberto		return (1);
7354359Sroberto	case NORMAL:
7454359Sroberto		switch (ch = (u_char)*s->str) {
7554359Sroberto		case '\0':
7654359Sroberto			s->state = EOS;
7754359Sroberto			return (0);
7854359Sroberto		case '\\':
7954359Sroberto			s->lastch = backslash(s);
8054359Sroberto			break;
8154359Sroberto		case '[':
8254359Sroberto			if (bracket(s))
8354359Sroberto				return (next(s));
8454359Sroberto			/* FALLTHROUGH */
8554359Sroberto		default:
8654359Sroberto			++s->str;
8754359Sroberto			s->lastch = ch;
8854359Sroberto			break;
8954359Sroberto		}
9054359Sroberto
9154359Sroberto		/* We can start a range at any time. */
9254359Sroberto		if (s->str[0] == '-' && genrange(s))
9354359Sroberto			return (next(s));
9454359Sroberto		return (1);
9554359Sroberto	case SEQUENCE:
9654359Sroberto		if (s->cnt-- == 0) {
9754359Sroberto			s->state = NORMAL;
9854359Sroberto			return (next(s));
9954359Sroberto		}
10054359Sroberto		return (1);
10154359Sroberto	case SET:
10254359Sroberto	case SET_UPPER:
10354359Sroberto	case SET_LOWER:
10454359Sroberto		if ((ch = s->set[s->cnt++]) == OOBCH) {
10554359Sroberto			s->state = NORMAL;
10654359Sroberto			return (next(s));
10754359Sroberto		}
10854359Sroberto		s->lastch = ch;
10954359Sroberto		return (1);
11054359Sroberto	default:
11154359Sroberto		return (0);
11254359Sroberto	}
11354359Sroberto	/* NOTREACHED */
11454359Sroberto}
11554359Sroberto
11654359Srobertostatic int
11754359Srobertobracket(s)
11854359Sroberto	STR *s;
11954359Sroberto{
12054359Sroberto	char *p;
12154359Sroberto
12254359Sroberto	switch (s->str[1]) {
12354359Sroberto	case ':':				/* "[:class:]" */
12454359Sroberto		if ((p = strchr(s->str + 2, ']')) == NULL)
12554359Sroberto			return (0);
12654359Sroberto		if (*(p - 1) != ':' || p - s->str < 4)
12754359Sroberto			goto repeat;
12854359Sroberto		*(p - 1) = '\0';
12954359Sroberto		s->str += 2;
13054359Sroberto		genclass(s);
13154359Sroberto		s->str = p + 1;
13254359Sroberto		return (1);
13354359Sroberto	case '=':				/* "[=equiv=]" */
13454359Sroberto		if ((p = strchr(s->str + 2, ']')) == NULL)
13554359Sroberto			return (0);
13654359Sroberto		if (*(p - 1) != '=' || p - s->str < 4)
13754359Sroberto			goto repeat;
13854359Sroberto		s->str += 2;
13954359Sroberto		genequiv(s);
14054359Sroberto		return (1);
14154359Sroberto	default:				/* "[\###*n]" or "[#*n]" */
14254359Sroberto	repeat:
14354359Sroberto		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
14454359Sroberto			return (0);
14582498Sroberto		if (p[0] != '*' || index(p, ']') == NULL)
14654359Sroberto			return (0);
14754359Sroberto		s->str += 1;
14854359Sroberto		genseq(s);
14954359Sroberto		return (1);
15082498Sroberto	}
15154359Sroberto	/* NOTREACHED */
15254359Sroberto}
15354359Sroberto
15454359Srobertotypedef struct {
15554359Sroberto	const char *name;
15654359Sroberto	int (*func)(int);
15754359Sroberto	int *set;
15854359Sroberto} CLASS;
15954359Sroberto
16054359Srobertostatic CLASS classes[] = {
16154359Sroberto#undef isalnum
16254359Sroberto	{ "alnum",  isalnum,  NULL },
16354359Sroberto#undef isalpha
16454359Sroberto	{ "alpha",  isalpha,  NULL },
16554359Sroberto#undef isblank
16654359Sroberto	{ "blank",  isblank,  NULL },
16754359Sroberto#undef iscntrl
16854359Sroberto	{ "cntrl",  iscntrl,  NULL },
16954359Sroberto#undef isdigit
17054359Sroberto	{ "digit",  isdigit,  NULL },
17154359Sroberto#undef isgraph
17254359Sroberto	{ "graph",  isgraph,  NULL },
17354359Sroberto#undef islower
17454359Sroberto	{ "lower",  islower,  NULL },
17554359Sroberto#undef isprint
17654359Sroberto	{ "print",  isprint,  NULL },
17754359Sroberto#undef ispunct
17854359Sroberto	{ "punct",  ispunct,  NULL },
17954359Sroberto#undef isspace
18054359Sroberto	{ "space",  isspace,  NULL },
18154359Sroberto#undef isupper
18254359Sroberto	{ "upper",  isupper,  NULL },
18354359Sroberto#undef isxdigit
18454359Sroberto	{ "xdigit", isxdigit, NULL },
18554359Sroberto};
18654359Sroberto
18754359Srobertostatic void
18854359Srobertogenclass(s)
18954359Sroberto	STR *s;
19054359Sroberto{
19154359Sroberto	int cnt, (*func)(int);
19254359Sroberto	CLASS *cp, tmp;
19354359Sroberto	int *p, n;
19454359Sroberto
19554359Sroberto	tmp.name = s->str;
19654359Sroberto	if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) /
19754359Sroberto	    sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
19854359Sroberto		errx(1, "unknown class %s", s->str);
19954359Sroberto
20054359Sroberto	if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
20154359Sroberto		err(1, "genclass() malloc");
20254359Sroberto	bzero(p, (NCHARS + 1) * sizeof(int));
20354359Sroberto	for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt)
20454359Sroberto		if ((func)(cnt))
20554359Sroberto			*p++ = cnt;
20654359Sroberto	*p = OOBCH;
20754359Sroberto	n = p - cp->set;
20854359Sroberto
20954359Sroberto	s->cnt = 0;
21054359Sroberto	s->set = cp->set;
21154359Sroberto	if (strcmp(s->str, "upper") == 0)
21254359Sroberto		s->state = SET_UPPER;
21354359Sroberto	else if (strcmp(s->str, "lower") == 0) {
21454359Sroberto		s->state = SET_LOWER;
21554359Sroberto	} else
21654359Sroberto		s->state = SET;
21754359Sroberto	if ((s->state == SET_LOWER || s->state == SET_UPPER) && n > 1)
21854359Sroberto		mergesort(s->set, n, sizeof(*(s->set)), charcoll);
21954359Sroberto}
22054359Sroberto
22154359Srobertostatic int
22254359Srobertoc_class(a, b)
22354359Sroberto	const void *a, *b;
22454359Sroberto{
22554359Sroberto	return (strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name));
22654359Sroberto}
22754359Sroberto
22854359Srobertostatic void
22954359Srobertogenequiv(s)
23054359Sroberto	STR *s;
23154359Sroberto{
23254359Sroberto	int i, p, pri;
23354359Sroberto	char src[2], dst[3];
23454359Sroberto
23554359Sroberto	if (*s->str == '\\') {
23654359Sroberto		s->equiv[0] = backslash(s);
23754359Sroberto		if (*s->str != '=')
23854359Sroberto			errx(1, "misplaced equivalence equals sign");
23954359Sroberto		s->str += 2;
24054359Sroberto	} else {
24154359Sroberto		s->equiv[0] = s->str[0];
24254359Sroberto		if (s->str[1] != '=')
24354359Sroberto			errx(1, "misplaced equivalence equals sign");
24454359Sroberto		s->str += 3;
24554359Sroberto	}
24654359Sroberto
24754359Sroberto	/*
24854359Sroberto	 * Calculate the set of all characters in the same equivalence class
24954359Sroberto	 * as the specified character (they will have the same primary
25054359Sroberto	 * collation weights).
25154359Sroberto	 * XXX Knows too much about how strxfrm() is implemented. Assumes
25254359Sroberto	 * it fills the string with primary collation weight bytes. Only one-
25354359Sroberto	 * to-one mappings are supported.
25454359Sroberto	 */
25554359Sroberto	src[0] = s->equiv[0];
25654359Sroberto	src[1] = '\0';
25754359Sroberto	if (strxfrm(dst, src, sizeof(dst)) == 1) {
25854359Sroberto		pri = (unsigned char)*dst;
25954359Sroberto		for (p = 1, i = 1; i < NCHARS; i++) {
26054359Sroberto			*src = i;
26154359Sroberto			if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
26254359Sroberto			    pri == (unsigned char)*dst)
26354359Sroberto				s->equiv[p++] = i;
26454359Sroberto		}
26554359Sroberto		s->equiv[p] = OOBCH;
26654359Sroberto	}
26754359Sroberto
26854359Sroberto	s->cnt = 0;
26954359Sroberto	s->state = SET;
27054359Sroberto	s->set = s->equiv;
27154359Sroberto}
27254359Sroberto
27354359Srobertostatic int
27454359Srobertogenrange(s)
27554359Sroberto	STR *s;
27654359Sroberto{
27754359Sroberto	int stopval;
27854359Sroberto	char *savestart;
27954359Sroberto	int n, cnt, *p;
28054359Sroberto
28154359Sroberto	savestart = s->str;
28254359Sroberto	stopval = *++s->str == '\\' ? backslash(s) : (u_char)*s->str++;
28354359Sroberto	if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
28454359Sroberto		s->str = savestart;
28554359Sroberto		return (0);
28654359Sroberto	}
28754359Sroberto	if ((s->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
28854359Sroberto		err(1, "genrange() malloc");
28954359Sroberto	bzero(p, (NCHARS + 1) * sizeof(int));
29054359Sroberto	for (cnt = 0; cnt < NCHARS; ++cnt)
29154359Sroberto		if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
29254359Sroberto		    charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
29354359Sroberto			*p++ = cnt;
29454359Sroberto	*p = OOBCH;
29554359Sroberto	n = p - s->set;
29654359Sroberto
29754359Sroberto	s->cnt = 0;
29854359Sroberto	s->state = SET;
29954359Sroberto	if (n > 1)
30054359Sroberto		mergesort(s->set, n, sizeof(*(s->set)), charcoll);
30154359Sroberto	return (1);
30254359Sroberto}
30354359Sroberto
30454359Srobertostatic void
30554359Srobertogenseq(s)
30654359Sroberto	STR *s;
30754359Sroberto{
30854359Sroberto	char *ep;
30954359Sroberto
31054359Sroberto	if (s->which == STRING1)
31154359Sroberto		errx(1, "sequences only valid in string2");
31254359Sroberto
31354359Sroberto	if (*s->str == '\\')
31454359Sroberto		s->lastch = backslash(s);
31554359Sroberto	else
31654359Sroberto		s->lastch = *s->str++;
31754359Sroberto	if (*s->str != '*')
31854359Sroberto		errx(1, "misplaced sequence asterisk");
31954359Sroberto
32054359Sroberto	switch (*++s->str) {
32154359Sroberto	case '\\':
32254359Sroberto		s->cnt = backslash(s);
32354359Sroberto		break;
32454359Sroberto	case ']':
32554359Sroberto		s->cnt = 0;
32654359Sroberto		++s->str;
32754359Sroberto		break;
32854359Sroberto	default:
32954359Sroberto		if (isdigit((u_char)*s->str)) {
33054359Sroberto			s->cnt = strtol(s->str, &ep, 0);
33154359Sroberto			if (*ep == ']') {
33254359Sroberto				s->str = ep + 1;
33354359Sroberto				break;
33454359Sroberto			}
33554359Sroberto		}
33654359Sroberto		errx(1, "illegal sequence count");
33754359Sroberto		/* NOTREACHED */
33854359Sroberto	}
33954359Sroberto
34054359Sroberto	s->state = s->cnt ? SEQUENCE : INFINITE;
34154359Sroberto}
34254359Sroberto
34354359Sroberto/*
34454359Sroberto * Translate \??? into a character.  Up to 3 octal digits, if no digits either
34554359Sroberto * an escape code or a literal character.
34654359Sroberto */
34754359Srobertostatic int
34854359Srobertobackslash(s)
34954359Sroberto	STR *s;
35054359Sroberto{
35154359Sroberto	int ch, cnt, val;
35254359Sroberto
35354359Sroberto	for (cnt = val = 0;;) {
35454359Sroberto		ch = (u_char)*++s->str;
35554359Sroberto		if (!isascii(ch) || !isdigit(ch))
35654359Sroberto			break;
35754359Sroberto		val = val * 8 + ch - '0';
35854359Sroberto		if (++cnt == 3) {
35954359Sroberto			++s->str;
36054359Sroberto			break;
36154359Sroberto		}
36254359Sroberto	}
36354359Sroberto	if (cnt)
36454359Sroberto		return (val);
36554359Sroberto	if (ch != '\0')
36654359Sroberto		++s->str;
36754359Sroberto	switch (ch) {
36854359Sroberto		case 'a':			/* escape characters */
36954359Sroberto			return ('\7');
37054359Sroberto		case 'b':
37154359Sroberto			return ('\b');
37254359Sroberto		case 'f':
37354359Sroberto			return ('\f');
37454359Sroberto		case 'n':
37554359Sroberto			return ('\n');
37654359Sroberto		case 'r':
37754359Sroberto			return ('\r');
37854359Sroberto		case 't':
37954359Sroberto			return ('\t');
38054359Sroberto		case 'v':
38154359Sroberto			return ('\13');
38254359Sroberto		case '\0':			/*  \" -> \ */
38354359Sroberto			s->state = EOS;
38454359Sroberto			return ('\\');
38554359Sroberto		default:			/* \x" -> x */
38654359Sroberto			return (ch);
38754359Sroberto	}
38854359Sroberto}
38954359Sroberto