str.c revision 98215
11590Srgrimes/*-
21590Srgrimes * Copyright (c) 1991, 1993
31590Srgrimes *	The Regents of the University of California.  All rights reserved.
41590Srgrimes *
51590Srgrimes * Redistribution and use in source and binary forms, with or without
61590Srgrimes * modification, are permitted provided that the following conditions
71590Srgrimes * are met:
81590Srgrimes * 1. Redistributions of source code must retain the above copyright
91590Srgrimes *    notice, this list of conditions and the following disclaimer.
101590Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
111590Srgrimes *    notice, this list of conditions and the following disclaimer in the
121590Srgrimes *    documentation and/or other materials provided with the distribution.
131590Srgrimes * 3. All advertising materials mentioning features or use of this software
141590Srgrimes *    must display the following acknowledgement:
151590Srgrimes *	This product includes software developed by the University of
161590Srgrimes *	California, Berkeley and its contributors.
171590Srgrimes * 4. Neither the name of the University nor the names of its contributors
181590Srgrimes *    may be used to endorse or promote products derived from this software
191590Srgrimes *    without specific prior written permission.
201590Srgrimes *
211590Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
221590Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
231590Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
241590Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
251590Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
261590Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
271590Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
281590Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
291590Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
301590Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
311590Srgrimes * SUCH DAMAGE.
321590Srgrimes */
331590Srgrimes
3487705Smarkm#include <sys/cdefs.h>
3587705Smarkm
3687705Smarkm__FBSDID("$FreeBSD: head/usr.bin/tr/str.c 98215 2002-06-14 09:53:11Z tjr $");
3787705Smarkm
381590Srgrimes#ifndef lint
3987705Smarkmstatic const char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
4028368Scharnier#endif
411590Srgrimes
421590Srgrimes#include <sys/cdefs.h>
431590Srgrimes#include <sys/types.h>
441590Srgrimes
4528368Scharnier#include <ctype.h>
4628368Scharnier#include <err.h>
471590Srgrimes#include <stddef.h>
481590Srgrimes#include <stdio.h>
491590Srgrimes#include <stdlib.h>
501590Srgrimes#include <string.h>
511590Srgrimes
521590Srgrimes#include "extern.h"
531590Srgrimes
5492922Simpstatic int	backslash(STR *);
5592922Simpstatic int	bracket(STR *);
5692922Simpstatic int	c_class(const void *, const void *);
5792922Simpstatic void	genclass(STR *);
5892922Simpstatic void	genequiv(STR *);
5992922Simpstatic int	genrange(STR *);
6092922Simpstatic void	genseq(STR *);
611590Srgrimes
621590Srgrimesint
631590Srgrimesnext(s)
6487705Smarkm	STR *s;
651590Srgrimes{
6687705Smarkm	int ch;
671590Srgrimes
681590Srgrimes	switch (s->state) {
691590Srgrimes	case EOS:
701590Srgrimes		return (0);
711590Srgrimes	case INFINITE:
721590Srgrimes		return (1);
731590Srgrimes	case NORMAL:
7414648Sjoerg		switch (ch = (u_char)*s->str) {
751590Srgrimes		case '\0':
761590Srgrimes			s->state = EOS;
771590Srgrimes			return (0);
781590Srgrimes		case '\\':
791590Srgrimes			s->lastch = backslash(s);
801590Srgrimes			break;
811590Srgrimes		case '[':
821590Srgrimes			if (bracket(s))
831590Srgrimes				return (next(s));
841590Srgrimes			/* FALLTHROUGH */
851590Srgrimes		default:
861590Srgrimes			++s->str;
871590Srgrimes			s->lastch = ch;
881590Srgrimes			break;
891590Srgrimes		}
901590Srgrimes
911590Srgrimes		/* We can start a range at any time. */
921590Srgrimes		if (s->str[0] == '-' && genrange(s))
931590Srgrimes			return (next(s));
941590Srgrimes		return (1);
951590Srgrimes	case RANGE:
961590Srgrimes		if (s->cnt-- == 0) {
971590Srgrimes			s->state = NORMAL;
981590Srgrimes			return (next(s));
991590Srgrimes		}
1001590Srgrimes		++s->lastch;
1011590Srgrimes		return (1);
1021590Srgrimes	case SEQUENCE:
1031590Srgrimes		if (s->cnt-- == 0) {
1041590Srgrimes			s->state = NORMAL;
1051590Srgrimes			return (next(s));
1061590Srgrimes		}
1071590Srgrimes		return (1);
1081590Srgrimes	case SET:
1091590Srgrimes		if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
1101590Srgrimes			s->state = NORMAL;
1111590Srgrimes			return (next(s));
1121590Srgrimes		}
1131590Srgrimes		return (1);
11487705Smarkm	default:
11587705Smarkm		return (0);
1161590Srgrimes	}
1171590Srgrimes	/* NOTREACHED */
1181590Srgrimes}
1191590Srgrimes
1201590Srgrimesstatic int
1211590Srgrimesbracket(s)
12287705Smarkm	STR *s;
1231590Srgrimes{
12487705Smarkm	char *p;
1251590Srgrimes
1261590Srgrimes	switch (s->str[1]) {
1271590Srgrimes	case ':':				/* "[:class:]" */
1281590Srgrimes		if ((p = strstr(s->str + 2, ":]")) == NULL)
1291590Srgrimes			return (0);
1301590Srgrimes		*p = '\0';
1311590Srgrimes		s->str += 2;
1321590Srgrimes		genclass(s);
1331590Srgrimes		s->str = p + 2;
1341590Srgrimes		return (1);
1351590Srgrimes	case '=':				/* "[=equiv=]" */
1361590Srgrimes		if ((p = strstr(s->str + 2, "=]")) == NULL)
1371590Srgrimes			return (0);
1381590Srgrimes		s->str += 2;
1391590Srgrimes		genequiv(s);
1401590Srgrimes		return (1);
1411590Srgrimes	default:				/* "[\###*n]" or "[#*n]" */
1421590Srgrimes		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
1431590Srgrimes			return (0);
1441590Srgrimes		if (p[0] != '*' || index(p, ']') == NULL)
1451590Srgrimes			return (0);
1461590Srgrimes		s->str += 1;
1471590Srgrimes		genseq(s);
1481590Srgrimes		return (1);
1491590Srgrimes	}
1501590Srgrimes	/* NOTREACHED */
1511590Srgrimes}
1521590Srgrimes
1531590Srgrimestypedef struct {
15487705Smarkm	const char *name;
15592922Simp	int (*func)(int);
1561590Srgrimes	int *set;
1571590Srgrimes} CLASS;
1581590Srgrimes
1591590Srgrimesstatic CLASS classes[] = {
16011895Sache#undef isalnum
16187705Smarkm	{ "alnum",  isalnum,  NULL },
16211895Sache#undef isalpha
16387705Smarkm	{ "alpha",  isalpha,  NULL },
16411895Sache#undef isblank
16587705Smarkm	{ "blank",  isblank,  NULL },
16611895Sache#undef iscntrl
16787705Smarkm	{ "cntrl",  iscntrl,  NULL },
16811895Sache#undef isdigit
16987705Smarkm	{ "digit",  isdigit,  NULL },
17011895Sache#undef isgraph
17187705Smarkm	{ "graph",  isgraph,  NULL },
17211895Sache#undef islower
17387705Smarkm	{ "lower",  islower,  NULL },
17411895Sache#undef isprint
17587705Smarkm	{ "print",  isprint,  NULL },
17611895Sache#undef ispunct
17787705Smarkm	{ "punct",  ispunct,  NULL },
17811895Sache#undef isspace
17987705Smarkm	{ "space",  isspace,  NULL },
18011895Sache#undef isupper
18187705Smarkm	{ "upper",  isupper,  NULL },
18211895Sache#undef isxdigit
18387705Smarkm	{ "xdigit", isxdigit, NULL },
1841590Srgrimes};
1851590Srgrimes
1861590Srgrimesstatic void
1871590Srgrimesgenclass(s)
1881590Srgrimes	STR *s;
1891590Srgrimes{
19092922Simp	int cnt, (*func)(int);
1911590Srgrimes	CLASS *cp, tmp;
1921590Srgrimes	int *p;
1931590Srgrimes
1941590Srgrimes	tmp.name = s->str;
1951590Srgrimes	if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) /
1961590Srgrimes	    sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
19728368Scharnier		errx(1, "unknown class %s", s->str);
1981590Srgrimes
1991590Srgrimes	if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
20028368Scharnier		errx(1, "malloc");
2011590Srgrimes	bzero(p, (NCHARS + 1) * sizeof(int));
2021590Srgrimes	for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt)
2031590Srgrimes		if ((func)(cnt))
2041590Srgrimes			*p++ = cnt;
2051590Srgrimes	*p = OOBCH;
2061590Srgrimes
2071590Srgrimes	s->cnt = 0;
2081590Srgrimes	s->state = SET;
2091590Srgrimes	s->set = cp->set;
2101590Srgrimes}
2111590Srgrimes
2121590Srgrimesstatic int
2131590Srgrimesc_class(a, b)
2141590Srgrimes	const void *a, *b;
2151590Srgrimes{
21687705Smarkm	return (strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name));
2171590Srgrimes}
2181590Srgrimes
2191590Srgrimesstatic void
2201590Srgrimesgenequiv(s)
2211590Srgrimes	STR *s;
2221590Srgrimes{
22398210Stjr	int i, p, pri;
22498210Stjr	char src[2], dst[3];
22598210Stjr
2261590Srgrimes	if (*s->str == '\\') {
2271590Srgrimes		s->equiv[0] = backslash(s);
2281590Srgrimes		if (*s->str != '=')
22928368Scharnier			errx(1, "misplaced equivalence equals sign");
23098215Stjr		s->str += 2;
2311590Srgrimes	} else {
2321590Srgrimes		s->equiv[0] = s->str[0];
2331590Srgrimes		if (s->str[1] != '=')
23428368Scharnier			errx(1, "misplaced equivalence equals sign");
23598215Stjr		s->str += 3;
2361590Srgrimes	}
23798210Stjr
23898210Stjr	/*
23998210Stjr	 * Calculate the set of all characters in the same equivalence class
24098210Stjr	 * as the specified character (they will have the same primary
24198210Stjr	 * collation weights).
24298210Stjr	 * XXX Knows too much about how strxfrm() is implemented. Assumes
24398210Stjr	 * it fills the string with primary collation weight bytes. Only one-
24498210Stjr	 * to-one mappings are supported.
24598210Stjr	 */
24698210Stjr	src[0] = s->equiv[0];
24798210Stjr	src[1] = '\0';
24898210Stjr	if (strxfrm(dst, src, sizeof(dst)) == 1) {
24998210Stjr		pri = (unsigned char)*dst;
25098210Stjr		for (p = 1, i = 1; i < NCHARS; i++) {
25198210Stjr			*src = i;
25298210Stjr			if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
25398210Stjr			    pri == (unsigned char)*dst)
25498210Stjr				s->equiv[p++] = i;
25598210Stjr		}
25698210Stjr		s->equiv[p] = OOBCH;
25798210Stjr	}
25898210Stjr
2591590Srgrimes	s->cnt = 0;
2601590Srgrimes	s->state = SET;
2611590Srgrimes	s->set = s->equiv;
2621590Srgrimes}
2631590Srgrimes
2641590Srgrimesstatic int
2651590Srgrimesgenrange(s)
2661590Srgrimes	STR *s;
2671590Srgrimes{
2681590Srgrimes	int stopval;
2691590Srgrimes	char *savestart;
2701590Srgrimes
2711590Srgrimes	savestart = s->str;
27214648Sjoerg	stopval = *++s->str == '\\' ? backslash(s) : (u_char)*s->str++;
27312505Sbde	if (stopval < (u_char)s->lastch) {
2741590Srgrimes		s->str = savestart;
2751590Srgrimes		return (0);
2761590Srgrimes	}
2771590Srgrimes	s->cnt = stopval - s->lastch + 1;
2781590Srgrimes	s->state = RANGE;
2791590Srgrimes	--s->lastch;
2801590Srgrimes	return (1);
2811590Srgrimes}
2821590Srgrimes
2831590Srgrimesstatic void
2841590Srgrimesgenseq(s)
2851590Srgrimes	STR *s;
2861590Srgrimes{
2871590Srgrimes	char *ep;
2881590Srgrimes
2891590Srgrimes	if (s->which == STRING1)
29028368Scharnier		errx(1, "sequences only valid in string2");
2911590Srgrimes
2921590Srgrimes	if (*s->str == '\\')
2931590Srgrimes		s->lastch = backslash(s);
2941590Srgrimes	else
2951590Srgrimes		s->lastch = *s->str++;
2961590Srgrimes	if (*s->str != '*')
29728368Scharnier		errx(1, "misplaced sequence asterisk");
2981590Srgrimes
2991590Srgrimes	switch (*++s->str) {
3001590Srgrimes	case '\\':
3011590Srgrimes		s->cnt = backslash(s);
3021590Srgrimes		break;
3031590Srgrimes	case ']':
3041590Srgrimes		s->cnt = 0;
3051590Srgrimes		++s->str;
3061590Srgrimes		break;
3071590Srgrimes	default:
30814720Sjoerg		if (isdigit((u_char)*s->str)) {
3091590Srgrimes			s->cnt = strtol(s->str, &ep, 0);
3101590Srgrimes			if (*ep == ']') {
3111590Srgrimes				s->str = ep + 1;
3121590Srgrimes				break;
3131590Srgrimes			}
3141590Srgrimes		}
31528368Scharnier		errx(1, "illegal sequence count");
3161590Srgrimes		/* NOTREACHED */
3171590Srgrimes	}
3181590Srgrimes
3191590Srgrimes	s->state = s->cnt ? SEQUENCE : INFINITE;
3201590Srgrimes}
3211590Srgrimes
3221590Srgrimes/*
3231590Srgrimes * Translate \??? into a character.  Up to 3 octal digits, if no digits either
3241590Srgrimes * an escape code or a literal character.
3251590Srgrimes */
3261590Srgrimesstatic int
3271590Srgrimesbackslash(s)
32887705Smarkm	STR *s;
3291590Srgrimes{
33087705Smarkm	int ch, cnt, val;
3311590Srgrimes
3321590Srgrimes	for (cnt = val = 0;;) {
33314720Sjoerg		ch = (u_char)*++s->str;
3341590Srgrimes		if (!isascii(ch) || !isdigit(ch))
3351590Srgrimes			break;
3361590Srgrimes		val = val * 8 + ch - '0';
3371590Srgrimes		if (++cnt == 3) {
3381590Srgrimes			++s->str;
3391590Srgrimes			break;
3401590Srgrimes		}
3411590Srgrimes	}
3421590Srgrimes	if (cnt)
3431590Srgrimes		return (val);
3441590Srgrimes	if (ch != '\0')
3451590Srgrimes		++s->str;
3461590Srgrimes	switch (ch) {
3471590Srgrimes		case 'a':			/* escape characters */
3481590Srgrimes			return ('\7');
3491590Srgrimes		case 'b':
3501590Srgrimes			return ('\b');
3511590Srgrimes		case 'f':
3521590Srgrimes			return ('\f');
3531590Srgrimes		case 'n':
3541590Srgrimes			return ('\n');
3551590Srgrimes		case 'r':
3561590Srgrimes			return ('\r');
3571590Srgrimes		case 't':
3581590Srgrimes			return ('\t');
3591590Srgrimes		case 'v':
3601590Srgrimes			return ('\13');
3611590Srgrimes		case '\0':			/*  \" -> \ */
3621590Srgrimes			s->state = EOS;
3631590Srgrimes			return ('\\');
3641590Srgrimes		default:			/* \x" -> x */
3651590Srgrimes			return (ch);
3661590Srgrimes	}
3671590Srgrimes}
368