str.c revision 118399
154359Sroberto/*- 254359Sroberto * Copyright (c) 1991, 1993 354359Sroberto * The Regents of the University of California. All rights reserved. 454359Sroberto * 554359Sroberto * Redistribution and use in source and binary forms, with or without 654359Sroberto * modification, are permitted provided that the following conditions 754359Sroberto * are met: 854359Sroberto * 1. Redistributions of source code must retain the above copyright 954359Sroberto * notice, this list of conditions and the following disclaimer. 1082498Sroberto * 2. Redistributions in binary form must reproduce the above copyright 1182498Sroberto * notice, this list of conditions and the following disclaimer in the 1282498Sroberto * documentation and/or other materials provided with the distribution. 1382498Sroberto * 3. All advertising materials mentioning features or use of this software 1482498Sroberto * must display the following acknowledgement: 1582498Sroberto * This product includes software developed by the University of 1654359Sroberto * California, Berkeley and its contributors. 1754359Sroberto * 4. Neither the name of the University nor the names of its contributors 1854359Sroberto * may be used to endorse or promote products derived from this software 1954359Sroberto * without specific prior written permission. 2054359Sroberto * 2154359Sroberto * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 2254359Sroberto * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2354359Sroberto * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2454359Sroberto * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 2554359Sroberto * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2654359Sroberto * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2754359Sroberto * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2854359Sroberto * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2954359Sroberto * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3054359Sroberto * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3154359Sroberto * SUCH DAMAGE. 3254359Sroberto */ 3354359Sroberto 3454359Sroberto#include <sys/cdefs.h> 3554359Sroberto 3654359Sroberto__FBSDID("$FreeBSD: head/usr.bin/tr/str.c 118399 2003-08-03 22:02:49Z ache $"); 3754359Sroberto 3854359Sroberto#ifndef lint 3954359Srobertostatic const char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95"; 4054359Sroberto#endif 4154359Sroberto 4254359Sroberto#include <sys/cdefs.h> 4354359Sroberto#include <sys/types.h> 4454359Sroberto 4554359Sroberto#include <ctype.h> 4654359Sroberto#include <err.h> 4754359Sroberto#include <stddef.h> 4854359Sroberto#include <stdio.h> 4954359Sroberto#include <stdlib.h> 5054359Sroberto#include <string.h> 5154359Sroberto 5254359Sroberto#include "extern.h" 5354359Sroberto 5454359Srobertostatic int backslash(STR *); 5554359Srobertostatic int bracket(STR *); 5654359Srobertostatic int c_class(const void *, const void *); 5754359Srobertostatic void genclass(STR *); 5854359Srobertostatic void genequiv(STR *); 5954359Srobertostatic int genrange(STR *); 6054359Srobertostatic void genseq(STR *); 6154359Sroberto 6254359Srobertoint 6354359Srobertonext(s) 6454359Sroberto STR *s; 6554359Sroberto{ 6654359Sroberto int ch; 6754359Sroberto 6854359Sroberto switch (s->state) { 6954359Sroberto case EOS: 7054359Sroberto return (0); 7154359Sroberto case INFINITE: 7254359Sroberto return (1); 7354359Sroberto case NORMAL: 7454359Sroberto switch (ch = (u_char)*s->str) { 7554359Sroberto case '\0': 7654359Sroberto s->state = EOS; 7754359Sroberto return (0); 7854359Sroberto case '\\': 7954359Sroberto s->lastch = backslash(s); 8054359Sroberto break; 8154359Sroberto case '[': 8254359Sroberto if (bracket(s)) 8354359Sroberto return (next(s)); 8454359Sroberto /* FALLTHROUGH */ 8554359Sroberto default: 8654359Sroberto ++s->str; 8754359Sroberto s->lastch = ch; 8854359Sroberto break; 8954359Sroberto } 9054359Sroberto 9154359Sroberto /* We can start a range at any time. */ 9254359Sroberto if (s->str[0] == '-' && genrange(s)) 9354359Sroberto return (next(s)); 9454359Sroberto return (1); 9554359Sroberto case SEQUENCE: 9654359Sroberto if (s->cnt-- == 0) { 9754359Sroberto s->state = NORMAL; 9854359Sroberto return (next(s)); 9954359Sroberto } 10054359Sroberto return (1); 10154359Sroberto case SET: 10254359Sroberto case SET_UPPER: 10354359Sroberto case SET_LOWER: 10454359Sroberto if ((ch = s->set[s->cnt++]) == OOBCH) { 10554359Sroberto s->state = NORMAL; 10654359Sroberto return (next(s)); 10754359Sroberto } 10854359Sroberto s->lastch = ch; 10954359Sroberto return (1); 11054359Sroberto default: 11154359Sroberto return (0); 11254359Sroberto } 11354359Sroberto /* NOTREACHED */ 11454359Sroberto} 11554359Sroberto 11654359Srobertostatic int 11754359Srobertobracket(s) 11854359Sroberto STR *s; 11954359Sroberto{ 12054359Sroberto char *p; 12154359Sroberto 12254359Sroberto switch (s->str[1]) { 12354359Sroberto case ':': /* "[:class:]" */ 12454359Sroberto if ((p = strchr(s->str + 2, ']')) == NULL) 12554359Sroberto return (0); 12654359Sroberto if (*(p - 1) != ':' || p - s->str < 4) 12754359Sroberto goto repeat; 12854359Sroberto *(p - 1) = '\0'; 12954359Sroberto s->str += 2; 13054359Sroberto genclass(s); 13154359Sroberto s->str = p + 1; 13254359Sroberto return (1); 13354359Sroberto case '=': /* "[=equiv=]" */ 13454359Sroberto if ((p = strchr(s->str + 2, ']')) == NULL) 13554359Sroberto return (0); 13654359Sroberto if (*(p - 1) != '=' || p - s->str < 4) 13754359Sroberto goto repeat; 13854359Sroberto s->str += 2; 13954359Sroberto genequiv(s); 14054359Sroberto return (1); 14154359Sroberto default: /* "[\###*n]" or "[#*n]" */ 14254359Sroberto repeat: 14354359Sroberto if ((p = strpbrk(s->str + 2, "*]")) == NULL) 14454359Sroberto return (0); 14582498Sroberto if (p[0] != '*' || index(p, ']') == NULL) 14654359Sroberto return (0); 14754359Sroberto s->str += 1; 14854359Sroberto genseq(s); 14954359Sroberto return (1); 15082498Sroberto } 15154359Sroberto /* NOTREACHED */ 15254359Sroberto} 15354359Sroberto 15454359Srobertotypedef struct { 15554359Sroberto const char *name; 15654359Sroberto int (*func)(int); 15754359Sroberto int *set; 15854359Sroberto} CLASS; 15954359Sroberto 16054359Srobertostatic CLASS classes[] = { 16154359Sroberto#undef isalnum 16254359Sroberto { "alnum", isalnum, NULL }, 16354359Sroberto#undef isalpha 16454359Sroberto { "alpha", isalpha, NULL }, 16554359Sroberto#undef isblank 16654359Sroberto { "blank", isblank, NULL }, 16754359Sroberto#undef iscntrl 16854359Sroberto { "cntrl", iscntrl, NULL }, 16954359Sroberto#undef isdigit 17054359Sroberto { "digit", isdigit, NULL }, 17154359Sroberto#undef isgraph 17254359Sroberto { "graph", isgraph, NULL }, 17354359Sroberto#undef islower 17454359Sroberto { "lower", islower, NULL }, 17554359Sroberto#undef isprint 17654359Sroberto { "print", isprint, NULL }, 17754359Sroberto#undef ispunct 17854359Sroberto { "punct", ispunct, NULL }, 17954359Sroberto#undef isspace 18054359Sroberto { "space", isspace, NULL }, 18154359Sroberto#undef isupper 18254359Sroberto { "upper", isupper, NULL }, 18354359Sroberto#undef isxdigit 18454359Sroberto { "xdigit", isxdigit, NULL }, 18554359Sroberto}; 18654359Sroberto 18754359Srobertostatic void 18854359Srobertogenclass(s) 18954359Sroberto STR *s; 19054359Sroberto{ 19154359Sroberto int cnt, (*func)(int); 19254359Sroberto CLASS *cp, tmp; 19354359Sroberto int *p, n; 19454359Sroberto 19554359Sroberto tmp.name = s->str; 19654359Sroberto if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) / 19754359Sroberto sizeof(CLASS), sizeof(CLASS), c_class)) == NULL) 19854359Sroberto errx(1, "unknown class %s", s->str); 19954359Sroberto 20054359Sroberto if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL) 20154359Sroberto err(1, "genclass() malloc"); 20254359Sroberto bzero(p, (NCHARS + 1) * sizeof(int)); 20354359Sroberto for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt) 20454359Sroberto if ((func)(cnt)) 20554359Sroberto *p++ = cnt; 20654359Sroberto *p = OOBCH; 20754359Sroberto n = p - cp->set; 20854359Sroberto 20954359Sroberto s->cnt = 0; 21054359Sroberto s->set = cp->set; 21154359Sroberto if (strcmp(s->str, "upper") == 0) 21254359Sroberto s->state = SET_UPPER; 21354359Sroberto else if (strcmp(s->str, "lower") == 0) { 21454359Sroberto s->state = SET_LOWER; 21554359Sroberto } else 21654359Sroberto s->state = SET; 21754359Sroberto if ((s->state == SET_LOWER || s->state == SET_UPPER) && n > 1) 21854359Sroberto mergesort(s->set, n, sizeof(*(s->set)), charcoll); 21954359Sroberto} 22054359Sroberto 22154359Srobertostatic int 22254359Srobertoc_class(a, b) 22354359Sroberto const void *a, *b; 22454359Sroberto{ 22554359Sroberto return (strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name)); 22654359Sroberto} 22754359Sroberto 22854359Srobertostatic void 22954359Srobertogenequiv(s) 23054359Sroberto STR *s; 23154359Sroberto{ 23254359Sroberto int i, p, pri; 23354359Sroberto char src[2], dst[3]; 23454359Sroberto 23554359Sroberto if (*s->str == '\\') { 23654359Sroberto s->equiv[0] = backslash(s); 23754359Sroberto if (*s->str != '=') 23854359Sroberto errx(1, "misplaced equivalence equals sign"); 23954359Sroberto s->str += 2; 24054359Sroberto } else { 24154359Sroberto s->equiv[0] = s->str[0]; 24254359Sroberto if (s->str[1] != '=') 24354359Sroberto errx(1, "misplaced equivalence equals sign"); 24454359Sroberto s->str += 3; 24554359Sroberto } 24654359Sroberto 24754359Sroberto /* 24854359Sroberto * Calculate the set of all characters in the same equivalence class 24954359Sroberto * as the specified character (they will have the same primary 25054359Sroberto * collation weights). 25154359Sroberto * XXX Knows too much about how strxfrm() is implemented. Assumes 25254359Sroberto * it fills the string with primary collation weight bytes. Only one- 25354359Sroberto * to-one mappings are supported. 25454359Sroberto */ 25554359Sroberto src[0] = s->equiv[0]; 25654359Sroberto src[1] = '\0'; 25754359Sroberto if (strxfrm(dst, src, sizeof(dst)) == 1) { 25854359Sroberto pri = (unsigned char)*dst; 25954359Sroberto for (p = 1, i = 1; i < NCHARS; i++) { 26054359Sroberto *src = i; 26154359Sroberto if (strxfrm(dst, src, sizeof(dst)) == 1 && pri && 26254359Sroberto pri == (unsigned char)*dst) 26354359Sroberto s->equiv[p++] = i; 26454359Sroberto } 26554359Sroberto s->equiv[p] = OOBCH; 26654359Sroberto } 26754359Sroberto 26854359Sroberto s->cnt = 0; 26954359Sroberto s->state = SET; 27054359Sroberto s->set = s->equiv; 27154359Sroberto} 27254359Sroberto 27354359Srobertostatic int 27454359Srobertogenrange(s) 27554359Sroberto STR *s; 27654359Sroberto{ 27754359Sroberto int stopval; 27854359Sroberto char *savestart; 27954359Sroberto int n, cnt, *p; 28054359Sroberto 28154359Sroberto savestart = s->str; 28254359Sroberto stopval = *++s->str == '\\' ? backslash(s) : (u_char)*s->str++; 28354359Sroberto if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) { 28454359Sroberto s->str = savestart; 28554359Sroberto return (0); 28654359Sroberto } 28754359Sroberto if ((s->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL) 28854359Sroberto err(1, "genrange() malloc"); 28954359Sroberto bzero(p, (NCHARS + 1) * sizeof(int)); 29054359Sroberto for (cnt = 0; cnt < NCHARS; ++cnt) 29154359Sroberto if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 && 29254359Sroberto charcoll((const void *)&cnt, (const void *)&stopval) <= 0) 29354359Sroberto *p++ = cnt; 29454359Sroberto *p = OOBCH; 29554359Sroberto n = p - s->set; 29654359Sroberto 29754359Sroberto s->cnt = 0; 29854359Sroberto s->state = SET; 29954359Sroberto if (n > 1) 30054359Sroberto mergesort(s->set, n, sizeof(*(s->set)), charcoll); 30154359Sroberto return (1); 30254359Sroberto} 30354359Sroberto 30454359Srobertostatic void 30554359Srobertogenseq(s) 30654359Sroberto STR *s; 30754359Sroberto{ 30854359Sroberto char *ep; 30954359Sroberto 31054359Sroberto if (s->which == STRING1) 31154359Sroberto errx(1, "sequences only valid in string2"); 31254359Sroberto 31354359Sroberto if (*s->str == '\\') 31454359Sroberto s->lastch = backslash(s); 31554359Sroberto else 31654359Sroberto s->lastch = *s->str++; 31754359Sroberto if (*s->str != '*') 31854359Sroberto errx(1, "misplaced sequence asterisk"); 31954359Sroberto 32054359Sroberto switch (*++s->str) { 32154359Sroberto case '\\': 32254359Sroberto s->cnt = backslash(s); 32354359Sroberto break; 32454359Sroberto case ']': 32554359Sroberto s->cnt = 0; 32654359Sroberto ++s->str; 32754359Sroberto break; 32854359Sroberto default: 32954359Sroberto if (isdigit((u_char)*s->str)) { 33054359Sroberto s->cnt = strtol(s->str, &ep, 0); 33154359Sroberto if (*ep == ']') { 33254359Sroberto s->str = ep + 1; 33354359Sroberto break; 33454359Sroberto } 33554359Sroberto } 33654359Sroberto errx(1, "illegal sequence count"); 33754359Sroberto /* NOTREACHED */ 33854359Sroberto } 33954359Sroberto 34054359Sroberto s->state = s->cnt ? SEQUENCE : INFINITE; 34154359Sroberto} 34254359Sroberto 34354359Sroberto/* 34454359Sroberto * Translate \??? into a character. Up to 3 octal digits, if no digits either 34554359Sroberto * an escape code or a literal character. 34654359Sroberto */ 34754359Srobertostatic int 34854359Srobertobackslash(s) 34954359Sroberto STR *s; 35054359Sroberto{ 35154359Sroberto int ch, cnt, val; 35254359Sroberto 35354359Sroberto for (cnt = val = 0;;) { 35454359Sroberto ch = (u_char)*++s->str; 35554359Sroberto if (!isascii(ch) || !isdigit(ch)) 35654359Sroberto break; 35754359Sroberto val = val * 8 + ch - '0'; 35854359Sroberto if (++cnt == 3) { 35954359Sroberto ++s->str; 36054359Sroberto break; 36154359Sroberto } 36254359Sroberto } 36354359Sroberto if (cnt) 36454359Sroberto return (val); 36554359Sroberto if (ch != '\0') 36654359Sroberto ++s->str; 36754359Sroberto switch (ch) { 36854359Sroberto case 'a': /* escape characters */ 36954359Sroberto return ('\7'); 37054359Sroberto case 'b': 37154359Sroberto return ('\b'); 37254359Sroberto case 'f': 37354359Sroberto return ('\f'); 37454359Sroberto case 'n': 37554359Sroberto return ('\n'); 37654359Sroberto case 'r': 37754359Sroberto return ('\r'); 37854359Sroberto case 't': 37954359Sroberto return ('\t'); 38054359Sroberto case 'v': 38154359Sroberto return ('\13'); 38254359Sroberto case '\0': /* \" -> \ */ 38354359Sroberto s->state = EOS; 38454359Sroberto return ('\\'); 38554359Sroberto default: /* \x" -> x */ 38654359Sroberto return (ch); 38754359Sroberto } 38854359Sroberto} 38954359Sroberto