str.c revision 98215
11590Srgrimes/*- 21590Srgrimes * Copyright (c) 1991, 1993 31590Srgrimes * The Regents of the University of California. All rights reserved. 41590Srgrimes * 51590Srgrimes * Redistribution and use in source and binary forms, with or without 61590Srgrimes * modification, are permitted provided that the following conditions 71590Srgrimes * are met: 81590Srgrimes * 1. Redistributions of source code must retain the above copyright 91590Srgrimes * notice, this list of conditions and the following disclaimer. 101590Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 111590Srgrimes * notice, this list of conditions and the following disclaimer in the 121590Srgrimes * documentation and/or other materials provided with the distribution. 131590Srgrimes * 3. All advertising materials mentioning features or use of this software 141590Srgrimes * must display the following acknowledgement: 151590Srgrimes * This product includes software developed by the University of 161590Srgrimes * California, Berkeley and its contributors. 171590Srgrimes * 4. Neither the name of the University nor the names of its contributors 181590Srgrimes * may be used to endorse or promote products derived from this software 191590Srgrimes * without specific prior written permission. 201590Srgrimes * 211590Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 221590Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 231590Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 241590Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 251590Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 261590Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 271590Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 281590Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 291590Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 301590Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 311590Srgrimes * SUCH DAMAGE. 321590Srgrimes */ 331590Srgrimes 3487705Smarkm#include <sys/cdefs.h> 3587705Smarkm 3687705Smarkm__FBSDID("$FreeBSD: head/usr.bin/tr/str.c 98215 2002-06-14 09:53:11Z tjr $"); 3787705Smarkm 381590Srgrimes#ifndef lint 3987705Smarkmstatic const char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95"; 4028368Scharnier#endif 411590Srgrimes 421590Srgrimes#include <sys/cdefs.h> 431590Srgrimes#include <sys/types.h> 441590Srgrimes 4528368Scharnier#include <ctype.h> 4628368Scharnier#include <err.h> 471590Srgrimes#include <stddef.h> 481590Srgrimes#include <stdio.h> 491590Srgrimes#include <stdlib.h> 501590Srgrimes#include <string.h> 511590Srgrimes 521590Srgrimes#include "extern.h" 531590Srgrimes 5492922Simpstatic int backslash(STR *); 5592922Simpstatic int bracket(STR *); 5692922Simpstatic int c_class(const void *, const void *); 5792922Simpstatic void genclass(STR *); 5892922Simpstatic void genequiv(STR *); 5992922Simpstatic int genrange(STR *); 6092922Simpstatic void genseq(STR *); 611590Srgrimes 621590Srgrimesint 631590Srgrimesnext(s) 6487705Smarkm STR *s; 651590Srgrimes{ 6687705Smarkm int ch; 671590Srgrimes 681590Srgrimes switch (s->state) { 691590Srgrimes case EOS: 701590Srgrimes return (0); 711590Srgrimes case INFINITE: 721590Srgrimes return (1); 731590Srgrimes case NORMAL: 7414648Sjoerg switch (ch = (u_char)*s->str) { 751590Srgrimes case '\0': 761590Srgrimes s->state = EOS; 771590Srgrimes return (0); 781590Srgrimes case '\\': 791590Srgrimes s->lastch = backslash(s); 801590Srgrimes break; 811590Srgrimes case '[': 821590Srgrimes if (bracket(s)) 831590Srgrimes return (next(s)); 841590Srgrimes /* FALLTHROUGH */ 851590Srgrimes default: 861590Srgrimes ++s->str; 871590Srgrimes s->lastch = ch; 881590Srgrimes break; 891590Srgrimes } 901590Srgrimes 911590Srgrimes /* We can start a range at any time. */ 921590Srgrimes if (s->str[0] == '-' && genrange(s)) 931590Srgrimes return (next(s)); 941590Srgrimes return (1); 951590Srgrimes case RANGE: 961590Srgrimes if (s->cnt-- == 0) { 971590Srgrimes s->state = NORMAL; 981590Srgrimes return (next(s)); 991590Srgrimes } 1001590Srgrimes ++s->lastch; 1011590Srgrimes return (1); 1021590Srgrimes case SEQUENCE: 1031590Srgrimes if (s->cnt-- == 0) { 1041590Srgrimes s->state = NORMAL; 1051590Srgrimes return (next(s)); 1061590Srgrimes } 1071590Srgrimes return (1); 1081590Srgrimes case SET: 1091590Srgrimes if ((s->lastch = s->set[s->cnt++]) == OOBCH) { 1101590Srgrimes s->state = NORMAL; 1111590Srgrimes return (next(s)); 1121590Srgrimes } 1131590Srgrimes return (1); 11487705Smarkm default: 11587705Smarkm return (0); 1161590Srgrimes } 1171590Srgrimes /* NOTREACHED */ 1181590Srgrimes} 1191590Srgrimes 1201590Srgrimesstatic int 1211590Srgrimesbracket(s) 12287705Smarkm STR *s; 1231590Srgrimes{ 12487705Smarkm char *p; 1251590Srgrimes 1261590Srgrimes switch (s->str[1]) { 1271590Srgrimes case ':': /* "[:class:]" */ 1281590Srgrimes if ((p = strstr(s->str + 2, ":]")) == NULL) 1291590Srgrimes return (0); 1301590Srgrimes *p = '\0'; 1311590Srgrimes s->str += 2; 1321590Srgrimes genclass(s); 1331590Srgrimes s->str = p + 2; 1341590Srgrimes return (1); 1351590Srgrimes case '=': /* "[=equiv=]" */ 1361590Srgrimes if ((p = strstr(s->str + 2, "=]")) == NULL) 1371590Srgrimes return (0); 1381590Srgrimes s->str += 2; 1391590Srgrimes genequiv(s); 1401590Srgrimes return (1); 1411590Srgrimes default: /* "[\###*n]" or "[#*n]" */ 1421590Srgrimes if ((p = strpbrk(s->str + 2, "*]")) == NULL) 1431590Srgrimes return (0); 1441590Srgrimes if (p[0] != '*' || index(p, ']') == NULL) 1451590Srgrimes return (0); 1461590Srgrimes s->str += 1; 1471590Srgrimes genseq(s); 1481590Srgrimes return (1); 1491590Srgrimes } 1501590Srgrimes /* NOTREACHED */ 1511590Srgrimes} 1521590Srgrimes 1531590Srgrimestypedef struct { 15487705Smarkm const char *name; 15592922Simp int (*func)(int); 1561590Srgrimes int *set; 1571590Srgrimes} CLASS; 1581590Srgrimes 1591590Srgrimesstatic CLASS classes[] = { 16011895Sache#undef isalnum 16187705Smarkm { "alnum", isalnum, NULL }, 16211895Sache#undef isalpha 16387705Smarkm { "alpha", isalpha, NULL }, 16411895Sache#undef isblank 16587705Smarkm { "blank", isblank, NULL }, 16611895Sache#undef iscntrl 16787705Smarkm { "cntrl", iscntrl, NULL }, 16811895Sache#undef isdigit 16987705Smarkm { "digit", isdigit, NULL }, 17011895Sache#undef isgraph 17187705Smarkm { "graph", isgraph, NULL }, 17211895Sache#undef islower 17387705Smarkm { "lower", islower, NULL }, 17411895Sache#undef isprint 17587705Smarkm { "print", isprint, NULL }, 17611895Sache#undef ispunct 17787705Smarkm { "punct", ispunct, NULL }, 17811895Sache#undef isspace 17987705Smarkm { "space", isspace, NULL }, 18011895Sache#undef isupper 18187705Smarkm { "upper", isupper, NULL }, 18211895Sache#undef isxdigit 18387705Smarkm { "xdigit", isxdigit, NULL }, 1841590Srgrimes}; 1851590Srgrimes 1861590Srgrimesstatic void 1871590Srgrimesgenclass(s) 1881590Srgrimes STR *s; 1891590Srgrimes{ 19092922Simp int cnt, (*func)(int); 1911590Srgrimes CLASS *cp, tmp; 1921590Srgrimes int *p; 1931590Srgrimes 1941590Srgrimes tmp.name = s->str; 1951590Srgrimes if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) / 1961590Srgrimes sizeof(CLASS), sizeof(CLASS), c_class)) == NULL) 19728368Scharnier errx(1, "unknown class %s", s->str); 1981590Srgrimes 1991590Srgrimes if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL) 20028368Scharnier errx(1, "malloc"); 2011590Srgrimes bzero(p, (NCHARS + 1) * sizeof(int)); 2021590Srgrimes for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt) 2031590Srgrimes if ((func)(cnt)) 2041590Srgrimes *p++ = cnt; 2051590Srgrimes *p = OOBCH; 2061590Srgrimes 2071590Srgrimes s->cnt = 0; 2081590Srgrimes s->state = SET; 2091590Srgrimes s->set = cp->set; 2101590Srgrimes} 2111590Srgrimes 2121590Srgrimesstatic int 2131590Srgrimesc_class(a, b) 2141590Srgrimes const void *a, *b; 2151590Srgrimes{ 21687705Smarkm return (strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name)); 2171590Srgrimes} 2181590Srgrimes 2191590Srgrimesstatic void 2201590Srgrimesgenequiv(s) 2211590Srgrimes STR *s; 2221590Srgrimes{ 22398210Stjr int i, p, pri; 22498210Stjr char src[2], dst[3]; 22598210Stjr 2261590Srgrimes if (*s->str == '\\') { 2271590Srgrimes s->equiv[0] = backslash(s); 2281590Srgrimes if (*s->str != '=') 22928368Scharnier errx(1, "misplaced equivalence equals sign"); 23098215Stjr s->str += 2; 2311590Srgrimes } else { 2321590Srgrimes s->equiv[0] = s->str[0]; 2331590Srgrimes if (s->str[1] != '=') 23428368Scharnier errx(1, "misplaced equivalence equals sign"); 23598215Stjr s->str += 3; 2361590Srgrimes } 23798210Stjr 23898210Stjr /* 23998210Stjr * Calculate the set of all characters in the same equivalence class 24098210Stjr * as the specified character (they will have the same primary 24198210Stjr * collation weights). 24298210Stjr * XXX Knows too much about how strxfrm() is implemented. Assumes 24398210Stjr * it fills the string with primary collation weight bytes. Only one- 24498210Stjr * to-one mappings are supported. 24598210Stjr */ 24698210Stjr src[0] = s->equiv[0]; 24798210Stjr src[1] = '\0'; 24898210Stjr if (strxfrm(dst, src, sizeof(dst)) == 1) { 24998210Stjr pri = (unsigned char)*dst; 25098210Stjr for (p = 1, i = 1; i < NCHARS; i++) { 25198210Stjr *src = i; 25298210Stjr if (strxfrm(dst, src, sizeof(dst)) == 1 && pri && 25398210Stjr pri == (unsigned char)*dst) 25498210Stjr s->equiv[p++] = i; 25598210Stjr } 25698210Stjr s->equiv[p] = OOBCH; 25798210Stjr } 25898210Stjr 2591590Srgrimes s->cnt = 0; 2601590Srgrimes s->state = SET; 2611590Srgrimes s->set = s->equiv; 2621590Srgrimes} 2631590Srgrimes 2641590Srgrimesstatic int 2651590Srgrimesgenrange(s) 2661590Srgrimes STR *s; 2671590Srgrimes{ 2681590Srgrimes int stopval; 2691590Srgrimes char *savestart; 2701590Srgrimes 2711590Srgrimes savestart = s->str; 27214648Sjoerg stopval = *++s->str == '\\' ? backslash(s) : (u_char)*s->str++; 27312505Sbde if (stopval < (u_char)s->lastch) { 2741590Srgrimes s->str = savestart; 2751590Srgrimes return (0); 2761590Srgrimes } 2771590Srgrimes s->cnt = stopval - s->lastch + 1; 2781590Srgrimes s->state = RANGE; 2791590Srgrimes --s->lastch; 2801590Srgrimes return (1); 2811590Srgrimes} 2821590Srgrimes 2831590Srgrimesstatic void 2841590Srgrimesgenseq(s) 2851590Srgrimes STR *s; 2861590Srgrimes{ 2871590Srgrimes char *ep; 2881590Srgrimes 2891590Srgrimes if (s->which == STRING1) 29028368Scharnier errx(1, "sequences only valid in string2"); 2911590Srgrimes 2921590Srgrimes if (*s->str == '\\') 2931590Srgrimes s->lastch = backslash(s); 2941590Srgrimes else 2951590Srgrimes s->lastch = *s->str++; 2961590Srgrimes if (*s->str != '*') 29728368Scharnier errx(1, "misplaced sequence asterisk"); 2981590Srgrimes 2991590Srgrimes switch (*++s->str) { 3001590Srgrimes case '\\': 3011590Srgrimes s->cnt = backslash(s); 3021590Srgrimes break; 3031590Srgrimes case ']': 3041590Srgrimes s->cnt = 0; 3051590Srgrimes ++s->str; 3061590Srgrimes break; 3071590Srgrimes default: 30814720Sjoerg if (isdigit((u_char)*s->str)) { 3091590Srgrimes s->cnt = strtol(s->str, &ep, 0); 3101590Srgrimes if (*ep == ']') { 3111590Srgrimes s->str = ep + 1; 3121590Srgrimes break; 3131590Srgrimes } 3141590Srgrimes } 31528368Scharnier errx(1, "illegal sequence count"); 3161590Srgrimes /* NOTREACHED */ 3171590Srgrimes } 3181590Srgrimes 3191590Srgrimes s->state = s->cnt ? SEQUENCE : INFINITE; 3201590Srgrimes} 3211590Srgrimes 3221590Srgrimes/* 3231590Srgrimes * Translate \??? into a character. Up to 3 octal digits, if no digits either 3241590Srgrimes * an escape code or a literal character. 3251590Srgrimes */ 3261590Srgrimesstatic int 3271590Srgrimesbackslash(s) 32887705Smarkm STR *s; 3291590Srgrimes{ 33087705Smarkm int ch, cnt, val; 3311590Srgrimes 3321590Srgrimes for (cnt = val = 0;;) { 33314720Sjoerg ch = (u_char)*++s->str; 3341590Srgrimes if (!isascii(ch) || !isdigit(ch)) 3351590Srgrimes break; 3361590Srgrimes val = val * 8 + ch - '0'; 3371590Srgrimes if (++cnt == 3) { 3381590Srgrimes ++s->str; 3391590Srgrimes break; 3401590Srgrimes } 3411590Srgrimes } 3421590Srgrimes if (cnt) 3431590Srgrimes return (val); 3441590Srgrimes if (ch != '\0') 3451590Srgrimes ++s->str; 3461590Srgrimes switch (ch) { 3471590Srgrimes case 'a': /* escape characters */ 3481590Srgrimes return ('\7'); 3491590Srgrimes case 'b': 3501590Srgrimes return ('\b'); 3511590Srgrimes case 'f': 3521590Srgrimes return ('\f'); 3531590Srgrimes case 'n': 3541590Srgrimes return ('\n'); 3551590Srgrimes case 'r': 3561590Srgrimes return ('\r'); 3571590Srgrimes case 't': 3581590Srgrimes return ('\t'); 3591590Srgrimes case 'v': 3601590Srgrimes return ('\13'); 3611590Srgrimes case '\0': /* \" -> \ */ 3621590Srgrimes s->state = EOS; 3631590Srgrimes return ('\\'); 3641590Srgrimes default: /* \x" -> x */ 3651590Srgrimes return (ch); 3661590Srgrimes } 3671590Srgrimes} 368