11590Srgrimes/*- 21590Srgrimes * Copyright (c) 1991, 1993 31590Srgrimes * The Regents of the University of California. All rights reserved. 41590Srgrimes * 51590Srgrimes * Redistribution and use in source and binary forms, with or without 61590Srgrimes * modification, are permitted provided that the following conditions 71590Srgrimes * are met: 81590Srgrimes * 1. Redistributions of source code must retain the above copyright 91590Srgrimes * notice, this list of conditions and the following disclaimer. 101590Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 111590Srgrimes * notice, this list of conditions and the following disclaimer in the 121590Srgrimes * documentation and/or other materials provided with the distribution. 131590Srgrimes * 4. Neither the name of the University nor the names of its contributors 141590Srgrimes * may be used to endorse or promote products derived from this software 151590Srgrimes * without specific prior written permission. 161590Srgrimes * 171590Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 181590Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 191590Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 201590Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 211590Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 221590Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 231590Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 241590Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 251590Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 261590Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 271590Srgrimes * SUCH DAMAGE. 281590Srgrimes */ 291590Srgrimes 3087705Smarkm#include <sys/cdefs.h> 3187705Smarkm 3287705Smarkm__FBSDID("$FreeBSD$"); 3387705Smarkm 341590Srgrimes#ifndef lint 3587705Smarkmstatic const char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95"; 3628368Scharnier#endif 371590Srgrimes 381590Srgrimes#include <sys/types.h> 391590Srgrimes 4028368Scharnier#include <ctype.h> 4128368Scharnier#include <err.h> 42131846Stjr#include <errno.h> 43200462Sdelphij#include <stddef.h> 44200462Sdelphij#include <stdio.h> 451590Srgrimes#include <stdlib.h> 461590Srgrimes#include <string.h> 47131846Stjr#include <wchar.h> 48131846Stjr#include <wctype.h> 491590Srgrimes 501590Srgrimes#include "extern.h" 511590Srgrimes 52118412Sachestatic int backslash(STR *, int *); 5392922Simpstatic int bracket(STR *); 5492922Simpstatic void genclass(STR *); 5592922Simpstatic void genequiv(STR *); 56118412Sachestatic int genrange(STR *, int); 5792922Simpstatic void genseq(STR *); 581590Srgrimes 59131846Stjrwint_t 60226360Sednext(STR *s) 611590Srgrimes{ 62131846Stjr int is_octal; 63131846Stjr wint_t ch; 64131846Stjr wchar_t wch; 65131846Stjr size_t clen; 661590Srgrimes 671590Srgrimes switch (s->state) { 681590Srgrimes case EOS: 691590Srgrimes return (0); 701590Srgrimes case INFINITE: 711590Srgrimes return (1); 721590Srgrimes case NORMAL: 73131846Stjr switch (*s->str) { 741590Srgrimes case '\0': 751590Srgrimes s->state = EOS; 761590Srgrimes return (0); 771590Srgrimes case '\\': 78118412Sache s->lastch = backslash(s, &is_octal); 791590Srgrimes break; 801590Srgrimes case '[': 811590Srgrimes if (bracket(s)) 821590Srgrimes return (next(s)); 831590Srgrimes /* FALLTHROUGH */ 841590Srgrimes default: 85131846Stjr clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL); 86131846Stjr if (clen == (size_t)-1 || clen == (size_t)-2 || 87131846Stjr clen == 0) 88131846Stjr errc(1, EILSEQ, NULL); 89118412Sache is_octal = 0; 90131846Stjr s->lastch = wch; 91131846Stjr s->str += clen; 921590Srgrimes break; 931590Srgrimes } 941590Srgrimes 951590Srgrimes /* We can start a range at any time. */ 96118412Sache if (s->str[0] == '-' && genrange(s, is_octal)) 971590Srgrimes return (next(s)); 981590Srgrimes return (1); 99118415Sache case RANGE: 100118415Sache if (s->cnt-- == 0) { 101118415Sache s->state = NORMAL; 102118415Sache return (next(s)); 103118415Sache } 104118415Sache ++s->lastch; 105118415Sache return (1); 1061590Srgrimes case SEQUENCE: 1071590Srgrimes if (s->cnt-- == 0) { 1081590Srgrimes s->state = NORMAL; 1091590Srgrimes return (next(s)); 1101590Srgrimes } 1111590Srgrimes return (1); 112131846Stjr case CCLASS: 113131846Stjr case CCLASS_UPPER: 114131846Stjr case CCLASS_LOWER: 115131846Stjr s->cnt++; 116131846Stjr ch = nextwctype(s->lastch, s->cclass); 117131846Stjr if (ch == -1) { 118131846Stjr s->state = NORMAL; 119131846Stjr return (next(s)); 120131846Stjr } 121131846Stjr s->lastch = ch; 122131846Stjr return (1); 1231590Srgrimes case SET: 124118399Sache if ((ch = s->set[s->cnt++]) == OOBCH) { 1251590Srgrimes s->state = NORMAL; 1261590Srgrimes return (next(s)); 1271590Srgrimes } 128118399Sache s->lastch = ch; 1291590Srgrimes return (1); 13087705Smarkm default: 13187705Smarkm return (0); 1321590Srgrimes } 1331590Srgrimes /* NOTREACHED */ 1341590Srgrimes} 1351590Srgrimes 1361590Srgrimesstatic int 137226360Sedbracket(STR *s) 1381590Srgrimes{ 13987705Smarkm char *p; 1401590Srgrimes 1411590Srgrimes switch (s->str[1]) { 1421590Srgrimes case ':': /* "[:class:]" */ 14398242Stjr if ((p = strchr(s->str + 2, ']')) == NULL) 1441590Srgrimes return (0); 14598242Stjr if (*(p - 1) != ':' || p - s->str < 4) 14698242Stjr goto repeat; 14798242Stjr *(p - 1) = '\0'; 1481590Srgrimes s->str += 2; 1491590Srgrimes genclass(s); 15098242Stjr s->str = p + 1; 1511590Srgrimes return (1); 1521590Srgrimes case '=': /* "[=equiv=]" */ 153213284Sjilles if (s->str[2] == '\0' || (p = strchr(s->str + 3, ']')) == NULL) 1541590Srgrimes return (0); 15598242Stjr if (*(p - 1) != '=' || p - s->str < 4) 15698242Stjr goto repeat; 1571590Srgrimes s->str += 2; 1581590Srgrimes genequiv(s); 1591590Srgrimes return (1); 1601590Srgrimes default: /* "[\###*n]" or "[#*n]" */ 16198242Stjr repeat: 1621590Srgrimes if ((p = strpbrk(s->str + 2, "*]")) == NULL) 1631590Srgrimes return (0); 164229403Sed if (p[0] != '*' || strchr(p, ']') == NULL) 1651590Srgrimes return (0); 1661590Srgrimes s->str += 1; 1671590Srgrimes genseq(s); 1681590Srgrimes return (1); 1691590Srgrimes } 1701590Srgrimes /* NOTREACHED */ 1711590Srgrimes} 1721590Srgrimes 1731590Srgrimesstatic void 174226360Sedgenclass(STR *s) 1751590Srgrimes{ 1761590Srgrimes 177131846Stjr if ((s->cclass = wctype(s->str)) == 0) 17828368Scharnier errx(1, "unknown class %s", s->str); 1791590Srgrimes s->cnt = 0; 180131846Stjr s->lastch = -1; /* incremented before check in next() */ 181118371Sache if (strcmp(s->str, "upper") == 0) 182131846Stjr s->state = CCLASS_UPPER; 183118475Sache else if (strcmp(s->str, "lower") == 0) 184131846Stjr s->state = CCLASS_LOWER; 185118475Sache else 186131846Stjr s->state = CCLASS; 1871590Srgrimes} 1881590Srgrimes 1891590Srgrimesstatic void 190226360Sedgenequiv(STR *s) 1911590Srgrimes{ 19298210Stjr int i, p, pri; 19398210Stjr char src[2], dst[3]; 194131846Stjr size_t clen; 195131846Stjr wchar_t wc; 19698210Stjr 1971590Srgrimes if (*s->str == '\\') { 198118412Sache s->equiv[0] = backslash(s, NULL); 1991590Srgrimes if (*s->str != '=') 20028368Scharnier errx(1, "misplaced equivalence equals sign"); 20198215Stjr s->str += 2; 2021590Srgrimes } else { 203131846Stjr clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 204131846Stjr if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0) 205131846Stjr errc(1, EILSEQ, NULL); 206131846Stjr s->equiv[0] = wc; 207131846Stjr if (s->str[clen] != '=') 20828368Scharnier errx(1, "misplaced equivalence equals sign"); 209131846Stjr s->str += clen + 2; 2101590Srgrimes } 21198210Stjr 21298210Stjr /* 21398210Stjr * Calculate the set of all characters in the same equivalence class 21498210Stjr * as the specified character (they will have the same primary 21598210Stjr * collation weights). 21698210Stjr * XXX Knows too much about how strxfrm() is implemented. Assumes 21798210Stjr * it fills the string with primary collation weight bytes. Only one- 21898210Stjr * to-one mappings are supported. 219131846Stjr * XXX Equivalence classes not supported in multibyte locales. 22098210Stjr */ 221131846Stjr src[0] = (char)s->equiv[0]; 22298210Stjr src[1] = '\0'; 223131846Stjr if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) { 22498210Stjr pri = (unsigned char)*dst; 225131846Stjr for (p = 1, i = 1; i < NCHARS_SB; i++) { 22698210Stjr *src = i; 22798210Stjr if (strxfrm(dst, src, sizeof(dst)) == 1 && pri && 22898210Stjr pri == (unsigned char)*dst) 22998210Stjr s->equiv[p++] = i; 23098210Stjr } 23198210Stjr s->equiv[p] = OOBCH; 23298210Stjr } 23398210Stjr 2341590Srgrimes s->cnt = 0; 2351590Srgrimes s->state = SET; 2361590Srgrimes s->set = s->equiv; 2371590Srgrimes} 2381590Srgrimes 2391590Srgrimesstatic int 240118412Sachegenrange(STR *s, int was_octal) 2411590Srgrimes{ 242118412Sache int stopval, octal; 2431590Srgrimes char *savestart; 244118372Sache int n, cnt, *p; 245131846Stjr size_t clen; 246131846Stjr wchar_t wc; 2471590Srgrimes 248118412Sache octal = 0; 2491590Srgrimes savestart = s->str; 250131846Stjr if (*++s->str == '\\') 251131846Stjr stopval = backslash(s, &octal); 252131846Stjr else { 253131846Stjr clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 254131846Stjr if (clen == (size_t)-1 || clen == (size_t)-2) 255131846Stjr errc(1, EILSEQ, NULL); 256131846Stjr stopval = wc; 257131846Stjr s->str += clen; 2581590Srgrimes } 259131846Stjr /* 260131846Stjr * XXX Characters are not ordered according to collating sequence in 261131846Stjr * multibyte locales. 262131846Stjr */ 263131846Stjr if (octal || was_octal || MB_CUR_MAX > 1) { 264131846Stjr if (stopval < s->lastch) { 265131846Stjr s->str = savestart; 266131846Stjr return (0); 267131846Stjr } 268118415Sache s->cnt = stopval - s->lastch + 1; 269118415Sache s->state = RANGE; 270118415Sache --s->lastch; 271118415Sache return (1); 272118415Sache } 273131846Stjr if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) { 274131846Stjr s->str = savestart; 275131846Stjr return (0); 276131846Stjr } 277131846Stjr if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL) 278118372Sache err(1, "genrange() malloc"); 279131846Stjr for (cnt = 0; cnt < NCHARS_SB; cnt++) 280118415Sache if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 && 281118415Sache charcoll((const void *)&cnt, (const void *)&stopval) <= 0) 282118372Sache *p++ = cnt; 283118372Sache *p = OOBCH; 284118372Sache n = p - s->set; 285118372Sache 286118372Sache s->cnt = 0; 287118372Sache s->state = SET; 288118415Sache if (n > 1) 289118372Sache mergesort(s->set, n, sizeof(*(s->set)), charcoll); 2901590Srgrimes return (1); 2911590Srgrimes} 2921590Srgrimes 2931590Srgrimesstatic void 294226360Sedgenseq(STR *s) 2951590Srgrimes{ 2961590Srgrimes char *ep; 297131846Stjr wchar_t wc; 298131846Stjr size_t clen; 2991590Srgrimes 3001590Srgrimes if (s->which == STRING1) 30128368Scharnier errx(1, "sequences only valid in string2"); 3021590Srgrimes 3031590Srgrimes if (*s->str == '\\') 304118412Sache s->lastch = backslash(s, NULL); 305131846Stjr else { 306131846Stjr clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 307131846Stjr if (clen == (size_t)-1 || clen == (size_t)-2) 308131846Stjr errc(1, EILSEQ, NULL); 309131846Stjr s->lastch = wc; 310131846Stjr s->str += clen; 311131846Stjr } 3121590Srgrimes if (*s->str != '*') 31328368Scharnier errx(1, "misplaced sequence asterisk"); 3141590Srgrimes 3151590Srgrimes switch (*++s->str) { 3161590Srgrimes case '\\': 317118412Sache s->cnt = backslash(s, NULL); 3181590Srgrimes break; 3191590Srgrimes case ']': 3201590Srgrimes s->cnt = 0; 3211590Srgrimes ++s->str; 3221590Srgrimes break; 3231590Srgrimes default: 32414720Sjoerg if (isdigit((u_char)*s->str)) { 3251590Srgrimes s->cnt = strtol(s->str, &ep, 0); 3261590Srgrimes if (*ep == ']') { 3271590Srgrimes s->str = ep + 1; 3281590Srgrimes break; 3291590Srgrimes } 3301590Srgrimes } 33128368Scharnier errx(1, "illegal sequence count"); 3321590Srgrimes /* NOTREACHED */ 3331590Srgrimes } 3341590Srgrimes 3351590Srgrimes s->state = s->cnt ? SEQUENCE : INFINITE; 3361590Srgrimes} 3371590Srgrimes 3381590Srgrimes/* 3391590Srgrimes * Translate \??? into a character. Up to 3 octal digits, if no digits either 3401590Srgrimes * an escape code or a literal character. 3411590Srgrimes */ 3421590Srgrimesstatic int 343118412Sachebackslash(STR *s, int *is_octal) 3441590Srgrimes{ 34587705Smarkm int ch, cnt, val; 3461590Srgrimes 347118412Sache if (is_octal != NULL) 348118412Sache *is_octal = 0; 3491590Srgrimes for (cnt = val = 0;;) { 35014720Sjoerg ch = (u_char)*++s->str; 351137685Sjkh if (!isdigit(ch) || ch > '7') 3521590Srgrimes break; 3531590Srgrimes val = val * 8 + ch - '0'; 3541590Srgrimes if (++cnt == 3) { 3551590Srgrimes ++s->str; 3561590Srgrimes break; 3571590Srgrimes } 3581590Srgrimes } 359118412Sache if (cnt) { 360118412Sache if (is_octal != NULL) 361118412Sache *is_octal = 1; 3621590Srgrimes return (val); 363118412Sache } 3641590Srgrimes if (ch != '\0') 3651590Srgrimes ++s->str; 3661590Srgrimes switch (ch) { 3671590Srgrimes case 'a': /* escape characters */ 3681590Srgrimes return ('\7'); 3691590Srgrimes case 'b': 3701590Srgrimes return ('\b'); 3711590Srgrimes case 'f': 3721590Srgrimes return ('\f'); 3731590Srgrimes case 'n': 3741590Srgrimes return ('\n'); 3751590Srgrimes case 'r': 3761590Srgrimes return ('\r'); 3771590Srgrimes case 't': 3781590Srgrimes return ('\t'); 3791590Srgrimes case 'v': 3801590Srgrimes return ('\13'); 3811590Srgrimes case '\0': /* \" -> \ */ 3821590Srgrimes s->state = EOS; 3831590Srgrimes return ('\\'); 3841590Srgrimes default: /* \x" -> x */ 3851590Srgrimes return (ch); 3861590Srgrimes } 3871590Srgrimes} 388