1/* $OpenBSD: str.c,v 1.15 2023/05/04 16:08:29 tb Exp $ */ 2/* $NetBSD: str.c,v 1.7 1995/08/31 22:13:47 jtc Exp $ */ 3 4/*- 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33#include <sys/types.h> 34 35#include <assert.h> 36#include <errno.h> 37#include <stddef.h> 38#include <stdio.h> 39#include <stdlib.h> 40#include <string.h> 41#include <ctype.h> 42#include <err.h> 43 44#include "extern.h" 45 46static int backslash(STR *); 47static int bracket(STR *); 48static int c_class(const void *, const void *); 49static void genclass(STR *); 50static void genequiv(STR *); 51static int genrange(STR *); 52static void genseq(STR *); 53 54int 55next(STR *s) 56{ 57 int ch; 58 59 switch (s->state) { 60 case EOS: 61 return (0); 62 case INFINITE: 63 return (1); 64 case NORMAL: 65 switch (ch = *s->str) { 66 case '\0': 67 s->state = EOS; 68 return (0); 69 case '\\': 70 s->lastch = backslash(s); 71 break; 72 case '[': 73 if (bracket(s)) 74 return (next(s)); 75 /* FALLTHROUGH */ 76 default: 77 ++s->str; 78 s->lastch = ch; 79 break; 80 } 81 82 /* We can start a range at any time. */ 83 if (s->str[0] == '-' && genrange(s)) 84 return (next(s)); 85 return (1); 86 case RANGE: 87 if (s->cnt-- == 0) { 88 s->state = NORMAL; 89 return (next(s)); 90 } 91 ++s->lastch; 92 return (1); 93 case SEQUENCE: 94 if (s->cnt-- == 0) { 95 s->state = NORMAL; 96 return (next(s)); 97 } 98 return (1); 99 case SET: 100 if ((s->lastch = s->set[s->cnt++]) == OOBCH) { 101 s->state = NORMAL; 102 return (next(s)); 103 } 104 return (1); 105 default: 106 return 0; 107 } 108 /* NOTREACHED */ 109} 110 111static int 112bracket(STR *s) 113{ 114 char *p; 115 116 switch (s->str[1]) { 117 case ':': /* "[:class:]" */ 118 if ((p = strstr((char *)s->str + 2, ":]")) == NULL) 119 return (0); 120 *p = '\0'; 121 s->str += 2; 122 genclass(s); 123 s->str = (unsigned char *)p + 2; 124 return (1); 125 case '=': /* "[=equiv=]" */ 126 if ((p = strstr((char *)s->str + 2, "=]")) == NULL) 127 return (0); 128 s->str += 2; 129 genequiv(s); 130 return (1); 131 default: /* "[\###*n]" or "[#*n]" */ 132 if ((p = strpbrk((char *)s->str + 2, "*]")) == NULL) 133 return (0); 134 if (p[0] != '*' || strchr(p, ']') == NULL) 135 return (0); 136 s->str += 1; 137 genseq(s); 138 return (1); 139 } 140 /* NOTREACHED */ 141} 142 143typedef struct { 144 char *name; 145 int (*func)(int); 146 int *set; 147} CLASS; 148 149static CLASS classes[] = { 150 { "alnum", isalnum, }, 151 { "alpha", isalpha, }, 152 { "blank", isblank, }, 153 { "cntrl", iscntrl, }, 154 { "digit", isdigit, }, 155 { "graph", isgraph, }, 156 { "lower", islower, }, 157 { "print", isprint, }, 158 { "punct", ispunct, }, 159 { "space", isspace, }, 160 { "upper", isupper, }, 161 { "xdigit", isxdigit, }, 162}; 163 164static void 165genclass(STR *s) 166{ 167 CLASS *cp, tmp; 168 size_t len; 169 int i; 170 171 tmp.name = (char *)s->str; 172 if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) / 173 sizeof(CLASS), sizeof(CLASS), c_class)) == NULL) 174 errx(1, "unknown class %s", s->str); 175 176 /* 177 * Generate the set of characters in the class if we haven't 178 * already done so. 179 */ 180 if (cp->set == NULL) { 181 cp->set = reallocarray(NULL, NCHARS + 1, sizeof(*cp->set)); 182 if (cp->set == NULL) 183 err(1, NULL); 184 len = 0; 185 for (i = 0; i < NCHARS; i++) { 186 if (cp->func(i)) { 187 cp->set[len] = i; 188 len++; 189 } 190 } 191 cp->set[len] = OOBCH; 192 len++; 193 cp->set = reallocarray(cp->set, len, sizeof(*cp->set)); 194 if (cp->set == NULL) 195 err(1, NULL); 196 } 197 198 s->cnt = 0; 199 s->state = SET; 200 s->set = cp->set; 201} 202 203static int 204c_class(const void *a, const void *b) 205{ 206 return (strcmp(((CLASS *)a)->name, ((CLASS *)b)->name)); 207} 208 209/* 210 * English doesn't have any equivalence classes, so for now 211 * we just syntax check and grab the character. 212 */ 213static void 214genequiv(STR *s) 215{ 216 if (*s->str == '\\') { 217 s->equiv[0] = backslash(s); 218 if (*s->str != '=') 219 errx(1, "misplaced equivalence equals sign"); 220 } else { 221 s->equiv[0] = s->str[0]; 222 if (s->str[1] != '=') 223 errx(1, "misplaced equivalence equals sign"); 224 } 225 s->str += 2; 226 s->cnt = 0; 227 s->state = SET; 228 s->set = s->equiv; 229} 230 231static int 232genrange(STR *s) 233{ 234 int stopval; 235 unsigned char *savestart; 236 237 savestart = s->str; 238 stopval = *++s->str == '\\' ? backslash(s) : *s->str++; 239 if (stopval < (u_char)s->lastch) { 240 s->str = savestart; 241 return (0); 242 } 243 s->cnt = stopval - s->lastch + 1; 244 s->state = RANGE; 245 --s->lastch; 246 return (1); 247} 248 249static void 250genseq(STR *s) 251{ 252 char *ep; 253 254 if (s->which == STRING1) 255 errx(1, "sequences only valid in string2"); 256 257 if (*s->str == '\\') 258 s->lastch = backslash(s); 259 else 260 s->lastch = *s->str++; 261 if (*s->str != '*') 262 errx(1, "misplaced sequence asterisk"); 263 264 switch (*++s->str) { 265 case '\\': 266 s->cnt = backslash(s); 267 break; 268 case ']': 269 s->cnt = 0; 270 ++s->str; 271 break; 272 default: 273 if (isdigit(*s->str)) { 274 s->cnt = strtol((char *)s->str, &ep, 0); 275 if (*ep == ']') { 276 s->str = (unsigned char *)ep + 1; 277 break; 278 } 279 } 280 errx(1, "illegal sequence count"); 281 /* NOTREACHED */ 282 } 283 284 s->state = s->cnt ? SEQUENCE : INFINITE; 285} 286 287/* 288 * Translate \??? into a character. Up to 3 octal digits, if no digits either 289 * an escape code or a literal character. 290 */ 291static int 292backslash(STR *s) 293{ 294 size_t i; 295 int ch, val; 296 297 assert(*s->str == '\\'); 298 s->str++; 299 300 /* Empty escapes become plain backslashes. */ 301 if (*s->str == '\0') { 302 s->state = EOS; 303 return ('\\'); 304 } 305 306 val = 0; 307 for (i = 0; i < 3; i++) { 308 if (s->str[i] < '0' || '7' < s->str[i]) 309 break; 310 val = val * 8 + s->str[i] - '0'; 311 } 312 if (i > 0) { 313 if (val > UCHAR_MAX) 314 errx(1, "octal value out of range: %d", val); 315 s->str += i; 316 return (val); 317 } 318 319 ch = *s->str++; 320 switch (ch) { 321 case 'a': /* escape characters */ 322 return ('\7'); 323 case 'b': 324 return ('\b'); 325 case 'f': 326 return ('\f'); 327 case 'n': 328 return ('\n'); 329 case 'r': 330 return ('\r'); 331 case 't': 332 return ('\t'); 333 case 'v': 334 return ('\13'); 335 default: /* \x" -> x */ 336 return (ch); 337 } 338} 339