1/* $NetBSD: str.c,v 1.30 2018/05/26 11:20:30 leot Exp $ */ 2 3/*- 4 * Copyright (c) 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32#include <sys/cdefs.h> 33#ifndef lint 34#if 0 35static char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95"; 36#endif 37__RCSID("$NetBSD: str.c,v 1.30 2018/05/26 11:20:30 leot Exp $"); 38#endif /* not lint */ 39 40#include <sys/types.h> 41 42#include <err.h> 43#include <errno.h> 44#include <stddef.h> 45#include <stdio.h> 46#include <stdlib.h> 47#include <string.h> 48#include <ctype.h> 49#include <assert.h> 50 51#include "extern.h" 52 53struct str { 54 enum { STRING1, STRING2 } which; 55 enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state; 56 int cnt; /* character count */ 57 int lastch; /* last character */ 58 int equiv[2]; /* equivalence set */ 59 int *set; /* set of characters */ 60 const char *str; /* user's string */ 61}; 62 63static int backslash(STR *); 64static int bracket(STR *); 65static int c_class(const void *, const void *); 66static int *genclass(const char *, size_t); 67static void genequiv(STR *); 68static int genrange(STR *); 69static void genseq(STR *); 70 71STR * 72str_create(int whichstring, const char *txt) 73{ 74 STR *s; 75 76 s = malloc(sizeof(*s)); 77 if (s == NULL) { 78 err(1, "Out of memory"); 79 } 80 81 s->which = whichstring == 2 ? STRING2 : STRING1; 82 s->state = NORMAL; 83 s->cnt = 0; 84 s->lastch = OOBCH; 85 s->equiv[0] = 0; 86 s->equiv[1] = OOBCH; 87 s->set = NULL; 88 s->str = txt; 89 90 return s; 91} 92 93void 94str_destroy(STR *s) 95{ 96 if (s->set != NULL && s->set != s->equiv) { 97 free(s->set); 98 } 99 free(s); 100} 101 102int 103next(STR *s, int *ret) 104{ 105 int ch; 106 107 switch (s->state) { 108 case EOS: 109 *ret = s->lastch; 110 return 0; 111 case INFINITE: 112 *ret = s->lastch; 113 return 1; 114 case NORMAL: 115 ch = (unsigned char)s->str[0]; 116 switch (ch) { 117 case '\0': 118 s->state = EOS; 119 *ret = s->lastch; 120 return 0; 121 case '\\': 122 s->lastch = backslash(s); 123 break; 124 case '[': 125 if (bracket(s)) { 126 return next(s, ret); 127 } 128 /* FALLTHROUGH */ 129 default: 130 ++s->str; 131 s->lastch = ch; 132 break; 133 } 134 135 /* We can start a range at any time. */ 136 if (s->str[0] == '-' && genrange(s)) { 137 return next(s, ret); 138 } 139 *ret = s->lastch; 140 return 1; 141 case RANGE: 142 if (s->cnt == 0) { 143 s->state = NORMAL; 144 return next(s, ret); 145 } 146 s->cnt--; 147 ++s->lastch; 148 *ret = s->lastch; 149 return 1; 150 case SEQUENCE: 151 if (s->cnt == 0) { 152 s->state = NORMAL; 153 return next(s, ret); 154 } 155 s->cnt--; 156 *ret = s->lastch; 157 return 1; 158 case SET: 159 s->lastch = s->set[s->cnt++]; 160 if (s->lastch == OOBCH) { 161 s->state = NORMAL; 162 if (s->set != s->equiv) { 163 free(s->set); 164 } 165 s->set = NULL; 166 return next(s, ret); 167 } 168 *ret = s->lastch; 169 return 1; 170 } 171 /* NOTREACHED */ 172 assert(0); 173 *ret = s->lastch; 174 return 0; 175} 176 177static int 178bracket(STR *s) 179{ 180 const char *p; 181 int *q; 182 183 switch (s->str[1]) { 184 case ':': /* "[:class:]" */ 185 if ((p = strstr(s->str + 2, ":]")) == NULL) 186 return 0; 187 s->str += 2; 188 q = genclass(s->str, p - s->str); 189 s->state = SET; 190 s->set = q; 191 s->cnt = 0; 192 s->str = p + 2; 193 return 1; 194 case '=': /* "[=equiv=]" */ 195 if ((p = strstr(s->str + 2, "=]")) == NULL) 196 return 0; 197 s->str += 2; 198 genequiv(s); 199 s->str = p + 2; 200 return 1; 201 default: /* "[\###*n]" or "[#*n]" */ 202 if ((p = strpbrk(s->str + 2, "*]")) == NULL) 203 return 0; 204 if (p[0] != '*' || strchr(p, ']') == NULL) 205 return 0; 206 s->str += 1; 207 genseq(s); 208 return 1; 209 } 210 /* NOTREACHED */ 211} 212 213typedef struct { 214 const char *name; 215 int (*func)(int); 216} CLASS; 217 218static const CLASS classes[] = { 219 { "alnum", isalnum }, 220 { "alpha", isalpha }, 221 { "blank", isblank }, 222 { "cntrl", iscntrl }, 223 { "digit", isdigit }, 224 { "graph", isgraph }, 225 { "lower", islower }, 226 { "print", isprint }, 227 { "punct", ispunct }, 228 { "space", isspace }, 229 { "upper", isupper }, 230 { "xdigit", isxdigit }, 231}; 232 233typedef struct { 234 const char *name; 235 size_t len; 236} CLASSKEY; 237 238static int * 239genclass(const char *class, size_t len) 240{ 241 int ch; 242 const CLASS *cp; 243 CLASSKEY key; 244 int *p; 245 unsigned pos, num; 246 247 /* Find the class */ 248 key.name = class; 249 key.len = len; 250 cp = bsearch(&key, classes, __arraycount(classes), sizeof(classes[0]), 251 c_class); 252 if (cp == NULL) { 253 errx(1, "unknown class %.*s", (int)len, class); 254 } 255 256 /* 257 * Figure out what characters are in the class 258 */ 259 260 num = NCHARS + 1; 261 p = malloc(num * sizeof(*p)); 262 if (p == NULL) { 263 err(1, "malloc"); 264 } 265 266 pos = 0; 267 for (ch = 0; ch < NCHARS; ch++) { 268 if (cp->func(ch)) { 269 p[pos++] = ch; 270 } 271 } 272 273 p[pos++] = OOBCH; 274 for (; pos < num; pos++) { 275 p[pos] = 0; 276 } 277 278 return p; 279} 280 281static int 282c_class(const void *av, const void *bv) 283{ 284 const CLASSKEY *a = av; 285 const CLASS *b = bv; 286 size_t blen; 287 int r; 288 289 blen = strlen(b->name); 290 r = strncmp(a->name, b->name, a->len); 291 if (r != 0) { 292 return r; 293 } 294 if (a->len < blen) { 295 /* someone gave us a prefix of the right name */ 296 return -1; 297 } 298 assert(a-> len == blen); 299 return 0; 300} 301 302/* 303 * English doesn't have any equivalence classes, so for now 304 * we just syntax check and grab the character. 305 */ 306static void 307genequiv(STR *s) 308{ 309 int ch; 310 311 ch = (unsigned char)s->str[0]; 312 if (ch == '\\') { 313 s->equiv[0] = backslash(s); 314 } else { 315 s->equiv[0] = ch; 316 s->str++; 317 } 318 if (s->str[0] != '=') { 319 errx(1, "Misplaced equivalence equals sign"); 320 } 321 s->str++; 322 if (s->str[0] != ']') { 323 errx(1, "Misplaced equivalence right bracket"); 324 } 325 s->str++; 326 327 s->cnt = 0; 328 s->state = SET; 329 s->set = s->equiv; 330} 331 332static int 333genrange(STR *s) 334{ 335 int stopval; 336 const char *savestart; 337 338 savestart = s->str++; 339 stopval = s->str[0] == '\\' ? backslash(s) : (unsigned char)*s->str++; 340 if (stopval < (unsigned char)s->lastch) { 341 s->str = savestart; 342 return 0; 343 } 344 s->cnt = stopval - s->lastch + 1; 345 s->state = RANGE; 346 --s->lastch; 347 return 1; 348} 349 350static void 351genseq(STR *s) 352{ 353 char *ep; 354 355 if (s->which == STRING1) { 356 errx(1, "Sequences only valid in string2"); 357 } 358 359 if (*s->str == '\\') { 360 s->lastch = backslash(s); 361 } else { 362 s->lastch = (unsigned char)*s->str++; 363 } 364 if (*s->str != '*') { 365 errx(1, "Misplaced sequence asterisk"); 366 } 367 368 s->str++; 369 switch (s->str[0]) { 370 case '\\': 371 s->cnt = backslash(s); 372 break; 373 case ']': 374 s->cnt = 0; 375 ++s->str; 376 break; 377 default: 378 if (isdigit((unsigned char)s->str[0])) { 379 s->cnt = strtol(s->str, &ep, 0); 380 if (*ep == ']') { 381 s->str = ep + 1; 382 break; 383 } 384 } 385 errx(1, "illegal sequence count"); 386 /* NOTREACHED */ 387 } 388 389 s->state = s->cnt ? SEQUENCE : INFINITE; 390} 391 392/* 393 * Translate \??? into a character. Up to 3 octal digits, if no digits either 394 * an escape code or a literal character. 395 */ 396static int 397backslash(STR *s) 398{ 399 int ch, cnt, val; 400 401 cnt = val = 0; 402 for (;;) { 403 /* Consume the character we're already on. */ 404 s->str++; 405 406 /* Look at the next character. */ 407 ch = (unsigned char)s->str[0]; 408 if (!isascii(ch) || !isdigit(ch)) { 409 break; 410 } 411 val = val * 8 + ch - '0'; 412 if (++cnt == 3) { 413 /* Enough digits; consume this one and stop */ 414 ++s->str; 415 break; 416 } 417 } 418 if (cnt) { 419 /* We saw digits, so return their value */ 420 if (val >= OOBCH) 421 errx(1, "Invalid octal character value"); 422 return val; 423 } 424 if (ch == '\0') { 425 /* \<end> -> \ */ 426 s->state = EOS; 427 return '\\'; 428 } 429 430 /* Consume the escaped character */ 431 s->str++; 432 433 switch (ch) { 434 case 'a': /* escape characters */ 435 return '\7'; 436 case 'b': 437 return '\b'; 438 case 'e': 439 return '\033'; 440 case 'f': 441 return '\f'; 442 case 'n': 443 return '\n'; 444 case 'r': 445 return '\r'; 446 case 't': 447 return '\t'; 448 case 'v': 449 return '\13'; 450 default: /* \q -> q */ 451 return ch; 452 } 453} 454