str.c revision 118412
1/*- 2 * Copyright (c) 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34#include <sys/cdefs.h> 35 36__FBSDID("$FreeBSD: head/usr.bin/tr/str.c 118412 2003-08-04 04:20:04Z ache $"); 37 38#ifndef lint 39static const char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95"; 40#endif 41 42#include <sys/cdefs.h> 43#include <sys/types.h> 44 45#include <ctype.h> 46#include <err.h> 47#include <stddef.h> 48#include <stdio.h> 49#include <stdlib.h> 50#include <string.h> 51 52#include "extern.h" 53 54static int backslash(STR *, int *); 55static int bracket(STR *); 56static int c_class(const void *, const void *); 57static void genclass(STR *); 58static void genequiv(STR *); 59static int genrange(STR *, int); 60static void genseq(STR *); 61 62int 63next(s) 64 STR *s; 65{ 66 int ch, is_octal; 67 68 switch (s->state) { 69 case EOS: 70 return (0); 71 case INFINITE: 72 return (1); 73 case NORMAL: 74 switch (ch = (u_char)*s->str) { 75 case '\0': 76 s->state = EOS; 77 return (0); 78 case '\\': 79 s->lastch = backslash(s, &is_octal); 80 break; 81 case '[': 82 if (bracket(s)) 83 return (next(s)); 84 /* FALLTHROUGH */ 85 default: 86 is_octal = 0; 87 ++s->str; 88 s->lastch = ch; 89 break; 90 } 91 92 /* We can start a range at any time. */ 93 if (s->str[0] == '-' && genrange(s, is_octal)) 94 return (next(s)); 95 return (1); 96 case SEQUENCE: 97 if (s->cnt-- == 0) { 98 s->state = NORMAL; 99 return (next(s)); 100 } 101 return (1); 102 case SET: 103 case SET_UPPER: 104 case SET_LOWER: 105 if ((ch = s->set[s->cnt++]) == OOBCH) { 106 s->state = NORMAL; 107 return (next(s)); 108 } 109 s->lastch = ch; 110 return (1); 111 default: 112 return (0); 113 } 114 /* NOTREACHED */ 115} 116 117static int 118bracket(s) 119 STR *s; 120{ 121 char *p; 122 123 switch (s->str[1]) { 124 case ':': /* "[:class:]" */ 125 if ((p = strchr(s->str + 2, ']')) == NULL) 126 return (0); 127 if (*(p - 1) != ':' || p - s->str < 4) 128 goto repeat; 129 *(p - 1) = '\0'; 130 s->str += 2; 131 genclass(s); 132 s->str = p + 1; 133 return (1); 134 case '=': /* "[=equiv=]" */ 135 if ((p = strchr(s->str + 2, ']')) == NULL) 136 return (0); 137 if (*(p - 1) != '=' || p - s->str < 4) 138 goto repeat; 139 s->str += 2; 140 genequiv(s); 141 return (1); 142 default: /* "[\###*n]" or "[#*n]" */ 143 repeat: 144 if ((p = strpbrk(s->str + 2, "*]")) == NULL) 145 return (0); 146 if (p[0] != '*' || index(p, ']') == NULL) 147 return (0); 148 s->str += 1; 149 genseq(s); 150 return (1); 151 } 152 /* NOTREACHED */ 153} 154 155typedef struct { 156 const char *name; 157 int (*func)(int); 158 int *set; 159} CLASS; 160 161static CLASS classes[] = { 162#undef isalnum 163 { "alnum", isalnum, NULL }, 164#undef isalpha 165 { "alpha", isalpha, NULL }, 166#undef isblank 167 { "blank", isblank, NULL }, 168#undef iscntrl 169 { "cntrl", iscntrl, NULL }, 170#undef isdigit 171 { "digit", isdigit, NULL }, 172#undef isgraph 173 { "graph", isgraph, NULL }, 174#undef islower 175 { "lower", islower, NULL }, 176#undef isprint 177 { "print", isprint, NULL }, 178#undef ispunct 179 { "punct", ispunct, NULL }, 180#undef isspace 181 { "space", isspace, NULL }, 182#undef isupper 183 { "upper", isupper, NULL }, 184#undef isxdigit 185 { "xdigit", isxdigit, NULL }, 186}; 187 188static void 189genclass(s) 190 STR *s; 191{ 192 int cnt, (*func)(int); 193 CLASS *cp, tmp; 194 int *p, n; 195 196 tmp.name = s->str; 197 if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) / 198 sizeof(CLASS), sizeof(CLASS), c_class)) == NULL) 199 errx(1, "unknown class %s", s->str); 200 201 if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL) 202 err(1, "genclass() malloc"); 203 bzero(p, (NCHARS + 1) * sizeof(int)); 204 for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt) 205 if ((func)(cnt)) 206 *p++ = cnt; 207 *p = OOBCH; 208 n = p - cp->set; 209 210 s->cnt = 0; 211 s->set = cp->set; 212 if (strcmp(s->str, "upper") == 0) 213 s->state = SET_UPPER; 214 else if (strcmp(s->str, "lower") == 0) { 215 s->state = SET_LOWER; 216 } else 217 s->state = SET; 218 if ((s->state == SET_LOWER || s->state == SET_UPPER) && n > 1) 219 mergesort(s->set, n, sizeof(*(s->set)), charcoll); 220} 221 222static int 223c_class(a, b) 224 const void *a, *b; 225{ 226 return (strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name)); 227} 228 229static void 230genequiv(s) 231 STR *s; 232{ 233 int i, p, pri; 234 char src[2], dst[3]; 235 236 if (*s->str == '\\') { 237 s->equiv[0] = backslash(s, NULL); 238 if (*s->str != '=') 239 errx(1, "misplaced equivalence equals sign"); 240 s->str += 2; 241 } else { 242 s->equiv[0] = s->str[0]; 243 if (s->str[1] != '=') 244 errx(1, "misplaced equivalence equals sign"); 245 s->str += 3; 246 } 247 248 /* 249 * Calculate the set of all characters in the same equivalence class 250 * as the specified character (they will have the same primary 251 * collation weights). 252 * XXX Knows too much about how strxfrm() is implemented. Assumes 253 * it fills the string with primary collation weight bytes. Only one- 254 * to-one mappings are supported. 255 */ 256 src[0] = s->equiv[0]; 257 src[1] = '\0'; 258 if (strxfrm(dst, src, sizeof(dst)) == 1) { 259 pri = (unsigned char)*dst; 260 for (p = 1, i = 1; i < NCHARS; i++) { 261 *src = i; 262 if (strxfrm(dst, src, sizeof(dst)) == 1 && pri && 263 pri == (unsigned char)*dst) 264 s->equiv[p++] = i; 265 } 266 s->equiv[p] = OOBCH; 267 } 268 269 s->cnt = 0; 270 s->state = SET; 271 s->set = s->equiv; 272} 273 274static int 275genrange(STR *s, int was_octal) 276{ 277 int stopval, octal; 278 char *savestart; 279 int n, cnt, *p; 280 281 octal = 0; 282 savestart = s->str; 283 stopval = *++s->str == '\\' ? backslash(s, &octal) : (u_char)*s->str++; 284 if (!octal) 285 octal = was_octal; 286 287 if ((octal && stopval < s->lastch) || 288 (!octal && 289 charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0)) { 290 s->str = savestart; 291 return (0); 292 } 293 if ((s->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL) 294 err(1, "genrange() malloc"); 295 bzero(p, (NCHARS + 1) * sizeof(int)); 296 if (octal) { 297 for (cnt = s->lastch; cnt <= stopval; cnt++) 298 *p++ = cnt; 299 } else { 300 for (cnt = 0; cnt < NCHARS; cnt++) 301 if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 && 302 charcoll((const void *)&cnt, (const void *)&stopval) <= 0) 303 *p++ = cnt; 304 } 305 *p = OOBCH; 306 n = p - s->set; 307 308 s->cnt = 0; 309 s->state = SET; 310 if (!octal && n > 1) 311 mergesort(s->set, n, sizeof(*(s->set)), charcoll); 312 return (1); 313} 314 315static void 316genseq(s) 317 STR *s; 318{ 319 char *ep; 320 321 if (s->which == STRING1) 322 errx(1, "sequences only valid in string2"); 323 324 if (*s->str == '\\') 325 s->lastch = backslash(s, NULL); 326 else 327 s->lastch = *s->str++; 328 if (*s->str != '*') 329 errx(1, "misplaced sequence asterisk"); 330 331 switch (*++s->str) { 332 case '\\': 333 s->cnt = backslash(s, NULL); 334 break; 335 case ']': 336 s->cnt = 0; 337 ++s->str; 338 break; 339 default: 340 if (isdigit((u_char)*s->str)) { 341 s->cnt = strtol(s->str, &ep, 0); 342 if (*ep == ']') { 343 s->str = ep + 1; 344 break; 345 } 346 } 347 errx(1, "illegal sequence count"); 348 /* NOTREACHED */ 349 } 350 351 s->state = s->cnt ? SEQUENCE : INFINITE; 352} 353 354/* 355 * Translate \??? into a character. Up to 3 octal digits, if no digits either 356 * an escape code or a literal character. 357 */ 358static int 359backslash(STR *s, int *is_octal) 360{ 361 int ch, cnt, val; 362 363 if (is_octal != NULL) 364 *is_octal = 0; 365 for (cnt = val = 0;;) { 366 ch = (u_char)*++s->str; 367 if (!isdigit(ch)) 368 break; 369 val = val * 8 + ch - '0'; 370 if (++cnt == 3) { 371 ++s->str; 372 break; 373 } 374 } 375 if (cnt) { 376 if (is_octal != NULL) 377 *is_octal = 1; 378 return (val); 379 } 380 if (ch != '\0') 381 ++s->str; 382 switch (ch) { 383 case 'a': /* escape characters */ 384 return ('\7'); 385 case 'b': 386 return ('\b'); 387 case 'f': 388 return ('\f'); 389 case 'n': 390 return ('\n'); 391 case 'r': 392 return ('\r'); 393 case 't': 394 return ('\t'); 395 case 'v': 396 return ('\13'); 397 case '\0': /* \" -> \ */ 398 s->state = EOS; 399 return ('\\'); 400 default: /* \x" -> x */ 401 return (ch); 402 } 403} 404