regexp.c revision 282449
1/* 2 * Copyright (c) 1980, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 4. Neither the name of the University nor the names of its contributors 15 * may be used to endorse or promote products derived from this software 16 * without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31#include <sys/cdefs.h> 32 33__FBSDID("$FreeBSD: head/usr.bin/vgrind/regexp.c 282449 2015-05-05 07:33:38Z bapt $"); 34 35#ifndef lint 36static const char copyright[] = 37"@(#) Copyright (c) 1980, 1993\n\ 38 The Regents of the University of California. All rights reserved.\n"; 39#endif 40 41#ifndef lint 42static const char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93"; 43#endif 44 45#include <ctype.h> 46#include <stdlib.h> 47#include <string.h> 48 49#include "extern.h" 50 51#define FALSE 0 52#define TRUE !(FALSE) 53#define NIL 0 54 55static void expconv(void); 56 57boolean _escaped; /* true if we are currently _escaped */ 58char *s_start; /* start of string */ 59boolean l_onecase; /* true if upper and lower equivalent */ 60 61#define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 62 63/* STRNCMP - like strncmp except that we convert the 64 * first string to lower case before comparing 65 * if l_onecase is set. 66 */ 67 68int 69STRNCMP(register char *s1, register char *s2, register int len) 70{ 71 if (l_onecase) { 72 do 73 if (*s2 - makelower(*s1)) 74 return (*s2 - makelower(*s1)); 75 else { 76 s2++; 77 s1++; 78 } 79 while (--len); 80 } else { 81 do 82 if (*s2 - *s1) 83 return (*s2 - *s1); 84 else { 85 s2++; 86 s1++; 87 } 88 while (--len); 89 } 90 return(0); 91} 92 93/* The following routine converts an irregular expression to 94 * internal format. 95 * 96 * Either meta symbols (\a \d or \p) or character strings or 97 * operations ( alternation or perenthesizing ) can be 98 * specified. Each starts with a descriptor byte. The descriptor 99 * byte has STR set for strings, META set for meta symbols 100 * and OPER set for operations. 101 * The descriptor byte can also have the OPT bit set if the object 102 * defined is optional. Also ALT can be set to indicate an alternation. 103 * 104 * For metasymbols the byte following the descriptor byte identities 105 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 106 * strings the byte after the descriptor is a character count for 107 * the string: 108 * 109 * meta symbols := descriptor 110 * symbol 111 * 112 * strings := descriptor 113 * character count 114 * the string 115 * 116 * operatins := descriptor 117 * symbol 118 * character count 119 */ 120 121/* 122 * handy macros for accessing parts of match blocks 123 */ 124#define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 125#define MNEXT(A) (A+2) /* character following a metasymbol block */ 126 127#define OSYM(A) (*(A+1)) /* symbol in an operation block */ 128#define OCNT(A) (*(A+2)) /* character count */ 129#define ONEXT(A) (A+3) /* next character after the operation */ 130#define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 131 132#define SCNT(A) (*(A+1)) /* byte count of a string */ 133#define SSTR(A) (A+2) /* address of the string */ 134#define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 135 136/* 137 * bit flags in the descriptor 138 */ 139#define OPT 1 140#define STR 2 141#define META 4 142#define ALT 8 143#define OPER 16 144 145static char *ccre; /* pointer to current position in converted exp*/ 146static char *ure; /* pointer current position in unconverted exp */ 147 148/* re: unconverted irregular expression */ 149char * 150convexp(char *re) 151{ 152 register char *cre; /* pointer to converted regular expression */ 153 154 /* allocate room for the converted expression */ 155 if (re == NIL) 156 return (NIL); 157 if (*re == '\0') 158 return (NIL); 159 cre = malloc (4 * strlen(re) + 3); 160 ccre = cre; 161 ure = re; 162 163 /* start the conversion with a \a */ 164 *cre = META | OPT; 165 MSYM(cre) = 'a'; 166 ccre = MNEXT(cre); 167 168 /* start the conversion (its recursive) */ 169 expconv (); 170 *ccre = 0; 171 return (cre); 172} 173 174static void 175expconv() 176{ 177 register char *cs; /* pointer to current symbol in converted exp */ 178 register char c; /* character being processed */ 179 register char *acs; /* pinter to last alternate */ 180 register int temp; 181 182 /* let the conversion begin */ 183 acs = NIL; 184 cs = NIL; 185 while (*ure != NIL) { 186 switch (c = *ure++) { 187 188 case '\\': 189 switch (c = *ure++) { 190 191 /* escaped characters are just characters */ 192 default: 193 if (cs == NIL || (*cs & STR) == 0) { 194 cs = ccre; 195 *cs = STR; 196 SCNT(cs) = 1; 197 ccre += 2; 198 } else 199 SCNT(cs)++; 200 *ccre++ = c; 201 break; 202 203 /* normal(?) metacharacters */ 204 case 'a': 205 case 'd': 206 case 'e': 207 case 'p': 208 if (acs != NIL && acs != cs) { 209 do { 210 temp = OCNT(acs); 211 OCNT(acs) = ccre - acs; 212 acs -= temp; 213 } while (temp != 0); 214 acs = NIL; 215 } 216 cs = ccre; 217 *cs = META; 218 MSYM(cs) = c; 219 ccre = MNEXT(cs); 220 break; 221 } 222 break; 223 224 /* just put the symbol in */ 225 case '^': 226 case '$': 227 if (acs != NIL && acs != cs) { 228 do { 229 temp = OCNT(acs); 230 OCNT(acs) = ccre - acs; 231 acs -= temp; 232 } while (temp != 0); 233 acs = NIL; 234 } 235 cs = ccre; 236 *cs = META; 237 MSYM(cs) = c; 238 ccre = MNEXT(cs); 239 break; 240 241 /* mark the last match sequence as optional */ 242 case '?': 243 if (cs) 244 *cs = *cs | OPT; 245 break; 246 247 /* recurse and define a subexpression */ 248 case '(': 249 if (acs != NIL && acs != cs) { 250 do { 251 temp = OCNT(acs); 252 OCNT(acs) = ccre - acs; 253 acs -= temp; 254 } while (temp != 0); 255 acs = NIL; 256 } 257 cs = ccre; 258 *cs = OPER; 259 OSYM(cs) = '('; 260 ccre = ONEXT(cs); 261 expconv (); 262 OCNT(cs) = ccre - cs; /* offset to next symbol */ 263 break; 264 265 /* return from a recursion */ 266 case ')': 267 if (acs != NIL) { 268 do { 269 temp = OCNT(acs); 270 OCNT(acs) = ccre - acs; 271 acs -= temp; 272 } while (temp != 0); 273 acs = NIL; 274 } 275 cs = ccre; 276 *cs = META; 277 MSYM(cs) = c; 278 ccre = MNEXT(cs); 279 return; 280 281 /* mark the last match sequence as having an alternate */ 282 /* the third byte will contain an offset to jump over the */ 283 /* alternate match in case the first did not fail */ 284 case '|': 285 if (acs != NIL && acs != cs) 286 OCNT(ccre) = ccre - acs; /* make a back pointer */ 287 else 288 OCNT(ccre) = 0; 289 *cs |= ALT; 290 cs = ccre; 291 *cs = OPER; 292 OSYM(cs) = '|'; 293 ccre = ONEXT(cs); 294 acs = cs; /* remember that the pointer is to be filles */ 295 break; 296 297 /* if its not a metasymbol just build a scharacter string */ 298 default: 299 if (cs == NIL || (*cs & STR) == 0) { 300 cs = ccre; 301 *cs = STR; 302 SCNT(cs) = 1; 303 ccre = SSTR(cs); 304 } else 305 SCNT(cs)++; 306 *ccre++ = c; 307 break; 308 } 309 } 310 if (acs != NIL) { 311 do { 312 temp = OCNT(acs); 313 OCNT(acs) = ccre - acs; 314 acs -= temp; 315 } while (temp != 0); 316 acs = NIL; 317 } 318 return; 319} 320/* end of convertre */ 321 322 323/* 324 * The following routine recognises an irregular expresion 325 * with the following special characters: 326 * 327 * \? - means last match was optional 328 * \a - matches any number of characters 329 * \d - matches any number of spaces and tabs 330 * \p - matches any number of alphanumeric 331 * characters. The 332 * characters matched will be copied into 333 * the area pointed to by 'name'. 334 * \| - alternation 335 * \( \) - grouping used mostly for alternation and 336 * optionality 337 * 338 * The irregular expression must be translated to internal form 339 * prior to calling this routine 340 * 341 * The value returned is the pointer to the first non \a 342 * character matched. 343 */ 344 345/* 346 * s: string to check for a match in 347 * re: a converted irregular expression 348 * mstring: where to put whatever matches a \p 349 */ 350char * 351expmatch (register char *s, register char *re, register char *mstring) 352{ 353 register char *cs; /* the current symbol */ 354 register char *ptr,*s1; /* temporary pointer */ 355 boolean matched; /* a temporary boolean */ 356 357 /* initial conditions */ 358 if (re == NIL) 359 return (NIL); 360 cs = re; 361 matched = FALSE; 362 363 /* loop till expression string is exhausted (or at least pretty tired) */ 364 while (*cs) { 365 switch (*cs & (OPER | STR | META)) { 366 367 /* try to match a string */ 368 case STR: 369 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 370 if (matched) { 371 372 /* hoorah it matches */ 373 s += SCNT(cs); 374 cs = SNEXT(cs); 375 } else if (*cs & ALT) { 376 377 /* alternation, skip to next expression */ 378 cs = SNEXT(cs); 379 } else if (*cs & OPT) { 380 381 /* the match is optional */ 382 cs = SNEXT(cs); 383 matched = 1; /* indicate a successful match */ 384 } else { 385 386 /* no match, error return */ 387 return (NIL); 388 } 389 break; 390 391 /* an operator, do something fancy */ 392 case OPER: 393 switch (OSYM(cs)) { 394 395 /* this is an alternation */ 396 case '|': 397 if (matched) 398 399 /* last thing in the alternation was a match, skip ahead */ 400 cs = OPTR(cs); 401 else 402 403 /* no match, keep trying */ 404 cs = ONEXT(cs); 405 break; 406 407 /* this is a grouping, recurse */ 408 case '(': 409 ptr = expmatch (s, ONEXT(cs), mstring); 410 if (ptr != NIL) { 411 412 /* the subexpression matched */ 413 matched = 1; 414 s = ptr; 415 } else if (*cs & ALT) { 416 417 /* alternation, skip to next expression */ 418 matched = 0; 419 } else if (*cs & OPT) { 420 421 /* the match is optional */ 422 matched = 1; /* indicate a successful match */ 423 } else { 424 425 /* no match, error return */ 426 return (NIL); 427 } 428 cs = OPTR(cs); 429 break; 430 } 431 break; 432 433 /* try to match a metasymbol */ 434 case META: 435 switch (MSYM(cs)) { 436 437 /* try to match anything and remember what was matched */ 438 case 'p': 439 /* 440 * This is really the same as trying the match the 441 * remaining parts of the expression to any subset 442 * of the string. 443 */ 444 s1 = s; 445 do { 446 ptr = expmatch (s1, MNEXT(cs), mstring); 447 if (ptr != NIL && s1 != s) { 448 449 /* we have a match, remember the match */ 450 strncpy (mstring, s, s1 - s); 451 mstring[s1 - s] = '\0'; 452 return (ptr); 453 } else if (ptr != NIL && (*cs & OPT)) { 454 455 /* it was aoptional so no match is ok */ 456 return (ptr); 457 } else if (ptr != NIL) { 458 459 /* not optional and we still matched */ 460 return (NIL); 461 } 462 if (!(isalnum(*s1) || *s1 == '_' || 463 /* C++ destructor */ 464 *s1 == '~' || 465 /* C++ scope operator */ 466 (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' && 467 (s1++, TRUE)))) 468 return (NIL); 469 if (*s1 == '\\') 470 _escaped = _escaped ? FALSE : TRUE; 471 else 472 _escaped = FALSE; 473 } while (*s1++); 474 return (NIL); 475 476 /* try to match anything */ 477 case 'a': 478 /* 479 * This is really the same as trying the match the 480 * remaining parts of the expression to any subset 481 * of the string. 482 */ 483 s1 = s; 484 do { 485 ptr = expmatch (s1, MNEXT(cs), mstring); 486 if (ptr != NIL && s1 != s) { 487 488 /* we have a match */ 489 return (ptr); 490 } else if (ptr != NIL && (*cs & OPT)) { 491 492 /* it was aoptional so no match is ok */ 493 return (ptr); 494 } else if (ptr != NIL) { 495 496 /* not optional and we still matched */ 497 return (NIL); 498 } 499 if (*s1 == '\\') 500 _escaped = _escaped ? FALSE : TRUE; 501 else 502 _escaped = FALSE; 503 } while (*s1++); 504 return (NIL); 505 506 /* fail if we are currently _escaped */ 507 case 'e': 508 if (_escaped) 509 return(NIL); 510 cs = MNEXT(cs); 511 break; 512 513 /* match any number of tabs and spaces */ 514 case 'd': 515 ptr = s; 516 while (*s == ' ' || *s == '\t') 517 s++; 518 if (s != ptr || s == s_start) { 519 520 /* match, be happy */ 521 matched = 1; 522 cs = MNEXT(cs); 523 } else if (*s == '\n' || *s == '\0') { 524 525 /* match, be happy */ 526 matched = 1; 527 cs = MNEXT(cs); 528 } else if (*cs & ALT) { 529 530 /* try the next part */ 531 matched = 0; 532 cs = MNEXT(cs); 533 } else if (*cs & OPT) { 534 535 /* doesn't matter */ 536 matched = 1; 537 cs = MNEXT(cs); 538 } else 539 540 /* no match, error return */ 541 return (NIL); 542 break; 543 544 /* check for end of line */ 545 case '$': 546 if (*s == '\0' || *s == '\n') { 547 548 /* match, be happy */ 549 s++; 550 matched = 1; 551 cs = MNEXT(cs); 552 } else if (*cs & ALT) { 553 554 /* try the next part */ 555 matched = 0; 556 cs = MNEXT(cs); 557 } else if (*cs & OPT) { 558 559 /* doesn't matter */ 560 matched = 1; 561 cs = MNEXT(cs); 562 } else 563 564 /* no match, error return */ 565 return (NIL); 566 break; 567 568 /* check for start of line */ 569 case '^': 570 if (s == s_start) { 571 572 /* match, be happy */ 573 matched = 1; 574 cs = MNEXT(cs); 575 } else if (*cs & ALT) { 576 577 /* try the next part */ 578 matched = 0; 579 cs = MNEXT(cs); 580 } else if (*cs & OPT) { 581 582 /* doesn't matter */ 583 matched = 1; 584 cs = MNEXT(cs); 585 } else 586 587 /* no match, error return */ 588 return (NIL); 589 break; 590 591 /* end of a subexpression, return success */ 592 case ')': 593 return (s); 594 } 595 break; 596 } 597 } 598 return (s); 599} 600