regexp.c revision 331722
1156230Smux/* 2156230Smux * Copyright (c) 1980, 1993 3156230Smux * The Regents of the University of California. All rights reserved. 4156230Smux * 5156230Smux * 6156230Smux * Redistribution and use in source and binary forms, with or without 7156230Smux * modification, are permitted provided that the following conditions 8156230Smux * are met: 9156230Smux * 1. Redistributions of source code must retain the above copyright 10156230Smux * notice, this list of conditions and the following disclaimer. 11156230Smux * 2. Redistributions in binary form must reproduce the above copyright 12156230Smux * notice, this list of conditions and the following disclaimer in the 13156230Smux * documentation and/or other materials provided with the distribution. 14156230Smux * 4. Neither the name of the University nor the names of its contributors 15156230Smux * may be used to endorse or promote products derived from this software 16156230Smux * without specific prior written permission. 17156230Smux * 18156230Smux * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19156230Smux * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20156230Smux * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21156230Smux * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22156230Smux * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23156230Smux * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24156230Smux * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25156230Smux * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26156230Smux * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27156230Smux * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28156230Smux * SUCH DAMAGE. 29156230Smux */ 30156230Smux 31156230Smux#include <sys/cdefs.h> 32156230Smux 33156230Smux__FBSDID("$FreeBSD: stable/11/usr.bin/vgrind/regexp.c 331722 2018-03-29 02:50:57Z eadler $"); 34156230Smux 35156230Smux#ifndef lint 36156230Smuxstatic const char copyright[] = 37156230Smux"@(#) Copyright (c) 1980, 1993\n\ 38156230Smux The Regents of the University of California. All rights reserved.\n"; 39156230Smux#endif 40156230Smux 41156230Smux#ifndef lint 42156230Smuxstatic const char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93"; 43156230Smux#endif 44156230Smux 45156230Smux#include <ctype.h> 46156230Smux#include <stdlib.h> 47156230Smux#include <stdbool.h> 48156230Smux#include <string.h> 49156230Smux 50156230Smux#include "extern.h" 51156230Smux 52156230Smuxstatic void expconv(void); 53156230Smux 54156230Smuxbool _escaped; /* true if we are currently x_escaped */ 55156230Smuxchar *s_start; /* start of string */ 56156230Smuxbool l_onecase; /* true if upper and lower equivalent */ 57156230Smux 58156230Smux#define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 59156230Smux 60156230Smux/* STRNCMP - like strncmp except that we convert the 61156230Smux * first string to lower case before comparing 62156230Smux * if l_onecase is set. 63156230Smux */ 64156230Smux 65156230Smuxint 66156230SmuxSTRNCMP(register char *s1, register char *s2, register int len) 67156230Smux{ 68156230Smux if (l_onecase) { 69156230Smux do 70156230Smux if (*s2 - makelower(*s1)) 71156230Smux return (*s2 - makelower(*s1)); 72156230Smux else { 73156230Smux s2++; 74156230Smux s1++; 75156230Smux } 76156230Smux while (--len); 77156230Smux } else { 78156230Smux do 79156230Smux if (*s2 - *s1) 80156230Smux return (*s2 - *s1); 81156230Smux else { 82156230Smux s2++; 83156230Smux s1++; 84156230Smux } 85156230Smux while (--len); 86156230Smux } 87156230Smux return(0); 88156230Smux} 89156230Smux 90156230Smux/* The following routine converts an irregular expression to 91156230Smux * internal format. 92156230Smux * 93156230Smux * Either meta symbols (\a \d or \p) or character strings or 94156230Smux * operations ( alternation or parenthesizing ) can be 95156230Smux * specified. Each starts with a descriptor byte. The descriptor 96156230Smux * byte has STR set for strings, META set for meta symbols 97156230Smux * and OPER set for operations. 98156230Smux * The descriptor byte can also have the OPT bit set if the object 99156230Smux * defined is optional. Also ALT can be set to indicate an alternation. 100156230Smux * 101156230Smux * For metasymbols the byte following the descriptor byte identities 102156230Smux * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 103156230Smux * strings the byte after the descriptor is a character count for 104156230Smux * the string: 105156230Smux * 106156230Smux * meta symbols := descriptor 107156230Smux * symbol 108156230Smux * 109156230Smux * strings := descriptor 110156230Smux * character count 111156230Smux * the string 112156230Smux * 113156230Smux * operations := descriptor 114156230Smux * symbol 115156230Smux * character count 116156230Smux */ 117156230Smux 118156230Smux/* 119156230Smux * handy macros for accessing parts of match blocks 120156230Smux */ 121156230Smux#define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 122156230Smux#define MNEXT(A) (A+2) /* character following a metasymbol block */ 123156230Smux 124156230Smux#define OSYM(A) (*(A+1)) /* symbol in an operation block */ 125156230Smux#define OCNT(A) (*(A+2)) /* character count */ 126156230Smux#define ONEXT(A) (A+3) /* next character after the operation */ 127156230Smux#define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 128156230Smux 129156230Smux#define SCNT(A) (*(A+1)) /* byte count of a string */ 130156230Smux#define SSTR(A) (A+2) /* address of the string */ 131156230Smux#define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 132156230Smux 133156230Smux/* 134156230Smux * bit flags in the descriptor 135156230Smux */ 136156230Smux#define OPT 1 137156230Smux#define STR 2 138156230Smux#define META 4 139156230Smux#define ALT 8 140156230Smux#define OPER 16 141156230Smux 142156230Smuxstatic char *ccre; /* pointer to current position in converted exp*/ 143156230Smuxstatic char *ure; /* pointer current position in unconverted exp */ 144156230Smux 145156230Smux/* re: unconverted irregular expression */ 146156230Smuxchar * 147156230Smuxconvexp(char *re) 148156230Smux{ 149156230Smux register char *cre; /* pointer to converted regular expression */ 150156230Smux 151156230Smux /* allocate room for the converted expression */ 152156230Smux if (re == NULL) 153156230Smux return (NULL); 154156230Smux if (*re == '\0') 155156230Smux return (NULL); 156156230Smux cre = malloc(4 * strlen(re) + 3); 157156230Smux ccre = cre; 158156230Smux ure = re; 159156230Smux 160156230Smux /* start the conversion with a \a */ 161156230Smux *cre = META | OPT; 162156230Smux MSYM(cre) = 'a'; 163156230Smux ccre = MNEXT(cre); 164156230Smux 165156230Smux /* start the conversion (its recursive) */ 166156230Smux expconv (); 167156230Smux *ccre = 0; 168156230Smux return (cre); 169156230Smux} 170156230Smux 171156230Smuxstatic void 172156230Smuxexpconv() 173156230Smux{ 174156230Smux register char *cs; /* pointer to current symbol in converted exp */ 175156230Smux register char c; /* character being processed */ 176156230Smux register char *acs; /* pinter to last alternate */ 177 register int temp; 178 179 /* let the conversion begin */ 180 acs = NULL; 181 cs = NULL; 182 while (*ure) { 183 switch (c = *ure++) { 184 185 case '\\': 186 switch (c = *ure++) { 187 188 /* escaped characters are just characters */ 189 default: 190 if (cs == NULL || (*cs & STR) == 0) { 191 cs = ccre; 192 *cs = STR; 193 SCNT(cs) = 1; 194 ccre += 2; 195 } else 196 SCNT(cs)++; 197 *ccre++ = c; 198 break; 199 200 /* normal(?) metacharacters */ 201 case 'a': 202 case 'd': 203 case 'e': 204 case 'p': 205 if (acs != NULL && acs != cs) { 206 do { 207 temp = OCNT(acs); 208 OCNT(acs) = ccre - acs; 209 acs -= temp; 210 } while (temp != 0); 211 acs = NULL; 212 } 213 cs = ccre; 214 *cs = META; 215 MSYM(cs) = c; 216 ccre = MNEXT(cs); 217 break; 218 } 219 break; 220 221 /* just put the symbol in */ 222 case '^': 223 case '$': 224 if (acs != NULL && acs != cs) { 225 do { 226 temp = OCNT(acs); 227 OCNT(acs) = ccre - acs; 228 acs -= temp; 229 } while (temp != 0); 230 acs = NULL; 231 } 232 cs = ccre; 233 *cs = META; 234 MSYM(cs) = c; 235 ccre = MNEXT(cs); 236 break; 237 238 /* mark the last match sequence as optional */ 239 case '?': 240 if (cs) 241 *cs = *cs | OPT; 242 break; 243 244 /* recurse and define a subexpression */ 245 case '(': 246 if (acs != NULL && acs != cs) { 247 do { 248 temp = OCNT(acs); 249 OCNT(acs) = ccre - acs; 250 acs -= temp; 251 } while (temp != 0); 252 acs = NULL; 253 } 254 cs = ccre; 255 *cs = OPER; 256 OSYM(cs) = '('; 257 ccre = ONEXT(cs); 258 expconv(); 259 OCNT(cs) = ccre - cs; /* offset to next symbol */ 260 break; 261 262 /* return from a recursion */ 263 case ')': 264 if (acs != NULL) { 265 do { 266 temp = OCNT(acs); 267 OCNT(acs) = ccre - acs; 268 acs -= temp; 269 } while (temp != 0); 270 acs = NULL; 271 } 272 cs = ccre; 273 *cs = META; 274 MSYM(cs) = c; 275 ccre = MNEXT(cs); 276 return; 277 278 /* mark the last match sequence as having an alternate */ 279 /* the third byte will contain an offset to jump over the */ 280 /* alternate match in case the first did not fail */ 281 case '|': 282 if (acs != NULL && acs != cs) 283 OCNT(ccre) = ccre - acs; /* make a back pointer */ 284 else 285 OCNT(ccre) = 0; 286 *cs |= ALT; 287 cs = ccre; 288 *cs = OPER; 289 OSYM(cs) = '|'; 290 ccre = ONEXT(cs); 291 acs = cs; /* remember that the pointer is to be filles */ 292 break; 293 294 /* if its not a metasymbol just build a scharacter string */ 295 default: 296 if (cs == NULL || (*cs & STR) == 0) { 297 cs = ccre; 298 *cs = STR; 299 SCNT(cs) = 1; 300 ccre = SSTR(cs); 301 } else 302 SCNT(cs)++; 303 *ccre++ = c; 304 break; 305 } 306 } 307 if (acs != NULL) { 308 do { 309 temp = OCNT(acs); 310 OCNT(acs) = ccre - acs; 311 acs -= temp; 312 } while (temp != 0); 313 acs = NULL; 314 } 315 return; 316} 317/* end of convertre */ 318 319 320/* 321 * The following routine recognises an irregular expression 322 * with the following special characters: 323 * 324 * \? - means last match was optional 325 * \a - matches any number of characters 326 * \d - matches any number of spaces and tabs 327 * \p - matches any number of alphanumeric 328 * characters. The 329 * characters matched will be copied into 330 * the area pointed to by 'name'. 331 * \| - alternation 332 * \( \) - grouping used mostly for alternation and 333 * optionality 334 * 335 * The irregular expression must be translated to internal form 336 * prior to calling this routine 337 * 338 * The value returned is the pointer to the first non \a 339 * character matched. 340 */ 341 342/* 343 * s: string to check for a match in 344 * re: a converted irregular expression 345 * mstring: where to put whatever matches a \p 346 */ 347char * 348expmatch (register char *s, register char *re, register char *mstring) 349{ 350 register char *cs; /* the current symbol */ 351 register char *ptr,*s1; /* temporary pointer */ 352 bool matched; /* a temporary bool */ 353 354 /* initial conditions */ 355 if (re == NULL) 356 return (NULL); 357 cs = re; 358 matched = false; 359 360 /* loop till expression string is exhausted (or at least pretty tired) */ 361 while (*cs) { 362 switch (*cs & (OPER | STR | META)) { 363 364 /* try to match a string */ 365 case STR: 366 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 367 if (matched) { 368 369 /* hoorah it matches */ 370 s += SCNT(cs); 371 cs = SNEXT(cs); 372 } else if (*cs & ALT) { 373 374 /* alternation, skip to next expression */ 375 cs = SNEXT(cs); 376 } else if (*cs & OPT) { 377 378 /* the match is optional */ 379 cs = SNEXT(cs); 380 matched = 1; /* indicate a successful match */ 381 } else { 382 383 /* no match, error return */ 384 return (NULL); 385 } 386 break; 387 388 /* an operator, do something fancy */ 389 case OPER: 390 switch (OSYM(cs)) { 391 392 /* this is an alternation */ 393 case '|': 394 if (matched) 395 396 /* last thing in the alternation was a match, skip ahead */ 397 cs = OPTR(cs); 398 else 399 400 /* no match, keep trying */ 401 cs = ONEXT(cs); 402 break; 403 404 /* this is a grouping, recurse */ 405 case '(': 406 ptr = expmatch(s, ONEXT(cs), mstring); 407 if (ptr != NULL) { 408 409 /* the subexpression matched */ 410 matched = 1; 411 s = ptr; 412 } else if (*cs & ALT) { 413 414 /* alternation, skip to next expression */ 415 matched = 0; 416 } else if (*cs & OPT) { 417 418 /* the match is optional */ 419 matched = 1; /* indicate a successful match */ 420 } else { 421 422 /* no match, error return */ 423 return (NULL); 424 } 425 cs = OPTR(cs); 426 break; 427 } 428 break; 429 430 /* try to match a metasymbol */ 431 case META: 432 switch (MSYM(cs)) { 433 434 /* try to match anything and remember what was matched */ 435 case 'p': 436 /* 437 * This is really the same as trying the match the 438 * remaining parts of the expression to any subset 439 * of the string. 440 */ 441 s1 = s; 442 do { 443 ptr = expmatch(s1, MNEXT(cs), mstring); 444 if (ptr != NULL && s1 != s) { 445 446 /* we have a match, remember the match */ 447 strncpy (mstring, s, s1 - s); 448 mstring[s1 - s] = '\0'; 449 return (ptr); 450 } else if (ptr != NULL && (*cs & OPT)) { 451 452 /* it was aoptional so no match is ok */ 453 return (ptr); 454 } else if (ptr != NULL) { 455 456 /* not optional and we still matched */ 457 return (NULL); 458 } 459 if (!(isalnum(*s1) || *s1 == '_' || 460 /* C++ destructor */ 461 *s1 == '~' || 462 /* C++ scope operator */ 463 (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' && 464 (s1++, true)))) 465 return (NULL); 466 if (*s1 == '\\') 467 _escaped = _escaped ? false : true; 468 else 469 _escaped = false; 470 } while (*s1++); 471 return (NULL); 472 473 /* try to match anything */ 474 case 'a': 475 /* 476 * This is really the same as trying the match the 477 * remaining parts of the expression to any subset 478 * of the string. 479 */ 480 s1 = s; 481 do { 482 ptr = expmatch(s1, MNEXT(cs), mstring); 483 if (ptr != NULL && s1 != s) { 484 485 /* we have a match */ 486 return (ptr); 487 } else if (ptr != NULL && (*cs & OPT)) { 488 489 /* it was aoptional so no match is ok */ 490 return (ptr); 491 } else if (ptr != NULL) { 492 493 /* not optional and we still matched */ 494 return (NULL); 495 } 496 if (*s1 == '\\') 497 _escaped = _escaped ? false : true; 498 else 499 _escaped = false; 500 } while (*s1++); 501 return (NULL); 502 503 /* fail if we are currently _escaped */ 504 case 'e': 505 if (_escaped) 506 return(NULL); 507 cs = MNEXT(cs); 508 break; 509 510 /* match any number of tabs and spaces */ 511 case 'd': 512 ptr = s; 513 while (*s == ' ' || *s == '\t') 514 s++; 515 if (s != ptr || s == s_start) { 516 517 /* match, be happy */ 518 matched = 1; 519 cs = MNEXT(cs); 520 } else if (*s == '\n' || *s == '\0') { 521 522 /* match, be happy */ 523 matched = 1; 524 cs = MNEXT(cs); 525 } else if (*cs & ALT) { 526 527 /* try the next part */ 528 matched = 0; 529 cs = MNEXT(cs); 530 } else if (*cs & OPT) { 531 532 /* doesn't matter */ 533 matched = 1; 534 cs = MNEXT(cs); 535 } else 536 537 /* no match, error return */ 538 return (NULL); 539 break; 540 541 /* check for end of line */ 542 case '$': 543 if (*s == '\0' || *s == '\n') { 544 545 /* match, be happy */ 546 s++; 547 matched = 1; 548 cs = MNEXT(cs); 549 } else if (*cs & ALT) { 550 551 /* try the next part */ 552 matched = 0; 553 cs = MNEXT(cs); 554 } else if (*cs & OPT) { 555 556 /* doesn't matter */ 557 matched = 1; 558 cs = MNEXT(cs); 559 } else 560 561 /* no match, error return */ 562 return (NULL); 563 break; 564 565 /* check for start of line */ 566 case '^': 567 if (s == s_start) { 568 569 /* match, be happy */ 570 matched = 1; 571 cs = MNEXT(cs); 572 } else if (*cs & ALT) { 573 574 /* try the next part */ 575 matched = 0; 576 cs = MNEXT(cs); 577 } else if (*cs & OPT) { 578 579 /* doesn't matter */ 580 matched = 1; 581 cs = MNEXT(cs); 582 } else 583 584 /* no match, error return */ 585 return (NULL); 586 break; 587 588 /* end of a subexpression, return success */ 589 case ')': 590 return (s); 591 } 592 break; 593 } 594 } 595 return (s); 596} 597