tre-fastmatch.c revision 253810
1/* $FreeBSD: head/usr.bin/grep/regex/tre-fastmatch.c 253810 2013-07-30 18:16:43Z ache $ */ 2 3/*- 4 * Copyright (c) 1999 James Howard and Dag-Erling Co��dan Sm��rgrav 5 * Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org> 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#include "glue.h" 31 32#include <ctype.h> 33#include <limits.h> 34#include <regex.h> 35#include <stdbool.h> 36#include <stdlib.h> 37#include <string.h> 38#ifdef TRE_WCHAR 39#include <wchar.h> 40#include <wctype.h> 41#endif 42 43#include "hashtable.h" 44#include "tre-fastmatch.h" 45#include "xmalloc.h" 46 47static int fastcmp(const fastmatch_t *fg, const void *data, 48 tre_str_type_t type); 49 50/* 51 * Clean up if pattern compilation fails. 52 */ 53#define FAIL_COMP(errcode) \ 54 { \ 55 if (fg->pattern) \ 56 xfree(fg->pattern); \ 57 if (fg->wpattern) \ 58 xfree(fg->wpattern); \ 59 if (fg->qsBc_table) \ 60 hashtable_free(fg->qsBc_table); \ 61 fg = NULL; \ 62 return errcode; \ 63 } 64 65/* 66 * Skips n characters in the input string and assigns the start 67 * address to startptr. Note: as per IEEE Std 1003.1-2008 68 * matching is based on bit pattern not character representations 69 * so we can handle MB strings as byte sequences just like 70 * SB strings. 71 */ 72#define SKIP_CHARS(n) \ 73 switch (type) \ 74 { \ 75 case STR_WIDE: \ 76 startptr = str_wide + n; \ 77 break; \ 78 default: \ 79 startptr = str_byte + n; \ 80 } 81 82/* 83 * Converts the wide string pattern to SB/MB string and stores 84 * it in fg->pattern. Sets fg->len to the byte length of the 85 * converted string. 86 */ 87#define STORE_MBS_PAT \ 88 { \ 89 size_t siz; \ 90 \ 91 siz = wcstombs(NULL, fg->wpattern, 0); \ 92 if (siz == (size_t)-1) \ 93 return REG_BADPAT; \ 94 fg->len = siz; \ 95 fg->pattern = xmalloc(siz + 1); \ 96 if (fg->pattern == NULL) \ 97 return REG_ESPACE; \ 98 wcstombs(fg->pattern, fg->wpattern, siz); \ 99 fg->pattern[siz] = '\0'; \ 100 } \ 101 102#define IS_OUT_OF_BOUNDS \ 103 ((!fg->reversed \ 104 ? ((type == STR_WIDE) ? ((j + fg->wlen) > len) \ 105 : ((j + fg->len) > len)) \ 106 : (j < 0))) 107 108/* 109 * Checks whether the new position after shifting in the input string 110 * is out of the bounds and break out from the loop if so. 111 */ 112#define CHECKBOUNDS \ 113 if (IS_OUT_OF_BOUNDS) \ 114 break; \ 115 116/* 117 * Shifts in the input string after a mismatch. The position of the 118 * mismatch is stored in the mismatch variable. 119 */ 120#define SHIFT \ 121 CHECKBOUNDS; \ 122 \ 123 { \ 124 int r = -1; \ 125 unsigned int bc = 0, gs = 0, ts; \ 126 \ 127 switch (type) \ 128 { \ 129 case STR_WIDE: \ 130 if (!fg->hasdot) \ 131 { \ 132 if (u != 0 && (unsigned)mismatch == fg->wlen - 1 - shift) \ 133 mismatch -= u; \ 134 v = fg->wlen - 1 - mismatch; \ 135 r = hashtable_get(fg->qsBc_table, \ 136 &str_wide[!fg->reversed ? (size_t)j + fg->wlen \ 137 : (size_t)j - 1], &bc); \ 138 gs = fg->bmGs[mismatch]; \ 139 } \ 140 bc = (r == HASH_OK) ? bc : fg->defBc; \ 141 DPRINT(("tre_fast_match: mismatch on character" CHF ", " \ 142 "BC %d, GS %d\n", \ 143 ((const tre_char_t *)startptr)[mismatch + 1], \ 144 bc, gs)); \ 145 break; \ 146 default: \ 147 if (!fg->hasdot) \ 148 { \ 149 if (u != 0 && (unsigned)mismatch == fg->len - 1 - shift) \ 150 mismatch -= u; \ 151 v = fg->len - 1 - mismatch; \ 152 gs = fg->sbmGs[mismatch]; \ 153 } \ 154 bc = fg->qsBc[((const unsigned char *)str_byte) \ 155 [!fg->reversed ? (size_t)j + fg->len \ 156 : (size_t)j - 1]]; \ 157 DPRINT(("tre_fast_match: mismatch on character %c, " \ 158 "BC %d, GS %d\n", \ 159 ((const unsigned char *)startptr)[mismatch + 1], \ 160 bc, gs)); \ 161 } \ 162 if (fg->hasdot) \ 163 shift = bc; \ 164 else \ 165 { \ 166 ts = (u >= v) ? (u - v) : 0; \ 167 shift = MAX(ts, bc); \ 168 shift = MAX(shift, gs); \ 169 if (shift == gs) \ 170 u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v); \ 171 else \ 172 { \ 173 if (ts < bc) \ 174 shift = MAX(shift, u + 1); \ 175 u = 0; \ 176 } \ 177 } \ 178 DPRINT(("tre_fast_match: shifting %u characters\n", shift)); \ 179 j = !fg->reversed ? j + shift : j - shift; \ 180 } 181 182/* 183 * Normal Quick Search would require a shift based on the position the 184 * next character after the comparison is within the pattern. With 185 * wildcards, the position of the last dot effects the maximum shift 186 * distance. 187 * The closer to the end the wild card is the slower the search. 188 * 189 * Examples: 190 * Pattern Max shift 191 * ------- --------- 192 * this 5 193 * .his 4 194 * t.is 3 195 * th.s 2 196 * thi. 1 197 */ 198 199/* 200 * Fills in the bad character shift array for SB/MB strings. 201 */ 202#define FILL_QSBC \ 203 if (fg->reversed) \ 204 { \ 205 _FILL_QSBC_REVERSED \ 206 } \ 207 else \ 208 { \ 209 _FILL_QSBC \ 210 } 211 212#define _FILL_QSBC \ 213 for (unsigned int i = 0; i <= UCHAR_MAX; i++) \ 214 fg->qsBc[i] = fg->len - hasdot; \ 215 for (unsigned int i = hasdot + 1; i < fg->len; i++) \ 216 { \ 217 fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i; \ 218 DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i], \ 219 fg->len - i)); \ 220 if (fg->icase) \ 221 { \ 222 char c = islower((unsigned char)fg->pattern[i]) ? \ 223 toupper((unsigned char)fg->pattern[i]) : \ 224 tolower((unsigned char)fg->pattern[i]); \ 225 fg->qsBc[(unsigned char)c] = fg->len - i; \ 226 DPRINT(("BC shift for char %c is %zu\n", c, fg->len - i)); \ 227 } \ 228 } 229 230#define _FILL_QSBC_REVERSED \ 231 for (unsigned int i = 0; i <= UCHAR_MAX; i++) \ 232 fg->qsBc[i] = firstdot + 1; \ 233 for (int i = firstdot - 1; i >= 0; i--) \ 234 { \ 235 fg->qsBc[(unsigned char)fg->pattern[i]] = i + 1; \ 236 DPRINT(("Reverse BC shift for char %c is %d\n", fg->pattern[i], \ 237 i + 1)); \ 238 if (fg->icase) \ 239 { \ 240 char c = islower((unsigned char)fg->pattern[i]) ? \ 241 toupper((unsigned char)fg->pattern[i]) : \ 242 tolower((unsigned char)fg->pattern[i]); \ 243 fg->qsBc[(unsigned char)c] = i + 1; \ 244 DPRINT(("Reverse BC shift for char %c is %d\n", c, i + 1)); \ 245 } \ 246 } 247 248/* 249 * Fills in the bad character shifts into a hastable for wide strings. 250 * With wide characters it is not possible any more to use a normal 251 * array because there are too many characters and we could not 252 * provide enough memory. Fortunately, we only have to store distinct 253 * values for so many characters as the number of distinct characters 254 * in the pattern, so we can store them in a hashtable and store a 255 * default shift value for the rest. 256 */ 257#define FILL_QSBC_WIDE \ 258 if (fg->reversed) \ 259 { \ 260 _FILL_QSBC_WIDE_REVERSED \ 261 } \ 262 else \ 263 { \ 264 _FILL_QSBC_WIDE \ 265 } 266 267#define _FILL_QSBC_WIDE \ 268 /* Adjust the shift based on location of the last dot ('.'). */ \ 269 fg->defBc = fg->wlen - whasdot; \ 270 \ 271 /* Preprocess pattern. */ \ 272 fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4), \ 273 sizeof(tre_char_t), sizeof(int)); \ 274 if (!fg->qsBc_table) \ 275 FAIL_COMP(REG_ESPACE); \ 276 for (unsigned int i = whasdot + 1; i < fg->wlen; i++) \ 277 { \ 278 int k = fg->wlen - i; \ 279 int r; \ 280 \ 281 r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \ 282 if ((r == HASH_FAIL) || (r == HASH_FULL)) \ 283 FAIL_COMP(REG_ESPACE); \ 284 DPRINT(("BC shift for wide char " CHF " is %d\n", fg->wpattern[i],\ 285 k)); \ 286 if (fg->icase) \ 287 { \ 288 tre_char_t wc = iswlower(fg->wpattern[i]) ? \ 289 towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \ 290 r = hashtable_put(fg->qsBc_table, &wc, &k); \ 291 if ((r == HASH_FAIL) || (r == HASH_FULL)) \ 292 FAIL_COMP(REG_ESPACE); \ 293 DPRINT(("BC shift for wide char " CHF " is %d\n", wc, k)); \ 294 } \ 295 } 296 297#define _FILL_QSBC_WIDE_REVERSED \ 298 /* Adjust the shift based on location of the last dot ('.'). */ \ 299 fg->defBc = (size_t)wfirstdot; \ 300 \ 301 /* Preprocess pattern. */ \ 302 fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4), \ 303 sizeof(tre_char_t), sizeof(int)); \ 304 if (!fg->qsBc_table) \ 305 FAIL_COMP(REG_ESPACE); \ 306 for (int i = wfirstdot - 1; i >= 0; i--) \ 307 { \ 308 int k = i + 1; \ 309 int r; \ 310 \ 311 r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \ 312 if ((r == HASH_FAIL) || (r == HASH_FULL)) \ 313 FAIL_COMP(REG_ESPACE); \ 314 DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", \ 315 fg->wpattern[i], k)); \ 316 if (fg->icase) \ 317 { \ 318 tre_char_t wc = iswlower(fg->wpattern[i]) ? \ 319 towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \ 320 r = hashtable_put(fg->qsBc_table, &wc, &k); \ 321 if ((r == HASH_FAIL) || (r == HASH_FULL)) \ 322 FAIL_COMP(REG_ESPACE); \ 323 DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc, \ 324 k)); \ 325 } \ 326 } 327 328#ifdef _GREP_DEBUG 329#define DPRINT_BMGS(len, fmt_str, sh) \ 330 for (unsigned int i = 0; i < len; i++) \ 331 DPRINT((fmt_str, i, sh[i])); 332#else 333#define DPRINT_BMGS(len, fmt_str, sh) \ 334 do { } while(/*CONSTCOND*/0) 335#endif 336 337/* 338 * Fills in the good suffix table for SB/MB strings. 339 */ 340#define FILL_BMGS \ 341 if (!fg->hasdot) \ 342 { \ 343 fg->sbmGs = xmalloc(fg->len * sizeof(int)); \ 344 if (!fg->sbmGs) \ 345 return REG_ESPACE; \ 346 if (fg->len == 1) \ 347 fg->sbmGs[0] = 1; \ 348 else \ 349 _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false); \ 350 DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs); \ 351 } 352 353/* 354 * Fills in the good suffix table for wide strings. 355 */ 356#define FILL_BMGS_WIDE \ 357 if (!fg->hasdot) \ 358 { \ 359 fg->bmGs = xmalloc(fg->wlen * sizeof(int)); \ 360 if (!fg->bmGs) \ 361 return REG_ESPACE; \ 362 if (fg->wlen == 1) \ 363 fg->bmGs[0] = 1; \ 364 else \ 365 _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true); \ 366 DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n", \ 367 fg->bmGs); \ 368 } 369 370#define _FILL_BMGS(arr, pat, plen, wide) \ 371 { \ 372 char *p; \ 373 tre_char_t *wp; \ 374 \ 375 if (wide) \ 376 { \ 377 if (fg->icase) \ 378 { \ 379 wp = xmalloc(plen * sizeof(tre_char_t)); \ 380 if (wp == NULL) \ 381 return REG_ESPACE; \ 382 for (unsigned int i = 0; i < plen; i++) \ 383 wp[i] = towlower(pat[i]); \ 384 _CALC_BMGS(arr, wp, plen); \ 385 xfree(wp); \ 386 } \ 387 else \ 388 _CALC_BMGS(arr, pat, plen); \ 389 } \ 390 else \ 391 { \ 392 if (fg->icase) \ 393 { \ 394 p = xmalloc(plen); \ 395 if (p == NULL) \ 396 return REG_ESPACE; \ 397 for (unsigned int i = 0; i < plen; i++) \ 398 p[i] = tolower((unsigned char)pat[i]); \ 399 _CALC_BMGS(arr, p, plen); \ 400 xfree(p); \ 401 } \ 402 else \ 403 _CALC_BMGS(arr, pat, plen); \ 404 } \ 405 } 406 407#define _CALC_BMGS(arr, pat, plen) \ 408 { \ 409 int f = 0, g; \ 410 \ 411 int *suff = xmalloc(plen * sizeof(int)); \ 412 if (suff == NULL) \ 413 return REG_ESPACE; \ 414 \ 415 suff[plen - 1] = plen; \ 416 g = plen - 1; \ 417 for (int i = plen - 2; i >= 0; i--) \ 418 { \ 419 if (i > g && suff[i + plen - 1 - f] < i - g) \ 420 suff[i] = suff[i + plen - 1 - f]; \ 421 else \ 422 { \ 423 if (i < g) \ 424 g = i; \ 425 f = i; \ 426 while (g >= 0 && pat[g] == pat[g + plen - 1 - f]) \ 427 g--; \ 428 suff[i] = f - g; \ 429 } \ 430 } \ 431 \ 432 for (unsigned int i = 0; i < plen; i++) \ 433 arr[i] = plen; \ 434 g = 0; \ 435 for (int i = plen - 1; i >= 0; i--) \ 436 if (suff[i] == i + 1) \ 437 for(; (unsigned long)g < plen - 1 - i; g++) \ 438 if (arr[g] == plen) \ 439 arr[g] = plen - 1 - i; \ 440 for (unsigned int i = 0; i <= plen - 2; i++) \ 441 arr[plen - 1 - suff[i]] = plen - 1 - i; \ 442 \ 443 xfree(suff); \ 444 } 445 446/* 447 * Copies the pattern pat having lenght n to p and stores 448 * the size in l. 449 */ 450#define SAVE_PATTERN(src, srclen, dst, dstlen) \ 451 dstlen = srclen; \ 452 dst = xmalloc((dstlen + 1) * sizeof(tre_char_t)); \ 453 if (dst == NULL) \ 454 return REG_ESPACE; \ 455 if (dstlen > 0) \ 456 memcpy(dst, src, dstlen * sizeof(tre_char_t)); \ 457 dst[dstlen] = TRE_CHAR('\0'); 458 459/* 460 * Initializes pattern compiling. 461 */ 462#define INIT_COMP \ 463 /* Initialize. */ \ 464 memset(fg, 0, sizeof(*fg)); \ 465 fg->icase = (cflags & REG_ICASE); \ 466 fg->word = (cflags & REG_WORD); \ 467 fg->newline = (cflags & REG_NEWLINE); \ 468 fg->nosub = (cflags & REG_NOSUB); \ 469 \ 470 /* Cannot handle REG_ICASE with MB string */ \ 471 if (fg->icase && (TRE_MB_CUR_MAX > 1) && n > 0) \ 472 { \ 473 DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n")); \ 474 return REG_BADPAT; \ 475 } 476 477/* 478 * Checks whether we have a 0-length pattern that will match 479 * anything. If literal is set to false, the EOL anchor is also 480 * taken into account. 481 */ 482#define CHECK_MATCHALL(literal) \ 483 if (!literal && n == 1 && pat[0] == TRE_CHAR('$')) \ 484 { \ 485 n--; \ 486 fg->eol = true; \ 487 } \ 488 \ 489 if (n == 0) \ 490 { \ 491 fg->matchall = true; \ 492 fg->pattern = xmalloc(sizeof(char)); \ 493 if (!fg->pattern) \ 494 FAIL_COMP(REG_ESPACE); \ 495 fg->pattern[0] = '\0'; \ 496 fg->wpattern = xmalloc(sizeof(tre_char_t)); \ 497 if (!fg->wpattern) \ 498 FAIL_COMP(REG_ESPACE); \ 499 fg->wpattern[0] = TRE_CHAR('\0'); \ 500 DPRINT(("Matching every input\n")); \ 501 return REG_OK; \ 502 } 503 504/* 505 * Returns: REG_OK on success, error code otherwise 506 */ 507int 508tre_compile_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n, 509 int cflags) 510{ 511 size_t hasdot = 0, whasdot = 0; 512 ssize_t firstdot = -1, wfirstdot = -1; 513 514 INIT_COMP; 515 516 CHECK_MATCHALL(true); 517 518 /* Cannot handle word boundaries with MB string */ 519 if (fg->word && (TRE_MB_CUR_MAX > 1)) 520 return REG_BADPAT; 521 522#ifdef TRE_WCHAR 523 SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen); 524 STORE_MBS_PAT; 525#else 526 SAVE_PATTERN(pat, n, fg->pattern, fg->len); 527#endif 528 529 DPRINT(("tre_compile_literal: pattern: %s, len %zu, icase: %c, word: %c, " 530 "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n', 531 fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n')); 532 533 FILL_QSBC; 534 FILL_BMGS; 535#ifdef TRE_WCHAR 536 FILL_QSBC_WIDE; 537 FILL_BMGS_WIDE; 538#endif 539 540 return REG_OK; 541} 542 543/* 544 * Returns: REG_OK on success, error code otherwise 545 */ 546int 547tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n, 548 int cflags) 549{ 550 tre_char_t *tmp; 551 size_t pos = 0, hasdot = 0, whasdot = 0; 552 ssize_t firstdot = -1, wfirstdot = -1; 553 bool escaped = false; 554 bool *_escmap = NULL; 555 556 INIT_COMP; 557 558 /* Remove beginning-of-line character ('^'). */ 559 if (pat[0] == TRE_CHAR('^')) 560 { 561 fg->bol = true; 562 n--; 563 pat++; 564 } 565 566 CHECK_MATCHALL(false); 567 568 /* Handle word-boundary matching when GNU extensions are enabled */ 569 if ((cflags & REG_GNU) && (n >= 14) && 570 (memcmp(pat, TRE_CHAR("[[:<:]]"), 7 * sizeof(tre_char_t)) == 0) && 571 (memcmp(pat + n - 7, TRE_CHAR("[[:>:]]"), 572 7 * sizeof(tre_char_t)) == 0)) 573 { 574 n -= 14; 575 pat += 7; 576 fg->word = true; 577 } 578 579 /* Cannot handle word boundaries with MB string */ 580 if (fg->word && (TRE_MB_CUR_MAX > 1)) 581 return REG_BADPAT; 582 583 tmp = xmalloc((n + 1) * sizeof(tre_char_t)); 584 if (tmp == NULL) 585 return REG_ESPACE; 586 587/* Copies the char into the stored pattern and skips to the next char. */ 588#define STORE_CHAR \ 589 do \ 590 { \ 591 tmp[pos++] = pat[i]; \ 592 escaped = false; \ 593 continue; \ 594 } while (0) 595 596 /* Traverse the input pattern for processing */ 597 for (unsigned int i = 0; i < n; i++) 598 { 599 switch (pat[i]) 600 { 601 case TRE_CHAR('\\'): 602 if (escaped) 603 STORE_CHAR; 604 else if (i == n - 1) 605 goto badpat; 606 else 607 escaped = true; 608 continue; 609 case TRE_CHAR('['): 610 if (escaped) 611 STORE_CHAR; 612 else 613 goto badpat; 614 continue; 615 case TRE_CHAR('*'): 616 if (escaped || (!(cflags & REG_EXTENDED) && (i == 0))) 617 STORE_CHAR; 618 else 619 goto badpat; 620 continue; 621 case TRE_CHAR('+'): 622 case TRE_CHAR('?'): 623 if ((cflags & REG_EXTENDED) && (i == 0)) 624 continue; 625 else if ((cflags & REG_EXTENDED) ^ !escaped) 626 STORE_CHAR; 627 else 628 goto badpat; 629 continue; 630 case TRE_CHAR('.'): 631 if (escaped) 632 { 633 if (!_escmap) 634 _escmap = xmalloc(n * sizeof(bool)); 635 if (!_escmap) 636 { 637 xfree(tmp); 638 return REG_ESPACE; 639 } 640 _escmap[i] = true; 641 STORE_CHAR; 642 } 643 else 644 { 645 whasdot = i; 646 if (wfirstdot == -1) 647 wfirstdot = i; 648 STORE_CHAR; 649 } 650 continue; 651 case TRE_CHAR('^'): 652 STORE_CHAR; 653 continue; 654 case TRE_CHAR('$'): 655 if (!escaped && (i == n - 1)) 656 fg->eol = true; 657 else 658 STORE_CHAR; 659 continue; 660 case TRE_CHAR('('): 661 if ((cflags & REG_EXTENDED) ^ escaped) 662 goto badpat; 663 else 664 STORE_CHAR; 665 continue; 666 case TRE_CHAR('{'): 667 if (!(cflags & REG_EXTENDED) ^ escaped) 668 STORE_CHAR; 669 else if (!(cflags & REG_EXTENDED) && (i == 0)) 670 STORE_CHAR; 671 else if ((cflags & REG_EXTENDED) && (i == 0)) 672 continue; 673 else 674 goto badpat; 675 continue; 676 case TRE_CHAR('|'): 677 if ((cflags & REG_EXTENDED) ^ escaped) 678 goto badpat; 679 else 680 STORE_CHAR; 681 continue; 682 default: 683 if (escaped) 684 goto badpat; 685 else 686 STORE_CHAR; 687 continue; 688 } 689 continue; 690badpat: 691 xfree(tmp); 692 DPRINT(("tre_compile_fast: compilation of pattern failed, falling" 693 "back to NFA\n")); 694 return REG_BADPAT; 695 } 696 697 fg->hasdot = wfirstdot > -1; 698 699 /* 700 * The pattern has been processed and copied to tmp as a literal string 701 * with escapes, anchors (^$) and the word boundary match character 702 * classes stripped out. 703 */ 704#ifdef TRE_WCHAR 705 SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen); 706 fg->wescmap = _escmap; 707 STORE_MBS_PAT; 708 709 /* 710 * The position of dots and escaped dots is different in the MB string 711 * than in to the wide string so traverse the converted string, as well, 712 * to store these positions. 713 */ 714 if (fg->hasdot || (fg->wescmap != NULL)) 715 { 716 if (fg->wescmap != NULL) 717 { 718 fg->escmap = xmalloc(fg->len * sizeof(bool)); 719 if (!fg->escmap) 720 { 721 tre_free_fast(fg); 722 return REG_ESPACE; 723 } 724 } 725 726 escaped = false; 727 for (unsigned int i = 0; i < fg->len; i++) 728 if (fg->pattern[i] == '\\') 729 escaped = !escaped; 730 else if (fg->pattern[i] == '.' && escaped) 731 { 732 fg->escmap[i] = true; 733 escaped = false; 734 } 735 else if (fg->pattern[i] == '.' && !escaped) 736 { 737 hasdot = i; 738 if (firstdot == -1) 739 firstdot = i; 740 } 741 else 742 escaped = false; 743 } 744#else 745 SAVE_PATTERN(tmp, pos, fg->pattern, fg->len); 746 fg->escmap = _escmap; 747#endif 748 749 xfree(tmp); 750 751 DPRINT(("tre_compile_fast: pattern: %s, len %zu, bol %c, eol %c, " 752 "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len, 753 fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n', 754 fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n', 755 fg->newline ? 'y' : 'n')); 756 757 /* Check whether reverse QS algorithm is more efficient */ 758 if ((wfirstdot > -1) && (fg->wlen - whasdot + 1 < (size_t)wfirstdot) && 759 fg->nosub) 760 { 761 fg->reversed = true; 762 DPRINT(("tre_compile_fast: using reverse QS algorithm\n")); 763 } 764 765 FILL_QSBC; 766 FILL_BMGS; 767#ifdef TRE_WCHAR 768 FILL_QSBC_WIDE; 769 FILL_BMGS_WIDE; 770#endif 771 772 return REG_OK; 773} 774 775#define _SHIFT_ONE \ 776 { \ 777 shift = 1; \ 778 j = !fg->reversed ? j + shift : j - shift; \ 779 continue; \ 780 } 781 782#define _BBOUND_COND \ 783 ((type == STR_WIDE) ? \ 784 ((j == 0) || !(tre_isalnum(str_wide[j - 1]) || \ 785 (str_wide[j - 1] == TRE_CHAR('_')))) : \ 786 ((j == 0) || !(tre_isalnum(str_byte[j - 1]) || \ 787 (str_byte[j - 1] == '_')))) 788 789#define _EBOUND_COND \ 790 ((type == STR_WIDE) ? \ 791 ((j + fg->wlen == len) || !(tre_isalnum(str_wide[j + fg->wlen]) || \ 792 (str_wide[j + fg->wlen] == TRE_CHAR('_')))) : \ 793 ((j + fg->len == len) || !(tre_isalnum(str_byte[j + fg->len]) || \ 794 (str_byte[j + fg->len] == '_')))) 795 796/* 797 * Condition to check whether the match on position j is on a 798 * word boundary. 799 */ 800#define IS_ON_WORD_BOUNDARY \ 801 (_BBOUND_COND && _EBOUND_COND) 802 803/* 804 * Checks word boundary and shifts one if match is not on a 805 * boundary. 806 */ 807#define CHECK_WORD_BOUNDARY \ 808 if (!IS_ON_WORD_BOUNDARY) \ 809 _SHIFT_ONE; 810 811#define _BOL_COND \ 812 ((j == 0) || ((type == STR_WIDE) ? (str_wide[j - 1] == TRE_CHAR('\n'))\ 813 : (str_byte[j - 1] == '\n'))) 814 815/* 816 * Checks BOL anchor and shifts one if match is not on a 817 * boundary. 818 */ 819#define CHECK_BOL_ANCHOR \ 820 if (!_BOL_COND) \ 821 _SHIFT_ONE; 822 823#define _EOL_COND \ 824 ((type == STR_WIDE) \ 825 ? ((j + fg->wlen == len) || \ 826 (str_wide[j + fg->wlen] == TRE_CHAR('\n'))) \ 827 : ((j + fg->len == len) || (str_byte[j + fg->wlen] == '\n'))) 828 829/* 830 * Checks EOL anchor and shifts one if match is not on a 831 * boundary. 832 */ 833#define CHECK_EOL_ANCHOR \ 834 if (!_EOL_COND) \ 835 _SHIFT_ONE; 836 837/* 838 * Executes matching of the precompiled pattern on the input string. 839 * Returns REG_OK or REG_NOMATCH depending on if we find a match or not. 840 */ 841int 842tre_match_fast(const fastmatch_t *fg, const void *data, size_t len, 843 tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags) 844{ 845 unsigned int shift, u = 0, v = 0; 846 ssize_t j = 0; 847 int ret = REG_NOMATCH; 848 int mismatch; 849 const char *str_byte = data; 850 const void *startptr = NULL; 851 const tre_char_t *str_wide = data; 852 853 /* Calculate length if unspecified. */ 854 if (len == (size_t)-1) 855 switch (type) 856 { 857 case STR_WIDE: 858 len = tre_strlen(str_wide); 859 break; 860 default: 861 len = strlen(str_byte); 862 break; 863 } 864 865 /* Shortcut for empty pattern */ 866 if (fg->matchall) 867 { 868 if (!fg->nosub && nmatch >= 1) 869 { 870 pmatch[0].rm_so = 0; 871 pmatch[0].rm_eo = len; 872 } 873 if (fg->bol && fg->eol) 874 return (len == 0) ? REG_OK : REG_NOMATCH; 875 else 876 return REG_OK; 877 } 878 879 /* No point in going farther if we do not have enough data. */ 880 switch (type) 881 { 882 case STR_WIDE: 883 if (len < fg->wlen) 884 return ret; 885 shift = fg->wlen; 886 break; 887 default: 888 if (len < fg->len) 889 return ret; 890 shift = fg->len; 891 } 892 893 /* 894 * REG_NOTBOL means not anchoring ^ to the beginning of the line, so we 895 * can shift one because there can't be a match at the beginning. 896 */ 897 if (fg->bol && (eflags & REG_NOTBOL)) 898 j = 1; 899 900 /* 901 * Like above, we cannot have a match at the very end when anchoring to 902 * the end and REG_NOTEOL is specified. 903 */ 904 if (fg->eol && (eflags & REG_NOTEOL)) 905 len--; 906 907 if (fg->reversed) 908 j = len - (type == STR_WIDE ? fg->wlen : fg->len); 909 910 911 /* Only try once at the beginning or ending of the line. */ 912 if ((fg->bol || fg->eol) && !fg->newline && !(eflags & REG_NOTBOL) && 913 !(eflags & REG_NOTEOL)) 914 { 915 /* Simple text comparison. */ 916 if (!((fg->bol && fg->eol) && 917 (type == STR_WIDE ? (len != fg->wlen) : (len != fg->len)))) 918 { 919 /* Determine where in data to start search at. */ 920 j = fg->eol ? len - (type == STR_WIDE ? fg->wlen : fg->len) : 0; 921 SKIP_CHARS(j); 922 mismatch = fastcmp(fg, startptr, type); 923 if (mismatch == REG_OK) 924 { 925 if (fg->word && !IS_ON_WORD_BOUNDARY) 926 return ret; 927 if (!fg->nosub && nmatch >= 1) 928 { 929 pmatch[0].rm_so = j; 930 pmatch[0].rm_eo = j + (type == STR_WIDE ? fg->wlen : fg->len); 931 } 932 return REG_OK; 933 } 934 } 935 } 936 else 937 { 938 /* Quick Search / Turbo Boyer-Moore algorithm. */ 939 do 940 { 941 SKIP_CHARS(j); 942 mismatch = fastcmp(fg, startptr, type); 943 if (mismatch == REG_OK) 944 { 945 if (fg->word) 946 CHECK_WORD_BOUNDARY; 947 if (fg->bol) 948 CHECK_BOL_ANCHOR; 949 if (fg->eol) 950 CHECK_EOL_ANCHOR; 951 if (!fg->nosub && nmatch >= 1) 952 { 953 pmatch[0].rm_so = j; 954 pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len); 955 } 956 return REG_OK; 957 } 958 else if (mismatch > 0) 959 return mismatch; 960 mismatch = -mismatch - 1; 961 SHIFT; 962 } while (!IS_OUT_OF_BOUNDS); 963 } 964 return ret; 965} 966 967/* 968 * Frees the resources that were allocated when the pattern was compiled. 969 */ 970void 971tre_free_fast(fastmatch_t *fg) 972{ 973 974 DPRINT(("tre_fast_free: freeing structures for pattern %s\n", 975 fg->pattern)); 976 977#ifdef TRE_WCHAR 978 hashtable_free(fg->qsBc_table); 979 if (!fg->hasdot) 980 xfree(fg->bmGs); 981 if (fg->wescmap) 982 xfree(fg->wescmap); 983 xfree(fg->wpattern); 984#endif 985 if (!fg->hasdot) 986 xfree(fg->sbmGs); 987 if (fg->escmap) 988 xfree(fg->escmap); 989 xfree(fg->pattern); 990} 991 992/* 993 * Returns: -(i + 1) on failure (position that it failed with minus sign) 994 * error code on error 995 * REG_OK on success 996 */ 997static inline int 998fastcmp(const fastmatch_t *fg, const void *data, tre_str_type_t type) 999{ 1000 const char *str_byte = data; 1001 const char *pat_byte = fg->pattern; 1002 const tre_char_t *str_wide = data; 1003 const tre_char_t *pat_wide = fg->wpattern; 1004 const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap; 1005 size_t len = (type == STR_WIDE) ? fg->wlen : fg->len; 1006 int ret = REG_OK; 1007 1008 /* Compare the pattern and the input char-by-char from the last position. */ 1009 for (int i = len - 1; i >= 0; i--) { 1010 switch (type) 1011 { 1012 case STR_WIDE: 1013 1014 /* Check dot */ 1015 if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') && 1016 (!escmap || !escmap[i]) && 1017 (!fg->newline || (str_wide[i] != TRE_CHAR('\n')))) 1018 continue; 1019 1020 /* Compare */ 1021 if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i])) 1022 : (pat_wide[i] == str_wide[i])) 1023 continue; 1024 break; 1025 default: 1026 /* Check dot */ 1027 if (fg->hasdot && pat_byte[i] == '.' && 1028 (!escmap || !escmap[i]) && 1029 (!fg->newline || (str_byte[i] != '\n'))) 1030 continue; 1031 1032 /* Compare */ 1033 if (fg->icase ? (tolower((unsigned char)pat_byte[i]) == tolower((unsigned char)str_byte[i])) 1034 : (pat_byte[i] == str_byte[i])) 1035 continue; 1036 } 1037 DPRINT(("fastcmp: mismatch at position %d\n", i)); 1038 ret = -(i + 1); 1039 break; 1040 } 1041 return ret; 1042} 1043