tre-fastmatch.c revision 265160
1/* $FreeBSD: stable/10/usr.bin/grep/regex/tre-fastmatch.c 265160 2014-04-30 20:39:08Z pfg $ */
2
3/*-
4 * Copyright (c) 1999 James Howard and Dag-Erling Co��dan Sm��rgrav
5 * Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include "glue.h"
31
32#include <ctype.h>
33#include <limits.h>
34#include <regex.h>
35#include <stdbool.h>
36#include <stdlib.h>
37#include <string.h>
38#ifdef TRE_WCHAR
39#include <wchar.h>
40#include <wctype.h>
41#endif
42
43#include "hashtable.h"
44#include "tre-fastmatch.h"
45#include "xmalloc.h"
46
47static int	fastcmp(const fastmatch_t *fg, const void *data,
48			tre_str_type_t type);
49
50/*
51 * Clean up if pattern compilation fails.
52 */
53#define FAIL_COMP(errcode)						\
54  {									\
55    if (fg->pattern)							\
56      xfree(fg->pattern);						\
57    if (fg->wpattern)							\
58      xfree(fg->wpattern);						\
59    if (fg->qsBc_table)							\
60      hashtable_free(fg->qsBc_table);					\
61    fg = NULL;								\
62    return errcode;							\
63  }
64
65/*
66 * Skips n characters in the input string and assigns the start
67 * address to startptr. Note: as per IEEE Std 1003.1-2008
68 * matching is based on bit pattern not character representations
69 * so we can handle MB strings as byte sequences just like
70 * SB strings.
71 */
72#define SKIP_CHARS(n)							\
73  switch (type)								\
74    {									\
75      case STR_WIDE:							\
76	startptr = str_wide + n;					\
77	break;								\
78      default:								\
79	startptr = str_byte + n;					\
80    }
81
82/*
83 * Converts the wide string pattern to SB/MB string and stores
84 * it in fg->pattern. Sets fg->len to the byte length of the
85 * converted string.
86 */
87#define STORE_MBS_PAT							\
88  {									\
89    size_t siz;								\
90									\
91    siz = wcstombs(NULL, fg->wpattern, 0);				\
92    if (siz == (size_t)-1)						\
93      return REG_BADPAT;						\
94    fg->len = siz;							\
95    fg->pattern = xmalloc(siz + 1);					\
96    if (fg->pattern == NULL)						\
97      return REG_ESPACE;						\
98    wcstombs(fg->pattern, fg->wpattern, siz);				\
99    fg->pattern[siz] = '\0';						\
100  }									\
101
102#define IS_OUT_OF_BOUNDS						\
103  ((!fg->reversed							\
104    ? ((type == STR_WIDE) ? ((j + fg->wlen) > len)			\
105			  : ((j + fg->len) > len))			\
106    : (j < 0)))
107
108/*
109 * Checks whether the new position after shifting in the input string
110 * is out of the bounds and break out from the loop if so.
111 */
112#define CHECKBOUNDS							\
113  if (IS_OUT_OF_BOUNDS)							\
114    break;								\
115
116/*
117 * Shifts in the input string after a mismatch. The position of the
118 * mismatch is stored in the mismatch variable.
119 */
120#define SHIFT								\
121  CHECKBOUNDS;								\
122									\
123  {									\
124    int r = -1;								\
125    unsigned int bc = 0, gs = 0, ts;					\
126									\
127    switch (type)							\
128      {									\
129	case STR_WIDE:							\
130	  if (!fg->hasdot)						\
131	    {								\
132	      if (u != 0 && (unsigned)mismatch == fg->wlen - 1 - shift)	\
133		mismatch -= u;						\
134	      v = fg->wlen - 1 - mismatch;				\
135	      r = hashtable_get(fg->qsBc_table,				\
136		&str_wide[!fg->reversed ? (size_t)j + fg->wlen		\
137			  : (size_t)j - 1], &bc);			\
138	      gs = fg->bmGs[mismatch];					\
139	    }								\
140	    bc = (r == HASH_OK) ? bc : fg->defBc;			\
141	    DPRINT(("tre_fast_match: mismatch on character" CHF ", "	\
142		    "BC %d, GS %d\n",					\
143		    ((const tre_char_t *)startptr)[mismatch + 1],	\
144		    bc, gs));						\
145            break;							\
146	default:							\
147	  if (!fg->hasdot)						\
148	    {								\
149	      if (u != 0 && (unsigned)mismatch == fg->len - 1 - shift)	\
150		mismatch -= u;						\
151	      v = fg->len - 1 - mismatch;				\
152	      gs = fg->sbmGs[mismatch];					\
153	    }								\
154	  bc = fg->qsBc[((const unsigned char *)str_byte)		\
155			[!fg->reversed ? (size_t)j + fg->len		\
156			 : (size_t)j - 1]];				\
157	  DPRINT(("tre_fast_match: mismatch on character %c, "		\
158		 "BC %d, GS %d\n",					\
159		 ((const unsigned char *)startptr)[mismatch + 1],	\
160		 bc, gs));						\
161      }									\
162    if (fg->hasdot)							\
163      shift = bc;							\
164    else								\
165      {									\
166	ts = (u >= v) ? (u - v) : 0;					\
167	shift = MAX(ts, bc);						\
168	shift = MAX(shift, gs);						\
169	if (shift == gs)						\
170	  u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v);	\
171	else								\
172	  {								\
173	    if (ts < bc)						\
174	      shift = MAX(shift, u + 1);				\
175	    u = 0;							\
176	  }								\
177      }									\
178      DPRINT(("tre_fast_match: shifting %u characters\n", shift));	\
179      j = !fg->reversed ? j + shift : j - shift;			\
180  }
181
182/*
183 * Normal Quick Search would require a shift based on the position the
184 * next character after the comparison is within the pattern.  With
185 * wildcards, the position of the last dot effects the maximum shift
186 * distance.
187 * The closer to the end the wild card is the slower the search.
188 *
189 * Examples:
190 * Pattern    Max shift
191 * -------    ---------
192 * this               5
193 * .his               4
194 * t.is               3
195 * th.s               2
196 * thi.               1
197 */
198
199/*
200 * Fills in the bad character shift array for SB/MB strings.
201 */
202#define FILL_QSBC							\
203  if (fg->reversed)							\
204    {									\
205      _FILL_QSBC_REVERSED						\
206    }									\
207  else									\
208    {									\
209      _FILL_QSBC							\
210    }
211
212#define _FILL_QSBC							\
213  for (unsigned int i = 0; i <= UCHAR_MAX; i++)				\
214    fg->qsBc[i] = fg->len - hasdot;					\
215  for (unsigned int i = hasdot + 1; i < fg->len; i++)			\
216    {									\
217      fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i;		\
218      DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i],		\
219	     fg->len - i));						\
220      if (fg->icase)							\
221	{								\
222	  char c = islower((unsigned char)fg->pattern[i]) ?		\
223		   toupper((unsigned char)fg->pattern[i]) :		\
224		   tolower((unsigned char)fg->pattern[i]);		\
225	  fg->qsBc[(unsigned char)c] = fg->len - i;			\
226	  DPRINT(("BC shift for char %c is %zu\n", c, fg->len - i));	\
227	}								\
228    }
229
230#define _FILL_QSBC_REVERSED						\
231  for (unsigned int i = 0; i <= UCHAR_MAX; i++)				\
232    fg->qsBc[i] = firstdot + 1;						\
233  for (int i = firstdot - 1; i >= 0; i--)				\
234    {									\
235      fg->qsBc[(unsigned char)fg->pattern[i]] = i + 1;			\
236      DPRINT(("Reverse BC shift for char %c is %d\n", fg->pattern[i],	\
237	     i + 1));							\
238      if (fg->icase)							\
239        {								\
240          char c = islower((unsigned char)fg->pattern[i]) ?		\
241		   toupper((unsigned char)fg->pattern[i]) :		\
242		   tolower((unsigned char)fg->pattern[i]);		\
243          fg->qsBc[(unsigned char)c] = i + 1;				\
244	  DPRINT(("Reverse BC shift for char %c is %d\n", c, i + 1));	\
245        }								\
246    }
247
248/*
249 * Fills in the bad character shifts into a hastable for wide strings.
250 * With wide characters it is not possible any more to use a normal
251 * array because there are too many characters and we could not
252 * provide enough memory. Fortunately, we only have to store distinct
253 * values for so many characters as the number of distinct characters
254 * in the pattern, so we can store them in a hashtable and store a
255 * default shift value for the rest.
256 */
257#define FILL_QSBC_WIDE							\
258  if (fg->reversed)							\
259    {									\
260      _FILL_QSBC_WIDE_REVERSED						\
261    }									\
262  else									\
263    {									\
264      _FILL_QSBC_WIDE							\
265    }
266
267#define _FILL_QSBC_WIDE							\
268  /* Adjust the shift based on location of the last dot ('.'). */	\
269  fg->defBc = fg->wlen - whasdot;					\
270									\
271  /* Preprocess pattern. */						\
272  fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4),	\
273				  sizeof(tre_char_t), sizeof(int));	\
274  if (!fg->qsBc_table)							\
275    FAIL_COMP(REG_ESPACE);						\
276  for (unsigned int i = whasdot + 1; i < fg->wlen; i++)			\
277    {									\
278      int k = fg->wlen - i;						\
279      int r;								\
280									\
281      r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);		\
282      if ((r == HASH_FAIL) || (r == HASH_FULL))				\
283	FAIL_COMP(REG_ESPACE);						\
284      DPRINT(("BC shift for wide char " CHF " is %d\n", fg->wpattern[i],\
285	     k));							\
286      if (fg->icase)							\
287	{								\
288	  tre_char_t wc = iswlower(fg->wpattern[i]) ?			\
289	    towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]);	\
290	  r = hashtable_put(fg->qsBc_table, &wc, &k);			\
291	  if ((r == HASH_FAIL) || (r == HASH_FULL))			\
292	    FAIL_COMP(REG_ESPACE);					\
293	  DPRINT(("BC shift for wide char " CHF " is %d\n", wc, k));	\
294	}								\
295    }
296
297#define _FILL_QSBC_WIDE_REVERSED					\
298  /* Adjust the shift based on location of the last dot ('.'). */	\
299  fg->defBc = (size_t)wfirstdot;					\
300									\
301  /* Preprocess pattern. */						\
302  fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4),	\
303				  sizeof(tre_char_t), sizeof(int));	\
304  if (!fg->qsBc_table)							\
305    FAIL_COMP(REG_ESPACE);						\
306  for (int i = wfirstdot - 1; i >= 0; i--)				\
307    {									\
308      int k = i + 1;							\
309      int r;								\
310									\
311      r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);		\
312      if ((r == HASH_FAIL) || (r == HASH_FULL))				\
313	FAIL_COMP(REG_ESPACE);						\
314      DPRINT(("Reverse BC shift for wide char " CHF " is %d\n",		\
315	     fg->wpattern[i], k));					\
316      if (fg->icase)							\
317	{								\
318	  tre_char_t wc = iswlower(fg->wpattern[i]) ?			\
319	    towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]);	\
320	  r = hashtable_put(fg->qsBc_table, &wc, &k);			\
321	  if ((r == HASH_FAIL) || (r == HASH_FULL))			\
322	    FAIL_COMP(REG_ESPACE);					\
323	  DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc,	\
324		 k));							\
325	}								\
326    }
327
328#ifdef _GREP_DEBUG
329#define DPRINT_BMGS(len, fmt_str, sh)					\
330  for (unsigned int i = 0; i < len; i++)				\
331    DPRINT((fmt_str, i, sh[i]));
332#else
333#define DPRINT_BMGS(len, fmt_str, sh)					\
334  do { } while(/*CONSTCOND*/0)
335#endif
336
337/*
338 * Fills in the good suffix table for SB/MB strings.
339 */
340#define FILL_BMGS							\
341  if (!fg->hasdot)							\
342    {									\
343      fg->sbmGs = xmalloc(fg->len * sizeof(int));			\
344      if (!fg->sbmGs)							\
345	return REG_ESPACE;						\
346      if (fg->len == 1)							\
347	fg->sbmGs[0] = 1;						\
348      else								\
349	_FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false);		\
350      DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs);	\
351    }
352
353/*
354 * Fills in the good suffix table for wide strings.
355 */
356#define FILL_BMGS_WIDE							\
357  if (!fg->hasdot)							\
358    {									\
359      fg->bmGs = xmalloc(fg->wlen * sizeof(int));			\
360      if (!fg->bmGs)							\
361	return REG_ESPACE;						\
362      if (fg->wlen == 1)						\
363	fg->bmGs[0] = 1;						\
364      else								\
365	_FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true);		\
366      DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n",	\
367		  fg->bmGs);						\
368    }
369
370#define _FILL_BMGS(arr, pat, plen, wide)				\
371  {									\
372    char *p;								\
373    tre_char_t *wp;							\
374									\
375    if (wide)								\
376      {									\
377	if (fg->icase)							\
378	  {								\
379	    wp = xmalloc(plen * sizeof(tre_char_t));			\
380	    if (wp == NULL)						\
381	      return REG_ESPACE;					\
382	    for (unsigned int i = 0; i < plen; i++)			\
383	      wp[i] = towlower(pat[i]);					\
384	    _CALC_BMGS(arr, wp, plen);					\
385	    xfree(wp);							\
386	  }								\
387	else								\
388	  _CALC_BMGS(arr, pat, plen);					\
389      }									\
390    else								\
391      {									\
392	if (fg->icase)							\
393	  {								\
394	    p = xmalloc(plen);						\
395	    if (p == NULL)						\
396	      return REG_ESPACE;					\
397	    for (unsigned int i = 0; i < plen; i++)			\
398	      p[i] = tolower((unsigned char)pat[i]);                    \
399	    _CALC_BMGS(arr, p, plen);					\
400	    xfree(p);							\
401	  }								\
402	else								\
403	  _CALC_BMGS(arr, pat, plen);					\
404      }									\
405  }
406
407#define _CALC_BMGS(arr, pat, plen)					\
408  {									\
409    int f = 0, g;							\
410									\
411    int *suff = xmalloc(plen * sizeof(int));				\
412    if (suff == NULL)							\
413      return REG_ESPACE;						\
414									\
415    suff[plen - 1] = plen;						\
416    g = plen - 1;							\
417    for (int i = plen - 2; i >= 0; i--)					\
418      {									\
419	if (i > g && suff[i + plen - 1 - f] < i - g)			\
420	  suff[i] = suff[i + plen - 1 - f];				\
421	else								\
422	  {								\
423	    if (i < g)							\
424	      g = i;							\
425	    f = i;							\
426	    while (g >= 0 && pat[g] == pat[g + plen - 1 - f])		\
427	      g--;							\
428	    suff[i] = f - g;						\
429	  }								\
430      }									\
431									\
432    for (unsigned int i = 0; i < plen; i++)				\
433      arr[i] = plen;							\
434    g = 0;								\
435    for (int i = plen - 1; i >= 0; i--)					\
436      if (suff[i] == i + 1)						\
437	for(; (unsigned long)g < plen - 1 - i; g++)			\
438	  if (arr[g] == plen)						\
439	    arr[g] = plen - 1 - i;					\
440    for (unsigned int i = 0; i <= plen - 2; i++)			\
441      arr[plen - 1 - suff[i]] = plen - 1 - i;				\
442									\
443    xfree(suff);							\
444  }
445
446/*
447 * Copies the pattern pat having length n to p and stores
448 * the size in l.
449 */
450#define SAVE_PATTERN(src, srclen, dst, dstlen)				\
451  dstlen = srclen;							\
452  dst = xmalloc((dstlen + 1) * sizeof(tre_char_t));			\
453  if (dst == NULL)							\
454    return REG_ESPACE;							\
455  if (dstlen > 0)							\
456    memcpy(dst, src, dstlen * sizeof(tre_char_t));			\
457  dst[dstlen] = TRE_CHAR('\0');
458
459/*
460 * Initializes pattern compiling.
461 */
462#define INIT_COMP							\
463  /* Initialize. */							\
464  memset(fg, 0, sizeof(*fg));						\
465  fg->icase = (cflags & REG_ICASE);					\
466  fg->word = (cflags & REG_WORD);					\
467  fg->newline = (cflags & REG_NEWLINE);					\
468  fg->nosub = (cflags & REG_NOSUB);					\
469									\
470  /* Cannot handle REG_ICASE with MB string */				\
471  if (fg->icase && (TRE_MB_CUR_MAX > 1) && n > 0)			\
472    {									\
473      DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n"));	\
474      return REG_BADPAT;						\
475    }
476
477/*
478 * Checks whether we have a 0-length pattern that will match
479 * anything. If literal is set to false, the EOL anchor is also
480 * taken into account.
481 */
482#define CHECK_MATCHALL(literal)						\
483  if (!literal && n == 1 && pat[0] == TRE_CHAR('$'))			\
484    {									\
485      n--;								\
486      fg->eol = true;							\
487    }									\
488									\
489  if (n == 0)								\
490    {									\
491      fg->matchall = true;						\
492      fg->pattern = xmalloc(sizeof(char));				\
493      if (!fg->pattern)							\
494	FAIL_COMP(REG_ESPACE);						\
495      fg->pattern[0] = '\0';						\
496      fg->wpattern = xmalloc(sizeof(tre_char_t));			\
497      if (!fg->wpattern)						\
498	FAIL_COMP(REG_ESPACE);						\
499      fg->wpattern[0] = TRE_CHAR('\0');					\
500      DPRINT(("Matching every input\n"));				\
501      return REG_OK;							\
502    }
503
504/*
505 * Returns: REG_OK on success, error code otherwise
506 */
507int
508tre_compile_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n,
509		    int cflags)
510{
511  size_t hasdot = 0, whasdot = 0;
512  ssize_t firstdot = -1, wfirstdot = -1;
513
514  INIT_COMP;
515
516  CHECK_MATCHALL(true);
517
518  /* Cannot handle word boundaries with MB string */
519  if (fg->word && (TRE_MB_CUR_MAX > 1))
520    return REG_BADPAT;
521
522#ifdef TRE_WCHAR
523  SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen);
524  STORE_MBS_PAT;
525#else
526  SAVE_PATTERN(pat, n, fg->pattern, fg->len);
527#endif
528
529  DPRINT(("tre_compile_literal: pattern: %s, len %zu, icase: %c, word: %c, "
530	 "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n',
531	 fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n'));
532
533  FILL_QSBC;
534  FILL_BMGS;
535#ifdef TRE_WCHAR
536  FILL_QSBC_WIDE;
537  FILL_BMGS_WIDE;
538#endif
539
540  return REG_OK;
541}
542
543/*
544 * Returns: REG_OK on success, error code otherwise
545 */
546int
547tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n,
548		 int cflags)
549{
550  tre_char_t *tmp;
551  size_t pos = 0, hasdot = 0, whasdot = 0;
552  ssize_t firstdot = -1, wfirstdot = -1;
553  bool escaped = false;
554  bool *_escmap = NULL;
555
556  INIT_COMP;
557
558  /* Remove beginning-of-line character ('^'). */
559  if (pat[0] == TRE_CHAR('^'))
560    {
561      fg->bol = true;
562      n--;
563      pat++;
564    }
565
566  CHECK_MATCHALL(false);
567
568  /* Handle word-boundary matching when GNU extensions are enabled */
569  if ((cflags & REG_GNU) && (n >= 14) &&
570      (memcmp(pat, TRE_CHAR("[[:<:]]"), 7 * sizeof(tre_char_t)) == 0) &&
571      (memcmp(pat + n - 7, TRE_CHAR("[[:>:]]"),
572	      7 * sizeof(tre_char_t)) == 0))
573    {
574      n -= 14;
575      pat += 7;
576      fg->word = true;
577    }
578
579  /* Cannot handle word boundaries with MB string */
580  if (fg->word && (TRE_MB_CUR_MAX > 1))
581    return REG_BADPAT;
582
583  tmp = xmalloc((n + 1) * sizeof(tre_char_t));
584  if (tmp == NULL)
585    return REG_ESPACE;
586
587/* Copies the char into the stored pattern and skips to the next char. */
588#define STORE_CHAR							\
589  do									\
590    {									\
591      tmp[pos++] = pat[i];						\
592      escaped = false;							\
593      continue;								\
594    } while (0)
595
596  /* Traverse the input pattern for processing */
597  for (unsigned int i = 0; i < n; i++)
598    {
599      switch (pat[i])
600	{
601	  case TRE_CHAR('\\'):
602	    if (escaped)
603	      STORE_CHAR;
604	    else if (i == n - 1)
605	      goto badpat;
606	    else
607	      escaped = true;
608	    continue;
609	  case TRE_CHAR('['):
610	    if (escaped)
611	      STORE_CHAR;
612	    else
613	      goto badpat;
614	    continue;
615	  case TRE_CHAR('*'):
616	    if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
617	      STORE_CHAR;
618	    else
619	      goto badpat;
620	    continue;
621	  case TRE_CHAR('+'):
622	  case TRE_CHAR('?'):
623	    if ((cflags & REG_EXTENDED) && (i == 0))
624	      continue;
625	    else if ((cflags & REG_EXTENDED) ^ !escaped)
626	      STORE_CHAR;
627	    else
628	      goto badpat;
629	    continue;
630	  case TRE_CHAR('.'):
631	    if (escaped)
632	      {
633		if (!_escmap)
634		  _escmap = xmalloc(n * sizeof(bool));
635		if (!_escmap)
636		  {
637		    xfree(tmp);
638		    return REG_ESPACE;
639		  }
640		_escmap[i] = true;
641		STORE_CHAR;
642	      }
643	    else
644	      {
645		whasdot = i;
646		if (wfirstdot == -1)
647			wfirstdot = i;
648		STORE_CHAR;
649	      }
650	    continue;
651	  case TRE_CHAR('^'):
652	    STORE_CHAR;
653	    continue;
654	  case TRE_CHAR('$'):
655	    if (!escaped && (i == n - 1))
656	      fg->eol = true;
657	    else
658	      STORE_CHAR;
659	    continue;
660	  case TRE_CHAR('('):
661	    if ((cflags & REG_EXTENDED) ^ escaped)
662	      goto badpat;
663	    else
664	      STORE_CHAR;
665	    continue;
666	  case TRE_CHAR('{'):
667	    if (!(cflags & REG_EXTENDED) ^ escaped)
668	      STORE_CHAR;
669	    else if (!(cflags & REG_EXTENDED) && (i == 0))
670	      STORE_CHAR;
671	    else if ((cflags & REG_EXTENDED) && (i == 0))
672	      continue;
673	    else
674	      goto badpat;
675	    continue;
676	  case TRE_CHAR('|'):
677	    if ((cflags & REG_EXTENDED) ^ escaped)
678	      goto badpat;
679	    else
680	      STORE_CHAR;
681	    continue;
682	  default:
683	    if (escaped)
684	      goto badpat;
685	    else
686	      STORE_CHAR;
687	    continue;
688	}
689      continue;
690badpat:
691      xfree(tmp);
692      DPRINT(("tre_compile_fast: compilation of pattern failed, falling"
693	      "back to NFA\n"));
694      return REG_BADPAT;
695    }
696
697  fg->hasdot = wfirstdot > -1;
698
699  /*
700   * The pattern has been processed and copied to tmp as a literal string
701   * with escapes, anchors (^$) and the word boundary match character
702   * classes stripped out.
703   */
704#ifdef TRE_WCHAR
705  SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen);
706  fg->wescmap = _escmap;
707  STORE_MBS_PAT;
708
709  /*
710   * The position of dots and escaped dots is different in the MB string
711   * than in to the wide string so traverse the converted string, as well,
712   * to store these positions.
713   */
714  if (fg->hasdot || (fg->wescmap != NULL))
715    {
716      if (fg->wescmap != NULL)
717	{
718	  fg->escmap = xmalloc(fg->len * sizeof(bool));
719	  if (!fg->escmap)
720	    {
721	      tre_free_fast(fg);
722	      return REG_ESPACE;
723	    }
724	}
725
726      escaped = false;
727      for (unsigned int i = 0; i < fg->len; i++)
728	if (fg->pattern[i] == '\\')
729	  escaped = !escaped;
730	else if (fg->pattern[i] == '.' && escaped)
731	  {
732	    fg->escmap[i] = true;
733	    escaped = false;
734	  }
735	else if (fg->pattern[i] == '.' && !escaped)
736	  {
737	    hasdot = i;
738	    if (firstdot == -1)
739	      firstdot = i;
740	  }
741	else
742	  escaped = false;
743    }
744#else
745  SAVE_PATTERN(tmp, pos, fg->pattern, fg->len);
746  fg->escmap = _escmap;
747#endif
748
749  xfree(tmp);
750
751  DPRINT(("tre_compile_fast: pattern: %s, len %zu, bol %c, eol %c, "
752	 "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len,
753	 fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',
754	 fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n',
755	 fg->newline ? 'y' : 'n'));
756
757  /* Check whether reverse QS algorithm is more efficient */
758  if ((wfirstdot > -1) && (fg->wlen - whasdot + 1 < (size_t)wfirstdot) &&
759      fg->nosub)
760    {
761      fg->reversed = true;
762      DPRINT(("tre_compile_fast: using reverse QS algorithm\n"));
763    }
764
765  FILL_QSBC;
766  FILL_BMGS;
767#ifdef TRE_WCHAR
768  FILL_QSBC_WIDE;
769  FILL_BMGS_WIDE;
770#endif
771
772  return REG_OK;
773}
774
775#define _SHIFT_ONE							\
776  {									\
777    shift = 1;								\
778    j = !fg->reversed ? j + shift : j - shift;				\
779    continue;								\
780  }
781
782#define _BBOUND_COND							\
783  ((type == STR_WIDE) ?							\
784    ((j == 0) || !(tre_isalnum(str_wide[j - 1]) ||			\
785      (str_wide[j - 1] == TRE_CHAR('_')))) :				\
786    ((j == 0) || !(tre_isalnum(str_byte[j - 1]) ||			\
787      (str_byte[j - 1] == '_'))))
788
789#define _EBOUND_COND							\
790  ((type == STR_WIDE) ?							\
791    ((j + fg->wlen == len) || !(tre_isalnum(str_wide[j + fg->wlen]) ||	\
792      (str_wide[j + fg->wlen] == TRE_CHAR('_')))) :			\
793    ((j + fg->len == len) || !(tre_isalnum(str_byte[j + fg->len]) ||	\
794      (str_byte[j + fg->len] == '_'))))
795
796/*
797 * Condition to check whether the match on position j is on a
798 * word boundary.
799 */
800#define IS_ON_WORD_BOUNDARY						\
801  (_BBOUND_COND && _EBOUND_COND)
802
803/*
804 * Checks word boundary and shifts one if match is not on a
805 * boundary.
806 */
807#define CHECK_WORD_BOUNDARY						\
808    if (!IS_ON_WORD_BOUNDARY)						\
809      _SHIFT_ONE;
810
811#define _BOL_COND							\
812  ((j == 0) || ((type == STR_WIDE) ? (str_wide[j - 1] == TRE_CHAR('\n'))\
813				   : (str_byte[j - 1] == '\n')))
814
815/*
816 * Checks BOL anchor and shifts one if match is not on a
817 * boundary.
818 */
819#define CHECK_BOL_ANCHOR						\
820    if (!_BOL_COND)							\
821      _SHIFT_ONE;
822
823#define _EOL_COND							\
824  ((type == STR_WIDE)							\
825    ? ((j + fg->wlen == len) ||						\
826		(str_wide[j + fg->wlen] == TRE_CHAR('\n')))		\
827    : ((j + fg->len == len) || (str_byte[j + fg->wlen] == '\n')))
828
829/*
830 * Checks EOL anchor and shifts one if match is not on a
831 * boundary.
832 */
833#define CHECK_EOL_ANCHOR						\
834    if (!_EOL_COND)							\
835      _SHIFT_ONE;
836
837/*
838 * Executes matching of the precompiled pattern on the input string.
839 * Returns REG_OK or REG_NOMATCH depending on if we find a match or not.
840 */
841int
842tre_match_fast(const fastmatch_t *fg, const void *data, size_t len,
843    tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags)
844{
845  unsigned int shift, u = 0, v = 0;
846  ssize_t j = 0;
847  int ret = REG_NOMATCH;
848  int mismatch;
849  const char *str_byte = data;
850  const void *startptr = NULL;
851  const tre_char_t *str_wide = data;
852
853  /* Calculate length if unspecified. */
854  if (len == (size_t)-1)
855    switch (type)
856      {
857	case STR_WIDE:
858	  len = tre_strlen(str_wide);
859	  break;
860	default:
861	  len = strlen(str_byte);
862	  break;
863      }
864
865  /* Shortcut for empty pattern */
866  if (fg->matchall)
867    {
868      if (!fg->nosub && nmatch >= 1)
869	{
870	  pmatch[0].rm_so = 0;
871	  pmatch[0].rm_eo = len;
872	}
873      if (fg->bol && fg->eol)
874	return (len == 0) ? REG_OK : REG_NOMATCH;
875      else
876	return REG_OK;
877    }
878
879  /* No point in going farther if we do not have enough data. */
880  switch (type)
881    {
882      case STR_WIDE:
883	if (len < fg->wlen)
884	  return ret;
885	shift = fg->wlen;
886	break;
887      default:
888	if (len < fg->len)
889	  return ret;
890	shift = fg->len;
891    }
892
893  /*
894   * REG_NOTBOL means not anchoring ^ to the beginning of the line, so we
895   * can shift one because there can't be a match at the beginning.
896   */
897  if (fg->bol && (eflags & REG_NOTBOL))
898    j = 1;
899
900  /*
901   * Like above, we cannot have a match at the very end when anchoring to
902   * the end and REG_NOTEOL is specified.
903   */
904  if (fg->eol && (eflags & REG_NOTEOL))
905    len--;
906
907  if (fg->reversed)
908    j = len - (type == STR_WIDE ? fg->wlen : fg->len);
909
910
911  /* Only try once at the beginning or ending of the line. */
912  if ((fg->bol || fg->eol) && !fg->newline && !(eflags & REG_NOTBOL) &&
913      !(eflags & REG_NOTEOL))
914    {
915      /* Simple text comparison. */
916      if (!((fg->bol && fg->eol) &&
917	  (type == STR_WIDE ? (len != fg->wlen) : (len != fg->len))))
918	{
919	  /* Determine where in data to start search at. */
920	  j = fg->eol ? len - (type == STR_WIDE ? fg->wlen : fg->len) : 0;
921	  SKIP_CHARS(j);
922	  mismatch = fastcmp(fg, startptr, type);
923	  if (mismatch == REG_OK)
924	    {
925	      if (fg->word && !IS_ON_WORD_BOUNDARY)
926		return ret;
927	      if (!fg->nosub && nmatch >= 1)
928		{
929		  pmatch[0].rm_so = j;
930		  pmatch[0].rm_eo = j + (type == STR_WIDE ? fg->wlen : fg->len);
931		}
932	      return REG_OK;
933            }
934        }
935    }
936  else
937    {
938      /* Quick Search / Turbo Boyer-Moore algorithm. */
939      do
940	{
941	  SKIP_CHARS(j);
942	  mismatch = fastcmp(fg, startptr, type);
943	  if (mismatch == REG_OK)
944	    {
945	      if (fg->word)
946		CHECK_WORD_BOUNDARY;
947	      if (fg->bol)
948		CHECK_BOL_ANCHOR;
949	      if (fg->eol)
950		CHECK_EOL_ANCHOR;
951	      if (!fg->nosub && nmatch >= 1)
952		{
953		  pmatch[0].rm_so = j;
954		  pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len);
955		}
956	      return REG_OK;
957	    }
958	  else if (mismatch > 0)
959	    return mismatch;
960	  mismatch = -mismatch - 1;
961	  SHIFT;
962        } while (!IS_OUT_OF_BOUNDS);
963    }
964    return ret;
965}
966
967/*
968 * Frees the resources that were allocated when the pattern was compiled.
969 */
970void
971tre_free_fast(fastmatch_t *fg)
972{
973
974  DPRINT(("tre_fast_free: freeing structures for pattern %s\n",
975	 fg->pattern));
976
977#ifdef TRE_WCHAR
978  hashtable_free(fg->qsBc_table);
979  if (!fg->hasdot)
980    xfree(fg->bmGs);
981  if (fg->wescmap)
982    xfree(fg->wescmap);
983  xfree(fg->wpattern);
984#endif
985  if (!fg->hasdot)
986    xfree(fg->sbmGs);
987  if (fg->escmap)
988    xfree(fg->escmap);
989  xfree(fg->pattern);
990}
991
992/*
993 * Returns:	-(i + 1) on failure (position that it failed with minus sign)
994 *		error code on error
995 *		REG_OK on success
996 */
997static inline int
998fastcmp(const fastmatch_t *fg, const void *data, tre_str_type_t type)
999{
1000  const char *str_byte = data;
1001  const char *pat_byte = fg->pattern;
1002  const tre_char_t *str_wide = data;
1003  const tre_char_t *pat_wide = fg->wpattern;
1004  const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap;
1005  size_t len = (type == STR_WIDE) ? fg->wlen : fg->len;
1006  int ret = REG_OK;
1007
1008  /* Compare the pattern and the input char-by-char from the last position. */
1009  for (int i = len - 1; i >= 0; i--) {
1010    switch (type)
1011      {
1012	case STR_WIDE:
1013
1014	  /* Check dot */
1015	  if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') &&
1016	      (!escmap || !escmap[i]) &&
1017	      (!fg->newline || (str_wide[i] != TRE_CHAR('\n'))))
1018	    continue;
1019
1020	  /* Compare */
1021	  if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i]))
1022		    : (pat_wide[i] == str_wide[i]))
1023	    continue;
1024	  break;
1025	default:
1026	  /* Check dot */
1027	  if (fg->hasdot && pat_byte[i] == '.' &&
1028	      (!escmap || !escmap[i]) &&
1029	      (!fg->newline || (str_byte[i] != '\n')))
1030	    continue;
1031
1032	  /* Compare */
1033	  if (fg->icase ? (tolower((unsigned char)pat_byte[i]) == tolower((unsigned char)str_byte[i]))
1034		    : (pat_byte[i] == str_byte[i]))
1035	  continue;
1036      }
1037    DPRINT(("fastcmp: mismatch at position %d\n", i));
1038    ret = -(i + 1);
1039    break;
1040  }
1041  return ret;
1042}
1043