1/*
2  regexec.c - TRE POSIX compatible matching functions (and more).
3
4  Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
5  All rights reserved.
6
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10
11    1. Redistributions of source code must retain the above copyright
12       notice, this list of conditions and the following disclaimer.
13
14    2. Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17
18  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30*/
31
32#include <stdlib.h>
33#include <string.h>
34#include <wchar.h>
35#include <wctype.h>
36#include <limits.h>
37#include <stdint.h>
38
39#include <regex.h>
40
41#include "tre.h"
42
43#include <assert.h>
44
45static void
46tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
47		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
48
49/***********************************************************************
50 from tre-match-utils.h
51***********************************************************************/
52
53#define GET_NEXT_WCHAR() do {                                                 \
54    prev_c = next_c; pos += pos_add_next;                                     \
55    if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
56        if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
57        else pos_add_next++;                                                  \
58    }                                                                         \
59    str_byte += pos_add_next;                                                 \
60  } while (0)
61
62#define IS_WORD_CHAR(c)	 ((c) == L'_' || tre_isalnum(c))
63
64#define CHECK_ASSERTIONS(assertions)					      \
65  (((assertions & ASSERT_AT_BOL)					      \
66    && (pos > 0 || reg_notbol)						      \
67    && (prev_c != L'\n' || !reg_newline))				      \
68   || ((assertions & ASSERT_AT_EOL)					      \
69       && (next_c != L'\0' || reg_noteol)				      \
70       && (next_c != L'\n' || !reg_newline))				      \
71   || ((assertions & ASSERT_AT_BOW)					      \
72       && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c)))	              \
73   || ((assertions & ASSERT_AT_EOW)					      \
74       && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c)))		      \
75   || ((assertions & ASSERT_AT_WB)					      \
76       && (pos != 0 && next_c != L'\0'					      \
77	   && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c)))		      \
78   || ((assertions & ASSERT_AT_WB_NEG)					      \
79       && (pos == 0 || next_c == L'\0'					      \
80	   || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
81
82#define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)                             \
83  (((trans_i->assertions & ASSERT_CHAR_CLASS)                                 \
84       && !(tnfa->cflags & REG_ICASE)                                         \
85       && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class))                 \
86    || ((trans_i->assertions & ASSERT_CHAR_CLASS)                             \
87        && (tnfa->cflags & REG_ICASE)                                         \
88        && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class)     \
89	&& !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class))    \
90    || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)                         \
91        && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\
92                                      tnfa->cflags & REG_ICASE)))
93
94
95
96
97/* Returns 1 if `t1' wins `t2', 0 otherwise. */
98static int
99tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
100	      regoff_t *t1, regoff_t *t2)
101{
102  int i;
103  for (i = 0; i < num_tags; i++)
104    {
105      if (tag_directions[i] == TRE_TAG_MINIMIZE)
106	{
107	  if (t1[i] < t2[i])
108	    return 1;
109	  if (t1[i] > t2[i])
110	    return 0;
111	}
112      else
113	{
114	  if (t1[i] > t2[i])
115	    return 1;
116	  if (t1[i] < t2[i])
117	    return 0;
118	}
119    }
120  /*  assert(0);*/
121  return 0;
122}
123
124static int
125tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
126{
127  while (*classes != (tre_ctype_t)0)
128    if ((!icase && tre_isctype(wc, *classes))
129	|| (icase && (tre_isctype(tre_toupper(wc), *classes)
130		      || tre_isctype(tre_tolower(wc), *classes))))
131      return 1; /* Match. */
132    else
133      classes++;
134  return 0; /* No match. */
135}
136
137
138/***********************************************************************
139 from tre-match-parallel.c
140***********************************************************************/
141
142/*
143  This algorithm searches for matches basically by reading characters
144  in the searched string one by one, starting at the beginning.	 All
145  matching paths in the TNFA are traversed in parallel.	 When two or
146  more paths reach the same state, exactly one is chosen according to
147  tag ordering rules; if returning submatches is not required it does
148  not matter which path is chosen.
149
150  The worst case time required for finding the leftmost and longest
151  match, or determining that there is no match, is always linearly
152  dependent on the length of the text being searched.
153
154  This algorithm cannot handle TNFAs with back referencing nodes.
155  See `tre-match-backtrack.c'.
156*/
157
158typedef struct {
159  tre_tnfa_transition_t *state;
160  regoff_t *tags;
161} tre_tnfa_reach_t;
162
163typedef struct {
164  regoff_t pos;
165  regoff_t **tags;
166} tre_reach_pos_t;
167
168
169static reg_errcode_t
170tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
171		      regoff_t *match_tags, int eflags,
172		      regoff_t *match_end_ofs)
173{
174  /* State variables required by GET_NEXT_WCHAR. */
175  tre_char_t prev_c = 0, next_c = 0;
176  const char *str_byte = string;
177  regoff_t pos = -1;
178  regoff_t pos_add_next = 1;
179#ifdef TRE_MBSTATE
180  mbstate_t mbstate;
181#endif /* TRE_MBSTATE */
182  int reg_notbol = eflags & REG_NOTBOL;
183  int reg_noteol = eflags & REG_NOTEOL;
184  int reg_newline = tnfa->cflags & REG_NEWLINE;
185  reg_errcode_t ret;
186
187  char *buf;
188  tre_tnfa_transition_t *trans_i;
189  tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
190  tre_reach_pos_t *reach_pos;
191  int *tag_i;
192  int num_tags, i;
193
194  regoff_t match_eo = -1;	   /* end offset of match (-1 if no match found yet) */
195  int new_match = 0;
196  regoff_t *tmp_tags = NULL;
197  regoff_t *tmp_iptr;
198
199#ifdef TRE_MBSTATE
200  memset(&mbstate, '\0', sizeof(mbstate));
201#endif /* TRE_MBSTATE */
202
203  if (!match_tags)
204    num_tags = 0;
205  else
206    num_tags = tnfa->num_tags;
207
208  /* Allocate memory for temporary data required for matching.	This needs to
209     be done for every matching operation to be thread safe.  This allocates
210     everything in a single large block with calloc(). */
211  {
212    size_t tbytes, rbytes, pbytes, xbytes, total_bytes;
213    char *tmp_buf;
214
215    /* Ensure that tbytes and xbytes*num_states cannot overflow, and that
216     * they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */
217    if (num_tags > SIZE_MAX/(8 * sizeof(regoff_t) * tnfa->num_states))
218      goto error_exit;
219
220    /* Likewise check rbytes. */
221    if (tnfa->num_states+1 > SIZE_MAX/(8 * sizeof(*reach_next)))
222      goto error_exit;
223
224    /* Likewise check pbytes. */
225    if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_pos)))
226      goto error_exit;
227
228    /* Compute the length of the block we need. */
229    tbytes = sizeof(*tmp_tags) * num_tags;
230    rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
231    pbytes = sizeof(*reach_pos) * tnfa->num_states;
232    xbytes = sizeof(regoff_t) * num_tags;
233    total_bytes =
234      (sizeof(long) - 1) * 4 /* for alignment paddings */
235      + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
236
237    /* Allocate the memory. */
238    buf = calloc(total_bytes, 1);
239    if (buf == NULL)
240      return REG_ESPACE;
241
242    /* Get the various pointers within tmp_buf (properly aligned). */
243    tmp_tags = (void *)buf;
244    tmp_buf = buf + tbytes;
245    tmp_buf += ALIGN(tmp_buf, long);
246    reach_next = (void *)tmp_buf;
247    tmp_buf += rbytes;
248    tmp_buf += ALIGN(tmp_buf, long);
249    reach = (void *)tmp_buf;
250    tmp_buf += rbytes;
251    tmp_buf += ALIGN(tmp_buf, long);
252    reach_pos = (void *)tmp_buf;
253    tmp_buf += pbytes;
254    tmp_buf += ALIGN(tmp_buf, long);
255    for (i = 0; i < tnfa->num_states; i++)
256      {
257	reach[i].tags = (void *)tmp_buf;
258	tmp_buf += xbytes;
259	reach_next[i].tags = (void *)tmp_buf;
260	tmp_buf += xbytes;
261      }
262  }
263
264  for (i = 0; i < tnfa->num_states; i++)
265    reach_pos[i].pos = -1;
266
267  GET_NEXT_WCHAR();
268  pos = 0;
269
270  reach_next_i = reach_next;
271  while (1)
272    {
273      /* If no match found yet, add the initial states to `reach_next'. */
274      if (match_eo < 0)
275	{
276	  trans_i = tnfa->initial;
277	  while (trans_i->state != NULL)
278	    {
279	      if (reach_pos[trans_i->state_id].pos < pos)
280		{
281		  if (trans_i->assertions
282		      && CHECK_ASSERTIONS(trans_i->assertions))
283		    {
284		      trans_i++;
285		      continue;
286		    }
287
288		  reach_next_i->state = trans_i->state;
289		  for (i = 0; i < num_tags; i++)
290		    reach_next_i->tags[i] = -1;
291		  tag_i = trans_i->tags;
292		  if (tag_i)
293		    while (*tag_i >= 0)
294		      {
295			if (*tag_i < num_tags)
296			  reach_next_i->tags[*tag_i] = pos;
297			tag_i++;
298		      }
299		  if (reach_next_i->state == tnfa->final)
300		    {
301		      match_eo = pos;
302		      new_match = 1;
303		      for (i = 0; i < num_tags; i++)
304			match_tags[i] = reach_next_i->tags[i];
305		    }
306		  reach_pos[trans_i->state_id].pos = pos;
307		  reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
308		  reach_next_i++;
309		}
310	      trans_i++;
311	    }
312	  reach_next_i->state = NULL;
313	}
314      else
315	{
316	  if (num_tags == 0 || reach_next_i == reach_next)
317	    /* We have found a match. */
318	    break;
319	}
320
321      /* Check for end of string. */
322      if (!next_c) break;
323
324      GET_NEXT_WCHAR();
325
326      /* Swap `reach' and `reach_next'. */
327      reach_i = reach;
328      reach = reach_next;
329      reach_next = reach_i;
330
331      /* For each state in `reach', weed out states that don't fulfill the
332	 minimal matching conditions. */
333      if (tnfa->num_minimals && new_match)
334	{
335	  new_match = 0;
336	  reach_next_i = reach_next;
337	  for (reach_i = reach; reach_i->state; reach_i++)
338	    {
339	      int skip = 0;
340	      for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
341		{
342		  int end = tnfa->minimal_tags[i];
343		  int start = tnfa->minimal_tags[i + 1];
344		  if (end >= num_tags)
345		    {
346		      skip = 1;
347		      break;
348		    }
349		  else if (reach_i->tags[start] == match_tags[start]
350			   && reach_i->tags[end] < match_tags[end])
351		    {
352		      skip = 1;
353		      break;
354		    }
355		}
356	      if (!skip)
357		{
358		  reach_next_i->state = reach_i->state;
359		  tmp_iptr = reach_next_i->tags;
360		  reach_next_i->tags = reach_i->tags;
361		  reach_i->tags = tmp_iptr;
362		  reach_next_i++;
363		}
364	    }
365	  reach_next_i->state = NULL;
366
367	  /* Swap `reach' and `reach_next'. */
368	  reach_i = reach;
369	  reach = reach_next;
370	  reach_next = reach_i;
371	}
372
373      /* For each state in `reach' see if there is a transition leaving with
374	 the current input symbol to a state not yet in `reach_next', and
375	 add the destination states to `reach_next'. */
376      reach_next_i = reach_next;
377      for (reach_i = reach; reach_i->state; reach_i++)
378	{
379	  for (trans_i = reach_i->state; trans_i->state; trans_i++)
380	    {
381	      /* Does this transition match the input symbol? */
382	      if (trans_i->code_min <= (tre_cint_t)prev_c &&
383		  trans_i->code_max >= (tre_cint_t)prev_c)
384		{
385		  if (trans_i->assertions
386		      && (CHECK_ASSERTIONS(trans_i->assertions)
387			  || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
388		    {
389		      continue;
390		    }
391
392		  /* Compute the tags after this transition. */
393		  for (i = 0; i < num_tags; i++)
394		    tmp_tags[i] = reach_i->tags[i];
395		  tag_i = trans_i->tags;
396		  if (tag_i != NULL)
397		    while (*tag_i >= 0)
398		      {
399			if (*tag_i < num_tags)
400			  tmp_tags[*tag_i] = pos;
401			tag_i++;
402		      }
403
404		  if (reach_pos[trans_i->state_id].pos < pos)
405		    {
406		      /* Found an unvisited node. */
407		      reach_next_i->state = trans_i->state;
408		      tmp_iptr = reach_next_i->tags;
409		      reach_next_i->tags = tmp_tags;
410		      tmp_tags = tmp_iptr;
411		      reach_pos[trans_i->state_id].pos = pos;
412		      reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
413
414		      if (reach_next_i->state == tnfa->final
415			  && (match_eo == -1
416			      || (num_tags > 0
417				  && reach_next_i->tags[0] <= match_tags[0])))
418			{
419			  match_eo = pos;
420			  new_match = 1;
421			  for (i = 0; i < num_tags; i++)
422			    match_tags[i] = reach_next_i->tags[i];
423			}
424		      reach_next_i++;
425
426		    }
427		  else
428		    {
429		      assert(reach_pos[trans_i->state_id].pos == pos);
430		      /* Another path has also reached this state.  We choose
431			 the winner by examining the tag values for both
432			 paths. */
433		      if (tre_tag_order(num_tags, tnfa->tag_directions,
434					tmp_tags,
435					*reach_pos[trans_i->state_id].tags))
436			{
437			  /* The new path wins. */
438			  tmp_iptr = *reach_pos[trans_i->state_id].tags;
439			  *reach_pos[trans_i->state_id].tags = tmp_tags;
440			  if (trans_i->state == tnfa->final)
441			    {
442			      match_eo = pos;
443			      new_match = 1;
444			      for (i = 0; i < num_tags; i++)
445				match_tags[i] = tmp_tags[i];
446			    }
447			  tmp_tags = tmp_iptr;
448			}
449		    }
450		}
451	    }
452	}
453      reach_next_i->state = NULL;
454    }
455
456  *match_end_ofs = match_eo;
457  ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
458error_exit:
459  xfree(buf);
460  return ret;
461}
462
463
464
465/***********************************************************************
466 from tre-match-backtrack.c
467***********************************************************************/
468
469/*
470  This matcher is for regexps that use back referencing.  Regexp matching
471  with back referencing is an NP-complete problem on the number of back
472  references.  The easiest way to match them is to use a backtracking
473  routine which basically goes through all possible paths in the TNFA
474  and chooses the one which results in the best (leftmost and longest)
475  match.  This can be spectacularly expensive and may run out of stack
476  space, but there really is no better known generic algorithm.	 Quoting
477  Henry Spencer from comp.compilers:
478  <URL: http://compilers.iecc.com/comparch/article/93-03-102>
479
480    POSIX.2 REs require longest match, which is really exciting to
481    implement since the obsolete ("basic") variant also includes
482    \<digit>.  I haven't found a better way of tackling this than doing
483    a preliminary match using a DFA (or simulation) on a modified RE
484    that just replicates subREs for \<digit>, and then doing a
485    backtracking match to determine whether the subRE matches were
486    right.  This can be rather slow, but I console myself with the
487    thought that people who use \<digit> deserve very slow execution.
488    (Pun unintentional but very appropriate.)
489
490*/
491
492typedef struct {
493  regoff_t pos;
494  const char *str_byte;
495  tre_tnfa_transition_t *state;
496  int state_id;
497  int next_c;
498  regoff_t *tags;
499#ifdef TRE_MBSTATE
500  mbstate_t mbstate;
501#endif /* TRE_MBSTATE */
502} tre_backtrack_item_t;
503
504typedef struct tre_backtrack_struct {
505  tre_backtrack_item_t item;
506  struct tre_backtrack_struct *prev;
507  struct tre_backtrack_struct *next;
508} *tre_backtrack_t;
509
510#ifdef TRE_MBSTATE
511#define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate)
512#define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
513#else /* !TRE_MBSTATE */
514#define BT_STACK_MBSTATE_IN
515#define BT_STACK_MBSTATE_OUT
516#endif /* !TRE_MBSTATE */
517
518#define tre_bt_mem_new		  tre_mem_new
519#define tre_bt_mem_alloc	  tre_mem_alloc
520#define tre_bt_mem_destroy	  tre_mem_destroy
521
522
523#define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
524  do									      \
525    {									      \
526      int i;								      \
527      if (!stack->next)							      \
528	{								      \
529	  tre_backtrack_t s;						      \
530	  s = tre_bt_mem_alloc(mem, sizeof(*s));			      \
531	  if (!s)							      \
532	    {								      \
533	      tre_bt_mem_destroy(mem);					      \
534	      if (tags)							      \
535		xfree(tags);						      \
536	      if (pmatch)						      \
537		xfree(pmatch);						      \
538	      if (states_seen)						      \
539		xfree(states_seen);					      \
540	      return REG_ESPACE;					      \
541	    }								      \
542	  s->prev = stack;						      \
543	  s->next = NULL;						      \
544	  s->item.tags = tre_bt_mem_alloc(mem,				      \
545					  sizeof(*tags) * tnfa->num_tags);    \
546	  if (!s->item.tags)						      \
547	    {								      \
548	      tre_bt_mem_destroy(mem);					      \
549	      if (tags)							      \
550		xfree(tags);						      \
551	      if (pmatch)						      \
552		xfree(pmatch);						      \
553	      if (states_seen)						      \
554		xfree(states_seen);					      \
555	      return REG_ESPACE;					      \
556	    }								      \
557	  stack->next = s;						      \
558	  stack = s;							      \
559	}								      \
560      else								      \
561	stack = stack->next;						      \
562      stack->item.pos = (_pos);						      \
563      stack->item.str_byte = (_str_byte);				      \
564      stack->item.state = (_state);					      \
565      stack->item.state_id = (_state_id);				      \
566      stack->item.next_c = (_next_c);					      \
567      for (i = 0; i < tnfa->num_tags; i++)				      \
568	stack->item.tags[i] = (_tags)[i];				      \
569      BT_STACK_MBSTATE_IN;						      \
570    }									      \
571  while (0)
572
573#define BT_STACK_POP()							      \
574  do									      \
575    {									      \
576      int i;								      \
577      assert(stack->prev);						      \
578      pos = stack->item.pos;						      \
579      str_byte = stack->item.str_byte;					      \
580      state = stack->item.state;					      \
581      next_c = stack->item.next_c;					      \
582      for (i = 0; i < tnfa->num_tags; i++)				      \
583	tags[i] = stack->item.tags[i];					      \
584      BT_STACK_MBSTATE_OUT;						      \
585      stack = stack->prev;						      \
586    }									      \
587  while (0)
588
589#undef MIN
590#define MIN(a, b) ((a) <= (b) ? (a) : (b))
591
592static reg_errcode_t
593tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
594		       regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
595{
596  /* State variables required by GET_NEXT_WCHAR. */
597  tre_char_t prev_c = 0, next_c = 0;
598  const char *str_byte = string;
599  regoff_t pos = 0;
600  regoff_t pos_add_next = 1;
601#ifdef TRE_MBSTATE
602  mbstate_t mbstate;
603#endif /* TRE_MBSTATE */
604  int reg_notbol = eflags & REG_NOTBOL;
605  int reg_noteol = eflags & REG_NOTEOL;
606  int reg_newline = tnfa->cflags & REG_NEWLINE;
607
608  /* These are used to remember the necessary values of the above
609     variables to return to the position where the current search
610     started from. */
611  int next_c_start;
612  const char *str_byte_start;
613  regoff_t pos_start = -1;
614#ifdef TRE_MBSTATE
615  mbstate_t mbstate_start;
616#endif /* TRE_MBSTATE */
617
618  /* End offset of best match so far, or -1 if no match found yet. */
619  regoff_t match_eo = -1;
620  /* Tag arrays. */
621  int *next_tags;
622  regoff_t *tags = NULL;
623  /* Current TNFA state. */
624  tre_tnfa_transition_t *state;
625  int *states_seen = NULL;
626
627  /* Memory allocator to for allocating the backtracking stack. */
628  tre_mem_t mem = tre_bt_mem_new();
629
630  /* The backtracking stack. */
631  tre_backtrack_t stack;
632
633  tre_tnfa_transition_t *trans_i;
634  regmatch_t *pmatch = NULL;
635  int ret;
636
637#ifdef TRE_MBSTATE
638  memset(&mbstate, '\0', sizeof(mbstate));
639#endif /* TRE_MBSTATE */
640
641  if (!mem)
642    return REG_ESPACE;
643  stack = tre_bt_mem_alloc(mem, sizeof(*stack));
644  if (!stack)
645    {
646      ret = REG_ESPACE;
647      goto error_exit;
648    }
649  stack->prev = NULL;
650  stack->next = NULL;
651
652  if (tnfa->num_tags)
653    {
654      tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
655      if (!tags)
656	{
657	  ret = REG_ESPACE;
658	  goto error_exit;
659	}
660    }
661  if (tnfa->num_submatches)
662    {
663      pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
664      if (!pmatch)
665	{
666	  ret = REG_ESPACE;
667	  goto error_exit;
668	}
669    }
670  if (tnfa->num_states)
671    {
672      states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
673      if (!states_seen)
674	{
675	  ret = REG_ESPACE;
676	  goto error_exit;
677	}
678    }
679
680 retry:
681  {
682    int i;
683    for (i = 0; i < tnfa->num_tags; i++)
684      {
685	tags[i] = -1;
686	if (match_tags)
687	  match_tags[i] = -1;
688      }
689    for (i = 0; i < tnfa->num_states; i++)
690      states_seen[i] = 0;
691  }
692
693  state = NULL;
694  pos = pos_start;
695  GET_NEXT_WCHAR();
696  pos_start = pos;
697  next_c_start = next_c;
698  str_byte_start = str_byte;
699#ifdef TRE_MBSTATE
700  mbstate_start = mbstate;
701#endif /* TRE_MBSTATE */
702
703  /* Handle initial states. */
704  next_tags = NULL;
705  for (trans_i = tnfa->initial; trans_i->state; trans_i++)
706    {
707      if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
708	{
709	  continue;
710	}
711      if (state == NULL)
712	{
713	  /* Start from this state. */
714	  state = trans_i->state;
715	  next_tags = trans_i->tags;
716	}
717      else
718	{
719	  /* Backtrack to this state. */
720	  BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
721			trans_i->state_id, next_c, tags, mbstate);
722	  {
723	    int *tmp = trans_i->tags;
724	    if (tmp)
725	      while (*tmp >= 0)
726		stack->item.tags[*tmp++] = pos;
727	  }
728	}
729    }
730
731  if (next_tags)
732    for (; *next_tags >= 0; next_tags++)
733      tags[*next_tags] = pos;
734
735
736  if (state == NULL)
737    goto backtrack;
738
739  while (1)
740    {
741      tre_tnfa_transition_t *next_state;
742      int empty_br_match;
743
744      if (state == tnfa->final)
745	{
746	  if (match_eo < pos
747	      || (match_eo == pos
748		  && match_tags
749		  && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
750				   tags, match_tags)))
751	    {
752	      int i;
753	      /* This match wins the previous match. */
754	      match_eo = pos;
755	      if (match_tags)
756		for (i = 0; i < tnfa->num_tags; i++)
757		  match_tags[i] = tags[i];
758	    }
759	  /* Our TNFAs never have transitions leaving from the final state,
760	     so we jump right to backtracking. */
761	  goto backtrack;
762	}
763
764      /* Go to the next character in the input string. */
765      empty_br_match = 0;
766      trans_i = state;
767      if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
768	{
769	  /* This is a back reference state.  All transitions leaving from
770	     this state have the same back reference "assertion".  Instead
771	     of reading the next character, we match the back reference. */
772	  regoff_t so, eo;
773	  int bt = trans_i->u.backref;
774	  regoff_t bt_len;
775	  int result;
776
777	  /* Get the substring we need to match against.  Remember to
778	     turn off REG_NOSUB temporarily. */
779	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
780			  tnfa, tags, pos);
781	  so = pmatch[bt].rm_so;
782	  eo = pmatch[bt].rm_eo;
783	  bt_len = eo - so;
784
785	  result = strncmp((const char*)string + so, str_byte - 1,
786				 (size_t)bt_len);
787
788	  if (result == 0)
789	    {
790	      /* Back reference matched.  Check for infinite loop. */
791	      if (bt_len == 0)
792		empty_br_match = 1;
793	      if (empty_br_match && states_seen[trans_i->state_id])
794		{
795		  goto backtrack;
796		}
797
798	      states_seen[trans_i->state_id] = empty_br_match;
799
800	      /* Advance in input string and resync `prev_c', `next_c'
801		 and pos. */
802	      str_byte += bt_len - 1;
803	      pos += bt_len - 1;
804	      GET_NEXT_WCHAR();
805	    }
806	  else
807	    {
808	      goto backtrack;
809	    }
810	}
811      else
812	{
813	  /* Check for end of string. */
814	  if (next_c == L'\0')
815		goto backtrack;
816
817	  /* Read the next character. */
818	  GET_NEXT_WCHAR();
819	}
820
821      next_state = NULL;
822      for (trans_i = state; trans_i->state; trans_i++)
823	{
824	  if (trans_i->code_min <= (tre_cint_t)prev_c
825	      && trans_i->code_max >= (tre_cint_t)prev_c)
826	    {
827	      if (trans_i->assertions
828		  && (CHECK_ASSERTIONS(trans_i->assertions)
829		      || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
830		{
831		  continue;
832		}
833
834	      if (next_state == NULL)
835		{
836		  /* First matching transition. */
837		  next_state = trans_i->state;
838		  next_tags = trans_i->tags;
839		}
840	      else
841		{
842		  /* Second matching transition.  We may need to backtrack here
843		     to take this transition instead of the first one, so we
844		     push this transition in the backtracking stack so we can
845		     jump back here if needed. */
846		  BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
847				trans_i->state_id, next_c, tags, mbstate);
848		  {
849		    int *tmp;
850		    for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
851		      stack->item.tags[*tmp] = pos;
852		  }
853#if 0 /* XXX - it's important not to look at all transitions here to keep
854	 the stack small! */
855		  break;
856#endif
857		}
858	    }
859	}
860
861      if (next_state != NULL)
862	{
863	  /* Matching transitions were found.  Take the first one. */
864	  state = next_state;
865
866	  /* Update the tag values. */
867	  if (next_tags)
868	    while (*next_tags >= 0)
869	      tags[*next_tags++] = pos;
870	}
871      else
872	{
873	backtrack:
874	  /* A matching transition was not found.  Try to backtrack. */
875	  if (stack->prev)
876	    {
877	      if (stack->item.state->assertions & ASSERT_BACKREF)
878		{
879		  states_seen[stack->item.state_id] = 0;
880		}
881
882	      BT_STACK_POP();
883	    }
884	  else if (match_eo < 0)
885	    {
886	      /* Try starting from a later position in the input string. */
887	      /* Check for end of string. */
888	      if (next_c == L'\0')
889		    {
890		      break;
891		    }
892	      next_c = next_c_start;
893#ifdef TRE_MBSTATE
894	      mbstate = mbstate_start;
895#endif /* TRE_MBSTATE */
896	      str_byte = str_byte_start;
897	      goto retry;
898	    }
899	  else
900	    {
901	      break;
902	    }
903	}
904    }
905
906  ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
907  *match_end_ofs = match_eo;
908
909 error_exit:
910  tre_bt_mem_destroy(mem);
911#ifndef TRE_USE_ALLOCA
912  if (tags)
913    xfree(tags);
914  if (pmatch)
915    xfree(pmatch);
916  if (states_seen)
917    xfree(states_seen);
918#endif /* !TRE_USE_ALLOCA */
919
920  return ret;
921}
922
923/***********************************************************************
924 from regexec.c
925***********************************************************************/
926
927/* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
928   endpoint values. */
929static void
930tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
931		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
932{
933  tre_submatch_data_t *submatch_data;
934  unsigned int i, j;
935  int *parents;
936
937  i = 0;
938  if (match_eo >= 0 && !(cflags & REG_NOSUB))
939    {
940      /* Construct submatch offsets from the tags. */
941      submatch_data = tnfa->submatch_data;
942      while (i < tnfa->num_submatches && i < nmatch)
943	{
944	  if (submatch_data[i].so_tag == tnfa->end_tag)
945	    pmatch[i].rm_so = match_eo;
946	  else
947	    pmatch[i].rm_so = tags[submatch_data[i].so_tag];
948
949	  if (submatch_data[i].eo_tag == tnfa->end_tag)
950	    pmatch[i].rm_eo = match_eo;
951	  else
952	    pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
953
954	  /* If either of the endpoints were not used, this submatch
955	     was not part of the match. */
956	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
957	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
958
959	  i++;
960	}
961      /* Reset all submatches that are not within all of their parent
962	 submatches. */
963      i = 0;
964      while (i < tnfa->num_submatches && i < nmatch)
965	{
966	  if (pmatch[i].rm_eo == -1)
967	    assert(pmatch[i].rm_so == -1);
968	  assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
969
970	  parents = submatch_data[i].parents;
971	  if (parents != NULL)
972	    for (j = 0; parents[j] >= 0; j++)
973	      {
974		if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
975		    || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
976		  pmatch[i].rm_so = pmatch[i].rm_eo = -1;
977	      }
978	  i++;
979	}
980    }
981
982  while (i < nmatch)
983    {
984      pmatch[i].rm_so = -1;
985      pmatch[i].rm_eo = -1;
986      i++;
987    }
988}
989
990
991/*
992  Wrapper functions for POSIX compatible regexp matching.
993*/
994
995int
996regexec(const regex_t *restrict preg, const char *restrict string,
997	  size_t nmatch, regmatch_t pmatch[restrict], int eflags)
998{
999  tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
1000  reg_errcode_t status;
1001  regoff_t *tags = NULL, eo;
1002  if (tnfa->cflags & REG_NOSUB) nmatch = 0;
1003  if (tnfa->num_tags > 0 && nmatch > 0)
1004    {
1005      tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
1006      if (tags == NULL)
1007	return REG_ESPACE;
1008    }
1009
1010  /* Dispatch to the appropriate matcher. */
1011  if (tnfa->have_backrefs)
1012    {
1013      /* The regex has back references, use the backtracking matcher. */
1014      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
1015    }
1016  else
1017    {
1018      /* Exact matching, no back references, use the parallel matcher. */
1019      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
1020    }
1021
1022  if (status == REG_OK)
1023    /* A match was found, so fill the submatch registers. */
1024    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
1025  if (tags)
1026    xfree(tags);
1027  return status;
1028}
1029