1/*
2  tre-match-backtrack.c - TRE backtracking regex matching engine
3
4  This software is released under a BSD-style license.
5  See the file LICENSE for details and copyright.
6
7*/
8
9/*
10  This matcher is for regexps that use back referencing.  Regexp matching
11  with back referencing is an NP-complete problem on the number of back
12  references.  The easiest way to match them is to use a backtracking
13  routine which basically goes through all possible paths in the TNFA
14  and chooses the one which results in the best (leftmost and longest)
15  match.  This can be spectacularly expensive and may run out of stack
16  space, but there really is no better known generic algorithm.	 Quoting
17  Henry Spencer from comp.compilers:
18  <URL: http://compilers.iecc.com/comparch/article/93-03-102>
19
20    POSIX.2 REs require longest match, which is really exciting to
21    implement since the obsolete ("basic") variant also includes
22    \<digit>.  I haven't found a better way of tackling this than doing
23    a preliminary match using a DFA (or simulation) on a modified RE
24    that just replicates subREs for \<digit>, and then doing a
25    backtracking match to determine whether the subRE matches were
26    right.  This can be rather slow, but I console myself with the
27    thought that people who use \<digit> deserve very slow execution.
28    (Pun unintentional but very appropriate.)
29
30*/
31
32
33#ifdef HAVE_CONFIG_H
34#include <config.h>
35#endif /* HAVE_CONFIG_H */
36
37#include "tre-alloca.h"
38
39#include <assert.h>
40#include <stdlib.h>
41#include <string.h>
42#ifdef HAVE_WCHAR_H
43#include <wchar.h>
44#endif /* HAVE_WCHAR_H */
45#ifdef HAVE_WCTYPE_H
46#include <wctype.h>
47#endif /* HAVE_WCTYPE_H */
48#ifndef TRE_WCHAR
49#include <ctype.h>
50#endif /* !TRE_WCHAR */
51#ifdef HAVE_MALLOC_H
52#include <malloc.h>
53#endif /* HAVE_MALLOC_H */
54
55#include "tre-internal.h"
56#include "tre-mem.h"
57#include "tre-match-utils.h"
58#include "tre.h"
59#include "xmalloc.h"
60
61typedef struct {
62  int pos;
63  const char *str_byte;
64#ifdef TRE_WCHAR
65  const wchar_t *str_wide;
66#endif /* TRE_WCHAR */
67  tre_tnfa_transition_t *state;
68  int state_id;
69  int next_c;
70  int *tags;
71#ifdef TRE_MBSTATE
72  mbstate_t mbstate;
73#endif /* TRE_MBSTATE */
74} tre_backtrack_item_t;
75
76typedef struct tre_backtrack_struct {
77  tre_backtrack_item_t item;
78  struct tre_backtrack_struct *prev;
79  struct tre_backtrack_struct *next;
80} *tre_backtrack_t;
81
82#ifdef TRE_WCHAR
83#define BT_STACK_WIDE_IN(_str_wide)	stack->item.str_wide = (_str_wide)
84#define BT_STACK_WIDE_OUT		(str_wide) = stack->item.str_wide
85#else /* !TRE_WCHAR */
86#define BT_STACK_WIDE_IN(_str_wide)
87#define BT_STACK_WIDE_OUT
88#endif /* !TRE_WCHAR */
89
90#ifdef TRE_MBSTATE
91#define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate)
92#define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
93#else /* !TRE_MBSTATE */
94#define BT_STACK_MBSTATE_IN
95#define BT_STACK_MBSTATE_OUT
96#endif /* !TRE_MBSTATE */
97
98
99#ifdef TRE_USE_ALLOCA
100#define tre_bt_mem_new		  tre_mem_newa
101#define tre_bt_mem_alloc	  tre_mem_alloca
102#define tre_bt_mem_destroy(obj)	  do { } while (/*CONSTCOND*/(void)0,0)
103#else /* !TRE_USE_ALLOCA */
104#define tre_bt_mem_new		  tre_mem_new
105#define tre_bt_mem_alloc	  tre_mem_alloc
106#define tre_bt_mem_destroy	  tre_mem_destroy
107#endif /* !TRE_USE_ALLOCA */
108
109
110#define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
111  do									      \
112    {									      \
113      size_t i;								      \
114      if (!stack->next)							      \
115	{								      \
116	  tre_backtrack_t s;						      \
117	  s = tre_bt_mem_alloc(mem, sizeof(*s));			      \
118	  if (!s)							      \
119	    {								      \
120	      tre_bt_mem_destroy(mem);					      \
121	      if (tags)							      \
122		xfree(tags);						      \
123	      if (pmatch)						      \
124		xfree(pmatch);						      \
125	      if (states_seen)						      \
126		xfree(states_seen);					      \
127	      return REG_ESPACE;					      \
128	    }								      \
129	  s->prev = stack;						      \
130	  s->next = NULL;						      \
131	  s->item.tags = tre_bt_mem_alloc(mem,				      \
132					  sizeof(*tags) * tnfa->num_tags);    \
133	  if (!s->item.tags)						      \
134	    {								      \
135	      tre_bt_mem_destroy(mem);					      \
136	      if (tags)							      \
137		xfree(tags);						      \
138	      if (pmatch)						      \
139		xfree(pmatch);						      \
140	      if (states_seen)						      \
141		xfree(states_seen);					      \
142	      return REG_ESPACE;					      \
143	    }								      \
144	  stack->next = s;						      \
145	  stack = s;							      \
146	}								      \
147      else								      \
148	stack = stack->next;						      \
149      stack->item.pos = (_pos);						      \
150      stack->item.str_byte = (_str_byte);				      \
151      BT_STACK_WIDE_IN(_str_wide);					      \
152      stack->item.state = (_state);					      \
153      stack->item.state_id = (_state_id);				      \
154      stack->item.next_c = (_next_c);					      \
155      for (i = 0; i < tnfa->num_tags; i++)				      \
156	stack->item.tags[i] = (_tags)[i];				      \
157      BT_STACK_MBSTATE_IN;						      \
158    }									      \
159  while (/*CONSTCOND*/(void)0,0)
160
161#define BT_STACK_POP()							      \
162  do									      \
163    {									      \
164      size_t i;								      \
165      assert(stack->prev);						      \
166      pos = stack->item.pos;						      \
167      if (type == STR_USER)                                                   \
168        str_source->rewind(pos + pos_add_next, str_source->context);          \
169      str_byte = stack->item.str_byte;					      \
170      BT_STACK_WIDE_OUT;						      \
171      state = stack->item.state;					      \
172      next_c = (tre_char_t) stack->item.next_c;					      \
173      for (i = 0; i < tnfa->num_tags; i++)				      \
174	tags[i] = stack->item.tags[i];					      \
175      BT_STACK_MBSTATE_OUT;						      \
176      stack = stack->prev;						      \
177    }									      \
178  while (/*CONSTCOND*/(void)0,0)
179
180#undef MIN
181#define MIN(a, b) ((a) <= (b) ? (a) : (b))
182
183reg_errcode_t
184tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
185		       int len, tre_str_type_t type, int *match_tags,
186		       int eflags, int *match_end_ofs)
187{
188  /* State variables required by GET_NEXT_WCHAR. */
189  tre_char_t prev_c = 0, next_c = 0;
190  const char *str_byte = string;
191  int pos = 0;
192  unsigned int pos_add_next = 1;
193#ifdef TRE_WCHAR
194  const wchar_t *str_wide = string;
195#ifdef TRE_MBSTATE
196  mbstate_t mbstate;
197#endif /* TRE_MBSTATE */
198#endif /* TRE_WCHAR */
199  int reg_notbol = eflags & REG_NOTBOL;
200  int reg_noteol = eflags & REG_NOTEOL;
201  int reg_newline = tnfa->cflags & REG_NEWLINE;
202  size_t str_user_end = 0;
203
204  /* These are used to remember the necessary values of the above
205     variables to return to the position where the current search
206     started from. */
207  int next_c_start;
208  const char *str_byte_start;
209  int pos_start = -1;
210#ifdef TRE_WCHAR
211  const wchar_t *str_wide_start;
212#endif /* TRE_WCHAR */
213#ifdef TRE_MBSTATE
214  mbstate_t mbstate_start;
215#endif /* TRE_MBSTATE */
216
217  /* End offset of best match so far, or -1 if no match found yet. */
218  int match_eo = -1;
219  /* Tag arrays. */
220  int *next_tags, *tags = NULL;
221  /* Current TNFA state. */
222  tre_tnfa_transition_t *state;
223  int *states_seen = NULL;
224
225  /* Memory allocator to for allocating the backtracking stack. */
226  tre_mem_t mem = tre_bt_mem_new();
227
228  /* The backtracking stack. */
229  tre_backtrack_t stack;
230
231  tre_tnfa_transition_t *trans_i;
232  regmatch_t *pmatch = NULL;
233  reg_errcode_t ret;
234
235#ifdef TRE_MBSTATE
236  memset(&mbstate, '\0', sizeof(mbstate));
237#endif /* TRE_MBSTATE */
238
239  if (!mem)
240    return REG_ESPACE;
241  stack = tre_bt_mem_alloc(mem, sizeof(*stack));
242  if (!stack)
243    {
244      ret = REG_ESPACE;
245      goto error_exit;
246    }
247  stack->prev = NULL;
248  stack->next = NULL;
249
250  DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
251  DPRINT(("len = %d\n", len));
252
253#ifdef TRE_USE_ALLOCA
254  tags = alloca(sizeof(*tags) * tnfa->num_tags);
255  pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches);
256  states_seen = alloca(sizeof(*states_seen) * tnfa->num_states);
257#else /* !TRE_USE_ALLOCA */
258  if (tnfa->num_tags)
259    {
260      tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
261      if (!tags)
262	{
263	  ret = REG_ESPACE;
264	  goto error_exit;
265	}
266    }
267  if (tnfa->num_submatches)
268    {
269      pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
270      if (!pmatch)
271	{
272	  ret = REG_ESPACE;
273	  goto error_exit;
274	}
275    }
276  if (tnfa->num_states)
277    {
278      states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
279      if (!states_seen)
280	{
281	  ret = REG_ESPACE;
282	  goto error_exit;
283	}
284    }
285#endif /* !TRE_USE_ALLOCA */
286
287 retry:
288  {
289    size_t i;
290    for (i = 0; i < tnfa->num_tags; i++)
291      {
292	tags[i] = -1;
293	if (match_tags)
294	  match_tags[i] = -1;
295      }
296    for (i = 0; i < tnfa->num_states; i++)
297      states_seen[i] = 0;
298  }
299
300  state = NULL;
301  pos = pos_start;
302  if (type == STR_USER)
303    str_source->rewind(pos + pos_add_next, str_source->context);
304  GET_NEXT_WCHAR();
305  pos_start = pos;
306  next_c_start = next_c;
307  str_byte_start = str_byte;
308#ifdef TRE_WCHAR
309  str_wide_start = str_wide;
310#endif /* TRE_WCHAR */
311#ifdef TRE_MBSTATE
312  mbstate_start = mbstate;
313#endif /* TRE_MBSTATE */
314
315  /* Handle initial states. */
316  next_tags = NULL;
317  for (trans_i = tnfa->initial; trans_i->state; trans_i++)
318    {
319      DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
320      if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
321	{
322	  DPRINT(("assert failed\n"));
323	  continue;
324	}
325      if (state == NULL)
326	{
327	  /* Start from this state. */
328	  state = trans_i->state;
329	  next_tags = trans_i->tags;
330	}
331      else
332	{
333	  /* Backtrack to this state. */
334	  DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
335	  BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
336			trans_i->state_id, next_c, tags, mbstate);
337	  {
338	    int *tmp = trans_i->tags;
339	    if (tmp)
340	      while (*tmp >= 0)
341		stack->item.tags[*tmp++] = pos;
342	  }
343	}
344    }
345
346  if (next_tags)
347    for (; *next_tags >= 0; next_tags++)
348      tags[*next_tags] = pos;
349
350
351  DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
352  DPRINT(("pos:chr/code | state and tags\n"));
353  DPRINT(("-------------+------------------------------------------------\n"));
354
355  if (state == NULL)
356    goto backtrack;
357
358  while (/*CONSTCOND*/(void)1,1)
359    {
360      tre_tnfa_transition_t *next_state;
361      int empty_br_match;
362
363      DPRINT(("start loop\n"));
364      if (state == tnfa->final)
365	{
366	  DPRINT(("  match found, %d %d\n", match_eo, pos));
367	  if (match_eo < pos
368	      || (match_eo == pos
369		  && match_tags
370		  && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
371				   tags, match_tags)))
372	    {
373	      size_t i;
374	      /* This match wins the previous match. */
375	      DPRINT(("	 win previous\n"));
376	      match_eo = pos;
377	      if (match_tags)
378		for (i = 0; i < tnfa->num_tags; i++)
379		  match_tags[i] = tags[i];
380	    }
381	  /* Our TNFAs never have transitions leaving from the final state,
382	     so we jump right to backtracking. */
383	  goto backtrack;
384	}
385
386#ifdef TRE_DEBUG
387      DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
388	      state));
389      {
390	int i;
391	for (i = 0; i < tnfa->num_tags; i++)
392	  DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
393	DPRINT(("\n"));
394      }
395#endif /* TRE_DEBUG */
396
397      /* Go to the next character in the input string. */
398      empty_br_match = 0;
399      trans_i = state;
400      if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
401	{
402	  /* This is a back reference state.  All transitions leaving from
403	     this state have the same back reference "assertion".  Instead
404	     of reading the next character, we match the back reference. */
405	  regoff_t so, eo, bt = trans_i->u.backref;
406	  regoff_t bt_len;
407	  regoff_t result;
408
409	  DPRINT(("  should match back reference %d\n", bt));
410	  /* Get the substring we need to match against.  Remember to
411	     turn off REG_NOSUB temporarily. */
412	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
413			  tnfa, tags, pos);
414	  /* LINTED */so = pmatch[bt].rm_so;
415	  /* LINTED */eo = pmatch[bt].rm_eo;
416	  bt_len = eo - so;
417
418#ifdef TRE_DEBUG
419	  {
420	    int slen;
421	    if (len < 0)
422	      slen = bt_len;
423	    else
424	      slen = MIN(bt_len, len - pos);
425
426	    if (type == STR_BYTE)
427	      {
428		DPRINT(("  substring (len %d) is [%d, %d[: '%.*s'\n",
429			bt_len, so, eo, bt_len, (char*)string + so));
430		DPRINT(("  current string is '%.*s'\n", slen, str_byte - 1));
431	      }
432#ifdef TRE_WCHAR
433	    else if (type == STR_WIDE)
434	      {
435		DPRINT(("  substring (len %d) is [%d, %d[: '%.*" STRF "'\n",
436			bt_len, so, eo, bt_len, (wchar_t*)string + so));
437		DPRINT(("  current string is '%.*" STRF "'\n",
438			slen, str_wide - 1));
439	      }
440#endif /* TRE_WCHAR */
441	  }
442#endif
443
444	  if (len < 0)
445	    {
446	      if (type == STR_USER)
447		result = str_source->compare((unsigned)so, (unsigned)pos,
448					     (unsigned)bt_len,
449					     str_source->context);
450#ifdef TRE_WCHAR
451	      else if (type == STR_WIDE)
452		result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
453				 (size_t)bt_len);
454#endif /* TRE_WCHAR */
455	      else
456		/* LINTED */result = strncmp((const char*)string + so, str_byte - 1,
457				 (size_t)bt_len);
458	    }
459	  else if (len - pos < bt_len)
460	    result = 1;
461#ifdef TRE_WCHAR
462	  else if (type == STR_WIDE)
463	    result = wmemcmp((const wchar_t*)string + so, str_wide - 1,
464			     (size_t)bt_len);
465#endif /* TRE_WCHAR */
466	  else
467	    /* LINTED */result = memcmp((const char*)string + so, str_byte - 1,
468			    (size_t)bt_len);
469
470	  if (result == 0)
471	    {
472	      /* Back reference matched.  Check for infinite loop. */
473	      if (bt_len == 0)
474		empty_br_match = 1;
475	      if (empty_br_match && states_seen[trans_i->state_id])
476		{
477		  DPRINT(("  avoid loop\n"));
478		  goto backtrack;
479		}
480
481	      states_seen[trans_i->state_id] = empty_br_match;
482
483	      /* Advance in input string and resync `prev_c', `next_c'
484		 and pos. */
485	      DPRINT(("	 back reference matched\n"));
486	      /* LINTED */str_byte += bt_len - 1;
487#ifdef TRE_WCHAR
488	      /* LINTED */str_wide += bt_len - 1;
489#endif /* TRE_WCHAR */
490	      /* LINTED */pos += bt_len - 1;
491	      GET_NEXT_WCHAR();
492	      DPRINT(("	 pos now %d\n", pos));
493	    }
494	  else
495	    {
496	      DPRINT(("	 back reference did not match\n"));
497	      goto backtrack;
498	    }
499	}
500      else
501	{
502	  /* Check for end of string. */
503	  if (len < 0)
504	    {
505	      if (type == STR_USER)
506		{
507		  if (str_user_end)
508		    goto backtrack;
509		}
510	      else if (next_c == L'\0')
511		goto backtrack;
512	    }
513	  else
514	    {
515	      if (pos >= len)
516		goto backtrack;
517	    }
518
519	  /* Read the next character. */
520	  GET_NEXT_WCHAR();
521	}
522
523      next_state = NULL;
524      for (trans_i = state; trans_i->state; trans_i++)
525	{
526	  DPRINT(("  transition %d-%d (%c-%c) %d to %d\n",
527		  trans_i->code_min, trans_i->code_max,
528		  trans_i->code_min, trans_i->code_max,
529		  trans_i->assertions, trans_i->state_id));
530	  if (trans_i->code_min <= (tre_cint_t)prev_c
531	      && trans_i->code_max >= (tre_cint_t)prev_c)
532	    {
533	      if (trans_i->assertions
534		  && (CHECK_ASSERTIONS(trans_i->assertions)
535		      || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
536		{
537		  DPRINT(("  assertion failed\n"));
538		  continue;
539		}
540
541	      if (next_state == NULL)
542		{
543		  /* First matching transition. */
544		  DPRINT(("  Next state is %d\n", trans_i->state_id));
545		  next_state = trans_i->state;
546		  next_tags = trans_i->tags;
547		}
548	      else
549		{
550		  /* Second matching transition.  We may need to backtrack here
551		     to take this transition instead of the first one, so we
552		     push this transition in the backtracking stack so we can
553		     jump back here if needed. */
554		  DPRINT(("  saving state %d for backtracking\n",
555			  trans_i->state_id));
556		  BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
557				trans_i->state_id, next_c, tags, mbstate);
558		  {
559		    int *tmp;
560		    for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
561		      stack->item.tags[*tmp] = pos;
562		  }
563#if 0 /* XXX - it's important not to look at all transitions here to keep
564	 the stack small! */
565		  break;
566#endif
567		}
568	    }
569	}
570
571      if (next_state != NULL)
572	{
573	  /* Matching transitions were found.  Take the first one. */
574	  state = next_state;
575
576	  /* Update the tag values. */
577	  if (next_tags)
578	    while (*next_tags >= 0)
579	      tags[*next_tags++] = pos;
580	}
581      else
582	{
583	backtrack:
584	  /* A matching transition was not found.  Try to backtrack. */
585	  if (stack->prev)
586	    {
587	      DPRINT(("	 backtracking\n"));
588#if ASSERT_BACKREF
589	      if (stack->item.state->assertions)
590		{
591		  DPRINT(("  states_seen[%d] = 0\n",
592			  stack->item.state_id));
593		  states_seen[stack->item.state_id] = 0;
594		}
595#endif
596
597	      BT_STACK_POP();
598	    }
599	  else if (match_eo < 0)
600	    {
601	      /* Try starting from a later position in the input string. */
602	      /* Check for end of string. */
603	      if (len < 0)
604		{
605		  if (next_c == L'\0')
606		    {
607		      DPRINT(("end of string.\n"));
608		      break;
609		    }
610		}
611	      else
612		{
613		  if (pos >= len)
614		    {
615		      DPRINT(("end of string.\n"));
616		      break;
617		    }
618		}
619	      DPRINT(("restarting from next start position\n"));
620	      next_c = (tre_char_t) next_c_start;
621#ifdef TRE_MBSTATE
622	      mbstate = mbstate_start;
623#endif /* TRE_MBSTATE */
624	      str_byte = str_byte_start;
625#ifdef TRE_WCHAR
626	      str_wide = str_wide_start;
627#endif /* TRE_WCHAR */
628	      goto retry;
629	    }
630	  else
631	    {
632	      DPRINT(("finished\n"));
633	      break;
634	    }
635	}
636    }
637
638  ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
639  *match_end_ofs = match_eo;
640
641 error_exit:
642  tre_bt_mem_destroy(mem);
643#ifndef TRE_USE_ALLOCA
644  if (tags)
645    xfree(tags);
646  if (pmatch)
647    xfree(pmatch);
648  if (states_seen)
649    xfree(states_seen);
650#endif /* !TRE_USE_ALLOCA */
651
652  return ret;
653}
654