1/*
2  tre-match-parallel.c - TRE parallel regex matching engine
3
4  This software is released under a BSD-style license.
5  See the file LICENSE for details and copyright.
6
7*/
8
9/*
10  This algorithm searches for matches basically by reading characters
11  in the searched string one by one, starting at the beginning.	 All
12  matching paths in the TNFA are traversed in parallel.	 When two or
13  more paths reach the same state, exactly one is chosen according to
14  tag ordering rules; if returning submatches is not required it does
15  not matter which path is chosen.
16
17  The worst case time required for finding the leftmost and longest
18  match, or determining that there is no match, is always linearly
19  dependent on the length of the text being searched.
20
21  This algorithm cannot handle TNFAs with back referencing nodes.
22  See `tre-match-backtrack.c'.
23*/
24
25
26#ifdef HAVE_CONFIG_H
27#include <config.h>
28#endif /* HAVE_CONFIG_H */
29
30/* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
31   info while running */
32#undef TRE_USE_ALLOCA
33
34#ifdef TRE_USE_ALLOCA
35/* AIX requires this to be the first thing in the file.	 */
36#ifndef __GNUC__
37# if HAVE_ALLOCA_H
38#  include <alloca.h>
39# else
40#  ifdef _AIX
41 #pragma alloca
42#  else
43#   ifndef alloca /* predefined by HP cc +Olibcalls */
44char *alloca ();
45#   endif
46#  endif
47# endif
48#endif
49#endif /* TRE_USE_ALLOCA */
50
51#include <assert.h>
52#include <stdlib.h>
53#include <string.h>
54#ifdef HAVE_WCHAR_H
55#include <wchar.h>
56#endif /* HAVE_WCHAR_H */
57#ifdef HAVE_WCTYPE_H
58#include <wctype.h>
59#endif /* HAVE_WCTYPE_H */
60#ifndef TRE_WCHAR
61#include <ctype.h>
62#endif /* !TRE_WCHAR */
63#ifdef HAVE_MALLOC_H
64#include <malloc.h>
65#endif /* HAVE_MALLOC_H */
66
67#include "tre-internal.h"
68#include "tre-match-utils.h"
69#include "tre.h"
70#include "xmalloc.h"
71
72
73
74typedef struct {
75  tre_tnfa_transition_t *state;
76  tre_tag_t *tags;
77} tre_tnfa_reach_t;
78
79typedef struct {
80  int pos;
81  tre_tag_t **tags;
82} tre_reach_pos_t;
83
84
85#ifdef TRE_DEBUG
86static void
87tre_print_reach1(tre_tnfa_transition_t *state, tre_tag_t *tags, int num_tags)
88{
89  DPRINT((" %p", (void *)state));
90  if (num_tags > 0)
91    {
92      DPRINT(("/"));
93      tre_print_tags(tags, num_tags);
94    }
95}
96
97static void
98tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
99{
100  while (reach->state != NULL)
101    {
102      tre_print_reach1(reach->state, reach->tags, num_tags);
103      reach++;
104    }
105  DPRINT(("\n"));
106
107}
108#endif /* TRE_DEBUG */
109
110reg_errcode_t
111tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
112		      tre_str_type_t type, tre_tag_t *match_tags, int eflags,
113		      int *match_end_ofs)
114{
115  /* State variables required by GET_NEXT_WCHAR. */
116  tre_char_t prev_c = 0, next_c = 0;
117  const char *str_byte = string;
118  int pos = -1;
119  unsigned int pos_add_next = 1;
120#ifdef TRE_WCHAR
121  const wchar_t *str_wide = string;
122#ifdef TRE_MBSTATE
123  mbstate_t mbstate;
124#endif /* TRE_MBSTATE */
125#endif /* TRE_WCHAR */
126  int reg_notbol = eflags & REG_NOTBOL;
127  int reg_noteol = eflags & REG_NOTEOL;
128  int reg_newline = tnfa->cflags & REG_NEWLINE;
129#ifdef TRE_STR_USER
130  int str_user_end = 0;
131#endif /* TRE_STR_USER */
132
133  char *buf;
134  tre_tnfa_transition_t *trans_i;
135  tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
136  tre_reach_pos_t *reach_pos;
137  int *tag_i;
138  int num_tags, i;
139
140  int match_eo = -1;	   /* end offset of match (-1 if no match found yet) */
141#ifdef TRE_DEBUG
142  int once;
143#endif /* TRE_DEBUG */
144  tre_tag_t *tmp_tags = NULL;
145  tre_tag_t *tmp_iptr;
146  int tbytes;
147  int touch = 1;
148
149#ifdef TRE_MBSTATE
150  memset(&mbstate, '\0', sizeof(mbstate));
151#endif /* TRE_MBSTATE */
152
153  DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
154
155  if (!match_tags)
156    num_tags = 0;
157  else
158    num_tags = tnfa->num_tags;
159
160  /* Allocate memory for temporary data required for matching.	This needs to
161     be done for every matching operation to be thread safe.  This allocates
162     everything in a single large block from the stack frame using alloca()
163     or with malloc() if alloca is unavailable. */
164  {
165    int rbytes, pbytes, total_bytes;
166    char *tmp_buf;
167    /* Compute the length of the block we need. */
168    tbytes = sizeof(*tmp_tags) * num_tags;
169    rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
170    pbytes = sizeof(*reach_pos) * tnfa->num_states;
171    total_bytes =
172      (sizeof(long) - 1) * 4 /* for alignment paddings */
173      + (rbytes + tbytes * tnfa->num_states) * 2 + tbytes + pbytes;
174
175    DPRINT(("tre_tnfa_run_parallel, allocate %d bytes\n", total_bytes));
176    /* Allocate the memory. */
177#ifdef TRE_USE_ALLOCA
178    buf = alloca(total_bytes);
179#else /* !TRE_USE_ALLOCA */
180    buf = xmalloc((unsigned)total_bytes);
181#endif /* !TRE_USE_ALLOCA */
182    if (buf == NULL)
183      return REG_ESPACE;
184    memset(buf, 0, (size_t)total_bytes);
185
186    /* Get the various pointers within tmp_buf (properly aligned). */
187    tmp_tags = (void *)buf;
188    tmp_buf = buf + tbytes;
189    tmp_buf += ALIGN(tmp_buf, long);
190    reach_next = (void *)tmp_buf;
191    tmp_buf += rbytes;
192    tmp_buf += ALIGN(tmp_buf, long);
193    reach = (void *)tmp_buf;
194    tmp_buf += rbytes;
195    tmp_buf += ALIGN(tmp_buf, long);
196    reach_pos = (void *)tmp_buf;
197    tmp_buf += pbytes;
198    tmp_buf += ALIGN(tmp_buf, long);
199    for (i = 0; i < tnfa->num_states; i++)
200      {
201	reach[i].tags = (void *)tmp_buf;
202	tmp_buf += tbytes;
203	reach_next[i].tags = (void *)tmp_buf;
204	tmp_buf += tbytes;
205      }
206  }
207
208  for (i = 0; i < tnfa->num_states; i++)
209    reach_pos[i].pos = -1;
210
211  /* If only one character can start a match, find it first. */
212  if (tnfa->first_char >= 0 && str_byte)
213    {
214      const char *orig_str = str_byte;
215      int first = tnfa->first_char;
216      int found_high_bit = 0;
217
218
219      if (type == STR_BYTE)
220	{
221	  if (len >= 0)
222	    str_byte = memchr(orig_str, first, (size_t)len);
223	  else
224	    str_byte = strchr(orig_str, first);
225	}
226      else if (type == STR_MBS)
227	{
228	  /*
229	   * If the match character is ASCII, try to match the character
230	   * directly, but if a high bit character is found, we stop there.
231	   */
232	  if (first < 0x80)
233	    {
234	      if (len >= 0)
235		{
236		  int i;
237		  for (i = 0; ; str_byte++, i++)
238		    {
239		      if (i >= len)
240			{
241			  str_byte = NULL;
242			  break;
243			}
244		      if (*str_byte == first)
245			break;
246		      if (*str_byte & 0x80)
247			{
248			  found_high_bit = 1;
249			  break;
250			}
251		    }
252		}
253	      else
254		{
255		  for (; ; str_byte++)
256		    {
257		      if (!*str_byte)
258			{
259			  str_byte = NULL;
260			  break;
261			}
262		      if (*str_byte == first)
263			break;
264		      if (*str_byte & 0x80)
265			{
266			  found_high_bit = 1;
267			  break;
268			}
269		    }
270		}
271	    }
272	  else
273	    {
274	      if (len >= 0)
275		{
276		  int i;
277		  for (i = 0; ; str_byte++, i++)
278		    {
279		      if (i >= len)
280			{
281			  str_byte = NULL;
282			  break;
283			}
284		      if (*str_byte & 0x80)
285			{
286			  found_high_bit = 1;
287			  break;
288			}
289		    }
290		}
291	      else
292		{
293		  for (; ; str_byte++)
294		    {
295		      if (!*str_byte)
296			{
297			  str_byte = NULL;
298			  break;
299			}
300		      if (*str_byte & 0x80)
301			{
302			  found_high_bit = 1;
303			  break;
304			}
305		    }
306		}
307	    }
308	}
309      if (str_byte == NULL)
310	{
311#ifndef TRE_USE_ALLOCA
312	  if (buf)
313	    xfree(buf);
314#endif /* !TRE_USE_ALLOCA */
315	  return REG_NOMATCH;
316	}
317      DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
318      if (!found_high_bit)
319	{
320	  if (str_byte >= orig_str + 1)
321	    prev_c = (unsigned char)*(str_byte - 1);
322	  next_c = (unsigned char)*str_byte;
323	  pos = str_byte - orig_str;
324	  if (len < 0 || pos < len)
325	    str_byte++;
326	}
327      else
328	{
329	  if (str_byte == orig_str)
330	    goto no_first_optimization;
331	  /*
332	   * Back up one character, fix up the position, then call
333	   * GET_NEXT_WCHAR() to process the multibyte character.
334	   */
335	  /* no need to set prev_c, since GET_NEXT_WCHAR will overwrite */
336	  next_c = (unsigned char)*(str_byte - 1);
337	  pos = (str_byte - 1) - orig_str;
338	  GET_NEXT_WCHAR();
339	}
340    }
341  else
342    {
343no_first_optimization:
344      GET_NEXT_WCHAR();
345      pos = 0;
346    }
347
348#ifdef USE_FIRSTPOS_CHARS /* not defined */
349  /* Skip over characters that cannot possibly be the first character
350     of a match. */
351  if (tnfa->firstpos_chars != NULL)
352    {
353      char *chars = tnfa->firstpos_chars;
354
355      if (len < 0)
356	{
357	  const char *orig_str = str_byte;
358	  /* XXX - use strpbrk() and wcspbrk() because they might be
359	     optimized for the target architecture.  Try also strcspn()
360	     and wcscspn() and compare the speeds. */
361	  while (next_c != L'\0' && !chars[next_c])
362	    {
363	      next_c = *str_byte++;
364	    }
365	  prev_c = *(str_byte - 2);
366	  pos += str_byte - orig_str;
367	  DPRINT(("skipped %d chars\n", str_byte - orig_str));
368	}
369      else
370	{
371	  while (pos <= len && !chars[next_c])
372	    {
373	      prev_c = next_c;
374	      next_c = (unsigned char)(*str_byte++);
375	      pos++;
376	    }
377	}
378    }
379#endif /* USE_FIRSTPOS_CHARS */
380
381  DPRINT(("length: %d\n", len));
382  DPRINT(("pos:chr/code | states and tags\n"));
383  DPRINT(("-------------+------------------------------------------------\n"));
384
385  reach_next_i = reach_next;
386  while (/*CONSTCOND*/1)
387    {
388      /* If no match found yet, add the initial states to `reach_next'. */
389      if (match_eo < 0)
390	{
391	  DPRINT((" init >"));
392	  trans_i = tnfa->initial;
393	  while (trans_i->state != NULL)
394	    {
395	      if (reach_pos[trans_i->state_id].pos < pos)
396		{
397		  if (trans_i->assertions
398		      && CHECK_ASSERTIONS(trans_i->assertions))
399		    {
400		      DPRINT(("assertion failed\n"));
401		      trans_i++;
402		      continue;
403		    }
404
405		  DPRINT((" %p", (void *)trans_i->state));
406		  reach_next_i->state = trans_i->state;
407		  memset(reach_next_i->tags, 0, tbytes);
408		  tag_i = trans_i->tags;
409		  if (tag_i)
410		    {
411		      while (*tag_i >= 0)
412			{
413			  if (*tag_i < num_tags)
414			    tre_tag_set(reach_next_i->tags, *tag_i, pos, touch);
415			  tag_i++;
416			}
417			touch++;
418		    }
419		  if (reach_next_i->state == tnfa->final)
420		    {
421		      DPRINT(("	 found empty match\n"));
422		      match_eo = pos;
423		      memcpy(match_tags, reach_next_i->tags, tbytes);
424		    }
425		  reach_pos[trans_i->state_id].pos = pos;
426		  reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
427		  reach_next_i++;
428		}
429	      trans_i++;
430	    }
431	  DPRINT(("\n"));
432	  reach_next_i->state = NULL;
433	}
434      else
435	{
436	  if (num_tags == 0 || reach_next_i == reach_next)
437	    /*�We have found a match. */
438	    break;
439	}
440
441      /* Check for end of string. */
442      if (len < 0)
443	{
444#ifdef TRE_STR_USER
445	  if (type == STR_USER)
446	    {
447	      if (str_user_end)
448		break;
449	    }
450	  else
451#endif /* TRE_STR_USER */
452	  if (next_c == L'\0')
453	    break;
454	}
455      else
456	{
457	  if (pos >= len)
458	    break;
459	}
460
461      GET_NEXT_WCHAR();
462
463#ifdef TRE_DEBUG
464      DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
465      tre_print_reach(tnfa, reach_next, num_tags);
466      //DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
467      //tre_print_reach(tnfa, reach_next, num_tags);
468#endif /* TRE_DEBUG */
469
470      /* Swap `reach' and `reach_next'. */
471      reach_i = reach;
472      reach = reach_next;
473      reach_next = reach_i;
474
475#ifdef TRE_DEBUG
476      once = 0;
477#endif /* TRE_DEBUG */
478
479      /* For each state in `reach' see if there is a transition leaving with
480	 the current input symbol to a state not yet in `reach_next', and
481	 add the destination states to `reach_next'. */
482      reach_next_i = reach_next;
483      for (reach_i = reach; reach_i->state; reach_i++)
484	{
485	  for (trans_i = reach_i->state; trans_i->state; trans_i++)
486	    {
487	      /* Does this transition match the input symbol? */
488	      if (trans_i->code_min <= (tre_cint_t)prev_c &&
489		  trans_i->code_max >= (tre_cint_t)prev_c)
490		{
491		  if (trans_i->assertions
492		      && (CHECK_ASSERTIONS(trans_i->assertions)
493			  || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
494		    {
495		      DPRINT(("assertion failed\n"));
496		      continue;
497		    }
498
499		  /* Compute the tags after this transition. */
500		  memcpy(tmp_tags, reach_i->tags, tbytes);
501		  tag_i = trans_i->tags;
502		  if (tag_i != NULL)
503		    {
504		      while (*tag_i >= 0)
505			{
506			  if (*tag_i < num_tags)
507			    tre_tag_set(tmp_tags, *tag_i, pos, touch);
508			  tag_i++;
509			}
510			touch++;
511		    }
512
513		  /* For each new transition, weed out those that don't
514		     fulfill the minimal matching conditions. */
515		  if (tnfa->num_minimals && match_eo >= 0)
516		    {
517		      int skip = 0;
518#ifdef TRE_DEBUG
519		      if (!once)
520			{
521			  DPRINT(("Checking minimal conditions: match_eo=%d "
522				  "match_tags=", match_eo));
523			  tre_print_tags(match_tags, num_tags);
524			  DPRINT(("\n"));
525			  once++;
526			}
527#endif /* TRE_DEBUG */
528		      for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
529			{
530			   int end = tnfa->minimal_tags[i];
531			   int start = tnfa->minimal_tags[i + 1];
532			   DPRINT(("  Minimal start %d, end %d\n", start, end));
533			   if (tre_minimal_tag_order(start, end, match_tags,
534						     tmp_tags) > 0)
535			     {
536				skip = 1;
537				break;
538			     }
539			}
540		      if (skip)
541			{
542#ifdef TRE_DEBUG
543			   DPRINT(("	 Throwing out"));
544			   tre_print_reach1(reach_i->state, tmp_tags,
545					     num_tags);
546			   DPRINT(("\n"));
547#endif /* TRE_DEBUG */
548			   continue;
549			}
550		    }
551
552		  if (reach_pos[trans_i->state_id].pos < pos)
553		    {
554		      /* Found an unvisited node. */
555		      reach_next_i->state = trans_i->state;
556		      tmp_iptr = reach_next_i->tags;
557		      reach_next_i->tags = tmp_tags;
558		      tmp_tags = tmp_iptr;
559		      reach_pos[trans_i->state_id].pos = pos;
560		      reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
561
562		      if (reach_next_i->state == tnfa->final
563			  && (match_eo == -1
564			      || (num_tags > 0
565				  && tre_tag_get(reach_next_i->tags, 0) <=
566				  tre_tag_get(match_tags, 0))))
567			{
568#ifdef TRE_DEBUG
569			  DPRINT(("  found match"));
570			  tre_print_reach1(trans_i->state, reach_next_i->tags, num_tags);
571			  DPRINT(("\n"));
572#endif /* TRE_DEBUG */
573			  match_eo = pos;
574			  memcpy(match_tags, reach_next_i->tags, tbytes);
575			}
576		      reach_next_i++;
577
578		    }
579		  else
580		    {
581		      assert(reach_pos[trans_i->state_id].pos == pos);
582		      /* Another path has also reached this state.  We choose
583			 the winner by examining the tag values for both
584			 paths. */
585		      if (tre_tag_order(num_tags, tnfa->tag_directions,
586					tmp_tags,
587					*reach_pos[trans_i->state_id].tags))
588			{
589			  /* The new path wins. */
590			  tmp_iptr = *reach_pos[trans_i->state_id].tags;
591			  *reach_pos[trans_i->state_id].tags = tmp_tags;
592			  if (trans_i->state == tnfa->final)
593			    {
594#ifdef TRE_DEBUG
595			      DPRINT(("	 found better match"));
596			      tre_print_reach1(trans_i->state, tmp_tags, num_tags);
597			      DPRINT(("\n"));
598#endif /* TRE_DEBUG */
599			      match_eo = pos;
600			      memcpy(match_tags, tmp_tags, tbytes);
601			    }
602			  tmp_tags = tmp_iptr;
603			}
604		    }
605		}
606	    }
607	}
608      reach_next_i->state = NULL;
609    }
610
611  DPRINT(("match end offset = %d\n", match_eo));
612
613  *match_end_ofs = match_eo;
614#ifdef TRE_DEBUG
615  if (match_tags)
616    {
617      DPRINT(("Winning tags="));
618      tre_print_tags_all(match_tags, num_tags);
619      DPRINT((" touch=%d\n", touch));
620    }
621#endif /* TRE_DEBUG */
622
623#ifndef TRE_USE_ALLOCA
624  if (buf)
625    xfree(buf);
626#endif /* !TRE_USE_ALLOCA */
627
628  return match_eo >= 0 ? REG_OK : REG_NOMATCH;
629}
630
631/* EOF */
632