1/*
2  tre_regexec.c - TRE POSIX compatible matching functions (and more).
3
4  This software is released under a BSD-style license.
5  See the file LICENSE for details and copyright.
6
7*/
8
9#ifdef HAVE_CONFIG_H
10#include <config.h>
11#endif /* HAVE_CONFIG_H */
12
13#ifdef TRE_USE_ALLOCA
14/* AIX requires this to be the first thing in the file.	 */
15#ifndef __GNUC__
16# if HAVE_ALLOCA_H
17#  include <alloca.h>
18# else
19#  ifdef _AIX
20 #pragma alloca
21#  else
22#   ifndef alloca /* predefined by HP cc +Olibcalls */
23char *alloca ();
24#   endif
25#  endif
26# endif
27#endif
28#endif /* TRE_USE_ALLOCA */
29
30#include <assert.h>
31#include <stdlib.h>
32#include <string.h>
33#ifdef HAVE_WCHAR_H
34#include <wchar.h>
35#endif /* HAVE_WCHAR_H */
36#ifdef HAVE_WCTYPE_H
37#include <wctype.h>
38#endif /* HAVE_WCTYPE_H */
39#ifndef TRE_WCHAR
40#include <ctype.h>
41#endif /* !TRE_WCHAR */
42#ifdef HAVE_MALLOC_H
43#include <malloc.h>
44#endif /* HAVE_MALLOC_H */
45#include <limits.h>
46
47#include "tre-internal.h"
48#include "tre-match-utils.h"
49#include "tre.h"
50#include "xmalloc.h"
51
52
53/* For each tre_last_matched_t in the lm array, find the last matched branch by
54   comparing the touch value of the cmp_tag's.  For all other branches, reset
55   the corresponding tags.  If reset_all is non-zero, reset all tags in all
56   branches.  Recurse into the nested last matched structures, clearing tags as
57   apprpriate. */
58static void
59tre_reset_last_matched_branches(tre_tag_t *tags, const tre_last_matched_t *lm,
60			        int n, int start_tag, int reset_all)
61{
62  int max, i, reset;
63  tre_last_matched_branch_t *b;
64
65  DPRINT(("tre_reset_last_matched_branches: n=%d start_tag=%d reset_all=%d\n",
66	  n, start_tag, reset_all));
67  for (; n-- > 0; lm++)
68    {
69      if (lm->n_branches == 1)
70	{
71	  b = lm->branches;
72	  if (start_tag > 0)
73	    {
74	      DPRINT(("  b->cmp_tag=%d %d <? %d\n", b->cmp_tag,
75		      tre_tag_touch_get(tags, b->cmp_tag),
76		      tre_tag_touch_get(tags, start_tag)));
77	      reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) <
78		       tre_tag_touch_get(tags, start_tag));
79	    }
80	  else
81	    reset = 0;
82
83	  if (reset)
84	    {
85	      int *t;
86
87	      for (i = b->n_tags, t = b->tags; i > 0; i--, t++)
88		{
89		  DPRINT(("  Resetting t%d\n", *t));
90		  tre_tag_reset(tags, *t);
91		}
92	    }
93	  if (b->n_last_matched > 0)
94	    tre_reset_last_matched_branches(tags, b->last_matched,
95						b->n_last_matched,
96						lm->start_tag, reset);
97	}
98      else
99	{
100	  if (!reset_all)
101	    {
102#ifdef TRE_DEBUG
103	      int last;
104#endif /* TRE_DEBUG */
105	      max = 0;
106	      for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++)
107		{
108		  int t = b->cmp_tag;
109		  int touch = tre_tag_touch_get(tags, t);
110		  if (touch > max)
111		    {
112		      max = touch;
113#ifdef TRE_DEBUG
114		      last = t;
115#endif /* TRE_DEBUG */
116		    }
117		}
118	      DPRINT(("  Last touched end tag t%d=%d\n", last, max));
119	    }
120
121	  for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++)
122	    {
123	      reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < max);
124	      if (reset)
125		{
126		  int j;
127		  int *t;
128
129		  for (j = b->n_tags, t = b->tags; j > 0; j--, t++)
130		    {
131		      DPRINT(("  Resetting t%d\n", *t));
132		      tre_tag_reset(tags, *t);
133		    }
134		}
135	      if (b->n_last_matched > 0)
136		tre_reset_last_matched_branches(tags, b->last_matched,
137						b->n_last_matched,
138						lm->start_tag, reset);
139	    }
140	}
141    }
142}
143
144
145/* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
146   endpoint values. */
147reg_errcode_t
148tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
149		const tre_tnfa_t *tnfa, const tre_tag_t *intags, int match_eo)
150{
151  unsigned int i;
152
153  if (cflags & REG_NOSUB) return REG_OK;
154
155  i = 0;
156  if (match_eo >= 0 && intags)
157    {
158      const tre_tag_t *tags = intags;
159      tre_submatch_data_t *submatch_data;
160
161      if (tnfa->last_matched_branch &&
162	  tnfa->last_matched_branch->n_last_matched > 0)
163	{
164	  tre_tag_t *t;
165#ifdef TRE_USE_ALLOCA
166	  t = alloca(sizeof(*t) * tnfa->num_tags);
167#else /* !TRE_USE_ALLOCA */
168	  t = xmalloc(sizeof(*t) * tnfa->num_tags);
169#endif /* !TRE_USE_ALLOCA */
170	  if (!t) return REG_ESPACE;
171	  memcpy(t, intags, tnfa->num_tags * sizeof(tre_tag_t));
172	  tre_reset_last_matched_branches(t,
173				    tnfa->last_matched_branch->last_matched,
174				    tnfa->last_matched_branch->n_last_matched,
175				    0, 0);
176	  tags = t;
177	}
178      /* Construct submatch offsets from the tags. */
179      DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
180      submatch_data = tnfa->submatch_data;
181      while (i < tnfa->num_submatches && i < nmatch)
182	{
183	  if (submatch_data[i].so_tag == tnfa->end_tag)
184	    pmatch[i].rm_so = match_eo;
185	  else
186	    pmatch[i].rm_so = tre_tag_get(tags, submatch_data[i].so_tag);
187
188	  if (submatch_data[i].eo_tag == tnfa->end_tag)
189	    pmatch[i].rm_eo = match_eo;
190	  else
191	    pmatch[i].rm_eo = tre_tag_get(tags, submatch_data[i].eo_tag);
192
193	  /* If either of the endpoints were not used, this submatch
194	     was not part of the match. */
195	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
196	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
197
198	  DPRINT(("pmatch[%d] = {t%d = %qd, t%d = %qd}\n", i,
199		  submatch_data[i].so_tag, pmatch[i].rm_so,
200		  submatch_data[i].eo_tag, pmatch[i].rm_eo));
201	  i++;
202	}
203#ifndef TRE_USE_ALLOCA
204	if (tags != intags) xfree(tags);
205#endif /* !TRE_USE_ALLOCA */
206    }
207
208  while (i < nmatch)
209    {
210      pmatch[i].rm_so = -1;
211      pmatch[i].rm_eo = -1;
212      i++;
213    }
214
215  return REG_OK;
216}
217
218
219/*
220  Wrapper functions for POSIX compatible regexp matching.
221*/
222
223#ifndef __LIBC__
224int
225tre_have_backrefs(const regex_t *preg)
226{
227  tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
228  return tnfa->have_backrefs;
229}
230
231#ifdef TRE_APPROX
232int
233tre_have_approx(const regex_t *preg)
234{
235  tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
236  return tnfa->have_approx;
237}
238#endif /* TRE_APPROX */
239#endif /* !__LIBC__ */
240
241static int
242tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
243	  tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
244	  int eflags)
245{
246  reg_errcode_t status;
247  tre_tag_t *tags = NULL;
248  int eo;
249  size_t offset = 0, count = 0;
250  if (tnfa->num_tags > 0 && nmatch > 0)
251    {
252#ifdef TRE_USE_ALLOCA
253      tags = alloca(sizeof(*tags) * tnfa->num_tags);
254#else /* !TRE_USE_ALLOCA */
255      tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
256#endif /* !TRE_USE_ALLOCA */
257      if (tags == NULL)
258	return REG_ESPACE;
259    }
260
261  if (
262#ifdef TRE_STR_USER
263      type != STR_USER &&
264#endif /* TRE_STR_USER */
265      (eflags & REG_STARTEND) && pmatch)
266    {
267      if (pmatch->rm_so < 0)
268	return REG_INVARG;
269      if (len == (size_t)-1)
270	{
271	  if (pmatch->rm_eo < 0 || pmatch->rm_so > pmatch->rm_eo)
272	    return REG_INVARG;
273	  len = pmatch->rm_eo - pmatch->rm_so;
274	}
275      count = offset = pmatch->rm_so;
276      if (type == STR_WIDE) offset *= sizeof(wchar_t);
277    }
278
279  /* Dispatch to the appropriate matcher. */
280  if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER)
281    {
282      /* The regex has back references, use the backtracking matcher. */
283#ifdef TRE_STR_USER
284      if (type == STR_USER)
285	{
286	  const tre_str_source *source = string;
287	  if (source->rewind == NULL || source->compare == NULL)
288	    /* The backtracking matcher requires rewind and compare
289	       capabilities from the input stream. */
290	    return REG_BADPAT;
291	}
292#endif /* TRE_STR_USER */
293      status = tre_tnfa_run_backtrack(tnfa, string + offset, (int)len, type,
294				      tags, eflags, &eo);
295    }
296#ifdef TRE_APPROX
297  else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER)
298    {
299      /* The regex uses approximate matching, use the approximate matcher. */
300      regamatch_t match;
301      regaparams_t params;
302      tre_regaparams_default(&params);
303      params.max_err = 0;
304      params.max_cost = 0;
305      status = tre_tnfa_run_approx(tnfa, string + offset, (int)len, type, tags,
306				   &match, params, eflags, &eo);
307    }
308#endif /* TRE_APPROX */
309  else
310    {
311      /* Exact matching, no back references, use the parallel matcher. */
312      status = tre_tnfa_run_parallel(tnfa, string + offset, (int)len, type,
313				     tags, eflags, &eo);
314    }
315
316  if (status == REG_OK)
317    {
318      /* A match was found, so fill the submatch registers. */
319      status = tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
320      /* If doing REG_STARTEND, adjust the pmatch array (we can't build
321         this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls
322	 tre_fill_pmatch itself). */
323      if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) &&
324#ifdef TRE_STR_USER
325	  type != STR_USER &&
326#endif /* TRE_STR_USER */
327	  (eflags & REG_STARTEND) && pmatch && nmatch > 0)
328	{
329	  size_t i;
330	  regmatch_t *p;
331	  for (i = nmatch, p = pmatch; i > 0; p++, i--)
332	    {
333	      if (p->rm_so >= 0) p->rm_so += count;
334	      if (p->rm_eo >= 0) p->rm_eo += count;
335	    }
336	}
337    }
338#ifndef TRE_USE_ALLOCA
339  if (tags)
340    xfree(tags);
341#endif /* !TRE_USE_ALLOCA */
342  return status;
343}
344
345int
346tre_regnexec(const regex_t *preg, const char *str, size_t len,
347	 size_t nmatch, regmatch_t pmatch[], int eflags)
348{
349  tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
350  tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS;
351
352#ifdef TRE_USE_SYSTEM_REGEX_H
353  if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
354#endif /* TRE_USE_SYSTEM_REGEX_H */
355
356  return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags);
357}
358
359int
360tre_regexec(const regex_t *preg, const char *str,
361	size_t nmatch, regmatch_t pmatch[], int eflags)
362{
363  return tre_regnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags);
364}
365
366
367#ifdef TRE_WCHAR
368
369int
370tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len,
371	  size_t nmatch, regmatch_t pmatch[], int eflags)
372{
373  tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
374
375#ifdef TRE_USE_SYSTEM_REGEX_H
376  if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
377#endif /* TRE_USE_SYSTEM_REGEX_H */
378
379  return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags);
380}
381
382int
383tre_regwexec(const regex_t *preg, const wchar_t *str,
384	 size_t nmatch, regmatch_t pmatch[], int eflags)
385{
386  return tre_regwnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags);
387}
388
389#endif /* TRE_WCHAR */
390
391#ifdef TRE_STR_USER
392int
393tre_reguexec(const regex_t *preg, const tre_str_source *str,
394	 size_t nmatch, regmatch_t pmatch[], int eflags)
395{
396  tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
397
398#ifdef TRE_USE_SYSTEM_REGEX_H
399  if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
400#endif /* TRE_USE_SYSTEM_REGEX_H */
401
402  return tre_match(tnfa, str, (size_t)-1, STR_USER, nmatch, pmatch, eflags);
403}
404#endif /* TRE_STR_USER */
405
406
407#ifdef TRE_APPROX
408
409/*
410  Wrapper functions for approximate regexp matching.
411*/
412
413static int
414tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
415		 tre_str_type_t type, regamatch_t *match, regaparams_t params,
416		 int eflags)
417{
418  reg_errcode_t status;
419  tre_tag_t *tags = NULL;
420  int eo;
421  size_t offset = 0, count = 0;
422
423  /* If the regexp does not use approximate matching features, the
424     maximum cost is zero, and the approximate matcher isn't forced,
425     use the exact matcher instead. */
426  if (params.max_cost == 0 && !tnfa->have_approx
427      && !(eflags & REG_APPROX_MATCHER))
428    return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch,
429		     eflags);
430
431  /* Back references are not supported by the approximate matcher. */
432  if (tnfa->have_backrefs)
433    return REG_BADPAT;
434
435  if (tnfa->num_tags > 0 && match->nmatch > 0)
436    {
437#if TRE_USE_ALLOCA
438      tags = alloca(sizeof(*tags) * tnfa->num_tags);
439#else /* !TRE_USE_ALLOCA */
440      tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
441#endif /* !TRE_USE_ALLOCA */
442      if (tags == NULL)
443	return REG_ESPACE;
444    }
445
446  if (
447#ifdef TRE_STR_USER
448      type != STR_USER &&
449#endif /* TRE_STR_USER */
450      (eflags & REG_STARTEND) && match->pmatch)
451    {
452      if (match->pmatch->rm_so < 0)
453	return REG_INVARG;
454      if (len == (size_t)-1)
455	{
456	  if (match->pmatch->rm_eo < 0 || match->pmatch->rm_so >
457	      match->pmatch->rm_eo)
458	    return REG_INVARG;
459	  len = match->pmatch->rm_eo - match->pmatch->rm_so;
460	}
461      count = offset = match->pmatch->rm_so;
462      if (type == STR_WIDE) offset *= sizeof(wchar_t);
463    }
464
465  status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
466			       match, params, eflags, &eo);
467  if (status == REG_OK)
468    {
469      status = tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags,
470			       tnfa, tags, eo);
471      /* If doing REG_STARTEND, adjust the pmatch array (we can't build
472         this into tre_fill_pmatch, because tre_tnfa_run_backtrack call
473	 tre_fill_pmatch itself). */
474      if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) &&
475#ifdef TRE_STR_USER
476	  type != STR_USER &&
477#endif /* TRE_STR_USER */
478	  (eflags & REG_STARTEND) && match->pmatch && match->nmatch > 0)
479	{
480	  size_t i;
481	  regmatch_t *p;
482	  for (i = match->nmatch, p = match->pmatch; i > 0; p++, i--)
483	    {
484	      if (p->rm_so >= 0) p->rm_so += count;
485	      if (p->rm_eo >= 0) p->rm_eo += count;
486	    }
487	}
488    }
489#ifndef TRE_USE_ALLOCA
490  if (tags)
491    xfree(tags);
492#endif /* !TRE_USE_ALLOCA */
493  return status;
494}
495
496int
497tre_reganexec(const regex_t *preg, const char *str, size_t len,
498	  regamatch_t *match, regaparams_t params, int eflags)
499{
500  tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
501  tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS;
502
503#ifdef TRE_USE_SYSTEM_REGEX_H
504  if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
505#endif /* TRE_USE_SYSTEM_REGEX_H */
506
507  return tre_match_approx(tnfa, str, len, type, match, params, eflags);
508}
509
510int
511tre_regaexec(const regex_t *preg, const char *str,
512	 regamatch_t *match, regaparams_t params, int eflags)
513{
514  return tre_reganexec(preg, str, (size_t)-1, match, params, eflags);
515}
516
517#ifdef TRE_WCHAR
518
519int
520tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len,
521	   regamatch_t *match, regaparams_t params, int eflags)
522{
523  tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
524
525#ifdef TRE_USE_SYSTEM_REGEX_H
526  if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
527#endif /* TRE_USE_SYSTEM_REGEX_H */
528
529  return tre_match_approx(tnfa, str, len, STR_WIDE,
530			  match, params, eflags);
531}
532
533int
534tre_regawexec(const regex_t *preg, const wchar_t *str,
535	  regamatch_t *match, regaparams_t params, int eflags)
536{
537  return tre_regawnexec(preg, str, (size_t)-1, match, params, eflags);
538}
539
540#endif /* TRE_WCHAR */
541
542void
543tre_regaparams_default(regaparams_t *params)
544{
545  memset(params, 0, sizeof(*params));
546  params->cost_ins = 1;
547  params->cost_del = 1;
548  params->cost_subst = 1;
549  params->max_cost = INT_MAX;
550  params->max_ins = INT_MAX;
551  params->max_del = INT_MAX;
552  params->max_subst = INT_MAX;
553  params->max_err = INT_MAX;
554}
555
556#endif /* TRE_APPROX */
557
558/* EOF */
559