1/*
2  tre-match-utils.h - TRE matcher helper definitions
3
4  This software is released under a BSD-style license.
5  See the file LICENSE for details and copyright.
6
7*/
8
9#include "tre-internal.h"
10
11#define str_source ((const tre_str_source*)string)
12
13#ifdef TRE_WCHAR
14
15#ifdef TRE_MULTIBYTE
16
17/* Wide character and multibyte support. */
18
19#ifdef TRE_STR_USER
20#error TRE_STR_USER defined
21#define GET_NEXT_WCHAR()						      \
22  do {									      \
23    prev_c = next_c;							      \
24    if (type == STR_BYTE)						      \
25      {									      \
26	pos++;								      \
27	if (len >= 0 && pos >= len)					      \
28	  next_c = '\0';						      \
29	else								      \
30	  next_c = (unsigned char)(*str_byte++);			      \
31      }									      \
32    else if (type == STR_WIDE)						      \
33      {									      \
34	pos++;								      \
35	if (len >= 0 && pos >= len)					      \
36	  next_c = L'\0';						      \
37	else								      \
38	  next_c = *str_wide++;						      \
39      }									      \
40    else if (type == STR_MBS)						      \
41      {									      \
42        pos += pos_add_next;					      	      \
43	if (str_byte == NULL)						      \
44	  next_c = L'\0';						      \
45	else								      \
46	  {								      \
47	    size_t w;							      \
48	    int max;							      \
49	    if (len >= 0)						      \
50	      max = len - pos;						      \
51	    else							      \
52	      max = 32;							      \
53	    if (max <= 0)						      \
54	      {								      \
55		next_c = L'\0';						      \
56		pos_add_next = 1;					      \
57	      }								      \
58	    else							      \
59	      {								      \
60		w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate,   \
61				  tnfa->loc);                                 \
62		if (w == (size_t)-1 || w == (size_t)-2)			      \
63		  return REG_ILLSEQ;					      \
64		if (w == 0 && len >= 0)					      \
65		  {							      \
66		    pos_add_next = 1;					      \
67		    next_c = 0;						      \
68		    str_byte++;						      \
69		  }							      \
70		else							      \
71		  {							      \
72		    pos_add_next = w;					      \
73		    str_byte += w;					      \
74		  }							      \
75	      }								      \
76	  }								      \
77      }									      \
78    else if (type == STR_USER)						      \
79      {									      \
80        pos += pos_add_next;					      	      \
81	str_user_end = str_source->get_next_char(&next_c, &pos_add_next,      \
82                                                 str_source->context);	      \
83      }									      \
84  } while(/*CONSTCOND*/0)
85#else /* !TRE_STR_USER */
86/*
87 * Because all multibyte encodings are exclusively single-shift encoding,
88 * with the shift codes having the high bit set, we can make an optimization
89 * for STR_MBS that only calls tre_mbrtowc_l() when a high-bit character
90 * is detected, and just do a direct copy for ASCII characters.
91 */
92#define GET_NEXT_WCHAR()						      \
93  do {									      \
94    prev_c = next_c;							      \
95    switch (type)							      \
96      {									      \
97      case STR_BYTE:							      \
98	pos++;								      \
99	if (len >= 0 && pos >= len)					      \
100	  next_c = '\0';						      \
101	else								      \
102	  next_c = (unsigned char)(*str_byte++);			      \
103	break;								      \
104      case STR_WIDE:							      \
105	pos++;								      \
106	if (len >= 0 && pos >= len)					      \
107	  next_c = L'\0';						      \
108	else								      \
109	  next_c = *str_wide++;						      \
110	break;								      \
111      case STR_MBS:							      \
112	pos += pos_add_next;					      	      \
113	if (__builtin_expect(len >= 0 && pos >= len, 0))		      \
114	  {								      \
115	    next_c = L'\0';						      \
116	    pos_add_next = 1;						      \
117	  }								      \
118	else if (__builtin_expect(!(*str_byte & 0x80), 1))		      \
119	  {								      \
120	    next_c = (unsigned char)(*str_byte++);			      \
121	    pos_add_next = 1;						      \
122	  }								      \
123	else								      \
124	  {								      \
125	    size_t w;							      \
126	    int max;							      \
127	    if (len >= 0)						      \
128	      max = len - pos;						      \
129	    else							      \
130	      max = 32;							      \
131	    w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate,	      \
132			      tnfa->loc);				      \
133	    if (w == (size_t)-1 || w == (size_t)-2)			      \
134	      return REG_ILLSEQ;					      \
135	    if (w == 0 && len >= 0)					      \
136	      {								      \
137		pos_add_next = 1;					      \
138		next_c = 0;						      \
139		str_byte++;						      \
140	      }								      \
141	    else							      \
142	      {								      \
143		pos_add_next = w;					      \
144		str_byte += w;						      \
145	      }								      \
146	  }								      \
147	break;								      \
148      }									      \
149  } while(/*CONSTCOND*/0)
150#endif /* !TRE_STR_USER */
151
152#else /* !TRE_MULTIBYTE */
153
154/* Wide character support, no multibyte support. */
155#error TRE_MULTIBYTE undefined
156
157#ifdef TRE_STR_USER
158#define GET_NEXT_WCHAR()						      \
159  do {									      \
160    prev_c = next_c;							      \
161    if (type == STR_BYTE)						      \
162      {									      \
163	pos++;								      \
164	if (len >= 0 && pos >= len)					      \
165	  next_c = '\0';						      \
166	else								      \
167	  next_c = (unsigned char)(*str_byte++);			      \
168      }									      \
169    else if (type == STR_WIDE)						      \
170      {									      \
171	pos++;								      \
172	if (len >= 0 && pos >= len)					      \
173	  next_c = L'\0';						      \
174	else								      \
175	  next_c = *str_wide++;						      \
176      }									      \
177    else if (type == STR_USER)						      \
178      {									      \
179        pos += pos_add_next;					      	      \
180	str_user_end = str_source->get_next_char(&next_c, &pos_add_next,      \
181                                                 str_source->context);	      \
182      }									      \
183  } while(/*CONSTCOND*/0)
184#else /* !TRE_STR_USER */
185#define GET_NEXT_WCHAR()						      \
186  do {									      \
187    prev_c = next_c;							      \
188    if (type == STR_BYTE)						      \
189      {									      \
190	pos++;								      \
191	if (len >= 0 && pos >= len)					      \
192	  next_c = '\0';						      \
193	else								      \
194	  next_c = (unsigned char)(*str_byte++);			      \
195      }									      \
196    else if (type == STR_WIDE)						      \
197      {									      \
198	pos++;								      \
199	if (len >= 0 && pos >= len)					      \
200	  next_c = L'\0';						      \
201	else								      \
202	  next_c = *str_wide++;						      \
203      }									      \
204  } while(/*CONSTCOND*/0)
205#endif /* !TRE_STR_USER */
206
207#endif /* !TRE_MULTIBYTE */
208
209#else /* !TRE_WCHAR */
210
211/* No wide character or multibyte support. */
212#error TRE_WCHAR undefined
213
214#ifdef TRE_STR_USER
215#define GET_NEXT_WCHAR()						      \
216  do {									      \
217    prev_c = next_c;							      \
218    if (type == STR_BYTE)						      \
219      {									      \
220	pos++;								      \
221	if (len >= 0 && pos >= len)					      \
222	  next_c = '\0';						      \
223	else								      \
224	  next_c = (unsigned char)(*str_byte++);			      \
225      }									      \
226    else if (type == STR_USER)						      \
227      {									      \
228	pos += pos_add_next;						      \
229	str_user_end = str_source->get_next_char(&next_c, &pos_add_next,      \
230						 str_source->context);	      \
231      }									      \
232  } while(/*CONSTCOND*/0)
233#else /* !TRE_STR_USER */
234#define GET_NEXT_WCHAR()						      \
235  do {									      \
236    prev_c = next_c;							      \
237    if (type == STR_BYTE)						      \
238      {									      \
239	pos++;								      \
240	if (len >= 0 && pos >= len)					      \
241	  next_c = '\0';						      \
242	else								      \
243	  next_c = (unsigned char)(*str_byte++);			      \
244      }									      \
245  } while(/*CONSTCOND*/0)
246#endif /* !TRE_STR_USER */
247
248#endif /* !TRE_WCHAR */
249
250
251
252/* Assumes tre_tnfa_t *tnfa in scope */
253#define IS_WORD_CHAR(c)	 ((c) == L'_' || tre_isalnum_l(c, tnfa->loc))
254
255#define CHECK_ASSERTIONS(assertions)					      \
256  (((assertions & ASSERT_AT_BOL)					      \
257    && (pos > 0 || reg_notbol)						      \
258    && (prev_c != L'\n' || !reg_newline))				      \
259   || ((assertions & ASSERT_AT_EOL)					      \
260       && (next_c != L'\0' || reg_noteol)				      \
261       && (next_c != L'\n' || !reg_newline))				      \
262   || ((assertions & ASSERT_AT_BOW)					      \
263       && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c)))	              \
264   || ((assertions & ASSERT_AT_EOW)					      \
265       && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c)))		      \
266   || ((assertions & ASSERT_AT_WB)					      \
267       && (pos != 0 && next_c != L'\0'					      \
268	   && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c)))		      \
269   || ((assertions & ASSERT_AT_WB_NEG)					      \
270       && (pos == 0 || next_c == L'\0'					      \
271	   || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
272
273#define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)                             \
274  ((trans_i->assertions & ASSERT_BRACKET_MATCH)                               \
275    && !tre_bracket_match(trans_i->u.bracket_match_list,(tre_cint_t)prev_c,    \
276                                      tnfa))
277
278
279
280
281inline static int
282tre_tag_get(const tre_tag_t *tags, int i)
283{
284  tags += i;
285  return tags->count > 0 ? tags->value : -1;
286}
287
288inline static void
289tre_tag_set(tre_tag_t *tags, int i, int val, int touch)
290{
291  tags += i;
292  if (tags->count++ == 0)
293    tags->first = val;
294  tags->value = val;
295  tags->touch = touch;
296}
297
298inline static void
299tre_tag_reset(tre_tag_t *tags, int i)
300{
301  tags[i].count = 0;
302}
303
304inline static int
305tre_tag_touch_get(const tre_tag_t *tags, int i)
306{
307  return tags[i].touch;
308}
309
310#ifdef TRE_DEBUG
311inline static void
312tre_print_tags(const tre_tag_t *tags, int num_tags)
313{
314  int i;
315  for (i = 0; i < num_tags; i++, tags++)
316    {
317      switch(tags->count)
318	{
319	case 0:
320	  DPRINT(("%d:(0,-1)", i));
321	  break;
322	case 1:
323	  DPRINT(("%d:(1,%d)", i, tags->first));
324	  break;
325	default:
326	  DPRINT(("%d:(%d,%d,%d)", i, tags->count, tags->first,
327		  tags->value));
328	  break;
329	}
330      if (i < (num_tags - 1))
331	DPRINT((" "));
332    }
333}
334
335inline static void
336tre_print_tags_all(const tre_tag_t *tags, int num_tags)
337{
338  int i;
339  for (i = 0; i < num_tags; i++, tags++)
340    {
341      switch(tags->count)
342	{
343	case 0:
344	  DPRINT(("%d:(0,-1)/%d", i, tags->touch));
345	  break;
346	case 1:
347	  DPRINT(("%d:(1,%d)/%d", i, tags->first, tags->touch));
348	  break;
349	default:
350	  DPRINT(("%d:(%d,%d,%d)/%d", i, tags->count, tags->first,
351		  tags->value, tags->touch));
352	  break;
353	}
354      if (i < (num_tags - 1))
355	DPRINT((" "));
356    }
357}
358#endif /* TRE_DEBUG */
359
360/* Return < 0, = 0 or > 0 depending on how the start/end pairs of a minimal
361 * group between t1 and t2 compare (t1 loses if < 0, t1 wins if > 0) */
362inline static int
363tre_minimal_tag_order(int start, int end, const tre_tag_t *tags1,
364		      const tre_tag_t *tags2)
365{
366  const tre_tag_t *t1, *t2;
367
368  t1 = tags1 + start;
369  t2 = tags2 + start;
370  /* We need both start tags to be set */
371  if (t1->count == 0 || t2->count == 0)
372    return 0;
373
374  /* The start tags must be equal */
375  if (t1->value != t2->value)
376    return 0;
377
378  t1 = tags1 + end;
379  t2 = tags2 + end;
380  /* For the end tags, we prefer set over unset, because unset means that
381   * the end tag is still growing */
382  if (t1->count == 0)
383    {
384      /* if t2 is set, t1 loses since it is unset */
385      if (t2->count != 0)
386	return -1;
387    }
388  /* if t2 not set, t1 wins since it is set */
389  else if (t2->count == 0)
390    return 1;
391
392  /* least current value wins */
393  return t2->value - t1->value;
394}
395
396/* Return < 0, = 0 or > 0 depending on how the i-th item of t1 and t2 compare
397 * (t1 loses if < 0, t1 wins if > 0) */
398inline static int
399tre_tag_order_1(int i, tre_tag_direction_t dir, const tre_tag_t *t1,
400		const tre_tag_t *t2)
401{
402  int diff;
403
404  t1 += i;
405  t2 += i;
406  switch (dir)
407    {
408    case TRE_TAG_MINIMIZE:
409      /* least current value wins (because tags are initialized to all zeros,
410       * unset wins over set; also, tre_minimal_tag_order() will have already
411       * been run, which checks for being unset) */
412      return t2->value - t1->value;
413
414    case TRE_TAG_MAXIMIZE:
415      /* prefer set */
416      if (t1->count == 0)
417	{
418	  /* if neither t1 and t2 are set, try next tag */
419	  if (t2->count == 0)
420	    return 0;
421	  /* t2 is set, t1 loses since it is unset */
422	  return -1;
423	}
424      /* if t2 not set, t1 wins since it is set */
425      else if (t2->count == 0)
426	return 1;
427      /* greatest initial value wins */
428      if ((diff = t1->first - t2->first) != 0)
429	return diff;
430      /* least number of times the tag was set, wins */
431      if ((diff = t2->count - t1->count) != 0)
432	return diff;
433      /* if the tags were only set once, they only have initial values */
434      if (t1->count == 1)
435	return 0;
436      /* greatest current value wins */
437      return t1->value - t2->value;
438
439    case TRE_TAG_LEFT_MAXIMIZE:
440      /* prefer set */
441      if (t1->count == 0)
442	{
443	  /* if neither t1 and t2 are set, try next tag */
444	  if (t2->count == 0)
445	    return 0;
446	  /* t2 is set, t1 loses since it is unset */
447	  return -1;
448	}
449      /* if t2 not set, t1 wins since it is set */
450      else if (t2->count == 0)
451	return 1;
452      /* least initial value wins */
453      if ((diff = t2->first - t1->first) != 0)
454	return diff;
455      /* least number of times the tag was set, wins */
456      if ((diff = t2->count - t1->count) != 0)
457	return diff;
458      /* if the tags were only set once, they only have initial values */
459      if (t1->count == 1)
460	return 0;
461      /* greatest current value wins */
462      return t1->value - t2->value;
463
464    default:
465      /* Shouldn't happen: only assert if TRE_DEBUG defined */
466      assert(0);
467      break;
468    }
469  return 0;
470}
471
472#ifdef TRE_DEBUG
473#define _MORE_DEBUGGING
474#endif /* TRE_DEBUG */
475
476/* Returns 1 if `t1' wins `t2', 0 otherwise. */
477inline static int
478#ifdef _MORE_DEBUGGING
479_tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
480	      const tre_tag_t *t1, const tre_tag_t *t2)
481#else /* !_MORE_DEBUGGING */
482tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
483	      const tre_tag_t *t1, const tre_tag_t *t2)
484#endif /* !_MORE_DEBUGGING */
485{
486  int i, ret;
487
488  for (i = 0; i < num_tags; i++)
489    {
490      if ((ret = tre_tag_order_1(i, tag_directions[i], t1, t2)) != 0)
491	return (ret > 0);
492    }
493  /*  assert(0);*/
494  return 0;
495}
496
497#ifdef _MORE_DEBUGGING
498inline static int
499tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
500	      const tre_tag_t *t1, const tre_tag_t *t2)
501{
502  int ret = _tre_tag_order(num_tags, tag_directions, t1, t2);
503  DPRINT(("tre_tag_order: "));
504  tre_print_tags(t1, num_tags);
505  DPRINT((" %s ", ret ? "wins" : "doesn't win"));
506  tre_print_tags(t2, num_tags);
507  DPRINT(("\n"));
508  return ret;
509}
510#endif /* _MORE_DEBUGGING */
511
512#ifdef __LIBC__
513#include <xlocale_private.h>
514#else /* !__LIBC__ */
515#include <xlocale.h>
516#endif /* !__LIBC__ */
517
518int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len);
519
520inline static int
521tre_bracket_match(tre_bracket_match_list_t * __restrict list, tre_cint_t wc,
522		  const tre_tnfa_t * __restrict tnfa)
523{
524  int match = 0;
525  int i;
526  tre_bracket_match_t *b;
527  tre_cint_t uc, lc;
528  int we, ue, le, got_equiv = 0;
529  int icase = ((tnfa->cflags & REG_ICASE) != 0);
530
531  DPRINT(("tre_bracket_match: %p, %d, %d\n", list, wc, icase));
532  if (icase)
533    {
534      if (tre_islower_l(wc, tnfa->loc))
535	{
536	  lc = wc;
537	  uc = tre_toupper_l(wc, tnfa->loc);
538	}
539      else if (tre_isupper_l(wc, tnfa->loc))
540	{
541	  uc = wc;
542	  lc = tre_tolower_l(wc, tnfa->loc);
543	}
544      else
545	{
546	  icase = 0;
547	}
548    }
549  for (i = 0, b = list->bracket_matches; i < list->num_bracket_matches;
550       i++, b++)
551    {
552      switch (b->type)
553	{
554	case TRE_BRACKET_MATCH_TYPE_CHAR:
555	  if (icase)
556	    match = (b->value == uc || b->value == lc);
557	  else
558	    match = (b->value == wc);
559	  break;
560	case TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN:
561	  {
562	    tre_cint_t start = b->value, end;
563	    if (++i >= list->num_bracket_matches ||
564		(++b)->type != TRE_BRACKET_MATCH_TYPE_RANGE_END)
565	      {
566		DPRINT(("tre_bracket_match: no following range end\n"));
567		assert(0);
568		goto error;
569	      }
570	    end = b->value;
571	    if (!got_equiv)
572	      {
573		if (icase)
574		  {
575		    ue = __collate_equiv_value(tnfa->loc, &uc, 1);
576		    le = __collate_equiv_value(tnfa->loc, &lc, 1);
577		  }
578		else
579		    we = __collate_equiv_value(tnfa->loc, &wc, 1);
580		got_equiv = 1;
581	      }
582	    if (icase)
583	      match = ((start <= ue && ue <= end) ||
584		      (start <= le && le <= end));
585	    else
586	      match = (start <= we && we <= end);
587	    break;
588	  }
589	case TRE_BRACKET_MATCH_TYPE_RANGE_END:
590	  DPRINT(("tre_bracket_match: range end without preceeding start\n"));
591	  assert(0);
592	  break;
593	case TRE_BRACKET_MATCH_TYPE_CLASS:
594	  if (icase)
595	    match = (tre_isctype_l(uc, b->value, tnfa->loc) ||
596		     tre_isctype_l(lc, b->value, tnfa->loc));
597	  else
598	    match = (tre_isctype_l(wc, b->value, tnfa->loc));
599	  break;
600	case TRE_BRACKET_MATCH_TYPE_EQUIVALENCE:
601	  if (!got_equiv)
602	    {
603	      if (icase)
604		{
605		  ue = __collate_equiv_value(tnfa->loc, &uc, 1);
606		  le = __collate_equiv_value(tnfa->loc, &lc, 1);
607		}
608	      else
609		  we = __collate_equiv_value(tnfa->loc, &wc, 1);
610	      got_equiv = 1;
611	    }
612	  if (icase)
613	    match = (b->value == ue || b->value == le);
614	  else
615	    match = (b->value == we);
616	  break;
617	default:
618	  DPRINT(("tre_bracket_match: unknown type %d\n", b->type));
619	  assert(0);
620	  break;
621	}
622	if (match)
623	  break;
624    }
625error:
626  if (list->flags & TRE_BRACKET_MATCH_FLAG_NEGATE) {
627    if ((tnfa->cflags & REG_NEWLINE) && wc == '\n') return 0;
628    match = !match;
629  }
630  return match;
631}
632