dfa.c revision 146199
1/* dfa.c - deterministic extended regexp routines for GNU
2   Copyright 1988, 1998, 2000 Free Software Foundation, Inc.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2, or (at your option)
7   any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software
16   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA */
17
18/* Written June, 1988 by Mike Haertel
19   Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
20
21/* $FreeBSD: head/gnu/usr.bin/grep/dfa.c 146199 2005-05-14 03:02:22Z tjr $ */
22
23#ifdef HAVE_CONFIG_H
24#include <config.h>
25#endif
26
27#include <assert.h>
28#include <ctype.h>
29#include <stdio.h>
30
31#include <sys/types.h>
32#ifdef STDC_HEADERS
33#include <stdlib.h>
34#else
35extern char *calloc(), *malloc(), *realloc();
36extern void free();
37#endif
38
39#if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
40#include <string.h>
41#else
42#include <strings.h>
43#endif
44
45#if HAVE_SETLOCALE
46# include <locale.h>
47#endif
48
49#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
50/* We can handle multibyte string.  */
51# define MBS_SUPPORT
52#endif
53
54#ifdef MBS_SUPPORT
55# include <wchar.h>
56# include <wctype.h>
57#endif
58
59#ifndef DEBUG	/* use the same approach as regex.c */
60#undef assert
61#define assert(e)
62#endif /* DEBUG */
63
64#ifndef isgraph
65#define isgraph(C) (isprint(C) && !isspace(C))
66#endif
67
68#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
69#define ISALPHA(C) isalpha(C)
70#define ISUPPER(C) isupper(C)
71#define ISLOWER(C) islower(C)
72#define ISDIGIT(C) isdigit(C)
73#define ISXDIGIT(C) isxdigit(C)
74#define ISSPACE(C) isspace(C)
75#define ISPUNCT(C) ispunct(C)
76#define ISALNUM(C) isalnum(C)
77#define ISPRINT(C) isprint(C)
78#define ISGRAPH(C) isgraph(C)
79#define ISCNTRL(C) iscntrl(C)
80#else
81#define ISALPHA(C) (isascii(C) && isalpha(C))
82#define ISUPPER(C) (isascii(C) && isupper(C))
83#define ISLOWER(C) (isascii(C) && islower(C))
84#define ISDIGIT(C) (isascii(C) && isdigit(C))
85#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
86#define ISSPACE(C) (isascii(C) && isspace(C))
87#define ISPUNCT(C) (isascii(C) && ispunct(C))
88#define ISALNUM(C) (isascii(C) && isalnum(C))
89#define ISPRINT(C) (isascii(C) && isprint(C))
90#define ISGRAPH(C) (isascii(C) && isgraph(C))
91#define ISCNTRL(C) (isascii(C) && iscntrl(C))
92#endif
93
94/* ISASCIIDIGIT differs from ISDIGIT, as follows:
95   - Its arg may be any int or unsigned int; it need not be an unsigned char.
96   - It's guaranteed to evaluate its argument exactly once.
97   - It's typically faster.
98   Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
99   only '0' through '9' are digits.  Prefer ISASCIIDIGIT to ISDIGIT unless
100   it's important to use the locale's definition of `digit' even when the
101   host does not conform to Posix.  */
102#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
103
104/* If we (don't) have I18N.  */
105/* glibc defines _ */
106#ifndef _
107# ifdef HAVE_LIBINTL_H
108#  include <libintl.h>
109#  ifndef _
110#   define _(Str) gettext (Str)
111#  endif
112# else
113#  define _(Str) (Str)
114# endif
115#endif
116
117#include "regex.h"
118#include "dfa.h"
119#include "hard-locale.h"
120
121/* HPUX, define those as macros in sys/param.h */
122#ifdef setbit
123# undef setbit
124#endif
125#ifdef clrbit
126# undef clrbit
127#endif
128
129static void dfamust PARAMS ((struct dfa *dfa));
130static void regexp PARAMS ((int toplevel));
131
132static ptr_t
133xcalloc (size_t n, size_t s)
134{
135  ptr_t r = calloc(n, s);
136
137  if (!r)
138    dfaerror(_("Memory exhausted"));
139  return r;
140}
141
142static ptr_t
143xmalloc (size_t n)
144{
145  ptr_t r = malloc(n);
146
147  assert(n != 0);
148  if (!r)
149    dfaerror(_("Memory exhausted"));
150  return r;
151}
152
153static ptr_t
154xrealloc (ptr_t p, size_t n)
155{
156  ptr_t r = realloc(p, n);
157
158  assert(n != 0);
159  if (!r)
160    dfaerror(_("Memory exhausted"));
161  return r;
162}
163
164#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
165#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
166#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
167
168/* Reallocate an array of type t if nalloc is too small for index. */
169#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
170  if ((index) >= (nalloc))			  \
171    {						  \
172      do					  \
173	(nalloc) *= 2;				  \
174      while ((index) >= (nalloc));		  \
175      REALLOC(p, t, nalloc);			  \
176    }
177
178#ifdef DEBUG
179
180static void
181prtok (token t)
182{
183  char const *s;
184
185  if (t < 0)
186    fprintf(stderr, "END");
187  else if (t < NOTCHAR)
188    fprintf(stderr, "%c", t);
189  else
190    {
191      switch (t)
192	{
193	case EMPTY: s = "EMPTY"; break;
194	case BACKREF: s = "BACKREF"; break;
195	case BEGLINE: s = "BEGLINE"; break;
196	case ENDLINE: s = "ENDLINE"; break;
197	case BEGWORD: s = "BEGWORD"; break;
198	case ENDWORD: s = "ENDWORD"; break;
199	case LIMWORD: s = "LIMWORD"; break;
200	case NOTLIMWORD: s = "NOTLIMWORD"; break;
201	case QMARK: s = "QMARK"; break;
202	case STAR: s = "STAR"; break;
203	case PLUS: s = "PLUS"; break;
204	case CAT: s = "CAT"; break;
205	case OR: s = "OR"; break;
206	case ORTOP: s = "ORTOP"; break;
207	case LPAREN: s = "LPAREN"; break;
208	case RPAREN: s = "RPAREN"; break;
209	case CRANGE: s = "CRANGE"; break;
210#ifdef MBS_SUPPORT
211	case ANYCHAR: s = "ANYCHAR"; break;
212	case MBCSET: s = "MBCSET"; break;
213#endif /* MBS_SUPPORT */
214	default: s = "CSET"; break;
215	}
216      fprintf(stderr, "%s", s);
217    }
218}
219#endif /* DEBUG */
220
221/* Stuff pertaining to charclasses. */
222
223static int
224tstbit (unsigned b, charclass c)
225{
226  return c[b / INTBITS] & 1 << b % INTBITS;
227}
228
229static void
230setbit (unsigned b, charclass c)
231{
232  c[b / INTBITS] |= 1 << b % INTBITS;
233}
234
235static void
236clrbit (unsigned b, charclass c)
237{
238  c[b / INTBITS] &= ~(1 << b % INTBITS);
239}
240
241static void
242copyset (charclass src, charclass dst)
243{
244  memcpy (dst, src, sizeof (charclass));
245}
246
247static void
248zeroset (charclass s)
249{
250  memset (s, 0, sizeof (charclass));
251}
252
253static void
254notset (charclass s)
255{
256  int i;
257
258  for (i = 0; i < CHARCLASS_INTS; ++i)
259    s[i] = ~s[i];
260}
261
262static int
263equal (charclass s1, charclass s2)
264{
265  return memcmp (s1, s2, sizeof (charclass)) == 0;
266}
267
268/* A pointer to the current dfa is kept here during parsing. */
269static struct dfa *dfa;
270
271/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
272static int
273charclass_index (charclass s)
274{
275  int i;
276
277  for (i = 0; i < dfa->cindex; ++i)
278    if (equal(s, dfa->charclasses[i]))
279      return i;
280  REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
281  ++dfa->cindex;
282  copyset(s, dfa->charclasses[i]);
283  return i;
284}
285
286/* Syntax bits controlling the behavior of the lexical analyzer. */
287static reg_syntax_t syntax_bits, syntax_bits_set;
288
289/* Flag for case-folding letters into sets. */
290static int case_fold;
291
292/* End-of-line byte in data.  */
293static unsigned char eolbyte;
294
295/* Entry point to set syntax options. */
296void
297dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
298{
299  syntax_bits_set = 1;
300  syntax_bits = bits;
301  case_fold = fold;
302  eolbyte = eol;
303}
304
305/* Like setbit, but if case is folded, set both cases of a letter.  */
306static void
307setbit_case_fold (unsigned b, charclass c)
308{
309  setbit (b, c);
310  if (case_fold)
311    {
312      if (ISUPPER (b))
313	setbit (tolower (b), c);
314      else if (ISLOWER (b))
315	setbit (toupper (b), c);
316    }
317}
318
319/* Lexical analyzer.  All the dross that deals with the obnoxious
320   GNU Regex syntax bits is located here.  The poor, suffering
321   reader is referred to the GNU Regex documentation for the
322   meaning of the @#%!@#%^!@ syntax bits. */
323
324static char const *lexstart;	/* Pointer to beginning of input string. */
325static char const *lexptr;	/* Pointer to next input character. */
326static int lexleft;		/* Number of characters remaining. */
327static token lasttok;		/* Previous token returned; initially END. */
328static int laststart;		/* True if we're separated from beginning or (, |
329				   only by zero-width characters. */
330static int parens;		/* Count of outstanding left parens. */
331static int minrep, maxrep;	/* Repeat counts for {m,n}. */
332static int hard_LC_COLLATE;	/* Nonzero if LC_COLLATE is hard.  */
333
334#ifdef MBS_SUPPORT
335/* These variables are used only if (MB_CUR_MAX > 1).  */
336static mbstate_t mbs;		/* Mbstate for mbrlen().  */
337static int cur_mb_len;		/* Byte length of the current scanning
338				   multibyte character.  */
339static int cur_mb_index;        /* Byte index of the current scanning multibyte
340                                   character.
341
342				   singlebyte character : cur_mb_index = 0
343				   multibyte character
344				       1st byte : cur_mb_index = 1
345				       2nd byte : cur_mb_index = 2
346				         ...
347				       nth byte : cur_mb_index = n  */
348static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
349                                  Each element store the amount of remain
350                                  byte of corresponding multibyte character
351                                  in the input string.  A element's value
352                                  is 0 if corresponding character is a
353                                  singlebyte chracter.
354                                  e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
355                                   mblen_buf :   0,       3,       2,       1
356                               */
357static wchar_t *inputwcs;	/* Wide character representation of input
358				   string in dfaexec().
359				   The length of this array is same as
360				   the length of input string(char array).
361				   inputstring[i] is a single-byte char,
362				   or 1st byte of a multibyte char.
363				   And inputwcs[i] is the codepoint.  */
364static unsigned char const *buf_begin;/* refference to begin in dfaexec().  */
365static unsigned char const *buf_end;	/* refference to end in dfaexec().  */
366#endif /* MBS_SUPPORT  */
367
368#ifdef MBS_SUPPORT
369/* This function update cur_mb_len, and cur_mb_index.
370   p points current lexptr, len is the remaining buffer length.  */
371static void
372update_mb_len_index (unsigned char const *p, int len)
373{
374  /* If last character is a part of a multibyte character,
375     we update cur_mb_index.  */
376  if (cur_mb_index)
377    cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
378			: cur_mb_index + 1;
379
380  /* If last character is a single byte character, or the
381     last portion of a multibyte character, we check whether
382     next character is a multibyte character or not.  */
383  if (! cur_mb_index)
384    {
385      cur_mb_len = mbrlen(p, len, &mbs);
386      if (cur_mb_len > 1)
387	/* It is a multibyte character.
388	   cur_mb_len was already set by mbrlen().  */
389	cur_mb_index = 1;
390      else if (cur_mb_len < 1)
391	/* Invalid sequence.  We treat it as a singlebyte character.
392	   cur_mb_index is aleady 0.  */
393	cur_mb_len = 1;
394      /* Otherwise, cur_mb_len == 1, it is a singlebyte character.
395	 cur_mb_index is aleady 0.  */
396    }
397}
398#endif /* MBS_SUPPORT */
399
400#ifdef MBS_SUPPORT
401/* Note that characters become unsigned here. */
402# define FETCH(c, eoferr)			\
403  {						\
404    if (! lexleft)				\
405     {						\
406	if (eoferr != 0)			\
407	  dfaerror (eoferr);			\
408	else					\
409	  return lasttok = END;			\
410      }						\
411    if (MB_CUR_MAX > 1)				\
412      update_mb_len_index(lexptr, lexleft);	\
413    (c) = (unsigned char) *lexptr++;		\
414    --lexleft;					\
415  }
416
417/* This function fetch a wide character, and update cur_mb_len,
418   used only if the current locale is a multibyte environment.  */
419static wint_t
420fetch_wc (char const *eoferr)
421{
422  wchar_t wc;
423  if (! lexleft)
424    {
425      if (eoferr != 0)
426	dfaerror (eoferr);
427      else
428	return WEOF;
429    }
430
431  cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
432  if (cur_mb_len <= 0)
433   {
434      cur_mb_len = 1;
435      wc = *lexptr;
436    }
437  lexptr += cur_mb_len;
438  lexleft -= cur_mb_len;
439  return wc;
440}
441#else
442/* Note that characters become unsigned here. */
443# define FETCH(c, eoferr)   	      \
444  {			   	      \
445    if (! lexleft)	   	      \
446      {				      \
447	if (eoferr != 0)	      \
448	  dfaerror (eoferr);	      \
449	else		   	      \
450	  return lasttok = END;	      \
451      }				      \
452    (c) = (unsigned char) *lexptr++;  \
453    --lexleft;		   	      \
454  }
455#endif /* MBS_SUPPORT */
456
457#ifdef MBS_SUPPORT
458/* Multibyte character handling sub-routin for lex.
459   This function  parse a bracket expression and build a struct
460   mb_char_classes.  */
461static void
462parse_bracket_exp_mb ()
463{
464  wint_t wc, wc1, wc2;
465
466  /* Work area to build a mb_char_classes.  */
467  struct mb_char_classes *work_mbc;
468  int chars_al, range_sts_al, range_ends_al, ch_classes_al,
469    equivs_al, coll_elems_al;
470
471  REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
472		       dfa->mbcsets_alloc, dfa->nmbcsets + 1);
473  /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
474     We will update dfa->multibyte_prop in addtok(), because we can't
475     decide the index in dfa->tokens[].  */
476
477  /* Initialize work are */
478  work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
479
480  chars_al = 1;
481  range_sts_al = range_ends_al = 0;
482  ch_classes_al = equivs_al = coll_elems_al = 0;
483  MALLOC(work_mbc->chars, wchar_t, chars_al);
484
485  work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
486  work_mbc->nequivs = work_mbc->ncoll_elems = 0;
487  work_mbc->chars = work_mbc->ch_classes = NULL;
488  work_mbc->range_sts = work_mbc->range_ends = NULL;
489  work_mbc->equivs = work_mbc->coll_elems = NULL;
490
491  wc = fetch_wc(_("Unbalanced ["));
492  if (wc == L'^')
493    {
494      wc = fetch_wc(_("Unbalanced ["));
495      work_mbc->invert = 1;
496    }
497  else
498    work_mbc->invert = 0;
499  do
500    {
501      wc1 = WEOF; /* mark wc1 is not initialized".  */
502
503      /* Note that if we're looking at some other [:...:] construct,
504	 we just treat it as a bunch of ordinary characters.  We can do
505	 this because we assume regex has checked for syntax errors before
506	 dfa is ever called. */
507      if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
508	{
509#define BRACKET_BUFFER_SIZE 128
510	  char str[BRACKET_BUFFER_SIZE];
511	  wc1 = wc;
512	  wc = fetch_wc(_("Unbalanced ["));
513
514	  /* If pattern contains `[[:', `[[.', or `[[='.  */
515	  if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
516	    {
517	      unsigned char c;
518	      unsigned char delim = (unsigned char)wc;
519	      int len = 0;
520	      for (;;)
521		{
522		  if (! lexleft)
523		    dfaerror (_("Unbalanced ["));
524		  c = (unsigned char) *lexptr++;
525		  --lexleft;
526
527		  if ((c == delim && *lexptr == ']') || lexleft == 0)
528		    break;
529		  if (len < BRACKET_BUFFER_SIZE)
530		    str[len++] = c;
531		  else
532		    /* This is in any case an invalid class name.  */
533		    str[0] = '\0';
534		}
535	      str[len] = '\0';
536
537	      if (lexleft == 0)
538		{
539		  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
540				       work_mbc->nchars + 2);
541		  work_mbc->chars[work_mbc->nchars++] = L'[';
542		  work_mbc->chars[work_mbc->nchars++] = delim;
543		  break;
544		}
545
546	      if (--lexleft, *lexptr++ != ']')
547		dfaerror (_("Unbalanced ["));
548	      if (delim == ':')
549		/* build character class.  */
550		{
551		  wctype_t wt;
552		  /* Query the character class as wctype_t.  */
553		  wt = wctype (str);
554
555		  if (ch_classes_al == 0)
556		    MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al);
557		  REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
558				       ch_classes_al,
559				       work_mbc->nch_classes + 1);
560		  work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
561
562 		}
563	      else if (delim == '=' || delim == '.')
564		{
565		  char *elem;
566		  MALLOC(elem, char, len + 1);
567		  strncpy(elem, str, len + 1);
568
569		  if (delim == '=')
570		    /* build equivalent class.  */
571		    {
572		      if (equivs_al == 0)
573			MALLOC(work_mbc->equivs, char*, ++equivs_al);
574		      REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
575					   equivs_al,
576					   work_mbc->nequivs + 1);
577		      work_mbc->equivs[work_mbc->nequivs++] = elem;
578		    }
579
580		  if (delim == '.')
581		    /* build collating element.  */
582		    {
583		      if (coll_elems_al == 0)
584			MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
585		      REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
586					   coll_elems_al,
587					   work_mbc->ncoll_elems + 1);
588		      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
589		    }
590 		}
591	      wc1 = wc = WEOF;
592	    }
593	  else
594	    /* We treat '[' as a normal character here.  */
595	    {
596	      wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
597	    }
598	}
599      else
600	{
601	  if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
602	    wc = fetch_wc(("Unbalanced ["));
603	}
604
605      if (wc1 == WEOF)
606	wc1 = fetch_wc(_("Unbalanced ["));
607
608      if (wc1 == L'-')
609	/* build range characters.  */
610	{
611	  wc2 = fetch_wc(_("Unbalanced ["));
612	  if (wc2 == L']')
613	    {
614	      /* In the case [x-], the - is an ordinary hyphen,
615		 which is left in c1, the lookahead character. */
616	      lexptr -= cur_mb_len;
617	      lexleft += cur_mb_len;
618	      wc2 = wc;
619	    }
620	  else
621	    {
622	      if (wc2 == L'\\'
623		  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
624		wc2 = fetch_wc(_("Unbalanced ["));
625	      wc1 = fetch_wc(_("Unbalanced ["));
626	    }
627
628	  if (range_sts_al == 0)
629	    {
630	      MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
631	      MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
632	    }
633	  REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
634			       range_sts_al, work_mbc->nranges + 1);
635	  work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
636	  REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
637			       range_ends_al, work_mbc->nranges + 1);
638	  work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
639	}
640      else if (wc != WEOF)
641	/* build normal characters.  */
642	{
643	  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
644			       work_mbc->nchars + 1);
645	  work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
646	}
647    }
648  while ((wc = wc1) != L']');
649}
650#endif /* MBS_SUPPORT */
651
652#ifdef __STDC__
653#define FUNC(F, P) static int F(int c) { return P(c); }
654#else
655#define FUNC(F, P) static int F(c) int c; { return P(c); }
656#endif
657
658FUNC(is_alpha, ISALPHA)
659FUNC(is_upper, ISUPPER)
660FUNC(is_lower, ISLOWER)
661FUNC(is_digit, ISDIGIT)
662FUNC(is_xdigit, ISXDIGIT)
663FUNC(is_space, ISSPACE)
664FUNC(is_punct, ISPUNCT)
665FUNC(is_alnum, ISALNUM)
666FUNC(is_print, ISPRINT)
667FUNC(is_graph, ISGRAPH)
668FUNC(is_cntrl, ISCNTRL)
669
670static int
671is_blank (int c)
672{
673   return (c == ' ' || c == '\t');
674}
675
676/* The following list maps the names of the Posix named character classes
677   to predicate functions that determine whether a given character is in
678   the class.  The leading [ has already been eaten by the lexical analyzer. */
679static struct {
680  const char *name;
681  int (*pred) PARAMS ((int));
682} const prednames[] = {
683  { ":alpha:]", is_alpha },
684  { ":upper:]", is_upper },
685  { ":lower:]", is_lower },
686  { ":digit:]", is_digit },
687  { ":xdigit:]", is_xdigit },
688  { ":space:]", is_space },
689  { ":punct:]", is_punct },
690  { ":alnum:]", is_alnum },
691  { ":print:]", is_print },
692  { ":graph:]", is_graph },
693  { ":cntrl:]", is_cntrl },
694  { ":blank:]", is_blank },
695  { 0 }
696};
697
698/* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
699#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
700
701static int
702looking_at (char const *s)
703{
704  size_t len;
705
706  len = strlen(s);
707  if (lexleft < len)
708    return 0;
709  return strncmp(s, lexptr, len) == 0;
710}
711
712static token
713lex (void)
714{
715  unsigned c, c1, c2;
716  int backslash = 0, invert;
717  charclass ccl;
718  int i;
719
720  /* Basic plan: We fetch a character.  If it's a backslash,
721     we set the backslash flag and go through the loop again.
722     On the plus side, this avoids having a duplicate of the
723     main switch inside the backslash case.  On the minus side,
724     it means that just about every case begins with
725     "if (backslash) ...".  */
726  for (i = 0; i < 2; ++i)
727    {
728      FETCH(c, 0);
729#ifdef MBS_SUPPORT
730      if (MB_CUR_MAX > 1 && cur_mb_index)
731	/* If this is a part of a multi-byte character, we must treat
732	   this byte data as a normal character.
733	   e.g. In case of SJIS encoding, some character contains '\',
734	        but they must not be backslash.  */
735	goto normal_char;
736#endif /* MBS_SUPPORT  */
737      switch (c)
738	{
739	case '\\':
740	  if (backslash)
741	    goto normal_char;
742	  if (lexleft == 0)
743	    dfaerror(_("Unfinished \\ escape"));
744	  backslash = 1;
745	  break;
746
747	case '^':
748	  if (backslash)
749	    goto normal_char;
750	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
751	      || lasttok == END
752	      || lasttok == LPAREN
753	      || lasttok == OR)
754	    return lasttok = BEGLINE;
755	  goto normal_char;
756
757	case '$':
758	  if (backslash)
759	    goto normal_char;
760	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
761	      || lexleft == 0
762	      || (syntax_bits & RE_NO_BK_PARENS
763		  ? lexleft > 0 && *lexptr == ')'
764		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
765	      || (syntax_bits & RE_NO_BK_VBAR
766		  ? lexleft > 0 && *lexptr == '|'
767		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
768	      || ((syntax_bits & RE_NEWLINE_ALT)
769	          && lexleft > 0 && *lexptr == '\n'))
770	    return lasttok = ENDLINE;
771	  goto normal_char;
772
773	case '1':
774	case '2':
775	case '3':
776	case '4':
777	case '5':
778	case '6':
779	case '7':
780	case '8':
781	case '9':
782	  if (backslash && !(syntax_bits & RE_NO_BK_REFS))
783	    {
784	      laststart = 0;
785	      return lasttok = BACKREF;
786	    }
787	  goto normal_char;
788
789	case '`':
790	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
791	    return lasttok = BEGLINE;	/* FIXME: should be beginning of string */
792	  goto normal_char;
793
794	case '\'':
795	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
796	    return lasttok = ENDLINE;	/* FIXME: should be end of string */
797	  goto normal_char;
798
799	case '<':
800	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
801	    return lasttok = BEGWORD;
802	  goto normal_char;
803
804	case '>':
805	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
806	    return lasttok = ENDWORD;
807	  goto normal_char;
808
809	case 'b':
810	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
811	    return lasttok = LIMWORD;
812	  goto normal_char;
813
814	case 'B':
815	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
816	    return lasttok = NOTLIMWORD;
817	  goto normal_char;
818
819	case '?':
820	  if (syntax_bits & RE_LIMITED_OPS)
821	    goto normal_char;
822	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
823	    goto normal_char;
824	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
825	    goto normal_char;
826	  return lasttok = QMARK;
827
828	case '*':
829	  if (backslash)
830	    goto normal_char;
831	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
832	    goto normal_char;
833	  return lasttok = STAR;
834
835	case '+':
836	  if (syntax_bits & RE_LIMITED_OPS)
837	    goto normal_char;
838	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
839	    goto normal_char;
840	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
841	    goto normal_char;
842	  return lasttok = PLUS;
843
844	case '{':
845	  if (!(syntax_bits & RE_INTERVALS))
846	    goto normal_char;
847	  if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
848	    goto normal_char;
849	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
850	    goto normal_char;
851
852	  if (syntax_bits & RE_NO_BK_BRACES)
853	    {
854	      /* Scan ahead for a valid interval; if it's not valid,
855		 treat it as a literal '{'.  */
856	      int lo = -1, hi = -1;
857	      char const *p = lexptr;
858	      char const *lim = p + lexleft;
859	      for (;  p != lim && ISASCIIDIGIT (*p);  p++)
860		lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
861	      if (p != lim && *p == ',')
862		while (++p != lim && ISASCIIDIGIT (*p))
863		  hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
864	      else
865		hi = lo;
866	      if (p == lim || *p != '}'
867		  || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
868		goto normal_char;
869	    }
870
871	  minrep = 0;
872	  /* Cases:
873	     {M} - exact count
874	     {M,} - minimum count, maximum is infinity
875	     {M,N} - M through N */
876	  FETCH(c, _("unfinished repeat count"));
877	  if (ISASCIIDIGIT (c))
878	    {
879	      minrep = c - '0';
880	      for (;;)
881		{
882		  FETCH(c, _("unfinished repeat count"));
883		  if (! ISASCIIDIGIT (c))
884		    break;
885		  minrep = 10 * minrep + c - '0';
886		}
887	    }
888	  else
889	    dfaerror(_("malformed repeat count"));
890	  if (c == ',')
891	    {
892	      FETCH (c, _("unfinished repeat count"));
893	      if (! ISASCIIDIGIT (c))
894		maxrep = -1;
895	      else
896		{
897		  maxrep = c - '0';
898		  for (;;)
899		    {
900		      FETCH (c, _("unfinished repeat count"));
901		      if (! ISASCIIDIGIT (c))
902			break;
903		      maxrep = 10 * maxrep + c - '0';
904		    }
905		  if (0 <= maxrep && maxrep < minrep)
906		    dfaerror (_("malformed repeat count"));
907		}
908	    }
909	  else
910	    maxrep = minrep;
911	  if (!(syntax_bits & RE_NO_BK_BRACES))
912	    {
913	      if (c != '\\')
914		dfaerror(_("malformed repeat count"));
915	      FETCH(c, _("unfinished repeat count"));
916	    }
917	  if (c != '}')
918	    dfaerror(_("malformed repeat count"));
919	  laststart = 0;
920	  return lasttok = REPMN;
921
922	case '|':
923	  if (syntax_bits & RE_LIMITED_OPS)
924	    goto normal_char;
925	  if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
926	    goto normal_char;
927	  laststart = 1;
928	  return lasttok = OR;
929
930	case '\n':
931	  if (syntax_bits & RE_LIMITED_OPS
932	      || backslash
933	      || !(syntax_bits & RE_NEWLINE_ALT))
934	    goto normal_char;
935	  laststart = 1;
936	  return lasttok = OR;
937
938	case '(':
939	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
940	    goto normal_char;
941	  ++parens;
942	  laststart = 1;
943	  return lasttok = LPAREN;
944
945	case ')':
946	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
947	    goto normal_char;
948	  if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
949	    goto normal_char;
950	  --parens;
951	  laststart = 0;
952	  return lasttok = RPAREN;
953
954	case '.':
955	  if (backslash)
956	    goto normal_char;
957#ifdef MBS_SUPPORT
958	  if (MB_CUR_MAX > 1)
959	    {
960	      /* In multibyte environment period must match with a single
961		 character not a byte.  So we use ANYCHAR.  */
962	      laststart = 0;
963	      return lasttok = ANYCHAR;
964	    }
965#endif /* MBS_SUPPORT */
966	  zeroset(ccl);
967	  notset(ccl);
968	  if (!(syntax_bits & RE_DOT_NEWLINE))
969	    clrbit(eolbyte, ccl);
970	  if (syntax_bits & RE_DOT_NOT_NULL)
971	    clrbit('\0', ccl);
972	  laststart = 0;
973	  return lasttok = CSET + charclass_index(ccl);
974
975	case 'w':
976	case 'W':
977	  if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
978	    goto normal_char;
979	  zeroset(ccl);
980	  for (c2 = 0; c2 < NOTCHAR; ++c2)
981	    if (IS_WORD_CONSTITUENT(c2))
982	      setbit(c2, ccl);
983	  if (c == 'W')
984	    notset(ccl);
985	  laststart = 0;
986	  return lasttok = CSET + charclass_index(ccl);
987
988	case '[':
989	  if (backslash)
990	    goto normal_char;
991	  laststart = 0;
992#ifdef MBS_SUPPORT
993	  if (MB_CUR_MAX > 1)
994	    {
995	      /* In multibyte environment a bracket expression may contain
996		 multibyte characters, which must be treated as characters
997		 (not bytes).  So we parse it by parse_bracket_exp_mb().  */
998	      parse_bracket_exp_mb();
999	      return lasttok = MBCSET;
1000	    }
1001#endif
1002	  zeroset(ccl);
1003	  FETCH(c, _("Unbalanced ["));
1004	  if (c == '^')
1005	    {
1006	      FETCH(c, _("Unbalanced ["));
1007	      invert = 1;
1008	    }
1009	  else
1010	    invert = 0;
1011	  do
1012	    {
1013	      /* Nobody ever said this had to be fast. :-)
1014		 Note that if we're looking at some other [:...:]
1015		 construct, we just treat it as a bunch of ordinary
1016		 characters.  We can do this because we assume
1017		 regex has checked for syntax errors before
1018		 dfa is ever called. */
1019	      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
1020		for (c1 = 0; prednames[c1].name; ++c1)
1021		  if (looking_at(prednames[c1].name))
1022		    {
1023		      int (*pred) PARAMS ((int)) = prednames[c1].pred;
1024
1025		      for (c2 = 0; c2 < NOTCHAR; ++c2)
1026			if ((*pred)(c2))
1027			  setbit_case_fold (c2, ccl);
1028		      lexptr += strlen(prednames[c1].name);
1029		      lexleft -= strlen(prednames[c1].name);
1030		      FETCH(c1, _("Unbalanced ["));
1031		      goto skip;
1032		    }
1033	      if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1034		FETCH(c, _("Unbalanced ["));
1035	      FETCH(c1, _("Unbalanced ["));
1036	      if (c1 == '-')
1037		{
1038		  FETCH(c2, _("Unbalanced ["));
1039		  if (c2 == ']')
1040		    {
1041		      /* In the case [x-], the - is an ordinary hyphen,
1042			 which is left in c1, the lookahead character. */
1043		      --lexptr;
1044		      ++lexleft;
1045		    }
1046		  else
1047		    {
1048		      if (c2 == '\\'
1049			  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1050			FETCH(c2, _("Unbalanced ["));
1051		      FETCH(c1, _("Unbalanced ["));
1052		      if (!hard_LC_COLLATE) {
1053		        for (; c <= c2; c++)
1054			  setbit_case_fold (c, ccl);
1055		      } else {
1056			/* POSIX locales are painful - leave the decision to libc */
1057			char expr[6] = { '[', c, '-', c2, ']', '\0' };
1058			regex_t re;
1059			if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
1060			  for (c = 0; c < NOTCHAR; ++c) {
1061			    char buf[2] = { c, '\0' };
1062			    regmatch_t mat;
1063			    if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
1064                               && mat.rm_so == 0 && mat.rm_eo == 1)
1065                              setbit_case_fold (c, ccl);
1066			  }
1067			  regfree (&re);
1068			}
1069		      }
1070		      continue;
1071		    }
1072		}
1073
1074	      setbit_case_fold (c, ccl);
1075
1076	    skip:
1077	      ;
1078	    }
1079	  while ((c = c1) != ']');
1080	  if (invert)
1081	    {
1082	      notset(ccl);
1083	      if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1084		clrbit(eolbyte, ccl);
1085	    }
1086	  return lasttok = CSET + charclass_index(ccl);
1087
1088	default:
1089	normal_char:
1090	  laststart = 0;
1091	  if (case_fold && ISALPHA(c))
1092	    {
1093	      zeroset(ccl);
1094	      setbit_case_fold (c, ccl);
1095	      return lasttok = CSET + charclass_index(ccl);
1096	    }
1097	  return c;
1098	}
1099    }
1100
1101  /* The above loop should consume at most a backslash
1102     and some other character. */
1103  abort();
1104  return END;	/* keeps pedantic compilers happy. */
1105}
1106
1107/* Recursive descent parser for regular expressions. */
1108
1109static token tok;		/* Lookahead token. */
1110static int depth;		/* Current depth of a hypothetical stack
1111				   holding deferred productions.  This is
1112				   used to determine the depth that will be
1113				   required of the real stack later on in
1114				   dfaanalyze(). */
1115
1116/* Add the given token to the parse tree, maintaining the depth count and
1117   updating the maximum depth if necessary. */
1118static void
1119addtok (token t)
1120{
1121#ifdef MBS_SUPPORT
1122  if (MB_CUR_MAX > 1)
1123    {
1124      REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1125			   dfa->tindex);
1126      /* Set dfa->multibyte_prop.  See struct dfa in dfa.h.  */
1127      if (t == MBCSET)
1128	dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
1129      else if (t < NOTCHAR)
1130	dfa->multibyte_prop[dfa->tindex]
1131	  = (cur_mb_len == 1)? 3 /* single-byte char */
1132	  : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
1133	     + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
1134      else
1135	/* It may be unnecesssary, but it is safer to treat other
1136	   symbols as singlebyte characters.  */
1137	dfa->multibyte_prop[dfa->tindex] = 3;
1138    }
1139#endif
1140
1141  REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1142  dfa->tokens[dfa->tindex++] = t;
1143
1144  switch (t)
1145    {
1146    case QMARK:
1147    case STAR:
1148    case PLUS:
1149      break;
1150
1151    case CAT:
1152    case OR:
1153    case ORTOP:
1154      --depth;
1155      break;
1156
1157    default:
1158      ++dfa->nleaves;
1159    case EMPTY:
1160      ++depth;
1161      break;
1162    }
1163  if (depth > dfa->depth)
1164    dfa->depth = depth;
1165}
1166
1167/* The grammar understood by the parser is as follows.
1168
1169   regexp:
1170     regexp OR branch
1171     branch
1172
1173   branch:
1174     branch closure
1175     closure
1176
1177   closure:
1178     closure QMARK
1179     closure STAR
1180     closure PLUS
1181     closure REPMN
1182     atom
1183
1184   atom:
1185     <normal character>
1186     <multibyte character>
1187     ANYCHAR
1188     MBCSET
1189     CSET
1190     BACKREF
1191     BEGLINE
1192     ENDLINE
1193     BEGWORD
1194     ENDWORD
1195     LIMWORD
1196     NOTLIMWORD
1197     CRANGE
1198     LPAREN regexp RPAREN
1199     <empty>
1200
1201   The parser builds a parse tree in postfix form in an array of tokens. */
1202
1203static void
1204atom (void)
1205{
1206  if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1207      || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1208#ifdef MBS_SUPPORT
1209      || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
1210#endif /* MBS_SUPPORT */
1211      || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1212    {
1213      addtok(tok);
1214      tok = lex();
1215#ifdef MBS_SUPPORT
1216      /* We treat a multibyte character as a single atom, so that DFA
1217	 can treat a multibyte character as a single expression.
1218
1219         e.g. We construct following tree from "<mb1><mb2>".
1220              <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1221              <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
1222      */
1223      if (MB_CUR_MAX > 1)
1224	{
1225	  while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
1226	    {
1227	      addtok(tok);
1228	      addtok(CAT);
1229	      tok = lex();
1230	    }
1231	}
1232#endif /* MBS_SUPPORT  */
1233    }
1234  else if (tok == CRANGE)
1235    {
1236      /* A character range like "[a-z]" in a locale other than "C" or
1237	 "POSIX".  This range might any sequence of one or more
1238	 characters.  Unfortunately the POSIX locale primitives give
1239	 us no practical way to find what character sequences might be
1240	 matched.  Treat this approximately like "(.\1)" -- i.e. match
1241	 one character, and then punt to the full matcher.  */
1242      charclass ccl;
1243      zeroset (ccl);
1244      notset (ccl);
1245      addtok (CSET + charclass_index (ccl));
1246      addtok (BACKREF);
1247      addtok (CAT);
1248      tok = lex ();
1249    }
1250  else if (tok == LPAREN)
1251    {
1252      tok = lex();
1253      regexp(0);
1254      if (tok != RPAREN)
1255	dfaerror(_("Unbalanced ("));
1256      tok = lex();
1257    }
1258  else
1259    addtok(EMPTY);
1260}
1261
1262/* Return the number of tokens in the given subexpression. */
1263static int
1264nsubtoks (int tindex)
1265{
1266  int ntoks1;
1267
1268  switch (dfa->tokens[tindex - 1])
1269    {
1270    default:
1271      return 1;
1272    case QMARK:
1273    case STAR:
1274    case PLUS:
1275      return 1 + nsubtoks(tindex - 1);
1276    case CAT:
1277    case OR:
1278    case ORTOP:
1279      ntoks1 = nsubtoks(tindex - 1);
1280      return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1281    }
1282}
1283
1284/* Copy the given subexpression to the top of the tree. */
1285static void
1286copytoks (int tindex, int ntokens)
1287{
1288  int i;
1289
1290  for (i = 0; i < ntokens; ++i)
1291    addtok(dfa->tokens[tindex + i]);
1292}
1293
1294static void
1295closure (void)
1296{
1297  int tindex, ntokens, i;
1298
1299  atom();
1300  while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1301    if (tok == REPMN)
1302      {
1303	ntokens = nsubtoks(dfa->tindex);
1304	tindex = dfa->tindex - ntokens;
1305	if (maxrep < 0)
1306	  addtok(PLUS);
1307	if (minrep == 0)
1308	  addtok(QMARK);
1309	for (i = 1; i < minrep; ++i)
1310	  {
1311	    copytoks(tindex, ntokens);
1312	    addtok(CAT);
1313	  }
1314	for (; i < maxrep; ++i)
1315	  {
1316	    copytoks(tindex, ntokens);
1317	    addtok(QMARK);
1318	    addtok(CAT);
1319	  }
1320	tok = lex();
1321      }
1322    else
1323      {
1324	addtok(tok);
1325	tok = lex();
1326      }
1327}
1328
1329static void
1330branch (void)
1331{
1332  closure();
1333  while (tok != RPAREN && tok != OR && tok >= 0)
1334    {
1335      closure();
1336      addtok(CAT);
1337    }
1338}
1339
1340static void
1341regexp (int toplevel)
1342{
1343  branch();
1344  while (tok == OR)
1345    {
1346      tok = lex();
1347      branch();
1348      if (toplevel)
1349	addtok(ORTOP);
1350      else
1351	addtok(OR);
1352    }
1353}
1354
1355/* Main entry point for the parser.  S is a string to be parsed, len is the
1356   length of the string, so s can include NUL characters.  D is a pointer to
1357   the struct dfa to parse into. */
1358void
1359dfaparse (char const *s, size_t len, struct dfa *d)
1360{
1361  dfa = d;
1362  lexstart = lexptr = s;
1363  lexleft = len;
1364  lasttok = END;
1365  laststart = 1;
1366  parens = 0;
1367  hard_LC_COLLATE = hard_locale (LC_COLLATE);
1368#ifdef MBS_SUPPORT
1369  if (MB_CUR_MAX > 1)
1370    {
1371      cur_mb_index = 0;
1372      cur_mb_len = 0;
1373      memset(&mbs, 0, sizeof(mbstate_t));
1374    }
1375#endif /* MBS_SUPPORT  */
1376
1377  if (! syntax_bits_set)
1378    dfaerror(_("No syntax specified"));
1379
1380  tok = lex();
1381  depth = d->depth;
1382
1383  regexp(1);
1384
1385  if (tok != END)
1386    dfaerror(_("Unbalanced )"));
1387
1388  addtok(END - d->nregexps);
1389  addtok(CAT);
1390
1391  if (d->nregexps)
1392    addtok(ORTOP);
1393
1394  ++d->nregexps;
1395}
1396
1397/* Some primitives for operating on sets of positions. */
1398
1399/* Copy one set to another; the destination must be large enough. */
1400static void
1401copy (position_set const *src, position_set *dst)
1402{
1403  int i;
1404
1405  for (i = 0; i < src->nelem; ++i)
1406    dst->elems[i] = src->elems[i];
1407  dst->nelem = src->nelem;
1408}
1409
1410/* Insert a position in a set.  Position sets are maintained in sorted
1411   order according to index.  If position already exists in the set with
1412   the same index then their constraints are logically or'd together.
1413   S->elems must point to an array large enough to hold the resulting set. */
1414static void
1415insert (position p, position_set *s)
1416{
1417  int i;
1418  position t1, t2;
1419
1420  for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1421    continue;
1422  if (i < s->nelem && p.index == s->elems[i].index)
1423    s->elems[i].constraint |= p.constraint;
1424  else
1425    {
1426      t1 = p;
1427      ++s->nelem;
1428      while (i < s->nelem)
1429	{
1430	  t2 = s->elems[i];
1431	  s->elems[i++] = t1;
1432	  t1 = t2;
1433	}
1434    }
1435}
1436
1437/* Merge two sets of positions into a third.  The result is exactly as if
1438   the positions of both sets were inserted into an initially empty set. */
1439static void
1440merge (position_set const *s1, position_set const *s2, position_set *m)
1441{
1442  int i = 0, j = 0;
1443
1444  m->nelem = 0;
1445  while (i < s1->nelem && j < s2->nelem)
1446    if (s1->elems[i].index > s2->elems[j].index)
1447      m->elems[m->nelem++] = s1->elems[i++];
1448    else if (s1->elems[i].index < s2->elems[j].index)
1449      m->elems[m->nelem++] = s2->elems[j++];
1450    else
1451      {
1452	m->elems[m->nelem] = s1->elems[i++];
1453	m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1454      }
1455  while (i < s1->nelem)
1456    m->elems[m->nelem++] = s1->elems[i++];
1457  while (j < s2->nelem)
1458    m->elems[m->nelem++] = s2->elems[j++];
1459}
1460
1461/* Delete a position from a set. */
1462static void
1463delete (position p, position_set *s)
1464{
1465  int i;
1466
1467  for (i = 0; i < s->nelem; ++i)
1468    if (p.index == s->elems[i].index)
1469      break;
1470  if (i < s->nelem)
1471    for (--s->nelem; i < s->nelem; ++i)
1472      s->elems[i] = s->elems[i + 1];
1473}
1474
1475/* Find the index of the state corresponding to the given position set with
1476   the given preceding context, or create a new state if there is no such
1477   state.  Newline and letter tell whether we got here on a newline or
1478   letter, respectively. */
1479static int
1480state_index (struct dfa *d, position_set const *s, int newline, int letter)
1481{
1482  int hash = 0;
1483  int constraint;
1484  int i, j;
1485
1486  newline = newline ? 1 : 0;
1487  letter = letter ? 1 : 0;
1488
1489  for (i = 0; i < s->nelem; ++i)
1490    hash ^= s->elems[i].index + s->elems[i].constraint;
1491
1492  /* Try to find a state that exactly matches the proposed one. */
1493  for (i = 0; i < d->sindex; ++i)
1494    {
1495      if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1496	  || newline != d->states[i].newline || letter != d->states[i].letter)
1497	continue;
1498      for (j = 0; j < s->nelem; ++j)
1499	if (s->elems[j].constraint
1500	    != d->states[i].elems.elems[j].constraint
1501	    || s->elems[j].index != d->states[i].elems.elems[j].index)
1502	  break;
1503      if (j == s->nelem)
1504	return i;
1505    }
1506
1507  /* We'll have to create a new state. */
1508  REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1509  d->states[i].hash = hash;
1510  MALLOC(d->states[i].elems.elems, position, s->nelem);
1511  copy(s, &d->states[i].elems);
1512  d->states[i].newline = newline;
1513  d->states[i].letter = letter;
1514  d->states[i].backref = 0;
1515  d->states[i].constraint = 0;
1516  d->states[i].first_end = 0;
1517#ifdef MBS_SUPPORT
1518  if (MB_CUR_MAX > 1)
1519    d->states[i].mbps.nelem = 0;
1520#endif
1521  for (j = 0; j < s->nelem; ++j)
1522    if (d->tokens[s->elems[j].index] < 0)
1523      {
1524	constraint = s->elems[j].constraint;
1525	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1526	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1527	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1528	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1529	  d->states[i].constraint |= constraint;
1530	if (! d->states[i].first_end)
1531	  d->states[i].first_end = d->tokens[s->elems[j].index];
1532      }
1533    else if (d->tokens[s->elems[j].index] == BACKREF)
1534      {
1535	d->states[i].constraint = NO_CONSTRAINT;
1536	d->states[i].backref = 1;
1537      }
1538
1539  ++d->sindex;
1540
1541  return i;
1542}
1543
1544/* Find the epsilon closure of a set of positions.  If any position of the set
1545   contains a symbol that matches the empty string in some context, replace
1546   that position with the elements of its follow labeled with an appropriate
1547   constraint.  Repeat exhaustively until no funny positions are left.
1548   S->elems must be large enough to hold the result. */
1549static void
1550epsclosure (position_set *s, struct dfa const *d)
1551{
1552  int i, j;
1553  int *visited;
1554  position p, old;
1555
1556  MALLOC(visited, int, d->tindex);
1557  for (i = 0; i < d->tindex; ++i)
1558    visited[i] = 0;
1559
1560  for (i = 0; i < s->nelem; ++i)
1561    if (d->tokens[s->elems[i].index] >= NOTCHAR
1562	&& d->tokens[s->elems[i].index] != BACKREF
1563#ifdef MBS_SUPPORT
1564	&& d->tokens[s->elems[i].index] != ANYCHAR
1565	&& d->tokens[s->elems[i].index] != MBCSET
1566#endif
1567	&& d->tokens[s->elems[i].index] < CSET)
1568      {
1569	old = s->elems[i];
1570	p.constraint = old.constraint;
1571	delete(s->elems[i], s);
1572	if (visited[old.index])
1573	  {
1574	    --i;
1575	    continue;
1576	  }
1577	visited[old.index] = 1;
1578	switch (d->tokens[old.index])
1579	  {
1580	  case BEGLINE:
1581	    p.constraint &= BEGLINE_CONSTRAINT;
1582	    break;
1583	  case ENDLINE:
1584	    p.constraint &= ENDLINE_CONSTRAINT;
1585	    break;
1586	  case BEGWORD:
1587	    p.constraint &= BEGWORD_CONSTRAINT;
1588	    break;
1589	  case ENDWORD:
1590	    p.constraint &= ENDWORD_CONSTRAINT;
1591	    break;
1592	  case LIMWORD:
1593	    p.constraint &= LIMWORD_CONSTRAINT;
1594	    break;
1595	  case NOTLIMWORD:
1596	    p.constraint &= NOTLIMWORD_CONSTRAINT;
1597	    break;
1598	  default:
1599	    break;
1600	  }
1601	for (j = 0; j < d->follows[old.index].nelem; ++j)
1602	  {
1603	    p.index = d->follows[old.index].elems[j].index;
1604	    insert(p, s);
1605	  }
1606	/* Force rescan to start at the beginning. */
1607	i = -1;
1608      }
1609
1610  free(visited);
1611}
1612
1613/* Perform bottom-up analysis on the parse tree, computing various functions.
1614   Note that at this point, we're pretending constructs like \< are real
1615   characters rather than constraints on what can follow them.
1616
1617   Nullable:  A node is nullable if it is at the root of a regexp that can
1618   match the empty string.
1619   *  EMPTY leaves are nullable.
1620   * No other leaf is nullable.
1621   * A QMARK or STAR node is nullable.
1622   * A PLUS node is nullable if its argument is nullable.
1623   * A CAT node is nullable if both its arguments are nullable.
1624   * An OR node is nullable if either argument is nullable.
1625
1626   Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
1627   that could correspond to the first character of a string matching the
1628   regexp rooted at the given node.
1629   * EMPTY leaves have empty firstpos.
1630   * The firstpos of a nonempty leaf is that leaf itself.
1631   * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1632     argument.
1633   * The firstpos of a CAT node is the firstpos of the left argument, union
1634     the firstpos of the right if the left argument is nullable.
1635   * The firstpos of an OR node is the union of firstpos of each argument.
1636
1637   Lastpos:  The lastpos of a node is the set of positions that could
1638   correspond to the last character of a string matching the regexp at
1639   the given node.
1640   * EMPTY leaves have empty lastpos.
1641   * The lastpos of a nonempty leaf is that leaf itself.
1642   * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1643     argument.
1644   * The lastpos of a CAT node is the lastpos of its right argument, union
1645     the lastpos of the left if the right argument is nullable.
1646   * The lastpos of an OR node is the union of the lastpos of each argument.
1647
1648   Follow:  The follow of a position is the set of positions that could
1649   correspond to the character following a character matching the node in
1650   a string matching the regexp.  At this point we consider special symbols
1651   that match the empty string in some context to be just normal characters.
1652   Later, if we find that a special symbol is in a follow set, we will
1653   replace it with the elements of its follow, labeled with an appropriate
1654   constraint.
1655   * Every node in the firstpos of the argument of a STAR or PLUS node is in
1656     the follow of every node in the lastpos.
1657   * Every node in the firstpos of the second argument of a CAT node is in
1658     the follow of every node in the lastpos of the first argument.
1659
1660   Because of the postfix representation of the parse tree, the depth-first
1661   analysis is conveniently done by a linear scan with the aid of a stack.
1662   Sets are stored as arrays of the elements, obeying a stack-like allocation
1663   scheme; the number of elements in each set deeper in the stack can be
1664   used to determine the address of a particular set's array. */
1665void
1666dfaanalyze (struct dfa *d, int searchflag)
1667{
1668  int *nullable;		/* Nullable stack. */
1669  int *nfirstpos;		/* Element count stack for firstpos sets. */
1670  position *firstpos;		/* Array where firstpos elements are stored. */
1671  int *nlastpos;		/* Element count stack for lastpos sets. */
1672  position *lastpos;		/* Array where lastpos elements are stored. */
1673  int *nalloc;			/* Sizes of arrays allocated to follow sets. */
1674  position_set tmp;		/* Temporary set for merging sets. */
1675  position_set merged;		/* Result of merging sets. */
1676  int wants_newline;		/* True if some position wants newline info. */
1677  int *o_nullable;
1678  int *o_nfirst, *o_nlast;
1679  position *o_firstpos, *o_lastpos;
1680  int i, j;
1681  position *pos;
1682
1683#ifdef DEBUG
1684  fprintf(stderr, "dfaanalyze:\n");
1685  for (i = 0; i < d->tindex; ++i)
1686    {
1687      fprintf(stderr, " %d:", i);
1688      prtok(d->tokens[i]);
1689    }
1690  putc('\n', stderr);
1691#endif
1692
1693  d->searchflag = searchflag;
1694
1695  MALLOC(nullable, int, d->depth);
1696  o_nullable = nullable;
1697  MALLOC(nfirstpos, int, d->depth);
1698  o_nfirst = nfirstpos;
1699  MALLOC(firstpos, position, d->nleaves);
1700  o_firstpos = firstpos, firstpos += d->nleaves;
1701  MALLOC(nlastpos, int, d->depth);
1702  o_nlast = nlastpos;
1703  MALLOC(lastpos, position, d->nleaves);
1704  o_lastpos = lastpos, lastpos += d->nleaves;
1705  MALLOC(nalloc, int, d->tindex);
1706  for (i = 0; i < d->tindex; ++i)
1707    nalloc[i] = 0;
1708  MALLOC(merged.elems, position, d->nleaves);
1709
1710  CALLOC(d->follows, position_set, d->tindex);
1711
1712  for (i = 0; i < d->tindex; ++i)
1713#ifdef DEBUG
1714    {				/* Nonsyntactic #ifdef goo... */
1715#endif
1716    switch (d->tokens[i])
1717      {
1718      case EMPTY:
1719	/* The empty set is nullable. */
1720	*nullable++ = 1;
1721
1722	/* The firstpos and lastpos of the empty leaf are both empty. */
1723	*nfirstpos++ = *nlastpos++ = 0;
1724	break;
1725
1726      case STAR:
1727      case PLUS:
1728	/* Every element in the firstpos of the argument is in the follow
1729	   of every element in the lastpos. */
1730	tmp.nelem = nfirstpos[-1];
1731	tmp.elems = firstpos;
1732	pos = lastpos;
1733	for (j = 0; j < nlastpos[-1]; ++j)
1734	  {
1735	    merge(&tmp, &d->follows[pos[j].index], &merged);
1736	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1737				 nalloc[pos[j].index], merged.nelem - 1);
1738	    copy(&merged, &d->follows[pos[j].index]);
1739	  }
1740
1741      case QMARK:
1742	/* A QMARK or STAR node is automatically nullable. */
1743	if (d->tokens[i] != PLUS)
1744	  nullable[-1] = 1;
1745	break;
1746
1747      case CAT:
1748	/* Every element in the firstpos of the second argument is in the
1749	   follow of every element in the lastpos of the first argument. */
1750	tmp.nelem = nfirstpos[-1];
1751	tmp.elems = firstpos;
1752	pos = lastpos + nlastpos[-1];
1753	for (j = 0; j < nlastpos[-2]; ++j)
1754	  {
1755	    merge(&tmp, &d->follows[pos[j].index], &merged);
1756	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1757				 nalloc[pos[j].index], merged.nelem - 1);
1758	    copy(&merged, &d->follows[pos[j].index]);
1759	  }
1760
1761	/* The firstpos of a CAT node is the firstpos of the first argument,
1762	   union that of the second argument if the first is nullable. */
1763	if (nullable[-2])
1764	  nfirstpos[-2] += nfirstpos[-1];
1765	else
1766	  firstpos += nfirstpos[-1];
1767	--nfirstpos;
1768
1769	/* The lastpos of a CAT node is the lastpos of the second argument,
1770	   union that of the first argument if the second is nullable. */
1771	if (nullable[-1])
1772	  nlastpos[-2] += nlastpos[-1];
1773	else
1774	  {
1775	    pos = lastpos + nlastpos[-2];
1776	    for (j = nlastpos[-1] - 1; j >= 0; --j)
1777	      pos[j] = lastpos[j];
1778	    lastpos += nlastpos[-2];
1779	    nlastpos[-2] = nlastpos[-1];
1780	  }
1781	--nlastpos;
1782
1783	/* A CAT node is nullable if both arguments are nullable. */
1784	nullable[-2] = nullable[-1] && nullable[-2];
1785	--nullable;
1786	break;
1787
1788      case OR:
1789      case ORTOP:
1790	/* The firstpos is the union of the firstpos of each argument. */
1791	nfirstpos[-2] += nfirstpos[-1];
1792	--nfirstpos;
1793
1794	/* The lastpos is the union of the lastpos of each argument. */
1795	nlastpos[-2] += nlastpos[-1];
1796	--nlastpos;
1797
1798	/* An OR node is nullable if either argument is nullable. */
1799	nullable[-2] = nullable[-1] || nullable[-2];
1800	--nullable;
1801	break;
1802
1803      default:
1804	/* Anything else is a nonempty position.  (Note that special
1805	   constructs like \< are treated as nonempty strings here;
1806	   an "epsilon closure" effectively makes them nullable later.
1807	   Backreferences have to get a real position so we can detect
1808	   transitions on them later.  But they are nullable. */
1809	*nullable++ = d->tokens[i] == BACKREF;
1810
1811	/* This position is in its own firstpos and lastpos. */
1812	*nfirstpos++ = *nlastpos++ = 1;
1813	--firstpos, --lastpos;
1814	firstpos->index = lastpos->index = i;
1815	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1816
1817	/* Allocate the follow set for this position. */
1818	nalloc[i] = 1;
1819	MALLOC(d->follows[i].elems, position, nalloc[i]);
1820	break;
1821      }
1822#ifdef DEBUG
1823    /* ... balance the above nonsyntactic #ifdef goo... */
1824      fprintf(stderr, "node %d:", i);
1825      prtok(d->tokens[i]);
1826      putc('\n', stderr);
1827      fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1828      fprintf(stderr, " firstpos:");
1829      for (j = nfirstpos[-1] - 1; j >= 0; --j)
1830	{
1831	  fprintf(stderr, " %d:", firstpos[j].index);
1832	  prtok(d->tokens[firstpos[j].index]);
1833	}
1834      fprintf(stderr, "\n lastpos:");
1835      for (j = nlastpos[-1] - 1; j >= 0; --j)
1836	{
1837	  fprintf(stderr, " %d:", lastpos[j].index);
1838	  prtok(d->tokens[lastpos[j].index]);
1839	}
1840      putc('\n', stderr);
1841    }
1842#endif
1843
1844  /* For each follow set that is the follow set of a real position, replace
1845     it with its epsilon closure. */
1846  for (i = 0; i < d->tindex; ++i)
1847    if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1848#ifdef MBS_SUPPORT
1849        || d->tokens[i] == ANYCHAR
1850        || d->tokens[i] == MBCSET
1851#endif
1852	|| d->tokens[i] >= CSET)
1853      {
1854#ifdef DEBUG
1855	fprintf(stderr, "follows(%d:", i);
1856	prtok(d->tokens[i]);
1857	fprintf(stderr, "):");
1858	for (j = d->follows[i].nelem - 1; j >= 0; --j)
1859	  {
1860	    fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1861	    prtok(d->tokens[d->follows[i].elems[j].index]);
1862	  }
1863	putc('\n', stderr);
1864#endif
1865	copy(&d->follows[i], &merged);
1866	epsclosure(&merged, d);
1867	if (d->follows[i].nelem < merged.nelem)
1868	  REALLOC(d->follows[i].elems, position, merged.nelem);
1869	copy(&merged, &d->follows[i]);
1870      }
1871
1872  /* Get the epsilon closure of the firstpos of the regexp.  The result will
1873     be the set of positions of state 0. */
1874  merged.nelem = 0;
1875  for (i = 0; i < nfirstpos[-1]; ++i)
1876    insert(firstpos[i], &merged);
1877  epsclosure(&merged, d);
1878
1879  /* Check if any of the positions of state 0 will want newline context. */
1880  wants_newline = 0;
1881  for (i = 0; i < merged.nelem; ++i)
1882    if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1883      wants_newline = 1;
1884
1885  /* Build the initial state. */
1886  d->salloc = 1;
1887  d->sindex = 0;
1888  MALLOC(d->states, dfa_state, d->salloc);
1889  state_index(d, &merged, wants_newline, 0);
1890
1891  free(o_nullable);
1892  free(o_nfirst);
1893  free(o_firstpos);
1894  free(o_nlast);
1895  free(o_lastpos);
1896  free(nalloc);
1897  free(merged.elems);
1898}
1899
1900/* Find, for each character, the transition out of state s of d, and store
1901   it in the appropriate slot of trans.
1902
1903   We divide the positions of s into groups (positions can appear in more
1904   than one group).  Each group is labeled with a set of characters that
1905   every position in the group matches (taking into account, if necessary,
1906   preceding context information of s).  For each group, find the union
1907   of the its elements' follows.  This set is the set of positions of the
1908   new state.  For each character in the group's label, set the transition
1909   on this character to be to a state corresponding to the set's positions,
1910   and its associated backward context information, if necessary.
1911
1912   If we are building a searching matcher, we include the positions of state
1913   0 in every state.
1914
1915   The collection of groups is constructed by building an equivalence-class
1916   partition of the positions of s.
1917
1918   For each position, find the set of characters C that it matches.  Eliminate
1919   any characters from C that fail on grounds of backward context.
1920
1921   Search through the groups, looking for a group whose label L has nonempty
1922   intersection with C.  If L - C is nonempty, create a new group labeled
1923   L - C and having the same positions as the current group, and set L to
1924   the intersection of L and C.  Insert the position in this group, set
1925   C = C - L, and resume scanning.
1926
1927   If after comparing with every group there are characters remaining in C,
1928   create a new group labeled with the characters of C and insert this
1929   position in that group. */
1930void
1931dfastate (int s, struct dfa *d, int trans[])
1932{
1933  position_set grps[NOTCHAR];	/* As many as will ever be needed. */
1934  charclass labels[NOTCHAR];	/* Labels corresponding to the groups. */
1935  int ngrps = 0;		/* Number of groups actually used. */
1936  position pos;			/* Current position being considered. */
1937  charclass matches;		/* Set of matching characters. */
1938  int matchesf;			/* True if matches is nonempty. */
1939  charclass intersect;		/* Intersection with some label set. */
1940  int intersectf;		/* True if intersect is nonempty. */
1941  charclass leftovers;		/* Stuff in the label that didn't match. */
1942  int leftoversf;		/* True if leftovers is nonempty. */
1943  static charclass letters;	/* Set of characters considered letters. */
1944  static charclass newline;	/* Set of characters that aren't newline. */
1945  position_set follows;		/* Union of the follows of some group. */
1946  position_set tmp;		/* Temporary space for merging sets. */
1947  int state;			/* New state. */
1948  int wants_newline;		/* New state wants to know newline context. */
1949  int state_newline;		/* New state on a newline transition. */
1950  int wants_letter;		/* New state wants to know letter context. */
1951  int state_letter;		/* New state on a letter transition. */
1952  static int initialized;	/* Flag for static initialization. */
1953#ifdef MBS_SUPPORT
1954  int next_isnt_1st_byte = 0;	/* Flag If we can't add state0.  */
1955#endif
1956  int i, j, k;
1957
1958  /* Initialize the set of letters, if necessary. */
1959  if (! initialized)
1960    {
1961      initialized = 1;
1962      for (i = 0; i < NOTCHAR; ++i)
1963	if (IS_WORD_CONSTITUENT(i))
1964	  setbit(i, letters);
1965      setbit(eolbyte, newline);
1966    }
1967
1968  zeroset(matches);
1969
1970  for (i = 0; i < d->states[s].elems.nelem; ++i)
1971    {
1972      pos = d->states[s].elems.elems[i];
1973      if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1974	setbit(d->tokens[pos.index], matches);
1975      else if (d->tokens[pos.index] >= CSET)
1976	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1977#ifdef MBS_SUPPORT
1978      else if (d->tokens[pos.index] == ANYCHAR
1979               || d->tokens[pos.index] == MBCSET)
1980      /* MB_CUR_MAX > 1  */
1981	{
1982	  /* ANYCHAR and MBCSET must match with a single character, so we
1983	     must put it to d->states[s].mbps, which contains the positions
1984	     which can match with a single character not a byte.  */
1985	  if (d->states[s].mbps.nelem == 0)
1986	    {
1987	      MALLOC(d->states[s].mbps.elems, position,
1988		     d->states[s].elems.nelem);
1989	    }
1990	  insert(pos, &(d->states[s].mbps));
1991	  continue;
1992	}
1993#endif /* MBS_SUPPORT */
1994      else
1995	continue;
1996
1997      /* Some characters may need to be eliminated from matches because
1998	 they fail in the current context. */
1999      if (pos.constraint != 0xFF)
2000	{
2001	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2002					 d->states[s].newline, 1))
2003	    clrbit(eolbyte, matches);
2004	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2005					 d->states[s].newline, 0))
2006	    for (j = 0; j < CHARCLASS_INTS; ++j)
2007	      matches[j] &= newline[j];
2008	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2009					d->states[s].letter, 1))
2010	    for (j = 0; j < CHARCLASS_INTS; ++j)
2011	      matches[j] &= ~letters[j];
2012	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2013					d->states[s].letter, 0))
2014	    for (j = 0; j < CHARCLASS_INTS; ++j)
2015	      matches[j] &= letters[j];
2016
2017	  /* If there are no characters left, there's no point in going on. */
2018	  for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2019	    continue;
2020	  if (j == CHARCLASS_INTS)
2021	    continue;
2022	}
2023
2024      for (j = 0; j < ngrps; ++j)
2025	{
2026	  /* If matches contains a single character only, and the current
2027	     group's label doesn't contain that character, go on to the
2028	     next group. */
2029	  if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2030	      && !tstbit(d->tokens[pos.index], labels[j]))
2031	    continue;
2032
2033	  /* Check if this group's label has a nonempty intersection with
2034	     matches. */
2035	  intersectf = 0;
2036	  for (k = 0; k < CHARCLASS_INTS; ++k)
2037	    (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2038	  if (! intersectf)
2039	    continue;
2040
2041	  /* It does; now find the set differences both ways. */
2042	  leftoversf = matchesf = 0;
2043	  for (k = 0; k < CHARCLASS_INTS; ++k)
2044	    {
2045	      /* Even an optimizing compiler can't know this for sure. */
2046	      int match = matches[k], label = labels[j][k];
2047
2048	      (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2049	      (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2050	    }
2051
2052	  /* If there were leftovers, create a new group labeled with them. */
2053	  if (leftoversf)
2054	    {
2055	      copyset(leftovers, labels[ngrps]);
2056	      copyset(intersect, labels[j]);
2057	      MALLOC(grps[ngrps].elems, position, d->nleaves);
2058	      copy(&grps[j], &grps[ngrps]);
2059	      ++ngrps;
2060	    }
2061
2062	  /* Put the position in the current group.  Note that there is no
2063	     reason to call insert() here. */
2064	  grps[j].elems[grps[j].nelem++] = pos;
2065
2066	  /* If every character matching the current position has been
2067	     accounted for, we're done. */
2068	  if (! matchesf)
2069	    break;
2070	}
2071
2072      /* If we've passed the last group, and there are still characters
2073	 unaccounted for, then we'll have to create a new group. */
2074      if (j == ngrps)
2075	{
2076	  copyset(matches, labels[ngrps]);
2077	  zeroset(matches);
2078	  MALLOC(grps[ngrps].elems, position, d->nleaves);
2079	  grps[ngrps].nelem = 1;
2080	  grps[ngrps].elems[0] = pos;
2081	  ++ngrps;
2082	}
2083    }
2084
2085  MALLOC(follows.elems, position, d->nleaves);
2086  MALLOC(tmp.elems, position, d->nleaves);
2087
2088  /* If we are a searching matcher, the default transition is to a state
2089     containing the positions of state 0, otherwise the default transition
2090     is to fail miserably. */
2091  if (d->searchflag)
2092    {
2093      wants_newline = 0;
2094      wants_letter = 0;
2095      for (i = 0; i < d->states[0].elems.nelem; ++i)
2096	{
2097	  if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2098	    wants_newline = 1;
2099	  if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2100	    wants_letter = 1;
2101	}
2102      copy(&d->states[0].elems, &follows);
2103      state = state_index(d, &follows, 0, 0);
2104      if (wants_newline)
2105	state_newline = state_index(d, &follows, 1, 0);
2106      else
2107	state_newline = state;
2108      if (wants_letter)
2109	state_letter = state_index(d, &follows, 0, 1);
2110      else
2111	state_letter = state;
2112      for (i = 0; i < NOTCHAR; ++i)
2113	trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2114      trans[eolbyte] = state_newline;
2115    }
2116  else
2117    for (i = 0; i < NOTCHAR; ++i)
2118      trans[i] = -1;
2119
2120  for (i = 0; i < ngrps; ++i)
2121    {
2122      follows.nelem = 0;
2123
2124      /* Find the union of the follows of the positions of the group.
2125	 This is a hideously inefficient loop.  Fix it someday. */
2126      for (j = 0; j < grps[i].nelem; ++j)
2127	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2128	  insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2129
2130#ifdef MBS_SUPPORT
2131      if (MB_CUR_MAX > 1)
2132	{
2133	  /* If a token in follows.elems is not 1st byte of a multibyte
2134	     character, or the states of follows must accept the bytes
2135	     which are not 1st byte of the multibyte character.
2136	     Then, if a state of follows encounter a byte, it must not be
2137	     a 1st byte of a multibyte character nor singlebyte character.
2138	     We cansel to add state[0].follows to next state, because
2139	     state[0] must accept 1st-byte
2140
2141	     For example, we assume <sb a> is a certain singlebyte
2142	     character, <mb A> is a certain multibyte character, and the
2143	     codepoint of <sb a> equals the 2nd byte of the codepoint of
2144	     <mb A>.
2145	     When state[0] accepts <sb a>, state[i] transit to state[i+1]
2146	     by accepting accepts 1st byte of <mb A>, and state[i+1]
2147	     accepts 2nd byte of <mb A>, if state[i+1] encounter the
2148	     codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2149	     <mb A>, so we can not add state[0].  */
2150
2151	  next_isnt_1st_byte = 0;
2152	  for (j = 0; j < follows.nelem; ++j)
2153	    {
2154	      if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2155		{
2156		  next_isnt_1st_byte = 1;
2157		  break;
2158		}
2159	    }
2160	}
2161#endif
2162
2163      /* If we are building a searching matcher, throw in the positions
2164	 of state 0 as well. */
2165#ifdef MBS_SUPPORT
2166      if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte))
2167#else
2168      if (d->searchflag)
2169#endif
2170	for (j = 0; j < d->states[0].elems.nelem; ++j)
2171	  insert(d->states[0].elems.elems[j], &follows);
2172
2173      /* Find out if the new state will want any context information. */
2174      wants_newline = 0;
2175      if (tstbit(eolbyte, labels[i]))
2176	for (j = 0; j < follows.nelem; ++j)
2177	  if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2178	    wants_newline = 1;
2179
2180      wants_letter = 0;
2181      for (j = 0; j < CHARCLASS_INTS; ++j)
2182	if (labels[i][j] & letters[j])
2183	  break;
2184      if (j < CHARCLASS_INTS)
2185	for (j = 0; j < follows.nelem; ++j)
2186	  if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2187	    wants_letter = 1;
2188
2189      /* Find the state(s) corresponding to the union of the follows. */
2190      state = state_index(d, &follows, 0, 0);
2191      if (wants_newline)
2192	state_newline = state_index(d, &follows, 1, 0);
2193      else
2194	state_newline = state;
2195      if (wants_letter)
2196	state_letter = state_index(d, &follows, 0, 1);
2197      else
2198	state_letter = state;
2199
2200      /* Set the transitions for each character in the current label. */
2201      for (j = 0; j < CHARCLASS_INTS; ++j)
2202	for (k = 0; k < INTBITS; ++k)
2203	  if (labels[i][j] & 1 << k)
2204	    {
2205	      int c = j * INTBITS + k;
2206
2207	      if (c == eolbyte)
2208		trans[c] = state_newline;
2209	      else if (IS_WORD_CONSTITUENT(c))
2210		trans[c] = state_letter;
2211	      else if (c < NOTCHAR)
2212		trans[c] = state;
2213	    }
2214    }
2215
2216  for (i = 0; i < ngrps; ++i)
2217    free(grps[i].elems);
2218  free(follows.elems);
2219  free(tmp.elems);
2220}
2221
2222/* Some routines for manipulating a compiled dfa's transition tables.
2223   Each state may or may not have a transition table; if it does, and it
2224   is a non-accepting state, then d->trans[state] points to its table.
2225   If it is an accepting state then d->fails[state] points to its table.
2226   If it has no table at all, then d->trans[state] is NULL.
2227   TODO: Improve this comment, get rid of the unnecessary redundancy. */
2228
2229static void
2230build_state (int s, struct dfa *d)
2231{
2232  int *trans;			/* The new transition table. */
2233  int i;
2234
2235  /* Set an upper limit on the number of transition tables that will ever
2236     exist at once.  1024 is arbitrary.  The idea is that the frequently
2237     used transition tables will be quickly rebuilt, whereas the ones that
2238     were only needed once or twice will be cleared away. */
2239  if (d->trcount >= 1024)
2240    {
2241      for (i = 0; i < d->tralloc; ++i)
2242	if (d->trans[i])
2243	  {
2244	    free((ptr_t) d->trans[i]);
2245	    d->trans[i] = NULL;
2246	  }
2247	else if (d->fails[i])
2248	  {
2249	    free((ptr_t) d->fails[i]);
2250	    d->fails[i] = NULL;
2251	  }
2252      d->trcount = 0;
2253    }
2254
2255  ++d->trcount;
2256
2257  /* Set up the success bits for this state. */
2258  d->success[s] = 0;
2259  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2260      s, *d))
2261    d->success[s] |= 4;
2262  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2263      s, *d))
2264    d->success[s] |= 2;
2265  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2266      s, *d))
2267    d->success[s] |= 1;
2268
2269  MALLOC(trans, int, NOTCHAR);
2270  dfastate(s, d, trans);
2271
2272  /* Now go through the new transition table, and make sure that the trans
2273     and fail arrays are allocated large enough to hold a pointer for the
2274     largest state mentioned in the table. */
2275  for (i = 0; i < NOTCHAR; ++i)
2276    if (trans[i] >= d->tralloc)
2277      {
2278	int oldalloc = d->tralloc;
2279
2280	while (trans[i] >= d->tralloc)
2281	  d->tralloc *= 2;
2282	REALLOC(d->realtrans, int *, d->tralloc + 1);
2283	d->trans = d->realtrans + 1;
2284	REALLOC(d->fails, int *, d->tralloc);
2285	REALLOC(d->success, int, d->tralloc);
2286	while (oldalloc < d->tralloc)
2287	  {
2288	    d->trans[oldalloc] = NULL;
2289	    d->fails[oldalloc++] = NULL;
2290	  }
2291      }
2292
2293  /* Newline is a sentinel.  */
2294  trans[eolbyte] = -1;
2295
2296  if (ACCEPTING(s, *d))
2297    d->fails[s] = trans;
2298  else
2299    d->trans[s] = trans;
2300}
2301
2302static void
2303build_state_zero (struct dfa *d)
2304{
2305  d->tralloc = 1;
2306  d->trcount = 0;
2307  CALLOC(d->realtrans, int *, d->tralloc + 1);
2308  d->trans = d->realtrans + 1;
2309  CALLOC(d->fails, int *, d->tralloc);
2310  MALLOC(d->success, int, d->tralloc);
2311  build_state(0, d);
2312}
2313
2314#ifdef MBS_SUPPORT
2315/* Multibyte character handling sub-routins for dfaexec.  */
2316
2317/* Initial state may encounter the byte which is not a singlebyte character
2318   nor 1st byte of a multibyte character.  But it is incorrect for initial
2319   state to accept such a byte.
2320   For example, in sjis encoding the regular expression like "\\" accepts
2321   the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2322   0x815c. Then Initial state must skip the bytes which are not a singlebyte
2323   character nor 1st byte of a multibyte character.  */
2324#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p)		\
2325  if (s == 0)						\
2326    {							\
2327      while (inputwcs[p - buf_begin] == 0		\
2328            && mblen_buf[p - buf_begin] > 0		\
2329	    && p < buf_end)				\
2330        ++p;						\
2331      if (p >= end)					\
2332	{						\
2333          free(mblen_buf);				\
2334          free(inputwcs);				\
2335	  return (size_t) -1;				\
2336	}						\
2337    }
2338
2339static void
2340realloc_trans_if_necessary(struct dfa *d, int new_state)
2341{
2342  /* Make sure that the trans and fail arrays are allocated large enough
2343     to hold a pointer for the new state. */
2344  if (new_state >= d->tralloc)
2345    {
2346      int oldalloc = d->tralloc;
2347
2348      while (new_state >= d->tralloc)
2349	d->tralloc *= 2;
2350      REALLOC(d->realtrans, int *, d->tralloc + 1);
2351      d->trans = d->realtrans + 1;
2352      REALLOC(d->fails, int *, d->tralloc);
2353      REALLOC(d->success, int, d->tralloc);
2354      while (oldalloc < d->tralloc)
2355	{
2356	  d->trans[oldalloc] = NULL;
2357	  d->fails[oldalloc++] = NULL;
2358	}
2359    }
2360}
2361
2362/* Return values of transit_state_singlebyte(), and
2363   transit_state_consume_1char.  */
2364typedef enum
2365{
2366  TRANSIT_STATE_IN_PROGRESS,	/* State transition has not finished.  */
2367  TRANSIT_STATE_DONE,		/* State transition has finished.  */
2368  TRANSIT_STATE_END_BUFFER	/* Reach the end of the buffer.  */
2369} status_transit_state;
2370
2371/* Consume a single byte and transit state from 's' to '*next_state'.
2372   This function is almost same as the state transition routin in dfaexec().
2373   But state transition is done just once, otherwise matching succeed or
2374   reach the end of the buffer.  */
2375static status_transit_state
2376transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2377				  int *next_state)
2378{
2379  int *t;
2380  int works = s;
2381
2382  status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2383
2384  while (rval == TRANSIT_STATE_IN_PROGRESS)
2385    {
2386      if ((t = d->trans[works]) != NULL)
2387	{
2388	  works = t[*p];
2389	  rval = TRANSIT_STATE_DONE;
2390	  if (works < 0)
2391	    works = 0;
2392	}
2393      else if (works < 0)
2394	{
2395	  if (p == buf_end)
2396	    /* At the moment, it must not happen.  */
2397	    return TRANSIT_STATE_END_BUFFER;
2398	  works = 0;
2399	}
2400      else if (d->fails[works])
2401	{
2402	  works = d->fails[works][*p];
2403	  rval = TRANSIT_STATE_DONE;
2404	}
2405      else
2406	{
2407	  build_state(works, d);
2408	}
2409    }
2410  *next_state = works;
2411  return rval;
2412}
2413
2414/* Check whether period can match or not in the current context.  If it can,
2415   return the amount of the bytes with which period can match, otherwise
2416   return 0.
2417   `pos' is the position of the period.  `index' is the index from the
2418   buf_begin, and it is the current position in the buffer.  */
2419static int
2420match_anychar (struct dfa *d, int s, position pos, int index)
2421{
2422  int newline = 0;
2423  int letter = 0;
2424  wchar_t wc;
2425  int mbclen;
2426
2427  wc = inputwcs[index];
2428  mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2429
2430  /* Check context.  */
2431  if (wc == (wchar_t)eolbyte)
2432    {
2433      if (!(syntax_bits & RE_DOT_NEWLINE))
2434	return 0;
2435      newline = 1;
2436    }
2437  else if (wc == (wchar_t)'\0')
2438    {
2439      if (syntax_bits & RE_DOT_NOT_NULL)
2440	return 0;
2441      newline = 1;
2442    }
2443
2444  if (iswalnum(wc) || wc == L'_')
2445    letter = 1;
2446
2447  if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2448			   newline, d->states[s].letter, letter))
2449    return 0;
2450
2451  return mbclen;
2452}
2453
2454/* Check whether bracket expression can match or not in the current context.
2455   If it can, return the amount of the bytes with which expression can match,
2456   otherwise return 0.
2457   `pos' is the position of the bracket expression.  `index' is the index
2458   from the buf_begin, and it is the current position in the buffer.  */
2459int
2460match_mb_charset (struct dfa *d, int s, position pos, int index)
2461{
2462  int i;
2463  int match;		/* Flag which represent that matching succeed.  */
2464  int match_len;	/* Length of the character (or collating element)
2465			   with which this operator match.  */
2466  int op_len;		/* Length of the operator.  */
2467  char buffer[128];
2468  wchar_t wcbuf[6];
2469
2470  /* Pointer to the structure to which we are currently reffering.  */
2471  struct mb_char_classes *work_mbc;
2472
2473  int newline = 0;
2474  int letter = 0;
2475  wchar_t wc;		/* Current reffering character.  */
2476
2477  wc = inputwcs[index];
2478
2479  /* Check context.  */
2480  if (wc == (wchar_t)eolbyte)
2481    {
2482      if (!(syntax_bits & RE_DOT_NEWLINE))
2483	return 0;
2484      newline = 1;
2485    }
2486  else if (wc == (wchar_t)'\0')
2487    {
2488      if (syntax_bits & RE_DOT_NOT_NULL)
2489	return 0;
2490      newline = 1;
2491    }
2492  if (iswalnum(wc) || wc == L'_')
2493    letter = 1;
2494  if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2495			   newline, d->states[s].letter, letter))
2496    return 0;
2497
2498  /* Assign the current reffering operator to work_mbc.  */
2499  work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2500  match = !work_mbc->invert;
2501  match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2502
2503  /* match with a character class?  */
2504  for (i = 0; i<work_mbc->nch_classes; i++)
2505    {
2506      if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2507	goto charset_matched;
2508    }
2509
2510  strncpy(buffer, buf_begin + index, match_len);
2511  buffer[match_len] = '\0';
2512
2513  /* match with an equivalent class?  */
2514  for (i = 0; i<work_mbc->nequivs; i++)
2515    {
2516      op_len = strlen(work_mbc->equivs[i]);
2517      strncpy(buffer, buf_begin + index, op_len);
2518      buffer[op_len] = '\0';
2519      if (strcoll(work_mbc->equivs[i], buffer) == 0)
2520	{
2521	  match_len = op_len;
2522	  goto charset_matched;
2523	}
2524    }
2525
2526  /* match with a collating element?  */
2527  for (i = 0; i<work_mbc->ncoll_elems; i++)
2528    {
2529      op_len = strlen(work_mbc->coll_elems[i]);
2530      strncpy(buffer, buf_begin + index, op_len);
2531      buffer[op_len] = '\0';
2532
2533      if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2534	{
2535	  match_len = op_len;
2536	  goto charset_matched;
2537	}
2538    }
2539
2540  wcbuf[0] = wc;
2541  wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2542
2543  /* match with a range?  */
2544  for (i = 0; i<work_mbc->nranges; i++)
2545    {
2546      wcbuf[2] = work_mbc->range_sts[i];
2547      wcbuf[4] = work_mbc->range_ends[i];
2548
2549      if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2550	  wcscoll(wcbuf+4, wcbuf) >= 0)
2551	goto charset_matched;
2552    }
2553
2554  /* match with a character?  */
2555  for (i = 0; i<work_mbc->nchars; i++)
2556    {
2557      if (wc == work_mbc->chars[i])
2558	goto charset_matched;
2559    }
2560
2561  match = !match;
2562
2563 charset_matched:
2564  return match ? match_len : 0;
2565}
2566
2567/* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2568   array which corresponds to `d->states[s].mbps.elem' and each element of
2569   the array contains the amount of the bytes with which the element can
2570   match.
2571   `index' is the index from the buf_begin, and it is the current position
2572   in the buffer.
2573   Caller MUST free the array which this function return.  */
2574static int*
2575check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
2576{
2577  int i;
2578  int* rarray;
2579
2580  MALLOC(rarray, int, d->states[s].mbps.nelem);
2581  for (i = 0; i < d->states[s].mbps.nelem; ++i)
2582    {
2583      position pos = d->states[s].mbps.elems[i];
2584      switch(d->tokens[pos.index])
2585	{
2586	case ANYCHAR:
2587	  rarray[i] = match_anychar(d, s, pos, index);
2588	  break;
2589	case MBCSET:
2590	  rarray[i] = match_mb_charset(d, s, pos, index);
2591	  break;
2592	default:
2593	  break; /* can not happen.  */
2594	}
2595    }
2596  return rarray;
2597}
2598
2599/* Consume a single character and enumerate all of the positions which can
2600   be next position from the state `s'.
2601   `match_lens' is the input. It can be NULL, but it can also be the output
2602   of check_matching_with_multibyte_ops() for optimization.
2603   `mbclen' and `pps' are the output.  `mbclen' is the length of the
2604   character consumed, and `pps' is the set this function enumerate.  */
2605static status_transit_state
2606transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2607			     int *match_lens, int *mbclen, position_set *pps)
2608{
2609  int i, j;
2610  int s1, s2;
2611  int* work_mbls;
2612  status_transit_state rs = TRANSIT_STATE_DONE;
2613
2614  /* Calculate the length of the (single/multi byte) character
2615     to which p points.  */
2616  *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
2617    : mblen_buf[*pp - buf_begin];
2618
2619  /* Calculate the state which can be reached from the state `s' by
2620     consuming `*mbclen' single bytes from the buffer.  */
2621  s1 = s;
2622  for (i = 0; i < *mbclen; i++)
2623    {
2624      s2 = s1;
2625      rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2626    }
2627  /* Copy the positions contained by `s1' to the set `pps'.  */
2628  copy(&(d->states[s1].elems), pps);
2629
2630  /* Check (inputed)match_lens, and initialize if it is NULL.  */
2631  if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2632    work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2633  else
2634    work_mbls = match_lens;
2635
2636  /* Add all of the positions which can be reached from `s' by consuming
2637     a single character.  */
2638  for (i = 0; i < d->states[s].mbps.nelem ; i++)
2639   {
2640      if (work_mbls[i] == *mbclen)
2641	for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2642	     j++)
2643	  insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2644		 pps);
2645    }
2646
2647  if (match_lens == NULL && work_mbls != NULL)
2648    free(work_mbls);
2649  return rs;
2650}
2651
2652/* Transit state from s, then return new state and update the pointer of the
2653   buffer.  This function is for some operator which can match with a multi-
2654   byte character or a collating element(which may be multi characters).  */
2655static int
2656transit_state (struct dfa *d, int s, unsigned char const **pp)
2657{
2658  int s1;
2659  int mbclen;		/* The length of current input multibyte character. */
2660  int maxlen = 0;
2661  int i, j;
2662  int *match_lens = NULL;
2663  int nelem = d->states[s].mbps.nelem; /* Just a alias.  */
2664  position_set follows;
2665  unsigned char const *p1 = *pp;
2666  status_transit_state rs;
2667  wchar_t wc;
2668
2669  if (nelem > 0)
2670    /* This state has (a) multibyte operator(s).
2671       We check whether each of them can match or not.  */
2672    {
2673      /* Note: caller must free the return value of this function.  */
2674      match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2675
2676      for (i = 0; i < nelem; i++)
2677	/* Search the operator which match the longest string,
2678	   in this state.  */
2679	{
2680	  if (match_lens[i] > maxlen)
2681	    maxlen = match_lens[i];
2682	}
2683    }
2684
2685  if (nelem == 0 || maxlen == 0)
2686    /* This state has no multibyte operator which can match.
2687       We need to  check only one singlebyte character.  */
2688    {
2689      status_transit_state rs;
2690      rs = transit_state_singlebyte(d, s, *pp, &s1);
2691
2692      /* We must update the pointer if state transition succeeded.  */
2693      if (rs == TRANSIT_STATE_DONE)
2694	++*pp;
2695
2696      if (match_lens != NULL)
2697	free(match_lens);
2698      return s1;
2699    }
2700
2701  /* This state has some operators which can match a multibyte character.  */
2702  follows.nelem = 0;
2703  MALLOC(follows.elems, position, d->nleaves);
2704
2705  /* `maxlen' may be longer than the length of a character, because it may
2706     not be a character but a (multi character) collating element.
2707     We enumerate all of the positions which `s' can reach by consuming
2708     `maxlen' bytes.  */
2709  rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2710
2711  wc = inputwcs[*pp - mbclen - buf_begin];
2712  s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2713  realloc_trans_if_necessary(d, s1);
2714
2715  while (*pp - p1 < maxlen)
2716    {
2717      follows.nelem = 0;
2718      rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2719
2720      for (i = 0; i < nelem ; i++)
2721	{
2722	  if (match_lens[i] == *pp - p1)
2723	    for (j = 0;
2724		 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2725	      insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2726		     &follows);
2727	}
2728
2729      wc = inputwcs[*pp - mbclen - buf_begin];
2730      s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2731      realloc_trans_if_necessary(d, s1);
2732    }
2733  free(match_lens);
2734  free(follows.elems);
2735  return s1;
2736}
2737
2738#endif
2739
2740/* Search through a buffer looking for a match to the given struct dfa.
2741   Find the first occurrence of a string matching the regexp in the buffer,
2742   and the shortest possible version thereof.  Return the offset of the first
2743   character after the match, or (size_t) -1 if none is found.  BEGIN points to
2744   the beginning of the buffer, and SIZE is the size of the buffer.  If SIZE
2745   is nonzero, BEGIN[SIZE - 1] must be a newline.  BACKREF points to a place
2746   where we're supposed to store a 1 if backreferencing happened and the
2747   match needs to be verified by a backtracking matcher.  Otherwise
2748   we store a 0 in *backref. */
2749size_t
2750dfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
2751{
2752  register int s;	/* Current state. */
2753  register unsigned char const *p; /* Current input character. */
2754  register unsigned char const *end; /* One past the last input character.  */
2755  register int **trans, *t;	/* Copy of d->trans so it can be optimized
2756				   into a register. */
2757  register unsigned char eol = eolbyte;	/* Likewise for eolbyte.  */
2758  static int sbit[NOTCHAR];	/* Table for anding with d->success. */
2759  static int sbit_init;
2760
2761  if (! sbit_init)
2762    {
2763      int i;
2764
2765      sbit_init = 1;
2766      for (i = 0; i < NOTCHAR; ++i)
2767	sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2768      sbit[eol] = 4;
2769    }
2770
2771  if (! d->tralloc)
2772    build_state_zero(d);
2773
2774  s = 0;
2775  p = (unsigned char const *) begin;
2776  end = p + size;
2777  trans = d->trans;
2778
2779#ifdef MBS_SUPPORT
2780  if (MB_CUR_MAX > 1)
2781    {
2782      int remain_bytes, i;
2783      buf_begin = begin;
2784      buf_end = end;
2785
2786      /* initialize mblen_buf, and inputwcs.  */
2787      MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
2788      MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
2789      memset(&mbs, 0, sizeof(mbstate_t));
2790      remain_bytes = 0;
2791      for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
2792	{
2793	  if (remain_bytes == 0)
2794	    {
2795	      remain_bytes
2796		= mbrtowc(inputwcs + i, begin + i,
2797			  end - (unsigned char const *)begin - i + 1, &mbs);
2798	      if (remain_bytes <= 1)
2799		{
2800		  remain_bytes = 0;
2801		  inputwcs[i] = (wchar_t)begin[i];
2802		  mblen_buf[i] = 0;
2803		}
2804	      else
2805		{
2806		  mblen_buf[i] = remain_bytes;
2807		  remain_bytes--;
2808		}
2809	    }
2810	  else
2811	    {
2812	      mblen_buf[i] = remain_bytes;
2813	      inputwcs[i] = 0;
2814	      remain_bytes--;
2815	    }
2816	}
2817      mblen_buf[i] = 0;
2818      inputwcs[i] = 0; /* sentinel */
2819    }
2820#endif /* MBS_SUPPORT */
2821
2822  for (;;)
2823    {
2824#ifdef MBS_SUPPORT
2825      if (MB_CUR_MAX > 1)
2826	while ((t = trans[s]))
2827	  {
2828	    if (d->states[s].mbps.nelem != 0)
2829	      {
2830		/* Can match with a multibyte character( and multi character
2831		   collating element).  */
2832		unsigned char const *nextp;
2833
2834		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2835
2836		nextp = p;
2837		s = transit_state(d, s, &nextp);
2838		p = nextp;
2839
2840		/* Trans table might be updated.  */
2841		trans = d->trans;
2842	      }
2843	    else
2844	      {
2845		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2846		s = t[*p++];
2847	      }
2848	  }
2849      else
2850#endif /* MBS_SUPPORT */
2851        while ((t = trans[s]))
2852	  s = t[*p++];
2853
2854      if (s < 0)
2855	{
2856	  if (p == end)
2857	    {
2858#ifdef MBS_SUPPORT
2859	      if (MB_CUR_MAX > 1)
2860		{
2861		  free(mblen_buf);
2862		  free(inputwcs);
2863		}
2864#endif /* MBS_SUPPORT */
2865	      return (size_t) -1;
2866	    }
2867	  s = 0;
2868	}
2869      else if ((t = d->fails[s]))
2870	{
2871	  if (d->success[s] & sbit[*p])
2872	    {
2873	      if (backref)
2874		*backref = (d->states[s].backref != 0);
2875#ifdef MBS_SUPPORT
2876	      if (MB_CUR_MAX > 1)
2877		{
2878		  free(mblen_buf);
2879		  free(inputwcs);
2880		}
2881#endif /* MBS_SUPPORT */
2882	      return (char const *) p - begin;
2883	    }
2884
2885#ifdef MBS_SUPPORT
2886	  if (MB_CUR_MAX > 1)
2887	    {
2888		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2889		if (d->states[s].mbps.nelem != 0)
2890		  {
2891		    /* Can match with a multibyte character( and multi
2892		       character collating element).  */
2893		    unsigned char const *nextp;
2894		    nextp = p;
2895		    s = transit_state(d, s, &nextp);
2896		    p = nextp;
2897
2898		    /* Trans table might be updated.  */
2899		    trans = d->trans;
2900		  }
2901		else
2902		s = t[*p++];
2903	    }
2904	  else
2905#endif /* MBS_SUPPORT */
2906	  s = t[*p++];
2907	}
2908      else
2909	{
2910	  build_state(s, d);
2911	  trans = d->trans;
2912	}
2913    }
2914}
2915
2916/* Initialize the components of a dfa that the other routines don't
2917   initialize for themselves. */
2918void
2919dfainit (struct dfa *d)
2920{
2921  d->calloc = 1;
2922  MALLOC(d->charclasses, charclass, d->calloc);
2923  d->cindex = 0;
2924
2925  d->talloc = 1;
2926  MALLOC(d->tokens, token, d->talloc);
2927  d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2928#ifdef MBS_SUPPORT
2929  if (MB_CUR_MAX > 1)
2930    {
2931      d->nmultibyte_prop = 1;
2932      MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
2933      d->nmbcsets = 0;
2934      d->mbcsets_alloc = 1;
2935      MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
2936    }
2937#endif
2938
2939  d->searchflag = 0;
2940  d->tralloc = 0;
2941
2942  d->musts = 0;
2943}
2944
2945/* Parse and analyze a single string of the given length. */
2946void
2947dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
2948{
2949  if (case_fold)	/* dummy folding in service of dfamust() */
2950    {
2951      char *lcopy;
2952      int i;
2953
2954      lcopy = malloc(len);
2955      if (!lcopy)
2956	dfaerror(_("out of memory"));
2957
2958      /* This is a kludge. */
2959      case_fold = 0;
2960      for (i = 0; i < len; ++i)
2961	if (ISUPPER ((unsigned char) s[i]))
2962	  lcopy[i] = tolower ((unsigned char) s[i]);
2963	else
2964	  lcopy[i] = s[i];
2965
2966      dfainit(d);
2967      dfaparse(lcopy, len, d);
2968      free(lcopy);
2969      dfamust(d);
2970      d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2971      case_fold = 1;
2972      dfaparse(s, len, d);
2973      dfaanalyze(d, searchflag);
2974    }
2975  else
2976    {
2977        dfainit(d);
2978        dfaparse(s, len, d);
2979	dfamust(d);
2980        dfaanalyze(d, searchflag);
2981    }
2982}
2983
2984/* Free the storage held by the components of a dfa. */
2985void
2986dfafree (struct dfa *d)
2987{
2988  int i;
2989  struct dfamust *dm, *ndm;
2990
2991  free((ptr_t) d->charclasses);
2992  free((ptr_t) d->tokens);
2993
2994#ifdef MBS_SUPPORT
2995  if (MB_CUR_MAX > 1)
2996    {
2997      free((ptr_t) d->multibyte_prop);
2998      for (i = 0; i < d->nmbcsets; ++i)
2999	{
3000	  int j;
3001	  struct mb_char_classes *p = &(d->mbcsets[i]);
3002	  if (p->chars != NULL)
3003	    free(p->chars);
3004	  if (p->ch_classes != NULL)
3005	    free(p->ch_classes);
3006	  if (p->range_sts != NULL)
3007	    free(p->range_sts);
3008	  if (p->range_ends != NULL)
3009	    free(p->range_ends);
3010
3011	  for (j = 0; j < p->nequivs; ++j)
3012	    free(p->equivs[j]);
3013	  if (p->equivs != NULL)
3014	    free(p->equivs);
3015
3016	  for (j = 0; j < p->ncoll_elems; ++j)
3017	    free(p->coll_elems[j]);
3018	  if (p->coll_elems != NULL)
3019	    free(p->coll_elems);
3020	}
3021      free((ptr_t) d->mbcsets);
3022    }
3023#endif /* MBS_SUPPORT */
3024
3025  for (i = 0; i < d->sindex; ++i)
3026    free((ptr_t) d->states[i].elems.elems);
3027  free((ptr_t) d->states);
3028  for (i = 0; i < d->tindex; ++i)
3029    if (d->follows[i].elems)
3030      free((ptr_t) d->follows[i].elems);
3031  free((ptr_t) d->follows);
3032  for (i = 0; i < d->tralloc; ++i)
3033    if (d->trans[i])
3034      free((ptr_t) d->trans[i]);
3035    else if (d->fails[i])
3036      free((ptr_t) d->fails[i]);
3037  if (d->realtrans) free((ptr_t) d->realtrans);
3038  if (d->fails) free((ptr_t) d->fails);
3039  if (d->success) free((ptr_t) d->success);
3040  for (dm = d->musts; dm; dm = ndm)
3041    {
3042      ndm = dm->next;
3043      free(dm->must);
3044      free((ptr_t) dm);
3045    }
3046}
3047
3048/* Having found the postfix representation of the regular expression,
3049   try to find a long sequence of characters that must appear in any line
3050   containing the r.e.
3051   Finding a "longest" sequence is beyond the scope here;
3052   we take an easy way out and hope for the best.
3053   (Take "(ab|a)b"--please.)
3054
3055   We do a bottom-up calculation of sequences of characters that must appear
3056   in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3057   representation:
3058	sequences that must appear at the left of the match ("left")
3059	sequences that must appear at the right of the match ("right")
3060	lists of sequences that must appear somewhere in the match ("in")
3061	sequences that must constitute the match ("is")
3062
3063   When we get to the root of the tree, we use one of the longest of its
3064   calculated "in" sequences as our answer.  The sequence we find is returned in
3065   d->must (where "d" is the single argument passed to "dfamust");
3066   the length of the sequence is returned in d->mustn.
3067
3068   The sequences calculated for the various types of node (in pseudo ANSI c)
3069   are shown below.  "p" is the operand of unary operators (and the left-hand
3070   operand of binary operators); "q" is the right-hand operand of binary
3071   operators.
3072
3073   "ZERO" means "a zero-length sequence" below.
3074
3075	Type	left		right		is		in
3076	----	----		-----		--		--
3077	char c	# c		# c		# c		# c
3078
3079	ANYCHAR	ZERO		ZERO		ZERO		ZERO
3080
3081	MBCSET	ZERO		ZERO		ZERO		ZERO
3082
3083	CSET	ZERO		ZERO		ZERO		ZERO
3084
3085	STAR	ZERO		ZERO		ZERO		ZERO
3086
3087	QMARK	ZERO		ZERO		ZERO		ZERO
3088
3089	PLUS	p->left		p->right	ZERO		p->in
3090
3091	CAT	(p->is==ZERO)?	(q->is==ZERO)?	(p->is!=ZERO &&	p->in plus
3092		p->left :	q->right :	q->is!=ZERO) ?	q->in plus
3093		p->is##q->left	p->right##q->is	p->is##q->is :	p->right##q->left
3094						ZERO
3095
3096	OR	longest common	longest common	(do p->is and	substrings common to
3097		leading		trailing	q->is have same	p->in and q->in
3098		(sub)sequence	(sub)sequence	length and
3099		of p->left	of p->right	content) ?
3100		and q->left	and q->right	p->is : NULL
3101
3102   If there's anything else we recognize in the tree, all four sequences get set
3103   to zero-length sequences.  If there's something we don't recognize in the tree,
3104   we just return a zero-length sequence.
3105
3106   Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3107   'aaa')?
3108
3109   And. . .is it here or someplace that we might ponder "optimizations" such as
3110	egrep 'psi|epsilon'	->	egrep 'psi'
3111	egrep 'pepsi|epsilon'	->	egrep 'epsi'
3112					(Yes, we now find "epsi" as a "string
3113					that must occur", but we might also
3114					simplify the *entire* r.e. being sought)
3115	grep '[c]'		->	grep 'c'
3116	grep '(ab|a)b'		->	grep 'ab'
3117	grep 'ab*'		->	grep 'a'
3118	grep 'a*b'		->	grep 'b'
3119
3120   There are several issues:
3121
3122   Is optimization easy (enough)?
3123
3124   Does optimization actually accomplish anything,
3125   or is the automaton you get from "psi|epsilon" (for example)
3126   the same as the one you get from "psi" (for example)?
3127
3128   Are optimizable r.e.'s likely to be used in real-life situations
3129   (something like 'ab*' is probably unlikely; something like is
3130   'psi|epsilon' is likelier)? */
3131
3132static char *
3133icatalloc (char *old, char *new)
3134{
3135  char *result;
3136  size_t oldsize, newsize;
3137
3138  newsize = (new == NULL) ? 0 : strlen(new);
3139  if (old == NULL)
3140    oldsize = 0;
3141  else if (newsize == 0)
3142    return old;
3143  else	oldsize = strlen(old);
3144  if (old == NULL)
3145    result = (char *) malloc(newsize + 1);
3146  else
3147    result = (char *) realloc((void *) old, oldsize + newsize + 1);
3148  if (result != NULL && new != NULL)
3149    (void) strcpy(result + oldsize, new);
3150  return result;
3151}
3152
3153static char *
3154icpyalloc (char *string)
3155{
3156  return icatalloc((char *) NULL, string);
3157}
3158
3159static char *
3160istrstr (char *lookin, char *lookfor)
3161{
3162  char *cp;
3163  size_t len;
3164
3165  len = strlen(lookfor);
3166  for (cp = lookin; *cp != '\0'; ++cp)
3167    if (strncmp(cp, lookfor, len) == 0)
3168      return cp;
3169  return NULL;
3170}
3171
3172static void
3173ifree (char *cp)
3174{
3175  if (cp != NULL)
3176    free(cp);
3177}
3178
3179static void
3180freelist (char **cpp)
3181{
3182  int i;
3183
3184  if (cpp == NULL)
3185    return;
3186  for (i = 0; cpp[i] != NULL; ++i)
3187    {
3188      free(cpp[i]);
3189      cpp[i] = NULL;
3190    }
3191}
3192
3193static char **
3194enlist (char **cpp, char *new, size_t len)
3195{
3196  int i, j;
3197
3198  if (cpp == NULL)
3199    return NULL;
3200  if ((new = icpyalloc(new)) == NULL)
3201    {
3202      freelist(cpp);
3203      return NULL;
3204    }
3205  new[len] = '\0';
3206  /* Is there already something in the list that's new (or longer)? */
3207  for (i = 0; cpp[i] != NULL; ++i)
3208    if (istrstr(cpp[i], new) != NULL)
3209      {
3210	free(new);
3211	return cpp;
3212      }
3213  /* Eliminate any obsoleted strings. */
3214  j = 0;
3215  while (cpp[j] != NULL)
3216    if (istrstr(new, cpp[j]) == NULL)
3217      ++j;
3218    else
3219      {
3220	free(cpp[j]);
3221	if (--i == j)
3222	  break;
3223	cpp[j] = cpp[i];
3224	cpp[i] = NULL;
3225      }
3226  /* Add the new string. */
3227  cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
3228  if (cpp == NULL)
3229    return NULL;
3230  cpp[i] = new;
3231  cpp[i + 1] = NULL;
3232  return cpp;
3233}
3234
3235/* Given pointers to two strings, return a pointer to an allocated
3236   list of their distinct common substrings. Return NULL if something
3237   seems wild. */
3238static char **
3239comsubs (char *left, char *right)
3240{
3241  char **cpp;
3242  char *lcp;
3243  char *rcp;
3244  size_t i, len;
3245
3246  if (left == NULL || right == NULL)
3247    return NULL;
3248  cpp = (char **) malloc(sizeof *cpp);
3249  if (cpp == NULL)
3250    return NULL;
3251  cpp[0] = NULL;
3252  for (lcp = left; *lcp != '\0'; ++lcp)
3253    {
3254      len = 0;
3255      rcp = strchr (right, *lcp);
3256      while (rcp != NULL)
3257	{
3258	  for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3259	    continue;
3260	  if (i > len)
3261	    len = i;
3262	  rcp = strchr (rcp + 1, *lcp);
3263	}
3264      if (len == 0)
3265	continue;
3266      if ((cpp = enlist(cpp, lcp, len)) == NULL)
3267	break;
3268    }
3269  return cpp;
3270}
3271
3272static char **
3273addlists (char **old, char **new)
3274{
3275  int i;
3276
3277  if (old == NULL || new == NULL)
3278    return NULL;
3279  for (i = 0; new[i] != NULL; ++i)
3280    {
3281      old = enlist(old, new[i], strlen(new[i]));
3282      if (old == NULL)
3283	break;
3284    }
3285  return old;
3286}
3287
3288/* Given two lists of substrings, return a new list giving substrings
3289   common to both. */
3290static char **
3291inboth (char **left, char **right)
3292{
3293  char **both;
3294  char **temp;
3295  int lnum, rnum;
3296
3297  if (left == NULL || right == NULL)
3298    return NULL;
3299  both = (char **) malloc(sizeof *both);
3300  if (both == NULL)
3301    return NULL;
3302  both[0] = NULL;
3303  for (lnum = 0; left[lnum] != NULL; ++lnum)
3304    {
3305      for (rnum = 0; right[rnum] != NULL; ++rnum)
3306	{
3307	  temp = comsubs(left[lnum], right[rnum]);
3308	  if (temp == NULL)
3309	    {
3310	      freelist(both);
3311	      return NULL;
3312	    }
3313	  both = addlists(both, temp);
3314	  freelist(temp);
3315	  free(temp);
3316	  if (both == NULL)
3317	    return NULL;
3318	}
3319    }
3320  return both;
3321}
3322
3323typedef struct
3324{
3325  char **in;
3326  char *left;
3327  char *right;
3328  char *is;
3329} must;
3330
3331static void
3332resetmust (must *mp)
3333{
3334  mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3335  freelist(mp->in);
3336}
3337
3338static void
3339dfamust (struct dfa *dfa)
3340{
3341  must *musts;
3342  must *mp;
3343  char *result;
3344  int ri;
3345  int i;
3346  int exact;
3347  token t;
3348  static must must0;
3349  struct dfamust *dm;
3350  static char empty_string[] = "";
3351
3352  result = empty_string;
3353  exact = 0;
3354  musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
3355  if (musts == NULL)
3356    return;
3357  mp = musts;
3358  for (i = 0; i <= dfa->tindex; ++i)
3359    mp[i] = must0;
3360  for (i = 0; i <= dfa->tindex; ++i)
3361    {
3362      mp[i].in = (char **) malloc(sizeof *mp[i].in);
3363      mp[i].left = malloc(2);
3364      mp[i].right = malloc(2);
3365      mp[i].is = malloc(2);
3366      if (mp[i].in == NULL || mp[i].left == NULL ||
3367	  mp[i].right == NULL || mp[i].is == NULL)
3368	goto done;
3369      mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3370      mp[i].in[0] = NULL;
3371    }
3372#ifdef DEBUG
3373  fprintf(stderr, "dfamust:\n");
3374  for (i = 0; i < dfa->tindex; ++i)
3375    {
3376      fprintf(stderr, " %d:", i);
3377      prtok(dfa->tokens[i]);
3378    }
3379  putc('\n', stderr);
3380#endif
3381  for (ri = 0; ri < dfa->tindex; ++ri)
3382    {
3383      switch (t = dfa->tokens[ri])
3384	{
3385	case LPAREN:
3386	case RPAREN:
3387	  goto done;		/* "cannot happen" */
3388	case EMPTY:
3389	case BEGLINE:
3390	case ENDLINE:
3391	case BEGWORD:
3392	case ENDWORD:
3393	case LIMWORD:
3394	case NOTLIMWORD:
3395	case BACKREF:
3396	  resetmust(mp);
3397	  break;
3398	case STAR:
3399	case QMARK:
3400	  if (mp <= musts)
3401	    goto done;		/* "cannot happen" */
3402	  --mp;
3403	  resetmust(mp);
3404	  break;
3405	case OR:
3406	case ORTOP:
3407	  if (mp < &musts[2])
3408	    goto done;		/* "cannot happen" */
3409	  {
3410	    char **new;
3411	    must *lmp;
3412	    must *rmp;
3413	    int j, ln, rn, n;
3414
3415	    rmp = --mp;
3416	    lmp = --mp;
3417	    /* Guaranteed to be.  Unlikely, but. . . */
3418	    if (strcmp(lmp->is, rmp->is) != 0)
3419	      lmp->is[0] = '\0';
3420	    /* Left side--easy */
3421	    i = 0;
3422	    while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3423	      ++i;
3424	    lmp->left[i] = '\0';
3425	    /* Right side */
3426	    ln = strlen(lmp->right);
3427	    rn = strlen(rmp->right);
3428	    n = ln;
3429	    if (n > rn)
3430	      n = rn;
3431	    for (i = 0; i < n; ++i)
3432	      if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3433		break;
3434	    for (j = 0; j < i; ++j)
3435	      lmp->right[j] = lmp->right[(ln - i) + j];
3436	    lmp->right[j] = '\0';
3437	    new = inboth(lmp->in, rmp->in);
3438	    if (new == NULL)
3439	      goto done;
3440	    freelist(lmp->in);
3441	    free((char *) lmp->in);
3442	    lmp->in = new;
3443	  }
3444	  break;
3445	case PLUS:
3446	  if (mp <= musts)
3447	    goto done;		/* "cannot happen" */
3448	  --mp;
3449	  mp->is[0] = '\0';
3450	  break;
3451	case END:
3452	  if (mp != &musts[1])
3453	    goto done;		/* "cannot happen" */
3454	  for (i = 0; musts[0].in[i] != NULL; ++i)
3455	    if (strlen(musts[0].in[i]) > strlen(result))
3456	      result = musts[0].in[i];
3457	  if (strcmp(result, musts[0].is) == 0)
3458	    exact = 1;
3459	  goto done;
3460	case CAT:
3461	  if (mp < &musts[2])
3462	    goto done;		/* "cannot happen" */
3463	  {
3464	    must *lmp;
3465	    must *rmp;
3466
3467	    rmp = --mp;
3468	    lmp = --mp;
3469	    /* In.  Everything in left, plus everything in
3470	       right, plus catenation of
3471	       left's right and right's left. */
3472	    lmp->in = addlists(lmp->in, rmp->in);
3473	    if (lmp->in == NULL)
3474	      goto done;
3475	    if (lmp->right[0] != '\0' &&
3476		rmp->left[0] != '\0')
3477	      {
3478		char *tp;
3479
3480		tp = icpyalloc(lmp->right);
3481		if (tp == NULL)
3482		  goto done;
3483		tp = icatalloc(tp, rmp->left);
3484		if (tp == NULL)
3485		  goto done;
3486		lmp->in = enlist(lmp->in, tp,
3487				 strlen(tp));
3488		free(tp);
3489		if (lmp->in == NULL)
3490		  goto done;
3491	      }
3492	    /* Left-hand */
3493	    if (lmp->is[0] != '\0')
3494	      {
3495		lmp->left = icatalloc(lmp->left,
3496				      rmp->left);
3497		if (lmp->left == NULL)
3498		  goto done;
3499	      }
3500	    /* Right-hand */
3501	    if (rmp->is[0] == '\0')
3502	      lmp->right[0] = '\0';
3503	    lmp->right = icatalloc(lmp->right, rmp->right);
3504	    if (lmp->right == NULL)
3505	      goto done;
3506	    /* Guaranteed to be */
3507	    if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3508	      {
3509		lmp->is = icatalloc(lmp->is, rmp->is);
3510		if (lmp->is == NULL)
3511		  goto done;
3512	      }
3513	    else
3514	      lmp->is[0] = '\0';
3515	  }
3516	  break;
3517	default:
3518	  if (t < END)
3519	    {
3520	      /* "cannot happen" */
3521	      goto done;
3522	    }
3523	  else if (t == '\0')
3524	    {
3525	      /* not on *my* shift */
3526	      goto done;
3527	    }
3528	  else if (t >= CSET
3529#ifdef MBS_SUPPORT
3530		   || t == ANYCHAR
3531		   || t == MBCSET
3532#endif /* MBS_SUPPORT */
3533		   )
3534	    {
3535	      /* easy enough */
3536	      resetmust(mp);
3537	    }
3538	  else
3539	    {
3540	      /* plain character */
3541	      resetmust(mp);
3542	      mp->is[0] = mp->left[0] = mp->right[0] = t;
3543	      mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3544	      mp->in = enlist(mp->in, mp->is, (size_t)1);
3545	      if (mp->in == NULL)
3546		goto done;
3547	    }
3548	  break;
3549	}
3550#ifdef DEBUG
3551      fprintf(stderr, " node: %d:", ri);
3552      prtok(dfa->tokens[ri]);
3553      fprintf(stderr, "\n  in:");
3554      for (i = 0; mp->in[i]; ++i)
3555	fprintf(stderr, " \"%s\"", mp->in[i]);
3556      fprintf(stderr, "\n  is: \"%s\"\n", mp->is);
3557      fprintf(stderr, "  left: \"%s\"\n", mp->left);
3558      fprintf(stderr, "  right: \"%s\"\n", mp->right);
3559#endif
3560      ++mp;
3561    }
3562 done:
3563  if (strlen(result))
3564    {
3565      dm = (struct dfamust *) malloc(sizeof (struct dfamust));
3566      dm->exact = exact;
3567      dm->must = malloc(strlen(result) + 1);
3568      strcpy(dm->must, result);
3569      dm->next = dfa->musts;
3570      dfa->musts = dm;
3571    }
3572  mp = musts;
3573  for (i = 0; i <= dfa->tindex; ++i)
3574    {
3575      freelist(mp[i].in);
3576      ifree((char *) mp[i].in);
3577      ifree(mp[i].left);
3578      ifree(mp[i].right);
3579      ifree(mp[i].is);
3580    }
3581  free((char *) mp);
3582}
3583/* vim:set shiftwidth=2: */
3584