1/* dfa.c - deterministic extended regexp routines for GNU
2   Copyright 1988, 1998, 2000 Free Software Foundation, Inc.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2, or (at your option)
7   any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software
16   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA */
17
18/* Written June, 1988 by Mike Haertel
19   Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
20
21/* $FreeBSD$ */
22
23#ifdef HAVE_CONFIG_H
24#include <config.h>
25#endif
26
27#include <assert.h>
28#include <ctype.h>
29#include <stdio.h>
30
31#include <sys/types.h>
32#ifdef STDC_HEADERS
33#include <stdlib.h>
34#else
35extern char *calloc(), *malloc(), *realloc();
36extern void free();
37#endif
38
39#if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
40#include <string.h>
41#else
42#include <strings.h>
43#endif
44
45#if HAVE_SETLOCALE
46# include <locale.h>
47#endif
48
49#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
50/* We can handle multibyte string.  */
51# define MBS_SUPPORT
52#endif
53
54#ifdef MBS_SUPPORT
55# include <wchar.h>
56# include <wctype.h>
57#endif
58
59#ifndef DEBUG	/* use the same approach as regex.c */
60#undef assert
61#define assert(e)
62#endif /* DEBUG */
63
64#ifndef isgraph
65#define isgraph(C) (isprint(C) && !isspace(C))
66#endif
67
68#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
69#define ISALPHA(C) isalpha(C)
70#define ISUPPER(C) isupper(C)
71#define ISLOWER(C) islower(C)
72#define ISDIGIT(C) isdigit(C)
73#define ISXDIGIT(C) isxdigit(C)
74#define ISSPACE(C) isspace(C)
75#define ISPUNCT(C) ispunct(C)
76#define ISALNUM(C) isalnum(C)
77#define ISPRINT(C) isprint(C)
78#define ISGRAPH(C) isgraph(C)
79#define ISCNTRL(C) iscntrl(C)
80#else
81#define ISALPHA(C) (isascii(C) && isalpha(C))
82#define ISUPPER(C) (isascii(C) && isupper(C))
83#define ISLOWER(C) (isascii(C) && islower(C))
84#define ISDIGIT(C) (isascii(C) && isdigit(C))
85#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
86#define ISSPACE(C) (isascii(C) && isspace(C))
87#define ISPUNCT(C) (isascii(C) && ispunct(C))
88#define ISALNUM(C) (isascii(C) && isalnum(C))
89#define ISPRINT(C) (isascii(C) && isprint(C))
90#define ISGRAPH(C) (isascii(C) && isgraph(C))
91#define ISCNTRL(C) (isascii(C) && iscntrl(C))
92#endif
93
94/* ISASCIIDIGIT differs from ISDIGIT, as follows:
95   - Its arg may be any int or unsigned int; it need not be an unsigned char.
96   - It's guaranteed to evaluate its argument exactly once.
97   - It's typically faster.
98   Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
99   only '0' through '9' are digits.  Prefer ISASCIIDIGIT to ISDIGIT unless
100   it's important to use the locale's definition of `digit' even when the
101   host does not conform to Posix.  */
102#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
103
104/* If we (don't) have I18N.  */
105/* glibc defines _ */
106#ifndef _
107# ifdef HAVE_LIBINTL_H
108#  include <libintl.h>
109#  ifndef _
110#   define _(Str) gettext (Str)
111#  endif
112# else
113#  define _(Str) (Str)
114# endif
115#endif
116
117#include "regex.h"
118#include "dfa.h"
119#include "hard-locale.h"
120
121/* HPUX, define those as macros in sys/param.h */
122#ifdef setbit
123# undef setbit
124#endif
125#ifdef clrbit
126# undef clrbit
127#endif
128
129static void dfamust PARAMS ((struct dfa *dfa));
130static void regexp PARAMS ((int toplevel));
131
132static ptr_t
133xcalloc (size_t n, size_t s)
134{
135  ptr_t r = calloc(n, s);
136
137  if (!r)
138    dfaerror(_("Memory exhausted"));
139  return r;
140}
141
142static ptr_t
143xmalloc (size_t n)
144{
145  ptr_t r = malloc(n);
146
147  assert(n != 0);
148  if (!r)
149    dfaerror(_("Memory exhausted"));
150  return r;
151}
152
153static ptr_t
154xrealloc (ptr_t p, size_t n)
155{
156  ptr_t r = realloc(p, n);
157
158  assert(n != 0);
159  if (!r)
160    dfaerror(_("Memory exhausted"));
161  return r;
162}
163
164#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
165#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
166#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
167
168/* Reallocate an array of type t if nalloc is too small for index. */
169#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
170  if ((index) >= (nalloc))			  \
171    {						  \
172      do					  \
173	(nalloc) *= 2;				  \
174      while ((index) >= (nalloc));		  \
175      REALLOC(p, t, nalloc);			  \
176    }
177
178#ifdef DEBUG
179
180static void
181prtok (token t)
182{
183  char const *s;
184
185  if (t < 0)
186    fprintf(stderr, "END");
187  else if (t < NOTCHAR)
188    fprintf(stderr, "%c", t);
189  else
190    {
191      switch (t)
192	{
193	case EMPTY: s = "EMPTY"; break;
194	case BACKREF: s = "BACKREF"; break;
195	case BEGLINE: s = "BEGLINE"; break;
196	case ENDLINE: s = "ENDLINE"; break;
197	case BEGWORD: s = "BEGWORD"; break;
198	case ENDWORD: s = "ENDWORD"; break;
199	case LIMWORD: s = "LIMWORD"; break;
200	case NOTLIMWORD: s = "NOTLIMWORD"; break;
201	case QMARK: s = "QMARK"; break;
202	case STAR: s = "STAR"; break;
203	case PLUS: s = "PLUS"; break;
204	case CAT: s = "CAT"; break;
205	case OR: s = "OR"; break;
206	case ORTOP: s = "ORTOP"; break;
207	case LPAREN: s = "LPAREN"; break;
208	case RPAREN: s = "RPAREN"; break;
209	case CRANGE: s = "CRANGE"; break;
210#ifdef MBS_SUPPORT
211	case ANYCHAR: s = "ANYCHAR"; break;
212	case MBCSET: s = "MBCSET"; break;
213#endif /* MBS_SUPPORT */
214	default: s = "CSET"; break;
215	}
216      fprintf(stderr, "%s", s);
217    }
218}
219#endif /* DEBUG */
220
221/* Stuff pertaining to charclasses. */
222
223static int
224tstbit (unsigned b, charclass c)
225{
226  return c[b / INTBITS] & 1 << b % INTBITS;
227}
228
229static void
230setbit (unsigned b, charclass c)
231{
232  c[b / INTBITS] |= 1 << b % INTBITS;
233}
234
235static void
236clrbit (unsigned b, charclass c)
237{
238  c[b / INTBITS] &= ~(1 << b % INTBITS);
239}
240
241static void
242copyset (charclass src, charclass dst)
243{
244  memcpy (dst, src, sizeof (charclass));
245}
246
247static void
248zeroset (charclass s)
249{
250  memset (s, 0, sizeof (charclass));
251}
252
253static void
254notset (charclass s)
255{
256  int i;
257
258  for (i = 0; i < CHARCLASS_INTS; ++i)
259    s[i] = ~s[i];
260}
261
262static int
263equal (charclass s1, charclass s2)
264{
265  return memcmp (s1, s2, sizeof (charclass)) == 0;
266}
267
268/* A pointer to the current dfa is kept here during parsing. */
269static struct dfa *dfa;
270
271/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
272static int
273charclass_index (charclass s)
274{
275  int i;
276
277  for (i = 0; i < dfa->cindex; ++i)
278    if (equal(s, dfa->charclasses[i]))
279      return i;
280  REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
281  ++dfa->cindex;
282  copyset(s, dfa->charclasses[i]);
283  return i;
284}
285
286/* Syntax bits controlling the behavior of the lexical analyzer. */
287static reg_syntax_t syntax_bits, syntax_bits_set;
288
289/* Flag for case-folding letters into sets. */
290static int case_fold;
291
292/* End-of-line byte in data.  */
293static unsigned char eolbyte;
294
295/* Entry point to set syntax options. */
296void
297dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
298{
299  syntax_bits_set = 1;
300  syntax_bits = bits;
301  case_fold = fold;
302  eolbyte = eol;
303}
304
305/* Like setbit, but if case is folded, set both cases of a letter.  */
306static void
307setbit_case_fold (unsigned b, charclass c)
308{
309  setbit (b, c);
310  if (case_fold)
311    {
312      if (ISUPPER (b))
313	setbit (tolower (b), c);
314      else if (ISLOWER (b))
315	setbit (toupper (b), c);
316    }
317}
318
319/* Lexical analyzer.  All the dross that deals with the obnoxious
320   GNU Regex syntax bits is located here.  The poor, suffering
321   reader is referred to the GNU Regex documentation for the
322   meaning of the @#%!@#%^!@ syntax bits. */
323
324static char const *lexstart;	/* Pointer to beginning of input string. */
325static char const *lexptr;	/* Pointer to next input character. */
326static int lexleft;		/* Number of characters remaining. */
327static token lasttok;		/* Previous token returned; initially END. */
328static int laststart;		/* True if we're separated from beginning or (, |
329				   only by zero-width characters. */
330static int parens;		/* Count of outstanding left parens. */
331static int minrep, maxrep;	/* Repeat counts for {m,n}. */
332static int hard_LC_COLLATE;	/* Nonzero if LC_COLLATE is hard.  */
333
334#ifdef MBS_SUPPORT
335/* These variables are used only if (MB_CUR_MAX > 1).  */
336static mbstate_t mbs;		/* Mbstate for mbrlen().  */
337static ssize_t cur_mb_len;	/* Byte length of the current scanning
338				   multibyte character.  Must also handle
339				   negative result from mbrlen().  */
340static ssize_t cur_mb_index;	/* Byte index of the current scanning multibyte
341                                   character.
342
343				   singlebyte character : cur_mb_index = 0
344				   multibyte character
345				       1st byte : cur_mb_index = 1
346				       2nd byte : cur_mb_index = 2
347				         ...
348				       nth byte : cur_mb_index = n  */
349static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
350                                  Each element store the amount of remain
351                                  byte of corresponding multibyte character
352                                  in the input string.  A element's value
353                                  is 0 if corresponding character is a
354                                  singlebyte chracter.
355                                  e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
356                                   mblen_buf :   0,       3,       2,       1
357                               */
358static wchar_t *inputwcs;	/* Wide character representation of input
359				   string in dfaexec().
360				   The length of this array is same as
361				   the length of input string(char array).
362				   inputstring[i] is a single-byte char,
363				   or 1st byte of a multibyte char.
364				   And inputwcs[i] is the codepoint.  */
365static unsigned char const *buf_begin;/* refference to begin in dfaexec().  */
366static unsigned char const *buf_end;	/* refference to end in dfaexec().  */
367#endif /* MBS_SUPPORT  */
368
369#ifdef MBS_SUPPORT
370/* This function update cur_mb_len, and cur_mb_index.
371   p points current lexptr, len is the remaining buffer length.  */
372static void
373update_mb_len_index (unsigned char const *p, size_t len)
374{
375  /* If last character is a part of a multibyte character,
376     we update cur_mb_index.  */
377  if (cur_mb_index)
378    cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
379			: cur_mb_index + 1;
380
381  /* If last character is a single byte character, or the
382     last portion of a multibyte character, we check whether
383     next character is a multibyte character or not.  */
384  if (! cur_mb_index)
385    {
386      cur_mb_len = mbrlen(p, len, &mbs);
387      if (cur_mb_len > 1)
388	/* It is a multibyte character.
389	   cur_mb_len was already set by mbrlen().  */
390	cur_mb_index = 1;
391      else if (cur_mb_len < 1)
392	/* Invalid sequence.  We treat it as a singlebyte character.
393	   cur_mb_index is aleady 0.  */
394	cur_mb_len = 1;
395      /* Otherwise, cur_mb_len == 1, it is a singlebyte character.
396	 cur_mb_index is aleady 0.  */
397    }
398}
399#endif /* MBS_SUPPORT */
400
401#ifdef MBS_SUPPORT
402/* Note that characters become unsigned here. */
403# define FETCH(c, eoferr)			\
404  {						\
405    if (! lexleft)				\
406     {						\
407	if (eoferr != 0)			\
408	  dfaerror (eoferr);			\
409	else					\
410	  return lasttok = END;			\
411      }						\
412    if (MB_CUR_MAX > 1)				\
413      update_mb_len_index(lexptr, lexleft);	\
414    (c) = (unsigned char) *lexptr++;		\
415    --lexleft;					\
416  }
417
418/* This function fetch a wide character, and update cur_mb_len,
419   used only if the current locale is a multibyte environment.  */
420static wint_t
421fetch_wc (char const *eoferr)
422{
423  wchar_t wc;
424  if (! lexleft)
425    {
426      if (eoferr != 0)
427	dfaerror (eoferr);
428      else
429	return WEOF;
430    }
431
432  cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
433  if (cur_mb_len <= 0)
434   {
435      cur_mb_len = 1;
436      wc = *lexptr;
437    }
438  lexptr += cur_mb_len;
439  lexleft -= cur_mb_len;
440  return wc;
441}
442#else
443/* Note that characters become unsigned here. */
444# define FETCH(c, eoferr)   	      \
445  {			   	      \
446    if (! lexleft)	   	      \
447      {				      \
448	if (eoferr != 0)	      \
449	  dfaerror (eoferr);	      \
450	else		   	      \
451	  return lasttok = END;	      \
452      }				      \
453    (c) = (unsigned char) *lexptr++;  \
454    --lexleft;		   	      \
455  }
456#endif /* MBS_SUPPORT */
457
458#ifdef MBS_SUPPORT
459/* Multibyte character handling sub-routin for lex.
460   This function  parse a bracket expression and build a struct
461   mb_char_classes.  */
462static void
463parse_bracket_exp_mb ()
464{
465  wint_t wc, wc1, wc2;
466
467  /* Work area to build a mb_char_classes.  */
468  struct mb_char_classes *work_mbc;
469  int chars_al, range_sts_al, range_ends_al, ch_classes_al,
470    equivs_al, coll_elems_al;
471
472  REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
473		       dfa->mbcsets_alloc, dfa->nmbcsets + 1);
474  /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
475     We will update dfa->multibyte_prop in addtok(), because we can't
476     decide the index in dfa->tokens[].  */
477
478  /* Initialize work are */
479  work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
480
481  chars_al = 1;
482  range_sts_al = range_ends_al = 0;
483  ch_classes_al = equivs_al = coll_elems_al = 0;
484  MALLOC(work_mbc->chars, wchar_t, chars_al);
485
486  work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
487  work_mbc->nequivs = work_mbc->ncoll_elems = 0;
488  work_mbc->chars = work_mbc->ch_classes = NULL;
489  work_mbc->range_sts = work_mbc->range_ends = NULL;
490  work_mbc->equivs = work_mbc->coll_elems = NULL;
491
492  wc = fetch_wc(_("Unbalanced ["));
493  if (wc == L'^')
494    {
495      wc = fetch_wc(_("Unbalanced ["));
496      work_mbc->invert = 1;
497    }
498  else
499    work_mbc->invert = 0;
500  do
501    {
502      wc1 = WEOF; /* mark wc1 is not initialized".  */
503
504      /* Note that if we're looking at some other [:...:] construct,
505	 we just treat it as a bunch of ordinary characters.  We can do
506	 this because we assume regex has checked for syntax errors before
507	 dfa is ever called. */
508      if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
509	{
510#define BRACKET_BUFFER_SIZE 128
511	  char str[BRACKET_BUFFER_SIZE];
512	  wc1 = wc;
513	  wc = fetch_wc(_("Unbalanced ["));
514
515	  /* If pattern contains `[[:', `[[.', or `[[='.  */
516	  if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
517	    {
518	      unsigned char c;
519	      unsigned char delim = (unsigned char)wc;
520	      int len = 0;
521	      for (;;)
522		{
523		  if (! lexleft)
524		    dfaerror (_("Unbalanced ["));
525		  c = (unsigned char) *lexptr++;
526		  --lexleft;
527
528		  if ((c == delim && *lexptr == ']') || lexleft == 0)
529		    break;
530		  if (len < BRACKET_BUFFER_SIZE)
531		    str[len++] = c;
532		  else
533		    /* This is in any case an invalid class name.  */
534		    str[0] = '\0';
535		}
536	      str[len] = '\0';
537
538	      if (lexleft == 0)
539		{
540		  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
541				       work_mbc->nchars + 2);
542		  work_mbc->chars[work_mbc->nchars++] = L'[';
543		  work_mbc->chars[work_mbc->nchars++] = delim;
544		  break;
545		}
546
547	      if (--lexleft, *lexptr++ != ']')
548		dfaerror (_("Unbalanced ["));
549	      if (delim == ':')
550		/* build character class.  */
551		{
552		  wctype_t wt;
553		  /* Query the character class as wctype_t.  */
554		  wt = wctype (str);
555
556		  if (ch_classes_al == 0)
557		    MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al);
558		  REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
559				       ch_classes_al,
560				       work_mbc->nch_classes + 1);
561		  work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
562
563 		}
564	      else if (delim == '=' || delim == '.')
565		{
566		  char *elem;
567		  MALLOC(elem, char, len + 1);
568		  strncpy(elem, str, len + 1);
569
570		  if (delim == '=')
571		    /* build equivalent class.  */
572		    {
573		      if (equivs_al == 0)
574			MALLOC(work_mbc->equivs, char*, ++equivs_al);
575		      REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
576					   equivs_al,
577					   work_mbc->nequivs + 1);
578		      work_mbc->equivs[work_mbc->nequivs++] = elem;
579		    }
580
581		  if (delim == '.')
582		    /* build collating element.  */
583		    {
584		      if (coll_elems_al == 0)
585			MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
586		      REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
587					   coll_elems_al,
588					   work_mbc->ncoll_elems + 1);
589		      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
590		    }
591 		}
592	      wc1 = wc = WEOF;
593	    }
594	  else
595	    /* We treat '[' as a normal character here.  */
596	    {
597	      wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
598	    }
599	}
600      else
601	{
602	  if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
603	    wc = fetch_wc(("Unbalanced ["));
604	}
605
606      if (wc1 == WEOF)
607	wc1 = fetch_wc(_("Unbalanced ["));
608
609      if (wc1 == L'-')
610	/* build range characters.  */
611	{
612	  wc2 = fetch_wc(_("Unbalanced ["));
613	  if (wc2 == L']')
614	    {
615	      /* In the case [x-], the - is an ordinary hyphen,
616		 which is left in c1, the lookahead character. */
617	      lexptr -= cur_mb_len;
618	      lexleft += cur_mb_len;
619	      wc2 = wc;
620	    }
621	  else
622	    {
623	      if (wc2 == L'\\'
624		  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
625		wc2 = fetch_wc(_("Unbalanced ["));
626	      wc1 = fetch_wc(_("Unbalanced ["));
627	    }
628
629	  if (range_sts_al == 0)
630	    {
631	      MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
632	      MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
633	    }
634	  REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
635			       range_sts_al, work_mbc->nranges + 1);
636	  work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
637	  REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
638			       range_ends_al, work_mbc->nranges + 1);
639	  work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
640	}
641      else if (wc != WEOF)
642	/* build normal characters.  */
643	{
644	  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
645			       work_mbc->nchars + 1);
646	  work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
647	}
648    }
649  while ((wc = wc1) != L']');
650}
651#endif /* MBS_SUPPORT */
652
653#ifdef __STDC__
654#define FUNC(F, P) static int F(int c) { return P(c); }
655#else
656#define FUNC(F, P) static int F(c) int c; { return P(c); }
657#endif
658
659FUNC(is_alpha, ISALPHA)
660FUNC(is_upper, ISUPPER)
661FUNC(is_lower, ISLOWER)
662FUNC(is_digit, ISDIGIT)
663FUNC(is_xdigit, ISXDIGIT)
664FUNC(is_space, ISSPACE)
665FUNC(is_punct, ISPUNCT)
666FUNC(is_alnum, ISALNUM)
667FUNC(is_print, ISPRINT)
668FUNC(is_graph, ISGRAPH)
669FUNC(is_cntrl, ISCNTRL)
670
671static int
672is_blank (int c)
673{
674   return (c == ' ' || c == '\t');
675}
676
677/* The following list maps the names of the Posix named character classes
678   to predicate functions that determine whether a given character is in
679   the class.  The leading [ has already been eaten by the lexical analyzer. */
680static struct {
681  const char *name;
682  int (*pred) PARAMS ((int));
683} const prednames[] = {
684  { ":alpha:]", is_alpha },
685  { ":upper:]", is_upper },
686  { ":lower:]", is_lower },
687  { ":digit:]", is_digit },
688  { ":xdigit:]", is_xdigit },
689  { ":space:]", is_space },
690  { ":punct:]", is_punct },
691  { ":alnum:]", is_alnum },
692  { ":print:]", is_print },
693  { ":graph:]", is_graph },
694  { ":cntrl:]", is_cntrl },
695  { ":blank:]", is_blank },
696  { 0 }
697};
698
699/* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
700#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
701
702static int
703looking_at (char const *s)
704{
705  size_t len;
706
707  len = strlen(s);
708  if (lexleft < len)
709    return 0;
710  return strncmp(s, lexptr, len) == 0;
711}
712
713static token
714lex (void)
715{
716  unsigned c, c1, c2;
717  int backslash = 0, invert;
718  charclass ccl;
719  int i;
720
721  /* Basic plan: We fetch a character.  If it's a backslash,
722     we set the backslash flag and go through the loop again.
723     On the plus side, this avoids having a duplicate of the
724     main switch inside the backslash case.  On the minus side,
725     it means that just about every case begins with
726     "if (backslash) ...".  */
727  for (i = 0; i < 2; ++i)
728    {
729      FETCH(c, 0);
730#ifdef MBS_SUPPORT
731      if (MB_CUR_MAX > 1 && cur_mb_index)
732	/* If this is a part of a multi-byte character, we must treat
733	   this byte data as a normal character.
734	   e.g. In case of SJIS encoding, some character contains '\',
735	        but they must not be backslash.  */
736	goto normal_char;
737#endif /* MBS_SUPPORT  */
738      switch (c)
739	{
740	case '\\':
741	  if (backslash)
742	    goto normal_char;
743	  if (lexleft == 0)
744	    dfaerror(_("Unfinished \\ escape"));
745	  backslash = 1;
746	  break;
747
748	case '^':
749	  if (backslash)
750	    goto normal_char;
751	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
752	      || lasttok == END
753	      || lasttok == LPAREN
754	      || lasttok == OR)
755	    return lasttok = BEGLINE;
756	  goto normal_char;
757
758	case '$':
759	  if (backslash)
760	    goto normal_char;
761	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
762	      || lexleft == 0
763	      || (syntax_bits & RE_NO_BK_PARENS
764		  ? lexleft > 0 && *lexptr == ')'
765		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
766	      || (syntax_bits & RE_NO_BK_VBAR
767		  ? lexleft > 0 && *lexptr == '|'
768		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
769	      || ((syntax_bits & RE_NEWLINE_ALT)
770	          && lexleft > 0 && *lexptr == '\n'))
771	    return lasttok = ENDLINE;
772	  goto normal_char;
773
774	case '1':
775	case '2':
776	case '3':
777	case '4':
778	case '5':
779	case '6':
780	case '7':
781	case '8':
782	case '9':
783	  if (backslash && !(syntax_bits & RE_NO_BK_REFS))
784	    {
785	      laststart = 0;
786	      return lasttok = BACKREF;
787	    }
788	  goto normal_char;
789
790	case '`':
791	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
792	    return lasttok = BEGLINE;	/* FIXME: should be beginning of string */
793	  goto normal_char;
794
795	case '\'':
796	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
797	    return lasttok = ENDLINE;	/* FIXME: should be end of string */
798	  goto normal_char;
799
800	case '<':
801	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
802	    return lasttok = BEGWORD;
803	  goto normal_char;
804
805	case '>':
806	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
807	    return lasttok = ENDWORD;
808	  goto normal_char;
809
810	case 'b':
811	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
812	    return lasttok = LIMWORD;
813	  goto normal_char;
814
815	case 'B':
816	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
817	    return lasttok = NOTLIMWORD;
818	  goto normal_char;
819
820	case '?':
821	  if (syntax_bits & RE_LIMITED_OPS)
822	    goto normal_char;
823	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
824	    goto normal_char;
825	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
826	    goto normal_char;
827	  return lasttok = QMARK;
828
829	case '*':
830	  if (backslash)
831	    goto normal_char;
832	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
833	    goto normal_char;
834	  return lasttok = STAR;
835
836	case '+':
837	  if (syntax_bits & RE_LIMITED_OPS)
838	    goto normal_char;
839	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
840	    goto normal_char;
841	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
842	    goto normal_char;
843	  return lasttok = PLUS;
844
845	case '{':
846	  if (!(syntax_bits & RE_INTERVALS))
847	    goto normal_char;
848	  if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
849	    goto normal_char;
850	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
851	    goto normal_char;
852
853	  if (syntax_bits & RE_NO_BK_BRACES)
854	    {
855	      /* Scan ahead for a valid interval; if it's not valid,
856		 treat it as a literal '{'.  */
857	      int lo = -1, hi = -1;
858	      char const *p = lexptr;
859	      char const *lim = p + lexleft;
860	      for (;  p != lim && ISASCIIDIGIT (*p);  p++)
861		lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
862	      if (p != lim && *p == ',')
863		while (++p != lim && ISASCIIDIGIT (*p))
864		  hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
865	      else
866		hi = lo;
867	      if (p == lim || *p != '}'
868		  || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
869		goto normal_char;
870	    }
871
872	  minrep = 0;
873	  /* Cases:
874	     {M} - exact count
875	     {M,} - minimum count, maximum is infinity
876	     {M,N} - M through N */
877	  FETCH(c, _("unfinished repeat count"));
878	  if (ISASCIIDIGIT (c))
879	    {
880	      minrep = c - '0';
881	      for (;;)
882		{
883		  FETCH(c, _("unfinished repeat count"));
884		  if (! ISASCIIDIGIT (c))
885		    break;
886		  minrep = 10 * minrep + c - '0';
887		}
888	    }
889	  else
890	    dfaerror(_("malformed repeat count"));
891	  if (c == ',')
892	    {
893	      FETCH (c, _("unfinished repeat count"));
894	      if (! ISASCIIDIGIT (c))
895		maxrep = -1;
896	      else
897		{
898		  maxrep = c - '0';
899		  for (;;)
900		    {
901		      FETCH (c, _("unfinished repeat count"));
902		      if (! ISASCIIDIGIT (c))
903			break;
904		      maxrep = 10 * maxrep + c - '0';
905		    }
906		  if (0 <= maxrep && maxrep < minrep)
907		    dfaerror (_("malformed repeat count"));
908		}
909	    }
910	  else
911	    maxrep = minrep;
912	  if (!(syntax_bits & RE_NO_BK_BRACES))
913	    {
914	      if (c != '\\')
915		dfaerror(_("malformed repeat count"));
916	      FETCH(c, _("unfinished repeat count"));
917	    }
918	  if (c != '}')
919	    dfaerror(_("malformed repeat count"));
920	  laststart = 0;
921	  return lasttok = REPMN;
922
923	case '|':
924	  if (syntax_bits & RE_LIMITED_OPS)
925	    goto normal_char;
926	  if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
927	    goto normal_char;
928	  laststart = 1;
929	  return lasttok = OR;
930
931	case '\n':
932	  if (syntax_bits & RE_LIMITED_OPS
933	      || backslash
934	      || !(syntax_bits & RE_NEWLINE_ALT))
935	    goto normal_char;
936	  laststart = 1;
937	  return lasttok = OR;
938
939	case '(':
940	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
941	    goto normal_char;
942	  ++parens;
943	  laststart = 1;
944	  return lasttok = LPAREN;
945
946	case ')':
947	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
948	    goto normal_char;
949	  if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
950	    goto normal_char;
951	  --parens;
952	  laststart = 0;
953	  return lasttok = RPAREN;
954
955	case '.':
956	  if (backslash)
957	    goto normal_char;
958#ifdef MBS_SUPPORT
959	  if (MB_CUR_MAX > 1)
960	    {
961	      /* In multibyte environment period must match with a single
962		 character not a byte.  So we use ANYCHAR.  */
963	      laststart = 0;
964	      return lasttok = ANYCHAR;
965	    }
966#endif /* MBS_SUPPORT */
967	  zeroset(ccl);
968	  notset(ccl);
969	  if (!(syntax_bits & RE_DOT_NEWLINE))
970	    clrbit(eolbyte, ccl);
971	  if (syntax_bits & RE_DOT_NOT_NULL)
972	    clrbit('\0', ccl);
973	  laststart = 0;
974	  return lasttok = CSET + charclass_index(ccl);
975
976	case 'w':
977	case 'W':
978	  if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
979	    goto normal_char;
980	  zeroset(ccl);
981	  for (c2 = 0; c2 < NOTCHAR; ++c2)
982	    if (IS_WORD_CONSTITUENT(c2))
983	      setbit(c2, ccl);
984	  if (c == 'W')
985	    notset(ccl);
986	  laststart = 0;
987	  return lasttok = CSET + charclass_index(ccl);
988
989	case '[':
990	  if (backslash)
991	    goto normal_char;
992	  laststart = 0;
993#ifdef MBS_SUPPORT
994	  if (MB_CUR_MAX > 1)
995	    {
996	      /* In multibyte environment a bracket expression may contain
997		 multibyte characters, which must be treated as characters
998		 (not bytes).  So we parse it by parse_bracket_exp_mb().  */
999	      parse_bracket_exp_mb();
1000	      return lasttok = MBCSET;
1001	    }
1002#endif
1003	  zeroset(ccl);
1004	  FETCH(c, _("Unbalanced ["));
1005	  if (c == '^')
1006	    {
1007	      FETCH(c, _("Unbalanced ["));
1008	      invert = 1;
1009	    }
1010	  else
1011	    invert = 0;
1012	  do
1013	    {
1014	      /* Nobody ever said this had to be fast. :-)
1015		 Note that if we're looking at some other [:...:]
1016		 construct, we just treat it as a bunch of ordinary
1017		 characters.  We can do this because we assume
1018		 regex has checked for syntax errors before
1019		 dfa is ever called. */
1020	      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
1021		for (c1 = 0; prednames[c1].name; ++c1)
1022		  if (looking_at(prednames[c1].name))
1023		    {
1024		      int (*pred) PARAMS ((int)) = prednames[c1].pred;
1025
1026		      for (c2 = 0; c2 < NOTCHAR; ++c2)
1027			if ((*pred)(c2))
1028			  setbit_case_fold (c2, ccl);
1029		      lexptr += strlen(prednames[c1].name);
1030		      lexleft -= strlen(prednames[c1].name);
1031		      FETCH(c1, _("Unbalanced ["));
1032		      goto skip;
1033		    }
1034	      if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1035		FETCH(c, _("Unbalanced ["));
1036	      FETCH(c1, _("Unbalanced ["));
1037	      if (c1 == '-')
1038		{
1039		  FETCH(c2, _("Unbalanced ["));
1040		  if (c2 == ']')
1041		    {
1042		      /* In the case [x-], the - is an ordinary hyphen,
1043			 which is left in c1, the lookahead character. */
1044		      --lexptr;
1045		      ++lexleft;
1046		    }
1047		  else
1048		    {
1049		      if (c2 == '\\'
1050			  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1051			FETCH(c2, _("Unbalanced ["));
1052		      FETCH(c1, _("Unbalanced ["));
1053		      if (!hard_LC_COLLATE) {
1054		        for (; c <= c2; c++)
1055			  setbit_case_fold (c, ccl);
1056		      } else {
1057			/* POSIX locales are painful - leave the decision to libc */
1058			char expr[6] = { '[', c, '-', c2, ']', '\0' };
1059			regex_t re;
1060			if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
1061			  for (c = 0; c < NOTCHAR; ++c) {
1062			    char buf[2] = { c, '\0' };
1063			    regmatch_t mat;
1064			    if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
1065                               && mat.rm_so == 0 && mat.rm_eo == 1)
1066                              setbit_case_fold (c, ccl);
1067			  }
1068			  regfree (&re);
1069			}
1070		      }
1071		      continue;
1072		    }
1073		}
1074
1075	      setbit_case_fold (c, ccl);
1076
1077	    skip:
1078	      ;
1079	    }
1080	  while ((c = c1) != ']');
1081	  if (invert)
1082	    {
1083	      notset(ccl);
1084	      if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1085		clrbit(eolbyte, ccl);
1086	    }
1087	  return lasttok = CSET + charclass_index(ccl);
1088
1089	default:
1090	normal_char:
1091	  laststart = 0;
1092	  if (case_fold && ISALPHA(c))
1093	    {
1094	      zeroset(ccl);
1095	      setbit_case_fold (c, ccl);
1096	      return lasttok = CSET + charclass_index(ccl);
1097	    }
1098	  return c;
1099	}
1100    }
1101
1102  /* The above loop should consume at most a backslash
1103     and some other character. */
1104  abort();
1105  return END;	/* keeps pedantic compilers happy. */
1106}
1107
1108/* Recursive descent parser for regular expressions. */
1109
1110static token tok;		/* Lookahead token. */
1111static int depth;		/* Current depth of a hypothetical stack
1112				   holding deferred productions.  This is
1113				   used to determine the depth that will be
1114				   required of the real stack later on in
1115				   dfaanalyze(). */
1116
1117/* Add the given token to the parse tree, maintaining the depth count and
1118   updating the maximum depth if necessary. */
1119static void
1120addtok (token t)
1121{
1122#ifdef MBS_SUPPORT
1123  if (MB_CUR_MAX > 1)
1124    {
1125      REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1126			   dfa->tindex);
1127      /* Set dfa->multibyte_prop.  See struct dfa in dfa.h.  */
1128      if (t == MBCSET)
1129	dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
1130      else if (t < NOTCHAR)
1131	dfa->multibyte_prop[dfa->tindex]
1132	  = (cur_mb_len == 1)? 3 /* single-byte char */
1133	  : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
1134	     + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
1135      else
1136	/* It may be unnecesssary, but it is safer to treat other
1137	   symbols as singlebyte characters.  */
1138	dfa->multibyte_prop[dfa->tindex] = 3;
1139    }
1140#endif
1141
1142  REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1143  dfa->tokens[dfa->tindex++] = t;
1144
1145  switch (t)
1146    {
1147    case QMARK:
1148    case STAR:
1149    case PLUS:
1150      break;
1151
1152    case CAT:
1153    case OR:
1154    case ORTOP:
1155      --depth;
1156      break;
1157
1158    default:
1159      ++dfa->nleaves;
1160    case EMPTY:
1161      ++depth;
1162      break;
1163    }
1164  if (depth > dfa->depth)
1165    dfa->depth = depth;
1166}
1167
1168/* The grammar understood by the parser is as follows.
1169
1170   regexp:
1171     regexp OR branch
1172     branch
1173
1174   branch:
1175     branch closure
1176     closure
1177
1178   closure:
1179     closure QMARK
1180     closure STAR
1181     closure PLUS
1182     closure REPMN
1183     atom
1184
1185   atom:
1186     <normal character>
1187     <multibyte character>
1188     ANYCHAR
1189     MBCSET
1190     CSET
1191     BACKREF
1192     BEGLINE
1193     ENDLINE
1194     BEGWORD
1195     ENDWORD
1196     LIMWORD
1197     NOTLIMWORD
1198     CRANGE
1199     LPAREN regexp RPAREN
1200     <empty>
1201
1202   The parser builds a parse tree in postfix form in an array of tokens. */
1203
1204static void
1205atom (void)
1206{
1207  if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1208      || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1209#ifdef MBS_SUPPORT
1210      || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
1211#endif /* MBS_SUPPORT */
1212      || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1213    {
1214      addtok(tok);
1215      tok = lex();
1216#ifdef MBS_SUPPORT
1217      /* We treat a multibyte character as a single atom, so that DFA
1218	 can treat a multibyte character as a single expression.
1219
1220         e.g. We construct following tree from "<mb1><mb2>".
1221              <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1222              <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
1223      */
1224      if (MB_CUR_MAX > 1)
1225	{
1226	  while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
1227	    {
1228	      addtok(tok);
1229	      addtok(CAT);
1230	      tok = lex();
1231	    }
1232	}
1233#endif /* MBS_SUPPORT  */
1234    }
1235  else if (tok == CRANGE)
1236    {
1237      /* A character range like "[a-z]" in a locale other than "C" or
1238	 "POSIX".  This range might any sequence of one or more
1239	 characters.  Unfortunately the POSIX locale primitives give
1240	 us no practical way to find what character sequences might be
1241	 matched.  Treat this approximately like "(.\1)" -- i.e. match
1242	 one character, and then punt to the full matcher.  */
1243      charclass ccl;
1244      zeroset (ccl);
1245      notset (ccl);
1246      addtok (CSET + charclass_index (ccl));
1247      addtok (BACKREF);
1248      addtok (CAT);
1249      tok = lex ();
1250    }
1251  else if (tok == LPAREN)
1252    {
1253      tok = lex();
1254      regexp(0);
1255      if (tok != RPAREN)
1256	dfaerror(_("Unbalanced ("));
1257      tok = lex();
1258    }
1259  else
1260    addtok(EMPTY);
1261}
1262
1263/* Return the number of tokens in the given subexpression. */
1264static int
1265nsubtoks (int tindex)
1266{
1267  int ntoks1;
1268
1269  switch (dfa->tokens[tindex - 1])
1270    {
1271    default:
1272      return 1;
1273    case QMARK:
1274    case STAR:
1275    case PLUS:
1276      return 1 + nsubtoks(tindex - 1);
1277    case CAT:
1278    case OR:
1279    case ORTOP:
1280      ntoks1 = nsubtoks(tindex - 1);
1281      return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1282    }
1283}
1284
1285/* Copy the given subexpression to the top of the tree. */
1286static void
1287copytoks (int tindex, int ntokens)
1288{
1289  int i;
1290
1291  for (i = 0; i < ntokens; ++i)
1292    addtok(dfa->tokens[tindex + i]);
1293}
1294
1295static void
1296closure (void)
1297{
1298  int tindex, ntokens, i;
1299
1300  atom();
1301  while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1302    if (tok == REPMN)
1303      {
1304	ntokens = nsubtoks(dfa->tindex);
1305	tindex = dfa->tindex - ntokens;
1306	if (maxrep < 0)
1307	  addtok(PLUS);
1308	if (minrep == 0)
1309	  addtok(QMARK);
1310	for (i = 1; i < minrep; ++i)
1311	  {
1312	    copytoks(tindex, ntokens);
1313	    addtok(CAT);
1314	  }
1315	for (; i < maxrep; ++i)
1316	  {
1317	    copytoks(tindex, ntokens);
1318	    addtok(QMARK);
1319	    addtok(CAT);
1320	  }
1321	tok = lex();
1322      }
1323    else
1324      {
1325	addtok(tok);
1326	tok = lex();
1327      }
1328}
1329
1330static void
1331branch (void)
1332{
1333  closure();
1334  while (tok != RPAREN && tok != OR && tok >= 0)
1335    {
1336      closure();
1337      addtok(CAT);
1338    }
1339}
1340
1341static void
1342regexp (int toplevel)
1343{
1344  branch();
1345  while (tok == OR)
1346    {
1347      tok = lex();
1348      branch();
1349      if (toplevel)
1350	addtok(ORTOP);
1351      else
1352	addtok(OR);
1353    }
1354}
1355
1356/* Main entry point for the parser.  S is a string to be parsed, len is the
1357   length of the string, so s can include NUL characters.  D is a pointer to
1358   the struct dfa to parse into. */
1359void
1360dfaparse (char const *s, size_t len, struct dfa *d)
1361{
1362  dfa = d;
1363  lexstart = lexptr = s;
1364  lexleft = len;
1365  lasttok = END;
1366  laststart = 1;
1367  parens = 0;
1368  hard_LC_COLLATE = hard_locale (LC_COLLATE);
1369#ifdef MBS_SUPPORT
1370  if (MB_CUR_MAX > 1)
1371    {
1372      cur_mb_index = 0;
1373      cur_mb_len = 0;
1374      memset(&mbs, 0, sizeof(mbstate_t));
1375    }
1376#endif /* MBS_SUPPORT  */
1377
1378  if (! syntax_bits_set)
1379    dfaerror(_("No syntax specified"));
1380
1381  tok = lex();
1382  depth = d->depth;
1383
1384  regexp(1);
1385
1386  if (tok != END)
1387    dfaerror(_("Unbalanced )"));
1388
1389  addtok(END - d->nregexps);
1390  addtok(CAT);
1391
1392  if (d->nregexps)
1393    addtok(ORTOP);
1394
1395  ++d->nregexps;
1396}
1397
1398/* Some primitives for operating on sets of positions. */
1399
1400/* Copy one set to another; the destination must be large enough. */
1401static void
1402copy (position_set const *src, position_set *dst)
1403{
1404  int i;
1405
1406  for (i = 0; i < src->nelem; ++i)
1407    dst->elems[i] = src->elems[i];
1408  dst->nelem = src->nelem;
1409}
1410
1411/* Insert a position in a set.  Position sets are maintained in sorted
1412   order according to index.  If position already exists in the set with
1413   the same index then their constraints are logically or'd together.
1414   S->elems must point to an array large enough to hold the resulting set. */
1415static void
1416insert (position p, position_set *s)
1417{
1418  int i;
1419  position t1, t2;
1420
1421  for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1422    continue;
1423  if (i < s->nelem && p.index == s->elems[i].index)
1424    s->elems[i].constraint |= p.constraint;
1425  else
1426    {
1427      t1 = p;
1428      ++s->nelem;
1429      while (i < s->nelem)
1430	{
1431	  t2 = s->elems[i];
1432	  s->elems[i++] = t1;
1433	  t1 = t2;
1434	}
1435    }
1436}
1437
1438/* Merge two sets of positions into a third.  The result is exactly as if
1439   the positions of both sets were inserted into an initially empty set. */
1440static void
1441merge (position_set const *s1, position_set const *s2, position_set *m)
1442{
1443  int i = 0, j = 0;
1444
1445  m->nelem = 0;
1446  while (i < s1->nelem && j < s2->nelem)
1447    if (s1->elems[i].index > s2->elems[j].index)
1448      m->elems[m->nelem++] = s1->elems[i++];
1449    else if (s1->elems[i].index < s2->elems[j].index)
1450      m->elems[m->nelem++] = s2->elems[j++];
1451    else
1452      {
1453	m->elems[m->nelem] = s1->elems[i++];
1454	m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1455      }
1456  while (i < s1->nelem)
1457    m->elems[m->nelem++] = s1->elems[i++];
1458  while (j < s2->nelem)
1459    m->elems[m->nelem++] = s2->elems[j++];
1460}
1461
1462/* Delete a position from a set. */
1463static void
1464delete (position p, position_set *s)
1465{
1466  int i;
1467
1468  for (i = 0; i < s->nelem; ++i)
1469    if (p.index == s->elems[i].index)
1470      break;
1471  if (i < s->nelem)
1472    for (--s->nelem; i < s->nelem; ++i)
1473      s->elems[i] = s->elems[i + 1];
1474}
1475
1476/* Find the index of the state corresponding to the given position set with
1477   the given preceding context, or create a new state if there is no such
1478   state.  Newline and letter tell whether we got here on a newline or
1479   letter, respectively. */
1480static int
1481state_index (struct dfa *d, position_set const *s, int newline, int letter)
1482{
1483  int hash = 0;
1484  int constraint;
1485  int i, j;
1486
1487  newline = newline ? 1 : 0;
1488  letter = letter ? 1 : 0;
1489
1490  for (i = 0; i < s->nelem; ++i)
1491    hash ^= s->elems[i].index + s->elems[i].constraint;
1492
1493  /* Try to find a state that exactly matches the proposed one. */
1494  for (i = 0; i < d->sindex; ++i)
1495    {
1496      if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1497	  || newline != d->states[i].newline || letter != d->states[i].letter)
1498	continue;
1499      for (j = 0; j < s->nelem; ++j)
1500	if (s->elems[j].constraint
1501	    != d->states[i].elems.elems[j].constraint
1502	    || s->elems[j].index != d->states[i].elems.elems[j].index)
1503	  break;
1504      if (j == s->nelem)
1505	return i;
1506    }
1507
1508  /* We'll have to create a new state. */
1509  REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1510  d->states[i].hash = hash;
1511  MALLOC(d->states[i].elems.elems, position, s->nelem);
1512  copy(s, &d->states[i].elems);
1513  d->states[i].newline = newline;
1514  d->states[i].letter = letter;
1515  d->states[i].backref = 0;
1516  d->states[i].constraint = 0;
1517  d->states[i].first_end = 0;
1518#ifdef MBS_SUPPORT
1519  if (MB_CUR_MAX > 1)
1520    d->states[i].mbps.nelem = 0;
1521#endif
1522  for (j = 0; j < s->nelem; ++j)
1523    if (d->tokens[s->elems[j].index] < 0)
1524      {
1525	constraint = s->elems[j].constraint;
1526	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1527	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1528	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1529	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1530	  d->states[i].constraint |= constraint;
1531	if (! d->states[i].first_end)
1532	  d->states[i].first_end = d->tokens[s->elems[j].index];
1533      }
1534    else if (d->tokens[s->elems[j].index] == BACKREF)
1535      {
1536	d->states[i].constraint = NO_CONSTRAINT;
1537	d->states[i].backref = 1;
1538      }
1539
1540  ++d->sindex;
1541
1542  return i;
1543}
1544
1545/* Find the epsilon closure of a set of positions.  If any position of the set
1546   contains a symbol that matches the empty string in some context, replace
1547   that position with the elements of its follow labeled with an appropriate
1548   constraint.  Repeat exhaustively until no funny positions are left.
1549   S->elems must be large enough to hold the result. */
1550static void
1551epsclosure (position_set *s, struct dfa const *d)
1552{
1553  int i, j;
1554  int *visited;
1555  position p, old;
1556
1557  MALLOC(visited, int, d->tindex);
1558  for (i = 0; i < d->tindex; ++i)
1559    visited[i] = 0;
1560
1561  for (i = 0; i < s->nelem; ++i)
1562    if (d->tokens[s->elems[i].index] >= NOTCHAR
1563	&& d->tokens[s->elems[i].index] != BACKREF
1564#ifdef MBS_SUPPORT
1565	&& d->tokens[s->elems[i].index] != ANYCHAR
1566	&& d->tokens[s->elems[i].index] != MBCSET
1567#endif
1568	&& d->tokens[s->elems[i].index] < CSET)
1569      {
1570	old = s->elems[i];
1571	p.constraint = old.constraint;
1572	delete(s->elems[i], s);
1573	if (visited[old.index])
1574	  {
1575	    --i;
1576	    continue;
1577	  }
1578	visited[old.index] = 1;
1579	switch (d->tokens[old.index])
1580	  {
1581	  case BEGLINE:
1582	    p.constraint &= BEGLINE_CONSTRAINT;
1583	    break;
1584	  case ENDLINE:
1585	    p.constraint &= ENDLINE_CONSTRAINT;
1586	    break;
1587	  case BEGWORD:
1588	    p.constraint &= BEGWORD_CONSTRAINT;
1589	    break;
1590	  case ENDWORD:
1591	    p.constraint &= ENDWORD_CONSTRAINT;
1592	    break;
1593	  case LIMWORD:
1594	    p.constraint &= LIMWORD_CONSTRAINT;
1595	    break;
1596	  case NOTLIMWORD:
1597	    p.constraint &= NOTLIMWORD_CONSTRAINT;
1598	    break;
1599	  default:
1600	    break;
1601	  }
1602	for (j = 0; j < d->follows[old.index].nelem; ++j)
1603	  {
1604	    p.index = d->follows[old.index].elems[j].index;
1605	    insert(p, s);
1606	  }
1607	/* Force rescan to start at the beginning. */
1608	i = -1;
1609      }
1610
1611  free(visited);
1612}
1613
1614/* Perform bottom-up analysis on the parse tree, computing various functions.
1615   Note that at this point, we're pretending constructs like \< are real
1616   characters rather than constraints on what can follow them.
1617
1618   Nullable:  A node is nullable if it is at the root of a regexp that can
1619   match the empty string.
1620   *  EMPTY leaves are nullable.
1621   * No other leaf is nullable.
1622   * A QMARK or STAR node is nullable.
1623   * A PLUS node is nullable if its argument is nullable.
1624   * A CAT node is nullable if both its arguments are nullable.
1625   * An OR node is nullable if either argument is nullable.
1626
1627   Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
1628   that could correspond to the first character of a string matching the
1629   regexp rooted at the given node.
1630   * EMPTY leaves have empty firstpos.
1631   * The firstpos of a nonempty leaf is that leaf itself.
1632   * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1633     argument.
1634   * The firstpos of a CAT node is the firstpos of the left argument, union
1635     the firstpos of the right if the left argument is nullable.
1636   * The firstpos of an OR node is the union of firstpos of each argument.
1637
1638   Lastpos:  The lastpos of a node is the set of positions that could
1639   correspond to the last character of a string matching the regexp at
1640   the given node.
1641   * EMPTY leaves have empty lastpos.
1642   * The lastpos of a nonempty leaf is that leaf itself.
1643   * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1644     argument.
1645   * The lastpos of a CAT node is the lastpos of its right argument, union
1646     the lastpos of the left if the right argument is nullable.
1647   * The lastpos of an OR node is the union of the lastpos of each argument.
1648
1649   Follow:  The follow of a position is the set of positions that could
1650   correspond to the character following a character matching the node in
1651   a string matching the regexp.  At this point we consider special symbols
1652   that match the empty string in some context to be just normal characters.
1653   Later, if we find that a special symbol is in a follow set, we will
1654   replace it with the elements of its follow, labeled with an appropriate
1655   constraint.
1656   * Every node in the firstpos of the argument of a STAR or PLUS node is in
1657     the follow of every node in the lastpos.
1658   * Every node in the firstpos of the second argument of a CAT node is in
1659     the follow of every node in the lastpos of the first argument.
1660
1661   Because of the postfix representation of the parse tree, the depth-first
1662   analysis is conveniently done by a linear scan with the aid of a stack.
1663   Sets are stored as arrays of the elements, obeying a stack-like allocation
1664   scheme; the number of elements in each set deeper in the stack can be
1665   used to determine the address of a particular set's array. */
1666void
1667dfaanalyze (struct dfa *d, int searchflag)
1668{
1669  int *nullable;		/* Nullable stack. */
1670  int *nfirstpos;		/* Element count stack for firstpos sets. */
1671  position *firstpos;		/* Array where firstpos elements are stored. */
1672  int *nlastpos;		/* Element count stack for lastpos sets. */
1673  position *lastpos;		/* Array where lastpos elements are stored. */
1674  int *nalloc;			/* Sizes of arrays allocated to follow sets. */
1675  position_set tmp;		/* Temporary set for merging sets. */
1676  position_set merged;		/* Result of merging sets. */
1677  int wants_newline;		/* True if some position wants newline info. */
1678  int *o_nullable;
1679  int *o_nfirst, *o_nlast;
1680  position *o_firstpos, *o_lastpos;
1681  int i, j;
1682  position *pos;
1683
1684#ifdef DEBUG
1685  fprintf(stderr, "dfaanalyze:\n");
1686  for (i = 0; i < d->tindex; ++i)
1687    {
1688      fprintf(stderr, " %d:", i);
1689      prtok(d->tokens[i]);
1690    }
1691  putc('\n', stderr);
1692#endif
1693
1694  d->searchflag = searchflag;
1695
1696  MALLOC(nullable, int, d->depth);
1697  o_nullable = nullable;
1698  MALLOC(nfirstpos, int, d->depth);
1699  o_nfirst = nfirstpos;
1700  MALLOC(firstpos, position, d->nleaves);
1701  o_firstpos = firstpos, firstpos += d->nleaves;
1702  MALLOC(nlastpos, int, d->depth);
1703  o_nlast = nlastpos;
1704  MALLOC(lastpos, position, d->nleaves);
1705  o_lastpos = lastpos, lastpos += d->nleaves;
1706  MALLOC(nalloc, int, d->tindex);
1707  for (i = 0; i < d->tindex; ++i)
1708    nalloc[i] = 0;
1709  MALLOC(merged.elems, position, d->nleaves);
1710
1711  CALLOC(d->follows, position_set, d->tindex);
1712
1713  for (i = 0; i < d->tindex; ++i)
1714#ifdef DEBUG
1715    {				/* Nonsyntactic #ifdef goo... */
1716#endif
1717    switch (d->tokens[i])
1718      {
1719      case EMPTY:
1720	/* The empty set is nullable. */
1721	*nullable++ = 1;
1722
1723	/* The firstpos and lastpos of the empty leaf are both empty. */
1724	*nfirstpos++ = *nlastpos++ = 0;
1725	break;
1726
1727      case STAR:
1728      case PLUS:
1729	/* Every element in the firstpos of the argument is in the follow
1730	   of every element in the lastpos. */
1731	tmp.nelem = nfirstpos[-1];
1732	tmp.elems = firstpos;
1733	pos = lastpos;
1734	for (j = 0; j < nlastpos[-1]; ++j)
1735	  {
1736	    merge(&tmp, &d->follows[pos[j].index], &merged);
1737	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1738				 nalloc[pos[j].index], merged.nelem - 1);
1739	    copy(&merged, &d->follows[pos[j].index]);
1740	  }
1741
1742      case QMARK:
1743	/* A QMARK or STAR node is automatically nullable. */
1744	if (d->tokens[i] != PLUS)
1745	  nullable[-1] = 1;
1746	break;
1747
1748      case CAT:
1749	/* Every element in the firstpos of the second argument is in the
1750	   follow of every element in the lastpos of the first argument. */
1751	tmp.nelem = nfirstpos[-1];
1752	tmp.elems = firstpos;
1753	pos = lastpos + nlastpos[-1];
1754	for (j = 0; j < nlastpos[-2]; ++j)
1755	  {
1756	    merge(&tmp, &d->follows[pos[j].index], &merged);
1757	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1758				 nalloc[pos[j].index], merged.nelem - 1);
1759	    copy(&merged, &d->follows[pos[j].index]);
1760	  }
1761
1762	/* The firstpos of a CAT node is the firstpos of the first argument,
1763	   union that of the second argument if the first is nullable. */
1764	if (nullable[-2])
1765	  nfirstpos[-2] += nfirstpos[-1];
1766	else
1767	  firstpos += nfirstpos[-1];
1768	--nfirstpos;
1769
1770	/* The lastpos of a CAT node is the lastpos of the second argument,
1771	   union that of the first argument if the second is nullable. */
1772	if (nullable[-1])
1773	  nlastpos[-2] += nlastpos[-1];
1774	else
1775	  {
1776	    pos = lastpos + nlastpos[-2];
1777	    for (j = nlastpos[-1] - 1; j >= 0; --j)
1778	      pos[j] = lastpos[j];
1779	    lastpos += nlastpos[-2];
1780	    nlastpos[-2] = nlastpos[-1];
1781	  }
1782	--nlastpos;
1783
1784	/* A CAT node is nullable if both arguments are nullable. */
1785	nullable[-2] = nullable[-1] && nullable[-2];
1786	--nullable;
1787	break;
1788
1789      case OR:
1790      case ORTOP:
1791	/* The firstpos is the union of the firstpos of each argument. */
1792	nfirstpos[-2] += nfirstpos[-1];
1793	--nfirstpos;
1794
1795	/* The lastpos is the union of the lastpos of each argument. */
1796	nlastpos[-2] += nlastpos[-1];
1797	--nlastpos;
1798
1799	/* An OR node is nullable if either argument is nullable. */
1800	nullable[-2] = nullable[-1] || nullable[-2];
1801	--nullable;
1802	break;
1803
1804      default:
1805	/* Anything else is a nonempty position.  (Note that special
1806	   constructs like \< are treated as nonempty strings here;
1807	   an "epsilon closure" effectively makes them nullable later.
1808	   Backreferences have to get a real position so we can detect
1809	   transitions on them later.  But they are nullable. */
1810	*nullable++ = d->tokens[i] == BACKREF;
1811
1812	/* This position is in its own firstpos and lastpos. */
1813	*nfirstpos++ = *nlastpos++ = 1;
1814	--firstpos, --lastpos;
1815	firstpos->index = lastpos->index = i;
1816	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1817
1818	/* Allocate the follow set for this position. */
1819	nalloc[i] = 1;
1820	MALLOC(d->follows[i].elems, position, nalloc[i]);
1821	break;
1822      }
1823#ifdef DEBUG
1824    /* ... balance the above nonsyntactic #ifdef goo... */
1825      fprintf(stderr, "node %d:", i);
1826      prtok(d->tokens[i]);
1827      putc('\n', stderr);
1828      fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1829      fprintf(stderr, " firstpos:");
1830      for (j = nfirstpos[-1] - 1; j >= 0; --j)
1831	{
1832	  fprintf(stderr, " %d:", firstpos[j].index);
1833	  prtok(d->tokens[firstpos[j].index]);
1834	}
1835      fprintf(stderr, "\n lastpos:");
1836      for (j = nlastpos[-1] - 1; j >= 0; --j)
1837	{
1838	  fprintf(stderr, " %d:", lastpos[j].index);
1839	  prtok(d->tokens[lastpos[j].index]);
1840	}
1841      putc('\n', stderr);
1842    }
1843#endif
1844
1845  /* For each follow set that is the follow set of a real position, replace
1846     it with its epsilon closure. */
1847  for (i = 0; i < d->tindex; ++i)
1848    if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1849#ifdef MBS_SUPPORT
1850        || d->tokens[i] == ANYCHAR
1851        || d->tokens[i] == MBCSET
1852#endif
1853	|| d->tokens[i] >= CSET)
1854      {
1855#ifdef DEBUG
1856	fprintf(stderr, "follows(%d:", i);
1857	prtok(d->tokens[i]);
1858	fprintf(stderr, "):");
1859	for (j = d->follows[i].nelem - 1; j >= 0; --j)
1860	  {
1861	    fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1862	    prtok(d->tokens[d->follows[i].elems[j].index]);
1863	  }
1864	putc('\n', stderr);
1865#endif
1866	copy(&d->follows[i], &merged);
1867	epsclosure(&merged, d);
1868	if (d->follows[i].nelem < merged.nelem)
1869	  REALLOC(d->follows[i].elems, position, merged.nelem);
1870	copy(&merged, &d->follows[i]);
1871      }
1872
1873  /* Get the epsilon closure of the firstpos of the regexp.  The result will
1874     be the set of positions of state 0. */
1875  merged.nelem = 0;
1876  for (i = 0; i < nfirstpos[-1]; ++i)
1877    insert(firstpos[i], &merged);
1878  epsclosure(&merged, d);
1879
1880  /* Check if any of the positions of state 0 will want newline context. */
1881  wants_newline = 0;
1882  for (i = 0; i < merged.nelem; ++i)
1883    if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1884      wants_newline = 1;
1885
1886  /* Build the initial state. */
1887  d->salloc = 1;
1888  d->sindex = 0;
1889  MALLOC(d->states, dfa_state, d->salloc);
1890  state_index(d, &merged, wants_newline, 0);
1891
1892  free(o_nullable);
1893  free(o_nfirst);
1894  free(o_firstpos);
1895  free(o_nlast);
1896  free(o_lastpos);
1897  free(nalloc);
1898  free(merged.elems);
1899}
1900
1901/* Find, for each character, the transition out of state s of d, and store
1902   it in the appropriate slot of trans.
1903
1904   We divide the positions of s into groups (positions can appear in more
1905   than one group).  Each group is labeled with a set of characters that
1906   every position in the group matches (taking into account, if necessary,
1907   preceding context information of s).  For each group, find the union
1908   of the its elements' follows.  This set is the set of positions of the
1909   new state.  For each character in the group's label, set the transition
1910   on this character to be to a state corresponding to the set's positions,
1911   and its associated backward context information, if necessary.
1912
1913   If we are building a searching matcher, we include the positions of state
1914   0 in every state.
1915
1916   The collection of groups is constructed by building an equivalence-class
1917   partition of the positions of s.
1918
1919   For each position, find the set of characters C that it matches.  Eliminate
1920   any characters from C that fail on grounds of backward context.
1921
1922   Search through the groups, looking for a group whose label L has nonempty
1923   intersection with C.  If L - C is nonempty, create a new group labeled
1924   L - C and having the same positions as the current group, and set L to
1925   the intersection of L and C.  Insert the position in this group, set
1926   C = C - L, and resume scanning.
1927
1928   If after comparing with every group there are characters remaining in C,
1929   create a new group labeled with the characters of C and insert this
1930   position in that group. */
1931void
1932dfastate (int s, struct dfa *d, int trans[])
1933{
1934  position_set grps[NOTCHAR];	/* As many as will ever be needed. */
1935  charclass labels[NOTCHAR];	/* Labels corresponding to the groups. */
1936  int ngrps = 0;		/* Number of groups actually used. */
1937  position pos;			/* Current position being considered. */
1938  charclass matches;		/* Set of matching characters. */
1939  int matchesf;			/* True if matches is nonempty. */
1940  charclass intersect;		/* Intersection with some label set. */
1941  int intersectf;		/* True if intersect is nonempty. */
1942  charclass leftovers;		/* Stuff in the label that didn't match. */
1943  int leftoversf;		/* True if leftovers is nonempty. */
1944  static charclass letters;	/* Set of characters considered letters. */
1945  static charclass newline;	/* Set of characters that aren't newline. */
1946  position_set follows;		/* Union of the follows of some group. */
1947  position_set tmp;		/* Temporary space for merging sets. */
1948  int state;			/* New state. */
1949  int wants_newline;		/* New state wants to know newline context. */
1950  int state_newline;		/* New state on a newline transition. */
1951  int wants_letter;		/* New state wants to know letter context. */
1952  int state_letter;		/* New state on a letter transition. */
1953  static int initialized;	/* Flag for static initialization. */
1954#ifdef MBS_SUPPORT
1955  int next_isnt_1st_byte = 0;	/* Flag If we can't add state0.  */
1956#endif
1957  int i, j, k;
1958
1959  /* Initialize the set of letters, if necessary. */
1960  if (! initialized)
1961    {
1962      initialized = 1;
1963      for (i = 0; i < NOTCHAR; ++i)
1964	if (IS_WORD_CONSTITUENT(i))
1965	  setbit(i, letters);
1966      setbit(eolbyte, newline);
1967    }
1968
1969  zeroset(matches);
1970
1971  for (i = 0; i < d->states[s].elems.nelem; ++i)
1972    {
1973      pos = d->states[s].elems.elems[i];
1974      if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1975	setbit(d->tokens[pos.index], matches);
1976      else if (d->tokens[pos.index] >= CSET)
1977	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1978#ifdef MBS_SUPPORT
1979      else if (d->tokens[pos.index] == ANYCHAR
1980               || d->tokens[pos.index] == MBCSET)
1981      /* MB_CUR_MAX > 1  */
1982	{
1983	  /* ANYCHAR and MBCSET must match with a single character, so we
1984	     must put it to d->states[s].mbps, which contains the positions
1985	     which can match with a single character not a byte.  */
1986	  if (d->states[s].mbps.nelem == 0)
1987	    {
1988	      MALLOC(d->states[s].mbps.elems, position,
1989		     d->states[s].elems.nelem);
1990	    }
1991	  insert(pos, &(d->states[s].mbps));
1992	  continue;
1993	}
1994#endif /* MBS_SUPPORT */
1995      else
1996	continue;
1997
1998      /* Some characters may need to be eliminated from matches because
1999	 they fail in the current context. */
2000      if (pos.constraint != 0xFF)
2001	{
2002	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2003					 d->states[s].newline, 1))
2004	    clrbit(eolbyte, matches);
2005	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2006					 d->states[s].newline, 0))
2007	    for (j = 0; j < CHARCLASS_INTS; ++j)
2008	      matches[j] &= newline[j];
2009	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2010					d->states[s].letter, 1))
2011	    for (j = 0; j < CHARCLASS_INTS; ++j)
2012	      matches[j] &= ~letters[j];
2013	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2014					d->states[s].letter, 0))
2015	    for (j = 0; j < CHARCLASS_INTS; ++j)
2016	      matches[j] &= letters[j];
2017
2018	  /* If there are no characters left, there's no point in going on. */
2019	  for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2020	    continue;
2021	  if (j == CHARCLASS_INTS)
2022	    continue;
2023	}
2024
2025      for (j = 0; j < ngrps; ++j)
2026	{
2027	  /* If matches contains a single character only, and the current
2028	     group's label doesn't contain that character, go on to the
2029	     next group. */
2030	  if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2031	      && !tstbit(d->tokens[pos.index], labels[j]))
2032	    continue;
2033
2034	  /* Check if this group's label has a nonempty intersection with
2035	     matches. */
2036	  intersectf = 0;
2037	  for (k = 0; k < CHARCLASS_INTS; ++k)
2038	    (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2039	  if (! intersectf)
2040	    continue;
2041
2042	  /* It does; now find the set differences both ways. */
2043	  leftoversf = matchesf = 0;
2044	  for (k = 0; k < CHARCLASS_INTS; ++k)
2045	    {
2046	      /* Even an optimizing compiler can't know this for sure. */
2047	      int match = matches[k], label = labels[j][k];
2048
2049	      (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2050	      (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2051	    }
2052
2053	  /* If there were leftovers, create a new group labeled with them. */
2054	  if (leftoversf)
2055	    {
2056	      copyset(leftovers, labels[ngrps]);
2057	      copyset(intersect, labels[j]);
2058	      MALLOC(grps[ngrps].elems, position, d->nleaves);
2059	      copy(&grps[j], &grps[ngrps]);
2060	      ++ngrps;
2061	    }
2062
2063	  /* Put the position in the current group.  Note that there is no
2064	     reason to call insert() here. */
2065	  grps[j].elems[grps[j].nelem++] = pos;
2066
2067	  /* If every character matching the current position has been
2068	     accounted for, we're done. */
2069	  if (! matchesf)
2070	    break;
2071	}
2072
2073      /* If we've passed the last group, and there are still characters
2074	 unaccounted for, then we'll have to create a new group. */
2075      if (j == ngrps)
2076	{
2077	  copyset(matches, labels[ngrps]);
2078	  zeroset(matches);
2079	  MALLOC(grps[ngrps].elems, position, d->nleaves);
2080	  grps[ngrps].nelem = 1;
2081	  grps[ngrps].elems[0] = pos;
2082	  ++ngrps;
2083	}
2084    }
2085
2086  MALLOC(follows.elems, position, d->nleaves);
2087  MALLOC(tmp.elems, position, d->nleaves);
2088
2089  /* If we are a searching matcher, the default transition is to a state
2090     containing the positions of state 0, otherwise the default transition
2091     is to fail miserably. */
2092  if (d->searchflag)
2093    {
2094      wants_newline = 0;
2095      wants_letter = 0;
2096      for (i = 0; i < d->states[0].elems.nelem; ++i)
2097	{
2098	  if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2099	    wants_newline = 1;
2100	  if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2101	    wants_letter = 1;
2102	}
2103      copy(&d->states[0].elems, &follows);
2104      state = state_index(d, &follows, 0, 0);
2105      if (wants_newline)
2106	state_newline = state_index(d, &follows, 1, 0);
2107      else
2108	state_newline = state;
2109      if (wants_letter)
2110	state_letter = state_index(d, &follows, 0, 1);
2111      else
2112	state_letter = state;
2113      for (i = 0; i < NOTCHAR; ++i)
2114	trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2115      trans[eolbyte] = state_newline;
2116    }
2117  else
2118    for (i = 0; i < NOTCHAR; ++i)
2119      trans[i] = -1;
2120
2121  for (i = 0; i < ngrps; ++i)
2122    {
2123      follows.nelem = 0;
2124
2125      /* Find the union of the follows of the positions of the group.
2126	 This is a hideously inefficient loop.  Fix it someday. */
2127      for (j = 0; j < grps[i].nelem; ++j)
2128	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2129	  insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2130
2131#ifdef MBS_SUPPORT
2132      if (MB_CUR_MAX > 1)
2133	{
2134	  /* If a token in follows.elems is not 1st byte of a multibyte
2135	     character, or the states of follows must accept the bytes
2136	     which are not 1st byte of the multibyte character.
2137	     Then, if a state of follows encounter a byte, it must not be
2138	     a 1st byte of a multibyte character nor singlebyte character.
2139	     We cansel to add state[0].follows to next state, because
2140	     state[0] must accept 1st-byte
2141
2142	     For example, we assume <sb a> is a certain singlebyte
2143	     character, <mb A> is a certain multibyte character, and the
2144	     codepoint of <sb a> equals the 2nd byte of the codepoint of
2145	     <mb A>.
2146	     When state[0] accepts <sb a>, state[i] transit to state[i+1]
2147	     by accepting accepts 1st byte of <mb A>, and state[i+1]
2148	     accepts 2nd byte of <mb A>, if state[i+1] encounter the
2149	     codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2150	     <mb A>, so we can not add state[0].  */
2151
2152	  next_isnt_1st_byte = 0;
2153	  for (j = 0; j < follows.nelem; ++j)
2154	    {
2155	      if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2156		{
2157		  next_isnt_1st_byte = 1;
2158		  break;
2159		}
2160	    }
2161	}
2162#endif
2163
2164      /* If we are building a searching matcher, throw in the positions
2165	 of state 0 as well. */
2166#ifdef MBS_SUPPORT
2167      if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte))
2168#else
2169      if (d->searchflag)
2170#endif
2171	for (j = 0; j < d->states[0].elems.nelem; ++j)
2172	  insert(d->states[0].elems.elems[j], &follows);
2173
2174      /* Find out if the new state will want any context information. */
2175      wants_newline = 0;
2176      if (tstbit(eolbyte, labels[i]))
2177	for (j = 0; j < follows.nelem; ++j)
2178	  if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2179	    wants_newline = 1;
2180
2181      wants_letter = 0;
2182      for (j = 0; j < CHARCLASS_INTS; ++j)
2183	if (labels[i][j] & letters[j])
2184	  break;
2185      if (j < CHARCLASS_INTS)
2186	for (j = 0; j < follows.nelem; ++j)
2187	  if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2188	    wants_letter = 1;
2189
2190      /* Find the state(s) corresponding to the union of the follows. */
2191      state = state_index(d, &follows, 0, 0);
2192      if (wants_newline)
2193	state_newline = state_index(d, &follows, 1, 0);
2194      else
2195	state_newline = state;
2196      if (wants_letter)
2197	state_letter = state_index(d, &follows, 0, 1);
2198      else
2199	state_letter = state;
2200
2201      /* Set the transitions for each character in the current label. */
2202      for (j = 0; j < CHARCLASS_INTS; ++j)
2203	for (k = 0; k < INTBITS; ++k)
2204	  if (labels[i][j] & 1 << k)
2205	    {
2206	      int c = j * INTBITS + k;
2207
2208	      if (c == eolbyte)
2209		trans[c] = state_newline;
2210	      else if (IS_WORD_CONSTITUENT(c))
2211		trans[c] = state_letter;
2212	      else if (c < NOTCHAR)
2213		trans[c] = state;
2214	    }
2215    }
2216
2217  for (i = 0; i < ngrps; ++i)
2218    free(grps[i].elems);
2219  free(follows.elems);
2220  free(tmp.elems);
2221}
2222
2223/* Some routines for manipulating a compiled dfa's transition tables.
2224   Each state may or may not have a transition table; if it does, and it
2225   is a non-accepting state, then d->trans[state] points to its table.
2226   If it is an accepting state then d->fails[state] points to its table.
2227   If it has no table at all, then d->trans[state] is NULL.
2228   TODO: Improve this comment, get rid of the unnecessary redundancy. */
2229
2230static void
2231build_state (int s, struct dfa *d)
2232{
2233  int *trans;			/* The new transition table. */
2234  int i;
2235
2236  /* Set an upper limit on the number of transition tables that will ever
2237     exist at once.  1024 is arbitrary.  The idea is that the frequently
2238     used transition tables will be quickly rebuilt, whereas the ones that
2239     were only needed once or twice will be cleared away. */
2240  if (d->trcount >= 1024)
2241    {
2242      for (i = 0; i < d->tralloc; ++i)
2243	if (d->trans[i])
2244	  {
2245	    free((ptr_t) d->trans[i]);
2246	    d->trans[i] = NULL;
2247	  }
2248	else if (d->fails[i])
2249	  {
2250	    free((ptr_t) d->fails[i]);
2251	    d->fails[i] = NULL;
2252	  }
2253      d->trcount = 0;
2254    }
2255
2256  ++d->trcount;
2257
2258  /* Set up the success bits for this state. */
2259  d->success[s] = 0;
2260  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2261      s, *d))
2262    d->success[s] |= 4;
2263  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2264      s, *d))
2265    d->success[s] |= 2;
2266  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2267      s, *d))
2268    d->success[s] |= 1;
2269
2270  MALLOC(trans, int, NOTCHAR);
2271  dfastate(s, d, trans);
2272
2273  /* Now go through the new transition table, and make sure that the trans
2274     and fail arrays are allocated large enough to hold a pointer for the
2275     largest state mentioned in the table. */
2276  for (i = 0; i < NOTCHAR; ++i)
2277    if (trans[i] >= d->tralloc)
2278      {
2279	int oldalloc = d->tralloc;
2280
2281	while (trans[i] >= d->tralloc)
2282	  d->tralloc *= 2;
2283	REALLOC(d->realtrans, int *, d->tralloc + 1);
2284	d->trans = d->realtrans + 1;
2285	REALLOC(d->fails, int *, d->tralloc);
2286	REALLOC(d->success, int, d->tralloc);
2287	while (oldalloc < d->tralloc)
2288	  {
2289	    d->trans[oldalloc] = NULL;
2290	    d->fails[oldalloc++] = NULL;
2291	  }
2292      }
2293
2294  /* Newline is a sentinel.  */
2295  trans[eolbyte] = -1;
2296
2297  if (ACCEPTING(s, *d))
2298    d->fails[s] = trans;
2299  else
2300    d->trans[s] = trans;
2301}
2302
2303static void
2304build_state_zero (struct dfa *d)
2305{
2306  d->tralloc = 1;
2307  d->trcount = 0;
2308  CALLOC(d->realtrans, int *, d->tralloc + 1);
2309  d->trans = d->realtrans + 1;
2310  CALLOC(d->fails, int *, d->tralloc);
2311  MALLOC(d->success, int, d->tralloc);
2312  build_state(0, d);
2313}
2314
2315#ifdef MBS_SUPPORT
2316/* Multibyte character handling sub-routins for dfaexec.  */
2317
2318/* Initial state may encounter the byte which is not a singlebyte character
2319   nor 1st byte of a multibyte character.  But it is incorrect for initial
2320   state to accept such a byte.
2321   For example, in sjis encoding the regular expression like "\\" accepts
2322   the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2323   0x815c. Then Initial state must skip the bytes which are not a singlebyte
2324   character nor 1st byte of a multibyte character.  */
2325#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p)		\
2326  if (s == 0)						\
2327    {							\
2328      while (inputwcs[p - buf_begin] == 0		\
2329            && mblen_buf[p - buf_begin] > 0		\
2330	    && p < buf_end)				\
2331        ++p;						\
2332      if (p >= end)					\
2333	{						\
2334          free(mblen_buf);				\
2335          free(inputwcs);				\
2336	  return (size_t) -1;				\
2337	}						\
2338    }
2339
2340static void
2341realloc_trans_if_necessary(struct dfa *d, int new_state)
2342{
2343  /* Make sure that the trans and fail arrays are allocated large enough
2344     to hold a pointer for the new state. */
2345  if (new_state >= d->tralloc)
2346    {
2347      int oldalloc = d->tralloc;
2348
2349      while (new_state >= d->tralloc)
2350	d->tralloc *= 2;
2351      REALLOC(d->realtrans, int *, d->tralloc + 1);
2352      d->trans = d->realtrans + 1;
2353      REALLOC(d->fails, int *, d->tralloc);
2354      REALLOC(d->success, int, d->tralloc);
2355      while (oldalloc < d->tralloc)
2356	{
2357	  d->trans[oldalloc] = NULL;
2358	  d->fails[oldalloc++] = NULL;
2359	}
2360    }
2361}
2362
2363/* Return values of transit_state_singlebyte(), and
2364   transit_state_consume_1char.  */
2365typedef enum
2366{
2367  TRANSIT_STATE_IN_PROGRESS,	/* State transition has not finished.  */
2368  TRANSIT_STATE_DONE,		/* State transition has finished.  */
2369  TRANSIT_STATE_END_BUFFER	/* Reach the end of the buffer.  */
2370} status_transit_state;
2371
2372/* Consume a single byte and transit state from 's' to '*next_state'.
2373   This function is almost same as the state transition routin in dfaexec().
2374   But state transition is done just once, otherwise matching succeed or
2375   reach the end of the buffer.  */
2376static status_transit_state
2377transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2378				  int *next_state)
2379{
2380  int *t;
2381  int works = s;
2382
2383  status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2384
2385  while (rval == TRANSIT_STATE_IN_PROGRESS)
2386    {
2387      if ((t = d->trans[works]) != NULL)
2388	{
2389	  works = t[*p];
2390	  rval = TRANSIT_STATE_DONE;
2391	  if (works < 0)
2392	    works = 0;
2393	}
2394      else if (works < 0)
2395	{
2396	  if (p == buf_end)
2397	    /* At the moment, it must not happen.  */
2398	    return TRANSIT_STATE_END_BUFFER;
2399	  works = 0;
2400	}
2401      else if (d->fails[works])
2402	{
2403	  works = d->fails[works][*p];
2404	  rval = TRANSIT_STATE_DONE;
2405	}
2406      else
2407	{
2408	  build_state(works, d);
2409	}
2410    }
2411  *next_state = works;
2412  return rval;
2413}
2414
2415/* Check whether period can match or not in the current context.  If it can,
2416   return the amount of the bytes with which period can match, otherwise
2417   return 0.
2418   `pos' is the position of the period.  `index' is the index from the
2419   buf_begin, and it is the current position in the buffer.  */
2420static int
2421match_anychar (struct dfa *d, int s, position pos, int index)
2422{
2423  int newline = 0;
2424  int letter = 0;
2425  wchar_t wc;
2426  int mbclen;
2427
2428  wc = inputwcs[index];
2429  mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2430
2431  /* Check context.  */
2432  if (wc == (wchar_t)eolbyte)
2433    {
2434      if (!(syntax_bits & RE_DOT_NEWLINE))
2435	return 0;
2436      newline = 1;
2437    }
2438  else if (wc == (wchar_t)'\0')
2439    {
2440      if (syntax_bits & RE_DOT_NOT_NULL)
2441	return 0;
2442      newline = 1;
2443    }
2444
2445  if (iswalnum(wc) || wc == L'_')
2446    letter = 1;
2447
2448  if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2449			   newline, d->states[s].letter, letter))
2450    return 0;
2451
2452  return mbclen;
2453}
2454
2455/* Check whether bracket expression can match or not in the current context.
2456   If it can, return the amount of the bytes with which expression can match,
2457   otherwise return 0.
2458   `pos' is the position of the bracket expression.  `index' is the index
2459   from the buf_begin, and it is the current position in the buffer.  */
2460int
2461match_mb_charset (struct dfa *d, int s, position pos, int index)
2462{
2463  int i;
2464  int match;		/* Flag which represent that matching succeed.  */
2465  int match_len;	/* Length of the character (or collating element)
2466			   with which this operator match.  */
2467  size_t op_len;	/* Length of the operator.  */
2468  char buffer[128];
2469  wchar_t wcbuf[6];
2470
2471  /* Pointer to the structure to which we are currently reffering.  */
2472  struct mb_char_classes *work_mbc;
2473
2474  int newline = 0;
2475  int letter = 0;
2476  wchar_t wc;		/* Current reffering character.  */
2477
2478  wc = inputwcs[index];
2479
2480  /* Check context.  */
2481  if (wc == (wchar_t)eolbyte)
2482    {
2483      if (!(syntax_bits & RE_DOT_NEWLINE))
2484	return 0;
2485      newline = 1;
2486    }
2487  else if (wc == (wchar_t)'\0')
2488    {
2489      if (syntax_bits & RE_DOT_NOT_NULL)
2490	return 0;
2491      newline = 1;
2492    }
2493  if (iswalnum(wc) || wc == L'_')
2494    letter = 1;
2495  if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2496			   newline, d->states[s].letter, letter))
2497    return 0;
2498
2499  /* Assign the current reffering operator to work_mbc.  */
2500  work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2501  match = !work_mbc->invert;
2502  match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2503
2504  /* match with a character class?  */
2505  for (i = 0; i<work_mbc->nch_classes; i++)
2506    {
2507      if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2508	goto charset_matched;
2509    }
2510
2511  strncpy(buffer, buf_begin + index, match_len);
2512  buffer[match_len] = '\0';
2513
2514  /* match with an equivalent class?  */
2515  for (i = 0; i<work_mbc->nequivs; i++)
2516    {
2517      op_len = strlen(work_mbc->equivs[i]);
2518      strncpy(buffer, buf_begin + index, op_len);
2519      buffer[op_len] = '\0';
2520      if (strcoll(work_mbc->equivs[i], buffer) == 0)
2521	{
2522	  match_len = op_len;
2523	  goto charset_matched;
2524	}
2525    }
2526
2527  /* match with a collating element?  */
2528  for (i = 0; i<work_mbc->ncoll_elems; i++)
2529    {
2530      op_len = strlen(work_mbc->coll_elems[i]);
2531      strncpy(buffer, buf_begin + index, op_len);
2532      buffer[op_len] = '\0';
2533
2534      if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2535	{
2536	  match_len = op_len;
2537	  goto charset_matched;
2538	}
2539    }
2540
2541  wcbuf[0] = wc;
2542  wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2543
2544  /* match with a range?  */
2545  for (i = 0; i<work_mbc->nranges; i++)
2546    {
2547      wcbuf[2] = work_mbc->range_sts[i];
2548      wcbuf[4] = work_mbc->range_ends[i];
2549
2550      if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2551	  wcscoll(wcbuf+4, wcbuf) >= 0)
2552	goto charset_matched;
2553    }
2554
2555  /* match with a character?  */
2556  if (case_fold)
2557    wc = towlower (wc);
2558  for (i = 0; i<work_mbc->nchars; i++)
2559    {
2560      if (wc == work_mbc->chars[i])
2561	goto charset_matched;
2562    }
2563
2564  match = !match;
2565
2566 charset_matched:
2567  return match ? match_len : 0;
2568}
2569
2570/* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2571   array which corresponds to `d->states[s].mbps.elem' and each element of
2572   the array contains the amount of the bytes with which the element can
2573   match.
2574   `index' is the index from the buf_begin, and it is the current position
2575   in the buffer.
2576   Caller MUST free the array which this function return.  */
2577static int*
2578check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
2579{
2580  int i;
2581  int* rarray;
2582
2583  MALLOC(rarray, int, d->states[s].mbps.nelem);
2584  for (i = 0; i < d->states[s].mbps.nelem; ++i)
2585    {
2586      position pos = d->states[s].mbps.elems[i];
2587      switch(d->tokens[pos.index])
2588	{
2589	case ANYCHAR:
2590	  rarray[i] = match_anychar(d, s, pos, index);
2591	  break;
2592	case MBCSET:
2593	  rarray[i] = match_mb_charset(d, s, pos, index);
2594	  break;
2595	default:
2596	  break; /* can not happen.  */
2597	}
2598    }
2599  return rarray;
2600}
2601
2602/* Consume a single character and enumerate all of the positions which can
2603   be next position from the state `s'.
2604   `match_lens' is the input. It can be NULL, but it can also be the output
2605   of check_matching_with_multibyte_ops() for optimization.
2606   `mbclen' and `pps' are the output.  `mbclen' is the length of the
2607   character consumed, and `pps' is the set this function enumerate.  */
2608static status_transit_state
2609transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2610			     int *match_lens, int *mbclen, position_set *pps)
2611{
2612  int i, j;
2613  int s1, s2;
2614  int* work_mbls;
2615  status_transit_state rs = TRANSIT_STATE_DONE;
2616
2617  /* Calculate the length of the (single/multi byte) character
2618     to which p points.  */
2619  *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
2620    : mblen_buf[*pp - buf_begin];
2621
2622  /* Calculate the state which can be reached from the state `s' by
2623     consuming `*mbclen' single bytes from the buffer.  */
2624  s1 = s;
2625  for (i = 0; i < *mbclen; i++)
2626    {
2627      s2 = s1;
2628      rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2629    }
2630  /* Copy the positions contained by `s1' to the set `pps'.  */
2631  copy(&(d->states[s1].elems), pps);
2632
2633  /* Check (inputed)match_lens, and initialize if it is NULL.  */
2634  if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2635    work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2636  else
2637    work_mbls = match_lens;
2638
2639  /* Add all of the positions which can be reached from `s' by consuming
2640     a single character.  */
2641  for (i = 0; i < d->states[s].mbps.nelem ; i++)
2642   {
2643      if (work_mbls[i] == *mbclen)
2644	for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2645	     j++)
2646	  insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2647		 pps);
2648    }
2649
2650  if (match_lens == NULL && work_mbls != NULL)
2651    free(work_mbls);
2652  return rs;
2653}
2654
2655/* Transit state from s, then return new state and update the pointer of the
2656   buffer.  This function is for some operator which can match with a multi-
2657   byte character or a collating element(which may be multi characters).  */
2658static int
2659transit_state (struct dfa *d, int s, unsigned char const **pp)
2660{
2661  int s1;
2662  int mbclen;		/* The length of current input multibyte character. */
2663  int maxlen = 0;
2664  int i, j;
2665  int *match_lens = NULL;
2666  int nelem = d->states[s].mbps.nelem; /* Just a alias.  */
2667  position_set follows;
2668  unsigned char const *p1 = *pp;
2669  status_transit_state rs;
2670  wchar_t wc;
2671
2672  if (nelem > 0)
2673    /* This state has (a) multibyte operator(s).
2674       We check whether each of them can match or not.  */
2675    {
2676      /* Note: caller must free the return value of this function.  */
2677      match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2678
2679      for (i = 0; i < nelem; i++)
2680	/* Search the operator which match the longest string,
2681	   in this state.  */
2682	{
2683	  if (match_lens[i] > maxlen)
2684	    maxlen = match_lens[i];
2685	}
2686    }
2687
2688  if (nelem == 0 || maxlen == 0)
2689    /* This state has no multibyte operator which can match.
2690       We need to  check only one singlebyte character.  */
2691    {
2692      status_transit_state rs;
2693      rs = transit_state_singlebyte(d, s, *pp, &s1);
2694
2695      /* We must update the pointer if state transition succeeded.  */
2696      if (rs == TRANSIT_STATE_DONE)
2697	++*pp;
2698
2699      if (match_lens != NULL)
2700	free(match_lens);
2701      return s1;
2702    }
2703
2704  /* This state has some operators which can match a multibyte character.  */
2705  follows.nelem = 0;
2706  MALLOC(follows.elems, position, d->nleaves);
2707
2708  /* `maxlen' may be longer than the length of a character, because it may
2709     not be a character but a (multi character) collating element.
2710     We enumerate all of the positions which `s' can reach by consuming
2711     `maxlen' bytes.  */
2712  rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2713
2714  wc = inputwcs[*pp - mbclen - buf_begin];
2715  s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2716  realloc_trans_if_necessary(d, s1);
2717
2718  while (*pp - p1 < maxlen)
2719    {
2720      follows.nelem = 0;
2721      rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2722
2723      for (i = 0; i < nelem ; i++)
2724	{
2725	  if (match_lens[i] == *pp - p1)
2726	    for (j = 0;
2727		 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2728	      insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2729		     &follows);
2730	}
2731
2732      wc = inputwcs[*pp - mbclen - buf_begin];
2733      s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2734      realloc_trans_if_necessary(d, s1);
2735    }
2736  free(match_lens);
2737  free(follows.elems);
2738  return s1;
2739}
2740
2741#endif
2742
2743/* Search through a buffer looking for a match to the given struct dfa.
2744   Find the first occurrence of a string matching the regexp in the buffer,
2745   and the shortest possible version thereof.  Return the offset of the first
2746   character after the match, or (size_t) -1 if none is found.  BEGIN points to
2747   the beginning of the buffer, and SIZE is the size of the buffer.  If SIZE
2748   is nonzero, BEGIN[SIZE - 1] must be a newline.  BACKREF points to a place
2749   where we're supposed to store a 1 if backreferencing happened and the
2750   match needs to be verified by a backtracking matcher.  Otherwise
2751   we store a 0 in *backref. */
2752size_t
2753dfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
2754{
2755  register int s;	/* Current state. */
2756  register unsigned char const *p; /* Current input character. */
2757  register unsigned char const *end; /* One past the last input character.  */
2758  register int **trans, *t;	/* Copy of d->trans so it can be optimized
2759				   into a register. */
2760  register unsigned char eol = eolbyte;	/* Likewise for eolbyte.  */
2761  static int sbit[NOTCHAR];	/* Table for anding with d->success. */
2762  static int sbit_init;
2763
2764  if (! sbit_init)
2765    {
2766      int i;
2767
2768      sbit_init = 1;
2769      for (i = 0; i < NOTCHAR; ++i)
2770	sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2771      sbit[eol] = 4;
2772    }
2773
2774  if (! d->tralloc)
2775    build_state_zero(d);
2776
2777  s = 0;
2778  p = (unsigned char const *) begin;
2779  end = p + size;
2780  trans = d->trans;
2781
2782#ifdef MBS_SUPPORT
2783  if (MB_CUR_MAX > 1)
2784    {
2785      int remain_bytes, i;
2786      buf_begin = begin;
2787      buf_end = end;
2788
2789      /* initialize mblen_buf, and inputwcs.  */
2790      MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
2791      MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
2792      memset(&mbs, 0, sizeof(mbstate_t));
2793      remain_bytes = 0;
2794      for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
2795	{
2796	  if (remain_bytes == 0)
2797	    {
2798	      remain_bytes
2799		= mbrtowc(inputwcs + i, begin + i,
2800			  end - (unsigned char const *)begin - i + 1, &mbs);
2801	      if (remain_bytes <= 1)
2802		{
2803		  remain_bytes = 0;
2804		  inputwcs[i] = (wchar_t)begin[i];
2805		  mblen_buf[i] = 0;
2806		}
2807	      else
2808		{
2809		  mblen_buf[i] = remain_bytes;
2810		  remain_bytes--;
2811		}
2812	    }
2813	  else
2814	    {
2815	      mblen_buf[i] = remain_bytes;
2816	      inputwcs[i] = 0;
2817	      remain_bytes--;
2818	    }
2819	}
2820      mblen_buf[i] = 0;
2821      inputwcs[i] = 0; /* sentinel */
2822    }
2823#endif /* MBS_SUPPORT */
2824
2825  for (;;)
2826    {
2827#ifdef MBS_SUPPORT
2828      if (MB_CUR_MAX > 1)
2829	while ((t = trans[s]))
2830	  {
2831	    if (d->states[s].mbps.nelem != 0)
2832	      {
2833		/* Can match with a multibyte character( and multi character
2834		   collating element).  */
2835		unsigned char const *nextp;
2836
2837		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2838
2839		nextp = p;
2840		s = transit_state(d, s, &nextp);
2841		p = nextp;
2842
2843		/* Trans table might be updated.  */
2844		trans = d->trans;
2845	      }
2846	    else
2847	      {
2848		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2849		s = t[*p++];
2850	      }
2851	  }
2852      else
2853#endif /* MBS_SUPPORT */
2854        while ((t = trans[s]))
2855	  s = t[*p++];
2856
2857      if (s < 0)
2858	{
2859	  if (p == end)
2860	    {
2861#ifdef MBS_SUPPORT
2862	      if (MB_CUR_MAX > 1)
2863		{
2864		  free(mblen_buf);
2865		  free(inputwcs);
2866		}
2867#endif /* MBS_SUPPORT */
2868	      return (size_t) -1;
2869	    }
2870	  s = 0;
2871	}
2872      else if ((t = d->fails[s]))
2873	{
2874	  if (d->success[s] & sbit[*p])
2875	    {
2876	      if (backref)
2877		*backref = (d->states[s].backref != 0);
2878#ifdef MBS_SUPPORT
2879	      if (MB_CUR_MAX > 1)
2880		{
2881		  free(mblen_buf);
2882		  free(inputwcs);
2883		}
2884#endif /* MBS_SUPPORT */
2885	      return (char const *) p - begin;
2886	    }
2887
2888#ifdef MBS_SUPPORT
2889	  if (MB_CUR_MAX > 1)
2890	    {
2891		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2892		if (d->states[s].mbps.nelem != 0)
2893		  {
2894		    /* Can match with a multibyte character( and multi
2895		       character collating element).  */
2896		    unsigned char const *nextp;
2897		    nextp = p;
2898		    s = transit_state(d, s, &nextp);
2899		    p = nextp;
2900
2901		    /* Trans table might be updated.  */
2902		    trans = d->trans;
2903		  }
2904		else
2905		s = t[*p++];
2906	    }
2907	  else
2908#endif /* MBS_SUPPORT */
2909	  s = t[*p++];
2910	}
2911      else
2912	{
2913	  build_state(s, d);
2914	  trans = d->trans;
2915	}
2916    }
2917}
2918
2919/* Initialize the components of a dfa that the other routines don't
2920   initialize for themselves. */
2921void
2922dfainit (struct dfa *d)
2923{
2924  d->calloc = 1;
2925  MALLOC(d->charclasses, charclass, d->calloc);
2926  d->cindex = 0;
2927
2928  d->talloc = 1;
2929  MALLOC(d->tokens, token, d->talloc);
2930  d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2931#ifdef MBS_SUPPORT
2932  if (MB_CUR_MAX > 1)
2933    {
2934      d->nmultibyte_prop = 1;
2935      MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
2936      d->nmbcsets = 0;
2937      d->mbcsets_alloc = 1;
2938      MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
2939    }
2940#endif
2941
2942  d->searchflag = 0;
2943  d->tralloc = 0;
2944
2945  d->musts = 0;
2946}
2947
2948/* Parse and analyze a single string of the given length. */
2949void
2950dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
2951{
2952  if (case_fold)	/* dummy folding in service of dfamust() */
2953    {
2954      char *lcopy;
2955      int i;
2956
2957      lcopy = malloc(len);
2958      if (!lcopy)
2959	dfaerror(_("out of memory"));
2960
2961      /* This is a kludge. */
2962      case_fold = 0;
2963      for (i = 0; i < len; ++i)
2964	if (ISUPPER ((unsigned char) s[i]))
2965	  lcopy[i] = tolower ((unsigned char) s[i]);
2966	else
2967	  lcopy[i] = s[i];
2968
2969      dfainit(d);
2970      dfaparse(lcopy, len, d);
2971      free(lcopy);
2972      dfamust(d);
2973      d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2974      case_fold = 1;
2975      dfaparse(s, len, d);
2976      dfaanalyze(d, searchflag);
2977    }
2978  else
2979    {
2980        dfainit(d);
2981        dfaparse(s, len, d);
2982	dfamust(d);
2983        dfaanalyze(d, searchflag);
2984    }
2985}
2986
2987/* Free the storage held by the components of a dfa. */
2988void
2989dfafree (struct dfa *d)
2990{
2991  int i;
2992  struct dfamust *dm, *ndm;
2993
2994  free((ptr_t) d->charclasses);
2995  free((ptr_t) d->tokens);
2996
2997#ifdef MBS_SUPPORT
2998  if (MB_CUR_MAX > 1)
2999    {
3000      free((ptr_t) d->multibyte_prop);
3001      for (i = 0; i < d->nmbcsets; ++i)
3002	{
3003	  int j;
3004	  struct mb_char_classes *p = &(d->mbcsets[i]);
3005	  if (p->chars != NULL)
3006	    free(p->chars);
3007	  if (p->ch_classes != NULL)
3008	    free(p->ch_classes);
3009	  if (p->range_sts != NULL)
3010	    free(p->range_sts);
3011	  if (p->range_ends != NULL)
3012	    free(p->range_ends);
3013
3014	  for (j = 0; j < p->nequivs; ++j)
3015	    free(p->equivs[j]);
3016	  if (p->equivs != NULL)
3017	    free(p->equivs);
3018
3019	  for (j = 0; j < p->ncoll_elems; ++j)
3020	    free(p->coll_elems[j]);
3021	  if (p->coll_elems != NULL)
3022	    free(p->coll_elems);
3023	}
3024      free((ptr_t) d->mbcsets);
3025    }
3026#endif /* MBS_SUPPORT */
3027
3028  for (i = 0; i < d->sindex; ++i)
3029    free((ptr_t) d->states[i].elems.elems);
3030  free((ptr_t) d->states);
3031  for (i = 0; i < d->tindex; ++i)
3032    if (d->follows[i].elems)
3033      free((ptr_t) d->follows[i].elems);
3034  free((ptr_t) d->follows);
3035  for (i = 0; i < d->tralloc; ++i)
3036    if (d->trans[i])
3037      free((ptr_t) d->trans[i]);
3038    else if (d->fails[i])
3039      free((ptr_t) d->fails[i]);
3040  if (d->realtrans) free((ptr_t) d->realtrans);
3041  if (d->fails) free((ptr_t) d->fails);
3042  if (d->success) free((ptr_t) d->success);
3043  for (dm = d->musts; dm; dm = ndm)
3044    {
3045      ndm = dm->next;
3046      free(dm->must);
3047      free((ptr_t) dm);
3048    }
3049}
3050
3051/* Having found the postfix representation of the regular expression,
3052   try to find a long sequence of characters that must appear in any line
3053   containing the r.e.
3054   Finding a "longest" sequence is beyond the scope here;
3055   we take an easy way out and hope for the best.
3056   (Take "(ab|a)b"--please.)
3057
3058   We do a bottom-up calculation of sequences of characters that must appear
3059   in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3060   representation:
3061	sequences that must appear at the left of the match ("left")
3062	sequences that must appear at the right of the match ("right")
3063	lists of sequences that must appear somewhere in the match ("in")
3064	sequences that must constitute the match ("is")
3065
3066   When we get to the root of the tree, we use one of the longest of its
3067   calculated "in" sequences as our answer.  The sequence we find is returned in
3068   d->must (where "d" is the single argument passed to "dfamust");
3069   the length of the sequence is returned in d->mustn.
3070
3071   The sequences calculated for the various types of node (in pseudo ANSI c)
3072   are shown below.  "p" is the operand of unary operators (and the left-hand
3073   operand of binary operators); "q" is the right-hand operand of binary
3074   operators.
3075
3076   "ZERO" means "a zero-length sequence" below.
3077
3078	Type	left		right		is		in
3079	----	----		-----		--		--
3080	char c	# c		# c		# c		# c
3081
3082	ANYCHAR	ZERO		ZERO		ZERO		ZERO
3083
3084	MBCSET	ZERO		ZERO		ZERO		ZERO
3085
3086	CSET	ZERO		ZERO		ZERO		ZERO
3087
3088	STAR	ZERO		ZERO		ZERO		ZERO
3089
3090	QMARK	ZERO		ZERO		ZERO		ZERO
3091
3092	PLUS	p->left		p->right	ZERO		p->in
3093
3094	CAT	(p->is==ZERO)?	(q->is==ZERO)?	(p->is!=ZERO &&	p->in plus
3095		p->left :	q->right :	q->is!=ZERO) ?	q->in plus
3096		p->is##q->left	p->right##q->is	p->is##q->is :	p->right##q->left
3097						ZERO
3098
3099	OR	longest common	longest common	(do p->is and	substrings common to
3100		leading		trailing	q->is have same	p->in and q->in
3101		(sub)sequence	(sub)sequence	length and
3102		of p->left	of p->right	content) ?
3103		and q->left	and q->right	p->is : NULL
3104
3105   If there's anything else we recognize in the tree, all four sequences get set
3106   to zero-length sequences.  If there's something we don't recognize in the tree,
3107   we just return a zero-length sequence.
3108
3109   Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3110   'aaa')?
3111
3112   And. . .is it here or someplace that we might ponder "optimizations" such as
3113	egrep 'psi|epsilon'	->	egrep 'psi'
3114	egrep 'pepsi|epsilon'	->	egrep 'epsi'
3115					(Yes, we now find "epsi" as a "string
3116					that must occur", but we might also
3117					simplify the *entire* r.e. being sought)
3118	grep '[c]'		->	grep 'c'
3119	grep '(ab|a)b'		->	grep 'ab'
3120	grep 'ab*'		->	grep 'a'
3121	grep 'a*b'		->	grep 'b'
3122
3123   There are several issues:
3124
3125   Is optimization easy (enough)?
3126
3127   Does optimization actually accomplish anything,
3128   or is the automaton you get from "psi|epsilon" (for example)
3129   the same as the one you get from "psi" (for example)?
3130
3131   Are optimizable r.e.'s likely to be used in real-life situations
3132   (something like 'ab*' is probably unlikely; something like is
3133   'psi|epsilon' is likelier)? */
3134
3135static char *
3136icatalloc (char *old, char *new)
3137{
3138  char *result;
3139  size_t oldsize, newsize;
3140
3141  newsize = (new == NULL) ? 0 : strlen(new);
3142  if (old == NULL)
3143    oldsize = 0;
3144  else if (newsize == 0)
3145    return old;
3146  else	oldsize = strlen(old);
3147  if (old == NULL)
3148    result = (char *) malloc(newsize + 1);
3149  else
3150    result = (char *) realloc((void *) old, oldsize + newsize + 1);
3151  if (result != NULL && new != NULL)
3152    (void) strcpy(result + oldsize, new);
3153  return result;
3154}
3155
3156static char *
3157icpyalloc (char *string)
3158{
3159  return icatalloc((char *) NULL, string);
3160}
3161
3162static char *
3163istrstr (char *lookin, char *lookfor)
3164{
3165  char *cp;
3166  size_t len;
3167
3168  len = strlen(lookfor);
3169  for (cp = lookin; *cp != '\0'; ++cp)
3170    if (strncmp(cp, lookfor, len) == 0)
3171      return cp;
3172  return NULL;
3173}
3174
3175static void
3176ifree (char *cp)
3177{
3178  if (cp != NULL)
3179    free(cp);
3180}
3181
3182static void
3183freelist (char **cpp)
3184{
3185  int i;
3186
3187  if (cpp == NULL)
3188    return;
3189  for (i = 0; cpp[i] != NULL; ++i)
3190    {
3191      free(cpp[i]);
3192      cpp[i] = NULL;
3193    }
3194}
3195
3196static char **
3197enlist (char **cpp, char *new, size_t len)
3198{
3199  int i, j;
3200
3201  if (cpp == NULL)
3202    return NULL;
3203  if ((new = icpyalloc(new)) == NULL)
3204    {
3205      freelist(cpp);
3206      return NULL;
3207    }
3208  new[len] = '\0';
3209  /* Is there already something in the list that's new (or longer)? */
3210  for (i = 0; cpp[i] != NULL; ++i)
3211    if (istrstr(cpp[i], new) != NULL)
3212      {
3213	free(new);
3214	return cpp;
3215      }
3216  /* Eliminate any obsoleted strings. */
3217  j = 0;
3218  while (cpp[j] != NULL)
3219    if (istrstr(new, cpp[j]) == NULL)
3220      ++j;
3221    else
3222      {
3223	free(cpp[j]);
3224	if (--i == j)
3225	  break;
3226	cpp[j] = cpp[i];
3227	cpp[i] = NULL;
3228      }
3229  /* Add the new string. */
3230  cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
3231  if (cpp == NULL)
3232    return NULL;
3233  cpp[i] = new;
3234  cpp[i + 1] = NULL;
3235  return cpp;
3236}
3237
3238/* Given pointers to two strings, return a pointer to an allocated
3239   list of their distinct common substrings. Return NULL if something
3240   seems wild. */
3241static char **
3242comsubs (char *left, char *right)
3243{
3244  char **cpp;
3245  char *lcp;
3246  char *rcp;
3247  size_t i, len;
3248
3249  if (left == NULL || right == NULL)
3250    return NULL;
3251  cpp = (char **) malloc(sizeof *cpp);
3252  if (cpp == NULL)
3253    return NULL;
3254  cpp[0] = NULL;
3255  for (lcp = left; *lcp != '\0'; ++lcp)
3256    {
3257      len = 0;
3258      rcp = strchr (right, *lcp);
3259      while (rcp != NULL)
3260	{
3261	  for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3262	    continue;
3263	  if (i > len)
3264	    len = i;
3265	  rcp = strchr (rcp + 1, *lcp);
3266	}
3267      if (len == 0)
3268	continue;
3269      if ((cpp = enlist(cpp, lcp, len)) == NULL)
3270	break;
3271    }
3272  return cpp;
3273}
3274
3275static char **
3276addlists (char **old, char **new)
3277{
3278  int i;
3279
3280  if (old == NULL || new == NULL)
3281    return NULL;
3282  for (i = 0; new[i] != NULL; ++i)
3283    {
3284      old = enlist(old, new[i], strlen(new[i]));
3285      if (old == NULL)
3286	break;
3287    }
3288  return old;
3289}
3290
3291/* Given two lists of substrings, return a new list giving substrings
3292   common to both. */
3293static char **
3294inboth (char **left, char **right)
3295{
3296  char **both;
3297  char **temp;
3298  int lnum, rnum;
3299
3300  if (left == NULL || right == NULL)
3301    return NULL;
3302  both = (char **) malloc(sizeof *both);
3303  if (both == NULL)
3304    return NULL;
3305  both[0] = NULL;
3306  for (lnum = 0; left[lnum] != NULL; ++lnum)
3307    {
3308      for (rnum = 0; right[rnum] != NULL; ++rnum)
3309	{
3310	  temp = comsubs(left[lnum], right[rnum]);
3311	  if (temp == NULL)
3312	    {
3313	      freelist(both);
3314	      return NULL;
3315	    }
3316	  both = addlists(both, temp);
3317	  freelist(temp);
3318	  free(temp);
3319	  if (both == NULL)
3320	    return NULL;
3321	}
3322    }
3323  return both;
3324}
3325
3326typedef struct
3327{
3328  char **in;
3329  char *left;
3330  char *right;
3331  char *is;
3332} must;
3333
3334static void
3335resetmust (must *mp)
3336{
3337  mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3338  freelist(mp->in);
3339}
3340
3341static void
3342dfamust (struct dfa *dfa)
3343{
3344  must *musts;
3345  must *mp;
3346  char *result;
3347  int ri;
3348  int i;
3349  int exact;
3350  token t;
3351  static must must0;
3352  struct dfamust *dm;
3353  static char empty_string[] = "";
3354
3355  result = empty_string;
3356  exact = 0;
3357  musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
3358  if (musts == NULL)
3359    return;
3360  mp = musts;
3361  for (i = 0; i <= dfa->tindex; ++i)
3362    mp[i] = must0;
3363  for (i = 0; i <= dfa->tindex; ++i)
3364    {
3365      mp[i].in = (char **) malloc(sizeof *mp[i].in);
3366      mp[i].left = malloc(2);
3367      mp[i].right = malloc(2);
3368      mp[i].is = malloc(2);
3369      if (mp[i].in == NULL || mp[i].left == NULL ||
3370	  mp[i].right == NULL || mp[i].is == NULL)
3371	goto done;
3372      mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3373      mp[i].in[0] = NULL;
3374    }
3375#ifdef DEBUG
3376  fprintf(stderr, "dfamust:\n");
3377  for (i = 0; i < dfa->tindex; ++i)
3378    {
3379      fprintf(stderr, " %d:", i);
3380      prtok(dfa->tokens[i]);
3381    }
3382  putc('\n', stderr);
3383#endif
3384  for (ri = 0; ri < dfa->tindex; ++ri)
3385    {
3386      switch (t = dfa->tokens[ri])
3387	{
3388	case LPAREN:
3389	case RPAREN:
3390	  goto done;		/* "cannot happen" */
3391	case EMPTY:
3392	case BEGLINE:
3393	case ENDLINE:
3394	case BEGWORD:
3395	case ENDWORD:
3396	case LIMWORD:
3397	case NOTLIMWORD:
3398	case BACKREF:
3399	  resetmust(mp);
3400	  break;
3401	case STAR:
3402	case QMARK:
3403	  if (mp <= musts)
3404	    goto done;		/* "cannot happen" */
3405	  --mp;
3406	  resetmust(mp);
3407	  break;
3408	case OR:
3409	case ORTOP:
3410	  if (mp < &musts[2])
3411	    goto done;		/* "cannot happen" */
3412	  {
3413	    char **new;
3414	    must *lmp;
3415	    must *rmp;
3416	    int j, ln, rn, n;
3417
3418	    rmp = --mp;
3419	    lmp = --mp;
3420	    /* Guaranteed to be.  Unlikely, but. . . */
3421	    if (strcmp(lmp->is, rmp->is) != 0)
3422	      lmp->is[0] = '\0';
3423	    /* Left side--easy */
3424	    i = 0;
3425	    while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3426	      ++i;
3427	    lmp->left[i] = '\0';
3428	    /* Right side */
3429	    ln = strlen(lmp->right);
3430	    rn = strlen(rmp->right);
3431	    n = ln;
3432	    if (n > rn)
3433	      n = rn;
3434	    for (i = 0; i < n; ++i)
3435	      if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3436		break;
3437	    for (j = 0; j < i; ++j)
3438	      lmp->right[j] = lmp->right[(ln - i) + j];
3439	    lmp->right[j] = '\0';
3440	    new = inboth(lmp->in, rmp->in);
3441	    if (new == NULL)
3442	      goto done;
3443	    freelist(lmp->in);
3444	    free((char *) lmp->in);
3445	    lmp->in = new;
3446	  }
3447	  break;
3448	case PLUS:
3449	  if (mp <= musts)
3450	    goto done;		/* "cannot happen" */
3451	  --mp;
3452	  mp->is[0] = '\0';
3453	  break;
3454	case END:
3455	  if (mp != &musts[1])
3456	    goto done;		/* "cannot happen" */
3457	  for (i = 0; musts[0].in[i] != NULL; ++i)
3458	    if (strlen(musts[0].in[i]) > strlen(result))
3459	      result = musts[0].in[i];
3460	  if (strcmp(result, musts[0].is) == 0)
3461	    exact = 1;
3462	  goto done;
3463	case CAT:
3464	  if (mp < &musts[2])
3465	    goto done;		/* "cannot happen" */
3466	  {
3467	    must *lmp;
3468	    must *rmp;
3469
3470	    rmp = --mp;
3471	    lmp = --mp;
3472	    /* In.  Everything in left, plus everything in
3473	       right, plus catenation of
3474	       left's right and right's left. */
3475	    lmp->in = addlists(lmp->in, rmp->in);
3476	    if (lmp->in == NULL)
3477	      goto done;
3478	    if (lmp->right[0] != '\0' &&
3479		rmp->left[0] != '\0')
3480	      {
3481		char *tp;
3482
3483		tp = icpyalloc(lmp->right);
3484		if (tp == NULL)
3485		  goto done;
3486		tp = icatalloc(tp, rmp->left);
3487		if (tp == NULL)
3488		  goto done;
3489		lmp->in = enlist(lmp->in, tp,
3490				 strlen(tp));
3491		free(tp);
3492		if (lmp->in == NULL)
3493		  goto done;
3494	      }
3495	    /* Left-hand */
3496	    if (lmp->is[0] != '\0')
3497	      {
3498		lmp->left = icatalloc(lmp->left,
3499				      rmp->left);
3500		if (lmp->left == NULL)
3501		  goto done;
3502	      }
3503	    /* Right-hand */
3504	    if (rmp->is[0] == '\0')
3505	      lmp->right[0] = '\0';
3506	    lmp->right = icatalloc(lmp->right, rmp->right);
3507	    if (lmp->right == NULL)
3508	      goto done;
3509	    /* Guaranteed to be */
3510	    if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3511	      {
3512		lmp->is = icatalloc(lmp->is, rmp->is);
3513		if (lmp->is == NULL)
3514		  goto done;
3515	      }
3516	    else
3517	      lmp->is[0] = '\0';
3518	  }
3519	  break;
3520	default:
3521	  if (t < END)
3522	    {
3523	      /* "cannot happen" */
3524	      goto done;
3525	    }
3526	  else if (t == '\0')
3527	    {
3528	      /* not on *my* shift */
3529	      goto done;
3530	    }
3531	  else if (t >= CSET
3532#ifdef MBS_SUPPORT
3533		   || t == ANYCHAR
3534		   || t == MBCSET
3535#endif /* MBS_SUPPORT */
3536		   )
3537	    {
3538	      /* easy enough */
3539	      resetmust(mp);
3540	    }
3541	  else
3542	    {
3543	      /* plain character */
3544	      resetmust(mp);
3545	      mp->is[0] = mp->left[0] = mp->right[0] = t;
3546	      mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3547	      mp->in = enlist(mp->in, mp->is, (size_t)1);
3548	      if (mp->in == NULL)
3549		goto done;
3550	    }
3551	  break;
3552	}
3553#ifdef DEBUG
3554      fprintf(stderr, " node: %d:", ri);
3555      prtok(dfa->tokens[ri]);
3556      fprintf(stderr, "\n  in:");
3557      for (i = 0; mp->in[i]; ++i)
3558	fprintf(stderr, " \"%s\"", mp->in[i]);
3559      fprintf(stderr, "\n  is: \"%s\"\n", mp->is);
3560      fprintf(stderr, "  left: \"%s\"\n", mp->left);
3561      fprintf(stderr, "  right: \"%s\"\n", mp->right);
3562#endif
3563      ++mp;
3564    }
3565 done:
3566  if (strlen(result))
3567    {
3568      dm = (struct dfamust *) malloc(sizeof (struct dfamust));
3569      dm->exact = exact;
3570      dm->must = malloc(strlen(result) + 1);
3571      strcpy(dm->must, result);
3572      dm->next = dfa->musts;
3573      dfa->musts = dm;
3574    }
3575  mp = musts;
3576  for (i = 0; i <= dfa->tindex; ++i)
3577    {
3578      freelist(mp[i].in);
3579      ifree((char *) mp[i].in);
3580      ifree(mp[i].left);
3581      ifree(mp[i].right);
3582      ifree(mp[i].is);
3583    }
3584  free((char *) mp);
3585}
3586/* vim:set shiftwidth=2: */
3587