• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-WNDR4500v2-V1.0.0.60_1.0.38/ap/gpl/timemachine/gettext-0.17/gettext-tools/libgrep/
1/* dfa.c - deterministic extended regexp routines for GNU
2   Copyright 1988, 1998, 2000, 2005-2006 Free Software Foundation, Inc.
3
4   This program is free software: you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 3 of the License, or
7   (at your option) any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
16
17/* Written June, 1988 by Mike Haertel
18   Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
19
20#ifdef HAVE_CONFIG_H
21#include <config.h>
22#endif
23
24#include <assert.h>
25#include <ctype.h>
26#include <stdio.h>
27
28#include <sys/types.h>
29#include <stdlib.h>
30#include <string.h>
31#include <locale.h>
32
33#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
34/* We can handle multibyte string.  */
35# define MBS_SUPPORT
36#endif
37
38#ifdef MBS_SUPPORT
39# include <wchar.h>
40# include <wctype.h>
41#endif
42
43#ifndef DEBUG	/* use the same approach as regex.c */
44#undef assert
45#define assert(e)
46#endif /* DEBUG */
47
48#ifndef isgraph
49#define isgraph(C) (isprint(C) && !isspace(C))
50#endif
51
52#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
53#define ISALPHA(C) isalpha(C)
54#define ISUPPER(C) isupper(C)
55#define ISLOWER(C) islower(C)
56#define ISDIGIT(C) isdigit(C)
57#define ISXDIGIT(C) isxdigit(C)
58#define ISSPACE(C) isspace(C)
59#define ISPUNCT(C) ispunct(C)
60#define ISALNUM(C) isalnum(C)
61#define ISPRINT(C) isprint(C)
62#define ISGRAPH(C) isgraph(C)
63#define ISCNTRL(C) iscntrl(C)
64#else
65#define ISALPHA(C) (isascii(C) && isalpha(C))
66#define ISUPPER(C) (isascii(C) && isupper(C))
67#define ISLOWER(C) (isascii(C) && islower(C))
68#define ISDIGIT(C) (isascii(C) && isdigit(C))
69#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
70#define ISSPACE(C) (isascii(C) && isspace(C))
71#define ISPUNCT(C) (isascii(C) && ispunct(C))
72#define ISALNUM(C) (isascii(C) && isalnum(C))
73#define ISPRINT(C) (isascii(C) && isprint(C))
74#define ISGRAPH(C) (isascii(C) && isgraph(C))
75#define ISCNTRL(C) (isascii(C) && iscntrl(C))
76#endif
77
78/* ISASCIIDIGIT differs from ISDIGIT, as follows:
79   - Its arg may be any int or unsigned int; it need not be an unsigned char.
80   - It's guaranteed to evaluate its argument exactly once.
81   - It's typically faster.
82   Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
83   only '0' through '9' are digits.  Prefer ISASCIIDIGIT to ISDIGIT unless
84   it's important to use the locale's definition of `digit' even when the
85   host does not conform to Posix.  */
86#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
87
88#include "regex.h"
89#include "dfa.h"
90#include "hard-locale.h"
91#include "gettext.h"
92#define _(str) gettext (str)
93
94/* HPUX, define those as macros in sys/param.h */
95#ifdef setbit
96# undef setbit
97#endif
98#ifdef clrbit
99# undef clrbit
100#endif
101
102static void dfamust (struct dfa *dfa);
103static void regexp (int toplevel);
104
105static ptr_t
106xcalloc (size_t n, size_t s)
107{
108  ptr_t r = calloc(n, s);
109
110  if (!r)
111    dfaerror(_("Memory exhausted"));
112  return r;
113}
114
115static ptr_t
116xmalloc (size_t n)
117{
118  ptr_t r = malloc(n);
119
120  assert(n != 0);
121  if (!r)
122    dfaerror(_("Memory exhausted"));
123  return r;
124}
125
126static ptr_t
127xrealloc (ptr_t p, size_t n)
128{
129  ptr_t r = realloc(p, n);
130
131  assert(n != 0);
132  if (!r)
133    dfaerror(_("Memory exhausted"));
134  return r;
135}
136
137#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
138#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
139#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
140
141/* Reallocate an array of type t if nalloc is too small for index. */
142#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
143  if ((index) >= (nalloc))			  \
144    {						  \
145      do					  \
146	(nalloc) *= 2;				  \
147      while ((index) >= (nalloc));		  \
148      REALLOC(p, t, nalloc);			  \
149    }
150
151#ifdef DEBUG
152
153static void
154prtok (token t)
155{
156  char const *s;
157
158  if (t < 0)
159    fprintf(stderr, "END");
160  else if (t < NOTCHAR)
161    fprintf(stderr, "%c", t);
162  else
163    {
164      switch (t)
165	{
166	case EMPTY: s = "EMPTY"; break;
167	case BACKREF: s = "BACKREF"; break;
168	case BEGLINE: s = "BEGLINE"; break;
169	case ENDLINE: s = "ENDLINE"; break;
170	case BEGWORD: s = "BEGWORD"; break;
171	case ENDWORD: s = "ENDWORD"; break;
172	case LIMWORD: s = "LIMWORD"; break;
173	case NOTLIMWORD: s = "NOTLIMWORD"; break;
174	case QMARK: s = "QMARK"; break;
175	case STAR: s = "STAR"; break;
176	case PLUS: s = "PLUS"; break;
177	case CAT: s = "CAT"; break;
178	case OR: s = "OR"; break;
179	case ORTOP: s = "ORTOP"; break;
180	case LPAREN: s = "LPAREN"; break;
181	case RPAREN: s = "RPAREN"; break;
182	case CRANGE: s = "CRANGE"; break;
183#ifdef MBS_SUPPORT
184	case ANYCHAR: s = "ANYCHAR"; break;
185	case MBCSET: s = "MBCSET"; break;
186#endif /* MBS_SUPPORT */
187	default: s = "CSET"; break;
188	}
189      fprintf(stderr, "%s", s);
190    }
191}
192#endif /* DEBUG */
193
194/* Stuff pertaining to charclasses. */
195
196static int
197tstbit (unsigned b, charclass c)
198{
199  return c[b / INTBITS] & 1 << b % INTBITS;
200}
201
202static void
203setbit (unsigned b, charclass c)
204{
205  c[b / INTBITS] |= 1 << b % INTBITS;
206}
207
208static void
209clrbit (unsigned b, charclass c)
210{
211  c[b / INTBITS] &= ~(1 << b % INTBITS);
212}
213
214static void
215copyset (charclass src, charclass dst)
216{
217  memcpy (dst, src, sizeof (charclass));
218}
219
220static void
221zeroset (charclass s)
222{
223  memset (s, 0, sizeof (charclass));
224}
225
226static void
227notset (charclass s)
228{
229  int i;
230
231  for (i = 0; i < CHARCLASS_INTS; ++i)
232    s[i] = ~s[i];
233}
234
235static int
236equal (charclass s1, charclass s2)
237{
238  return memcmp (s1, s2, sizeof (charclass)) == 0;
239}
240
241/* A pointer to the current dfa is kept here during parsing. */
242static struct dfa *dfa;
243
244/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
245static int
246charclass_index (charclass s)
247{
248  int i;
249
250  for (i = 0; i < dfa->cindex; ++i)
251    if (equal(s, dfa->charclasses[i]))
252      return i;
253  REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
254  ++dfa->cindex;
255  copyset(s, dfa->charclasses[i]);
256  return i;
257}
258
259/* Syntax bits controlling the behavior of the lexical analyzer. */
260static reg_syntax_t syntax_bits, syntax_bits_set;
261
262/* Flag for case-folding letters into sets. */
263static int case_fold;
264
265/* End-of-line byte in data.  */
266static unsigned char eolbyte;
267
268/* Entry point to set syntax options. */
269void
270dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
271{
272  syntax_bits_set = 1;
273  syntax_bits = bits;
274  case_fold = fold;
275  eolbyte = eol;
276}
277
278/* Like setbit, but if case is folded, set both cases of a letter.  */
279static void
280setbit_case_fold (unsigned b, charclass c)
281{
282  setbit (b, c);
283  if (case_fold)
284    {
285      if (ISUPPER (b))
286	setbit (tolower (b), c);
287      else if (ISLOWER (b))
288	setbit (toupper (b), c);
289    }
290}
291
292/* Lexical analyzer.  All the dross that deals with the obnoxious
293   GNU Regex syntax bits is located here.  The poor, suffering
294   reader is referred to the GNU Regex documentation for the
295   meaning of the @#%!@#%^!@ syntax bits. */
296
297static char const *lexstart;	/* Pointer to beginning of input string. */
298static char const *lexptr;	/* Pointer to next input character. */
299static int lexleft;		/* Number of characters remaining. */
300static token lasttok;		/* Previous token returned; initially END. */
301static int laststart;		/* True if we're separated from beginning or (, |
302				   only by zero-width characters. */
303static int parens;		/* Count of outstanding left parens. */
304static int minrep, maxrep;	/* Repeat counts for {m,n}. */
305static int hard_LC_COLLATE;	/* Nonzero if LC_COLLATE is hard.  */
306
307#ifdef MBS_SUPPORT
308/* These variables are used only if (MB_CUR_MAX > 1).  */
309static mbstate_t mbs;		/* Mbstate for mbrlen().  */
310static int cur_mb_len;		/* Byte length of the current scanning
311				   multibyte character.  */
312static int cur_mb_index;        /* Byte index of the current scanning multibyte
313                                   character.
314
315				   singlebyte character : cur_mb_index = 0
316				   multibyte character
317				       1st byte : cur_mb_index = 1
318				       2nd byte : cur_mb_index = 2
319				         ...
320				       nth byte : cur_mb_index = n  */
321static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
322                                  Each element store the amount of remain
323                                  byte of corresponding multibyte character
324                                  in the input string.  A element's value
325                                  is 0 if corresponding character is a
326                                  singlebyte chracter.
327                                  e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
328                                   mblen_buf :   0,       3,       2,       1
329                               */
330static wchar_t *inputwcs;	/* Wide character representation of input
331				   string in dfaexec().
332				   The length of this array is same as
333				   the length of input string(char array).
334				   inputstring[i] is a single-byte char,
335				   or 1st byte of a multibyte char.
336				   And inputwcs[i] is the codepoint.  */
337static unsigned char const *buf_begin;/* refference to begin in dfaexec().  */
338static unsigned char const *buf_end;	/* refference to end in dfaexec().  */
339#endif /* MBS_SUPPORT  */
340
341#ifdef MBS_SUPPORT
342/* This function update cur_mb_len, and cur_mb_index.
343   p points current lexptr, len is the remaining buffer length.  */
344static void
345update_mb_len_index (char const *p, int len)
346{
347  /* If last character is a part of a multibyte character,
348     we update cur_mb_index.  */
349  if (cur_mb_index)
350    cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
351			: cur_mb_index + 1;
352
353  /* If last character is a single byte character, or the
354     last portion of a multibyte character, we check whether
355     next character is a multibyte character or not.  */
356  if (! cur_mb_index)
357    {
358      cur_mb_len = mbrlen(p, len, &mbs);
359      if (cur_mb_len > 1)
360	/* It is a multibyte character.
361	   cur_mb_len was already set by mbrlen().  */
362	cur_mb_index = 1;
363      else if (cur_mb_len < 1)
364	/* Invalid sequence.  We treat it as a singlebyte character.
365	   cur_mb_index is aleady 0.  */
366	cur_mb_len = 1;
367      /* Otherwise, cur_mb_len == 1, it is a singlebyte character.
368	 cur_mb_index is aleady 0.  */
369    }
370}
371#endif /* MBS_SUPPORT */
372
373#ifdef MBS_SUPPORT
374/* Note that characters become unsigned here. */
375# define FETCH(c, eoferr)			\
376  {						\
377    if (! lexleft)				\
378     {						\
379	if (eoferr != 0)			\
380	  dfaerror (eoferr);			\
381	else					\
382	  return lasttok = END;			\
383      }						\
384    if (MB_CUR_MAX > 1)				\
385      update_mb_len_index(lexptr, lexleft);	\
386    (c) = (unsigned char) *lexptr++;		\
387    --lexleft;					\
388  }
389
390/* This function fetch a wide character, and update cur_mb_len,
391   used only if the current locale is a multibyte environment.  */
392static wchar_t
393fetch_wc (char const *eoferr)
394{
395  wchar_t wc;
396  if (! lexleft)
397    {
398      if (eoferr != 0)
399	dfaerror (eoferr);
400      else
401	return -1;
402    }
403
404  cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
405  if (cur_mb_len <= 0)
406   {
407      cur_mb_len = 1;
408      wc = *lexptr;
409    }
410  lexptr += cur_mb_len;
411  lexleft -= cur_mb_len;
412  return wc;
413}
414#else
415/* Note that characters become unsigned here. */
416# define FETCH(c, eoferr)   	      \
417  {			   	      \
418    if (! lexleft)	   	      \
419      {				      \
420	if (eoferr != 0)	      \
421	  dfaerror (eoferr);	      \
422	else		   	      \
423	  return lasttok = END;	      \
424      }				      \
425    (c) = (unsigned char) *lexptr++;  \
426    --lexleft;		   	      \
427  }
428#endif /* MBS_SUPPORT */
429
430#ifdef MBS_SUPPORT
431/* Multibyte character handling sub-routin for lex.
432   This function  parse a bracket expression and build a struct
433   mb_char_classes.  */
434static void
435parse_bracket_exp_mb ()
436{
437  wchar_t wc, wc1, wc2;
438
439  /* Work area to build a mb_char_classes.  */
440  struct mb_char_classes *work_mbc;
441  int chars_al, range_sts_al, range_ends_al, ch_classes_al,
442    equivs_al, coll_elems_al;
443
444  REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
445		       dfa->mbcsets_alloc, dfa->nmbcsets + 1);
446  /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
447     We will update dfa->multibyte_prop in addtok(), because we can't
448     decide the index in dfa->tokens[].  */
449
450  /* Initialize work are */
451  work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
452
453  chars_al = 1;
454  range_sts_al = range_ends_al = 0;
455  ch_classes_al = equivs_al = coll_elems_al = 0;
456  MALLOC(work_mbc->chars, wchar_t, chars_al);
457
458  work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
459  work_mbc->nequivs = work_mbc->ncoll_elems = 0;
460  work_mbc->chars = NULL;
461  work_mbc->ch_classes = NULL;
462  work_mbc->range_sts = work_mbc->range_ends = NULL;
463  work_mbc->equivs = work_mbc->coll_elems = NULL;
464
465  wc = fetch_wc(_("Unbalanced ["));
466  if (wc == L'^')
467    {
468      wc = fetch_wc(_("Unbalanced ["));
469      work_mbc->invert = 1;
470    }
471  else
472    work_mbc->invert = 0;
473  do
474    {
475      wc1 = -1; /* mark wc1 is not initialized".  */
476
477      /* Note that if we're looking at some other [:...:] construct,
478	 we just treat it as a bunch of ordinary characters.  We can do
479	 this because we assume regex has checked for syntax errors before
480	 dfa is ever called. */
481      if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
482	{
483#define BRACKET_BUFFER_SIZE 128
484	  char str[BRACKET_BUFFER_SIZE];
485	  wc1 = wc;
486	  wc = fetch_wc(_("Unbalanced ["));
487
488	  /* If pattern contains `[[:', `[[.', or `[[='.  */
489	  if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
490	    {
491	      unsigned char c;
492	      unsigned char delim = (unsigned char)wc;
493	      int len = 0;
494	      for (;;)
495		{
496		  if (! lexleft)
497		    dfaerror (_("Unbalanced ["));
498		  c = (unsigned char) *lexptr++;
499		  --lexleft;
500
501		  if ((c == delim && *lexptr == ']') || lexleft == 0)
502		    break;
503		  if (len < BRACKET_BUFFER_SIZE)
504		    str[len++] = c;
505		  else
506		    /* This is in any case an invalid class name.  */
507		    str[0] = '\0';
508		}
509	      str[len] = '\0';
510
511	      if (lexleft == 0)
512		{
513		  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
514				       work_mbc->nchars + 2);
515		  work_mbc->chars[work_mbc->nchars++] = L'[';
516		  work_mbc->chars[work_mbc->nchars++] = delim;
517		  break;
518		}
519
520	      if (--lexleft, *lexptr++ != ']')
521		dfaerror (_("Unbalanced ["));
522	      if (delim == ':')
523		/* build character class.  */
524		{
525		  wctype_t wt;
526		  /* Query the character class as wctype_t.  */
527		  wt = wctype (str);
528
529		  if (ch_classes_al == 0)
530		    MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
531		  REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
532				       ch_classes_al,
533				       work_mbc->nch_classes + 1);
534		  work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
535
536 		}
537	      else if (delim == '=' || delim == '.')
538		{
539		  char *elem;
540		  MALLOC(elem, char, len + 1);
541		  strncpy(elem, str, len + 1);
542
543		  if (delim == '=')
544		    /* build equivalent class.  */
545		    {
546		      if (equivs_al == 0)
547			MALLOC(work_mbc->equivs, char*, ++equivs_al);
548		      REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
549					   equivs_al,
550					   work_mbc->nequivs + 1);
551		      work_mbc->equivs[work_mbc->nequivs++] = elem;
552		    }
553
554		  if (delim == '.')
555		    /* build collating element.  */
556		    {
557		      if (coll_elems_al == 0)
558			MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
559		      REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
560					   coll_elems_al,
561					   work_mbc->ncoll_elems + 1);
562		      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
563		    }
564 		}
565	      wc = -1;
566	    }
567	  else
568	    /* We treat '[' as a normal character here.  */
569	    {
570	      wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
571	    }
572	}
573      else
574	{
575	  if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
576	    wc = fetch_wc(("Unbalanced ["));
577	}
578
579      if (wc1 == -1)
580	wc1 = fetch_wc(_("Unbalanced ["));
581
582      if (wc1 == L'-')
583	/* build range characters.  */
584	{
585	  wc2 = fetch_wc(_("Unbalanced ["));
586	  if (wc2 == L']')
587	    {
588	      /* In the case [x-], the - is an ordinary hyphen,
589		 which is left in c1, the lookahead character. */
590	      lexptr -= cur_mb_len;
591	      lexleft += cur_mb_len;
592	      wc2 = wc;
593	    }
594	  else
595	    {
596	      if (wc2 == L'\\'
597		  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
598		wc2 = fetch_wc(_("Unbalanced ["));
599	      wc1 = fetch_wc(_("Unbalanced ["));
600	    }
601
602	  if (range_sts_al == 0)
603	    {
604	      MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
605	      MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
606	    }
607	  REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
608			       range_sts_al, work_mbc->nranges + 1);
609	  work_mbc->range_sts[work_mbc->nranges] = wc;
610	  REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
611			       range_ends_al, work_mbc->nranges + 1);
612	  work_mbc->range_ends[work_mbc->nranges++] = wc2;
613	}
614      else if (wc != -1)
615	/* build normal characters.  */
616	{
617	  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
618			       work_mbc->nchars + 1);
619	  work_mbc->chars[work_mbc->nchars++] = wc;
620	}
621    }
622  while ((wc = wc1) != L']');
623}
624#endif /* MBS_SUPPORT */
625
626#ifdef __STDC__
627#define FUNC(F, P) static int F(int c) { return P(c); }
628#else
629#define FUNC(F, P) static int F(c) int c; { return P(c); }
630#endif
631
632FUNC(is_alpha, ISALPHA)
633FUNC(is_upper, ISUPPER)
634FUNC(is_lower, ISLOWER)
635FUNC(is_digit, ISDIGIT)
636FUNC(is_xdigit, ISXDIGIT)
637FUNC(is_space, ISSPACE)
638FUNC(is_punct, ISPUNCT)
639FUNC(is_alnum, ISALNUM)
640FUNC(is_print, ISPRINT)
641FUNC(is_graph, ISGRAPH)
642FUNC(is_cntrl, ISCNTRL)
643
644static int
645is_blank (int c)
646{
647   return (c == ' ' || c == '\t');
648}
649
650/* The following list maps the names of the Posix named character classes
651   to predicate functions that determine whether a given character is in
652   the class.  The leading [ has already been eaten by the lexical analyzer. */
653static struct {
654  const char *name;
655  int (*pred) (int);
656} const prednames[] = {
657  { ":alpha:]", is_alpha },
658  { ":upper:]", is_upper },
659  { ":lower:]", is_lower },
660  { ":digit:]", is_digit },
661  { ":xdigit:]", is_xdigit },
662  { ":space:]", is_space },
663  { ":punct:]", is_punct },
664  { ":alnum:]", is_alnum },
665  { ":print:]", is_print },
666  { ":graph:]", is_graph },
667  { ":cntrl:]", is_cntrl },
668  { ":blank:]", is_blank },
669  { 0 }
670};
671
672/* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
673#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
674
675static int
676looking_at (char const *s)
677{
678  size_t len;
679
680  len = strlen(s);
681  if (lexleft < len)
682    return 0;
683  return strncmp(s, lexptr, len) == 0;
684}
685
686static token
687lex (void)
688{
689  unsigned c, c1, c2;
690  int backslash = 0, invert;
691  charclass ccl;
692  int i;
693
694  /* Basic plan: We fetch a character.  If it's a backslash,
695     we set the backslash flag and go through the loop again.
696     On the plus side, this avoids having a duplicate of the
697     main switch inside the backslash case.  On the minus side,
698     it means that just about every case begins with
699     "if (backslash) ...".  */
700  for (i = 0; i < 2; ++i)
701    {
702      FETCH(c, 0);
703#ifdef MBS_SUPPORT
704      if (MB_CUR_MAX > 1 && cur_mb_index)
705	/* If this is a part of a multi-byte character, we must treat
706	   this byte data as a normal character.
707	   e.g. In case of SJIS encoding, some character contains '\',
708	        but they must not be backslash.  */
709	goto normal_char;
710#endif /* MBS_SUPPORT  */
711      switch (c)
712	{
713	case '\\':
714	  if (backslash)
715	    goto normal_char;
716	  if (lexleft == 0)
717	    dfaerror(_("Unfinished \\ escape"));
718	  backslash = 1;
719	  break;
720
721	case '^':
722	  if (backslash)
723	    goto normal_char;
724	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
725	      || lasttok == END
726	      || lasttok == LPAREN
727	      || lasttok == OR)
728	    return lasttok = BEGLINE;
729	  goto normal_char;
730
731	case '$':
732	  if (backslash)
733	    goto normal_char;
734	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
735	      || lexleft == 0
736	      || (syntax_bits & RE_NO_BK_PARENS
737		  ? lexleft > 0 && *lexptr == ')'
738		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
739	      || (syntax_bits & RE_NO_BK_VBAR
740		  ? lexleft > 0 && *lexptr == '|'
741		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
742	      || ((syntax_bits & RE_NEWLINE_ALT)
743	          && lexleft > 0 && *lexptr == '\n'))
744	    return lasttok = ENDLINE;
745	  goto normal_char;
746
747	case '1':
748	case '2':
749	case '3':
750	case '4':
751	case '5':
752	case '6':
753	case '7':
754	case '8':
755	case '9':
756	  if (backslash && !(syntax_bits & RE_NO_BK_REFS))
757	    {
758	      laststart = 0;
759	      return lasttok = BACKREF;
760	    }
761	  goto normal_char;
762
763	case '`':
764	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
765	    return lasttok = BEGLINE;	/* FIXME: should be beginning of string */
766	  goto normal_char;
767
768	case '\'':
769	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
770	    return lasttok = ENDLINE;	/* FIXME: should be end of string */
771	  goto normal_char;
772
773	case '<':
774	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
775	    return lasttok = BEGWORD;
776	  goto normal_char;
777
778	case '>':
779	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
780	    return lasttok = ENDWORD;
781	  goto normal_char;
782
783	case 'b':
784	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
785	    return lasttok = LIMWORD;
786	  goto normal_char;
787
788	case 'B':
789	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
790	    return lasttok = NOTLIMWORD;
791	  goto normal_char;
792
793	case '?':
794	  if (syntax_bits & RE_LIMITED_OPS)
795	    goto normal_char;
796	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
797	    goto normal_char;
798	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
799	    goto normal_char;
800	  return lasttok = QMARK;
801
802	case '*':
803	  if (backslash)
804	    goto normal_char;
805	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
806	    goto normal_char;
807	  return lasttok = STAR;
808
809	case '+':
810	  if (syntax_bits & RE_LIMITED_OPS)
811	    goto normal_char;
812	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
813	    goto normal_char;
814	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
815	    goto normal_char;
816	  return lasttok = PLUS;
817
818	case '{':
819	  if (!(syntax_bits & RE_INTERVALS))
820	    goto normal_char;
821	  if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
822	    goto normal_char;
823	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
824	    goto normal_char;
825
826	  if (syntax_bits & RE_NO_BK_BRACES)
827	    {
828	      /* Scan ahead for a valid interval; if it's not valid,
829		 treat it as a literal '{'.  */
830	      int lo = -1, hi = -1;
831	      char const *p = lexptr;
832	      char const *lim = p + lexleft;
833	      for (;  p != lim && ISASCIIDIGIT (*p);  p++)
834		lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
835	      if (p != lim && *p == ',')
836		while (++p != lim && ISASCIIDIGIT (*p))
837		  hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
838	      else
839		hi = lo;
840	      if (p == lim || *p != '}'
841		  || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
842		goto normal_char;
843	    }
844
845	  minrep = 0;
846	  /* Cases:
847	     {M} - exact count
848	     {M,} - minimum count, maximum is infinity
849	     {M,N} - M through N */
850	  FETCH(c, _("unfinished repeat count"));
851	  if (ISASCIIDIGIT (c))
852	    {
853	      minrep = c - '0';
854	      for (;;)
855		{
856		  FETCH(c, _("unfinished repeat count"));
857		  if (! ISASCIIDIGIT (c))
858		    break;
859		  minrep = 10 * minrep + c - '0';
860		}
861	    }
862	  else
863	    dfaerror(_("malformed repeat count"));
864	  if (c == ',')
865	    {
866	      FETCH (c, _("unfinished repeat count"));
867	      if (! ISASCIIDIGIT (c))
868		maxrep = -1;
869	      else
870		{
871		  maxrep = c - '0';
872		  for (;;)
873		    {
874		      FETCH (c, _("unfinished repeat count"));
875		      if (! ISASCIIDIGIT (c))
876			break;
877		      maxrep = 10 * maxrep + c - '0';
878		    }
879		  if (0 <= maxrep && maxrep < minrep)
880		    dfaerror (_("malformed repeat count"));
881		}
882	    }
883	  else
884	    maxrep = minrep;
885	  if (!(syntax_bits & RE_NO_BK_BRACES))
886	    {
887	      if (c != '\\')
888		dfaerror(_("malformed repeat count"));
889	      FETCH(c, _("unfinished repeat count"));
890	    }
891	  if (c != '}')
892	    dfaerror(_("malformed repeat count"));
893	  laststart = 0;
894	  return lasttok = REPMN;
895
896	case '|':
897	  if (syntax_bits & RE_LIMITED_OPS)
898	    goto normal_char;
899	  if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
900	    goto normal_char;
901	  laststart = 1;
902	  return lasttok = OR;
903
904	case '\n':
905	  if (syntax_bits & RE_LIMITED_OPS
906	      || backslash
907	      || !(syntax_bits & RE_NEWLINE_ALT))
908	    goto normal_char;
909	  laststart = 1;
910	  return lasttok = OR;
911
912	case '(':
913	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
914	    goto normal_char;
915	  ++parens;
916	  laststart = 1;
917	  return lasttok = LPAREN;
918
919	case ')':
920	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
921	    goto normal_char;
922	  if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
923	    goto normal_char;
924	  --parens;
925	  laststart = 0;
926	  return lasttok = RPAREN;
927
928	case '.':
929	  if (backslash)
930	    goto normal_char;
931#ifdef MBS_SUPPORT
932	  if (MB_CUR_MAX > 1)
933	    {
934	      /* In multibyte environment period must match with a single
935		 character not a byte.  So we use ANYCHAR.  */
936	      laststart = 0;
937	      return lasttok = ANYCHAR;
938	    }
939#endif /* MBS_SUPPORT */
940	  zeroset(ccl);
941	  notset(ccl);
942	  if (!(syntax_bits & RE_DOT_NEWLINE))
943	    clrbit(eolbyte, ccl);
944	  if (syntax_bits & RE_DOT_NOT_NULL)
945	    clrbit('\0', ccl);
946	  laststart = 0;
947	  return lasttok = CSET + charclass_index(ccl);
948
949	case 'w':
950	case 'W':
951	  if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
952	    goto normal_char;
953	  zeroset(ccl);
954	  for (c2 = 0; c2 < NOTCHAR; ++c2)
955	    if (IS_WORD_CONSTITUENT(c2))
956	      setbit(c2, ccl);
957	  if (c == 'W')
958	    notset(ccl);
959	  laststart = 0;
960	  return lasttok = CSET + charclass_index(ccl);
961
962	case '[':
963	  if (backslash)
964	    goto normal_char;
965	  laststart = 0;
966#ifdef MBS_SUPPORT
967	  if (MB_CUR_MAX > 1)
968	    {
969	      /* In multibyte environment a bracket expression may contain
970		 multibyte characters, which must be treated as characters
971		 (not bytes).  So we parse it by parse_bracket_exp_mb().  */
972	      parse_bracket_exp_mb();
973	      return lasttok = MBCSET;
974	    }
975#endif
976	  zeroset(ccl);
977	  FETCH(c, _("Unbalanced ["));
978	  if (c == '^')
979	    {
980	      FETCH(c, _("Unbalanced ["));
981	      invert = 1;
982	    }
983	  else
984	    invert = 0;
985	  do
986	    {
987	      /* Nobody ever said this had to be fast. :-)
988		 Note that if we're looking at some other [:...:]
989		 construct, we just treat it as a bunch of ordinary
990		 characters.  We can do this because we assume
991		 regex has checked for syntax errors before
992		 dfa is ever called. */
993	      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
994		for (c1 = 0; prednames[c1].name; ++c1)
995		  if (looking_at(prednames[c1].name))
996		    {
997		      int (*pred) (int) = prednames[c1].pred;
998
999		      for (c2 = 0; c2 < NOTCHAR; ++c2)
1000			if ((*pred)(c2))
1001			  setbit_case_fold (c2, ccl);
1002		      lexptr += strlen(prednames[c1].name);
1003		      lexleft -= strlen(prednames[c1].name);
1004		      FETCH(c1, _("Unbalanced ["));
1005		      goto skip;
1006		    }
1007	      if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1008		FETCH(c, _("Unbalanced ["));
1009	      FETCH(c1, _("Unbalanced ["));
1010	      if (c1 == '-')
1011		{
1012		  FETCH(c2, _("Unbalanced ["));
1013		  if (c2 == ']')
1014		    {
1015		      /* In the case [x-], the - is an ordinary hyphen,
1016			 which is left in c1, the lookahead character. */
1017		      --lexptr;
1018		      ++lexleft;
1019		    }
1020		  else
1021		    {
1022		      if (c2 == '\\'
1023			  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1024			FETCH(c2, _("Unbalanced ["));
1025		      FETCH(c1, _("Unbalanced ["));
1026		      if (!hard_LC_COLLATE) {
1027		        for (; c <= c2; c++)
1028			  setbit_case_fold (c, ccl);
1029		      } else {
1030			/* POSIX locales are painful - leave the decision to libc */
1031			regex_t re;
1032			char expr[6]; /* = { '[', c, '-', c2, ']', '\0' }; */
1033
1034			expr[0] = '['; expr[1] = c; expr[2] = '-';
1035			expr[3] = c2; expr[4] = ']'; expr[5] = '\0';
1036			if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
1037			  for (c = 0; c < NOTCHAR; ++c) {
1038			    regmatch_t mat;
1039			    char buf[2]; /* = { c, '\0' }; */
1040
1041			    buf[0] = c; buf[1] = '\0';
1042			    if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
1043                               && mat.rm_so == 0 && mat.rm_eo == 1)
1044                              setbit_case_fold (c, ccl);
1045			  }
1046			  regfree (&re);
1047			}
1048		      }
1049		      continue;
1050		    }
1051		}
1052
1053	      setbit_case_fold (c, ccl);
1054
1055	    skip:
1056	      ;
1057	    }
1058	  while ((c = c1) != ']');
1059	  if (invert)
1060	    {
1061	      notset(ccl);
1062	      if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1063		clrbit(eolbyte, ccl);
1064	    }
1065	  return lasttok = CSET + charclass_index(ccl);
1066
1067	default:
1068	normal_char:
1069	  laststart = 0;
1070	  if (case_fold && ISALPHA(c))
1071	    {
1072	      zeroset(ccl);
1073	      setbit_case_fold (c, ccl);
1074	      return lasttok = CSET + charclass_index(ccl);
1075	    }
1076	  return c;
1077	}
1078    }
1079
1080  /* The above loop should consume at most a backslash
1081     and some other character. */
1082  abort();
1083  return END;	/* keeps pedantic compilers happy. */
1084}
1085
1086/* Recursive descent parser for regular expressions. */
1087
1088static token tok;		/* Lookahead token. */
1089static int depth;		/* Current depth of a hypothetical stack
1090				   holding deferred productions.  This is
1091				   used to determine the depth that will be
1092				   required of the real stack later on in
1093				   dfaanalyze(). */
1094
1095/* Add the given token to the parse tree, maintaining the depth count and
1096   updating the maximum depth if necessary. */
1097static void
1098addtok (token t)
1099{
1100#ifdef MBS_SUPPORT
1101  if (MB_CUR_MAX > 1)
1102    {
1103      REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1104			   dfa->tindex);
1105      /* Set dfa->multibyte_prop.  See struct dfa in dfa.h.  */
1106      if (t == MBCSET)
1107	dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
1108      else if (t < NOTCHAR)
1109	dfa->multibyte_prop[dfa->tindex]
1110	  = (cur_mb_len == 1)? 3 /* single-byte char */
1111	  : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
1112	     + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
1113      else
1114	/* It may be unnecesssary, but it is safer to treat other
1115	   symbols as singlebyte characters.  */
1116	dfa->multibyte_prop[dfa->tindex] = 3;
1117    }
1118#endif
1119
1120  REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1121  dfa->tokens[dfa->tindex++] = t;
1122
1123  switch (t)
1124    {
1125    case QMARK:
1126    case STAR:
1127    case PLUS:
1128      break;
1129
1130    case CAT:
1131    case OR:
1132    case ORTOP:
1133      --depth;
1134      break;
1135
1136    default:
1137      ++dfa->nleaves;
1138    case EMPTY:
1139      ++depth;
1140      break;
1141    }
1142  if (depth > dfa->depth)
1143    dfa->depth = depth;
1144}
1145
1146/* The grammar understood by the parser is as follows.
1147
1148   regexp:
1149     regexp OR branch
1150     branch
1151
1152   branch:
1153     branch closure
1154     closure
1155
1156   closure:
1157     closure QMARK
1158     closure STAR
1159     closure PLUS
1160     closure REPMN
1161     atom
1162
1163   atom:
1164     <normal character>
1165     <multibyte character>
1166     ANYCHAR
1167     MBCSET
1168     CSET
1169     BACKREF
1170     BEGLINE
1171     ENDLINE
1172     BEGWORD
1173     ENDWORD
1174     LIMWORD
1175     NOTLIMWORD
1176     CRANGE
1177     LPAREN regexp RPAREN
1178     <empty>
1179
1180   The parser builds a parse tree in postfix form in an array of tokens. */
1181
1182static void
1183atom (void)
1184{
1185  if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1186      || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1187#ifdef MBS_SUPPORT
1188      || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
1189#endif /* MBS_SUPPORT */
1190      || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1191    {
1192      addtok(tok);
1193      tok = lex();
1194#ifdef MBS_SUPPORT
1195      /* We treat a multibyte character as a single atom, so that DFA
1196	 can treat a multibyte character as a single expression.
1197
1198         e.g. We construct following tree from "<mb1><mb2>".
1199              <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1200              <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
1201      */
1202      if (MB_CUR_MAX > 1)
1203	{
1204	  while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
1205	    {
1206	      addtok(tok);
1207	      addtok(CAT);
1208	      tok = lex();
1209	    }
1210	}
1211#endif /* MBS_SUPPORT  */
1212    }
1213  else if (tok == CRANGE)
1214    {
1215      /* A character range like "[a-z]" in a locale other than "C" or
1216	 "POSIX".  This range might any sequence of one or more
1217	 characters.  Unfortunately the POSIX locale primitives give
1218	 us no practical way to find what character sequences might be
1219	 matched.  Treat this approximately like "(.\1)" -- i.e. match
1220	 one character, and then punt to the full matcher.  */
1221      charclass ccl;
1222      zeroset (ccl);
1223      notset (ccl);
1224      addtok (CSET + charclass_index (ccl));
1225      addtok (BACKREF);
1226      addtok (CAT);
1227      tok = lex ();
1228    }
1229  else if (tok == LPAREN)
1230    {
1231      tok = lex();
1232      regexp(0);
1233      if (tok != RPAREN)
1234	dfaerror(_("Unbalanced ("));
1235      tok = lex();
1236    }
1237  else
1238    addtok(EMPTY);
1239}
1240
1241/* Return the number of tokens in the given subexpression. */
1242static int
1243nsubtoks (int tindex)
1244{
1245  int ntoks1;
1246
1247  switch (dfa->tokens[tindex - 1])
1248    {
1249    default:
1250      return 1;
1251    case QMARK:
1252    case STAR:
1253    case PLUS:
1254      return 1 + nsubtoks(tindex - 1);
1255    case CAT:
1256    case OR:
1257    case ORTOP:
1258      ntoks1 = nsubtoks(tindex - 1);
1259      return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1260    }
1261}
1262
1263/* Copy the given subexpression to the top of the tree. */
1264static void
1265copytoks (int tindex, int ntokens)
1266{
1267  int i;
1268
1269  for (i = 0; i < ntokens; ++i)
1270    addtok(dfa->tokens[tindex + i]);
1271}
1272
1273static void
1274closure (void)
1275{
1276  int tindex, ntokens, i;
1277
1278  atom();
1279  while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1280    if (tok == REPMN)
1281      {
1282	ntokens = nsubtoks(dfa->tindex);
1283	tindex = dfa->tindex - ntokens;
1284	if (maxrep < 0)
1285	  addtok(PLUS);
1286	if (minrep == 0)
1287	  addtok(QMARK);
1288	for (i = 1; i < minrep; ++i)
1289	  {
1290	    copytoks(tindex, ntokens);
1291	    addtok(CAT);
1292	  }
1293	for (; i < maxrep; ++i)
1294	  {
1295	    copytoks(tindex, ntokens);
1296	    addtok(QMARK);
1297	    addtok(CAT);
1298	  }
1299	tok = lex();
1300      }
1301    else
1302      {
1303	addtok(tok);
1304	tok = lex();
1305      }
1306}
1307
1308static void
1309branch (void)
1310{
1311  closure();
1312  while (tok != RPAREN && tok != OR && tok >= 0)
1313    {
1314      closure();
1315      addtok(CAT);
1316    }
1317}
1318
1319static void
1320regexp (int toplevel)
1321{
1322  branch();
1323  while (tok == OR)
1324    {
1325      tok = lex();
1326      branch();
1327      if (toplevel)
1328	addtok(ORTOP);
1329      else
1330	addtok(OR);
1331    }
1332}
1333
1334/* Main entry point for the parser.  S is a string to be parsed, len is the
1335   length of the string, so s can include NUL characters.  D is a pointer to
1336   the struct dfa to parse into. */
1337void
1338dfaparse (char const *s, size_t len, struct dfa *d)
1339{
1340  dfa = d;
1341  lexstart = lexptr = s;
1342  lexleft = len;
1343  lasttok = END;
1344  laststart = 1;
1345  parens = 0;
1346#if ENABLE_NLS
1347  hard_LC_COLLATE = hard_locale (LC_COLLATE);
1348#endif
1349#ifdef MBS_SUPPORT
1350  if (MB_CUR_MAX > 1)
1351    {
1352      cur_mb_index = 0;
1353      cur_mb_len = 0;
1354      memset(&mbs, 0, sizeof(mbstate_t));
1355    }
1356#endif /* MBS_SUPPORT  */
1357
1358  if (! syntax_bits_set)
1359    dfaerror(_("No syntax specified"));
1360
1361  tok = lex();
1362  depth = d->depth;
1363
1364  regexp(1);
1365
1366  if (tok != END)
1367    dfaerror(_("Unbalanced )"));
1368
1369  addtok(END - d->nregexps);
1370  addtok(CAT);
1371
1372  if (d->nregexps)
1373    addtok(ORTOP);
1374
1375  ++d->nregexps;
1376}
1377
1378/* Some primitives for operating on sets of positions. */
1379
1380/* Copy one set to another; the destination must be large enough. */
1381static void
1382copy (position_set const *src, position_set *dst)
1383{
1384  int i;
1385
1386  for (i = 0; i < src->nelem; ++i)
1387    dst->elems[i] = src->elems[i];
1388  dst->nelem = src->nelem;
1389}
1390
1391/* Insert a position in a set.  Position sets are maintained in sorted
1392   order according to index.  If position already exists in the set with
1393   the same index then their constraints are logically or'd together.
1394   S->elems must point to an array large enough to hold the resulting set. */
1395static void
1396insert (position p, position_set *s)
1397{
1398  int i;
1399  position t1, t2;
1400
1401  for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1402    continue;
1403  if (i < s->nelem && p.index == s->elems[i].index)
1404    s->elems[i].constraint |= p.constraint;
1405  else
1406    {
1407      t1 = p;
1408      ++s->nelem;
1409      while (i < s->nelem)
1410	{
1411	  t2 = s->elems[i];
1412	  s->elems[i++] = t1;
1413	  t1 = t2;
1414	}
1415    }
1416}
1417
1418/* Merge two sets of positions into a third.  The result is exactly as if
1419   the positions of both sets were inserted into an initially empty set. */
1420static void
1421merge (position_set const *s1, position_set const *s2, position_set *m)
1422{
1423  int i = 0, j = 0;
1424
1425  m->nelem = 0;
1426  while (i < s1->nelem && j < s2->nelem)
1427    if (s1->elems[i].index > s2->elems[j].index)
1428      m->elems[m->nelem++] = s1->elems[i++];
1429    else if (s1->elems[i].index < s2->elems[j].index)
1430      m->elems[m->nelem++] = s2->elems[j++];
1431    else
1432      {
1433	m->elems[m->nelem] = s1->elems[i++];
1434	m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1435      }
1436  while (i < s1->nelem)
1437    m->elems[m->nelem++] = s1->elems[i++];
1438  while (j < s2->nelem)
1439    m->elems[m->nelem++] = s2->elems[j++];
1440}
1441
1442/* Delete a position from a set. */
1443static void
1444delete_pos (position p, position_set *s)
1445{
1446  int i;
1447
1448  for (i = 0; i < s->nelem; ++i)
1449    if (p.index == s->elems[i].index)
1450      break;
1451  if (i < s->nelem)
1452    for (--s->nelem; i < s->nelem; ++i)
1453      s->elems[i] = s->elems[i + 1];
1454}
1455
1456/* Find the index of the state corresponding to the given position set with
1457   the given preceding context, or create a new state if there is no such
1458   state.  Newline and letter tell whether we got here on a newline or
1459   letter, respectively. */
1460static int
1461state_index (struct dfa *d, position_set const *s, int newline, int letter)
1462{
1463  int hash = 0;
1464  int constraint;
1465  int i, j;
1466
1467  newline = newline ? 1 : 0;
1468  letter = letter ? 1 : 0;
1469
1470  for (i = 0; i < s->nelem; ++i)
1471    hash ^= s->elems[i].index + s->elems[i].constraint;
1472
1473  /* Try to find a state that exactly matches the proposed one. */
1474  for (i = 0; i < d->sindex; ++i)
1475    {
1476      if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1477	  || newline != d->states[i].newline || letter != d->states[i].letter)
1478	continue;
1479      for (j = 0; j < s->nelem; ++j)
1480	if (s->elems[j].constraint
1481	    != d->states[i].elems.elems[j].constraint
1482	    || s->elems[j].index != d->states[i].elems.elems[j].index)
1483	  break;
1484      if (j == s->nelem)
1485	return i;
1486    }
1487
1488  /* We'll have to create a new state. */
1489  REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1490  d->states[i].hash = hash;
1491  MALLOC(d->states[i].elems.elems, position, s->nelem);
1492  copy(s, &d->states[i].elems);
1493  d->states[i].newline = newline;
1494  d->states[i].letter = letter;
1495  d->states[i].backref = 0;
1496  d->states[i].constraint = 0;
1497  d->states[i].first_end = 0;
1498#ifdef MBS_SUPPORT
1499  if (MB_CUR_MAX > 1)
1500    d->states[i].mbps.nelem = 0;
1501#endif
1502  for (j = 0; j < s->nelem; ++j)
1503    if (d->tokens[s->elems[j].index] < 0)
1504      {
1505	constraint = s->elems[j].constraint;
1506	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1507	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1508	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1509	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1510	  d->states[i].constraint |= constraint;
1511	if (! d->states[i].first_end)
1512	  d->states[i].first_end = d->tokens[s->elems[j].index];
1513      }
1514    else if (d->tokens[s->elems[j].index] == BACKREF)
1515      {
1516	d->states[i].constraint = NO_CONSTRAINT;
1517	d->states[i].backref = 1;
1518      }
1519
1520  ++d->sindex;
1521
1522  return i;
1523}
1524
1525/* Find the epsilon closure of a set of positions.  If any position of the set
1526   contains a symbol that matches the empty string in some context, replace
1527   that position with the elements of its follow labeled with an appropriate
1528   constraint.  Repeat exhaustively until no funny positions are left.
1529   S->elems must be large enough to hold the result. */
1530static void
1531epsclosure (position_set *s, struct dfa const *d)
1532{
1533  int i, j;
1534  int *visited;
1535  position p, old;
1536
1537  MALLOC(visited, int, d->tindex);
1538  for (i = 0; i < d->tindex; ++i)
1539    visited[i] = 0;
1540
1541  for (i = 0; i < s->nelem; ++i)
1542    if (d->tokens[s->elems[i].index] >= NOTCHAR
1543	&& d->tokens[s->elems[i].index] != BACKREF
1544#ifdef MBS_SUPPORT
1545	&& d->tokens[s->elems[i].index] != ANYCHAR
1546	&& d->tokens[s->elems[i].index] != MBCSET
1547#endif
1548	&& d->tokens[s->elems[i].index] < CSET)
1549      {
1550	old = s->elems[i];
1551	p.constraint = old.constraint;
1552	delete_pos(s->elems[i], s);
1553	if (visited[old.index])
1554	  {
1555	    --i;
1556	    continue;
1557	  }
1558	visited[old.index] = 1;
1559	switch (d->tokens[old.index])
1560	  {
1561	  case BEGLINE:
1562	    p.constraint &= BEGLINE_CONSTRAINT;
1563	    break;
1564	  case ENDLINE:
1565	    p.constraint &= ENDLINE_CONSTRAINT;
1566	    break;
1567	  case BEGWORD:
1568	    p.constraint &= BEGWORD_CONSTRAINT;
1569	    break;
1570	  case ENDWORD:
1571	    p.constraint &= ENDWORD_CONSTRAINT;
1572	    break;
1573	  case LIMWORD:
1574	    p.constraint &= LIMWORD_CONSTRAINT;
1575	    break;
1576	  case NOTLIMWORD:
1577	    p.constraint &= NOTLIMWORD_CONSTRAINT;
1578	    break;
1579	  default:
1580	    break;
1581	  }
1582	for (j = 0; j < d->follows[old.index].nelem; ++j)
1583	  {
1584	    p.index = d->follows[old.index].elems[j].index;
1585	    insert(p, s);
1586	  }
1587	/* Force rescan to start at the beginning. */
1588	i = -1;
1589      }
1590
1591  free(visited);
1592}
1593
1594/* Perform bottom-up analysis on the parse tree, computing various functions.
1595   Note that at this point, we're pretending constructs like \< are real
1596   characters rather than constraints on what can follow them.
1597
1598   Nullable:  A node is nullable if it is at the root of a regexp that can
1599   match the empty string.
1600   *  EMPTY leaves are nullable.
1601   * No other leaf is nullable.
1602   * A QMARK or STAR node is nullable.
1603   * A PLUS node is nullable if its argument is nullable.
1604   * A CAT node is nullable if both its arguments are nullable.
1605   * An OR node is nullable if either argument is nullable.
1606
1607   Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
1608   that could correspond to the first character of a string matching the
1609   regexp rooted at the given node.
1610   * EMPTY leaves have empty firstpos.
1611   * The firstpos of a nonempty leaf is that leaf itself.
1612   * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1613     argument.
1614   * The firstpos of a CAT node is the firstpos of the left argument, union
1615     the firstpos of the right if the left argument is nullable.
1616   * The firstpos of an OR node is the union of firstpos of each argument.
1617
1618   Lastpos:  The lastpos of a node is the set of positions that could
1619   correspond to the last character of a string matching the regexp at
1620   the given node.
1621   * EMPTY leaves have empty lastpos.
1622   * The lastpos of a nonempty leaf is that leaf itself.
1623   * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1624     argument.
1625   * The lastpos of a CAT node is the lastpos of its right argument, union
1626     the lastpos of the left if the right argument is nullable.
1627   * The lastpos of an OR node is the union of the lastpos of each argument.
1628
1629   Follow:  The follow of a position is the set of positions that could
1630   correspond to the character following a character matching the node in
1631   a string matching the regexp.  At this point we consider special symbols
1632   that match the empty string in some context to be just normal characters.
1633   Later, if we find that a special symbol is in a follow set, we will
1634   replace it with the elements of its follow, labeled with an appropriate
1635   constraint.
1636   * Every node in the firstpos of the argument of a STAR or PLUS node is in
1637     the follow of every node in the lastpos.
1638   * Every node in the firstpos of the second argument of a CAT node is in
1639     the follow of every node in the lastpos of the first argument.
1640
1641   Because of the postfix representation of the parse tree, the depth-first
1642   analysis is conveniently done by a linear scan with the aid of a stack.
1643   Sets are stored as arrays of the elements, obeying a stack-like allocation
1644   scheme; the number of elements in each set deeper in the stack can be
1645   used to determine the address of a particular set's array. */
1646void
1647dfaanalyze (struct dfa *d, int searchflag)
1648{
1649  int *nullable;		/* Nullable stack. */
1650  int *nfirstpos;		/* Element count stack for firstpos sets. */
1651  position *firstpos;		/* Array where firstpos elements are stored. */
1652  int *nlastpos;		/* Element count stack for lastpos sets. */
1653  position *lastpos;		/* Array where lastpos elements are stored. */
1654  int *nalloc;			/* Sizes of arrays allocated to follow sets. */
1655  position_set tmp;		/* Temporary set for merging sets. */
1656  position_set merged;		/* Result of merging sets. */
1657  int wants_newline;		/* True if some position wants newline info. */
1658  int *o_nullable;
1659  int *o_nfirst, *o_nlast;
1660  position *o_firstpos, *o_lastpos;
1661  int i, j;
1662  position *pos;
1663
1664#ifdef DEBUG
1665  fprintf(stderr, "dfaanalyze:\n");
1666  for (i = 0; i < d->tindex; ++i)
1667    {
1668      fprintf(stderr, " %d:", i);
1669      prtok(d->tokens[i]);
1670    }
1671  putc('\n', stderr);
1672#endif
1673
1674  d->searchflag = searchflag;
1675
1676  MALLOC(nullable, int, d->depth);
1677  o_nullable = nullable;
1678  MALLOC(nfirstpos, int, d->depth);
1679  o_nfirst = nfirstpos;
1680  MALLOC(firstpos, position, d->nleaves);
1681  o_firstpos = firstpos, firstpos += d->nleaves;
1682  MALLOC(nlastpos, int, d->depth);
1683  o_nlast = nlastpos;
1684  MALLOC(lastpos, position, d->nleaves);
1685  o_lastpos = lastpos, lastpos += d->nleaves;
1686  MALLOC(nalloc, int, d->tindex);
1687  for (i = 0; i < d->tindex; ++i)
1688    nalloc[i] = 0;
1689  MALLOC(merged.elems, position, d->nleaves);
1690
1691  CALLOC(d->follows, position_set, d->tindex);
1692
1693  for (i = 0; i < d->tindex; ++i)
1694#ifdef DEBUG
1695    {				/* Nonsyntactic #ifdef goo... */
1696#endif
1697    switch (d->tokens[i])
1698      {
1699      case EMPTY:
1700	/* The empty set is nullable. */
1701	*nullable++ = 1;
1702
1703	/* The firstpos and lastpos of the empty leaf are both empty. */
1704	*nfirstpos++ = *nlastpos++ = 0;
1705	break;
1706
1707      case STAR:
1708      case PLUS:
1709	/* Every element in the firstpos of the argument is in the follow
1710	   of every element in the lastpos. */
1711	tmp.nelem = nfirstpos[-1];
1712	tmp.elems = firstpos;
1713	pos = lastpos;
1714	for (j = 0; j < nlastpos[-1]; ++j)
1715	  {
1716	    merge(&tmp, &d->follows[pos[j].index], &merged);
1717	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1718				 nalloc[pos[j].index], merged.nelem - 1);
1719	    copy(&merged, &d->follows[pos[j].index]);
1720	  }
1721
1722      case QMARK:
1723	/* A QMARK or STAR node is automatically nullable. */
1724	if (d->tokens[i] != PLUS)
1725	  nullable[-1] = 1;
1726	break;
1727
1728      case CAT:
1729	/* Every element in the firstpos of the second argument is in the
1730	   follow of every element in the lastpos of the first argument. */
1731	tmp.nelem = nfirstpos[-1];
1732	tmp.elems = firstpos;
1733	pos = lastpos + nlastpos[-1];
1734	for (j = 0; j < nlastpos[-2]; ++j)
1735	  {
1736	    merge(&tmp, &d->follows[pos[j].index], &merged);
1737	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1738				 nalloc[pos[j].index], merged.nelem - 1);
1739	    copy(&merged, &d->follows[pos[j].index]);
1740	  }
1741
1742	/* The firstpos of a CAT node is the firstpos of the first argument,
1743	   union that of the second argument if the first is nullable. */
1744	if (nullable[-2])
1745	  nfirstpos[-2] += nfirstpos[-1];
1746	else
1747	  firstpos += nfirstpos[-1];
1748	--nfirstpos;
1749
1750	/* The lastpos of a CAT node is the lastpos of the second argument,
1751	   union that of the first argument if the second is nullable. */
1752	if (nullable[-1])
1753	  nlastpos[-2] += nlastpos[-1];
1754	else
1755	  {
1756	    pos = lastpos + nlastpos[-2];
1757	    for (j = nlastpos[-1] - 1; j >= 0; --j)
1758	      pos[j] = lastpos[j];
1759	    lastpos += nlastpos[-2];
1760	    nlastpos[-2] = nlastpos[-1];
1761	  }
1762	--nlastpos;
1763
1764	/* A CAT node is nullable if both arguments are nullable. */
1765	nullable[-2] = nullable[-1] && nullable[-2];
1766	--nullable;
1767	break;
1768
1769      case OR:
1770      case ORTOP:
1771	/* The firstpos is the union of the firstpos of each argument. */
1772	nfirstpos[-2] += nfirstpos[-1];
1773	--nfirstpos;
1774
1775	/* The lastpos is the union of the lastpos of each argument. */
1776	nlastpos[-2] += nlastpos[-1];
1777	--nlastpos;
1778
1779	/* An OR node is nullable if either argument is nullable. */
1780	nullable[-2] = nullable[-1] || nullable[-2];
1781	--nullable;
1782	break;
1783
1784      default:
1785	/* Anything else is a nonempty position.  (Note that special
1786	   constructs like \< are treated as nonempty strings here;
1787	   an "epsilon closure" effectively makes them nullable later.
1788	   Backreferences have to get a real position so we can detect
1789	   transitions on them later.  But they are nullable. */
1790	*nullable++ = d->tokens[i] == BACKREF;
1791
1792	/* This position is in its own firstpos and lastpos. */
1793	*nfirstpos++ = *nlastpos++ = 1;
1794	--firstpos, --lastpos;
1795	firstpos->index = lastpos->index = i;
1796	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1797
1798	/* Allocate the follow set for this position. */
1799	nalloc[i] = 1;
1800	MALLOC(d->follows[i].elems, position, nalloc[i]);
1801	break;
1802      }
1803#ifdef DEBUG
1804    /* ... balance the above nonsyntactic #ifdef goo... */
1805      fprintf(stderr, "node %d:", i);
1806      prtok(d->tokens[i]);
1807      putc('\n', stderr);
1808      fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1809      fprintf(stderr, " firstpos:");
1810      for (j = nfirstpos[-1] - 1; j >= 0; --j)
1811	{
1812	  fprintf(stderr, " %d:", firstpos[j].index);
1813	  prtok(d->tokens[firstpos[j].index]);
1814	}
1815      fprintf(stderr, "\n lastpos:");
1816      for (j = nlastpos[-1] - 1; j >= 0; --j)
1817	{
1818	  fprintf(stderr, " %d:", lastpos[j].index);
1819	  prtok(d->tokens[lastpos[j].index]);
1820	}
1821      putc('\n', stderr);
1822    }
1823#endif
1824
1825  /* For each follow set that is the follow set of a real position, replace
1826     it with its epsilon closure. */
1827  for (i = 0; i < d->tindex; ++i)
1828    if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1829#ifdef MBS_SUPPORT
1830        || d->tokens[i] == ANYCHAR
1831        || d->tokens[i] == MBCSET
1832#endif
1833	|| d->tokens[i] >= CSET)
1834      {
1835#ifdef DEBUG
1836	fprintf(stderr, "follows(%d:", i);
1837	prtok(d->tokens[i]);
1838	fprintf(stderr, "):");
1839	for (j = d->follows[i].nelem - 1; j >= 0; --j)
1840	  {
1841	    fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1842	    prtok(d->tokens[d->follows[i].elems[j].index]);
1843	  }
1844	putc('\n', stderr);
1845#endif
1846	copy(&d->follows[i], &merged);
1847	epsclosure(&merged, d);
1848	if (d->follows[i].nelem < merged.nelem)
1849	  REALLOC(d->follows[i].elems, position, merged.nelem);
1850	copy(&merged, &d->follows[i]);
1851      }
1852
1853  /* Get the epsilon closure of the firstpos of the regexp.  The result will
1854     be the set of positions of state 0. */
1855  merged.nelem = 0;
1856  for (i = 0; i < nfirstpos[-1]; ++i)
1857    insert(firstpos[i], &merged);
1858  epsclosure(&merged, d);
1859
1860  /* Check if any of the positions of state 0 will want newline context. */
1861  wants_newline = 0;
1862  for (i = 0; i < merged.nelem; ++i)
1863    if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1864      wants_newline = 1;
1865
1866  /* Build the initial state. */
1867  d->salloc = 1;
1868  d->sindex = 0;
1869  MALLOC(d->states, dfa_state, d->salloc);
1870  state_index(d, &merged, wants_newline, 0);
1871
1872  free(o_nullable);
1873  free(o_nfirst);
1874  free(o_firstpos);
1875  free(o_nlast);
1876  free(o_lastpos);
1877  free(nalloc);
1878  free(merged.elems);
1879}
1880
1881/* Find, for each character, the transition out of state s of d, and store
1882   it in the appropriate slot of trans.
1883
1884   We divide the positions of s into groups (positions can appear in more
1885   than one group).  Each group is labeled with a set of characters that
1886   every position in the group matches (taking into account, if necessary,
1887   preceding context information of s).  For each group, find the union
1888   of the its elements' follows.  This set is the set of positions of the
1889   new state.  For each character in the group's label, set the transition
1890   on this character to be to a state corresponding to the set's positions,
1891   and its associated backward context information, if necessary.
1892
1893   If we are building a searching matcher, we include the positions of state
1894   0 in every state.
1895
1896   The collection of groups is constructed by building an equivalence-class
1897   partition of the positions of s.
1898
1899   For each position, find the set of characters C that it matches.  Eliminate
1900   any characters from C that fail on grounds of backward context.
1901
1902   Search through the groups, looking for a group whose label L has nonempty
1903   intersection with C.  If L - C is nonempty, create a new group labeled
1904   L - C and having the same positions as the current group, and set L to
1905   the intersection of L and C.  Insert the position in this group, set
1906   C = C - L, and resume scanning.
1907
1908   If after comparing with every group there are characters remaining in C,
1909   create a new group labeled with the characters of C and insert this
1910   position in that group. */
1911void
1912dfastate (int s, struct dfa *d, int trans[])
1913{
1914  position_set grps[NOTCHAR];	/* As many as will ever be needed. */
1915  charclass labels[NOTCHAR];	/* Labels corresponding to the groups. */
1916  int ngrps = 0;		/* Number of groups actually used. */
1917  position pos;			/* Current position being considered. */
1918  charclass matches;		/* Set of matching characters. */
1919  int matchesf;			/* True if matches is nonempty. */
1920  charclass intersect;		/* Intersection with some label set. */
1921  int intersectf;		/* True if intersect is nonempty. */
1922  charclass leftovers;		/* Stuff in the label that didn't match. */
1923  int leftoversf;		/* True if leftovers is nonempty. */
1924  static charclass letters;	/* Set of characters considered letters. */
1925  static charclass newline;	/* Set of characters that aren't newline. */
1926  position_set follows;		/* Union of the follows of some group. */
1927  position_set tmp;		/* Temporary space for merging sets. */
1928  int state;			/* New state. */
1929  int wants_newline;		/* New state wants to know newline context. */
1930  int state_newline;		/* New state on a newline transition. */
1931  int wants_letter;		/* New state wants to know letter context. */
1932  int state_letter;		/* New state on a letter transition. */
1933  static int initialized;	/* Flag for static initialization. */
1934#ifdef MBS_SUPPORT
1935  int next_isnt_1st_byte = 0;	/* Flag If we can't add state0.  */
1936#endif
1937  int i, j, k;
1938
1939  /* Initialize the set of letters, if necessary. */
1940  if (! initialized)
1941    {
1942      initialized = 1;
1943      for (i = 0; i < NOTCHAR; ++i)
1944	if (IS_WORD_CONSTITUENT(i))
1945	  setbit(i, letters);
1946      setbit(eolbyte, newline);
1947    }
1948
1949  zeroset(matches);
1950
1951  for (i = 0; i < d->states[s].elems.nelem; ++i)
1952    {
1953      pos = d->states[s].elems.elems[i];
1954      if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1955	setbit(d->tokens[pos.index], matches);
1956      else if (d->tokens[pos.index] >= CSET)
1957	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1958#ifdef MBS_SUPPORT
1959      else if (d->tokens[pos.index] == ANYCHAR
1960               || d->tokens[pos.index] == MBCSET)
1961      /* MB_CUR_MAX > 1  */
1962	{
1963	  /* ANYCHAR and MBCSET must match with a single character, so we
1964	     must put it to d->states[s].mbps, which contains the positions
1965	     which can match with a single character not a byte.  */
1966	  if (d->states[s].mbps.nelem == 0)
1967	    {
1968	      MALLOC(d->states[s].mbps.elems, position,
1969		     d->states[s].elems.nelem);
1970	    }
1971	  insert(pos, &(d->states[s].mbps));
1972	  continue;
1973	}
1974#endif /* MBS_SUPPORT */
1975      else
1976	continue;
1977
1978      /* Some characters may need to be eliminated from matches because
1979	 they fail in the current context. */
1980      if (pos.constraint != 0xFF)
1981	{
1982	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1983					 d->states[s].newline, 1))
1984	    clrbit(eolbyte, matches);
1985	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1986					 d->states[s].newline, 0))
1987	    for (j = 0; j < CHARCLASS_INTS; ++j)
1988	      matches[j] &= newline[j];
1989	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1990					d->states[s].letter, 1))
1991	    for (j = 0; j < CHARCLASS_INTS; ++j)
1992	      matches[j] &= ~letters[j];
1993	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1994					d->states[s].letter, 0))
1995	    for (j = 0; j < CHARCLASS_INTS; ++j)
1996	      matches[j] &= letters[j];
1997
1998	  /* If there are no characters left, there's no point in going on. */
1999	  for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2000	    continue;
2001	  if (j == CHARCLASS_INTS)
2002	    continue;
2003	}
2004
2005      for (j = 0; j < ngrps; ++j)
2006	{
2007	  /* If matches contains a single character only, and the current
2008	     group's label doesn't contain that character, go on to the
2009	     next group. */
2010	  if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2011	      && !tstbit(d->tokens[pos.index], labels[j]))
2012	    continue;
2013
2014	  /* Check if this group's label has a nonempty intersection with
2015	     matches. */
2016	  intersectf = 0;
2017	  for (k = 0; k < CHARCLASS_INTS; ++k)
2018	    (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2019	  if (! intersectf)
2020	    continue;
2021
2022	  /* It does; now find the set differences both ways. */
2023	  leftoversf = matchesf = 0;
2024	  for (k = 0; k < CHARCLASS_INTS; ++k)
2025	    {
2026	      /* Even an optimizing compiler can't know this for sure. */
2027	      int match = matches[k], label = labels[j][k];
2028
2029	      (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2030	      (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2031	    }
2032
2033	  /* If there were leftovers, create a new group labeled with them. */
2034	  if (leftoversf)
2035	    {
2036	      copyset(leftovers, labels[ngrps]);
2037	      copyset(intersect, labels[j]);
2038	      MALLOC(grps[ngrps].elems, position, d->nleaves);
2039	      copy(&grps[j], &grps[ngrps]);
2040	      ++ngrps;
2041	    }
2042
2043	  /* Put the position in the current group.  Note that there is no
2044	     reason to call insert() here. */
2045	  grps[j].elems[grps[j].nelem++] = pos;
2046
2047	  /* If every character matching the current position has been
2048	     accounted for, we're done. */
2049	  if (! matchesf)
2050	    break;
2051	}
2052
2053      /* If we've passed the last group, and there are still characters
2054	 unaccounted for, then we'll have to create a new group. */
2055      if (j == ngrps)
2056	{
2057	  copyset(matches, labels[ngrps]);
2058	  zeroset(matches);
2059	  MALLOC(grps[ngrps].elems, position, d->nleaves);
2060	  grps[ngrps].nelem = 1;
2061	  grps[ngrps].elems[0] = pos;
2062	  ++ngrps;
2063	}
2064    }
2065
2066  MALLOC(follows.elems, position, d->nleaves);
2067  MALLOC(tmp.elems, position, d->nleaves);
2068
2069  /* If we are a searching matcher, the default transition is to a state
2070     containing the positions of state 0, otherwise the default transition
2071     is to fail miserably. */
2072  if (d->searchflag)
2073    {
2074      wants_newline = 0;
2075      wants_letter = 0;
2076      for (i = 0; i < d->states[0].elems.nelem; ++i)
2077	{
2078	  if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2079	    wants_newline = 1;
2080	  if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2081	    wants_letter = 1;
2082	}
2083      copy(&d->states[0].elems, &follows);
2084      state = state_index(d, &follows, 0, 0);
2085      if (wants_newline)
2086	state_newline = state_index(d, &follows, 1, 0);
2087      else
2088	state_newline = state;
2089      if (wants_letter)
2090	state_letter = state_index(d, &follows, 0, 1);
2091      else
2092	state_letter = state;
2093      for (i = 0; i < NOTCHAR; ++i)
2094	trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2095      trans[eolbyte] = state_newline;
2096    }
2097  else
2098    for (i = 0; i < NOTCHAR; ++i)
2099      trans[i] = -1;
2100
2101  for (i = 0; i < ngrps; ++i)
2102    {
2103      follows.nelem = 0;
2104
2105      /* Find the union of the follows of the positions of the group.
2106	 This is a hideously inefficient loop.  Fix it someday. */
2107      for (j = 0; j < grps[i].nelem; ++j)
2108	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2109	  insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2110
2111#ifdef MBS_SUPPORT
2112      if (MB_CUR_MAX > 1)
2113	{
2114	  /* If a token in follows.elems is not 1st byte of a multibyte
2115	     character, or the states of follows must accept the bytes
2116	     which are not 1st byte of the multibyte character.
2117	     Then, if a state of follows encounter a byte, it must not be
2118	     a 1st byte of a multibyte character nor singlebyte character.
2119	     We cansel to add state[0].follows to next state, because
2120	     state[0] must accept 1st-byte
2121
2122	     For example, we assume <sb a> is a certain singlebyte
2123	     character, <mb A> is a certain multibyte character, and the
2124	     codepoint of <sb a> equals the 2nd byte of the codepoint of
2125	     <mb A>.
2126	     When state[0] accepts <sb a>, state[i] transit to state[i+1]
2127	     by accepting accepts 1st byte of <mb A>, and state[i+1]
2128	     accepts 2nd byte of <mb A>, if state[i+1] encounter the
2129	     codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2130	     <mb A>, so we can not add state[0].  */
2131
2132	  next_isnt_1st_byte = 0;
2133	  for (j = 0; j < follows.nelem; ++j)
2134	    {
2135	      if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2136		{
2137		  next_isnt_1st_byte = 1;
2138		  break;
2139		}
2140	    }
2141	}
2142#endif
2143
2144      /* If we are building a searching matcher, throw in the positions
2145	 of state 0 as well. */
2146#ifdef MBS_SUPPORT
2147      if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte))
2148#else
2149      if (d->searchflag)
2150#endif
2151	for (j = 0; j < d->states[0].elems.nelem; ++j)
2152	  insert(d->states[0].elems.elems[j], &follows);
2153
2154      /* Find out if the new state will want any context information. */
2155      wants_newline = 0;
2156      if (tstbit(eolbyte, labels[i]))
2157	for (j = 0; j < follows.nelem; ++j)
2158	  if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2159	    wants_newline = 1;
2160
2161      wants_letter = 0;
2162      for (j = 0; j < CHARCLASS_INTS; ++j)
2163	if (labels[i][j] & letters[j])
2164	  break;
2165      if (j < CHARCLASS_INTS)
2166	for (j = 0; j < follows.nelem; ++j)
2167	  if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2168	    wants_letter = 1;
2169
2170      /* Find the state(s) corresponding to the union of the follows. */
2171      state = state_index(d, &follows, 0, 0);
2172      if (wants_newline)
2173	state_newline = state_index(d, &follows, 1, 0);
2174      else
2175	state_newline = state;
2176      if (wants_letter)
2177	state_letter = state_index(d, &follows, 0, 1);
2178      else
2179	state_letter = state;
2180
2181      /* Set the transitions for each character in the current label. */
2182      for (j = 0; j < CHARCLASS_INTS; ++j)
2183	for (k = 0; k < INTBITS; ++k)
2184	  if (labels[i][j] & 1 << k)
2185	    {
2186	      int c = j * INTBITS + k;
2187
2188	      if (c == eolbyte)
2189		trans[c] = state_newline;
2190	      else if (IS_WORD_CONSTITUENT(c))
2191		trans[c] = state_letter;
2192	      else if (c < NOTCHAR)
2193		trans[c] = state;
2194	    }
2195    }
2196
2197  for (i = 0; i < ngrps; ++i)
2198    free(grps[i].elems);
2199  free(follows.elems);
2200  free(tmp.elems);
2201}
2202
2203/* Some routines for manipulating a compiled dfa's transition tables.
2204   Each state may or may not have a transition table; if it does, and it
2205   is a non-accepting state, then d->trans[state] points to its table.
2206   If it is an accepting state then d->fails[state] points to its table.
2207   If it has no table at all, then d->trans[state] is NULL.
2208   TODO: Improve this comment, get rid of the unnecessary redundancy. */
2209
2210static void
2211build_state (int s, struct dfa *d)
2212{
2213  int *trans;			/* The new transition table. */
2214  int i;
2215
2216  /* Set an upper limit on the number of transition tables that will ever
2217     exist at once.  1024 is arbitrary.  The idea is that the frequently
2218     used transition tables will be quickly rebuilt, whereas the ones that
2219     were only needed once or twice will be cleared away. */
2220  if (d->trcount >= 1024)
2221    {
2222      for (i = 0; i < d->tralloc; ++i)
2223	if (d->trans[i])
2224	  {
2225	    free((ptr_t) d->trans[i]);
2226	    d->trans[i] = NULL;
2227	  }
2228	else if (d->fails[i])
2229	  {
2230	    free((ptr_t) d->fails[i]);
2231	    d->fails[i] = NULL;
2232	  }
2233      d->trcount = 0;
2234    }
2235
2236  ++d->trcount;
2237
2238  /* Set up the success bits for this state. */
2239  d->success[s] = 0;
2240  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2241      s, *d))
2242    d->success[s] |= 4;
2243  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2244      s, *d))
2245    d->success[s] |= 2;
2246  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2247      s, *d))
2248    d->success[s] |= 1;
2249
2250  MALLOC(trans, int, NOTCHAR);
2251  dfastate(s, d, trans);
2252
2253  /* Now go through the new transition table, and make sure that the trans
2254     and fail arrays are allocated large enough to hold a pointer for the
2255     largest state mentioned in the table. */
2256  for (i = 0; i < NOTCHAR; ++i)
2257    if (trans[i] >= d->tralloc)
2258      {
2259	int oldalloc = d->tralloc;
2260
2261	while (trans[i] >= d->tralloc)
2262	  d->tralloc *= 2;
2263	REALLOC(d->realtrans, int *, d->tralloc + 1);
2264	d->trans = d->realtrans + 1;
2265	REALLOC(d->fails, int *, d->tralloc);
2266	REALLOC(d->success, int, d->tralloc);
2267	while (oldalloc < d->tralloc)
2268	  {
2269	    d->trans[oldalloc] = NULL;
2270	    d->fails[oldalloc++] = NULL;
2271	  }
2272      }
2273
2274  /* Newline is a sentinel.  */
2275  trans[eolbyte] = -1;
2276
2277  if (ACCEPTING(s, *d))
2278    d->fails[s] = trans;
2279  else
2280    d->trans[s] = trans;
2281}
2282
2283static void
2284build_state_zero (struct dfa *d)
2285{
2286  d->tralloc = 1;
2287  d->trcount = 0;
2288  CALLOC(d->realtrans, int *, d->tralloc + 1);
2289  d->trans = d->realtrans + 1;
2290  CALLOC(d->fails, int *, d->tralloc);
2291  MALLOC(d->success, int, d->tralloc);
2292  build_state(0, d);
2293}
2294
2295#ifdef MBS_SUPPORT
2296/* Multibyte character handling sub-routins for dfaexec.  */
2297
2298/* Initial state may encounter the byte which is not a singlebyte character
2299   nor 1st byte of a multibyte character.  But it is incorrect for initial
2300   state to accept such a byte.
2301   For example, in sjis encoding the regular expression like "\\" accepts
2302   the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2303   0x815c. Then Initial state must skip the bytes which are not a singlebyte
2304   character nor 1st byte of a multibyte character.  */
2305#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p)		\
2306  if (s == 0)						\
2307    {							\
2308      while (inputwcs[p - buf_begin] == 0		\
2309            && mblen_buf[p - buf_begin] > 0		\
2310	    && p < buf_end)				\
2311        ++p;						\
2312      if (p >= end)					\
2313	{						\
2314          free(mblen_buf);				\
2315          free(inputwcs);				\
2316	  return (size_t) -1;				\
2317	}						\
2318    }
2319
2320static void
2321realloc_trans_if_necessary(struct dfa *d, int new_state)
2322{
2323  /* Make sure that the trans and fail arrays are allocated large enough
2324     to hold a pointer for the new state. */
2325  if (new_state >= d->tralloc)
2326    {
2327      int oldalloc = d->tralloc;
2328
2329      while (new_state >= d->tralloc)
2330	d->tralloc *= 2;
2331      REALLOC(d->realtrans, int *, d->tralloc + 1);
2332      d->trans = d->realtrans + 1;
2333      REALLOC(d->fails, int *, d->tralloc);
2334      REALLOC(d->success, int, d->tralloc);
2335      while (oldalloc < d->tralloc)
2336	{
2337	  d->trans[oldalloc] = NULL;
2338	  d->fails[oldalloc++] = NULL;
2339	}
2340    }
2341}
2342
2343/* Return values of transit_state_singlebyte(), and
2344   transit_state_consume_1char.  */
2345typedef enum
2346{
2347  TRANSIT_STATE_IN_PROGRESS,	/* State transition has not finished.  */
2348  TRANSIT_STATE_DONE,		/* State transition has finished.  */
2349  TRANSIT_STATE_END_BUFFER	/* Reach the end of the buffer.  */
2350} status_transit_state;
2351
2352/* Consume a single byte and transit state from 's' to '*next_state'.
2353   This function is almost same as the state transition routin in dfaexec().
2354   But state transition is done just once, otherwise matching succeed or
2355   reach the end of the buffer.  */
2356static status_transit_state
2357transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2358				  int *next_state)
2359{
2360  int *t;
2361  int works = s;
2362
2363  status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2364
2365  while (rval == TRANSIT_STATE_IN_PROGRESS)
2366    {
2367      if ((t = d->trans[works]) != NULL)
2368	{
2369	  works = t[*p];
2370	  rval = TRANSIT_STATE_DONE;
2371	  if (works < 0)
2372	    works = 0;
2373	}
2374      else if (works < 0)
2375	{
2376	  if (p == buf_end)
2377	    /* At the moment, it must not happen.  */
2378	    return TRANSIT_STATE_END_BUFFER;
2379	  works = 0;
2380	}
2381      else if (d->fails[works])
2382	{
2383	  works = d->fails[works][*p];
2384	  rval = TRANSIT_STATE_DONE;
2385	}
2386      else
2387	{
2388	  build_state(works, d);
2389	}
2390    }
2391  *next_state = works;
2392  return rval;
2393}
2394
2395/* Check whether period can match or not in the current context.  If it can,
2396   return the amount of the bytes with which period can match, otherwise
2397   return 0.
2398   `pos' is the position of the period.  `index' is the index from the
2399   buf_begin, and it is the current position in the buffer.  */
2400static int
2401match_anychar (struct dfa *d, int s, position pos, int index)
2402{
2403  int newline = 0;
2404  int letter = 0;
2405  wchar_t wc;
2406  int mbclen;
2407
2408  wc = inputwcs[index];
2409  mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2410
2411  /* Check context.  */
2412  if (wc == (wchar_t)eolbyte)
2413    {
2414      if (!(syntax_bits & RE_DOT_NEWLINE))
2415	return 0;
2416      newline = 1;
2417    }
2418  else if (wc == (wchar_t)'\0')
2419    {
2420      if (syntax_bits & RE_DOT_NOT_NULL)
2421	return 0;
2422      newline = 1;
2423    }
2424
2425  if (iswalnum(wc) || wc == L'_')
2426    letter = 1;
2427
2428  if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2429			   newline, d->states[s].letter, letter))
2430    return 0;
2431
2432  return mbclen;
2433}
2434
2435/* Check whether bracket expression can match or not in the current context.
2436   If it can, return the amount of the bytes with which expression can match,
2437   otherwise return 0.
2438   `pos' is the position of the bracket expression.  `index' is the index
2439   from the buf_begin, and it is the current position in the buffer.  */
2440int
2441match_mb_charset (struct dfa *d, int s, position pos, int index)
2442{
2443  int i;
2444  int match;		/* Flag which represent that matching succeed.  */
2445  int match_len;	/* Length of the character (or collating element)
2446			   with which this operator match.  */
2447  int op_len;		/* Length of the operator.  */
2448  char buffer[128];
2449  wchar_t wcbuf[6];
2450
2451  /* Pointer to the structure to which we are currently reffering.  */
2452  struct mb_char_classes *work_mbc;
2453
2454  int newline = 0;
2455  int letter = 0;
2456  wchar_t wc;		/* Current reffering character.  */
2457
2458  wc = inputwcs[index];
2459
2460  /* Check context.  */
2461  if (wc == (wchar_t)eolbyte)
2462    {
2463      if (!(syntax_bits & RE_DOT_NEWLINE))
2464	return 0;
2465      newline = 1;
2466    }
2467  else if (wc == (wchar_t)'\0')
2468    {
2469      if (syntax_bits & RE_DOT_NOT_NULL)
2470	return 0;
2471      newline = 1;
2472    }
2473  if (iswalnum(wc) || wc == L'_')
2474    letter = 1;
2475  if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2476			   newline, d->states[s].letter, letter))
2477    return 0;
2478
2479  /* Assign the current reffering operator to work_mbc.  */
2480  work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2481  match = !work_mbc->invert;
2482  match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2483
2484  /* match with a character class?  */
2485  for (i = 0; i<work_mbc->nch_classes; i++)
2486    {
2487      if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2488	goto charset_matched;
2489    }
2490
2491  strncpy(buffer, (const char *) (buf_begin + index), match_len);
2492  buffer[match_len] = '\0';
2493
2494  /* match with an equivalent class?  */
2495  for (i = 0; i<work_mbc->nequivs; i++)
2496    {
2497      op_len = strlen(work_mbc->equivs[i]);
2498      strncpy(buffer, (const char *) (buf_begin + index), op_len);
2499      buffer[op_len] = '\0';
2500      if (strcoll(work_mbc->equivs[i], buffer) == 0)
2501	{
2502	  match_len = op_len;
2503	  goto charset_matched;
2504	}
2505    }
2506
2507  /* match with a collating element?  */
2508  for (i = 0; i<work_mbc->ncoll_elems; i++)
2509    {
2510      op_len = strlen(work_mbc->coll_elems[i]);
2511      strncpy(buffer, (const char *) (buf_begin + index), op_len);
2512      buffer[op_len] = '\0';
2513
2514      if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2515	{
2516	  match_len = op_len;
2517	  goto charset_matched;
2518	}
2519    }
2520
2521  wcbuf[0] = wc;
2522  wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2523
2524  /* match with a range?  */
2525  for (i = 0; i<work_mbc->nranges; i++)
2526    {
2527      wcbuf[2] = work_mbc->range_sts[i];
2528      wcbuf[4] = work_mbc->range_ends[i];
2529
2530      if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2531	  wcscoll(wcbuf+4, wcbuf) >= 0)
2532	goto charset_matched;
2533    }
2534
2535  /* match with a character?  */
2536  for (i = 0; i<work_mbc->nchars; i++)
2537    {
2538      if (wc == work_mbc->chars[i])
2539	goto charset_matched;
2540    }
2541
2542  match = !match;
2543
2544 charset_matched:
2545  return match ? match_len : 0;
2546}
2547
2548/* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2549   array which corresponds to `d->states[s].mbps.elem' and each element of
2550   the array contains the amount of the bytes with which the element can
2551   match.
2552   `index' is the index from the buf_begin, and it is the current position
2553   in the buffer.
2554   Caller MUST free the array which this function return.  */
2555static int*
2556check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
2557{
2558  int i;
2559  int* rarray;
2560
2561  MALLOC(rarray, int, d->states[s].mbps.nelem);
2562  for (i = 0; i < d->states[s].mbps.nelem; ++i)
2563    {
2564      position pos = d->states[s].mbps.elems[i];
2565      switch(d->tokens[pos.index])
2566	{
2567	case ANYCHAR:
2568	  rarray[i] = match_anychar(d, s, pos, index);
2569	  break;
2570	case MBCSET:
2571	  rarray[i] = match_mb_charset(d, s, pos, index);
2572	  break;
2573	default:
2574	  break; /* can not happen.  */
2575	}
2576    }
2577  return rarray;
2578}
2579
2580/* Consume a single character and enumerate all of the positions which can
2581   be next position from the state `s'.
2582   `match_lens' is the input. It can be NULL, but it can also be the output
2583   of check_matching_with_multibyte_ops() for optimization.
2584   `mbclen' and `pps' are the output.  `mbclen' is the length of the
2585   character consumed, and `pps' is the set this function enumerate.  */
2586static status_transit_state
2587transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2588			     int *match_lens, int *mbclen, position_set *pps)
2589{
2590  int i, j;
2591  int s1, s2;
2592  int* work_mbls;
2593  status_transit_state rs = TRANSIT_STATE_DONE;
2594
2595  /* Calculate the length of the (single/multi byte) character
2596     to which p points.  */
2597  *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
2598    : mblen_buf[*pp - buf_begin];
2599
2600  /* Calculate the state which can be reached from the state `s' by
2601     consuming `*mbclen' single bytes from the buffer.  */
2602  s1 = s;
2603  for (i = 0; i < *mbclen; i++)
2604    {
2605      s2 = s1;
2606      rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2607    }
2608  /* Copy the positions contained by `s1' to the set `pps'.  */
2609  copy(&(d->states[s1].elems), pps);
2610
2611  /* Check (inputed)match_lens, and initialize if it is NULL.  */
2612  if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2613    work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2614  else
2615    work_mbls = match_lens;
2616
2617  /* Add all of the positions which can be reached from `s' by consuming
2618     a single character.  */
2619  for (i = 0; i < d->states[s].mbps.nelem ; i++)
2620   {
2621      if (work_mbls[i] == *mbclen)
2622	for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2623	     j++)
2624	  insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2625		 pps);
2626    }
2627
2628  if (match_lens == NULL && work_mbls != NULL)
2629    free(work_mbls);
2630  return rs;
2631}
2632
2633/* Transit state from s, then return new state and update the pointer of the
2634   buffer.  This function is for some operator which can match with a multi-
2635   byte character or a collating element(which may be multi characters).  */
2636static int
2637transit_state (struct dfa *d, int s, unsigned char const **pp)
2638{
2639  int s1;
2640  int mbclen;		/* The length of current input multibyte character. */
2641  int maxlen = 0;
2642  int i, j;
2643  int *match_lens = NULL;
2644  int nelem = d->states[s].mbps.nelem; /* Just a alias.  */
2645  position_set follows;
2646  unsigned char const *p1 = *pp;
2647  status_transit_state rs;
2648  wchar_t wc;
2649
2650  if (nelem > 0)
2651    /* This state has (a) multibyte operator(s).
2652       We check whether each of them can match or not.  */
2653    {
2654      /* Note: caller must free the return value of this function.  */
2655      match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2656
2657      for (i = 0; i < nelem; i++)
2658	/* Search the operator which match the longest string,
2659	   in this state.  */
2660	{
2661	  if (match_lens[i] > maxlen)
2662	    maxlen = match_lens[i];
2663	}
2664    }
2665
2666  if (nelem == 0 || maxlen == 0)
2667    /* This state has no multibyte operator which can match.
2668       We need to  check only one singlebyte character.  */
2669    {
2670      status_transit_state rs;
2671      rs = transit_state_singlebyte(d, s, *pp, &s1);
2672
2673      /* We must update the pointer if state transition succeeded.  */
2674      if (rs == TRANSIT_STATE_DONE)
2675	++*pp;
2676
2677      if (match_lens != NULL)
2678	free(match_lens);
2679      return s1;
2680    }
2681
2682  /* This state has some operators which can match a multibyte character.  */
2683  follows.nelem = 0;
2684  MALLOC(follows.elems, position, d->nleaves);
2685
2686  /* `maxlen' may be longer than the length of a character, because it may
2687     not be a character but a (multi character) collating element.
2688     We enumerate all of the positions which `s' can reach by consuming
2689     `maxlen' bytes.  */
2690  rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2691
2692  wc = inputwcs[*pp - mbclen - buf_begin];
2693  s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2694  realloc_trans_if_necessary(d, s1);
2695
2696  while (*pp - p1 < maxlen)
2697    {
2698      follows.nelem = 0;
2699      rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2700
2701      for (i = 0; i < nelem ; i++)
2702	{
2703	  if (match_lens[i] == *pp - p1)
2704	    for (j = 0;
2705		 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2706	      insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2707		     &follows);
2708	}
2709
2710      wc = inputwcs[*pp - mbclen - buf_begin];
2711      s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2712      realloc_trans_if_necessary(d, s1);
2713    }
2714  free(match_lens);
2715  free(follows.elems);
2716  return s1;
2717}
2718
2719#endif
2720
2721/* Search through a buffer looking for a match to the given struct dfa.
2722   Find the first occurrence of a string matching the regexp in the buffer,
2723   and the shortest possible version thereof.  Return the offset of the first
2724   character after the match, or (size_t) -1 if none is found.  BEGIN points to
2725   the beginning of the buffer, and SIZE is the size of the buffer.  If SIZE
2726   is nonzero, BEGIN[SIZE - 1] must be a newline.  BACKREF points to a place
2727   where we're supposed to store a 1 if backreferencing happened and the
2728   match needs to be verified by a backtracking matcher.  Otherwise
2729   we store a 0 in *backref. */
2730size_t
2731dfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
2732{
2733  register int s;	/* Current state. */
2734  register unsigned char const *p; /* Current input character. */
2735  register unsigned char const *end; /* One past the last input character.  */
2736  register int **trans, *t;	/* Copy of d->trans so it can be optimized
2737				   into a register. */
2738  register unsigned char eol = eolbyte;	/* Likewise for eolbyte.  */
2739  static int sbit[NOTCHAR];	/* Table for anding with d->success. */
2740  static int sbit_init;
2741
2742  if (! sbit_init)
2743    {
2744      int i;
2745
2746      sbit_init = 1;
2747      for (i = 0; i < NOTCHAR; ++i)
2748	sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2749      sbit[eol] = 4;
2750    }
2751
2752  if (! d->tralloc)
2753    build_state_zero(d);
2754
2755  s = 0;
2756  p = (unsigned char const *) begin;
2757  end = p + size;
2758  trans = d->trans;
2759
2760#ifdef MBS_SUPPORT
2761  if (MB_CUR_MAX > 1)
2762    {
2763      int remain_bytes, i;
2764      buf_begin = (unsigned char const *)begin;
2765      buf_end = end;
2766
2767      /* initialize mblen_buf, and inputwcs.  */
2768      MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
2769      MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
2770      memset(&mbs, 0, sizeof(mbstate_t));
2771      remain_bytes = 0;
2772      for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
2773	{
2774	  if (remain_bytes == 0)
2775	    {
2776	      remain_bytes
2777		= mbrtowc(inputwcs + i, begin + i,
2778			  end - (unsigned char const *)begin - i + 1, &mbs);
2779	      if (remain_bytes <= 1)
2780		{
2781		  remain_bytes = 0;
2782		  inputwcs[i] = (wchar_t)begin[i];
2783		  mblen_buf[i] = 0;
2784		}
2785	      else
2786		{
2787		  mblen_buf[i] = remain_bytes;
2788		  remain_bytes--;
2789		}
2790	    }
2791	  else
2792	    {
2793	      mblen_buf[i] = remain_bytes;
2794	      inputwcs[i] = 0;
2795	      remain_bytes--;
2796	    }
2797	}
2798      mblen_buf[i] = 0;
2799      inputwcs[i] = 0; /* sentinel */
2800    }
2801#endif /* MBS_SUPPORT */
2802
2803  for (;;)
2804    {
2805#ifdef MBS_SUPPORT
2806      if (MB_CUR_MAX > 1)
2807	while ((t = trans[s]))
2808	  {
2809	    if (p == end)
2810	      {
2811		free(mblen_buf);
2812		free(inputwcs);
2813		return (size_t) -1;
2814	      }
2815	    if (d->states[s].mbps.nelem != 0)
2816	      {
2817		/* Can match with a multibyte character( and multi character
2818		   collating element).  */
2819		unsigned char const *nextp;
2820
2821		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2822
2823		nextp = p;
2824		s = transit_state(d, s, &nextp);
2825		p = nextp;
2826
2827		/* Trans table might be updated.  */
2828		trans = d->trans;
2829	      }
2830	    else
2831	      {
2832		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2833		s = t[*p++];
2834	      }
2835	  }
2836      else
2837#endif /* MBS_SUPPORT */
2838        while ((t = trans[s]))
2839	  {
2840	    if (p == end)
2841	      return (size_t) -1;
2842	    s = t[*p++];
2843	  }
2844
2845      if (s < 0)
2846	{
2847	  if (p == end)
2848	    {
2849#ifdef MBS_SUPPORT
2850	      if (MB_CUR_MAX > 1)
2851		{
2852		  free(mblen_buf);
2853		  free(inputwcs);
2854		}
2855#endif /* MBS_SUPPORT */
2856	      return (size_t) -1;
2857	    }
2858	  s = 0;
2859	}
2860      else if ((t = d->fails[s]))
2861	{
2862	  if (d->success[s] & sbit[*p])
2863	    {
2864	      if (backref)
2865		*backref = (d->states[s].backref != 0);
2866#ifdef MBS_SUPPORT
2867	      if (MB_CUR_MAX > 1)
2868		{
2869		  free(mblen_buf);
2870		  free(inputwcs);
2871		}
2872#endif /* MBS_SUPPORT */
2873	      return (char const *) p - begin;
2874	    }
2875
2876#ifdef MBS_SUPPORT
2877	  if (MB_CUR_MAX > 1)
2878	    {
2879		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2880		if (d->states[s].mbps.nelem != 0)
2881		  {
2882		    /* Can match with a multibyte character( and multi
2883		       character collating element).  */
2884		    unsigned char const *nextp;
2885		    nextp = p;
2886		    s = transit_state(d, s, &nextp);
2887		    p = nextp;
2888
2889		    /* Trans table might be updated.  */
2890		    trans = d->trans;
2891		  }
2892		else
2893		s = t[*p++];
2894	    }
2895	  else
2896#endif /* MBS_SUPPORT */
2897	  s = t[*p++];
2898	}
2899      else
2900	{
2901	  build_state(s, d);
2902	  trans = d->trans;
2903	}
2904    }
2905}
2906
2907/* Initialize the components of a dfa that the other routines don't
2908   initialize for themselves. */
2909void
2910dfainit (struct dfa *d)
2911{
2912  d->calloc = 1;
2913  MALLOC(d->charclasses, charclass, d->calloc);
2914  d->cindex = 0;
2915
2916  d->talloc = 1;
2917  MALLOC(d->tokens, token, d->talloc);
2918  d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2919#ifdef MBS_SUPPORT
2920  if (MB_CUR_MAX > 1)
2921    {
2922      d->nmultibyte_prop = 1;
2923      MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
2924      d->nmbcsets = 0;
2925      d->mbcsets_alloc = 1;
2926      MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
2927    }
2928#endif
2929
2930  d->searchflag = 0;
2931  d->tralloc = 0;
2932
2933  d->musts = 0;
2934}
2935
2936/* Parse and analyze a single string of the given length. */
2937void
2938dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
2939{
2940  if (case_fold)	/* dummy folding in service of dfamust() */
2941    {
2942      char *lcopy;
2943      int i;
2944
2945      lcopy = (char *) malloc(len);
2946      if (!lcopy)
2947	dfaerror(_("out of memory"));
2948
2949      /* This is a kludge. */
2950      case_fold = 0;
2951      for (i = 0; i < len; ++i)
2952	if (ISUPPER ((unsigned char) s[i]))
2953	  lcopy[i] = tolower ((unsigned char) s[i]);
2954	else
2955	  lcopy[i] = s[i];
2956
2957      dfainit(d);
2958      dfaparse(lcopy, len, d);
2959      free(lcopy);
2960      dfamust(d);
2961      d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2962      case_fold = 1;
2963      dfaparse(s, len, d);
2964      dfaanalyze(d, searchflag);
2965    }
2966  else
2967    {
2968        dfainit(d);
2969        dfaparse(s, len, d);
2970	dfamust(d);
2971        dfaanalyze(d, searchflag);
2972    }
2973}
2974
2975/* Free the storage held by the components of a dfa. */
2976void
2977dfafree (struct dfa *d)
2978{
2979  int i;
2980  struct dfamust *dm, *ndm;
2981
2982  free((ptr_t) d->charclasses);
2983  free((ptr_t) d->tokens);
2984
2985#ifdef MBS_SUPPORT
2986  if (MB_CUR_MAX > 1)
2987    {
2988      free((ptr_t) d->multibyte_prop);
2989      for (i = 0; i < d->nmbcsets; ++i)
2990	{
2991	  int j;
2992	  struct mb_char_classes *p = &(d->mbcsets[i]);
2993	  if (p->chars != NULL)
2994	    free(p->chars);
2995	  if (p->ch_classes != NULL)
2996	    free(p->ch_classes);
2997	  if (p->range_sts != NULL)
2998	    free(p->range_sts);
2999	  if (p->range_ends != NULL)
3000	    free(p->range_ends);
3001
3002	  for (j = 0; j < p->nequivs; ++j)
3003	    free(p->equivs[j]);
3004	  if (p->equivs != NULL)
3005	    free(p->equivs);
3006
3007	  for (j = 0; j < p->ncoll_elems; ++j)
3008	    free(p->coll_elems[j]);
3009	  if (p->coll_elems != NULL)
3010	    free(p->coll_elems);
3011	}
3012      free((ptr_t) d->mbcsets);
3013    }
3014#endif /* MBS_SUPPORT */
3015
3016  for (i = 0; i < d->sindex; ++i)
3017    free((ptr_t) d->states[i].elems.elems);
3018  free((ptr_t) d->states);
3019  for (i = 0; i < d->tindex; ++i)
3020    if (d->follows[i].elems)
3021      free((ptr_t) d->follows[i].elems);
3022  free((ptr_t) d->follows);
3023  for (i = 0; i < d->tralloc; ++i)
3024    if (d->trans[i])
3025      free((ptr_t) d->trans[i]);
3026    else if (d->fails[i])
3027      free((ptr_t) d->fails[i]);
3028  if (d->realtrans) free((ptr_t) d->realtrans);
3029  if (d->fails) free((ptr_t) d->fails);
3030  if (d->success) free((ptr_t) d->success);
3031  for (dm = d->musts; dm; dm = ndm)
3032    {
3033      ndm = dm->next;
3034      free(dm->must);
3035      free((ptr_t) dm);
3036    }
3037}
3038
3039/* Having found the postfix representation of the regular expression,
3040   try to find a long sequence of characters that must appear in any line
3041   containing the r.e.
3042   Finding a "longest" sequence is beyond the scope here;
3043   we take an easy way out and hope for the best.
3044   (Take "(ab|a)b"--please.)
3045
3046   We do a bottom-up calculation of sequences of characters that must appear
3047   in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3048   representation:
3049	sequences that must appear at the left of the match ("left")
3050	sequences that must appear at the right of the match ("right")
3051	lists of sequences that must appear somewhere in the match ("in")
3052	sequences that must constitute the match ("is")
3053
3054   When we get to the root of the tree, we use one of the longest of its
3055   calculated "in" sequences as our answer.  The sequence we find is returned in
3056   d->must (where "d" is the single argument passed to "dfamust");
3057   the length of the sequence is returned in d->mustn.
3058
3059   The sequences calculated for the various types of node (in pseudo ANSI c)
3060   are shown below.  "p" is the operand of unary operators (and the left-hand
3061   operand of binary operators); "q" is the right-hand operand of binary
3062   operators.
3063
3064   "ZERO" means "a zero-length sequence" below.
3065
3066	Type	left		right		is		in
3067	----	----		-----		--		--
3068	char c	# c		# c		# c		# c
3069
3070	ANYCHAR	ZERO		ZERO		ZERO		ZERO
3071
3072	MBCSET	ZERO		ZERO		ZERO		ZERO
3073
3074	CSET	ZERO		ZERO		ZERO		ZERO
3075
3076	STAR	ZERO		ZERO		ZERO		ZERO
3077
3078	QMARK	ZERO		ZERO		ZERO		ZERO
3079
3080	PLUS	p->left		p->right	ZERO		p->in
3081
3082	CAT	(p->is==ZERO)?	(q->is==ZERO)?	(p->is!=ZERO &&	p->in plus
3083		p->left :	q->right :	q->is!=ZERO) ?	q->in plus
3084		p->is##q->left	p->right##q->is	p->is##q->is :	p->right##q->left
3085						ZERO
3086
3087	OR	longest common	longest common	(do p->is and	substrings common to
3088		leading		trailing	q->is have same	p->in and q->in
3089		(sub)sequence	(sub)sequence	length and
3090		of p->left	of p->right	content) ?
3091		and q->left	and q->right	p->is : NULL
3092
3093   If there's anything else we recognize in the tree, all four sequences get set
3094   to zero-length sequences.  If there's something we don't recognize in the tree,
3095   we just return a zero-length sequence.
3096
3097   Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3098   'aaa')?
3099
3100   And. . .is it here or someplace that we might ponder "optimizations" such as
3101	egrep 'psi|epsilon'	->	egrep 'psi'
3102	egrep 'pepsi|epsilon'	->	egrep 'epsi'
3103					(Yes, we now find "epsi" as a "string
3104					that must occur", but we might also
3105					simplify the *entire* r.e. being sought)
3106	grep '[c]'		->	grep 'c'
3107	grep '(ab|a)b'		->	grep 'ab'
3108	grep 'ab*'		->	grep 'a'
3109	grep 'a*b'		->	grep 'b'
3110
3111   There are several issues:
3112
3113   Is optimization easy (enough)?
3114
3115   Does optimization actually accomplish anything,
3116   or is the automaton you get from "psi|epsilon" (for example)
3117   the same as the one you get from "psi" (for example)?
3118
3119   Are optimizable r.e.'s likely to be used in real-life situations
3120   (something like 'ab*' is probably unlikely; something like is
3121   'psi|epsilon' is likelier)? */
3122
3123static char *
3124icatalloc (char *stem, char *suffix)
3125{
3126  char *result;
3127  size_t stemsize, suffixsize;
3128
3129  suffixsize = (suffix == NULL) ? 0 : strlen(suffix);
3130  if (stem == NULL)
3131    stemsize = 0;
3132  else if (suffixsize == 0)
3133    return stem;
3134  else	stemsize = strlen(stem);
3135  if (stem == NULL)
3136    result = (char *) malloc(suffixsize + 1);
3137  else
3138    result = (char *) realloc((void *) stem, stemsize + suffixsize + 1);
3139  if (result != NULL && suffix != NULL)
3140    (void) strcpy(result + stemsize, suffix);
3141  return result;
3142}
3143
3144static char *
3145icpyalloc (char *string)
3146{
3147  return icatalloc((char *) NULL, string);
3148}
3149
3150static char *
3151istrstr (char *lookin, char *lookfor)
3152{
3153  char *cp;
3154  size_t len;
3155
3156  len = strlen(lookfor);
3157  for (cp = lookin; *cp != '\0'; ++cp)
3158    if (strncmp(cp, lookfor, len) == 0)
3159      return cp;
3160  return NULL;
3161}
3162
3163static void
3164ifree (char *cp)
3165{
3166  if (cp != NULL)
3167    free(cp);
3168}
3169
3170static void
3171freelist (char **cpp)
3172{
3173  int i;
3174
3175  if (cpp == NULL)
3176    return;
3177  for (i = 0; cpp[i] != NULL; ++i)
3178    {
3179      free(cpp[i]);
3180      cpp[i] = NULL;
3181    }
3182}
3183
3184static char **
3185enlist (char **cpp, char *suffix, size_t len)
3186{
3187  int i, j;
3188
3189  if (cpp == NULL)
3190    return NULL;
3191  if ((suffix = icpyalloc(suffix)) == NULL)
3192    {
3193      freelist(cpp);
3194      return NULL;
3195    }
3196  suffix[len] = '\0';
3197  /* Is there already something in the list that's suffix (or longer)? */
3198  for (i = 0; cpp[i] != NULL; ++i)
3199    if (istrstr(cpp[i], suffix) != NULL)
3200      {
3201	free(suffix);
3202	return cpp;
3203      }
3204  /* Eliminate any obsoleted strings. */
3205  j = 0;
3206  while (cpp[j] != NULL)
3207    if (istrstr(suffix, cpp[j]) == NULL)
3208      ++j;
3209    else
3210      {
3211	free(cpp[j]);
3212	if (--i == j)
3213	  break;
3214	cpp[j] = cpp[i];
3215	cpp[i] = NULL;
3216      }
3217  /* Add the new string. */
3218  cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
3219  if (cpp == NULL)
3220    return NULL;
3221  cpp[i] = suffix;
3222  cpp[i + 1] = NULL;
3223  return cpp;
3224}
3225
3226/* Given pointers to two strings, return a pointer to an allocated
3227   list of their distinct common substrings. Return NULL if something
3228   seems wild. */
3229static char **
3230comsubs (char *left, char *right)
3231{
3232  char **cpp;
3233  char *lcp;
3234  char *rcp;
3235  size_t i, len;
3236
3237  if (left == NULL || right == NULL)
3238    return NULL;
3239  cpp = (char **) malloc(sizeof *cpp);
3240  if (cpp == NULL)
3241    return NULL;
3242  cpp[0] = NULL;
3243  for (lcp = left; *lcp != '\0'; ++lcp)
3244    {
3245      len = 0;
3246      rcp = strchr (right, *lcp);
3247      while (rcp != NULL)
3248	{
3249	  for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3250	    continue;
3251	  if (i > len)
3252	    len = i;
3253	  rcp = strchr (rcp + 1, *lcp);
3254	}
3255      if (len == 0)
3256	continue;
3257      if ((cpp = enlist(cpp, lcp, len)) == NULL)
3258	break;
3259    }
3260  return cpp;
3261}
3262
3263static char **
3264addlists (char **old, char **suffixes)
3265{
3266  int i;
3267
3268  if (old == NULL || suffixes == NULL)
3269    return NULL;
3270  for (i = 0; suffixes[i] != NULL; ++i)
3271    {
3272      old = enlist(old, suffixes[i], strlen(suffixes[i]));
3273      if (old == NULL)
3274	break;
3275    }
3276  return old;
3277}
3278
3279/* Given two lists of substrings, return a new list giving substrings
3280   common to both. */
3281static char **
3282inboth (char **left, char **right)
3283{
3284  char **both;
3285  char **temp;
3286  int lnum, rnum;
3287
3288  if (left == NULL || right == NULL)
3289    return NULL;
3290  both = (char **) malloc(sizeof *both);
3291  if (both == NULL)
3292    return NULL;
3293  both[0] = NULL;
3294  for (lnum = 0; left[lnum] != NULL; ++lnum)
3295    {
3296      for (rnum = 0; right[rnum] != NULL; ++rnum)
3297	{
3298	  temp = comsubs(left[lnum], right[rnum]);
3299	  if (temp == NULL)
3300	    {
3301	      freelist(both);
3302	      return NULL;
3303	    }
3304	  both = addlists(both, temp);
3305	  freelist(temp);
3306	  free(temp);
3307	  if (both == NULL)
3308	    return NULL;
3309	}
3310    }
3311  return both;
3312}
3313
3314typedef struct
3315{
3316  char **in;
3317  char *left;
3318  char *right;
3319  char *is;
3320} must;
3321
3322static void
3323resetmust (must *mp)
3324{
3325  mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3326  freelist(mp->in);
3327}
3328
3329static void
3330dfamust (struct dfa *dfa)
3331{
3332  must *musts;
3333  must *mp;
3334  char *result;
3335  int ri;
3336  int i;
3337  int exact;
3338  token t;
3339  static must must0;
3340  struct dfamust *dm;
3341  static char empty_string[] = "";
3342
3343  result = empty_string;
3344  exact = 0;
3345  musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
3346  if (musts == NULL)
3347    return;
3348  mp = musts;
3349  for (i = 0; i <= dfa->tindex; ++i)
3350    mp[i] = must0;
3351  for (i = 0; i <= dfa->tindex; ++i)
3352    {
3353      mp[i].in = (char **) malloc(sizeof *mp[i].in);
3354      mp[i].left = (char *) malloc(2);
3355      mp[i].right = (char *) malloc(2);
3356      mp[i].is = (char *) malloc(2);
3357      if (mp[i].in == NULL || mp[i].left == NULL ||
3358	  mp[i].right == NULL || mp[i].is == NULL)
3359	goto done;
3360      mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3361      mp[i].in[0] = NULL;
3362    }
3363#ifdef DEBUG
3364  fprintf(stderr, "dfamust:\n");
3365  for (i = 0; i < dfa->tindex; ++i)
3366    {
3367      fprintf(stderr, " %d:", i);
3368      prtok(dfa->tokens[i]);
3369    }
3370  putc('\n', stderr);
3371#endif
3372  for (ri = 0; ri < dfa->tindex; ++ri)
3373    {
3374      switch (t = dfa->tokens[ri])
3375	{
3376	case LPAREN:
3377	case RPAREN:
3378	  goto done;		/* "cannot happen" */
3379	case EMPTY:
3380	case BEGLINE:
3381	case ENDLINE:
3382	case BEGWORD:
3383	case ENDWORD:
3384	case LIMWORD:
3385	case NOTLIMWORD:
3386	case BACKREF:
3387	  resetmust(mp);
3388	  break;
3389	case STAR:
3390	case QMARK:
3391	  if (mp <= musts)
3392	    goto done;		/* "cannot happen" */
3393	  --mp;
3394	  resetmust(mp);
3395	  break;
3396	case OR:
3397	case ORTOP:
3398	  if (mp < &musts[2])
3399	    goto done;		/* "cannot happen" */
3400	  {
3401	    char **common;
3402	    must *lmp;
3403	    must *rmp;
3404	    int j, ln, rn, n;
3405
3406	    rmp = --mp;
3407	    lmp = --mp;
3408	    /* Guaranteed to be.  Unlikely, but. . . */
3409	    if (strcmp(lmp->is, rmp->is) != 0)
3410	      lmp->is[0] = '\0';
3411	    /* Left side--easy */
3412	    i = 0;
3413	    while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3414	      ++i;
3415	    lmp->left[i] = '\0';
3416	    /* Right side */
3417	    ln = strlen(lmp->right);
3418	    rn = strlen(rmp->right);
3419	    n = ln;
3420	    if (n > rn)
3421	      n = rn;
3422	    for (i = 0; i < n; ++i)
3423	      if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3424		break;
3425	    for (j = 0; j < i; ++j)
3426	      lmp->right[j] = lmp->right[(ln - i) + j];
3427	    lmp->right[j] = '\0';
3428	    common = inboth(lmp->in, rmp->in);
3429	    if (common == NULL)
3430	      goto done;
3431	    freelist(lmp->in);
3432	    free((char *) lmp->in);
3433	    lmp->in = common;
3434	  }
3435	  break;
3436	case PLUS:
3437	  if (mp <= musts)
3438	    goto done;		/* "cannot happen" */
3439	  --mp;
3440	  mp->is[0] = '\0';
3441	  break;
3442	case END:
3443	  if (mp != &musts[1])
3444	    goto done;		/* "cannot happen" */
3445	  for (i = 0; musts[0].in[i] != NULL; ++i)
3446	    if (strlen(musts[0].in[i]) > strlen(result))
3447	      result = musts[0].in[i];
3448	  if (strcmp(result, musts[0].is) == 0)
3449	    exact = 1;
3450	  goto done;
3451	case CAT:
3452	  if (mp < &musts[2])
3453	    goto done;		/* "cannot happen" */
3454	  {
3455	    must *lmp;
3456	    must *rmp;
3457
3458	    rmp = --mp;
3459	    lmp = --mp;
3460	    /* In.  Everything in left, plus everything in
3461	       right, plus catenation of
3462	       left's right and right's left. */
3463	    lmp->in = addlists(lmp->in, rmp->in);
3464	    if (lmp->in == NULL)
3465	      goto done;
3466	    if (lmp->right[0] != '\0' &&
3467		rmp->left[0] != '\0')
3468	      {
3469		char *tp;
3470
3471		tp = icpyalloc(lmp->right);
3472		if (tp == NULL)
3473		  goto done;
3474		tp = icatalloc(tp, rmp->left);
3475		if (tp == NULL)
3476		  goto done;
3477		lmp->in = enlist(lmp->in, tp,
3478				 strlen(tp));
3479		free(tp);
3480		if (lmp->in == NULL)
3481		  goto done;
3482	      }
3483	    /* Left-hand */
3484	    if (lmp->is[0] != '\0')
3485	      {
3486		lmp->left = icatalloc(lmp->left,
3487				      rmp->left);
3488		if (lmp->left == NULL)
3489		  goto done;
3490	      }
3491	    /* Right-hand */
3492	    if (rmp->is[0] == '\0')
3493	      lmp->right[0] = '\0';
3494	    lmp->right = icatalloc(lmp->right, rmp->right);
3495	    if (lmp->right == NULL)
3496	      goto done;
3497	    /* Guaranteed to be */
3498	    if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3499	      {
3500		lmp->is = icatalloc(lmp->is, rmp->is);
3501		if (lmp->is == NULL)
3502		  goto done;
3503	      }
3504	    else
3505	      lmp->is[0] = '\0';
3506	  }
3507	  break;
3508	default:
3509	  if (t < END)
3510	    {
3511	      /* "cannot happen" */
3512	      goto done;
3513	    }
3514	  else if (t == '\0')
3515	    {
3516	      /* not on *my* shift */
3517	      goto done;
3518	    }
3519	  else if (t >= CSET
3520#ifdef MBS_SUPPORT
3521		   || t == ANYCHAR
3522		   || t == MBCSET
3523#endif /* MBS_SUPPORT */
3524		   )
3525	    {
3526	      /* easy enough */
3527	      resetmust(mp);
3528	    }
3529	  else
3530	    {
3531	      /* plain character */
3532	      resetmust(mp);
3533	      mp->is[0] = mp->left[0] = mp->right[0] = t;
3534	      mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3535	      mp->in = enlist(mp->in, mp->is, (size_t)1);
3536	      if (mp->in == NULL)
3537		goto done;
3538	    }
3539	  break;
3540	}
3541#ifdef DEBUG
3542      fprintf(stderr, " node: %d:", ri);
3543      prtok(dfa->tokens[ri]);
3544      fprintf(stderr, "\n  in:");
3545      for (i = 0; mp->in[i]; ++i)
3546	fprintf(stderr, " \"%s\"", mp->in[i]);
3547      fprintf(stderr, "\n  is: \"%s\"\n", mp->is);
3548      fprintf(stderr, "  left: \"%s\"\n", mp->left);
3549      fprintf(stderr, "  right: \"%s\"\n", mp->right);
3550#endif
3551      ++mp;
3552    }
3553 done:
3554  if (strlen(result))
3555    {
3556      dm = (struct dfamust *) malloc(sizeof (struct dfamust));
3557      dm->exact = exact;
3558      dm->must = (char *) malloc(strlen(result) + 1);
3559      strcpy(dm->must, result);
3560      dm->next = dfa->musts;
3561      dfa->musts = dm;
3562    }
3563  mp = musts;
3564  for (i = 0; i <= dfa->tindex; ++i)
3565    {
3566      freelist(mp[i].in);
3567      ifree((char *) mp[i].in);
3568      ifree(mp[i].left);
3569      ifree(mp[i].right);
3570      ifree(mp[i].is);
3571    }
3572  free((char *) mp);
3573}
3574/* vim:set shiftwidth=2: */
3575