dfa.c revision 56919
1/* dfa.c - deterministic extended regexp routines for GNU
2   Copyright (C) 1988, 1998 Free Software Foundation, Inc.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2, or (at your option)
7   any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software
16   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA */
17
18/* Written June, 1988 by Mike Haertel
19   Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
20
21/* $FreeBSD: head/gnu/usr.bin/grep/dfa.c 56919 2000-01-31 13:28:08Z ru $ */
22
23#ifdef HAVE_CONFIG_H
24#include <config.h>
25#endif
26
27#include <assert.h>
28#include <ctype.h>
29#include <stdio.h>
30
31#include <sys/types.h>
32#ifdef STDC_HEADERS
33#include <stdlib.h>
34#else
35extern char *calloc(), *malloc(), *realloc();
36extern void free();
37#endif
38
39#if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
40#include <string.h>
41#undef index
42#define index strchr
43#else
44#include <strings.h>
45#endif
46
47#ifndef DEBUG	/* use the same approach as regex.c */
48#undef assert
49#define assert(e)
50#endif /* DEBUG */
51
52#ifndef isgraph
53#define isgraph(C) (isprint(C) && !isspace(C))
54#endif
55
56#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
57#define ISALPHA(C) isalpha(C)
58#define ISUPPER(C) isupper(C)
59#define ISLOWER(C) islower(C)
60#define ISDIGIT(C) isdigit(C)
61#define ISXDIGIT(C) isxdigit(C)
62#define ISSPACE(C) isspace(C)
63#define ISPUNCT(C) ispunct(C)
64#define ISALNUM(C) isalnum(C)
65#define ISPRINT(C) isprint(C)
66#define ISGRAPH(C) isgraph(C)
67#define ISCNTRL(C) iscntrl(C)
68#else
69#define ISALPHA(C) (isascii(C) && isalpha(C))
70#define ISUPPER(C) (isascii(C) && isupper(C))
71#define ISLOWER(C) (isascii(C) && islower(C))
72#define ISDIGIT(C) (isascii(C) && isdigit(C))
73#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
74#define ISSPACE(C) (isascii(C) && isspace(C))
75#define ISPUNCT(C) (isascii(C) && ispunct(C))
76#define ISALNUM(C) (isascii(C) && isalnum(C))
77#define ISPRINT(C) (isascii(C) && isprint(C))
78#define ISGRAPH(C) (isascii(C) && isgraph(C))
79#define ISCNTRL(C) (isascii(C) && iscntrl(C))
80#endif
81
82/* ISASCIIDIGIT differs from ISDIGIT, as follows:
83   - Its arg may be any int or unsigned int; it need not be an unsigned char.
84   - It's guaranteed to evaluate its argument exactly once.
85   - It's typically faster.
86   Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
87   only '0' through '9' are digits.  Prefer ISASCIIDIGIT to ISDIGIT unless
88   it's important to use the locale's definition of `digit' even when the
89   host does not conform to Posix.  */
90#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
91
92/* If we (don't) have I18N.  */
93/* glibc defines _ */
94#ifndef _
95# ifdef HAVE_LIBINTL_H
96#  include <libintl.h>
97#  ifndef _
98#   define _(Str) gettext (Str)
99#  endif
100# else
101#  define _(Str) (Str)
102# endif
103#endif
104
105#ifdef __FreeBSD__
106#include <gnuregex.h>
107#else
108#include "regex.h"
109#endif
110#include "dfa.h"
111
112/* HPUX, define those as macros in sys/param.h */
113#ifdef setbit
114# undef setbit
115#endif
116#ifdef clrbit
117# undef clrbit
118#endif
119
120static void dfamust PARAMS ((struct dfa *dfa));
121
122static ptr_t xcalloc PARAMS ((size_t n, size_t s));
123static ptr_t xmalloc PARAMS ((size_t n));
124static ptr_t xrealloc PARAMS ((ptr_t p, size_t n));
125#ifdef DEBUG
126static void prtok PARAMS ((token t));
127#endif
128static int tstbit PARAMS ((int b, charclass c));
129static void setbit PARAMS ((int b, charclass c));
130static void clrbit PARAMS ((int b, charclass c));
131static void copyset PARAMS ((charclass src, charclass dst));
132static void zeroset PARAMS ((charclass s));
133static void notset PARAMS ((charclass s));
134static int equal PARAMS ((charclass s1, charclass s2));
135static int charclass_index PARAMS ((charclass s));
136static int looking_at PARAMS ((const char *s));
137static token lex PARAMS ((void));
138static void addtok PARAMS ((token t));
139static void atom PARAMS ((void));
140static int nsubtoks PARAMS ((int tindex));
141static void copytoks PARAMS ((int tindex, int ntokens));
142static void closure PARAMS ((void));
143static void branch PARAMS ((void));
144static void regexp PARAMS ((int toplevel));
145static void copy PARAMS ((position_set *src, position_set *dst));
146static void insert PARAMS ((position p, position_set *s));
147static void merge PARAMS ((position_set *s1, position_set *s2, position_set *m));
148static void delete PARAMS ((position p, position_set *s));
149static int state_index PARAMS ((struct dfa *d, position_set *s,
150			  int newline, int letter));
151static void build_state PARAMS ((int s, struct dfa *d));
152static void build_state_zero PARAMS ((struct dfa *d));
153static char *icatalloc PARAMS ((char *old, char *new));
154static char *icpyalloc PARAMS ((char *string));
155static char *istrstr PARAMS ((char *lookin, char *lookfor));
156static void ifree PARAMS ((char *cp));
157static void freelist PARAMS ((char **cpp));
158static char **enlist PARAMS ((char **cpp, char *new, size_t len));
159static char **comsubs PARAMS ((char *left, char *right));
160static char **addlists PARAMS ((char **old, char **new));
161static char **inboth PARAMS ((char **left, char **right));
162
163static ptr_t
164xcalloc (size_t n, size_t s)
165{
166  ptr_t r = calloc(n, s);
167
168  if (!r)
169    dfaerror(_("Memory exhausted"));
170  return r;
171}
172
173static ptr_t
174xmalloc (size_t n)
175{
176  ptr_t r = malloc(n);
177
178  assert(n != 0);
179  if (!r)
180    dfaerror(_("Memory exhausted"));
181  return r;
182}
183
184static ptr_t
185xrealloc (ptr_t p, size_t n)
186{
187  ptr_t r = realloc(p, n);
188
189  assert(n != 0);
190  if (!r)
191    dfaerror(_("Memory exhausted"));
192  return r;
193}
194
195#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
196#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
197#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
198
199/* Reallocate an array of type t if nalloc is too small for index. */
200#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
201  if ((index) >= (nalloc))			  \
202    {						  \
203      while ((index) >= (nalloc))		  \
204	(nalloc) *= 2;				  \
205      REALLOC(p, t, nalloc);			  \
206    }
207
208#ifdef DEBUG
209
210static void
211prtok (token t)
212{
213  char *s;
214
215  if (t < 0)
216    fprintf(stderr, "END");
217  else if (t < NOTCHAR)
218    fprintf(stderr, "%c", t);
219  else
220    {
221      switch (t)
222	{
223	case EMPTY: s = "EMPTY"; break;
224	case BACKREF: s = "BACKREF"; break;
225	case BEGLINE: s = "BEGLINE"; break;
226	case ENDLINE: s = "ENDLINE"; break;
227	case BEGWORD: s = "BEGWORD"; break;
228	case ENDWORD: s = "ENDWORD"; break;
229	case LIMWORD: s = "LIMWORD"; break;
230	case NOTLIMWORD: s = "NOTLIMWORD"; break;
231	case QMARK: s = "QMARK"; break;
232	case STAR: s = "STAR"; break;
233	case PLUS: s = "PLUS"; break;
234	case CAT: s = "CAT"; break;
235	case OR: s = "OR"; break;
236	case ORTOP: s = "ORTOP"; break;
237	case LPAREN: s = "LPAREN"; break;
238	case RPAREN: s = "RPAREN"; break;
239	default: s = "CSET"; break;
240	}
241      fprintf(stderr, "%s", s);
242    }
243}
244#endif /* DEBUG */
245
246/* Stuff pertaining to charclasses. */
247
248static int
249tstbit (int b, charclass c)
250{
251  return c[b / INTBITS] & 1 << b % INTBITS;
252}
253
254static void
255setbit (int b, charclass c)
256{
257  c[b / INTBITS] |= 1 << b % INTBITS;
258}
259
260static void
261clrbit (int b, charclass c)
262{
263  c[b / INTBITS] &= ~(1 << b % INTBITS);
264}
265
266static void
267copyset (charclass src, charclass dst)
268{
269  int i;
270
271  for (i = 0; i < CHARCLASS_INTS; ++i)
272    dst[i] = src[i];
273}
274
275static void
276zeroset (charclass s)
277{
278  int i;
279
280  for (i = 0; i < CHARCLASS_INTS; ++i)
281    s[i] = 0;
282}
283
284static void
285notset (charclass s)
286{
287  int i;
288
289  for (i = 0; i < CHARCLASS_INTS; ++i)
290    s[i] = ~s[i];
291}
292
293static int
294equal (charclass s1, charclass s2)
295{
296  int i;
297
298  for (i = 0; i < CHARCLASS_INTS; ++i)
299    if (s1[i] != s2[i])
300      return 0;
301  return 1;
302}
303
304/* A pointer to the current dfa is kept here during parsing. */
305static struct dfa *dfa;
306
307/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
308static int
309charclass_index (charclass s)
310{
311  int i;
312
313  for (i = 0; i < dfa->cindex; ++i)
314    if (equal(s, dfa->charclasses[i]))
315      return i;
316  REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
317  ++dfa->cindex;
318  copyset(s, dfa->charclasses[i]);
319  return i;
320}
321
322/* Syntax bits controlling the behavior of the lexical analyzer. */
323static reg_syntax_t syntax_bits, syntax_bits_set;
324
325/* Flag for case-folding letters into sets. */
326static int case_fold;
327
328/* End-of-line byte in data.  */
329static unsigned char eolbyte;
330
331/* Entry point to set syntax options. */
332void
333dfasyntax (reg_syntax_t bits, int fold, int eol)
334{
335  syntax_bits_set = 1;
336  syntax_bits = bits;
337  case_fold = fold;
338  eolbyte = eol;
339}
340
341/* Lexical analyzer.  All the dross that deals with the obnoxious
342   GNU Regex syntax bits is located here.  The poor, suffering
343   reader is referred to the GNU Regex documentation for the
344   meaning of the @#%!@#%^!@ syntax bits. */
345
346static char *lexstart;		/* Pointer to beginning of input string. */
347static char *lexptr;		/* Pointer to next input character. */
348static int lexleft;		/* Number of characters remaining. */
349static token lasttok;		/* Previous token returned; initially END. */
350static int laststart;		/* True if we're separated from beginning or (, |
351				   only by zero-width characters. */
352static int parens;		/* Count of outstanding left parens. */
353static int minrep, maxrep;	/* Repeat counts for {m,n}. */
354
355/* Note that characters become unsigned here. */
356#define FETCH(c, eoferr)   	      \
357  {			   	      \
358    if (! lexleft)	   	      \
359      {				      \
360	if (eoferr != 0)	      \
361	  dfaerror (eoferr);	      \
362	else		   	      \
363	  return lasttok = END;	      \
364      }				      \
365    (c) = (unsigned char) *lexptr++;  \
366    --lexleft;		   	      \
367  }
368
369#ifdef __STDC__
370#define FUNC(F, P) static int F(int c) { return P(c); }
371#else
372#define FUNC(F, P) static int F(c) int c; { return P(c); }
373#endif
374
375FUNC(is_alpha, ISALPHA)
376FUNC(is_upper, ISUPPER)
377FUNC(is_lower, ISLOWER)
378FUNC(is_digit, ISDIGIT)
379FUNC(is_xdigit, ISXDIGIT)
380FUNC(is_space, ISSPACE)
381FUNC(is_punct, ISPUNCT)
382FUNC(is_alnum, ISALNUM)
383FUNC(is_print, ISPRINT)
384FUNC(is_graph, ISGRAPH)
385FUNC(is_cntrl, ISCNTRL)
386
387static int
388is_blank (int c)
389{
390   return (c == ' ' || c == '\t');
391}
392
393/* The following list maps the names of the Posix named character classes
394   to predicate functions that determine whether a given character is in
395   the class.  The leading [ has already been eaten by the lexical analyzer. */
396static struct {
397  const char *name;
398  int (*pred) PARAMS ((int));
399} prednames[] = {
400  { ":alpha:]", is_alpha },
401  { ":upper:]", is_upper },
402  { ":lower:]", is_lower },
403  { ":digit:]", is_digit },
404  { ":xdigit:]", is_xdigit },
405  { ":space:]", is_space },
406  { ":punct:]", is_punct },
407  { ":alnum:]", is_alnum },
408  { ":print:]", is_print },
409  { ":graph:]", is_graph },
410  { ":cntrl:]", is_cntrl },
411  { ":blank:]", is_blank },
412  { 0 }
413};
414
415/* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
416#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
417
418static int
419looking_at (char const *s)
420{
421  size_t len;
422
423  len = strlen(s);
424  if (lexleft < len)
425    return 0;
426  return strncmp(s, lexptr, len) == 0;
427}
428
429static token
430lex (void)
431{
432  token c, c1, c2;
433  int backslash = 0, invert;
434  charclass ccl;
435  int i;
436  char lo[2];
437  char hi[2];
438
439  /* Basic plan: We fetch a character.  If it's a backslash,
440     we set the backslash flag and go through the loop again.
441     On the plus side, this avoids having a duplicate of the
442     main switch inside the backslash case.  On the minus side,
443     it means that just about every case begins with
444     "if (backslash) ...".  */
445  for (i = 0; i < 2; ++i)
446    {
447      FETCH(c, 0);
448      switch (c)
449	{
450	case '\\':
451	  if (backslash)
452	    goto normal_char;
453	  if (lexleft == 0)
454	    dfaerror(_("Unfinished \\ escape"));
455	  backslash = 1;
456	  break;
457
458	case '^':
459	  if (backslash)
460	    goto normal_char;
461	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
462	      || lasttok == END
463	      || lasttok == LPAREN
464	      || lasttok == OR)
465	    return lasttok = BEGLINE;
466	  goto normal_char;
467
468	case '$':
469	  if (backslash)
470	    goto normal_char;
471	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
472	      || lexleft == 0
473	      || (syntax_bits & RE_NO_BK_PARENS
474		  ? lexleft > 0 && *lexptr == ')'
475		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
476	      || (syntax_bits & RE_NO_BK_VBAR
477		  ? lexleft > 0 && *lexptr == '|'
478		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
479	      || ((syntax_bits & RE_NEWLINE_ALT)
480	          && lexleft > 0 && *lexptr == '\n'))
481	    return lasttok = ENDLINE;
482	  goto normal_char;
483
484	case '1':
485	case '2':
486	case '3':
487	case '4':
488	case '5':
489	case '6':
490	case '7':
491	case '8':
492	case '9':
493	  if (backslash && !(syntax_bits & RE_NO_BK_REFS))
494	    {
495	      laststart = 0;
496	      return lasttok = BACKREF;
497	    }
498	  goto normal_char;
499
500	case '`':
501	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
502	    return lasttok = BEGLINE;	/* FIXME: should be beginning of string */
503	  goto normal_char;
504
505	case '\'':
506	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
507	    return lasttok = ENDLINE;	/* FIXME: should be end of string */
508	  goto normal_char;
509
510	case '<':
511	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
512	    return lasttok = BEGWORD;
513	  goto normal_char;
514
515	case '>':
516	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
517	    return lasttok = ENDWORD;
518	  goto normal_char;
519
520	case 'b':
521	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
522	    return lasttok = LIMWORD;
523	  goto normal_char;
524
525	case 'B':
526	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
527	    return lasttok = NOTLIMWORD;
528	  goto normal_char;
529
530	case '?':
531	  if (syntax_bits & RE_LIMITED_OPS)
532	    goto normal_char;
533	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
534	    goto normal_char;
535	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
536	    goto normal_char;
537	  return lasttok = QMARK;
538
539	case '*':
540	  if (backslash)
541	    goto normal_char;
542	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
543	    goto normal_char;
544	  return lasttok = STAR;
545
546	case '+':
547	  if (syntax_bits & RE_LIMITED_OPS)
548	    goto normal_char;
549	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
550	    goto normal_char;
551	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
552	    goto normal_char;
553	  return lasttok = PLUS;
554
555	case '{':
556	  if (!(syntax_bits & RE_INTERVALS))
557	    goto normal_char;
558	  if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
559	    goto normal_char;
560	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
561	    goto normal_char;
562
563	  if (syntax_bits & RE_NO_BK_BRACES)
564	    {
565	      /* Scan ahead for a valid interval; if it's not valid,
566		 treat it as a literal '{'.  */
567	      int lo = -1, hi = -1;
568	      char const *p = lexptr;
569	      char const *lim = p + lexleft;
570	      for (;  p != lim && ISASCIIDIGIT (*p);  p++)
571		lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
572	      if (p != lim && *p == ',')
573		while (++p != lim && ISASCIIDIGIT (*p))
574		  hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
575	      else
576		hi = lo;
577	      if (p == lim || *p != '}'
578		  || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
579		goto normal_char;
580	    }
581
582	  minrep = 0;
583	  /* Cases:
584	     {M} - exact count
585	     {M,} - minimum count, maximum is infinity
586	     {M,N} - M through N */
587	  FETCH(c, _("unfinished repeat count"));
588	  if (ISASCIIDIGIT (c))
589	    {
590	      minrep = c - '0';
591	      for (;;)
592		{
593		  FETCH(c, _("unfinished repeat count"));
594		  if (! ISASCIIDIGIT (c))
595		    break;
596		  minrep = 10 * minrep + c - '0';
597		}
598	    }
599	  else
600	    dfaerror(_("malformed repeat count"));
601	  if (c == ',')
602	    {
603	      FETCH (c, _("unfinished repeat count"));
604	      if (! ISASCIIDIGIT (c))
605		maxrep = -1;
606	      else
607		{
608		  maxrep = c - '0';
609		  for (;;)
610		    {
611		      FETCH (c, _("unfinished repeat count"));
612		      if (! ISASCIIDIGIT (c))
613			break;
614		      maxrep = 10 * maxrep + c - '0';
615		    }
616		  if (0 <= maxrep && maxrep < minrep)
617		    dfaerror (_("malformed repeat count"));
618		}
619	    }
620	  else
621	    maxrep = minrep;
622	  if (!(syntax_bits & RE_NO_BK_BRACES))
623	    {
624	      if (c != '\\')
625		dfaerror(_("malformed repeat count"));
626	      FETCH(c, _("unfinished repeat count"));
627	    }
628	  if (c != '}')
629	    dfaerror(_("malformed repeat count"));
630	  laststart = 0;
631	  return lasttok = REPMN;
632
633	case '|':
634	  if (syntax_bits & RE_LIMITED_OPS)
635	    goto normal_char;
636	  if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
637	    goto normal_char;
638	  laststart = 1;
639	  return lasttok = OR;
640
641	case '\n':
642	  if (syntax_bits & RE_LIMITED_OPS
643	      || backslash
644	      || !(syntax_bits & RE_NEWLINE_ALT))
645	    goto normal_char;
646	  laststart = 1;
647	  return lasttok = OR;
648
649	case '(':
650	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
651	    goto normal_char;
652	  ++parens;
653	  laststart = 1;
654	  return lasttok = LPAREN;
655
656	case ')':
657	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
658	    goto normal_char;
659	  if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
660	    goto normal_char;
661	  --parens;
662	  laststart = 0;
663	  return lasttok = RPAREN;
664
665	case '.':
666	  if (backslash)
667	    goto normal_char;
668	  zeroset(ccl);
669	  notset(ccl);
670	  if (!(syntax_bits & RE_DOT_NEWLINE))
671	    clrbit(eolbyte, ccl);
672	  if (syntax_bits & RE_DOT_NOT_NULL)
673	    clrbit('\0', ccl);
674	  laststart = 0;
675	  return lasttok = CSET + charclass_index(ccl);
676
677	case 'w':
678	case 'W':
679	  if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
680	    goto normal_char;
681	  zeroset(ccl);
682	  for (c2 = 0; c2 < NOTCHAR; ++c2)
683	    if (IS_WORD_CONSTITUENT(c2))
684	      setbit(c2, ccl);
685	  if (c == 'W')
686	    notset(ccl);
687	  laststart = 0;
688	  return lasttok = CSET + charclass_index(ccl);
689
690	case '[':
691	  if (backslash)
692	    goto normal_char;
693	  zeroset(ccl);
694	  FETCH(c, _("Unbalanced ["));
695	  if (c == '^')
696	    {
697	      FETCH(c, _("Unbalanced ["));
698	      invert = 1;
699	    }
700	  else
701	    invert = 0;
702	  do
703	    {
704	      /* Nobody ever said this had to be fast. :-)
705		 Note that if we're looking at some other [:...:]
706		 construct, we just treat it as a bunch of ordinary
707		 characters.  We can do this because we assume
708		 regex has checked for syntax errors before
709		 dfa is ever called. */
710	      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
711		for (c1 = 0; prednames[c1].name; ++c1)
712		  if (looking_at(prednames[c1].name))
713		    {
714			int (*pred)() = prednames[c1].pred;
715			if (case_fold
716			    && (pred == is_upper || pred == is_lower))
717				pred = is_alpha;
718
719		      for (c2 = 0; c2 < NOTCHAR; ++c2)
720			if ((*pred)(c2))
721			  setbit(c2, ccl);
722		      lexptr += strlen(prednames[c1].name);
723		      lexleft -= strlen(prednames[c1].name);
724		      FETCH(c1, _("Unbalanced ["));
725		      goto skip;
726		    }
727	      if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
728		FETCH(c, _("Unbalanced ["));
729	      FETCH(c1, _("Unbalanced ["));
730	      if (c1 == '-')
731		{
732		  FETCH(c2, _("Unbalanced ["));
733		  if (c2 == ']')
734		    {
735		      /* In the case [x-], the - is an ordinary hyphen,
736			 which is left in c1, the lookahead character. */
737		      --lexptr;
738		      ++lexleft;
739		      c2 = c;
740		    }
741		  else
742		    {
743		      if (c2 == '\\'
744			  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
745			FETCH(c2, _("Unbalanced ["));
746		      FETCH(c1, _("Unbalanced ["));
747		    }
748		}
749	      else
750		c2 = c;
751
752	      lo[0] = c;  lo[1] = '\0';
753	      hi[0] = c2; hi[1] = '\0';
754	      for (c = 0; c < NOTCHAR; c++)
755		{
756		  char ch[2];
757		  ch[0] = c;  ch[1] = '\0';
758		  if (strcoll (lo, ch) <= 0 && strcoll (ch, hi) <= 0)
759		    {
760		      setbit (c, ccl);
761		      if (case_fold)
762			{
763			  if (ISUPPER (c))
764			    setbit (tolower (c), ccl);
765			  else if (ISLOWER (c))
766			    setbit (toupper (c), ccl);
767			}
768		    }
769		}
770
771	    skip:
772	      ;
773	    }
774	  while ((c = c1) != ']');
775	  if (invert)
776	    {
777	      notset(ccl);
778	      if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
779		clrbit(eolbyte, ccl);
780	    }
781	  laststart = 0;
782	  return lasttok = CSET + charclass_index(ccl);
783
784	default:
785	normal_char:
786	  laststart = 0;
787	  if (case_fold && ISALPHA(c))
788	    {
789	      zeroset(ccl);
790	      setbit(c, ccl);
791	      if (isupper(c))
792		setbit(tolower(c), ccl);
793	      else
794		setbit(toupper(c), ccl);
795	      return lasttok = CSET + charclass_index(ccl);
796	    }
797	  return c;
798	}
799    }
800
801  /* The above loop should consume at most a backslash
802     and some other character. */
803  abort();
804  return END;	/* keeps pedantic compilers happy. */
805}
806
807/* Recursive descent parser for regular expressions. */
808
809static token tok;		/* Lookahead token. */
810static int depth;		/* Current depth of a hypothetical stack
811				   holding deferred productions.  This is
812				   used to determine the depth that will be
813				   required of the real stack later on in
814				   dfaanalyze(). */
815
816/* Add the given token to the parse tree, maintaining the depth count and
817   updating the maximum depth if necessary. */
818static void
819addtok (token t)
820{
821  REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
822  dfa->tokens[dfa->tindex++] = t;
823
824  switch (t)
825    {
826    case QMARK:
827    case STAR:
828    case PLUS:
829      break;
830
831    case CAT:
832    case OR:
833    case ORTOP:
834      --depth;
835      break;
836
837    default:
838      ++dfa->nleaves;
839    case EMPTY:
840      ++depth;
841      break;
842    }
843  if (depth > dfa->depth)
844    dfa->depth = depth;
845}
846
847/* The grammar understood by the parser is as follows.
848
849   regexp:
850     regexp OR branch
851     branch
852
853   branch:
854     branch closure
855     closure
856
857   closure:
858     closure QMARK
859     closure STAR
860     closure PLUS
861     atom
862
863   atom:
864     <normal character>
865     CSET
866     BACKREF
867     BEGLINE
868     ENDLINE
869     BEGWORD
870     ENDWORD
871     LIMWORD
872     NOTLIMWORD
873     <empty>
874
875   The parser builds a parse tree in postfix form in an array of tokens. */
876
877static void
878atom (void)
879{
880  if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
881      || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
882      || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
883    {
884      addtok(tok);
885      tok = lex();
886    }
887  else if (tok == LPAREN)
888    {
889      tok = lex();
890      regexp(0);
891      if (tok != RPAREN)
892	dfaerror(_("Unbalanced ("));
893      tok = lex();
894    }
895  else
896    addtok(EMPTY);
897}
898
899/* Return the number of tokens in the given subexpression. */
900static int
901nsubtoks (int tindex)
902{
903  int ntoks1;
904
905  switch (dfa->tokens[tindex - 1])
906    {
907    default:
908      return 1;
909    case QMARK:
910    case STAR:
911    case PLUS:
912      return 1 + nsubtoks(tindex - 1);
913    case CAT:
914    case OR:
915    case ORTOP:
916      ntoks1 = nsubtoks(tindex - 1);
917      return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
918    }
919}
920
921/* Copy the given subexpression to the top of the tree. */
922static void
923copytoks (int tindex, int ntokens)
924{
925  int i;
926
927  for (i = 0; i < ntokens; ++i)
928    addtok(dfa->tokens[tindex + i]);
929}
930
931static void
932closure (void)
933{
934  int tindex, ntokens, i;
935
936  atom();
937  while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
938    if (tok == REPMN)
939      {
940	ntokens = nsubtoks(dfa->tindex);
941	tindex = dfa->tindex - ntokens;
942	if (maxrep < 0)
943	  addtok(PLUS);
944	if (minrep == 0)
945	  addtok(QMARK);
946	for (i = 1; i < minrep; ++i)
947	  {
948	    copytoks(tindex, ntokens);
949	    addtok(CAT);
950	  }
951	for (; i < maxrep; ++i)
952	  {
953	    copytoks(tindex, ntokens);
954	    addtok(QMARK);
955	    addtok(CAT);
956	  }
957	tok = lex();
958      }
959    else
960      {
961	addtok(tok);
962	tok = lex();
963      }
964}
965
966static void
967branch (void)
968{
969  closure();
970  while (tok != RPAREN && tok != OR && tok >= 0)
971    {
972      closure();
973      addtok(CAT);
974    }
975}
976
977static void
978regexp (int toplevel)
979{
980  branch();
981  while (tok == OR)
982    {
983      tok = lex();
984      branch();
985      if (toplevel)
986	addtok(ORTOP);
987      else
988	addtok(OR);
989    }
990}
991
992/* Main entry point for the parser.  S is a string to be parsed, len is the
993   length of the string, so s can include NUL characters.  D is a pointer to
994   the struct dfa to parse into. */
995void
996dfaparse (char *s, size_t len, struct dfa *d)
997{
998  dfa = d;
999  lexstart = lexptr = s;
1000  lexleft = len;
1001  lasttok = END;
1002  laststart = 1;
1003  parens = 0;
1004
1005  if (! syntax_bits_set)
1006    dfaerror(_("No syntax specified"));
1007
1008  tok = lex();
1009  depth = d->depth;
1010
1011  regexp(1);
1012
1013  if (tok != END)
1014    dfaerror(_("Unbalanced )"));
1015
1016  addtok(END - d->nregexps);
1017  addtok(CAT);
1018
1019  if (d->nregexps)
1020    addtok(ORTOP);
1021
1022  ++d->nregexps;
1023}
1024
1025/* Some primitives for operating on sets of positions. */
1026
1027/* Copy one set to another; the destination must be large enough. */
1028static void
1029copy (position_set *src, position_set *dst)
1030{
1031  int i;
1032
1033  for (i = 0; i < src->nelem; ++i)
1034    dst->elems[i] = src->elems[i];
1035  dst->nelem = src->nelem;
1036}
1037
1038/* Insert a position in a set.  Position sets are maintained in sorted
1039   order according to index.  If position already exists in the set with
1040   the same index then their constraints are logically or'd together.
1041   S->elems must point to an array large enough to hold the resulting set. */
1042static void
1043insert (position p, position_set *s)
1044{
1045  int i;
1046  position t1, t2;
1047
1048  for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1049    continue;
1050  if (i < s->nelem && p.index == s->elems[i].index)
1051    s->elems[i].constraint |= p.constraint;
1052  else
1053    {
1054      t1 = p;
1055      ++s->nelem;
1056      while (i < s->nelem)
1057	{
1058	  t2 = s->elems[i];
1059	  s->elems[i++] = t1;
1060	  t1 = t2;
1061	}
1062    }
1063}
1064
1065/* Merge two sets of positions into a third.  The result is exactly as if
1066   the positions of both sets were inserted into an initially empty set. */
1067static void
1068merge (position_set *s1, position_set *s2, position_set *m)
1069{
1070  int i = 0, j = 0;
1071
1072  m->nelem = 0;
1073  while (i < s1->nelem && j < s2->nelem)
1074    if (s1->elems[i].index > s2->elems[j].index)
1075      m->elems[m->nelem++] = s1->elems[i++];
1076    else if (s1->elems[i].index < s2->elems[j].index)
1077      m->elems[m->nelem++] = s2->elems[j++];
1078    else
1079      {
1080	m->elems[m->nelem] = s1->elems[i++];
1081	m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1082      }
1083  while (i < s1->nelem)
1084    m->elems[m->nelem++] = s1->elems[i++];
1085  while (j < s2->nelem)
1086    m->elems[m->nelem++] = s2->elems[j++];
1087}
1088
1089/* Delete a position from a set. */
1090static void
1091delete (position p, position_set *s)
1092{
1093  int i;
1094
1095  for (i = 0; i < s->nelem; ++i)
1096    if (p.index == s->elems[i].index)
1097      break;
1098  if (i < s->nelem)
1099    for (--s->nelem; i < s->nelem; ++i)
1100      s->elems[i] = s->elems[i + 1];
1101}
1102
1103/* Find the index of the state corresponding to the given position set with
1104   the given preceding context, or create a new state if there is no such
1105   state.  Newline and letter tell whether we got here on a newline or
1106   letter, respectively. */
1107static int
1108state_index (struct dfa *d, position_set *s, int newline, int letter)
1109{
1110  int hash = 0;
1111  int constraint;
1112  int i, j;
1113
1114  newline = newline ? 1 : 0;
1115  letter = letter ? 1 : 0;
1116
1117  for (i = 0; i < s->nelem; ++i)
1118    hash ^= s->elems[i].index + s->elems[i].constraint;
1119
1120  /* Try to find a state that exactly matches the proposed one. */
1121  for (i = 0; i < d->sindex; ++i)
1122    {
1123      if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1124	  || newline != d->states[i].newline || letter != d->states[i].letter)
1125	continue;
1126      for (j = 0; j < s->nelem; ++j)
1127	if (s->elems[j].constraint
1128	    != d->states[i].elems.elems[j].constraint
1129	    || s->elems[j].index != d->states[i].elems.elems[j].index)
1130	  break;
1131      if (j == s->nelem)
1132	return i;
1133    }
1134
1135  /* We'll have to create a new state. */
1136  REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1137  d->states[i].hash = hash;
1138  MALLOC(d->states[i].elems.elems, position, s->nelem);
1139  copy(s, &d->states[i].elems);
1140  d->states[i].newline = newline;
1141  d->states[i].letter = letter;
1142  d->states[i].backref = 0;
1143  d->states[i].constraint = 0;
1144  d->states[i].first_end = 0;
1145  for (j = 0; j < s->nelem; ++j)
1146    if (d->tokens[s->elems[j].index] < 0)
1147      {
1148	constraint = s->elems[j].constraint;
1149	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1150	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1151	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1152	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1153	  d->states[i].constraint |= constraint;
1154	if (! d->states[i].first_end)
1155	  d->states[i].first_end = d->tokens[s->elems[j].index];
1156      }
1157    else if (d->tokens[s->elems[j].index] == BACKREF)
1158      {
1159	d->states[i].constraint = NO_CONSTRAINT;
1160	d->states[i].backref = 1;
1161      }
1162
1163  ++d->sindex;
1164
1165  return i;
1166}
1167
1168/* Find the epsilon closure of a set of positions.  If any position of the set
1169   contains a symbol that matches the empty string in some context, replace
1170   that position with the elements of its follow labeled with an appropriate
1171   constraint.  Repeat exhaustively until no funny positions are left.
1172   S->elems must be large enough to hold the result. */
1173static void
1174epsclosure (position_set *s, struct dfa *d)
1175{
1176  int i, j;
1177  int *visited;
1178  position p, old;
1179
1180  MALLOC(visited, int, d->tindex);
1181  for (i = 0; i < d->tindex; ++i)
1182    visited[i] = 0;
1183
1184  for (i = 0; i < s->nelem; ++i)
1185    if (d->tokens[s->elems[i].index] >= NOTCHAR
1186	&& d->tokens[s->elems[i].index] != BACKREF
1187	&& d->tokens[s->elems[i].index] < CSET)
1188      {
1189	old = s->elems[i];
1190	p.constraint = old.constraint;
1191	delete(s->elems[i], s);
1192	if (visited[old.index])
1193	  {
1194	    --i;
1195	    continue;
1196	  }
1197	visited[old.index] = 1;
1198	switch (d->tokens[old.index])
1199	  {
1200	  case BEGLINE:
1201	    p.constraint &= BEGLINE_CONSTRAINT;
1202	    break;
1203	  case ENDLINE:
1204	    p.constraint &= ENDLINE_CONSTRAINT;
1205	    break;
1206	  case BEGWORD:
1207	    p.constraint &= BEGWORD_CONSTRAINT;
1208	    break;
1209	  case ENDWORD:
1210	    p.constraint &= ENDWORD_CONSTRAINT;
1211	    break;
1212	  case LIMWORD:
1213	    p.constraint &= LIMWORD_CONSTRAINT;
1214	    break;
1215	  case NOTLIMWORD:
1216	    p.constraint &= NOTLIMWORD_CONSTRAINT;
1217	    break;
1218	  default:
1219	    break;
1220	  }
1221	for (j = 0; j < d->follows[old.index].nelem; ++j)
1222	  {
1223	    p.index = d->follows[old.index].elems[j].index;
1224	    insert(p, s);
1225	  }
1226	/* Force rescan to start at the beginning. */
1227	i = -1;
1228      }
1229
1230  free(visited);
1231}
1232
1233/* Perform bottom-up analysis on the parse tree, computing various functions.
1234   Note that at this point, we're pretending constructs like \< are real
1235   characters rather than constraints on what can follow them.
1236
1237   Nullable:  A node is nullable if it is at the root of a regexp that can
1238   match the empty string.
1239   *  EMPTY leaves are nullable.
1240   * No other leaf is nullable.
1241   * A QMARK or STAR node is nullable.
1242   * A PLUS node is nullable if its argument is nullable.
1243   * A CAT node is nullable if both its arguments are nullable.
1244   * An OR node is nullable if either argument is nullable.
1245
1246   Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
1247   that could correspond to the first character of a string matching the
1248   regexp rooted at the given node.
1249   * EMPTY leaves have empty firstpos.
1250   * The firstpos of a nonempty leaf is that leaf itself.
1251   * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1252     argument.
1253   * The firstpos of a CAT node is the firstpos of the left argument, union
1254     the firstpos of the right if the left argument is nullable.
1255   * The firstpos of an OR node is the union of firstpos of each argument.
1256
1257   Lastpos:  The lastpos of a node is the set of positions that could
1258   correspond to the last character of a string matching the regexp at
1259   the given node.
1260   * EMPTY leaves have empty lastpos.
1261   * The lastpos of a nonempty leaf is that leaf itself.
1262   * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1263     argument.
1264   * The lastpos of a CAT node is the lastpos of its right argument, union
1265     the lastpos of the left if the right argument is nullable.
1266   * The lastpos of an OR node is the union of the lastpos of each argument.
1267
1268   Follow:  The follow of a position is the set of positions that could
1269   correspond to the character following a character matching the node in
1270   a string matching the regexp.  At this point we consider special symbols
1271   that match the empty string in some context to be just normal characters.
1272   Later, if we find that a special symbol is in a follow set, we will
1273   replace it with the elements of its follow, labeled with an appropriate
1274   constraint.
1275   * Every node in the firstpos of the argument of a STAR or PLUS node is in
1276     the follow of every node in the lastpos.
1277   * Every node in the firstpos of the second argument of a CAT node is in
1278     the follow of every node in the lastpos of the first argument.
1279
1280   Because of the postfix representation of the parse tree, the depth-first
1281   analysis is conveniently done by a linear scan with the aid of a stack.
1282   Sets are stored as arrays of the elements, obeying a stack-like allocation
1283   scheme; the number of elements in each set deeper in the stack can be
1284   used to determine the address of a particular set's array. */
1285void
1286dfaanalyze (struct dfa *d, int searchflag)
1287{
1288  int *nullable;		/* Nullable stack. */
1289  int *nfirstpos;		/* Element count stack for firstpos sets. */
1290  position *firstpos;		/* Array where firstpos elements are stored. */
1291  int *nlastpos;		/* Element count stack for lastpos sets. */
1292  position *lastpos;		/* Array where lastpos elements are stored. */
1293  int *nalloc;			/* Sizes of arrays allocated to follow sets. */
1294  position_set tmp;		/* Temporary set for merging sets. */
1295  position_set merged;		/* Result of merging sets. */
1296  int wants_newline;		/* True if some position wants newline info. */
1297  int *o_nullable;
1298  int *o_nfirst, *o_nlast;
1299  position *o_firstpos, *o_lastpos;
1300  int i, j;
1301  position *pos;
1302
1303#ifdef DEBUG
1304  fprintf(stderr, "dfaanalyze:\n");
1305  for (i = 0; i < d->tindex; ++i)
1306    {
1307      fprintf(stderr, " %d:", i);
1308      prtok(d->tokens[i]);
1309    }
1310  putc('\n', stderr);
1311#endif
1312
1313  d->searchflag = searchflag;
1314
1315  MALLOC(nullable, int, d->depth);
1316  o_nullable = nullable;
1317  MALLOC(nfirstpos, int, d->depth);
1318  o_nfirst = nfirstpos;
1319  MALLOC(firstpos, position, d->nleaves);
1320  o_firstpos = firstpos, firstpos += d->nleaves;
1321  MALLOC(nlastpos, int, d->depth);
1322  o_nlast = nlastpos;
1323  MALLOC(lastpos, position, d->nleaves);
1324  o_lastpos = lastpos, lastpos += d->nleaves;
1325  MALLOC(nalloc, int, d->tindex);
1326  for (i = 0; i < d->tindex; ++i)
1327    nalloc[i] = 0;
1328  MALLOC(merged.elems, position, d->nleaves);
1329
1330  CALLOC(d->follows, position_set, d->tindex);
1331
1332  for (i = 0; i < d->tindex; ++i)
1333#ifdef DEBUG
1334    {				/* Nonsyntactic #ifdef goo... */
1335#endif
1336    switch (d->tokens[i])
1337      {
1338      case EMPTY:
1339	/* The empty set is nullable. */
1340	*nullable++ = 1;
1341
1342	/* The firstpos and lastpos of the empty leaf are both empty. */
1343	*nfirstpos++ = *nlastpos++ = 0;
1344	break;
1345
1346      case STAR:
1347      case PLUS:
1348	/* Every element in the firstpos of the argument is in the follow
1349	   of every element in the lastpos. */
1350	tmp.nelem = nfirstpos[-1];
1351	tmp.elems = firstpos;
1352	pos = lastpos;
1353	for (j = 0; j < nlastpos[-1]; ++j)
1354	  {
1355	    merge(&tmp, &d->follows[pos[j].index], &merged);
1356	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1357				 nalloc[pos[j].index], merged.nelem - 1);
1358	    copy(&merged, &d->follows[pos[j].index]);
1359	  }
1360
1361      case QMARK:
1362	/* A QMARK or STAR node is automatically nullable. */
1363	if (d->tokens[i] != PLUS)
1364	  nullable[-1] = 1;
1365	break;
1366
1367      case CAT:
1368	/* Every element in the firstpos of the second argument is in the
1369	   follow of every element in the lastpos of the first argument. */
1370	tmp.nelem = nfirstpos[-1];
1371	tmp.elems = firstpos;
1372	pos = lastpos + nlastpos[-1];
1373	for (j = 0; j < nlastpos[-2]; ++j)
1374	  {
1375	    merge(&tmp, &d->follows[pos[j].index], &merged);
1376	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1377				 nalloc[pos[j].index], merged.nelem - 1);
1378	    copy(&merged, &d->follows[pos[j].index]);
1379	  }
1380
1381	/* The firstpos of a CAT node is the firstpos of the first argument,
1382	   union that of the second argument if the first is nullable. */
1383	if (nullable[-2])
1384	  nfirstpos[-2] += nfirstpos[-1];
1385	else
1386	  firstpos += nfirstpos[-1];
1387	--nfirstpos;
1388
1389	/* The lastpos of a CAT node is the lastpos of the second argument,
1390	   union that of the first argument if the second is nullable. */
1391	if (nullable[-1])
1392	  nlastpos[-2] += nlastpos[-1];
1393	else
1394	  {
1395	    pos = lastpos + nlastpos[-2];
1396	    for (j = nlastpos[-1] - 1; j >= 0; --j)
1397	      pos[j] = lastpos[j];
1398	    lastpos += nlastpos[-2];
1399	    nlastpos[-2] = nlastpos[-1];
1400	  }
1401	--nlastpos;
1402
1403	/* A CAT node is nullable if both arguments are nullable. */
1404	nullable[-2] = nullable[-1] && nullable[-2];
1405	--nullable;
1406	break;
1407
1408      case OR:
1409      case ORTOP:
1410	/* The firstpos is the union of the firstpos of each argument. */
1411	nfirstpos[-2] += nfirstpos[-1];
1412	--nfirstpos;
1413
1414	/* The lastpos is the union of the lastpos of each argument. */
1415	nlastpos[-2] += nlastpos[-1];
1416	--nlastpos;
1417
1418	/* An OR node is nullable if either argument is nullable. */
1419	nullable[-2] = nullable[-1] || nullable[-2];
1420	--nullable;
1421	break;
1422
1423      default:
1424	/* Anything else is a nonempty position.  (Note that special
1425	   constructs like \< are treated as nonempty strings here;
1426	   an "epsilon closure" effectively makes them nullable later.
1427	   Backreferences have to get a real position so we can detect
1428	   transitions on them later.  But they are nullable. */
1429	*nullable++ = d->tokens[i] == BACKREF;
1430
1431	/* This position is in its own firstpos and lastpos. */
1432	*nfirstpos++ = *nlastpos++ = 1;
1433	--firstpos, --lastpos;
1434	firstpos->index = lastpos->index = i;
1435	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1436
1437	/* Allocate the follow set for this position. */
1438	nalloc[i] = 1;
1439	MALLOC(d->follows[i].elems, position, nalloc[i]);
1440	break;
1441      }
1442#ifdef DEBUG
1443    /* ... balance the above nonsyntactic #ifdef goo... */
1444      fprintf(stderr, "node %d:", i);
1445      prtok(d->tokens[i]);
1446      putc('\n', stderr);
1447      fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1448      fprintf(stderr, " firstpos:");
1449      for (j = nfirstpos[-1] - 1; j >= 0; --j)
1450	{
1451	  fprintf(stderr, " %d:", firstpos[j].index);
1452	  prtok(d->tokens[firstpos[j].index]);
1453	}
1454      fprintf(stderr, "\n lastpos:");
1455      for (j = nlastpos[-1] - 1; j >= 0; --j)
1456	{
1457	  fprintf(stderr, " %d:", lastpos[j].index);
1458	  prtok(d->tokens[lastpos[j].index]);
1459	}
1460      putc('\n', stderr);
1461    }
1462#endif
1463
1464  /* For each follow set that is the follow set of a real position, replace
1465     it with its epsilon closure. */
1466  for (i = 0; i < d->tindex; ++i)
1467    if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1468	|| d->tokens[i] >= CSET)
1469      {
1470#ifdef DEBUG
1471	fprintf(stderr, "follows(%d:", i);
1472	prtok(d->tokens[i]);
1473	fprintf(stderr, "):");
1474	for (j = d->follows[i].nelem - 1; j >= 0; --j)
1475	  {
1476	    fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1477	    prtok(d->tokens[d->follows[i].elems[j].index]);
1478	  }
1479	putc('\n', stderr);
1480#endif
1481	copy(&d->follows[i], &merged);
1482	epsclosure(&merged, d);
1483	if (d->follows[i].nelem < merged.nelem)
1484	  REALLOC(d->follows[i].elems, position, merged.nelem);
1485	copy(&merged, &d->follows[i]);
1486      }
1487
1488  /* Get the epsilon closure of the firstpos of the regexp.  The result will
1489     be the set of positions of state 0. */
1490  merged.nelem = 0;
1491  for (i = 0; i < nfirstpos[-1]; ++i)
1492    insert(firstpos[i], &merged);
1493  epsclosure(&merged, d);
1494
1495  /* Check if any of the positions of state 0 will want newline context. */
1496  wants_newline = 0;
1497  for (i = 0; i < merged.nelem; ++i)
1498    if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1499      wants_newline = 1;
1500
1501  /* Build the initial state. */
1502  d->salloc = 1;
1503  d->sindex = 0;
1504  MALLOC(d->states, dfa_state, d->salloc);
1505  state_index(d, &merged, wants_newline, 0);
1506
1507  free(o_nullable);
1508  free(o_nfirst);
1509  free(o_firstpos);
1510  free(o_nlast);
1511  free(o_lastpos);
1512  free(nalloc);
1513  free(merged.elems);
1514}
1515
1516/* Find, for each character, the transition out of state s of d, and store
1517   it in the appropriate slot of trans.
1518
1519   We divide the positions of s into groups (positions can appear in more
1520   than one group).  Each group is labeled with a set of characters that
1521   every position in the group matches (taking into account, if necessary,
1522   preceding context information of s).  For each group, find the union
1523   of the its elements' follows.  This set is the set of positions of the
1524   new state.  For each character in the group's label, set the transition
1525   on this character to be to a state corresponding to the set's positions,
1526   and its associated backward context information, if necessary.
1527
1528   If we are building a searching matcher, we include the positions of state
1529   0 in every state.
1530
1531   The collection of groups is constructed by building an equivalence-class
1532   partition of the positions of s.
1533
1534   For each position, find the set of characters C that it matches.  Eliminate
1535   any characters from C that fail on grounds of backward context.
1536
1537   Search through the groups, looking for a group whose label L has nonempty
1538   intersection with C.  If L - C is nonempty, create a new group labeled
1539   L - C and having the same positions as the current group, and set L to
1540   the intersection of L and C.  Insert the position in this group, set
1541   C = C - L, and resume scanning.
1542
1543   If after comparing with every group there are characters remaining in C,
1544   create a new group labeled with the characters of C and insert this
1545   position in that group. */
1546void
1547dfastate (int s, struct dfa *d, int trans[])
1548{
1549  position_set grps[NOTCHAR];	/* As many as will ever be needed. */
1550  charclass labels[NOTCHAR];	/* Labels corresponding to the groups. */
1551  int ngrps = 0;		/* Number of groups actually used. */
1552  position pos;			/* Current position being considered. */
1553  charclass matches;		/* Set of matching characters. */
1554  int matchesf;			/* True if matches is nonempty. */
1555  charclass intersect;		/* Intersection with some label set. */
1556  int intersectf;		/* True if intersect is nonempty. */
1557  charclass leftovers;		/* Stuff in the label that didn't match. */
1558  int leftoversf;		/* True if leftovers is nonempty. */
1559  static charclass letters;	/* Set of characters considered letters. */
1560  static charclass newline;	/* Set of characters that aren't newline. */
1561  position_set follows;		/* Union of the follows of some group. */
1562  position_set tmp;		/* Temporary space for merging sets. */
1563  int state;			/* New state. */
1564  int wants_newline;		/* New state wants to know newline context. */
1565  int state_newline;		/* New state on a newline transition. */
1566  int wants_letter;		/* New state wants to know letter context. */
1567  int state_letter;		/* New state on a letter transition. */
1568  static int initialized;	/* Flag for static initialization. */
1569  int i, j, k;
1570
1571  /* Initialize the set of letters, if necessary. */
1572  if (! initialized)
1573    {
1574      initialized = 1;
1575      for (i = 0; i < NOTCHAR; ++i)
1576	if (IS_WORD_CONSTITUENT(i))
1577	  setbit(i, letters);
1578      setbit(eolbyte, newline);
1579    }
1580
1581  zeroset(matches);
1582
1583  for (i = 0; i < d->states[s].elems.nelem; ++i)
1584    {
1585      pos = d->states[s].elems.elems[i];
1586      if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1587	setbit(d->tokens[pos.index], matches);
1588      else if (d->tokens[pos.index] >= CSET)
1589	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1590      else
1591	continue;
1592
1593      /* Some characters may need to be eliminated from matches because
1594	 they fail in the current context. */
1595      if (pos.constraint != 0xFF)
1596	{
1597	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1598					 d->states[s].newline, 1))
1599	    clrbit(eolbyte, matches);
1600	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1601					 d->states[s].newline, 0))
1602	    for (j = 0; j < CHARCLASS_INTS; ++j)
1603	      matches[j] &= newline[j];
1604	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1605					d->states[s].letter, 1))
1606	    for (j = 0; j < CHARCLASS_INTS; ++j)
1607	      matches[j] &= ~letters[j];
1608	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1609					d->states[s].letter, 0))
1610	    for (j = 0; j < CHARCLASS_INTS; ++j)
1611	      matches[j] &= letters[j];
1612
1613	  /* If there are no characters left, there's no point in going on. */
1614	  for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
1615	    continue;
1616	  if (j == CHARCLASS_INTS)
1617	    continue;
1618	}
1619
1620      for (j = 0; j < ngrps; ++j)
1621	{
1622	  /* If matches contains a single character only, and the current
1623	     group's label doesn't contain that character, go on to the
1624	     next group. */
1625	  if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
1626	      && !tstbit(d->tokens[pos.index], labels[j]))
1627	    continue;
1628
1629	  /* Check if this group's label has a nonempty intersection with
1630	     matches. */
1631	  intersectf = 0;
1632	  for (k = 0; k < CHARCLASS_INTS; ++k)
1633	    (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
1634	  if (! intersectf)
1635	    continue;
1636
1637	  /* It does; now find the set differences both ways. */
1638	  leftoversf = matchesf = 0;
1639	  for (k = 0; k < CHARCLASS_INTS; ++k)
1640	    {
1641	      /* Even an optimizing compiler can't know this for sure. */
1642	      int match = matches[k], label = labels[j][k];
1643
1644	      (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
1645	      (matches[k] = match & ~label) ? (matchesf = 1) : 0;
1646	    }
1647
1648	  /* If there were leftovers, create a new group labeled with them. */
1649	  if (leftoversf)
1650	    {
1651	      copyset(leftovers, labels[ngrps]);
1652	      copyset(intersect, labels[j]);
1653	      MALLOC(grps[ngrps].elems, position, d->nleaves);
1654	      copy(&grps[j], &grps[ngrps]);
1655	      ++ngrps;
1656	    }
1657
1658	  /* Put the position in the current group.  Note that there is no
1659	     reason to call insert() here. */
1660	  grps[j].elems[grps[j].nelem++] = pos;
1661
1662	  /* If every character matching the current position has been
1663	     accounted for, we're done. */
1664	  if (! matchesf)
1665	    break;
1666	}
1667
1668      /* If we've passed the last group, and there are still characters
1669	 unaccounted for, then we'll have to create a new group. */
1670      if (j == ngrps)
1671	{
1672	  copyset(matches, labels[ngrps]);
1673	  zeroset(matches);
1674	  MALLOC(grps[ngrps].elems, position, d->nleaves);
1675	  grps[ngrps].nelem = 1;
1676	  grps[ngrps].elems[0] = pos;
1677	  ++ngrps;
1678	}
1679    }
1680
1681  MALLOC(follows.elems, position, d->nleaves);
1682  MALLOC(tmp.elems, position, d->nleaves);
1683
1684  /* If we are a searching matcher, the default transition is to a state
1685     containing the positions of state 0, otherwise the default transition
1686     is to fail miserably. */
1687  if (d->searchflag)
1688    {
1689      wants_newline = 0;
1690      wants_letter = 0;
1691      for (i = 0; i < d->states[0].elems.nelem; ++i)
1692	{
1693	  if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
1694	    wants_newline = 1;
1695	  if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
1696	    wants_letter = 1;
1697	}
1698      copy(&d->states[0].elems, &follows);
1699      state = state_index(d, &follows, 0, 0);
1700      if (wants_newline)
1701	state_newline = state_index(d, &follows, 1, 0);
1702      else
1703	state_newline = state;
1704      if (wants_letter)
1705	state_letter = state_index(d, &follows, 0, 1);
1706      else
1707	state_letter = state;
1708      for (i = 0; i < NOTCHAR; ++i)
1709	trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
1710      trans[eolbyte] = state_newline;
1711    }
1712  else
1713    for (i = 0; i < NOTCHAR; ++i)
1714      trans[i] = -1;
1715
1716  for (i = 0; i < ngrps; ++i)
1717    {
1718      follows.nelem = 0;
1719
1720      /* Find the union of the follows of the positions of the group.
1721	 This is a hideously inefficient loop.  Fix it someday. */
1722      for (j = 0; j < grps[i].nelem; ++j)
1723	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
1724	  insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
1725
1726      /* If we are building a searching matcher, throw in the positions
1727	 of state 0 as well. */
1728      if (d->searchflag)
1729	for (j = 0; j < d->states[0].elems.nelem; ++j)
1730	  insert(d->states[0].elems.elems[j], &follows);
1731
1732      /* Find out if the new state will want any context information. */
1733      wants_newline = 0;
1734      if (tstbit(eolbyte, labels[i]))
1735	for (j = 0; j < follows.nelem; ++j)
1736	  if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
1737	    wants_newline = 1;
1738
1739      wants_letter = 0;
1740      for (j = 0; j < CHARCLASS_INTS; ++j)
1741	if (labels[i][j] & letters[j])
1742	  break;
1743      if (j < CHARCLASS_INTS)
1744	for (j = 0; j < follows.nelem; ++j)
1745	  if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
1746	    wants_letter = 1;
1747
1748      /* Find the state(s) corresponding to the union of the follows. */
1749      state = state_index(d, &follows, 0, 0);
1750      if (wants_newline)
1751	state_newline = state_index(d, &follows, 1, 0);
1752      else
1753	state_newline = state;
1754      if (wants_letter)
1755	state_letter = state_index(d, &follows, 0, 1);
1756      else
1757	state_letter = state;
1758
1759      /* Set the transitions for each character in the current label. */
1760      for (j = 0; j < CHARCLASS_INTS; ++j)
1761	for (k = 0; k < INTBITS; ++k)
1762	  if (labels[i][j] & 1 << k)
1763	    {
1764	      int c = j * INTBITS + k;
1765
1766	      if (c == eolbyte)
1767		trans[c] = state_newline;
1768	      else if (IS_WORD_CONSTITUENT(c))
1769		trans[c] = state_letter;
1770	      else if (c < NOTCHAR)
1771		trans[c] = state;
1772	    }
1773    }
1774
1775  for (i = 0; i < ngrps; ++i)
1776    free(grps[i].elems);
1777  free(follows.elems);
1778  free(tmp.elems);
1779}
1780
1781/* Some routines for manipulating a compiled dfa's transition tables.
1782   Each state may or may not have a transition table; if it does, and it
1783   is a non-accepting state, then d->trans[state] points to its table.
1784   If it is an accepting state then d->fails[state] points to its table.
1785   If it has no table at all, then d->trans[state] is NULL.
1786   TODO: Improve this comment, get rid of the unnecessary redundancy. */
1787
1788static void
1789build_state (int s, struct dfa *d)
1790{
1791  int *trans;			/* The new transition table. */
1792  int i;
1793
1794  /* Set an upper limit on the number of transition tables that will ever
1795     exist at once.  1024 is arbitrary.  The idea is that the frequently
1796     used transition tables will be quickly rebuilt, whereas the ones that
1797     were only needed once or twice will be cleared away. */
1798  if (d->trcount >= 1024)
1799    {
1800      for (i = 0; i < d->tralloc; ++i)
1801	if (d->trans[i])
1802	  {
1803	    free((ptr_t) d->trans[i]);
1804	    d->trans[i] = NULL;
1805	  }
1806	else if (d->fails[i])
1807	  {
1808	    free((ptr_t) d->fails[i]);
1809	    d->fails[i] = NULL;
1810	  }
1811      d->trcount = 0;
1812    }
1813
1814  ++d->trcount;
1815
1816  /* Set up the success bits for this state. */
1817  d->success[s] = 0;
1818  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
1819      s, *d))
1820    d->success[s] |= 4;
1821  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
1822      s, *d))
1823    d->success[s] |= 2;
1824  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
1825      s, *d))
1826    d->success[s] |= 1;
1827
1828  MALLOC(trans, int, NOTCHAR);
1829  dfastate(s, d, trans);
1830
1831  /* Now go through the new transition table, and make sure that the trans
1832     and fail arrays are allocated large enough to hold a pointer for the
1833     largest state mentioned in the table. */
1834  for (i = 0; i < NOTCHAR; ++i)
1835    if (trans[i] >= d->tralloc)
1836      {
1837	int oldalloc = d->tralloc;
1838
1839	while (trans[i] >= d->tralloc)
1840	  d->tralloc *= 2;
1841	REALLOC(d->realtrans, int *, d->tralloc + 1);
1842	d->trans = d->realtrans + 1;
1843	REALLOC(d->fails, int *, d->tralloc);
1844	REALLOC(d->success, int, d->tralloc);
1845	REALLOC(d->newlines, int, d->tralloc);
1846	while (oldalloc < d->tralloc)
1847	  {
1848	    d->trans[oldalloc] = NULL;
1849	    d->fails[oldalloc++] = NULL;
1850	  }
1851      }
1852
1853  /* Keep the newline transition in a special place so we can use it as
1854     a sentinel. */
1855  d->newlines[s] = trans[eolbyte];
1856  trans[eolbyte] = -1;
1857
1858  if (ACCEPTING(s, *d))
1859    d->fails[s] = trans;
1860  else
1861    d->trans[s] = trans;
1862}
1863
1864static void
1865build_state_zero (struct dfa *d)
1866{
1867  d->tralloc = 1;
1868  d->trcount = 0;
1869  CALLOC(d->realtrans, int *, d->tralloc + 1);
1870  d->trans = d->realtrans + 1;
1871  CALLOC(d->fails, int *, d->tralloc);
1872  MALLOC(d->success, int, d->tralloc);
1873  MALLOC(d->newlines, int, d->tralloc);
1874  build_state(0, d);
1875}
1876
1877/* Search through a buffer looking for a match to the given struct dfa.
1878   Find the first occurrence of a string matching the regexp in the buffer,
1879   and the shortest possible version thereof.  Return a pointer to the first
1880   character after the match, or NULL if none is found.  Begin points to
1881   the beginning of the buffer, and end points to the first character after
1882   its end.  We store a newline in *end to act as a sentinel, so end had
1883   better point somewhere valid.  Newline is a flag indicating whether to
1884   allow newlines to be in the matching string.  If count is non-
1885   NULL it points to a place we're supposed to increment every time we
1886   see a newline.  Finally, if backref is non-NULL it points to a place
1887   where we're supposed to store a 1 if backreferencing happened and the
1888   match needs to be verified by a backtracking matcher.  Otherwise
1889   we store a 0 in *backref. */
1890char *
1891dfaexec (struct dfa *d, char *begin, char *end,
1892	 int newline, int *count, int *backref)
1893{
1894  register int s, s1, tmp;	/* Current state. */
1895  register unsigned char *p;	/* Current input character. */
1896  register int **trans, *t;	/* Copy of d->trans so it can be optimized
1897				   into a register. */
1898  register unsigned char eol = eolbyte;	/* Likewise for eolbyte.  */
1899  static int sbit[NOTCHAR];	/* Table for anding with d->success. */
1900  static int sbit_init;
1901
1902  if (! sbit_init)
1903    {
1904      int i;
1905
1906      sbit_init = 1;
1907      for (i = 0; i < NOTCHAR; ++i)
1908	sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
1909      sbit[eol] = 4;
1910    }
1911
1912  if (! d->tralloc)
1913    build_state_zero(d);
1914
1915  s = s1 = 0;
1916  p = (unsigned char *) begin;
1917  trans = d->trans;
1918  *end = eol;
1919
1920  for (;;)
1921    {
1922      while ((t = trans[s]) != 0) { /* hand-optimized loop */
1923	s1 = t[*p++];
1924        if ((t = trans[s1]) == 0) {
1925           tmp = s ; s = s1 ; s1 = tmp ; /* swap */
1926           break;
1927        }
1928	s = t[*p++];
1929      }
1930
1931      if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
1932	{
1933	  if (d->success[s] & sbit[*p])
1934	    {
1935	      if (backref)
1936		*backref = (d->states[s].backref != 0);
1937	      return (char *) p;
1938	    }
1939
1940	  s1 = s;
1941	  s = d->fails[s][*p++];
1942	  continue;
1943	}
1944
1945      /* If the previous character was a newline, count it. */
1946      if (count && (char *) p <= end && p[-1] == eol)
1947	++*count;
1948
1949      /* Check if we've run off the end of the buffer. */
1950      if ((char *) p > end)
1951	return NULL;
1952
1953      if (s >= 0)
1954	{
1955	  build_state(s, d);
1956	  trans = d->trans;
1957	  continue;
1958	}
1959
1960      if (p[-1] == eol && newline)
1961	{
1962	  s = d->newlines[s1];
1963	  continue;
1964	}
1965
1966      s = 0;
1967    }
1968}
1969
1970/* Initialize the components of a dfa that the other routines don't
1971   initialize for themselves. */
1972void
1973dfainit (struct dfa *d)
1974{
1975  d->calloc = 1;
1976  MALLOC(d->charclasses, charclass, d->calloc);
1977  d->cindex = 0;
1978
1979  d->talloc = 1;
1980  MALLOC(d->tokens, token, d->talloc);
1981  d->tindex = d->depth = d->nleaves = d->nregexps = 0;
1982
1983  d->searchflag = 0;
1984  d->tralloc = 0;
1985
1986  d->musts = 0;
1987}
1988
1989/* Parse and analyze a single string of the given length. */
1990void
1991dfacomp (char *s, size_t len, struct dfa *d, int searchflag)
1992{
1993  if (case_fold)	/* dummy folding in service of dfamust() */
1994    {
1995      char *lcopy;
1996      int i;
1997
1998      lcopy = malloc(len);
1999      if (!lcopy)
2000	dfaerror(_("out of memory"));
2001
2002      /* This is a kludge. */
2003      case_fold = 0;
2004      for (i = 0; i < len; ++i)
2005	if (ISUPPER ((unsigned char) s[i]))
2006	  lcopy[i] = tolower ((unsigned char) s[i]);
2007	else
2008	  lcopy[i] = s[i];
2009
2010      dfainit(d);
2011      dfaparse(lcopy, len, d);
2012      free(lcopy);
2013      dfamust(d);
2014      d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2015      case_fold = 1;
2016      dfaparse(s, len, d);
2017      dfaanalyze(d, searchflag);
2018    }
2019  else
2020    {
2021        dfainit(d);
2022        dfaparse(s, len, d);
2023	dfamust(d);
2024        dfaanalyze(d, searchflag);
2025    }
2026}
2027
2028/* Free the storage held by the components of a dfa. */
2029void
2030dfafree (struct dfa *d)
2031{
2032  int i;
2033  struct dfamust *dm, *ndm;
2034
2035  free((ptr_t) d->charclasses);
2036  free((ptr_t) d->tokens);
2037  for (i = 0; i < d->sindex; ++i)
2038    free((ptr_t) d->states[i].elems.elems);
2039  free((ptr_t) d->states);
2040  for (i = 0; i < d->tindex; ++i)
2041    if (d->follows[i].elems)
2042      free((ptr_t) d->follows[i].elems);
2043  free((ptr_t) d->follows);
2044  for (i = 0; i < d->tralloc; ++i)
2045    if (d->trans[i])
2046      free((ptr_t) d->trans[i]);
2047    else if (d->fails[i])
2048      free((ptr_t) d->fails[i]);
2049  if (d->realtrans) free((ptr_t) d->realtrans);
2050  if (d->fails) free((ptr_t) d->fails);
2051  if (d->newlines) free((ptr_t) d->newlines);
2052  if (d->success) free((ptr_t) d->success);
2053  for (dm = d->musts; dm; dm = ndm)
2054    {
2055      ndm = dm->next;
2056      free(dm->must);
2057      free((ptr_t) dm);
2058    }
2059}
2060
2061/* Having found the postfix representation of the regular expression,
2062   try to find a long sequence of characters that must appear in any line
2063   containing the r.e.
2064   Finding a "longest" sequence is beyond the scope here;
2065   we take an easy way out and hope for the best.
2066   (Take "(ab|a)b"--please.)
2067
2068   We do a bottom-up calculation of sequences of characters that must appear
2069   in matches of r.e.'s represented by trees rooted at the nodes of the postfix
2070   representation:
2071	sequences that must appear at the left of the match ("left")
2072	sequences that must appear at the right of the match ("right")
2073	lists of sequences that must appear somewhere in the match ("in")
2074	sequences that must constitute the match ("is")
2075
2076   When we get to the root of the tree, we use one of the longest of its
2077   calculated "in" sequences as our answer.  The sequence we find is returned in
2078   d->must (where "d" is the single argument passed to "dfamust");
2079   the length of the sequence is returned in d->mustn.
2080
2081   The sequences calculated for the various types of node (in pseudo ANSI c)
2082   are shown below.  "p" is the operand of unary operators (and the left-hand
2083   operand of binary operators); "q" is the right-hand operand of binary
2084   operators.
2085
2086   "ZERO" means "a zero-length sequence" below.
2087
2088	Type	left		right		is		in
2089	----	----		-----		--		--
2090	char c	# c		# c		# c		# c
2091
2092	CSET	ZERO		ZERO		ZERO		ZERO
2093
2094	STAR	ZERO		ZERO		ZERO		ZERO
2095
2096	QMARK	ZERO		ZERO		ZERO		ZERO
2097
2098	PLUS	p->left		p->right	ZERO		p->in
2099
2100	CAT	(p->is==ZERO)?	(q->is==ZERO)?	(p->is!=ZERO &&	p->in plus
2101		p->left :	q->right :	q->is!=ZERO) ?	q->in plus
2102		p->is##q->left	p->right##q->is	p->is##q->is :	p->right##q->left
2103						ZERO
2104
2105	OR	longest common	longest common	(do p->is and	substrings common to
2106		leading		trailing	q->is have same	p->in and q->in
2107		(sub)sequence	(sub)sequence	length and
2108		of p->left	of p->right	content) ?
2109		and q->left	and q->right	p->is : NULL
2110
2111   If there's anything else we recognize in the tree, all four sequences get set
2112   to zero-length sequences.  If there's something we don't recognize in the tree,
2113   we just return a zero-length sequence.
2114
2115   Break ties in favor of infrequent letters (choosing 'zzz' in preference to
2116   'aaa')?
2117
2118   And. . .is it here or someplace that we might ponder "optimizations" such as
2119	egrep 'psi|epsilon'	->	egrep 'psi'
2120	egrep 'pepsi|epsilon'	->	egrep 'epsi'
2121					(Yes, we now find "epsi" as a "string
2122					that must occur", but we might also
2123					simplify the *entire* r.e. being sought)
2124	grep '[c]'		->	grep 'c'
2125	grep '(ab|a)b'		->	grep 'ab'
2126	grep 'ab*'		->	grep 'a'
2127	grep 'a*b'		->	grep 'b'
2128
2129   There are several issues:
2130
2131   Is optimization easy (enough)?
2132
2133   Does optimization actually accomplish anything,
2134   or is the automaton you get from "psi|epsilon" (for example)
2135   the same as the one you get from "psi" (for example)?
2136
2137   Are optimizable r.e.'s likely to be used in real-life situations
2138   (something like 'ab*' is probably unlikely; something like is
2139   'psi|epsilon' is likelier)? */
2140
2141static char *
2142icatalloc (char *old, char *new)
2143{
2144  char *result;
2145  size_t oldsize, newsize;
2146
2147  newsize = (new == NULL) ? 0 : strlen(new);
2148  if (old == NULL)
2149    oldsize = 0;
2150  else if (newsize == 0)
2151    return old;
2152  else	oldsize = strlen(old);
2153  if (old == NULL)
2154    result = (char *) malloc(newsize + 1);
2155  else
2156    result = (char *) realloc((void *) old, oldsize + newsize + 1);
2157  if (result != NULL && new != NULL)
2158    (void) strcpy(result + oldsize, new);
2159  return result;
2160}
2161
2162static char *
2163icpyalloc (char *string)
2164{
2165  return icatalloc((char *) NULL, string);
2166}
2167
2168static char *
2169istrstr (char *lookin, char *lookfor)
2170{
2171  char *cp;
2172  size_t len;
2173
2174  len = strlen(lookfor);
2175  for (cp = lookin; *cp != '\0'; ++cp)
2176    if (strncmp(cp, lookfor, len) == 0)
2177      return cp;
2178  return NULL;
2179}
2180
2181static void
2182ifree (char *cp)
2183{
2184  if (cp != NULL)
2185    free(cp);
2186}
2187
2188static void
2189freelist (char **cpp)
2190{
2191  int i;
2192
2193  if (cpp == NULL)
2194    return;
2195  for (i = 0; cpp[i] != NULL; ++i)
2196    {
2197      free(cpp[i]);
2198      cpp[i] = NULL;
2199    }
2200}
2201
2202static char **
2203enlist (char **cpp, char *new, size_t len)
2204{
2205  int i, j;
2206
2207  if (cpp == NULL)
2208    return NULL;
2209  if ((new = icpyalloc(new)) == NULL)
2210    {
2211      freelist(cpp);
2212      return NULL;
2213    }
2214  new[len] = '\0';
2215  /* Is there already something in the list that's new (or longer)? */
2216  for (i = 0; cpp[i] != NULL; ++i)
2217    if (istrstr(cpp[i], new) != NULL)
2218      {
2219	free(new);
2220	return cpp;
2221      }
2222  /* Eliminate any obsoleted strings. */
2223  j = 0;
2224  while (cpp[j] != NULL)
2225    if (istrstr(new, cpp[j]) == NULL)
2226      ++j;
2227    else
2228      {
2229	free(cpp[j]);
2230	if (--i == j)
2231	  break;
2232	cpp[j] = cpp[i];
2233	cpp[i] = NULL;
2234      }
2235  /* Add the new string. */
2236  cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
2237  if (cpp == NULL)
2238    return NULL;
2239  cpp[i] = new;
2240  cpp[i + 1] = NULL;
2241  return cpp;
2242}
2243
2244/* Given pointers to two strings, return a pointer to an allocated
2245   list of their distinct common substrings. Return NULL if something
2246   seems wild. */
2247static char **
2248comsubs (char *left, char *right)
2249{
2250  char **cpp;
2251  char *lcp;
2252  char *rcp;
2253  size_t i, len;
2254
2255  if (left == NULL || right == NULL)
2256    return NULL;
2257  cpp = (char **) malloc(sizeof *cpp);
2258  if (cpp == NULL)
2259    return NULL;
2260  cpp[0] = NULL;
2261  for (lcp = left; *lcp != '\0'; ++lcp)
2262    {
2263      len = 0;
2264      rcp = index(right, *lcp);
2265      while (rcp != NULL)
2266	{
2267	  for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
2268	    continue;
2269	  if (i > len)
2270	    len = i;
2271	  rcp = index(rcp + 1, *lcp);
2272	}
2273      if (len == 0)
2274	continue;
2275      if ((cpp = enlist(cpp, lcp, len)) == NULL)
2276	break;
2277    }
2278  return cpp;
2279}
2280
2281static char **
2282addlists (char **old, char **new)
2283{
2284  int i;
2285
2286  if (old == NULL || new == NULL)
2287    return NULL;
2288  for (i = 0; new[i] != NULL; ++i)
2289    {
2290      old = enlist(old, new[i], strlen(new[i]));
2291      if (old == NULL)
2292	break;
2293    }
2294  return old;
2295}
2296
2297/* Given two lists of substrings, return a new list giving substrings
2298   common to both. */
2299static char **
2300inboth (char **left, char **right)
2301{
2302  char **both;
2303  char **temp;
2304  int lnum, rnum;
2305
2306  if (left == NULL || right == NULL)
2307    return NULL;
2308  both = (char **) malloc(sizeof *both);
2309  if (both == NULL)
2310    return NULL;
2311  both[0] = NULL;
2312  for (lnum = 0; left[lnum] != NULL; ++lnum)
2313    {
2314      for (rnum = 0; right[rnum] != NULL; ++rnum)
2315	{
2316	  temp = comsubs(left[lnum], right[rnum]);
2317	  if (temp == NULL)
2318	    {
2319	      freelist(both);
2320	      return NULL;
2321	    }
2322	  both = addlists(both, temp);
2323	  freelist(temp);
2324	  free(temp);
2325	  if (both == NULL)
2326	    return NULL;
2327	}
2328    }
2329  return both;
2330}
2331
2332typedef struct
2333{
2334  char **in;
2335  char *left;
2336  char *right;
2337  char *is;
2338} must;
2339
2340static void
2341resetmust (must *mp)
2342{
2343  mp->left[0] = mp->right[0] = mp->is[0] = '\0';
2344  freelist(mp->in);
2345}
2346
2347static void
2348dfamust (struct dfa *dfa)
2349{
2350  must *musts;
2351  must *mp;
2352  char *result;
2353  int ri;
2354  int i;
2355  int exact;
2356  token t;
2357  static must must0;
2358  struct dfamust *dm;
2359  static char empty_string[] = "";
2360
2361  result = empty_string;
2362  exact = 0;
2363  musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
2364  if (musts == NULL)
2365    return;
2366  mp = musts;
2367  for (i = 0; i <= dfa->tindex; ++i)
2368    mp[i] = must0;
2369  for (i = 0; i <= dfa->tindex; ++i)
2370    {
2371      mp[i].in = (char **) malloc(sizeof *mp[i].in);
2372      mp[i].left = malloc(2);
2373      mp[i].right = malloc(2);
2374      mp[i].is = malloc(2);
2375      if (mp[i].in == NULL || mp[i].left == NULL ||
2376	  mp[i].right == NULL || mp[i].is == NULL)
2377	goto done;
2378      mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
2379      mp[i].in[0] = NULL;
2380    }
2381#ifdef DEBUG
2382  fprintf(stderr, "dfamust:\n");
2383  for (i = 0; i < dfa->tindex; ++i)
2384    {
2385      fprintf(stderr, " %d:", i);
2386      prtok(dfa->tokens[i]);
2387    }
2388  putc('\n', stderr);
2389#endif
2390  for (ri = 0; ri < dfa->tindex; ++ri)
2391    {
2392      switch (t = dfa->tokens[ri])
2393	{
2394	case LPAREN:
2395	case RPAREN:
2396	  goto done;		/* "cannot happen" */
2397	case EMPTY:
2398	case BEGLINE:
2399	case ENDLINE:
2400	case BEGWORD:
2401	case ENDWORD:
2402	case LIMWORD:
2403	case NOTLIMWORD:
2404	case BACKREF:
2405	  resetmust(mp);
2406	  break;
2407	case STAR:
2408	case QMARK:
2409	  if (mp <= musts)
2410	    goto done;		/* "cannot happen" */
2411	  --mp;
2412	  resetmust(mp);
2413	  break;
2414	case OR:
2415	case ORTOP:
2416	  if (mp < &musts[2])
2417	    goto done;		/* "cannot happen" */
2418	  {
2419	    char **new;
2420	    must *lmp;
2421	    must *rmp;
2422	    int j, ln, rn, n;
2423
2424	    rmp = --mp;
2425	    lmp = --mp;
2426	    /* Guaranteed to be.  Unlikely, but. . . */
2427	    if (strcmp(lmp->is, rmp->is) != 0)
2428	      lmp->is[0] = '\0';
2429	    /* Left side--easy */
2430	    i = 0;
2431	    while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
2432	      ++i;
2433	    lmp->left[i] = '\0';
2434	    /* Right side */
2435	    ln = strlen(lmp->right);
2436	    rn = strlen(rmp->right);
2437	    n = ln;
2438	    if (n > rn)
2439	      n = rn;
2440	    for (i = 0; i < n; ++i)
2441	      if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
2442		break;
2443	    for (j = 0; j < i; ++j)
2444	      lmp->right[j] = lmp->right[(ln - i) + j];
2445	    lmp->right[j] = '\0';
2446	    new = inboth(lmp->in, rmp->in);
2447	    if (new == NULL)
2448	      goto done;
2449	    freelist(lmp->in);
2450	    free((char *) lmp->in);
2451	    lmp->in = new;
2452	  }
2453	  break;
2454	case PLUS:
2455	  if (mp <= musts)
2456	    goto done;		/* "cannot happen" */
2457	  --mp;
2458	  mp->is[0] = '\0';
2459	  break;
2460	case END:
2461	  if (mp != &musts[1])
2462	    goto done;		/* "cannot happen" */
2463	  for (i = 0; musts[0].in[i] != NULL; ++i)
2464	    if (strlen(musts[0].in[i]) > strlen(result))
2465	      result = musts[0].in[i];
2466	  if (strcmp(result, musts[0].is) == 0)
2467	    exact = 1;
2468	  goto done;
2469	case CAT:
2470	  if (mp < &musts[2])
2471	    goto done;		/* "cannot happen" */
2472	  {
2473	    must *lmp;
2474	    must *rmp;
2475
2476	    rmp = --mp;
2477	    lmp = --mp;
2478	    /* In.  Everything in left, plus everything in
2479	       right, plus catenation of
2480	       left's right and right's left. */
2481	    lmp->in = addlists(lmp->in, rmp->in);
2482	    if (lmp->in == NULL)
2483	      goto done;
2484	    if (lmp->right[0] != '\0' &&
2485		rmp->left[0] != '\0')
2486	      {
2487		char *tp;
2488
2489		tp = icpyalloc(lmp->right);
2490		if (tp == NULL)
2491		  goto done;
2492		tp = icatalloc(tp, rmp->left);
2493		if (tp == NULL)
2494		  goto done;
2495		lmp->in = enlist(lmp->in, tp,
2496				 strlen(tp));
2497		free(tp);
2498		if (lmp->in == NULL)
2499		  goto done;
2500	      }
2501	    /* Left-hand */
2502	    if (lmp->is[0] != '\0')
2503	      {
2504		lmp->left = icatalloc(lmp->left,
2505				      rmp->left);
2506		if (lmp->left == NULL)
2507		  goto done;
2508	      }
2509	    /* Right-hand */
2510	    if (rmp->is[0] == '\0')
2511	      lmp->right[0] = '\0';
2512	    lmp->right = icatalloc(lmp->right, rmp->right);
2513	    if (lmp->right == NULL)
2514	      goto done;
2515	    /* Guaranteed to be */
2516	    if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
2517	      {
2518		lmp->is = icatalloc(lmp->is, rmp->is);
2519		if (lmp->is == NULL)
2520		  goto done;
2521	      }
2522	    else
2523	      lmp->is[0] = '\0';
2524	  }
2525	  break;
2526	default:
2527	  if (t < END)
2528	    {
2529	      /* "cannot happen" */
2530	      goto done;
2531	    }
2532	  else if (t == '\0')
2533	    {
2534	      /* not on *my* shift */
2535	      goto done;
2536	    }
2537	  else if (t >= CSET)
2538	    {
2539	      /* easy enough */
2540	      resetmust(mp);
2541	    }
2542	  else
2543	    {
2544	      /* plain character */
2545	      resetmust(mp);
2546	      mp->is[0] = mp->left[0] = mp->right[0] = t;
2547	      mp->is[1] = mp->left[1] = mp->right[1] = '\0';
2548	      mp->in = enlist(mp->in, mp->is, (size_t)1);
2549	      if (mp->in == NULL)
2550		goto done;
2551	    }
2552	  break;
2553	}
2554#ifdef DEBUG
2555      fprintf(stderr, " node: %d:", ri);
2556      prtok(dfa->tokens[ri]);
2557      fprintf(stderr, "\n  in:");
2558      for (i = 0; mp->in[i]; ++i)
2559	fprintf(stderr, " \"%s\"", mp->in[i]);
2560      fprintf(stderr, "\n  is: \"%s\"\n", mp->is);
2561      fprintf(stderr, "  left: \"%s\"\n", mp->left);
2562      fprintf(stderr, "  right: \"%s\"\n", mp->right);
2563#endif
2564      ++mp;
2565    }
2566 done:
2567  if (strlen(result))
2568    {
2569      dm = (struct dfamust *) malloc(sizeof (struct dfamust));
2570      dm->exact = exact;
2571      dm->must = malloc(strlen(result) + 1);
2572      strcpy(dm->must, result);
2573      dm->next = dfa->musts;
2574      dfa->musts = dm;
2575    }
2576  mp = musts;
2577  for (i = 0; i <= dfa->tindex; ++i)
2578    {
2579      freelist(mp[i].in);
2580      ifree((char *) mp[i].in);
2581      ifree(mp[i].left);
2582      ifree(mp[i].right);
2583      ifree(mp[i].is);
2584    }
2585  free((char *) mp);
2586}
2587