dfa.c revision 126435
1/* dfa.c - deterministic extended regexp routines for GNU
2   Copyright 1988, 1998, 2000 Free Software Foundation, Inc.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2, or (at your option)
7   any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software
16   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA */
17
18/* Written June, 1988 by Mike Haertel
19   Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
20
21/* $FreeBSD: head/gnu/usr.bin/grep/dfa.c 126435 2004-03-01 08:37:20Z ache $ */
22
23#ifdef HAVE_CONFIG_H
24#include <config.h>
25#endif
26
27#include <assert.h>
28#include <ctype.h>
29#include <stdio.h>
30
31#include <sys/types.h>
32#ifdef STDC_HEADERS
33#include <stdlib.h>
34#else
35extern char *calloc(), *malloc(), *realloc();
36extern void free();
37#endif
38
39#if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
40#include <string.h>
41#undef index
42#define index strchr
43#else
44#include <strings.h>
45#endif
46
47#ifndef DEBUG	/* use the same approach as regex.c */
48#undef assert
49#define assert(e)
50#endif /* DEBUG */
51
52#ifndef isgraph
53#define isgraph(C) (isprint(C) && !isspace(C))
54#endif
55
56#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
57#define ISALPHA(C) isalpha(C)
58#define ISUPPER(C) isupper(C)
59#define ISLOWER(C) islower(C)
60#define ISDIGIT(C) isdigit(C)
61#define ISXDIGIT(C) isxdigit(C)
62#define ISSPACE(C) isspace(C)
63#define ISPUNCT(C) ispunct(C)
64#define ISALNUM(C) isalnum(C)
65#define ISPRINT(C) isprint(C)
66#define ISGRAPH(C) isgraph(C)
67#define ISCNTRL(C) iscntrl(C)
68#else
69#define ISALPHA(C) (isascii(C) && isalpha(C))
70#define ISUPPER(C) (isascii(C) && isupper(C))
71#define ISLOWER(C) (isascii(C) && islower(C))
72#define ISDIGIT(C) (isascii(C) && isdigit(C))
73#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
74#define ISSPACE(C) (isascii(C) && isspace(C))
75#define ISPUNCT(C) (isascii(C) && ispunct(C))
76#define ISALNUM(C) (isascii(C) && isalnum(C))
77#define ISPRINT(C) (isascii(C) && isprint(C))
78#define ISGRAPH(C) (isascii(C) && isgraph(C))
79#define ISCNTRL(C) (isascii(C) && iscntrl(C))
80#endif
81
82/* ISASCIIDIGIT differs from ISDIGIT, as follows:
83   - Its arg may be any int or unsigned int; it need not be an unsigned char.
84   - It's guaranteed to evaluate its argument exactly once.
85   - It's typically faster.
86   Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
87   only '0' through '9' are digits.  Prefer ISASCIIDIGIT to ISDIGIT unless
88   it's important to use the locale's definition of `digit' even when the
89   host does not conform to Posix.  */
90#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
91
92/* If we (don't) have I18N.  */
93/* glibc defines _ */
94#ifndef _
95# ifdef HAVE_LIBINTL_H
96#  include <libintl.h>
97#  ifndef _
98#   define _(Str) gettext (Str)
99#  endif
100# else
101#  define _(Str) (Str)
102# endif
103#endif
104
105#include "regex.h"
106#include "dfa.h"
107
108/* HPUX, define those as macros in sys/param.h */
109#ifdef setbit
110# undef setbit
111#endif
112#ifdef clrbit
113# undef clrbit
114#endif
115
116static void dfamust PARAMS ((struct dfa *dfa));
117
118static ptr_t xcalloc PARAMS ((size_t n, size_t s));
119static ptr_t xmalloc PARAMS ((size_t n));
120static ptr_t xrealloc PARAMS ((ptr_t p, size_t n));
121#ifdef DEBUG
122static void prtok PARAMS ((token t));
123#endif
124static int tstbit PARAMS ((int b, charclass c));
125static void setbit PARAMS ((int b, charclass c));
126static void clrbit PARAMS ((int b, charclass c));
127static void copyset PARAMS ((charclass src, charclass dst));
128static void zeroset PARAMS ((charclass s));
129static void notset PARAMS ((charclass s));
130static int equal PARAMS ((charclass s1, charclass s2));
131static int charclass_index PARAMS ((charclass s));
132static int looking_at PARAMS ((const char *s));
133static token lex PARAMS ((void));
134static void addtok PARAMS ((token t));
135static void atom PARAMS ((void));
136static int nsubtoks PARAMS ((int tindex));
137static void copytoks PARAMS ((int tindex, int ntokens));
138static void closure PARAMS ((void));
139static void branch PARAMS ((void));
140static void regexp PARAMS ((int toplevel));
141static void copy PARAMS ((position_set *src, position_set *dst));
142static void insert PARAMS ((position p, position_set *s));
143static void merge PARAMS ((position_set *s1, position_set *s2, position_set *m));
144static void delete PARAMS ((position p, position_set *s));
145static int state_index PARAMS ((struct dfa *d, position_set *s,
146			  int newline, int letter));
147static void build_state PARAMS ((int s, struct dfa *d));
148static void build_state_zero PARAMS ((struct dfa *d));
149static char *icatalloc PARAMS ((char *old, char *new));
150static char *icpyalloc PARAMS ((char *string));
151static char *istrstr PARAMS ((char *lookin, char *lookfor));
152static void ifree PARAMS ((char *cp));
153static void freelist PARAMS ((char **cpp));
154static char **enlist PARAMS ((char **cpp, char *new, size_t len));
155static char **comsubs PARAMS ((char *left, char *right));
156static char **addlists PARAMS ((char **old, char **new));
157static char **inboth PARAMS ((char **left, char **right));
158
159static ptr_t
160xcalloc (size_t n, size_t s)
161{
162  ptr_t r = calloc(n, s);
163
164  if (!r)
165    dfaerror(_("Memory exhausted"));
166  return r;
167}
168
169static ptr_t
170xmalloc (size_t n)
171{
172  ptr_t r = malloc(n);
173
174  assert(n != 0);
175  if (!r)
176    dfaerror(_("Memory exhausted"));
177  return r;
178}
179
180static ptr_t
181xrealloc (ptr_t p, size_t n)
182{
183  ptr_t r = realloc(p, n);
184
185  assert(n != 0);
186  if (!r)
187    dfaerror(_("Memory exhausted"));
188  return r;
189}
190
191#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
192#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
193#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
194
195/* Reallocate an array of type t if nalloc is too small for index. */
196#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
197  if ((index) >= (nalloc))			  \
198    {						  \
199      while ((index) >= (nalloc))		  \
200	(nalloc) *= 2;				  \
201      REALLOC(p, t, nalloc);			  \
202    }
203
204#ifdef DEBUG
205
206static void
207prtok (token t)
208{
209  char *s;
210
211  if (t < 0)
212    fprintf(stderr, "END");
213  else if (t < NOTCHAR)
214    fprintf(stderr, "%c", t);
215  else
216    {
217      switch (t)
218	{
219	case EMPTY: s = "EMPTY"; break;
220	case BACKREF: s = "BACKREF"; break;
221	case BEGLINE: s = "BEGLINE"; break;
222	case ENDLINE: s = "ENDLINE"; break;
223	case BEGWORD: s = "BEGWORD"; break;
224	case ENDWORD: s = "ENDWORD"; break;
225	case LIMWORD: s = "LIMWORD"; break;
226	case NOTLIMWORD: s = "NOTLIMWORD"; break;
227	case QMARK: s = "QMARK"; break;
228	case STAR: s = "STAR"; break;
229	case PLUS: s = "PLUS"; break;
230	case CAT: s = "CAT"; break;
231	case OR: s = "OR"; break;
232	case ORTOP: s = "ORTOP"; break;
233	case LPAREN: s = "LPAREN"; break;
234	case RPAREN: s = "RPAREN"; break;
235	default: s = "CSET"; break;
236	}
237      fprintf(stderr, "%s", s);
238    }
239}
240#endif /* DEBUG */
241
242/* Stuff pertaining to charclasses. */
243
244static int
245tstbit (int b, charclass c)
246{
247  return c[b / INTBITS] & 1 << b % INTBITS;
248}
249
250static void
251setbit (int b, charclass c)
252{
253  c[b / INTBITS] |= 1 << b % INTBITS;
254}
255
256static void
257clrbit (int b, charclass c)
258{
259  c[b / INTBITS] &= ~(1 << b % INTBITS);
260}
261
262static void
263copyset (charclass src, charclass dst)
264{
265  int i;
266
267  for (i = 0; i < CHARCLASS_INTS; ++i)
268    dst[i] = src[i];
269}
270
271static void
272zeroset (charclass s)
273{
274  int i;
275
276  for (i = 0; i < CHARCLASS_INTS; ++i)
277    s[i] = 0;
278}
279
280static void
281notset (charclass s)
282{
283  int i;
284
285  for (i = 0; i < CHARCLASS_INTS; ++i)
286    s[i] = ~s[i];
287}
288
289static int
290equal (charclass s1, charclass s2)
291{
292  int i;
293
294  for (i = 0; i < CHARCLASS_INTS; ++i)
295    if (s1[i] != s2[i])
296      return 0;
297  return 1;
298}
299
300/* A pointer to the current dfa is kept here during parsing. */
301static struct dfa *dfa;
302
303/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
304static int
305charclass_index (charclass s)
306{
307  int i;
308
309  for (i = 0; i < dfa->cindex; ++i)
310    if (equal(s, dfa->charclasses[i]))
311      return i;
312  REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
313  ++dfa->cindex;
314  copyset(s, dfa->charclasses[i]);
315  return i;
316}
317
318/* Syntax bits controlling the behavior of the lexical analyzer. */
319static reg_syntax_t syntax_bits, syntax_bits_set;
320
321/* Flag for case-folding letters into sets. */
322static int case_fold;
323
324/* End-of-line byte in data.  */
325static unsigned char eolbyte;
326
327/* Entry point to set syntax options. */
328void
329dfasyntax (reg_syntax_t bits, int fold, int eol)
330{
331  syntax_bits_set = 1;
332  syntax_bits = bits;
333  case_fold = fold;
334  eolbyte = eol;
335}
336
337/* Lexical analyzer.  All the dross that deals with the obnoxious
338   GNU Regex syntax bits is located here.  The poor, suffering
339   reader is referred to the GNU Regex documentation for the
340   meaning of the @#%!@#%^!@ syntax bits. */
341
342static char *lexstart;		/* Pointer to beginning of input string. */
343static char *lexptr;		/* Pointer to next input character. */
344static int lexleft;		/* Number of characters remaining. */
345static token lasttok;		/* Previous token returned; initially END. */
346static int laststart;		/* True if we're separated from beginning or (, |
347				   only by zero-width characters. */
348static int parens;		/* Count of outstanding left parens. */
349static int minrep, maxrep;	/* Repeat counts for {m,n}. */
350
351/* Note that characters become unsigned here. */
352#define FETCH(c, eoferr)   	      \
353  {			   	      \
354    if (! lexleft)	   	      \
355      {				      \
356	if (eoferr != 0)	      \
357	  dfaerror (eoferr);	      \
358	else		   	      \
359	  return lasttok = END;	      \
360      }				      \
361    (c) = (unsigned char) *lexptr++;  \
362    --lexleft;		   	      \
363  }
364
365#ifdef __STDC__
366#define FUNC(F, P) static int F(int c) { return P(c); }
367#else
368#define FUNC(F, P) static int F(c) int c; { return P(c); }
369#endif
370
371FUNC(is_alpha, ISALPHA)
372FUNC(is_upper, ISUPPER)
373FUNC(is_lower, ISLOWER)
374FUNC(is_digit, ISDIGIT)
375FUNC(is_xdigit, ISXDIGIT)
376FUNC(is_space, ISSPACE)
377FUNC(is_punct, ISPUNCT)
378FUNC(is_alnum, ISALNUM)
379FUNC(is_print, ISPRINT)
380FUNC(is_graph, ISGRAPH)
381FUNC(is_cntrl, ISCNTRL)
382
383static int
384is_blank (int c)
385{
386   return (c == ' ' || c == '\t');
387}
388
389/* The following list maps the names of the Posix named character classes
390   to predicate functions that determine whether a given character is in
391   the class.  The leading [ has already been eaten by the lexical analyzer. */
392static struct {
393  const char *name;
394  int (*pred) PARAMS ((int));
395} prednames[] = {
396  { ":alpha:]", is_alpha },
397  { ":upper:]", is_upper },
398  { ":lower:]", is_lower },
399  { ":digit:]", is_digit },
400  { ":xdigit:]", is_xdigit },
401  { ":space:]", is_space },
402  { ":punct:]", is_punct },
403  { ":alnum:]", is_alnum },
404  { ":print:]", is_print },
405  { ":graph:]", is_graph },
406  { ":cntrl:]", is_cntrl },
407  { ":blank:]", is_blank },
408  { 0 }
409};
410
411/* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
412#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
413
414static int
415looking_at (char const *s)
416{
417  size_t len;
418
419  len = strlen(s);
420  if (lexleft < len)
421    return 0;
422  return strncmp(s, lexptr, len) == 0;
423}
424
425static token
426lex (void)
427{
428  token c, c1, c2;
429  int backslash = 0, invert;
430  charclass ccl;
431  int i;
432  char lo[2];
433  char hi[2];
434
435  /* Basic plan: We fetch a character.  If it's a backslash,
436     we set the backslash flag and go through the loop again.
437     On the plus side, this avoids having a duplicate of the
438     main switch inside the backslash case.  On the minus side,
439     it means that just about every case begins with
440     "if (backslash) ...".  */
441  for (i = 0; i < 2; ++i)
442    {
443      FETCH(c, 0);
444      switch (c)
445	{
446	case '\\':
447	  if (backslash)
448	    goto normal_char;
449	  if (lexleft == 0)
450	    dfaerror(_("Unfinished \\ escape"));
451	  backslash = 1;
452	  break;
453
454	case '^':
455	  if (backslash)
456	    goto normal_char;
457	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
458	      || lasttok == END
459	      || lasttok == LPAREN
460	      || lasttok == OR)
461	    return lasttok = BEGLINE;
462	  goto normal_char;
463
464	case '$':
465	  if (backslash)
466	    goto normal_char;
467	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
468	      || lexleft == 0
469	      || (syntax_bits & RE_NO_BK_PARENS
470		  ? lexleft > 0 && *lexptr == ')'
471		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
472	      || (syntax_bits & RE_NO_BK_VBAR
473		  ? lexleft > 0 && *lexptr == '|'
474		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
475	      || ((syntax_bits & RE_NEWLINE_ALT)
476	          && lexleft > 0 && *lexptr == '\n'))
477	    return lasttok = ENDLINE;
478	  goto normal_char;
479
480	case '1':
481	case '2':
482	case '3':
483	case '4':
484	case '5':
485	case '6':
486	case '7':
487	case '8':
488	case '9':
489	  if (backslash && !(syntax_bits & RE_NO_BK_REFS))
490	    {
491	      laststart = 0;
492	      return lasttok = BACKREF;
493	    }
494	  goto normal_char;
495
496	case '`':
497	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
498	    return lasttok = BEGLINE;	/* FIXME: should be beginning of string */
499	  goto normal_char;
500
501	case '\'':
502	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
503	    return lasttok = ENDLINE;	/* FIXME: should be end of string */
504	  goto normal_char;
505
506	case '<':
507	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
508	    return lasttok = BEGWORD;
509	  goto normal_char;
510
511	case '>':
512	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
513	    return lasttok = ENDWORD;
514	  goto normal_char;
515
516	case 'b':
517	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
518	    return lasttok = LIMWORD;
519	  goto normal_char;
520
521	case 'B':
522	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
523	    return lasttok = NOTLIMWORD;
524	  goto normal_char;
525
526	case '?':
527	  if (syntax_bits & RE_LIMITED_OPS)
528	    goto normal_char;
529	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
530	    goto normal_char;
531	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
532	    goto normal_char;
533	  return lasttok = QMARK;
534
535	case '*':
536	  if (backslash)
537	    goto normal_char;
538	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
539	    goto normal_char;
540	  return lasttok = STAR;
541
542	case '+':
543	  if (syntax_bits & RE_LIMITED_OPS)
544	    goto normal_char;
545	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
546	    goto normal_char;
547	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
548	    goto normal_char;
549	  return lasttok = PLUS;
550
551	case '{':
552	  if (!(syntax_bits & RE_INTERVALS))
553	    goto normal_char;
554	  if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
555	    goto normal_char;
556	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
557	    goto normal_char;
558
559	  if (syntax_bits & RE_NO_BK_BRACES)
560	    {
561	      /* Scan ahead for a valid interval; if it's not valid,
562		 treat it as a literal '{'.  */
563	      int lo = -1, hi = -1;
564	      char const *p = lexptr;
565	      char const *lim = p + lexleft;
566	      for (;  p != lim && ISASCIIDIGIT (*p);  p++)
567		lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
568	      if (p != lim && *p == ',')
569		while (++p != lim && ISASCIIDIGIT (*p))
570		  hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
571	      else
572		hi = lo;
573	      if (p == lim || *p != '}'
574		  || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
575		goto normal_char;
576	    }
577
578	  minrep = 0;
579	  /* Cases:
580	     {M} - exact count
581	     {M,} - minimum count, maximum is infinity
582	     {M,N} - M through N */
583	  FETCH(c, _("unfinished repeat count"));
584	  if (ISASCIIDIGIT (c))
585	    {
586	      minrep = c - '0';
587	      for (;;)
588		{
589		  FETCH(c, _("unfinished repeat count"));
590		  if (! ISASCIIDIGIT (c))
591		    break;
592		  minrep = 10 * minrep + c - '0';
593		}
594	    }
595	  else
596	    dfaerror(_("malformed repeat count"));
597	  if (c == ',')
598	    {
599	      FETCH (c, _("unfinished repeat count"));
600	      if (! ISASCIIDIGIT (c))
601		maxrep = -1;
602	      else
603		{
604		  maxrep = c - '0';
605		  for (;;)
606		    {
607		      FETCH (c, _("unfinished repeat count"));
608		      if (! ISASCIIDIGIT (c))
609			break;
610		      maxrep = 10 * maxrep + c - '0';
611		    }
612		  if (0 <= maxrep && maxrep < minrep)
613		    dfaerror (_("malformed repeat count"));
614		}
615	    }
616	  else
617	    maxrep = minrep;
618	  if (!(syntax_bits & RE_NO_BK_BRACES))
619	    {
620	      if (c != '\\')
621		dfaerror(_("malformed repeat count"));
622	      FETCH(c, _("unfinished repeat count"));
623	    }
624	  if (c != '}')
625	    dfaerror(_("malformed repeat count"));
626	  laststart = 0;
627	  return lasttok = REPMN;
628
629	case '|':
630	  if (syntax_bits & RE_LIMITED_OPS)
631	    goto normal_char;
632	  if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
633	    goto normal_char;
634	  laststart = 1;
635	  return lasttok = OR;
636
637	case '\n':
638	  if (syntax_bits & RE_LIMITED_OPS
639	      || backslash
640	      || !(syntax_bits & RE_NEWLINE_ALT))
641	    goto normal_char;
642	  laststart = 1;
643	  return lasttok = OR;
644
645	case '(':
646	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
647	    goto normal_char;
648	  ++parens;
649	  laststart = 1;
650	  return lasttok = LPAREN;
651
652	case ')':
653	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
654	    goto normal_char;
655	  if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
656	    goto normal_char;
657	  --parens;
658	  laststart = 0;
659	  return lasttok = RPAREN;
660
661	case '.':
662	  if (backslash)
663	    goto normal_char;
664	  zeroset(ccl);
665	  notset(ccl);
666	  if (!(syntax_bits & RE_DOT_NEWLINE))
667	    clrbit(eolbyte, ccl);
668	  if (syntax_bits & RE_DOT_NOT_NULL)
669	    clrbit('\0', ccl);
670	  laststart = 0;
671	  return lasttok = CSET + charclass_index(ccl);
672
673	case 'w':
674	case 'W':
675	  if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
676	    goto normal_char;
677	  zeroset(ccl);
678	  for (c2 = 0; c2 < NOTCHAR; ++c2)
679	    if (IS_WORD_CONSTITUENT(c2))
680	      setbit(c2, ccl);
681	  if (c == 'W')
682	    notset(ccl);
683	  laststart = 0;
684	  return lasttok = CSET + charclass_index(ccl);
685
686	case '[':
687	  if (backslash)
688	    goto normal_char;
689	  zeroset(ccl);
690	  FETCH(c, _("Unbalanced ["));
691	  if (c == '^')
692	    {
693	      FETCH(c, _("Unbalanced ["));
694	      invert = 1;
695	    }
696	  else
697	    invert = 0;
698	  do
699	    {
700	      /* Nobody ever said this had to be fast. :-)
701		 Note that if we're looking at some other [:...:]
702		 construct, we just treat it as a bunch of ordinary
703		 characters.  We can do this because we assume
704		 regex has checked for syntax errors before
705		 dfa is ever called. */
706	      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
707		for (c1 = 0; prednames[c1].name; ++c1)
708		  if (looking_at(prednames[c1].name))
709		    {
710			int (*pred)() = prednames[c1].pred;
711			if (case_fold
712			    && (pred == is_upper || pred == is_lower))
713				pred = is_alpha;
714
715		      for (c2 = 0; c2 < NOTCHAR; ++c2)
716			if ((*pred)(c2))
717			  setbit(c2, ccl);
718		      lexptr += strlen(prednames[c1].name);
719		      lexleft -= strlen(prednames[c1].name);
720		      FETCH(c1, _("Unbalanced ["));
721		      goto skip;
722		    }
723	      if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
724		FETCH(c, _("Unbalanced ["));
725	      FETCH(c1, _("Unbalanced ["));
726	      if (c1 == '-')
727		{
728		  FETCH(c2, _("Unbalanced ["));
729		  if (c2 == ']')
730		    {
731		      /* In the case [x-], the - is an ordinary hyphen,
732			 which is left in c1, the lookahead character. */
733		      --lexptr;
734		      ++lexleft;
735		      c2 = c;
736		    }
737		  else
738		    {
739		      if (c2 == '\\'
740			  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
741			FETCH(c2, _("Unbalanced ["));
742		      FETCH(c1, _("Unbalanced ["));
743		    }
744		}
745	      else
746		c2 = c;
747
748	      lo[0] = c;  lo[1] = '\0';
749	      hi[0] = c2; hi[1] = '\0';
750	      for (c = 0; c < NOTCHAR; c++)
751		{
752		  char ch[2];
753		  ch[0] = c;  ch[1] = '\0';
754		  if (strcoll (lo, ch) <= 0 && strcoll (ch, hi) <= 0)
755		    {
756		      setbit (c, ccl);
757		      if (case_fold)
758			{
759			  if (ISUPPER (c))
760			    setbit (tolower (c), ccl);
761			  else if (ISLOWER (c))
762			    setbit (toupper (c), ccl);
763			}
764		    }
765		}
766
767	    skip:
768	      ;
769	    }
770	  while ((c = c1) != ']');
771	  if (invert)
772	    {
773	      notset(ccl);
774	      if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
775		clrbit(eolbyte, ccl);
776	    }
777	  laststart = 0;
778	  return lasttok = CSET + charclass_index(ccl);
779
780	default:
781	normal_char:
782	  laststart = 0;
783	  if (case_fold && ISALPHA(c))
784	    {
785	      zeroset(ccl);
786	      setbit(c, ccl);
787	      if (isupper(c))
788		setbit(tolower(c), ccl);
789	      else
790		setbit(toupper(c), ccl);
791	      return lasttok = CSET + charclass_index(ccl);
792	    }
793	  return c;
794	}
795    }
796
797  /* The above loop should consume at most a backslash
798     and some other character. */
799  abort();
800  return END;	/* keeps pedantic compilers happy. */
801}
802
803/* Recursive descent parser for regular expressions. */
804
805static token tok;		/* Lookahead token. */
806static int depth;		/* Current depth of a hypothetical stack
807				   holding deferred productions.  This is
808				   used to determine the depth that will be
809				   required of the real stack later on in
810				   dfaanalyze(). */
811
812/* Add the given token to the parse tree, maintaining the depth count and
813   updating the maximum depth if necessary. */
814static void
815addtok (token t)
816{
817  REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
818  dfa->tokens[dfa->tindex++] = t;
819
820  switch (t)
821    {
822    case QMARK:
823    case STAR:
824    case PLUS:
825      break;
826
827    case CAT:
828    case OR:
829    case ORTOP:
830      --depth;
831      break;
832
833    default:
834      ++dfa->nleaves;
835    case EMPTY:
836      ++depth;
837      break;
838    }
839  if (depth > dfa->depth)
840    dfa->depth = depth;
841}
842
843/* The grammar understood by the parser is as follows.
844
845   regexp:
846     regexp OR branch
847     branch
848
849   branch:
850     branch closure
851     closure
852
853   closure:
854     closure QMARK
855     closure STAR
856     closure PLUS
857     atom
858
859   atom:
860     <normal character>
861     CSET
862     BACKREF
863     BEGLINE
864     ENDLINE
865     BEGWORD
866     ENDWORD
867     LIMWORD
868     NOTLIMWORD
869     <empty>
870
871   The parser builds a parse tree in postfix form in an array of tokens. */
872
873static void
874atom (void)
875{
876  if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
877      || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
878      || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
879    {
880      addtok(tok);
881      tok = lex();
882    }
883  else if (tok == LPAREN)
884    {
885      tok = lex();
886      regexp(0);
887      if (tok != RPAREN)
888	dfaerror(_("Unbalanced ("));
889      tok = lex();
890    }
891  else
892    addtok(EMPTY);
893}
894
895/* Return the number of tokens in the given subexpression. */
896static int
897nsubtoks (int tindex)
898{
899  int ntoks1;
900
901  switch (dfa->tokens[tindex - 1])
902    {
903    default:
904      return 1;
905    case QMARK:
906    case STAR:
907    case PLUS:
908      return 1 + nsubtoks(tindex - 1);
909    case CAT:
910    case OR:
911    case ORTOP:
912      ntoks1 = nsubtoks(tindex - 1);
913      return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
914    }
915}
916
917/* Copy the given subexpression to the top of the tree. */
918static void
919copytoks (int tindex, int ntokens)
920{
921  int i;
922
923  for (i = 0; i < ntokens; ++i)
924    addtok(dfa->tokens[tindex + i]);
925}
926
927static void
928closure (void)
929{
930  int tindex, ntokens, i;
931
932  atom();
933  while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
934    if (tok == REPMN)
935      {
936	ntokens = nsubtoks(dfa->tindex);
937	tindex = dfa->tindex - ntokens;
938	if (maxrep < 0)
939	  addtok(PLUS);
940	if (minrep == 0)
941	  addtok(QMARK);
942	for (i = 1; i < minrep; ++i)
943	  {
944	    copytoks(tindex, ntokens);
945	    addtok(CAT);
946	  }
947	for (; i < maxrep; ++i)
948	  {
949	    copytoks(tindex, ntokens);
950	    addtok(QMARK);
951	    addtok(CAT);
952	  }
953	tok = lex();
954      }
955    else
956      {
957	addtok(tok);
958	tok = lex();
959      }
960}
961
962static void
963branch (void)
964{
965  closure();
966  while (tok != RPAREN && tok != OR && tok >= 0)
967    {
968      closure();
969      addtok(CAT);
970    }
971}
972
973static void
974regexp (int toplevel)
975{
976  branch();
977  while (tok == OR)
978    {
979      tok = lex();
980      branch();
981      if (toplevel)
982	addtok(ORTOP);
983      else
984	addtok(OR);
985    }
986}
987
988/* Main entry point for the parser.  S is a string to be parsed, len is the
989   length of the string, so s can include NUL characters.  D is a pointer to
990   the struct dfa to parse into. */
991void
992dfaparse (char *s, size_t len, struct dfa *d)
993{
994  dfa = d;
995  lexstart = lexptr = s;
996  lexleft = len;
997  lasttok = END;
998  laststart = 1;
999  parens = 0;
1000
1001  if (! syntax_bits_set)
1002    dfaerror(_("No syntax specified"));
1003
1004  tok = lex();
1005  depth = d->depth;
1006
1007  regexp(1);
1008
1009  if (tok != END)
1010    dfaerror(_("Unbalanced )"));
1011
1012  addtok(END - d->nregexps);
1013  addtok(CAT);
1014
1015  if (d->nregexps)
1016    addtok(ORTOP);
1017
1018  ++d->nregexps;
1019}
1020
1021/* Some primitives for operating on sets of positions. */
1022
1023/* Copy one set to another; the destination must be large enough. */
1024static void
1025copy (position_set *src, position_set *dst)
1026{
1027  int i;
1028
1029  for (i = 0; i < src->nelem; ++i)
1030    dst->elems[i] = src->elems[i];
1031  dst->nelem = src->nelem;
1032}
1033
1034/* Insert a position in a set.  Position sets are maintained in sorted
1035   order according to index.  If position already exists in the set with
1036   the same index then their constraints are logically or'd together.
1037   S->elems must point to an array large enough to hold the resulting set. */
1038static void
1039insert (position p, position_set *s)
1040{
1041  int i;
1042  position t1, t2;
1043
1044  for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1045    continue;
1046  if (i < s->nelem && p.index == s->elems[i].index)
1047    s->elems[i].constraint |= p.constraint;
1048  else
1049    {
1050      t1 = p;
1051      ++s->nelem;
1052      while (i < s->nelem)
1053	{
1054	  t2 = s->elems[i];
1055	  s->elems[i++] = t1;
1056	  t1 = t2;
1057	}
1058    }
1059}
1060
1061/* Merge two sets of positions into a third.  The result is exactly as if
1062   the positions of both sets were inserted into an initially empty set. */
1063static void
1064merge (position_set *s1, position_set *s2, position_set *m)
1065{
1066  int i = 0, j = 0;
1067
1068  m->nelem = 0;
1069  while (i < s1->nelem && j < s2->nelem)
1070    if (s1->elems[i].index > s2->elems[j].index)
1071      m->elems[m->nelem++] = s1->elems[i++];
1072    else if (s1->elems[i].index < s2->elems[j].index)
1073      m->elems[m->nelem++] = s2->elems[j++];
1074    else
1075      {
1076	m->elems[m->nelem] = s1->elems[i++];
1077	m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1078      }
1079  while (i < s1->nelem)
1080    m->elems[m->nelem++] = s1->elems[i++];
1081  while (j < s2->nelem)
1082    m->elems[m->nelem++] = s2->elems[j++];
1083}
1084
1085/* Delete a position from a set. */
1086static void
1087delete (position p, position_set *s)
1088{
1089  int i;
1090
1091  for (i = 0; i < s->nelem; ++i)
1092    if (p.index == s->elems[i].index)
1093      break;
1094  if (i < s->nelem)
1095    for (--s->nelem; i < s->nelem; ++i)
1096      s->elems[i] = s->elems[i + 1];
1097}
1098
1099/* Find the index of the state corresponding to the given position set with
1100   the given preceding context, or create a new state if there is no such
1101   state.  Newline and letter tell whether we got here on a newline or
1102   letter, respectively. */
1103static int
1104state_index (struct dfa *d, position_set *s, int newline, int letter)
1105{
1106  int hash = 0;
1107  int constraint;
1108  int i, j;
1109
1110  newline = newline ? 1 : 0;
1111  letter = letter ? 1 : 0;
1112
1113  for (i = 0; i < s->nelem; ++i)
1114    hash ^= s->elems[i].index + s->elems[i].constraint;
1115
1116  /* Try to find a state that exactly matches the proposed one. */
1117  for (i = 0; i < d->sindex; ++i)
1118    {
1119      if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1120	  || newline != d->states[i].newline || letter != d->states[i].letter)
1121	continue;
1122      for (j = 0; j < s->nelem; ++j)
1123	if (s->elems[j].constraint
1124	    != d->states[i].elems.elems[j].constraint
1125	    || s->elems[j].index != d->states[i].elems.elems[j].index)
1126	  break;
1127      if (j == s->nelem)
1128	return i;
1129    }
1130
1131  /* We'll have to create a new state. */
1132  REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1133  d->states[i].hash = hash;
1134  MALLOC(d->states[i].elems.elems, position, s->nelem);
1135  copy(s, &d->states[i].elems);
1136  d->states[i].newline = newline;
1137  d->states[i].letter = letter;
1138  d->states[i].backref = 0;
1139  d->states[i].constraint = 0;
1140  d->states[i].first_end = 0;
1141  for (j = 0; j < s->nelem; ++j)
1142    if (d->tokens[s->elems[j].index] < 0)
1143      {
1144	constraint = s->elems[j].constraint;
1145	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1146	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1147	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1148	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1149	  d->states[i].constraint |= constraint;
1150	if (! d->states[i].first_end)
1151	  d->states[i].first_end = d->tokens[s->elems[j].index];
1152      }
1153    else if (d->tokens[s->elems[j].index] == BACKREF)
1154      {
1155	d->states[i].constraint = NO_CONSTRAINT;
1156	d->states[i].backref = 1;
1157      }
1158
1159  ++d->sindex;
1160
1161  return i;
1162}
1163
1164/* Find the epsilon closure of a set of positions.  If any position of the set
1165   contains a symbol that matches the empty string in some context, replace
1166   that position with the elements of its follow labeled with an appropriate
1167   constraint.  Repeat exhaustively until no funny positions are left.
1168   S->elems must be large enough to hold the result. */
1169static void
1170epsclosure (position_set *s, struct dfa *d)
1171{
1172  int i, j;
1173  int *visited;
1174  position p, old;
1175
1176  MALLOC(visited, int, d->tindex);
1177  for (i = 0; i < d->tindex; ++i)
1178    visited[i] = 0;
1179
1180  for (i = 0; i < s->nelem; ++i)
1181    if (d->tokens[s->elems[i].index] >= NOTCHAR
1182	&& d->tokens[s->elems[i].index] != BACKREF
1183	&& d->tokens[s->elems[i].index] < CSET)
1184      {
1185	old = s->elems[i];
1186	p.constraint = old.constraint;
1187	delete(s->elems[i], s);
1188	if (visited[old.index])
1189	  {
1190	    --i;
1191	    continue;
1192	  }
1193	visited[old.index] = 1;
1194	switch (d->tokens[old.index])
1195	  {
1196	  case BEGLINE:
1197	    p.constraint &= BEGLINE_CONSTRAINT;
1198	    break;
1199	  case ENDLINE:
1200	    p.constraint &= ENDLINE_CONSTRAINT;
1201	    break;
1202	  case BEGWORD:
1203	    p.constraint &= BEGWORD_CONSTRAINT;
1204	    break;
1205	  case ENDWORD:
1206	    p.constraint &= ENDWORD_CONSTRAINT;
1207	    break;
1208	  case LIMWORD:
1209	    p.constraint &= LIMWORD_CONSTRAINT;
1210	    break;
1211	  case NOTLIMWORD:
1212	    p.constraint &= NOTLIMWORD_CONSTRAINT;
1213	    break;
1214	  default:
1215	    break;
1216	  }
1217	for (j = 0; j < d->follows[old.index].nelem; ++j)
1218	  {
1219	    p.index = d->follows[old.index].elems[j].index;
1220	    insert(p, s);
1221	  }
1222	/* Force rescan to start at the beginning. */
1223	i = -1;
1224      }
1225
1226  free(visited);
1227}
1228
1229/* Perform bottom-up analysis on the parse tree, computing various functions.
1230   Note that at this point, we're pretending constructs like \< are real
1231   characters rather than constraints on what can follow them.
1232
1233   Nullable:  A node is nullable if it is at the root of a regexp that can
1234   match the empty string.
1235   *  EMPTY leaves are nullable.
1236   * No other leaf is nullable.
1237   * A QMARK or STAR node is nullable.
1238   * A PLUS node is nullable if its argument is nullable.
1239   * A CAT node is nullable if both its arguments are nullable.
1240   * An OR node is nullable if either argument is nullable.
1241
1242   Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
1243   that could correspond to the first character of a string matching the
1244   regexp rooted at the given node.
1245   * EMPTY leaves have empty firstpos.
1246   * The firstpos of a nonempty leaf is that leaf itself.
1247   * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1248     argument.
1249   * The firstpos of a CAT node is the firstpos of the left argument, union
1250     the firstpos of the right if the left argument is nullable.
1251   * The firstpos of an OR node is the union of firstpos of each argument.
1252
1253   Lastpos:  The lastpos of a node is the set of positions that could
1254   correspond to the last character of a string matching the regexp at
1255   the given node.
1256   * EMPTY leaves have empty lastpos.
1257   * The lastpos of a nonempty leaf is that leaf itself.
1258   * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1259     argument.
1260   * The lastpos of a CAT node is the lastpos of its right argument, union
1261     the lastpos of the left if the right argument is nullable.
1262   * The lastpos of an OR node is the union of the lastpos of each argument.
1263
1264   Follow:  The follow of a position is the set of positions that could
1265   correspond to the character following a character matching the node in
1266   a string matching the regexp.  At this point we consider special symbols
1267   that match the empty string in some context to be just normal characters.
1268   Later, if we find that a special symbol is in a follow set, we will
1269   replace it with the elements of its follow, labeled with an appropriate
1270   constraint.
1271   * Every node in the firstpos of the argument of a STAR or PLUS node is in
1272     the follow of every node in the lastpos.
1273   * Every node in the firstpos of the second argument of a CAT node is in
1274     the follow of every node in the lastpos of the first argument.
1275
1276   Because of the postfix representation of the parse tree, the depth-first
1277   analysis is conveniently done by a linear scan with the aid of a stack.
1278   Sets are stored as arrays of the elements, obeying a stack-like allocation
1279   scheme; the number of elements in each set deeper in the stack can be
1280   used to determine the address of a particular set's array. */
1281void
1282dfaanalyze (struct dfa *d, int searchflag)
1283{
1284  int *nullable;		/* Nullable stack. */
1285  int *nfirstpos;		/* Element count stack for firstpos sets. */
1286  position *firstpos;		/* Array where firstpos elements are stored. */
1287  int *nlastpos;		/* Element count stack for lastpos sets. */
1288  position *lastpos;		/* Array where lastpos elements are stored. */
1289  int *nalloc;			/* Sizes of arrays allocated to follow sets. */
1290  position_set tmp;		/* Temporary set for merging sets. */
1291  position_set merged;		/* Result of merging sets. */
1292  int wants_newline;		/* True if some position wants newline info. */
1293  int *o_nullable;
1294  int *o_nfirst, *o_nlast;
1295  position *o_firstpos, *o_lastpos;
1296  int i, j;
1297  position *pos;
1298
1299#ifdef DEBUG
1300  fprintf(stderr, "dfaanalyze:\n");
1301  for (i = 0; i < d->tindex; ++i)
1302    {
1303      fprintf(stderr, " %d:", i);
1304      prtok(d->tokens[i]);
1305    }
1306  putc('\n', stderr);
1307#endif
1308
1309  d->searchflag = searchflag;
1310
1311  MALLOC(nullable, int, d->depth);
1312  o_nullable = nullable;
1313  MALLOC(nfirstpos, int, d->depth);
1314  o_nfirst = nfirstpos;
1315  MALLOC(firstpos, position, d->nleaves);
1316  o_firstpos = firstpos, firstpos += d->nleaves;
1317  MALLOC(nlastpos, int, d->depth);
1318  o_nlast = nlastpos;
1319  MALLOC(lastpos, position, d->nleaves);
1320  o_lastpos = lastpos, lastpos += d->nleaves;
1321  MALLOC(nalloc, int, d->tindex);
1322  for (i = 0; i < d->tindex; ++i)
1323    nalloc[i] = 0;
1324  MALLOC(merged.elems, position, d->nleaves);
1325
1326  CALLOC(d->follows, position_set, d->tindex);
1327
1328  for (i = 0; i < d->tindex; ++i)
1329#ifdef DEBUG
1330    {				/* Nonsyntactic #ifdef goo... */
1331#endif
1332    switch (d->tokens[i])
1333      {
1334      case EMPTY:
1335	/* The empty set is nullable. */
1336	*nullable++ = 1;
1337
1338	/* The firstpos and lastpos of the empty leaf are both empty. */
1339	*nfirstpos++ = *nlastpos++ = 0;
1340	break;
1341
1342      case STAR:
1343      case PLUS:
1344	/* Every element in the firstpos of the argument is in the follow
1345	   of every element in the lastpos. */
1346	tmp.nelem = nfirstpos[-1];
1347	tmp.elems = firstpos;
1348	pos = lastpos;
1349	for (j = 0; j < nlastpos[-1]; ++j)
1350	  {
1351	    merge(&tmp, &d->follows[pos[j].index], &merged);
1352	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1353				 nalloc[pos[j].index], merged.nelem - 1);
1354	    copy(&merged, &d->follows[pos[j].index]);
1355	  }
1356
1357      case QMARK:
1358	/* A QMARK or STAR node is automatically nullable. */
1359	if (d->tokens[i] != PLUS)
1360	  nullable[-1] = 1;
1361	break;
1362
1363      case CAT:
1364	/* Every element in the firstpos of the second argument is in the
1365	   follow of every element in the lastpos of the first argument. */
1366	tmp.nelem = nfirstpos[-1];
1367	tmp.elems = firstpos;
1368	pos = lastpos + nlastpos[-1];
1369	for (j = 0; j < nlastpos[-2]; ++j)
1370	  {
1371	    merge(&tmp, &d->follows[pos[j].index], &merged);
1372	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1373				 nalloc[pos[j].index], merged.nelem - 1);
1374	    copy(&merged, &d->follows[pos[j].index]);
1375	  }
1376
1377	/* The firstpos of a CAT node is the firstpos of the first argument,
1378	   union that of the second argument if the first is nullable. */
1379	if (nullable[-2])
1380	  nfirstpos[-2] += nfirstpos[-1];
1381	else
1382	  firstpos += nfirstpos[-1];
1383	--nfirstpos;
1384
1385	/* The lastpos of a CAT node is the lastpos of the second argument,
1386	   union that of the first argument if the second is nullable. */
1387	if (nullable[-1])
1388	  nlastpos[-2] += nlastpos[-1];
1389	else
1390	  {
1391	    pos = lastpos + nlastpos[-2];
1392	    for (j = nlastpos[-1] - 1; j >= 0; --j)
1393	      pos[j] = lastpos[j];
1394	    lastpos += nlastpos[-2];
1395	    nlastpos[-2] = nlastpos[-1];
1396	  }
1397	--nlastpos;
1398
1399	/* A CAT node is nullable if both arguments are nullable. */
1400	nullable[-2] = nullable[-1] && nullable[-2];
1401	--nullable;
1402	break;
1403
1404      case OR:
1405      case ORTOP:
1406	/* The firstpos is the union of the firstpos of each argument. */
1407	nfirstpos[-2] += nfirstpos[-1];
1408	--nfirstpos;
1409
1410	/* The lastpos is the union of the lastpos of each argument. */
1411	nlastpos[-2] += nlastpos[-1];
1412	--nlastpos;
1413
1414	/* An OR node is nullable if either argument is nullable. */
1415	nullable[-2] = nullable[-1] || nullable[-2];
1416	--nullable;
1417	break;
1418
1419      default:
1420	/* Anything else is a nonempty position.  (Note that special
1421	   constructs like \< are treated as nonempty strings here;
1422	   an "epsilon closure" effectively makes them nullable later.
1423	   Backreferences have to get a real position so we can detect
1424	   transitions on them later.  But they are nullable. */
1425	*nullable++ = d->tokens[i] == BACKREF;
1426
1427	/* This position is in its own firstpos and lastpos. */
1428	*nfirstpos++ = *nlastpos++ = 1;
1429	--firstpos, --lastpos;
1430	firstpos->index = lastpos->index = i;
1431	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1432
1433	/* Allocate the follow set for this position. */
1434	nalloc[i] = 1;
1435	MALLOC(d->follows[i].elems, position, nalloc[i]);
1436	break;
1437      }
1438#ifdef DEBUG
1439    /* ... balance the above nonsyntactic #ifdef goo... */
1440      fprintf(stderr, "node %d:", i);
1441      prtok(d->tokens[i]);
1442      putc('\n', stderr);
1443      fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1444      fprintf(stderr, " firstpos:");
1445      for (j = nfirstpos[-1] - 1; j >= 0; --j)
1446	{
1447	  fprintf(stderr, " %d:", firstpos[j].index);
1448	  prtok(d->tokens[firstpos[j].index]);
1449	}
1450      fprintf(stderr, "\n lastpos:");
1451      for (j = nlastpos[-1] - 1; j >= 0; --j)
1452	{
1453	  fprintf(stderr, " %d:", lastpos[j].index);
1454	  prtok(d->tokens[lastpos[j].index]);
1455	}
1456      putc('\n', stderr);
1457    }
1458#endif
1459
1460  /* For each follow set that is the follow set of a real position, replace
1461     it with its epsilon closure. */
1462  for (i = 0; i < d->tindex; ++i)
1463    if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1464	|| d->tokens[i] >= CSET)
1465      {
1466#ifdef DEBUG
1467	fprintf(stderr, "follows(%d:", i);
1468	prtok(d->tokens[i]);
1469	fprintf(stderr, "):");
1470	for (j = d->follows[i].nelem - 1; j >= 0; --j)
1471	  {
1472	    fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1473	    prtok(d->tokens[d->follows[i].elems[j].index]);
1474	  }
1475	putc('\n', stderr);
1476#endif
1477	copy(&d->follows[i], &merged);
1478	epsclosure(&merged, d);
1479	if (d->follows[i].nelem < merged.nelem)
1480	  REALLOC(d->follows[i].elems, position, merged.nelem);
1481	copy(&merged, &d->follows[i]);
1482      }
1483
1484  /* Get the epsilon closure of the firstpos of the regexp.  The result will
1485     be the set of positions of state 0. */
1486  merged.nelem = 0;
1487  for (i = 0; i < nfirstpos[-1]; ++i)
1488    insert(firstpos[i], &merged);
1489  epsclosure(&merged, d);
1490
1491  /* Check if any of the positions of state 0 will want newline context. */
1492  wants_newline = 0;
1493  for (i = 0; i < merged.nelem; ++i)
1494    if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1495      wants_newline = 1;
1496
1497  /* Build the initial state. */
1498  d->salloc = 1;
1499  d->sindex = 0;
1500  MALLOC(d->states, dfa_state, d->salloc);
1501  state_index(d, &merged, wants_newline, 0);
1502
1503  free(o_nullable);
1504  free(o_nfirst);
1505  free(o_firstpos);
1506  free(o_nlast);
1507  free(o_lastpos);
1508  free(nalloc);
1509  free(merged.elems);
1510}
1511
1512/* Find, for each character, the transition out of state s of d, and store
1513   it in the appropriate slot of trans.
1514
1515   We divide the positions of s into groups (positions can appear in more
1516   than one group).  Each group is labeled with a set of characters that
1517   every position in the group matches (taking into account, if necessary,
1518   preceding context information of s).  For each group, find the union
1519   of the its elements' follows.  This set is the set of positions of the
1520   new state.  For each character in the group's label, set the transition
1521   on this character to be to a state corresponding to the set's positions,
1522   and its associated backward context information, if necessary.
1523
1524   If we are building a searching matcher, we include the positions of state
1525   0 in every state.
1526
1527   The collection of groups is constructed by building an equivalence-class
1528   partition of the positions of s.
1529
1530   For each position, find the set of characters C that it matches.  Eliminate
1531   any characters from C that fail on grounds of backward context.
1532
1533   Search through the groups, looking for a group whose label L has nonempty
1534   intersection with C.  If L - C is nonempty, create a new group labeled
1535   L - C and having the same positions as the current group, and set L to
1536   the intersection of L and C.  Insert the position in this group, set
1537   C = C - L, and resume scanning.
1538
1539   If after comparing with every group there are characters remaining in C,
1540   create a new group labeled with the characters of C and insert this
1541   position in that group. */
1542void
1543dfastate (int s, struct dfa *d, int trans[])
1544{
1545  position_set grps[NOTCHAR];	/* As many as will ever be needed. */
1546  charclass labels[NOTCHAR];	/* Labels corresponding to the groups. */
1547  int ngrps = 0;		/* Number of groups actually used. */
1548  position pos;			/* Current position being considered. */
1549  charclass matches;		/* Set of matching characters. */
1550  int matchesf;			/* True if matches is nonempty. */
1551  charclass intersect;		/* Intersection with some label set. */
1552  int intersectf;		/* True if intersect is nonempty. */
1553  charclass leftovers;		/* Stuff in the label that didn't match. */
1554  int leftoversf;		/* True if leftovers is nonempty. */
1555  static charclass letters;	/* Set of characters considered letters. */
1556  static charclass newline;	/* Set of characters that aren't newline. */
1557  position_set follows;		/* Union of the follows of some group. */
1558  position_set tmp;		/* Temporary space for merging sets. */
1559  int state;			/* New state. */
1560  int wants_newline;		/* New state wants to know newline context. */
1561  int state_newline;		/* New state on a newline transition. */
1562  int wants_letter;		/* New state wants to know letter context. */
1563  int state_letter;		/* New state on a letter transition. */
1564  static int initialized;	/* Flag for static initialization. */
1565  int i, j, k;
1566
1567  /* Initialize the set of letters, if necessary. */
1568  if (! initialized)
1569    {
1570      initialized = 1;
1571      for (i = 0; i < NOTCHAR; ++i)
1572	if (IS_WORD_CONSTITUENT(i))
1573	  setbit(i, letters);
1574      setbit(eolbyte, newline);
1575    }
1576
1577  zeroset(matches);
1578
1579  for (i = 0; i < d->states[s].elems.nelem; ++i)
1580    {
1581      pos = d->states[s].elems.elems[i];
1582      if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1583	setbit(d->tokens[pos.index], matches);
1584      else if (d->tokens[pos.index] >= CSET)
1585	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1586      else
1587	continue;
1588
1589      /* Some characters may need to be eliminated from matches because
1590	 they fail in the current context. */
1591      if (pos.constraint != 0xFF)
1592	{
1593	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1594					 d->states[s].newline, 1))
1595	    clrbit(eolbyte, matches);
1596	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1597					 d->states[s].newline, 0))
1598	    for (j = 0; j < CHARCLASS_INTS; ++j)
1599	      matches[j] &= newline[j];
1600	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1601					d->states[s].letter, 1))
1602	    for (j = 0; j < CHARCLASS_INTS; ++j)
1603	      matches[j] &= ~letters[j];
1604	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1605					d->states[s].letter, 0))
1606	    for (j = 0; j < CHARCLASS_INTS; ++j)
1607	      matches[j] &= letters[j];
1608
1609	  /* If there are no characters left, there's no point in going on. */
1610	  for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
1611	    continue;
1612	  if (j == CHARCLASS_INTS)
1613	    continue;
1614	}
1615
1616      for (j = 0; j < ngrps; ++j)
1617	{
1618	  /* If matches contains a single character only, and the current
1619	     group's label doesn't contain that character, go on to the
1620	     next group. */
1621	  if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
1622	      && !tstbit(d->tokens[pos.index], labels[j]))
1623	    continue;
1624
1625	  /* Check if this group's label has a nonempty intersection with
1626	     matches. */
1627	  intersectf = 0;
1628	  for (k = 0; k < CHARCLASS_INTS; ++k)
1629	    (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
1630	  if (! intersectf)
1631	    continue;
1632
1633	  /* It does; now find the set differences both ways. */
1634	  leftoversf = matchesf = 0;
1635	  for (k = 0; k < CHARCLASS_INTS; ++k)
1636	    {
1637	      /* Even an optimizing compiler can't know this for sure. */
1638	      int match = matches[k], label = labels[j][k];
1639
1640	      (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
1641	      (matches[k] = match & ~label) ? (matchesf = 1) : 0;
1642	    }
1643
1644	  /* If there were leftovers, create a new group labeled with them. */
1645	  if (leftoversf)
1646	    {
1647	      copyset(leftovers, labels[ngrps]);
1648	      copyset(intersect, labels[j]);
1649	      MALLOC(grps[ngrps].elems, position, d->nleaves);
1650	      copy(&grps[j], &grps[ngrps]);
1651	      ++ngrps;
1652	    }
1653
1654	  /* Put the position in the current group.  Note that there is no
1655	     reason to call insert() here. */
1656	  grps[j].elems[grps[j].nelem++] = pos;
1657
1658	  /* If every character matching the current position has been
1659	     accounted for, we're done. */
1660	  if (! matchesf)
1661	    break;
1662	}
1663
1664      /* If we've passed the last group, and there are still characters
1665	 unaccounted for, then we'll have to create a new group. */
1666      if (j == ngrps)
1667	{
1668	  copyset(matches, labels[ngrps]);
1669	  zeroset(matches);
1670	  MALLOC(grps[ngrps].elems, position, d->nleaves);
1671	  grps[ngrps].nelem = 1;
1672	  grps[ngrps].elems[0] = pos;
1673	  ++ngrps;
1674	}
1675    }
1676
1677  MALLOC(follows.elems, position, d->nleaves);
1678  MALLOC(tmp.elems, position, d->nleaves);
1679
1680  /* If we are a searching matcher, the default transition is to a state
1681     containing the positions of state 0, otherwise the default transition
1682     is to fail miserably. */
1683  if (d->searchflag)
1684    {
1685      wants_newline = 0;
1686      wants_letter = 0;
1687      for (i = 0; i < d->states[0].elems.nelem; ++i)
1688	{
1689	  if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
1690	    wants_newline = 1;
1691	  if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
1692	    wants_letter = 1;
1693	}
1694      copy(&d->states[0].elems, &follows);
1695      state = state_index(d, &follows, 0, 0);
1696      if (wants_newline)
1697	state_newline = state_index(d, &follows, 1, 0);
1698      else
1699	state_newline = state;
1700      if (wants_letter)
1701	state_letter = state_index(d, &follows, 0, 1);
1702      else
1703	state_letter = state;
1704      for (i = 0; i < NOTCHAR; ++i)
1705	trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
1706      trans[eolbyte] = state_newline;
1707    }
1708  else
1709    for (i = 0; i < NOTCHAR; ++i)
1710      trans[i] = -1;
1711
1712  for (i = 0; i < ngrps; ++i)
1713    {
1714      follows.nelem = 0;
1715
1716      /* Find the union of the follows of the positions of the group.
1717	 This is a hideously inefficient loop.  Fix it someday. */
1718      for (j = 0; j < grps[i].nelem; ++j)
1719	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
1720	  insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
1721
1722      /* If we are building a searching matcher, throw in the positions
1723	 of state 0 as well. */
1724      if (d->searchflag)
1725	for (j = 0; j < d->states[0].elems.nelem; ++j)
1726	  insert(d->states[0].elems.elems[j], &follows);
1727
1728      /* Find out if the new state will want any context information. */
1729      wants_newline = 0;
1730      if (tstbit(eolbyte, labels[i]))
1731	for (j = 0; j < follows.nelem; ++j)
1732	  if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
1733	    wants_newline = 1;
1734
1735      wants_letter = 0;
1736      for (j = 0; j < CHARCLASS_INTS; ++j)
1737	if (labels[i][j] & letters[j])
1738	  break;
1739      if (j < CHARCLASS_INTS)
1740	for (j = 0; j < follows.nelem; ++j)
1741	  if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
1742	    wants_letter = 1;
1743
1744      /* Find the state(s) corresponding to the union of the follows. */
1745      state = state_index(d, &follows, 0, 0);
1746      if (wants_newline)
1747	state_newline = state_index(d, &follows, 1, 0);
1748      else
1749	state_newline = state;
1750      if (wants_letter)
1751	state_letter = state_index(d, &follows, 0, 1);
1752      else
1753	state_letter = state;
1754
1755      /* Set the transitions for each character in the current label. */
1756      for (j = 0; j < CHARCLASS_INTS; ++j)
1757	for (k = 0; k < INTBITS; ++k)
1758	  if (labels[i][j] & 1 << k)
1759	    {
1760	      int c = j * INTBITS + k;
1761
1762	      if (c == eolbyte)
1763		trans[c] = state_newline;
1764	      else if (IS_WORD_CONSTITUENT(c))
1765		trans[c] = state_letter;
1766	      else if (c < NOTCHAR)
1767		trans[c] = state;
1768	    }
1769    }
1770
1771  for (i = 0; i < ngrps; ++i)
1772    free(grps[i].elems);
1773  free(follows.elems);
1774  free(tmp.elems);
1775}
1776
1777/* Some routines for manipulating a compiled dfa's transition tables.
1778   Each state may or may not have a transition table; if it does, and it
1779   is a non-accepting state, then d->trans[state] points to its table.
1780   If it is an accepting state then d->fails[state] points to its table.
1781   If it has no table at all, then d->trans[state] is NULL.
1782   TODO: Improve this comment, get rid of the unnecessary redundancy. */
1783
1784static void
1785build_state (int s, struct dfa *d)
1786{
1787  int *trans;			/* The new transition table. */
1788  int i;
1789
1790  /* Set an upper limit on the number of transition tables that will ever
1791     exist at once.  1024 is arbitrary.  The idea is that the frequently
1792     used transition tables will be quickly rebuilt, whereas the ones that
1793     were only needed once or twice will be cleared away. */
1794  if (d->trcount >= 1024)
1795    {
1796      for (i = 0; i < d->tralloc; ++i)
1797	if (d->trans[i])
1798	  {
1799	    free((ptr_t) d->trans[i]);
1800	    d->trans[i] = NULL;
1801	  }
1802	else if (d->fails[i])
1803	  {
1804	    free((ptr_t) d->fails[i]);
1805	    d->fails[i] = NULL;
1806	  }
1807      d->trcount = 0;
1808    }
1809
1810  ++d->trcount;
1811
1812  /* Set up the success bits for this state. */
1813  d->success[s] = 0;
1814  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
1815      s, *d))
1816    d->success[s] |= 4;
1817  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
1818      s, *d))
1819    d->success[s] |= 2;
1820  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
1821      s, *d))
1822    d->success[s] |= 1;
1823
1824  MALLOC(trans, int, NOTCHAR);
1825  dfastate(s, d, trans);
1826
1827  /* Now go through the new transition table, and make sure that the trans
1828     and fail arrays are allocated large enough to hold a pointer for the
1829     largest state mentioned in the table. */
1830  for (i = 0; i < NOTCHAR; ++i)
1831    if (trans[i] >= d->tralloc)
1832      {
1833	int oldalloc = d->tralloc;
1834
1835	while (trans[i] >= d->tralloc)
1836	  d->tralloc *= 2;
1837	REALLOC(d->realtrans, int *, d->tralloc + 1);
1838	d->trans = d->realtrans + 1;
1839	REALLOC(d->fails, int *, d->tralloc);
1840	REALLOC(d->success, int, d->tralloc);
1841	REALLOC(d->newlines, int, d->tralloc);
1842	while (oldalloc < d->tralloc)
1843	  {
1844	    d->trans[oldalloc] = NULL;
1845	    d->fails[oldalloc++] = NULL;
1846	  }
1847      }
1848
1849  /* Keep the newline transition in a special place so we can use it as
1850     a sentinel. */
1851  d->newlines[s] = trans[eolbyte];
1852  trans[eolbyte] = -1;
1853
1854  if (ACCEPTING(s, *d))
1855    d->fails[s] = trans;
1856  else
1857    d->trans[s] = trans;
1858}
1859
1860static void
1861build_state_zero (struct dfa *d)
1862{
1863  d->tralloc = 1;
1864  d->trcount = 0;
1865  CALLOC(d->realtrans, int *, d->tralloc + 1);
1866  d->trans = d->realtrans + 1;
1867  CALLOC(d->fails, int *, d->tralloc);
1868  MALLOC(d->success, int, d->tralloc);
1869  MALLOC(d->newlines, int, d->tralloc);
1870  build_state(0, d);
1871}
1872
1873/* Search through a buffer looking for a match to the given struct dfa.
1874   Find the first occurrence of a string matching the regexp in the buffer,
1875   and the shortest possible version thereof.  Return a pointer to the first
1876   character after the match, or NULL if none is found.  Begin points to
1877   the beginning of the buffer, and end points to the first character after
1878   its end.  We store a newline in *end to act as a sentinel, so end had
1879   better point somewhere valid.  Newline is a flag indicating whether to
1880   allow newlines to be in the matching string.  If count is non-
1881   NULL it points to a place we're supposed to increment every time we
1882   see a newline.  Finally, if backref is non-NULL it points to a place
1883   where we're supposed to store a 1 if backreferencing happened and the
1884   match needs to be verified by a backtracking matcher.  Otherwise
1885   we store a 0 in *backref. */
1886char *
1887dfaexec (struct dfa *d, char *begin, char *end,
1888	 int newline, int *count, int *backref)
1889{
1890  register int s, s1, tmp;	/* Current state. */
1891  register unsigned char *p;	/* Current input character. */
1892  register int **trans, *t;	/* Copy of d->trans so it can be optimized
1893				   into a register. */
1894  register unsigned char eol = eolbyte;	/* Likewise for eolbyte.  */
1895  static int sbit[NOTCHAR];	/* Table for anding with d->success. */
1896  static int sbit_init;
1897
1898  if (! sbit_init)
1899    {
1900      int i;
1901
1902      sbit_init = 1;
1903      for (i = 0; i < NOTCHAR; ++i)
1904	sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
1905      sbit[eol] = 4;
1906    }
1907
1908  if (! d->tralloc)
1909    build_state_zero(d);
1910
1911  s = s1 = 0;
1912  p = (unsigned char *) begin;
1913  trans = d->trans;
1914  *end = eol;
1915
1916  for (;;)
1917    {
1918      while ((t = trans[s]) != 0) { /* hand-optimized loop */
1919	s1 = t[*p++];
1920        if ((t = trans[s1]) == 0) {
1921           tmp = s ; s = s1 ; s1 = tmp ; /* swap */
1922           break;
1923        }
1924	s = t[*p++];
1925      }
1926
1927      if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
1928	{
1929	  if (d->success[s] & sbit[*p])
1930	    {
1931	      if (backref)
1932		*backref = (d->states[s].backref != 0);
1933	      return (char *) p;
1934	    }
1935
1936	  s1 = s;
1937	  s = d->fails[s][*p++];
1938	  continue;
1939	}
1940
1941      /* If the previous character was a newline, count it. */
1942      if (count && (char *) p <= end && p[-1] == eol)
1943	++*count;
1944
1945      /* Check if we've run off the end of the buffer. */
1946      if ((char *) p > end)
1947	return NULL;
1948
1949      if (s >= 0)
1950	{
1951	  build_state(s, d);
1952	  trans = d->trans;
1953	  continue;
1954	}
1955
1956      if (p[-1] == eol && newline)
1957	{
1958	  s = d->newlines[s1];
1959	  continue;
1960	}
1961
1962      s = 0;
1963    }
1964}
1965
1966/* Initialize the components of a dfa that the other routines don't
1967   initialize for themselves. */
1968void
1969dfainit (struct dfa *d)
1970{
1971  d->calloc = 1;
1972  MALLOC(d->charclasses, charclass, d->calloc);
1973  d->cindex = 0;
1974
1975  d->talloc = 1;
1976  MALLOC(d->tokens, token, d->talloc);
1977  d->tindex = d->depth = d->nleaves = d->nregexps = 0;
1978
1979  d->searchflag = 0;
1980  d->tralloc = 0;
1981
1982  d->musts = 0;
1983}
1984
1985/* Parse and analyze a single string of the given length. */
1986void
1987dfacomp (char *s, size_t len, struct dfa *d, int searchflag)
1988{
1989  if (case_fold)	/* dummy folding in service of dfamust() */
1990    {
1991      char *lcopy;
1992      int i;
1993
1994      lcopy = malloc(len);
1995      if (!lcopy)
1996	dfaerror(_("out of memory"));
1997
1998      /* This is a kludge. */
1999      case_fold = 0;
2000      for (i = 0; i < len; ++i)
2001	if (ISUPPER ((unsigned char) s[i]))
2002	  lcopy[i] = tolower ((unsigned char) s[i]);
2003	else
2004	  lcopy[i] = s[i];
2005
2006      dfainit(d);
2007      dfaparse(lcopy, len, d);
2008      free(lcopy);
2009      dfamust(d);
2010      d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2011      case_fold = 1;
2012      dfaparse(s, len, d);
2013      dfaanalyze(d, searchflag);
2014    }
2015  else
2016    {
2017        dfainit(d);
2018        dfaparse(s, len, d);
2019	dfamust(d);
2020        dfaanalyze(d, searchflag);
2021    }
2022}
2023
2024/* Free the storage held by the components of a dfa. */
2025void
2026dfafree (struct dfa *d)
2027{
2028  int i;
2029  struct dfamust *dm, *ndm;
2030
2031  free((ptr_t) d->charclasses);
2032  free((ptr_t) d->tokens);
2033  for (i = 0; i < d->sindex; ++i)
2034    free((ptr_t) d->states[i].elems.elems);
2035  free((ptr_t) d->states);
2036  for (i = 0; i < d->tindex; ++i)
2037    if (d->follows[i].elems)
2038      free((ptr_t) d->follows[i].elems);
2039  free((ptr_t) d->follows);
2040  for (i = 0; i < d->tralloc; ++i)
2041    if (d->trans[i])
2042      free((ptr_t) d->trans[i]);
2043    else if (d->fails[i])
2044      free((ptr_t) d->fails[i]);
2045  if (d->realtrans) free((ptr_t) d->realtrans);
2046  if (d->fails) free((ptr_t) d->fails);
2047  if (d->newlines) free((ptr_t) d->newlines);
2048  if (d->success) free((ptr_t) d->success);
2049  for (dm = d->musts; dm; dm = ndm)
2050    {
2051      ndm = dm->next;
2052      free(dm->must);
2053      free((ptr_t) dm);
2054    }
2055}
2056
2057/* Having found the postfix representation of the regular expression,
2058   try to find a long sequence of characters that must appear in any line
2059   containing the r.e.
2060   Finding a "longest" sequence is beyond the scope here;
2061   we take an easy way out and hope for the best.
2062   (Take "(ab|a)b"--please.)
2063
2064   We do a bottom-up calculation of sequences of characters that must appear
2065   in matches of r.e.'s represented by trees rooted at the nodes of the postfix
2066   representation:
2067	sequences that must appear at the left of the match ("left")
2068	sequences that must appear at the right of the match ("right")
2069	lists of sequences that must appear somewhere in the match ("in")
2070	sequences that must constitute the match ("is")
2071
2072   When we get to the root of the tree, we use one of the longest of its
2073   calculated "in" sequences as our answer.  The sequence we find is returned in
2074   d->must (where "d" is the single argument passed to "dfamust");
2075   the length of the sequence is returned in d->mustn.
2076
2077   The sequences calculated for the various types of node (in pseudo ANSI c)
2078   are shown below.  "p" is the operand of unary operators (and the left-hand
2079   operand of binary operators); "q" is the right-hand operand of binary
2080   operators.
2081
2082   "ZERO" means "a zero-length sequence" below.
2083
2084	Type	left		right		is		in
2085	----	----		-----		--		--
2086	char c	# c		# c		# c		# c
2087
2088	CSET	ZERO		ZERO		ZERO		ZERO
2089
2090	STAR	ZERO		ZERO		ZERO		ZERO
2091
2092	QMARK	ZERO		ZERO		ZERO		ZERO
2093
2094	PLUS	p->left		p->right	ZERO		p->in
2095
2096	CAT	(p->is==ZERO)?	(q->is==ZERO)?	(p->is!=ZERO &&	p->in plus
2097		p->left :	q->right :	q->is!=ZERO) ?	q->in plus
2098		p->is##q->left	p->right##q->is	p->is##q->is :	p->right##q->left
2099						ZERO
2100
2101	OR	longest common	longest common	(do p->is and	substrings common to
2102		leading		trailing	q->is have same	p->in and q->in
2103		(sub)sequence	(sub)sequence	length and
2104		of p->left	of p->right	content) ?
2105		and q->left	and q->right	p->is : NULL
2106
2107   If there's anything else we recognize in the tree, all four sequences get set
2108   to zero-length sequences.  If there's something we don't recognize in the tree,
2109   we just return a zero-length sequence.
2110
2111   Break ties in favor of infrequent letters (choosing 'zzz' in preference to
2112   'aaa')?
2113
2114   And. . .is it here or someplace that we might ponder "optimizations" such as
2115	egrep 'psi|epsilon'	->	egrep 'psi'
2116	egrep 'pepsi|epsilon'	->	egrep 'epsi'
2117					(Yes, we now find "epsi" as a "string
2118					that must occur", but we might also
2119					simplify the *entire* r.e. being sought)
2120	grep '[c]'		->	grep 'c'
2121	grep '(ab|a)b'		->	grep 'ab'
2122	grep 'ab*'		->	grep 'a'
2123	grep 'a*b'		->	grep 'b'
2124
2125   There are several issues:
2126
2127   Is optimization easy (enough)?
2128
2129   Does optimization actually accomplish anything,
2130   or is the automaton you get from "psi|epsilon" (for example)
2131   the same as the one you get from "psi" (for example)?
2132
2133   Are optimizable r.e.'s likely to be used in real-life situations
2134   (something like 'ab*' is probably unlikely; something like is
2135   'psi|epsilon' is likelier)? */
2136
2137static char *
2138icatalloc (char *old, char *new)
2139{
2140  char *result;
2141  size_t oldsize, newsize;
2142
2143  newsize = (new == NULL) ? 0 : strlen(new);
2144  if (old == NULL)
2145    oldsize = 0;
2146  else if (newsize == 0)
2147    return old;
2148  else	oldsize = strlen(old);
2149  if (old == NULL)
2150    result = (char *) malloc(newsize + 1);
2151  else
2152    result = (char *) realloc((void *) old, oldsize + newsize + 1);
2153  if (result != NULL && new != NULL)
2154    (void) strcpy(result + oldsize, new);
2155  return result;
2156}
2157
2158static char *
2159icpyalloc (char *string)
2160{
2161  return icatalloc((char *) NULL, string);
2162}
2163
2164static char *
2165istrstr (char *lookin, char *lookfor)
2166{
2167  char *cp;
2168  size_t len;
2169
2170  len = strlen(lookfor);
2171  for (cp = lookin; *cp != '\0'; ++cp)
2172    if (strncmp(cp, lookfor, len) == 0)
2173      return cp;
2174  return NULL;
2175}
2176
2177static void
2178ifree (char *cp)
2179{
2180  if (cp != NULL)
2181    free(cp);
2182}
2183
2184static void
2185freelist (char **cpp)
2186{
2187  int i;
2188
2189  if (cpp == NULL)
2190    return;
2191  for (i = 0; cpp[i] != NULL; ++i)
2192    {
2193      free(cpp[i]);
2194      cpp[i] = NULL;
2195    }
2196}
2197
2198static char **
2199enlist (char **cpp, char *new, size_t len)
2200{
2201  int i, j;
2202
2203  if (cpp == NULL)
2204    return NULL;
2205  if ((new = icpyalloc(new)) == NULL)
2206    {
2207      freelist(cpp);
2208      return NULL;
2209    }
2210  new[len] = '\0';
2211  /* Is there already something in the list that's new (or longer)? */
2212  for (i = 0; cpp[i] != NULL; ++i)
2213    if (istrstr(cpp[i], new) != NULL)
2214      {
2215	free(new);
2216	return cpp;
2217      }
2218  /* Eliminate any obsoleted strings. */
2219  j = 0;
2220  while (cpp[j] != NULL)
2221    if (istrstr(new, cpp[j]) == NULL)
2222      ++j;
2223    else
2224      {
2225	free(cpp[j]);
2226	if (--i == j)
2227	  break;
2228	cpp[j] = cpp[i];
2229	cpp[i] = NULL;
2230      }
2231  /* Add the new string. */
2232  cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
2233  if (cpp == NULL)
2234    return NULL;
2235  cpp[i] = new;
2236  cpp[i + 1] = NULL;
2237  return cpp;
2238}
2239
2240/* Given pointers to two strings, return a pointer to an allocated
2241   list of their distinct common substrings. Return NULL if something
2242   seems wild. */
2243static char **
2244comsubs (char *left, char *right)
2245{
2246  char **cpp;
2247  char *lcp;
2248  char *rcp;
2249  size_t i, len;
2250
2251  if (left == NULL || right == NULL)
2252    return NULL;
2253  cpp = (char **) malloc(sizeof *cpp);
2254  if (cpp == NULL)
2255    return NULL;
2256  cpp[0] = NULL;
2257  for (lcp = left; *lcp != '\0'; ++lcp)
2258    {
2259      len = 0;
2260      rcp = index(right, *lcp);
2261      while (rcp != NULL)
2262	{
2263	  for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
2264	    continue;
2265	  if (i > len)
2266	    len = i;
2267	  rcp = index(rcp + 1, *lcp);
2268	}
2269      if (len == 0)
2270	continue;
2271      if ((cpp = enlist(cpp, lcp, len)) == NULL)
2272	break;
2273    }
2274  return cpp;
2275}
2276
2277static char **
2278addlists (char **old, char **new)
2279{
2280  int i;
2281
2282  if (old == NULL || new == NULL)
2283    return NULL;
2284  for (i = 0; new[i] != NULL; ++i)
2285    {
2286      old = enlist(old, new[i], strlen(new[i]));
2287      if (old == NULL)
2288	break;
2289    }
2290  return old;
2291}
2292
2293/* Given two lists of substrings, return a new list giving substrings
2294   common to both. */
2295static char **
2296inboth (char **left, char **right)
2297{
2298  char **both;
2299  char **temp;
2300  int lnum, rnum;
2301
2302  if (left == NULL || right == NULL)
2303    return NULL;
2304  both = (char **) malloc(sizeof *both);
2305  if (both == NULL)
2306    return NULL;
2307  both[0] = NULL;
2308  for (lnum = 0; left[lnum] != NULL; ++lnum)
2309    {
2310      for (rnum = 0; right[rnum] != NULL; ++rnum)
2311	{
2312	  temp = comsubs(left[lnum], right[rnum]);
2313	  if (temp == NULL)
2314	    {
2315	      freelist(both);
2316	      return NULL;
2317	    }
2318	  both = addlists(both, temp);
2319	  freelist(temp);
2320	  free(temp);
2321	  if (both == NULL)
2322	    return NULL;
2323	}
2324    }
2325  return both;
2326}
2327
2328typedef struct
2329{
2330  char **in;
2331  char *left;
2332  char *right;
2333  char *is;
2334} must;
2335
2336static void
2337resetmust (must *mp)
2338{
2339  mp->left[0] = mp->right[0] = mp->is[0] = '\0';
2340  freelist(mp->in);
2341}
2342
2343static void
2344dfamust (struct dfa *dfa)
2345{
2346  must *musts;
2347  must *mp;
2348  char *result;
2349  int ri;
2350  int i;
2351  int exact;
2352  token t;
2353  static must must0;
2354  struct dfamust *dm;
2355  static char empty_string[] = "";
2356
2357  result = empty_string;
2358  exact = 0;
2359  musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
2360  if (musts == NULL)
2361    return;
2362  mp = musts;
2363  for (i = 0; i <= dfa->tindex; ++i)
2364    mp[i] = must0;
2365  for (i = 0; i <= dfa->tindex; ++i)
2366    {
2367      mp[i].in = (char **) malloc(sizeof *mp[i].in);
2368      mp[i].left = malloc(2);
2369      mp[i].right = malloc(2);
2370      mp[i].is = malloc(2);
2371      if (mp[i].in == NULL || mp[i].left == NULL ||
2372	  mp[i].right == NULL || mp[i].is == NULL)
2373	goto done;
2374      mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
2375      mp[i].in[0] = NULL;
2376    }
2377#ifdef DEBUG
2378  fprintf(stderr, "dfamust:\n");
2379  for (i = 0; i < dfa->tindex; ++i)
2380    {
2381      fprintf(stderr, " %d:", i);
2382      prtok(dfa->tokens[i]);
2383    }
2384  putc('\n', stderr);
2385#endif
2386  for (ri = 0; ri < dfa->tindex; ++ri)
2387    {
2388      switch (t = dfa->tokens[ri])
2389	{
2390	case LPAREN:
2391	case RPAREN:
2392	  goto done;		/* "cannot happen" */
2393	case EMPTY:
2394	case BEGLINE:
2395	case ENDLINE:
2396	case BEGWORD:
2397	case ENDWORD:
2398	case LIMWORD:
2399	case NOTLIMWORD:
2400	case BACKREF:
2401	  resetmust(mp);
2402	  break;
2403	case STAR:
2404	case QMARK:
2405	  if (mp <= musts)
2406	    goto done;		/* "cannot happen" */
2407	  --mp;
2408	  resetmust(mp);
2409	  break;
2410	case OR:
2411	case ORTOP:
2412	  if (mp < &musts[2])
2413	    goto done;		/* "cannot happen" */
2414	  {
2415	    char **new;
2416	    must *lmp;
2417	    must *rmp;
2418	    int j, ln, rn, n;
2419
2420	    rmp = --mp;
2421	    lmp = --mp;
2422	    /* Guaranteed to be.  Unlikely, but. . . */
2423	    if (strcmp(lmp->is, rmp->is) != 0)
2424	      lmp->is[0] = '\0';
2425	    /* Left side--easy */
2426	    i = 0;
2427	    while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
2428	      ++i;
2429	    lmp->left[i] = '\0';
2430	    /* Right side */
2431	    ln = strlen(lmp->right);
2432	    rn = strlen(rmp->right);
2433	    n = ln;
2434	    if (n > rn)
2435	      n = rn;
2436	    for (i = 0; i < n; ++i)
2437	      if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
2438		break;
2439	    for (j = 0; j < i; ++j)
2440	      lmp->right[j] = lmp->right[(ln - i) + j];
2441	    lmp->right[j] = '\0';
2442	    new = inboth(lmp->in, rmp->in);
2443	    if (new == NULL)
2444	      goto done;
2445	    freelist(lmp->in);
2446	    free((char *) lmp->in);
2447	    lmp->in = new;
2448	  }
2449	  break;
2450	case PLUS:
2451	  if (mp <= musts)
2452	    goto done;		/* "cannot happen" */
2453	  --mp;
2454	  mp->is[0] = '\0';
2455	  break;
2456	case END:
2457	  if (mp != &musts[1])
2458	    goto done;		/* "cannot happen" */
2459	  for (i = 0; musts[0].in[i] != NULL; ++i)
2460	    if (strlen(musts[0].in[i]) > strlen(result))
2461	      result = musts[0].in[i];
2462	  if (strcmp(result, musts[0].is) == 0)
2463	    exact = 1;
2464	  goto done;
2465	case CAT:
2466	  if (mp < &musts[2])
2467	    goto done;		/* "cannot happen" */
2468	  {
2469	    must *lmp;
2470	    must *rmp;
2471
2472	    rmp = --mp;
2473	    lmp = --mp;
2474	    /* In.  Everything in left, plus everything in
2475	       right, plus catenation of
2476	       left's right and right's left. */
2477	    lmp->in = addlists(lmp->in, rmp->in);
2478	    if (lmp->in == NULL)
2479	      goto done;
2480	    if (lmp->right[0] != '\0' &&
2481		rmp->left[0] != '\0')
2482	      {
2483		char *tp;
2484
2485		tp = icpyalloc(lmp->right);
2486		if (tp == NULL)
2487		  goto done;
2488		tp = icatalloc(tp, rmp->left);
2489		if (tp == NULL)
2490		  goto done;
2491		lmp->in = enlist(lmp->in, tp,
2492				 strlen(tp));
2493		free(tp);
2494		if (lmp->in == NULL)
2495		  goto done;
2496	      }
2497	    /* Left-hand */
2498	    if (lmp->is[0] != '\0')
2499	      {
2500		lmp->left = icatalloc(lmp->left,
2501				      rmp->left);
2502		if (lmp->left == NULL)
2503		  goto done;
2504	      }
2505	    /* Right-hand */
2506	    if (rmp->is[0] == '\0')
2507	      lmp->right[0] = '\0';
2508	    lmp->right = icatalloc(lmp->right, rmp->right);
2509	    if (lmp->right == NULL)
2510	      goto done;
2511	    /* Guaranteed to be */
2512	    if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
2513	      {
2514		lmp->is = icatalloc(lmp->is, rmp->is);
2515		if (lmp->is == NULL)
2516		  goto done;
2517	      }
2518	    else
2519	      lmp->is[0] = '\0';
2520	  }
2521	  break;
2522	default:
2523	  if (t < END)
2524	    {
2525	      /* "cannot happen" */
2526	      goto done;
2527	    }
2528	  else if (t == '\0')
2529	    {
2530	      /* not on *my* shift */
2531	      goto done;
2532	    }
2533	  else if (t >= CSET)
2534	    {
2535	      /* easy enough */
2536	      resetmust(mp);
2537	    }
2538	  else
2539	    {
2540	      /* plain character */
2541	      resetmust(mp);
2542	      mp->is[0] = mp->left[0] = mp->right[0] = t;
2543	      mp->is[1] = mp->left[1] = mp->right[1] = '\0';
2544	      mp->in = enlist(mp->in, mp->is, (size_t)1);
2545	      if (mp->in == NULL)
2546		goto done;
2547	    }
2548	  break;
2549	}
2550#ifdef DEBUG
2551      fprintf(stderr, " node: %d:", ri);
2552      prtok(dfa->tokens[ri]);
2553      fprintf(stderr, "\n  in:");
2554      for (i = 0; mp->in[i]; ++i)
2555	fprintf(stderr, " \"%s\"", mp->in[i]);
2556      fprintf(stderr, "\n  is: \"%s\"\n", mp->is);
2557      fprintf(stderr, "  left: \"%s\"\n", mp->left);
2558      fprintf(stderr, "  right: \"%s\"\n", mp->right);
2559#endif
2560      ++mp;
2561    }
2562 done:
2563  if (strlen(result))
2564    {
2565      dm = (struct dfamust *) malloc(sizeof (struct dfamust));
2566      dm->exact = exact;
2567      dm->must = malloc(strlen(result) + 1);
2568      strcpy(dm->must, result);
2569      dm->next = dfa->musts;
2570      dfa->musts = dm;
2571    }
2572  mp = musts;
2573  for (i = 0; i <= dfa->tindex; ++i)
2574    {
2575      freelist(mp[i].in);
2576      ifree((char *) mp[i].in);
2577      ifree(mp[i].left);
2578      ifree(mp[i].right);
2579      ifree(mp[i].is);
2580    }
2581  free((char *) mp);
2582}
2583