dfa.c revision 55379
1/* dfa.c - deterministic extended regexp routines for GNU
2   Copyright (C) 1988, 1998 Free Software Foundation, Inc.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2, or (at your option)
7   any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software
16   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA */
17
18/* Written June, 1988 by Mike Haertel
19   Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
20
21/* $FreeBSD: head/gnu/usr.bin/grep/dfa.c 55379 2000-01-04 03:25:40Z obrien $ */
22
23#ifdef HAVE_CONFIG_H
24#include <config.h>
25#endif
26
27#include <assert.h>
28#include <ctype.h>
29#include <stdio.h>
30
31#include <sys/types.h>
32#ifdef STDC_HEADERS
33#include <stdlib.h>
34#else
35extern char *calloc(), *malloc(), *realloc();
36extern void free();
37#endif
38
39#if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
40#include <string.h>
41#undef index
42#define index strchr
43#else
44#include <strings.h>
45#endif
46
47#ifndef DEBUG	/* use the same approach as regex.c */
48#undef assert
49#define assert(e)
50#endif /* DEBUG */
51
52#ifndef isgraph
53#define isgraph(C) (isprint(C) && !isspace(C))
54#endif
55
56#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
57#define ISALPHA(C) isalpha(C)
58#define ISUPPER(C) isupper(C)
59#define ISLOWER(C) islower(C)
60#define ISDIGIT(C) isdigit(C)
61#define ISXDIGIT(C) isxdigit(C)
62#define ISSPACE(C) isspace(C)
63#define ISPUNCT(C) ispunct(C)
64#define ISALNUM(C) isalnum(C)
65#define ISPRINT(C) isprint(C)
66#define ISGRAPH(C) isgraph(C)
67#define ISCNTRL(C) iscntrl(C)
68#else
69#define ISALPHA(C) (isascii(C) && isalpha(C))
70#define ISUPPER(C) (isascii(C) && isupper(C))
71#define ISLOWER(C) (isascii(C) && islower(C))
72#define ISDIGIT(C) (isascii(C) && isdigit(C))
73#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
74#define ISSPACE(C) (isascii(C) && isspace(C))
75#define ISPUNCT(C) (isascii(C) && ispunct(C))
76#define ISALNUM(C) (isascii(C) && isalnum(C))
77#define ISPRINT(C) (isascii(C) && isprint(C))
78#define ISGRAPH(C) (isascii(C) && isgraph(C))
79#define ISCNTRL(C) (isascii(C) && iscntrl(C))
80#endif
81
82/* If we (don't) have I18N.  */
83/* glibc defines _ */
84#ifndef _
85# ifdef HAVE_LIBINTL_H
86#  include <libintl.h>
87#  ifndef _
88#   define _(Str) gettext (Str)
89#  endif
90# else
91#  define _(Str) (Str)
92# endif
93#endif
94
95#ifdef __FreeBSD__
96#include <gnuregex.h>
97#else
98#include "regex.h"
99#endif
100#include "dfa.h"
101
102/* HPUX, define those as macros in sys/param.h */
103#ifdef setbit
104# undef setbit
105#endif
106#ifdef clrbit
107# undef clrbit
108#endif
109
110static void dfamust PARAMS ((struct dfa *dfa));
111
112static ptr_t xcalloc PARAMS ((size_t n, size_t s));
113static ptr_t xmalloc PARAMS ((size_t n));
114static ptr_t xrealloc PARAMS ((ptr_t p, size_t n));
115#ifdef DEBUG
116static void prtok PARAMS ((token t));
117#endif
118static int tstbit PARAMS ((int b, charclass c));
119static void setbit PARAMS ((int b, charclass c));
120static void clrbit PARAMS ((int b, charclass c));
121static void copyset PARAMS ((charclass src, charclass dst));
122static void zeroset PARAMS ((charclass s));
123static void notset PARAMS ((charclass s));
124static int equal PARAMS ((charclass s1, charclass s2));
125static int charclass_index PARAMS ((charclass s));
126static int looking_at PARAMS ((const char *s));
127static token lex PARAMS ((void));
128static void addtok PARAMS ((token t));
129static void atom PARAMS ((void));
130static int nsubtoks PARAMS ((int tindex));
131static void copytoks PARAMS ((int tindex, int ntokens));
132static void closure PARAMS ((void));
133static void branch PARAMS ((void));
134static void regexp PARAMS ((int toplevel));
135static void copy PARAMS ((position_set *src, position_set *dst));
136static void insert PARAMS ((position p, position_set *s));
137static void merge PARAMS ((position_set *s1, position_set *s2, position_set *m));
138static void delete PARAMS ((position p, position_set *s));
139static int state_index PARAMS ((struct dfa *d, position_set *s,
140			  int newline, int letter));
141static void build_state PARAMS ((int s, struct dfa *d));
142static void build_state_zero PARAMS ((struct dfa *d));
143static char *icatalloc PARAMS ((char *old, char *new));
144static char *icpyalloc PARAMS ((char *string));
145static char *istrstr PARAMS ((char *lookin, char *lookfor));
146static void ifree PARAMS ((char *cp));
147static void freelist PARAMS ((char **cpp));
148static char **enlist PARAMS ((char **cpp, char *new, size_t len));
149static char **comsubs PARAMS ((char *left, char *right));
150static char **addlists PARAMS ((char **old, char **new));
151static char **inboth PARAMS ((char **left, char **right));
152
153#ifdef __FreeBSD__
154static int
155collate_range_cmp(c1, c2)
156	int c1, c2;
157{
158	static char s1[2], s2[2];
159	int r;
160
161	if (c1 == c2)
162		return 0;
163	s1[0] = c1;
164	s2[0] = c2;
165	if ((r = strcoll(s1, s2)) == 0)
166		r = c1 - c2;
167
168	return r;
169}
170#endif
171
172static ptr_t
173xcalloc(n, s)
174     size_t n;
175     size_t s;
176{
177  ptr_t r = calloc(n, s);
178
179  if (!r)
180    dfaerror(_("Memory exhausted"));
181  return r;
182}
183
184static ptr_t
185xmalloc(n)
186     size_t n;
187{
188  ptr_t r = malloc(n);
189
190  assert(n != 0);
191  if (!r)
192    dfaerror(_("Memory exhausted"));
193  return r;
194}
195
196static ptr_t
197xrealloc(p, n)
198     ptr_t p;
199     size_t n;
200{
201  ptr_t r = realloc(p, n);
202
203  assert(n != 0);
204  if (!r)
205    dfaerror(_("Memory exhausted"));
206  return r;
207}
208
209#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
210#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
211#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
212
213/* Reallocate an array of type t if nalloc is too small for index. */
214#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
215  if ((index) >= (nalloc))			  \
216    {						  \
217      while ((index) >= (nalloc))		  \
218	(nalloc) *= 2;				  \
219      REALLOC(p, t, nalloc);			  \
220    }
221
222#ifdef DEBUG
223
224static void
225prtok(t)
226     token t;
227{
228  char *s;
229
230  if (t < 0)
231    fprintf(stderr, "END");
232  else if (t < NOTCHAR)
233    fprintf(stderr, "%c", t);
234  else
235    {
236      switch (t)
237	{
238	case EMPTY: s = "EMPTY"; break;
239	case BACKREF: s = "BACKREF"; break;
240	case BEGLINE: s = "BEGLINE"; break;
241	case ENDLINE: s = "ENDLINE"; break;
242	case BEGWORD: s = "BEGWORD"; break;
243	case ENDWORD: s = "ENDWORD"; break;
244	case LIMWORD: s = "LIMWORD"; break;
245	case NOTLIMWORD: s = "NOTLIMWORD"; break;
246	case QMARK: s = "QMARK"; break;
247	case STAR: s = "STAR"; break;
248	case PLUS: s = "PLUS"; break;
249	case CAT: s = "CAT"; break;
250	case OR: s = "OR"; break;
251	case ORTOP: s = "ORTOP"; break;
252	case LPAREN: s = "LPAREN"; break;
253	case RPAREN: s = "RPAREN"; break;
254	default: s = "CSET"; break;
255	}
256      fprintf(stderr, "%s", s);
257    }
258}
259#endif /* DEBUG */
260
261/* Stuff pertaining to charclasses. */
262
263static int
264tstbit(b, c)
265     int b;
266     charclass c;
267{
268  return c[b / INTBITS] & 1 << b % INTBITS;
269}
270
271static void
272setbit(b, c)
273     int b;
274     charclass c;
275{
276  c[b / INTBITS] |= 1 << b % INTBITS;
277}
278
279static void
280clrbit(b, c)
281     int b;
282     charclass c;
283{
284  c[b / INTBITS] &= ~(1 << b % INTBITS);
285}
286
287static void
288copyset(src, dst)
289     charclass src;
290     charclass dst;
291{
292  int i;
293
294  for (i = 0; i < CHARCLASS_INTS; ++i)
295    dst[i] = src[i];
296}
297
298static void
299zeroset(s)
300     charclass s;
301{
302  int i;
303
304  for (i = 0; i < CHARCLASS_INTS; ++i)
305    s[i] = 0;
306}
307
308static void
309notset(s)
310     charclass s;
311{
312  int i;
313
314  for (i = 0; i < CHARCLASS_INTS; ++i)
315    s[i] = ~s[i];
316}
317
318static int
319equal(s1, s2)
320     charclass s1;
321     charclass s2;
322{
323  int i;
324
325  for (i = 0; i < CHARCLASS_INTS; ++i)
326    if (s1[i] != s2[i])
327      return 0;
328  return 1;
329}
330
331/* A pointer to the current dfa is kept here during parsing. */
332static struct dfa *dfa;
333
334/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
335static int
336charclass_index(s)
337     charclass s;
338{
339  int i;
340
341  for (i = 0; i < dfa->cindex; ++i)
342    if (equal(s, dfa->charclasses[i]))
343      return i;
344  REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
345  ++dfa->cindex;
346  copyset(s, dfa->charclasses[i]);
347  return i;
348}
349
350/* Syntax bits controlling the behavior of the lexical analyzer. */
351static reg_syntax_t syntax_bits, syntax_bits_set;
352
353/* Flag for case-folding letters into sets. */
354static int case_fold;
355
356/* End-of-line byte in data.  */
357static unsigned char eolbyte;
358
359/* Entry point to set syntax options. */
360void
361dfasyntax(bits, fold, eol)
362     reg_syntax_t bits;
363     int fold;
364     int eol;
365{
366  syntax_bits_set = 1;
367  syntax_bits = bits;
368  case_fold = fold;
369  eolbyte = eol;
370}
371
372/* Lexical analyzer.  All the dross that deals with the obnoxious
373   GNU Regex syntax bits is located here.  The poor, suffering
374   reader is referred to the GNU Regex documentation for the
375   meaning of the @#%!@#%^!@ syntax bits. */
376
377static char *lexstart;		/* Pointer to beginning of input string. */
378static char *lexptr;		/* Pointer to next input character. */
379static int lexleft;		/* Number of characters remaining. */
380static token lasttok;		/* Previous token returned; initially END. */
381static int laststart;		/* True if we're separated from beginning or (, |
382				   only by zero-width characters. */
383static int parens;		/* Count of outstanding left parens. */
384static int minrep, maxrep;	/* Repeat counts for {m,n}. */
385
386/* Note that characters become unsigned here. */
387#define FETCH(c, eoferr)   	      \
388  {			   	      \
389    if (! lexleft)	   	      \
390      if (eoferr != 0)	   	      \
391	dfaerror(eoferr);  	      \
392      else		   	      \
393	return lasttok = END;	      \
394    (c) = (unsigned char) *lexptr++;  \
395    --lexleft;		   	      \
396  }
397
398#ifdef __STDC__
399#define FUNC(F, P) static int F(int c) { return P(c); }
400#else
401#define FUNC(F, P) static int F(c) int c; { return P(c); }
402#endif
403
404FUNC(is_alpha, ISALPHA)
405FUNC(is_upper, ISUPPER)
406FUNC(is_lower, ISLOWER)
407FUNC(is_digit, ISDIGIT)
408FUNC(is_xdigit, ISXDIGIT)
409FUNC(is_space, ISSPACE)
410FUNC(is_punct, ISPUNCT)
411FUNC(is_alnum, ISALNUM)
412FUNC(is_print, ISPRINT)
413FUNC(is_graph, ISGRAPH)
414FUNC(is_cntrl, ISCNTRL)
415
416static int is_blank(c)
417int c;
418{
419   return (c == ' ' || c == '\t');
420}
421
422/* The following list maps the names of the Posix named character classes
423   to predicate functions that determine whether a given character is in
424   the class.  The leading [ has already been eaten by the lexical analyzer. */
425static struct {
426  const char *name;
427  int (*pred) PARAMS ((int));
428} prednames[] = {
429  { ":alpha:]", is_alpha },
430  { ":upper:]", is_upper },
431  { ":lower:]", is_lower },
432  { ":digit:]", is_digit },
433  { ":xdigit:]", is_xdigit },
434  { ":space:]", is_space },
435  { ":punct:]", is_punct },
436  { ":alnum:]", is_alnum },
437  { ":print:]", is_print },
438  { ":graph:]", is_graph },
439  { ":cntrl:]", is_cntrl },
440  { ":blank:]", is_blank },
441  { 0 }
442};
443
444/* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
445#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
446
447static int
448looking_at(s)
449     const char *s;
450{
451  size_t len;
452
453  len = strlen(s);
454  if (lexleft < len)
455    return 0;
456  return strncmp(s, lexptr, len) == 0;
457}
458
459static token
460lex()
461{
462  token c, c1, c2;
463  int backslash = 0, invert;
464  charclass ccl;
465  int i;
466
467  /* Basic plan: We fetch a character.  If it's a backslash,
468     we set the backslash flag and go through the loop again.
469     On the plus side, this avoids having a duplicate of the
470     main switch inside the backslash case.  On the minus side,
471     it means that just about every case begins with
472     "if (backslash) ...".  */
473  for (i = 0; i < 2; ++i)
474    {
475      FETCH(c, 0);
476      switch (c)
477	{
478	case '\\':
479	  if (backslash)
480	    goto normal_char;
481	  if (lexleft == 0)
482	    dfaerror(_("Unfinished \\ escape"));
483	  backslash = 1;
484	  break;
485
486	case '^':
487	  if (backslash)
488	    goto normal_char;
489	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
490	      || lasttok == END
491	      || lasttok == LPAREN
492	      || lasttok == OR)
493	    return lasttok = BEGLINE;
494	  goto normal_char;
495
496	case '$':
497	  if (backslash)
498	    goto normal_char;
499	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
500	      || lexleft == 0
501	      || (syntax_bits & RE_NO_BK_PARENS
502		  ? lexleft > 0 && *lexptr == ')'
503		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
504	      || (syntax_bits & RE_NO_BK_VBAR
505		  ? lexleft > 0 && *lexptr == '|'
506		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
507	      || ((syntax_bits & RE_NEWLINE_ALT)
508	          && lexleft > 0 && *lexptr == '\n'))
509	    return lasttok = ENDLINE;
510	  goto normal_char;
511
512	case '1':
513	case '2':
514	case '3':
515	case '4':
516	case '5':
517	case '6':
518	case '7':
519	case '8':
520	case '9':
521	  if (backslash && !(syntax_bits & RE_NO_BK_REFS))
522	    {
523	      laststart = 0;
524	      return lasttok = BACKREF;
525	    }
526	  goto normal_char;
527
528	case '`':
529	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
530	    return lasttok = BEGLINE;	/* FIXME: should be beginning of string */
531	  goto normal_char;
532
533	case '\'':
534	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
535	    return lasttok = ENDLINE;	/* FIXME: should be end of string */
536	  goto normal_char;
537
538	case '<':
539	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
540	    return lasttok = BEGWORD;
541	  goto normal_char;
542
543	case '>':
544	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
545	    return lasttok = ENDWORD;
546	  goto normal_char;
547
548	case 'b':
549	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
550	    return lasttok = LIMWORD;
551	  goto normal_char;
552
553	case 'B':
554	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
555	    return lasttok = NOTLIMWORD;
556	  goto normal_char;
557
558	case '?':
559	  if (syntax_bits & RE_LIMITED_OPS)
560	    goto normal_char;
561	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
562	    goto normal_char;
563	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
564	    goto normal_char;
565	  return lasttok = QMARK;
566
567	case '*':
568	  if (backslash)
569	    goto normal_char;
570	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
571	    goto normal_char;
572	  return lasttok = STAR;
573
574	case '+':
575	  if (syntax_bits & RE_LIMITED_OPS)
576	    goto normal_char;
577	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
578	    goto normal_char;
579	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
580	    goto normal_char;
581	  return lasttok = PLUS;
582
583	case '{':
584	  if (!(syntax_bits & RE_INTERVALS))
585	    goto normal_char;
586	  if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
587	    goto normal_char;
588	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
589	    goto normal_char;
590
591	  if (syntax_bits & RE_NO_BK_BRACES)
592	    {
593	      /* Scan ahead for a valid interval; if it's not valid,
594		 treat it as a literal '{'.  */
595	      int lo = -1, hi = -1;
596	      char const *p = lexptr;
597	      char const *lim = p + lexleft;
598	      for (;  p != lim && ISDIGIT (*p);  p++)
599		lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
600	      if (p != lim && *p == ',')
601		while (++p != lim && ISDIGIT (*p))
602		  hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
603	      else
604		hi = lo;
605	      if (p == lim || *p != '}'
606		  || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
607		goto normal_char;
608	    }
609
610	  minrep = 0;
611	  /* Cases:
612	     {M} - exact count
613	     {M,} - minimum count, maximum is infinity
614	     {M,N} - M through N */
615	  FETCH(c, _("unfinished repeat count"));
616	  if (ISDIGIT(c))
617	    {
618	      minrep = c - '0';
619	      for (;;)
620		{
621		  FETCH(c, _("unfinished repeat count"));
622		  if (!ISDIGIT(c))
623		    break;
624		  minrep = 10 * minrep + c - '0';
625		}
626	    }
627	  else
628	    dfaerror(_("malformed repeat count"));
629	  if (c == ',')
630	    {
631	      FETCH (c, _("unfinished repeat count"));
632	      if (! ISDIGIT (c))
633		maxrep = -1;
634	      else
635		{
636		  maxrep = c - '0';
637		  for (;;)
638		    {
639		      FETCH (c, _("unfinished repeat count"));
640		      if (! ISDIGIT (c))
641			break;
642		      maxrep = 10 * maxrep + c - '0';
643		    }
644		  if (0 <= maxrep && maxrep < minrep)
645		    dfaerror (_("malformed repeat count"));
646		}
647	    }
648	  else
649	    maxrep = minrep;
650	  if (!(syntax_bits & RE_NO_BK_BRACES))
651	    {
652	      if (c != '\\')
653		dfaerror(_("malformed repeat count"));
654	      FETCH(c, _("unfinished repeat count"));
655	    }
656	  if (c != '}')
657	    dfaerror(_("malformed repeat count"));
658	  laststart = 0;
659	  return lasttok = REPMN;
660
661	case '|':
662	  if (syntax_bits & RE_LIMITED_OPS)
663	    goto normal_char;
664	  if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
665	    goto normal_char;
666	  laststart = 1;
667	  return lasttok = OR;
668
669	case '\n':
670	  if (syntax_bits & RE_LIMITED_OPS
671	      || backslash
672	      || !(syntax_bits & RE_NEWLINE_ALT))
673	    goto normal_char;
674	  laststart = 1;
675	  return lasttok = OR;
676
677	case '(':
678	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
679	    goto normal_char;
680	  ++parens;
681	  laststart = 1;
682	  return lasttok = LPAREN;
683
684	case ')':
685	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
686	    goto normal_char;
687	  if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
688	    goto normal_char;
689	  --parens;
690	  laststart = 0;
691	  return lasttok = RPAREN;
692
693	case '.':
694	  if (backslash)
695	    goto normal_char;
696	  zeroset(ccl);
697	  notset(ccl);
698	  if (!(syntax_bits & RE_DOT_NEWLINE))
699	    clrbit(eolbyte, ccl);
700	  if (syntax_bits & RE_DOT_NOT_NULL)
701	    clrbit('\0', ccl);
702	  laststart = 0;
703	  return lasttok = CSET + charclass_index(ccl);
704
705	case 'w':
706	case 'W':
707	  if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
708	    goto normal_char;
709	  zeroset(ccl);
710	  for (c2 = 0; c2 < NOTCHAR; ++c2)
711	    if (IS_WORD_CONSTITUENT(c2))
712	      setbit(c2, ccl);
713	  if (c == 'W')
714	    notset(ccl);
715	  laststart = 0;
716	  return lasttok = CSET + charclass_index(ccl);
717
718	case '[':
719	  if (backslash)
720	    goto normal_char;
721	  zeroset(ccl);
722	  FETCH(c, _("Unbalanced ["));
723	  if (c == '^')
724	    {
725	      FETCH(c, _("Unbalanced ["));
726	      invert = 1;
727	    }
728	  else
729	    invert = 0;
730	  do
731	    {
732	      /* Nobody ever said this had to be fast. :-)
733		 Note that if we're looking at some other [:...:]
734		 construct, we just treat it as a bunch of ordinary
735		 characters.  We can do this because we assume
736		 regex has checked for syntax errors before
737		 dfa is ever called. */
738	      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
739		for (c1 = 0; prednames[c1].name; ++c1)
740		  if (looking_at(prednames[c1].name))
741		    {
742			int (*pred)() = prednames[c1].pred;
743			if (case_fold
744			    && (pred == is_upper || pred == is_lower))
745				pred = is_alpha;
746
747		      for (c2 = 0; c2 < NOTCHAR; ++c2)
748			if ((*pred)(c2))
749			  setbit(c2, ccl);
750		      lexptr += strlen(prednames[c1].name);
751		      lexleft -= strlen(prednames[c1].name);
752		      FETCH(c1, _("Unbalanced ["));
753		      goto skip;
754		    }
755	      if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
756		FETCH(c, _("Unbalanced ["));
757	      FETCH(c1, _("Unbalanced ["));
758	      if (c1 == '-')
759		{
760		  FETCH(c2, _("Unbalanced ["));
761		  if (c2 == ']')
762		    {
763		      /* In the case [x-], the - is an ordinary hyphen,
764			 which is left in c1, the lookahead character. */
765		      --lexptr;
766		      ++lexleft;
767		      c2 = c;
768		    }
769		  else
770		    {
771		      if (c2 == '\\'
772			  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
773			FETCH(c2, _("Unbalanced ["));
774		      FETCH(c1, _("Unbalanced ["));
775		    }
776		}
777	      else
778		c2 = c;
779#ifdef __FreeBSD__
780	      if (collate_range_cmp(c, c2) <= 0)
781	      {
782		token c3;
783
784		for (c3 = 0; c3 < NOTCHAR; ++c3) {
785		  if (collate_range_cmp(c, c3) <= 0 &&
786		      collate_range_cmp(c3, c2) <= 0) {
787		    setbit(c3, ccl);
788		    if (case_fold)
789		      if (ISUPPER(c3))
790			setbit(tolower(c3), ccl);
791		      else if (ISLOWER(c))
792			setbit(toupper(c3), ccl);
793		  }
794		}
795	      }
796#else
797	      while (c <= c2)
798		{
799		  setbit(c, ccl);
800		  if (case_fold)
801		    if (ISUPPER(c))
802		      setbit(tolower(c), ccl);
803		    else if (ISLOWER(c))
804		      setbit(toupper(c), ccl);
805		  ++c;
806		}
807#endif
808	    skip:
809	      ;
810	    }
811	  while ((c = c1) != ']');
812	  if (invert)
813	    {
814	      notset(ccl);
815	      if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
816		clrbit(eolbyte, ccl);
817	    }
818	  laststart = 0;
819	  return lasttok = CSET + charclass_index(ccl);
820
821	default:
822	normal_char:
823	  laststart = 0;
824	  if (case_fold && ISALPHA(c))
825	    {
826	      zeroset(ccl);
827	      setbit(c, ccl);
828	      if (isupper(c))
829		setbit(tolower(c), ccl);
830	      else
831		setbit(toupper(c), ccl);
832	      return lasttok = CSET + charclass_index(ccl);
833	    }
834	  return c;
835	}
836    }
837
838  /* The above loop should consume at most a backslash
839     and some other character. */
840  abort();
841  return END;	/* keeps pedantic compilers happy. */
842}
843
844/* Recursive descent parser for regular expressions. */
845
846static token tok;		/* Lookahead token. */
847static int depth;		/* Current depth of a hypothetical stack
848				   holding deferred productions.  This is
849				   used to determine the depth that will be
850				   required of the real stack later on in
851				   dfaanalyze(). */
852
853/* Add the given token to the parse tree, maintaining the depth count and
854   updating the maximum depth if necessary. */
855static void
856addtok(t)
857     token t;
858{
859  REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
860  dfa->tokens[dfa->tindex++] = t;
861
862  switch (t)
863    {
864    case QMARK:
865    case STAR:
866    case PLUS:
867      break;
868
869    case CAT:
870    case OR:
871    case ORTOP:
872      --depth;
873      break;
874
875    default:
876      ++dfa->nleaves;
877    case EMPTY:
878      ++depth;
879      break;
880    }
881  if (depth > dfa->depth)
882    dfa->depth = depth;
883}
884
885/* The grammar understood by the parser is as follows.
886
887   regexp:
888     regexp OR branch
889     branch
890
891   branch:
892     branch closure
893     closure
894
895   closure:
896     closure QMARK
897     closure STAR
898     closure PLUS
899     atom
900
901   atom:
902     <normal character>
903     CSET
904     BACKREF
905     BEGLINE
906     ENDLINE
907     BEGWORD
908     ENDWORD
909     LIMWORD
910     NOTLIMWORD
911     <empty>
912
913   The parser builds a parse tree in postfix form in an array of tokens. */
914
915static void
916atom()
917{
918  if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
919      || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
920      || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
921    {
922      addtok(tok);
923      tok = lex();
924    }
925  else if (tok == LPAREN)
926    {
927      tok = lex();
928      regexp(0);
929      if (tok != RPAREN)
930	dfaerror(_("Unbalanced ("));
931      tok = lex();
932    }
933  else
934    addtok(EMPTY);
935}
936
937/* Return the number of tokens in the given subexpression. */
938static int
939nsubtoks(tindex)
940int tindex;
941{
942  int ntoks1;
943
944  switch (dfa->tokens[tindex - 1])
945    {
946    default:
947      return 1;
948    case QMARK:
949    case STAR:
950    case PLUS:
951      return 1 + nsubtoks(tindex - 1);
952    case CAT:
953    case OR:
954    case ORTOP:
955      ntoks1 = nsubtoks(tindex - 1);
956      return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
957    }
958}
959
960/* Copy the given subexpression to the top of the tree. */
961static void
962copytoks(tindex, ntokens)
963     int tindex, ntokens;
964{
965  int i;
966
967  for (i = 0; i < ntokens; ++i)
968    addtok(dfa->tokens[tindex + i]);
969}
970
971static void
972closure()
973{
974  int tindex, ntokens, i;
975
976  atom();
977  while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
978    if (tok == REPMN)
979      {
980	ntokens = nsubtoks(dfa->tindex);
981	tindex = dfa->tindex - ntokens;
982	if (maxrep < 0)
983	  addtok(PLUS);
984	if (minrep == 0)
985	  addtok(QMARK);
986	for (i = 1; i < minrep; ++i)
987	  {
988	    copytoks(tindex, ntokens);
989	    addtok(CAT);
990	  }
991	for (; i < maxrep; ++i)
992	  {
993	    copytoks(tindex, ntokens);
994	    addtok(QMARK);
995	    addtok(CAT);
996	  }
997	tok = lex();
998      }
999    else
1000      {
1001	addtok(tok);
1002	tok = lex();
1003      }
1004}
1005
1006static void
1007branch()
1008{
1009  closure();
1010  while (tok != RPAREN && tok != OR && tok >= 0)
1011    {
1012      closure();
1013      addtok(CAT);
1014    }
1015}
1016
1017static void
1018regexp(toplevel)
1019     int toplevel;
1020{
1021  branch();
1022  while (tok == OR)
1023    {
1024      tok = lex();
1025      branch();
1026      if (toplevel)
1027	addtok(ORTOP);
1028      else
1029	addtok(OR);
1030    }
1031}
1032
1033/* Main entry point for the parser.  S is a string to be parsed, len is the
1034   length of the string, so s can include NUL characters.  D is a pointer to
1035   the struct dfa to parse into. */
1036void
1037dfaparse(s, len, d)
1038     char *s;
1039     size_t len;
1040     struct dfa *d;
1041
1042{
1043  dfa = d;
1044  lexstart = lexptr = s;
1045  lexleft = len;
1046  lasttok = END;
1047  laststart = 1;
1048  parens = 0;
1049
1050  if (! syntax_bits_set)
1051    dfaerror(_("No syntax specified"));
1052
1053  tok = lex();
1054  depth = d->depth;
1055
1056  regexp(1);
1057
1058  if (tok != END)
1059    dfaerror(_("Unbalanced )"));
1060
1061  addtok(END - d->nregexps);
1062  addtok(CAT);
1063
1064  if (d->nregexps)
1065    addtok(ORTOP);
1066
1067  ++d->nregexps;
1068}
1069
1070/* Some primitives for operating on sets of positions. */
1071
1072/* Copy one set to another; the destination must be large enough. */
1073static void
1074copy(src, dst)
1075     position_set *src;
1076     position_set *dst;
1077{
1078  int i;
1079
1080  for (i = 0; i < src->nelem; ++i)
1081    dst->elems[i] = src->elems[i];
1082  dst->nelem = src->nelem;
1083}
1084
1085/* Insert a position in a set.  Position sets are maintained in sorted
1086   order according to index.  If position already exists in the set with
1087   the same index then their constraints are logically or'd together.
1088   S->elems must point to an array large enough to hold the resulting set. */
1089static void
1090insert(p, s)
1091     position p;
1092     position_set *s;
1093{
1094  int i;
1095  position t1, t2;
1096
1097  for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1098    continue;
1099  if (i < s->nelem && p.index == s->elems[i].index)
1100    s->elems[i].constraint |= p.constraint;
1101  else
1102    {
1103      t1 = p;
1104      ++s->nelem;
1105      while (i < s->nelem)
1106	{
1107	  t2 = s->elems[i];
1108	  s->elems[i++] = t1;
1109	  t1 = t2;
1110	}
1111    }
1112}
1113
1114/* Merge two sets of positions into a third.  The result is exactly as if
1115   the positions of both sets were inserted into an initially empty set. */
1116static void
1117merge(s1, s2, m)
1118     position_set *s1;
1119     position_set *s2;
1120     position_set *m;
1121{
1122  int i = 0, j = 0;
1123
1124  m->nelem = 0;
1125  while (i < s1->nelem && j < s2->nelem)
1126    if (s1->elems[i].index > s2->elems[j].index)
1127      m->elems[m->nelem++] = s1->elems[i++];
1128    else if (s1->elems[i].index < s2->elems[j].index)
1129      m->elems[m->nelem++] = s2->elems[j++];
1130    else
1131      {
1132	m->elems[m->nelem] = s1->elems[i++];
1133	m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1134      }
1135  while (i < s1->nelem)
1136    m->elems[m->nelem++] = s1->elems[i++];
1137  while (j < s2->nelem)
1138    m->elems[m->nelem++] = s2->elems[j++];
1139}
1140
1141/* Delete a position from a set. */
1142static void
1143delete(p, s)
1144     position p;
1145     position_set *s;
1146{
1147  int i;
1148
1149  for (i = 0; i < s->nelem; ++i)
1150    if (p.index == s->elems[i].index)
1151      break;
1152  if (i < s->nelem)
1153    for (--s->nelem; i < s->nelem; ++i)
1154      s->elems[i] = s->elems[i + 1];
1155}
1156
1157/* Find the index of the state corresponding to the given position set with
1158   the given preceding context, or create a new state if there is no such
1159   state.  Newline and letter tell whether we got here on a newline or
1160   letter, respectively. */
1161static int
1162state_index(d, s, newline, letter)
1163     struct dfa *d;
1164     position_set *s;
1165     int newline;
1166     int letter;
1167{
1168  int hash = 0;
1169  int constraint;
1170  int i, j;
1171
1172  newline = newline ? 1 : 0;
1173  letter = letter ? 1 : 0;
1174
1175  for (i = 0; i < s->nelem; ++i)
1176    hash ^= s->elems[i].index + s->elems[i].constraint;
1177
1178  /* Try to find a state that exactly matches the proposed one. */
1179  for (i = 0; i < d->sindex; ++i)
1180    {
1181      if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1182	  || newline != d->states[i].newline || letter != d->states[i].letter)
1183	continue;
1184      for (j = 0; j < s->nelem; ++j)
1185	if (s->elems[j].constraint
1186	    != d->states[i].elems.elems[j].constraint
1187	    || s->elems[j].index != d->states[i].elems.elems[j].index)
1188	  break;
1189      if (j == s->nelem)
1190	return i;
1191    }
1192
1193  /* We'll have to create a new state. */
1194  REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1195  d->states[i].hash = hash;
1196  MALLOC(d->states[i].elems.elems, position, s->nelem);
1197  copy(s, &d->states[i].elems);
1198  d->states[i].newline = newline;
1199  d->states[i].letter = letter;
1200  d->states[i].backref = 0;
1201  d->states[i].constraint = 0;
1202  d->states[i].first_end = 0;
1203  for (j = 0; j < s->nelem; ++j)
1204    if (d->tokens[s->elems[j].index] < 0)
1205      {
1206	constraint = s->elems[j].constraint;
1207	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1208	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1209	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1210	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1211	  d->states[i].constraint |= constraint;
1212	if (! d->states[i].first_end)
1213	  d->states[i].first_end = d->tokens[s->elems[j].index];
1214      }
1215    else if (d->tokens[s->elems[j].index] == BACKREF)
1216      {
1217	d->states[i].constraint = NO_CONSTRAINT;
1218	d->states[i].backref = 1;
1219      }
1220
1221  ++d->sindex;
1222
1223  return i;
1224}
1225
1226/* Find the epsilon closure of a set of positions.  If any position of the set
1227   contains a symbol that matches the empty string in some context, replace
1228   that position with the elements of its follow labeled with an appropriate
1229   constraint.  Repeat exhaustively until no funny positions are left.
1230   S->elems must be large enough to hold the result. */
1231static void epsclosure PARAMS ((position_set *s, struct dfa *d));
1232
1233static void
1234epsclosure(s, d)
1235     position_set *s;
1236     struct dfa *d;
1237{
1238  int i, j;
1239  int *visited;
1240  position p, old;
1241
1242  MALLOC(visited, int, d->tindex);
1243  for (i = 0; i < d->tindex; ++i)
1244    visited[i] = 0;
1245
1246  for (i = 0; i < s->nelem; ++i)
1247    if (d->tokens[s->elems[i].index] >= NOTCHAR
1248	&& d->tokens[s->elems[i].index] != BACKREF
1249	&& d->tokens[s->elems[i].index] < CSET)
1250      {
1251	old = s->elems[i];
1252	p.constraint = old.constraint;
1253	delete(s->elems[i], s);
1254	if (visited[old.index])
1255	  {
1256	    --i;
1257	    continue;
1258	  }
1259	visited[old.index] = 1;
1260	switch (d->tokens[old.index])
1261	  {
1262	  case BEGLINE:
1263	    p.constraint &= BEGLINE_CONSTRAINT;
1264	    break;
1265	  case ENDLINE:
1266	    p.constraint &= ENDLINE_CONSTRAINT;
1267	    break;
1268	  case BEGWORD:
1269	    p.constraint &= BEGWORD_CONSTRAINT;
1270	    break;
1271	  case ENDWORD:
1272	    p.constraint &= ENDWORD_CONSTRAINT;
1273	    break;
1274	  case LIMWORD:
1275	    p.constraint &= LIMWORD_CONSTRAINT;
1276	    break;
1277	  case NOTLIMWORD:
1278	    p.constraint &= NOTLIMWORD_CONSTRAINT;
1279	    break;
1280	  default:
1281	    break;
1282	  }
1283	for (j = 0; j < d->follows[old.index].nelem; ++j)
1284	  {
1285	    p.index = d->follows[old.index].elems[j].index;
1286	    insert(p, s);
1287	  }
1288	/* Force rescan to start at the beginning. */
1289	i = -1;
1290      }
1291
1292  free(visited);
1293}
1294
1295/* Perform bottom-up analysis on the parse tree, computing various functions.
1296   Note that at this point, we're pretending constructs like \< are real
1297   characters rather than constraints on what can follow them.
1298
1299   Nullable:  A node is nullable if it is at the root of a regexp that can
1300   match the empty string.
1301   *  EMPTY leaves are nullable.
1302   * No other leaf is nullable.
1303   * A QMARK or STAR node is nullable.
1304   * A PLUS node is nullable if its argument is nullable.
1305   * A CAT node is nullable if both its arguments are nullable.
1306   * An OR node is nullable if either argument is nullable.
1307
1308   Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
1309   that could correspond to the first character of a string matching the
1310   regexp rooted at the given node.
1311   * EMPTY leaves have empty firstpos.
1312   * The firstpos of a nonempty leaf is that leaf itself.
1313   * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1314     argument.
1315   * The firstpos of a CAT node is the firstpos of the left argument, union
1316     the firstpos of the right if the left argument is nullable.
1317   * The firstpos of an OR node is the union of firstpos of each argument.
1318
1319   Lastpos:  The lastpos of a node is the set of positions that could
1320   correspond to the last character of a string matching the regexp at
1321   the given node.
1322   * EMPTY leaves have empty lastpos.
1323   * The lastpos of a nonempty leaf is that leaf itself.
1324   * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1325     argument.
1326   * The lastpos of a CAT node is the lastpos of its right argument, union
1327     the lastpos of the left if the right argument is nullable.
1328   * The lastpos of an OR node is the union of the lastpos of each argument.
1329
1330   Follow:  The follow of a position is the set of positions that could
1331   correspond to the character following a character matching the node in
1332   a string matching the regexp.  At this point we consider special symbols
1333   that match the empty string in some context to be just normal characters.
1334   Later, if we find that a special symbol is in a follow set, we will
1335   replace it with the elements of its follow, labeled with an appropriate
1336   constraint.
1337   * Every node in the firstpos of the argument of a STAR or PLUS node is in
1338     the follow of every node in the lastpos.
1339   * Every node in the firstpos of the second argument of a CAT node is in
1340     the follow of every node in the lastpos of the first argument.
1341
1342   Because of the postfix representation of the parse tree, the depth-first
1343   analysis is conveniently done by a linear scan with the aid of a stack.
1344   Sets are stored as arrays of the elements, obeying a stack-like allocation
1345   scheme; the number of elements in each set deeper in the stack can be
1346   used to determine the address of a particular set's array. */
1347void
1348dfaanalyze(d, searchflag)
1349     struct dfa *d;
1350     int searchflag;
1351{
1352  int *nullable;		/* Nullable stack. */
1353  int *nfirstpos;		/* Element count stack for firstpos sets. */
1354  position *firstpos;		/* Array where firstpos elements are stored. */
1355  int *nlastpos;		/* Element count stack for lastpos sets. */
1356  position *lastpos;		/* Array where lastpos elements are stored. */
1357  int *nalloc;			/* Sizes of arrays allocated to follow sets. */
1358  position_set tmp;		/* Temporary set for merging sets. */
1359  position_set merged;		/* Result of merging sets. */
1360  int wants_newline;		/* True if some position wants newline info. */
1361  int *o_nullable;
1362  int *o_nfirst, *o_nlast;
1363  position *o_firstpos, *o_lastpos;
1364  int i, j;
1365  position *pos;
1366
1367#ifdef DEBUG
1368  fprintf(stderr, "dfaanalyze:\n");
1369  for (i = 0; i < d->tindex; ++i)
1370    {
1371      fprintf(stderr, " %d:", i);
1372      prtok(d->tokens[i]);
1373    }
1374  putc('\n', stderr);
1375#endif
1376
1377  d->searchflag = searchflag;
1378
1379  MALLOC(nullable, int, d->depth);
1380  o_nullable = nullable;
1381  MALLOC(nfirstpos, int, d->depth);
1382  o_nfirst = nfirstpos;
1383  MALLOC(firstpos, position, d->nleaves);
1384  o_firstpos = firstpos, firstpos += d->nleaves;
1385  MALLOC(nlastpos, int, d->depth);
1386  o_nlast = nlastpos;
1387  MALLOC(lastpos, position, d->nleaves);
1388  o_lastpos = lastpos, lastpos += d->nleaves;
1389  MALLOC(nalloc, int, d->tindex);
1390  for (i = 0; i < d->tindex; ++i)
1391    nalloc[i] = 0;
1392  MALLOC(merged.elems, position, d->nleaves);
1393
1394  CALLOC(d->follows, position_set, d->tindex);
1395
1396  for (i = 0; i < d->tindex; ++i)
1397#ifdef DEBUG
1398    {				/* Nonsyntactic #ifdef goo... */
1399#endif
1400    switch (d->tokens[i])
1401      {
1402      case EMPTY:
1403	/* The empty set is nullable. */
1404	*nullable++ = 1;
1405
1406	/* The firstpos and lastpos of the empty leaf are both empty. */
1407	*nfirstpos++ = *nlastpos++ = 0;
1408	break;
1409
1410      case STAR:
1411      case PLUS:
1412	/* Every element in the firstpos of the argument is in the follow
1413	   of every element in the lastpos. */
1414	tmp.nelem = nfirstpos[-1];
1415	tmp.elems = firstpos;
1416	pos = lastpos;
1417	for (j = 0; j < nlastpos[-1]; ++j)
1418	  {
1419	    merge(&tmp, &d->follows[pos[j].index], &merged);
1420	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1421				 nalloc[pos[j].index], merged.nelem - 1);
1422	    copy(&merged, &d->follows[pos[j].index]);
1423	  }
1424
1425      case QMARK:
1426	/* A QMARK or STAR node is automatically nullable. */
1427	if (d->tokens[i] != PLUS)
1428	  nullable[-1] = 1;
1429	break;
1430
1431      case CAT:
1432	/* Every element in the firstpos of the second argument is in the
1433	   follow of every element in the lastpos of the first argument. */
1434	tmp.nelem = nfirstpos[-1];
1435	tmp.elems = firstpos;
1436	pos = lastpos + nlastpos[-1];
1437	for (j = 0; j < nlastpos[-2]; ++j)
1438	  {
1439	    merge(&tmp, &d->follows[pos[j].index], &merged);
1440	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1441				 nalloc[pos[j].index], merged.nelem - 1);
1442	    copy(&merged, &d->follows[pos[j].index]);
1443	  }
1444
1445	/* The firstpos of a CAT node is the firstpos of the first argument,
1446	   union that of the second argument if the first is nullable. */
1447	if (nullable[-2])
1448	  nfirstpos[-2] += nfirstpos[-1];
1449	else
1450	  firstpos += nfirstpos[-1];
1451	--nfirstpos;
1452
1453	/* The lastpos of a CAT node is the lastpos of the second argument,
1454	   union that of the first argument if the second is nullable. */
1455	if (nullable[-1])
1456	  nlastpos[-2] += nlastpos[-1];
1457	else
1458	  {
1459	    pos = lastpos + nlastpos[-2];
1460	    for (j = nlastpos[-1] - 1; j >= 0; --j)
1461	      pos[j] = lastpos[j];
1462	    lastpos += nlastpos[-2];
1463	    nlastpos[-2] = nlastpos[-1];
1464	  }
1465	--nlastpos;
1466
1467	/* A CAT node is nullable if both arguments are nullable. */
1468	nullable[-2] = nullable[-1] && nullable[-2];
1469	--nullable;
1470	break;
1471
1472      case OR:
1473      case ORTOP:
1474	/* The firstpos is the union of the firstpos of each argument. */
1475	nfirstpos[-2] += nfirstpos[-1];
1476	--nfirstpos;
1477
1478	/* The lastpos is the union of the lastpos of each argument. */
1479	nlastpos[-2] += nlastpos[-1];
1480	--nlastpos;
1481
1482	/* An OR node is nullable if either argument is nullable. */
1483	nullable[-2] = nullable[-1] || nullable[-2];
1484	--nullable;
1485	break;
1486
1487      default:
1488	/* Anything else is a nonempty position.  (Note that special
1489	   constructs like \< are treated as nonempty strings here;
1490	   an "epsilon closure" effectively makes them nullable later.
1491	   Backreferences have to get a real position so we can detect
1492	   transitions on them later.  But they are nullable. */
1493	*nullable++ = d->tokens[i] == BACKREF;
1494
1495	/* This position is in its own firstpos and lastpos. */
1496	*nfirstpos++ = *nlastpos++ = 1;
1497	--firstpos, --lastpos;
1498	firstpos->index = lastpos->index = i;
1499	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1500
1501	/* Allocate the follow set for this position. */
1502	nalloc[i] = 1;
1503	MALLOC(d->follows[i].elems, position, nalloc[i]);
1504	break;
1505      }
1506#ifdef DEBUG
1507    /* ... balance the above nonsyntactic #ifdef goo... */
1508      fprintf(stderr, "node %d:", i);
1509      prtok(d->tokens[i]);
1510      putc('\n', stderr);
1511      fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1512      fprintf(stderr, " firstpos:");
1513      for (j = nfirstpos[-1] - 1; j >= 0; --j)
1514	{
1515	  fprintf(stderr, " %d:", firstpos[j].index);
1516	  prtok(d->tokens[firstpos[j].index]);
1517	}
1518      fprintf(stderr, "\n lastpos:");
1519      for (j = nlastpos[-1] - 1; j >= 0; --j)
1520	{
1521	  fprintf(stderr, " %d:", lastpos[j].index);
1522	  prtok(d->tokens[lastpos[j].index]);
1523	}
1524      putc('\n', stderr);
1525    }
1526#endif
1527
1528  /* For each follow set that is the follow set of a real position, replace
1529     it with its epsilon closure. */
1530  for (i = 0; i < d->tindex; ++i)
1531    if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1532	|| d->tokens[i] >= CSET)
1533      {
1534#ifdef DEBUG
1535	fprintf(stderr, "follows(%d:", i);
1536	prtok(d->tokens[i]);
1537	fprintf(stderr, "):");
1538	for (j = d->follows[i].nelem - 1; j >= 0; --j)
1539	  {
1540	    fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1541	    prtok(d->tokens[d->follows[i].elems[j].index]);
1542	  }
1543	putc('\n', stderr);
1544#endif
1545	copy(&d->follows[i], &merged);
1546	epsclosure(&merged, d);
1547	if (d->follows[i].nelem < merged.nelem)
1548	  REALLOC(d->follows[i].elems, position, merged.nelem);
1549	copy(&merged, &d->follows[i]);
1550      }
1551
1552  /* Get the epsilon closure of the firstpos of the regexp.  The result will
1553     be the set of positions of state 0. */
1554  merged.nelem = 0;
1555  for (i = 0; i < nfirstpos[-1]; ++i)
1556    insert(firstpos[i], &merged);
1557  epsclosure(&merged, d);
1558
1559  /* Check if any of the positions of state 0 will want newline context. */
1560  wants_newline = 0;
1561  for (i = 0; i < merged.nelem; ++i)
1562    if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1563      wants_newline = 1;
1564
1565  /* Build the initial state. */
1566  d->salloc = 1;
1567  d->sindex = 0;
1568  MALLOC(d->states, dfa_state, d->salloc);
1569  state_index(d, &merged, wants_newline, 0);
1570
1571  free(o_nullable);
1572  free(o_nfirst);
1573  free(o_firstpos);
1574  free(o_nlast);
1575  free(o_lastpos);
1576  free(nalloc);
1577  free(merged.elems);
1578}
1579
1580/* Find, for each character, the transition out of state s of d, and store
1581   it in the appropriate slot of trans.
1582
1583   We divide the positions of s into groups (positions can appear in more
1584   than one group).  Each group is labeled with a set of characters that
1585   every position in the group matches (taking into account, if necessary,
1586   preceding context information of s).  For each group, find the union
1587   of the its elements' follows.  This set is the set of positions of the
1588   new state.  For each character in the group's label, set the transition
1589   on this character to be to a state corresponding to the set's positions,
1590   and its associated backward context information, if necessary.
1591
1592   If we are building a searching matcher, we include the positions of state
1593   0 in every state.
1594
1595   The collection of groups is constructed by building an equivalence-class
1596   partition of the positions of s.
1597
1598   For each position, find the set of characters C that it matches.  Eliminate
1599   any characters from C that fail on grounds of backward context.
1600
1601   Search through the groups, looking for a group whose label L has nonempty
1602   intersection with C.  If L - C is nonempty, create a new group labeled
1603   L - C and having the same positions as the current group, and set L to
1604   the intersection of L and C.  Insert the position in this group, set
1605   C = C - L, and resume scanning.
1606
1607   If after comparing with every group there are characters remaining in C,
1608   create a new group labeled with the characters of C and insert this
1609   position in that group. */
1610void
1611dfastate(s, d, trans)
1612     int s;
1613     struct dfa *d;
1614     int trans[];
1615{
1616  position_set grps[NOTCHAR];	/* As many as will ever be needed. */
1617  charclass labels[NOTCHAR];	/* Labels corresponding to the groups. */
1618  int ngrps = 0;		/* Number of groups actually used. */
1619  position pos;			/* Current position being considered. */
1620  charclass matches;		/* Set of matching characters. */
1621  int matchesf;			/* True if matches is nonempty. */
1622  charclass intersect;		/* Intersection with some label set. */
1623  int intersectf;		/* True if intersect is nonempty. */
1624  charclass leftovers;		/* Stuff in the label that didn't match. */
1625  int leftoversf;		/* True if leftovers is nonempty. */
1626  static charclass letters;	/* Set of characters considered letters. */
1627  static charclass newline;	/* Set of characters that aren't newline. */
1628  position_set follows;		/* Union of the follows of some group. */
1629  position_set tmp;		/* Temporary space for merging sets. */
1630  int state;			/* New state. */
1631  int wants_newline;		/* New state wants to know newline context. */
1632  int state_newline;		/* New state on a newline transition. */
1633  int wants_letter;		/* New state wants to know letter context. */
1634  int state_letter;		/* New state on a letter transition. */
1635  static int initialized;	/* Flag for static initialization. */
1636  int i, j, k;
1637
1638  /* Initialize the set of letters, if necessary. */
1639  if (! initialized)
1640    {
1641      initialized = 1;
1642      for (i = 0; i < NOTCHAR; ++i)
1643	if (IS_WORD_CONSTITUENT(i))
1644	  setbit(i, letters);
1645      setbit(eolbyte, newline);
1646    }
1647
1648  zeroset(matches);
1649
1650  for (i = 0; i < d->states[s].elems.nelem; ++i)
1651    {
1652      pos = d->states[s].elems.elems[i];
1653      if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1654	setbit(d->tokens[pos.index], matches);
1655      else if (d->tokens[pos.index] >= CSET)
1656	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1657      else
1658	continue;
1659
1660      /* Some characters may need to be eliminated from matches because
1661	 they fail in the current context. */
1662      if (pos.constraint != 0xFF)
1663	{
1664	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1665					 d->states[s].newline, 1))
1666	    clrbit(eolbyte, matches);
1667	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1668					 d->states[s].newline, 0))
1669	    for (j = 0; j < CHARCLASS_INTS; ++j)
1670	      matches[j] &= newline[j];
1671	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1672					d->states[s].letter, 1))
1673	    for (j = 0; j < CHARCLASS_INTS; ++j)
1674	      matches[j] &= ~letters[j];
1675	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1676					d->states[s].letter, 0))
1677	    for (j = 0; j < CHARCLASS_INTS; ++j)
1678	      matches[j] &= letters[j];
1679
1680	  /* If there are no characters left, there's no point in going on. */
1681	  for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
1682	    continue;
1683	  if (j == CHARCLASS_INTS)
1684	    continue;
1685	}
1686
1687      for (j = 0; j < ngrps; ++j)
1688	{
1689	  /* If matches contains a single character only, and the current
1690	     group's label doesn't contain that character, go on to the
1691	     next group. */
1692	  if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
1693	      && !tstbit(d->tokens[pos.index], labels[j]))
1694	    continue;
1695
1696	  /* Check if this group's label has a nonempty intersection with
1697	     matches. */
1698	  intersectf = 0;
1699	  for (k = 0; k < CHARCLASS_INTS; ++k)
1700	    (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
1701	  if (! intersectf)
1702	    continue;
1703
1704	  /* It does; now find the set differences both ways. */
1705	  leftoversf = matchesf = 0;
1706	  for (k = 0; k < CHARCLASS_INTS; ++k)
1707	    {
1708	      /* Even an optimizing compiler can't know this for sure. */
1709	      int match = matches[k], label = labels[j][k];
1710
1711	      (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
1712	      (matches[k] = match & ~label) ? (matchesf = 1) : 0;
1713	    }
1714
1715	  /* If there were leftovers, create a new group labeled with them. */
1716	  if (leftoversf)
1717	    {
1718	      copyset(leftovers, labels[ngrps]);
1719	      copyset(intersect, labels[j]);
1720	      MALLOC(grps[ngrps].elems, position, d->nleaves);
1721	      copy(&grps[j], &grps[ngrps]);
1722	      ++ngrps;
1723	    }
1724
1725	  /* Put the position in the current group.  Note that there is no
1726	     reason to call insert() here. */
1727	  grps[j].elems[grps[j].nelem++] = pos;
1728
1729	  /* If every character matching the current position has been
1730	     accounted for, we're done. */
1731	  if (! matchesf)
1732	    break;
1733	}
1734
1735      /* If we've passed the last group, and there are still characters
1736	 unaccounted for, then we'll have to create a new group. */
1737      if (j == ngrps)
1738	{
1739	  copyset(matches, labels[ngrps]);
1740	  zeroset(matches);
1741	  MALLOC(grps[ngrps].elems, position, d->nleaves);
1742	  grps[ngrps].nelem = 1;
1743	  grps[ngrps].elems[0] = pos;
1744	  ++ngrps;
1745	}
1746    }
1747
1748  MALLOC(follows.elems, position, d->nleaves);
1749  MALLOC(tmp.elems, position, d->nleaves);
1750
1751  /* If we are a searching matcher, the default transition is to a state
1752     containing the positions of state 0, otherwise the default transition
1753     is to fail miserably. */
1754  if (d->searchflag)
1755    {
1756      wants_newline = 0;
1757      wants_letter = 0;
1758      for (i = 0; i < d->states[0].elems.nelem; ++i)
1759	{
1760	  if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
1761	    wants_newline = 1;
1762	  if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
1763	    wants_letter = 1;
1764	}
1765      copy(&d->states[0].elems, &follows);
1766      state = state_index(d, &follows, 0, 0);
1767      if (wants_newline)
1768	state_newline = state_index(d, &follows, 1, 0);
1769      else
1770	state_newline = state;
1771      if (wants_letter)
1772	state_letter = state_index(d, &follows, 0, 1);
1773      else
1774	state_letter = state;
1775      for (i = 0; i < NOTCHAR; ++i)
1776	trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
1777      trans[eolbyte] = state_newline;
1778    }
1779  else
1780    for (i = 0; i < NOTCHAR; ++i)
1781      trans[i] = -1;
1782
1783  for (i = 0; i < ngrps; ++i)
1784    {
1785      follows.nelem = 0;
1786
1787      /* Find the union of the follows of the positions of the group.
1788	 This is a hideously inefficient loop.  Fix it someday. */
1789      for (j = 0; j < grps[i].nelem; ++j)
1790	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
1791	  insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
1792
1793      /* If we are building a searching matcher, throw in the positions
1794	 of state 0 as well. */
1795      if (d->searchflag)
1796	for (j = 0; j < d->states[0].elems.nelem; ++j)
1797	  insert(d->states[0].elems.elems[j], &follows);
1798
1799      /* Find out if the new state will want any context information. */
1800      wants_newline = 0;
1801      if (tstbit(eolbyte, labels[i]))
1802	for (j = 0; j < follows.nelem; ++j)
1803	  if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
1804	    wants_newline = 1;
1805
1806      wants_letter = 0;
1807      for (j = 0; j < CHARCLASS_INTS; ++j)
1808	if (labels[i][j] & letters[j])
1809	  break;
1810      if (j < CHARCLASS_INTS)
1811	for (j = 0; j < follows.nelem; ++j)
1812	  if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
1813	    wants_letter = 1;
1814
1815      /* Find the state(s) corresponding to the union of the follows. */
1816      state = state_index(d, &follows, 0, 0);
1817      if (wants_newline)
1818	state_newline = state_index(d, &follows, 1, 0);
1819      else
1820	state_newline = state;
1821      if (wants_letter)
1822	state_letter = state_index(d, &follows, 0, 1);
1823      else
1824	state_letter = state;
1825
1826      /* Set the transitions for each character in the current label. */
1827      for (j = 0; j < CHARCLASS_INTS; ++j)
1828	for (k = 0; k < INTBITS; ++k)
1829	  if (labels[i][j] & 1 << k)
1830	    {
1831	      int c = j * INTBITS + k;
1832
1833	      if (c == eolbyte)
1834		trans[c] = state_newline;
1835	      else if (IS_WORD_CONSTITUENT(c))
1836		trans[c] = state_letter;
1837	      else if (c < NOTCHAR)
1838		trans[c] = state;
1839	    }
1840    }
1841
1842  for (i = 0; i < ngrps; ++i)
1843    free(grps[i].elems);
1844  free(follows.elems);
1845  free(tmp.elems);
1846}
1847
1848/* Some routines for manipulating a compiled dfa's transition tables.
1849   Each state may or may not have a transition table; if it does, and it
1850   is a non-accepting state, then d->trans[state] points to its table.
1851   If it is an accepting state then d->fails[state] points to its table.
1852   If it has no table at all, then d->trans[state] is NULL.
1853   TODO: Improve this comment, get rid of the unnecessary redundancy. */
1854
1855static void
1856build_state(s, d)
1857     int s;
1858     struct dfa *d;
1859{
1860  int *trans;			/* The new transition table. */
1861  int i;
1862
1863  /* Set an upper limit on the number of transition tables that will ever
1864     exist at once.  1024 is arbitrary.  The idea is that the frequently
1865     used transition tables will be quickly rebuilt, whereas the ones that
1866     were only needed once or twice will be cleared away. */
1867  if (d->trcount >= 1024)
1868    {
1869      for (i = 0; i < d->tralloc; ++i)
1870	if (d->trans[i])
1871	  {
1872	    free((ptr_t) d->trans[i]);
1873	    d->trans[i] = NULL;
1874	  }
1875	else if (d->fails[i])
1876	  {
1877	    free((ptr_t) d->fails[i]);
1878	    d->fails[i] = NULL;
1879	  }
1880      d->trcount = 0;
1881    }
1882
1883  ++d->trcount;
1884
1885  /* Set up the success bits for this state. */
1886  d->success[s] = 0;
1887  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
1888      s, *d))
1889    d->success[s] |= 4;
1890  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
1891      s, *d))
1892    d->success[s] |= 2;
1893  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
1894      s, *d))
1895    d->success[s] |= 1;
1896
1897  MALLOC(trans, int, NOTCHAR);
1898  dfastate(s, d, trans);
1899
1900  /* Now go through the new transition table, and make sure that the trans
1901     and fail arrays are allocated large enough to hold a pointer for the
1902     largest state mentioned in the table. */
1903  for (i = 0; i < NOTCHAR; ++i)
1904    if (trans[i] >= d->tralloc)
1905      {
1906	int oldalloc = d->tralloc;
1907
1908	while (trans[i] >= d->tralloc)
1909	  d->tralloc *= 2;
1910	REALLOC(d->realtrans, int *, d->tralloc + 1);
1911	d->trans = d->realtrans + 1;
1912	REALLOC(d->fails, int *, d->tralloc);
1913	REALLOC(d->success, int, d->tralloc);
1914	REALLOC(d->newlines, int, d->tralloc);
1915	while (oldalloc < d->tralloc)
1916	  {
1917	    d->trans[oldalloc] = NULL;
1918	    d->fails[oldalloc++] = NULL;
1919	  }
1920      }
1921
1922  /* Keep the newline transition in a special place so we can use it as
1923     a sentinel. */
1924  d->newlines[s] = trans[eolbyte];
1925  trans[eolbyte] = -1;
1926
1927  if (ACCEPTING(s, *d))
1928    d->fails[s] = trans;
1929  else
1930    d->trans[s] = trans;
1931}
1932
1933static void
1934build_state_zero(d)
1935     struct dfa *d;
1936{
1937  d->tralloc = 1;
1938  d->trcount = 0;
1939  CALLOC(d->realtrans, int *, d->tralloc + 1);
1940  d->trans = d->realtrans + 1;
1941  CALLOC(d->fails, int *, d->tralloc);
1942  MALLOC(d->success, int, d->tralloc);
1943  MALLOC(d->newlines, int, d->tralloc);
1944  build_state(0, d);
1945}
1946
1947/* Search through a buffer looking for a match to the given struct dfa.
1948   Find the first occurrence of a string matching the regexp in the buffer,
1949   and the shortest possible version thereof.  Return a pointer to the first
1950   character after the match, or NULL if none is found.  Begin points to
1951   the beginning of the buffer, and end points to the first character after
1952   its end.  We store a newline in *end to act as a sentinel, so end had
1953   better point somewhere valid.  Newline is a flag indicating whether to
1954   allow newlines to be in the matching string.  If count is non-
1955   NULL it points to a place we're supposed to increment every time we
1956   see a newline.  Finally, if backref is non-NULL it points to a place
1957   where we're supposed to store a 1 if backreferencing happened and the
1958   match needs to be verified by a backtracking matcher.  Otherwise
1959   we store a 0 in *backref. */
1960char *
1961dfaexec(d, begin, end, newline, count, backref)
1962     struct dfa *d;
1963     char *begin;
1964     char *end;
1965     int newline;
1966     int *count;
1967     int *backref;
1968{
1969  register int s, s1, tmp;	/* Current state. */
1970  register unsigned char *p;	/* Current input character. */
1971  register int **trans, *t;	/* Copy of d->trans so it can be optimized
1972				   into a register. */
1973  register unsigned char eol = eolbyte;	/* Likewise for eolbyte.  */
1974  static int sbit[NOTCHAR];	/* Table for anding with d->success. */
1975  static int sbit_init;
1976
1977  if (! sbit_init)
1978    {
1979      int i;
1980
1981      sbit_init = 1;
1982      for (i = 0; i < NOTCHAR; ++i)
1983	sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
1984      sbit[eol] = 4;
1985    }
1986
1987  if (! d->tralloc)
1988    build_state_zero(d);
1989
1990  s = s1 = 0;
1991  p = (unsigned char *) begin;
1992  trans = d->trans;
1993  *end = eol;
1994
1995  for (;;)
1996    {
1997      while ((t = trans[s]) != 0) { /* hand-optimized loop */
1998	s1 = t[*p++];
1999        if ((t = trans[s1]) == 0) {
2000           tmp = s ; s = s1 ; s1 = tmp ; /* swap */
2001           break;
2002        }
2003	s = t[*p++];
2004      }
2005
2006      if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
2007	{
2008	  if (d->success[s] & sbit[*p])
2009	    {
2010	      if (backref)
2011		*backref = (d->states[s].backref != 0);
2012	      return (char *) p;
2013	    }
2014
2015	  s1 = s;
2016	  s = d->fails[s][*p++];
2017	  continue;
2018	}
2019
2020      /* If the previous character was a newline, count it. */
2021      if (count && (char *) p <= end && p[-1] == eol)
2022	++*count;
2023
2024      /* Check if we've run off the end of the buffer. */
2025      if ((char *) p > end)
2026	return NULL;
2027
2028      if (s >= 0)
2029	{
2030	  build_state(s, d);
2031	  trans = d->trans;
2032	  continue;
2033	}
2034
2035      if (p[-1] == eol && newline)
2036	{
2037	  s = d->newlines[s1];
2038	  continue;
2039	}
2040
2041      s = 0;
2042    }
2043}
2044
2045/* Initialize the components of a dfa that the other routines don't
2046   initialize for themselves. */
2047void
2048dfainit(d)
2049     struct dfa *d;
2050{
2051  d->calloc = 1;
2052  MALLOC(d->charclasses, charclass, d->calloc);
2053  d->cindex = 0;
2054
2055  d->talloc = 1;
2056  MALLOC(d->tokens, token, d->talloc);
2057  d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2058
2059  d->searchflag = 0;
2060  d->tralloc = 0;
2061
2062  d->musts = 0;
2063}
2064
2065/* Parse and analyze a single string of the given length. */
2066void
2067dfacomp(s, len, d, searchflag)
2068     char *s;
2069     size_t len;
2070     struct dfa *d;
2071     int searchflag;
2072{
2073  if (case_fold)	/* dummy folding in service of dfamust() */
2074    {
2075      char *lcopy;
2076      int i;
2077
2078      lcopy = malloc(len);
2079      if (!lcopy)
2080	dfaerror(_("out of memory"));
2081
2082      /* This is a kludge. */
2083      case_fold = 0;
2084      for (i = 0; i < len; ++i)
2085	if (ISUPPER ((unsigned char) s[i]))
2086	  lcopy[i] = tolower ((unsigned char) s[i]);
2087	else
2088	  lcopy[i] = s[i];
2089
2090      dfainit(d);
2091      dfaparse(lcopy, len, d);
2092      free(lcopy);
2093      dfamust(d);
2094      d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2095      case_fold = 1;
2096      dfaparse(s, len, d);
2097      dfaanalyze(d, searchflag);
2098    }
2099  else
2100    {
2101        dfainit(d);
2102        dfaparse(s, len, d);
2103	dfamust(d);
2104        dfaanalyze(d, searchflag);
2105    }
2106}
2107
2108/* Free the storage held by the components of a dfa. */
2109void
2110dfafree(d)
2111     struct dfa *d;
2112{
2113  int i;
2114  struct dfamust *dm, *ndm;
2115
2116  free((ptr_t) d->charclasses);
2117  free((ptr_t) d->tokens);
2118  for (i = 0; i < d->sindex; ++i)
2119    free((ptr_t) d->states[i].elems.elems);
2120  free((ptr_t) d->states);
2121  for (i = 0; i < d->tindex; ++i)
2122    if (d->follows[i].elems)
2123      free((ptr_t) d->follows[i].elems);
2124  free((ptr_t) d->follows);
2125  for (i = 0; i < d->tralloc; ++i)
2126    if (d->trans[i])
2127      free((ptr_t) d->trans[i]);
2128    else if (d->fails[i])
2129      free((ptr_t) d->fails[i]);
2130  if (d->realtrans) free((ptr_t) d->realtrans);
2131  if (d->fails) free((ptr_t) d->fails);
2132  if (d->newlines) free((ptr_t) d->newlines);
2133  if (d->success) free((ptr_t) d->success);
2134  for (dm = d->musts; dm; dm = ndm)
2135    {
2136      ndm = dm->next;
2137      free(dm->must);
2138      free((ptr_t) dm);
2139    }
2140}
2141
2142/* Having found the postfix representation of the regular expression,
2143   try to find a long sequence of characters that must appear in any line
2144   containing the r.e.
2145   Finding a "longest" sequence is beyond the scope here;
2146   we take an easy way out and hope for the best.
2147   (Take "(ab|a)b"--please.)
2148
2149   We do a bottom-up calculation of sequences of characters that must appear
2150   in matches of r.e.'s represented by trees rooted at the nodes of the postfix
2151   representation:
2152	sequences that must appear at the left of the match ("left")
2153	sequences that must appear at the right of the match ("right")
2154	lists of sequences that must appear somewhere in the match ("in")
2155	sequences that must constitute the match ("is")
2156
2157   When we get to the root of the tree, we use one of the longest of its
2158   calculated "in" sequences as our answer.  The sequence we find is returned in
2159   d->must (where "d" is the single argument passed to "dfamust");
2160   the length of the sequence is returned in d->mustn.
2161
2162   The sequences calculated for the various types of node (in pseudo ANSI c)
2163   are shown below.  "p" is the operand of unary operators (and the left-hand
2164   operand of binary operators); "q" is the right-hand operand of binary
2165   operators.
2166
2167   "ZERO" means "a zero-length sequence" below.
2168
2169	Type	left		right		is		in
2170	----	----		-----		--		--
2171	char c	# c		# c		# c		# c
2172
2173	CSET	ZERO		ZERO		ZERO		ZERO
2174
2175	STAR	ZERO		ZERO		ZERO		ZERO
2176
2177	QMARK	ZERO		ZERO		ZERO		ZERO
2178
2179	PLUS	p->left		p->right	ZERO		p->in
2180
2181	CAT	(p->is==ZERO)?	(q->is==ZERO)?	(p->is!=ZERO &&	p->in plus
2182		p->left :	q->right :	q->is!=ZERO) ?	q->in plus
2183		p->is##q->left	p->right##q->is	p->is##q->is :	p->right##q->left
2184						ZERO
2185
2186	OR	longest common	longest common	(do p->is and	substrings common to
2187		leading		trailing	q->is have same	p->in and q->in
2188		(sub)sequence	(sub)sequence	length and
2189		of p->left	of p->right	content) ?
2190		and q->left	and q->right	p->is : NULL
2191
2192   If there's anything else we recognize in the tree, all four sequences get set
2193   to zero-length sequences.  If there's something we don't recognize in the tree,
2194   we just return a zero-length sequence.
2195
2196   Break ties in favor of infrequent letters (choosing 'zzz' in preference to
2197   'aaa')?
2198
2199   And. . .is it here or someplace that we might ponder "optimizations" such as
2200	egrep 'psi|epsilon'	->	egrep 'psi'
2201	egrep 'pepsi|epsilon'	->	egrep 'epsi'
2202					(Yes, we now find "epsi" as a "string
2203					that must occur", but we might also
2204					simplify the *entire* r.e. being sought)
2205	grep '[c]'		->	grep 'c'
2206	grep '(ab|a)b'		->	grep 'ab'
2207	grep 'ab*'		->	grep 'a'
2208	grep 'a*b'		->	grep 'b'
2209
2210   There are several issues:
2211
2212   Is optimization easy (enough)?
2213
2214   Does optimization actually accomplish anything,
2215   or is the automaton you get from "psi|epsilon" (for example)
2216   the same as the one you get from "psi" (for example)?
2217
2218   Are optimizable r.e.'s likely to be used in real-life situations
2219   (something like 'ab*' is probably unlikely; something like is
2220   'psi|epsilon' is likelier)? */
2221
2222static char *
2223icatalloc(old, new)
2224     char *old;
2225     char *new;
2226{
2227  char *result;
2228  size_t oldsize, newsize;
2229
2230  newsize = (new == NULL) ? 0 : strlen(new);
2231  if (old == NULL)
2232    oldsize = 0;
2233  else if (newsize == 0)
2234    return old;
2235  else	oldsize = strlen(old);
2236  if (old == NULL)
2237    result = (char *) malloc(newsize + 1);
2238  else
2239    result = (char *) realloc((void *) old, oldsize + newsize + 1);
2240  if (result != NULL && new != NULL)
2241    (void) strcpy(result + oldsize, new);
2242  return result;
2243}
2244
2245static char *
2246icpyalloc(string)
2247     char *string;
2248{
2249  return icatalloc((char *) NULL, string);
2250}
2251
2252static char *
2253istrstr(lookin, lookfor)
2254     char *lookin;
2255     char *lookfor;
2256{
2257  char *cp;
2258  size_t len;
2259
2260  len = strlen(lookfor);
2261  for (cp = lookin; *cp != '\0'; ++cp)
2262    if (strncmp(cp, lookfor, len) == 0)
2263      return cp;
2264  return NULL;
2265}
2266
2267static void
2268ifree(cp)
2269     char *cp;
2270{
2271  if (cp != NULL)
2272    free(cp);
2273}
2274
2275static void
2276freelist(cpp)
2277     char **cpp;
2278{
2279  int i;
2280
2281  if (cpp == NULL)
2282    return;
2283  for (i = 0; cpp[i] != NULL; ++i)
2284    {
2285      free(cpp[i]);
2286      cpp[i] = NULL;
2287    }
2288}
2289
2290static char **
2291enlist(cpp, new, len)
2292     char **cpp;
2293     char *new;
2294     size_t len;
2295{
2296  int i, j;
2297
2298  if (cpp == NULL)
2299    return NULL;
2300  if ((new = icpyalloc(new)) == NULL)
2301    {
2302      freelist(cpp);
2303      return NULL;
2304    }
2305  new[len] = '\0';
2306  /* Is there already something in the list that's new (or longer)? */
2307  for (i = 0; cpp[i] != NULL; ++i)
2308    if (istrstr(cpp[i], new) != NULL)
2309      {
2310	free(new);
2311	return cpp;
2312      }
2313  /* Eliminate any obsoleted strings. */
2314  j = 0;
2315  while (cpp[j] != NULL)
2316    if (istrstr(new, cpp[j]) == NULL)
2317      ++j;
2318    else
2319      {
2320	free(cpp[j]);
2321	if (--i == j)
2322	  break;
2323	cpp[j] = cpp[i];
2324	cpp[i] = NULL;
2325      }
2326  /* Add the new string. */
2327  cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
2328  if (cpp == NULL)
2329    return NULL;
2330  cpp[i] = new;
2331  cpp[i + 1] = NULL;
2332  return cpp;
2333}
2334
2335/* Given pointers to two strings, return a pointer to an allocated
2336   list of their distinct common substrings. Return NULL if something
2337   seems wild. */
2338static char **
2339comsubs(left, right)
2340     char *left;
2341     char *right;
2342{
2343  char **cpp;
2344  char *lcp;
2345  char *rcp;
2346  size_t i, len;
2347
2348  if (left == NULL || right == NULL)
2349    return NULL;
2350  cpp = (char **) malloc(sizeof *cpp);
2351  if (cpp == NULL)
2352    return NULL;
2353  cpp[0] = NULL;
2354  for (lcp = left; *lcp != '\0'; ++lcp)
2355    {
2356      len = 0;
2357      rcp = index(right, *lcp);
2358      while (rcp != NULL)
2359	{
2360	  for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
2361	    continue;
2362	  if (i > len)
2363	    len = i;
2364	  rcp = index(rcp + 1, *lcp);
2365	}
2366      if (len == 0)
2367	continue;
2368      if ((cpp = enlist(cpp, lcp, len)) == NULL)
2369	break;
2370    }
2371  return cpp;
2372}
2373
2374static char **
2375addlists(old, new)
2376char **old;
2377char **new;
2378{
2379  int i;
2380
2381  if (old == NULL || new == NULL)
2382    return NULL;
2383  for (i = 0; new[i] != NULL; ++i)
2384    {
2385      old = enlist(old, new[i], strlen(new[i]));
2386      if (old == NULL)
2387	break;
2388    }
2389  return old;
2390}
2391
2392/* Given two lists of substrings, return a new list giving substrings
2393   common to both. */
2394static char **
2395inboth(left, right)
2396     char **left;
2397     char **right;
2398{
2399  char **both;
2400  char **temp;
2401  int lnum, rnum;
2402
2403  if (left == NULL || right == NULL)
2404    return NULL;
2405  both = (char **) malloc(sizeof *both);
2406  if (both == NULL)
2407    return NULL;
2408  both[0] = NULL;
2409  for (lnum = 0; left[lnum] != NULL; ++lnum)
2410    {
2411      for (rnum = 0; right[rnum] != NULL; ++rnum)
2412	{
2413	  temp = comsubs(left[lnum], right[rnum]);
2414	  if (temp == NULL)
2415	    {
2416	      freelist(both);
2417	      return NULL;
2418	    }
2419	  both = addlists(both, temp);
2420	  freelist(temp);
2421	  free(temp);
2422	  if (both == NULL)
2423	    return NULL;
2424	}
2425    }
2426  return both;
2427}
2428
2429typedef struct
2430{
2431  char **in;
2432  char *left;
2433  char *right;
2434  char *is;
2435} must;
2436
2437static void
2438resetmust(mp)
2439must *mp;
2440{
2441  mp->left[0] = mp->right[0] = mp->is[0] = '\0';
2442  freelist(mp->in);
2443}
2444
2445static void
2446dfamust(dfa)
2447struct dfa *dfa;
2448{
2449  must *musts;
2450  must *mp;
2451  char *result;
2452  int ri;
2453  int i;
2454  int exact;
2455  token t;
2456  static must must0;
2457  struct dfamust *dm;
2458  static char empty_string[] = "";
2459
2460  result = empty_string;
2461  exact = 0;
2462  musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
2463  if (musts == NULL)
2464    return;
2465  mp = musts;
2466  for (i = 0; i <= dfa->tindex; ++i)
2467    mp[i] = must0;
2468  for (i = 0; i <= dfa->tindex; ++i)
2469    {
2470      mp[i].in = (char **) malloc(sizeof *mp[i].in);
2471      mp[i].left = malloc(2);
2472      mp[i].right = malloc(2);
2473      mp[i].is = malloc(2);
2474      if (mp[i].in == NULL || mp[i].left == NULL ||
2475	  mp[i].right == NULL || mp[i].is == NULL)
2476	goto done;
2477      mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
2478      mp[i].in[0] = NULL;
2479    }
2480#ifdef DEBUG
2481  fprintf(stderr, "dfamust:\n");
2482  for (i = 0; i < dfa->tindex; ++i)
2483    {
2484      fprintf(stderr, " %d:", i);
2485      prtok(dfa->tokens[i]);
2486    }
2487  putc('\n', stderr);
2488#endif
2489  for (ri = 0; ri < dfa->tindex; ++ri)
2490    {
2491      switch (t = dfa->tokens[ri])
2492	{
2493	case LPAREN:
2494	case RPAREN:
2495	  goto done;		/* "cannot happen" */
2496	case EMPTY:
2497	case BEGLINE:
2498	case ENDLINE:
2499	case BEGWORD:
2500	case ENDWORD:
2501	case LIMWORD:
2502	case NOTLIMWORD:
2503	case BACKREF:
2504	  resetmust(mp);
2505	  break;
2506	case STAR:
2507	case QMARK:
2508	  if (mp <= musts)
2509	    goto done;		/* "cannot happen" */
2510	  --mp;
2511	  resetmust(mp);
2512	  break;
2513	case OR:
2514	case ORTOP:
2515	  if (mp < &musts[2])
2516	    goto done;		/* "cannot happen" */
2517	  {
2518	    char **new;
2519	    must *lmp;
2520	    must *rmp;
2521	    int j, ln, rn, n;
2522
2523	    rmp = --mp;
2524	    lmp = --mp;
2525	    /* Guaranteed to be.  Unlikely, but. . . */
2526	    if (strcmp(lmp->is, rmp->is) != 0)
2527	      lmp->is[0] = '\0';
2528	    /* Left side--easy */
2529	    i = 0;
2530	    while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
2531	      ++i;
2532	    lmp->left[i] = '\0';
2533	    /* Right side */
2534	    ln = strlen(lmp->right);
2535	    rn = strlen(rmp->right);
2536	    n = ln;
2537	    if (n > rn)
2538	      n = rn;
2539	    for (i = 0; i < n; ++i)
2540	      if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
2541		break;
2542	    for (j = 0; j < i; ++j)
2543	      lmp->right[j] = lmp->right[(ln - i) + j];
2544	    lmp->right[j] = '\0';
2545	    new = inboth(lmp->in, rmp->in);
2546	    if (new == NULL)
2547	      goto done;
2548	    freelist(lmp->in);
2549	    free((char *) lmp->in);
2550	    lmp->in = new;
2551	  }
2552	  break;
2553	case PLUS:
2554	  if (mp <= musts)
2555	    goto done;		/* "cannot happen" */
2556	  --mp;
2557	  mp->is[0] = '\0';
2558	  break;
2559	case END:
2560	  if (mp != &musts[1])
2561	    goto done;		/* "cannot happen" */
2562	  for (i = 0; musts[0].in[i] != NULL; ++i)
2563	    if (strlen(musts[0].in[i]) > strlen(result))
2564	      result = musts[0].in[i];
2565	  if (strcmp(result, musts[0].is) == 0)
2566	    exact = 1;
2567	  goto done;
2568	case CAT:
2569	  if (mp < &musts[2])
2570	    goto done;		/* "cannot happen" */
2571	  {
2572	    must *lmp;
2573	    must *rmp;
2574
2575	    rmp = --mp;
2576	    lmp = --mp;
2577	    /* In.  Everything in left, plus everything in
2578	       right, plus catenation of
2579	       left's right and right's left. */
2580	    lmp->in = addlists(lmp->in, rmp->in);
2581	    if (lmp->in == NULL)
2582	      goto done;
2583	    if (lmp->right[0] != '\0' &&
2584		rmp->left[0] != '\0')
2585	      {
2586		char *tp;
2587
2588		tp = icpyalloc(lmp->right);
2589		if (tp == NULL)
2590		  goto done;
2591		tp = icatalloc(tp, rmp->left);
2592		if (tp == NULL)
2593		  goto done;
2594		lmp->in = enlist(lmp->in, tp,
2595				 strlen(tp));
2596		free(tp);
2597		if (lmp->in == NULL)
2598		  goto done;
2599	      }
2600	    /* Left-hand */
2601	    if (lmp->is[0] != '\0')
2602	      {
2603		lmp->left = icatalloc(lmp->left,
2604				      rmp->left);
2605		if (lmp->left == NULL)
2606		  goto done;
2607	      }
2608	    /* Right-hand */
2609	    if (rmp->is[0] == '\0')
2610	      lmp->right[0] = '\0';
2611	    lmp->right = icatalloc(lmp->right, rmp->right);
2612	    if (lmp->right == NULL)
2613	      goto done;
2614	    /* Guaranteed to be */
2615	    if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
2616	      {
2617		lmp->is = icatalloc(lmp->is, rmp->is);
2618		if (lmp->is == NULL)
2619		  goto done;
2620	      }
2621	    else
2622	      lmp->is[0] = '\0';
2623	  }
2624	  break;
2625	default:
2626	  if (t < END)
2627	    {
2628	      /* "cannot happen" */
2629	      goto done;
2630	    }
2631	  else if (t == '\0')
2632	    {
2633	      /* not on *my* shift */
2634	      goto done;
2635	    }
2636	  else if (t >= CSET)
2637	    {
2638	      /* easy enough */
2639	      resetmust(mp);
2640	    }
2641	  else
2642	    {
2643	      /* plain character */
2644	      resetmust(mp);
2645	      mp->is[0] = mp->left[0] = mp->right[0] = t;
2646	      mp->is[1] = mp->left[1] = mp->right[1] = '\0';
2647	      mp->in = enlist(mp->in, mp->is, (size_t)1);
2648	      if (mp->in == NULL)
2649		goto done;
2650	    }
2651	  break;
2652	}
2653#ifdef DEBUG
2654      fprintf(stderr, " node: %d:", ri);
2655      prtok(dfa->tokens[ri]);
2656      fprintf(stderr, "\n  in:");
2657      for (i = 0; mp->in[i]; ++i)
2658	fprintf(stderr, " \"%s\"", mp->in[i]);
2659      fprintf(stderr, "\n  is: \"%s\"\n", mp->is);
2660      fprintf(stderr, "  left: \"%s\"\n", mp->left);
2661      fprintf(stderr, "  right: \"%s\"\n", mp->right);
2662#endif
2663      ++mp;
2664    }
2665 done:
2666  if (strlen(result))
2667    {
2668      dm = (struct dfamust *) malloc(sizeof (struct dfamust));
2669      dm->exact = exact;
2670      dm->must = malloc(strlen(result) + 1);
2671      strcpy(dm->must, result);
2672      dm->next = dfa->musts;
2673      dfa->musts = dm;
2674    }
2675  mp = musts;
2676  for (i = 0; i <= dfa->tindex; ++i)
2677    {
2678      freelist(mp[i].in);
2679      ifree((char *) mp[i].in);
2680      ifree(mp[i].left);
2681      ifree(mp[i].right);
2682      ifree(mp[i].is);
2683    }
2684  free((char *) mp);
2685}
2686