dfa.h revision 55379
1/* dfa.h - declarations for GNU deterministic regexp compiler
2   Copyright (C) 1988, 1998 Free Software Foundation, Inc.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2, or (at your option)
7   any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software
16   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA */
17
18/* Written June, 1988 by Mike Haertel */
19
20/* $FreeBSD: head/gnu/usr.bin/grep/dfa.h 55379 2000-01-04 03:25:40Z obrien $ */
21
22/* FIXME:
23   2.  We should not export so much of the DFA internals.
24   In addition to clobbering modularity, we eat up valuable
25   name space. */
26
27# undef PARAMS
28#if __STDC__
29# ifndef _PTR_T
30# define _PTR_T
31  typedef void * ptr_t;
32# endif
33# define PARAMS(x) x
34#else
35# ifndef _PTR_T
36# define _PTR_T
37  typedef char * ptr_t;
38# endif
39# define PARAMS(x) ()
40#endif
41
42/* Number of bits in an unsigned char. */
43#ifndef CHARBITS
44#define CHARBITS 8
45#endif
46
47/* First integer value that is greater than any character code. */
48#define NOTCHAR (1 << CHARBITS)
49
50/* INTBITS need not be exact, just a lower bound. */
51#ifndef INTBITS
52#define INTBITS (CHARBITS * sizeof (int))
53#endif
54
55/* Number of ints required to hold a bit for every character. */
56#define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)
57
58/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
59typedef int charclass[CHARCLASS_INTS];
60
61/* The regexp is parsed into an array of tokens in postfix form.  Some tokens
62   are operators and others are terminal symbols.  Most (but not all) of these
63   codes are returned by the lexical analyzer. */
64
65typedef enum
66{
67  END = -1,			/* END is a terminal symbol that matches the
68				   end of input; any value of END or less in
69				   the parse tree is such a symbol.  Accepting
70				   states of the DFA are those that would have
71				   a transition on END. */
72
73  /* Ordinary character values are terminal symbols that match themselves. */
74
75  EMPTY = NOTCHAR,		/* EMPTY is a terminal symbol that matches
76				   the empty string. */
77
78  BACKREF,			/* BACKREF is generated by \<digit>; it
79				   it not completely handled.  If the scanner
80				   detects a transition on backref, it returns
81				   a kind of "semi-success" indicating that
82				   the match will have to be verified with
83				   a backtracking matcher. */
84
85  BEGLINE,			/* BEGLINE is a terminal symbol that matches
86				   the empty string if it is at the beginning
87				   of a line. */
88
89  ENDLINE,			/* ENDLINE is a terminal symbol that matches
90				   the empty string if it is at the end of
91				   a line. */
92
93  BEGWORD,			/* BEGWORD is a terminal symbol that matches
94				   the empty string if it is at the beginning
95				   of a word. */
96
97  ENDWORD,			/* ENDWORD is a terminal symbol that matches
98				   the empty string if it is at the end of
99				   a word. */
100
101  LIMWORD,			/* LIMWORD is a terminal symbol that matches
102				   the empty string if it is at the beginning
103				   or the end of a word. */
104
105  NOTLIMWORD,			/* NOTLIMWORD is a terminal symbol that
106				   matches the empty string if it is not at
107				   the beginning or end of a word. */
108
109  QMARK,			/* QMARK is an operator of one argument that
110				   matches zero or one occurences of its
111				   argument. */
112
113  STAR,				/* STAR is an operator of one argument that
114				   matches the Kleene closure (zero or more
115				   occurrences) of its argument. */
116
117  PLUS,				/* PLUS is an operator of one argument that
118				   matches the positive closure (one or more
119				   occurrences) of its argument. */
120
121  REPMN,			/* REPMN is a lexical token corresponding
122				   to the {m,n} construct.  REPMN never
123				   appears in the compiled token vector. */
124
125  CAT,				/* CAT is an operator of two arguments that
126				   matches the concatenation of its
127				   arguments.  CAT is never returned by the
128				   lexical analyzer. */
129
130  OR,				/* OR is an operator of two arguments that
131				   matches either of its arguments. */
132
133  ORTOP,			/* OR at the toplevel in the parse tree.
134				   This is used for a boyer-moore heuristic. */
135
136  LPAREN,			/* LPAREN never appears in the parse tree,
137				   it is only a lexeme. */
138
139  RPAREN,			/* RPAREN never appears in the parse tree. */
140
141  CSET				/* CSET and (and any value greater) is a
142				   terminal symbol that matches any of a
143				   class of characters. */
144} token;
145
146/* Sets are stored in an array in the compiled dfa; the index of the
147   array corresponding to a given set token is given by SET_INDEX(t). */
148#define SET_INDEX(t) ((t) - CSET)
149
150/* Sometimes characters can only be matched depending on the surrounding
151   context.  Such context decisions depend on what the previous character
152   was, and the value of the current (lookahead) character.  Context
153   dependent constraints are encoded as 8 bit integers.  Each bit that
154   is set indicates that the constraint succeeds in the corresponding
155   context.
156
157   bit 7 - previous and current are newlines
158   bit 6 - previous was newline, current isn't
159   bit 5 - previous wasn't newline, current is
160   bit 4 - neither previous nor current is a newline
161   bit 3 - previous and current are word-constituents
162   bit 2 - previous was word-constituent, current isn't
163   bit 1 - previous wasn't word-constituent, current is
164   bit 0 - neither previous nor current is word-constituent
165
166   Word-constituent characters are those that satisfy isalnum().
167
168   The macro SUCCEEDS_IN_CONTEXT determines whether a a given constraint
169   succeeds in a particular context.  Prevn is true if the previous character
170   was a newline, currn is true if the lookahead character is a newline.
171   Prevl and currl similarly depend upon whether the previous and current
172   characters are word-constituent letters. */
173#define MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \
174  ((constraint) & 1 << (((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4))
175#define MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \
176  ((constraint) & 1 << (((prevl) ? 2 : 0) + ((currl) ? 1 : 0)))
177#define SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \
178  (MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn)		     \
179   && MATCHES_LETTER_CONTEXT(constraint, prevl, currl))
180
181/* The following macros give information about what a constraint depends on. */
182#define PREV_NEWLINE_DEPENDENT(constraint) \
183  (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30))
184#define PREV_LETTER_DEPENDENT(constraint) \
185  (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03))
186
187/* Tokens that match the empty string subject to some constraint actually
188   work by applying that constraint to determine what may follow them,
189   taking into account what has gone before.  The following values are
190   the constraints corresponding to the special tokens previously defined. */
191#define NO_CONSTRAINT 0xff
192#define BEGLINE_CONSTRAINT 0xcf
193#define ENDLINE_CONSTRAINT 0xaf
194#define BEGWORD_CONSTRAINT 0xf2
195#define ENDWORD_CONSTRAINT 0xf4
196#define LIMWORD_CONSTRAINT 0xf6
197#define NOTLIMWORD_CONSTRAINT 0xf9
198
199/* States of the recognizer correspond to sets of positions in the parse
200   tree, together with the constraints under which they may be matched.
201   So a position is encoded as an index into the parse tree together with
202   a constraint. */
203typedef struct
204{
205  unsigned index;		/* Index into the parse array. */
206  unsigned constraint;		/* Constraint for matching this position. */
207} position;
208
209/* Sets of positions are stored as arrays. */
210typedef struct
211{
212  position *elems;		/* Elements of this position set. */
213  int nelem;			/* Number of elements in this set. */
214} position_set;
215
216/* A state of the dfa consists of a set of positions, some flags,
217   and the token value of the lowest-numbered position of the state that
218   contains an END token. */
219typedef struct
220{
221  int hash;			/* Hash of the positions of this state. */
222  position_set elems;		/* Positions this state could match. */
223  char newline;			/* True if previous state matched newline. */
224  char letter;			/* True if previous state matched a letter. */
225  char backref;			/* True if this state matches a \<digit>. */
226  unsigned char constraint;	/* Constraint for this state to accept. */
227  int first_end;		/* Token value of the first END in elems. */
228} dfa_state;
229
230/* Element of a list of strings, at least one of which is known to
231   appear in any R.E. matching the DFA. */
232struct dfamust
233{
234  int exact;
235  char *must;
236  struct dfamust *next;
237};
238
239/* A compiled regular expression. */
240struct dfa
241{
242  /* Stuff built by the scanner. */
243  charclass *charclasses;	/* Array of character sets for CSET tokens. */
244  int cindex;			/* Index for adding new charclasses. */
245  int calloc;			/* Number of charclasses currently allocated. */
246
247  /* Stuff built by the parser. */
248  token *tokens;		/* Postfix parse array. */
249  int tindex;			/* Index for adding new tokens. */
250  int talloc;			/* Number of tokens currently allocated. */
251  int depth;			/* Depth required of an evaluation stack
252				   used for depth-first traversal of the
253				   parse tree. */
254  int nleaves;			/* Number of leaves on the parse tree. */
255  int nregexps;			/* Count of parallel regexps being built
256				   with dfaparse(). */
257
258  /* Stuff owned by the state builder. */
259  dfa_state *states;		/* States of the dfa. */
260  int sindex;			/* Index for adding new states. */
261  int salloc;			/* Number of states currently allocated. */
262
263  /* Stuff built by the structure analyzer. */
264  position_set *follows;	/* Array of follow sets, indexed by position
265				   index.  The follow of a position is the set
266				   of positions containing characters that
267				   could conceivably follow a character
268				   matching the given position in a string
269				   matching the regexp.  Allocated to the
270				   maximum possible position index. */
271  int searchflag;		/* True if we are supposed to build a searching
272				   as opposed to an exact matcher.  A searching
273				   matcher finds the first and shortest string
274				   matching a regexp anywhere in the buffer,
275				   whereas an exact matcher finds the longest
276				   string matching, but anchored to the
277				   beginning of the buffer. */
278
279  /* Stuff owned by the executor. */
280  int tralloc;			/* Number of transition tables that have
281				   slots so far. */
282  int trcount;			/* Number of transition tables that have
283				   actually been built. */
284  int **trans;			/* Transition tables for states that can
285				   never accept.  If the transitions for a
286				   state have not yet been computed, or the
287				   state could possibly accept, its entry in
288				   this table is NULL. */
289  int **realtrans;		/* Trans always points to realtrans + 1; this
290				   is so trans[-1] can contain NULL. */
291  int **fails;			/* Transition tables after failing to accept
292				   on a state that potentially could do so. */
293  int *success;			/* Table of acceptance conditions used in
294				   dfaexec and computed in build_state. */
295  int *newlines;		/* Transitions on newlines.  The entry for a
296				   newline in any transition table is always
297				   -1 so we can count lines without wasting
298				   too many cycles.  The transition for a
299				   newline is stored separately and handled
300				   as a special case.  Newline is also used
301				   as a sentinel at the end of the buffer. */
302  struct dfamust *musts;	/* List of strings, at least one of which
303				   is known to appear in any r.e. matching
304				   the dfa. */
305};
306
307/* Some macros for user access to dfa internals. */
308
309/* ACCEPTING returns true if s could possibly be an accepting state of r. */
310#define ACCEPTING(s, r) ((r).states[s].constraint)
311
312/* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
313   specified context. */
314#define ACCEPTS_IN_CONTEXT(prevn, currn, prevl, currl, state, dfa) \
315  SUCCEEDS_IN_CONTEXT((dfa).states[state].constraint,		   \
316		       prevn, currn, prevl, currl)
317
318/* FIRST_MATCHING_REGEXP returns the index number of the first of parallel
319   regexps that a given state could accept.  Parallel regexps are numbered
320   starting at 1. */
321#define FIRST_MATCHING_REGEXP(state, dfa) (-(dfa).states[state].first_end)
322
323/* Entry points. */
324
325/* dfasyntax() takes three arguments; the first sets the syntax bits described
326   earlier in this file, the second sets the case-folding flag, and the
327   third specifies the line terminator. */
328extern void dfasyntax PARAMS ((reg_syntax_t, int, int));
329
330/* Compile the given string of the given length into the given struct dfa.
331   Final argument is a flag specifying whether to build a searching or an
332   exact matcher. */
333extern void dfacomp PARAMS ((char *, size_t, struct dfa *, int));
334
335/* Execute the given struct dfa on the buffer of characters.  The
336   first char * points to the beginning, and the second points to the
337   first character after the end of the buffer, which must be a writable
338   place so a sentinel end-of-buffer marker can be stored there.  The
339   second-to-last argument is a flag telling whether to allow newlines to
340   be part of a string matching the regexp.  The next-to-last argument,
341   if non-NULL, points to a place to increment every time we see a
342   newline.  The final argument, if non-NULL, points to a flag that will
343   be set if further examination by a backtracking matcher is needed in
344   order to verify backreferencing; otherwise the flag will be cleared.
345   Returns NULL if no match is found, or a pointer to the first
346   character after the first & shortest matching string in the buffer. */
347extern char *dfaexec PARAMS ((struct dfa *, char *, char *, int, int *, int *));
348
349/* Free the storage held by the components of a struct dfa. */
350extern void dfafree PARAMS ((struct dfa *));
351
352/* Entry points for people who know what they're doing. */
353
354/* Initialize the components of a struct dfa. */
355extern void dfainit PARAMS ((struct dfa *));
356
357/* Incrementally parse a string of given length into a struct dfa. */
358extern void dfaparse PARAMS ((char *, size_t, struct dfa *));
359
360/* Analyze a parsed regexp; second argument tells whether to build a searching
361   or an exact matcher. */
362extern void dfaanalyze PARAMS ((struct dfa *, int));
363
364/* Compute, for each possible character, the transitions out of a given
365   state, storing them in an array of integers. */
366extern void dfastate PARAMS ((int, struct dfa *, int []));
367
368/* Error handling. */
369
370/* dfaerror() is called by the regexp routines whenever an error occurs.  It
371   takes a single argument, a NUL-terminated string describing the error.
372   The default dfaerror() prints the error message to stderr and exits.
373   The user can provide a different dfafree() if so desired. */
374extern void dfaerror PARAMS ((const char *));
375