dfa.h revision 131576
1/* dfa.h - declarations for GNU deterministic regexp compiler
2   Copyright (C) 1988, 1998 Free Software Foundation, Inc.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2, or (at your option)
7   any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software
16   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA */
17
18/* Written June, 1988 by Mike Haertel */
19
20/* $FreeBSD: head/gnu/usr.bin/grep/dfa.h 131576 2004-07-04 16:16:59Z tjr $ */
21
22/* FIXME:
23   2.  We should not export so much of the DFA internals.
24   In addition to clobbering modularity, we eat up valuable
25   name space. */
26
27#include "mbcache.h"
28
29#ifdef __STDC__
30# ifndef _PTR_T
31# define _PTR_T
32  typedef void * ptr_t;
33# endif
34#else
35# ifndef _PTR_T
36# define _PTR_T
37  typedef char * ptr_t;
38# endif
39#endif
40
41#ifdef PARAMS
42# undef PARAMS
43#endif
44#if PROTOTYPES
45# define PARAMS(x) x
46#else
47# define PARAMS(x) ()
48#endif
49
50/* Number of bits in an unsigned char. */
51#ifndef CHARBITS
52#define CHARBITS 8
53#endif
54
55/* First integer value that is greater than any character code. */
56#define NOTCHAR (1 << CHARBITS)
57
58/* INTBITS need not be exact, just a lower bound. */
59#ifndef INTBITS
60#define INTBITS (CHARBITS * sizeof (int))
61#endif
62
63/* Number of ints required to hold a bit for every character. */
64#define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)
65
66/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
67typedef int charclass[CHARCLASS_INTS];
68
69/* The regexp is parsed into an array of tokens in postfix form.  Some tokens
70   are operators and others are terminal symbols.  Most (but not all) of these
71   codes are returned by the lexical analyzer. */
72
73typedef enum
74{
75  END = -1,			/* END is a terminal symbol that matches the
76				   end of input; any value of END or less in
77				   the parse tree is such a symbol.  Accepting
78				   states of the DFA are those that would have
79				   a transition on END. */
80
81  /* Ordinary character values are terminal symbols that match themselves. */
82
83  EMPTY = NOTCHAR,		/* EMPTY is a terminal symbol that matches
84				   the empty string. */
85
86  BACKREF,			/* BACKREF is generated by \<digit>; it
87				   it not completely handled.  If the scanner
88				   detects a transition on backref, it returns
89				   a kind of "semi-success" indicating that
90				   the match will have to be verified with
91				   a backtracking matcher. */
92
93  BEGLINE,			/* BEGLINE is a terminal symbol that matches
94				   the empty string if it is at the beginning
95				   of a line. */
96
97  ENDLINE,			/* ENDLINE is a terminal symbol that matches
98				   the empty string if it is at the end of
99				   a line. */
100
101  BEGWORD,			/* BEGWORD is a terminal symbol that matches
102				   the empty string if it is at the beginning
103				   of a word. */
104
105  ENDWORD,			/* ENDWORD is a terminal symbol that matches
106				   the empty string if it is at the end of
107				   a word. */
108
109  LIMWORD,			/* LIMWORD is a terminal symbol that matches
110				   the empty string if it is at the beginning
111				   or the end of a word. */
112
113  NOTLIMWORD,			/* NOTLIMWORD is a terminal symbol that
114				   matches the empty string if it is not at
115				   the beginning or end of a word. */
116
117  QMARK,			/* QMARK is an operator of one argument that
118				   matches zero or one occurences of its
119				   argument. */
120
121  STAR,				/* STAR is an operator of one argument that
122				   matches the Kleene closure (zero or more
123				   occurrences) of its argument. */
124
125  PLUS,				/* PLUS is an operator of one argument that
126				   matches the positive closure (one or more
127				   occurrences) of its argument. */
128
129  REPMN,			/* REPMN is a lexical token corresponding
130				   to the {m,n} construct.  REPMN never
131				   appears in the compiled token vector. */
132
133  CAT,				/* CAT is an operator of two arguments that
134				   matches the concatenation of its
135				   arguments.  CAT is never returned by the
136				   lexical analyzer. */
137
138  OR,				/* OR is an operator of two arguments that
139				   matches either of its arguments. */
140
141  ORTOP,			/* OR at the toplevel in the parse tree.
142				   This is used for a boyer-moore heuristic. */
143
144  LPAREN,			/* LPAREN never appears in the parse tree,
145				   it is only a lexeme. */
146
147  RPAREN,			/* RPAREN never appears in the parse tree. */
148
149  CRANGE,			/* CRANGE never appears in the parse tree.
150				   It stands for a character range that can
151				   match a string of one or more characters.
152				   For example, [a-z] can match "ch" in
153				   a Spanish locale.  */
154
155#ifdef MBS_SUPPORT
156  ANYCHAR,                     /* ANYCHAR is a terminal symbol that matches
157                                  any multibyte(or singlebyte) characters.
158			          It is used only if MB_CUR_MAX > 1.  */
159
160  MBCSET,			/* MBCSET is similar to CSET, but for
161				   multibyte characters.  */
162#endif /* MBS_SUPPORT */
163
164  CSET				/* CSET and (and any value greater) is a
165				   terminal symbol that matches any of a
166				   class of characters. */
167} token;
168
169/* Sets are stored in an array in the compiled dfa; the index of the
170   array corresponding to a given set token is given by SET_INDEX(t). */
171#define SET_INDEX(t) ((t) - CSET)
172
173/* Sometimes characters can only be matched depending on the surrounding
174   context.  Such context decisions depend on what the previous character
175   was, and the value of the current (lookahead) character.  Context
176   dependent constraints are encoded as 8 bit integers.  Each bit that
177   is set indicates that the constraint succeeds in the corresponding
178   context.
179
180   bit 7 - previous and current are newlines
181   bit 6 - previous was newline, current isn't
182   bit 5 - previous wasn't newline, current is
183   bit 4 - neither previous nor current is a newline
184   bit 3 - previous and current are word-constituents
185   bit 2 - previous was word-constituent, current isn't
186   bit 1 - previous wasn't word-constituent, current is
187   bit 0 - neither previous nor current is word-constituent
188
189   Word-constituent characters are those that satisfy isalnum().
190
191   The macro SUCCEEDS_IN_CONTEXT determines whether a a given constraint
192   succeeds in a particular context.  Prevn is true if the previous character
193   was a newline, currn is true if the lookahead character is a newline.
194   Prevl and currl similarly depend upon whether the previous and current
195   characters are word-constituent letters. */
196#define MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \
197  ((constraint) & 1 << (((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4))
198#define MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \
199  ((constraint) & 1 << (((prevl) ? 2 : 0) + ((currl) ? 1 : 0)))
200#define SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \
201  (MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn)		     \
202   && MATCHES_LETTER_CONTEXT(constraint, prevl, currl))
203
204/* The following macros give information about what a constraint depends on. */
205#define PREV_NEWLINE_DEPENDENT(constraint) \
206  (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30))
207#define PREV_LETTER_DEPENDENT(constraint) \
208  (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03))
209
210/* Tokens that match the empty string subject to some constraint actually
211   work by applying that constraint to determine what may follow them,
212   taking into account what has gone before.  The following values are
213   the constraints corresponding to the special tokens previously defined. */
214#define NO_CONSTRAINT 0xff
215#define BEGLINE_CONSTRAINT 0xcf
216#define ENDLINE_CONSTRAINT 0xaf
217#define BEGWORD_CONSTRAINT 0xf2
218#define ENDWORD_CONSTRAINT 0xf4
219#define LIMWORD_CONSTRAINT 0xf6
220#define NOTLIMWORD_CONSTRAINT 0xf9
221
222/* States of the recognizer correspond to sets of positions in the parse
223   tree, together with the constraints under which they may be matched.
224   So a position is encoded as an index into the parse tree together with
225   a constraint. */
226typedef struct
227{
228  unsigned index;		/* Index into the parse array. */
229  unsigned constraint;		/* Constraint for matching this position. */
230} position;
231
232/* Sets of positions are stored as arrays. */
233typedef struct
234{
235  position *elems;		/* Elements of this position set. */
236  int nelem;			/* Number of elements in this set. */
237} position_set;
238
239/* A state of the dfa consists of a set of positions, some flags,
240   and the token value of the lowest-numbered position of the state that
241   contains an END token. */
242typedef struct
243{
244  int hash;			/* Hash of the positions of this state. */
245  position_set elems;		/* Positions this state could match. */
246  char newline;			/* True if previous state matched newline. */
247  char letter;			/* True if previous state matched a letter. */
248  char backref;			/* True if this state matches a \<digit>. */
249  unsigned char constraint;	/* Constraint for this state to accept. */
250  int first_end;		/* Token value of the first END in elems. */
251#ifdef MBS_SUPPORT
252  position_set mbps;           /* Positions which can match multibyte
253                                  characters.  e.g. period.
254				  These staff are used only if
255				  MB_CUR_MAX > 1.  */
256#endif
257} dfa_state;
258
259/* Element of a list of strings, at least one of which is known to
260   appear in any R.E. matching the DFA. */
261struct dfamust
262{
263  int exact;
264  char *must;
265  struct dfamust *next;
266};
267
268#ifdef MBS_SUPPORT
269/* A bracket operator.
270   e.g. [a-c], [[:alpha:]], etc.  */
271struct mb_char_classes
272{
273  int invert;
274  wchar_t *chars;		/* Normal characters.  */
275  int nchars;
276  wctype_t *ch_classes;		/* Character classes.  */
277  int nch_classes;
278  wchar_t *range_sts;		/* Range characters (start of the range).  */
279  wchar_t *range_ends;		/* Range characters (end of the range).  */
280  int nranges;
281  char **equivs;		/* Equivalent classes.  */
282  int nequivs;
283  char **coll_elems;
284  int ncoll_elems;		/* Collating elements.  */
285};
286#endif
287
288/* A compiled regular expression. */
289struct dfa
290{
291  /* Stuff built by the scanner. */
292  charclass *charclasses;	/* Array of character sets for CSET tokens. */
293  int cindex;			/* Index for adding new charclasses. */
294  int calloc;			/* Number of charclasses currently allocated. */
295
296  /* Stuff built by the parser. */
297  token *tokens;		/* Postfix parse array. */
298  int tindex;			/* Index for adding new tokens. */
299  int talloc;			/* Number of tokens currently allocated. */
300  int depth;			/* Depth required of an evaluation stack
301				   used for depth-first traversal of the
302				   parse tree. */
303  int nleaves;			/* Number of leaves on the parse tree. */
304  int nregexps;			/* Count of parallel regexps being built
305				   with dfaparse(). */
306#ifdef MBS_SUPPORT
307  /* These stuff are used only if MB_CUR_MAX > 1 or multibyte environments.  */
308  int nmultibyte_prop;
309  int *multibyte_prop;
310  /* The value of multibyte_prop[i] is defined by following rule.
311       if tokens[i] < NOTCHAR
312         bit 1 : tokens[i] is a singlebyte character, or the last-byte of
313	         a multibyte character.
314	 bit 0 : tokens[i] is a singlebyte character, or the 1st-byte of
315	         a multibyte character.
316       if tokens[i] = MBCSET
317         ("the index of mbcsets correspnd to this operator" << 2) + 3
318
319     e.g.
320     tokens
321        = 'single_byte_a', 'multi_byte_A', single_byte_b'
322        = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
323     multibyte_prop
324        = 3     , 1               ,  0              ,  2              , 3
325  */
326
327  /* Array of the bracket expressoin in the DFA.  */
328  struct mb_char_classes *mbcsets;
329  int nmbcsets;
330  int mbcsets_alloc;
331#endif
332
333  /* Stuff owned by the state builder. */
334  dfa_state *states;		/* States of the dfa. */
335  int sindex;			/* Index for adding new states. */
336  int salloc;			/* Number of states currently allocated. */
337
338  /* Stuff built by the structure analyzer. */
339  position_set *follows;	/* Array of follow sets, indexed by position
340				   index.  The follow of a position is the set
341				   of positions containing characters that
342				   could conceivably follow a character
343				   matching the given position in a string
344				   matching the regexp.  Allocated to the
345				   maximum possible position index. */
346  int searchflag;		/* True if we are supposed to build a searching
347				   as opposed to an exact matcher.  A searching
348				   matcher finds the first and shortest string
349				   matching a regexp anywhere in the buffer,
350				   whereas an exact matcher finds the longest
351				   string matching, but anchored to the
352				   beginning of the buffer. */
353
354  /* Stuff owned by the executor. */
355  int tralloc;			/* Number of transition tables that have
356				   slots so far. */
357  int trcount;			/* Number of transition tables that have
358				   actually been built. */
359  int **trans;			/* Transition tables for states that can
360				   never accept.  If the transitions for a
361				   state have not yet been computed, or the
362				   state could possibly accept, its entry in
363				   this table is NULL. */
364  int **realtrans;		/* Trans always points to realtrans + 1; this
365				   is so trans[-1] can contain NULL. */
366  int **fails;			/* Transition tables after failing to accept
367				   on a state that potentially could do so. */
368  int *success;			/* Table of acceptance conditions used in
369				   dfaexec and computed in build_state. */
370  struct dfamust *musts;	/* List of strings, at least one of which
371				   is known to appear in any r.e. matching
372				   the dfa. */
373};
374
375/* Some macros for user access to dfa internals. */
376
377/* ACCEPTING returns true if s could possibly be an accepting state of r. */
378#define ACCEPTING(s, r) ((r).states[s].constraint)
379
380/* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
381   specified context. */
382#define ACCEPTS_IN_CONTEXT(prevn, currn, prevl, currl, state, dfa) \
383  SUCCEEDS_IN_CONTEXT((dfa).states[state].constraint,		   \
384		       prevn, currn, prevl, currl)
385
386/* FIRST_MATCHING_REGEXP returns the index number of the first of parallel
387   regexps that a given state could accept.  Parallel regexps are numbered
388   starting at 1. */
389#define FIRST_MATCHING_REGEXP(state, dfa) (-(dfa).states[state].first_end)
390
391/* Entry points. */
392
393/* dfasyntax() takes three arguments; the first sets the syntax bits described
394   earlier in this file, the second sets the case-folding flag, and the
395   third specifies the line terminator. */
396extern void dfasyntax PARAMS ((reg_syntax_t, int, unsigned char));
397
398/* Compile the given string of the given length into the given struct dfa.
399   Final argument is a flag specifying whether to build a searching or an
400   exact matcher. */
401extern void dfacomp PARAMS ((char const *, size_t, struct dfa *, int));
402
403/* Execute the given struct dfa on the buffer of characters.  The
404   last byte of the buffer must equal the end-of-line byte.
405   The final argument points to a flag that will
406   be set if further examination by a backtracking matcher is needed in
407   order to verify backreferencing; otherwise the flag will be cleared.
408   Returns (size_t) -1 if no match is found, or the offset of the first
409   character after the first & shortest matching string in the buffer. */
410extern size_t dfaexec PARAMS ((struct dfa *, char const *, size_t, int *,
411			       struct mb_cache *));
412
413/* Free the storage held by the components of a struct dfa. */
414extern void dfafree PARAMS ((struct dfa *));
415
416/* Entry points for people who know what they're doing. */
417
418/* Initialize the components of a struct dfa. */
419extern void dfainit PARAMS ((struct dfa *));
420
421/* Incrementally parse a string of given length into a struct dfa. */
422extern void dfaparse PARAMS ((char const *, size_t, struct dfa *));
423
424/* Analyze a parsed regexp; second argument tells whether to build a searching
425   or an exact matcher. */
426extern void dfaanalyze PARAMS ((struct dfa *, int));
427
428/* Compute, for each possible character, the transitions out of a given
429   state, storing them in an array of integers. */
430extern void dfastate PARAMS ((int, struct dfa *, int []));
431
432/* Error handling. */
433
434/* dfaerror() is called by the regexp routines whenever an error occurs.  It
435   takes a single argument, a NUL-terminated string describing the error.
436   The user must supply a dfaerror.  */
437extern void dfaerror PARAMS ((const char *));
438