dfa.h revision 55379
1/* dfa.h - declarations for GNU deterministic regexp compiler 2 Copyright (C) 1988, 1998 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ 17 18/* Written June, 1988 by Mike Haertel */ 19 20/* $FreeBSD: head/gnu/usr.bin/grep/dfa.h 55379 2000-01-04 03:25:40Z obrien $ */ 21 22/* FIXME: 23 2. We should not export so much of the DFA internals. 24 In addition to clobbering modularity, we eat up valuable 25 name space. */ 26 27# undef PARAMS 28#if __STDC__ 29# ifndef _PTR_T 30# define _PTR_T 31 typedef void * ptr_t; 32# endif 33# define PARAMS(x) x 34#else 35# ifndef _PTR_T 36# define _PTR_T 37 typedef char * ptr_t; 38# endif 39# define PARAMS(x) () 40#endif 41 42/* Number of bits in an unsigned char. */ 43#ifndef CHARBITS 44#define CHARBITS 8 45#endif 46 47/* First integer value that is greater than any character code. */ 48#define NOTCHAR (1 << CHARBITS) 49 50/* INTBITS need not be exact, just a lower bound. */ 51#ifndef INTBITS 52#define INTBITS (CHARBITS * sizeof (int)) 53#endif 54 55/* Number of ints required to hold a bit for every character. */ 56#define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS) 57 58/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */ 59typedef int charclass[CHARCLASS_INTS]; 60 61/* The regexp is parsed into an array of tokens in postfix form. Some tokens 62 are operators and others are terminal symbols. Most (but not all) of these 63 codes are returned by the lexical analyzer. */ 64 65typedef enum 66{ 67 END = -1, /* END is a terminal symbol that matches the 68 end of input; any value of END or less in 69 the parse tree is such a symbol. Accepting 70 states of the DFA are those that would have 71 a transition on END. */ 72 73 /* Ordinary character values are terminal symbols that match themselves. */ 74 75 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches 76 the empty string. */ 77 78 BACKREF, /* BACKREF is generated by \<digit>; it 79 it not completely handled. If the scanner 80 detects a transition on backref, it returns 81 a kind of "semi-success" indicating that 82 the match will have to be verified with 83 a backtracking matcher. */ 84 85 BEGLINE, /* BEGLINE is a terminal symbol that matches 86 the empty string if it is at the beginning 87 of a line. */ 88 89 ENDLINE, /* ENDLINE is a terminal symbol that matches 90 the empty string if it is at the end of 91 a line. */ 92 93 BEGWORD, /* BEGWORD is a terminal symbol that matches 94 the empty string if it is at the beginning 95 of a word. */ 96 97 ENDWORD, /* ENDWORD is a terminal symbol that matches 98 the empty string if it is at the end of 99 a word. */ 100 101 LIMWORD, /* LIMWORD is a terminal symbol that matches 102 the empty string if it is at the beginning 103 or the end of a word. */ 104 105 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that 106 matches the empty string if it is not at 107 the beginning or end of a word. */ 108 109 QMARK, /* QMARK is an operator of one argument that 110 matches zero or one occurences of its 111 argument. */ 112 113 STAR, /* STAR is an operator of one argument that 114 matches the Kleene closure (zero or more 115 occurrences) of its argument. */ 116 117 PLUS, /* PLUS is an operator of one argument that 118 matches the positive closure (one or more 119 occurrences) of its argument. */ 120 121 REPMN, /* REPMN is a lexical token corresponding 122 to the {m,n} construct. REPMN never 123 appears in the compiled token vector. */ 124 125 CAT, /* CAT is an operator of two arguments that 126 matches the concatenation of its 127 arguments. CAT is never returned by the 128 lexical analyzer. */ 129 130 OR, /* OR is an operator of two arguments that 131 matches either of its arguments. */ 132 133 ORTOP, /* OR at the toplevel in the parse tree. 134 This is used for a boyer-moore heuristic. */ 135 136 LPAREN, /* LPAREN never appears in the parse tree, 137 it is only a lexeme. */ 138 139 RPAREN, /* RPAREN never appears in the parse tree. */ 140 141 CSET /* CSET and (and any value greater) is a 142 terminal symbol that matches any of a 143 class of characters. */ 144} token; 145 146/* Sets are stored in an array in the compiled dfa; the index of the 147 array corresponding to a given set token is given by SET_INDEX(t). */ 148#define SET_INDEX(t) ((t) - CSET) 149 150/* Sometimes characters can only be matched depending on the surrounding 151 context. Such context decisions depend on what the previous character 152 was, and the value of the current (lookahead) character. Context 153 dependent constraints are encoded as 8 bit integers. Each bit that 154 is set indicates that the constraint succeeds in the corresponding 155 context. 156 157 bit 7 - previous and current are newlines 158 bit 6 - previous was newline, current isn't 159 bit 5 - previous wasn't newline, current is 160 bit 4 - neither previous nor current is a newline 161 bit 3 - previous and current are word-constituents 162 bit 2 - previous was word-constituent, current isn't 163 bit 1 - previous wasn't word-constituent, current is 164 bit 0 - neither previous nor current is word-constituent 165 166 Word-constituent characters are those that satisfy isalnum(). 167 168 The macro SUCCEEDS_IN_CONTEXT determines whether a a given constraint 169 succeeds in a particular context. Prevn is true if the previous character 170 was a newline, currn is true if the lookahead character is a newline. 171 Prevl and currl similarly depend upon whether the previous and current 172 characters are word-constituent letters. */ 173#define MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \ 174 ((constraint) & 1 << (((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4)) 175#define MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \ 176 ((constraint) & 1 << (((prevl) ? 2 : 0) + ((currl) ? 1 : 0))) 177#define SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \ 178 (MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \ 179 && MATCHES_LETTER_CONTEXT(constraint, prevl, currl)) 180 181/* The following macros give information about what a constraint depends on. */ 182#define PREV_NEWLINE_DEPENDENT(constraint) \ 183 (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30)) 184#define PREV_LETTER_DEPENDENT(constraint) \ 185 (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03)) 186 187/* Tokens that match the empty string subject to some constraint actually 188 work by applying that constraint to determine what may follow them, 189 taking into account what has gone before. The following values are 190 the constraints corresponding to the special tokens previously defined. */ 191#define NO_CONSTRAINT 0xff 192#define BEGLINE_CONSTRAINT 0xcf 193#define ENDLINE_CONSTRAINT 0xaf 194#define BEGWORD_CONSTRAINT 0xf2 195#define ENDWORD_CONSTRAINT 0xf4 196#define LIMWORD_CONSTRAINT 0xf6 197#define NOTLIMWORD_CONSTRAINT 0xf9 198 199/* States of the recognizer correspond to sets of positions in the parse 200 tree, together with the constraints under which they may be matched. 201 So a position is encoded as an index into the parse tree together with 202 a constraint. */ 203typedef struct 204{ 205 unsigned index; /* Index into the parse array. */ 206 unsigned constraint; /* Constraint for matching this position. */ 207} position; 208 209/* Sets of positions are stored as arrays. */ 210typedef struct 211{ 212 position *elems; /* Elements of this position set. */ 213 int nelem; /* Number of elements in this set. */ 214} position_set; 215 216/* A state of the dfa consists of a set of positions, some flags, 217 and the token value of the lowest-numbered position of the state that 218 contains an END token. */ 219typedef struct 220{ 221 int hash; /* Hash of the positions of this state. */ 222 position_set elems; /* Positions this state could match. */ 223 char newline; /* True if previous state matched newline. */ 224 char letter; /* True if previous state matched a letter. */ 225 char backref; /* True if this state matches a \<digit>. */ 226 unsigned char constraint; /* Constraint for this state to accept. */ 227 int first_end; /* Token value of the first END in elems. */ 228} dfa_state; 229 230/* Element of a list of strings, at least one of which is known to 231 appear in any R.E. matching the DFA. */ 232struct dfamust 233{ 234 int exact; 235 char *must; 236 struct dfamust *next; 237}; 238 239/* A compiled regular expression. */ 240struct dfa 241{ 242 /* Stuff built by the scanner. */ 243 charclass *charclasses; /* Array of character sets for CSET tokens. */ 244 int cindex; /* Index for adding new charclasses. */ 245 int calloc; /* Number of charclasses currently allocated. */ 246 247 /* Stuff built by the parser. */ 248 token *tokens; /* Postfix parse array. */ 249 int tindex; /* Index for adding new tokens. */ 250 int talloc; /* Number of tokens currently allocated. */ 251 int depth; /* Depth required of an evaluation stack 252 used for depth-first traversal of the 253 parse tree. */ 254 int nleaves; /* Number of leaves on the parse tree. */ 255 int nregexps; /* Count of parallel regexps being built 256 with dfaparse(). */ 257 258 /* Stuff owned by the state builder. */ 259 dfa_state *states; /* States of the dfa. */ 260 int sindex; /* Index for adding new states. */ 261 int salloc; /* Number of states currently allocated. */ 262 263 /* Stuff built by the structure analyzer. */ 264 position_set *follows; /* Array of follow sets, indexed by position 265 index. The follow of a position is the set 266 of positions containing characters that 267 could conceivably follow a character 268 matching the given position in a string 269 matching the regexp. Allocated to the 270 maximum possible position index. */ 271 int searchflag; /* True if we are supposed to build a searching 272 as opposed to an exact matcher. A searching 273 matcher finds the first and shortest string 274 matching a regexp anywhere in the buffer, 275 whereas an exact matcher finds the longest 276 string matching, but anchored to the 277 beginning of the buffer. */ 278 279 /* Stuff owned by the executor. */ 280 int tralloc; /* Number of transition tables that have 281 slots so far. */ 282 int trcount; /* Number of transition tables that have 283 actually been built. */ 284 int **trans; /* Transition tables for states that can 285 never accept. If the transitions for a 286 state have not yet been computed, or the 287 state could possibly accept, its entry in 288 this table is NULL. */ 289 int **realtrans; /* Trans always points to realtrans + 1; this 290 is so trans[-1] can contain NULL. */ 291 int **fails; /* Transition tables after failing to accept 292 on a state that potentially could do so. */ 293 int *success; /* Table of acceptance conditions used in 294 dfaexec and computed in build_state. */ 295 int *newlines; /* Transitions on newlines. The entry for a 296 newline in any transition table is always 297 -1 so we can count lines without wasting 298 too many cycles. The transition for a 299 newline is stored separately and handled 300 as a special case. Newline is also used 301 as a sentinel at the end of the buffer. */ 302 struct dfamust *musts; /* List of strings, at least one of which 303 is known to appear in any r.e. matching 304 the dfa. */ 305}; 306 307/* Some macros for user access to dfa internals. */ 308 309/* ACCEPTING returns true if s could possibly be an accepting state of r. */ 310#define ACCEPTING(s, r) ((r).states[s].constraint) 311 312/* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the 313 specified context. */ 314#define ACCEPTS_IN_CONTEXT(prevn, currn, prevl, currl, state, dfa) \ 315 SUCCEEDS_IN_CONTEXT((dfa).states[state].constraint, \ 316 prevn, currn, prevl, currl) 317 318/* FIRST_MATCHING_REGEXP returns the index number of the first of parallel 319 regexps that a given state could accept. Parallel regexps are numbered 320 starting at 1. */ 321#define FIRST_MATCHING_REGEXP(state, dfa) (-(dfa).states[state].first_end) 322 323/* Entry points. */ 324 325/* dfasyntax() takes three arguments; the first sets the syntax bits described 326 earlier in this file, the second sets the case-folding flag, and the 327 third specifies the line terminator. */ 328extern void dfasyntax PARAMS ((reg_syntax_t, int, int)); 329 330/* Compile the given string of the given length into the given struct dfa. 331 Final argument is a flag specifying whether to build a searching or an 332 exact matcher. */ 333extern void dfacomp PARAMS ((char *, size_t, struct dfa *, int)); 334 335/* Execute the given struct dfa on the buffer of characters. The 336 first char * points to the beginning, and the second points to the 337 first character after the end of the buffer, which must be a writable 338 place so a sentinel end-of-buffer marker can be stored there. The 339 second-to-last argument is a flag telling whether to allow newlines to 340 be part of a string matching the regexp. The next-to-last argument, 341 if non-NULL, points to a place to increment every time we see a 342 newline. The final argument, if non-NULL, points to a flag that will 343 be set if further examination by a backtracking matcher is needed in 344 order to verify backreferencing; otherwise the flag will be cleared. 345 Returns NULL if no match is found, or a pointer to the first 346 character after the first & shortest matching string in the buffer. */ 347extern char *dfaexec PARAMS ((struct dfa *, char *, char *, int, int *, int *)); 348 349/* Free the storage held by the components of a struct dfa. */ 350extern void dfafree PARAMS ((struct dfa *)); 351 352/* Entry points for people who know what they're doing. */ 353 354/* Initialize the components of a struct dfa. */ 355extern void dfainit PARAMS ((struct dfa *)); 356 357/* Incrementally parse a string of given length into a struct dfa. */ 358extern void dfaparse PARAMS ((char *, size_t, struct dfa *)); 359 360/* Analyze a parsed regexp; second argument tells whether to build a searching 361 or an exact matcher. */ 362extern void dfaanalyze PARAMS ((struct dfa *, int)); 363 364/* Compute, for each possible character, the transitions out of a given 365 state, storing them in an array of integers. */ 366extern void dfastate PARAMS ((int, struct dfa *, int [])); 367 368/* Error handling. */ 369 370/* dfaerror() is called by the regexp routines whenever an error occurs. It 371 takes a single argument, a NUL-terminated string describing the error. 372 The default dfaerror() prints the error message to stderr and exits. 373 The user can provide a different dfafree() if so desired. */ 374extern void dfaerror PARAMS ((const char *)); 375