dfa.h revision 53451
1/* dfa.h - declarations for GNU deterministic regexp compiler 2 Copyright (C) 1988 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ 17 18/* Written June, 1988 by Mike Haertel */ 19 20/* FIXME: 21 2. We should not export so much of the DFA internals. 22 In addition to clobbering modularity, we eat up valuable 23 name space. */ 24 25/* Number of bits in an unsigned char. */ 26#define CHARBITS 8 27 28/* First integer value that is greater than any character code. */ 29#define NOTCHAR (1 << CHARBITS) 30 31/* INTBITS need not be exact, just a lower bound. */ 32#define INTBITS (CHARBITS * sizeof (int)) 33 34/* Number of ints required to hold a bit for every character. */ 35#define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS) 36 37/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */ 38typedef int charclass[CHARCLASS_INTS]; 39 40/* The regexp is parsed into an array of tokens in postfix form. Some tokens 41 are operators and others are terminal symbols. Most (but not all) of these 42 codes are returned by the lexical analyzer. */ 43 44typedef enum 45{ 46 END = -1, /* END is a terminal symbol that matches the 47 end of input; any value of END or less in 48 the parse tree is such a symbol. Accepting 49 states of the DFA are those that would have 50 a transition on END. */ 51 52 /* Ordinary character values are terminal symbols that match themselves. */ 53 54 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches 55 the empty string. */ 56 57 BACKREF, /* BACKREF is generated by \<digit>; it 58 it not completely handled. If the scanner 59 detects a transition on backref, it returns 60 a kind of "semi-success" indicating that 61 the match will have to be verified with 62 a backtracking matcher. */ 63 64 BEGLINE, /* BEGLINE is a terminal symbol that matches 65 the empty string if it is at the beginning 66 of a line. */ 67 68 ENDLINE, /* ENDLINE is a terminal symbol that matches 69 the empty string if it is at the end of 70 a line. */ 71 72 BEGWORD, /* BEGWORD is a terminal symbol that matches 73 the empty string if it is at the beginning 74 of a word. */ 75 76 ENDWORD, /* ENDWORD is a terminal symbol that matches 77 the empty string if it is at the end of 78 a word. */ 79 80 LIMWORD, /* LIMWORD is a terminal symbol that matches 81 the empty string if it is at the beginning 82 or the end of a word. */ 83 84 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that 85 matches the empty string if it is not at 86 the beginning or end of a word. */ 87 88 QMARK, /* QMARK is an operator of one argument that 89 matches zero or one occurences of its 90 argument. */ 91 92 STAR, /* STAR is an operator of one argument that 93 matches the Kleene closure (zero or more 94 occurrences) of its argument. */ 95 96 PLUS, /* PLUS is an operator of one argument that 97 matches the positive closure (one or more 98 occurrences) of its argument. */ 99 100 REPMN, /* REPMN is a lexical token corresponding 101 to the {m,n} construct. REPMN never 102 appears in the compiled token vector. */ 103 104 CAT, /* CAT is an operator of two arguments that 105 matches the concatenation of its 106 arguments. CAT is never returned by the 107 lexical analyzer. */ 108 109 OR, /* OR is an operator of two arguments that 110 matches either of its arguments. */ 111 112 ORTOP, /* OR at the toplevel in the parse tree. 113 This is used for a boyer-moore heuristic. */ 114 115 LPAREN, /* LPAREN never appears in the parse tree, 116 it is only a lexeme. */ 117 118 RPAREN, /* RPAREN never appears in the parse tree. */ 119 120 CSET /* CSET and (and any value greater) is a 121 terminal symbol that matches any of a 122 class of characters. */ 123} token; 124 125/* Sets are stored in an array in the compiled dfa; the index of the 126 array corresponding to a given set token is given by SET_INDEX(t). */ 127#define SET_INDEX(t) ((t) - CSET) 128 129/* Sometimes characters can only be matched depending on the surrounding 130 context. Such context decisions depend on what the previous character 131 was, and the value of the current (lookahead) character. Context 132 dependent constraints are encoded as 8 bit integers. Each bit that 133 is set indicates that the constraint succeeds in the corresponding 134 context. 135 136 bit 7 - previous and current are newlines 137 bit 6 - previous was newline, current isn't 138 bit 5 - previous wasn't newline, current is 139 bit 4 - neither previous nor current is a newline 140 bit 3 - previous and current are word-constituents 141 bit 2 - previous was word-constituent, current isn't 142 bit 1 - previous wasn't word-constituent, current is 143 bit 0 - neither previous nor current is word-constituent 144 145 Word-constituent characters are those that satisfy isalnum(). 146 147 The macro SUCCEEDS_IN_CONTEXT determines whether a a given constraint 148 succeeds in a particular context. Prevn is true if the previous character 149 was a newline, currn is true if the lookahead character is a newline. 150 Prevl and currl similarly depend upon whether the previous and current 151 characters are word-constituent letters. */ 152#define MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \ 153 ((constraint) & 1 << (((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4)) 154#define MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \ 155 ((constraint) & 1 << (((prevl) ? 2 : 0) + ((currl) ? 1 : 0))) 156#define SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \ 157 (MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \ 158 && MATCHES_LETTER_CONTEXT(constraint, prevl, currl)) 159 160/* The following macros give information about what a constraint depends on. */ 161#define PREV_NEWLINE_DEPENDENT(constraint) \ 162 (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30)) 163#define PREV_LETTER_DEPENDENT(constraint) \ 164 (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03)) 165 166/* Tokens that match the empty string subject to some constraint actually 167 work by applying that constraint to determine what may follow them, 168 taking into account what has gone before. The following values are 169 the constraints corresponding to the special tokens previously defined. */ 170#define NO_CONSTRAINT 0xff 171#define BEGLINE_CONSTRAINT 0xcf 172#define ENDLINE_CONSTRAINT 0xaf 173#define BEGWORD_CONSTRAINT 0xf2 174#define ENDWORD_CONSTRAINT 0xf4 175#define LIMWORD_CONSTRAINT 0xf6 176#define NOTLIMWORD_CONSTRAINT 0xf9 177 178/* States of the recognizer correspond to sets of positions in the parse 179 tree, together with the constraints under which they may be matched. 180 So a position is encoded as an index into the parse tree together with 181 a constraint. */ 182typedef struct 183{ 184 unsigned index; /* Index into the parse array. */ 185 unsigned constraint; /* Constraint for matching this position. */ 186} position; 187 188/* Sets of positions are stored as arrays. */ 189typedef struct 190{ 191 position *elems; /* Elements of this position set. */ 192 int nelem; /* Number of elements in this set. */ 193} position_set; 194 195/* A state of the dfa consists of a set of positions, some flags, 196 and the token value of the lowest-numbered position of the state that 197 contains an END token. */ 198typedef struct 199{ 200 int hash; /* Hash of the positions of this state. */ 201 position_set elems; /* Positions this state could match. */ 202 char newline; /* True if previous state matched newline. */ 203 char letter; /* True if previous state matched a letter. */ 204 char backref; /* True if this state matches a \<digit>. */ 205 unsigned char constraint; /* Constraint for this state to accept. */ 206 int first_end; /* Token value of the first END in elems. */ 207} dfa_state; 208 209/* Element of a list of strings, at least one of which is known to 210 appear in any R.E. matching the DFA. */ 211struct dfamust 212{ 213 int exact; 214 char *must; 215 struct dfamust *next; 216}; 217 218/* A compiled regular expression. */ 219struct dfa 220{ 221 /* Stuff built by the scanner. */ 222 charclass *charclasses; /* Array of character sets for CSET tokens. */ 223 int cindex; /* Index for adding new charclasses. */ 224 int calloc; /* Number of charclasses currently allocated. */ 225 226 /* Stuff built by the parser. */ 227 token *tokens; /* Postfix parse array. */ 228 int tindex; /* Index for adding new tokens. */ 229 int talloc; /* Number of tokens currently allocated. */ 230 int depth; /* Depth required of an evaluation stack 231 used for depth-first traversal of the 232 parse tree. */ 233 int nleaves; /* Number of leaves on the parse tree. */ 234 int nregexps; /* Count of parallel regexps being built 235 with dfaparse(). */ 236 237 /* Stuff owned by the state builder. */ 238 dfa_state *states; /* States of the dfa. */ 239 int sindex; /* Index for adding new states. */ 240 int salloc; /* Number of states currently allocated. */ 241 242 /* Stuff built by the structure analyzer. */ 243 position_set *follows; /* Array of follow sets, indexed by position 244 index. The follow of a position is the set 245 of positions containing characters that 246 could conceivably follow a character 247 matching the given position in a string 248 matching the regexp. Allocated to the 249 maximum possible position index. */ 250 int searchflag; /* True if we are supposed to build a searching 251 as opposed to an exact matcher. A searching 252 matcher finds the first and shortest string 253 matching a regexp anywhere in the buffer, 254 whereas an exact matcher finds the longest 255 string matching, but anchored to the 256 beginning of the buffer. */ 257 258 /* Stuff owned by the executor. */ 259 int tralloc; /* Number of transition tables that have 260 slots so far. */ 261 int trcount; /* Number of transition tables that have 262 actually been built. */ 263 int **trans; /* Transition tables for states that can 264 never accept. If the transitions for a 265 state have not yet been computed, or the 266 state could possibly accept, its entry in 267 this table is NULL. */ 268 int **realtrans; /* Trans always points to realtrans + 1; this 269 is so trans[-1] can contain NULL. */ 270 int **fails; /* Transition tables after failing to accept 271 on a state that potentially could do so. */ 272 int *success; /* Table of acceptance conditions used in 273 dfaexec and computed in build_state. */ 274 int *newlines; /* Transitions on newlines. The entry for a 275 newline in any transition table is always 276 -1 so we can count lines without wasting 277 too many cycles. The transition for a 278 newline is stored separately and handled 279 as a special case. Newline is also used 280 as a sentinel at the end of the buffer. */ 281 struct dfamust *musts; /* List of strings, at least one of which 282 is known to appear in any r.e. matching 283 the dfa. */ 284}; 285 286/* Some macros for user access to dfa internals. */ 287 288/* ACCEPTING returns true if s could possibly be an accepting state of r. */ 289#define ACCEPTING(s, r) ((r).states[s].constraint) 290 291/* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the 292 specified context. */ 293#define ACCEPTS_IN_CONTEXT(prevn, currn, prevl, currl, state, dfa) \ 294 SUCCEEDS_IN_CONTEXT((dfa).states[state].constraint, \ 295 prevn, currn, prevl, currl) 296 297/* FIRST_MATCHING_REGEXP returns the index number of the first of parallel 298 regexps that a given state could accept. Parallel regexps are numbered 299 starting at 1. */ 300#define FIRST_MATCHING_REGEXP(state, dfa) (-(dfa).states[state].first_end) 301 302/* Entry points. */ 303 304#if __STDC__ 305 306/* dfasyntax() takes two arguments; the first sets the syntax bits described 307 earlier in this file, and the second sets the case-folding flag. */ 308extern void dfasyntax(int, int); 309 310/* Compile the given string of the given length into the given struct dfa. 311 Final argument is a flag specifying whether to build a searching or an 312 exact matcher. */ 313extern void dfacomp(char *, size_t, struct dfa *, int); 314 315/* Execute the given struct dfa on the buffer of characters. The 316 first char * points to the beginning, and the second points to the 317 first character after the end of the buffer, which must be a writable 318 place so a sentinel end-of-buffer marker can be stored there. The 319 second-to-last argument is a flag telling whether to allow newlines to 320 be part of a string matching the regexp. The next-to-last argument, 321 if non-NULL, points to a place to increment every time we see a 322 newline. The final argument, if non-NULL, points to a flag that will 323 be set if further examination by a backtracking matcher is needed in 324 order to verify backreferencing; otherwise the flag will be cleared. 325 Returns NULL if no match is found, or a pointer to the first 326 character after the first & shortest matching string in the buffer. */ 327extern char *dfaexec(struct dfa *, char *, char *, int, int *, int *); 328 329/* Free the storage held by the components of a struct dfa. */ 330extern void dfafree(struct dfa *); 331 332/* Entry points for people who know what they're doing. */ 333 334/* Initialize the components of a struct dfa. */ 335extern void dfainit(struct dfa *); 336 337/* Incrementally parse a string of given length into a struct dfa. */ 338extern void dfaparse(char *, size_t, struct dfa *); 339 340/* Analyze a parsed regexp; second argument tells whether to build a searching 341 or an exact matcher. */ 342extern void dfaanalyze(struct dfa *, int); 343 344/* Compute, for each possible character, the transitions out of a given 345 state, storing them in an array of integers. */ 346extern void dfastate(int, struct dfa *, int []); 347 348/* Error handling. */ 349 350/* dfaerror() is called by the regexp routines whenever an error occurs. It 351 takes a single argument, a NUL-terminated string describing the error. 352 The default dfaerror() prints the error message to stderr and exits. 353 The user can provide a different dfafree() if so desired. */ 354extern void dfaerror(char *); 355 356#else /* ! __STDC__ */ 357extern void dfasyntax(), dfacomp(), dfafree(), dfainit(), dfaparse(); 358extern void dfaanalyze(), dfastate(), dfaerror(); 359extern char *dfaexec(); 360#endif /* ! __STDC__ */ 361