1/* nfa - NFA construction routines */ 2 3/*- 4 * Copyright (c) 1990 The Regents of the University of California. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Vern Paxson. 9 * 10 * The United States Government has rights in this work pursuant 11 * to contract no. DE-AC03-76SF00098 between the United States 12 * Department of Energy and the University of California. 13 * 14 * Redistribution and use in source and binary forms are permitted provided 15 * that: (1) source distributions retain this entire copyright notice and 16 * comment, and (2) distributions including binaries display the following 17 * acknowledgement: ``This product includes software developed by the 18 * University of California, Berkeley and its contributors'' in the 19 * documentation or other materials provided with the distribution and in 20 * all advertising materials mentioning features or use of this software. 21 * Neither the name of the University nor the names of its contributors may 22 * be used to endorse or promote products derived from this software without 23 * specific prior written permission. 24 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED 25 * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF 26 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 27 */ 28 29/* $Header: /home/daffy/u0/vern/flex/RCS/nfa.c,v 2.17 95/03/04 16:11:42 vern Exp $ */ 30#include <sys/cdefs.h>
|
31__FBSDID("$FreeBSD: head/usr.bin/lex/nfa.c 99112 2002-06-30 05:25:07Z obrien $");
|
31__FBSDID("$FreeBSD: head/usr.bin/lex/nfa.c 179549 2008-06-04 19:50:34Z dwmalone $"); |
32 33#include "flexdef.h" 34 35 36/* declare functions that have forward references */ 37 38int dupmachine PROTO((int)); 39void mkxtion PROTO((int, int)); 40 41 42/* add_accept - add an accepting state to a machine 43 * 44 * accepting_number becomes mach's accepting number. 45 */ 46 47void add_accept( mach, accepting_number ) 48int mach, accepting_number; 49 { 50 /* Hang the accepting number off an epsilon state. if it is associated 51 * with a state that has a non-epsilon out-transition, then the state 52 * will accept BEFORE it makes that transition, i.e., one character 53 * too soon. 54 */ 55 56 if ( transchar[finalst[mach]] == SYM_EPSILON ) 57 accptnum[finalst[mach]] = accepting_number; 58 59 else 60 { 61 int astate = mkstate( SYM_EPSILON ); 62 accptnum[astate] = accepting_number; 63 (void) link_machines( mach, astate ); 64 } 65 } 66 67 68/* copysingl - make a given number of copies of a singleton machine 69 * 70 * synopsis 71 * 72 * newsng = copysingl( singl, num ); 73 * 74 * newsng - a new singleton composed of num copies of singl 75 * singl - a singleton machine 76 * num - the number of copies of singl to be present in newsng 77 */ 78 79int copysingl( singl, num ) 80int singl, num; 81 { 82 int copy, i; 83 84 copy = mkstate( SYM_EPSILON ); 85 86 for ( i = 1; i <= num; ++i ) 87 copy = link_machines( copy, dupmachine( singl ) ); 88 89 return copy; 90 } 91 92 93/* dumpnfa - debugging routine to write out an nfa */ 94 95void dumpnfa( state1 ) 96int state1; 97 98 { 99 int sym, tsp1, tsp2, anum, ns; 100 101 fprintf( stderr, 102 _( "\n\n********** beginning dump of nfa with start state %d\n" ), 103 state1 ); 104 105 /* We probably should loop starting at firstst[state1] and going to 106 * lastst[state1], but they're not maintained properly when we "or" 107 * all of the rules together. So we use our knowledge that the machine 108 * starts at state 1 and ends at lastnfa. 109 */ 110 111 /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */ 112 for ( ns = 1; ns <= lastnfa; ++ns ) 113 { 114 fprintf( stderr, _( "state # %4d\t" ), ns ); 115 116 sym = transchar[ns]; 117 tsp1 = trans1[ns]; 118 tsp2 = trans2[ns]; 119 anum = accptnum[ns]; 120 121 fprintf( stderr, "%3d: %4d, %4d", sym, tsp1, tsp2 ); 122 123 if ( anum != NIL ) 124 fprintf( stderr, " [%d]", anum ); 125 126 fprintf( stderr, "\n" ); 127 } 128 129 fprintf( stderr, _( "********** end of dump\n" ) ); 130 } 131 132 133/* dupmachine - make a duplicate of a given machine 134 * 135 * synopsis 136 * 137 * copy = dupmachine( mach ); 138 * 139 * copy - holds duplicate of mach 140 * mach - machine to be duplicated 141 * 142 * note that the copy of mach is NOT an exact duplicate; rather, all the 143 * transition states values are adjusted so that the copy is self-contained, 144 * as the original should have been. 145 * 146 * also note that the original MUST be contiguous, with its low and high 147 * states accessible by the arrays firstst and lastst 148 */ 149 150int dupmachine( mach ) 151int mach; 152 { 153 int i, init, state_offset; 154 int state = 0; 155 int last = lastst[mach]; 156 157 for ( i = firstst[mach]; i <= last; ++i ) 158 { 159 state = mkstate( transchar[i] ); 160 161 if ( trans1[i] != NO_TRANSITION ) 162 { 163 mkxtion( finalst[state], trans1[i] + state - i ); 164 165 if ( transchar[i] == SYM_EPSILON && 166 trans2[i] != NO_TRANSITION ) 167 mkxtion( finalst[state], 168 trans2[i] + state - i ); 169 } 170 171 accptnum[state] = accptnum[i]; 172 } 173 174 if ( state == 0 ) 175 flexfatal( _( "empty machine in dupmachine()" ) ); 176 177 state_offset = state - i + 1; 178 179 init = mach + state_offset; 180 firstst[init] = firstst[mach] + state_offset; 181 finalst[init] = finalst[mach] + state_offset; 182 lastst[init] = lastst[mach] + state_offset; 183 184 return init; 185 } 186 187 188/* finish_rule - finish up the processing for a rule 189 * 190 * An accepting number is added to the given machine. If variable_trail_rule 191 * is true then the rule has trailing context and both the head and trail 192 * are variable size. Otherwise if headcnt or trailcnt is non-zero then 193 * the machine recognizes a pattern with trailing context and headcnt is 194 * the number of characters in the matched part of the pattern, or zero 195 * if the matched part has variable length. trailcnt is the number of 196 * trailing context characters in the pattern, or zero if the trailing 197 * context has variable length. 198 */ 199 200void finish_rule( mach, variable_trail_rule, headcnt, trailcnt ) 201int mach, variable_trail_rule, headcnt, trailcnt; 202 { 203 char action_text[MAXLINE]; 204 205 add_accept( mach, num_rules ); 206 207 /* We did this in new_rule(), but it often gets the wrong 208 * number because we do it before we start parsing the current rule. 209 */ 210 rule_linenum[num_rules] = linenum; 211 212 /* If this is a continued action, then the line-number has already 213 * been updated, giving us the wrong number. 214 */ 215 if ( continued_action ) 216 --rule_linenum[num_rules]; 217 218 sprintf( action_text, "case %d:\n", num_rules ); 219 add_action( action_text ); 220 221 if ( variable_trail_rule ) 222 { 223 rule_type[num_rules] = RULE_VARIABLE; 224 225 if ( performance_report > 0 ) 226 fprintf( stderr, 227 _( "Variable trailing context rule at line %d\n" ), 228 rule_linenum[num_rules] ); 229 230 variable_trailing_context_rules = true; 231 } 232 233 else 234 { 235 rule_type[num_rules] = RULE_NORMAL; 236 237 if ( headcnt > 0 || trailcnt > 0 ) 238 { 239 /* Do trailing context magic to not match the trailing 240 * characters. 241 */ 242 char *scanner_cp = "yy_c_buf_p = yy_cp"; 243 char *scanner_bp = "yy_bp"; 244 245 add_action( 246 "*yy_cp = yy_hold_char; /* undo effects of setting up yytext */\n" ); 247 248 if ( headcnt > 0 ) 249 { 250 sprintf( action_text, "%s = %s + %d;\n", 251 scanner_cp, scanner_bp, headcnt ); 252 add_action( action_text ); 253 } 254 255 else 256 { 257 sprintf( action_text, "%s -= %d;\n", 258 scanner_cp, trailcnt ); 259 add_action( action_text ); 260 } 261 262 add_action( 263 "YY_DO_BEFORE_ACTION; /* set up yytext again */\n" ); 264 } 265 } 266 267 /* Okay, in the action code at this point yytext and yyleng have 268 * their proper final values for this rule, so here's the point 269 * to do any user action. But don't do it for continued actions, 270 * as that'll result in multiple YY_RULE_SETUP's. 271 */ 272 if ( ! continued_action ) 273 add_action( "YY_RULE_SETUP\n" ); 274 275 line_directive_out( (FILE *) 0, 1 ); 276 } 277 278 279/* link_machines - connect two machines together 280 * 281 * synopsis 282 * 283 * new = link_machines( first, last ); 284 * 285 * new - a machine constructed by connecting first to last 286 * first - the machine whose successor is to be last 287 * last - the machine whose predecessor is to be first 288 * 289 * note: this routine concatenates the machine first with the machine 290 * last to produce a machine new which will pattern-match first first 291 * and then last, and will fail if either of the sub-patterns fails. 292 * FIRST is set to new by the operation. last is unmolested. 293 */ 294 295int link_machines( first, last ) 296int first, last; 297 { 298 if ( first == NIL ) 299 return last; 300 301 else if ( last == NIL ) 302 return first; 303 304 else 305 { 306 mkxtion( finalst[first], last ); 307 finalst[first] = finalst[last]; 308 lastst[first] = MAX( lastst[first], lastst[last] ); 309 firstst[first] = MIN( firstst[first], firstst[last] ); 310 311 return first; 312 } 313 } 314 315 316/* mark_beginning_as_normal - mark each "beginning" state in a machine 317 * as being a "normal" (i.e., not trailing context- 318 * associated) states 319 * 320 * The "beginning" states are the epsilon closure of the first state 321 */ 322 323void mark_beginning_as_normal( mach )
|
324register int mach;
|
324int mach; |
325 { 326 switch ( state_type[mach] ) 327 { 328 case STATE_NORMAL: 329 /* Oh, we've already visited here. */ 330 return; 331 332 case STATE_TRAILING_CONTEXT: 333 state_type[mach] = STATE_NORMAL; 334 335 if ( transchar[mach] == SYM_EPSILON ) 336 { 337 if ( trans1[mach] != NO_TRANSITION ) 338 mark_beginning_as_normal( 339 trans1[mach] ); 340 341 if ( trans2[mach] != NO_TRANSITION ) 342 mark_beginning_as_normal( 343 trans2[mach] ); 344 } 345 break; 346 347 default: 348 flexerror( 349 _( "bad state type in mark_beginning_as_normal()" ) ); 350 break; 351 } 352 } 353 354 355/* mkbranch - make a machine that branches to two machines 356 * 357 * synopsis 358 * 359 * branch = mkbranch( first, second ); 360 * 361 * branch - a machine which matches either first's pattern or second's 362 * first, second - machines whose patterns are to be or'ed (the | operator) 363 * 364 * Note that first and second are NEITHER destroyed by the operation. Also, 365 * the resulting machine CANNOT be used with any other "mk" operation except 366 * more mkbranch's. Compare with mkor() 367 */ 368 369int mkbranch( first, second ) 370int first, second; 371 { 372 int eps; 373 374 if ( first == NO_TRANSITION ) 375 return second; 376 377 else if ( second == NO_TRANSITION ) 378 return first; 379 380 eps = mkstate( SYM_EPSILON ); 381 382 mkxtion( eps, first ); 383 mkxtion( eps, second ); 384 385 return eps; 386 } 387 388 389/* mkclos - convert a machine into a closure 390 * 391 * synopsis 392 * new = mkclos( state ); 393 * 394 * new - a new state which matches the closure of "state" 395 */ 396 397int mkclos( state ) 398int state; 399 { 400 return mkopt( mkposcl( state ) ); 401 } 402 403 404/* mkopt - make a machine optional 405 * 406 * synopsis 407 * 408 * new = mkopt( mach ); 409 * 410 * new - a machine which optionally matches whatever mach matched 411 * mach - the machine to make optional 412 * 413 * notes: 414 * 1. mach must be the last machine created 415 * 2. mach is destroyed by the call 416 */ 417 418int mkopt( mach ) 419int mach; 420 { 421 int eps; 422 423 if ( ! SUPER_FREE_EPSILON(finalst[mach]) ) 424 { 425 eps = mkstate( SYM_EPSILON ); 426 mach = link_machines( mach, eps ); 427 } 428 429 /* Can't skimp on the following if FREE_EPSILON(mach) is true because 430 * some state interior to "mach" might point back to the beginning 431 * for a closure. 432 */ 433 eps = mkstate( SYM_EPSILON ); 434 mach = link_machines( eps, mach ); 435 436 mkxtion( mach, finalst[mach] ); 437 438 return mach; 439 } 440 441 442/* mkor - make a machine that matches either one of two machines 443 * 444 * synopsis 445 * 446 * new = mkor( first, second ); 447 * 448 * new - a machine which matches either first's pattern or second's 449 * first, second - machines whose patterns are to be or'ed (the | operator) 450 * 451 * note that first and second are both destroyed by the operation 452 * the code is rather convoluted because an attempt is made to minimize 453 * the number of epsilon states needed 454 */ 455 456int mkor( first, second ) 457int first, second; 458 { 459 int eps, orend; 460 461 if ( first == NIL ) 462 return second; 463 464 else if ( second == NIL ) 465 return first; 466 467 else 468 { 469 /* See comment in mkopt() about why we can't use the first 470 * state of "first" or "second" if they satisfy "FREE_EPSILON". 471 */ 472 eps = mkstate( SYM_EPSILON ); 473 474 first = link_machines( eps, first ); 475 476 mkxtion( first, second ); 477 478 if ( SUPER_FREE_EPSILON(finalst[first]) && 479 accptnum[finalst[first]] == NIL ) 480 { 481 orend = finalst[first]; 482 mkxtion( finalst[second], orend ); 483 } 484 485 else if ( SUPER_FREE_EPSILON(finalst[second]) && 486 accptnum[finalst[second]] == NIL ) 487 { 488 orend = finalst[second]; 489 mkxtion( finalst[first], orend ); 490 } 491 492 else 493 { 494 eps = mkstate( SYM_EPSILON ); 495 496 first = link_machines( first, eps ); 497 orend = finalst[first]; 498 499 mkxtion( finalst[second], orend ); 500 } 501 } 502 503 finalst[first] = orend; 504 return first; 505 } 506 507 508/* mkposcl - convert a machine into a positive closure 509 * 510 * synopsis 511 * new = mkposcl( state ); 512 * 513 * new - a machine matching the positive closure of "state" 514 */ 515 516int mkposcl( state ) 517int state; 518 { 519 int eps; 520 521 if ( SUPER_FREE_EPSILON(finalst[state]) ) 522 { 523 mkxtion( finalst[state], state ); 524 return state; 525 } 526 527 else 528 { 529 eps = mkstate( SYM_EPSILON ); 530 mkxtion( eps, state ); 531 return link_machines( state, eps ); 532 } 533 } 534 535 536/* mkrep - make a replicated machine 537 * 538 * synopsis 539 * new = mkrep( mach, lb, ub ); 540 * 541 * new - a machine that matches whatever "mach" matched from "lb" 542 * number of times to "ub" number of times 543 * 544 * note 545 * if "ub" is INFINITY then "new" matches "lb" or more occurrences of "mach" 546 */ 547 548int mkrep( mach, lb, ub ) 549int mach, lb, ub; 550 { 551 int base_mach, tail, copy, i; 552 553 base_mach = copysingl( mach, lb - 1 ); 554 555 if ( ub == INFINITY ) 556 { 557 copy = dupmachine( mach ); 558 mach = link_machines( mach, 559 link_machines( base_mach, mkclos( copy ) ) ); 560 } 561 562 else 563 { 564 tail = mkstate( SYM_EPSILON ); 565 566 for ( i = lb; i < ub; ++i ) 567 { 568 copy = dupmachine( mach ); 569 tail = mkopt( link_machines( copy, tail ) ); 570 } 571 572 mach = link_machines( mach, link_machines( base_mach, tail ) ); 573 } 574 575 return mach; 576 } 577 578 579/* mkstate - create a state with a transition on a given symbol 580 * 581 * synopsis 582 * 583 * state = mkstate( sym ); 584 * 585 * state - a new state matching sym 586 * sym - the symbol the new state is to have an out-transition on 587 * 588 * note that this routine makes new states in ascending order through the 589 * state array (and increments LASTNFA accordingly). The routine DUPMACHINE 590 * relies on machines being made in ascending order and that they are 591 * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge 592 * that it admittedly is) 593 */ 594 595int mkstate( sym ) 596int sym; 597 { 598 if ( ++lastnfa >= current_mns ) 599 { 600 if ( (current_mns += MNS_INCREMENT) >= MAXIMUM_MNS ) 601 lerrif( 602 _( "input rules are too complicated (>= %d NFA states)" ), 603 current_mns ); 604 605 ++num_reallocs; 606 607 firstst = reallocate_integer_array( firstst, current_mns ); 608 lastst = reallocate_integer_array( lastst, current_mns ); 609 finalst = reallocate_integer_array( finalst, current_mns ); 610 transchar = reallocate_integer_array( transchar, current_mns ); 611 trans1 = reallocate_integer_array( trans1, current_mns ); 612 trans2 = reallocate_integer_array( trans2, current_mns ); 613 accptnum = reallocate_integer_array( accptnum, current_mns ); 614 assoc_rule = 615 reallocate_integer_array( assoc_rule, current_mns ); 616 state_type = 617 reallocate_integer_array( state_type, current_mns ); 618 } 619 620 firstst[lastnfa] = lastnfa; 621 finalst[lastnfa] = lastnfa; 622 lastst[lastnfa] = lastnfa; 623 transchar[lastnfa] = sym; 624 trans1[lastnfa] = NO_TRANSITION; 625 trans2[lastnfa] = NO_TRANSITION; 626 accptnum[lastnfa] = NIL; 627 assoc_rule[lastnfa] = num_rules; 628 state_type[lastnfa] = current_state_type; 629 630 /* Fix up equivalence classes base on this transition. Note that any 631 * character which has its own transition gets its own equivalence 632 * class. Thus only characters which are only in character classes 633 * have a chance at being in the same equivalence class. E.g. "a|b" 634 * puts 'a' and 'b' into two different equivalence classes. "[ab]" 635 * puts them in the same equivalence class (barring other differences 636 * elsewhere in the input). 637 */ 638 639 if ( sym < 0 ) 640 { 641 /* We don't have to update the equivalence classes since 642 * that was already done when the ccl was created for the 643 * first time. 644 */ 645 } 646 647 else if ( sym == SYM_EPSILON ) 648 ++numeps; 649 650 else 651 { 652 check_char( sym ); 653 654 if ( useecs ) 655 /* Map NUL's to csize. */ 656 mkechar( sym ? sym : csize, nextecm, ecgroup ); 657 } 658 659 return lastnfa; 660 } 661 662 663/* mkxtion - make a transition from one state to another 664 * 665 * synopsis 666 * 667 * mkxtion( statefrom, stateto ); 668 * 669 * statefrom - the state from which the transition is to be made 670 * stateto - the state to which the transition is to be made 671 */ 672 673void mkxtion( statefrom, stateto ) 674int statefrom, stateto; 675 { 676 if ( trans1[statefrom] == NO_TRANSITION ) 677 trans1[statefrom] = stateto; 678 679 else if ( (transchar[statefrom] != SYM_EPSILON) || 680 (trans2[statefrom] != NO_TRANSITION) ) 681 flexfatal( _( "found too many transitions in mkxtion()" ) ); 682 683 else 684 { /* second out-transition for an epsilon state */ 685 ++eps2; 686 trans2[statefrom] = stateto; 687 } 688 } 689 690/* new_rule - initialize for a new rule */ 691 692void new_rule() 693 { 694 if ( ++num_rules >= current_max_rules ) 695 { 696 ++num_reallocs; 697 current_max_rules += MAX_RULES_INCREMENT; 698 rule_type = reallocate_integer_array( rule_type, 699 current_max_rules ); 700 rule_linenum = reallocate_integer_array( rule_linenum, 701 current_max_rules ); 702 rule_useful = reallocate_integer_array( rule_useful, 703 current_max_rules ); 704 } 705 706 if ( num_rules > MAX_RULE ) 707 lerrif( _( "too many rules (> %d)!" ), MAX_RULE ); 708 709 rule_linenum[num_rules] = linenum; 710 rule_useful[num_rules] = false; 711 }
|