1 /* $OpenBSD: nfa.c,v 1.12 2024/11/09 18:03:44 op Exp $ */ 2 3 /* nfa - NFA construction routines */ 4 5 /* Copyright (c) 1990 The Regents of the University of California. */ 6 /* All rights reserved. */ 7 8 /* This code is derived from software contributed to Berkeley by */ 9 /* Vern Paxson. */ 10 11 /* The United States Government has rights in this work pursuant */ 12 /* to contract no. DE-AC03-76SF00098 between the United States */ 13 /* Department of Energy and the University of California. */ 14 15 /* This file is part of flex. */ 16 17 /* Redistribution and use in source and binary forms, with or without */ 18 /* modification, are permitted provided that the following conditions */ 19 /* are met: */ 20 21 /* 1. Redistributions of source code must retain the above copyright */ 22 /* notice, this list of conditions and the following disclaimer. */ 23 /* 2. Redistributions in binary form must reproduce the above copyright */ 24 /* notice, this list of conditions and the following disclaimer in the */ 25 /* documentation and/or other materials provided with the distribution. */ 26 27 /* Neither the name of the University nor the names of its contributors */ 28 /* may be used to endorse or promote products derived from this software */ 29 /* without specific prior written permission. */ 30 31 /* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */ 32 /* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */ 33 /* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */ 34 /* PURPOSE. */ 35 36 #include "flexdef.h" 37 38 39 /* declare functions that have forward references */ 40 41 int dupmachine PROTO((int)); 42 void mkxtion PROTO((int, int)); 43 44 45 /* add_accept - add an accepting state to a machine 46 * 47 * accepting_number becomes mach's accepting number. 48 */ 49 50 void 51 add_accept(int mach, int accepting_number) 52 { 53 /* 54 * Hang the accepting number off an epsilon state. if it is 55 * associated with a state that has a non-epsilon out-transition, 56 * then the state will accept BEFORE it makes that transition, i.e., 57 * one character too soon. 58 */ 59 60 if (transchar[finalst[mach]] == SYM_EPSILON) 61 accptnum[finalst[mach]] = accepting_number; 62 63 else { 64 int astate = mkstate(SYM_EPSILON); 65 66 accptnum[astate] = accepting_number; 67 (void) link_machines(mach, astate); 68 } 69 } 70 71 72 /* copysingl - make a given number of copies of a singleton machine 73 * 74 * synopsis 75 * 76 * newsng = copysingl( singl, num ); 77 * 78 * newsng - a new singleton composed of num copies of singl 79 * singl - a singleton machine 80 * num - the number of copies of singl to be present in newsng 81 */ 82 83 int 84 copysingl(int singl, int num) 85 { 86 int copy, i; 87 88 copy = mkstate(SYM_EPSILON); 89 90 for (i = 1; i <= num; ++i) 91 copy = link_machines(copy, dupmachine(singl)); 92 93 return copy; 94 } 95 96 97 /* dumpnfa - debugging routine to write out an nfa */ 98 99 void 100 dumpnfa(int state1) 101 { 102 int sym, tsp1, tsp2, anum, ns; 103 104 fprintf(stderr, 105 _ 106 ("\n\n********** beginning dump of nfa with start state %d\n"), 107 state1); 108 109 /* 110 * We probably should loop starting at firstst[state1] and going to 111 * lastst[state1], but they're not maintained properly when we "or" 112 * all of the rules together. So we use our knowledge that the 113 * machine starts at state 1 and ends at lastnfa. 114 */ 115 116 /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */ 117 for (ns = 1; ns <= lastnfa; ++ns) { 118 fprintf(stderr, _("state # %4d\t"), ns); 119 120 sym = transchar[ns]; 121 tsp1 = trans1[ns]; 122 tsp2 = trans2[ns]; 123 anum = accptnum[ns]; 124 125 fprintf(stderr, "%3d: %4d, %4d", sym, tsp1, tsp2); 126 127 if (anum != NIL) 128 fprintf(stderr, " [%d]", anum); 129 130 fprintf(stderr, "\n"); 131 } 132 133 fprintf(stderr, _("********** end of dump\n")); 134 } 135 136 137 /* dupmachine - make a duplicate of a given machine 138 * 139 * synopsis 140 * 141 * copy = dupmachine( mach ); 142 * 143 * copy - holds duplicate of mach 144 * mach - machine to be duplicated 145 * 146 * note that the copy of mach is NOT an exact duplicate; rather, all the 147 * transition states values are adjusted so that the copy is self-contained, 148 * as the original should have been. 149 * 150 * also note that the original MUST be contiguous, with its low and high 151 * states accessible by the arrays firstst and lastst 152 */ 153 154 int 155 dupmachine(int mach) 156 { 157 int i, init, state_offset; 158 int state = 0; 159 int last = lastst[mach]; 160 161 for (i = firstst[mach]; i <= last; ++i) { 162 state = mkstate(transchar[i]); 163 164 if (trans1[i] != NO_TRANSITION) { 165 mkxtion(finalst[state], trans1[i] + state - i); 166 167 if (transchar[i] == SYM_EPSILON && 168 trans2[i] != NO_TRANSITION) 169 mkxtion(finalst[state], 170 trans2[i] + state - i); 171 } 172 accptnum[state] = accptnum[i]; 173 } 174 175 if (state == 0) 176 flexfatal(_("empty machine in dupmachine()")); 177 178 state_offset = state - i + 1; 179 180 init = mach + state_offset; 181 firstst[init] = firstst[mach] + state_offset; 182 finalst[init] = finalst[mach] + state_offset; 183 lastst[init] = lastst[mach] + state_offset; 184 185 return init; 186 } 187 188 189 /* finish_rule - finish up the processing for a rule 190 * 191 * An accepting number is added to the given machine. If variable_trail_rule 192 * is true then the rule has trailing context and both the head and trail 193 * are variable size. Otherwise if headcnt or trailcnt is non-zero then 194 * the machine recognizes a pattern with trailing context and headcnt is 195 * the number of characters in the matched part of the pattern, or zero 196 * if the matched part has variable length. trailcnt is the number of 197 * trailing context characters in the pattern, or zero if the trailing 198 * context has variable length. 199 */ 200 201 void 202 finish_rule(int mach, int variable_trail_rule, int headcnt, int trailcnt, 203 int pcont_act) 204 { 205 char action_text[MAXLINE]; 206 207 add_accept(mach, num_rules); 208 209 /* 210 * We did this in new_rule(), but it often gets the wrong number 211 * because we do it before we start parsing the current rule. 212 */ 213 rule_linenum[num_rules] = linenum; 214 215 /* 216 * If this is a continued action, then the line-number has already 217 * been updated, giving us the wrong number. 218 */ 219 if (continued_action) 220 --rule_linenum[num_rules]; 221 222 223 /* 224 * If the previous rule was continued action, then we inherit the 225 * previous newline flag, possibly overriding the current one. 226 */ 227 if (pcont_act && rule_has_nl[num_rules - 1]) 228 rule_has_nl[num_rules] = true; 229 230 snprintf(action_text, sizeof(action_text), "case %d:\n", num_rules); 231 add_action(action_text); 232 if (rule_has_nl[num_rules]) { 233 snprintf(action_text, sizeof(action_text), "/* rule %d can match eol */\n", 234 num_rules); 235 add_action(action_text); 236 } 237 if (variable_trail_rule) { 238 rule_type[num_rules] = RULE_VARIABLE; 239 240 if (performance_report > 0) 241 fprintf(stderr, 242 _ 243 ("Variable trailing context rule at line %d\n"), 244 rule_linenum[num_rules]); 245 246 variable_trailing_context_rules = true; 247 } else { 248 rule_type[num_rules] = RULE_NORMAL; 249 250 if (headcnt > 0 || trailcnt > 0) { 251 /* 252 * Do trailing context magic to not match the 253 * trailing characters. 254 */ 255 char *scanner_cp = "YY_G(yy_c_buf_p) = yy_cp"; 256 char *scanner_bp = "yy_bp"; 257 258 add_action 259 ("*yy_cp = YY_G(yy_hold_char); /* undo effects of setting up yytext */\n"); 260 261 if (headcnt > 0) { 262 if (rule_has_nl[num_rules]) { 263 snprintf(action_text, sizeof(action_text), 264 "YY_LINENO_REWIND_TO(%s + %d);\n", scanner_bp, headcnt); 265 add_action(action_text); 266 } 267 snprintf(action_text, sizeof(action_text), "%s = %s + %d;\n", 268 scanner_cp, scanner_bp, headcnt); 269 add_action(action_text); 270 } else { 271 if (rule_has_nl[num_rules]) { 272 snprintf(action_text, sizeof(action_text), 273 "YY_LINENO_REWIND_TO(yy_cp - %d);\n", trailcnt); 274 add_action(action_text); 275 } 276 snprintf(action_text, sizeof(action_text), "%s -= %d;\n", 277 scanner_cp, trailcnt); 278 add_action(action_text); 279 } 280 281 add_action 282 ("YY_DO_BEFORE_ACTION; /* set up yytext again */\n"); 283 } 284 } 285 286 /* 287 * Okay, in the action code at this point yytext and yyleng have 288 * their proper final values for this rule, so here's the point to do 289 * any user action. But don't do it for continued actions, as 290 * that'll result in multiple YY_RULE_SETUP's. 291 */ 292 if (!continued_action) 293 add_action("YY_RULE_SETUP\n"); 294 295 line_directive_out((FILE *) 0, 1); 296 } 297 298 299 /* link_machines - connect two machines together 300 * 301 * synopsis 302 * 303 * new = link_machines( first, last ); 304 * 305 * new - a machine constructed by connecting first to last 306 * first - the machine whose successor is to be last 307 * last - the machine whose predecessor is to be first 308 * 309 * note: this routine concatenates the machine first with the machine 310 * last to produce a machine new which will pattern-match first first 311 * and then last, and will fail if either of the sub-patterns fails. 312 * FIRST is set to new by the operation. last is unmolested. 313 */ 314 315 int 316 link_machines(int first, int last) 317 { 318 if (first == NIL) 319 return last; 320 321 else if (last == NIL) 322 return first; 323 324 else { 325 mkxtion(finalst[first], last); 326 finalst[first] = finalst[last]; 327 lastst[first] = MAX(lastst[first], lastst[last]); 328 firstst[first] = MIN(firstst[first], firstst[last]); 329 330 return first; 331 } 332 } 333 334 335 /* mark_beginning_as_normal - mark each "beginning" state in a machine 336 * as being a "normal" (i.e., not trailing context- 337 * associated) states 338 * 339 * The "beginning" states are the epsilon closure of the first state 340 */ 341 342 void 343 mark_beginning_as_normal(int mach) 344 { 345 switch (state_type[mach]) { 346 case STATE_NORMAL: 347 /* Oh, we've already visited here. */ 348 return; 349 350 case STATE_TRAILING_CONTEXT: 351 state_type[mach] = STATE_NORMAL; 352 353 if (transchar[mach] == SYM_EPSILON) { 354 if (trans1[mach] != NO_TRANSITION) 355 mark_beginning_as_normal(trans1[mach]); 356 357 if (trans2[mach] != NO_TRANSITION) 358 mark_beginning_as_normal(trans2[mach]); 359 } 360 break; 361 362 default: 363 flexerror(_ 364 ("bad state type in mark_beginning_as_normal()")); 365 break; 366 } 367 } 368 369 370 /* mkbranch - make a machine that branches to two machines 371 * 372 * synopsis 373 * 374 * branch = mkbranch( first, second ); 375 * 376 * branch - a machine which matches either first's pattern or second's 377 * first, second - machines whose patterns are to be or'ed (the | operator) 378 * 379 * Note that first and second are NEITHER destroyed by the operation. Also, 380 * the resulting machine CANNOT be used with any other "mk" operation except 381 * more mkbranch's. Compare with mkor() 382 */ 383 384 int 385 mkbranch(int first, int second) 386 { 387 int eps; 388 389 if (first == NO_TRANSITION) 390 return second; 391 392 else if (second == NO_TRANSITION) 393 return first; 394 395 eps = mkstate(SYM_EPSILON); 396 397 mkxtion(eps, first); 398 mkxtion(eps, second); 399 400 return eps; 401 } 402 403 404 /* mkclos - convert a machine into a closure 405 * 406 * synopsis 407 * new = mkclos( state ); 408 * 409 * new - a new state which matches the closure of "state" 410 */ 411 412 int 413 mkclos(int state) 414 { 415 return mkopt(mkposcl(state)); 416 } 417 418 419 /* mkopt - make a machine optional 420 * 421 * synopsis 422 * 423 * new = mkopt( mach ); 424 * 425 * new - a machine which optionally matches whatever mach matched 426 * mach - the machine to make optional 427 * 428 * notes: 429 * 1. mach must be the last machine created 430 * 2. mach is destroyed by the call 431 */ 432 433 int 434 mkopt(int mach) 435 { 436 int eps; 437 438 if (!SUPER_FREE_EPSILON(finalst[mach])) { 439 eps = mkstate(SYM_EPSILON); 440 mach = link_machines(mach, eps); 441 } 442 /* 443 * Can't skimp on the following if FREE_EPSILON(mach) is true because 444 * some state interior to "mach" might point back to the beginning 445 * for a closure. 446 */ 447 eps = mkstate(SYM_EPSILON); 448 mach = link_machines(eps, mach); 449 450 mkxtion(mach, finalst[mach]); 451 452 return mach; 453 } 454 455 456 /* mkor - make a machine that matches either one of two machines 457 * 458 * synopsis 459 * 460 * new = mkor( first, second ); 461 * 462 * new - a machine which matches either first's pattern or second's 463 * first, second - machines whose patterns are to be or'ed (the | operator) 464 * 465 * note that first and second are both destroyed by the operation 466 * the code is rather convoluted because an attempt is made to minimize 467 * the number of epsilon states needed 468 */ 469 470 int 471 mkor(int first, int second) 472 { 473 int eps, orend; 474 475 if (first == NIL) 476 return second; 477 478 else if (second == NIL) 479 return first; 480 481 else { 482 /* 483 * See comment in mkopt() about why we can't use the first 484 * state of "first" or "second" if they satisfy 485 * "FREE_EPSILON". 486 */ 487 eps = mkstate(SYM_EPSILON); 488 489 first = link_machines(eps, first); 490 491 mkxtion(first, second); 492 493 if (SUPER_FREE_EPSILON(finalst[first]) && 494 accptnum[finalst[first]] == NIL) { 495 orend = finalst[first]; 496 mkxtion(finalst[second], orend); 497 } else if (SUPER_FREE_EPSILON(finalst[second]) && 498 accptnum[finalst[second]] == NIL) { 499 orend = finalst[second]; 500 mkxtion(finalst[first], orend); 501 } else { 502 eps = mkstate(SYM_EPSILON); 503 504 first = link_machines(first, eps); 505 orend = finalst[first]; 506 507 mkxtion(finalst[second], orend); 508 } 509 } 510 511 finalst[first] = orend; 512 return first; 513 } 514 515 516 /* mkposcl - convert a machine into a positive closure 517 * 518 * synopsis 519 * new = mkposcl( state ); 520 * 521 * new - a machine matching the positive closure of "state" 522 */ 523 524 int 525 mkposcl(int state) 526 { 527 int eps; 528 529 if (SUPER_FREE_EPSILON(finalst[state])) { 530 mkxtion(finalst[state], state); 531 return state; 532 } else { 533 eps = mkstate(SYM_EPSILON); 534 mkxtion(eps, state); 535 return link_machines(state, eps); 536 } 537 } 538 539 540 /* mkrep - make a replicated machine 541 * 542 * synopsis 543 * new = mkrep( mach, lb, ub ); 544 * 545 * new - a machine that matches whatever "mach" matched from "lb" 546 * number of times to "ub" number of times 547 * 548 * note 549 * if "ub" is INFINITE_REPEAT then "new" matches "lb" or more occurrences of "mach" 550 */ 551 552 int 553 mkrep(int mach, int lb, int ub) 554 { 555 int base_mach, tail, copy, i; 556 557 base_mach = copysingl(mach, lb - 1); 558 559 if (ub == INFINITE_REPEAT) { 560 copy = dupmachine(mach); 561 mach = link_machines(mach, 562 link_machines(base_mach, 563 mkclos(copy))); 564 } else { 565 tail = mkstate(SYM_EPSILON); 566 567 for (i = lb; i < ub; ++i) { 568 copy = dupmachine(mach); 569 tail = mkopt(link_machines(copy, tail)); 570 } 571 572 mach = 573 link_machines(mach, 574 link_machines(base_mach, tail)); 575 } 576 577 return mach; 578 } 579 580 581 /* mkstate - create a state with a transition on a given symbol 582 * 583 * synopsis 584 * 585 * state = mkstate( sym ); 586 * 587 * state - a new state matching sym 588 * sym - the symbol the new state is to have an out-transition on 589 * 590 * note that this routine makes new states in ascending order through the 591 * state array (and increments LASTNFA accordingly). The routine DUPMACHINE 592 * relies on machines being made in ascending order and that they are 593 * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge 594 * that it admittedly is) 595 */ 596 597 int 598 mkstate(int sym) 599 { 600 if (++lastnfa >= current_mns) { 601 if ((current_mns += MNS_INCREMENT) >= maximum_mns) 602 lerrif(_ 603 ("input rules are too complicated (>= %d NFA states)"), 604 current_mns); 605 606 ++num_reallocs; 607 608 firstst = reallocate_integer_array(firstst, current_mns); 609 lastst = reallocate_integer_array(lastst, current_mns); 610 finalst = reallocate_integer_array(finalst, current_mns); 611 transchar = 612 reallocate_integer_array(transchar, current_mns); 613 trans1 = reallocate_integer_array(trans1, current_mns); 614 trans2 = reallocate_integer_array(trans2, current_mns); 615 accptnum = 616 reallocate_integer_array(accptnum, current_mns); 617 assoc_rule = 618 reallocate_integer_array(assoc_rule, current_mns); 619 state_type = 620 reallocate_integer_array(state_type, current_mns); 621 } 622 firstst[lastnfa] = lastnfa; 623 finalst[lastnfa] = lastnfa; 624 lastst[lastnfa] = lastnfa; 625 transchar[lastnfa] = sym; 626 trans1[lastnfa] = NO_TRANSITION; 627 trans2[lastnfa] = NO_TRANSITION; 628 accptnum[lastnfa] = NIL; 629 assoc_rule[lastnfa] = num_rules; 630 state_type[lastnfa] = current_state_type; 631 632 /* 633 * Fix up equivalence classes base on this transition. Note that any 634 * character which has its own transition gets its own equivalence 635 * class. Thus only characters which are only in character classes 636 * have a chance at being in the same equivalence class. E.g. "a|b" 637 * puts 'a' and 'b' into two different equivalence classes. "[ab]" 638 * puts them in the same equivalence class (barring other differences 639 * elsewhere in the input). 640 */ 641 642 if (sym < 0) { 643 /* 644 * We don't have to update the equivalence classes since that 645 * was already done when the ccl was created for the first 646 * time. 647 */ 648 } else if (sym == SYM_EPSILON) 649 ++numeps; 650 651 else { 652 check_char(sym); 653 654 if (useecs) 655 /* Map NUL's to csize. */ 656 mkechar(sym ? sym : csize, nextecm, ecgroup); 657 } 658 659 return lastnfa; 660 } 661 662 663 /* mkxtion - make a transition from one state to another 664 * 665 * synopsis 666 * 667 * mkxtion( statefrom, stateto ); 668 * 669 * statefrom - the state from which the transition is to be made 670 * stateto - the state to which the transition is to be made 671 */ 672 673 void 674 mkxtion(int statefrom, int stateto) 675 { 676 if (trans1[statefrom] == NO_TRANSITION) 677 trans1[statefrom] = stateto; 678 679 else if ((transchar[statefrom] != SYM_EPSILON) || 680 (trans2[statefrom] != NO_TRANSITION)) 681 flexfatal(_("found too many transitions in mkxtion()")); 682 683 else { /* second out-transition for an epsilon state */ 684 ++eps2; 685 trans2[statefrom] = stateto; 686 } 687 } 688 689 /* new_rule - initialize for a new rule */ 690 691 void 692 new_rule(void) 693 { 694 if (++num_rules >= current_max_rules) { 695 ++num_reallocs; 696 current_max_rules += MAX_RULES_INCREMENT; 697 rule_type = reallocate_integer_array(rule_type, 698 current_max_rules); 699 rule_linenum = reallocate_integer_array(rule_linenum, 700 current_max_rules); 701 rule_useful = reallocate_integer_array(rule_useful, 702 current_max_rules); 703 rule_has_nl = reallocate_bool_array(rule_has_nl, 704 current_max_rules); 705 } 706 if (num_rules > MAX_RULE) 707 lerrif(_("too many rules (> %d)!"), MAX_RULE); 708 709 rule_linenum[num_rules] = linenum; 710 rule_useful[num_rules] = false; 711 rule_has_nl[num_rules] = false; 712 } 713