1 /* $OpenBSD: b.c,v 1.45 2023/10/30 17:52:54 millert Exp $ */ 2 /**************************************************************** 3 Copyright (C) Lucent Technologies 1997 4 All Rights Reserved 5 6 Permission to use, copy, modify, and distribute this software and 7 its documentation for any purpose and without fee is hereby 8 granted, provided that the above copyright notice appear in all 9 copies and that both that the copyright notice and this 10 permission notice and warranty disclaimer appear in supporting 11 documentation, and that the name Lucent Technologies or any of 12 its entities not be used in advertising or publicity pertaining 13 to distribution of the software without specific, written prior 14 permission. 15 16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23 THIS SOFTWARE. 24 ****************************************************************/ 25 26 /* lasciate ogne speranza, voi ch'intrate. */ 27 28 #define DEBUG 29 30 #include <ctype.h> 31 #include <limits.h> 32 #include <stdio.h> 33 #include <string.h> 34 #include <stdlib.h> 35 #include "awk.h" 36 #include "awkgram.tab.h" 37 38 #define MAXLIN 22 39 40 #define type(v) (v)->nobj /* badly overloaded here */ 41 #define info(v) (v)->ntype /* badly overloaded here */ 42 #define left(v) (v)->narg[0] 43 #define right(v) (v)->narg[1] 44 #define parent(v) (v)->nnext 45 46 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 47 #define ELEAF case EMPTYRE: /* empty string in regexp */ 48 #define UNARY case STAR: case PLUS: case QUEST: 49 50 /* encoding in tree Nodes: 51 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 52 left is index, right contains value or pointer to value 53 unary (STAR, PLUS, QUEST): left is child, right is null 54 binary (CAT, OR): left and right are children 55 parent contains pointer to parent 56 */ 57 58 59 int *setvec; 60 int *tmpset; 61 int maxsetvec = 0; 62 63 int rtok; /* next token in current re */ 64 int rlxval; 65 static const uschar *rlxstr; 66 static const uschar *prestr; /* current position in current re */ 67 static const uschar *lastre; /* origin of last re */ 68 static const uschar *lastatom; /* origin of last Atom */ 69 static const uschar *starttok; 70 static const uschar *basestr; /* starts with original, replaced during 71 repetition processing */ 72 static const uschar *firstbasestr; 73 74 static int setcnt; 75 static int poscnt; 76 77 const char *patbeg; 78 int patlen; 79 80 #define NFA 128 /* cache this many dynamic fa's */ 81 fa *fatab[NFA]; 82 int nfatab = 0; /* entries in fatab */ 83 84 extern int u8_nextlen(const char *s); 85 86 87 /* utf-8 mechanism: 88 89 For most of Awk, utf-8 strings just "work", since they look like 90 null-terminated sequences of 8-bit bytes. 91 92 Functions like length(), index(), and substr() have to operate 93 in units of utf-8 characters. The u8_* functions in run.c 94 handle this. 95 96 Regular expressions are more complicated, since the basic 97 mechanism of the goto table used 8-bit byte indices into the 98 gototab entries to compute the next state. Unicode is a lot 99 bigger, so the gototab entries are now structs with a character 100 and a next state, and there is a linear search of the characters 101 to find the state. (Yes, this is slower, by a significant 102 amount. Tough.) 103 104 Throughout the RE mechanism in b.c, utf-8 characters are 105 converted to their utf-32 value. This mostly shows up in 106 cclenter, which expands character class ranges like a-z and now 107 alpha-omega. The size of a gototab array is still about 256. 108 This should be dynamic, but for now things work ok for a single 109 code page of Unicode, which is the most likely case. 110 111 The code changes are localized in run.c and b.c. I have added a 112 handful of functions to somewhat better hide the implementation, 113 but a lot more could be done. 114 115 */ 116 117 static int get_gototab(fa*, int, int); 118 static int set_gototab(fa*, int, int, int); 119 static void reset_gototab(fa*, int); 120 extern int u8_rune(int *, const uschar *); 121 122 static int * 123 intalloc(size_t n, const char *f) 124 { 125 int *p = (int *) calloc(n, sizeof(int)); 126 if (p == NULL) 127 overflo(f); 128 return p; 129 } 130 131 static void 132 allocsetvec(const char *f) 133 { 134 maxsetvec = MAXLIN; 135 setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec)); 136 tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset)); 137 if (setvec == NULL || tmpset == NULL) 138 overflo(f); 139 } 140 141 static void 142 resizesetvec(const char *f) 143 { 144 setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec)); 145 tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset)); 146 if (setvec == NULL || tmpset == NULL) 147 overflo(f); 148 maxsetvec *= 4; 149 } 150 151 static void 152 resize_state(fa *f, int state) 153 { 154 gtt **p; 155 uschar *p2; 156 int **p3; 157 int i, new_count; 158 159 if (++state < f->state_count) 160 return; 161 162 new_count = state + 10; /* needs to be tuned */ 163 164 p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0])); 165 if (p == NULL) 166 goto out; 167 f->gototab = p; 168 169 p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0])); 170 if (p2 == NULL) 171 goto out; 172 f->out = p2; 173 174 p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0])); 175 if (p3 == NULL) 176 goto out; 177 f->posns = p3; 178 179 for (i = f->state_count; i < new_count; ++i) { 180 f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab)); 181 if (f->gototab[i] == NULL) 182 goto out; 183 f->out[i] = 0; 184 f->posns[i] = NULL; 185 } 186 f->gototab_len = NCHARS; /* should be variable, growable */ 187 f->state_count = new_count; 188 return; 189 out: 190 overflo(__func__); 191 } 192 193 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */ 194 { 195 int i, use, nuse; 196 fa *pfa; 197 static int now = 1; 198 199 if (setvec == NULL) { /* first time through any RE */ 200 allocsetvec(__func__); 201 } 202 203 if (compile_time != RUNNING) /* a constant for sure */ 204 return mkdfa(s, anchor); 205 for (i = 0; i < nfatab; i++) /* is it there already? */ 206 if (fatab[i]->anchor == anchor 207 && strcmp((const char *) fatab[i]->restr, s) == 0) { 208 fatab[i]->use = now++; 209 return fatab[i]; 210 } 211 pfa = mkdfa(s, anchor); 212 if (nfatab < NFA) { /* room for another */ 213 fatab[nfatab] = pfa; 214 fatab[nfatab]->use = now++; 215 nfatab++; 216 return pfa; 217 } 218 use = fatab[0]->use; /* replace least-recently used */ 219 nuse = 0; 220 for (i = 1; i < nfatab; i++) 221 if (fatab[i]->use < use) { 222 use = fatab[i]->use; 223 nuse = i; 224 } 225 freefa(fatab[nuse]); 226 fatab[nuse] = pfa; 227 pfa->use = now++; 228 return pfa; 229 } 230 231 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */ 232 /* anchor = true for anchored matches, else false */ 233 { 234 Node *p, *p1; 235 fa *f; 236 237 firstbasestr = (const uschar *) s; 238 basestr = firstbasestr; 239 p = reparse(s); 240 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 241 /* put ALL STAR in front of reg. exp. */ 242 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 243 /* put FINAL after reg. exp. */ 244 245 poscnt = 0; 246 penter(p1); /* enter parent pointers and leaf indices */ 247 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL) 248 overflo(__func__); 249 f->accept = poscnt-1; /* penter has computed number of positions in re */ 250 cfoll(f, p1); /* set up follow sets */ 251 freetr(p1); 252 resize_state(f, 1); 253 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__); 254 f->posns[1] = intalloc(1, __func__); 255 *f->posns[1] = 0; 256 f->initstat = makeinit(f, anchor); 257 f->anchor = anchor; 258 f->restr = (uschar *) tostring(s); 259 if (firstbasestr != basestr) { 260 if (basestr) 261 xfree(basestr); 262 } 263 return f; 264 } 265 266 int makeinit(fa *f, bool anchor) 267 { 268 int i, k; 269 270 f->curstat = 2; 271 f->out[2] = 0; 272 k = *(f->re[0].lfollow); 273 xfree(f->posns[2]); 274 f->posns[2] = intalloc(k + 1, __func__); 275 for (i = 0; i <= k; i++) { 276 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 277 } 278 if ((f->posns[2])[1] == f->accept) 279 f->out[2] = 1; 280 reset_gototab(f, 2); 281 f->curstat = cgoto(f, 2, HAT); 282 if (anchor) { 283 *f->posns[2] = k-1; /* leave out position 0 */ 284 for (i = 0; i < k; i++) { 285 (f->posns[0])[i] = (f->posns[2])[i]; 286 } 287 288 f->out[0] = f->out[2]; 289 if (f->curstat != 2) 290 --(*f->posns[f->curstat]); 291 } 292 return f->curstat; 293 } 294 295 void penter(Node *p) /* set up parent pointers and leaf indices */ 296 { 297 switch (type(p)) { 298 ELEAF 299 LEAF 300 info(p) = poscnt; 301 poscnt++; 302 break; 303 UNARY 304 penter(left(p)); 305 parent(left(p)) = p; 306 break; 307 case CAT: 308 case OR: 309 penter(left(p)); 310 penter(right(p)); 311 parent(left(p)) = p; 312 parent(right(p)) = p; 313 break; 314 case ZERO: 315 break; 316 default: /* can't happen */ 317 FATAL("can't happen: unknown type %d in penter", type(p)); 318 break; 319 } 320 } 321 322 void freetr(Node *p) /* free parse tree */ 323 { 324 switch (type(p)) { 325 ELEAF 326 LEAF 327 xfree(p); 328 break; 329 UNARY 330 case ZERO: 331 freetr(left(p)); 332 xfree(p); 333 break; 334 case CAT: 335 case OR: 336 freetr(left(p)); 337 freetr(right(p)); 338 xfree(p); 339 break; 340 default: /* can't happen */ 341 FATAL("can't happen: unknown type %d in freetr", type(p)); 342 break; 343 } 344 } 345 346 /* in the parsing of regular expressions, metacharacters like . have */ 347 /* to be seen literally; \056 is not a metacharacter. */ 348 349 int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */ 350 { /* only pick up one 8-bit byte (2 chars) */ 351 const uschar *p; 352 int n = 0; 353 int i; 354 355 for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) { 356 if (isdigit(*p)) 357 n = 16 * n + *p - '0'; 358 else if (*p >= 'a' && *p <= 'f') 359 n = 16 * n + *p - 'a' + 10; 360 else if (*p >= 'A' && *p <= 'F') 361 n = 16 * n + *p - 'A' + 10; 362 } 363 *pp = p; 364 return n; 365 } 366 367 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 368 369 int quoted(const uschar **pp) /* pick up next thing after a \\ */ 370 /* and increment *pp */ 371 { 372 const uschar *p = *pp; 373 int c; 374 375 /* BUG: should advance by utf-8 char even if makes no sense */ 376 377 if ((c = *p++) == 't') { 378 c = '\t'; 379 } else if (c == 'n') { 380 c = '\n'; 381 } else if (c == 'f') { 382 c = '\f'; 383 } else if (c == 'r') { 384 c = '\r'; 385 } else if (c == 'b') { 386 c = '\b'; 387 } else if (c == 'v') { 388 c = '\v'; 389 } else if (c == 'a') { 390 c = '\a'; 391 } else if (c == '\\') { 392 c = '\\'; 393 } else if (c == 'x') { /* 2 hex digits follow */ 394 c = hexstr(&p, 2); /* this adds a null if number is invalid */ 395 } else if (c == 'u') { /* unicode char number up to 8 hex digits */ 396 c = hexstr(&p, 8); 397 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 398 int n = c - '0'; 399 if (isoctdigit(*p)) { 400 n = 8 * n + *p++ - '0'; 401 if (isoctdigit(*p)) 402 n = 8 * n + *p++ - '0'; 403 } 404 c = n; 405 } /* else */ 406 /* c = c; */ 407 *pp = p; 408 return c; 409 } 410 411 int *cclenter(const char *argp) /* add a character class */ 412 { 413 int i, c, c2; 414 int n; 415 const uschar *p = (const uschar *) argp; 416 int *bp, *retp; 417 static int *buf = NULL; 418 static int bufsz = 100; 419 420 if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL) 421 FATAL("out of space for character class [%.10s...] 1", p); 422 bp = buf; 423 for (i = 0; *p != 0; ) { 424 n = u8_rune(&c, p); 425 p += n; 426 if (c == '\\') { 427 c = quoted(&p); 428 } else if (c == '-' && i > 0 && bp[-1] != 0) { 429 if (*p != 0) { 430 c = bp[-1]; 431 /* c2 = *p++; */ 432 n = u8_rune(&c2, p); 433 p += n; 434 if (c2 == '\\') 435 c2 = quoted(&p); /* BUG: sets p, has to be u8 size */ 436 if (c > c2) { /* empty; ignore */ 437 bp--; 438 i--; 439 continue; 440 } 441 while (c < c2) { 442 if (i >= bufsz) { 443 buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int)); 444 if (buf == NULL) 445 FATAL("out of space for character class [%.10s...] 2", p); 446 bufsz *= 2; 447 bp = buf + i; 448 } 449 *bp++ = ++c; 450 i++; 451 } 452 continue; 453 } 454 } 455 if (i >= bufsz) { 456 buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int)); 457 if (buf == NULL) 458 FATAL("out of space for character class [%.10s...] 2", p); 459 bufsz *= 2; 460 bp = buf + i; 461 } 462 *bp++ = c; 463 i++; 464 } 465 *bp = 0; 466 /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */ 467 /* xfree(op); BUG: what are we freeing here? */ 468 retp = (int *) calloc(bp-buf+1, sizeof(int)); 469 for (i = 0; i < bp-buf+1; i++) 470 retp[i] = buf[i]; 471 return retp; 472 } 473 474 void overflo(const char *s) 475 { 476 FATAL("regular expression too big: out of space in %.30s...", s); 477 } 478 479 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 480 { 481 int i; 482 int *p; 483 484 switch (type(v)) { 485 ELEAF 486 LEAF 487 f->re[info(v)].ltype = type(v); 488 f->re[info(v)].lval.np = right(v); 489 while (f->accept >= maxsetvec) { /* guessing here! */ 490 resizesetvec(__func__); 491 } 492 for (i = 0; i <= f->accept; i++) 493 setvec[i] = 0; 494 setcnt = 0; 495 follow(v); /* computes setvec and setcnt */ 496 p = intalloc(setcnt + 1, __func__); 497 f->re[info(v)].lfollow = p; 498 *p = setcnt; 499 for (i = f->accept; i >= 0; i--) 500 if (setvec[i] == 1) 501 *++p = i; 502 break; 503 UNARY 504 cfoll(f,left(v)); 505 break; 506 case CAT: 507 case OR: 508 cfoll(f,left(v)); 509 cfoll(f,right(v)); 510 break; 511 case ZERO: 512 break; 513 default: /* can't happen */ 514 FATAL("can't happen: unknown type %d in cfoll", type(v)); 515 } 516 } 517 518 int first(Node *p) /* collects initially active leaves of p into setvec */ 519 /* returns 0 if p matches empty string */ 520 { 521 int b, lp; 522 523 switch (type(p)) { 524 ELEAF 525 LEAF 526 lp = info(p); /* look for high-water mark of subscripts */ 527 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 528 resizesetvec(__func__); 529 } 530 if (type(p) == EMPTYRE) { 531 setvec[lp] = 0; 532 return(0); 533 } 534 if (setvec[lp] != 1) { 535 setvec[lp] = 1; 536 setcnt++; 537 } 538 if (type(p) == CCL && (*(int *) right(p)) == 0) 539 return(0); /* empty CCL */ 540 return(1); 541 case PLUS: 542 if (first(left(p)) == 0) 543 return(0); 544 return(1); 545 case STAR: 546 case QUEST: 547 first(left(p)); 548 return(0); 549 case CAT: 550 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 551 return(1); 552 case OR: 553 b = first(right(p)); 554 if (first(left(p)) == 0 || b == 0) return(0); 555 return(1); 556 case ZERO: 557 return 0; 558 } 559 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 560 return(-1); 561 } 562 563 void follow(Node *v) /* collects leaves that can follow v into setvec */ 564 { 565 Node *p; 566 567 if (type(v) == FINAL) 568 return; 569 p = parent(v); 570 switch (type(p)) { 571 case STAR: 572 case PLUS: 573 first(v); 574 follow(p); 575 return; 576 577 case OR: 578 case QUEST: 579 follow(p); 580 return; 581 582 case CAT: 583 if (v == left(p)) { /* v is left child of p */ 584 if (first(right(p)) == 0) { 585 follow(p); 586 return; 587 } 588 } else /* v is right child */ 589 follow(p); 590 return; 591 } 592 } 593 594 int member(int c, int *sarg) /* is c in s? */ 595 { 596 int *s = (int *) sarg; 597 598 while (*s) 599 if (c == *s++) 600 return(1); 601 return(0); 602 } 603 604 static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */ 605 { 606 int i; 607 for (i = 0; i < f->gototab_len; i++) { 608 if (f->gototab[state][i].ch == 0) 609 break; 610 if (f->gototab[state][i].ch == ch) 611 return f->gototab[state][i].state; 612 } 613 return 0; 614 } 615 616 static void reset_gototab(fa *f, int state) /* hide gototab inplementation */ 617 { 618 memset(f->gototab[state], 0, f->gototab_len * sizeof(**f->gototab)); 619 } 620 621 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */ 622 { 623 int i; 624 for (i = 0; i < f->gototab_len; i++) { 625 if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) { 626 f->gototab[state][i].ch = ch; 627 f->gototab[state][i].state = val; 628 return val; 629 } 630 } 631 overflo(__func__); 632 return val; /* not used anywhere at the moment */ 633 } 634 635 int match(fa *f, const char *p0) /* shortest match ? */ 636 { 637 int s, ns; 638 int n; 639 int rune; 640 const uschar *p = (const uschar *) p0; 641 642 /* return pmatch(f, p0); does it matter whether longest or shortest? */ 643 644 s = f->initstat; 645 assert (s < f->state_count); 646 647 if (f->out[s]) 648 return(1); 649 do { 650 /* assert(*p < NCHARS); */ 651 n = u8_rune(&rune, p); 652 if ((ns = get_gototab(f, s, rune)) != 0) 653 s = ns; 654 else 655 s = cgoto(f, s, rune); 656 if (f->out[s]) 657 return(1); 658 if (*p == 0) 659 break; 660 p += n; 661 } while (1); /* was *p++ != 0 */ 662 return(0); 663 } 664 665 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 666 { 667 int s, ns; 668 int n; 669 int rune; 670 const uschar *p = (const uschar *) p0; 671 const uschar *q; 672 673 s = f->initstat; 674 assert(s < f->state_count); 675 676 patbeg = (const char *)p; 677 patlen = -1; 678 do { 679 q = p; 680 do { 681 if (f->out[s]) /* final state */ 682 patlen = q-p; 683 /* assert(*q < NCHARS); */ 684 n = u8_rune(&rune, q); 685 if ((ns = get_gototab(f, s, rune)) != 0) 686 s = ns; 687 else 688 s = cgoto(f, s, rune); 689 690 assert(s < f->state_count); 691 692 if (s == 1) { /* no transition */ 693 if (patlen >= 0) { 694 patbeg = (const char *) p; 695 return(1); 696 } 697 else 698 goto nextin; /* no match */ 699 } 700 if (*q == 0) 701 break; 702 q += n; 703 } while (1); 704 q++; /* was *q++ */ 705 if (f->out[s]) 706 patlen = q-p-1; /* don't count $ */ 707 if (patlen >= 0) { 708 patbeg = (const char *) p; 709 return(1); 710 } 711 nextin: 712 s = 2; 713 if (*p == 0) 714 break; 715 n = u8_rune(&rune, p); 716 p += n; 717 } while (1); /* was *p++ */ 718 return (0); 719 } 720 721 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 722 { 723 int s, ns; 724 int n; 725 int rune; 726 const uschar *p = (const uschar *) p0; 727 const uschar *q; 728 729 s = f->initstat; 730 assert(s < f->state_count); 731 732 patbeg = (const char *)p; 733 patlen = -1; 734 while (*p) { 735 q = p; 736 do { 737 if (f->out[s]) /* final state */ 738 patlen = q-p; 739 /* assert(*q < NCHARS); */ 740 n = u8_rune(&rune, q); 741 if ((ns = get_gototab(f, s, rune)) != 0) 742 s = ns; 743 else 744 s = cgoto(f, s, rune); 745 if (s == 1) { /* no transition */ 746 if (patlen > 0) { 747 patbeg = (const char *) p; 748 return(1); 749 } else 750 goto nnextin; /* no nonempty match */ 751 } 752 if (*q == 0) 753 break; 754 q += n; 755 } while (1); 756 q++; 757 if (f->out[s]) 758 patlen = q-p-1; /* don't count $ */ 759 if (patlen > 0 ) { 760 patbeg = (const char *) p; 761 return(1); 762 } 763 nnextin: 764 s = 2; 765 p++; 766 } 767 return (0); 768 } 769 770 771 #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long 772 773 // Read one rune at a time from the given FILE*. Return both 774 // the bytes and the actual rune. 775 776 struct runedata { 777 int rune; 778 size_t len; 779 char bytes[6]; 780 }; 781 782 struct runedata getrune(FILE *fp) 783 { 784 struct runedata result; 785 int c, i, next; 786 787 memset(&result, 0, sizeof(result)); 788 789 c = getc(fp); 790 if (c == EOF) 791 return result; // result.rune == 0 --> EOF 792 else if (c < 128 || awk_mb_cur_max == 1) { 793 result.bytes[0] = c; 794 result.len = 1; 795 result.rune = c; 796 797 return result; 798 } 799 800 // need to get bytes and fill things in 801 result.bytes[0] = c; 802 result.len = 1; 803 804 next = 1; 805 for (i = 1; i < MAX_UTF_BYTES; i++) { 806 c = getc(fp); 807 if (c == EOF) 808 break; 809 result.bytes[next++] = c; 810 result.len++; 811 } 812 813 // put back any extra input bytes 814 int actual_len = u8_nextlen(result.bytes); 815 while (result.len > actual_len) { 816 ungetc(result.bytes[--result.len], fp); 817 } 818 819 result.bytes[result.len] = '\0'; 820 (void) u8_rune(& result.rune, (uschar *) result.bytes); 821 822 return result; 823 } 824 825 826 /* 827 * NAME 828 * fnematch 829 * 830 * DESCRIPTION 831 * A stream-fed version of nematch which transfers characters to a 832 * null-terminated buffer. All characters up to and including the last 833 * character of the matching text or EOF are placed in the buffer. If 834 * a match is found, patbeg and patlen are set appropriately. 835 * 836 * RETURN VALUES 837 * false No match found. 838 * true Match found. 839 */ 840 841 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) 842 { 843 char *buf = *pbuf; 844 int bufsize = *pbufsize; 845 int i, j, k, ns, s; 846 struct runedata r; 847 848 s = pfa->initstat; 849 patlen = 0; 850 851 /* 852 * All indices relative to buf. 853 * i <= j <= k <= bufsize 854 * 855 * i: origin of active substring (first byte of first character) 856 * j: current character (last byte of current character) 857 * k: destination of next getc() 858 */ 859 i = -1, k = 0; 860 do { 861 j = i++; 862 do { 863 r = getrune(f); 864 if ((++j + r.len) >= k) { 865 if (k >= bufsize) 866 if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) 867 FATAL("stream '%.30s...' too long", buf); 868 } 869 memcpy(buf + k, r.bytes, r.len); 870 j += r.len - 1; // incremented next time around the loop 871 k += r.len; 872 873 if ((ns = get_gototab(pfa, s, r.rune)) != 0) 874 s = ns; 875 else 876 s = cgoto(pfa, s, r.rune); 877 878 if (pfa->out[s]) { /* final state */ 879 patlen = j - i + 1; 880 if (r.rune == 0) /* don't count $ */ 881 patlen--; 882 } 883 } while (buf[j] && s != 1); 884 s = 2; 885 if (r.len > 1) 886 i += r.len - 1; // i incremented around the loop 887 } while (buf[i] && !patlen); 888 889 /* adjbuf() may have relocated a resized buffer. Inform the world. */ 890 *pbuf = buf; 891 *pbufsize = bufsize; 892 893 if (patlen) { 894 patbeg = buf + i; 895 /* 896 * Under no circumstances is the last character fed to 897 * the automaton part of the match. It is EOF's nullbyte, 898 * or it sent the automaton into a state with no further 899 * transitions available (s==1), or both. Room for a 900 * terminating nullbyte is guaranteed. 901 * 902 * ungetc any chars after the end of matching text 903 * (except for EOF's nullbyte, if present) and null 904 * terminate the buffer. 905 */ 906 do { 907 int ii; 908 for (ii = r.len; ii > 0; ii--) 909 if (buf[--k] && ungetc(buf[k], f) == EOF) 910 FATAL("unable to ungetc '%c'", buf[k]); 911 } while (k > i + patlen); 912 buf[k] = '\0'; 913 return true; 914 } 915 else 916 return false; 917 } 918 919 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 920 { /* uses relex() to scan regular expression */ 921 Node *np; 922 923 DPRINTF("reparse <%s>\n", p); 924 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 925 rtok = relex(); 926 /* GNU compatibility: an empty regexp matches anything */ 927 if (rtok == '\0') { 928 /* FATAL("empty regular expression"); previous */ 929 return(op2(EMPTYRE, NIL, NIL)); 930 } 931 np = regexp(); 932 if (rtok != '\0') 933 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 934 return(np); 935 } 936 937 Node *regexp(void) /* top-level parse of reg expr */ 938 { 939 return (alt(concat(primary()))); 940 } 941 942 Node *primary(void) 943 { 944 Node *np; 945 int savelastatom; 946 947 switch (rtok) { 948 case CHAR: 949 lastatom = starttok; 950 np = op2(CHAR, NIL, itonp(rlxval)); 951 rtok = relex(); 952 return (unary(np)); 953 case ALL: 954 rtok = relex(); 955 return (unary(op2(ALL, NIL, NIL))); 956 case EMPTYRE: 957 rtok = relex(); 958 return (unary(op2(EMPTYRE, NIL, NIL))); 959 case DOT: 960 lastatom = starttok; 961 rtok = relex(); 962 return (unary(op2(DOT, NIL, NIL))); 963 case CCL: 964 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 965 lastatom = starttok; 966 rtok = relex(); 967 return (unary(np)); 968 case NCCL: 969 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 970 lastatom = starttok; 971 rtok = relex(); 972 return (unary(np)); 973 case '^': 974 rtok = relex(); 975 return (unary(op2(CHAR, NIL, itonp(HAT)))); 976 case '$': 977 rtok = relex(); 978 return (unary(op2(CHAR, NIL, NIL))); 979 case '(': 980 lastatom = starttok; 981 savelastatom = starttok - basestr; /* Retain over recursion */ 982 rtok = relex(); 983 if (rtok == ')') { /* special pleading for () */ 984 rtok = relex(); 985 return unary(op2(CCL, NIL, (Node *) cclenter(""))); 986 } 987 np = regexp(); 988 if (rtok == ')') { 989 lastatom = basestr + savelastatom; /* Restore */ 990 rtok = relex(); 991 return (unary(np)); 992 } 993 else 994 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 995 default: 996 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 997 } 998 return 0; /*NOTREACHED*/ 999 } 1000 1001 Node *concat(Node *np) 1002 { 1003 switch (rtok) { 1004 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 1005 return (concat(op2(CAT, np, primary()))); 1006 case EMPTYRE: 1007 rtok = relex(); 1008 return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")), 1009 primary()))); 1010 } 1011 return (np); 1012 } 1013 1014 Node *alt(Node *np) 1015 { 1016 if (rtok == OR) { 1017 rtok = relex(); 1018 return (alt(op2(OR, np, concat(primary())))); 1019 } 1020 return (np); 1021 } 1022 1023 Node *unary(Node *np) 1024 { 1025 switch (rtok) { 1026 case STAR: 1027 rtok = relex(); 1028 return (unary(op2(STAR, np, NIL))); 1029 case PLUS: 1030 rtok = relex(); 1031 return (unary(op2(PLUS, np, NIL))); 1032 case QUEST: 1033 rtok = relex(); 1034 return (unary(op2(QUEST, np, NIL))); 1035 case ZERO: 1036 rtok = relex(); 1037 return (unary(op2(ZERO, np, NIL))); 1038 default: 1039 return (np); 1040 } 1041 } 1042 1043 /* 1044 * Character class definitions conformant to the POSIX locale as 1045 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 1046 * and operating character sets are both ASCII (ISO646) or supersets 1047 * thereof. 1048 * 1049 * Note that to avoid overflowing the temporary buffer used in 1050 * relex(), the expanded character class (prior to range expansion) 1051 * must be less than twice the size of their full name. 1052 */ 1053 1054 /* Because isblank doesn't show up in any of the header files on any 1055 * system i use, it's defined here. if some other locale has a richer 1056 * definition of "blank", define HAS_ISBLANK and provide your own 1057 * version. 1058 * the parentheses here are an attempt to find a path through the maze 1059 * of macro definition and/or function and/or version provided. thanks 1060 * to nelson beebe for the suggestion; let's see if it works everywhere. 1061 */ 1062 1063 #ifndef HAS_ISBLANK 1064 1065 int (xisblank)(int c) 1066 { 1067 return c==' ' || c=='\t'; 1068 } 1069 1070 #endif 1071 1072 static const struct charclass { 1073 const char *cc_name; 1074 int cc_namelen; 1075 int (*cc_func)(int); 1076 } charclasses[] = { 1077 { "alnum", 5, isalnum }, 1078 { "alpha", 5, isalpha }, 1079 #ifndef HAS_ISBLANK 1080 { "blank", 5, xisblank }, 1081 #else 1082 { "blank", 5, isblank }, 1083 #endif 1084 { "cntrl", 5, iscntrl }, 1085 { "digit", 5, isdigit }, 1086 { "graph", 5, isgraph }, 1087 { "lower", 5, islower }, 1088 { "print", 5, isprint }, 1089 { "punct", 5, ispunct }, 1090 { "space", 5, isspace }, 1091 { "upper", 5, isupper }, 1092 { "xdigit", 6, isxdigit }, 1093 { NULL, 0, NULL }, 1094 }; 1095 1096 #define REPEAT_SIMPLE 0 1097 #define REPEAT_PLUS_APPENDED 1 1098 #define REPEAT_WITH_Q 2 1099 #define REPEAT_ZERO 3 1100 1101 static int 1102 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1103 int atomlen, int firstnum, int secondnum, int special_case) 1104 { 1105 int i, j; 1106 uschar *buf = NULL; 1107 int ret = 1; 1108 int init_q = (firstnum == 0); /* first added char will be ? */ 1109 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 1110 int prefix_length = reptok - basestr; /* prefix includes first rep */ 1111 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */ 1112 int size = prefix_length + suffix_length; 1113 1114 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 1115 size += atomlen*(firstnum-1); 1116 } 1117 1118 /* Adjust size of buffer for special cases */ 1119 if (special_case == REPEAT_PLUS_APPENDED) { 1120 size++; /* for the final + */ 1121 } else if (special_case == REPEAT_WITH_Q) { 1122 size += init_q + (atomlen+1)* (n_q_reps-init_q); 1123 } else if (special_case == REPEAT_ZERO) { 1124 size += 2; /* just a null ERE: () */ 1125 } 1126 if ((buf = (uschar *) malloc(size + 1)) == NULL) 1127 FATAL("out of space in reg expr %.10s..", lastre); 1128 memcpy(buf, basestr, prefix_length); /* copy prefix */ 1129 j = prefix_length; 1130 if (special_case == REPEAT_ZERO) { 1131 j -= atomlen; 1132 buf[j++] = '('; 1133 buf[j++] = ')'; 1134 } 1135 for (i = 1; i < firstnum; i++) { /* copy x reps */ 1136 memcpy(&buf[j], atom, atomlen); 1137 j += atomlen; 1138 } 1139 if (special_case == REPEAT_PLUS_APPENDED) { 1140 buf[j++] = '+'; 1141 } else if (special_case == REPEAT_WITH_Q) { 1142 if (init_q) 1143 buf[j++] = '?'; 1144 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */ 1145 memcpy(&buf[j], atom, atomlen); 1146 j += atomlen; 1147 buf[j++] = '?'; 1148 } 1149 } 1150 memcpy(&buf[j], reptok+reptoklen, suffix_length); 1151 j += suffix_length; 1152 buf[j] = '\0'; 1153 /* free old basestr */ 1154 if (firstbasestr != basestr) { 1155 if (basestr) 1156 xfree(basestr); 1157 } 1158 basestr = buf; 1159 prestr = buf + prefix_length; 1160 if (special_case == REPEAT_ZERO) { 1161 prestr -= atomlen; 1162 ret++; 1163 } 1164 return ret; 1165 } 1166 1167 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1168 int atomlen, int firstnum, int secondnum) 1169 { 1170 /* 1171 In general, the repetition specifier or "bound" is replaced here 1172 by an equivalent ERE string, repeating the immediately previous atom 1173 and appending ? and + as needed. Note that the first copy of the 1174 atom is left in place, except in the special_case of a zero-repeat 1175 (i.e., {0}). 1176 */ 1177 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 1178 if (firstnum < 2) { 1179 /* 0 or 1: should be handled before you get here */ 1180 FATAL("internal error"); 1181 } else { 1182 return replace_repeat(reptok, reptoklen, atom, atomlen, 1183 firstnum, secondnum, REPEAT_PLUS_APPENDED); 1184 } 1185 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 1186 if (firstnum == 0) { /* {0} or {0,0} */ 1187 /* This case is unusual because the resulting 1188 replacement string might actually be SMALLER than 1189 the original ERE */ 1190 return replace_repeat(reptok, reptoklen, atom, atomlen, 1191 firstnum, secondnum, REPEAT_ZERO); 1192 } else { /* (firstnum >= 1) */ 1193 return replace_repeat(reptok, reptoklen, atom, atomlen, 1194 firstnum, secondnum, REPEAT_SIMPLE); 1195 } 1196 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 1197 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 1198 return replace_repeat(reptok, reptoklen, atom, atomlen, 1199 firstnum, secondnum, REPEAT_WITH_Q); 1200 } else { /* Error - shouldn't be here (n>m) */ 1201 FATAL("internal error"); 1202 } 1203 return 0; 1204 } 1205 1206 extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */ 1207 1208 int relex(void) /* lexical analyzer for reparse */ 1209 { 1210 int c, n; 1211 int cflag; 1212 static uschar *buf = NULL; 1213 static int bufsz = 100; 1214 uschar *bp; 1215 const struct charclass *cc; 1216 int i; 1217 int num, m; 1218 bool commafound, digitfound; 1219 const uschar *startreptok; 1220 static int parens = 0; 1221 1222 rescan: 1223 starttok = prestr; 1224 1225 if ((n = u8_rune(&rlxval, prestr)) > 1) { 1226 prestr += n; 1227 starttok = prestr; 1228 return CHAR; 1229 } 1230 1231 switch (c = *prestr++) { 1232 case '|': return OR; 1233 case '*': return STAR; 1234 case '+': return PLUS; 1235 case '?': return QUEST; 1236 case '.': return DOT; 1237 case '\0': prestr--; return '\0'; 1238 case '^': 1239 case '$': 1240 return c; 1241 case '(': 1242 parens++; 1243 return c; 1244 case ')': 1245 if (parens) { 1246 parens--; 1247 return c; 1248 } 1249 /* unmatched close parenthesis; per POSIX, treat as literal */ 1250 rlxval = c; 1251 return CHAR; 1252 case '\\': 1253 rlxval = quoted(&prestr); 1254 return CHAR; 1255 default: 1256 rlxval = c; 1257 return CHAR; 1258 case '[': 1259 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 1260 FATAL("out of space in reg expr %.10s..", lastre); 1261 bp = buf; 1262 if (*prestr == '^') { 1263 cflag = 1; 1264 prestr++; 1265 } 1266 else 1267 cflag = 0; 1268 n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */ 1269 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 1270 FATAL("out of space for reg expr %.10s...", lastre); 1271 for (; ; ) { 1272 if ((n = u8_rune(&rlxval, prestr)) > 1) { 1273 for (i = 0; i < n; i++) 1274 *bp++ = *prestr++; 1275 continue; 1276 } 1277 if ((c = *prestr++) == '\\') { 1278 *bp++ = '\\'; 1279 if ((c = *prestr++) == '\0') 1280 FATAL("nonterminated character class %.20s...", lastre); 1281 *bp++ = c; 1282 /* } else if (c == '\n') { */ 1283 /* FATAL("newline in character class %.20s...", lastre); */ 1284 } else if (c == '[' && *prestr == ':') { 1285 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1286 for (cc = charclasses; cc->cc_name; cc++) 1287 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1288 break; 1289 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1290 prestr[2 + cc->cc_namelen] == ']') { 1291 prestr += cc->cc_namelen + 3; 1292 /* 1293 * BUG: We begin at 1, instead of 0, since we 1294 * would otherwise prematurely terminate the 1295 * string for classes like [[:cntrl:]]. This 1296 * means that we can't match the NUL character, 1297 * not without first adapting the entire 1298 * program to track each string's length. 1299 */ 1300 for (i = 1; i <= UCHAR_MAX; i++) { 1301 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2")) 1302 FATAL("out of space for reg expr %.10s...", lastre); 1303 if (cc->cc_func(i)) { 1304 /* escape backslash */ 1305 if (i == '\\') { 1306 *bp++ = '\\'; 1307 n++; 1308 } 1309 1310 *bp++ = i; 1311 n++; 1312 } 1313 } 1314 } else 1315 *bp++ = c; 1316 } else if (c == '[' && *prestr == '.') { 1317 char collate_char; 1318 prestr++; 1319 collate_char = *prestr++; 1320 if (*prestr == '.' && prestr[1] == ']') { 1321 prestr += 2; 1322 /* Found it: map via locale TBD: for 1323 now, simply return this char. This 1324 is sufficient to pass conformance 1325 test awk.ex 156 1326 */ 1327 if (*prestr == ']') { 1328 prestr++; 1329 rlxval = collate_char; 1330 return CHAR; 1331 } 1332 } 1333 } else if (c == '[' && *prestr == '=') { 1334 char equiv_char; 1335 prestr++; 1336 equiv_char = *prestr++; 1337 if (*prestr == '=' && prestr[1] == ']') { 1338 prestr += 2; 1339 /* Found it: map via locale TBD: for now 1340 simply return this char. This is 1341 sufficient to pass conformance test 1342 awk.ex 156 1343 */ 1344 if (*prestr == ']') { 1345 prestr++; 1346 rlxval = equiv_char; 1347 return CHAR; 1348 } 1349 } 1350 } else if (c == '\0') { 1351 FATAL("nonterminated character class %.20s", lastre); 1352 } else if (bp == buf) { /* 1st char is special */ 1353 *bp++ = c; 1354 } else if (c == ']') { 1355 *bp++ = 0; 1356 rlxstr = (uschar *) tostring((char *) buf); 1357 if (cflag == 0) 1358 return CCL; 1359 else 1360 return NCCL; 1361 } else 1362 *bp++ = c; 1363 } 1364 break; 1365 case '{': 1366 if (isdigit(*(prestr))) { 1367 num = 0; /* Process as a repetition */ 1368 n = -1; m = -1; 1369 commafound = false; 1370 digitfound = false; 1371 startreptok = prestr-1; 1372 /* Remember start of previous atom here ? */ 1373 } else { /* just a { char, not a repetition */ 1374 rlxval = c; 1375 return CHAR; 1376 } 1377 for (; ; ) { 1378 if ((c = *prestr++) == '}') { 1379 if (commafound) { 1380 if (digitfound) { /* {n,m} */ 1381 m = num; 1382 if (m < n) 1383 FATAL("illegal repetition expression: class %.20s", 1384 lastre); 1385 if (n == 0 && m == 1) { 1386 return QUEST; 1387 } 1388 } else { /* {n,} */ 1389 if (n == 0) 1390 return STAR; 1391 else if (n == 1) 1392 return PLUS; 1393 } 1394 } else { 1395 if (digitfound) { /* {n} same as {n,n} */ 1396 n = num; 1397 m = num; 1398 } else { /* {} */ 1399 FATAL("illegal repetition expression: class %.20s", 1400 lastre); 1401 } 1402 } 1403 if (repeat(starttok, prestr-starttok, lastatom, 1404 startreptok - lastatom, n, m) > 0) { 1405 if (n == 0 && m == 0) { 1406 return ZERO; 1407 } 1408 /* must rescan input for next token */ 1409 goto rescan; 1410 } 1411 /* Failed to replace: eat up {...} characters 1412 and treat like just PLUS */ 1413 return PLUS; 1414 } else if (c == '\0') { 1415 FATAL("nonterminated character class %.20s", 1416 lastre); 1417 } else if (isdigit(c)) { 1418 num = 10 * num + c - '0'; 1419 digitfound = true; 1420 } else if (c == ',') { 1421 if (commafound) 1422 FATAL("illegal repetition expression: class %.20s", 1423 lastre); 1424 /* looking for {n,} or {n,m} */ 1425 commafound = true; 1426 n = num; 1427 digitfound = false; /* reset */ 1428 num = 0; 1429 } else { 1430 FATAL("illegal repetition expression: class %.20s", 1431 lastre); 1432 } 1433 } 1434 break; 1435 } 1436 } 1437 1438 int cgoto(fa *f, int s, int c) 1439 { 1440 int *p, *q; 1441 int i, j, k; 1442 1443 /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */ 1444 while (f->accept >= maxsetvec) { /* guessing here! */ 1445 resizesetvec(__func__); 1446 } 1447 for (i = 0; i <= f->accept; i++) 1448 setvec[i] = 0; 1449 setcnt = 0; 1450 resize_state(f, s); 1451 /* compute positions of gototab[s,c] into setvec */ 1452 p = f->posns[s]; 1453 for (i = 1; i <= *p; i++) { 1454 if ((k = f->re[p[i]].ltype) != FINAL) { 1455 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1456 || (k == DOT && c != 0 && c != HAT) 1457 || (k == ALL && c != 0) 1458 || (k == EMPTYRE && c != 0) 1459 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp)) 1460 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) { 1461 q = f->re[p[i]].lfollow; 1462 for (j = 1; j <= *q; j++) { 1463 if (q[j] >= maxsetvec) { 1464 resizesetvec(__func__); 1465 } 1466 if (setvec[q[j]] == 0) { 1467 setcnt++; 1468 setvec[q[j]] = 1; 1469 } 1470 } 1471 } 1472 } 1473 } 1474 /* determine if setvec is a previous state */ 1475 tmpset[0] = setcnt; 1476 j = 1; 1477 for (i = f->accept; i >= 0; i--) 1478 if (setvec[i]) { 1479 tmpset[j++] = i; 1480 } 1481 resize_state(f, f->curstat > s ? f->curstat : s); 1482 /* tmpset == previous state? */ 1483 for (i = 1; i <= f->curstat; i++) { 1484 p = f->posns[i]; 1485 if ((k = tmpset[0]) != p[0]) 1486 goto different; 1487 for (j = 1; j <= k; j++) 1488 if (tmpset[j] != p[j]) 1489 goto different; 1490 /* setvec is state i */ 1491 if (c != HAT) 1492 set_gototab(f, s, c, i); 1493 return i; 1494 different:; 1495 } 1496 1497 /* add tmpset to current set of states */ 1498 ++(f->curstat); 1499 resize_state(f, f->curstat); 1500 xfree(f->posns[f->curstat]); 1501 p = intalloc(setcnt + 1, __func__); 1502 1503 f->posns[f->curstat] = p; 1504 if (c != HAT) 1505 set_gototab(f, s, c, f->curstat); 1506 for (i = 0; i <= setcnt; i++) 1507 p[i] = tmpset[i]; 1508 if (setvec[f->accept]) 1509 f->out[f->curstat] = 1; 1510 else 1511 f->out[f->curstat] = 0; 1512 return f->curstat; 1513 } 1514 1515 1516 void freefa(fa *f) /* free a finite automaton */ 1517 { 1518 int i; 1519 1520 if (f == NULL) 1521 return; 1522 for (i = 0; i < f->state_count; i++) 1523 xfree(f->gototab[i]) 1524 for (i = 0; i <= f->curstat; i++) 1525 xfree(f->posns[i]); 1526 for (i = 0; i <= f->accept; i++) { 1527 xfree(f->re[i].lfollow); 1528 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1529 xfree(f->re[i].lval.np); 1530 } 1531 xfree(f->restr); 1532 xfree(f->out); 1533 xfree(f->posns); 1534 xfree(f->gototab); 1535 xfree(f); 1536 } 1537