1*14472Ssam #ifndef lint 2*14472Ssam static char sccsid[] = "@(#)b.c 4.2 08/11/83"; 3*14472Ssam #endif 46669Smckusick 56669Smckusick #include "awk.def" 66669Smckusick #include "stdio.h" 76669Smckusick #include "awk.h" 86669Smckusick 96669Smckusick extern node *op2(); 106669Smckusick extern struct fa *cgotofn(); 116669Smckusick #define MAXLIN 256 126669Smckusick #define NCHARS 128 136669Smckusick #define NSTATES 256 146669Smckusick 156669Smckusick #define type(v) v->nobj 166669Smckusick #define left(v) v->narg[0] 176669Smckusick #define right(v) v->narg[1] 186669Smckusick #define parent(v) v->nnext 196669Smckusick 206669Smckusick #define LEAF case CCL: case NCCL: case CHAR: case DOT: 216669Smckusick #define UNARY case FINAL: case STAR: case PLUS: case QUEST: 226669Smckusick 236669Smckusick /* encoding in tree nodes: 246669Smckusick leaf (CCL, NCCL, CHAR, DOT): left is index, right contains value or pointer to value 256669Smckusick unary (FINAL, STAR, PLUS, QUEST): left is child, right is null 266669Smckusick binary (CAT, OR): left and right are children 276669Smckusick parent contains pointer to parent 286669Smckusick */ 296669Smckusick 306669Smckusick struct fa { 316669Smckusick int cch; 326669Smckusick struct fa *st; 336669Smckusick }; 346669Smckusick 356669Smckusick int *state[NSTATES]; 366669Smckusick int *foll[MAXLIN]; 376669Smckusick char chars[MAXLIN]; 386669Smckusick int setvec[MAXLIN]; 396669Smckusick node *point[MAXLIN]; 406669Smckusick 416669Smckusick int setcnt; 426669Smckusick int line; 436669Smckusick 446669Smckusick 456669Smckusick struct fa *makedfa(p) /* returns dfa for tree pointed to by p */ 466669Smckusick node *p; 476669Smckusick { 486669Smckusick node *p1; 496669Smckusick struct fa *fap; 506669Smckusick p1 = op2(CAT, op2(STAR, op2(DOT, (node *) 0, (node *) 0), (node *) 0), p); 516669Smckusick /* put DOT STAR in front of reg. exp. */ 526669Smckusick p1 = op2(FINAL, p1, (node *) 0); /* install FINAL node */ 536669Smckusick 546669Smckusick line = 0; 556669Smckusick penter(p1); /* enter parent pointers and leaf indices */ 566669Smckusick point[line] = p1; /* FINAL node */ 576669Smckusick setvec[0] = 1; /* for initial DOT STAR */ 586669Smckusick cfoll(p1); /* set up follow sets */ 596669Smckusick fap = cgotofn(); 606669Smckusick freetr(p1); /* add this when alloc works */ 616669Smckusick return(fap); 626669Smckusick } 636669Smckusick 646669Smckusick penter(p) /* set up parent pointers and leaf indices */ 656669Smckusick node *p; 666669Smckusick { 676669Smckusick switch(type(p)) { 686669Smckusick LEAF 696669Smckusick left(p) = (node *) line; 706669Smckusick point[line++] = p; 716669Smckusick break; 726669Smckusick UNARY 736669Smckusick penter(left(p)); 746669Smckusick parent(left(p)) = p; 756669Smckusick break; 766669Smckusick case CAT: 776669Smckusick case OR: 786669Smckusick penter(left(p)); 796669Smckusick penter(right(p)); 806669Smckusick parent(left(p)) = p; 816669Smckusick parent(right(p)) = p; 826669Smckusick break; 836669Smckusick default: 846669Smckusick error(FATAL, "unknown type %d in penter\n", type(p)); 856669Smckusick break; 866669Smckusick } 876669Smckusick } 886669Smckusick 896669Smckusick freetr(p) /* free parse tree and follow sets */ 906669Smckusick node *p; 916669Smckusick { 926669Smckusick switch(type(p)) { 936669Smckusick LEAF 946669Smckusick xfree(foll[(int) left(p)]); 956669Smckusick xfree(p); 966669Smckusick break; 976669Smckusick UNARY 986669Smckusick freetr(left(p)); 996669Smckusick xfree(p); 1006669Smckusick break; 1016669Smckusick case CAT: 1026669Smckusick case OR: 1036669Smckusick freetr(left(p)); 1046669Smckusick freetr(right(p)); 1056669Smckusick xfree(p); 1066669Smckusick break; 1076669Smckusick default: 1086669Smckusick error(FATAL, "unknown type %d in freetr", type(p)); 1096669Smckusick break; 1106669Smckusick } 1116669Smckusick } 1126669Smckusick char *cclenter(p) 1136669Smckusick register char *p; 1146669Smckusick { 1156669Smckusick register i, c; 1166669Smckusick char *op; 1176669Smckusick 1186669Smckusick op = p; 1196669Smckusick i = 0; 1206669Smckusick while ((c = *p++) != 0) { 1216669Smckusick if (c == '-' && i > 0 && chars[i-1] != 0) { 1226669Smckusick if (*p != 0) { 1236669Smckusick c = chars[i-1]; 1246669Smckusick while (c < *p) { 1256669Smckusick if (i >= MAXLIN) 1266669Smckusick overflo(); 1276669Smckusick chars[i++] = ++c; 1286669Smckusick } 1296669Smckusick p++; 1306669Smckusick continue; 1316669Smckusick } 1326669Smckusick } 1336669Smckusick if (i >= MAXLIN) 1346669Smckusick overflo(); 1356669Smckusick chars[i++] = c; 1366669Smckusick } 1376669Smckusick chars[i++] = '\0'; 1386669Smckusick dprintf("cclenter: in = |%s|, out = |%s|\n", op, chars, NULL); 1396669Smckusick xfree(op); 1406669Smckusick return(tostring(chars)); 1416669Smckusick } 1426669Smckusick 1436669Smckusick overflo() 1446669Smckusick { 1456669Smckusick error(FATAL, "regular expression too long\n"); 1466669Smckusick } 1476669Smckusick 1486669Smckusick cfoll(v) /* enter follow set of each leaf of vertex v into foll[leaf] */ 1496669Smckusick register node *v; 1506669Smckusick { 1516669Smckusick register i; 1526669Smckusick int prev; 1536669Smckusick int *add(); 1546669Smckusick 1556669Smckusick switch(type(v)) { 1566669Smckusick LEAF 1576669Smckusick setcnt = 0; 1586669Smckusick for (i=1; i<=line; i++) 1596669Smckusick setvec[i] = 0; 1606669Smckusick follow(v); 1616669Smckusick if (notin(foll, ( (int) left(v))-1, &prev)) { 1626669Smckusick foll[(int) left(v)] = add(setcnt); 1636669Smckusick } 1646669Smckusick else 1656669Smckusick foll[ (int) left(v)] = foll[prev]; 1666669Smckusick break; 1676669Smckusick UNARY 1686669Smckusick cfoll(left(v)); 1696669Smckusick break; 1706669Smckusick case CAT: 1716669Smckusick case OR: 1726669Smckusick cfoll(left(v)); 1736669Smckusick cfoll(right(v)); 1746669Smckusick break; 1756669Smckusick default: 1766669Smckusick error(FATAL, "unknown type %d in cfoll", type(v)); 1776669Smckusick } 1786669Smckusick } 1796669Smckusick 1806669Smckusick first(p) /* collects initially active leaves of p into setvec */ 1816669Smckusick register node *p; /* returns 0 or 1 depending on whether p matches empty string */ 1826669Smckusick { 1836669Smckusick register b; 1846669Smckusick 1856669Smckusick switch(type(p)) { 1866669Smckusick LEAF 1876669Smckusick if (setvec[(int) left(p)] != 1) { 1886669Smckusick setvec[(int) left(p)] = 1; 1896669Smckusick setcnt++; 1906669Smckusick } 1916669Smckusick if (type(p) == CCL && (*(char *) right(p)) == '\0') 1926669Smckusick return(0); /* empty CCL */ 1936669Smckusick else return(1); 1946669Smckusick case FINAL: 1956669Smckusick case PLUS: 1966669Smckusick if (first(left(p)) == 0) return(0); 1976669Smckusick return(1); 1986669Smckusick case STAR: 1996669Smckusick case QUEST: 2006669Smckusick first(left(p)); 2016669Smckusick return(0); 2026669Smckusick case CAT: 2036669Smckusick if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 2046669Smckusick return(1); 2056669Smckusick case OR: 2066669Smckusick b = first(right(p)); 2076669Smckusick if (first(left(p)) == 0 || b == 0) return(0); 2086669Smckusick return(1); 2096669Smckusick } 2106669Smckusick error(FATAL, "unknown type %d in first\n", type(p)); 2116669Smckusick return(-1); 2126669Smckusick } 2136669Smckusick 2146669Smckusick follow(v) 2156669Smckusick node *v; /* collects leaves that can follow v into setvec */ 2166669Smckusick { 2176669Smckusick node *p; 2186669Smckusick 2196669Smckusick if (type(v) == FINAL) 2206669Smckusick return; 2216669Smckusick p = parent(v); 2226669Smckusick switch (type(p)) { 2236669Smckusick case STAR: 2246669Smckusick case PLUS: first(v); 2256669Smckusick follow(p); 2266669Smckusick return; 2276669Smckusick 2286669Smckusick case OR: 2296669Smckusick case QUEST: follow(p); 2306669Smckusick return; 2316669Smckusick 2326669Smckusick case CAT: if (v == left(p)) { /* v is left child of p */ 2336669Smckusick if (first(right(p)) == 0) { 2346669Smckusick follow(p); 2356669Smckusick return; 2366669Smckusick } 2376669Smckusick } 2386669Smckusick else /* v is right child */ 2396669Smckusick follow(p); 2406669Smckusick return; 2416669Smckusick case FINAL: if (setvec[line] != 1) { 2426669Smckusick setvec[line] = 1; 2436669Smckusick setcnt++; 2446669Smckusick } 2456669Smckusick return; 2466669Smckusick } 2476669Smckusick } 2486669Smckusick 2496669Smckusick member(c, s) /* is c in s? */ 2506669Smckusick register char c, *s; 2516669Smckusick { 2526669Smckusick while (*s) 2536669Smckusick if (c == *s++) 2546669Smckusick return(1); 2556669Smckusick return(0); 2566669Smckusick } 2576669Smckusick 2586669Smckusick notin(array, n, prev) /* is setvec in array[0] thru array[n]? */ 2596669Smckusick int **array; 2606669Smckusick int *prev; { 2616669Smckusick register i, j; 2626669Smckusick int *ptr; 2636669Smckusick for (i=0; i<=n; i++) { 2646669Smckusick ptr = array[i]; 2656669Smckusick if (*ptr == setcnt) { 2666669Smckusick for (j=0; j < setcnt; j++) 2676669Smckusick if (setvec[*(++ptr)] != 1) goto nxt; 2686669Smckusick *prev = i; 2696669Smckusick return(0); 2706669Smckusick } 2716669Smckusick nxt: ; 2726669Smckusick } 2736669Smckusick return(1); 2746669Smckusick } 2756669Smckusick 2766669Smckusick int *add(n) { /* remember setvec */ 2776669Smckusick int *ptr, *p; 2786669Smckusick register i; 2796669Smckusick if ((p = ptr = (int *) malloc((n+1)*sizeof(int))) == NULL) 2806669Smckusick overflo(); 2816669Smckusick *ptr = n; 2826669Smckusick dprintf("add(%d)\n", n, NULL, NULL); 2836669Smckusick for (i=1; i <= line; i++) 2846669Smckusick if (setvec[i] == 1) { 2856669Smckusick *(++ptr) = i; 2866669Smckusick dprintf(" ptr = %o, *ptr = %d, i = %d\n", ptr, *ptr, i); 2876669Smckusick } 2886669Smckusick dprintf("\n", NULL, NULL, NULL); 2896669Smckusick return(p); 2906669Smckusick } 2916669Smckusick 2926669Smckusick struct fa *cgotofn() 2936669Smckusick { 2946669Smckusick register i, k; 2956669Smckusick register int *ptr; 2966669Smckusick char c; 2976669Smckusick char *p; 2986669Smckusick node *cp; 2996669Smckusick int j, n, s, ind, numtrans; 3006669Smckusick int finflg; 3016669Smckusick int curpos, num, prev; 3026669Smckusick struct fa *where[NSTATES]; 3036669Smckusick 3046669Smckusick int fatab[257]; 3056669Smckusick struct fa *pfa; 3066669Smckusick 3076669Smckusick char index[MAXLIN]; 3086669Smckusick char iposns[MAXLIN]; 3096669Smckusick int sposns[MAXLIN]; 3106669Smckusick int spmax, spinit; 3116669Smckusick 3126669Smckusick char symbol[NCHARS]; 3136669Smckusick char isyms[NCHARS]; 3146669Smckusick char ssyms[NCHARS]; 3156669Smckusick int ssmax, ssinit; 3166669Smckusick 3176669Smckusick for (i=0; i<=line; i++) index[i] = iposns[i] = setvec[i] = 0; 3186669Smckusick for (i=0; i<NCHARS; i++) isyms[i] = symbol[i] = 0; 3196669Smckusick setcnt = 0; 3206669Smckusick /* compute initial positions and symbols of state 0 */ 3216669Smckusick ssmax = 0; 3226669Smckusick ptr = state[0] = foll[0]; 3236669Smckusick spinit = *ptr; 3246669Smckusick for (i=0; i<spinit; i++) { 3256669Smckusick curpos = *(++ptr); 3266669Smckusick sposns[i] = curpos; 3276669Smckusick iposns[curpos] = 1; 3286669Smckusick cp = point[curpos]; 3296669Smckusick dprintf("i = %d, spinit = %d, curpos = %d\n", i, spinit, curpos); 3306669Smckusick switch (type(cp)) { 3316669Smckusick case CHAR: 3326669Smckusick k = (int) right(cp); 3336669Smckusick if (isyms[k] != 1) { 3346669Smckusick isyms[k] = 1; 3356669Smckusick ssyms[ssmax++] = k; 3366669Smckusick } 3376669Smckusick break; 3386669Smckusick case DOT: 3396669Smckusick for (k=1; k<NCHARS; k++) { 3406669Smckusick if (k != HAT) { 3416669Smckusick if (isyms[k] != 1) { 3426669Smckusick isyms[k] = 1; 3436669Smckusick ssyms[ssmax++] = k; 3446669Smckusick } 3456669Smckusick } 3466669Smckusick } 3476669Smckusick break; 3486669Smckusick case CCL: 3496669Smckusick for (p = (char *) right(cp); *p; p++) { 3506669Smckusick if (*p != HAT) { 3516669Smckusick if (isyms[*p] != 1) { 3526669Smckusick isyms[*p] = 1; 3536669Smckusick ssyms[ssmax++] = *p; 3546669Smckusick } 3556669Smckusick } 3566669Smckusick } 3576669Smckusick break; 3586669Smckusick case NCCL: 3596669Smckusick for (k=1; k<NCHARS; k++) { 3606669Smckusick if (k != HAT && !member(k, (char *) right(cp))) { 3616669Smckusick if (isyms[k] != 1) { 3626669Smckusick isyms[k] = 1; 3636669Smckusick ssyms[ssmax++] = k; 3646669Smckusick } 3656669Smckusick } 3666669Smckusick } 3676669Smckusick } 3686669Smckusick } 3696669Smckusick ssinit = ssmax; 3706669Smckusick n = 0; 3716669Smckusick for (s=0; s<=n; s++) { 3726669Smckusick dprintf("s = %d\n", s, NULL, NULL); 3736669Smckusick ind = 0; 3746669Smckusick numtrans = 0; 3756669Smckusick finflg = 0; 3766669Smckusick if (*(state[s] + *state[s]) == line) { /* s final? */ 3776669Smckusick finflg = 1; 3786669Smckusick goto tenter; 3796669Smckusick } 3806669Smckusick spmax = spinit; 3816669Smckusick ssmax = ssinit; 3826669Smckusick ptr = state[s]; 3836669Smckusick num = *ptr; 3846669Smckusick for (i=0; i<num; i++) { 3856669Smckusick curpos = *(++ptr); 3866669Smckusick if (iposns[curpos] != 1 && index[curpos] != 1) { 3876669Smckusick index[curpos] = 1; 3886669Smckusick sposns[spmax++] = curpos; 3896669Smckusick } 3906669Smckusick cp = point[curpos]; 3916669Smckusick switch (type(cp)) { 3926669Smckusick case CHAR: 3936669Smckusick k = (int) right(cp); 3946669Smckusick if (isyms[k] == 0 && symbol[k] == 0) { 3956669Smckusick symbol[k] = 1; 3966669Smckusick ssyms[ssmax++] = k; 3976669Smckusick } 3986669Smckusick break; 3996669Smckusick case DOT: 4006669Smckusick for (k=1; k<NCHARS; k++) { 4016669Smckusick if (k != HAT) { 4026669Smckusick if (isyms[k] == 0 && symbol[k] == 0) { 4036669Smckusick symbol[k] = 1; 4046669Smckusick ssyms[ssmax++] = k; 4056669Smckusick } 4066669Smckusick } 4076669Smckusick } 4086669Smckusick break; 4096669Smckusick case CCL: 4106669Smckusick for (p = (char *) right(cp); *p; p++) { 4116669Smckusick if (*p != HAT) { 4126669Smckusick if (isyms[*p] == 0 && symbol[*p] == 0) { 4136669Smckusick symbol[*p] = 1; 4146669Smckusick ssyms[ssmax++] = *p; 4156669Smckusick } 4166669Smckusick } 4176669Smckusick } 4186669Smckusick break; 4196669Smckusick case NCCL: 4206669Smckusick for (k=1; k<NCHARS; k++) { 4216669Smckusick if (k != HAT && !member(k, (char *) right(cp))) { 4226669Smckusick if (isyms[k] == 0 && symbol[k] == 0) { 4236669Smckusick symbol[k] = 1; 4246669Smckusick ssyms[ssmax++] = k; 4256669Smckusick } 4266669Smckusick } 4276669Smckusick } 4286669Smckusick } 4296669Smckusick } 4306669Smckusick for (j=0; j<ssmax; j++) { /* nextstate(s, ssyms[j]) */ 4316669Smckusick c = ssyms[j]; 4326669Smckusick symbol[c] = 0; 4336669Smckusick setcnt = 0; 4346669Smckusick for (k=0; k<=line; k++) setvec[k] = 0; 4356669Smckusick for (i=0; i<spmax; i++) { 4366669Smckusick index[sposns[i]] = 0; 4376669Smckusick cp = point[sposns[i]]; 4386669Smckusick if ((k = type(cp)) != FINAL) 4396669Smckusick if (k == CHAR && c == (int) right(cp) 4406669Smckusick || k == DOT 4416669Smckusick || k == CCL && member(c, (char *) right(cp)) 4426669Smckusick || k == NCCL && !member(c, (char *) right(cp))) { 4436669Smckusick ptr = foll[sposns[i]]; 4446669Smckusick num = *ptr; 4456669Smckusick for (k=0; k<num; k++) { 4466669Smckusick if (setvec[*(++ptr)] != 1 4476669Smckusick && iposns[*ptr] != 1) { 4486669Smckusick setvec[*ptr] = 1; 4496669Smckusick setcnt++; 4506669Smckusick } 4516669Smckusick } 4526669Smckusick } 4536669Smckusick } /* end nextstate */ 4546669Smckusick if (notin(state, n, &prev)) { 4556669Smckusick if (n >= NSTATES) { 4566669Smckusick dprintf("cgotofn: notin; state = %d, n = %d\n", state, n, NULL); 4576669Smckusick overflo(); 4586669Smckusick } 4596669Smckusick state[++n] = add(setcnt); 4606669Smckusick dprintf(" delta(%d,%o) = %d", s,c,n); 4616669Smckusick dprintf(", ind = %d\n", ind+1, NULL, NULL); 4626669Smckusick fatab[++ind] = c; 4636669Smckusick fatab[++ind] = n; 4646669Smckusick numtrans++; 4656669Smckusick } 4666669Smckusick else { 4676669Smckusick if (prev != 0) { 4686669Smckusick dprintf(" delta(%d,%o) = %d", s,c,prev); 4696669Smckusick dprintf(", ind = %d\n", ind+1, NULL, NULL); 4706669Smckusick fatab[++ind] = c; 4716669Smckusick fatab[++ind] = prev; 4726669Smckusick numtrans++; 4736669Smckusick } 4746669Smckusick } 4756669Smckusick } 4766669Smckusick tenter: 4776669Smckusick if ((pfa = (struct fa *) malloc((numtrans + 1) * sizeof(struct fa))) == NULL) 4786669Smckusick overflo(); 4796669Smckusick where[s] = pfa; 4806669Smckusick if (finflg) 4816669Smckusick pfa->cch = -1; /* s is a final state */ 4826669Smckusick else 4836669Smckusick pfa->cch = numtrans; 4846669Smckusick pfa->st = 0; 4856669Smckusick for (i=1, pfa += 1; i<=numtrans; i++, pfa++) { 4866669Smckusick pfa->cch = fatab[2*i-1]; 4876669Smckusick pfa->st = (struct fa *) fatab[2*i]; 4886669Smckusick } 4896669Smckusick } 4906669Smckusick for (i=0; i<=n; i++) { 4916669Smckusick xfree(state[i]); /* free state[i] */ 4926669Smckusick pfa = where[i]; 4936669Smckusick pfa->st = where[0]; 4946669Smckusick dprintf("state %d: (%o)\n", i, pfa, NULL); 4956669Smckusick dprintf(" numtrans = %d, default = %o\n", pfa->cch, pfa->st, NULL); 4966669Smckusick for (k=1; k<=pfa->cch; k++) { 4976669Smckusick (pfa+k)->st = where[ (int) (pfa+k)->st]; 4986669Smckusick dprintf(" char = %o, nextstate = %o\n",(pfa+k)->cch, (pfa+k)->st, NULL); 4996669Smckusick } 5006669Smckusick } 5016669Smckusick pfa = where[0]; 5026669Smckusick if ((num = pfa->cch) < 0) 5036669Smckusick return(where[0]); 5046669Smckusick for (pfa += num; num; num--, pfa--) 5056669Smckusick if (pfa->cch == HAT) { 5066669Smckusick return(pfa->st); 5076669Smckusick } 5086669Smckusick return(where[0]); 5096669Smckusick } 5106669Smckusick 5116669Smckusick match(pfa, p) 5126669Smckusick register struct fa *pfa; 5136669Smckusick register char *p; 5146669Smckusick { 5156669Smckusick register count; 5166669Smckusick char c; 5176669Smckusick if (p == 0) return(0); 5186669Smckusick if (pfa->cch == 1) { /* fast test for first character, if possible */ 5196669Smckusick c = (++pfa)->cch; 5206669Smckusick do 5216669Smckusick if (c == *p) { 5226669Smckusick p++; 5236669Smckusick pfa = pfa->st; 5246669Smckusick goto adv; 5256669Smckusick } 5266669Smckusick while (*p++ != 0); 5276669Smckusick return(0); 5286669Smckusick } 5296669Smckusick adv: if ((count = pfa->cch) < 0) return(1); 5306669Smckusick do { 5316669Smckusick for (pfa += count; count; count--, pfa--) 5326669Smckusick if (pfa->cch == *p) { 5336669Smckusick break; 5346669Smckusick } 5356669Smckusick pfa = pfa->st; 5366669Smckusick if ((count = pfa->cch) < 0) return(1); 5376669Smckusick } while(*p++ != 0); 5386669Smckusick return(0); 5396669Smckusick } 540