114472Ssam #ifndef lint 2*29543Smckusick static char sccsid[] = "@(#)b.c 4.3 07/07/86"; 314472Ssam #endif 46669Smckusick 56669Smckusick #include "awk.def" 66669Smckusick #include "stdio.h" 76669Smckusick #include "awk.h" 86669Smckusick 96669Smckusick extern node *op2(); 106669Smckusick extern struct fa *cgotofn(); 116669Smckusick #define MAXLIN 256 126669Smckusick #define NCHARS 128 136669Smckusick #define NSTATES 256 146669Smckusick 156669Smckusick #define type(v) v->nobj 166669Smckusick #define left(v) v->narg[0] 176669Smckusick #define right(v) v->narg[1] 186669Smckusick #define parent(v) v->nnext 196669Smckusick 206669Smckusick #define LEAF case CCL: case NCCL: case CHAR: case DOT: 216669Smckusick #define UNARY case FINAL: case STAR: case PLUS: case QUEST: 226669Smckusick 236669Smckusick /* encoding in tree nodes: 246669Smckusick leaf (CCL, NCCL, CHAR, DOT): left is index, right contains value or pointer to value 256669Smckusick unary (FINAL, STAR, PLUS, QUEST): left is child, right is null 266669Smckusick binary (CAT, OR): left and right are children 276669Smckusick parent contains pointer to parent 286669Smckusick */ 296669Smckusick 306669Smckusick struct fa { 316669Smckusick int cch; 326669Smckusick struct fa *st; 336669Smckusick }; 346669Smckusick 356669Smckusick int *state[NSTATES]; 366669Smckusick int *foll[MAXLIN]; 376669Smckusick char chars[MAXLIN]; 386669Smckusick int setvec[MAXLIN]; 396669Smckusick node *point[MAXLIN]; 406669Smckusick 416669Smckusick int setcnt; 426669Smckusick int line; 43*29543Smckusick int maxfoll; /* index of highest foll[] entry set by cfoll() */ 446669Smckusick 456669Smckusick 466669Smckusick struct fa *makedfa(p) /* returns dfa for tree pointed to by p */ 476669Smckusick node *p; 486669Smckusick { 496669Smckusick node *p1; 506669Smckusick struct fa *fap; 516669Smckusick p1 = op2(CAT, op2(STAR, op2(DOT, (node *) 0, (node *) 0), (node *) 0), p); 526669Smckusick /* put DOT STAR in front of reg. exp. */ 536669Smckusick p1 = op2(FINAL, p1, (node *) 0); /* install FINAL node */ 546669Smckusick 556669Smckusick line = 0; 566669Smckusick penter(p1); /* enter parent pointers and leaf indices */ 576669Smckusick point[line] = p1; /* FINAL node */ 586669Smckusick setvec[0] = 1; /* for initial DOT STAR */ 596669Smckusick cfoll(p1); /* set up follow sets */ 606669Smckusick fap = cgotofn(); 616669Smckusick freetr(p1); /* add this when alloc works */ 626669Smckusick return(fap); 636669Smckusick } 646669Smckusick 656669Smckusick penter(p) /* set up parent pointers and leaf indices */ 666669Smckusick node *p; 676669Smckusick { 686669Smckusick switch(type(p)) { 696669Smckusick LEAF 706669Smckusick left(p) = (node *) line; 716669Smckusick point[line++] = p; 726669Smckusick break; 736669Smckusick UNARY 746669Smckusick penter(left(p)); 756669Smckusick parent(left(p)) = p; 766669Smckusick break; 776669Smckusick case CAT: 786669Smckusick case OR: 796669Smckusick penter(left(p)); 806669Smckusick penter(right(p)); 816669Smckusick parent(left(p)) = p; 826669Smckusick parent(right(p)) = p; 836669Smckusick break; 846669Smckusick default: 856669Smckusick error(FATAL, "unknown type %d in penter\n", type(p)); 866669Smckusick break; 876669Smckusick } 886669Smckusick } 896669Smckusick 906669Smckusick freetr(p) /* free parse tree and follow sets */ 916669Smckusick node *p; 926669Smckusick { 936669Smckusick switch(type(p)) { 946669Smckusick LEAF 95*29543Smckusick foll_free((int) left(p)); 966669Smckusick xfree(p); 976669Smckusick break; 986669Smckusick UNARY 996669Smckusick freetr(left(p)); 1006669Smckusick xfree(p); 1016669Smckusick break; 1026669Smckusick case CAT: 1036669Smckusick case OR: 1046669Smckusick freetr(left(p)); 1056669Smckusick freetr(right(p)); 1066669Smckusick xfree(p); 1076669Smckusick break; 1086669Smckusick default: 1096669Smckusick error(FATAL, "unknown type %d in freetr", type(p)); 1106669Smckusick break; 1116669Smckusick } 1126669Smckusick } 1136669Smckusick char *cclenter(p) 1146669Smckusick register char *p; 1156669Smckusick { 1166669Smckusick register i, c; 1176669Smckusick char *op; 1186669Smckusick 1196669Smckusick op = p; 1206669Smckusick i = 0; 1216669Smckusick while ((c = *p++) != 0) { 1226669Smckusick if (c == '-' && i > 0 && chars[i-1] != 0) { 1236669Smckusick if (*p != 0) { 1246669Smckusick c = chars[i-1]; 1256669Smckusick while (c < *p) { 1266669Smckusick if (i >= MAXLIN) 1276669Smckusick overflo(); 1286669Smckusick chars[i++] = ++c; 1296669Smckusick } 1306669Smckusick p++; 1316669Smckusick continue; 1326669Smckusick } 1336669Smckusick } 1346669Smckusick if (i >= MAXLIN) 1356669Smckusick overflo(); 1366669Smckusick chars[i++] = c; 1376669Smckusick } 1386669Smckusick chars[i++] = '\0'; 1396669Smckusick dprintf("cclenter: in = |%s|, out = |%s|\n", op, chars, NULL); 1406669Smckusick xfree(op); 1416669Smckusick return(tostring(chars)); 1426669Smckusick } 1436669Smckusick 1446669Smckusick overflo() 1456669Smckusick { 1466669Smckusick error(FATAL, "regular expression too long\n"); 1476669Smckusick } 1486669Smckusick 1496669Smckusick cfoll(v) /* enter follow set of each leaf of vertex v into foll[leaf] */ 1506669Smckusick register node *v; 1516669Smckusick { 1526669Smckusick register i; 1536669Smckusick int prev; 1546669Smckusick int *add(); 1556669Smckusick 156*29543Smckusick maxfoll = -1; 1576669Smckusick switch(type(v)) { 1586669Smckusick LEAF 1596669Smckusick setcnt = 0; 1606669Smckusick for (i=1; i<=line; i++) 1616669Smckusick setvec[i] = 0; 1626669Smckusick follow(v); 1636669Smckusick if (notin(foll, ( (int) left(v))-1, &prev)) { 1646669Smckusick foll[(int) left(v)] = add(setcnt); 1656669Smckusick } 1666669Smckusick else 1676669Smckusick foll[ (int) left(v)] = foll[prev]; 168*29543Smckusick if ((int)left(v) > maxfoll) 169*29543Smckusick maxfoll = (int)left(v); 1706669Smckusick break; 1716669Smckusick UNARY 1726669Smckusick cfoll(left(v)); 1736669Smckusick break; 1746669Smckusick case CAT: 1756669Smckusick case OR: 1766669Smckusick cfoll(left(v)); 1776669Smckusick cfoll(right(v)); 1786669Smckusick break; 1796669Smckusick default: 1806669Smckusick error(FATAL, "unknown type %d in cfoll", type(v)); 1816669Smckusick } 1826669Smckusick } 1836669Smckusick 1846669Smckusick first(p) /* collects initially active leaves of p into setvec */ 1856669Smckusick register node *p; /* returns 0 or 1 depending on whether p matches empty string */ 1866669Smckusick { 1876669Smckusick register b; 1886669Smckusick 1896669Smckusick switch(type(p)) { 1906669Smckusick LEAF 1916669Smckusick if (setvec[(int) left(p)] != 1) { 1926669Smckusick setvec[(int) left(p)] = 1; 1936669Smckusick setcnt++; 1946669Smckusick } 1956669Smckusick if (type(p) == CCL && (*(char *) right(p)) == '\0') 1966669Smckusick return(0); /* empty CCL */ 1976669Smckusick else return(1); 1986669Smckusick case FINAL: 1996669Smckusick case PLUS: 2006669Smckusick if (first(left(p)) == 0) return(0); 2016669Smckusick return(1); 2026669Smckusick case STAR: 2036669Smckusick case QUEST: 2046669Smckusick first(left(p)); 2056669Smckusick return(0); 2066669Smckusick case CAT: 2076669Smckusick if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 2086669Smckusick return(1); 2096669Smckusick case OR: 2106669Smckusick b = first(right(p)); 2116669Smckusick if (first(left(p)) == 0 || b == 0) return(0); 2126669Smckusick return(1); 2136669Smckusick } 2146669Smckusick error(FATAL, "unknown type %d in first\n", type(p)); 2156669Smckusick return(-1); 2166669Smckusick } 2176669Smckusick 2186669Smckusick follow(v) 2196669Smckusick node *v; /* collects leaves that can follow v into setvec */ 2206669Smckusick { 2216669Smckusick node *p; 2226669Smckusick 2236669Smckusick if (type(v) == FINAL) 2246669Smckusick return; 2256669Smckusick p = parent(v); 2266669Smckusick switch (type(p)) { 2276669Smckusick case STAR: 2286669Smckusick case PLUS: first(v); 2296669Smckusick follow(p); 2306669Smckusick return; 2316669Smckusick 2326669Smckusick case OR: 2336669Smckusick case QUEST: follow(p); 2346669Smckusick return; 2356669Smckusick 2366669Smckusick case CAT: if (v == left(p)) { /* v is left child of p */ 2376669Smckusick if (first(right(p)) == 0) { 2386669Smckusick follow(p); 2396669Smckusick return; 2406669Smckusick } 2416669Smckusick } 2426669Smckusick else /* v is right child */ 2436669Smckusick follow(p); 2446669Smckusick return; 2456669Smckusick case FINAL: if (setvec[line] != 1) { 2466669Smckusick setvec[line] = 1; 2476669Smckusick setcnt++; 2486669Smckusick } 2496669Smckusick return; 2506669Smckusick } 2516669Smckusick } 2526669Smckusick 2536669Smckusick member(c, s) /* is c in s? */ 2546669Smckusick register char c, *s; 2556669Smckusick { 2566669Smckusick while (*s) 2576669Smckusick if (c == *s++) 2586669Smckusick return(1); 2596669Smckusick return(0); 2606669Smckusick } 2616669Smckusick 2626669Smckusick notin(array, n, prev) /* is setvec in array[0] thru array[n]? */ 2636669Smckusick int **array; 2646669Smckusick int *prev; { 2656669Smckusick register i, j; 2666669Smckusick int *ptr; 2676669Smckusick for (i=0; i<=n; i++) { 2686669Smckusick ptr = array[i]; 2696669Smckusick if (*ptr == setcnt) { 2706669Smckusick for (j=0; j < setcnt; j++) 2716669Smckusick if (setvec[*(++ptr)] != 1) goto nxt; 2726669Smckusick *prev = i; 2736669Smckusick return(0); 2746669Smckusick } 2756669Smckusick nxt: ; 2766669Smckusick } 2776669Smckusick return(1); 2786669Smckusick } 2796669Smckusick 2806669Smckusick int *add(n) { /* remember setvec */ 2816669Smckusick int *ptr, *p; 2826669Smckusick register i; 2836669Smckusick if ((p = ptr = (int *) malloc((n+1)*sizeof(int))) == NULL) 2846669Smckusick overflo(); 2856669Smckusick *ptr = n; 2866669Smckusick dprintf("add(%d)\n", n, NULL, NULL); 2876669Smckusick for (i=1; i <= line; i++) 2886669Smckusick if (setvec[i] == 1) { 2896669Smckusick *(++ptr) = i; 2906669Smckusick dprintf(" ptr = %o, *ptr = %d, i = %d\n", ptr, *ptr, i); 2916669Smckusick } 2926669Smckusick dprintf("\n", NULL, NULL, NULL); 2936669Smckusick return(p); 2946669Smckusick } 2956669Smckusick 2966669Smckusick struct fa *cgotofn() 2976669Smckusick { 2986669Smckusick register i, k; 2996669Smckusick register int *ptr; 3006669Smckusick char c; 3016669Smckusick char *p; 3026669Smckusick node *cp; 3036669Smckusick int j, n, s, ind, numtrans; 3046669Smckusick int finflg; 3056669Smckusick int curpos, num, prev; 3066669Smckusick struct fa *where[NSTATES]; 3076669Smckusick 3086669Smckusick int fatab[257]; 3096669Smckusick struct fa *pfa; 3106669Smckusick 3116669Smckusick char index[MAXLIN]; 3126669Smckusick char iposns[MAXLIN]; 3136669Smckusick int sposns[MAXLIN]; 3146669Smckusick int spmax, spinit; 3156669Smckusick 3166669Smckusick char symbol[NCHARS]; 3176669Smckusick char isyms[NCHARS]; 3186669Smckusick char ssyms[NCHARS]; 3196669Smckusick int ssmax, ssinit; 3206669Smckusick 3216669Smckusick for (i=0; i<=line; i++) index[i] = iposns[i] = setvec[i] = 0; 3226669Smckusick for (i=0; i<NCHARS; i++) isyms[i] = symbol[i] = 0; 3236669Smckusick setcnt = 0; 3246669Smckusick /* compute initial positions and symbols of state 0 */ 3256669Smckusick ssmax = 0; 3266669Smckusick ptr = state[0] = foll[0]; 3276669Smckusick spinit = *ptr; 3286669Smckusick for (i=0; i<spinit; i++) { 3296669Smckusick curpos = *(++ptr); 3306669Smckusick sposns[i] = curpos; 3316669Smckusick iposns[curpos] = 1; 3326669Smckusick cp = point[curpos]; 3336669Smckusick dprintf("i = %d, spinit = %d, curpos = %d\n", i, spinit, curpos); 3346669Smckusick switch (type(cp)) { 3356669Smckusick case CHAR: 3366669Smckusick k = (int) right(cp); 3376669Smckusick if (isyms[k] != 1) { 3386669Smckusick isyms[k] = 1; 3396669Smckusick ssyms[ssmax++] = k; 3406669Smckusick } 3416669Smckusick break; 3426669Smckusick case DOT: 3436669Smckusick for (k=1; k<NCHARS; k++) { 3446669Smckusick if (k != HAT) { 3456669Smckusick if (isyms[k] != 1) { 3466669Smckusick isyms[k] = 1; 3476669Smckusick ssyms[ssmax++] = k; 3486669Smckusick } 3496669Smckusick } 3506669Smckusick } 3516669Smckusick break; 3526669Smckusick case CCL: 3536669Smckusick for (p = (char *) right(cp); *p; p++) { 3546669Smckusick if (*p != HAT) { 3556669Smckusick if (isyms[*p] != 1) { 3566669Smckusick isyms[*p] = 1; 3576669Smckusick ssyms[ssmax++] = *p; 3586669Smckusick } 3596669Smckusick } 3606669Smckusick } 3616669Smckusick break; 3626669Smckusick case NCCL: 3636669Smckusick for (k=1; k<NCHARS; k++) { 3646669Smckusick if (k != HAT && !member(k, (char *) right(cp))) { 3656669Smckusick if (isyms[k] != 1) { 3666669Smckusick isyms[k] = 1; 3676669Smckusick ssyms[ssmax++] = k; 3686669Smckusick } 3696669Smckusick } 3706669Smckusick } 3716669Smckusick } 3726669Smckusick } 3736669Smckusick ssinit = ssmax; 3746669Smckusick n = 0; 3756669Smckusick for (s=0; s<=n; s++) { 3766669Smckusick dprintf("s = %d\n", s, NULL, NULL); 3776669Smckusick ind = 0; 3786669Smckusick numtrans = 0; 3796669Smckusick finflg = 0; 3806669Smckusick if (*(state[s] + *state[s]) == line) { /* s final? */ 3816669Smckusick finflg = 1; 3826669Smckusick goto tenter; 3836669Smckusick } 3846669Smckusick spmax = spinit; 3856669Smckusick ssmax = ssinit; 3866669Smckusick ptr = state[s]; 3876669Smckusick num = *ptr; 3886669Smckusick for (i=0; i<num; i++) { 3896669Smckusick curpos = *(++ptr); 3906669Smckusick if (iposns[curpos] != 1 && index[curpos] != 1) { 3916669Smckusick index[curpos] = 1; 3926669Smckusick sposns[spmax++] = curpos; 3936669Smckusick } 3946669Smckusick cp = point[curpos]; 3956669Smckusick switch (type(cp)) { 3966669Smckusick case CHAR: 3976669Smckusick k = (int) right(cp); 3986669Smckusick if (isyms[k] == 0 && symbol[k] == 0) { 3996669Smckusick symbol[k] = 1; 4006669Smckusick ssyms[ssmax++] = k; 4016669Smckusick } 4026669Smckusick break; 4036669Smckusick case DOT: 4046669Smckusick for (k=1; k<NCHARS; k++) { 4056669Smckusick if (k != HAT) { 4066669Smckusick if (isyms[k] == 0 && symbol[k] == 0) { 4076669Smckusick symbol[k] = 1; 4086669Smckusick ssyms[ssmax++] = k; 4096669Smckusick } 4106669Smckusick } 4116669Smckusick } 4126669Smckusick break; 4136669Smckusick case CCL: 4146669Smckusick for (p = (char *) right(cp); *p; p++) { 4156669Smckusick if (*p != HAT) { 4166669Smckusick if (isyms[*p] == 0 && symbol[*p] == 0) { 4176669Smckusick symbol[*p] = 1; 4186669Smckusick ssyms[ssmax++] = *p; 4196669Smckusick } 4206669Smckusick } 4216669Smckusick } 4226669Smckusick break; 4236669Smckusick case NCCL: 4246669Smckusick for (k=1; k<NCHARS; k++) { 4256669Smckusick if (k != HAT && !member(k, (char *) right(cp))) { 4266669Smckusick if (isyms[k] == 0 && symbol[k] == 0) { 4276669Smckusick symbol[k] = 1; 4286669Smckusick ssyms[ssmax++] = k; 4296669Smckusick } 4306669Smckusick } 4316669Smckusick } 4326669Smckusick } 4336669Smckusick } 4346669Smckusick for (j=0; j<ssmax; j++) { /* nextstate(s, ssyms[j]) */ 4356669Smckusick c = ssyms[j]; 4366669Smckusick symbol[c] = 0; 4376669Smckusick setcnt = 0; 4386669Smckusick for (k=0; k<=line; k++) setvec[k] = 0; 4396669Smckusick for (i=0; i<spmax; i++) { 4406669Smckusick index[sposns[i]] = 0; 4416669Smckusick cp = point[sposns[i]]; 4426669Smckusick if ((k = type(cp)) != FINAL) 4436669Smckusick if (k == CHAR && c == (int) right(cp) 4446669Smckusick || k == DOT 4456669Smckusick || k == CCL && member(c, (char *) right(cp)) 4466669Smckusick || k == NCCL && !member(c, (char *) right(cp))) { 4476669Smckusick ptr = foll[sposns[i]]; 4486669Smckusick num = *ptr; 4496669Smckusick for (k=0; k<num; k++) { 4506669Smckusick if (setvec[*(++ptr)] != 1 4516669Smckusick && iposns[*ptr] != 1) { 4526669Smckusick setvec[*ptr] = 1; 4536669Smckusick setcnt++; 4546669Smckusick } 4556669Smckusick } 4566669Smckusick } 4576669Smckusick } /* end nextstate */ 4586669Smckusick if (notin(state, n, &prev)) { 4596669Smckusick if (n >= NSTATES) { 4606669Smckusick dprintf("cgotofn: notin; state = %d, n = %d\n", state, n, NULL); 4616669Smckusick overflo(); 4626669Smckusick } 4636669Smckusick state[++n] = add(setcnt); 4646669Smckusick dprintf(" delta(%d,%o) = %d", s,c,n); 4656669Smckusick dprintf(", ind = %d\n", ind+1, NULL, NULL); 4666669Smckusick fatab[++ind] = c; 4676669Smckusick fatab[++ind] = n; 4686669Smckusick numtrans++; 4696669Smckusick } 4706669Smckusick else { 4716669Smckusick if (prev != 0) { 4726669Smckusick dprintf(" delta(%d,%o) = %d", s,c,prev); 4736669Smckusick dprintf(", ind = %d\n", ind+1, NULL, NULL); 4746669Smckusick fatab[++ind] = c; 4756669Smckusick fatab[++ind] = prev; 4766669Smckusick numtrans++; 4776669Smckusick } 4786669Smckusick } 4796669Smckusick } 4806669Smckusick tenter: 4816669Smckusick if ((pfa = (struct fa *) malloc((numtrans + 1) * sizeof(struct fa))) == NULL) 4826669Smckusick overflo(); 4836669Smckusick where[s] = pfa; 4846669Smckusick if (finflg) 4856669Smckusick pfa->cch = -1; /* s is a final state */ 4866669Smckusick else 4876669Smckusick pfa->cch = numtrans; 4886669Smckusick pfa->st = 0; 4896669Smckusick for (i=1, pfa += 1; i<=numtrans; i++, pfa++) { 4906669Smckusick pfa->cch = fatab[2*i-1]; 4916669Smckusick pfa->st = (struct fa *) fatab[2*i]; 4926669Smckusick } 4936669Smckusick } 4946669Smckusick for (i=0; i<=n; i++) { 495*29543Smckusick /* N.b. state[0] == foll[0], not separately allocated */ 496*29543Smckusick if (i>0) 497*29543Smckusick xfree(state[i]); /* free state[i] */ 4986669Smckusick pfa = where[i]; 4996669Smckusick pfa->st = where[0]; 5006669Smckusick dprintf("state %d: (%o)\n", i, pfa, NULL); 5016669Smckusick dprintf(" numtrans = %d, default = %o\n", pfa->cch, pfa->st, NULL); 5026669Smckusick for (k=1; k<=pfa->cch; k++) { 5036669Smckusick (pfa+k)->st = where[ (int) (pfa+k)->st]; 5046669Smckusick dprintf(" char = %o, nextstate = %o\n",(pfa+k)->cch, (pfa+k)->st, NULL); 5056669Smckusick } 5066669Smckusick } 5076669Smckusick pfa = where[0]; 5086669Smckusick if ((num = pfa->cch) < 0) 5096669Smckusick return(where[0]); 5106669Smckusick for (pfa += num; num; num--, pfa--) 5116669Smckusick if (pfa->cch == HAT) { 5126669Smckusick return(pfa->st); 5136669Smckusick } 5146669Smckusick return(where[0]); 5156669Smckusick } 5166669Smckusick 5176669Smckusick match(pfa, p) 5186669Smckusick register struct fa *pfa; 5196669Smckusick register char *p; 5206669Smckusick { 5216669Smckusick register count; 5226669Smckusick char c; 5236669Smckusick if (p == 0) return(0); 5246669Smckusick if (pfa->cch == 1) { /* fast test for first character, if possible */ 5256669Smckusick c = (++pfa)->cch; 5266669Smckusick do 5276669Smckusick if (c == *p) { 5286669Smckusick p++; 5296669Smckusick pfa = pfa->st; 5306669Smckusick goto adv; 5316669Smckusick } 5326669Smckusick while (*p++ != 0); 5336669Smckusick return(0); 5346669Smckusick } 5356669Smckusick adv: if ((count = pfa->cch) < 0) return(1); 5366669Smckusick do { 5376669Smckusick for (pfa += count; count; count--, pfa--) 5386669Smckusick if (pfa->cch == *p) { 5396669Smckusick break; 5406669Smckusick } 5416669Smckusick pfa = pfa->st; 5426669Smckusick if ((count = pfa->cch) < 0) return(1); 5436669Smckusick } while(*p++ != 0); 5446669Smckusick return(0); 5456669Smckusick } 546*29543Smckusick 547*29543Smckusick /* 548*29543Smckusick * Free foll[i], taking into account identical foll[] entries. 549*29543Smckusick * This is necessary because cfoll() uses the same physical follow set for 550*29543Smckusick * several foll[] entries when the set is identical. Called by freetr(). 551*29543Smckusick */ 552*29543Smckusick foll_free(i) 553*29543Smckusick int i; 554*29543Smckusick { 555*29543Smckusick register int j; 556*29543Smckusick int *p = foll[i]; 557*29543Smckusick if (p==NULL) return; 558*29543Smckusick for (j=0; j<=maxfoll; j++) 559*29543Smckusick if (foll[j]==p) foll[j]=NULL; 560*29543Smckusick xfree(p); 561*29543Smckusick } 562