xref: /csrg-svn/old/awk/b.c (revision 29543)
1 #ifndef lint
2 static char sccsid[] = "@(#)b.c	4.3 07/07/86";
3 #endif
4 
5 #include "awk.def"
6 #include "stdio.h"
7 #include "awk.h"
8 
9 extern node *op2();
10 extern struct fa *cgotofn();
11 #define MAXLIN 256
12 #define NCHARS 128
13 #define NSTATES 256
14 
15 #define type(v)	v->nobj
16 #define left(v)	v->narg[0]
17 #define right(v)	v->narg[1]
18 #define parent(v)	v->nnext
19 
20 #define LEAF	case CCL: case NCCL: case CHAR: case DOT:
21 #define UNARY	case FINAL: case STAR: case PLUS: case QUEST:
22 
23 /* encoding in tree nodes:
24 	leaf (CCL, NCCL, CHAR, DOT): left is index, right contains value or pointer to value
25 	unary (FINAL, STAR, PLUS, QUEST): left is child, right is null
26 	binary (CAT, OR): left and right are children
27 	parent contains pointer to parent
28 */
29 
30 struct fa {
31 	int cch;
32 	struct fa *st;
33 };
34 
35 int	*state[NSTATES];
36 int	*foll[MAXLIN];
37 char	chars[MAXLIN];
38 int	setvec[MAXLIN];
39 node	*point[MAXLIN];
40 
41 int	setcnt;
42 int	line;
43 int	maxfoll;  /* index of highest foll[] entry set by cfoll() */
44 
45 
46 struct fa *makedfa(p)	/* returns dfa for tree pointed to by p */
47 node *p;
48 {
49 	node *p1;
50 	struct fa *fap;
51 	p1 = op2(CAT, op2(STAR, op2(DOT, (node *) 0, (node *) 0), (node *) 0), p);
52 		/* put DOT STAR in front of reg. exp. */
53 	p1 = op2(FINAL, p1, (node *) 0);		/* install FINAL node */
54 
55 	line = 0;
56 	penter(p1);	/* enter parent pointers and leaf indices */
57 	point[line] = p1;	/* FINAL node */
58 	setvec[0] = 1;		/* for initial DOT STAR */
59 	cfoll(p1);	/* set up follow sets */
60 	fap = cgotofn();
61 	freetr(p1);	/* add this when alloc works */
62 	return(fap);
63 }
64 
65 penter(p)	/* set up parent pointers and leaf indices */
66 node *p;
67 {
68 	switch(type(p)) {
69 		LEAF
70 			left(p) = (node *) line;
71 			point[line++] = p;
72 			break;
73 		UNARY
74 			penter(left(p));
75 			parent(left(p)) = p;
76 			break;
77 		case CAT:
78 		case OR:
79 			penter(left(p));
80 			penter(right(p));
81 			parent(left(p)) = p;
82 			parent(right(p)) = p;
83 			break;
84 		default:
85 			error(FATAL, "unknown type %d in penter\n", type(p));
86 			break;
87 	}
88 }
89 
90 freetr(p)	/* free parse tree and follow sets */
91 node *p;
92 {
93 	switch(type(p)) {
94 		LEAF
95 			foll_free((int) left(p));
96 			xfree(p);
97 			break;
98 		UNARY
99 			freetr(left(p));
100 			xfree(p);
101 			break;
102 		case CAT:
103 		case OR:
104 			freetr(left(p));
105 			freetr(right(p));
106 			xfree(p);
107 			break;
108 		default:
109 			error(FATAL, "unknown type %d in freetr", type(p));
110 			break;
111 	}
112 }
113 char *cclenter(p)
114 register char *p;
115 {
116 	register i, c;
117 	char *op;
118 
119 	op = p;
120 	i = 0;
121 	while ((c = *p++) != 0) {
122 		if (c == '-' && i > 0 && chars[i-1] != 0) {
123 			if (*p != 0) {
124 				c = chars[i-1];
125 				while (c < *p) {
126 					if (i >= MAXLIN)
127 						overflo();
128 					chars[i++] = ++c;
129 				}
130 				p++;
131 				continue;
132 			}
133 		}
134 		if (i >= MAXLIN)
135 			overflo();
136 		chars[i++] = c;
137 	}
138 	chars[i++] = '\0';
139 	dprintf("cclenter: in = |%s|, out = |%s|\n", op, chars, NULL);
140 	xfree(op);
141 	return(tostring(chars));
142 }
143 
144 overflo()
145 {
146 	error(FATAL, "regular expression too long\n");
147 }
148 
149 cfoll(v)		/* enter follow set of each leaf of vertex v into foll[leaf] */
150 register node *v;
151 {
152 	register i;
153 	int prev;
154 	int *add();
155 
156 	maxfoll = -1;
157 	switch(type(v)) {
158 		LEAF
159 			setcnt = 0;
160 			for (i=1; i<=line; i++)
161 				setvec[i] = 0;
162 			follow(v);
163 			if (notin(foll, ( (int) left(v))-1, &prev)) {
164 				foll[(int) left(v)] = add(setcnt);
165 			}
166 			else
167 				foll[ (int) left(v)] = foll[prev];
168 			if ((int)left(v) > maxfoll)
169 				maxfoll = (int)left(v);
170 			break;
171 		UNARY
172 			cfoll(left(v));
173 			break;
174 		case CAT:
175 		case OR:
176 			cfoll(left(v));
177 			cfoll(right(v));
178 			break;
179 		default:
180 			error(FATAL, "unknown type %d in cfoll", type(v));
181 	}
182 }
183 
184 first(p)			/* collects initially active leaves of p into setvec */
185 register node *p;		/* returns 0 or 1 depending on whether p matches empty string */
186 {
187 	register b;
188 
189 	switch(type(p)) {
190 		LEAF
191 			if (setvec[(int) left(p)] != 1) {
192 				setvec[(int) left(p)] = 1;
193 				setcnt++;
194 			}
195 			if (type(p) == CCL && (*(char *) right(p)) == '\0')
196 				return(0);		/* empty CCL */
197 			else return(1);
198 		case FINAL:
199 		case PLUS:
200 			if (first(left(p)) == 0) return(0);
201 			return(1);
202 		case STAR:
203 		case QUEST:
204 			first(left(p));
205 			return(0);
206 		case CAT:
207 			if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
208 			return(1);
209 		case OR:
210 			b = first(right(p));
211 			if (first(left(p)) == 0 || b == 0) return(0);
212 			return(1);
213 	}
214 	error(FATAL, "unknown type %d in first\n", type(p));
215 	return(-1);
216 }
217 
218 follow(v)
219 node *v;		/* collects leaves that can follow v into setvec */
220 {
221 	node *p;
222 
223 	if (type(v) == FINAL)
224 		return;
225 	p = parent(v);
226 	switch (type(p)) {
227 		case STAR:
228 		case PLUS:	first(v);
229 				follow(p);
230 				return;
231 
232 		case OR:
233 		case QUEST:	follow(p);
234 				return;
235 
236 		case CAT:	if (v == left(p)) {	/* v is left child of p */
237 					if (first(right(p)) == 0) {
238 						follow(p);
239 						return;
240 					}
241 				}
242 				else		/* v is right child */
243 					follow(p);
244 				return;
245 		case FINAL:	if (setvec[line] != 1) {
246 					setvec[line] = 1;
247 					setcnt++;
248 				}
249 				return;
250 	}
251 }
252 
253 member(c, s)	/* is c in s? */
254 register char c, *s;
255 {
256 	while (*s)
257 		if (c == *s++)
258 			return(1);
259 	return(0);
260 }
261 
262 notin(array, n, prev)		/* is setvec in array[0] thru array[n]? */
263 int **array;
264 int *prev; {
265 	register i, j;
266 	int *ptr;
267 	for (i=0; i<=n; i++) {
268 		ptr = array[i];
269 		if (*ptr == setcnt) {
270 			for (j=0; j < setcnt; j++)
271 				if (setvec[*(++ptr)] != 1) goto nxt;
272 			*prev = i;
273 			return(0);
274 		}
275 		nxt: ;
276 	}
277 	return(1);
278 }
279 
280 int *add(n) {		/* remember setvec */
281 	int *ptr, *p;
282 	register i;
283 	if ((p = ptr = (int *) malloc((n+1)*sizeof(int))) == NULL)
284 		overflo();
285 	*ptr = n;
286 	dprintf("add(%d)\n", n, NULL, NULL);
287 	for (i=1; i <= line; i++)
288 		if (setvec[i] == 1) {
289 			*(++ptr) = i;
290 			dprintf("  ptr = %o, *ptr = %d, i = %d\n", ptr, *ptr, i);
291 		}
292 	dprintf("\n", NULL, NULL, NULL);
293 	return(p);
294 }
295 
296 struct fa *cgotofn()
297 {
298 	register i, k;
299 	register int *ptr;
300 	char c;
301 	char *p;
302 	node *cp;
303 	int j, n, s, ind, numtrans;
304 	int finflg;
305 	int curpos, num, prev;
306 	struct fa *where[NSTATES];
307 
308 	int fatab[257];
309 	struct fa *pfa;
310 
311 	char index[MAXLIN];
312 	char iposns[MAXLIN];
313 	int sposns[MAXLIN];
314 	int spmax, spinit;
315 
316 	char symbol[NCHARS];
317 	char isyms[NCHARS];
318 	char ssyms[NCHARS];
319 	int ssmax, ssinit;
320 
321 	for (i=0; i<=line; i++) index[i] = iposns[i] = setvec[i] = 0;
322 	for (i=0; i<NCHARS; i++)  isyms[i] = symbol[i] = 0;
323 	setcnt = 0;
324 	/* compute initial positions and symbols of state 0 */
325 	ssmax = 0;
326 	ptr = state[0] = foll[0];
327 	spinit = *ptr;
328 	for (i=0; i<spinit; i++) {
329 		curpos = *(++ptr);
330 		sposns[i] = curpos;
331 		iposns[curpos] = 1;
332 		cp = point[curpos];
333 		dprintf("i = %d, spinit = %d, curpos = %d\n", i, spinit, curpos);
334 		switch (type(cp)) {
335 			case CHAR:
336 				k = (int) right(cp);
337 				if (isyms[k] != 1) {
338 					isyms[k] = 1;
339 					ssyms[ssmax++] = k;
340 				}
341 				break;
342 			case DOT:
343 				for (k=1; k<NCHARS; k++) {
344 					if (k != HAT) {
345 						if (isyms[k] != 1) {
346 							isyms[k] = 1;
347 							ssyms[ssmax++] = k;
348 						}
349 					}
350 				}
351 				break;
352 			case CCL:
353 				for (p = (char *) right(cp); *p; p++) {
354 					if (*p != HAT) {
355 						if (isyms[*p] != 1) {
356 							isyms[*p] = 1;
357 							ssyms[ssmax++] = *p;
358 						}
359 					}
360 				}
361 				break;
362 			case NCCL:
363 				for (k=1; k<NCHARS; k++) {
364 					if (k != HAT && !member(k, (char *) right(cp))) {
365 						if (isyms[k] != 1) {
366 							isyms[k] = 1;
367 							ssyms[ssmax++] = k;
368 						}
369 					}
370 				}
371 		}
372 	}
373 	ssinit = ssmax;
374 	n = 0;
375 	for (s=0; s<=n; s++)  {
376 	dprintf("s = %d\n", s, NULL, NULL);
377 		ind = 0;
378 		numtrans = 0;
379 		finflg = 0;
380 		if (*(state[s] + *state[s]) == line) {		/* s final? */
381 			finflg = 1;
382 			goto tenter;
383 		}
384 		spmax = spinit;
385 		ssmax = ssinit;
386 		ptr = state[s];
387 		num = *ptr;
388 		for (i=0; i<num; i++) {
389 			curpos = *(++ptr);
390 			if (iposns[curpos] != 1 && index[curpos] != 1) {
391 				index[curpos] = 1;
392 				sposns[spmax++] = curpos;
393 			}
394 			cp = point[curpos];
395 			switch (type(cp)) {
396 				case CHAR:
397 					k = (int) right(cp);
398 					if (isyms[k] == 0 && symbol[k] == 0) {
399 						symbol[k] = 1;
400 						ssyms[ssmax++] = k;
401 					}
402 					break;
403 				case DOT:
404 					for (k=1; k<NCHARS; k++) {
405 						if (k != HAT) {
406 							if (isyms[k] == 0 && symbol[k] == 0) {
407 								symbol[k] = 1;
408 								ssyms[ssmax++] = k;
409 							}
410 						}
411 					}
412 					break;
413 				case CCL:
414 					for (p = (char *) right(cp); *p; p++) {
415 						if (*p != HAT) {
416 							if (isyms[*p] == 0 && symbol[*p] == 0) {
417 								symbol[*p] = 1;
418 								ssyms[ssmax++] = *p;
419 							}
420 						}
421 					}
422 					break;
423 				case NCCL:
424 					for (k=1; k<NCHARS; k++) {
425 						if (k != HAT && !member(k, (char *) right(cp))) {
426 							if (isyms[k] == 0 && symbol[k] == 0) {
427 								symbol[k] = 1;
428 								ssyms[ssmax++] = k;
429 							}
430 						}
431 					}
432 			}
433 		}
434 		for (j=0; j<ssmax; j++) {	/* nextstate(s, ssyms[j]) */
435 			c = ssyms[j];
436 			symbol[c] = 0;
437 			setcnt = 0;
438 			for (k=0; k<=line; k++) setvec[k] = 0;
439 			for (i=0; i<spmax; i++) {
440 				index[sposns[i]] = 0;
441 				cp = point[sposns[i]];
442 				if ((k = type(cp)) != FINAL)
443 					if (k == CHAR && c == (int) right(cp)
444 					 || k == DOT
445 					 || k == CCL && member(c, (char *) right(cp))
446 					 || k == NCCL && !member(c, (char *) right(cp))) {
447 						ptr = foll[sposns[i]];
448 						num = *ptr;
449 						for (k=0; k<num; k++) {
450 							if (setvec[*(++ptr)] != 1
451 								&& iposns[*ptr] != 1) {
452 								setvec[*ptr] = 1;
453 								setcnt++;
454 							}
455 						}
456 					}
457 			} /* end nextstate */
458 			if (notin(state, n, &prev)) {
459 				if (n >= NSTATES) {
460 					dprintf("cgotofn: notin; state = %d, n = %d\n", state, n, NULL);
461 					overflo();
462 				}
463 				state[++n] = add(setcnt);
464 				dprintf("	delta(%d,%o) = %d", s,c,n);
465 				dprintf(", ind = %d\n", ind+1, NULL, NULL);
466 				fatab[++ind] = c;
467 				fatab[++ind] = n;
468 				numtrans++;
469 			}
470 			else {
471 				if (prev != 0) {
472 					dprintf("	delta(%d,%o) = %d", s,c,prev);
473 					dprintf(", ind = %d\n", ind+1, NULL, NULL);
474 					fatab[++ind] = c;
475 					fatab[++ind] = prev;
476 					numtrans++;
477 				}
478 			}
479 		}
480 	tenter:
481 		if ((pfa = (struct fa *) malloc((numtrans + 1) * sizeof(struct fa))) == NULL)
482 			overflo();
483 		where[s] = pfa;
484 		if (finflg)
485 			pfa->cch = -1;		/* s is a final state */
486 		else
487 			pfa->cch = numtrans;
488 		pfa->st = 0;
489 		for (i=1, pfa += 1; i<=numtrans; i++, pfa++) {
490 			pfa->cch = fatab[2*i-1];
491 			pfa->st = (struct fa *) fatab[2*i];
492 		}
493 	}
494 	for (i=0; i<=n; i++) {
495 		/* N.b. state[0] == foll[0], not separately allocated */
496 		if (i>0)
497 			xfree(state[i]);       /* free state[i] */
498 		pfa = where[i];
499 		pfa->st = where[0];
500 		dprintf("state %d: (%o)\n", i, pfa, NULL);
501 		dprintf("	numtrans = %d,	default = %o\n", pfa->cch, pfa->st, NULL);
502 		for (k=1; k<=pfa->cch; k++) {
503 			(pfa+k)->st = where[ (int) (pfa+k)->st];
504 			dprintf("	char = %o,	nextstate = %o\n",(pfa+k)->cch, (pfa+k)->st, NULL);
505 		}
506 	}
507 	pfa = where[0];
508 	if ((num = pfa->cch) < 0)
509 		return(where[0]);
510 	for (pfa += num; num; num--, pfa--)
511 		if (pfa->cch == HAT) {
512 			return(pfa->st);
513 		}
514 	return(where[0]);
515 }
516 
517 match(pfa, p)
518 register struct fa *pfa;
519 register char *p;
520 {
521 	register count;
522 	char c;
523 	if (p == 0) return(0);
524 	if (pfa->cch == 1) {		/* fast test for first character, if possible */
525 		c = (++pfa)->cch;
526 		do
527 			if (c == *p) {
528 				p++;
529 				pfa = pfa->st;
530 				goto adv;
531 			}
532 		while (*p++ != 0);
533 		return(0);
534 	}
535    adv: if ((count = pfa->cch) < 0) return(1);
536 	do {
537 		for (pfa += count; count; count--, pfa--)
538 			if (pfa->cch == *p) {
539 				break;
540 			}
541 		pfa = pfa->st;
542 		if ((count = pfa->cch) < 0) return(1);
543 	} while(*p++ != 0);
544 	return(0);
545 }
546 
547 /*
548  * Free foll[i], taking into account identical foll[] entries.
549  * This is necessary because cfoll() uses the same physical follow set for
550  * several foll[] entries when the set is identical.  Called by freetr().
551  */
552 foll_free(i)
553 int i;
554 {
555 	register int j;
556 	int *p = foll[i];
557 	if (p==NULL) return;
558 	for (j=0; j<=maxfoll; j++)
559 		if (foll[j]==p) foll[j]=NULL;
560 	xfree(p);
561 }
562