xref: /csrg-svn/old/awk/b.c (revision 14472)
1 #ifndef lint
2 static char sccsid[] = "@(#)b.c	4.2 08/11/83";
3 #endif
4 
5 #include "awk.def"
6 #include "stdio.h"
7 #include "awk.h"
8 
9 extern node *op2();
10 extern struct fa *cgotofn();
11 #define MAXLIN 256
12 #define NCHARS 128
13 #define NSTATES 256
14 
15 #define type(v)	v->nobj
16 #define left(v)	v->narg[0]
17 #define right(v)	v->narg[1]
18 #define parent(v)	v->nnext
19 
20 #define LEAF	case CCL: case NCCL: case CHAR: case DOT:
21 #define UNARY	case FINAL: case STAR: case PLUS: case QUEST:
22 
23 /* encoding in tree nodes:
24 	leaf (CCL, NCCL, CHAR, DOT): left is index, right contains value or pointer to value
25 	unary (FINAL, STAR, PLUS, QUEST): left is child, right is null
26 	binary (CAT, OR): left and right are children
27 	parent contains pointer to parent
28 */
29 
30 struct fa {
31 	int cch;
32 	struct fa *st;
33 };
34 
35 int	*state[NSTATES];
36 int	*foll[MAXLIN];
37 char	chars[MAXLIN];
38 int	setvec[MAXLIN];
39 node	*point[MAXLIN];
40 
41 int	setcnt;
42 int	line;
43 
44 
45 struct fa *makedfa(p)	/* returns dfa for tree pointed to by p */
46 node *p;
47 {
48 	node *p1;
49 	struct fa *fap;
50 	p1 = op2(CAT, op2(STAR, op2(DOT, (node *) 0, (node *) 0), (node *) 0), p);
51 		/* put DOT STAR in front of reg. exp. */
52 	p1 = op2(FINAL, p1, (node *) 0);		/* install FINAL node */
53 
54 	line = 0;
55 	penter(p1);	/* enter parent pointers and leaf indices */
56 	point[line] = p1;	/* FINAL node */
57 	setvec[0] = 1;		/* for initial DOT STAR */
58 	cfoll(p1);	/* set up follow sets */
59 	fap = cgotofn();
60 	freetr(p1);	/* add this when alloc works */
61 	return(fap);
62 }
63 
64 penter(p)	/* set up parent pointers and leaf indices */
65 node *p;
66 {
67 	switch(type(p)) {
68 		LEAF
69 			left(p) = (node *) line;
70 			point[line++] = p;
71 			break;
72 		UNARY
73 			penter(left(p));
74 			parent(left(p)) = p;
75 			break;
76 		case CAT:
77 		case OR:
78 			penter(left(p));
79 			penter(right(p));
80 			parent(left(p)) = p;
81 			parent(right(p)) = p;
82 			break;
83 		default:
84 			error(FATAL, "unknown type %d in penter\n", type(p));
85 			break;
86 	}
87 }
88 
89 freetr(p)	/* free parse tree and follow sets */
90 node *p;
91 {
92 	switch(type(p)) {
93 		LEAF
94 			xfree(foll[(int) left(p)]);
95 			xfree(p);
96 			break;
97 		UNARY
98 			freetr(left(p));
99 			xfree(p);
100 			break;
101 		case CAT:
102 		case OR:
103 			freetr(left(p));
104 			freetr(right(p));
105 			xfree(p);
106 			break;
107 		default:
108 			error(FATAL, "unknown type %d in freetr", type(p));
109 			break;
110 	}
111 }
112 char *cclenter(p)
113 register char *p;
114 {
115 	register i, c;
116 	char *op;
117 
118 	op = p;
119 	i = 0;
120 	while ((c = *p++) != 0) {
121 		if (c == '-' && i > 0 && chars[i-1] != 0) {
122 			if (*p != 0) {
123 				c = chars[i-1];
124 				while (c < *p) {
125 					if (i >= MAXLIN)
126 						overflo();
127 					chars[i++] = ++c;
128 				}
129 				p++;
130 				continue;
131 			}
132 		}
133 		if (i >= MAXLIN)
134 			overflo();
135 		chars[i++] = c;
136 	}
137 	chars[i++] = '\0';
138 	dprintf("cclenter: in = |%s|, out = |%s|\n", op, chars, NULL);
139 	xfree(op);
140 	return(tostring(chars));
141 }
142 
143 overflo()
144 {
145 	error(FATAL, "regular expression too long\n");
146 }
147 
148 cfoll(v)		/* enter follow set of each leaf of vertex v into foll[leaf] */
149 register node *v;
150 {
151 	register i;
152 	int prev;
153 	int *add();
154 
155 	switch(type(v)) {
156 		LEAF
157 			setcnt = 0;
158 			for (i=1; i<=line; i++)
159 				setvec[i] = 0;
160 			follow(v);
161 			if (notin(foll, ( (int) left(v))-1, &prev)) {
162 				foll[(int) left(v)] = add(setcnt);
163 			}
164 			else
165 				foll[ (int) left(v)] = foll[prev];
166 			break;
167 		UNARY
168 			cfoll(left(v));
169 			break;
170 		case CAT:
171 		case OR:
172 			cfoll(left(v));
173 			cfoll(right(v));
174 			break;
175 		default:
176 			error(FATAL, "unknown type %d in cfoll", type(v));
177 	}
178 }
179 
180 first(p)			/* collects initially active leaves of p into setvec */
181 register node *p;		/* returns 0 or 1 depending on whether p matches empty string */
182 {
183 	register b;
184 
185 	switch(type(p)) {
186 		LEAF
187 			if (setvec[(int) left(p)] != 1) {
188 				setvec[(int) left(p)] = 1;
189 				setcnt++;
190 			}
191 			if (type(p) == CCL && (*(char *) right(p)) == '\0')
192 				return(0);		/* empty CCL */
193 			else return(1);
194 		case FINAL:
195 		case PLUS:
196 			if (first(left(p)) == 0) return(0);
197 			return(1);
198 		case STAR:
199 		case QUEST:
200 			first(left(p));
201 			return(0);
202 		case CAT:
203 			if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
204 			return(1);
205 		case OR:
206 			b = first(right(p));
207 			if (first(left(p)) == 0 || b == 0) return(0);
208 			return(1);
209 	}
210 	error(FATAL, "unknown type %d in first\n", type(p));
211 	return(-1);
212 }
213 
214 follow(v)
215 node *v;		/* collects leaves that can follow v into setvec */
216 {
217 	node *p;
218 
219 	if (type(v) == FINAL)
220 		return;
221 	p = parent(v);
222 	switch (type(p)) {
223 		case STAR:
224 		case PLUS:	first(v);
225 				follow(p);
226 				return;
227 
228 		case OR:
229 		case QUEST:	follow(p);
230 				return;
231 
232 		case CAT:	if (v == left(p)) {	/* v is left child of p */
233 					if (first(right(p)) == 0) {
234 						follow(p);
235 						return;
236 					}
237 				}
238 				else		/* v is right child */
239 					follow(p);
240 				return;
241 		case FINAL:	if (setvec[line] != 1) {
242 					setvec[line] = 1;
243 					setcnt++;
244 				}
245 				return;
246 	}
247 }
248 
249 member(c, s)	/* is c in s? */
250 register char c, *s;
251 {
252 	while (*s)
253 		if (c == *s++)
254 			return(1);
255 	return(0);
256 }
257 
258 notin(array, n, prev)		/* is setvec in array[0] thru array[n]? */
259 int **array;
260 int *prev; {
261 	register i, j;
262 	int *ptr;
263 	for (i=0; i<=n; i++) {
264 		ptr = array[i];
265 		if (*ptr == setcnt) {
266 			for (j=0; j < setcnt; j++)
267 				if (setvec[*(++ptr)] != 1) goto nxt;
268 			*prev = i;
269 			return(0);
270 		}
271 		nxt: ;
272 	}
273 	return(1);
274 }
275 
276 int *add(n) {		/* remember setvec */
277 	int *ptr, *p;
278 	register i;
279 	if ((p = ptr = (int *) malloc((n+1)*sizeof(int))) == NULL)
280 		overflo();
281 	*ptr = n;
282 	dprintf("add(%d)\n", n, NULL, NULL);
283 	for (i=1; i <= line; i++)
284 		if (setvec[i] == 1) {
285 			*(++ptr) = i;
286 			dprintf("  ptr = %o, *ptr = %d, i = %d\n", ptr, *ptr, i);
287 		}
288 	dprintf("\n", NULL, NULL, NULL);
289 	return(p);
290 }
291 
292 struct fa *cgotofn()
293 {
294 	register i, k;
295 	register int *ptr;
296 	char c;
297 	char *p;
298 	node *cp;
299 	int j, n, s, ind, numtrans;
300 	int finflg;
301 	int curpos, num, prev;
302 	struct fa *where[NSTATES];
303 
304 	int fatab[257];
305 	struct fa *pfa;
306 
307 	char index[MAXLIN];
308 	char iposns[MAXLIN];
309 	int sposns[MAXLIN];
310 	int spmax, spinit;
311 
312 	char symbol[NCHARS];
313 	char isyms[NCHARS];
314 	char ssyms[NCHARS];
315 	int ssmax, ssinit;
316 
317 	for (i=0; i<=line; i++) index[i] = iposns[i] = setvec[i] = 0;
318 	for (i=0; i<NCHARS; i++)  isyms[i] = symbol[i] = 0;
319 	setcnt = 0;
320 	/* compute initial positions and symbols of state 0 */
321 	ssmax = 0;
322 	ptr = state[0] = foll[0];
323 	spinit = *ptr;
324 	for (i=0; i<spinit; i++) {
325 		curpos = *(++ptr);
326 		sposns[i] = curpos;
327 		iposns[curpos] = 1;
328 		cp = point[curpos];
329 		dprintf("i = %d, spinit = %d, curpos = %d\n", i, spinit, curpos);
330 		switch (type(cp)) {
331 			case CHAR:
332 				k = (int) right(cp);
333 				if (isyms[k] != 1) {
334 					isyms[k] = 1;
335 					ssyms[ssmax++] = k;
336 				}
337 				break;
338 			case DOT:
339 				for (k=1; k<NCHARS; k++) {
340 					if (k != HAT) {
341 						if (isyms[k] != 1) {
342 							isyms[k] = 1;
343 							ssyms[ssmax++] = k;
344 						}
345 					}
346 				}
347 				break;
348 			case CCL:
349 				for (p = (char *) right(cp); *p; p++) {
350 					if (*p != HAT) {
351 						if (isyms[*p] != 1) {
352 							isyms[*p] = 1;
353 							ssyms[ssmax++] = *p;
354 						}
355 					}
356 				}
357 				break;
358 			case NCCL:
359 				for (k=1; k<NCHARS; k++) {
360 					if (k != HAT && !member(k, (char *) right(cp))) {
361 						if (isyms[k] != 1) {
362 							isyms[k] = 1;
363 							ssyms[ssmax++] = k;
364 						}
365 					}
366 				}
367 		}
368 	}
369 	ssinit = ssmax;
370 	n = 0;
371 	for (s=0; s<=n; s++)  {
372 	dprintf("s = %d\n", s, NULL, NULL);
373 		ind = 0;
374 		numtrans = 0;
375 		finflg = 0;
376 		if (*(state[s] + *state[s]) == line) {		/* s final? */
377 			finflg = 1;
378 			goto tenter;
379 		}
380 		spmax = spinit;
381 		ssmax = ssinit;
382 		ptr = state[s];
383 		num = *ptr;
384 		for (i=0; i<num; i++) {
385 			curpos = *(++ptr);
386 			if (iposns[curpos] != 1 && index[curpos] != 1) {
387 				index[curpos] = 1;
388 				sposns[spmax++] = curpos;
389 			}
390 			cp = point[curpos];
391 			switch (type(cp)) {
392 				case CHAR:
393 					k = (int) right(cp);
394 					if (isyms[k] == 0 && symbol[k] == 0) {
395 						symbol[k] = 1;
396 						ssyms[ssmax++] = k;
397 					}
398 					break;
399 				case DOT:
400 					for (k=1; k<NCHARS; k++) {
401 						if (k != HAT) {
402 							if (isyms[k] == 0 && symbol[k] == 0) {
403 								symbol[k] = 1;
404 								ssyms[ssmax++] = k;
405 							}
406 						}
407 					}
408 					break;
409 				case CCL:
410 					for (p = (char *) right(cp); *p; p++) {
411 						if (*p != HAT) {
412 							if (isyms[*p] == 0 && symbol[*p] == 0) {
413 								symbol[*p] = 1;
414 								ssyms[ssmax++] = *p;
415 							}
416 						}
417 					}
418 					break;
419 				case NCCL:
420 					for (k=1; k<NCHARS; k++) {
421 						if (k != HAT && !member(k, (char *) right(cp))) {
422 							if (isyms[k] == 0 && symbol[k] == 0) {
423 								symbol[k] = 1;
424 								ssyms[ssmax++] = k;
425 							}
426 						}
427 					}
428 			}
429 		}
430 		for (j=0; j<ssmax; j++) {	/* nextstate(s, ssyms[j]) */
431 			c = ssyms[j];
432 			symbol[c] = 0;
433 			setcnt = 0;
434 			for (k=0; k<=line; k++) setvec[k] = 0;
435 			for (i=0; i<spmax; i++) {
436 				index[sposns[i]] = 0;
437 				cp = point[sposns[i]];
438 				if ((k = type(cp)) != FINAL)
439 					if (k == CHAR && c == (int) right(cp)
440 					 || k == DOT
441 					 || k == CCL && member(c, (char *) right(cp))
442 					 || k == NCCL && !member(c, (char *) right(cp))) {
443 						ptr = foll[sposns[i]];
444 						num = *ptr;
445 						for (k=0; k<num; k++) {
446 							if (setvec[*(++ptr)] != 1
447 								&& iposns[*ptr] != 1) {
448 								setvec[*ptr] = 1;
449 								setcnt++;
450 							}
451 						}
452 					}
453 			} /* end nextstate */
454 			if (notin(state, n, &prev)) {
455 				if (n >= NSTATES) {
456 					dprintf("cgotofn: notin; state = %d, n = %d\n", state, n, NULL);
457 					overflo();
458 				}
459 				state[++n] = add(setcnt);
460 				dprintf("	delta(%d,%o) = %d", s,c,n);
461 				dprintf(", ind = %d\n", ind+1, NULL, NULL);
462 				fatab[++ind] = c;
463 				fatab[++ind] = n;
464 				numtrans++;
465 			}
466 			else {
467 				if (prev != 0) {
468 					dprintf("	delta(%d,%o) = %d", s,c,prev);
469 					dprintf(", ind = %d\n", ind+1, NULL, NULL);
470 					fatab[++ind] = c;
471 					fatab[++ind] = prev;
472 					numtrans++;
473 				}
474 			}
475 		}
476 	tenter:
477 		if ((pfa = (struct fa *) malloc((numtrans + 1) * sizeof(struct fa))) == NULL)
478 			overflo();
479 		where[s] = pfa;
480 		if (finflg)
481 			pfa->cch = -1;		/* s is a final state */
482 		else
483 			pfa->cch = numtrans;
484 		pfa->st = 0;
485 		for (i=1, pfa += 1; i<=numtrans; i++, pfa++) {
486 			pfa->cch = fatab[2*i-1];
487 			pfa->st = (struct fa *) fatab[2*i];
488 		}
489 	}
490 	for (i=0; i<=n; i++) {
491 		xfree(state[i]);	/* free state[i] */
492 		pfa = where[i];
493 		pfa->st = where[0];
494 		dprintf("state %d: (%o)\n", i, pfa, NULL);
495 		dprintf("	numtrans = %d,	default = %o\n", pfa->cch, pfa->st, NULL);
496 		for (k=1; k<=pfa->cch; k++) {
497 			(pfa+k)->st = where[ (int) (pfa+k)->st];
498 			dprintf("	char = %o,	nextstate = %o\n",(pfa+k)->cch, (pfa+k)->st, NULL);
499 		}
500 	}
501 	pfa = where[0];
502 	if ((num = pfa->cch) < 0)
503 		return(where[0]);
504 	for (pfa += num; num; num--, pfa--)
505 		if (pfa->cch == HAT) {
506 			return(pfa->st);
507 		}
508 	return(where[0]);
509 }
510 
511 match(pfa, p)
512 register struct fa *pfa;
513 register char *p;
514 {
515 	register count;
516 	char c;
517 	if (p == 0) return(0);
518 	if (pfa->cch == 1) {		/* fast test for first character, if possible */
519 		c = (++pfa)->cch;
520 		do
521 			if (c == *p) {
522 				p++;
523 				pfa = pfa->st;
524 				goto adv;
525 			}
526 		while (*p++ != 0);
527 		return(0);
528 	}
529    adv: if ((count = pfa->cch) < 0) return(1);
530 	do {
531 		for (pfa += count; count; count--, pfa--)
532 			if (pfa->cch == *p) {
533 				break;
534 			}
535 		pfa = pfa->st;
536 		if ((count = pfa->cch) < 0) return(1);
537 	} while(*p++ != 0);
538 	return(0);
539 }
540