xref: /netbsd-src/external/historical/nawk/dist/b.c (revision b757af438b42b93f8c6571f026d8b8ef3eaf5fc9)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #if HAVE_NBTOOL_CONFIG_H
28 #include "nbtool_config.h"
29 #endif
30 
31 #define	DEBUG
32 
33 #include <ctype.h>
34 #include <stdio.h>
35 #include <string.h>
36 #include <stdlib.h>
37 #include <assert.h>
38 #include "awk.h"
39 #include "awkgram.h"
40 
41 #define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
42 				/* NCHARS is 2**n */
43 #define MAXLIN 22
44 
45 #define type(v)		(v)->nobj	/* badly overloaded here */
46 #define info(v)		(v)->ntype	/* badly overloaded here */
47 #define left(v)		(v)->narg[0]
48 #define right(v)	(v)->narg[1]
49 #define parent(v)	(v)->nnext
50 
51 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
52 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
53 #define UNARY	case STAR: case PLUS: case QUEST:
54 
55 /* encoding in tree Nodes:
56 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
57 		left is index, right contains value or pointer to value
58 	unary (STAR, PLUS, QUEST): left is child, right is null
59 	binary (CAT, OR): left and right are children
60 	parent contains pointer to parent
61 */
62 
63 
64 int	*setvec;
65 int	*tmpset;
66 int	maxsetvec = 0;
67 
68 int	rtok;		/* next token in current re */
69 int	rlxval;
70 static const uschar	*rlxstr;
71 static const uschar	*prestr;	/* current position in current re */
72 static const uschar	*lastre;	/* origin of last re */
73 
74 static	int setcnt;
75 static	int poscnt;
76 
77 uschar	*patbeg;
78 int	patlen;
79 
80 #define	NFA	128	/* cache this many dynamic fa's */
81 fa	*fatab[NFA];
82 int	nfatab	= 0;	/* entries in fatab */
83 
84 static void
85 resizesetvec(const char *msg)
86 {
87 	if (maxsetvec == 0)
88 		maxsetvec = MAXLIN;
89 	else
90 		maxsetvec *= 4;
91 	setvec = realloc(setvec, maxsetvec * sizeof(*setvec));
92 	tmpset = realloc(tmpset, maxsetvec * sizeof(*tmpset));
93 	if (setvec == 0 || tmpset == 0)
94 	    overflo(msg);
95 }
96 
97 static void
98 resize_state(fa *f, int state)
99 {
100 	void *p;
101 	int i, new_count;
102 
103 	if (++state < f->state_count)
104 		return;
105 
106 	new_count = state + 10; /* needs to be tuned */
107 
108 	p = realloc(f->gototab, new_count * sizeof(f->gototab[0]));
109 	if (p == NULL)
110 		goto out;
111 	f->gototab = p;
112 
113 	p = realloc(f->out, new_count * sizeof(f->out[0]));
114 	if (p == NULL)
115 		goto out;
116 	f->out = p;
117 
118 	p = realloc(f->posns, new_count * sizeof(f->posns[0]));
119 	if (p == NULL)
120 		goto out;
121 	f->posns = p;
122 
123 	for (i = f->state_count; i < new_count; ++i) {
124 		f->gototab[i] = calloc(1, NCHARS * sizeof (**f->gototab));
125 		if (f->gototab[i] == NULL)
126 			goto out;
127 		f->out[i]  = 0;
128 		f->posns[i] = NULL;
129 	}
130 	f->state_count = new_count;
131 	return;
132 out:
133 	overflo("out of memory in resize_state");
134 }
135 
136 fa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
137 {
138 	int i, use, nuse;
139 	fa *pfa;
140 	static int now = 1;
141 
142 	if (setvec == 0)	/* first time through any RE */
143 		resizesetvec("out of space initializing makedfa");
144 
145 	if (compile_time)	/* a constant for sure */
146 		return mkdfa(s, anchor);
147 	for (i = 0; i < nfatab; i++)	/* is it there already? */
148 		if (fatab[i]->anchor == anchor
149 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
150 			fatab[i]->use = now++;
151 			return fatab[i];
152 		}
153 	pfa = mkdfa(s, anchor);
154 	if (nfatab < NFA) {	/* room for another */
155 		fatab[nfatab] = pfa;
156 		fatab[nfatab]->use = now++;
157 		nfatab++;
158 		return pfa;
159 	}
160 	use = fatab[0]->use;	/* replace least-recently used */
161 	nuse = 0;
162 	for (i = 1; i < nfatab; i++)
163 		if (fatab[i]->use < use) {
164 			use = fatab[i]->use;
165 			nuse = i;
166 		}
167 	freefa(fatab[nuse]);
168 	fatab[nuse] = pfa;
169 	pfa->use = now++;
170 	return pfa;
171 }
172 
173 fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
174 				/* anchor = 1 for anchored matches, else 0 */
175 {
176 	Node *p, *p1;
177 	fa *f;
178 
179 	p = reparse(s);
180 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
181 		/* put ALL STAR in front of reg.  exp. */
182 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
183 		/* put FINAL after reg.  exp. */
184 
185 	poscnt = 0;
186 	penter(p1);	/* enter parent pointers and leaf indices */
187 	if ((f = calloc(1, sizeof(*f) + poscnt*sizeof(rrow))) == NULL)
188 		overflo("out of space for fa");
189 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
190 	cfoll(f, p1);	/* set up follow sets */
191 	freetr(p1);
192 	resize_state(f, 1);
193 	if ((f->posns[0] = calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
194 			overflo("out of space in makedfa");
195 	if ((f->posns[1] = calloc(1, sizeof(int))) == NULL)
196 		overflo("out of space in makedfa");
197 	*f->posns[1] = 0;
198 	f->initstat = makeinit(f, anchor);
199 	f->anchor = anchor;
200 	f->restr = (uschar *) tostring(s);
201 	return f;
202 }
203 
204 int makeinit(fa *f, int anchor)
205 {
206 	int i, k;
207 
208 	resize_state(f, 2);
209 	f->curstat = 2;
210 	f->out[2] = 0;
211 	k = *(f->re[0].lfollow);
212 	xfree(f->posns[2]);
213 	if ((f->posns[2] = calloc(1, (k+1)*sizeof(int))) == NULL)
214 		overflo("out of space in makeinit");
215 	for (i=0; i <= k; i++) {
216 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
217 	}
218 	if ((f->posns[2])[1] == f->accept)
219 		f->out[2] = 1;
220 	for (i=0; i < NCHARS; i++)
221 		f->gototab[2][i] = 0;
222 	f->curstat = cgoto(f, 2, HAT);
223 	if (anchor) {
224 		*f->posns[2] = k-1;	/* leave out position 0 */
225 		for (i=0; i < k; i++) {
226 			(f->posns[0])[i] = (f->posns[2])[i];
227 		}
228 
229 		f->out[0] = f->out[2];
230 		if (f->curstat != 2) {
231 			resize_state(f, f->curstat);
232 			--(*f->posns[f->curstat]);
233 		}
234 	}
235 	return f->curstat;
236 }
237 
238 void penter(Node *p)	/* set up parent pointers and leaf indices */
239 {
240 	switch (type(p)) {
241 	ELEAF
242 	LEAF
243 		info(p) = poscnt;
244 		poscnt++;
245 		break;
246 	UNARY
247 		penter(left(p));
248 		parent(left(p)) = p;
249 		break;
250 	case CAT:
251 	case OR:
252 		penter(left(p));
253 		penter(right(p));
254 		parent(left(p)) = p;
255 		parent(right(p)) = p;
256 		break;
257 	default:	/* can't happen */
258 		FATAL("can't happen: unknown type %d in penter", type(p));
259 		break;
260 	}
261 }
262 
263 void freetr(Node *p)	/* free parse tree */
264 {
265 	switch (type(p)) {
266 	ELEAF
267 	LEAF
268 		xfree(p);
269 		break;
270 	UNARY
271 		freetr(left(p));
272 		xfree(p);
273 		break;
274 	case CAT:
275 	case OR:
276 		freetr(left(p));
277 		freetr(right(p));
278 		xfree(p);
279 		break;
280 	default:	/* can't happen */
281 		FATAL("can't happen: unknown type %d in freetr", type(p));
282 		break;
283 	}
284 }
285 
286 /* in the parsing of regular expressions, metacharacters like . have */
287 /* to be seen literally;  \056 is not a metacharacter. */
288 
289 int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
290 {			/* only pick up one 8-bit byte (2 chars) */
291 	const uschar *p;
292 	int n = 0;
293 	int i;
294 
295 	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
296 		if (isdigit(*p))
297 			n = 16 * n + *p - '0';
298 		else if (*p >= 'a' && *p <= 'f')
299 			n = 16 * n + *p - 'a' + 10;
300 		else if (*p >= 'A' && *p <= 'F')
301 			n = 16 * n + *p - 'A' + 10;
302 	}
303 	*pp = p;
304 	return n;
305 }
306 
307 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
308 
309 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
310 			/* and increment *pp */
311 {
312 	const uschar *p = *pp;
313 	int c;
314 
315 	if ((c = *p++) == 't')
316 		c = '\t';
317 	else if (c == 'n')
318 		c = '\n';
319 	else if (c == 'f')
320 		c = '\f';
321 	else if (c == 'r')
322 		c = '\r';
323 	else if (c == 'b')
324 		c = '\b';
325 	else if (c == '\\')
326 		c = '\\';
327 	else if (c == 'x') {	/* hexadecimal goo follows */
328 		c = hexstr(&p);	/* this adds a null if number is invalid */
329 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
330 		int n = c - '0';
331 		if (isoctdigit(*p)) {
332 			n = 8 * n + *p++ - '0';
333 			if (isoctdigit(*p))
334 				n = 8 * n + *p++ - '0';
335 		}
336 		c = n;
337 	} /* else */
338 		/* c = c; */
339 	*pp = p;
340 	return c;
341 }
342 
343 char *cclenter(const char *argp)	/* add a character class */
344 {
345 	int i, c, c2;
346 	const uschar *p = (const uschar *) argp;
347 	const uschar *op;
348 	uschar *bp;
349 	static uschar *buf = 0;
350 	static int bufsz = 100;
351 
352 	op = p;
353 	if (buf == 0 && (buf = malloc(bufsz)) == NULL)
354 		FATAL("out of space for character class [%.10s...] 1", p);
355 	bp = buf;
356 	for (i = 0; (c = *p++) != 0; ) {
357 		if (c == '\\') {
358 			c = quoted(&p);
359 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
360 			if (*p != 0) {
361 				c = bp[-1];
362 				c2 = *p++;
363 				if (c2 == '\\')
364 					c2 = quoted(&p);
365 				if (c > c2) {	/* empty; ignore */
366 					bp--;
367 					i--;
368 					continue;
369 				}
370 				while (c < c2) {
371 					if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter1"))
372 						FATAL("out of space for character class [%.10s...] 2", p);
373 					*bp++ = ++c;
374 					i++;
375 				}
376 				continue;
377 			}
378 		}
379 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter2"))
380 			FATAL("out of space for character class [%.10s...] 3", p);
381 		*bp++ = c;
382 		i++;
383 	}
384 	*bp = 0;
385 	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
386 	free(__UNCONST(op));
387 	return (char *) tostring((char *) buf);
388 }
389 
390 void overflo(const char *s)
391 {
392 	FATAL("regular expression too big: %.30s...", s);
393 }
394 
395 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
396 {
397 	int i;
398 	int *p;
399 
400 	switch (type(v)) {
401 	ELEAF
402 	LEAF
403 		f->re[info(v)].ltype = type(v);
404 		f->re[info(v)].lval.np = right(v);
405 		while (f->accept >= maxsetvec) /* guessing here! */
406 			resizesetvec("out of space in cfoll()");
407 		for (i = 0; i <= f->accept; i++)
408 			setvec[i] = 0;
409 		setcnt = 0;
410 		follow(v);	/* computes setvec and setcnt */
411 		if ((p = calloc(1, (setcnt+1)*sizeof(int))) == NULL)
412 			overflo("out of space building follow set");
413 		f->re[info(v)].lfollow = p;
414 		*p = setcnt;
415 		for (i = f->accept; i >= 0; i--)
416 			if (setvec[i] == 1)
417 				*++p = i;
418 		break;
419 	UNARY
420 		cfoll(f,left(v));
421 		break;
422 	case CAT:
423 	case OR:
424 		cfoll(f,left(v));
425 		cfoll(f,right(v));
426 		break;
427 	default:	/* can't happen */
428 		FATAL("can't happen: unknown type %d in cfoll", type(v));
429 	}
430 }
431 
432 int first(Node *p)	/* collects initially active leaves of p into setvec */
433 			/* returns 0 if p matches empty string */
434 {
435 	int b, lp;
436 
437 	switch (type(p)) {
438 	ELEAF
439 	LEAF
440 		lp = info(p);	/* look for high-water mark of subscripts */
441 		while (setcnt >= maxsetvec || lp >= maxsetvec) /* guessing here! */
442 			resizesetvec("out of space in first()");
443 		if (type(p) == EMPTYRE) {
444 			setvec[lp] = 0;
445 			return(0);
446 		}
447 		if (setvec[lp] != 1) {
448 			setvec[lp] = 1;
449 			setcnt++;
450 		}
451 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
452 			return(0);		/* empty CCL */
453 		else return(1);
454 	case PLUS:
455 		if (first(left(p)) == 0) return(0);
456 		return(1);
457 	case STAR:
458 	case QUEST:
459 		first(left(p));
460 		return(0);
461 	case CAT:
462 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
463 		return(1);
464 	case OR:
465 		b = first(right(p));
466 		if (first(left(p)) == 0 || b == 0) return(0);
467 		return(1);
468 	}
469 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
470 	return(-1);
471 }
472 
473 void follow(Node *v)	/* collects leaves that can follow v into setvec */
474 {
475 	Node *p;
476 
477 	if (type(v) == FINAL)
478 		return;
479 	p = parent(v);
480 	switch (type(p)) {
481 	case STAR:
482 	case PLUS:
483 		first(v);
484 		follow(p);
485 		return;
486 
487 	case OR:
488 	case QUEST:
489 		follow(p);
490 		return;
491 
492 	case CAT:
493 		if (v == left(p)) {	/* v is left child of p */
494 			if (first(right(p)) == 0) {
495 				follow(p);
496 				return;
497 			}
498 		} else		/* v is right child */
499 			follow(p);
500 		return;
501 	}
502 }
503 
504 int member(int c, const char *sarg)	/* is c in s? */
505 {
506 	const uschar *s = (const uschar *) sarg;
507 
508 	while (*s)
509 		if (c == *s++)
510 			return(1);
511 	return(0);
512 }
513 
514 int match(fa *f, const char *p0)	/* shortest match ? */
515 {
516 	int s, ns;
517 	const uschar *p = (const uschar *) p0;
518 
519 	s = f->initstat;
520 	assert (s < f->state_count);
521 
522 	if (f->out[s])
523 		return(1);
524 	do {
525 		/* assert(*p < NCHARS); */
526 		if ((ns = f->gototab[s][*p]) != 0)
527 			s = ns;
528 		else
529 			s = cgoto(f, s, *p);
530 
531 		assert (s < f->state_count);
532 
533 		if (f->out[s])
534 			return(1);
535 	} while (*p++ != 0);
536 	return(0);
537 }
538 
539 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
540 {
541 	int s, ns;
542 	uschar *p = __UNCONST(p0);
543 	uschar *q;
544 
545 	s = f->initstat;
546 	assert(s < f->state_count);
547 	patbeg = p;
548 	patlen = -1;
549 	do {
550 		q = p;
551 		do {
552 			if (f->out[s])		/* final state */
553 				patlen = q-p;
554 			/* assert(*q < NCHARS); */
555 			if ((ns = f->gototab[s][*q]) != 0)
556 				s = ns;
557 			else
558 				s = cgoto(f, s, *q);
559 
560 			assert(s < f->state_count);
561 
562 			if (s == 1) {	/* no transition */
563 				if (patlen >= 0) {
564 					patbeg = p;
565 					return(1);
566 				}
567 				else
568 					goto nextin;	/* no match */
569 			}
570 		} while (*q++ != 0);
571 		if (f->out[s])
572 			patlen = q-p-1;	/* don't count $ */
573 		if (patlen >= 0) {
574 			patbeg = p;
575 			return(1);
576 		}
577 	nextin:
578 		s = 2;
579 	} while (*p++ != 0);
580 	return (0);
581 }
582 
583 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
584 {
585 	int s, ns;
586 	uschar *p = __UNCONST(p0);
587 	uschar *q;
588 
589 	s = f->initstat;
590 	assert(s < f->state_count);
591 
592 	patlen = -1;
593 	while (*p) {
594 		q = p;
595 		do {
596 			if (f->out[s])		/* final state */
597 				patlen = q-p;
598 			/* assert(*q < NCHARS); */
599 			if ((ns = f->gototab[s][*q]) != 0)
600 				s = ns;
601 			else
602 				s = cgoto(f, s, *q);
603 
604 			assert(s < f->state_count);
605 
606 			if (s == 1) {	/* no transition */
607 				if (patlen > 0) {
608 					patbeg = p;
609 					return(1);
610 				} else
611 					goto nnextin;	/* no nonempty match */
612 			}
613 		} while (*q++ != 0);
614 		if (f->out[s])
615 			patlen = q-p-1;	/* don't count $ */
616 		if (patlen > 0 ) {
617 			patbeg = p;
618 			return(1);
619 		}
620 	nnextin:
621 		s = 2;
622 		p++;
623 	}
624 	return (0);
625 }
626 
627 
628 /*
629  * NAME
630  *     fnematch
631  *
632  * DESCRIPTION
633  *     A stream-fed version of nematch which transfers characters to a
634  *     null-terminated buffer. All characters up to and including the last
635  *     character of the matching text or EOF are placed in the buffer. If
636  *     a match is found, patbeg and patlen are set appropriately.
637  *
638  * RETURN VALUES
639  *     0    No match found.
640  *     1    Match found.
641  */
642 
643 int fnematch(fa *pfa, FILE *f, uschar **pbuf, int *pbufsize, int quantum)
644 {
645 	uschar *buf = *pbuf;
646 	int bufsize = *pbufsize;
647 	int c, i, j, k, ns, s;
648 
649 	s = pfa->initstat;
650 	assert(s < pfa->state_count);
651 	patlen = 0;
652 
653 	/*
654 	 * All indices relative to buf.
655 	 * i <= j <= k <= bufsize
656 	 *
657 	 * i: origin of active substring
658 	 * j: current character
659 	 * k: destination of next getc()
660 	 */
661 	i = -1, k = 0;
662         do {
663 		j = i++;
664 		do {
665 			if (++j == k) {
666 				if (k == bufsize)
667 					if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
668 						FATAL("stream '%.30s...' too long", buf);
669 				buf[k++] = (c = getc(f)) != EOF ? c : 0;
670 			}
671 			c = buf[j];
672 			/* assert(c < NCHARS); */
673 
674 			if ((ns = pfa->gototab[s][c]) != 0)
675 				s = ns;
676 			else
677 				s = cgoto(pfa, s, c);
678 			assert(s < pfa->state_count);
679 
680 			if (pfa->out[s]) {	/* final state */
681 				patlen = j - i + 1;
682 				if (c == 0)	/* don't count $ */
683 					patlen--;
684 			}
685 		} while (buf[j] && s != 1);
686 		s = 2;
687 	} while (buf[i] && !patlen);
688 
689 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
690 	*pbuf = buf;
691 	*pbufsize = bufsize;
692 
693 	if (patlen) {
694 		patbeg = buf + i;
695 		/*
696 		 * Under no circumstances is the last character fed to
697 		 * the automaton part of the match. It is EOF's nullbyte,
698 		 * or it sent the automaton into a state with no further
699 		 * transitions available (s==1), or both. Room for a
700 		 * terminating nullbyte is guaranteed.
701 		 *
702 		 * ungetc any chars after the end of matching text
703 		 * (except for EOF's nullbyte, if present) and null
704 		 * terminate the buffer.
705 		 */
706 		do
707 			if (buf[--k] && ungetc(buf[k], f) == EOF)
708 				FATAL("unable to ungetc '%c'", buf[k]);
709 		while (k > i + patlen);
710 		buf[k] = 0;
711 		return 1;
712 	}
713 	else
714 		return 0;
715 }
716 
717 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
718 {			/* uses relex() to scan regular expression */
719 	Node *np;
720 
721 	dprintf( ("reparse <%s>\n", p) );
722 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
723 	rtok = relex();
724 	/* GNU compatibility: an empty regexp matches anything */
725 	if (rtok == '\0') {
726 		/* FATAL("empty regular expression"); previous */
727 		return(op2(EMPTYRE, NIL, NIL));
728 	}
729 	np = regexp();
730 	if (rtok != '\0')
731 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
732 	return(np);
733 }
734 
735 Node *regexp(void)	/* top-level parse of reg expr */
736 {
737 	return (alt(concat(primary())));
738 }
739 
740 Node *primary(void)
741 {
742 	Node *np;
743 
744 	switch (rtok) {
745 	case CHAR:
746 		np = op2(CHAR, NIL, itonp(rlxval));
747 		rtok = relex();
748 		return (unary(np));
749 	case ALL:
750 		rtok = relex();
751 		return (unary(op2(ALL, NIL, NIL)));
752 	case EMPTYRE:
753 		rtok = relex();
754 		return (unary(op2(ALL, NIL, NIL)));
755 	case DOT:
756 		rtok = relex();
757 		return (unary(op2(DOT, NIL, NIL)));
758 	case CCL:
759 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
760 		rtok = relex();
761 		return (unary(np));
762 	case NCCL:
763 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
764 		rtok = relex();
765 		return (unary(np));
766 	case '^':
767 		rtok = relex();
768 		return (unary(op2(CHAR, NIL, itonp(HAT))));
769 	case '$':
770 		rtok = relex();
771 		return (unary(op2(CHAR, NIL, NIL)));
772 	case '(':
773 		rtok = relex();
774 		if (rtok == ')') {	/* special pleading for () */
775 			rtok = relex();
776 			return unary(op2(CCL, NIL, (Node *) tostring("")));
777 		}
778 		np = regexp();
779 		if (rtok == ')') {
780 			rtok = relex();
781 			return (unary(np));
782 		}
783 		else
784 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
785 	default:
786 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
787 	}
788 	return 0;	/*NOTREACHED*/
789 }
790 
791 Node *concat(Node *np)
792 {
793 	switch (rtok) {
794 	case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
795 		return (concat(op2(CAT, np, primary())));
796 	}
797 	return (np);
798 }
799 
800 Node *alt(Node *np)
801 {
802 	if (rtok == OR) {
803 		rtok = relex();
804 		return (alt(op2(OR, np, concat(primary()))));
805 	}
806 	return (np);
807 }
808 
809 Node *unary(Node *np)
810 {
811 	switch (rtok) {
812 	case STAR:
813 		rtok = relex();
814 		return (unary(op2(STAR, np, NIL)));
815 	case PLUS:
816 		rtok = relex();
817 		return (unary(op2(PLUS, np, NIL)));
818 	case QUEST:
819 		rtok = relex();
820 		return (unary(op2(QUEST, np, NIL)));
821 	default:
822 		return (np);
823 	}
824 }
825 
826 /*
827  * Character class definitions conformant to the POSIX locale as
828  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
829  * and operating character sets are both ASCII (ISO646) or supersets
830  * thereof.
831  *
832  * Note that to avoid overflowing the temporary buffer used in
833  * relex(), the expanded character class (prior to range expansion)
834  * must be less than twice the size of their full name.
835  */
836 
837 /* Because isblank doesn't show up in any of the header files on any
838  * system i use, it's defined here.  if some other locale has a richer
839  * definition of "blank", define HAS_ISBLANK and provide your own
840  * version.
841  * the parentheses here are an attempt to find a path through the maze
842  * of macro definition and/or function and/or version provided.  thanks
843  * to nelson beebe for the suggestion; let's see if it works everywhere.
844  */
845 
846 /* #define HAS_ISBLANK */
847 
848 static const struct charclass {
849 	const char *cc_name;
850 	int cc_namelen;
851 	int (*cc_func)(int);
852 } charclasses[] = {
853 	{ "alnum",	5,	isalnum },
854 	{ "alpha",	5,	isalpha },
855 	{ "blank",	5,	isblank },
856 	{ "cntrl",	5,	iscntrl },
857 	{ "digit",	5,	isdigit },
858 	{ "graph",	5,	isgraph },
859 	{ "lower",	5,	islower },
860 	{ "print",	5,	isprint },
861 	{ "punct",	5,	ispunct },
862 	{ "space",	5,	isspace },
863 	{ "upper",	5,	isupper },
864 	{ "xdigit",	6,	isxdigit },
865 	{ NULL,		0,	NULL },
866 };
867 
868 
869 int relex(void)		/* lexical analyzer for reparse */
870 {
871 	int c, n;
872 	int cflag;
873 	static uschar *buf = 0;
874 	static int bufsz = 100;
875 	uschar *bp;
876 	const struct charclass *cc;
877 	int i;
878 
879 	switch (c = *prestr++) {
880 	case '|': return OR;
881 	case '*': return STAR;
882 	case '+': return PLUS;
883 	case '?': return QUEST;
884 	case '.': return DOT;
885 	case '\0': prestr--; return '\0';
886 	case '^':
887 	case '$':
888 	case '(':
889 	case ')':
890 		return c;
891 	case '\\':
892 		rlxval = quoted(&prestr);
893 		return CHAR;
894 	default:
895 		rlxval = c;
896 		return CHAR;
897 	case '[':
898 		if (buf == 0 && (buf = malloc(bufsz)) == NULL)
899 			FATAL("out of space in reg expr %.10s..", lastre);
900 		bp = buf;
901 		if (*prestr == '^') {
902 			cflag = 1;
903 			prestr++;
904 		}
905 		else
906 			cflag = 0;
907 		n = 2 * strlen((const char *) prestr)+1;
908 		if (!adjbuf(&buf, &bufsz, n, n, &bp, "relex1"))
909 			FATAL("out of space for reg expr %.10s...", lastre);
910 		for (; ; ) {
911 			if ((c = *prestr++) == '\\') {
912 				*bp++ = '\\';
913 				if ((c = *prestr++) == '\0')
914 					FATAL("nonterminated character class %.20s...", lastre);
915 				*bp++ = c;
916 			/* } else if (c == '\n') { */
917 			/* 	FATAL("newline in character class %.20s...", lastre); */
918 			} else if (c == '[' && *prestr == ':') {
919 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
920 				for (cc = charclasses; cc->cc_name; cc++)
921 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
922 						break;
923 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
924 				    prestr[2 + cc->cc_namelen] == ']') {
925 					prestr += cc->cc_namelen + 3;
926 					for (i = 1; i < NCHARS; i++) {
927 						if (!adjbuf(&buf, &bufsz, bp-buf+1, 100, &bp, "relex2"))
928 						    FATAL("out of space for reg expr %.10s...", lastre);
929 						if (cc->cc_func(i)) {
930 							*bp++ = i;
931 							n++;
932 						}
933 					}
934 				} else
935 					*bp++ = c;
936 			} else if (c == '\0') {
937 				FATAL("nonterminated character class %.20s", lastre);
938 			} else if (bp == buf) {	/* 1st char is special */
939 				*bp++ = c;
940 			} else if (c == ']') {
941 				*bp++ = 0;
942 				rlxstr = (uschar *) tostring((char *) buf);
943 				if (cflag == 0)
944 					return CCL;
945 				else
946 					return NCCL;
947 			} else
948 				*bp++ = c;
949 		}
950 	}
951 }
952 
953 int cgoto(fa *f, int s, int c)
954 {
955 	int i, j, k;
956 	int *p, *q;
957 
958 	assert(c == HAT || c < NCHARS);
959 	while (f->accept >= maxsetvec) 	/* guessing here! */
960 		resizesetvec("out of space in cgoto()");
961 	for (i = 0; i <= f->accept; i++)
962 		setvec[i] = 0;
963 	setcnt = 0;
964 	resize_state(f, s);
965 	/* compute positions of gototab[s,c] into setvec */
966 	p = f->posns[s];
967 	for (i = 1; i <= *p; i++) {
968 		if ((k = f->re[p[i]].ltype) != FINAL) {
969 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
970 			 || (k == DOT && c != 0 && c != HAT)
971 			 || (k == ALL && c != 0)
972 			 || (k == EMPTYRE && c != 0)
973 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
974 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
975 				q = f->re[p[i]].lfollow;
976 				for (j = 1; j <= *q; j++) {
977 					if (q[j] >= maxsetvec)
978 						resizesetvec("cgoto overflow");
979 					if (setvec[q[j]] == 0) {
980 						setcnt++;
981 						setvec[q[j]] = 1;
982 					}
983 				}
984 			}
985 		}
986 	}
987 	/* determine if setvec is a previous state */
988 	tmpset[0] = setcnt;
989 	j = 1;
990 	for (i = f->accept; i >= 0; i--)
991 		if (setvec[i]) {
992 			tmpset[j++] = i;
993 		}
994 
995 	resize_state(f, f->curstat > s ? f->curstat : s);
996 	/* tmpset == previous state? */
997 	for (i = 1; i <= f->curstat; i++) {
998 		p = f->posns[i];
999 		if ((k = tmpset[0]) != p[0])
1000 			goto different;
1001 		for (j = 1; j <= k; j++)
1002 			if (tmpset[j] != p[j])
1003 				goto different;
1004 		/* setvec is state i */
1005 		if (c != HAT)
1006 			f->gototab[s][c] = i;
1007 		return i;
1008 	  different:;
1009 	}
1010 
1011 	/* add tmpset to current set of states */
1012 	++(f->curstat);
1013 	resize_state(f, f->curstat);
1014 	for (i = 0; i < NCHARS; i++)
1015 		f->gototab[f->curstat][i] = 0;
1016 	xfree(f->posns[f->curstat]);
1017 	if ((p = calloc(1, (setcnt+1)*sizeof(int))) == NULL)
1018 		overflo("out of space in cgoto");
1019 
1020 	f->posns[f->curstat] = p;
1021 	if (c != HAT)
1022 		f->gototab[s][c] = f->curstat;
1023 	for (i = 0; i <= setcnt; i++)
1024 		p[i] = tmpset[i];
1025 	if (setvec[f->accept])
1026 		f->out[f->curstat] = 1;
1027 	else
1028 		f->out[f->curstat] = 0;
1029 	return f->curstat;
1030 }
1031 
1032 
1033 void freefa(fa *f)	/* free a finite automaton */
1034 {
1035 	int i;
1036 
1037 	if (f == NULL)
1038 		return;
1039 	for (i = 0; i < f->state_count; i++) {
1040 		xfree(f->gototab[i])
1041 		xfree(f->posns[i]);
1042 	}
1043 	for (i = 0; i <= f->accept; i++) {
1044 		xfree(f->re[i].lfollow);
1045 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1046 			xfree((f->re[i].lval.np));
1047 	}
1048 	xfree(f->restr);
1049 	xfree(f->out);
1050 	xfree(f->posns);
1051 	xfree(f->gototab);
1052 	xfree(f);
1053 }
1054