xref: /openbsd-src/usr.bin/awk/b.c (revision 1ad61ae0a79a724d2d3ec69e69c8e1d1ff6b53a0)
1 /*	$OpenBSD: b.c,v 1.45 2023/10/30 17:52:54 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 /* lasciate ogne speranza, voi ch'intrate. */
27 
28 #define	DEBUG
29 
30 #include <ctype.h>
31 #include <limits.h>
32 #include <stdio.h>
33 #include <string.h>
34 #include <stdlib.h>
35 #include "awk.h"
36 #include "awkgram.tab.h"
37 
38 #define MAXLIN 22
39 
40 #define type(v)		(v)->nobj	/* badly overloaded here */
41 #define info(v)		(v)->ntype	/* badly overloaded here */
42 #define left(v)		(v)->narg[0]
43 #define right(v)	(v)->narg[1]
44 #define parent(v)	(v)->nnext
45 
46 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
47 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
48 #define UNARY	case STAR: case PLUS: case QUEST:
49 
50 /* encoding in tree Nodes:
51 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
52 		left is index, right contains value or pointer to value
53 	unary (STAR, PLUS, QUEST): left is child, right is null
54 	binary (CAT, OR): left and right are children
55 	parent contains pointer to parent
56 */
57 
58 
59 int	*setvec;
60 int	*tmpset;
61 int	maxsetvec = 0;
62 
63 int	rtok;		/* next token in current re */
64 int	rlxval;
65 static const uschar	*rlxstr;
66 static const uschar	*prestr;	/* current position in current re */
67 static const uschar	*lastre;	/* origin of last re */
68 static const uschar	*lastatom;	/* origin of last Atom */
69 static const uschar	*starttok;
70 static const uschar 	*basestr;	/* starts with original, replaced during
71 				   repetition processing */
72 static const uschar 	*firstbasestr;
73 
74 static	int setcnt;
75 static	int poscnt;
76 
77 const char	*patbeg;
78 int	patlen;
79 
80 #define	NFA	128	/* cache this many dynamic fa's */
81 fa	*fatab[NFA];
82 int	nfatab	= 0;	/* entries in fatab */
83 
84 extern int u8_nextlen(const char *s);
85 
86 
87 /* utf-8 mechanism:
88 
89    For most of Awk, utf-8 strings just "work", since they look like
90    null-terminated sequences of 8-bit bytes.
91 
92    Functions like length(), index(), and substr() have to operate
93    in units of utf-8 characters.  The u8_* functions in run.c
94    handle this.
95 
96    Regular expressions are more complicated, since the basic
97    mechanism of the goto table used 8-bit byte indices into the
98    gototab entries to compute the next state.  Unicode is a lot
99    bigger, so the gototab entries are now structs with a character
100    and a next state, and there is a linear search of the characters
101    to find the state.  (Yes, this is slower, by a significant
102    amount.  Tough.)
103 
104    Throughout the RE mechanism in b.c, utf-8 characters are
105    converted to their utf-32 value.  This mostly shows up in
106    cclenter, which expands character class ranges like a-z and now
107    alpha-omega.  The size of a gototab array is still about 256.
108    This should be dynamic, but for now things work ok for a single
109    code page of Unicode, which is the most likely case.
110 
111    The code changes are localized in run.c and b.c.  I have added a
112    handful of functions to somewhat better hide the implementation,
113    but a lot more could be done.
114 
115  */
116 
117 static int get_gototab(fa*, int, int);
118 static int set_gototab(fa*, int, int, int);
119 static void reset_gototab(fa*, int);
120 extern int u8_rune(int *, const uschar *);
121 
122 static int *
123 intalloc(size_t n, const char *f)
124 {
125 	int *p = (int *) calloc(n, sizeof(int));
126 	if (p == NULL)
127 		overflo(f);
128 	return p;
129 }
130 
131 static void
132 allocsetvec(const char *f)
133 {
134 	maxsetvec = MAXLIN;
135 	setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
136 	tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
137 	if (setvec == NULL || tmpset == NULL)
138 		overflo(f);
139 }
140 
141 static void
142 resizesetvec(const char *f)
143 {
144 	setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
145 	tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
146 	if (setvec == NULL || tmpset == NULL)
147 		overflo(f);
148 	maxsetvec *= 4;
149 }
150 
151 static void
152 resize_state(fa *f, int state)
153 {
154 	gtt **p;
155 	uschar *p2;
156 	int **p3;
157 	int i, new_count;
158 
159 	if (++state < f->state_count)
160 		return;
161 
162 	new_count = state + 10; /* needs to be tuned */
163 
164 	p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
165 	if (p == NULL)
166 		goto out;
167 	f->gototab = p;
168 
169 	p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
170 	if (p2 == NULL)
171 		goto out;
172 	f->out = p2;
173 
174 	p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
175 	if (p3 == NULL)
176 		goto out;
177 	f->posns = p3;
178 
179 	for (i = f->state_count; i < new_count; ++i) {
180 		f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));
181 		if (f->gototab[i] == NULL)
182 			goto out;
183 		f->out[i]  = 0;
184 		f->posns[i] = NULL;
185 	}
186 	f->gototab_len = NCHARS; /* should be variable, growable */
187 	f->state_count = new_count;
188 	return;
189 out:
190 	overflo(__func__);
191 }
192 
193 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
194 {
195 	int i, use, nuse;
196 	fa *pfa;
197 	static int now = 1;
198 
199 	if (setvec == NULL) {	/* first time through any RE */
200 		allocsetvec(__func__);
201 	}
202 
203 	if (compile_time != RUNNING)	/* a constant for sure */
204 		return mkdfa(s, anchor);
205 	for (i = 0; i < nfatab; i++)	/* is it there already? */
206 		if (fatab[i]->anchor == anchor
207 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
208 			fatab[i]->use = now++;
209 			return fatab[i];
210 		}
211 	pfa = mkdfa(s, anchor);
212 	if (nfatab < NFA) {	/* room for another */
213 		fatab[nfatab] = pfa;
214 		fatab[nfatab]->use = now++;
215 		nfatab++;
216 		return pfa;
217 	}
218 	use = fatab[0]->use;	/* replace least-recently used */
219 	nuse = 0;
220 	for (i = 1; i < nfatab; i++)
221 		if (fatab[i]->use < use) {
222 			use = fatab[i]->use;
223 			nuse = i;
224 		}
225 	freefa(fatab[nuse]);
226 	fatab[nuse] = pfa;
227 	pfa->use = now++;
228 	return pfa;
229 }
230 
231 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
232 				/* anchor = true for anchored matches, else false */
233 {
234 	Node *p, *p1;
235 	fa *f;
236 
237 	firstbasestr = (const uschar *) s;
238 	basestr = firstbasestr;
239 	p = reparse(s);
240 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
241 		/* put ALL STAR in front of reg.  exp. */
242 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
243 		/* put FINAL after reg.  exp. */
244 
245 	poscnt = 0;
246 	penter(p1);	/* enter parent pointers and leaf indices */
247 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
248 		overflo(__func__);
249 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
250 	cfoll(f, p1);	/* set up follow sets */
251 	freetr(p1);
252 	resize_state(f, 1);
253 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
254 	f->posns[1] = intalloc(1, __func__);
255 	*f->posns[1] = 0;
256 	f->initstat = makeinit(f, anchor);
257 	f->anchor = anchor;
258 	f->restr = (uschar *) tostring(s);
259 	if (firstbasestr != basestr) {
260 		if (basestr)
261 			xfree(basestr);
262 	}
263 	return f;
264 }
265 
266 int makeinit(fa *f, bool anchor)
267 {
268 	int i, k;
269 
270 	f->curstat = 2;
271 	f->out[2] = 0;
272 	k = *(f->re[0].lfollow);
273 	xfree(f->posns[2]);
274 	f->posns[2] = intalloc(k + 1,  __func__);
275 	for (i = 0; i <= k; i++) {
276 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
277 	}
278 	if ((f->posns[2])[1] == f->accept)
279 		f->out[2] = 1;
280 	reset_gototab(f, 2);
281 	f->curstat = cgoto(f, 2, HAT);
282 	if (anchor) {
283 		*f->posns[2] = k-1;	/* leave out position 0 */
284 		for (i = 0; i < k; i++) {
285 			(f->posns[0])[i] = (f->posns[2])[i];
286 		}
287 
288 		f->out[0] = f->out[2];
289 		if (f->curstat != 2)
290 			--(*f->posns[f->curstat]);
291 	}
292 	return f->curstat;
293 }
294 
295 void penter(Node *p)	/* set up parent pointers and leaf indices */
296 {
297 	switch (type(p)) {
298 	ELEAF
299 	LEAF
300 		info(p) = poscnt;
301 		poscnt++;
302 		break;
303 	UNARY
304 		penter(left(p));
305 		parent(left(p)) = p;
306 		break;
307 	case CAT:
308 	case OR:
309 		penter(left(p));
310 		penter(right(p));
311 		parent(left(p)) = p;
312 		parent(right(p)) = p;
313 		break;
314 	case ZERO:
315 		break;
316 	default:	/* can't happen */
317 		FATAL("can't happen: unknown type %d in penter", type(p));
318 		break;
319 	}
320 }
321 
322 void freetr(Node *p)	/* free parse tree */
323 {
324 	switch (type(p)) {
325 	ELEAF
326 	LEAF
327 		xfree(p);
328 		break;
329 	UNARY
330 	case ZERO:
331 		freetr(left(p));
332 		xfree(p);
333 		break;
334 	case CAT:
335 	case OR:
336 		freetr(left(p));
337 		freetr(right(p));
338 		xfree(p);
339 		break;
340 	default:	/* can't happen */
341 		FATAL("can't happen: unknown type %d in freetr", type(p));
342 		break;
343 	}
344 }
345 
346 /* in the parsing of regular expressions, metacharacters like . have */
347 /* to be seen literally;  \056 is not a metacharacter. */
348 
349 int hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
350 {			/* only pick up one 8-bit byte (2 chars) */
351 	const uschar *p;
352 	int n = 0;
353 	int i;
354 
355 	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
356 		if (isdigit(*p))
357 			n = 16 * n + *p - '0';
358 		else if (*p >= 'a' && *p <= 'f')
359 			n = 16 * n + *p - 'a' + 10;
360 		else if (*p >= 'A' && *p <= 'F')
361 			n = 16 * n + *p - 'A' + 10;
362 	}
363 	*pp = p;
364 	return n;
365 }
366 
367 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
368 
369 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
370 			/* and increment *pp */
371 {
372 	const uschar *p = *pp;
373 	int c;
374 
375 /* BUG: should advance by utf-8 char even if makes no sense */
376 
377 	if ((c = *p++) == 't') {
378 		c = '\t';
379 	} else if (c == 'n') {
380 		c = '\n';
381 	} else if (c == 'f') {
382 		c = '\f';
383 	} else if (c == 'r') {
384 		c = '\r';
385 	} else if (c == 'b') {
386 		c = '\b';
387 	} else if (c == 'v') {
388 		c = '\v';
389 	} else if (c == 'a') {
390 		c = '\a';
391 	} else if (c == '\\') {
392 		c = '\\';
393 	} else if (c == 'x') {	/* 2 hex digits follow */
394 		c = hexstr(&p, 2);	/* this adds a null if number is invalid */
395 	} else if (c == 'u') {	/* unicode char number up to 8 hex digits */
396 		c = hexstr(&p, 8);
397 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
398 		int n = c - '0';
399 		if (isoctdigit(*p)) {
400 			n = 8 * n + *p++ - '0';
401 			if (isoctdigit(*p))
402 				n = 8 * n + *p++ - '0';
403 		}
404 		c = n;
405 	} /* else */
406 		/* c = c; */
407 	*pp = p;
408 	return c;
409 }
410 
411 int *cclenter(const char *argp)	/* add a character class */
412 {
413 	int i, c, c2;
414 	int n;
415 	const uschar *p = (const uschar *) argp;
416 	int *bp, *retp;
417 	static int *buf = NULL;
418 	static int bufsz = 100;
419 
420 	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
421 		FATAL("out of space for character class [%.10s...] 1", p);
422 	bp = buf;
423 	for (i = 0; *p != 0; ) {
424 		n = u8_rune(&c, p);
425 		p += n;
426 		if (c == '\\') {
427 			c = quoted(&p);
428 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
429 			if (*p != 0) {
430 				c = bp[-1];
431 				/* c2 = *p++; */
432 				n = u8_rune(&c2, p);
433 				p += n;
434 				if (c2 == '\\')
435 					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
436 				if (c > c2) {	/* empty; ignore */
437 					bp--;
438 					i--;
439 					continue;
440 				}
441 				while (c < c2) {
442 					if (i >= bufsz) {
443 						buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
444 						if (buf == NULL)
445 							FATAL("out of space for character class [%.10s...] 2", p);
446 						bufsz *= 2;
447 						bp = buf + i;
448 					}
449 					*bp++ = ++c;
450 					i++;
451 				}
452 				continue;
453 			}
454 		}
455 		if (i >= bufsz) {
456 			buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
457 			if (buf == NULL)
458 				FATAL("out of space for character class [%.10s...] 2", p);
459 			bufsz *= 2;
460 			bp = buf + i;
461 		}
462 		*bp++ = c;
463 		i++;
464 	}
465 	*bp = 0;
466 	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
467 	/* xfree(op);  BUG: what are we freeing here? */
468 	retp = (int *) calloc(bp-buf+1, sizeof(int));
469 	for (i = 0; i < bp-buf+1; i++)
470 		retp[i] = buf[i];
471 	return retp;
472 }
473 
474 void overflo(const char *s)
475 {
476 	FATAL("regular expression too big: out of space in %.30s...", s);
477 }
478 
479 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
480 {
481 	int i;
482 	int *p;
483 
484 	switch (type(v)) {
485 	ELEAF
486 	LEAF
487 		f->re[info(v)].ltype = type(v);
488 		f->re[info(v)].lval.np = right(v);
489 		while (f->accept >= maxsetvec) {	/* guessing here! */
490 			resizesetvec(__func__);
491 		}
492 		for (i = 0; i <= f->accept; i++)
493 			setvec[i] = 0;
494 		setcnt = 0;
495 		follow(v);	/* computes setvec and setcnt */
496 		p = intalloc(setcnt + 1, __func__);
497 		f->re[info(v)].lfollow = p;
498 		*p = setcnt;
499 		for (i = f->accept; i >= 0; i--)
500 			if (setvec[i] == 1)
501 				*++p = i;
502 		break;
503 	UNARY
504 		cfoll(f,left(v));
505 		break;
506 	case CAT:
507 	case OR:
508 		cfoll(f,left(v));
509 		cfoll(f,right(v));
510 		break;
511 	case ZERO:
512 		break;
513 	default:	/* can't happen */
514 		FATAL("can't happen: unknown type %d in cfoll", type(v));
515 	}
516 }
517 
518 int first(Node *p)	/* collects initially active leaves of p into setvec */
519 			/* returns 0 if p matches empty string */
520 {
521 	int b, lp;
522 
523 	switch (type(p)) {
524 	ELEAF
525 	LEAF
526 		lp = info(p);	/* look for high-water mark of subscripts */
527 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
528 			resizesetvec(__func__);
529 		}
530 		if (type(p) == EMPTYRE) {
531 			setvec[lp] = 0;
532 			return(0);
533 		}
534 		if (setvec[lp] != 1) {
535 			setvec[lp] = 1;
536 			setcnt++;
537 		}
538 		if (type(p) == CCL && (*(int *) right(p)) == 0)
539 			return(0);		/* empty CCL */
540 		return(1);
541 	case PLUS:
542 		if (first(left(p)) == 0)
543 			return(0);
544 		return(1);
545 	case STAR:
546 	case QUEST:
547 		first(left(p));
548 		return(0);
549 	case CAT:
550 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
551 		return(1);
552 	case OR:
553 		b = first(right(p));
554 		if (first(left(p)) == 0 || b == 0) return(0);
555 		return(1);
556 	case ZERO:
557 		return 0;
558 	}
559 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
560 	return(-1);
561 }
562 
563 void follow(Node *v)	/* collects leaves that can follow v into setvec */
564 {
565 	Node *p;
566 
567 	if (type(v) == FINAL)
568 		return;
569 	p = parent(v);
570 	switch (type(p)) {
571 	case STAR:
572 	case PLUS:
573 		first(v);
574 		follow(p);
575 		return;
576 
577 	case OR:
578 	case QUEST:
579 		follow(p);
580 		return;
581 
582 	case CAT:
583 		if (v == left(p)) {	/* v is left child of p */
584 			if (first(right(p)) == 0) {
585 				follow(p);
586 				return;
587 			}
588 		} else		/* v is right child */
589 			follow(p);
590 		return;
591 	}
592 }
593 
594 int member(int c, int *sarg)	/* is c in s? */
595 {
596 	int *s = (int *) sarg;
597 
598 	while (*s)
599 		if (c == *s++)
600 			return(1);
601 	return(0);
602 }
603 
604 static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
605 {
606 	int i;
607 	for (i = 0; i < f->gototab_len; i++) {
608 		if (f->gototab[state][i].ch == 0)
609 			break;
610 		if (f->gototab[state][i].ch == ch)
611 			return f->gototab[state][i].state;
612 	}
613 	return 0;
614 }
615 
616 static void reset_gototab(fa *f, int state) /* hide gototab inplementation */
617 {
618 	memset(f->gototab[state], 0, f->gototab_len * sizeof(**f->gototab));
619 }
620 
621 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
622 {
623 	int i;
624 	for (i = 0; i < f->gototab_len; i++) {
625 		if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {
626 			f->gototab[state][i].ch = ch;
627 			f->gototab[state][i].state = val;
628 			return val;
629 		}
630 	}
631 	overflo(__func__);
632 	return val; /* not used anywhere at the moment */
633 }
634 
635 int match(fa *f, const char *p0)	/* shortest match ? */
636 {
637 	int s, ns;
638 	int n;
639 	int rune;
640 	const uschar *p = (const uschar *) p0;
641 
642 	/* return pmatch(f, p0); does it matter whether longest or shortest? */
643 
644 	s = f->initstat;
645 	assert (s < f->state_count);
646 
647 	if (f->out[s])
648 		return(1);
649 	do {
650 		/* assert(*p < NCHARS); */
651 		n = u8_rune(&rune, p);
652 		if ((ns = get_gototab(f, s, rune)) != 0)
653 			s = ns;
654 		else
655 			s = cgoto(f, s, rune);
656 		if (f->out[s])
657 			return(1);
658 		if (*p == 0)
659 			break;
660 		p += n;
661 	} while (1);  /* was *p++ != 0 */
662 	return(0);
663 }
664 
665 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
666 {
667 	int s, ns;
668 	int n;
669 	int rune;
670 	const uschar *p = (const uschar *) p0;
671 	const uschar *q;
672 
673 	s = f->initstat;
674 	assert(s < f->state_count);
675 
676 	patbeg = (const char *)p;
677 	patlen = -1;
678 	do {
679 		q = p;
680 		do {
681 			if (f->out[s])		/* final state */
682 				patlen = q-p;
683 			/* assert(*q < NCHARS); */
684 			n = u8_rune(&rune, q);
685 			if ((ns = get_gototab(f, s, rune)) != 0)
686 				s = ns;
687 			else
688 				s = cgoto(f, s, rune);
689 
690 			assert(s < f->state_count);
691 
692 			if (s == 1) {	/* no transition */
693 				if (patlen >= 0) {
694 					patbeg = (const char *) p;
695 					return(1);
696 				}
697 				else
698 					goto nextin;	/* no match */
699 			}
700 			if (*q == 0)
701 				break;
702 			q += n;
703 		} while (1);
704 		q++;  /* was *q++ */
705 		if (f->out[s])
706 			patlen = q-p-1;	/* don't count $ */
707 		if (patlen >= 0) {
708 			patbeg = (const char *) p;
709 			return(1);
710 		}
711 	nextin:
712 		s = 2;
713 		if (*p == 0)
714 			break;
715 		n = u8_rune(&rune, p);
716 		p += n;
717 	} while (1); /* was *p++ */
718 	return (0);
719 }
720 
721 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
722 {
723 	int s, ns;
724 	int n;
725 	int rune;
726 	const uschar *p = (const uschar *) p0;
727 	const uschar *q;
728 
729 	s = f->initstat;
730 	assert(s < f->state_count);
731 
732 	patbeg = (const char *)p;
733 	patlen = -1;
734 	while (*p) {
735 		q = p;
736 		do {
737 			if (f->out[s])		/* final state */
738 				patlen = q-p;
739 			/* assert(*q < NCHARS); */
740 			n = u8_rune(&rune, q);
741 			if ((ns = get_gototab(f, s, rune)) != 0)
742 				s = ns;
743 			else
744 				s = cgoto(f, s, rune);
745 			if (s == 1) {	/* no transition */
746 				if (patlen > 0) {
747 					patbeg = (const char *) p;
748 					return(1);
749 				} else
750 					goto nnextin;	/* no nonempty match */
751 			}
752 			if (*q == 0)
753 				break;
754 			q += n;
755 		} while (1);
756 		q++;
757 		if (f->out[s])
758 			patlen = q-p-1;	/* don't count $ */
759 		if (patlen > 0 ) {
760 			patbeg = (const char *) p;
761 			return(1);
762 		}
763 	nnextin:
764 		s = 2;
765 		p++;
766 	}
767 	return (0);
768 }
769 
770 
771 #define MAX_UTF_BYTES	4	// UTF-8 is up to 4 bytes long
772 
773 // Read one rune at a time from the given FILE*. Return both
774 // the bytes and the actual rune.
775 
776 struct runedata {
777 	int rune;
778 	size_t len;
779 	char bytes[6];
780 };
781 
782 struct runedata getrune(FILE *fp)
783 {
784 	struct runedata result;
785 	int c, i, next;
786 
787 	memset(&result, 0, sizeof(result));
788 
789 	c = getc(fp);
790 	if (c == EOF)
791 		return result;	// result.rune == 0 --> EOF
792 	else if (c < 128 || awk_mb_cur_max == 1) {
793 		result.bytes[0] = c;
794 		result.len = 1;
795 		result.rune = c;
796 
797 		return result;
798 	}
799 
800 	// need to get bytes and fill things in
801 	result.bytes[0] = c;
802 	result.len = 1;
803 
804 	next = 1;
805 	for (i = 1; i < MAX_UTF_BYTES; i++) {
806 		c = getc(fp);
807 		if (c == EOF)
808 			break;
809 		result.bytes[next++] = c;
810 		result.len++;
811 	}
812 
813 	// put back any extra input bytes
814 	int actual_len = u8_nextlen(result.bytes);
815 	while (result.len > actual_len) {
816 		ungetc(result.bytes[--result.len], fp);
817 	}
818 
819 	result.bytes[result.len] = '\0';
820 	(void) u8_rune(& result.rune, (uschar *) result.bytes);
821 
822 	return result;
823 }
824 
825 
826 /*
827  * NAME
828  *     fnematch
829  *
830  * DESCRIPTION
831  *     A stream-fed version of nematch which transfers characters to a
832  *     null-terminated buffer. All characters up to and including the last
833  *     character of the matching text or EOF are placed in the buffer. If
834  *     a match is found, patbeg and patlen are set appropriately.
835  *
836  * RETURN VALUES
837  *     false    No match found.
838  *     true     Match found.
839  */
840 
841 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
842 {
843 	char *buf = *pbuf;
844 	int bufsize = *pbufsize;
845 	int i, j, k, ns, s;
846 	struct runedata r;
847 
848 	s = pfa->initstat;
849 	patlen = 0;
850 
851 	/*
852 	 * All indices relative to buf.
853 	 * i <= j <= k <= bufsize
854 	 *
855 	 * i: origin of active substring (first byte of first character)
856 	 * j: current character		(last byte of current character)
857 	 * k: destination of next getc()
858 	 */
859 	i = -1, k = 0;
860         do {
861 		j = i++;
862 		do {
863 			r = getrune(f);
864 			if ((++j + r.len) >= k) {
865 				if (k >= bufsize)
866 					if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
867 						FATAL("stream '%.30s...' too long", buf);
868 			}
869 			memcpy(buf + k, r.bytes, r.len);
870 			j += r.len - 1;	// incremented next time around the loop
871 			k += r.len;
872 
873 			if ((ns = get_gototab(pfa, s, r.rune)) != 0)
874 				s = ns;
875 			else
876 				s = cgoto(pfa, s, r.rune);
877 
878 			if (pfa->out[s]) {	/* final state */
879 				patlen = j - i + 1;
880 				if (r.rune == 0)	/* don't count $ */
881 					patlen--;
882 			}
883 		} while (buf[j] && s != 1);
884 		s = 2;
885 		if (r.len > 1)
886 			i += r.len - 1;	// i incremented around the loop
887 	} while (buf[i] && !patlen);
888 
889 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
890 	*pbuf = buf;
891 	*pbufsize = bufsize;
892 
893 	if (patlen) {
894 		patbeg = buf + i;
895 		/*
896 		 * Under no circumstances is the last character fed to
897 		 * the automaton part of the match. It is EOF's nullbyte,
898 		 * or it sent the automaton into a state with no further
899 		 * transitions available (s==1), or both. Room for a
900 		 * terminating nullbyte is guaranteed.
901 		 *
902 		 * ungetc any chars after the end of matching text
903 		 * (except for EOF's nullbyte, if present) and null
904 		 * terminate the buffer.
905 		 */
906 		do {
907 			int ii;
908 			for (ii = r.len; ii > 0; ii--)
909 				if (buf[--k] && ungetc(buf[k], f) == EOF)
910 					FATAL("unable to ungetc '%c'", buf[k]);
911 		} while (k > i + patlen);
912 		buf[k] = '\0';
913 		return true;
914 	}
915 	else
916 		return false;
917 }
918 
919 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
920 {			/* uses relex() to scan regular expression */
921 	Node *np;
922 
923 	DPRINTF("reparse <%s>\n", p);
924 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
925 	rtok = relex();
926 	/* GNU compatibility: an empty regexp matches anything */
927 	if (rtok == '\0') {
928 		/* FATAL("empty regular expression"); previous */
929 		return(op2(EMPTYRE, NIL, NIL));
930 	}
931 	np = regexp();
932 	if (rtok != '\0')
933 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
934 	return(np);
935 }
936 
937 Node *regexp(void)	/* top-level parse of reg expr */
938 {
939 	return (alt(concat(primary())));
940 }
941 
942 Node *primary(void)
943 {
944 	Node *np;
945 	int savelastatom;
946 
947 	switch (rtok) {
948 	case CHAR:
949 		lastatom = starttok;
950 		np = op2(CHAR, NIL, itonp(rlxval));
951 		rtok = relex();
952 		return (unary(np));
953 	case ALL:
954 		rtok = relex();
955 		return (unary(op2(ALL, NIL, NIL)));
956 	case EMPTYRE:
957 		rtok = relex();
958 		return (unary(op2(EMPTYRE, NIL, NIL)));
959 	case DOT:
960 		lastatom = starttok;
961 		rtok = relex();
962 		return (unary(op2(DOT, NIL, NIL)));
963 	case CCL:
964 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
965 		lastatom = starttok;
966 		rtok = relex();
967 		return (unary(np));
968 	case NCCL:
969 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
970 		lastatom = starttok;
971 		rtok = relex();
972 		return (unary(np));
973 	case '^':
974 		rtok = relex();
975 		return (unary(op2(CHAR, NIL, itonp(HAT))));
976 	case '$':
977 		rtok = relex();
978 		return (unary(op2(CHAR, NIL, NIL)));
979 	case '(':
980 		lastatom = starttok;
981 		savelastatom = starttok - basestr; /* Retain over recursion */
982 		rtok = relex();
983 		if (rtok == ')') {	/* special pleading for () */
984 			rtok = relex();
985 			return unary(op2(CCL, NIL, (Node *) cclenter("")));
986 		}
987 		np = regexp();
988 		if (rtok == ')') {
989 			lastatom = basestr + savelastatom; /* Restore */
990 			rtok = relex();
991 			return (unary(np));
992 		}
993 		else
994 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
995 	default:
996 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
997 	}
998 	return 0;	/*NOTREACHED*/
999 }
1000 
1001 Node *concat(Node *np)
1002 {
1003 	switch (rtok) {
1004 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1005 		return (concat(op2(CAT, np, primary())));
1006 	case EMPTYRE:
1007 		rtok = relex();
1008 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1009 				primary())));
1010 	}
1011 	return (np);
1012 }
1013 
1014 Node *alt(Node *np)
1015 {
1016 	if (rtok == OR) {
1017 		rtok = relex();
1018 		return (alt(op2(OR, np, concat(primary()))));
1019 	}
1020 	return (np);
1021 }
1022 
1023 Node *unary(Node *np)
1024 {
1025 	switch (rtok) {
1026 	case STAR:
1027 		rtok = relex();
1028 		return (unary(op2(STAR, np, NIL)));
1029 	case PLUS:
1030 		rtok = relex();
1031 		return (unary(op2(PLUS, np, NIL)));
1032 	case QUEST:
1033 		rtok = relex();
1034 		return (unary(op2(QUEST, np, NIL)));
1035 	case ZERO:
1036 		rtok = relex();
1037 		return (unary(op2(ZERO, np, NIL)));
1038 	default:
1039 		return (np);
1040 	}
1041 }
1042 
1043 /*
1044  * Character class definitions conformant to the POSIX locale as
1045  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1046  * and operating character sets are both ASCII (ISO646) or supersets
1047  * thereof.
1048  *
1049  * Note that to avoid overflowing the temporary buffer used in
1050  * relex(), the expanded character class (prior to range expansion)
1051  * must be less than twice the size of their full name.
1052  */
1053 
1054 /* Because isblank doesn't show up in any of the header files on any
1055  * system i use, it's defined here.  if some other locale has a richer
1056  * definition of "blank", define HAS_ISBLANK and provide your own
1057  * version.
1058  * the parentheses here are an attempt to find a path through the maze
1059  * of macro definition and/or function and/or version provided.  thanks
1060  * to nelson beebe for the suggestion; let's see if it works everywhere.
1061  */
1062 
1063 #ifndef HAS_ISBLANK
1064 
1065 int (xisblank)(int c)
1066 {
1067 	return c==' ' || c=='\t';
1068 }
1069 
1070 #endif
1071 
1072 static const struct charclass {
1073 	const char *cc_name;
1074 	int cc_namelen;
1075 	int (*cc_func)(int);
1076 } charclasses[] = {
1077 	{ "alnum",	5,	isalnum },
1078 	{ "alpha",	5,	isalpha },
1079 #ifndef HAS_ISBLANK
1080 	{ "blank",	5,	xisblank },
1081 #else
1082 	{ "blank",	5,	isblank },
1083 #endif
1084 	{ "cntrl",	5,	iscntrl },
1085 	{ "digit",	5,	isdigit },
1086 	{ "graph",	5,	isgraph },
1087 	{ "lower",	5,	islower },
1088 	{ "print",	5,	isprint },
1089 	{ "punct",	5,	ispunct },
1090 	{ "space",	5,	isspace },
1091 	{ "upper",	5,	isupper },
1092 	{ "xdigit",	6,	isxdigit },
1093 	{ NULL,		0,	NULL },
1094 };
1095 
1096 #define REPEAT_SIMPLE		0
1097 #define REPEAT_PLUS_APPENDED	1
1098 #define REPEAT_WITH_Q		2
1099 #define REPEAT_ZERO		3
1100 
1101 static int
1102 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1103 	       int atomlen, int firstnum, int secondnum, int special_case)
1104 {
1105 	int i, j;
1106 	uschar *buf = NULL;
1107 	int ret = 1;
1108 	int init_q = (firstnum == 0);		/* first added char will be ? */
1109 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
1110 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
1111 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
1112 	int size = prefix_length +  suffix_length;
1113 
1114 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
1115 		size += atomlen*(firstnum-1);
1116 	}
1117 
1118 	/* Adjust size of buffer for special cases */
1119 	if (special_case == REPEAT_PLUS_APPENDED) {
1120 		size++;		/* for the final + */
1121 	} else if (special_case == REPEAT_WITH_Q) {
1122 		size += init_q + (atomlen+1)* (n_q_reps-init_q);
1123 	} else if (special_case == REPEAT_ZERO) {
1124 		size += 2;	/* just a null ERE: () */
1125 	}
1126 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
1127 		FATAL("out of space in reg expr %.10s..", lastre);
1128 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
1129 	j = prefix_length;
1130 	if (special_case == REPEAT_ZERO) {
1131 		j -= atomlen;
1132 		buf[j++] = '(';
1133 		buf[j++] = ')';
1134 	}
1135 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
1136 		memcpy(&buf[j], atom, atomlen);
1137 		j += atomlen;
1138 	}
1139 	if (special_case == REPEAT_PLUS_APPENDED) {
1140 		buf[j++] = '+';
1141 	} else if (special_case == REPEAT_WITH_Q) {
1142 		if (init_q)
1143 			buf[j++] = '?';
1144 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
1145 			memcpy(&buf[j], atom, atomlen);
1146 			j += atomlen;
1147 			buf[j++] = '?';
1148 		}
1149 	}
1150 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
1151 	j += suffix_length;
1152 	buf[j] = '\0';
1153 	/* free old basestr */
1154 	if (firstbasestr != basestr) {
1155 		if (basestr)
1156 			xfree(basestr);
1157 	}
1158 	basestr = buf;
1159 	prestr  = buf + prefix_length;
1160 	if (special_case == REPEAT_ZERO) {
1161 		prestr  -= atomlen;
1162 		ret++;
1163 	}
1164 	return ret;
1165 }
1166 
1167 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1168 		  int atomlen, int firstnum, int secondnum)
1169 {
1170 	/*
1171 	   In general, the repetition specifier or "bound" is replaced here
1172 	   by an equivalent ERE string, repeating the immediately previous atom
1173 	   and appending ? and + as needed. Note that the first copy of the
1174 	   atom is left in place, except in the special_case of a zero-repeat
1175 	   (i.e., {0}).
1176 	 */
1177 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1178 		if (firstnum < 2) {
1179 			/* 0 or 1: should be handled before you get here */
1180 			FATAL("internal error");
1181 		} else {
1182 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1183 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1184 		}
1185 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1186 		if (firstnum == 0) {	/* {0} or {0,0} */
1187 			/* This case is unusual because the resulting
1188 			   replacement string might actually be SMALLER than
1189 			   the original ERE */
1190 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1191 					firstnum, secondnum, REPEAT_ZERO);
1192 		} else {		/* (firstnum >= 1) */
1193 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1194 					firstnum, secondnum, REPEAT_SIMPLE);
1195 		}
1196 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1197 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1198 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1199 					firstnum, secondnum, REPEAT_WITH_Q);
1200 	} else {	/* Error - shouldn't be here (n>m) */
1201 		FATAL("internal error");
1202 	}
1203 	return 0;
1204 }
1205 
1206 extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */
1207 
1208 int relex(void)		/* lexical analyzer for reparse */
1209 {
1210 	int c, n;
1211 	int cflag;
1212 	static uschar *buf = NULL;
1213 	static int bufsz = 100;
1214 	uschar *bp;
1215 	const struct charclass *cc;
1216 	int i;
1217 	int num, m;
1218 	bool commafound, digitfound;
1219 	const uschar *startreptok;
1220 	static int parens = 0;
1221 
1222 rescan:
1223 	starttok = prestr;
1224 
1225 	if ((n = u8_rune(&rlxval, prestr)) > 1) {
1226 		prestr += n;
1227 		starttok = prestr;
1228 		return CHAR;
1229 	}
1230 
1231 	switch (c = *prestr++) {
1232 	case '|': return OR;
1233 	case '*': return STAR;
1234 	case '+': return PLUS;
1235 	case '?': return QUEST;
1236 	case '.': return DOT;
1237 	case '\0': prestr--; return '\0';
1238 	case '^':
1239 	case '$':
1240 		return c;
1241 	case '(':
1242 		parens++;
1243 		return c;
1244 	case ')':
1245 		if (parens) {
1246 			parens--;
1247 			return c;
1248 		}
1249 		/* unmatched close parenthesis; per POSIX, treat as literal */
1250 		rlxval = c;
1251 		return CHAR;
1252 	case '\\':
1253 		rlxval = quoted(&prestr);
1254 		return CHAR;
1255 	default:
1256 		rlxval = c;
1257 		return CHAR;
1258 	case '[':
1259 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1260 			FATAL("out of space in reg expr %.10s..", lastre);
1261 		bp = buf;
1262 		if (*prestr == '^') {
1263 			cflag = 1;
1264 			prestr++;
1265 		}
1266 		else
1267 			cflag = 0;
1268 		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1269 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1270 			FATAL("out of space for reg expr %.10s...", lastre);
1271 		for (; ; ) {
1272 			if ((n = u8_rune(&rlxval, prestr)) > 1) {
1273 				for (i = 0; i < n; i++)
1274 					*bp++ = *prestr++;
1275 				continue;
1276 			}
1277 			if ((c = *prestr++) == '\\') {
1278 				*bp++ = '\\';
1279 				if ((c = *prestr++) == '\0')
1280 					FATAL("nonterminated character class %.20s...", lastre);
1281 				*bp++ = c;
1282 			/* } else if (c == '\n') { */
1283 			/* 	FATAL("newline in character class %.20s...", lastre); */
1284 			} else if (c == '[' && *prestr == ':') {
1285 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1286 				for (cc = charclasses; cc->cc_name; cc++)
1287 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1288 						break;
1289 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1290 				    prestr[2 + cc->cc_namelen] == ']') {
1291 					prestr += cc->cc_namelen + 3;
1292 					/*
1293 					 * BUG: We begin at 1, instead of 0, since we
1294 					 * would otherwise prematurely terminate the
1295 					 * string for classes like [[:cntrl:]]. This
1296 					 * means that we can't match the NUL character,
1297 					 * not without first adapting the entire
1298 					 * program to track each string's length.
1299 					 */
1300 					for (i = 1; i <= UCHAR_MAX; i++) {
1301 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1302 						    FATAL("out of space for reg expr %.10s...", lastre);
1303 						if (cc->cc_func(i)) {
1304 							/* escape backslash */
1305 							if (i == '\\') {
1306 								*bp++ = '\\';
1307 								n++;
1308 							}
1309 
1310 							*bp++ = i;
1311 							n++;
1312 						}
1313 					}
1314 				} else
1315 					*bp++ = c;
1316 			} else if (c == '[' && *prestr == '.') {
1317 				char collate_char;
1318 				prestr++;
1319 				collate_char = *prestr++;
1320 				if (*prestr == '.' && prestr[1] == ']') {
1321 					prestr += 2;
1322 					/* Found it: map via locale TBD: for
1323 					   now, simply return this char.  This
1324 					   is sufficient to pass conformance
1325 					   test awk.ex 156
1326 					 */
1327 					if (*prestr == ']') {
1328 						prestr++;
1329 						rlxval = collate_char;
1330 						return CHAR;
1331 					}
1332 				}
1333 			} else if (c == '[' && *prestr == '=') {
1334 				char equiv_char;
1335 				prestr++;
1336 				equiv_char = *prestr++;
1337 				if (*prestr == '=' && prestr[1] == ']') {
1338 					prestr += 2;
1339 					/* Found it: map via locale TBD: for now
1340 					   simply return this char. This is
1341 					   sufficient to pass conformance test
1342 					   awk.ex 156
1343 					 */
1344 					if (*prestr == ']') {
1345 						prestr++;
1346 						rlxval = equiv_char;
1347 						return CHAR;
1348 					}
1349 				}
1350 			} else if (c == '\0') {
1351 				FATAL("nonterminated character class %.20s", lastre);
1352 			} else if (bp == buf) {	/* 1st char is special */
1353 				*bp++ = c;
1354 			} else if (c == ']') {
1355 				*bp++ = 0;
1356 				rlxstr = (uschar *) tostring((char *) buf);
1357 				if (cflag == 0)
1358 					return CCL;
1359 				else
1360 					return NCCL;
1361 			} else
1362 				*bp++ = c;
1363 		}
1364 		break;
1365 	case '{':
1366 		if (isdigit(*(prestr))) {
1367 			num = 0;	/* Process as a repetition */
1368 			n = -1; m = -1;
1369 			commafound = false;
1370 			digitfound = false;
1371 			startreptok = prestr-1;
1372 			/* Remember start of previous atom here ? */
1373 		} else {        	/* just a { char, not a repetition */
1374 			rlxval = c;
1375 			return CHAR;
1376                 }
1377 		for (; ; ) {
1378 			if ((c = *prestr++) == '}') {
1379 				if (commafound) {
1380 					if (digitfound) { /* {n,m} */
1381 						m = num;
1382 						if (m < n)
1383 							FATAL("illegal repetition expression: class %.20s",
1384 								lastre);
1385 						if (n == 0 && m == 1) {
1386 							return QUEST;
1387 						}
1388 					} else {	/* {n,} */
1389 						if (n == 0)
1390 							return STAR;
1391 						else if (n == 1)
1392 							return PLUS;
1393 					}
1394 				} else {
1395 					if (digitfound) { /* {n} same as {n,n} */
1396 						n = num;
1397 						m = num;
1398 					} else {	/* {} */
1399 						FATAL("illegal repetition expression: class %.20s",
1400 							lastre);
1401 					}
1402 				}
1403 				if (repeat(starttok, prestr-starttok, lastatom,
1404 					   startreptok - lastatom, n, m) > 0) {
1405 					if (n == 0 && m == 0) {
1406 						return ZERO;
1407 					}
1408 					/* must rescan input for next token */
1409 					goto rescan;
1410 				}
1411 				/* Failed to replace: eat up {...} characters
1412 				   and treat like just PLUS */
1413 				return PLUS;
1414 			} else if (c == '\0') {
1415 				FATAL("nonterminated character class %.20s",
1416 					lastre);
1417 			} else if (isdigit(c)) {
1418 				num = 10 * num + c - '0';
1419 				digitfound = true;
1420 			} else if (c == ',') {
1421 				if (commafound)
1422 					FATAL("illegal repetition expression: class %.20s",
1423 						lastre);
1424 				/* looking for {n,} or {n,m} */
1425 				commafound = true;
1426 				n = num;
1427 				digitfound = false; /* reset */
1428 				num = 0;
1429 			} else {
1430 				FATAL("illegal repetition expression: class %.20s",
1431 					lastre);
1432 			}
1433 		}
1434 		break;
1435 	}
1436 }
1437 
1438 int cgoto(fa *f, int s, int c)
1439 {
1440 	int *p, *q;
1441 	int i, j, k;
1442 
1443 	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
1444 	while (f->accept >= maxsetvec) {	/* guessing here! */
1445 		resizesetvec(__func__);
1446 	}
1447 	for (i = 0; i <= f->accept; i++)
1448 		setvec[i] = 0;
1449 	setcnt = 0;
1450 	resize_state(f, s);
1451 	/* compute positions of gototab[s,c] into setvec */
1452 	p = f->posns[s];
1453 	for (i = 1; i <= *p; i++) {
1454 		if ((k = f->re[p[i]].ltype) != FINAL) {
1455 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1456 			 || (k == DOT && c != 0 && c != HAT)
1457 			 || (k == ALL && c != 0)
1458 			 || (k == EMPTYRE && c != 0)
1459 			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1460 			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1461 				q = f->re[p[i]].lfollow;
1462 				for (j = 1; j <= *q; j++) {
1463 					if (q[j] >= maxsetvec) {
1464 						resizesetvec(__func__);
1465 					}
1466 					if (setvec[q[j]] == 0) {
1467 						setcnt++;
1468 						setvec[q[j]] = 1;
1469 					}
1470 				}
1471 			}
1472 		}
1473 	}
1474 	/* determine if setvec is a previous state */
1475 	tmpset[0] = setcnt;
1476 	j = 1;
1477 	for (i = f->accept; i >= 0; i--)
1478 		if (setvec[i]) {
1479 			tmpset[j++] = i;
1480 		}
1481 	resize_state(f, f->curstat > s ? f->curstat : s);
1482 	/* tmpset == previous state? */
1483 	for (i = 1; i <= f->curstat; i++) {
1484 		p = f->posns[i];
1485 		if ((k = tmpset[0]) != p[0])
1486 			goto different;
1487 		for (j = 1; j <= k; j++)
1488 			if (tmpset[j] != p[j])
1489 				goto different;
1490 		/* setvec is state i */
1491 		if (c != HAT)
1492 			set_gototab(f, s, c, i);
1493 		return i;
1494 	  different:;
1495 	}
1496 
1497 	/* add tmpset to current set of states */
1498 	++(f->curstat);
1499 	resize_state(f, f->curstat);
1500 	xfree(f->posns[f->curstat]);
1501 	p = intalloc(setcnt + 1, __func__);
1502 
1503 	f->posns[f->curstat] = p;
1504 	if (c != HAT)
1505 		set_gototab(f, s, c, f->curstat);
1506 	for (i = 0; i <= setcnt; i++)
1507 		p[i] = tmpset[i];
1508 	if (setvec[f->accept])
1509 		f->out[f->curstat] = 1;
1510 	else
1511 		f->out[f->curstat] = 0;
1512 	return f->curstat;
1513 }
1514 
1515 
1516 void freefa(fa *f)	/* free a finite automaton */
1517 {
1518 	int i;
1519 
1520 	if (f == NULL)
1521 		return;
1522 	for (i = 0; i < f->state_count; i++)
1523 		xfree(f->gototab[i])
1524 	for (i = 0; i <= f->curstat; i++)
1525 		xfree(f->posns[i]);
1526 	for (i = 0; i <= f->accept; i++) {
1527 		xfree(f->re[i].lfollow);
1528 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1529 			xfree(f->re[i].lval.np);
1530 	}
1531 	xfree(f->restr);
1532 	xfree(f->out);
1533 	xfree(f->posns);
1534 	xfree(f->gototab);
1535 	xfree(f);
1536 }
1537