xref: /csrg-svn/lib/libc/regex/regcomp.c (revision 56357)
155847Sbostic /*-
255847Sbostic  * Copyright (c) 1992 Henry Spencer.
355847Sbostic  * Copyright (c) 1992 The Regents of the University of California.
455847Sbostic  * All rights reserved.
555847Sbostic  *
655847Sbostic  * This code is derived from software contributed to Berkeley by
755847Sbostic  * Henry Spencer of the University of Toronto.
855847Sbostic  *
955847Sbostic  * %sccs.include.redist.c%
1055847Sbostic  *
11*56357Sbostic  *	@(#)regcomp.c	5.3 (Berkeley) 09/30/92
1255847Sbostic  */
1355847Sbostic 
1455847Sbostic #if defined(LIBC_SCCS) && !defined(lint)
15*56357Sbostic static char sccsid[] = "@(#)regcomp.c	5.3 (Berkeley) 09/30/92";
1655847Sbostic #endif /* LIBC_SCCS and not lint */
1755847Sbostic 
1855847Sbostic #include <sys/types.h>
1955847Sbostic #include <stdio.h>
2055847Sbostic #include <string.h>
2155847Sbostic #include <ctype.h>
2255847Sbostic #include <limits.h>
2355847Sbostic #include <stdlib.h>
2455847Sbostic #include <regex.h>
2555847Sbostic 
2655847Sbostic #include "utils.h"
2755847Sbostic #include "regex2.h"
2855847Sbostic 
2955847Sbostic #include "cclass.h"
3055847Sbostic #include "cname.h"
3155847Sbostic 
3255847Sbostic /*
3355847Sbostic  * parse structure, passed up and down to avoid global variables and
3455847Sbostic  * other clumsinesses
3555847Sbostic  */
3655847Sbostic struct parse {
3755847Sbostic 	uchar *next;		/* next character in RE */
3855847Sbostic 	int error;		/* has an error been seen? */
3955847Sbostic 	sop *strip;		/* malloced strip */
4055847Sbostic 	sopno ssize;		/* malloced strip size (allocated) */
4155847Sbostic 	sopno slen;		/* malloced strip length (used) */
4255847Sbostic 	int ncsalloc;		/* number of csets allocated */
4355847Sbostic 	struct re_guts *g;
4455847Sbostic #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
4555847Sbostic 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
4655847Sbostic 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
4755847Sbostic };
4855847Sbostic 
4956355Sbostic static uchar nuls[10];		/* place to point scanner in event of error */
5056355Sbostic 
5155847Sbostic /*
5255847Sbostic  * macros for use with parse structure
5355847Sbostic  * BEWARE:  these know that the parse structure is named `p' !!!
5455847Sbostic  */
5555847Sbostic #define	PEEK()	((uchar)*p->next)
5655847Sbostic #define	PEEK2()	((uchar)*(p->next+1))
5755847Sbostic #define	SEE(c)	(PEEK() == (c))
5855847Sbostic #define	SEETWO(a, b)	(PEEK() == (a) && PEEK2() == (b))
5955847Sbostic #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
6055847Sbostic #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
6155847Sbostic #define	NEXT()	(p->next++)
6255847Sbostic #define	NEXT2()	(p->next += 2)
6355847Sbostic #define	NEXTn(n)	(p->next += (n))
6455847Sbostic #define	GETNEXT()	((uchar)*p->next++)
6555847Sbostic #define	SETERROR(e)	seterr(p, (e))
6655847Sbostic #define	REQUIRE(co, e)	((co) || SETERROR(e))
6755847Sbostic #define	MUSTSEE(c, e)	(REQUIRE(PEEK() == (c), e))
6855847Sbostic #define	MUSTEAT(c, e)	(REQUIRE(GETNEXT() == (c), e))
6955847Sbostic #define	MUSTNOTSEE(c, e)	(REQUIRE(PEEK() != (c), e))
7055847Sbostic #define	EMIT(sop, sopnd)	doemit(p, sop, (size_t)(sopnd))
7155847Sbostic #define	INSERT(sop, pos)	doinsert(p, sop, HERE()-(pos)+1, pos)
7255847Sbostic #define	FWD(pos)		dofwd(p, pos, HERE()-(pos))
7355847Sbostic #define	BACK(sop, pos)		EMIT(sop, HERE()-pos)
7455847Sbostic #define	HERE()		(p->slen)
7555847Sbostic #define	THERE()		(p->slen - 1)
7655847Sbostic #define	DROP(n)	(p->slen -= (n))
7755847Sbostic 
78*56357Sbostic static cset	*allocset __P((struct parse *));
79*56357Sbostic static void	 bothcases __P((struct parse *, u_int));
80*56357Sbostic static void	 categorize __P((struct parse *, struct re_guts *));
81*56357Sbostic static void	 doemit __P((struct parse *, sop, size_t));
82*56357Sbostic static void	 dofwd __P((struct parse *, sopno, sop));
83*56357Sbostic static void	 doinsert __P((struct parse *, sop, size_t, sopno));
84*56357Sbostic static sopno	 dupl __P((struct parse *, sopno, sopno));
85*56357Sbostic static void	 enlarge __P((struct parse *, sopno));
86*56357Sbostic static void	 findmust __P((struct parse *, struct re_guts *));
87*56357Sbostic static int	 freezeset __P((struct parse *, cset *));
88*56357Sbostic static int	 isinsets __P((struct re_guts *, u_int));
89*56357Sbostic static void	 mcadd __P((struct parse *, cset *, uchar *));
90*56357Sbostic static uchar	*mcfind __P((cset *, u_int *));
91*56357Sbostic static int	 mcin __P((struct parse *, cset *, u_int *));
92*56357Sbostic static void	 mcinvert __P((struct parse *, cset *));
93*56357Sbostic static void	 mcsub __P((struct parse *, cset *, u_int *));
94*56357Sbostic static void	 nonnewline __P((struct parse *));
95*56357Sbostic static void	 ordinary __P((struct parse *, u_int));
96*56357Sbostic static uchar	 othercase __P((u_int));
97*56357Sbostic static void	 p_b_cclass __P((struct parse *, cset *));
98*56357Sbostic static uchar	 p_b_coll_elem __P((struct parse *, u_int));
99*56357Sbostic static void	 p_b_eclass __P((struct parse *, cset *));
100*56357Sbostic static uchar	 p_b_symbol __P((struct parse *));
101*56357Sbostic static void	 p_b_term __P((struct parse *, cset *));
102*56357Sbostic static void	 p_bracket __P((struct parse *));
103*56357Sbostic static void	 p_bre  __P((struct parse *, u_int, u_int));
104*56357Sbostic static int	 p_count __P((struct parse *));
105*56357Sbostic static void	 p_ere  __P((struct parse *, u_int));
106*56357Sbostic static void	 p_ere_exp  __P((struct parse *));
107*56357Sbostic static int	 p_simp_re __P((struct parse *, int));
108*56357Sbostic static sopno	 pluscount __P((struct parse *, struct re_guts *));
109*56357Sbostic static void	 repeat __P((struct parse *, sopno, int, int));
110*56357Sbostic static int	 samesets __P((struct re_guts *, u_int, u_int));
111*56357Sbostic static int	 seterr __P((struct parse *, int));
112*56357Sbostic static void	 stripsnug __P((struct parse *, struct re_guts *));
113*56357Sbostic 
11455847Sbostic /*
11555847Sbostic  - regcomp - interface for parser and compilation
11655847Sbostic  */
11755847Sbostic int				/* 0 success, otherwise REG_something */
11855847Sbostic regcomp(preg, pattern, cflags)
11955847Sbostic regex_t *preg;
12055847Sbostic const char *pattern;
12155847Sbostic int cflags;
12255847Sbostic {
12355847Sbostic 	struct parse pa;
12455847Sbostic 	register struct re_guts *g;
12555847Sbostic 	register struct parse *p = &pa;
12655847Sbostic 	register int i;
12755847Sbostic 
12855847Sbostic 	/* do the mallocs early so failure handling is easy */
12955847Sbostic 	/* the +NUC here is for the category table */
13055847Sbostic 	g = (struct re_guts *)malloc(sizeof(struct re_guts) + NUC);
13155847Sbostic 	if (g == NULL)
13255847Sbostic 		return(REG_ESPACE);
13355847Sbostic 	p->ssize = strlen(pattern)/2*3 + 1;
13455847Sbostic 	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
13555847Sbostic 	p->slen = 0;
13655847Sbostic 	if (p->strip == NULL) {
13755847Sbostic 		free((char *)g);
13855847Sbostic 		return(REG_ESPACE);
13955847Sbostic 	}
14055847Sbostic 
14155847Sbostic 	/* set things up */
14255847Sbostic 	p->g = g;
14355847Sbostic 	p->next = (uchar *)pattern;
14455847Sbostic 	p->error = 0;
14555847Sbostic 	p->ncsalloc = 0;
14655847Sbostic 	for (i = 0; i < NPAREN; i++) {
14755847Sbostic 		p->pbegin[i] = 0;
14855847Sbostic 		p->pend[i] = 0;
14955847Sbostic 	}
15055847Sbostic 	g->csetsize = NUC;
15155847Sbostic 	g->sets = NULL;
15255847Sbostic 	g->setbits = NULL;
15355847Sbostic 	g->ncsets = 0;
15455847Sbostic 	g->cflags = cflags;
15555847Sbostic 	g->iflags = 0;
15655847Sbostic 	g->must = NULL;
15755847Sbostic 	g->mlen = 0;
15855847Sbostic 	g->nsub = 0;
15955847Sbostic 	g->ncategories = 1;	/* category 0 is "everything else" */
16055847Sbostic 	g->categories = (uchar *)g + sizeof(struct re_guts);
16155847Sbostic 	(void) memset((char *)g->categories, 0, NUC);
16255847Sbostic 	g->backrefs = 0;
16355847Sbostic 	g->nplus = 0;
16455847Sbostic 
16555847Sbostic 	/* do it */
16655847Sbostic 	EMIT(OEND, 0);
16755847Sbostic 	g->firststate = THERE();
16855847Sbostic 	if (cflags&REG_EXTENDED)
16955847Sbostic 		p_ere(p, '\0');
17055847Sbostic 	else
17155847Sbostic 		p_bre(p, '\0', '\0');
17255847Sbostic 	EMIT(OEND, 0);
17355847Sbostic 	g->laststate = THERE();
17455847Sbostic 
17555847Sbostic 	/* tidy up loose ends and fill things in */
17655847Sbostic 	categorize(p, g);
17755847Sbostic 	stripsnug(p, g);
17855847Sbostic 	findmust(p, g);
17955847Sbostic 	g->nplus = pluscount(p, g);
18055847Sbostic 	g->magic = MAGIC2;
18155847Sbostic 	preg->re_nsub = g->nsub;
18255847Sbostic 	preg->re_g = g;
18355847Sbostic 	preg->re_magic = MAGIC1;
18456355Sbostic #ifndef REDEBUG
18555847Sbostic 	/* not debugging, so can't rely on the assert() in regexec() */
18655847Sbostic 	if (g->iflags&BAD)
18755847Sbostic 		SETERROR(REG_ASSERT);
18855847Sbostic #endif
18955847Sbostic 
19055847Sbostic 	/* win or lose, we're done */
19155847Sbostic 	if (p->error != 0)	/* lose */
19255847Sbostic 		regfree(preg);
19355847Sbostic 	return(p->error);
19455847Sbostic }
19555847Sbostic 
19655847Sbostic /*
19755847Sbostic  - p_ere - ERE parser top level, concatenation and alternation
19855847Sbostic  */
19955847Sbostic static void
20055847Sbostic p_ere(p, stop)
20155847Sbostic register struct parse *p;
202*56357Sbostic u_int stop;			/* character this ERE should end at */
20355847Sbostic {
20455847Sbostic 	register uchar c;
20555847Sbostic 	register sopno prevback;
20655847Sbostic 	register sopno prevfwd;
20755847Sbostic 	register sopno conc;
20855847Sbostic 	register int first = 1;		/* is this the first alternative? */
20955847Sbostic 
21055847Sbostic 	for (;;) {
21155847Sbostic 		/* do a bunch of concatenated expressions */
21255847Sbostic 		conc = HERE();
21355847Sbostic 		while ((c = PEEK()) != '|' && c != stop && c != '\0')
21455847Sbostic 			p_ere_exp(p);
21555847Sbostic 		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
21655847Sbostic 
21755847Sbostic 		if (!EAT('|'))
21855847Sbostic 			break;		/* NOTE BREAK OUT */
21955847Sbostic 
22055847Sbostic 		if (first) {
22155847Sbostic 			INSERT(OCH_, conc);	/* offset is wrong */
22255847Sbostic 			prevfwd = conc;
22355847Sbostic 			prevback = conc;
22455847Sbostic 			first = 0;
22555847Sbostic 		}
22655847Sbostic 		BACK(OOR1, prevback);
22755847Sbostic 		prevback = THERE();
22855847Sbostic 		FWD(prevfwd);			/* fix previous offset */
22955847Sbostic 		prevfwd = HERE();
23055847Sbostic 		EMIT(OOR2, 0);			/* offset is very wrong */
23155847Sbostic 	}
23255847Sbostic 
23355847Sbostic 	if (!first) {		/* tail-end fixups */
23455847Sbostic 		FWD(prevfwd);
23555847Sbostic 		BACK(O_CH, prevback);
23655847Sbostic 	}
23755847Sbostic 
23855847Sbostic 	assert(SEE(stop) || SEE('\0'));
23955847Sbostic }
24055847Sbostic 
24155847Sbostic /*
24255847Sbostic  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
24355847Sbostic  */
24455847Sbostic static void
24555847Sbostic p_ere_exp(p)
24655847Sbostic register struct parse *p;
24755847Sbostic {
24855847Sbostic 	register uchar c;
24955847Sbostic 	register sopno pos;
25055847Sbostic 	register int count;
25155847Sbostic 	register int count2;
25255847Sbostic 	register sopno subno;
25355847Sbostic 	int wascaret = 0;
25455847Sbostic 	/* we call { a repetition if followed by a digit */
25555847Sbostic #	define	ISRPT(c1, c2)	(c1 == '*' || c1 == '+' || c1 == '?' || \
25655847Sbostic 						(c1 == '{' && isdigit(c2)))
25755847Sbostic 
25855847Sbostic 	c = GETNEXT();
25955847Sbostic 	assert(c != '\0');	/* caller should have ensured this */
26055847Sbostic 
26155847Sbostic 	pos = HERE();
26255847Sbostic 	switch (c) {
26355847Sbostic 	case '(':
26455847Sbostic 		MUSTNOTSEE('\0', REG_EPAREN);
26555847Sbostic 		p->g->nsub++;
26655847Sbostic 		subno = p->g->nsub;
26755847Sbostic 		if (subno < NPAREN)
26855847Sbostic 			p->pbegin[subno] = HERE();
26955847Sbostic 		EMIT(OLPAREN, subno);
27055847Sbostic 		if (!SEE(')'))
27155847Sbostic 			p_ere(p, ')');
27255847Sbostic 		if (subno < NPAREN) {
27355847Sbostic 			p->pend[subno] = HERE();
27455847Sbostic 			assert(p->pend[subno] != 0);
27555847Sbostic 		}
27655847Sbostic 		EMIT(ORPAREN, subno);
27755847Sbostic 		MUSTEAT(')', REG_EPAREN);
27855847Sbostic 		break;
27955847Sbostic #ifndef POSIX_MISTAKE
28055847Sbostic 	case ')':		/* happens only if no current unmatched ( */
28155847Sbostic 		/*
28255847Sbostic 		 * You may ask, why the ifndef?  Because I didn't notice
28355847Sbostic 		 * this until slightly too late for 1003.2, and none of the
28455847Sbostic 		 * other 1003.2 regular-expression reviewers noticed it at
28555847Sbostic 		 * all.  So an unmatched ) is legal POSIX, at least until
28655847Sbostic 		 * we can get it fixed.
28755847Sbostic 		 */
28855847Sbostic 		SETERROR(REG_EPAREN);
28955847Sbostic 		break;
29055847Sbostic #endif
29155847Sbostic 	case '^':
29255847Sbostic 		EMIT(OBOL, 0);
29355847Sbostic 		p->g->iflags |= USEBOL;
29455847Sbostic 		wascaret = 1;
29555847Sbostic 		break;
29655847Sbostic 	case '$':
29755847Sbostic 		EMIT(OEOL, 0);
29855847Sbostic 		p->g->iflags |= USEEOL;
29955847Sbostic 		break;
30055847Sbostic 	case '|':
30155847Sbostic 		SETERROR(REG_EMPTY);
30255847Sbostic 		break;
30355847Sbostic 	case '*':
30455847Sbostic 	case '+':
30555847Sbostic 	case '?':
30655847Sbostic 		SETERROR(REG_BADRPT);
30755847Sbostic 		break;
30855847Sbostic 	case '.':
30955847Sbostic 		if (p->g->cflags&REG_NEWLINE)
31055847Sbostic 			nonnewline(p);
31155847Sbostic 		else
31255847Sbostic 			EMIT(OANY, 0);
31355847Sbostic 		break;
31455847Sbostic 	case '[':
31555847Sbostic 		p_bracket(p);
31655847Sbostic 		break;
31755847Sbostic 	case '\\':
31855847Sbostic 		c = GETNEXT();
31956355Sbostic #ifdef xxx
32055847Sbostic 		if (c == '^' || c == '.' || c == '[' || c == '$' ||
32155847Sbostic 				c == '(' || c == ')' || c == '|' ||
32255847Sbostic 				c == '*' || c == '+' || c == '?' ||
32355847Sbostic 				c == '{' || c == '\\')
32456355Sbostic #else
32556355Sbostic 		if (c != '\0')
32656355Sbostic #endif
32755847Sbostic 			ordinary(p, c);
32855847Sbostic 		else
32955847Sbostic 			SETERROR(REG_EESCAPE);
33055847Sbostic 		break;
33155847Sbostic 	case '{':		/* okay as ordinary except if digit follows */
33255847Sbostic 		REQUIRE(!isdigit(PEEK()), REG_BADRPT);
33355847Sbostic 		/* FALLTHROUGH */
33455847Sbostic 	default:
33555847Sbostic 		ordinary(p, c);
33655847Sbostic 		break;
33755847Sbostic 	}
33855847Sbostic 
33955847Sbostic 	c = PEEK();
34055847Sbostic 	if (!ISRPT(c, PEEK2()))
34155847Sbostic 		return;		/* no repetition, we're done */
34255847Sbostic 	NEXT();
34355847Sbostic 
34455847Sbostic 	REQUIRE(!wascaret, REG_BADRPT);
34555847Sbostic 	switch (c) {
34655847Sbostic 	case '*':	/* implemented as +? */
34755847Sbostic 		INSERT(OPLUS_, pos);
34855847Sbostic 		BACK(O_PLUS, pos);
34955847Sbostic 		INSERT(OQUEST_, pos);
35055847Sbostic 		BACK(O_QUEST, pos);
35155847Sbostic 		break;
35255847Sbostic 	case '+':
35355847Sbostic 		INSERT(OPLUS_, pos);
35455847Sbostic 		BACK(O_PLUS, pos);
35555847Sbostic 		break;
35655847Sbostic 	case '?':
35755847Sbostic 		INSERT(OQUEST_, pos);
35855847Sbostic 		BACK(O_QUEST, pos);
35955847Sbostic 		break;
36055847Sbostic 	case '{':
36155847Sbostic 		count = p_count(p);
36255847Sbostic 		if (EAT(',')) {
36355847Sbostic 			if (isdigit(PEEK())) {
36455847Sbostic 				count2 = p_count(p);
36555847Sbostic 				REQUIRE(count <= count2, REG_BADBR);
36655847Sbostic 			} else		/* single number with comma */
36755847Sbostic 				count2 = INFINITY;
36855847Sbostic 		} else		/* just a single number */
36955847Sbostic 			count2 = count;
37055847Sbostic 		repeat(p, pos, count, count2);
37155847Sbostic 		if (!EAT('}')) {	/* error heuristics */
37255847Sbostic 			while ((c = PEEK()) != '\0' && c != '}')
37355847Sbostic 				NEXT();
37455847Sbostic 			if (c == '\0')
37555847Sbostic 				SETERROR(REG_EBRACE);
37655847Sbostic 			else
37755847Sbostic 				SETERROR(REG_BADBR);
37855847Sbostic 		}
37955847Sbostic 		break;
38055847Sbostic 	}
38155847Sbostic 
38255847Sbostic 	c = PEEK();
38355847Sbostic 	REQUIRE(!ISRPT(c, PEEK2()), REG_BADRPT);
38455847Sbostic }
38555847Sbostic 
38655847Sbostic /*
38755847Sbostic  - p_bre - BRE parser top level, anchoring and concatenation
38856355Sbostic  * Giving end1 as '\0' essentially eliminates the end1/end2 check.
38955847Sbostic  *
39056355Sbostic  * This implementation is a bit of a kludge, in that a trailing $ is first
39156355Sbostic  * taken as an ordinary character and then revised to be an anchor.  The
39256355Sbostic  * only undesirable side effect is that '$' gets included as a character
39356355Sbostic  * category in such cases.  This is fairly harmless; not worth fixing.
39456355Sbostic  * The amount of lookahead needed to avoid this kludge is excessive,
39556355Sbostic  * especially since things like "$*" appear to be legal. xxx
39655847Sbostic  */
39755847Sbostic static void
39855847Sbostic p_bre(p, end1, end2)
39955847Sbostic register struct parse *p;
400*56357Sbostic register u_int end1;		/* first terminating character */
401*56357Sbostic register u_int end2;		/* second terminating character */
40255847Sbostic {
40355847Sbostic 	register sopno start = HERE();
40455847Sbostic 	register int first = 1;			/* first subexpression? */
40556355Sbostic 	register int wasdollar = 0;
40655847Sbostic 
40755847Sbostic 	if (EAT('^')) {
40855847Sbostic 		EMIT(OBOL, 0);
40955847Sbostic 		p->g->iflags |= USEBOL;
41055847Sbostic 	}
41155847Sbostic 	while (!SEE('\0') && !SEETWO(end1, end2)) {
41255847Sbostic 		wasdollar = p_simp_re(p, first);
41355847Sbostic 		first = 0;
41455847Sbostic 	}
41555847Sbostic 	if (wasdollar) {	/* oops, that was a trailing anchor */
41655847Sbostic 		DROP(1);
41755847Sbostic 		EMIT(OEOL, 0);
41855847Sbostic 		p->g->iflags |= USEEOL;
41955847Sbostic 	}
42055847Sbostic 
42155847Sbostic 	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
42255847Sbostic }
42355847Sbostic 
42455847Sbostic /*
42555847Sbostic  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
42655847Sbostic  */
42755847Sbostic static int			/* was the simple RE an unbackslashed $? */
42855847Sbostic p_simp_re(p, starordinary)
42955847Sbostic register struct parse *p;
43055847Sbostic int starordinary;		/* is a leading * an ordinary character? */
43155847Sbostic {
43255847Sbostic 	register int c;
43355847Sbostic 	register int count;
43455847Sbostic 	register int count2;
43555847Sbostic 	register sopno pos;
43655847Sbostic 	register int i;
43755847Sbostic 	register sopno subno;
43855847Sbostic #	define	BACKSL	(1<<CHAR_BIT)
43955847Sbostic 
44055847Sbostic 	pos = HERE();		/* repetion op, if any, covers from here */
44155847Sbostic 
44255847Sbostic 	c = GETNEXT();
44355847Sbostic 	assert(c != '\0');	/* caller should have ensured this */
44455847Sbostic 	if (c == '\\')
44555847Sbostic 		c = BACKSL | PEEK();
44655847Sbostic 	switch (c) {
44755847Sbostic 	case '.':
44855847Sbostic 		if (p->g->cflags&REG_NEWLINE)
44955847Sbostic 			nonnewline(p);
45055847Sbostic 		else
45155847Sbostic 			EMIT(OANY, 0);
45255847Sbostic 		break;
45355847Sbostic 	case '[':
45455847Sbostic 		p_bracket(p);
45555847Sbostic 		break;
45655847Sbostic 	case BACKSL|'{':
45755847Sbostic 		SETERROR(REG_BADRPT);
45855847Sbostic 		break;
45955847Sbostic 	case BACKSL|'(':
46055847Sbostic 		NEXT();
46155847Sbostic 		p->g->nsub++;
46255847Sbostic 		subno = p->g->nsub;
46355847Sbostic 		if (subno < NPAREN)
46455847Sbostic 			p->pbegin[subno] = HERE();
46555847Sbostic 		EMIT(OLPAREN, subno);
46655847Sbostic 		/* the SEE here is an error heuristic */
46755847Sbostic 		if (!SEE('\0') && !SEETWO('\\', ')'))
46855847Sbostic 			p_bre(p, '\\', ')');
46955847Sbostic 		if (subno < NPAREN) {
47055847Sbostic 			p->pend[subno] = HERE();
47155847Sbostic 			assert(p->pend[subno] != 0);
47255847Sbostic 		}
47355847Sbostic 		EMIT(ORPAREN, subno);
47455847Sbostic 		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
47555847Sbostic 		break;
47655847Sbostic 	case BACKSL|')':	/* should not get here -- must be user */
47755847Sbostic 	case BACKSL|'}':
47855847Sbostic 		SETERROR(REG_EPAREN);
47955847Sbostic 		break;
48055847Sbostic 	case BACKSL|'1':
48155847Sbostic 	case BACKSL|'2':
48255847Sbostic 	case BACKSL|'3':
48355847Sbostic 	case BACKSL|'4':
48455847Sbostic 	case BACKSL|'5':
48555847Sbostic 	case BACKSL|'6':
48655847Sbostic 	case BACKSL|'7':
48755847Sbostic 	case BACKSL|'8':
48855847Sbostic 	case BACKSL|'9':
48955847Sbostic 		i = (c&~BACKSL) - '0';
49055847Sbostic 		assert(i < NPAREN);
49155847Sbostic 		if (p->pend[i] != 0) {
49255847Sbostic 			assert(i <= p->g->nsub);
49355847Sbostic 			EMIT(OBACK_, i);
49455847Sbostic 			assert(p->pbegin[i] != 0);
49555847Sbostic 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
49655847Sbostic 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
49755847Sbostic 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
49855847Sbostic 			EMIT(O_BACK, i);
49955847Sbostic 		} else
50055847Sbostic 			SETERROR(REG_ESUBREG);
50155847Sbostic 		p->g->backrefs = 1;
50255847Sbostic 		NEXT();
50355847Sbostic 		break;
50456355Sbostic 	case BACKSL|'\0':
50556355Sbostic 		SETERROR(REG_EESCAPE);
50656355Sbostic 		break;
50755847Sbostic 	case '*':
50855847Sbostic 		REQUIRE(starordinary, REG_BADRPT);
50955847Sbostic 		/* FALLTHROUGH */
51055847Sbostic 	default:
51156355Sbostic 		if (c & BACKSL)
51256355Sbostic 			c = GETNEXT();
51355847Sbostic 		ordinary(p, (uchar)c);
51455847Sbostic 		break;
51555847Sbostic 	}
51655847Sbostic 
51755847Sbostic 	if (EAT('*')) {		/* implemented as +? */
51855847Sbostic 		INSERT(OPLUS_, pos);
51955847Sbostic 		BACK(O_PLUS, pos);
52055847Sbostic 		INSERT(OQUEST_, pos);
52155847Sbostic 		BACK(O_QUEST, pos);
52255847Sbostic 	} else if (EATTWO('\\', '{')) {
52355847Sbostic 		count = p_count(p);
52455847Sbostic 		if (EAT(',')) {
52555847Sbostic 			if (isdigit(PEEK())) {
52655847Sbostic 				count2 = p_count(p);
52755847Sbostic 				REQUIRE(count <= count2, REG_BADBR);
52855847Sbostic 			} else		/* single number with comma */
52955847Sbostic 				count2 = INFINITY;
53055847Sbostic 		} else		/* just a single number */
53155847Sbostic 			count2 = count;
53255847Sbostic 		repeat(p, pos, count, count2);
53355847Sbostic 		if (!EATTWO('\\', '}')) {	/* error heuristics */
53455847Sbostic 			while (!SEE('\0') && !SEETWO('\\', '}'))
53555847Sbostic 				NEXT();
53655847Sbostic 			if (SEE('\0'))
53755847Sbostic 				SETERROR(REG_EBRACE);
53855847Sbostic 			else
53955847Sbostic 				SETERROR(REG_BADBR);
54055847Sbostic 		}
54155847Sbostic 	} else if (c == '$')	/* unbackslashed $ not followed by reptn */
54255847Sbostic 		return(1);
54355847Sbostic 
54455847Sbostic 	return(0);
54555847Sbostic }
54655847Sbostic 
54755847Sbostic /*
54855847Sbostic  - p_count - parse a repetition count
54955847Sbostic  */
55055847Sbostic static int			/* the value */
55155847Sbostic p_count(p)
55255847Sbostic register struct parse *p;
55355847Sbostic {
55455847Sbostic 	register int count = 0;
55555847Sbostic 	register int ndigits = 0;
55655847Sbostic 
55755847Sbostic 	while (isdigit(PEEK()) && count <= DUPMAX) {
55855847Sbostic 		count = count*10 + (GETNEXT() - '0');
55955847Sbostic 		ndigits++;
56055847Sbostic 	}
56155847Sbostic 
56255847Sbostic 	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
56355847Sbostic 	return(count);
56455847Sbostic }
56555847Sbostic 
56655847Sbostic /*
56755847Sbostic  - p_bracket - parse a bracketed character list
56855847Sbostic  *
56955847Sbostic  * Note a significant property of this code:  if the allocset() did SETERROR,
57055847Sbostic  * no set operations are done.
57155847Sbostic  */
57255847Sbostic static void
57355847Sbostic p_bracket(p)
57455847Sbostic register struct parse *p;
57555847Sbostic {
57655847Sbostic 	register uchar c;
57755847Sbostic 	register cset *cs = allocset(p);
57855847Sbostic 	register int invert = 0;
57955847Sbostic 
58055847Sbostic 	if (EAT('^'))
58155847Sbostic 		invert++;	/* make note to invert set at end */
58255847Sbostic 	if (EAT(']'))
58355847Sbostic 		CHadd(cs, ']');
58455847Sbostic 	while ((c = PEEK()) != '\0' && c != ']' && !SEETWO('-', ']'))
58555847Sbostic 		p_b_term(p, cs);
58655847Sbostic 	if (EAT('-'))
58755847Sbostic 		CHadd(cs, '-');
58855847Sbostic 	MUSTEAT(']', REG_EBRACK);
58955847Sbostic 
59055847Sbostic 	if (invert && p->error == 0) {
59155847Sbostic 		register int i;
59255847Sbostic 
59355847Sbostic 		for (i = p->g->csetsize - 1; i >= 0; i--)
59455847Sbostic 			if (CHIN(cs, i))
59555847Sbostic 				CHsub(cs, i);
59655847Sbostic 			else
59755847Sbostic 				CHadd(cs, i);
59855847Sbostic 		if (p->g->cflags&REG_NEWLINE)
59955847Sbostic 			CHsub(cs, '\n');
60055847Sbostic 		if (cs->multis != NULL)
60155847Sbostic 			mcinvert(p, cs);
60255847Sbostic 	}
60355847Sbostic 	assert(cs->multis == NULL);		/* xxx */
60455847Sbostic 	EMIT(OANYOF, freezeset(p, cs));
60555847Sbostic }
60655847Sbostic 
60755847Sbostic /*
60855847Sbostic  - p_b_term - parse one term of a bracketed character list
60955847Sbostic  */
61055847Sbostic static void
61155847Sbostic p_b_term(p, cs)
61255847Sbostic register struct parse *p;
61355847Sbostic register cset *cs;
61455847Sbostic {
61555847Sbostic 	register uchar c;
61655847Sbostic 	register uchar start, finish;
61755847Sbostic 	register int i;
61855847Sbostic 
61955847Sbostic 	/* classify what we've got */
62055847Sbostic 	switch (PEEK()) {
62155847Sbostic 	case '[':
62255847Sbostic 		c = PEEK2();
62355847Sbostic 		break;
62455847Sbostic 	case '-':
62555847Sbostic 		SETERROR(REG_ERANGE);
62655847Sbostic 		return;			/* NOTE RETURN */
62755847Sbostic 		break;
62855847Sbostic 	default:
62955847Sbostic 		c = '\0';
63055847Sbostic 		break;
63155847Sbostic 	}
63255847Sbostic 
63355847Sbostic 	switch (c) {
63455847Sbostic 	case ':':		/* character class */
63555847Sbostic 		NEXT2();
63655847Sbostic 		c = PEEK();
63755847Sbostic 		REQUIRE(c != '\0', REG_EBRACK);
63855847Sbostic 		REQUIRE(c != '-' && c != ']', REG_ECTYPE);
63955847Sbostic 		p_b_cclass(p, cs);
64055847Sbostic 		MUSTNOTSEE('\0', REG_EBRACK);
64155847Sbostic 		REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
64255847Sbostic 		break;
64355847Sbostic 	case '=':		/* equivalence class */
64455847Sbostic 		NEXT2();
64555847Sbostic 		c = PEEK();
64655847Sbostic 		REQUIRE(c != '\0', REG_EBRACK);
64755847Sbostic 		REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
64855847Sbostic 		p_b_eclass(p, cs);
64955847Sbostic 		MUSTNOTSEE('\0', REG_EBRACK);
65055847Sbostic 		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
65155847Sbostic 		break;
65255847Sbostic 	default:		/* symbol, ordinary character, or range */
65355847Sbostic /* xxx revision needed for multichar stuff */
65455847Sbostic 		start = p_b_symbol(p);
65555847Sbostic 		if (PEEK() == '-' && (c = PEEK2()) != ']' && c != '\0') {
65655847Sbostic 			/* range */
65755847Sbostic 			NEXT();
65855847Sbostic 			if (EAT('-'))
65955847Sbostic 				finish = '-';
66055847Sbostic 			else
66155847Sbostic 				finish = p_b_symbol(p);
66255847Sbostic 		} else
66355847Sbostic 			finish = start;
66455847Sbostic 		REQUIRE(start <= finish, REG_ERANGE);
66555847Sbostic 		for (i = start; i <= finish; i++) {
66655847Sbostic 			CHadd(cs, i);
66755847Sbostic 			if ((p->g->cflags&REG_ICASE) && isalpha(i)) {
66855847Sbostic 				c = othercase((uchar)i);
66955847Sbostic 				CHadd(cs, c);
67055847Sbostic 			}
67155847Sbostic 		}
67255847Sbostic 		break;
67355847Sbostic 	}
67455847Sbostic }
67555847Sbostic 
67655847Sbostic /*
67755847Sbostic  - p_b_cclass - parse a character-class name and deal with it
67855847Sbostic  */
67955847Sbostic static void
68055847Sbostic p_b_cclass(p, cs)
68155847Sbostic register struct parse *p;
68255847Sbostic register cset *cs;
68355847Sbostic {
68455847Sbostic 	register uchar *sb = p->next;
68555847Sbostic 	register uchar *se = sb;
68655847Sbostic 	register struct cclass *cp;
68755847Sbostic 	register int len;
68855847Sbostic 	register uchar *u;
68955847Sbostic 	register uchar c;
69055847Sbostic 
69155847Sbostic 	while (isalpha(*se))
69255847Sbostic 		se++;
69355847Sbostic 	len = se - sb;
69455847Sbostic 	NEXTn(len);
69555847Sbostic 	for (cp = cclasses; cp->name != NULL; cp++)
69655847Sbostic 		if (strncmp(cp->name, (char *)sb, len) == 0 &&
69756355Sbostic 							cp->name[len] == '\0')
69855847Sbostic 			break;
69955847Sbostic 	if (cp->name == NULL) {
70055847Sbostic 		/* oops, didn't find it */
70155847Sbostic 		SETERROR(REG_ECTYPE);
70255847Sbostic 		return;
70355847Sbostic 	}
70455847Sbostic 
70555847Sbostic 	u = (uchar *)cp->chars;
70655847Sbostic 	while ((c = *u++) != '\0')
70755847Sbostic 		CHadd(cs, c);
70855847Sbostic 	for (u = (uchar *)cp->multis; *u != '\0'; u += strlen((char *)u) + 1)
70955847Sbostic 		MCadd(cs, u);
71055847Sbostic }
71155847Sbostic 
71255847Sbostic /*
71355847Sbostic  - p_b_eclass - parse an equivalence-class name and deal with it
71455847Sbostic  *
71555847Sbostic  * This implementation is incomplete. xxx
71655847Sbostic  */
71755847Sbostic static void
71855847Sbostic p_b_eclass(p, cs)
71955847Sbostic register struct parse *p;
72055847Sbostic register cset *cs;
72155847Sbostic {
72255847Sbostic 	register uchar c;
72355847Sbostic 
72455847Sbostic 	c = p_b_coll_elem(p, '=');
72555847Sbostic 	CHadd(cs, c);
72655847Sbostic }
72755847Sbostic 
72855847Sbostic /*
72955847Sbostic  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
73055847Sbostic  */
73155847Sbostic static uchar			/* value of symbol */
73255847Sbostic p_b_symbol(p)
73355847Sbostic register struct parse *p;
73455847Sbostic {
73555847Sbostic 	register uchar value;
73655847Sbostic 
73755847Sbostic 	if (!EATTWO('[', '.')) {
73855847Sbostic 		MUSTNOTSEE('\0', REG_EBRACK);
73955847Sbostic 		return(GETNEXT());
74055847Sbostic 	}
74155847Sbostic 
74255847Sbostic 	/* collating symbol */
74355847Sbostic 	MUSTNOTSEE('\0', REG_EBRACK);
74455847Sbostic 	value = p_b_coll_elem(p, '.');
74555847Sbostic 	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
74655847Sbostic 	return(value);
74755847Sbostic }
74855847Sbostic 
74955847Sbostic /*
75055847Sbostic  - p_b_coll_elem - parse a collating-element name and look it up
75155847Sbostic  */
75255847Sbostic static uchar			/* value of collating element */
75355847Sbostic p_b_coll_elem(p, endc)
75455847Sbostic register struct parse *p;
755*56357Sbostic u_int endc;			/* name ended by endc,']' */
75655847Sbostic {
75755847Sbostic 	register uchar *sp = p->next;
75855847Sbostic 	register struct cname *cp;
75955847Sbostic 	register int len;
76055847Sbostic 	register uchar c;
76155847Sbostic 
76255847Sbostic 	while ((c = PEEK()) != '\0' && !SEETWO(endc, ']'))
76355847Sbostic 		NEXT();
76455847Sbostic 	if (c == '\0') {
76555847Sbostic 		SETERROR(REG_EBRACK);
76655847Sbostic 		return(0);
76755847Sbostic 	}
76855847Sbostic 	len = p->next - sp;
76955847Sbostic 	for (cp = cnames; cp->name != NULL; cp++)
77055847Sbostic 		if (strncmp(cp->name, (char *)sp, len) == 0 &&
77156355Sbostic 							cp->name[len] == '\0')
77255847Sbostic 			return(cp->code);	/* known name */
77355847Sbostic 	if (len == 1)
77455847Sbostic 		return(*sp);	/* single character */
77555847Sbostic 	SETERROR(REG_ECOLLATE);			/* neither */
77655847Sbostic 	return(0);
77755847Sbostic }
77855847Sbostic 
77955847Sbostic /*
78055847Sbostic  - othercase - return the case counterpart of an alphabetic
78155847Sbostic  */
78255847Sbostic static uchar
78355847Sbostic othercase(ch)
784*56357Sbostic u_int ch;
78555847Sbostic {
78655847Sbostic 	assert(isalpha(ch));
78755847Sbostic 	if (isupper(ch))
78855847Sbostic 		return(tolower(ch));
78955847Sbostic 	else if (islower(ch))
79055847Sbostic 		return(toupper(ch));
79155847Sbostic 	else			/* peculiar, but could happen */
79255847Sbostic 		return(ch);
79355847Sbostic }
79455847Sbostic 
79555847Sbostic /*
79655847Sbostic  - bothcases - emit a dualcase version of a character
79756355Sbostic  *
79855847Sbostic  * Boy, is this implementation ever a kludge...
79955847Sbostic  */
80055847Sbostic static void
80155847Sbostic bothcases(p, ch)
80255847Sbostic register struct parse *p;
803*56357Sbostic u_int ch;
80455847Sbostic {
80555847Sbostic 	register uchar *oldnext;
80655847Sbostic 	uchar bracket[3];
80755847Sbostic 
80855847Sbostic 	oldnext = p->next;
80955847Sbostic 	p->next = bracket;
81055847Sbostic 	bracket[0] = ch;
81155847Sbostic 	bracket[1] = ']';
81255847Sbostic 	bracket[2] = '\0';
81355847Sbostic 	p_bracket(p);
81455847Sbostic 	assert(p->next == bracket+2);
81555847Sbostic 	p->next = oldnext;
81655847Sbostic }
81755847Sbostic 
81855847Sbostic /*
81955847Sbostic  - ordinary - emit an ordinary character
82055847Sbostic  */
82155847Sbostic static void
82255847Sbostic ordinary(p, ch)
82355847Sbostic register struct parse *p;
824*56357Sbostic register u_int ch;
82555847Sbostic {
82655847Sbostic 	register uchar *cap = p->g->categories;
82755847Sbostic 
82855847Sbostic 	if ((p->g->cflags&REG_ICASE) && isalpha(ch)) {
82955847Sbostic 		bothcases(p, ch);
83055847Sbostic 		return;
83155847Sbostic 	}
83255847Sbostic 
83355847Sbostic 	EMIT(OCHAR, ch);
83455847Sbostic 	if (cap[ch] == 0)
83555847Sbostic 		cap[ch] = p->g->ncategories++;
83655847Sbostic }
83755847Sbostic 
83855847Sbostic /*
83955847Sbostic  - nonnewline - emit REG_NEWLINE version of OANY
84056355Sbostic  *
84155847Sbostic  * Boy, is this implementation ever a kludge...
84255847Sbostic  */
84355847Sbostic static void
84455847Sbostic nonnewline(p)
84555847Sbostic register struct parse *p;
84655847Sbostic {
84755847Sbostic 	register uchar *oldnext;
84855847Sbostic 	uchar bracket[4];
84955847Sbostic 
85055847Sbostic 	oldnext = p->next;
85155847Sbostic 	p->next = bracket;
85255847Sbostic 	bracket[0] = '^';
85355847Sbostic 	bracket[1] = '\n';
85455847Sbostic 	bracket[2] = ']';
85555847Sbostic 	bracket[3] = '\0';
85655847Sbostic 	p_bracket(p);
85755847Sbostic 	assert(p->next == bracket+3);
85855847Sbostic 	p->next = oldnext;
85955847Sbostic }
86055847Sbostic 
86155847Sbostic /*
86255847Sbostic  - repeat - generate code for a bounded repetition, recursively if needed
86355847Sbostic  */
86455847Sbostic static void
86555847Sbostic repeat(p, start, from, to)
86655847Sbostic register struct parse *p;
86755847Sbostic sopno start;			/* operand from here to end of strip */
86855847Sbostic int from;			/* repeated from this number */
86955847Sbostic int to;				/* to this number of times (maybe INFINITY) */
87055847Sbostic {
87155847Sbostic 	register sopno finish = HERE();
87255847Sbostic #	define	N	2
87355847Sbostic #	define	INF	3
87455847Sbostic #	define	REP(f, t)	((f)*8 + (t))
87555847Sbostic #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
87655847Sbostic 	register sopno copy;
87755847Sbostic 
87855847Sbostic 	if (p->error != 0)	/* head off possible runaway recursion */
87955847Sbostic 		return;
88055847Sbostic 
88155847Sbostic 	assert(from <= to);
88255847Sbostic 
88355847Sbostic 	switch (REP(MAP(from), MAP(to))) {
88455847Sbostic 	case REP(0, 0):			/* must be user doing this */
88555847Sbostic 		DROP(finish-start);	/* drop the operand */
88655847Sbostic 		break;
88755847Sbostic 	case REP(0, 1):			/* as x{1,1}? */
88855847Sbostic 	case REP(0, N):			/* as x{1,n}? */
88955847Sbostic 	case REP(0, INF):		/* as x{1,}? */
89055847Sbostic 		INSERT(OQUEST_, start);		/* offset is wrong... */
89155847Sbostic 		repeat(p, start+1, 1, to);
89255847Sbostic 		FWD(start);			/* ... fix it */
89355847Sbostic 		BACK(O_QUEST, start);
89455847Sbostic 		break;
89555847Sbostic 	case REP(1, 1):			/* trivial case */
89655847Sbostic 		/* done */
89755847Sbostic 		break;
89855847Sbostic 	case REP(1, N):			/* as x?x{1,n-1} */
89955847Sbostic 		INSERT(OQUEST_, start);
90055847Sbostic 		BACK(O_QUEST, start);
90155847Sbostic 		copy = dupl(p, start+1, finish+1);
90255847Sbostic 		assert(copy == finish+2);
90355847Sbostic 		repeat(p, copy, 1, to-1);
90455847Sbostic 		break;
90555847Sbostic 	case REP(1, INF):		/* as x+ */
90655847Sbostic 		INSERT(OPLUS_, start);
90755847Sbostic 		BACK(O_PLUS, start);
90855847Sbostic 		break;
90955847Sbostic 	case REP(N, N):			/* as xx{m-1,n-1} */
91055847Sbostic 		copy = dupl(p, start, finish);
91155847Sbostic 		repeat(p, copy, from-1, to-1);
91255847Sbostic 		break;
91355847Sbostic 	case REP(N, INF):		/* as xx{n-1,INF} */
91455847Sbostic 		copy = dupl(p, start, finish);
91555847Sbostic 		repeat(p, copy, from-1, to);
91655847Sbostic 		break;
91755847Sbostic 	default:			/* "can't happen" */
91855847Sbostic 		SETERROR(REG_ASSERT);	/* just in case */
91955847Sbostic 		break;
92055847Sbostic 	}
92155847Sbostic }
92255847Sbostic 
92355847Sbostic /*
92455847Sbostic  - seterr - set an error condition
92555847Sbostic  */
92655847Sbostic static int			/* useless but makes type checking happy */
92755847Sbostic seterr(p, e)
92855847Sbostic register struct parse *p;
92955847Sbostic int e;
93055847Sbostic {
93155847Sbostic 	if (p->error == 0)	/* keep earliest error condition */
93255847Sbostic 		p->error = e;
93355847Sbostic 	p->next = nuls;		/* try to bring things to a halt */
93455847Sbostic 	return(0);		/* make the return value well-defined */
93555847Sbostic }
93655847Sbostic 
93755847Sbostic /*
93855847Sbostic  - allocset - allocate a set of characters for []
93955847Sbostic  */
94055847Sbostic static cset *
94155847Sbostic allocset(p)
94255847Sbostic register struct parse *p;
94355847Sbostic {
94455847Sbostic 	register int no = p->g->ncsets++;
94555847Sbostic 	register size_t nc;
94655847Sbostic 	register size_t nbytes;
94755847Sbostic 	register cset *cs;
94855847Sbostic 	register size_t css = (size_t)p->g->csetsize;
94955847Sbostic 
95055847Sbostic 	if (no >= p->ncsalloc) {	/* need another column of space */
95155847Sbostic 		p->ncsalloc += CHAR_BIT;
95255847Sbostic 		nc = p->ncsalloc;
95355847Sbostic 		assert(nc % CHAR_BIT == 0);
95455847Sbostic 		nbytes = nc / CHAR_BIT * css;
95555847Sbostic 		if (p->g->sets == NULL)
95655847Sbostic 			p->g->sets = (cset *)malloc(nc * sizeof(cset));
95755847Sbostic 		else
95855847Sbostic 			p->g->sets = (cset *)realloc((char *)p->g->sets,
95955847Sbostic 							nc * sizeof(cset));
96055847Sbostic 		if (p->g->setbits == NULL)
96155847Sbostic 			p->g->setbits = (uchar *)malloc(nbytes);
96255847Sbostic 		else
96355847Sbostic 			p->g->setbits = (uchar *)realloc((char *)p->g->setbits,
96455847Sbostic 								nbytes);
96555847Sbostic 		if (p->g->sets != NULL && p->g->setbits != NULL)
96655847Sbostic 			(void) memset((char *)p->g->setbits + (nbytes - css),
96755847Sbostic 								0, css);
96855847Sbostic 		else {
96955847Sbostic 			no = 0;
97055847Sbostic 			SETERROR(REG_ESPACE);
97155847Sbostic 			/* caller's responsibility not to do set ops */
97255847Sbostic 		}
97355847Sbostic 	}
97455847Sbostic 
97555847Sbostic 	assert(p->g->sets != NULL);	/* xxx */
97655847Sbostic 	cs = &p->g->sets[no];
97755847Sbostic 	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
97855847Sbostic 	cs->mask = 1 << ((no) % CHAR_BIT);
97955847Sbostic 	cs->hash = 0;
98055847Sbostic 	cs->smultis = 0;
98155847Sbostic 	cs->multis = NULL;
98255847Sbostic 
98355847Sbostic 	return(cs);
98455847Sbostic }
98555847Sbostic 
98655847Sbostic /*
98755847Sbostic  - freezeset - final processing on a set of characters
98855847Sbostic  *
98955847Sbostic  * The main task here is merging identical sets.  This is usually a waste
99055847Sbostic  * of time (although the hash code minimizes the overhead), but can win
99155847Sbostic  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
99255847Sbostic  * is done using addition rather than xor -- all ASCII [aA] sets xor to
99355847Sbostic  * the same value!
99455847Sbostic  */
99555847Sbostic static int			/* set number */
99655847Sbostic freezeset(p, cs)
99755847Sbostic register struct parse *p;
99855847Sbostic register cset *cs;
99955847Sbostic {
100055847Sbostic 	register uchar h = cs->hash;
100155847Sbostic 	register int i;
100255847Sbostic 	register cset *top = &p->g->sets[p->g->ncsets];
100355847Sbostic 	register cset *cs2;
100455847Sbostic 	register size_t css = (size_t)p->g->csetsize;
100555847Sbostic 
100655847Sbostic 	/* look for an earlier one which is the same */
100755847Sbostic 	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
100855847Sbostic 		if (cs2->hash == h && cs2 != cs) {
100955847Sbostic 			/* maybe */
101055847Sbostic 			for (i = 0; i < css; i++)
101155847Sbostic 				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
101255847Sbostic 					break;		/* no */
101355847Sbostic 			if (i == css)
101455847Sbostic 				break;			/* yes */
101555847Sbostic 		}
101655847Sbostic 
101755847Sbostic 	if (cs2 < top) {	/* found one */
101855847Sbostic 		assert(cs == top-1);
101955847Sbostic 		p->g->ncsets--;
102055847Sbostic 		for (i = 0; i < css; i++)
102155847Sbostic 			CHsub(cs, i);
102255847Sbostic 		cs = cs2;
102355847Sbostic 	}
102455847Sbostic 
102555847Sbostic 	return((int)(cs - p->g->sets));
102655847Sbostic }
102755847Sbostic 
102855847Sbostic /*
102955847Sbostic  - mcadd - add a collating element to a cset
103055847Sbostic  */
103155847Sbostic static void
103255847Sbostic mcadd(p, cs, cp)
103355847Sbostic register struct parse *p;
103455847Sbostic register cset *cs;
103555847Sbostic register uchar *cp;
103655847Sbostic {
103755847Sbostic 	register size_t oldend = cs->smultis;
103855847Sbostic 
103955847Sbostic 	cs->smultis += strlen((char *)cp) + 1;
104055847Sbostic 	if (cs->multis == NULL)
104155847Sbostic 		cs->multis = (uchar *)malloc(cs->smultis);
104255847Sbostic 	else
104355847Sbostic 		cs->multis = (uchar *)realloc(cs->multis, cs->smultis);
104455847Sbostic 	if (cs->multis == NULL) {
104555847Sbostic 		SETERROR(REG_ESPACE);
104655847Sbostic 		return;
104755847Sbostic 	}
104855847Sbostic 
104955847Sbostic 	(void) strcpy((char *)(cs->multis + oldend - 1), (char *)cp);
105055847Sbostic 	cs->multis[cs->smultis - 1] = '\0';
105155847Sbostic }
105255847Sbostic 
105355847Sbostic /*
105455847Sbostic  - mcsub - subtract a collating element from a cset
105555847Sbostic  */
105655847Sbostic static void
105755847Sbostic mcsub(p, cs, cp)
105855847Sbostic register struct parse *p;
105955847Sbostic register cset *cs;
1060*56357Sbostic register u_int *cp;
106155847Sbostic {
106255847Sbostic 	register uchar *fp = mcfind(cs, cp);
106355847Sbostic 	register size_t len = strlen((char *)fp);
106455847Sbostic 
106555847Sbostic 	assert(p != NULL);
106655847Sbostic 	(void) memmove((char *)fp, (char *)(fp + len + 1),
106755847Sbostic 				cs->smultis - (fp + len + 1 - cs->multis));
106855847Sbostic 	cs->smultis -= len;
106955847Sbostic 
107055847Sbostic 	if (cs->smultis == 0) {
107155847Sbostic 		free((char *)cs->multis);
107255847Sbostic 		cs->multis = NULL;
107355847Sbostic 		return;
107455847Sbostic 	}
107555847Sbostic 
107655847Sbostic 	cs->multis = (uchar *)realloc(cs->multis, cs->smultis);
107755847Sbostic 	assert(cs->multis != NULL);
107855847Sbostic }
107955847Sbostic 
108055847Sbostic /*
108155847Sbostic  - mcin - is a collating element in a cset?
108255847Sbostic  */
108355847Sbostic static int
108455847Sbostic mcin(p, cs, cp)
108555847Sbostic register struct parse *p;
108655847Sbostic register cset *cs;
1087*56357Sbostic register u_int *cp;
108855847Sbostic {
108955847Sbostic 	return(mcfind(cs, cp) != NULL);
109055847Sbostic }
109155847Sbostic 
109255847Sbostic /*
109355847Sbostic  - mcfind - find a collating element in a cset
109455847Sbostic  */
109555847Sbostic static uchar *
109655847Sbostic mcfind(cs, cp)
109755847Sbostic register cset *cs;
1098*56357Sbostic register u_int *cp;
109955847Sbostic {
110055847Sbostic 	register uchar *p;
110155847Sbostic 
110255847Sbostic 	if (cs->multis == NULL)
110355847Sbostic 		return(NULL);
110455847Sbostic 	for (p = cs->multis; *p != '\0'; p += strlen((char *)p) + 1)
110555847Sbostic 		if (strcmp((char *)cp, (char *)p) == 0)
110655847Sbostic 			return(p);
110755847Sbostic 	return(NULL);
110855847Sbostic }
110955847Sbostic 
111055847Sbostic /*
111155847Sbostic  - mcinvert - invert the list of collating elements in a cset
111255847Sbostic  *
111355847Sbostic  * This would have to know the set of possibilities.  Implementation
111455847Sbostic  * is deferred.
111555847Sbostic  */
111655847Sbostic static void
111755847Sbostic mcinvert(p, cs)
111855847Sbostic register struct parse *p;
111955847Sbostic register cset *cs;
112055847Sbostic {
112155847Sbostic 	assert(cs->multis == NULL);	/* xxx */
112255847Sbostic }
112355847Sbostic 
112455847Sbostic /*
112555847Sbostic  - isinsets - is this character in any sets?
112655847Sbostic  */
112755847Sbostic static int			/* predicate */
112855847Sbostic isinsets(g, c)
112955847Sbostic register struct re_guts *g;
1130*56357Sbostic u_int c;
113155847Sbostic {
113255847Sbostic 	register uchar *col;
113355847Sbostic 	register int i;
113455847Sbostic 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
113555847Sbostic 
113655847Sbostic 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
113755847Sbostic 		if (col[c] != 0)
113855847Sbostic 			return(1);
113955847Sbostic 	return(0);
114055847Sbostic }
114155847Sbostic 
114255847Sbostic /*
114355847Sbostic  - samesets - are these two characters in exactly the same sets?
114455847Sbostic  */
114555847Sbostic static int			/* predicate */
114655847Sbostic samesets(g, c1, c2)
114755847Sbostic register struct re_guts *g;
1148*56357Sbostic register u_int c1;
1149*56357Sbostic register u_int c2;
115055847Sbostic {
115155847Sbostic 	register uchar *col;
115255847Sbostic 	register int i;
115355847Sbostic 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
115455847Sbostic 
115555847Sbostic 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
115655847Sbostic 		if (col[c1] != col[c2])
115755847Sbostic 			return(0);
115855847Sbostic 	return(1);
115955847Sbostic }
116055847Sbostic 
116155847Sbostic /*
116255847Sbostic  - categorize - sort out character categories
116355847Sbostic  */
116455847Sbostic static void
116555847Sbostic categorize(p, g)
116655847Sbostic struct parse *p;
116755847Sbostic register struct re_guts *g;
116855847Sbostic {
116955847Sbostic 	register uchar *cats = g->categories;
117055847Sbostic 	register unsigned c;
117155847Sbostic 	register unsigned c2;
117255847Sbostic 	register uchar cat;
117355847Sbostic 
117455847Sbostic 	/* avoid making error situations worse */
117555847Sbostic 	if (p->error != 0)
117655847Sbostic 		return;
117755847Sbostic 
117855847Sbostic 	for (c = 0; c < g->csetsize; c++)
117955847Sbostic 		if (cats[c] == 0 && isinsets(g, c)) {
118055847Sbostic 			cat = g->ncategories++;
118155847Sbostic 			cats[c] = cat;
118255847Sbostic 			for (c2 = c+1; c2 < g->csetsize; c2++)
118355847Sbostic 				if (cats[c2] == 0 && samesets(g, c, c2))
118455847Sbostic 					cats[c2] = cat;
118555847Sbostic 		}
118655847Sbostic }
118755847Sbostic 
118855847Sbostic /*
118955847Sbostic  - dupl - emit a duplicate of a bunch of sops
119055847Sbostic  */
119155847Sbostic static sopno			/* start of duplicate */
119255847Sbostic dupl(p, start, finish)
119355847Sbostic register struct parse *p;
119455847Sbostic sopno start;			/* from here */
119555847Sbostic sopno finish;			/* to this less one */
119655847Sbostic {
119755847Sbostic 	register sopno ret = HERE();
119855847Sbostic 	register sopno len = finish - start;
119955847Sbostic 
120055847Sbostic 	assert(finish >= start);
120155847Sbostic 	if (len == 0)
120255847Sbostic 		return(ret);
120355847Sbostic 	enlarge(p, p->ssize + len);	/* this many unexpected additions */
120455847Sbostic 	assert(p->ssize >= p->slen + len);
120555847Sbostic 	(void) memcpy((char *)(p->strip + p->slen),
120655847Sbostic 		(char *)(p->strip + start), (size_t)len*sizeof(sop));
120755847Sbostic 	p->slen += len;
120855847Sbostic 	return(ret);
120955847Sbostic }
121055847Sbostic 
121155847Sbostic /*
121255847Sbostic  - doemit - emit a strip operator
121355847Sbostic  *
121455847Sbostic  * It might seem better to implement this as a macro with a function as
121555847Sbostic  * hard-case backup, but it's just too big and messy unless there are
121655847Sbostic  * some changes to the data structures.  Maybe later.
121755847Sbostic  */
121855847Sbostic static void
121955847Sbostic doemit(p, op, opnd)
122055847Sbostic register struct parse *p;
122155847Sbostic sop op;
122255847Sbostic size_t opnd;
122355847Sbostic {
122455847Sbostic 	/* avoid making error situations worse */
122555847Sbostic 	if (p->error != 0)
122655847Sbostic 		return;
122755847Sbostic 
122855847Sbostic 	/* deal with oversize operands ("can't happen", more or less) */
122955847Sbostic 	assert(opnd < 1<<OPSHIFT);
123055847Sbostic 
123155847Sbostic 	/* deal with undersized strip */
123255847Sbostic 	if (p->slen >= p->ssize)
123355847Sbostic 		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
123455847Sbostic 	assert(p->slen < p->ssize);
123555847Sbostic 
123655847Sbostic 	/* finally, it's all reduced to the easy case */
123755847Sbostic 	p->strip[p->slen++] = SOP(op, opnd);
123855847Sbostic }
123955847Sbostic 
124055847Sbostic /*
124155847Sbostic  - doinsert - insert a sop into the strip
124255847Sbostic  */
124355847Sbostic static void
124455847Sbostic doinsert(p, op, opnd, pos)
124555847Sbostic register struct parse *p;
124655847Sbostic sop op;
124755847Sbostic size_t opnd;
124855847Sbostic sopno pos;
124955847Sbostic {
125055847Sbostic 	register sopno sn;
125155847Sbostic 	register sop s;
125255847Sbostic 	register int i;
125355847Sbostic 
125455847Sbostic 	/* avoid making error situations worse */
125555847Sbostic 	if (p->error != 0)
125655847Sbostic 		return;
125755847Sbostic 
125855847Sbostic 	sn = HERE();
125955847Sbostic 	EMIT(op, opnd);		/* do checks, ensure space */
126055847Sbostic 	assert(HERE() == sn+1);
126155847Sbostic 	s = p->strip[sn];
126255847Sbostic 
126355847Sbostic 	/* adjust paren pointers */
126455847Sbostic 	assert(pos > 0);
126555847Sbostic 	for (i = 1; i < NPAREN; i++) {
126655847Sbostic 		if (p->pbegin[i] >= pos) {
126755847Sbostic 			p->pbegin[i]++;
126855847Sbostic 		}
126955847Sbostic 		if (p->pend[i] >= pos) {
127055847Sbostic 			p->pend[i]++;
127155847Sbostic 		}
127255847Sbostic 	}
127355847Sbostic 
127455847Sbostic 	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
127555847Sbostic 						(HERE()-pos-1)*sizeof(sop));
127655847Sbostic 	p->strip[pos] = s;
127755847Sbostic }
127855847Sbostic 
127955847Sbostic /*
128055847Sbostic  - dofwd - complete a forward reference
128155847Sbostic  */
128255847Sbostic static void
128355847Sbostic dofwd(p, pos, value)
128455847Sbostic register struct parse *p;
128555847Sbostic register sopno pos;
128655847Sbostic sop value;
128755847Sbostic {
128855847Sbostic 	/* avoid making error situations worse */
128955847Sbostic 	if (p->error != 0)
129055847Sbostic 		return;
129155847Sbostic 
129255847Sbostic 	assert(value < 1<<OPSHIFT);
129355847Sbostic 	p->strip[pos] = OP(p->strip[pos]) | value;
129455847Sbostic }
129555847Sbostic 
129655847Sbostic /*
129755847Sbostic  - enlarge - enlarge the strip
129855847Sbostic  */
129955847Sbostic static void
130055847Sbostic enlarge(p, size)
130155847Sbostic register struct parse *p;
130255847Sbostic register sopno size;
130355847Sbostic {
130455847Sbostic 	register sop *sp;
130555847Sbostic 
130655847Sbostic 	if (p->ssize >= size)
130755847Sbostic 		return;
130855847Sbostic 
130955847Sbostic 	sp = (sop *)realloc(p->strip, size*sizeof(sop));
131055847Sbostic 	if (sp == NULL) {
131155847Sbostic 		SETERROR(REG_ESPACE);
131255847Sbostic 		return;
131355847Sbostic 	}
131455847Sbostic 	p->strip = sp;
131555847Sbostic 	p->ssize = size;
131655847Sbostic }
131755847Sbostic 
131855847Sbostic /*
131955847Sbostic  - stripsnug - compact the strip
132055847Sbostic  */
132155847Sbostic static void
132255847Sbostic stripsnug(p, g)
132355847Sbostic register struct parse *p;
132455847Sbostic register struct re_guts *g;
132555847Sbostic {
132655847Sbostic 	g->nstates = p->slen;
132755847Sbostic 	g->strip = (sop *)realloc((sop *)p->strip, p->slen * sizeof(sop));
132855847Sbostic 	if (g->strip == NULL) {
132955847Sbostic 		SETERROR(REG_ESPACE);
133055847Sbostic 		g->strip = p->strip;
133155847Sbostic 	}
133255847Sbostic }
133355847Sbostic 
133455847Sbostic /*
133555847Sbostic  - findmust - fill in must and mlen with longest mandatory literal string
133655847Sbostic  *
133755847Sbostic  * This algorithm could do fancy things like analyzing the operands of |
133855847Sbostic  * for common subsequences.  Someday.  This code is simple and finds most
133955847Sbostic  * of the interesting cases.
134055847Sbostic  *
134155847Sbostic  * Note that must and mlen got initialized during setup.
134255847Sbostic  */
134356355Sbostic static void
134455847Sbostic findmust(p, g)
134555847Sbostic struct parse *p;
134655847Sbostic register struct re_guts *g;
134755847Sbostic {
134855847Sbostic 	register sop *scan;
134955847Sbostic 	sop *start;
135055847Sbostic 	register sop *newstart;
135155847Sbostic 	register sopno newlen;
135255847Sbostic 	register sop s;
135355847Sbostic 	register char *cp;
135455847Sbostic 	register sopno i;
135555847Sbostic 
135655847Sbostic 	/* avoid making error situations worse */
135755847Sbostic 	if (p->error != 0)
135855847Sbostic 		return;
135955847Sbostic 
136055847Sbostic 	/* find the longest OCHAR sequence in strip */
136155847Sbostic 	newlen = 0;
136255847Sbostic 	scan = g->strip + 1;
136355847Sbostic 	do {
136455847Sbostic 		s = *scan++;
136555847Sbostic 		switch (OP(s)) {
136655847Sbostic 		case OCHAR:		/* sequence member */
136755847Sbostic 			if (newlen == 0)		/* new sequence */
136855847Sbostic 				newstart = scan - 1;
136955847Sbostic 			newlen++;
137055847Sbostic 			break;
137155847Sbostic 		case OPLUS_:		/* things that don't break one */
137255847Sbostic 		case OLPAREN:
137355847Sbostic 		case ORPAREN:
137455847Sbostic 			break;
137555847Sbostic 		case OQUEST_:		/* things that must be skipped */
137655847Sbostic 		case OCH_:
137755847Sbostic 			scan--;
137855847Sbostic 			do {
137955847Sbostic 				scan += OPND(s);
138055847Sbostic 				s = *scan;
138155847Sbostic 				/* assert() interferes w debug printouts */
138255847Sbostic 				if (OP(s) != O_QUEST && OP(s) != O_CH &&
138355847Sbostic 							OP(s) != OOR2) {
138455847Sbostic 					g->iflags |= BAD;
138555847Sbostic 					return;
138655847Sbostic 				}
138755847Sbostic 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
138855847Sbostic 			/* fallthrough */
138955847Sbostic 		default:		/* things that break a sequence */
139055847Sbostic 			if (newlen > g->mlen) {		/* ends one */
139155847Sbostic 				start = newstart;
139255847Sbostic 				g->mlen = newlen;
139355847Sbostic 			}
139455847Sbostic 			newlen = 0;
139555847Sbostic 			break;
139655847Sbostic 		}
139755847Sbostic 	} while (OP(s) != OEND);
139855847Sbostic 
139955847Sbostic 	if (g->mlen == 0)		/* there isn't one */
140055847Sbostic 		return;
140155847Sbostic 
140255847Sbostic 	/* turn it into a character string */
140355847Sbostic 	g->must = malloc((size_t)g->mlen + 1);
140455847Sbostic 	if (g->must == NULL) {		/* argh; just forget it */
140555847Sbostic 		g->mlen = 0;
140655847Sbostic 		return;
140755847Sbostic 	}
140855847Sbostic 	cp = g->must;
140955847Sbostic 	scan = start;
141055847Sbostic 	for (i = g->mlen; i > 0; i--) {
141155847Sbostic 		while (OP(s = *scan++) != OCHAR)
141255847Sbostic 			continue;
141355847Sbostic 		*cp++ = OPND(s);
141455847Sbostic 	}
141555847Sbostic 	*cp++ = '\0';		/* just on general principles */
141655847Sbostic }
141755847Sbostic 
141855847Sbostic /*
141955847Sbostic  - pluscount - count + nesting
142055847Sbostic  */
142156355Sbostic static sopno			/* nesting depth */
142255847Sbostic pluscount(p, g)
142355847Sbostic struct parse *p;
142455847Sbostic register struct re_guts *g;
142555847Sbostic {
142655847Sbostic 	register sop *scan;
142755847Sbostic 	register sop s;
142855847Sbostic 	register sopno plusnest = 0;
142955847Sbostic 	register sopno maxnest = 0;
143055847Sbostic 
143155847Sbostic 	if (p->error != 0)
143255847Sbostic 		return(0);	/* there may not be an OEND */
143355847Sbostic 
143455847Sbostic 	scan = g->strip + 1;
143555847Sbostic 	do {
143655847Sbostic 		s = *scan++;
143755847Sbostic 		switch (OP(s)) {
143855847Sbostic 		case OPLUS_:
143955847Sbostic 			plusnest++;
144055847Sbostic 			break;
144155847Sbostic 		case O_PLUS:
144255847Sbostic 			if (plusnest > maxnest)
144355847Sbostic 				maxnest = plusnest;
144455847Sbostic 			plusnest--;
144555847Sbostic 			break;
144655847Sbostic 		}
144755847Sbostic 	} while (OP(s) != OEND);
144855847Sbostic 	if (plusnest != 0)
144955847Sbostic 		g->iflags |= BAD;
145055847Sbostic 	return(maxnest);
145155847Sbostic }
1452