xref: /csrg-svn/lib/libc/regex/regcomp.c (revision 56355)
155847Sbostic /*-
255847Sbostic  * Copyright (c) 1992 Henry Spencer.
355847Sbostic  * Copyright (c) 1992 The Regents of the University of California.
455847Sbostic  * All rights reserved.
555847Sbostic  *
655847Sbostic  * This code is derived from software contributed to Berkeley by
755847Sbostic  * Henry Spencer of the University of Toronto.
855847Sbostic  *
955847Sbostic  * %sccs.include.redist.c%
1055847Sbostic  *
11*56355Sbostic  *	@(#)regcomp.c	5.2 (Berkeley) 09/30/92
1255847Sbostic  */
1355847Sbostic 
1455847Sbostic #if defined(LIBC_SCCS) && !defined(lint)
15*56355Sbostic static char sccsid[] = "@(#)regcomp.c	5.2 (Berkeley) 09/30/92";
1655847Sbostic #endif /* LIBC_SCCS and not lint */
1755847Sbostic 
1855847Sbostic #include <sys/types.h>
1955847Sbostic #include <stdio.h>
2055847Sbostic #include <string.h>
2155847Sbostic #include <ctype.h>
2255847Sbostic #include <limits.h>
2355847Sbostic #include <stdlib.h>
2455847Sbostic #include <regex.h>
2555847Sbostic 
2655847Sbostic #include "utils.h"
2755847Sbostic #include "regex2.h"
2855847Sbostic 
2955847Sbostic #include "cclass.h"
3055847Sbostic #include "cname.h"
3155847Sbostic 
3255847Sbostic /*
3355847Sbostic  * parse structure, passed up and down to avoid global variables and
3455847Sbostic  * other clumsinesses
3555847Sbostic  */
3655847Sbostic struct parse {
3755847Sbostic 	uchar *next;		/* next character in RE */
3855847Sbostic 	int error;		/* has an error been seen? */
3955847Sbostic 	sop *strip;		/* malloced strip */
4055847Sbostic 	sopno ssize;		/* malloced strip size (allocated) */
4155847Sbostic 	sopno slen;		/* malloced strip length (used) */
4255847Sbostic 	int ncsalloc;		/* number of csets allocated */
4355847Sbostic 	struct re_guts *g;
4455847Sbostic #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
4555847Sbostic 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
4655847Sbostic 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
4755847Sbostic };
4855847Sbostic 
49*56355Sbostic #include "regcomp.ih"
5055847Sbostic 
51*56355Sbostic static uchar nuls[10];		/* place to point scanner in event of error */
52*56355Sbostic 
5355847Sbostic /*
5455847Sbostic  * macros for use with parse structure
5555847Sbostic  * BEWARE:  these know that the parse structure is named `p' !!!
5655847Sbostic  */
5755847Sbostic #define	PEEK()	((uchar)*p->next)
5855847Sbostic #define	PEEK2()	((uchar)*(p->next+1))
5955847Sbostic #define	SEE(c)	(PEEK() == (c))
6055847Sbostic #define	SEETWO(a, b)	(PEEK() == (a) && PEEK2() == (b))
6155847Sbostic #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
6255847Sbostic #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
6355847Sbostic #define	NEXT()	(p->next++)
6455847Sbostic #define	NEXT2()	(p->next += 2)
6555847Sbostic #define	NEXTn(n)	(p->next += (n))
6655847Sbostic #define	GETNEXT()	((uchar)*p->next++)
6755847Sbostic #define	SETERROR(e)	seterr(p, (e))
6855847Sbostic #define	REQUIRE(co, e)	((co) || SETERROR(e))
6955847Sbostic #define	MUSTSEE(c, e)	(REQUIRE(PEEK() == (c), e))
7055847Sbostic #define	MUSTEAT(c, e)	(REQUIRE(GETNEXT() == (c), e))
7155847Sbostic #define	MUSTNOTSEE(c, e)	(REQUIRE(PEEK() != (c), e))
7255847Sbostic #define	EMIT(sop, sopnd)	doemit(p, sop, (size_t)(sopnd))
7355847Sbostic #define	INSERT(sop, pos)	doinsert(p, sop, HERE()-(pos)+1, pos)
7455847Sbostic #define	FWD(pos)		dofwd(p, pos, HERE()-(pos))
7555847Sbostic #define	BACK(sop, pos)		EMIT(sop, HERE()-pos)
7655847Sbostic #define	HERE()		(p->slen)
7755847Sbostic #define	THERE()		(p->slen - 1)
7855847Sbostic #define	DROP(n)	(p->slen -= (n))
7955847Sbostic 
8055847Sbostic /*
8155847Sbostic  - regcomp - interface for parser and compilation
82*56355Sbostic  = extern int regcomp(regex_t *preg, const char *pattern, int cflags);
83*56355Sbostic  = #define	REG_EXTENDED	001
84*56355Sbostic  = #define	REG_ICASE	002
85*56355Sbostic  = #define	REG_NOSUB	004
86*56355Sbostic  = #define	REG_NEWLINE	010
8755847Sbostic  */
8855847Sbostic int				/* 0 success, otherwise REG_something */
8955847Sbostic regcomp(preg, pattern, cflags)
9055847Sbostic regex_t *preg;
9155847Sbostic const char *pattern;
9255847Sbostic int cflags;
9355847Sbostic {
9455847Sbostic 	struct parse pa;
9555847Sbostic 	register struct re_guts *g;
9655847Sbostic 	register struct parse *p = &pa;
9755847Sbostic 	register int i;
9855847Sbostic 
9955847Sbostic 	/* do the mallocs early so failure handling is easy */
10055847Sbostic 	/* the +NUC here is for the category table */
10155847Sbostic 	g = (struct re_guts *)malloc(sizeof(struct re_guts) + NUC);
10255847Sbostic 	if (g == NULL)
10355847Sbostic 		return(REG_ESPACE);
10455847Sbostic 	p->ssize = strlen(pattern)/2*3 + 1;
10555847Sbostic 	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
10655847Sbostic 	p->slen = 0;
10755847Sbostic 	if (p->strip == NULL) {
10855847Sbostic 		free((char *)g);
10955847Sbostic 		return(REG_ESPACE);
11055847Sbostic 	}
11155847Sbostic 
11255847Sbostic 	/* set things up */
11355847Sbostic 	p->g = g;
11455847Sbostic 	p->next = (uchar *)pattern;
11555847Sbostic 	p->error = 0;
11655847Sbostic 	p->ncsalloc = 0;
11755847Sbostic 	for (i = 0; i < NPAREN; i++) {
11855847Sbostic 		p->pbegin[i] = 0;
11955847Sbostic 		p->pend[i] = 0;
12055847Sbostic 	}
12155847Sbostic 	g->csetsize = NUC;
12255847Sbostic 	g->sets = NULL;
12355847Sbostic 	g->setbits = NULL;
12455847Sbostic 	g->ncsets = 0;
12555847Sbostic 	g->cflags = cflags;
12655847Sbostic 	g->iflags = 0;
12755847Sbostic 	g->must = NULL;
12855847Sbostic 	g->mlen = 0;
12955847Sbostic 	g->nsub = 0;
13055847Sbostic 	g->ncategories = 1;	/* category 0 is "everything else" */
13155847Sbostic 	g->categories = (uchar *)g + sizeof(struct re_guts);
13255847Sbostic 	(void) memset((char *)g->categories, 0, NUC);
13355847Sbostic 	g->backrefs = 0;
13455847Sbostic 	g->nplus = 0;
13555847Sbostic 
13655847Sbostic 	/* do it */
13755847Sbostic 	EMIT(OEND, 0);
13855847Sbostic 	g->firststate = THERE();
13955847Sbostic 	if (cflags&REG_EXTENDED)
14055847Sbostic 		p_ere(p, '\0');
14155847Sbostic 	else
14255847Sbostic 		p_bre(p, '\0', '\0');
14355847Sbostic 	EMIT(OEND, 0);
14455847Sbostic 	g->laststate = THERE();
14555847Sbostic 
14655847Sbostic 	/* tidy up loose ends and fill things in */
14755847Sbostic 	categorize(p, g);
14855847Sbostic 	stripsnug(p, g);
14955847Sbostic 	findmust(p, g);
15055847Sbostic 	g->nplus = pluscount(p, g);
15155847Sbostic 	g->magic = MAGIC2;
15255847Sbostic 	preg->re_nsub = g->nsub;
15355847Sbostic 	preg->re_g = g;
15455847Sbostic 	preg->re_magic = MAGIC1;
155*56355Sbostic #ifndef REDEBUG
15655847Sbostic 	/* not debugging, so can't rely on the assert() in regexec() */
15755847Sbostic 	if (g->iflags&BAD)
15855847Sbostic 		SETERROR(REG_ASSERT);
15955847Sbostic #endif
16055847Sbostic 
16155847Sbostic 	/* win or lose, we're done */
16255847Sbostic 	if (p->error != 0)	/* lose */
16355847Sbostic 		regfree(preg);
16455847Sbostic 	return(p->error);
16555847Sbostic }
16655847Sbostic 
16755847Sbostic /*
16855847Sbostic  - p_ere - ERE parser top level, concatenation and alternation
169*56355Sbostic  == static void p_ere(register struct parse *p, uchar stop);
17055847Sbostic  */
17155847Sbostic static void
17255847Sbostic p_ere(p, stop)
17355847Sbostic register struct parse *p;
17455847Sbostic uchar stop;			/* character this ERE should end at */
17555847Sbostic {
17655847Sbostic 	register uchar c;
17755847Sbostic 	register sopno prevback;
17855847Sbostic 	register sopno prevfwd;
17955847Sbostic 	register sopno conc;
18055847Sbostic 	register int first = 1;		/* is this the first alternative? */
18155847Sbostic 
18255847Sbostic 	for (;;) {
18355847Sbostic 		/* do a bunch of concatenated expressions */
18455847Sbostic 		conc = HERE();
18555847Sbostic 		while ((c = PEEK()) != '|' && c != stop && c != '\0')
18655847Sbostic 			p_ere_exp(p);
18755847Sbostic 		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
18855847Sbostic 
18955847Sbostic 		if (!EAT('|'))
19055847Sbostic 			break;		/* NOTE BREAK OUT */
19155847Sbostic 
19255847Sbostic 		if (first) {
19355847Sbostic 			INSERT(OCH_, conc);	/* offset is wrong */
19455847Sbostic 			prevfwd = conc;
19555847Sbostic 			prevback = conc;
19655847Sbostic 			first = 0;
19755847Sbostic 		}
19855847Sbostic 		BACK(OOR1, prevback);
19955847Sbostic 		prevback = THERE();
20055847Sbostic 		FWD(prevfwd);			/* fix previous offset */
20155847Sbostic 		prevfwd = HERE();
20255847Sbostic 		EMIT(OOR2, 0);			/* offset is very wrong */
20355847Sbostic 	}
20455847Sbostic 
20555847Sbostic 	if (!first) {		/* tail-end fixups */
20655847Sbostic 		FWD(prevfwd);
20755847Sbostic 		BACK(O_CH, prevback);
20855847Sbostic 	}
20955847Sbostic 
21055847Sbostic 	assert(SEE(stop) || SEE('\0'));
21155847Sbostic }
21255847Sbostic 
21355847Sbostic /*
21455847Sbostic  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
215*56355Sbostic  == static void p_ere_exp(register struct parse *p);
21655847Sbostic  */
21755847Sbostic static void
21855847Sbostic p_ere_exp(p)
21955847Sbostic register struct parse *p;
22055847Sbostic {
22155847Sbostic 	register uchar c;
22255847Sbostic 	register sopno pos;
22355847Sbostic 	register int count;
22455847Sbostic 	register int count2;
22555847Sbostic 	register sopno subno;
22655847Sbostic 	int wascaret = 0;
22755847Sbostic 	/* we call { a repetition if followed by a digit */
22855847Sbostic #	define	ISRPT(c1, c2)	(c1 == '*' || c1 == '+' || c1 == '?' || \
22955847Sbostic 						(c1 == '{' && isdigit(c2)))
23055847Sbostic 
23155847Sbostic 	c = GETNEXT();
23255847Sbostic 	assert(c != '\0');	/* caller should have ensured this */
23355847Sbostic 
23455847Sbostic 	pos = HERE();
23555847Sbostic 	switch (c) {
23655847Sbostic 	case '(':
23755847Sbostic 		MUSTNOTSEE('\0', REG_EPAREN);
23855847Sbostic 		p->g->nsub++;
23955847Sbostic 		subno = p->g->nsub;
24055847Sbostic 		if (subno < NPAREN)
24155847Sbostic 			p->pbegin[subno] = HERE();
24255847Sbostic 		EMIT(OLPAREN, subno);
24355847Sbostic 		if (!SEE(')'))
24455847Sbostic 			p_ere(p, ')');
24555847Sbostic 		if (subno < NPAREN) {
24655847Sbostic 			p->pend[subno] = HERE();
24755847Sbostic 			assert(p->pend[subno] != 0);
24855847Sbostic 		}
24955847Sbostic 		EMIT(ORPAREN, subno);
25055847Sbostic 		MUSTEAT(')', REG_EPAREN);
25155847Sbostic 		break;
25255847Sbostic #ifndef POSIX_MISTAKE
25355847Sbostic 	case ')':		/* happens only if no current unmatched ( */
25455847Sbostic 		/*
25555847Sbostic 		 * You may ask, why the ifndef?  Because I didn't notice
25655847Sbostic 		 * this until slightly too late for 1003.2, and none of the
25755847Sbostic 		 * other 1003.2 regular-expression reviewers noticed it at
25855847Sbostic 		 * all.  So an unmatched ) is legal POSIX, at least until
25955847Sbostic 		 * we can get it fixed.
26055847Sbostic 		 */
26155847Sbostic 		SETERROR(REG_EPAREN);
26255847Sbostic 		break;
26355847Sbostic #endif
26455847Sbostic 	case '^':
26555847Sbostic 		EMIT(OBOL, 0);
26655847Sbostic 		p->g->iflags |= USEBOL;
26755847Sbostic 		wascaret = 1;
26855847Sbostic 		break;
26955847Sbostic 	case '$':
27055847Sbostic 		EMIT(OEOL, 0);
27155847Sbostic 		p->g->iflags |= USEEOL;
27255847Sbostic 		break;
27355847Sbostic 	case '|':
27455847Sbostic 		SETERROR(REG_EMPTY);
27555847Sbostic 		break;
27655847Sbostic 	case '*':
27755847Sbostic 	case '+':
27855847Sbostic 	case '?':
27955847Sbostic 		SETERROR(REG_BADRPT);
28055847Sbostic 		break;
28155847Sbostic 	case '.':
28255847Sbostic 		if (p->g->cflags&REG_NEWLINE)
28355847Sbostic 			nonnewline(p);
28455847Sbostic 		else
28555847Sbostic 			EMIT(OANY, 0);
28655847Sbostic 		break;
28755847Sbostic 	case '[':
28855847Sbostic 		p_bracket(p);
28955847Sbostic 		break;
29055847Sbostic 	case '\\':
29155847Sbostic 		c = GETNEXT();
292*56355Sbostic #ifdef xxx
29355847Sbostic 		if (c == '^' || c == '.' || c == '[' || c == '$' ||
29455847Sbostic 				c == '(' || c == ')' || c == '|' ||
29555847Sbostic 				c == '*' || c == '+' || c == '?' ||
29655847Sbostic 				c == '{' || c == '\\')
297*56355Sbostic #else
298*56355Sbostic 		if (c != '\0')
299*56355Sbostic #endif
30055847Sbostic 			ordinary(p, c);
30155847Sbostic 		else
30255847Sbostic 			SETERROR(REG_EESCAPE);
30355847Sbostic 		break;
30455847Sbostic 	case '{':		/* okay as ordinary except if digit follows */
30555847Sbostic 		REQUIRE(!isdigit(PEEK()), REG_BADRPT);
30655847Sbostic 		/* FALLTHROUGH */
30755847Sbostic 	default:
30855847Sbostic 		ordinary(p, c);
30955847Sbostic 		break;
31055847Sbostic 	}
31155847Sbostic 
31255847Sbostic 	c = PEEK();
31355847Sbostic 	if (!ISRPT(c, PEEK2()))
31455847Sbostic 		return;		/* no repetition, we're done */
31555847Sbostic 	NEXT();
31655847Sbostic 
31755847Sbostic 	REQUIRE(!wascaret, REG_BADRPT);
31855847Sbostic 	switch (c) {
31955847Sbostic 	case '*':	/* implemented as +? */
32055847Sbostic 		INSERT(OPLUS_, pos);
32155847Sbostic 		BACK(O_PLUS, pos);
32255847Sbostic 		INSERT(OQUEST_, pos);
32355847Sbostic 		BACK(O_QUEST, pos);
32455847Sbostic 		break;
32555847Sbostic 	case '+':
32655847Sbostic 		INSERT(OPLUS_, pos);
32755847Sbostic 		BACK(O_PLUS, pos);
32855847Sbostic 		break;
32955847Sbostic 	case '?':
33055847Sbostic 		INSERT(OQUEST_, pos);
33155847Sbostic 		BACK(O_QUEST, pos);
33255847Sbostic 		break;
33355847Sbostic 	case '{':
33455847Sbostic 		count = p_count(p);
33555847Sbostic 		if (EAT(',')) {
33655847Sbostic 			if (isdigit(PEEK())) {
33755847Sbostic 				count2 = p_count(p);
33855847Sbostic 				REQUIRE(count <= count2, REG_BADBR);
33955847Sbostic 			} else		/* single number with comma */
34055847Sbostic 				count2 = INFINITY;
34155847Sbostic 		} else		/* just a single number */
34255847Sbostic 			count2 = count;
34355847Sbostic 		repeat(p, pos, count, count2);
34455847Sbostic 		if (!EAT('}')) {	/* error heuristics */
34555847Sbostic 			while ((c = PEEK()) != '\0' && c != '}')
34655847Sbostic 				NEXT();
34755847Sbostic 			if (c == '\0')
34855847Sbostic 				SETERROR(REG_EBRACE);
34955847Sbostic 			else
35055847Sbostic 				SETERROR(REG_BADBR);
35155847Sbostic 		}
35255847Sbostic 		break;
35355847Sbostic 	}
35455847Sbostic 
35555847Sbostic 	c = PEEK();
35655847Sbostic 	REQUIRE(!ISRPT(c, PEEK2()), REG_BADRPT);
35755847Sbostic }
35855847Sbostic 
35955847Sbostic /*
36055847Sbostic  - p_bre - BRE parser top level, anchoring and concatenation
361*56355Sbostic  == static void p_bre(register struct parse *p, register uchar end1, \
362*56355Sbostic  ==	register uchar end2);
363*56355Sbostic  * Giving end1 as '\0' essentially eliminates the end1/end2 check.
36455847Sbostic  *
365*56355Sbostic  * This implementation is a bit of a kludge, in that a trailing $ is first
366*56355Sbostic  * taken as an ordinary character and then revised to be an anchor.  The
367*56355Sbostic  * only undesirable side effect is that '$' gets included as a character
368*56355Sbostic  * category in such cases.  This is fairly harmless; not worth fixing.
369*56355Sbostic  * The amount of lookahead needed to avoid this kludge is excessive,
370*56355Sbostic  * especially since things like "$*" appear to be legal. xxx
37155847Sbostic  */
37255847Sbostic static void
37355847Sbostic p_bre(p, end1, end2)
37455847Sbostic register struct parse *p;
37555847Sbostic register uchar end1;		/* first terminating character */
37655847Sbostic register uchar end2;		/* second terminating character */
37755847Sbostic {
37855847Sbostic 	register sopno start = HERE();
37955847Sbostic 	register int first = 1;			/* first subexpression? */
380*56355Sbostic 	register int wasdollar = 0;
38155847Sbostic 
38255847Sbostic 	if (EAT('^')) {
38355847Sbostic 		EMIT(OBOL, 0);
38455847Sbostic 		p->g->iflags |= USEBOL;
38555847Sbostic 	}
38655847Sbostic 	while (!SEE('\0') && !SEETWO(end1, end2)) {
38755847Sbostic 		wasdollar = p_simp_re(p, first);
38855847Sbostic 		first = 0;
38955847Sbostic 	}
39055847Sbostic 	if (wasdollar) {	/* oops, that was a trailing anchor */
39155847Sbostic 		DROP(1);
39255847Sbostic 		EMIT(OEOL, 0);
39355847Sbostic 		p->g->iflags |= USEEOL;
39455847Sbostic 	}
39555847Sbostic 
39655847Sbostic 	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
39755847Sbostic }
39855847Sbostic 
39955847Sbostic /*
40055847Sbostic  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
401*56355Sbostic  == static int p_simp_re(register struct parse *p, int starordinary);
40255847Sbostic  */
40355847Sbostic static int			/* was the simple RE an unbackslashed $? */
40455847Sbostic p_simp_re(p, starordinary)
40555847Sbostic register struct parse *p;
40655847Sbostic int starordinary;		/* is a leading * an ordinary character? */
40755847Sbostic {
40855847Sbostic 	register int c;
40955847Sbostic 	register int count;
41055847Sbostic 	register int count2;
41155847Sbostic 	register sopno pos;
41255847Sbostic 	register int i;
41355847Sbostic 	register sopno subno;
41455847Sbostic #	define	BACKSL	(1<<CHAR_BIT)
41555847Sbostic 
41655847Sbostic 	pos = HERE();		/* repetion op, if any, covers from here */
41755847Sbostic 
41855847Sbostic 	c = GETNEXT();
41955847Sbostic 	assert(c != '\0');	/* caller should have ensured this */
42055847Sbostic 	if (c == '\\')
42155847Sbostic 		c = BACKSL | PEEK();
42255847Sbostic 	switch (c) {
42355847Sbostic 	case '.':
42455847Sbostic 		if (p->g->cflags&REG_NEWLINE)
42555847Sbostic 			nonnewline(p);
42655847Sbostic 		else
42755847Sbostic 			EMIT(OANY, 0);
42855847Sbostic 		break;
42955847Sbostic 	case '[':
43055847Sbostic 		p_bracket(p);
43155847Sbostic 		break;
43255847Sbostic 	case BACKSL|'{':
43355847Sbostic 		SETERROR(REG_BADRPT);
43455847Sbostic 		break;
43555847Sbostic 	case BACKSL|'(':
43655847Sbostic 		NEXT();
43755847Sbostic 		p->g->nsub++;
43855847Sbostic 		subno = p->g->nsub;
43955847Sbostic 		if (subno < NPAREN)
44055847Sbostic 			p->pbegin[subno] = HERE();
44155847Sbostic 		EMIT(OLPAREN, subno);
44255847Sbostic 		/* the SEE here is an error heuristic */
44355847Sbostic 		if (!SEE('\0') && !SEETWO('\\', ')'))
44455847Sbostic 			p_bre(p, '\\', ')');
44555847Sbostic 		if (subno < NPAREN) {
44655847Sbostic 			p->pend[subno] = HERE();
44755847Sbostic 			assert(p->pend[subno] != 0);
44855847Sbostic 		}
44955847Sbostic 		EMIT(ORPAREN, subno);
45055847Sbostic 		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
45155847Sbostic 		break;
45255847Sbostic 	case BACKSL|')':	/* should not get here -- must be user */
45355847Sbostic 	case BACKSL|'}':
45455847Sbostic 		SETERROR(REG_EPAREN);
45555847Sbostic 		break;
45655847Sbostic 	case BACKSL|'1':
45755847Sbostic 	case BACKSL|'2':
45855847Sbostic 	case BACKSL|'3':
45955847Sbostic 	case BACKSL|'4':
46055847Sbostic 	case BACKSL|'5':
46155847Sbostic 	case BACKSL|'6':
46255847Sbostic 	case BACKSL|'7':
46355847Sbostic 	case BACKSL|'8':
46455847Sbostic 	case BACKSL|'9':
46555847Sbostic 		i = (c&~BACKSL) - '0';
46655847Sbostic 		assert(i < NPAREN);
46755847Sbostic 		if (p->pend[i] != 0) {
46855847Sbostic 			assert(i <= p->g->nsub);
46955847Sbostic 			EMIT(OBACK_, i);
47055847Sbostic 			assert(p->pbegin[i] != 0);
47155847Sbostic 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
47255847Sbostic 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
47355847Sbostic 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
47455847Sbostic 			EMIT(O_BACK, i);
47555847Sbostic 		} else
47655847Sbostic 			SETERROR(REG_ESUBREG);
47755847Sbostic 		p->g->backrefs = 1;
47855847Sbostic 		NEXT();
47955847Sbostic 		break;
480*56355Sbostic 	case BACKSL|'\0':
481*56355Sbostic 		SETERROR(REG_EESCAPE);
482*56355Sbostic 		break;
48355847Sbostic 	case '*':
48455847Sbostic 		REQUIRE(starordinary, REG_BADRPT);
48555847Sbostic 		/* FALLTHROUGH */
48655847Sbostic 	default:
487*56355Sbostic 		if (c & BACKSL)
488*56355Sbostic 			c = GETNEXT();
48955847Sbostic 		ordinary(p, (uchar)c);
49055847Sbostic 		break;
49155847Sbostic 	}
49255847Sbostic 
49355847Sbostic 	if (EAT('*')) {		/* implemented as +? */
49455847Sbostic 		INSERT(OPLUS_, pos);
49555847Sbostic 		BACK(O_PLUS, pos);
49655847Sbostic 		INSERT(OQUEST_, pos);
49755847Sbostic 		BACK(O_QUEST, pos);
49855847Sbostic 	} else if (EATTWO('\\', '{')) {
49955847Sbostic 		count = p_count(p);
50055847Sbostic 		if (EAT(',')) {
50155847Sbostic 			if (isdigit(PEEK())) {
50255847Sbostic 				count2 = p_count(p);
50355847Sbostic 				REQUIRE(count <= count2, REG_BADBR);
50455847Sbostic 			} else		/* single number with comma */
50555847Sbostic 				count2 = INFINITY;
50655847Sbostic 		} else		/* just a single number */
50755847Sbostic 			count2 = count;
50855847Sbostic 		repeat(p, pos, count, count2);
50955847Sbostic 		if (!EATTWO('\\', '}')) {	/* error heuristics */
51055847Sbostic 			while (!SEE('\0') && !SEETWO('\\', '}'))
51155847Sbostic 				NEXT();
51255847Sbostic 			if (SEE('\0'))
51355847Sbostic 				SETERROR(REG_EBRACE);
51455847Sbostic 			else
51555847Sbostic 				SETERROR(REG_BADBR);
51655847Sbostic 		}
51755847Sbostic 	} else if (c == '$')	/* unbackslashed $ not followed by reptn */
51855847Sbostic 		return(1);
51955847Sbostic 
52055847Sbostic 	return(0);
52155847Sbostic }
52255847Sbostic 
52355847Sbostic /*
52455847Sbostic  - p_count - parse a repetition count
525*56355Sbostic  == static int p_count(register struct parse *p);
52655847Sbostic  */
52755847Sbostic static int			/* the value */
52855847Sbostic p_count(p)
52955847Sbostic register struct parse *p;
53055847Sbostic {
53155847Sbostic 	register int count = 0;
53255847Sbostic 	register int ndigits = 0;
53355847Sbostic 
53455847Sbostic 	while (isdigit(PEEK()) && count <= DUPMAX) {
53555847Sbostic 		count = count*10 + (GETNEXT() - '0');
53655847Sbostic 		ndigits++;
53755847Sbostic 	}
53855847Sbostic 
53955847Sbostic 	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
54055847Sbostic 	return(count);
54155847Sbostic }
54255847Sbostic 
54355847Sbostic /*
54455847Sbostic  - p_bracket - parse a bracketed character list
545*56355Sbostic  == static void p_bracket(register struct parse *p);
54655847Sbostic  *
54755847Sbostic  * Note a significant property of this code:  if the allocset() did SETERROR,
54855847Sbostic  * no set operations are done.
54955847Sbostic  */
55055847Sbostic static void
55155847Sbostic p_bracket(p)
55255847Sbostic register struct parse *p;
55355847Sbostic {
55455847Sbostic 	register uchar c;
55555847Sbostic 	register cset *cs = allocset(p);
55655847Sbostic 	register int invert = 0;
55755847Sbostic 
55855847Sbostic 	if (EAT('^'))
55955847Sbostic 		invert++;	/* make note to invert set at end */
56055847Sbostic 	if (EAT(']'))
56155847Sbostic 		CHadd(cs, ']');
56255847Sbostic 	while ((c = PEEK()) != '\0' && c != ']' && !SEETWO('-', ']'))
56355847Sbostic 		p_b_term(p, cs);
56455847Sbostic 	if (EAT('-'))
56555847Sbostic 		CHadd(cs, '-');
56655847Sbostic 	MUSTEAT(']', REG_EBRACK);
56755847Sbostic 
56855847Sbostic 	if (invert && p->error == 0) {
56955847Sbostic 		register int i;
57055847Sbostic 
57155847Sbostic 		for (i = p->g->csetsize - 1; i >= 0; i--)
57255847Sbostic 			if (CHIN(cs, i))
57355847Sbostic 				CHsub(cs, i);
57455847Sbostic 			else
57555847Sbostic 				CHadd(cs, i);
57655847Sbostic 		if (p->g->cflags&REG_NEWLINE)
57755847Sbostic 			CHsub(cs, '\n');
57855847Sbostic 		if (cs->multis != NULL)
57955847Sbostic 			mcinvert(p, cs);
58055847Sbostic 	}
58155847Sbostic 	assert(cs->multis == NULL);		/* xxx */
58255847Sbostic 	EMIT(OANYOF, freezeset(p, cs));
58355847Sbostic }
58455847Sbostic 
58555847Sbostic /*
58655847Sbostic  - p_b_term - parse one term of a bracketed character list
587*56355Sbostic  == static void p_b_term(register struct parse *p, register cset *cs);
58855847Sbostic  */
58955847Sbostic static void
59055847Sbostic p_b_term(p, cs)
59155847Sbostic register struct parse *p;
59255847Sbostic register cset *cs;
59355847Sbostic {
59455847Sbostic 	register uchar c;
59555847Sbostic 	register uchar start, finish;
59655847Sbostic 	register int i;
59755847Sbostic 
59855847Sbostic 	/* classify what we've got */
59955847Sbostic 	switch (PEEK()) {
60055847Sbostic 	case '[':
60155847Sbostic 		c = PEEK2();
60255847Sbostic 		break;
60355847Sbostic 	case '-':
60455847Sbostic 		SETERROR(REG_ERANGE);
60555847Sbostic 		return;			/* NOTE RETURN */
60655847Sbostic 		break;
60755847Sbostic 	default:
60855847Sbostic 		c = '\0';
60955847Sbostic 		break;
61055847Sbostic 	}
61155847Sbostic 
61255847Sbostic 	switch (c) {
61355847Sbostic 	case ':':		/* character class */
61455847Sbostic 		NEXT2();
61555847Sbostic 		c = PEEK();
61655847Sbostic 		REQUIRE(c != '\0', REG_EBRACK);
61755847Sbostic 		REQUIRE(c != '-' && c != ']', REG_ECTYPE);
61855847Sbostic 		p_b_cclass(p, cs);
61955847Sbostic 		MUSTNOTSEE('\0', REG_EBRACK);
62055847Sbostic 		REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
62155847Sbostic 		break;
62255847Sbostic 	case '=':		/* equivalence class */
62355847Sbostic 		NEXT2();
62455847Sbostic 		c = PEEK();
62555847Sbostic 		REQUIRE(c != '\0', REG_EBRACK);
62655847Sbostic 		REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
62755847Sbostic 		p_b_eclass(p, cs);
62855847Sbostic 		MUSTNOTSEE('\0', REG_EBRACK);
62955847Sbostic 		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
63055847Sbostic 		break;
63155847Sbostic 	default:		/* symbol, ordinary character, or range */
63255847Sbostic /* xxx revision needed for multichar stuff */
63355847Sbostic 		start = p_b_symbol(p);
63455847Sbostic 		if (PEEK() == '-' && (c = PEEK2()) != ']' && c != '\0') {
63555847Sbostic 			/* range */
63655847Sbostic 			NEXT();
63755847Sbostic 			if (EAT('-'))
63855847Sbostic 				finish = '-';
63955847Sbostic 			else
64055847Sbostic 				finish = p_b_symbol(p);
64155847Sbostic 		} else
64255847Sbostic 			finish = start;
64355847Sbostic 		REQUIRE(start <= finish, REG_ERANGE);
64455847Sbostic 		for (i = start; i <= finish; i++) {
64555847Sbostic 			CHadd(cs, i);
64655847Sbostic 			if ((p->g->cflags&REG_ICASE) && isalpha(i)) {
64755847Sbostic 				c = othercase((uchar)i);
64855847Sbostic 				CHadd(cs, c);
64955847Sbostic 			}
65055847Sbostic 		}
65155847Sbostic 		break;
65255847Sbostic 	}
65355847Sbostic }
65455847Sbostic 
65555847Sbostic /*
65655847Sbostic  - p_b_cclass - parse a character-class name and deal with it
657*56355Sbostic  == static void p_b_cclass(register struct parse *p, register cset *cs);
65855847Sbostic  */
65955847Sbostic static void
66055847Sbostic p_b_cclass(p, cs)
66155847Sbostic register struct parse *p;
66255847Sbostic register cset *cs;
66355847Sbostic {
66455847Sbostic 	register uchar *sb = p->next;
66555847Sbostic 	register uchar *se = sb;
66655847Sbostic 	register struct cclass *cp;
66755847Sbostic 	register int len;
66855847Sbostic 	register uchar *u;
66955847Sbostic 	register uchar c;
67055847Sbostic 
67155847Sbostic 	while (isalpha(*se))
67255847Sbostic 		se++;
67355847Sbostic 	len = se - sb;
67455847Sbostic 	NEXTn(len);
67555847Sbostic 	for (cp = cclasses; cp->name != NULL; cp++)
67655847Sbostic 		if (strncmp(cp->name, (char *)sb, len) == 0 &&
677*56355Sbostic 							cp->name[len] == '\0')
67855847Sbostic 			break;
67955847Sbostic 	if (cp->name == NULL) {
68055847Sbostic 		/* oops, didn't find it */
68155847Sbostic 		SETERROR(REG_ECTYPE);
68255847Sbostic 		return;
68355847Sbostic 	}
68455847Sbostic 
68555847Sbostic 	u = (uchar *)cp->chars;
68655847Sbostic 	while ((c = *u++) != '\0')
68755847Sbostic 		CHadd(cs, c);
68855847Sbostic 	for (u = (uchar *)cp->multis; *u != '\0'; u += strlen((char *)u) + 1)
68955847Sbostic 		MCadd(cs, u);
69055847Sbostic }
69155847Sbostic 
69255847Sbostic /*
69355847Sbostic  - p_b_eclass - parse an equivalence-class name and deal with it
694*56355Sbostic  == static void p_b_eclass(register struct parse *p, register cset *cs);
69555847Sbostic  *
69655847Sbostic  * This implementation is incomplete. xxx
69755847Sbostic  */
69855847Sbostic static void
69955847Sbostic p_b_eclass(p, cs)
70055847Sbostic register struct parse *p;
70155847Sbostic register cset *cs;
70255847Sbostic {
70355847Sbostic 	register uchar c;
70455847Sbostic 
70555847Sbostic 	c = p_b_coll_elem(p, '=');
70655847Sbostic 	CHadd(cs, c);
70755847Sbostic }
70855847Sbostic 
70955847Sbostic /*
71055847Sbostic  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
711*56355Sbostic  == static uchar p_b_symbol(register struct parse *p);
71255847Sbostic  */
71355847Sbostic static uchar			/* value of symbol */
71455847Sbostic p_b_symbol(p)
71555847Sbostic register struct parse *p;
71655847Sbostic {
71755847Sbostic 	register uchar value;
71855847Sbostic 
71955847Sbostic 	if (!EATTWO('[', '.')) {
72055847Sbostic 		MUSTNOTSEE('\0', REG_EBRACK);
72155847Sbostic 		return(GETNEXT());
72255847Sbostic 	}
72355847Sbostic 
72455847Sbostic 	/* collating symbol */
72555847Sbostic 	MUSTNOTSEE('\0', REG_EBRACK);
72655847Sbostic 	value = p_b_coll_elem(p, '.');
72755847Sbostic 	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
72855847Sbostic 	return(value);
72955847Sbostic }
73055847Sbostic 
73155847Sbostic /*
73255847Sbostic  - p_b_coll_elem - parse a collating-element name and look it up
733*56355Sbostic  == static uchar p_b_coll_elem(register struct parse *p, uchar endc);
73455847Sbostic  */
73555847Sbostic static uchar			/* value of collating element */
73655847Sbostic p_b_coll_elem(p, endc)
73755847Sbostic register struct parse *p;
73855847Sbostic uchar endc;			/* name ended by endc,']' */
73955847Sbostic {
74055847Sbostic 	register uchar *sp = p->next;
74155847Sbostic 	register struct cname *cp;
74255847Sbostic 	register int len;
74355847Sbostic 	register uchar c;
74455847Sbostic 
74555847Sbostic 	while ((c = PEEK()) != '\0' && !SEETWO(endc, ']'))
74655847Sbostic 		NEXT();
74755847Sbostic 	if (c == '\0') {
74855847Sbostic 		SETERROR(REG_EBRACK);
74955847Sbostic 		return(0);
75055847Sbostic 	}
75155847Sbostic 	len = p->next - sp;
75255847Sbostic 	for (cp = cnames; cp->name != NULL; cp++)
75355847Sbostic 		if (strncmp(cp->name, (char *)sp, len) == 0 &&
754*56355Sbostic 							cp->name[len] == '\0')
75555847Sbostic 			return(cp->code);	/* known name */
75655847Sbostic 	if (len == 1)
75755847Sbostic 		return(*sp);	/* single character */
75855847Sbostic 	SETERROR(REG_ECOLLATE);			/* neither */
75955847Sbostic 	return(0);
76055847Sbostic }
76155847Sbostic 
76255847Sbostic /*
76355847Sbostic  - othercase - return the case counterpart of an alphabetic
764*56355Sbostic  == static uchar othercase(uchar ch);
76555847Sbostic  */
76655847Sbostic static uchar
76755847Sbostic othercase(ch)
76855847Sbostic uchar ch;
76955847Sbostic {
77055847Sbostic 	assert(isalpha(ch));
77155847Sbostic 	if (isupper(ch))
77255847Sbostic 		return(tolower(ch));
77355847Sbostic 	else if (islower(ch))
77455847Sbostic 		return(toupper(ch));
77555847Sbostic 	else			/* peculiar, but could happen */
77655847Sbostic 		return(ch);
77755847Sbostic }
77855847Sbostic 
77955847Sbostic /*
78055847Sbostic  - bothcases - emit a dualcase version of a character
781*56355Sbostic  == static void bothcases(register struct parse *p, uchar ch);
782*56355Sbostic  *
78355847Sbostic  * Boy, is this implementation ever a kludge...
78455847Sbostic  */
78555847Sbostic static void
78655847Sbostic bothcases(p, ch)
78755847Sbostic register struct parse *p;
78855847Sbostic uchar ch;
78955847Sbostic {
79055847Sbostic 	register uchar *oldnext;
79155847Sbostic 	uchar bracket[3];
79255847Sbostic 
79355847Sbostic 	oldnext = p->next;
79455847Sbostic 	p->next = bracket;
79555847Sbostic 	bracket[0] = ch;
79655847Sbostic 	bracket[1] = ']';
79755847Sbostic 	bracket[2] = '\0';
79855847Sbostic 	p_bracket(p);
79955847Sbostic 	assert(p->next == bracket+2);
80055847Sbostic 	p->next = oldnext;
80155847Sbostic }
80255847Sbostic 
80355847Sbostic /*
80455847Sbostic  - ordinary - emit an ordinary character
805*56355Sbostic  == static void ordinary(register struct parse *p, register uchar ch);
80655847Sbostic  */
80755847Sbostic static void
80855847Sbostic ordinary(p, ch)
80955847Sbostic register struct parse *p;
81055847Sbostic register uchar ch;
81155847Sbostic {
81255847Sbostic 	register uchar *cap = p->g->categories;
81355847Sbostic 
81455847Sbostic 	if ((p->g->cflags&REG_ICASE) && isalpha(ch)) {
81555847Sbostic 		bothcases(p, ch);
81655847Sbostic 		return;
81755847Sbostic 	}
81855847Sbostic 
81955847Sbostic 	EMIT(OCHAR, ch);
82055847Sbostic 	if (cap[ch] == 0)
82155847Sbostic 		cap[ch] = p->g->ncategories++;
82255847Sbostic }
82355847Sbostic 
82455847Sbostic /*
82555847Sbostic  - nonnewline - emit REG_NEWLINE version of OANY
826*56355Sbostic  == static void nonnewline(register struct parse *p);
827*56355Sbostic  *
82855847Sbostic  * Boy, is this implementation ever a kludge...
82955847Sbostic  */
83055847Sbostic static void
83155847Sbostic nonnewline(p)
83255847Sbostic register struct parse *p;
83355847Sbostic {
83455847Sbostic 	register uchar *oldnext;
83555847Sbostic 	uchar bracket[4];
83655847Sbostic 
83755847Sbostic 	oldnext = p->next;
83855847Sbostic 	p->next = bracket;
83955847Sbostic 	bracket[0] = '^';
84055847Sbostic 	bracket[1] = '\n';
84155847Sbostic 	bracket[2] = ']';
84255847Sbostic 	bracket[3] = '\0';
84355847Sbostic 	p_bracket(p);
84455847Sbostic 	assert(p->next == bracket+3);
84555847Sbostic 	p->next = oldnext;
84655847Sbostic }
84755847Sbostic 
84855847Sbostic /*
84955847Sbostic  - repeat - generate code for a bounded repetition, recursively if needed
850*56355Sbostic  == static void repeat(register struct parse *p, sopno start, int from, int to);
85155847Sbostic  */
85255847Sbostic static void
85355847Sbostic repeat(p, start, from, to)
85455847Sbostic register struct parse *p;
85555847Sbostic sopno start;			/* operand from here to end of strip */
85655847Sbostic int from;			/* repeated from this number */
85755847Sbostic int to;				/* to this number of times (maybe INFINITY) */
85855847Sbostic {
85955847Sbostic 	register sopno finish = HERE();
86055847Sbostic #	define	N	2
86155847Sbostic #	define	INF	3
86255847Sbostic #	define	REP(f, t)	((f)*8 + (t))
86355847Sbostic #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
86455847Sbostic 	register sopno copy;
86555847Sbostic 
86655847Sbostic 	if (p->error != 0)	/* head off possible runaway recursion */
86755847Sbostic 		return;
86855847Sbostic 
86955847Sbostic 	assert(from <= to);
87055847Sbostic 
87155847Sbostic 	switch (REP(MAP(from), MAP(to))) {
87255847Sbostic 	case REP(0, 0):			/* must be user doing this */
87355847Sbostic 		DROP(finish-start);	/* drop the operand */
87455847Sbostic 		break;
87555847Sbostic 	case REP(0, 1):			/* as x{1,1}? */
87655847Sbostic 	case REP(0, N):			/* as x{1,n}? */
87755847Sbostic 	case REP(0, INF):		/* as x{1,}? */
87855847Sbostic 		INSERT(OQUEST_, start);		/* offset is wrong... */
87955847Sbostic 		repeat(p, start+1, 1, to);
88055847Sbostic 		FWD(start);			/* ... fix it */
88155847Sbostic 		BACK(O_QUEST, start);
88255847Sbostic 		break;
88355847Sbostic 	case REP(1, 1):			/* trivial case */
88455847Sbostic 		/* done */
88555847Sbostic 		break;
88655847Sbostic 	case REP(1, N):			/* as x?x{1,n-1} */
88755847Sbostic 		INSERT(OQUEST_, start);
88855847Sbostic 		BACK(O_QUEST, start);
88955847Sbostic 		copy = dupl(p, start+1, finish+1);
89055847Sbostic 		assert(copy == finish+2);
89155847Sbostic 		repeat(p, copy, 1, to-1);
89255847Sbostic 		break;
89355847Sbostic 	case REP(1, INF):		/* as x+ */
89455847Sbostic 		INSERT(OPLUS_, start);
89555847Sbostic 		BACK(O_PLUS, start);
89655847Sbostic 		break;
89755847Sbostic 	case REP(N, N):			/* as xx{m-1,n-1} */
89855847Sbostic 		copy = dupl(p, start, finish);
89955847Sbostic 		repeat(p, copy, from-1, to-1);
90055847Sbostic 		break;
90155847Sbostic 	case REP(N, INF):		/* as xx{n-1,INF} */
90255847Sbostic 		copy = dupl(p, start, finish);
90355847Sbostic 		repeat(p, copy, from-1, to);
90455847Sbostic 		break;
90555847Sbostic 	default:			/* "can't happen" */
90655847Sbostic 		SETERROR(REG_ASSERT);	/* just in case */
90755847Sbostic 		break;
90855847Sbostic 	}
90955847Sbostic }
91055847Sbostic 
91155847Sbostic /*
91255847Sbostic  - seterr - set an error condition
913*56355Sbostic  == static int seterr(register struct parse *p, int e);
91455847Sbostic  */
91555847Sbostic static int			/* useless but makes type checking happy */
91655847Sbostic seterr(p, e)
91755847Sbostic register struct parse *p;
91855847Sbostic int e;
91955847Sbostic {
92055847Sbostic 	if (p->error == 0)	/* keep earliest error condition */
92155847Sbostic 		p->error = e;
92255847Sbostic 	p->next = nuls;		/* try to bring things to a halt */
92355847Sbostic 	return(0);		/* make the return value well-defined */
92455847Sbostic }
92555847Sbostic 
92655847Sbostic /*
92755847Sbostic  - allocset - allocate a set of characters for []
928*56355Sbostic  == static cset *allocset(register struct parse *p);
92955847Sbostic  */
93055847Sbostic static cset *
93155847Sbostic allocset(p)
93255847Sbostic register struct parse *p;
93355847Sbostic {
93455847Sbostic 	register int no = p->g->ncsets++;
93555847Sbostic 	register size_t nc;
93655847Sbostic 	register size_t nbytes;
93755847Sbostic 	register cset *cs;
93855847Sbostic 	register size_t css = (size_t)p->g->csetsize;
93955847Sbostic 
94055847Sbostic 	if (no >= p->ncsalloc) {	/* need another column of space */
94155847Sbostic 		p->ncsalloc += CHAR_BIT;
94255847Sbostic 		nc = p->ncsalloc;
94355847Sbostic 		assert(nc % CHAR_BIT == 0);
94455847Sbostic 		nbytes = nc / CHAR_BIT * css;
94555847Sbostic 		if (p->g->sets == NULL)
94655847Sbostic 			p->g->sets = (cset *)malloc(nc * sizeof(cset));
94755847Sbostic 		else
94855847Sbostic 			p->g->sets = (cset *)realloc((char *)p->g->sets,
94955847Sbostic 							nc * sizeof(cset));
95055847Sbostic 		if (p->g->setbits == NULL)
95155847Sbostic 			p->g->setbits = (uchar *)malloc(nbytes);
95255847Sbostic 		else
95355847Sbostic 			p->g->setbits = (uchar *)realloc((char *)p->g->setbits,
95455847Sbostic 								nbytes);
95555847Sbostic 		if (p->g->sets != NULL && p->g->setbits != NULL)
95655847Sbostic 			(void) memset((char *)p->g->setbits + (nbytes - css),
95755847Sbostic 								0, css);
95855847Sbostic 		else {
95955847Sbostic 			no = 0;
96055847Sbostic 			SETERROR(REG_ESPACE);
96155847Sbostic 			/* caller's responsibility not to do set ops */
96255847Sbostic 		}
96355847Sbostic 	}
96455847Sbostic 
96555847Sbostic 	assert(p->g->sets != NULL);	/* xxx */
96655847Sbostic 	cs = &p->g->sets[no];
96755847Sbostic 	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
96855847Sbostic 	cs->mask = 1 << ((no) % CHAR_BIT);
96955847Sbostic 	cs->hash = 0;
97055847Sbostic 	cs->smultis = 0;
97155847Sbostic 	cs->multis = NULL;
97255847Sbostic 
97355847Sbostic 	return(cs);
97455847Sbostic }
97555847Sbostic 
97655847Sbostic /*
97755847Sbostic  - freezeset - final processing on a set of characters
978*56355Sbostic  == static int freezeset(register struct parse *p, register cset *cs);
97955847Sbostic  *
98055847Sbostic  * The main task here is merging identical sets.  This is usually a waste
98155847Sbostic  * of time (although the hash code minimizes the overhead), but can win
98255847Sbostic  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
98355847Sbostic  * is done using addition rather than xor -- all ASCII [aA] sets xor to
98455847Sbostic  * the same value!
98555847Sbostic  */
98655847Sbostic static int			/* set number */
98755847Sbostic freezeset(p, cs)
98855847Sbostic register struct parse *p;
98955847Sbostic register cset *cs;
99055847Sbostic {
99155847Sbostic 	register uchar h = cs->hash;
99255847Sbostic 	register int i;
99355847Sbostic 	register cset *top = &p->g->sets[p->g->ncsets];
99455847Sbostic 	register cset *cs2;
99555847Sbostic 	register size_t css = (size_t)p->g->csetsize;
99655847Sbostic 
99755847Sbostic 	/* look for an earlier one which is the same */
99855847Sbostic 	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
99955847Sbostic 		if (cs2->hash == h && cs2 != cs) {
100055847Sbostic 			/* maybe */
100155847Sbostic 			for (i = 0; i < css; i++)
100255847Sbostic 				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
100355847Sbostic 					break;		/* no */
100455847Sbostic 			if (i == css)
100555847Sbostic 				break;			/* yes */
100655847Sbostic 		}
100755847Sbostic 
100855847Sbostic 	if (cs2 < top) {	/* found one */
100955847Sbostic 		assert(cs == top-1);
101055847Sbostic 		p->g->ncsets--;
101155847Sbostic 		for (i = 0; i < css; i++)
101255847Sbostic 			CHsub(cs, i);
101355847Sbostic 		cs = cs2;
101455847Sbostic 	}
101555847Sbostic 
101655847Sbostic 	return((int)(cs - p->g->sets));
101755847Sbostic }
101855847Sbostic 
101955847Sbostic /*
102055847Sbostic  - mcadd - add a collating element to a cset
1021*56355Sbostic  == static void mcadd(register struct parse *p, register cset *cs, \
1022*56355Sbostic  ==	register uchar *cp);
102355847Sbostic  */
102455847Sbostic static void
102555847Sbostic mcadd(p, cs, cp)
102655847Sbostic register struct parse *p;
102755847Sbostic register cset *cs;
102855847Sbostic register uchar *cp;
102955847Sbostic {
103055847Sbostic 	register size_t oldend = cs->smultis;
103155847Sbostic 
103255847Sbostic 	cs->smultis += strlen((char *)cp) + 1;
103355847Sbostic 	if (cs->multis == NULL)
103455847Sbostic 		cs->multis = (uchar *)malloc(cs->smultis);
103555847Sbostic 	else
103655847Sbostic 		cs->multis = (uchar *)realloc(cs->multis, cs->smultis);
103755847Sbostic 	if (cs->multis == NULL) {
103855847Sbostic 		SETERROR(REG_ESPACE);
103955847Sbostic 		return;
104055847Sbostic 	}
104155847Sbostic 
104255847Sbostic 	(void) strcpy((char *)(cs->multis + oldend - 1), (char *)cp);
104355847Sbostic 	cs->multis[cs->smultis - 1] = '\0';
104455847Sbostic }
104555847Sbostic 
104655847Sbostic /*
104755847Sbostic  - mcsub - subtract a collating element from a cset
1048*56355Sbostic  == static void mcsub(register struct parse *p, register cset *cs, \
1049*56355Sbostic  ==	register uchar *cp);
105055847Sbostic  */
105155847Sbostic static void
105255847Sbostic mcsub(p, cs, cp)
105355847Sbostic register struct parse *p;
105455847Sbostic register cset *cs;
105555847Sbostic register uchar *cp;
105655847Sbostic {
105755847Sbostic 	register uchar *fp = mcfind(cs, cp);
105855847Sbostic 	register size_t len = strlen((char *)fp);
105955847Sbostic 
106055847Sbostic 	assert(p != NULL);
106155847Sbostic 	(void) memmove((char *)fp, (char *)(fp + len + 1),
106255847Sbostic 				cs->smultis - (fp + len + 1 - cs->multis));
106355847Sbostic 	cs->smultis -= len;
106455847Sbostic 
106555847Sbostic 	if (cs->smultis == 0) {
106655847Sbostic 		free((char *)cs->multis);
106755847Sbostic 		cs->multis = NULL;
106855847Sbostic 		return;
106955847Sbostic 	}
107055847Sbostic 
107155847Sbostic 	cs->multis = (uchar *)realloc(cs->multis, cs->smultis);
107255847Sbostic 	assert(cs->multis != NULL);
107355847Sbostic }
107455847Sbostic 
107555847Sbostic /*
107655847Sbostic  - mcin - is a collating element in a cset?
1077*56355Sbostic  == static int mcin(register struct parse *p, register cset *cs, \
1078*56355Sbostic  ==	register uchar *cp);
107955847Sbostic  */
108055847Sbostic static int
108155847Sbostic mcin(p, cs, cp)
108255847Sbostic register struct parse *p;
108355847Sbostic register cset *cs;
108455847Sbostic register uchar *cp;
108555847Sbostic {
108655847Sbostic 	return(mcfind(cs, cp) != NULL);
108755847Sbostic }
108855847Sbostic 
108955847Sbostic /*
109055847Sbostic  - mcfind - find a collating element in a cset
1091*56355Sbostic  == static uchar *mcfind(register cset *cs, register uchar *cp);
109255847Sbostic  */
109355847Sbostic static uchar *
109455847Sbostic mcfind(cs, cp)
109555847Sbostic register cset *cs;
109655847Sbostic register uchar *cp;
109755847Sbostic {
109855847Sbostic 	register uchar *p;
109955847Sbostic 
110055847Sbostic 	if (cs->multis == NULL)
110155847Sbostic 		return(NULL);
110255847Sbostic 	for (p = cs->multis; *p != '\0'; p += strlen((char *)p) + 1)
110355847Sbostic 		if (strcmp((char *)cp, (char *)p) == 0)
110455847Sbostic 			return(p);
110555847Sbostic 	return(NULL);
110655847Sbostic }
110755847Sbostic 
110855847Sbostic /*
110955847Sbostic  - mcinvert - invert the list of collating elements in a cset
1110*56355Sbostic  == static void mcinvert(register struct parse *p, register cset *cs);
111155847Sbostic  *
111255847Sbostic  * This would have to know the set of possibilities.  Implementation
111355847Sbostic  * is deferred.
111455847Sbostic  */
111555847Sbostic static void
111655847Sbostic mcinvert(p, cs)
111755847Sbostic register struct parse *p;
111855847Sbostic register cset *cs;
111955847Sbostic {
112055847Sbostic 	assert(cs->multis == NULL);	/* xxx */
112155847Sbostic }
112255847Sbostic 
112355847Sbostic /*
112455847Sbostic  - isinsets - is this character in any sets?
1125*56355Sbostic  == static int isinsets(register struct re_guts *g, uchar c);
112655847Sbostic  */
112755847Sbostic static int			/* predicate */
112855847Sbostic isinsets(g, c)
112955847Sbostic register struct re_guts *g;
113055847Sbostic uchar c;
113155847Sbostic {
113255847Sbostic 	register uchar *col;
113355847Sbostic 	register int i;
113455847Sbostic 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
113555847Sbostic 
113655847Sbostic 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
113755847Sbostic 		if (col[c] != 0)
113855847Sbostic 			return(1);
113955847Sbostic 	return(0);
114055847Sbostic }
114155847Sbostic 
114255847Sbostic /*
114355847Sbostic  - samesets - are these two characters in exactly the same sets?
1144*56355Sbostic  == static int samesets(register struct re_guts *g, uchar c1, uchar c2);
114555847Sbostic  */
114655847Sbostic static int			/* predicate */
114755847Sbostic samesets(g, c1, c2)
114855847Sbostic register struct re_guts *g;
114955847Sbostic register uchar c1;
115055847Sbostic register uchar c2;
115155847Sbostic {
115255847Sbostic 	register uchar *col;
115355847Sbostic 	register int i;
115455847Sbostic 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
115555847Sbostic 
115655847Sbostic 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
115755847Sbostic 		if (col[c1] != col[c2])
115855847Sbostic 			return(0);
115955847Sbostic 	return(1);
116055847Sbostic }
116155847Sbostic 
116255847Sbostic /*
116355847Sbostic  - categorize - sort out character categories
1164*56355Sbostic  == static void categorize(struct parse *p, register struct re_guts *g);
116555847Sbostic  */
116655847Sbostic static void
116755847Sbostic categorize(p, g)
116855847Sbostic struct parse *p;
116955847Sbostic register struct re_guts *g;
117055847Sbostic {
117155847Sbostic 	register uchar *cats = g->categories;
117255847Sbostic 	register unsigned c;
117355847Sbostic 	register unsigned c2;
117455847Sbostic 	register uchar cat;
117555847Sbostic 
117655847Sbostic 	/* avoid making error situations worse */
117755847Sbostic 	if (p->error != 0)
117855847Sbostic 		return;
117955847Sbostic 
118055847Sbostic 	for (c = 0; c < g->csetsize; c++)
118155847Sbostic 		if (cats[c] == 0 && isinsets(g, c)) {
118255847Sbostic 			cat = g->ncategories++;
118355847Sbostic 			cats[c] = cat;
118455847Sbostic 			for (c2 = c+1; c2 < g->csetsize; c2++)
118555847Sbostic 				if (cats[c2] == 0 && samesets(g, c, c2))
118655847Sbostic 					cats[c2] = cat;
118755847Sbostic 		}
118855847Sbostic }
118955847Sbostic 
119055847Sbostic /*
119155847Sbostic  - dupl - emit a duplicate of a bunch of sops
1192*56355Sbostic  == static sopno dupl(register struct parse *p, sopno start, sopno finish);
119355847Sbostic  */
119455847Sbostic static sopno			/* start of duplicate */
119555847Sbostic dupl(p, start, finish)
119655847Sbostic register struct parse *p;
119755847Sbostic sopno start;			/* from here */
119855847Sbostic sopno finish;			/* to this less one */
119955847Sbostic {
120055847Sbostic 	register sopno ret = HERE();
120155847Sbostic 	register sopno len = finish - start;
120255847Sbostic 
120355847Sbostic 	assert(finish >= start);
120455847Sbostic 	if (len == 0)
120555847Sbostic 		return(ret);
120655847Sbostic 	enlarge(p, p->ssize + len);	/* this many unexpected additions */
120755847Sbostic 	assert(p->ssize >= p->slen + len);
120855847Sbostic 	(void) memcpy((char *)(p->strip + p->slen),
120955847Sbostic 		(char *)(p->strip + start), (size_t)len*sizeof(sop));
121055847Sbostic 	p->slen += len;
121155847Sbostic 	return(ret);
121255847Sbostic }
121355847Sbostic 
121455847Sbostic /*
121555847Sbostic  - doemit - emit a strip operator
1216*56355Sbostic  == static void doemit(register struct parse *p, sop op, size_t opnd);
121755847Sbostic  *
121855847Sbostic  * It might seem better to implement this as a macro with a function as
121955847Sbostic  * hard-case backup, but it's just too big and messy unless there are
122055847Sbostic  * some changes to the data structures.  Maybe later.
122155847Sbostic  */
122255847Sbostic static void
122355847Sbostic doemit(p, op, opnd)
122455847Sbostic register struct parse *p;
122555847Sbostic sop op;
122655847Sbostic size_t opnd;
122755847Sbostic {
122855847Sbostic 	/* avoid making error situations worse */
122955847Sbostic 	if (p->error != 0)
123055847Sbostic 		return;
123155847Sbostic 
123255847Sbostic 	/* deal with oversize operands ("can't happen", more or less) */
123355847Sbostic 	assert(opnd < 1<<OPSHIFT);
123455847Sbostic 
123555847Sbostic 	/* deal with undersized strip */
123655847Sbostic 	if (p->slen >= p->ssize)
123755847Sbostic 		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
123855847Sbostic 	assert(p->slen < p->ssize);
123955847Sbostic 
124055847Sbostic 	/* finally, it's all reduced to the easy case */
124155847Sbostic 	p->strip[p->slen++] = SOP(op, opnd);
124255847Sbostic }
124355847Sbostic 
124455847Sbostic /*
124555847Sbostic  - doinsert - insert a sop into the strip
1246*56355Sbostic  == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
124755847Sbostic  */
124855847Sbostic static void
124955847Sbostic doinsert(p, op, opnd, pos)
125055847Sbostic register struct parse *p;
125155847Sbostic sop op;
125255847Sbostic size_t opnd;
125355847Sbostic sopno pos;
125455847Sbostic {
125555847Sbostic 	register sopno sn;
125655847Sbostic 	register sop s;
125755847Sbostic 	register int i;
125855847Sbostic 
125955847Sbostic 	/* avoid making error situations worse */
126055847Sbostic 	if (p->error != 0)
126155847Sbostic 		return;
126255847Sbostic 
126355847Sbostic 	sn = HERE();
126455847Sbostic 	EMIT(op, opnd);		/* do checks, ensure space */
126555847Sbostic 	assert(HERE() == sn+1);
126655847Sbostic 	s = p->strip[sn];
126755847Sbostic 
126855847Sbostic 	/* adjust paren pointers */
126955847Sbostic 	assert(pos > 0);
127055847Sbostic 	for (i = 1; i < NPAREN; i++) {
127155847Sbostic 		if (p->pbegin[i] >= pos) {
127255847Sbostic 			p->pbegin[i]++;
127355847Sbostic 		}
127455847Sbostic 		if (p->pend[i] >= pos) {
127555847Sbostic 			p->pend[i]++;
127655847Sbostic 		}
127755847Sbostic 	}
127855847Sbostic 
127955847Sbostic 	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
128055847Sbostic 						(HERE()-pos-1)*sizeof(sop));
128155847Sbostic 	p->strip[pos] = s;
128255847Sbostic }
128355847Sbostic 
128455847Sbostic /*
128555847Sbostic  - dofwd - complete a forward reference
1286*56355Sbostic  == static void dofwd(register struct parse *p, sopno pos, sop value);
128755847Sbostic  */
128855847Sbostic static void
128955847Sbostic dofwd(p, pos, value)
129055847Sbostic register struct parse *p;
129155847Sbostic register sopno pos;
129255847Sbostic sop value;
129355847Sbostic {
129455847Sbostic 	/* avoid making error situations worse */
129555847Sbostic 	if (p->error != 0)
129655847Sbostic 		return;
129755847Sbostic 
129855847Sbostic 	assert(value < 1<<OPSHIFT);
129955847Sbostic 	p->strip[pos] = OP(p->strip[pos]) | value;
130055847Sbostic }
130155847Sbostic 
130255847Sbostic /*
130355847Sbostic  - enlarge - enlarge the strip
1304*56355Sbostic  == static void enlarge(register struct parse *p, sopno size);
130555847Sbostic  */
130655847Sbostic static void
130755847Sbostic enlarge(p, size)
130855847Sbostic register struct parse *p;
130955847Sbostic register sopno size;
131055847Sbostic {
131155847Sbostic 	register sop *sp;
131255847Sbostic 
131355847Sbostic 	if (p->ssize >= size)
131455847Sbostic 		return;
131555847Sbostic 
131655847Sbostic 	sp = (sop *)realloc(p->strip, size*sizeof(sop));
131755847Sbostic 	if (sp == NULL) {
131855847Sbostic 		SETERROR(REG_ESPACE);
131955847Sbostic 		return;
132055847Sbostic 	}
132155847Sbostic 	p->strip = sp;
132255847Sbostic 	p->ssize = size;
132355847Sbostic }
132455847Sbostic 
132555847Sbostic /*
132655847Sbostic  - stripsnug - compact the strip
1327*56355Sbostic  == static void stripsnug(register struct parse *p, register struct re_guts *g);
132855847Sbostic  */
132955847Sbostic static void
133055847Sbostic stripsnug(p, g)
133155847Sbostic register struct parse *p;
133255847Sbostic register struct re_guts *g;
133355847Sbostic {
133455847Sbostic 	g->nstates = p->slen;
133555847Sbostic 	g->strip = (sop *)realloc((sop *)p->strip, p->slen * sizeof(sop));
133655847Sbostic 	if (g->strip == NULL) {
133755847Sbostic 		SETERROR(REG_ESPACE);
133855847Sbostic 		g->strip = p->strip;
133955847Sbostic 	}
134055847Sbostic }
134155847Sbostic 
134255847Sbostic /*
134355847Sbostic  - findmust - fill in must and mlen with longest mandatory literal string
1344*56355Sbostic  == static void findmust(register struct parse *p, register struct re_guts *g);
134555847Sbostic  *
134655847Sbostic  * This algorithm could do fancy things like analyzing the operands of |
134755847Sbostic  * for common subsequences.  Someday.  This code is simple and finds most
134855847Sbostic  * of the interesting cases.
134955847Sbostic  *
135055847Sbostic  * Note that must and mlen got initialized during setup.
135155847Sbostic  */
1352*56355Sbostic static void
135355847Sbostic findmust(p, g)
135455847Sbostic struct parse *p;
135555847Sbostic register struct re_guts *g;
135655847Sbostic {
135755847Sbostic 	register sop *scan;
135855847Sbostic 	sop *start;
135955847Sbostic 	register sop *newstart;
136055847Sbostic 	register sopno newlen;
136155847Sbostic 	register sop s;
136255847Sbostic 	register char *cp;
136355847Sbostic 	register sopno i;
136455847Sbostic 
136555847Sbostic 	/* avoid making error situations worse */
136655847Sbostic 	if (p->error != 0)
136755847Sbostic 		return;
136855847Sbostic 
136955847Sbostic 	/* find the longest OCHAR sequence in strip */
137055847Sbostic 	newlen = 0;
137155847Sbostic 	scan = g->strip + 1;
137255847Sbostic 	do {
137355847Sbostic 		s = *scan++;
137455847Sbostic 		switch (OP(s)) {
137555847Sbostic 		case OCHAR:		/* sequence member */
137655847Sbostic 			if (newlen == 0)		/* new sequence */
137755847Sbostic 				newstart = scan - 1;
137855847Sbostic 			newlen++;
137955847Sbostic 			break;
138055847Sbostic 		case OPLUS_:		/* things that don't break one */
138155847Sbostic 		case OLPAREN:
138255847Sbostic 		case ORPAREN:
138355847Sbostic 			break;
138455847Sbostic 		case OQUEST_:		/* things that must be skipped */
138555847Sbostic 		case OCH_:
138655847Sbostic 			scan--;
138755847Sbostic 			do {
138855847Sbostic 				scan += OPND(s);
138955847Sbostic 				s = *scan;
139055847Sbostic 				/* assert() interferes w debug printouts */
139155847Sbostic 				if (OP(s) != O_QUEST && OP(s) != O_CH &&
139255847Sbostic 							OP(s) != OOR2) {
139355847Sbostic 					g->iflags |= BAD;
139455847Sbostic 					return;
139555847Sbostic 				}
139655847Sbostic 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
139755847Sbostic 			/* fallthrough */
139855847Sbostic 		default:		/* things that break a sequence */
139955847Sbostic 			if (newlen > g->mlen) {		/* ends one */
140055847Sbostic 				start = newstart;
140155847Sbostic 				g->mlen = newlen;
140255847Sbostic 			}
140355847Sbostic 			newlen = 0;
140455847Sbostic 			break;
140555847Sbostic 		}
140655847Sbostic 	} while (OP(s) != OEND);
140755847Sbostic 
140855847Sbostic 	if (g->mlen == 0)		/* there isn't one */
140955847Sbostic 		return;
141055847Sbostic 
141155847Sbostic 	/* turn it into a character string */
141255847Sbostic 	g->must = malloc((size_t)g->mlen + 1);
141355847Sbostic 	if (g->must == NULL) {		/* argh; just forget it */
141455847Sbostic 		g->mlen = 0;
141555847Sbostic 		return;
141655847Sbostic 	}
141755847Sbostic 	cp = g->must;
141855847Sbostic 	scan = start;
141955847Sbostic 	for (i = g->mlen; i > 0; i--) {
142055847Sbostic 		while (OP(s = *scan++) != OCHAR)
142155847Sbostic 			continue;
142255847Sbostic 		*cp++ = OPND(s);
142355847Sbostic 	}
142455847Sbostic 	*cp++ = '\0';		/* just on general principles */
142555847Sbostic }
142655847Sbostic 
142755847Sbostic /*
142855847Sbostic  - pluscount - count + nesting
1429*56355Sbostic  == static sopno pluscount(register struct parse *p, register struct re_guts *g);
143055847Sbostic  */
1431*56355Sbostic static sopno			/* nesting depth */
143255847Sbostic pluscount(p, g)
143355847Sbostic struct parse *p;
143455847Sbostic register struct re_guts *g;
143555847Sbostic {
143655847Sbostic 	register sop *scan;
143755847Sbostic 	register sop s;
143855847Sbostic 	register sopno plusnest = 0;
143955847Sbostic 	register sopno maxnest = 0;
144055847Sbostic 
144155847Sbostic 	if (p->error != 0)
144255847Sbostic 		return(0);	/* there may not be an OEND */
144355847Sbostic 
144455847Sbostic 	scan = g->strip + 1;
144555847Sbostic 	do {
144655847Sbostic 		s = *scan++;
144755847Sbostic 		switch (OP(s)) {
144855847Sbostic 		case OPLUS_:
144955847Sbostic 			plusnest++;
145055847Sbostic 			break;
145155847Sbostic 		case O_PLUS:
145255847Sbostic 			if (plusnest > maxnest)
145355847Sbostic 				maxnest = plusnest;
145455847Sbostic 			plusnest--;
145555847Sbostic 			break;
145655847Sbostic 		}
145755847Sbostic 	} while (OP(s) != OEND);
145855847Sbostic 	if (plusnest != 0)
145955847Sbostic 		g->iflags |= BAD;
146055847Sbostic 	return(maxnest);
146155847Sbostic }
1462