xref: /netbsd-src/external/bsd/less/dist/regexp.c (revision 838f5788460f0f133b15d706e644d692a9d4d6ec)
1*838f5788Ssimonb /*	$NetBSD: regexp.c,v 1.4 2023/10/06 05:49:49 simonb Exp $	*/
220006a0bStron 
320006a0bStron /*
420006a0bStron  * regcomp and regexec -- regsub and regerror are elsewhere
520006a0bStron  *
620006a0bStron  *	Copyright (c) 1986 by University of Toronto.
720006a0bStron  *	Written by Henry Spencer.  Not derived from licensed software.
820006a0bStron  *
920006a0bStron  *	Permission is granted to anyone to use this software for any
1020006a0bStron  *	purpose on any computer system, and to redistribute it freely,
1120006a0bStron  *	subject to the following restrictions:
1220006a0bStron  *
1320006a0bStron  *	1. The author is not responsible for the consequences of use of
1420006a0bStron  *		this software, no matter how awful, even if they arise
1520006a0bStron  *		from defects in it.
1620006a0bStron  *
1720006a0bStron  *	2. The origin of this software must not be misrepresented, either
1820006a0bStron  *		by explicit claim or by omission.
1920006a0bStron  *
2020006a0bStron  *	3. Altered versions must be plainly marked as such, and must not
2120006a0bStron  *		be misrepresented as being the original software.
2220006a0bStron  *
2320006a0bStron  * Beware that some of this code is subtly aware of the way operator
2420006a0bStron  * precedence is structured in regular expressions.  Serious changes in
2520006a0bStron  * regular-expression syntax might require a total rethink.
2620006a0bStron  *
2720006a0bStron  * *** NOTE: this code has been altered slightly for use in Tcl. ***
2820006a0bStron  * Slightly modified by David MacKenzie to undo most of the changes for TCL.
2920006a0bStron  * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman
3020006a0bStron  */
3120006a0bStron 
3220006a0bStron #include "less.h"
3320006a0bStron #if HAVE_STDIO_H
3420006a0bStron #include <stdio.h>
3520006a0bStron #endif
3620006a0bStron #if HAVE_STDLIB_H
3720006a0bStron #include <stdlib.h>
3820006a0bStron #endif
3920006a0bStron #if HAVE_STRING_H
4020006a0bStron #include <string.h>
4120006a0bStron #endif
4220006a0bStron #include "regexp.h"
4320006a0bStron 
4420006a0bStron /*
4520006a0bStron  * The "internal use only" fields in regexp.h are present to pass info from
4620006a0bStron  * compile to execute that permits the execute phase to run lots faster on
4720006a0bStron  * simple cases.  They are:
4820006a0bStron  *
4920006a0bStron  * regstart	char that must begin a match; '\0' if none obvious
5020006a0bStron  * reganch	is the match anchored (at beginning-of-line only)?
5120006a0bStron  * regmust	string (pointer into program) that match must include, or NULL
5220006a0bStron  * regmlen	length of regmust string
5320006a0bStron  *
5420006a0bStron  * Regstart and reganch permit very fast decisions on suitable starting points
5520006a0bStron  * for a match, cutting down the work a lot.  Regmust permits fast rejection
5620006a0bStron  * of lines that cannot possibly match.  The regmust tests are costly enough
5720006a0bStron  * that regcomp() supplies a regmust only if the r.e. contains something
5820006a0bStron  * potentially expensive (at present, the only such thing detected is * or +
5920006a0bStron  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
6020006a0bStron  * supplied because the test in regexec() needs it and regcomp() is
6120006a0bStron  * computing it anyway.
6220006a0bStron  */
6320006a0bStron 
6420006a0bStron /*
6520006a0bStron  * Structure for regexp "program".  This is essentially a linear encoding
6620006a0bStron  * of a nondeterministic finite-state machine (aka syntax charts or
6720006a0bStron  * "railroad normal form" in parsing technology).  Each node is an opcode
6820006a0bStron  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
6920006a0bStron  * all nodes except BRANCH implement concatenation; a "next" pointer with
7020006a0bStron  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
7120006a0bStron  * have one of the subtle syntax dependencies:  an individual BRANCH (as
7220006a0bStron  * opposed to a collection of them) is never concatenated with anything
7320006a0bStron  * because of operator precedence.)  The operand of some types of node is
7420006a0bStron  * a literal string; for others, it is a node leading into a sub-FSM.  In
7520006a0bStron  * particular, the operand of a BRANCH node is the first node of the branch.
7620006a0bStron  * (NB this is *not* a tree structure:  the tail of the branch connects
7720006a0bStron  * to the thing following the set of BRANCHes.)  The opcodes are:
7820006a0bStron  */
7920006a0bStron 
8020006a0bStron /* definition	number	opnd?	meaning */
8120006a0bStron #undef EOL
8220006a0bStron #define	END	0	/* no	End of program. */
8320006a0bStron #define	BOL	1	/* no	Match "" at beginning of line. */
8420006a0bStron #define	EOL	2	/* no	Match "" at end of line. */
8520006a0bStron #define	ANY	3	/* no	Match any one character. */
8620006a0bStron #define	ANYOF	4	/* str	Match any character in this string. */
8720006a0bStron #define	ANYBUT	5	/* str	Match any character not in this string. */
8820006a0bStron #define	BRANCH	6	/* node	Match this alternative, or the next... */
8920006a0bStron #define	BACK	7	/* no	Match "", "next" ptr points backward. */
9020006a0bStron #define	EXACTLY	8	/* str	Match this string. */
9120006a0bStron #define	NOTHING	9	/* no	Match empty string. */
9220006a0bStron #define	STAR	10	/* node	Match this (simple) thing 0 or more times. */
9320006a0bStron #define	PLUS	11	/* node	Match this (simple) thing 1 or more times. */
9420006a0bStron #define	OPEN	20	/* no	Mark this point in input as start of #n. */
9520006a0bStron 			/*	OPEN+1 is number 1, etc. */
9620006a0bStron #define	CLOSE	30	/* no	Analogous to OPEN. */
9720006a0bStron 
9820006a0bStron /*
9920006a0bStron  * Opcode notes:
10020006a0bStron  *
10120006a0bStron  * BRANCH	The set of branches constituting a single choice are hooked
10220006a0bStron  *		together with their "next" pointers, since precedence prevents
10320006a0bStron  *		anything being concatenated to any individual branch.  The
10420006a0bStron  *		"next" pointer of the last BRANCH in a choice points to the
10520006a0bStron  *		thing following the whole choice.  This is also where the
10620006a0bStron  *		final "next" pointer of each individual branch points; each
10720006a0bStron  *		branch starts with the operand node of a BRANCH node.
10820006a0bStron  *
10920006a0bStron  * BACK		Normal "next" pointers all implicitly point forward; BACK
11020006a0bStron  *		exists to make loop structures possible.
11120006a0bStron  *
11220006a0bStron  * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
11320006a0bStron  *		BRANCH structures using BACK.  Simple cases (one character
11420006a0bStron  *		per match) are implemented with STAR and PLUS for speed
11520006a0bStron  *		and to minimize recursive plunges.
11620006a0bStron  *
11720006a0bStron  * OPEN,CLOSE	...are numbered at compile time.
11820006a0bStron  */
11920006a0bStron 
12020006a0bStron /*
12120006a0bStron  * A node is one char of opcode followed by two chars of "next" pointer.
12220006a0bStron  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
12320006a0bStron  * value is a positive offset from the opcode of the node containing it.
12420006a0bStron  * An operand, if any, simply follows the node.  (Note that much of the
12520006a0bStron  * code generation knows about this implicit relationship.)
12620006a0bStron  *
12720006a0bStron  * Using two bytes for the "next" pointer is vast overkill for most things,
12820006a0bStron  * but allows patterns to get big without disasters.
12920006a0bStron  */
13020006a0bStron #define	OP(p)	(*(p))
13120006a0bStron #define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
13220006a0bStron #define	OPERAND(p)	((p) + 3)
13320006a0bStron 
13420006a0bStron /*
13520006a0bStron  * See regmagic.h for one further detail of program structure.
13620006a0bStron  */
13720006a0bStron 
13820006a0bStron 
13920006a0bStron /*
14020006a0bStron  * Utility definitions.
14120006a0bStron  */
14220006a0bStron #ifndef CHARBITS
14320006a0bStron #define	UCHARAT(p)	((int)*(unsigned char *)(p))
14420006a0bStron #else
14520006a0bStron #define	UCHARAT(p)	((int)*(p)&CHARBITS)
14620006a0bStron #endif
14720006a0bStron 
14820006a0bStron #define	FAIL(m)	{ regerror(m); return(NULL); }
14920006a0bStron #define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
15020006a0bStron #define	META	"^$.[()|?+*\\"
15120006a0bStron 
15220006a0bStron /*
15320006a0bStron  * Flags to be passed up and down.
15420006a0bStron  */
15520006a0bStron #define	HASWIDTH	01	/* Known never to match null string. */
15620006a0bStron #define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
15720006a0bStron #define	SPSTART		04	/* Starts with * or +. */
15820006a0bStron #define	WORST		0	/* Worst case. */
15920006a0bStron 
16020006a0bStron /*
16120006a0bStron  * Global work variables for regcomp().
16220006a0bStron  */
16320006a0bStron static char *regparse;		/* Input-scan pointer. */
16420006a0bStron static int regnpar;		/* () count. */
16520006a0bStron static char regdummy;
16620006a0bStron static char *regcode;		/* Code-emit pointer; &regdummy = don't. */
16720006a0bStron static long regsize;		/* Code size. */
16820006a0bStron 
16920006a0bStron /*
17020006a0bStron  * The first byte of the regexp internal "program" is actually this magic
17120006a0bStron  * number; the start node begins in the second byte.
17220006a0bStron  */
17320006a0bStron #define	MAGIC	0234
17420006a0bStron 
17520006a0bStron 
17620006a0bStron /*
17720006a0bStron  * Forward declarations for regcomp()'s friends.
17820006a0bStron  */
17920006a0bStron #ifndef STATIC
18020006a0bStron #define	STATIC	static
18120006a0bStron #endif
18220006a0bStron STATIC char *reg();
18320006a0bStron STATIC char *regbranch();
18420006a0bStron STATIC char *regpiece();
18520006a0bStron STATIC char *regatom();
18620006a0bStron STATIC char *regnode();
18720006a0bStron STATIC char *regnext();
18820006a0bStron STATIC void regc();
18920006a0bStron STATIC void reginsert();
19020006a0bStron STATIC void regtail();
19120006a0bStron STATIC void regoptail();
19220006a0bStron #ifdef STRCSPN
19320006a0bStron STATIC int strcspn();
19420006a0bStron #endif
19520006a0bStron 
19620006a0bStron /*
19720006a0bStron  - regcomp - compile a regular expression into internal code
19820006a0bStron  *
19920006a0bStron  * We can't allocate space until we know how big the compiled form will be,
20020006a0bStron  * but we can't compile it (and thus know how big it is) until we've got a
20120006a0bStron  * place to put the code.  So we cheat:  we compile it twice, once with code
20220006a0bStron  * generation turned off and size counting turned on, and once "for real".
20320006a0bStron  * This also means that we don't allocate space until we are sure that the
20420006a0bStron  * thing really will compile successfully, and we never have to move the
20520006a0bStron  * code and thus invalidate pointers into it.  (Note that it has to be in
20620006a0bStron  * one piece because free() must be able to free it all.)
20720006a0bStron  *
20820006a0bStron  * Beware that the optimization-preparation code in here knows about some
20920006a0bStron  * of the structure of the compiled regexp.
21020006a0bStron  */
21120006a0bStron regexp *
regcomp(exp)21220006a0bStron regcomp(exp)
21320006a0bStron char *exp;
21420006a0bStron {
21520006a0bStron 	register regexp *r;
21620006a0bStron 	register char *scan;
21720006a0bStron 	register char *longest;
21820006a0bStron 	register int len;
21920006a0bStron 	int flags;
22020006a0bStron 
22120006a0bStron 	if (exp == NULL)
22220006a0bStron 		FAIL("NULL argument");
22320006a0bStron 
22420006a0bStron 	/* First pass: determine size, legality. */
22520006a0bStron 	regparse = exp;
22620006a0bStron 	regnpar = 1;
22720006a0bStron 	regsize = 0L;
22820006a0bStron 	regcode = &regdummy;
22920006a0bStron 	regc(MAGIC);
23020006a0bStron 	if (reg(0, &flags) == NULL)
23120006a0bStron 		return(NULL);
23220006a0bStron 
23320006a0bStron 	/* Small enough for pointer-storage convention? */
23420006a0bStron 	if (regsize >= 32767L)		/* Probably could be 65535L. */
23520006a0bStron 		FAIL("regexp too big");
23620006a0bStron 
23720006a0bStron 	/* Allocate space. */
23820006a0bStron 	r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
23920006a0bStron 	if (r == NULL)
24020006a0bStron 		FAIL("out of space");
24120006a0bStron 
24220006a0bStron 	/* Second pass: emit code. */
24320006a0bStron 	regparse = exp;
24420006a0bStron 	regnpar = 1;
24520006a0bStron 	regcode = r->program;
24620006a0bStron 	regc(MAGIC);
24720006a0bStron 	if (reg(0, &flags) == NULL)
248*838f5788Ssimonb 	{
249*838f5788Ssimonb 		free(r);
25020006a0bStron 		return(NULL);
251*838f5788Ssimonb 	}
25220006a0bStron 
25320006a0bStron 	/* Dig out information for optimizations. */
25420006a0bStron 	r->regstart = '\0';	/* Worst-case defaults. */
25520006a0bStron 	r->reganch = 0;
25620006a0bStron 	r->regmust = NULL;
25720006a0bStron 	r->regmlen = 0;
25820006a0bStron 	scan = r->program+1;			/* First BRANCH. */
25920006a0bStron 	if (OP(regnext(scan)) == END) {		/* Only one top-level choice. */
26020006a0bStron 		scan = OPERAND(scan);
26120006a0bStron 
26220006a0bStron 		/* Starting-point info. */
26320006a0bStron 		if (OP(scan) == EXACTLY)
26420006a0bStron 			r->regstart = *OPERAND(scan);
26520006a0bStron 		else if (OP(scan) == BOL)
26620006a0bStron 			r->reganch++;
26720006a0bStron 
26820006a0bStron 		/*
26920006a0bStron 		 * If there's something expensive in the r.e., find the
27020006a0bStron 		 * longest literal string that must appear and make it the
27120006a0bStron 		 * regmust.  Resolve ties in favor of later strings, since
27220006a0bStron 		 * the regstart check works with the beginning of the r.e.
27320006a0bStron 		 * and avoiding duplication strengthens checking.  Not a
27420006a0bStron 		 * strong reason, but sufficient in the absence of others.
27520006a0bStron 		 */
27620006a0bStron 		if (flags&SPSTART) {
27720006a0bStron 			longest = NULL;
27820006a0bStron 			len = 0;
27920006a0bStron 			for (; scan != NULL; scan = regnext(scan))
28020006a0bStron 				if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
28120006a0bStron 					longest = OPERAND(scan);
282*838f5788Ssimonb 					len = (int) strlen(OPERAND(scan));
28320006a0bStron 				}
28420006a0bStron 			r->regmust = longest;
28520006a0bStron 			r->regmlen = len;
28620006a0bStron 		}
28720006a0bStron 	}
28820006a0bStron 
28920006a0bStron 	return(r);
29020006a0bStron }
29120006a0bStron 
29220006a0bStron /*
29320006a0bStron  - reg - regular expression, i.e. main body or parenthesized thing
29420006a0bStron  *
29520006a0bStron  * Caller must absorb opening parenthesis.
29620006a0bStron  *
29720006a0bStron  * Combining parenthesis handling with the base level of regular expression
29820006a0bStron  * is a trifle forced, but the need to tie the tails of the branches to what
29920006a0bStron  * follows makes it hard to avoid.
30020006a0bStron  */
30120006a0bStron static char *
reg(paren,flagp)30220006a0bStron reg(paren, flagp)
30320006a0bStron int paren;			/* Parenthesized? */
30420006a0bStron int *flagp;
30520006a0bStron {
30620006a0bStron 	register char *ret;
30720006a0bStron 	register char *br;
30820006a0bStron 	register char *ender;
30920006a0bStron 	register int parno = 0;
31020006a0bStron 	int flags;
31120006a0bStron 
31220006a0bStron 	*flagp = HASWIDTH;	/* Tentatively. */
31320006a0bStron 
31420006a0bStron 	/* Make an OPEN node, if parenthesized. */
31520006a0bStron 	if (paren) {
31620006a0bStron 		if (regnpar >= NSUBEXP)
31720006a0bStron 			FAIL("too many ()");
31820006a0bStron 		parno = regnpar;
31920006a0bStron 		regnpar++;
32020006a0bStron 		ret = regnode(OPEN+parno);
32120006a0bStron 	} else
32220006a0bStron 		ret = NULL;
32320006a0bStron 
32420006a0bStron 	/* Pick up the branches, linking them together. */
32520006a0bStron 	br = regbranch(&flags);
32620006a0bStron 	if (br == NULL)
32720006a0bStron 		return(NULL);
32820006a0bStron 	if (ret != NULL)
32920006a0bStron 		regtail(ret, br);	/* OPEN -> first. */
33020006a0bStron 	else
33120006a0bStron 		ret = br;
33220006a0bStron 	if (!(flags&HASWIDTH))
33320006a0bStron 		*flagp &= ~HASWIDTH;
33420006a0bStron 	*flagp |= flags&SPSTART;
33520006a0bStron 	while (*regparse == '|') {
33620006a0bStron 		regparse++;
33720006a0bStron 		br = regbranch(&flags);
33820006a0bStron 		if (br == NULL)
33920006a0bStron 			return(NULL);
34020006a0bStron 		regtail(ret, br);	/* BRANCH -> BRANCH. */
34120006a0bStron 		if (!(flags&HASWIDTH))
34220006a0bStron 			*flagp &= ~HASWIDTH;
34320006a0bStron 		*flagp |= flags&SPSTART;
34420006a0bStron 	}
34520006a0bStron 
34620006a0bStron 	/* Make a closing node, and hook it on the end. */
34720006a0bStron 	ender = regnode((paren) ? CLOSE+parno : END);
34820006a0bStron 	regtail(ret, ender);
34920006a0bStron 
35020006a0bStron 	/* Hook the tails of the branches to the closing node. */
35120006a0bStron 	for (br = ret; br != NULL; br = regnext(br))
35220006a0bStron 		regoptail(br, ender);
35320006a0bStron 
35420006a0bStron 	/* Check for proper termination. */
35520006a0bStron 	if (paren && *regparse++ != ')') {
35620006a0bStron 		FAIL("unmatched ()");
35720006a0bStron 	} else if (!paren && *regparse != '\0') {
35820006a0bStron 		if (*regparse == ')') {
35920006a0bStron 			FAIL("unmatched ()");
36020006a0bStron 		} else
36120006a0bStron 			FAIL("junk on end");	/* "Can't happen". */
36220006a0bStron 		/* NOTREACHED */
36320006a0bStron 	}
36420006a0bStron 
36520006a0bStron 	return(ret);
36620006a0bStron }
36720006a0bStron 
36820006a0bStron /*
36920006a0bStron  - regbranch - one alternative of an | operator
37020006a0bStron  *
37120006a0bStron  * Implements the concatenation operator.
37220006a0bStron  */
37320006a0bStron static char *
regbranch(flagp)37420006a0bStron regbranch(flagp)
37520006a0bStron int *flagp;
37620006a0bStron {
37720006a0bStron 	register char *ret;
37820006a0bStron 	register char *chain;
37920006a0bStron 	register char *latest;
38020006a0bStron 	int flags;
38120006a0bStron 
38220006a0bStron 	*flagp = WORST;		/* Tentatively. */
38320006a0bStron 
38420006a0bStron 	ret = regnode(BRANCH);
38520006a0bStron 	chain = NULL;
38620006a0bStron 	while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
38720006a0bStron 		latest = regpiece(&flags);
38820006a0bStron 		if (latest == NULL)
38920006a0bStron 			return(NULL);
39020006a0bStron 		*flagp |= flags&HASWIDTH;
39120006a0bStron 		if (chain == NULL)	/* First piece. */
39220006a0bStron 			*flagp |= flags&SPSTART;
39320006a0bStron 		else
39420006a0bStron 			regtail(chain, latest);
39520006a0bStron 		chain = latest;
39620006a0bStron 	}
39720006a0bStron 	if (chain == NULL)	/* Loop ran zero times. */
39820006a0bStron 		(void) regnode(NOTHING);
39920006a0bStron 
40020006a0bStron 	return(ret);
40120006a0bStron }
40220006a0bStron 
40320006a0bStron /*
40420006a0bStron  - regpiece - something followed by possible [*+?]
40520006a0bStron  *
40620006a0bStron  * Note that the branching code sequences used for ? and the general cases
40720006a0bStron  * of * and + are somewhat optimized:  they use the same NOTHING node as
40820006a0bStron  * both the endmarker for their branch list and the body of the last branch.
40920006a0bStron  * It might seem that this node could be dispensed with entirely, but the
41020006a0bStron  * endmarker role is not redundant.
41120006a0bStron  */
41220006a0bStron static char *
regpiece(flagp)41320006a0bStron regpiece(flagp)
41420006a0bStron int *flagp;
41520006a0bStron {
41620006a0bStron 	register char *ret;
41720006a0bStron 	register char op;
41820006a0bStron 	register char *next;
41920006a0bStron 	int flags;
42020006a0bStron 
42120006a0bStron 	ret = regatom(&flags);
42220006a0bStron 	if (ret == NULL)
42320006a0bStron 		return(NULL);
42420006a0bStron 
42520006a0bStron 	op = *regparse;
42620006a0bStron 	if (!ISMULT(op)) {
42720006a0bStron 		*flagp = flags;
42820006a0bStron 		return(ret);
42920006a0bStron 	}
43020006a0bStron 
43120006a0bStron 	if (!(flags&HASWIDTH) && op != '?')
43220006a0bStron 		FAIL("*+ operand could be empty");
43320006a0bStron 	*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
43420006a0bStron 
43520006a0bStron 	if (op == '*' && (flags&SIMPLE))
43620006a0bStron 		reginsert(STAR, ret);
43720006a0bStron 	else if (op == '*') {
43820006a0bStron 		/* Emit x* as (x&|), where & means "self". */
43920006a0bStron 		reginsert(BRANCH, ret);			/* Either x */
44020006a0bStron 		regoptail(ret, regnode(BACK));		/* and loop */
44120006a0bStron 		regoptail(ret, ret);			/* back */
44220006a0bStron 		regtail(ret, regnode(BRANCH));		/* or */
44320006a0bStron 		regtail(ret, regnode(NOTHING));		/* null. */
44420006a0bStron 	} else if (op == '+' && (flags&SIMPLE))
44520006a0bStron 		reginsert(PLUS, ret);
44620006a0bStron 	else if (op == '+') {
44720006a0bStron 		/* Emit x+ as x(&|), where & means "self". */
44820006a0bStron 		next = regnode(BRANCH);			/* Either */
44920006a0bStron 		regtail(ret, next);
45020006a0bStron 		regtail(regnode(BACK), ret);		/* loop back */
45120006a0bStron 		regtail(next, regnode(BRANCH));		/* or */
45220006a0bStron 		regtail(ret, regnode(NOTHING));		/* null. */
45320006a0bStron 	} else if (op == '?') {
45420006a0bStron 		/* Emit x? as (x|) */
45520006a0bStron 		reginsert(BRANCH, ret);			/* Either x */
45620006a0bStron 		regtail(ret, regnode(BRANCH));		/* or */
45720006a0bStron 		next = regnode(NOTHING);		/* null. */
45820006a0bStron 		regtail(ret, next);
45920006a0bStron 		regoptail(ret, next);
46020006a0bStron 	}
46120006a0bStron 	regparse++;
46220006a0bStron 	if (ISMULT(*regparse))
46320006a0bStron 		FAIL("nested *?+");
46420006a0bStron 
46520006a0bStron 	return(ret);
46620006a0bStron }
46720006a0bStron 
46820006a0bStron /*
46920006a0bStron  - regatom - the lowest level
47020006a0bStron  *
47120006a0bStron  * Optimization:  gobbles an entire sequence of ordinary characters so that
47220006a0bStron  * it can turn them into a single node, which is smaller to store and
47320006a0bStron  * faster to run.  Backslashed characters are exceptions, each becoming a
47420006a0bStron  * separate node; the code is simpler that way and it's not worth fixing.
47520006a0bStron  */
47620006a0bStron static char *
regatom(flagp)47720006a0bStron regatom(flagp)
47820006a0bStron int *flagp;
47920006a0bStron {
48020006a0bStron 	register char *ret;
48120006a0bStron 	int flags;
48220006a0bStron 
48320006a0bStron 	*flagp = WORST;		/* Tentatively. */
48420006a0bStron 
48520006a0bStron 	switch (*regparse++) {
48620006a0bStron 	case '^':
48720006a0bStron 		ret = regnode(BOL);
48820006a0bStron 		break;
48920006a0bStron 	case '$':
49020006a0bStron 		ret = regnode(EOL);
49120006a0bStron 		break;
49220006a0bStron 	case '.':
49320006a0bStron 		ret = regnode(ANY);
49420006a0bStron 		*flagp |= HASWIDTH|SIMPLE;
49520006a0bStron 		break;
49620006a0bStron 	case '[': {
49720006a0bStron 			register int clss;
49820006a0bStron 			register int classend;
49920006a0bStron 
50020006a0bStron 			if (*regparse == '^') {	/* Complement of range. */
50120006a0bStron 				ret = regnode(ANYBUT);
50220006a0bStron 				regparse++;
50320006a0bStron 			} else
50420006a0bStron 				ret = regnode(ANYOF);
50520006a0bStron 			if (*regparse == ']' || *regparse == '-')
50620006a0bStron 				regc(*regparse++);
50720006a0bStron 			while (*regparse != '\0' && *regparse != ']') {
50820006a0bStron 				if (*regparse == '-') {
50920006a0bStron 					regparse++;
51020006a0bStron 					if (*regparse == ']' || *regparse == '\0')
51120006a0bStron 						regc('-');
51220006a0bStron 					else {
51320006a0bStron 						clss = UCHARAT(regparse-2)+1;
51420006a0bStron 						classend = UCHARAT(regparse);
51520006a0bStron 						if (clss > classend+1)
51620006a0bStron 							FAIL("invalid [] range");
51720006a0bStron 						for (; clss <= classend; clss++)
51820006a0bStron 							regc(clss);
51920006a0bStron 						regparse++;
52020006a0bStron 					}
52120006a0bStron 				} else
52220006a0bStron 					regc(*regparse++);
52320006a0bStron 			}
52420006a0bStron 			regc('\0');
52520006a0bStron 			if (*regparse != ']')
52620006a0bStron 				FAIL("unmatched []");
52720006a0bStron 			regparse++;
52820006a0bStron 			*flagp |= HASWIDTH|SIMPLE;
52920006a0bStron 		}
53020006a0bStron 		break;
53120006a0bStron 	case '(':
53220006a0bStron 		ret = reg(1, &flags);
53320006a0bStron 		if (ret == NULL)
53420006a0bStron 			return(NULL);
53520006a0bStron 		*flagp |= flags&(HASWIDTH|SPSTART);
53620006a0bStron 		break;
53720006a0bStron 	case '\0':
53820006a0bStron 	case '|':
53920006a0bStron 	case ')':
54020006a0bStron 		FAIL("internal urp");	/* Supposed to be caught earlier. */
54120006a0bStron 		/* NOTREACHED */
54220006a0bStron 		break;
54320006a0bStron 	case '?':
54420006a0bStron 	case '+':
54520006a0bStron 	case '*':
54620006a0bStron 		FAIL("?+* follows nothing");
54720006a0bStron 		/* NOTREACHED */
54820006a0bStron 		break;
54920006a0bStron 	case '\\':
55020006a0bStron 		if (*regparse == '\0')
55120006a0bStron 			FAIL("trailing \\");
55220006a0bStron 		ret = regnode(EXACTLY);
55320006a0bStron 		regc(*regparse++);
55420006a0bStron 		regc('\0');
55520006a0bStron 		*flagp |= HASWIDTH|SIMPLE;
55620006a0bStron 		break;
55720006a0bStron 	default: {
55820006a0bStron 			register int len;
55920006a0bStron 			register char ender;
56020006a0bStron 
56120006a0bStron 			regparse--;
562*838f5788Ssimonb 			len = (int) strcspn(regparse, META);
56320006a0bStron 			if (len <= 0)
56420006a0bStron 				FAIL("internal disaster");
56520006a0bStron 			ender = *(regparse+len);
56620006a0bStron 			if (len > 1 && ISMULT(ender))
56720006a0bStron 				len--;		/* Back off clear of ?+* operand. */
56820006a0bStron 			*flagp |= HASWIDTH;
56920006a0bStron 			if (len == 1)
57020006a0bStron 				*flagp |= SIMPLE;
57120006a0bStron 			ret = regnode(EXACTLY);
57220006a0bStron 			while (len > 0) {
57320006a0bStron 				regc(*regparse++);
57420006a0bStron 				len--;
57520006a0bStron 			}
57620006a0bStron 			regc('\0');
57720006a0bStron 		}
57820006a0bStron 		break;
57920006a0bStron 	}
58020006a0bStron 
58120006a0bStron 	return(ret);
58220006a0bStron }
58320006a0bStron 
58420006a0bStron /*
58520006a0bStron  - regnode - emit a node
58620006a0bStron  */
58720006a0bStron static char *			/* Location. */
regnode(op)58820006a0bStron regnode(op)
58920006a0bStron char op;
59020006a0bStron {
59120006a0bStron 	register char *ret;
59220006a0bStron 	register char *ptr;
59320006a0bStron 
59420006a0bStron 	ret = regcode;
59520006a0bStron 	if (ret == &regdummy) {
59620006a0bStron 		regsize += 3;
59720006a0bStron 		return(ret);
59820006a0bStron 	}
59920006a0bStron 
60020006a0bStron 	ptr = ret;
60120006a0bStron 	*ptr++ = op;
60220006a0bStron 	*ptr++ = '\0';		/* Null "next" pointer. */
60320006a0bStron 	*ptr++ = '\0';
60420006a0bStron 	regcode = ptr;
60520006a0bStron 
60620006a0bStron 	return(ret);
60720006a0bStron }
60820006a0bStron 
60920006a0bStron /*
61020006a0bStron  - regc - emit (if appropriate) a byte of code
61120006a0bStron  */
61220006a0bStron static void
regc(b)61320006a0bStron regc(b)
61420006a0bStron char b;
61520006a0bStron {
61620006a0bStron 	if (regcode != &regdummy)
61720006a0bStron 		*regcode++ = b;
61820006a0bStron 	else
61920006a0bStron 		regsize++;
62020006a0bStron }
62120006a0bStron 
62220006a0bStron /*
62320006a0bStron  - reginsert - insert an operator in front of already-emitted operand
62420006a0bStron  *
62520006a0bStron  * Means relocating the operand.
62620006a0bStron  */
62720006a0bStron static void
reginsert(op,opnd)62820006a0bStron reginsert(op, opnd)
62920006a0bStron char op;
63020006a0bStron char *opnd;
63120006a0bStron {
63220006a0bStron 	register char *src;
63320006a0bStron 	register char *dst;
63420006a0bStron 	register char *place;
63520006a0bStron 
63620006a0bStron 	if (regcode == &regdummy) {
63720006a0bStron 		regsize += 3;
63820006a0bStron 		return;
63920006a0bStron 	}
64020006a0bStron 
64120006a0bStron 	src = regcode;
64220006a0bStron 	regcode += 3;
64320006a0bStron 	dst = regcode;
64420006a0bStron 	while (src > opnd)
64520006a0bStron 		*--dst = *--src;
64620006a0bStron 
64720006a0bStron 	place = opnd;		/* Op node, where operand used to be. */
64820006a0bStron 	*place++ = op;
64920006a0bStron 	*place++ = '\0';
65020006a0bStron 	*place++ = '\0';
65120006a0bStron }
65220006a0bStron 
65320006a0bStron /*
65420006a0bStron  - regtail - set the next-pointer at the end of a node chain
65520006a0bStron  */
65620006a0bStron static void
regtail(p,val)65720006a0bStron regtail(p, val)
65820006a0bStron char *p;
65920006a0bStron char *val;
66020006a0bStron {
66120006a0bStron 	register char *scan;
66220006a0bStron 	register char *temp;
66320006a0bStron 	register int offset;
66420006a0bStron 
66520006a0bStron 	if (p == &regdummy)
66620006a0bStron 		return;
66720006a0bStron 
66820006a0bStron 	/* Find last node. */
66920006a0bStron 	scan = p;
67020006a0bStron 	for (;;) {
67120006a0bStron 		temp = regnext(scan);
67220006a0bStron 		if (temp == NULL)
67320006a0bStron 			break;
67420006a0bStron 		scan = temp;
67520006a0bStron 	}
67620006a0bStron 
67720006a0bStron 	if (OP(scan) == BACK)
678*838f5788Ssimonb 		offset = (int) (scan - val);
67920006a0bStron 	else
680*838f5788Ssimonb 		offset = (int) (val - scan);
68120006a0bStron 	*(scan+1) = (offset>>8)&0377;
68220006a0bStron 	*(scan+2) = offset&0377;
68320006a0bStron }
68420006a0bStron 
68520006a0bStron /*
68620006a0bStron  - regoptail - regtail on operand of first argument; nop if operandless
68720006a0bStron  */
68820006a0bStron static void
regoptail(p,val)68920006a0bStron regoptail(p, val)
69020006a0bStron char *p;
69120006a0bStron char *val;
69220006a0bStron {
69320006a0bStron 	/* "Operandless" and "op != BRANCH" are synonymous in practice. */
69420006a0bStron 	if (p == NULL || p == &regdummy || OP(p) != BRANCH)
69520006a0bStron 		return;
69620006a0bStron 	regtail(OPERAND(p), val);
69720006a0bStron }
69820006a0bStron 
69920006a0bStron /*
70020006a0bStron  * regexec and friends
70120006a0bStron  */
70220006a0bStron 
70320006a0bStron /*
70420006a0bStron  * Global work variables for regexec().
70520006a0bStron  */
70620006a0bStron static char *reginput;		/* String-input pointer. */
70720006a0bStron static char *regbol;		/* Beginning of input, for ^ check. */
70820006a0bStron static char **regstartp;	/* Pointer to startp array. */
70920006a0bStron static char **regendp;		/* Ditto for endp. */
71020006a0bStron 
71120006a0bStron /*
71220006a0bStron  * Forwards.
71320006a0bStron  */
71420006a0bStron STATIC int regtry();
71520006a0bStron STATIC int regmatch();
71620006a0bStron STATIC int regrepeat();
71720006a0bStron 
71820006a0bStron #ifdef DEBUG
71920006a0bStron int regnarrate = 0;
72020006a0bStron void regdump();
72120006a0bStron STATIC char *regprop();
72220006a0bStron #endif
72320006a0bStron 
72420006a0bStron /*
72520006a0bStron  - regexec - match a regexp against a string
72620006a0bStron  */
72720006a0bStron int
regexec2(prog,string,notbol)72820006a0bStron regexec2(prog, string, notbol)
72920006a0bStron register regexp *prog;
73020006a0bStron register char *string;
73120006a0bStron int notbol;
73220006a0bStron {
73320006a0bStron 	register char *s;
73420006a0bStron 
73520006a0bStron 	/* Be paranoid... */
73620006a0bStron 	if (prog == NULL || string == NULL) {
73720006a0bStron 		regerror("NULL parameter");
73820006a0bStron 		return(0);
73920006a0bStron 	}
74020006a0bStron 
74120006a0bStron 	/* Check validity of program. */
74220006a0bStron 	if (UCHARAT(prog->program) != MAGIC) {
74320006a0bStron 		regerror("corrupted program");
74420006a0bStron 		return(0);
74520006a0bStron 	}
74620006a0bStron 
74720006a0bStron 	/* If there is a "must appear" string, look for it. */
74820006a0bStron 	if (prog->regmust != NULL) {
74920006a0bStron 		s = string;
75020006a0bStron 		while ((s = strchr(s, prog->regmust[0])) != NULL) {
75120006a0bStron 			if (strncmp(s, prog->regmust, prog->regmlen) == 0)
75220006a0bStron 				break;	/* Found it. */
75320006a0bStron 			s++;
75420006a0bStron 		}
75520006a0bStron 		if (s == NULL)	/* Not present. */
75620006a0bStron 			return(0);
75720006a0bStron 	}
75820006a0bStron 
75920006a0bStron 	/* Mark beginning of line for ^ . */
76020006a0bStron 	if (notbol)
76120006a0bStron 		regbol = NULL;
76220006a0bStron 	else
76320006a0bStron 		regbol = string;
76420006a0bStron 
76520006a0bStron 	/* Simplest case:  anchored match need be tried only once. */
76620006a0bStron 	if (prog->reganch)
76720006a0bStron 		return(regtry(prog, string));
76820006a0bStron 
76920006a0bStron 	/* Messy cases:  unanchored match. */
77020006a0bStron 	s = string;
77120006a0bStron 	if (prog->regstart != '\0')
77220006a0bStron 		/* We know what char it must start with. */
77320006a0bStron 		while ((s = strchr(s, prog->regstart)) != NULL) {
77420006a0bStron 			if (regtry(prog, s))
77520006a0bStron 				return(1);
77620006a0bStron 			s++;
77720006a0bStron 		}
77820006a0bStron 	else
77920006a0bStron 		/* We don't -- general case. */
78020006a0bStron 		do {
78120006a0bStron 			if (regtry(prog, s))
78220006a0bStron 				return(1);
78320006a0bStron 		} while (*s++ != '\0');
78420006a0bStron 
78520006a0bStron 	/* Failure. */
78620006a0bStron 	return(0);
78720006a0bStron }
78820006a0bStron 
78920006a0bStron int
regexec(prog,string)79020006a0bStron regexec(prog, string)
79120006a0bStron register regexp *prog;
79220006a0bStron register char *string;
79320006a0bStron {
79420006a0bStron 	return regexec2(prog, string, 0);
79520006a0bStron }
79620006a0bStron 
79720006a0bStron /*
79820006a0bStron  - regtry - try match at specific point
79920006a0bStron  */
80020006a0bStron static int			/* 0 failure, 1 success */
regtry(prog,string)80120006a0bStron regtry(prog, string)
80220006a0bStron regexp *prog;
80320006a0bStron char *string;
80420006a0bStron {
80520006a0bStron 	register int i;
80620006a0bStron 	register char **sp;
80720006a0bStron 	register char **ep;
80820006a0bStron 
80920006a0bStron 	reginput = string;
81020006a0bStron 	regstartp = prog->startp;
81120006a0bStron 	regendp = prog->endp;
81220006a0bStron 
81320006a0bStron 	sp = prog->startp;
81420006a0bStron 	ep = prog->endp;
81520006a0bStron 	for (i = NSUBEXP; i > 0; i--) {
81620006a0bStron 		*sp++ = NULL;
81720006a0bStron 		*ep++ = NULL;
81820006a0bStron 	}
81920006a0bStron 	if (regmatch(prog->program + 1)) {
82020006a0bStron 		prog->startp[0] = string;
82120006a0bStron 		prog->endp[0] = reginput;
82220006a0bStron 		return(1);
82320006a0bStron 	} else
82420006a0bStron 		return(0);
82520006a0bStron }
82620006a0bStron 
82720006a0bStron /*
82820006a0bStron  - regmatch - main matching routine
82920006a0bStron  *
83020006a0bStron  * Conceptually the strategy is simple:  check to see whether the current
83120006a0bStron  * node matches, call self recursively to see whether the rest matches,
83220006a0bStron  * and then act accordingly.  In practice we make some effort to avoid
83320006a0bStron  * recursion, in particular by going through "ordinary" nodes (that don't
83420006a0bStron  * need to know whether the rest of the match failed) by a loop instead of
83520006a0bStron  * by recursion.
83620006a0bStron  */
83720006a0bStron static int			/* 0 failure, 1 success */
regmatch(prog)83820006a0bStron regmatch(prog)
83920006a0bStron char *prog;
84020006a0bStron {
84120006a0bStron 	register char *scan;	/* Current node. */
84220006a0bStron 	char *next;		/* Next node. */
84320006a0bStron 
84420006a0bStron 	scan = prog;
84520006a0bStron #ifdef DEBUG
84620006a0bStron 	if (scan != NULL && regnarrate)
84720006a0bStron 		fprintf(stderr, "%s(\n", regprop(scan));
84820006a0bStron #endif
84920006a0bStron 	while (scan != NULL) {
85020006a0bStron #ifdef DEBUG
85120006a0bStron 		if (regnarrate)
85220006a0bStron 			fprintf(stderr, "%s...\n", regprop(scan));
85320006a0bStron #endif
85420006a0bStron 		next = regnext(scan);
85520006a0bStron 
85620006a0bStron 		switch (OP(scan)) {
85720006a0bStron 		case BOL:
85820006a0bStron 			if (reginput != regbol)
85920006a0bStron 				return(0);
86020006a0bStron 			break;
86120006a0bStron 		case EOL:
86220006a0bStron 			if (*reginput != '\0')
86320006a0bStron 				return(0);
86420006a0bStron 			break;
86520006a0bStron 		case ANY:
86620006a0bStron 			if (*reginput == '\0')
86720006a0bStron 				return(0);
86820006a0bStron 			reginput++;
86920006a0bStron 			break;
87020006a0bStron 		case EXACTLY: {
87120006a0bStron 				register int len;
87220006a0bStron 				register char *opnd;
87320006a0bStron 
87420006a0bStron 				opnd = OPERAND(scan);
87520006a0bStron 				/* Inline the first character, for speed. */
87620006a0bStron 				if (*opnd != *reginput)
87720006a0bStron 					return(0);
878*838f5788Ssimonb 				len = (int) strlen(opnd);
87920006a0bStron 				if (len > 1 && strncmp(opnd, reginput, len) != 0)
88020006a0bStron 					return(0);
88120006a0bStron 				reginput += len;
88220006a0bStron 			}
88320006a0bStron 			break;
88420006a0bStron 		case ANYOF:
88520006a0bStron  			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
88620006a0bStron 				return(0);
88720006a0bStron 			reginput++;
88820006a0bStron 			break;
88920006a0bStron 		case ANYBUT:
89020006a0bStron  			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
89120006a0bStron 				return(0);
89220006a0bStron 			reginput++;
89320006a0bStron 			break;
89420006a0bStron 		case NOTHING:
89520006a0bStron 			break;
89620006a0bStron 		case BACK:
89720006a0bStron 			break;
89820006a0bStron 		case OPEN+1:
89920006a0bStron 		case OPEN+2:
90020006a0bStron 		case OPEN+3:
90120006a0bStron 		case OPEN+4:
90220006a0bStron 		case OPEN+5:
90320006a0bStron 		case OPEN+6:
90420006a0bStron 		case OPEN+7:
90520006a0bStron 		case OPEN+8:
90620006a0bStron 		case OPEN+9: {
90720006a0bStron 				register int no;
90820006a0bStron 				register char *save;
90920006a0bStron 
91020006a0bStron 				no = OP(scan) - OPEN;
91120006a0bStron 				save = reginput;
91220006a0bStron 
91320006a0bStron 				if (regmatch(next)) {
91420006a0bStron 					/*
91520006a0bStron 					 * Don't set startp if some later
91620006a0bStron 					 * invocation of the same parentheses
91720006a0bStron 					 * already has.
91820006a0bStron 					 */
91920006a0bStron 					if (regstartp[no] == NULL)
92020006a0bStron 						regstartp[no] = save;
92120006a0bStron 					return(1);
92220006a0bStron 				} else
92320006a0bStron 					return(0);
92420006a0bStron 			}
92520006a0bStron 			/* NOTREACHED */
92620006a0bStron 			break;
92720006a0bStron 		case CLOSE+1:
92820006a0bStron 		case CLOSE+2:
92920006a0bStron 		case CLOSE+3:
93020006a0bStron 		case CLOSE+4:
93120006a0bStron 		case CLOSE+5:
93220006a0bStron 		case CLOSE+6:
93320006a0bStron 		case CLOSE+7:
93420006a0bStron 		case CLOSE+8:
93520006a0bStron 		case CLOSE+9: {
93620006a0bStron 				register int no;
93720006a0bStron 				register char *save;
93820006a0bStron 
93920006a0bStron 				no = OP(scan) - CLOSE;
94020006a0bStron 				save = reginput;
94120006a0bStron 
94220006a0bStron 				if (regmatch(next)) {
94320006a0bStron 					/*
94420006a0bStron 					 * Don't set endp if some later
94520006a0bStron 					 * invocation of the same parentheses
94620006a0bStron 					 * already has.
94720006a0bStron 					 */
94820006a0bStron 					if (regendp[no] == NULL)
94920006a0bStron 						regendp[no] = save;
95020006a0bStron 					return(1);
95120006a0bStron 				} else
95220006a0bStron 					return(0);
95320006a0bStron 			}
95420006a0bStron 			/* NOTREACHED */
95520006a0bStron 			break;
95620006a0bStron 		case BRANCH: {
95720006a0bStron 				register char *save;
95820006a0bStron 
95920006a0bStron 				if (OP(next) != BRANCH)		/* No choice. */
96020006a0bStron 					next = OPERAND(scan);	/* Avoid recursion. */
96120006a0bStron 				else {
96220006a0bStron 					do {
96320006a0bStron 						save = reginput;
96420006a0bStron 						if (regmatch(OPERAND(scan)))
96520006a0bStron 							return(1);
96620006a0bStron 						reginput = save;
96720006a0bStron 						scan = regnext(scan);
96820006a0bStron 					} while (scan != NULL && OP(scan) == BRANCH);
96920006a0bStron 					return(0);
97020006a0bStron 					/* NOTREACHED */
97120006a0bStron 				}
97220006a0bStron 			}
97320006a0bStron 			/* NOTREACHED */
97420006a0bStron 			break;
97520006a0bStron 		case STAR:
97620006a0bStron 		case PLUS: {
97720006a0bStron 				register char nextch;
97820006a0bStron 				register int no;
97920006a0bStron 				register char *save;
98020006a0bStron 				register int min;
98120006a0bStron 
98220006a0bStron 				/*
98320006a0bStron 				 * Lookahead to avoid useless match attempts
98420006a0bStron 				 * when we know what character comes next.
98520006a0bStron 				 */
98620006a0bStron 				nextch = '\0';
98720006a0bStron 				if (OP(next) == EXACTLY)
98820006a0bStron 					nextch = *OPERAND(next);
98920006a0bStron 				min = (OP(scan) == STAR) ? 0 : 1;
99020006a0bStron 				save = reginput;
99120006a0bStron 				no = regrepeat(OPERAND(scan));
99220006a0bStron 				while (no >= min) {
99320006a0bStron 					/* If it could work, try it. */
99420006a0bStron 					if (nextch == '\0' || *reginput == nextch)
99520006a0bStron 						if (regmatch(next))
99620006a0bStron 							return(1);
99720006a0bStron 					/* Couldn't or didn't -- back up. */
99820006a0bStron 					no--;
99920006a0bStron 					reginput = save + no;
100020006a0bStron 				}
100120006a0bStron 				return(0);
100220006a0bStron 			}
100320006a0bStron 			/* NOTREACHED */
100420006a0bStron 			break;
100520006a0bStron 		case END:
100620006a0bStron 			return(1);	/* Success! */
100720006a0bStron 			/* NOTREACHED */
100820006a0bStron 			break;
100920006a0bStron 		default:
101020006a0bStron 			regerror("memory corruption");
101120006a0bStron 			return(0);
101220006a0bStron 			/* NOTREACHED */
101320006a0bStron 			break;
101420006a0bStron 		}
101520006a0bStron 
101620006a0bStron 		scan = next;
101720006a0bStron 	}
101820006a0bStron 
101920006a0bStron 	/*
102020006a0bStron 	 * We get here only if there's trouble -- normally "case END" is
102120006a0bStron 	 * the terminating point.
102220006a0bStron 	 */
102320006a0bStron 	regerror("corrupted pointers");
102420006a0bStron 	return(0);
102520006a0bStron }
102620006a0bStron 
102720006a0bStron /*
102820006a0bStron  - regrepeat - repeatedly match something simple, report how many
102920006a0bStron  */
103020006a0bStron static int
regrepeat(p)103120006a0bStron regrepeat(p)
103220006a0bStron char *p;
103320006a0bStron {
103420006a0bStron 	register int count = 0;
103520006a0bStron 	register char *scan;
103620006a0bStron 	register char *opnd;
103720006a0bStron 
103820006a0bStron 	scan = reginput;
103920006a0bStron 	opnd = OPERAND(p);
104020006a0bStron 	switch (OP(p)) {
104120006a0bStron 	case ANY:
1042*838f5788Ssimonb 		count = (int) strlen(scan);
104320006a0bStron 		scan += count;
104420006a0bStron 		break;
104520006a0bStron 	case EXACTLY:
104620006a0bStron 		while (*opnd == *scan) {
104720006a0bStron 			count++;
104820006a0bStron 			scan++;
104920006a0bStron 		}
105020006a0bStron 		break;
105120006a0bStron 	case ANYOF:
105220006a0bStron 		while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
105320006a0bStron 			count++;
105420006a0bStron 			scan++;
105520006a0bStron 		}
105620006a0bStron 		break;
105720006a0bStron 	case ANYBUT:
105820006a0bStron 		while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
105920006a0bStron 			count++;
106020006a0bStron 			scan++;
106120006a0bStron 		}
106220006a0bStron 		break;
106320006a0bStron 	default:		/* Oh dear.  Called inappropriately. */
106420006a0bStron 		regerror("internal foulup");
106520006a0bStron 		count = 0;	/* Best compromise. */
106620006a0bStron 		break;
106720006a0bStron 	}
106820006a0bStron 	reginput = scan;
106920006a0bStron 
107020006a0bStron 	return(count);
107120006a0bStron }
107220006a0bStron 
107320006a0bStron /*
107420006a0bStron  - regnext - dig the "next" pointer out of a node
107520006a0bStron  */
107620006a0bStron static char *
regnext(p)107720006a0bStron regnext(p)
107820006a0bStron register char *p;
107920006a0bStron {
108020006a0bStron 	register int offset;
108120006a0bStron 
108220006a0bStron 	if (p == &regdummy)
108320006a0bStron 		return(NULL);
108420006a0bStron 
108520006a0bStron 	offset = NEXT(p);
108620006a0bStron 	if (offset == 0)
108720006a0bStron 		return(NULL);
108820006a0bStron 
108920006a0bStron 	if (OP(p) == BACK)
109020006a0bStron 		return(p-offset);
109120006a0bStron 	else
109220006a0bStron 		return(p+offset);
109320006a0bStron }
109420006a0bStron 
109520006a0bStron #ifdef DEBUG
109620006a0bStron 
109720006a0bStron STATIC char *regprop();
109820006a0bStron 
109920006a0bStron /*
110020006a0bStron  - regdump - dump a regexp onto stdout in vaguely comprehensible form
110120006a0bStron  */
110220006a0bStron void
regdump(r)110320006a0bStron regdump(r)
110420006a0bStron regexp *r;
110520006a0bStron {
110620006a0bStron 	register char *s;
110720006a0bStron 	register char op = EXACTLY;	/* Arbitrary non-END op. */
110820006a0bStron 	register char *next;
110920006a0bStron 
111020006a0bStron 
111120006a0bStron 	s = r->program + 1;
111220006a0bStron 	while (op != END) {	/* While that wasn't END last time... */
111320006a0bStron 		op = OP(s);
111420006a0bStron 		printf("%2d%s", s-r->program, regprop(s));	/* Where, what. */
111520006a0bStron 		next = regnext(s);
111620006a0bStron 		if (next == NULL)		/* Next ptr. */
111720006a0bStron 			printf("(0)");
111820006a0bStron 		else
111920006a0bStron 			printf("(%d)", (s-r->program)+(next-s));
112020006a0bStron 		s += 3;
112120006a0bStron 		if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
112220006a0bStron 			/* Literal string, where present. */
112320006a0bStron 			while (*s != '\0') {
112420006a0bStron 				putchar(*s);
112520006a0bStron 				s++;
112620006a0bStron 			}
112720006a0bStron 			s++;
112820006a0bStron 		}
112920006a0bStron 		putchar('\n');
113020006a0bStron 	}
113120006a0bStron 
113220006a0bStron 	/* Header fields of interest. */
113320006a0bStron 	if (r->regstart != '\0')
113420006a0bStron 		printf("start `%c' ", r->regstart);
113520006a0bStron 	if (r->reganch)
113620006a0bStron 		printf("anchored ");
113720006a0bStron 	if (r->regmust != NULL)
113820006a0bStron 		printf("must have \"%s\"", r->regmust);
113920006a0bStron 	printf("\n");
114020006a0bStron }
114120006a0bStron 
114220006a0bStron /*
114320006a0bStron  - regprop - printable representation of opcode
114420006a0bStron  */
114520006a0bStron static char *
regprop(op)114620006a0bStron regprop(op)
114720006a0bStron char *op;
114820006a0bStron {
114920006a0bStron 	register char *p;
115020006a0bStron 	static char buf[50];
115120006a0bStron 
115220006a0bStron 	(void) strcpy(buf, ":");
115320006a0bStron 
115420006a0bStron 	switch (OP(op)) {
115520006a0bStron 	case BOL:
115620006a0bStron 		p = "BOL";
115720006a0bStron 		break;
115820006a0bStron 	case EOL:
115920006a0bStron 		p = "EOL";
116020006a0bStron 		break;
116120006a0bStron 	case ANY:
116220006a0bStron 		p = "ANY";
116320006a0bStron 		break;
116420006a0bStron 	case ANYOF:
116520006a0bStron 		p = "ANYOF";
116620006a0bStron 		break;
116720006a0bStron 	case ANYBUT:
116820006a0bStron 		p = "ANYBUT";
116920006a0bStron 		break;
117020006a0bStron 	case BRANCH:
117120006a0bStron 		p = "BRANCH";
117220006a0bStron 		break;
117320006a0bStron 	case EXACTLY:
117420006a0bStron 		p = "EXACTLY";
117520006a0bStron 		break;
117620006a0bStron 	case NOTHING:
117720006a0bStron 		p = "NOTHING";
117820006a0bStron 		break;
117920006a0bStron 	case BACK:
118020006a0bStron 		p = "BACK";
118120006a0bStron 		break;
118220006a0bStron 	case END:
118320006a0bStron 		p = "END";
118420006a0bStron 		break;
118520006a0bStron 	case OPEN+1:
118620006a0bStron 	case OPEN+2:
118720006a0bStron 	case OPEN+3:
118820006a0bStron 	case OPEN+4:
118920006a0bStron 	case OPEN+5:
119020006a0bStron 	case OPEN+6:
119120006a0bStron 	case OPEN+7:
119220006a0bStron 	case OPEN+8:
119320006a0bStron 	case OPEN+9:
119420006a0bStron 		sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
119520006a0bStron 		p = NULL;
119620006a0bStron 		break;
119720006a0bStron 	case CLOSE+1:
119820006a0bStron 	case CLOSE+2:
119920006a0bStron 	case CLOSE+3:
120020006a0bStron 	case CLOSE+4:
120120006a0bStron 	case CLOSE+5:
120220006a0bStron 	case CLOSE+6:
120320006a0bStron 	case CLOSE+7:
120420006a0bStron 	case CLOSE+8:
120520006a0bStron 	case CLOSE+9:
120620006a0bStron 		sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
120720006a0bStron 		p = NULL;
120820006a0bStron 		break;
120920006a0bStron 	case STAR:
121020006a0bStron 		p = "STAR";
121120006a0bStron 		break;
121220006a0bStron 	case PLUS:
121320006a0bStron 		p = "PLUS";
121420006a0bStron 		break;
121520006a0bStron 	default:
121620006a0bStron 		regerror("corrupted opcode");
121720006a0bStron 		break;
121820006a0bStron 	}
121920006a0bStron 	if (p != NULL)
122020006a0bStron 		(void) strcat(buf, p);
122120006a0bStron 	return(buf);
122220006a0bStron }
122320006a0bStron #endif
122420006a0bStron 
122520006a0bStron /*
122620006a0bStron  * The following is provided for those people who do not have strcspn() in
122720006a0bStron  * their C libraries.  They should get off their butts and do something
122820006a0bStron  * about it; at least one public-domain implementation of those (highly
122920006a0bStron  * useful) string routines has been published on Usenet.
123020006a0bStron  */
123120006a0bStron #ifdef STRCSPN
123220006a0bStron /*
123320006a0bStron  * strcspn - find length of initial segment of s1 consisting entirely
123420006a0bStron  * of characters not from s2
123520006a0bStron  */
123620006a0bStron 
123720006a0bStron static int
strcspn(s1,s2)123820006a0bStron strcspn(s1, s2)
123920006a0bStron char *s1;
124020006a0bStron char *s2;
124120006a0bStron {
124220006a0bStron 	register char *scan1;
124320006a0bStron 	register char *scan2;
124420006a0bStron 	register int count;
124520006a0bStron 
124620006a0bStron 	count = 0;
124720006a0bStron 	for (scan1 = s1; *scan1 != '\0'; scan1++) {
124820006a0bStron 		for (scan2 = s2; *scan2 != '\0';)	/* ++ moved down. */
124920006a0bStron 			if (*scan1 == *scan2++)
125020006a0bStron 				return(count);
125120006a0bStron 		count++;
125220006a0bStron 	}
125320006a0bStron 	return(count);
125420006a0bStron }
125520006a0bStron #endif
1256