xref: /csrg-svn/lib/libcompat/regexp/regexp.c (revision 41349)
1*41349Sbostic /*
2*41349Sbostic  * regcomp and regexec -- regsub and regerror are elsewhere
3*41349Sbostic  *
4*41349Sbostic  *	Copyright (c) 1986 by University of Toronto.
5*41349Sbostic  *	Written by Henry Spencer.  Not derived from licensed software.
6*41349Sbostic  *
7*41349Sbostic  *	Permission is granted to anyone to use this software for any
8*41349Sbostic  *	purpose on any computer system, and to redistribute it freely,
9*41349Sbostic  *	subject to the following restrictions:
10*41349Sbostic  *
11*41349Sbostic  *	1. The author is not responsible for the consequences of use of
12*41349Sbostic  *		this software, no matter how awful, even if they arise
13*41349Sbostic  *		from defects in it.
14*41349Sbostic  *
15*41349Sbostic  *	2. The origin of this software must not be misrepresented, either
16*41349Sbostic  *		by explicit claim or by omission.
17*41349Sbostic  *
18*41349Sbostic  *	3. Altered versions must be plainly marked as such, and must not
19*41349Sbostic  *		be misrepresented as being the original software.
20*41349Sbostic  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
21*41349Sbostic  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22*41349Sbostic  *** to assist in implementing egrep.
23*41349Sbostic  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
24*41349Sbostic  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25*41349Sbostic  *** as in BSD grep and ex.
26*41349Sbostic  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
27*41349Sbostic  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28*41349Sbostic  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
29*41349Sbostic  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30*41349Sbostic  *
31*41349Sbostic  * Beware that some of this code is subtly aware of the way operator
32*41349Sbostic  * precedence is structured in regular expressions.  Serious changes in
33*41349Sbostic  * regular-expression syntax might require a total rethink.
34*41349Sbostic  */
35*41349Sbostic #include <stdio.h>
36*41349Sbostic #include <ctype.h>
37*41349Sbostic #include <regexp.h>
38*41349Sbostic #include "regmagic.h"
39*41349Sbostic 
40*41349Sbostic /*
41*41349Sbostic  * The "internal use only" fields in regexp.h are present to pass info from
42*41349Sbostic  * compile to execute that permits the execute phase to run lots faster on
43*41349Sbostic  * simple cases.  They are:
44*41349Sbostic  *
45*41349Sbostic  * regstart	char that must begin a match; '\0' if none obvious
46*41349Sbostic  * reganch	is the match anchored (at beginning-of-line only)?
47*41349Sbostic  * regmust	string (pointer into program) that match must include, or NULL
48*41349Sbostic  * regmlen	length of regmust string
49*41349Sbostic  *
50*41349Sbostic  * Regstart and reganch permit very fast decisions on suitable starting points
51*41349Sbostic  * for a match, cutting down the work a lot.  Regmust permits fast rejection
52*41349Sbostic  * of lines that cannot possibly match.  The regmust tests are costly enough
53*41349Sbostic  * that regcomp() supplies a regmust only if the r.e. contains something
54*41349Sbostic  * potentially expensive (at present, the only such thing detected is * or +
55*41349Sbostic  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
56*41349Sbostic  * supplied because the test in regexec() needs it and regcomp() is computing
57*41349Sbostic  * it anyway.
58*41349Sbostic  */
59*41349Sbostic 
60*41349Sbostic /*
61*41349Sbostic  * Structure for regexp "program".  This is essentially a linear encoding
62*41349Sbostic  * of a nondeterministic finite-state machine (aka syntax charts or
63*41349Sbostic  * "railroad normal form" in parsing technology).  Each node is an opcode
64*41349Sbostic  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
65*41349Sbostic  * all nodes except BRANCH implement concatenation; a "next" pointer with
66*41349Sbostic  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
67*41349Sbostic  * have one of the subtle syntax dependencies:  an individual BRANCH (as
68*41349Sbostic  * opposed to a collection of them) is never concatenated with anything
69*41349Sbostic  * because of operator precedence.)  The operand of some types of node is
70*41349Sbostic  * a literal string; for others, it is a node leading into a sub-FSM.  In
71*41349Sbostic  * particular, the operand of a BRANCH node is the first node of the branch.
72*41349Sbostic  * (NB this is *not* a tree structure:  the tail of the branch connects
73*41349Sbostic  * to the thing following the set of BRANCHes.)  The opcodes are:
74*41349Sbostic  */
75*41349Sbostic 
76*41349Sbostic /* definition	number	opnd?	meaning */
77*41349Sbostic #define	END	0	/* no	End of program. */
78*41349Sbostic #define	BOL	1	/* no	Match "" at beginning of line. */
79*41349Sbostic #define	EOL	2	/* no	Match "" at end of line. */
80*41349Sbostic #define	ANY	3	/* no	Match any one character. */
81*41349Sbostic #define	ANYOF	4	/* str	Match any character in this string. */
82*41349Sbostic #define	ANYBUT	5	/* str	Match any character not in this string. */
83*41349Sbostic #define	BRANCH	6	/* node	Match this alternative, or the next... */
84*41349Sbostic #define	BACK	7	/* no	Match "", "next" ptr points backward. */
85*41349Sbostic #define	EXACTLY	8	/* str	Match this string. */
86*41349Sbostic #define	NOTHING	9	/* no	Match empty string. */
87*41349Sbostic #define	STAR	10	/* node	Match this (simple) thing 0 or more times. */
88*41349Sbostic #define	PLUS	11	/* node	Match this (simple) thing 1 or more times. */
89*41349Sbostic #define	WORDA	12	/* no	Match "" at wordchar, where prev is nonword */
90*41349Sbostic #define	WORDZ	13	/* no	Match "" at nonwordchar, where prev is word */
91*41349Sbostic #define	OPEN	20	/* no	Mark this point in input as start of #n. */
92*41349Sbostic 			/*	OPEN+1 is number 1, etc. */
93*41349Sbostic #define	CLOSE	30	/* no	Analogous to OPEN. */
94*41349Sbostic 
95*41349Sbostic /*
96*41349Sbostic  * Opcode notes:
97*41349Sbostic  *
98*41349Sbostic  * BRANCH	The set of branches constituting a single choice are hooked
99*41349Sbostic  *		together with their "next" pointers, since precedence prevents
100*41349Sbostic  *		anything being concatenated to any individual branch.  The
101*41349Sbostic  *		"next" pointer of the last BRANCH in a choice points to the
102*41349Sbostic  *		thing following the whole choice.  This is also where the
103*41349Sbostic  *		final "next" pointer of each individual branch points; each
104*41349Sbostic  *		branch starts with the operand node of a BRANCH node.
105*41349Sbostic  *
106*41349Sbostic  * BACK		Normal "next" pointers all implicitly point forward; BACK
107*41349Sbostic  *		exists to make loop structures possible.
108*41349Sbostic  *
109*41349Sbostic  * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
110*41349Sbostic  *		BRANCH structures using BACK.  Simple cases (one character
111*41349Sbostic  *		per match) are implemented with STAR and PLUS for speed
112*41349Sbostic  *		and to minimize recursive plunges.
113*41349Sbostic  *
114*41349Sbostic  * OPEN,CLOSE	...are numbered at compile time.
115*41349Sbostic  */
116*41349Sbostic 
117*41349Sbostic /*
118*41349Sbostic  * A node is one char of opcode followed by two chars of "next" pointer.
119*41349Sbostic  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
120*41349Sbostic  * value is a positive offset from the opcode of the node containing it.
121*41349Sbostic  * An operand, if any, simply follows the node.  (Note that much of the
122*41349Sbostic  * code generation knows about this implicit relationship.)
123*41349Sbostic  *
124*41349Sbostic  * Using two bytes for the "next" pointer is vast overkill for most things,
125*41349Sbostic  * but allows patterns to get big without disasters.
126*41349Sbostic  */
127*41349Sbostic #define	OP(p)	(*(p))
128*41349Sbostic #define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
129*41349Sbostic #define	OPERAND(p)	((p) + 3)
130*41349Sbostic 
131*41349Sbostic /*
132*41349Sbostic  * See regmagic.h for one further detail of program structure.
133*41349Sbostic  */
134*41349Sbostic 
135*41349Sbostic 
136*41349Sbostic /*
137*41349Sbostic  * Utility definitions.
138*41349Sbostic  */
139*41349Sbostic #ifndef CHARBITS
140*41349Sbostic #define	UCHARAT(p)	((int)*(unsigned char *)(p))
141*41349Sbostic #else
142*41349Sbostic #define	UCHARAT(p)	((int)*(p)&CHARBITS)
143*41349Sbostic #endif
144*41349Sbostic 
145*41349Sbostic #define	FAIL(m)	{ regerror(m); return(NULL); }
146*41349Sbostic #define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
147*41349Sbostic 
148*41349Sbostic /*
149*41349Sbostic  * Flags to be passed up and down.
150*41349Sbostic  */
151*41349Sbostic #define	HASWIDTH	01	/* Known never to match null string. */
152*41349Sbostic #define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
153*41349Sbostic #define	SPSTART		04	/* Starts with * or +. */
154*41349Sbostic #define	WORST		0	/* Worst case. */
155*41349Sbostic 
156*41349Sbostic /*
157*41349Sbostic  * Global work variables for regcomp().
158*41349Sbostic  */
159*41349Sbostic static char *regparse;		/* Input-scan pointer. */
160*41349Sbostic static int regnpar;		/* () count. */
161*41349Sbostic static char regdummy;
162*41349Sbostic static char *regcode;		/* Code-emit pointer; &regdummy = don't. */
163*41349Sbostic static long regsize;		/* Code size. */
164*41349Sbostic 
165*41349Sbostic /*
166*41349Sbostic  * Forward declarations for regcomp()'s friends.
167*41349Sbostic  */
168*41349Sbostic #ifndef STATIC
169*41349Sbostic #define	STATIC	static
170*41349Sbostic #endif
171*41349Sbostic STATIC char *reg();
172*41349Sbostic STATIC char *regbranch();
173*41349Sbostic STATIC char *regpiece();
174*41349Sbostic STATIC char *regatom();
175*41349Sbostic STATIC char *regnode();
176*41349Sbostic STATIC char *regnext();
177*41349Sbostic STATIC void regc();
178*41349Sbostic STATIC void reginsert();
179*41349Sbostic STATIC void regtail();
180*41349Sbostic STATIC void regoptail();
181*41349Sbostic #ifdef STRCSPN
182*41349Sbostic STATIC int strcspn();
183*41349Sbostic #endif
184*41349Sbostic 
185*41349Sbostic /*
186*41349Sbostic  - regcomp - compile a regular expression into internal code
187*41349Sbostic  *
188*41349Sbostic  * We can't allocate space until we know how big the compiled form will be,
189*41349Sbostic  * but we can't compile it (and thus know how big it is) until we've got a
190*41349Sbostic  * place to put the code.  So we cheat:  we compile it twice, once with code
191*41349Sbostic  * generation turned off and size counting turned on, and once "for real".
192*41349Sbostic  * This also means that we don't allocate space until we are sure that the
193*41349Sbostic  * thing really will compile successfully, and we never have to move the
194*41349Sbostic  * code and thus invalidate pointers into it.  (Note that it has to be in
195*41349Sbostic  * one piece because free() must be able to free it all.)
196*41349Sbostic  *
197*41349Sbostic  * Beware that the optimization-preparation code in here knows about some
198*41349Sbostic  * of the structure of the compiled regexp.
199*41349Sbostic  */
200*41349Sbostic regexp *
201*41349Sbostic regcomp(exp)
202*41349Sbostic char *exp;
203*41349Sbostic {
204*41349Sbostic 	register regexp *r;
205*41349Sbostic 	register char *scan;
206*41349Sbostic 	register char *longest;
207*41349Sbostic 	register int len;
208*41349Sbostic 	int flags;
209*41349Sbostic 	extern char *malloc();
210*41349Sbostic 
211*41349Sbostic 	if (exp == NULL)
212*41349Sbostic 		FAIL("NULL argument");
213*41349Sbostic 
214*41349Sbostic 	/* First pass: determine size, legality. */
215*41349Sbostic 	if (exp[0] == '.' && exp[1] == '*') exp += 2;  /* aid grep */
216*41349Sbostic 	regparse = exp;
217*41349Sbostic 	regnpar = 1;
218*41349Sbostic 	regsize = 0L;
219*41349Sbostic 	regcode = &regdummy;
220*41349Sbostic 	regc(MAGIC);
221*41349Sbostic 	if (reg(0, &flags) == NULL)
222*41349Sbostic 		return(NULL);
223*41349Sbostic 
224*41349Sbostic 	/* Small enough for pointer-storage convention? */
225*41349Sbostic 	if (regsize >= 32767L)		/* Probably could be 65535L. */
226*41349Sbostic 		FAIL("regexp too big");
227*41349Sbostic 
228*41349Sbostic 	/* Allocate space. */
229*41349Sbostic 	r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
230*41349Sbostic 	if (r == NULL)
231*41349Sbostic 		FAIL("out of space");
232*41349Sbostic 
233*41349Sbostic 	/* Second pass: emit code. */
234*41349Sbostic 	regparse = exp;
235*41349Sbostic 	regnpar = 1;
236*41349Sbostic 	regcode = r->program;
237*41349Sbostic 	regc(MAGIC);
238*41349Sbostic 	if (reg(0, &flags) == NULL)
239*41349Sbostic 		return(NULL);
240*41349Sbostic 
241*41349Sbostic 	/* Dig out information for optimizations. */
242*41349Sbostic 	r->regstart = '\0';	/* Worst-case defaults. */
243*41349Sbostic 	r->reganch = 0;
244*41349Sbostic 	r->regmust = NULL;
245*41349Sbostic 	r->regmlen = 0;
246*41349Sbostic 	scan = r->program+1;			/* First BRANCH. */
247*41349Sbostic 	if (OP(regnext(scan)) == END) {		/* Only one top-level choice. */
248*41349Sbostic 		scan = OPERAND(scan);
249*41349Sbostic 
250*41349Sbostic 		/* Starting-point info. */
251*41349Sbostic 		if (OP(scan) == EXACTLY)
252*41349Sbostic 			r->regstart = *OPERAND(scan);
253*41349Sbostic 		else if (OP(scan) == BOL)
254*41349Sbostic 			r->reganch++;
255*41349Sbostic 
256*41349Sbostic 		/*
257*41349Sbostic 		 * If there's something expensive in the r.e., find the
258*41349Sbostic 		 * longest literal string that must appear and make it the
259*41349Sbostic 		 * regmust.  Resolve ties in favor of later strings, since
260*41349Sbostic 		 * the regstart check works with the beginning of the r.e.
261*41349Sbostic 		 * and avoiding duplication strengthens checking.  Not a
262*41349Sbostic 		 * strong reason, but sufficient in the absence of others.
263*41349Sbostic 		 */
264*41349Sbostic 		if (flags&SPSTART) {
265*41349Sbostic 			longest = NULL;
266*41349Sbostic 			len = 0;
267*41349Sbostic 			for (; scan != NULL; scan = regnext(scan))
268*41349Sbostic 				if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
269*41349Sbostic 					longest = OPERAND(scan);
270*41349Sbostic 					len = strlen(OPERAND(scan));
271*41349Sbostic 				}
272*41349Sbostic 			r->regmust = longest;
273*41349Sbostic 			r->regmlen = len;
274*41349Sbostic 		}
275*41349Sbostic 	}
276*41349Sbostic 
277*41349Sbostic 	return(r);
278*41349Sbostic }
279*41349Sbostic 
280*41349Sbostic /*
281*41349Sbostic  - reg - regular expression, i.e. main body or parenthesized thing
282*41349Sbostic  *
283*41349Sbostic  * Caller must absorb opening parenthesis.
284*41349Sbostic  *
285*41349Sbostic  * Combining parenthesis handling with the base level of regular expression
286*41349Sbostic  * is a trifle forced, but the need to tie the tails of the branches to what
287*41349Sbostic  * follows makes it hard to avoid.
288*41349Sbostic  */
289*41349Sbostic static char *
290*41349Sbostic reg(paren, flagp)
291*41349Sbostic int paren;			/* Parenthesized? */
292*41349Sbostic int *flagp;
293*41349Sbostic {
294*41349Sbostic 	register char *ret;
295*41349Sbostic 	register char *br;
296*41349Sbostic 	register char *ender;
297*41349Sbostic 	register int parno;
298*41349Sbostic 	int flags;
299*41349Sbostic 
300*41349Sbostic 	*flagp = HASWIDTH;	/* Tentatively. */
301*41349Sbostic 
302*41349Sbostic 	/* Make an OPEN node, if parenthesized. */
303*41349Sbostic 	if (paren) {
304*41349Sbostic 		if (regnpar >= NSUBEXP)
305*41349Sbostic 			FAIL("too many ()");
306*41349Sbostic 		parno = regnpar;
307*41349Sbostic 		regnpar++;
308*41349Sbostic 		ret = regnode(OPEN+parno);
309*41349Sbostic 	} else
310*41349Sbostic 		ret = NULL;
311*41349Sbostic 
312*41349Sbostic 	/* Pick up the branches, linking them together. */
313*41349Sbostic 	br = regbranch(&flags);
314*41349Sbostic 	if (br == NULL)
315*41349Sbostic 		return(NULL);
316*41349Sbostic 	if (ret != NULL)
317*41349Sbostic 		regtail(ret, br);	/* OPEN -> first. */
318*41349Sbostic 	else
319*41349Sbostic 		ret = br;
320*41349Sbostic 	if (!(flags&HASWIDTH))
321*41349Sbostic 		*flagp &= ~HASWIDTH;
322*41349Sbostic 	*flagp |= flags&SPSTART;
323*41349Sbostic 	while (*regparse == '|' || *regparse == '\n') {
324*41349Sbostic 		regparse++;
325*41349Sbostic 		br = regbranch(&flags);
326*41349Sbostic 		if (br == NULL)
327*41349Sbostic 			return(NULL);
328*41349Sbostic 		regtail(ret, br);	/* BRANCH -> BRANCH. */
329*41349Sbostic 		if (!(flags&HASWIDTH))
330*41349Sbostic 			*flagp &= ~HASWIDTH;
331*41349Sbostic 		*flagp |= flags&SPSTART;
332*41349Sbostic 	}
333*41349Sbostic 
334*41349Sbostic 	/* Make a closing node, and hook it on the end. */
335*41349Sbostic 	ender = regnode((paren) ? CLOSE+parno : END);
336*41349Sbostic 	regtail(ret, ender);
337*41349Sbostic 
338*41349Sbostic 	/* Hook the tails of the branches to the closing node. */
339*41349Sbostic 	for (br = ret; br != NULL; br = regnext(br))
340*41349Sbostic 		regoptail(br, ender);
341*41349Sbostic 
342*41349Sbostic 	/* Check for proper termination. */
343*41349Sbostic 	if (paren && *regparse++ != ')') {
344*41349Sbostic 		FAIL("unmatched ()");
345*41349Sbostic 	} else if (!paren && *regparse != '\0') {
346*41349Sbostic 		if (*regparse == ')') {
347*41349Sbostic 			FAIL("unmatched ()");
348*41349Sbostic 		} else
349*41349Sbostic 			FAIL("junk on end");	/* "Can't happen". */
350*41349Sbostic 		/* NOTREACHED */
351*41349Sbostic 	}
352*41349Sbostic 
353*41349Sbostic 	return(ret);
354*41349Sbostic }
355*41349Sbostic 
356*41349Sbostic /*
357*41349Sbostic  - regbranch - one alternative of an | operator
358*41349Sbostic  *
359*41349Sbostic  * Implements the concatenation operator.
360*41349Sbostic  */
361*41349Sbostic static char *
362*41349Sbostic regbranch(flagp)
363*41349Sbostic int *flagp;
364*41349Sbostic {
365*41349Sbostic 	register char *ret;
366*41349Sbostic 	register char *chain;
367*41349Sbostic 	register char *latest;
368*41349Sbostic 	int flags;
369*41349Sbostic 
370*41349Sbostic 	*flagp = WORST;		/* Tentatively. */
371*41349Sbostic 
372*41349Sbostic 	ret = regnode(BRANCH);
373*41349Sbostic 	chain = NULL;
374*41349Sbostic 	while (*regparse != '\0' && *regparse != ')' &&
375*41349Sbostic 	       *regparse != '\n' && *regparse != '|') {
376*41349Sbostic 		latest = regpiece(&flags);
377*41349Sbostic 		if (latest == NULL)
378*41349Sbostic 			return(NULL);
379*41349Sbostic 		*flagp |= flags&HASWIDTH;
380*41349Sbostic 		if (chain == NULL)	/* First piece. */
381*41349Sbostic 			*flagp |= flags&SPSTART;
382*41349Sbostic 		else
383*41349Sbostic 			regtail(chain, latest);
384*41349Sbostic 		chain = latest;
385*41349Sbostic 	}
386*41349Sbostic 	if (chain == NULL)	/* Loop ran zero times. */
387*41349Sbostic 		(void) regnode(NOTHING);
388*41349Sbostic 
389*41349Sbostic 	return(ret);
390*41349Sbostic }
391*41349Sbostic 
392*41349Sbostic /*
393*41349Sbostic  - regpiece - something followed by possible [*+?]
394*41349Sbostic  *
395*41349Sbostic  * Note that the branching code sequences used for ? and the general cases
396*41349Sbostic  * of * and + are somewhat optimized:  they use the same NOTHING node as
397*41349Sbostic  * both the endmarker for their branch list and the body of the last branch.
398*41349Sbostic  * It might seem that this node could be dispensed with entirely, but the
399*41349Sbostic  * endmarker role is not redundant.
400*41349Sbostic  */
401*41349Sbostic static char *
402*41349Sbostic regpiece(flagp)
403*41349Sbostic int *flagp;
404*41349Sbostic {
405*41349Sbostic 	register char *ret;
406*41349Sbostic 	register char op;
407*41349Sbostic 	register char *next;
408*41349Sbostic 	int flags;
409*41349Sbostic 
410*41349Sbostic 	ret = regatom(&flags);
411*41349Sbostic 	if (ret == NULL)
412*41349Sbostic 		return(NULL);
413*41349Sbostic 
414*41349Sbostic 	op = *regparse;
415*41349Sbostic 	if (!ISMULT(op)) {
416*41349Sbostic 		*flagp = flags;
417*41349Sbostic 		return(ret);
418*41349Sbostic 	}
419*41349Sbostic 
420*41349Sbostic 	if (!(flags&HASWIDTH) && op != '?')
421*41349Sbostic 		FAIL("*+ operand could be empty");
422*41349Sbostic 	*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
423*41349Sbostic 
424*41349Sbostic 	if (op == '*' && (flags&SIMPLE))
425*41349Sbostic 		reginsert(STAR, ret);
426*41349Sbostic 	else if (op == '*') {
427*41349Sbostic 		/* Emit x* as (x&|), where & means "self". */
428*41349Sbostic 		reginsert(BRANCH, ret);			/* Either x */
429*41349Sbostic 		regoptail(ret, regnode(BACK));		/* and loop */
430*41349Sbostic 		regoptail(ret, ret);			/* back */
431*41349Sbostic 		regtail(ret, regnode(BRANCH));		/* or */
432*41349Sbostic 		regtail(ret, regnode(NOTHING));		/* null. */
433*41349Sbostic 	} else if (op == '+' && (flags&SIMPLE))
434*41349Sbostic 		reginsert(PLUS, ret);
435*41349Sbostic 	else if (op == '+') {
436*41349Sbostic 		/* Emit x+ as x(&|), where & means "self". */
437*41349Sbostic 		next = regnode(BRANCH);			/* Either */
438*41349Sbostic 		regtail(ret, next);
439*41349Sbostic 		regtail(regnode(BACK), ret);		/* loop back */
440*41349Sbostic 		regtail(next, regnode(BRANCH));		/* or */
441*41349Sbostic 		regtail(ret, regnode(NOTHING));		/* null. */
442*41349Sbostic 	} else if (op == '?') {
443*41349Sbostic 		/* Emit x? as (x|) */
444*41349Sbostic 		reginsert(BRANCH, ret);			/* Either x */
445*41349Sbostic 		regtail(ret, regnode(BRANCH));		/* or */
446*41349Sbostic 		next = regnode(NOTHING);		/* null. */
447*41349Sbostic 		regtail(ret, next);
448*41349Sbostic 		regoptail(ret, next);
449*41349Sbostic 	}
450*41349Sbostic 	regparse++;
451*41349Sbostic 	if (ISMULT(*regparse))
452*41349Sbostic 		FAIL("nested *?+");
453*41349Sbostic 
454*41349Sbostic 	return(ret);
455*41349Sbostic }
456*41349Sbostic 
457*41349Sbostic /*
458*41349Sbostic  - regatom - the lowest level
459*41349Sbostic  *
460*41349Sbostic  * Optimization:  gobbles an entire sequence of ordinary characters so that
461*41349Sbostic  * it can turn them into a single node, which is smaller to store and
462*41349Sbostic  * faster to run.  Backslashed characters are exceptions, each becoming a
463*41349Sbostic  * separate node; the code is simpler that way and it's not worth fixing.
464*41349Sbostic  */
465*41349Sbostic static char *
466*41349Sbostic regatom(flagp)
467*41349Sbostic int *flagp;
468*41349Sbostic {
469*41349Sbostic 	register char *ret;
470*41349Sbostic 	int flags;
471*41349Sbostic 
472*41349Sbostic 	*flagp = WORST;		/* Tentatively. */
473*41349Sbostic 
474*41349Sbostic 	switch (*regparse++) {
475*41349Sbostic 	/* FIXME: these chars only have meaning at beg/end of pat? */
476*41349Sbostic 	case '^':
477*41349Sbostic 		ret = regnode(BOL);
478*41349Sbostic 		break;
479*41349Sbostic 	case '$':
480*41349Sbostic 		ret = regnode(EOL);
481*41349Sbostic 		break;
482*41349Sbostic 	case '.':
483*41349Sbostic 		ret = regnode(ANY);
484*41349Sbostic 		*flagp |= HASWIDTH|SIMPLE;
485*41349Sbostic 		break;
486*41349Sbostic 	case '[': {
487*41349Sbostic 			register int class;
488*41349Sbostic 			register int classend;
489*41349Sbostic 
490*41349Sbostic 			if (*regparse == '^') {	/* Complement of range. */
491*41349Sbostic 				ret = regnode(ANYBUT);
492*41349Sbostic 				regparse++;
493*41349Sbostic 			} else
494*41349Sbostic 				ret = regnode(ANYOF);
495*41349Sbostic 			if (*regparse == ']' || *regparse == '-')
496*41349Sbostic 				regc(*regparse++);
497*41349Sbostic 			while (*regparse != '\0' && *regparse != ']') {
498*41349Sbostic 				if (*regparse == '-') {
499*41349Sbostic 					regparse++;
500*41349Sbostic 					if (*regparse == ']' || *regparse == '\0')
501*41349Sbostic 						regc('-');
502*41349Sbostic 					else {
503*41349Sbostic 						class = UCHARAT(regparse-2)+1;
504*41349Sbostic 						classend = UCHARAT(regparse);
505*41349Sbostic 						if (class > classend+1)
506*41349Sbostic 							FAIL("invalid [] range");
507*41349Sbostic 						for (; class <= classend; class++)
508*41349Sbostic 							regc(class);
509*41349Sbostic 						regparse++;
510*41349Sbostic 					}
511*41349Sbostic 				} else
512*41349Sbostic 					regc(*regparse++);
513*41349Sbostic 			}
514*41349Sbostic 			regc('\0');
515*41349Sbostic 			if (*regparse != ']')
516*41349Sbostic 				FAIL("unmatched []");
517*41349Sbostic 			regparse++;
518*41349Sbostic 			*flagp |= HASWIDTH|SIMPLE;
519*41349Sbostic 		}
520*41349Sbostic 		break;
521*41349Sbostic 	case '(':
522*41349Sbostic 		ret = reg(1, &flags);
523*41349Sbostic 		if (ret == NULL)
524*41349Sbostic 			return(NULL);
525*41349Sbostic 		*flagp |= flags&(HASWIDTH|SPSTART);
526*41349Sbostic 		break;
527*41349Sbostic 	case '\0':
528*41349Sbostic 	case '|':
529*41349Sbostic 	case '\n':
530*41349Sbostic 	case ')':
531*41349Sbostic 		FAIL("internal urp");	/* Supposed to be caught earlier. */
532*41349Sbostic 		break;
533*41349Sbostic 	case '?':
534*41349Sbostic 	case '+':
535*41349Sbostic 	case '*':
536*41349Sbostic 		FAIL("?+* follows nothing");
537*41349Sbostic 		break;
538*41349Sbostic 	case '\\':
539*41349Sbostic 		switch (*regparse++) {
540*41349Sbostic 		case '\0':
541*41349Sbostic 			FAIL("trailing \\");
542*41349Sbostic 			break;
543*41349Sbostic 		case '<':
544*41349Sbostic 			ret = regnode(WORDA);
545*41349Sbostic 			break;
546*41349Sbostic 		case '>':
547*41349Sbostic 			ret = regnode(WORDZ);
548*41349Sbostic 			break;
549*41349Sbostic 		/* FIXME: Someday handle \1, \2, ... */
550*41349Sbostic 		default:
551*41349Sbostic 			/* Handle general quoted chars in exact-match routine */
552*41349Sbostic 			goto de_fault;
553*41349Sbostic 		}
554*41349Sbostic 		break;
555*41349Sbostic 	de_fault:
556*41349Sbostic 	default:
557*41349Sbostic 		/*
558*41349Sbostic 		 * Encode a string of characters to be matched exactly.
559*41349Sbostic 		 *
560*41349Sbostic 		 * This is a bit tricky due to quoted chars and due to
561*41349Sbostic 		 * '*', '+', and '?' taking the SINGLE char previous
562*41349Sbostic 		 * as their operand.
563*41349Sbostic 		 *
564*41349Sbostic 		 * On entry, the char at regparse[-1] is going to go
565*41349Sbostic 		 * into the string, no matter what it is.  (It could be
566*41349Sbostic 		 * following a \ if we are entered from the '\' case.)
567*41349Sbostic 		 *
568*41349Sbostic 		 * Basic idea is to pick up a good char in  ch  and
569*41349Sbostic 		 * examine the next char.  If it's *+? then we twiddle.
570*41349Sbostic 		 * If it's \ then we frozzle.  If it's other magic char
571*41349Sbostic 		 * we push  ch  and terminate the string.  If none of the
572*41349Sbostic 		 * above, we push  ch  on the string and go around again.
573*41349Sbostic 		 *
574*41349Sbostic 		 *  regprev  is used to remember where "the current char"
575*41349Sbostic 		 * starts in the string, if due to a *+? we need to back
576*41349Sbostic 		 * up and put the current char in a separate, 1-char, string.
577*41349Sbostic 		 * When  regprev  is NULL,  ch  is the only char in the
578*41349Sbostic 		 * string; this is used in *+? handling, and in setting
579*41349Sbostic 		 * flags |= SIMPLE at the end.
580*41349Sbostic 		 */
581*41349Sbostic 		{
582*41349Sbostic 			char *regprev;
583*41349Sbostic 			register char ch;
584*41349Sbostic 
585*41349Sbostic 			regparse--;			/* Look at cur char */
586*41349Sbostic 			ret = regnode(EXACTLY);
587*41349Sbostic 			for ( regprev = 0 ; ; ) {
588*41349Sbostic 				ch = *regparse++;	/* Get current char */
589*41349Sbostic 				switch (*regparse) {	/* look at next one */
590*41349Sbostic 
591*41349Sbostic 				default:
592*41349Sbostic 					regc(ch);	/* Add cur to string */
593*41349Sbostic 					break;
594*41349Sbostic 
595*41349Sbostic 				case '.': case '[': case '(':
596*41349Sbostic 				case ')': case '|': case '\n':
597*41349Sbostic 				case '$': case '^':
598*41349Sbostic 				case '\0':
599*41349Sbostic 				/* FIXME, $ and ^ should not always be magic */
600*41349Sbostic 				magic:
601*41349Sbostic 					regc(ch);	/* dump cur char */
602*41349Sbostic 					goto done;	/* and we are done */
603*41349Sbostic 
604*41349Sbostic 				case '?': case '+': case '*':
605*41349Sbostic 					if (!regprev) 	/* If just ch in str, */
606*41349Sbostic 						goto magic;	/* use it */
607*41349Sbostic 					/* End mult-char string one early */
608*41349Sbostic 					regparse = regprev; /* Back up parse */
609*41349Sbostic 					goto done;
610*41349Sbostic 
611*41349Sbostic 				case '\\':
612*41349Sbostic 					regc(ch);	/* Cur char OK */
613*41349Sbostic 					switch (regparse[1]){ /* Look after \ */
614*41349Sbostic 					case '\0':
615*41349Sbostic 					case '<':
616*41349Sbostic 					case '>':
617*41349Sbostic 					/* FIXME: Someday handle \1, \2, ... */
618*41349Sbostic 						goto done; /* Not quoted */
619*41349Sbostic 					default:
620*41349Sbostic 						/* Backup point is \, scan							 * point is after it. */
621*41349Sbostic 						regprev = regparse;
622*41349Sbostic 						regparse++;
623*41349Sbostic 						continue;	/* NOT break; */
624*41349Sbostic 					}
625*41349Sbostic 				}
626*41349Sbostic 				regprev = regparse;	/* Set backup point */
627*41349Sbostic 			}
628*41349Sbostic 		done:
629*41349Sbostic 			regc('\0');
630*41349Sbostic 			*flagp |= HASWIDTH;
631*41349Sbostic 			if (!regprev)		/* One char? */
632*41349Sbostic 				*flagp |= SIMPLE;
633*41349Sbostic 		}
634*41349Sbostic 		break;
635*41349Sbostic 	}
636*41349Sbostic 
637*41349Sbostic 	return(ret);
638*41349Sbostic }
639*41349Sbostic 
640*41349Sbostic /*
641*41349Sbostic  - regnode - emit a node
642*41349Sbostic  */
643*41349Sbostic static char *			/* Location. */
644*41349Sbostic regnode(op)
645*41349Sbostic char op;
646*41349Sbostic {
647*41349Sbostic 	register char *ret;
648*41349Sbostic 	register char *ptr;
649*41349Sbostic 
650*41349Sbostic 	ret = regcode;
651*41349Sbostic 	if (ret == &regdummy) {
652*41349Sbostic 		regsize += 3;
653*41349Sbostic 		return(ret);
654*41349Sbostic 	}
655*41349Sbostic 
656*41349Sbostic 	ptr = ret;
657*41349Sbostic 	*ptr++ = op;
658*41349Sbostic 	*ptr++ = '\0';		/* Null "next" pointer. */
659*41349Sbostic 	*ptr++ = '\0';
660*41349Sbostic 	regcode = ptr;
661*41349Sbostic 
662*41349Sbostic 	return(ret);
663*41349Sbostic }
664*41349Sbostic 
665*41349Sbostic /*
666*41349Sbostic  - regc - emit (if appropriate) a byte of code
667*41349Sbostic  */
668*41349Sbostic static void
669*41349Sbostic regc(b)
670*41349Sbostic char b;
671*41349Sbostic {
672*41349Sbostic 	if (regcode != &regdummy)
673*41349Sbostic 		*regcode++ = b;
674*41349Sbostic 	else
675*41349Sbostic 		regsize++;
676*41349Sbostic }
677*41349Sbostic 
678*41349Sbostic /*
679*41349Sbostic  - reginsert - insert an operator in front of already-emitted operand
680*41349Sbostic  *
681*41349Sbostic  * Means relocating the operand.
682*41349Sbostic  */
683*41349Sbostic static void
684*41349Sbostic reginsert(op, opnd)
685*41349Sbostic char op;
686*41349Sbostic char *opnd;
687*41349Sbostic {
688*41349Sbostic 	register char *src;
689*41349Sbostic 	register char *dst;
690*41349Sbostic 	register char *place;
691*41349Sbostic 
692*41349Sbostic 	if (regcode == &regdummy) {
693*41349Sbostic 		regsize += 3;
694*41349Sbostic 		return;
695*41349Sbostic 	}
696*41349Sbostic 
697*41349Sbostic 	src = regcode;
698*41349Sbostic 	regcode += 3;
699*41349Sbostic 	dst = regcode;
700*41349Sbostic 	while (src > opnd)
701*41349Sbostic 		*--dst = *--src;
702*41349Sbostic 
703*41349Sbostic 	place = opnd;		/* Op node, where operand used to be. */
704*41349Sbostic 	*place++ = op;
705*41349Sbostic 	*place++ = '\0';
706*41349Sbostic 	*place++ = '\0';
707*41349Sbostic }
708*41349Sbostic 
709*41349Sbostic /*
710*41349Sbostic  - regtail - set the next-pointer at the end of a node chain
711*41349Sbostic  */
712*41349Sbostic static void
713*41349Sbostic regtail(p, val)
714*41349Sbostic char *p;
715*41349Sbostic char *val;
716*41349Sbostic {
717*41349Sbostic 	register char *scan;
718*41349Sbostic 	register char *temp;
719*41349Sbostic 	register int offset;
720*41349Sbostic 
721*41349Sbostic 	if (p == &regdummy)
722*41349Sbostic 		return;
723*41349Sbostic 
724*41349Sbostic 	/* Find last node. */
725*41349Sbostic 	scan = p;
726*41349Sbostic 	for (;;) {
727*41349Sbostic 		temp = regnext(scan);
728*41349Sbostic 		if (temp == NULL)
729*41349Sbostic 			break;
730*41349Sbostic 		scan = temp;
731*41349Sbostic 	}
732*41349Sbostic 
733*41349Sbostic 	if (OP(scan) == BACK)
734*41349Sbostic 		offset = scan - val;
735*41349Sbostic 	else
736*41349Sbostic 		offset = val - scan;
737*41349Sbostic 	*(scan+1) = (offset>>8)&0377;
738*41349Sbostic 	*(scan+2) = offset&0377;
739*41349Sbostic }
740*41349Sbostic 
741*41349Sbostic /*
742*41349Sbostic  - regoptail - regtail on operand of first argument; nop if operandless
743*41349Sbostic  */
744*41349Sbostic static void
745*41349Sbostic regoptail(p, val)
746*41349Sbostic char *p;
747*41349Sbostic char *val;
748*41349Sbostic {
749*41349Sbostic 	/* "Operandless" and "op != BRANCH" are synonymous in practice. */
750*41349Sbostic 	if (p == NULL || p == &regdummy || OP(p) != BRANCH)
751*41349Sbostic 		return;
752*41349Sbostic 	regtail(OPERAND(p), val);
753*41349Sbostic }
754*41349Sbostic 
755*41349Sbostic /*
756*41349Sbostic  * regexec and friends
757*41349Sbostic  */
758*41349Sbostic 
759*41349Sbostic /*
760*41349Sbostic  * Global work variables for regexec().
761*41349Sbostic  */
762*41349Sbostic static char *reginput;		/* String-input pointer. */
763*41349Sbostic static char *regbol;		/* Beginning of input, for ^ check. */
764*41349Sbostic static char **regstartp;	/* Pointer to startp array. */
765*41349Sbostic static char **regendp;		/* Ditto for endp. */
766*41349Sbostic 
767*41349Sbostic /*
768*41349Sbostic  * Forwards.
769*41349Sbostic  */
770*41349Sbostic STATIC int regtry();
771*41349Sbostic STATIC int regmatch();
772*41349Sbostic STATIC int regrepeat();
773*41349Sbostic 
774*41349Sbostic #ifdef DEBUG
775*41349Sbostic int regnarrate = 0;
776*41349Sbostic void regdump();
777*41349Sbostic STATIC char *regprop();
778*41349Sbostic #endif
779*41349Sbostic 
780*41349Sbostic /*
781*41349Sbostic  - regexec - match a regexp against a string
782*41349Sbostic  */
783*41349Sbostic int
784*41349Sbostic regexec(prog, string)
785*41349Sbostic register regexp *prog;
786*41349Sbostic register char *string;
787*41349Sbostic {
788*41349Sbostic 	register char *s;
789*41349Sbostic 	extern char *strchr();
790*41349Sbostic 
791*41349Sbostic 	/* Be paranoid... */
792*41349Sbostic 	if (prog == NULL || string == NULL) {
793*41349Sbostic 		regerror("NULL parameter");
794*41349Sbostic 		return(0);
795*41349Sbostic 	}
796*41349Sbostic 
797*41349Sbostic 	/* Check validity of program. */
798*41349Sbostic 	if (UCHARAT(prog->program) != MAGIC) {
799*41349Sbostic 		regerror("corrupted program");
800*41349Sbostic 		return(0);
801*41349Sbostic 	}
802*41349Sbostic 
803*41349Sbostic 	/* If there is a "must appear" string, look for it. */
804*41349Sbostic 	if (prog->regmust != NULL) {
805*41349Sbostic 		s = string;
806*41349Sbostic 		while ((s = strchr(s, prog->regmust[0])) != NULL) {
807*41349Sbostic 			if (strncmp(s, prog->regmust, prog->regmlen) == 0)
808*41349Sbostic 				break;	/* Found it. */
809*41349Sbostic 			s++;
810*41349Sbostic 		}
811*41349Sbostic 		if (s == NULL)	/* Not present. */
812*41349Sbostic 			return(0);
813*41349Sbostic 	}
814*41349Sbostic 
815*41349Sbostic 	/* Mark beginning of line for ^ . */
816*41349Sbostic 	regbol = string;
817*41349Sbostic 
818*41349Sbostic 	/* Simplest case:  anchored match need be tried only once. */
819*41349Sbostic 	if (prog->reganch)
820*41349Sbostic 		return(regtry(prog, string));
821*41349Sbostic 
822*41349Sbostic 	/* Messy cases:  unanchored match. */
823*41349Sbostic 	s = string;
824*41349Sbostic 	if (prog->regstart != '\0')
825*41349Sbostic 		/* We know what char it must start with. */
826*41349Sbostic 		while ((s = strchr(s, prog->regstart)) != NULL) {
827*41349Sbostic 			if (regtry(prog, s))
828*41349Sbostic 				return(1);
829*41349Sbostic 			s++;
830*41349Sbostic 		}
831*41349Sbostic 	else
832*41349Sbostic 		/* We don't -- general case. */
833*41349Sbostic 		do {
834*41349Sbostic 			if (regtry(prog, s))
835*41349Sbostic 				return(1);
836*41349Sbostic 		} while (*s++ != '\0');
837*41349Sbostic 
838*41349Sbostic 	/* Failure. */
839*41349Sbostic 	return(0);
840*41349Sbostic }
841*41349Sbostic 
842*41349Sbostic /*
843*41349Sbostic  - regtry - try match at specific point
844*41349Sbostic  */
845*41349Sbostic static int			/* 0 failure, 1 success */
846*41349Sbostic regtry(prog, string)
847*41349Sbostic regexp *prog;
848*41349Sbostic char *string;
849*41349Sbostic {
850*41349Sbostic 	register int i;
851*41349Sbostic 	register char **sp;
852*41349Sbostic 	register char **ep;
853*41349Sbostic 
854*41349Sbostic 	reginput = string;
855*41349Sbostic 	regstartp = prog->startp;
856*41349Sbostic 	regendp = prog->endp;
857*41349Sbostic 
858*41349Sbostic 	sp = prog->startp;
859*41349Sbostic 	ep = prog->endp;
860*41349Sbostic 	for (i = NSUBEXP; i > 0; i--) {
861*41349Sbostic 		*sp++ = NULL;
862*41349Sbostic 		*ep++ = NULL;
863*41349Sbostic 	}
864*41349Sbostic 	if (regmatch(prog->program + 1)) {
865*41349Sbostic 		prog->startp[0] = string;
866*41349Sbostic 		prog->endp[0] = reginput;
867*41349Sbostic 		return(1);
868*41349Sbostic 	} else
869*41349Sbostic 		return(0);
870*41349Sbostic }
871*41349Sbostic 
872*41349Sbostic /*
873*41349Sbostic  - regmatch - main matching routine
874*41349Sbostic  *
875*41349Sbostic  * Conceptually the strategy is simple:  check to see whether the current
876*41349Sbostic  * node matches, call self recursively to see whether the rest matches,
877*41349Sbostic  * and then act accordingly.  In practice we make some effort to avoid
878*41349Sbostic  * recursion, in particular by going through "ordinary" nodes (that don't
879*41349Sbostic  * need to know whether the rest of the match failed) by a loop instead of
880*41349Sbostic  * by recursion.
881*41349Sbostic  */
882*41349Sbostic static int			/* 0 failure, 1 success */
883*41349Sbostic regmatch(prog)
884*41349Sbostic char *prog;
885*41349Sbostic {
886*41349Sbostic 	register char *scan;	/* Current node. */
887*41349Sbostic 	char *next;		/* Next node. */
888*41349Sbostic 	extern char *strchr();
889*41349Sbostic 
890*41349Sbostic 	scan = prog;
891*41349Sbostic #ifdef DEBUG
892*41349Sbostic 	if (scan != NULL && regnarrate)
893*41349Sbostic 		fprintf(stderr, "%s(\n", regprop(scan));
894*41349Sbostic #endif
895*41349Sbostic 	while (scan != NULL) {
896*41349Sbostic #ifdef DEBUG
897*41349Sbostic 		if (regnarrate)
898*41349Sbostic 			fprintf(stderr, "%s...\n", regprop(scan));
899*41349Sbostic #endif
900*41349Sbostic 		next = regnext(scan);
901*41349Sbostic 
902*41349Sbostic 		switch (OP(scan)) {
903*41349Sbostic 		case BOL:
904*41349Sbostic 			if (reginput != regbol)
905*41349Sbostic 				return(0);
906*41349Sbostic 			break;
907*41349Sbostic 		case EOL:
908*41349Sbostic 			if (*reginput != '\0')
909*41349Sbostic 				return(0);
910*41349Sbostic 			break;
911*41349Sbostic 		case WORDA:
912*41349Sbostic 			/* Must be looking at a letter, digit, or _ */
913*41349Sbostic 			if ((!isalnum(*reginput)) && *reginput != '_')
914*41349Sbostic 				return(0);
915*41349Sbostic 			/* Prev must be BOL or nonword */
916*41349Sbostic 			if (reginput > regbol &&
917*41349Sbostic 			    (isalnum(reginput[-1]) || reginput[-1] == '_'))
918*41349Sbostic 				return(0);
919*41349Sbostic 			break;
920*41349Sbostic 		case WORDZ:
921*41349Sbostic 			/* Must be looking at non letter, digit, or _ */
922*41349Sbostic 			if (isalnum(*reginput) || *reginput == '_')
923*41349Sbostic 				return(0);
924*41349Sbostic 			/* We don't care what the previous char was */
925*41349Sbostic 			break;
926*41349Sbostic 		case ANY:
927*41349Sbostic 			if (*reginput == '\0')
928*41349Sbostic 				return(0);
929*41349Sbostic 			reginput++;
930*41349Sbostic 			break;
931*41349Sbostic 		case EXACTLY: {
932*41349Sbostic 				register int len;
933*41349Sbostic 				register char *opnd;
934*41349Sbostic 
935*41349Sbostic 				opnd = OPERAND(scan);
936*41349Sbostic 				/* Inline the first character, for speed. */
937*41349Sbostic 				if (*opnd != *reginput)
938*41349Sbostic 					return(0);
939*41349Sbostic 				len = strlen(opnd);
940*41349Sbostic 				if (len > 1 && strncmp(opnd, reginput, len) != 0)
941*41349Sbostic 					return(0);
942*41349Sbostic 				reginput += len;
943*41349Sbostic 			}
944*41349Sbostic 			break;
945*41349Sbostic 		case ANYOF:
946*41349Sbostic  			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
947*41349Sbostic 				return(0);
948*41349Sbostic 			reginput++;
949*41349Sbostic 			break;
950*41349Sbostic 		case ANYBUT:
951*41349Sbostic  			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
952*41349Sbostic 				return(0);
953*41349Sbostic 			reginput++;
954*41349Sbostic 			break;
955*41349Sbostic 		case NOTHING:
956*41349Sbostic 			break;
957*41349Sbostic 		case BACK:
958*41349Sbostic 			break;
959*41349Sbostic 		case OPEN+1:
960*41349Sbostic 		case OPEN+2:
961*41349Sbostic 		case OPEN+3:
962*41349Sbostic 		case OPEN+4:
963*41349Sbostic 		case OPEN+5:
964*41349Sbostic 		case OPEN+6:
965*41349Sbostic 		case OPEN+7:
966*41349Sbostic 		case OPEN+8:
967*41349Sbostic 		case OPEN+9: {
968*41349Sbostic 				register int no;
969*41349Sbostic 				register char *save;
970*41349Sbostic 
971*41349Sbostic 				no = OP(scan) - OPEN;
972*41349Sbostic 				save = reginput;
973*41349Sbostic 
974*41349Sbostic 				if (regmatch(next)) {
975*41349Sbostic 					/*
976*41349Sbostic 					 * Don't set startp if some later
977*41349Sbostic 					 * invocation of the same parentheses
978*41349Sbostic 					 * already has.
979*41349Sbostic 					 */
980*41349Sbostic 					if (regstartp[no] == NULL)
981*41349Sbostic 						regstartp[no] = save;
982*41349Sbostic 					return(1);
983*41349Sbostic 				} else
984*41349Sbostic 					return(0);
985*41349Sbostic 			}
986*41349Sbostic 			break;
987*41349Sbostic 		case CLOSE+1:
988*41349Sbostic 		case CLOSE+2:
989*41349Sbostic 		case CLOSE+3:
990*41349Sbostic 		case CLOSE+4:
991*41349Sbostic 		case CLOSE+5:
992*41349Sbostic 		case CLOSE+6:
993*41349Sbostic 		case CLOSE+7:
994*41349Sbostic 		case CLOSE+8:
995*41349Sbostic 		case CLOSE+9: {
996*41349Sbostic 				register int no;
997*41349Sbostic 				register char *save;
998*41349Sbostic 
999*41349Sbostic 				no = OP(scan) - CLOSE;
1000*41349Sbostic 				save = reginput;
1001*41349Sbostic 
1002*41349Sbostic 				if (regmatch(next)) {
1003*41349Sbostic 					/*
1004*41349Sbostic 					 * Don't set endp if some later
1005*41349Sbostic 					 * invocation of the same parentheses
1006*41349Sbostic 					 * already has.
1007*41349Sbostic 					 */
1008*41349Sbostic 					if (regendp[no] == NULL)
1009*41349Sbostic 						regendp[no] = save;
1010*41349Sbostic 					return(1);
1011*41349Sbostic 				} else
1012*41349Sbostic 					return(0);
1013*41349Sbostic 			}
1014*41349Sbostic 			break;
1015*41349Sbostic 		case BRANCH: {
1016*41349Sbostic 				register char *save;
1017*41349Sbostic 
1018*41349Sbostic 				if (OP(next) != BRANCH)		/* No choice. */
1019*41349Sbostic 					next = OPERAND(scan);	/* Avoid recursion. */
1020*41349Sbostic 				else {
1021*41349Sbostic 					do {
1022*41349Sbostic 						save = reginput;
1023*41349Sbostic 						if (regmatch(OPERAND(scan)))
1024*41349Sbostic 							return(1);
1025*41349Sbostic 						reginput = save;
1026*41349Sbostic 						scan = regnext(scan);
1027*41349Sbostic 					} while (scan != NULL && OP(scan) == BRANCH);
1028*41349Sbostic 					return(0);
1029*41349Sbostic 					/* NOTREACHED */
1030*41349Sbostic 				}
1031*41349Sbostic 			}
1032*41349Sbostic 			break;
1033*41349Sbostic 		case STAR:
1034*41349Sbostic 		case PLUS: {
1035*41349Sbostic 				register char nextch;
1036*41349Sbostic 				register int no;
1037*41349Sbostic 				register char *save;
1038*41349Sbostic 				register int min;
1039*41349Sbostic 
1040*41349Sbostic 				/*
1041*41349Sbostic 				 * Lookahead to avoid useless match attempts
1042*41349Sbostic 				 * when we know what character comes next.
1043*41349Sbostic 				 */
1044*41349Sbostic 				nextch = '\0';
1045*41349Sbostic 				if (OP(next) == EXACTLY)
1046*41349Sbostic 					nextch = *OPERAND(next);
1047*41349Sbostic 				min = (OP(scan) == STAR) ? 0 : 1;
1048*41349Sbostic 				save = reginput;
1049*41349Sbostic 				no = regrepeat(OPERAND(scan));
1050*41349Sbostic 				while (no >= min) {
1051*41349Sbostic 					/* If it could work, try it. */
1052*41349Sbostic 					if (nextch == '\0' || *reginput == nextch)
1053*41349Sbostic 						if (regmatch(next))
1054*41349Sbostic 							return(1);
1055*41349Sbostic 					/* Couldn't or didn't -- back up. */
1056*41349Sbostic 					no--;
1057*41349Sbostic 					reginput = save + no;
1058*41349Sbostic 				}
1059*41349Sbostic 				return(0);
1060*41349Sbostic 			}
1061*41349Sbostic 			break;
1062*41349Sbostic 		case END:
1063*41349Sbostic 			return(1);	/* Success! */
1064*41349Sbostic 			break;
1065*41349Sbostic 		default:
1066*41349Sbostic 			regerror("memory corruption");
1067*41349Sbostic 			return(0);
1068*41349Sbostic 			break;
1069*41349Sbostic 		}
1070*41349Sbostic 
1071*41349Sbostic 		scan = next;
1072*41349Sbostic 	}
1073*41349Sbostic 
1074*41349Sbostic 	/*
1075*41349Sbostic 	 * We get here only if there's trouble -- normally "case END" is
1076*41349Sbostic 	 * the terminating point.
1077*41349Sbostic 	 */
1078*41349Sbostic 	regerror("corrupted pointers");
1079*41349Sbostic 	return(0);
1080*41349Sbostic }
1081*41349Sbostic 
1082*41349Sbostic /*
1083*41349Sbostic  - regrepeat - repeatedly match something simple, report how many
1084*41349Sbostic  */
1085*41349Sbostic static int
1086*41349Sbostic regrepeat(p)
1087*41349Sbostic char *p;
1088*41349Sbostic {
1089*41349Sbostic 	register int count = 0;
1090*41349Sbostic 	register char *scan;
1091*41349Sbostic 	register char *opnd;
1092*41349Sbostic 
1093*41349Sbostic 	scan = reginput;
1094*41349Sbostic 	opnd = OPERAND(p);
1095*41349Sbostic 	switch (OP(p)) {
1096*41349Sbostic 	case ANY:
1097*41349Sbostic 		count = strlen(scan);
1098*41349Sbostic 		scan += count;
1099*41349Sbostic 		break;
1100*41349Sbostic 	case EXACTLY:
1101*41349Sbostic 		while (*opnd == *scan) {
1102*41349Sbostic 			count++;
1103*41349Sbostic 			scan++;
1104*41349Sbostic 		}
1105*41349Sbostic 		break;
1106*41349Sbostic 	case ANYOF:
1107*41349Sbostic 		while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1108*41349Sbostic 			count++;
1109*41349Sbostic 			scan++;
1110*41349Sbostic 		}
1111*41349Sbostic 		break;
1112*41349Sbostic 	case ANYBUT:
1113*41349Sbostic 		while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1114*41349Sbostic 			count++;
1115*41349Sbostic 			scan++;
1116*41349Sbostic 		}
1117*41349Sbostic 		break;
1118*41349Sbostic 	default:		/* Oh dear.  Called inappropriately. */
1119*41349Sbostic 		regerror("internal foulup");
1120*41349Sbostic 		count = 0;	/* Best compromise. */
1121*41349Sbostic 		break;
1122*41349Sbostic 	}
1123*41349Sbostic 	reginput = scan;
1124*41349Sbostic 
1125*41349Sbostic 	return(count);
1126*41349Sbostic }
1127*41349Sbostic 
1128*41349Sbostic /*
1129*41349Sbostic  - regnext - dig the "next" pointer out of a node
1130*41349Sbostic  */
1131*41349Sbostic static char *
1132*41349Sbostic regnext(p)
1133*41349Sbostic register char *p;
1134*41349Sbostic {
1135*41349Sbostic 	register int offset;
1136*41349Sbostic 
1137*41349Sbostic 	if (p == &regdummy)
1138*41349Sbostic 		return(NULL);
1139*41349Sbostic 
1140*41349Sbostic 	offset = NEXT(p);
1141*41349Sbostic 	if (offset == 0)
1142*41349Sbostic 		return(NULL);
1143*41349Sbostic 
1144*41349Sbostic 	if (OP(p) == BACK)
1145*41349Sbostic 		return(p-offset);
1146*41349Sbostic 	else
1147*41349Sbostic 		return(p+offset);
1148*41349Sbostic }
1149*41349Sbostic 
1150*41349Sbostic #ifdef DEBUG
1151*41349Sbostic 
1152*41349Sbostic STATIC char *regprop();
1153*41349Sbostic 
1154*41349Sbostic /*
1155*41349Sbostic  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1156*41349Sbostic  */
1157*41349Sbostic void
1158*41349Sbostic regdump(r)
1159*41349Sbostic regexp *r;
1160*41349Sbostic {
1161*41349Sbostic 	register char *s;
1162*41349Sbostic 	register char op = EXACTLY;	/* Arbitrary non-END op. */
1163*41349Sbostic 	register char *next;
1164*41349Sbostic 	extern char *strchr();
1165*41349Sbostic 
1166*41349Sbostic 
1167*41349Sbostic 	s = r->program + 1;
1168*41349Sbostic 	while (op != END) {	/* While that wasn't END last time... */
1169*41349Sbostic 		op = OP(s);
1170*41349Sbostic 		printf("%2d%s", s-r->program, regprop(s));	/* Where, what. */
1171*41349Sbostic 		next = regnext(s);
1172*41349Sbostic 		if (next == NULL)		/* Next ptr. */
1173*41349Sbostic 			printf("(0)");
1174*41349Sbostic 		else
1175*41349Sbostic 			printf("(%d)", (s-r->program)+(next-s));
1176*41349Sbostic 		s += 3;
1177*41349Sbostic 		if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1178*41349Sbostic 			/* Literal string, where present. */
1179*41349Sbostic 			while (*s != '\0') {
1180*41349Sbostic 				putchar(*s);
1181*41349Sbostic 				s++;
1182*41349Sbostic 			}
1183*41349Sbostic 			s++;
1184*41349Sbostic 		}
1185*41349Sbostic 		putchar('\n');
1186*41349Sbostic 	}
1187*41349Sbostic 
1188*41349Sbostic 	/* Header fields of interest. */
1189*41349Sbostic 	if (r->regstart != '\0')
1190*41349Sbostic 		printf("start `%c' ", r->regstart);
1191*41349Sbostic 	if (r->reganch)
1192*41349Sbostic 		printf("anchored ");
1193*41349Sbostic 	if (r->regmust != NULL)
1194*41349Sbostic 		printf("must have \"%s\"", r->regmust);
1195*41349Sbostic 	printf("\n");
1196*41349Sbostic }
1197*41349Sbostic 
1198*41349Sbostic /*
1199*41349Sbostic  - regprop - printable representation of opcode
1200*41349Sbostic  */
1201*41349Sbostic static char *
1202*41349Sbostic regprop(op)
1203*41349Sbostic char *op;
1204*41349Sbostic {
1205*41349Sbostic 	register char *p;
1206*41349Sbostic 	static char buf[50];
1207*41349Sbostic 
1208*41349Sbostic 	(void) strcpy(buf, ":");
1209*41349Sbostic 
1210*41349Sbostic 	switch (OP(op)) {
1211*41349Sbostic 	case BOL:
1212*41349Sbostic 		p = "BOL";
1213*41349Sbostic 		break;
1214*41349Sbostic 	case EOL:
1215*41349Sbostic 		p = "EOL";
1216*41349Sbostic 		break;
1217*41349Sbostic 	case ANY:
1218*41349Sbostic 		p = "ANY";
1219*41349Sbostic 		break;
1220*41349Sbostic 	case ANYOF:
1221*41349Sbostic 		p = "ANYOF";
1222*41349Sbostic 		break;
1223*41349Sbostic 	case ANYBUT:
1224*41349Sbostic 		p = "ANYBUT";
1225*41349Sbostic 		break;
1226*41349Sbostic 	case BRANCH:
1227*41349Sbostic 		p = "BRANCH";
1228*41349Sbostic 		break;
1229*41349Sbostic 	case EXACTLY:
1230*41349Sbostic 		p = "EXACTLY";
1231*41349Sbostic 		break;
1232*41349Sbostic 	case NOTHING:
1233*41349Sbostic 		p = "NOTHING";
1234*41349Sbostic 		break;
1235*41349Sbostic 	case BACK:
1236*41349Sbostic 		p = "BACK";
1237*41349Sbostic 		break;
1238*41349Sbostic 	case END:
1239*41349Sbostic 		p = "END";
1240*41349Sbostic 		break;
1241*41349Sbostic 	case OPEN+1:
1242*41349Sbostic 	case OPEN+2:
1243*41349Sbostic 	case OPEN+3:
1244*41349Sbostic 	case OPEN+4:
1245*41349Sbostic 	case OPEN+5:
1246*41349Sbostic 	case OPEN+6:
1247*41349Sbostic 	case OPEN+7:
1248*41349Sbostic 	case OPEN+8:
1249*41349Sbostic 	case OPEN+9:
1250*41349Sbostic 		sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1251*41349Sbostic 		p = NULL;
1252*41349Sbostic 		break;
1253*41349Sbostic 	case CLOSE+1:
1254*41349Sbostic 	case CLOSE+2:
1255*41349Sbostic 	case CLOSE+3:
1256*41349Sbostic 	case CLOSE+4:
1257*41349Sbostic 	case CLOSE+5:
1258*41349Sbostic 	case CLOSE+6:
1259*41349Sbostic 	case CLOSE+7:
1260*41349Sbostic 	case CLOSE+8:
1261*41349Sbostic 	case CLOSE+9:
1262*41349Sbostic 		sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1263*41349Sbostic 		p = NULL;
1264*41349Sbostic 		break;
1265*41349Sbostic 	case STAR:
1266*41349Sbostic 		p = "STAR";
1267*41349Sbostic 		break;
1268*41349Sbostic 	case PLUS:
1269*41349Sbostic 		p = "PLUS";
1270*41349Sbostic 		break;
1271*41349Sbostic 	case WORDA:
1272*41349Sbostic 		p = "WORDA";
1273*41349Sbostic 		break;
1274*41349Sbostic 	case WORDZ:
1275*41349Sbostic 		p = "WORDZ";
1276*41349Sbostic 		break;
1277*41349Sbostic 	default:
1278*41349Sbostic 		regerror("corrupted opcode");
1279*41349Sbostic 		break;
1280*41349Sbostic 	}
1281*41349Sbostic 	if (p != NULL)
1282*41349Sbostic 		(void) strcat(buf, p);
1283*41349Sbostic 	return(buf);
1284*41349Sbostic }
1285*41349Sbostic #endif
1286*41349Sbostic 
1287*41349Sbostic /*
1288*41349Sbostic  * The following is provided for those people who do not have strcspn() in
1289*41349Sbostic  * their C libraries.  They should get off their butts and do something
1290*41349Sbostic  * about it; at least one public-domain implementation of those (highly
1291*41349Sbostic  * useful) string routines has been published on Usenet.
1292*41349Sbostic  */
1293*41349Sbostic #ifdef STRCSPN
1294*41349Sbostic /*
1295*41349Sbostic  * strcspn - find length of initial segment of s1 consisting entirely
1296*41349Sbostic  * of characters not from s2
1297*41349Sbostic  */
1298*41349Sbostic 
1299*41349Sbostic static int
1300*41349Sbostic strcspn(s1, s2)
1301*41349Sbostic char *s1;
1302*41349Sbostic char *s2;
1303*41349Sbostic {
1304*41349Sbostic 	register char *scan1;
1305*41349Sbostic 	register char *scan2;
1306*41349Sbostic 	register int count;
1307*41349Sbostic 
1308*41349Sbostic 	count = 0;
1309*41349Sbostic 	for (scan1 = s1; *scan1 != '\0'; scan1++) {
1310*41349Sbostic 		for (scan2 = s2; *scan2 != '\0';)	/* ++ moved down. */
1311*41349Sbostic 			if (*scan1 == *scan2++)
1312*41349Sbostic 				return(count);
1313*41349Sbostic 		count++;
1314*41349Sbostic 	}
1315*41349Sbostic 	return(count);
1316*41349Sbostic }
1317*41349Sbostic #endif
1318