xref: /plan9/sys/src/ape/cmd/pax/regexp.c (revision 9a747e4fd48b9f4522c70c07e8f882a15030f964)
1 /* $Source: /u/mark/src/pax/RCS/regexp.c,v $
2  *
3  * $Revision: 1.2 $
4  *
5  * regexp.c - regular expression matching
6  *
7  * DESCRIPTION
8  *
9  *	Underneath the reformatting and comment blocks which were added to
10  *	make it consistent with the rest of the code, you will find a
11  *	modified version of Henry Specer's regular expression library.
12  *	Henry's functions were modified to provide the minimal regular
13  *	expression matching, as required by P1003.  Henry's code was
14  *	copyrighted, and copy of the copyright message and restrictions
15  *	are provided, verbatim, below:
16  *
17  *	Copyright (c) 1986 by University of Toronto.
18  *	Written by Henry Spencer.  Not derived from licensed software.
19  *
20  *	Permission is granted to anyone to use this software for any
21  *	purpose on any computer system, and to redistribute it freely,
22  *	subject to the following restrictions:
23  *
24  *	1. The author is not responsible for the consequences of use of
25  *         this software, no matter how awful, even if they arise
26  *	   from defects in it.
27  *
28  *	2. The origin of this software must not be misrepresented, either
29  *	   by explicit claim or by omission.
30  *
31  *	3. Altered versions must be plainly marked as such, and must not
32  *	   be misrepresented as being the original software.
33  *
34  * 	Beware that some of this code is subtly aware of the way operator
35  * 	precedence is structured in regular expressions.  Serious changes in
36  * 	regular-expression syntax might require a total rethink.
37  *
38  * AUTHORS
39  *
40  *     Mark H. Colburn, NAPS International (mark@jhereg.mn.org)
41  *     Henry Spencer, University of Torronto (henry@utzoo.edu)
42  *
43  * Sponsored by The USENIX Association for public distribution.
44  *
45  * $Log:	regexp.c,v $
46  * Revision 1.2  89/02/12  10:05:39  mark
47  * 1.2 release fixes
48  *
49  * Revision 1.1  88/12/23  18:02:32  mark
50  * Initial revision
51  *
52  */
53 
54 /* Headers */
55 
56 #include "pax.h"
57 
58 #ifndef lint
59 static char    *Ident = "$Id: regexp.c,v 1.2 89/02/12 10:05:39 mark Exp $";
60 #endif
61 
62 
63 /*
64  * The "internal use only" fields in regexp.h are present to pass info from
65  * compile to execute that permits the execute phase to run lots faster on
66  * simple cases.  They are:
67  *
68  * regstart	char that must begin a match; '\0' if none obvious
69  * reganch	is the match anchored (at beginning-of-line only)?
70  * regmust	string (pointer into program) that match must include, or NULL
71  * regmlen	length of regmust string
72  *
73  * Regstart and reganch permit very fast decisions on suitable starting points
74  * for a match, cutting down the work a lot.  Regmust permits fast rejection
75  * of lines that cannot possibly match.  The regmust tests are costly enough
76  * that regcomp() supplies a regmust only if the r.e. contains something
77  * potentially expensive (at present, the only such thing detected is * or +
78  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
79  * supplied because the test in regexec() needs it and regcomp() is computing
80  * it anyway.
81  */
82 
83 /*
84  * Structure for regexp "program".  This is essentially a linear encoding
85  * of a nondeterministic finite-state machine (aka syntax charts or
86  * "railroad normal form" in parsing technology).  Each node is an opcode
87  * plus a "nxt" pointer, possibly plus an operand.  "Nxt" pointers of
88  * all nodes except BRANCH implement concatenation; a "nxt" pointer with
89  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
90  * have one of the subtle syntax dependencies:  an individual BRANCH (as
91  * opposed to a collection of them) is never concatenated with anything
92  * because of operator precedence.)  The operand of some types of node is
93  * a literal string; for others, it is a node leading into a sub-FSM.  In
94  * particular, the operand of a BRANCH node is the first node of the branch.
95  * (NB this is *not* a tree structure:  the tail of the branch connects
96  * to the thing following the set of BRANCHes.)  The opcodes are:
97  */
98 
99 /* definition	number	opnd?	meaning */
100 #define	END	0		/* no	End of program. */
101 #define	BOL	1		/* no	Match "" at beginning of line. */
102 #define	EOL	2		/* no	Match "" at end of line. */
103 #define	ANY	3		/* no	Match any one character. */
104 #define	ANYOF	4		/* str	Match any character in this string. */
105 #define	ANYBUT	5		/* str	Match any character not in this
106 				 * string. */
107 #define	BRANCH	6		/* node	Match this alternative, or the
108 				 * nxt... */
109 #define	BACK	7		/* no	Match "", "nxt" ptr points backward. */
110 #define	EXACTLY	8		/* str	Match this string. */
111 #define	NOTHING	9		/* no	Match empty string. */
112 #define	STAR	10		/* node	Match this (simple) thing 0 or more
113 				 * times. */
114 #define	OPEN	20		/* no	Mark this point in input as start of
115 				 * #n. */
116  /* OPEN+1 is number 1, etc. */
117 #define	CLOSE	30		/* no	Analogous to OPEN. */
118 
119 /*
120  * Opcode notes:
121  *
122  * BRANCH	The set of branches constituting a single choice are hooked
123  *		together with their "nxt" pointers, since precedence prevents
124  *		anything being concatenated to any individual branch.  The
125  *		"nxt" pointer of the last BRANCH in a choice points to the
126  *		thing following the whole choice.  This is also where the
127  *		final "nxt" pointer of each individual branch points; each
128  *		branch starts with the operand node of a BRANCH node.
129  *
130  * BACK		Normal "nxt" pointers all implicitly point forward; BACK
131  *		exists to make loop structures possible.
132  *
133  * STAR		complex '*', are implemented as circular BRANCH structures
134  *		using BACK.  Simple cases (one character per match) are
135  *		implemented with STAR for speed and to minimize recursive
136  *		plunges.
137  *
138  * OPEN,CLOSE	...are numbered at compile time.
139  */
140 
141 /*
142  * A node is one char of opcode followed by two chars of "nxt" pointer.
143  * "Nxt" pointers are stored as two 8-bit pieces, high order first.  The
144  * value is a positive offset from the opcode of the node containing it.
145  * An operand, if any, simply follows the node.  (Note that much of the
146  * code generation knows about this implicit relationship.)
147  *
148  * Using two bytes for the "nxt" pointer is vast overkill for most things,
149  * but allows patterns to get big without disasters.
150  */
151 #define	OP(p)	(*(p))
152 #define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
153 #define	OPERAND(p)	((p) + 3)
154 
155 /*
156  * Utility definitions.
157  */
158 
159 #define	FAIL(m)	{ regerror(m); return(NULL); }
160 #define	ISMULT(c)	((c) == '*')
161 #define	META	"^$.[()|*\\"
162 #ifndef CHARBITS
163 #define	UCHARAT(p)	((int)*(unsigned char *)(p))
164 #else
165 #define	UCHARAT(p)	((int)*(p)&CHARBITS)
166 #endif
167 
168 /*
169  * Flags to be passed up and down.
170  */
171 #define	HASWIDTH	01	/* Known never to match null string. */
172 #define	SIMPLE		02	/* Simple enough to be STAR operand. */
173 #define	SPSTART		04	/* Starts with * */
174 #define	WORST		0	/* Worst case. */
175 
176 /*
177  * Global work variables for regcomp().
178  */
179 static char    *regparse;	/* Input-scan pointer. */
180 static int      regnpar;	/* () count. */
181 static char     regdummy;
182 static char    *regcode;	/* Code-emit pointer; &regdummy = don't. */
183 static long     regsize;	/* Code size. */
184 
185 /*
186  * Forward declarations for regcomp()'s friends.
187  */
188 #ifndef STATIC
189 #define	STATIC	static
190 #endif
191 STATIC char    *reg();
192 STATIC char    *regbranch();
193 STATIC char    *regpiece();
194 STATIC char    *regatom();
195 STATIC char    *regnode();
196 STATIC char    *regnext();
197 STATIC void     regc();
198 STATIC void     reginsert();
199 STATIC void     regtail();
200 STATIC void     regoptail();
201 #ifdef STRCSPN
202 STATIC int      strcspn();
203 #endif
204 
205 /*
206  - regcomp - compile a regular expression into internal code
207  *
208  * We can't allocate space until we know how big the compiled form will be,
209  * but we can't compile it (and thus know how big it is) until we've got a
210  * place to put the code.  So we cheat:  we compile it twice, once with code
211  * generation turned off and size counting turned on, and once "for real".
212  * This also means that we don't allocate space until we are sure that the
213  * thing really will compile successfully, and we never have to move the
214  * code and thus invalidate pointers into it.  (Note that it has to be in
215  * one piece because free() must be able to free it all.)
216  *
217  * Beware that the optimization-preparation code in here knows about some
218  * of the structure of the compiled regexp.
219  */
regcomp(exp)220 regexp *regcomp(exp)
221 char           *exp;
222 {
223     register regexp *r;
224     register char  *scan;
225     register char  *longest;
226     register int    len;
227     int             flags;
228     extern char    *malloc();
229 
230     if (exp == (char *)NULL)
231 	FAIL("NULL argument");
232 
233     /* First pass: determine size, legality. */
234     regparse = exp;
235     regnpar = 1;
236     regsize = 0L;
237     regcode = &regdummy;
238     regc(MAGIC);
239     if (reg(0, &flags) == (char *)NULL)
240 	return ((regexp *)NULL);
241 
242     /* Small enough for pointer-storage convention? */
243     if (regsize >= 32767L)	/* Probably could be 65535L. */
244 	FAIL("regexp too big");
245 
246     /* Allocate space. */
247     r = (regexp *) malloc(sizeof(regexp) + (unsigned) regsize);
248     if (r == (regexp *) NULL)
249 	FAIL("out of space");
250 
251     /* Second pass: emit code. */
252     regparse = exp;
253     regnpar = 1;
254     regcode = r->program;
255     regc(MAGIC);
256     if (reg(0, &flags) == NULL)
257 	return ((regexp *) NULL);
258 
259     /* Dig out information for optimizations. */
260     r->regstart = '\0';		/* Worst-case defaults. */
261     r->reganch = 0;
262     r->regmust = NULL;
263     r->regmlen = 0;
264     scan = r->program + 1;	/* First BRANCH. */
265     if (OP(regnext(scan)) == END) {	/* Only one top-level choice. */
266 	scan = OPERAND(scan);
267 
268 	/* Starting-point info. */
269 	if (OP(scan) == EXACTLY)
270 	    r->regstart = *OPERAND(scan);
271 	else if (OP(scan) == BOL)
272 	    r->reganch++;
273 
274 	/*
275 	 * If there's something expensive in the r.e., find the longest
276 	 * literal string that must appear and make it the regmust.  Resolve
277 	 * ties in favor of later strings, since the regstart check works
278 	 * with the beginning of the r.e. and avoiding duplication
279 	 * strengthens checking.  Not a strong reason, but sufficient in the
280 	 * absence of others.
281 	 */
282 	if (flags & SPSTART) {
283 	    longest = NULL;
284 	    len = 0;
285 	    for (; scan != NULL; scan = regnext(scan))
286 		if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
287 		    longest = OPERAND(scan);
288 		    len = strlen(OPERAND(scan));
289 		}
290 	    r->regmust = longest;
291 	    r->regmlen = len;
292 	}
293     }
294     return (r);
295 }
296 
297 /*
298  - reg - regular expression, i.e. main body or parenthesized thing
299  *
300  * Caller must absorb opening parenthesis.
301  *
302  * Combining parenthesis handling with the base level of regular expression
303  * is a trifle forced, but the need to tie the tails of the branches to what
304  * follows makes it hard to avoid.
305  */
reg(paren,flagp)306 static char *reg(paren, flagp)
307 int             paren;		/* Parenthesized? */
308 int            *flagp;
309 {
310     register char  *ret;
311     register char  *br;
312     register char  *ender;
313     register int    parno;
314     int             flags;
315 
316     *flagp = HASWIDTH;		/* Tentatively. */
317 
318     /* Make an OPEN node, if parenthesized. */
319     if (paren) {
320 	if (regnpar >= NSUBEXP)
321 	    FAIL("too many ()");
322 	parno = regnpar;
323 	regnpar++;
324 	ret = regnode(OPEN + parno);
325     } else
326 	ret = (char *)NULL;
327 
328     /* Pick up the branches, linking them together. */
329     br = regbranch(&flags);
330     if (br == (char *)NULL)
331 	return ((char *)NULL);
332     if (ret != (char *)NULL)
333 	regtail(ret, br);	/* OPEN -> first. */
334     else
335 	ret = br;
336     if (!(flags & HASWIDTH))
337 	*flagp &= ~HASWIDTH;
338     *flagp |= flags & SPSTART;
339     while (*regparse == '|') {
340 	regparse++;
341 	br = regbranch(&flags);
342 	if (br == (char *)NULL)
343 	    return ((char *)NULL);
344 	regtail(ret, br);	/* BRANCH -> BRANCH. */
345 	if (!(flags & HASWIDTH))
346 	    *flagp &= ~HASWIDTH;
347 	*flagp |= flags & SPSTART;
348     }
349 
350     /* Make a closing node, and hook it on the end. */
351     ender = regnode((paren) ? CLOSE + parno : END);
352     regtail(ret, ender);
353 
354     /* Hook the tails of the branches to the closing node. */
355     for (br = ret; br != (char *)NULL; br = regnext(br))
356 	regoptail(br, ender);
357 
358     /* Check for proper termination. */
359     if (paren && *regparse++ != ')') {
360 	FAIL("unmatched ()");
361     } else if (!paren && *regparse != '\0') {
362 	if (*regparse == ')') {
363 	    FAIL("unmatched ()");
364 	} else
365 	    FAIL("junk on end");/* "Can't happen". */
366 	/* NOTREACHED */
367     }
368     return (ret);
369 }
370 
371 /*
372  - regbranch - one alternative of an | operator
373  *
374  * Implements the concatenation operator.
375  */
regbranch(flagp)376 static char  *regbranch(flagp)
377 int            *flagp;
378 {
379     register char  *ret;
380     register char  *chain;
381     register char  *latest;
382     int             flags;
383 
384     *flagp = WORST;		/* Tentatively. */
385 
386     ret = regnode(BRANCH);
387     chain = (char *)NULL;
388     while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
389 	latest = regpiece(&flags);
390 	if (latest == (char *)NULL)
391 	    return ((char *)NULL);
392 	*flagp |= flags & HASWIDTH;
393 	if (chain == (char *)NULL)	/* First piece. */
394 	    *flagp |= flags & SPSTART;
395 	else
396 	    regtail(chain, latest);
397 	chain = latest;
398     }
399     if (chain == (char *)NULL)		/* Loop ran zero times. */
400 	regnode(NOTHING);
401 
402     return (ret);
403 }
404 
405 /*
406  - regpiece - something followed by possible [*]
407  *
408  * Note that the branching code sequence used for * is somewhat optimized:
409  * they use the same NOTHING node as both the endmarker for their branch
410  * list and the body of the last branch.  It might seem that this node could
411  * be dispensed with entirely, but the endmarker role is not redundant.
412  */
regpiece(flagp)413 static char *regpiece(flagp)
414 int            *flagp;
415 {
416     register char  *ret;
417     register char   op;
418     register char  *nxt;
419     int             flags;
420 
421     ret = regatom(&flags);
422     if (ret == (char *)NULL)
423 	return ((char *)NULL);
424 
425     op = *regparse;
426     if (!ISMULT(op)) {
427 	*flagp = flags;
428 	return (ret);
429     }
430     if (!(flags & HASWIDTH))
431 	FAIL("* operand could be empty");
432     *flagp = (WORST | SPSTART);
433 
434     if (op == '*' && (flags & SIMPLE))
435 	reginsert(STAR, ret);
436     else if (op == '*') {
437 	/* Emit x* as (x&|), where & means "self". */
438 	reginsert(BRANCH, ret);	/* Either x */
439 	regoptail(ret, regnode(BACK));	/* and loop */
440 	regoptail(ret, ret);	/* back */
441 	regtail(ret, regnode(BRANCH));	/* or */
442 	regtail(ret, regnode(NOTHING));	/* null. */
443     }
444     regparse++;
445     if (ISMULT(*regparse))
446 	FAIL("nested *");
447 
448     return (ret);
449 }
450 
451 /*
452  - regatom - the lowest level
453  *
454  * Optimization:  gobbles an entire sequence of ordinary characters so that
455  * it can turn them into a single node, which is smaller to store and
456  * faster to run.  Backslashed characters are exceptions, each becoming a
457  * separate node; the code is simpler that way and it's not worth fixing.
458  */
regatom(flagp)459 static char *regatom(flagp)
460 int            *flagp;
461 {
462     register char  *ret;
463     int             flags;
464 
465     *flagp = WORST;		/* Tentatively. */
466 
467     switch (*regparse++) {
468     case '^':
469 	ret = regnode(BOL);
470 	break;
471     case '$':
472 	ret = regnode(EOL);
473 	break;
474     case '.':
475 	ret = regnode(ANY);
476 	*flagp |= HASWIDTH | SIMPLE;
477 	break;
478     case '[':{
479 	    register int    class;
480 	    register int    classend;
481 
482 	    if (*regparse == '^') {	/* Complement of range. */
483 		ret = regnode(ANYBUT);
484 		regparse++;
485 	    } else
486 		ret = regnode(ANYOF);
487 	    if (*regparse == ']' || *regparse == '-')
488 		regc(*regparse++);
489 	    while (*regparse != '\0' && *regparse != ']') {
490 		if (*regparse == '-') {
491 		    regparse++;
492 		    if (*regparse == ']' || *regparse == '\0')
493 			regc('-');
494 		    else {
495 			class = UCHARAT(regparse - 2) + 1;
496 			classend = UCHARAT(regparse);
497 			if (class > classend + 1)
498 			    FAIL("invalid [] range");
499 			for (; class <= classend; class++)
500 			    regc(class);
501 			regparse++;
502 		    }
503 		} else
504 		    regc(*regparse++);
505 	    }
506 	    regc('\0');
507 	    if (*regparse != ']')
508 		FAIL("unmatched []");
509 	    regparse++;
510 	    *flagp |= HASWIDTH | SIMPLE;
511 	}
512 	break;
513     case '(':
514 	ret = reg(1, &flags);
515 	if (ret == (char *)NULL)
516 	    return ((char *)NULL);
517 	*flagp |= flags & (HASWIDTH | SPSTART);
518 	break;
519     case '\0':
520     case '|':
521     case ')':
522 	FAIL("internal urp");	/* Supposed to be caught earlier. */
523 	break;
524     case '*':
525 	FAIL("* follows nothing");
526 	break;
527     case '\\':
528 	if (*regparse == '\0')
529 	    FAIL("trailing \\");
530 	ret = regnode(EXACTLY);
531 	regc(*regparse++);
532 	regc('\0');
533 	*flagp |= HASWIDTH | SIMPLE;
534 	break;
535     default:{
536 	    register int    len;
537 	    register char   ender;
538 
539 	    regparse--;
540 	    len = strcspn(regparse, META);
541 	    if (len <= 0)
542 		FAIL("internal disaster");
543 	    ender = *(regparse + len);
544 	    if (len > 1 && ISMULT(ender))
545 		len--;		/* Back off clear of * operand. */
546 	    *flagp |= HASWIDTH;
547 	    if (len == 1)
548 		*flagp |= SIMPLE;
549 	    ret = regnode(EXACTLY);
550 	    while (len > 0) {
551 		regc(*regparse++);
552 		len--;
553 	    }
554 	    regc('\0');
555 	}
556 	break;
557     }
558 
559     return (ret);
560 }
561 
562 /*
563  - regnode - emit a node
564  */
regnode(op)565 static char *regnode(op)
566 char            op;
567 {
568     register char  *ret;
569     register char  *ptr;
570 
571     ret = regcode;
572     if (ret == &regdummy) {
573 	regsize += 3;
574 	return (ret);
575     }
576     ptr = ret;
577     *ptr++ = op;
578     *ptr++ = '\0';		/* Null "nxt" pointer. */
579     *ptr++ = '\0';
580     regcode = ptr;
581 
582     return (ret);
583 }
584 
585 /*
586  - regc - emit (if appropriate) a byte of code
587  */
regc(b)588 static void regc(b)
589 char            b;
590 {
591     if (regcode != &regdummy)
592 	*regcode++ = b;
593     else
594 	regsize++;
595 }
596 
597 /*
598  - reginsert - insert an operator in front of already-emitted operand
599  *
600  * Means relocating the operand.
601  */
reginsert(op,opnd)602 static void reginsert(op, opnd)
603 char            op;
604 char           *opnd;
605 {
606     register char  *src;
607     register char  *dst;
608     register char  *place;
609 
610     if (regcode == &regdummy) {
611 	regsize += 3;
612 	return;
613     }
614     src = regcode;
615     regcode += 3;
616     dst = regcode;
617     while (src > opnd)
618 	*--dst = *--src;
619 
620     place = opnd;		/* Op node, where operand used to be. */
621     *place++ = op;
622     *place++ = '\0';
623     *place++ = '\0';
624 }
625 
626 /*
627  - regtail - set the next-pointer at the end of a node chain
628  */
regtail(p,val)629 static void regtail(p, val)
630 char           *p;
631 char           *val;
632 {
633     register char  *scan;
634     register char  *temp;
635     register int    offset;
636 
637     if (p == &regdummy)
638 	return;
639 
640     /* Find last node. */
641     scan = p;
642     for (;;) {
643 	temp = regnext(scan);
644 	if (temp == (char *)NULL)
645 	    break;
646 	scan = temp;
647     }
648 
649     if (OP(scan) == BACK)
650 	offset = scan - val;
651     else
652 	offset = val - scan;
653     *(scan + 1) = (offset >> 8) & 0377;
654     *(scan + 2) = offset & 0377;
655 }
656 
657 /*
658  - regoptail - regtail on operand of first argument; nop if operandless
659  */
regoptail(p,val)660 static void regoptail(p, val)
661 char           *p;
662 char           *val;
663 {
664     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
665     if (p == (char *)NULL || p == &regdummy || OP(p) != BRANCH)
666 	return;
667     regtail(OPERAND(p), val);
668 }
669 
670 /*
671  * regexec and friends
672  */
673 
674 /*
675  * Global work variables for regexec().
676  */
677 static char    *reginput;	/* String-input pointer. */
678 static char    *regbol;		/* Beginning of input, for ^ check. */
679 static char   **regstartp;	/* Pointer to startp array. */
680 static char   **regendp;	/* Ditto for endp. */
681 
682 /*
683  * Forwards.
684  */
685 STATIC int      regtry();
686 STATIC int      regmatch();
687 STATIC int      regrepeat();
688 
689 #ifdef DEBUG
690 int             regnarrate = 0;
691 void            regdump();
692 STATIC char    *regprop();
693 #endif
694 
695 /*
696  - regexec - match a regexp against a string
697  */
regexec(prog,string)698 int regexec(prog, string)
699 register regexp *prog;
700 register char  *string;
701 {
702     register char  *s;
703 
704     /* Be paranoid... */
705     if (prog == (regexp *)NULL || string == (char *)NULL) {
706 	regerror("NULL parameter");
707 	return (0);
708     }
709     /* Check validity of program. */
710     if (UCHARAT(prog->program) != MAGIC) {
711 	regerror("corrupted program");
712 	return (0);
713     }
714     /* If there is a "must appear" string, look for it. */
715     if (prog->regmust != (char *)NULL) {
716 	s = string;
717 	while ((s = strchr(s, prog->regmust[0])) != (char *)NULL) {
718 	    if (strncmp(s, prog->regmust, prog->regmlen) == 0)
719 		break;		/* Found it. */
720 	    s++;
721 	}
722 	if (s == (char *)NULL)		/* Not present. */
723 	    return (0);
724     }
725     /* Mark beginning of line for ^ . */
726     regbol = string;
727 
728     /* Simplest case:  anchored match need be tried only once. */
729     if (prog->reganch)
730 	return (regtry(prog, string));
731 
732     /* Messy cases:  unanchored match. */
733     s = string;
734     if (prog->regstart != '\0')
735 	/* We know what char it must start with. */
736 	while ((s = strchr(s, prog->regstart)) != (char *)NULL) {
737 	    if (regtry(prog, s))
738 		return (1);
739 	    s++;
740 	}
741     else
742 	/* We don't -- general case. */
743 	do {
744 	    if (regtry(prog, s))
745 		return (1);
746 	} while (*s++ != '\0');
747 
748     /* Failure. */
749     return (0);
750 }
751 
752 /*
753  - regtry - try match at specific point
754  */
755 #ifdef __STDC__
756 
regtry(regexp * prog,char * string)757 static int regtry(regexp *prog, char *string)
758 
759 #else
760 
761 static int regtry(prog, string)
762 regexp         *prog;
763 char           *string;
764 
765 #endif
766 {
767     register int    i;
768     register char **sp;
769     register char **ep;
770 
771     reginput = string;
772     regstartp = prog->startp;
773     regendp = prog->endp;
774 
775     sp = prog->startp;
776     ep = prog->endp;
777     for (i = NSUBEXP; i > 0; i--) {
778 	*sp++ = (char *)NULL;
779 	*ep++ = (char *)NULL;
780     }
781     if (regmatch(prog->program + 1)) {
782 	prog->startp[0] = string;
783 	prog->endp[0] = reginput;
784 	return (1);
785     } else
786 	return (0);
787 }
788 
789 /*
790  - regmatch - main matching routine
791  *
792  * Conceptually the strategy is simple:  check to see whether the current
793  * node matches, call self recursively to see whether the rest matches,
794  * and then act accordingly.  In practice we make some effort to avoid
795  * recursion, in particular by going through "ordinary" nodes (that don't
796  * need to know whether the rest of the match failed) by a loop instead of
797  * by recursion.
798  */
799 #ifdef __STDC__
800 
regmatch(char * prog)801 static int regmatch(char *prog)
802 
803 #else
804 
805 static int regmatch(prog)
806 char           *prog;
807 
808 #endif
809 {
810     register char  *scan;	/* Current node. */
811     char           *nxt;	/* nxt node. */
812 
813     scan = prog;
814 #ifdef DEBUG
815     if (scan != (char *)NULL && regnarrate)
816 	fprintf(stderr, "%s(\n", regprop(scan));
817 #endif
818     while (scan != (char *)NULL) {
819 #ifdef DEBUG
820 	if (regnarrate)
821 	    fprintf(stderr, "%s...\n", regprop(scan));
822 #endif
823 	nxt = regnext(scan);
824 
825 	switch (OP(scan)) {
826 	case BOL:
827 	    if (reginput != regbol)
828 		return (0);
829 	    break;
830 	case EOL:
831 	    if (*reginput != '\0')
832 		return (0);
833 	    break;
834 	case ANY:
835 	    if (*reginput == '\0')
836 		return (0);
837 	    reginput++;
838 	    break;
839 	case EXACTLY:{
840 		register int    len;
841 		register char  *opnd;
842 
843 		opnd = OPERAND(scan);
844 		/* Inline the first character, for speed. */
845 		if (*opnd != *reginput)
846 		    return (0);
847 		len = strlen(opnd);
848 		if (len > 1 && strncmp(opnd, reginput, len) != 0)
849 		    return (0);
850 		reginput += len;
851 	    }
852 	    break;
853 	case ANYOF:
854 	    if (*reginput == '\0' ||
855 		 strchr(OPERAND(scan), *reginput) == (char *)NULL)
856 		return (0);
857 	    reginput++;
858 	    break;
859 	case ANYBUT:
860 	    if (*reginput == '\0' ||
861 		 strchr(OPERAND(scan), *reginput) != (char *)NULL)
862 		return (0);
863 	    reginput++;
864 	    break;
865 	case NOTHING:
866 	    break;
867 	case BACK:
868 	    break;
869 	case OPEN + 1:
870 	case OPEN + 2:
871 	case OPEN + 3:
872 	case OPEN + 4:
873 	case OPEN + 5:
874 	case OPEN + 6:
875 	case OPEN + 7:
876 	case OPEN + 8:
877 	case OPEN + 9:{
878 		register int    no;
879 		register char  *save;
880 
881 		no = OP(scan) - OPEN;
882 		save = reginput;
883 
884 		if (regmatch(nxt)) {
885 		    /*
886 		     * Don't set startp if some later invocation of the same
887 		     * parentheses already has.
888 		     */
889 		    if (regstartp[no] == (char *)NULL)
890 			regstartp[no] = save;
891 		    return (1);
892 		} else
893 		    return (0);
894 	    }
895 	    break;
896 	case CLOSE + 1:
897 	case CLOSE + 2:
898 	case CLOSE + 3:
899 	case CLOSE + 4:
900 	case CLOSE + 5:
901 	case CLOSE + 6:
902 	case CLOSE + 7:
903 	case CLOSE + 8:
904 	case CLOSE + 9:{
905 		register int    no;
906 		register char  *save;
907 
908 		no = OP(scan) - CLOSE;
909 		save = reginput;
910 
911 		if (regmatch(nxt)) {
912 		    /*
913 		     * Don't set endp if some later invocation of the same
914 		     * parentheses already has.
915 		     */
916 		    if (regendp[no] == (char *)NULL)
917 			regendp[no] = save;
918 		    return (1);
919 		} else
920 		    return (0);
921 	    }
922 	    break;
923 	case BRANCH:{
924 		register char  *save;
925 
926 		if (OP(nxt) != BRANCH)	/* No choice. */
927 		    nxt = OPERAND(scan);	/* Avoid recursion. */
928 		else {
929 		    do {
930 			save = reginput;
931 			if (regmatch(OPERAND(scan)))
932 			    return (1);
933 			reginput = save;
934 			scan = regnext(scan);
935 		    } while (scan != (char *)NULL && OP(scan) == BRANCH);
936 		    return (0);
937 		    /* NOTREACHED */
938 		}
939 	    }
940 	    break;
941 	case STAR:{
942 		register char   nextch;
943 		register int    no;
944 		register char  *save;
945 		register int    minimum;
946 
947 		/*
948 		 * Lookahead to avoid useless match attempts when we know
949 		 * what character comes next.
950 		 */
951 		nextch = '\0';
952 		if (OP(nxt) == EXACTLY)
953 		    nextch = *OPERAND(nxt);
954 		minimum = (OP(scan) == STAR) ? 0 : 1;
955 		save = reginput;
956 		no = regrepeat(OPERAND(scan));
957 		while (no >= minimum) {
958 		    /* If it could work, try it. */
959 		    if (nextch == '\0' || *reginput == nextch)
960 			if (regmatch(nxt))
961 			    return (1);
962 		    /* Couldn't or didn't -- back up. */
963 		    no--;
964 		    reginput = save + no;
965 		}
966 		return (0);
967 	    }
968 	    break;
969 	case END:
970 	    return (1);		/* Success! */
971 	    break;
972 	default:
973 	    regerror("memory corruption");
974 	    return (0);
975 	    break;
976 	}
977 
978 	scan = nxt;
979     }
980 
981     /*
982      * We get here only if there's trouble -- normally "case END" is the
983      * terminating point.
984      */
985     regerror("corrupted pointers");
986     return (0);
987 }
988 
989 /*
990  - regrepeat - repeatedly match something simple, report how many
991  */
992 #ifdef __STDC__
993 
regrepeat(char * p)994 static int regrepeat(char *p)
995 
996 #else
997 
998 static int regrepeat(p)
999 char           *p;
1000 
1001 #endif
1002 {
1003     register int    count = 0;
1004     register char  *scan;
1005     register char  *opnd;
1006 
1007     scan = reginput;
1008     opnd = OPERAND(p);
1009     switch (OP(p)) {
1010     case ANY:
1011 	count = strlen(scan);
1012 	scan += count;
1013 	break;
1014     case EXACTLY:
1015 	while (*opnd == *scan) {
1016 	    count++;
1017 	    scan++;
1018 	}
1019 	break;
1020     case ANYOF:
1021 	while (*scan != '\0' && strchr(opnd, *scan) != (char *)NULL) {
1022 	    count++;
1023 	    scan++;
1024 	}
1025 	break;
1026     case ANYBUT:
1027 	while (*scan != '\0' && strchr(opnd, *scan) == (char *)NULL) {
1028 	    count++;
1029 	    scan++;
1030 	}
1031 	break;
1032     default:			/* Oh dear.  Called inappropriately. */
1033 	regerror("internal foulup");
1034 	count = 0;		/* Best compromise. */
1035 	break;
1036     }
1037     reginput = scan;
1038 
1039     return (count);
1040 }
1041 
1042 
1043 /*
1044  - regnext - dig the "nxt" pointer out of a node
1045  */
1046 #ifdef __STDC__
1047 
regnext(register char * p)1048 static char *regnext(register char *p)
1049 
1050 #else
1051 
1052 static char *regnext(p)
1053 register char  *p;
1054 
1055 #endif
1056 {
1057     register int    offset;
1058 
1059     if (p == &regdummy)
1060 	return ((char *)NULL);
1061 
1062     offset = NEXT(p);
1063     if (offset == 0)
1064 	return ((char *)NULL);
1065 
1066     if (OP(p) == BACK)
1067 	return (p - offset);
1068     else
1069 	return (p + offset);
1070 }
1071 
1072 #ifdef DEBUG
1073 
1074 STATIC char    *regprop();
1075 
1076 /*
1077  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1078  */
1079 #ifdef __STDC__
1080 
regdump(regexp * r)1081 void regdump(regexp *r)
1082 
1083 #else
1084 
1085 void regdump(r)
1086 regexp         *r;
1087 
1088 #endif
1089 {
1090     register char  *s;
1091     register char   op = EXACTLY;	/* Arbitrary non-END op. */
1092     register char  *nxt;
1093     extern char    *strchr();
1094 
1095 
1096     s = r->program + 1;
1097     while (op != END) {		/* While that wasn't END last time... */
1098 	op = OP(s);
1099 	printf("%2d%s", s - r->program, regprop(s));	/* Where, what. */
1100 	nxt = regnext(s);
1101 	if (nxt == (char *)NULL)	/* nxt ptr. */
1102 	    printf("(0)");
1103 	else
1104 	    printf("(%d)", (s - r->program) + (nxt - s));
1105 	s += 3;
1106 	if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1107 	    /* Literal string, where present. */
1108 	    while (*s != '\0') {
1109 		putchar(*s);
1110 		s++;
1111 	    }
1112 	    s++;
1113 	}
1114 	putchar('\n');
1115     }
1116 
1117     /* Header fields of interest. */
1118     if (r->regstart != '\0')
1119 	printf("start `%c' ", r->regstart);
1120     if (r->reganch)
1121 	printf("anchored ");
1122     if (r->regmust != (char *)NULL)
1123 	printf("must have \"%s\"", r->regmust);
1124     printf("\n");
1125 }
1126 
1127 /*
1128  - regprop - printable representation of opcode
1129  */
1130 #ifdef __STDC__
1131 
regprop(char * op)1132 static char *regprop(char *op)
1133 
1134 #else
1135 
1136 static char *regprop(op)
1137 char           *op;
1138 
1139 #endif
1140 {
1141     register char  *p;
1142     static char     buf[50];
1143 
1144     strcpy(buf, ":");
1145 
1146     switch (OP(op)) {
1147     case BOL:
1148 	p = "BOL";
1149 	break;
1150     case EOL:
1151 	p = "EOL";
1152 	break;
1153     case ANY:
1154 	p = "ANY";
1155 	break;
1156     case ANYOF:
1157 	p = "ANYOF";
1158 	break;
1159     case ANYBUT:
1160 	p = "ANYBUT";
1161 	break;
1162     case BRANCH:
1163 	p = "BRANCH";
1164 	break;
1165     case EXACTLY:
1166 	p = "EXACTLY";
1167 	break;
1168     case NOTHING:
1169 	p = "NOTHING";
1170 	break;
1171     case BACK:
1172 	p = "BACK";
1173 	break;
1174     case END:
1175 	p = "END";
1176 	break;
1177     case OPEN + 1:
1178     case OPEN + 2:
1179     case OPEN + 3:
1180     case OPEN + 4:
1181     case OPEN + 5:
1182     case OPEN + 6:
1183     case OPEN + 7:
1184     case OPEN + 8:
1185     case OPEN + 9:
1186 	sprintf(buf + strlen(buf), "OPEN%d", OP(op) - OPEN);
1187 	p = (char *)NULL;
1188 	break;
1189     case CLOSE + 1:
1190     case CLOSE + 2:
1191     case CLOSE + 3:
1192     case CLOSE + 4:
1193     case CLOSE + 5:
1194     case CLOSE + 6:
1195     case CLOSE + 7:
1196     case CLOSE + 8:
1197     case CLOSE + 9:
1198 	sprintf(buf + strlen(buf), "CLOSE%d", OP(op) - CLOSE);
1199 	p = (char *)NULL;
1200 	break;
1201     case STAR:
1202 	p = "STAR";
1203 	break;
1204     default:
1205 	regerror("corrupted opcode");
1206 	break;
1207     }
1208     if (p != (char *)NULL)
1209 	strcat(buf, p);
1210     return (buf);
1211 }
1212 #endif
1213 
1214 /*
1215  * The following is provided for those people who do not have strcspn() in
1216  * their C libraries.  They should get off their butts and do something
1217  * about it; at least one public-domain implementation of those (highly
1218  * useful) string routines has been published on Usenet.
1219  */
1220 #ifdef STRCSPN
1221 /*
1222  * strcspn - find length of initial segment of s1 consisting entirely
1223  * of characters not from s2
1224  */
1225 
1226 #ifdef __STDC__
1227 
strcspn(char * s1,char * s2)1228 static int strcspn(char *s1, char *s2)
1229 
1230 #else
1231 
1232 static int strcspn(s1, s2)
1233 char           *s1;
1234 char           *s2;
1235 
1236 #endif
1237 {
1238     register char  *scan1;
1239     register char  *scan2;
1240     register int    count;
1241 
1242     count = 0;
1243     for (scan1 = s1; *scan1 != '\0'; scan1++) {
1244 	for (scan2 = s2; *scan2 != '\0';)	/* ++ moved down. */
1245 	    if (*scan1 == *scan2++)
1246 		return (count);
1247 	count++;
1248     }
1249     return (count);
1250 }
1251 #endif
1252 
1253 
1254 /*
1255  - regsub - perform substitutions after a regexp match
1256  */
1257 #ifdef __STDC__
1258 
regsub(regexp * prog,char * source,char * dest)1259 void regsub(regexp *prog, char *source, char *dest)
1260 
1261 #else
1262 
1263 void regsub(prog, source, dest)
1264 regexp         *prog;
1265 char           *source;
1266 char           *dest;
1267 
1268 #endif
1269 {
1270     register char  *src;
1271     register char  *dst;
1272     register char   c;
1273     register int    no;
1274     register int    len;
1275     extern char    *strncpy();
1276 
1277     if (prog == (regexp *)NULL ||
1278 	source == (char *)NULL || dest == (char *)NULL) {
1279 	regerror("NULL parm to regsub");
1280 	return;
1281     }
1282     if (UCHARAT(prog->program) != MAGIC) {
1283 	regerror("damaged regexp fed to regsub");
1284 	return;
1285     }
1286     src = source;
1287     dst = dest;
1288     while ((c = *src++) != '\0') {
1289 	if (c == '&')
1290 	    no = 0;
1291 	else if (c == '\\' && '0' <= *src && *src <= '9')
1292 	    no = *src++ - '0';
1293 	else
1294 	    no = -1;
1295 
1296 	if (no < 0) {		/* Ordinary character. */
1297 	    if (c == '\\' && (*src == '\\' || *src == '&'))
1298 		c = *src++;
1299 	    *dst++ = c;
1300 	} else if (prog->startp[no] != (char *)NULL &&
1301 		   prog->endp[no] != (char *)NULL) {
1302 	    len = prog->endp[no] - prog->startp[no];
1303 	    strncpy(dst, prog->startp[no], len);
1304 	    dst += len;
1305 	    if (len != 0 && *(dst - 1) == '\0') {	/* strncpy hit NUL. */
1306 		regerror("damaged match string");
1307 		return;
1308 	    }
1309 	}
1310     }
1311     *dst++ = '\0';
1312 }
1313 
1314 
1315 #ifdef __STDC__
1316 
regerror(char * s)1317 void regerror(char *s)
1318 
1319 #else
1320 
1321 void regerror(s)
1322 char           *s;
1323 
1324 #endif
1325 {
1326     fprintf(stderr, "regexp(3): %s", s);
1327     exit(1);
1328 }
1329