1*838f5788Ssimonb /* $NetBSD: regexp.c,v 1.4 2023/10/06 05:49:49 simonb Exp $ */
220006a0bStron
320006a0bStron /*
420006a0bStron * regcomp and regexec -- regsub and regerror are elsewhere
520006a0bStron *
620006a0bStron * Copyright (c) 1986 by University of Toronto.
720006a0bStron * Written by Henry Spencer. Not derived from licensed software.
820006a0bStron *
920006a0bStron * Permission is granted to anyone to use this software for any
1020006a0bStron * purpose on any computer system, and to redistribute it freely,
1120006a0bStron * subject to the following restrictions:
1220006a0bStron *
1320006a0bStron * 1. The author is not responsible for the consequences of use of
1420006a0bStron * this software, no matter how awful, even if they arise
1520006a0bStron * from defects in it.
1620006a0bStron *
1720006a0bStron * 2. The origin of this software must not be misrepresented, either
1820006a0bStron * by explicit claim or by omission.
1920006a0bStron *
2020006a0bStron * 3. Altered versions must be plainly marked as such, and must not
2120006a0bStron * be misrepresented as being the original software.
2220006a0bStron *
2320006a0bStron * Beware that some of this code is subtly aware of the way operator
2420006a0bStron * precedence is structured in regular expressions. Serious changes in
2520006a0bStron * regular-expression syntax might require a total rethink.
2620006a0bStron *
2720006a0bStron * *** NOTE: this code has been altered slightly for use in Tcl. ***
2820006a0bStron * Slightly modified by David MacKenzie to undo most of the changes for TCL.
2920006a0bStron * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman
3020006a0bStron */
3120006a0bStron
3220006a0bStron #include "less.h"
3320006a0bStron #if HAVE_STDIO_H
3420006a0bStron #include <stdio.h>
3520006a0bStron #endif
3620006a0bStron #if HAVE_STDLIB_H
3720006a0bStron #include <stdlib.h>
3820006a0bStron #endif
3920006a0bStron #if HAVE_STRING_H
4020006a0bStron #include <string.h>
4120006a0bStron #endif
4220006a0bStron #include "regexp.h"
4320006a0bStron
4420006a0bStron /*
4520006a0bStron * The "internal use only" fields in regexp.h are present to pass info from
4620006a0bStron * compile to execute that permits the execute phase to run lots faster on
4720006a0bStron * simple cases. They are:
4820006a0bStron *
4920006a0bStron * regstart char that must begin a match; '\0' if none obvious
5020006a0bStron * reganch is the match anchored (at beginning-of-line only)?
5120006a0bStron * regmust string (pointer into program) that match must include, or NULL
5220006a0bStron * regmlen length of regmust string
5320006a0bStron *
5420006a0bStron * Regstart and reganch permit very fast decisions on suitable starting points
5520006a0bStron * for a match, cutting down the work a lot. Regmust permits fast rejection
5620006a0bStron * of lines that cannot possibly match. The regmust tests are costly enough
5720006a0bStron * that regcomp() supplies a regmust only if the r.e. contains something
5820006a0bStron * potentially expensive (at present, the only such thing detected is * or +
5920006a0bStron * at the start of the r.e., which can involve a lot of backup). Regmlen is
6020006a0bStron * supplied because the test in regexec() needs it and regcomp() is
6120006a0bStron * computing it anyway.
6220006a0bStron */
6320006a0bStron
6420006a0bStron /*
6520006a0bStron * Structure for regexp "program". This is essentially a linear encoding
6620006a0bStron * of a nondeterministic finite-state machine (aka syntax charts or
6720006a0bStron * "railroad normal form" in parsing technology). Each node is an opcode
6820006a0bStron * plus a "next" pointer, possibly plus an operand. "Next" pointers of
6920006a0bStron * all nodes except BRANCH implement concatenation; a "next" pointer with
7020006a0bStron * a BRANCH on both ends of it is connecting two alternatives. (Here we
7120006a0bStron * have one of the subtle syntax dependencies: an individual BRANCH (as
7220006a0bStron * opposed to a collection of them) is never concatenated with anything
7320006a0bStron * because of operator precedence.) The operand of some types of node is
7420006a0bStron * a literal string; for others, it is a node leading into a sub-FSM. In
7520006a0bStron * particular, the operand of a BRANCH node is the first node of the branch.
7620006a0bStron * (NB this is *not* a tree structure: the tail of the branch connects
7720006a0bStron * to the thing following the set of BRANCHes.) The opcodes are:
7820006a0bStron */
7920006a0bStron
8020006a0bStron /* definition number opnd? meaning */
8120006a0bStron #undef EOL
8220006a0bStron #define END 0 /* no End of program. */
8320006a0bStron #define BOL 1 /* no Match "" at beginning of line. */
8420006a0bStron #define EOL 2 /* no Match "" at end of line. */
8520006a0bStron #define ANY 3 /* no Match any one character. */
8620006a0bStron #define ANYOF 4 /* str Match any character in this string. */
8720006a0bStron #define ANYBUT 5 /* str Match any character not in this string. */
8820006a0bStron #define BRANCH 6 /* node Match this alternative, or the next... */
8920006a0bStron #define BACK 7 /* no Match "", "next" ptr points backward. */
9020006a0bStron #define EXACTLY 8 /* str Match this string. */
9120006a0bStron #define NOTHING 9 /* no Match empty string. */
9220006a0bStron #define STAR 10 /* node Match this (simple) thing 0 or more times. */
9320006a0bStron #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
9420006a0bStron #define OPEN 20 /* no Mark this point in input as start of #n. */
9520006a0bStron /* OPEN+1 is number 1, etc. */
9620006a0bStron #define CLOSE 30 /* no Analogous to OPEN. */
9720006a0bStron
9820006a0bStron /*
9920006a0bStron * Opcode notes:
10020006a0bStron *
10120006a0bStron * BRANCH The set of branches constituting a single choice are hooked
10220006a0bStron * together with their "next" pointers, since precedence prevents
10320006a0bStron * anything being concatenated to any individual branch. The
10420006a0bStron * "next" pointer of the last BRANCH in a choice points to the
10520006a0bStron * thing following the whole choice. This is also where the
10620006a0bStron * final "next" pointer of each individual branch points; each
10720006a0bStron * branch starts with the operand node of a BRANCH node.
10820006a0bStron *
10920006a0bStron * BACK Normal "next" pointers all implicitly point forward; BACK
11020006a0bStron * exists to make loop structures possible.
11120006a0bStron *
11220006a0bStron * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
11320006a0bStron * BRANCH structures using BACK. Simple cases (one character
11420006a0bStron * per match) are implemented with STAR and PLUS for speed
11520006a0bStron * and to minimize recursive plunges.
11620006a0bStron *
11720006a0bStron * OPEN,CLOSE ...are numbered at compile time.
11820006a0bStron */
11920006a0bStron
12020006a0bStron /*
12120006a0bStron * A node is one char of opcode followed by two chars of "next" pointer.
12220006a0bStron * "Next" pointers are stored as two 8-bit pieces, high order first. The
12320006a0bStron * value is a positive offset from the opcode of the node containing it.
12420006a0bStron * An operand, if any, simply follows the node. (Note that much of the
12520006a0bStron * code generation knows about this implicit relationship.)
12620006a0bStron *
12720006a0bStron * Using two bytes for the "next" pointer is vast overkill for most things,
12820006a0bStron * but allows patterns to get big without disasters.
12920006a0bStron */
13020006a0bStron #define OP(p) (*(p))
13120006a0bStron #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
13220006a0bStron #define OPERAND(p) ((p) + 3)
13320006a0bStron
13420006a0bStron /*
13520006a0bStron * See regmagic.h for one further detail of program structure.
13620006a0bStron */
13720006a0bStron
13820006a0bStron
13920006a0bStron /*
14020006a0bStron * Utility definitions.
14120006a0bStron */
14220006a0bStron #ifndef CHARBITS
14320006a0bStron #define UCHARAT(p) ((int)*(unsigned char *)(p))
14420006a0bStron #else
14520006a0bStron #define UCHARAT(p) ((int)*(p)&CHARBITS)
14620006a0bStron #endif
14720006a0bStron
14820006a0bStron #define FAIL(m) { regerror(m); return(NULL); }
14920006a0bStron #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
15020006a0bStron #define META "^$.[()|?+*\\"
15120006a0bStron
15220006a0bStron /*
15320006a0bStron * Flags to be passed up and down.
15420006a0bStron */
15520006a0bStron #define HASWIDTH 01 /* Known never to match null string. */
15620006a0bStron #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
15720006a0bStron #define SPSTART 04 /* Starts with * or +. */
15820006a0bStron #define WORST 0 /* Worst case. */
15920006a0bStron
16020006a0bStron /*
16120006a0bStron * Global work variables for regcomp().
16220006a0bStron */
16320006a0bStron static char *regparse; /* Input-scan pointer. */
16420006a0bStron static int regnpar; /* () count. */
16520006a0bStron static char regdummy;
16620006a0bStron static char *regcode; /* Code-emit pointer; ®dummy = don't. */
16720006a0bStron static long regsize; /* Code size. */
16820006a0bStron
16920006a0bStron /*
17020006a0bStron * The first byte of the regexp internal "program" is actually this magic
17120006a0bStron * number; the start node begins in the second byte.
17220006a0bStron */
17320006a0bStron #define MAGIC 0234
17420006a0bStron
17520006a0bStron
17620006a0bStron /*
17720006a0bStron * Forward declarations for regcomp()'s friends.
17820006a0bStron */
17920006a0bStron #ifndef STATIC
18020006a0bStron #define STATIC static
18120006a0bStron #endif
18220006a0bStron STATIC char *reg();
18320006a0bStron STATIC char *regbranch();
18420006a0bStron STATIC char *regpiece();
18520006a0bStron STATIC char *regatom();
18620006a0bStron STATIC char *regnode();
18720006a0bStron STATIC char *regnext();
18820006a0bStron STATIC void regc();
18920006a0bStron STATIC void reginsert();
19020006a0bStron STATIC void regtail();
19120006a0bStron STATIC void regoptail();
19220006a0bStron #ifdef STRCSPN
19320006a0bStron STATIC int strcspn();
19420006a0bStron #endif
19520006a0bStron
19620006a0bStron /*
19720006a0bStron - regcomp - compile a regular expression into internal code
19820006a0bStron *
19920006a0bStron * We can't allocate space until we know how big the compiled form will be,
20020006a0bStron * but we can't compile it (and thus know how big it is) until we've got a
20120006a0bStron * place to put the code. So we cheat: we compile it twice, once with code
20220006a0bStron * generation turned off and size counting turned on, and once "for real".
20320006a0bStron * This also means that we don't allocate space until we are sure that the
20420006a0bStron * thing really will compile successfully, and we never have to move the
20520006a0bStron * code and thus invalidate pointers into it. (Note that it has to be in
20620006a0bStron * one piece because free() must be able to free it all.)
20720006a0bStron *
20820006a0bStron * Beware that the optimization-preparation code in here knows about some
20920006a0bStron * of the structure of the compiled regexp.
21020006a0bStron */
21120006a0bStron regexp *
regcomp(exp)21220006a0bStron regcomp(exp)
21320006a0bStron char *exp;
21420006a0bStron {
21520006a0bStron register regexp *r;
21620006a0bStron register char *scan;
21720006a0bStron register char *longest;
21820006a0bStron register int len;
21920006a0bStron int flags;
22020006a0bStron
22120006a0bStron if (exp == NULL)
22220006a0bStron FAIL("NULL argument");
22320006a0bStron
22420006a0bStron /* First pass: determine size, legality. */
22520006a0bStron regparse = exp;
22620006a0bStron regnpar = 1;
22720006a0bStron regsize = 0L;
22820006a0bStron regcode = ®dummy;
22920006a0bStron regc(MAGIC);
23020006a0bStron if (reg(0, &flags) == NULL)
23120006a0bStron return(NULL);
23220006a0bStron
23320006a0bStron /* Small enough for pointer-storage convention? */
23420006a0bStron if (regsize >= 32767L) /* Probably could be 65535L. */
23520006a0bStron FAIL("regexp too big");
23620006a0bStron
23720006a0bStron /* Allocate space. */
23820006a0bStron r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
23920006a0bStron if (r == NULL)
24020006a0bStron FAIL("out of space");
24120006a0bStron
24220006a0bStron /* Second pass: emit code. */
24320006a0bStron regparse = exp;
24420006a0bStron regnpar = 1;
24520006a0bStron regcode = r->program;
24620006a0bStron regc(MAGIC);
24720006a0bStron if (reg(0, &flags) == NULL)
248*838f5788Ssimonb {
249*838f5788Ssimonb free(r);
25020006a0bStron return(NULL);
251*838f5788Ssimonb }
25220006a0bStron
25320006a0bStron /* Dig out information for optimizations. */
25420006a0bStron r->regstart = '\0'; /* Worst-case defaults. */
25520006a0bStron r->reganch = 0;
25620006a0bStron r->regmust = NULL;
25720006a0bStron r->regmlen = 0;
25820006a0bStron scan = r->program+1; /* First BRANCH. */
25920006a0bStron if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
26020006a0bStron scan = OPERAND(scan);
26120006a0bStron
26220006a0bStron /* Starting-point info. */
26320006a0bStron if (OP(scan) == EXACTLY)
26420006a0bStron r->regstart = *OPERAND(scan);
26520006a0bStron else if (OP(scan) == BOL)
26620006a0bStron r->reganch++;
26720006a0bStron
26820006a0bStron /*
26920006a0bStron * If there's something expensive in the r.e., find the
27020006a0bStron * longest literal string that must appear and make it the
27120006a0bStron * regmust. Resolve ties in favor of later strings, since
27220006a0bStron * the regstart check works with the beginning of the r.e.
27320006a0bStron * and avoiding duplication strengthens checking. Not a
27420006a0bStron * strong reason, but sufficient in the absence of others.
27520006a0bStron */
27620006a0bStron if (flags&SPSTART) {
27720006a0bStron longest = NULL;
27820006a0bStron len = 0;
27920006a0bStron for (; scan != NULL; scan = regnext(scan))
28020006a0bStron if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
28120006a0bStron longest = OPERAND(scan);
282*838f5788Ssimonb len = (int) strlen(OPERAND(scan));
28320006a0bStron }
28420006a0bStron r->regmust = longest;
28520006a0bStron r->regmlen = len;
28620006a0bStron }
28720006a0bStron }
28820006a0bStron
28920006a0bStron return(r);
29020006a0bStron }
29120006a0bStron
29220006a0bStron /*
29320006a0bStron - reg - regular expression, i.e. main body or parenthesized thing
29420006a0bStron *
29520006a0bStron * Caller must absorb opening parenthesis.
29620006a0bStron *
29720006a0bStron * Combining parenthesis handling with the base level of regular expression
29820006a0bStron * is a trifle forced, but the need to tie the tails of the branches to what
29920006a0bStron * follows makes it hard to avoid.
30020006a0bStron */
30120006a0bStron static char *
reg(paren,flagp)30220006a0bStron reg(paren, flagp)
30320006a0bStron int paren; /* Parenthesized? */
30420006a0bStron int *flagp;
30520006a0bStron {
30620006a0bStron register char *ret;
30720006a0bStron register char *br;
30820006a0bStron register char *ender;
30920006a0bStron register int parno = 0;
31020006a0bStron int flags;
31120006a0bStron
31220006a0bStron *flagp = HASWIDTH; /* Tentatively. */
31320006a0bStron
31420006a0bStron /* Make an OPEN node, if parenthesized. */
31520006a0bStron if (paren) {
31620006a0bStron if (regnpar >= NSUBEXP)
31720006a0bStron FAIL("too many ()");
31820006a0bStron parno = regnpar;
31920006a0bStron regnpar++;
32020006a0bStron ret = regnode(OPEN+parno);
32120006a0bStron } else
32220006a0bStron ret = NULL;
32320006a0bStron
32420006a0bStron /* Pick up the branches, linking them together. */
32520006a0bStron br = regbranch(&flags);
32620006a0bStron if (br == NULL)
32720006a0bStron return(NULL);
32820006a0bStron if (ret != NULL)
32920006a0bStron regtail(ret, br); /* OPEN -> first. */
33020006a0bStron else
33120006a0bStron ret = br;
33220006a0bStron if (!(flags&HASWIDTH))
33320006a0bStron *flagp &= ~HASWIDTH;
33420006a0bStron *flagp |= flags&SPSTART;
33520006a0bStron while (*regparse == '|') {
33620006a0bStron regparse++;
33720006a0bStron br = regbranch(&flags);
33820006a0bStron if (br == NULL)
33920006a0bStron return(NULL);
34020006a0bStron regtail(ret, br); /* BRANCH -> BRANCH. */
34120006a0bStron if (!(flags&HASWIDTH))
34220006a0bStron *flagp &= ~HASWIDTH;
34320006a0bStron *flagp |= flags&SPSTART;
34420006a0bStron }
34520006a0bStron
34620006a0bStron /* Make a closing node, and hook it on the end. */
34720006a0bStron ender = regnode((paren) ? CLOSE+parno : END);
34820006a0bStron regtail(ret, ender);
34920006a0bStron
35020006a0bStron /* Hook the tails of the branches to the closing node. */
35120006a0bStron for (br = ret; br != NULL; br = regnext(br))
35220006a0bStron regoptail(br, ender);
35320006a0bStron
35420006a0bStron /* Check for proper termination. */
35520006a0bStron if (paren && *regparse++ != ')') {
35620006a0bStron FAIL("unmatched ()");
35720006a0bStron } else if (!paren && *regparse != '\0') {
35820006a0bStron if (*regparse == ')') {
35920006a0bStron FAIL("unmatched ()");
36020006a0bStron } else
36120006a0bStron FAIL("junk on end"); /* "Can't happen". */
36220006a0bStron /* NOTREACHED */
36320006a0bStron }
36420006a0bStron
36520006a0bStron return(ret);
36620006a0bStron }
36720006a0bStron
36820006a0bStron /*
36920006a0bStron - regbranch - one alternative of an | operator
37020006a0bStron *
37120006a0bStron * Implements the concatenation operator.
37220006a0bStron */
37320006a0bStron static char *
regbranch(flagp)37420006a0bStron regbranch(flagp)
37520006a0bStron int *flagp;
37620006a0bStron {
37720006a0bStron register char *ret;
37820006a0bStron register char *chain;
37920006a0bStron register char *latest;
38020006a0bStron int flags;
38120006a0bStron
38220006a0bStron *flagp = WORST; /* Tentatively. */
38320006a0bStron
38420006a0bStron ret = regnode(BRANCH);
38520006a0bStron chain = NULL;
38620006a0bStron while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
38720006a0bStron latest = regpiece(&flags);
38820006a0bStron if (latest == NULL)
38920006a0bStron return(NULL);
39020006a0bStron *flagp |= flags&HASWIDTH;
39120006a0bStron if (chain == NULL) /* First piece. */
39220006a0bStron *flagp |= flags&SPSTART;
39320006a0bStron else
39420006a0bStron regtail(chain, latest);
39520006a0bStron chain = latest;
39620006a0bStron }
39720006a0bStron if (chain == NULL) /* Loop ran zero times. */
39820006a0bStron (void) regnode(NOTHING);
39920006a0bStron
40020006a0bStron return(ret);
40120006a0bStron }
40220006a0bStron
40320006a0bStron /*
40420006a0bStron - regpiece - something followed by possible [*+?]
40520006a0bStron *
40620006a0bStron * Note that the branching code sequences used for ? and the general cases
40720006a0bStron * of * and + are somewhat optimized: they use the same NOTHING node as
40820006a0bStron * both the endmarker for their branch list and the body of the last branch.
40920006a0bStron * It might seem that this node could be dispensed with entirely, but the
41020006a0bStron * endmarker role is not redundant.
41120006a0bStron */
41220006a0bStron static char *
regpiece(flagp)41320006a0bStron regpiece(flagp)
41420006a0bStron int *flagp;
41520006a0bStron {
41620006a0bStron register char *ret;
41720006a0bStron register char op;
41820006a0bStron register char *next;
41920006a0bStron int flags;
42020006a0bStron
42120006a0bStron ret = regatom(&flags);
42220006a0bStron if (ret == NULL)
42320006a0bStron return(NULL);
42420006a0bStron
42520006a0bStron op = *regparse;
42620006a0bStron if (!ISMULT(op)) {
42720006a0bStron *flagp = flags;
42820006a0bStron return(ret);
42920006a0bStron }
43020006a0bStron
43120006a0bStron if (!(flags&HASWIDTH) && op != '?')
43220006a0bStron FAIL("*+ operand could be empty");
43320006a0bStron *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
43420006a0bStron
43520006a0bStron if (op == '*' && (flags&SIMPLE))
43620006a0bStron reginsert(STAR, ret);
43720006a0bStron else if (op == '*') {
43820006a0bStron /* Emit x* as (x&|), where & means "self". */
43920006a0bStron reginsert(BRANCH, ret); /* Either x */
44020006a0bStron regoptail(ret, regnode(BACK)); /* and loop */
44120006a0bStron regoptail(ret, ret); /* back */
44220006a0bStron regtail(ret, regnode(BRANCH)); /* or */
44320006a0bStron regtail(ret, regnode(NOTHING)); /* null. */
44420006a0bStron } else if (op == '+' && (flags&SIMPLE))
44520006a0bStron reginsert(PLUS, ret);
44620006a0bStron else if (op == '+') {
44720006a0bStron /* Emit x+ as x(&|), where & means "self". */
44820006a0bStron next = regnode(BRANCH); /* Either */
44920006a0bStron regtail(ret, next);
45020006a0bStron regtail(regnode(BACK), ret); /* loop back */
45120006a0bStron regtail(next, regnode(BRANCH)); /* or */
45220006a0bStron regtail(ret, regnode(NOTHING)); /* null. */
45320006a0bStron } else if (op == '?') {
45420006a0bStron /* Emit x? as (x|) */
45520006a0bStron reginsert(BRANCH, ret); /* Either x */
45620006a0bStron regtail(ret, regnode(BRANCH)); /* or */
45720006a0bStron next = regnode(NOTHING); /* null. */
45820006a0bStron regtail(ret, next);
45920006a0bStron regoptail(ret, next);
46020006a0bStron }
46120006a0bStron regparse++;
46220006a0bStron if (ISMULT(*regparse))
46320006a0bStron FAIL("nested *?+");
46420006a0bStron
46520006a0bStron return(ret);
46620006a0bStron }
46720006a0bStron
46820006a0bStron /*
46920006a0bStron - regatom - the lowest level
47020006a0bStron *
47120006a0bStron * Optimization: gobbles an entire sequence of ordinary characters so that
47220006a0bStron * it can turn them into a single node, which is smaller to store and
47320006a0bStron * faster to run. Backslashed characters are exceptions, each becoming a
47420006a0bStron * separate node; the code is simpler that way and it's not worth fixing.
47520006a0bStron */
47620006a0bStron static char *
regatom(flagp)47720006a0bStron regatom(flagp)
47820006a0bStron int *flagp;
47920006a0bStron {
48020006a0bStron register char *ret;
48120006a0bStron int flags;
48220006a0bStron
48320006a0bStron *flagp = WORST; /* Tentatively. */
48420006a0bStron
48520006a0bStron switch (*regparse++) {
48620006a0bStron case '^':
48720006a0bStron ret = regnode(BOL);
48820006a0bStron break;
48920006a0bStron case '$':
49020006a0bStron ret = regnode(EOL);
49120006a0bStron break;
49220006a0bStron case '.':
49320006a0bStron ret = regnode(ANY);
49420006a0bStron *flagp |= HASWIDTH|SIMPLE;
49520006a0bStron break;
49620006a0bStron case '[': {
49720006a0bStron register int clss;
49820006a0bStron register int classend;
49920006a0bStron
50020006a0bStron if (*regparse == '^') { /* Complement of range. */
50120006a0bStron ret = regnode(ANYBUT);
50220006a0bStron regparse++;
50320006a0bStron } else
50420006a0bStron ret = regnode(ANYOF);
50520006a0bStron if (*regparse == ']' || *regparse == '-')
50620006a0bStron regc(*regparse++);
50720006a0bStron while (*regparse != '\0' && *regparse != ']') {
50820006a0bStron if (*regparse == '-') {
50920006a0bStron regparse++;
51020006a0bStron if (*regparse == ']' || *regparse == '\0')
51120006a0bStron regc('-');
51220006a0bStron else {
51320006a0bStron clss = UCHARAT(regparse-2)+1;
51420006a0bStron classend = UCHARAT(regparse);
51520006a0bStron if (clss > classend+1)
51620006a0bStron FAIL("invalid [] range");
51720006a0bStron for (; clss <= classend; clss++)
51820006a0bStron regc(clss);
51920006a0bStron regparse++;
52020006a0bStron }
52120006a0bStron } else
52220006a0bStron regc(*regparse++);
52320006a0bStron }
52420006a0bStron regc('\0');
52520006a0bStron if (*regparse != ']')
52620006a0bStron FAIL("unmatched []");
52720006a0bStron regparse++;
52820006a0bStron *flagp |= HASWIDTH|SIMPLE;
52920006a0bStron }
53020006a0bStron break;
53120006a0bStron case '(':
53220006a0bStron ret = reg(1, &flags);
53320006a0bStron if (ret == NULL)
53420006a0bStron return(NULL);
53520006a0bStron *flagp |= flags&(HASWIDTH|SPSTART);
53620006a0bStron break;
53720006a0bStron case '\0':
53820006a0bStron case '|':
53920006a0bStron case ')':
54020006a0bStron FAIL("internal urp"); /* Supposed to be caught earlier. */
54120006a0bStron /* NOTREACHED */
54220006a0bStron break;
54320006a0bStron case '?':
54420006a0bStron case '+':
54520006a0bStron case '*':
54620006a0bStron FAIL("?+* follows nothing");
54720006a0bStron /* NOTREACHED */
54820006a0bStron break;
54920006a0bStron case '\\':
55020006a0bStron if (*regparse == '\0')
55120006a0bStron FAIL("trailing \\");
55220006a0bStron ret = regnode(EXACTLY);
55320006a0bStron regc(*regparse++);
55420006a0bStron regc('\0');
55520006a0bStron *flagp |= HASWIDTH|SIMPLE;
55620006a0bStron break;
55720006a0bStron default: {
55820006a0bStron register int len;
55920006a0bStron register char ender;
56020006a0bStron
56120006a0bStron regparse--;
562*838f5788Ssimonb len = (int) strcspn(regparse, META);
56320006a0bStron if (len <= 0)
56420006a0bStron FAIL("internal disaster");
56520006a0bStron ender = *(regparse+len);
56620006a0bStron if (len > 1 && ISMULT(ender))
56720006a0bStron len--; /* Back off clear of ?+* operand. */
56820006a0bStron *flagp |= HASWIDTH;
56920006a0bStron if (len == 1)
57020006a0bStron *flagp |= SIMPLE;
57120006a0bStron ret = regnode(EXACTLY);
57220006a0bStron while (len > 0) {
57320006a0bStron regc(*regparse++);
57420006a0bStron len--;
57520006a0bStron }
57620006a0bStron regc('\0');
57720006a0bStron }
57820006a0bStron break;
57920006a0bStron }
58020006a0bStron
58120006a0bStron return(ret);
58220006a0bStron }
58320006a0bStron
58420006a0bStron /*
58520006a0bStron - regnode - emit a node
58620006a0bStron */
58720006a0bStron static char * /* Location. */
regnode(op)58820006a0bStron regnode(op)
58920006a0bStron char op;
59020006a0bStron {
59120006a0bStron register char *ret;
59220006a0bStron register char *ptr;
59320006a0bStron
59420006a0bStron ret = regcode;
59520006a0bStron if (ret == ®dummy) {
59620006a0bStron regsize += 3;
59720006a0bStron return(ret);
59820006a0bStron }
59920006a0bStron
60020006a0bStron ptr = ret;
60120006a0bStron *ptr++ = op;
60220006a0bStron *ptr++ = '\0'; /* Null "next" pointer. */
60320006a0bStron *ptr++ = '\0';
60420006a0bStron regcode = ptr;
60520006a0bStron
60620006a0bStron return(ret);
60720006a0bStron }
60820006a0bStron
60920006a0bStron /*
61020006a0bStron - regc - emit (if appropriate) a byte of code
61120006a0bStron */
61220006a0bStron static void
regc(b)61320006a0bStron regc(b)
61420006a0bStron char b;
61520006a0bStron {
61620006a0bStron if (regcode != ®dummy)
61720006a0bStron *regcode++ = b;
61820006a0bStron else
61920006a0bStron regsize++;
62020006a0bStron }
62120006a0bStron
62220006a0bStron /*
62320006a0bStron - reginsert - insert an operator in front of already-emitted operand
62420006a0bStron *
62520006a0bStron * Means relocating the operand.
62620006a0bStron */
62720006a0bStron static void
reginsert(op,opnd)62820006a0bStron reginsert(op, opnd)
62920006a0bStron char op;
63020006a0bStron char *opnd;
63120006a0bStron {
63220006a0bStron register char *src;
63320006a0bStron register char *dst;
63420006a0bStron register char *place;
63520006a0bStron
63620006a0bStron if (regcode == ®dummy) {
63720006a0bStron regsize += 3;
63820006a0bStron return;
63920006a0bStron }
64020006a0bStron
64120006a0bStron src = regcode;
64220006a0bStron regcode += 3;
64320006a0bStron dst = regcode;
64420006a0bStron while (src > opnd)
64520006a0bStron *--dst = *--src;
64620006a0bStron
64720006a0bStron place = opnd; /* Op node, where operand used to be. */
64820006a0bStron *place++ = op;
64920006a0bStron *place++ = '\0';
65020006a0bStron *place++ = '\0';
65120006a0bStron }
65220006a0bStron
65320006a0bStron /*
65420006a0bStron - regtail - set the next-pointer at the end of a node chain
65520006a0bStron */
65620006a0bStron static void
regtail(p,val)65720006a0bStron regtail(p, val)
65820006a0bStron char *p;
65920006a0bStron char *val;
66020006a0bStron {
66120006a0bStron register char *scan;
66220006a0bStron register char *temp;
66320006a0bStron register int offset;
66420006a0bStron
66520006a0bStron if (p == ®dummy)
66620006a0bStron return;
66720006a0bStron
66820006a0bStron /* Find last node. */
66920006a0bStron scan = p;
67020006a0bStron for (;;) {
67120006a0bStron temp = regnext(scan);
67220006a0bStron if (temp == NULL)
67320006a0bStron break;
67420006a0bStron scan = temp;
67520006a0bStron }
67620006a0bStron
67720006a0bStron if (OP(scan) == BACK)
678*838f5788Ssimonb offset = (int) (scan - val);
67920006a0bStron else
680*838f5788Ssimonb offset = (int) (val - scan);
68120006a0bStron *(scan+1) = (offset>>8)&0377;
68220006a0bStron *(scan+2) = offset&0377;
68320006a0bStron }
68420006a0bStron
68520006a0bStron /*
68620006a0bStron - regoptail - regtail on operand of first argument; nop if operandless
68720006a0bStron */
68820006a0bStron static void
regoptail(p,val)68920006a0bStron regoptail(p, val)
69020006a0bStron char *p;
69120006a0bStron char *val;
69220006a0bStron {
69320006a0bStron /* "Operandless" and "op != BRANCH" are synonymous in practice. */
69420006a0bStron if (p == NULL || p == ®dummy || OP(p) != BRANCH)
69520006a0bStron return;
69620006a0bStron regtail(OPERAND(p), val);
69720006a0bStron }
69820006a0bStron
69920006a0bStron /*
70020006a0bStron * regexec and friends
70120006a0bStron */
70220006a0bStron
70320006a0bStron /*
70420006a0bStron * Global work variables for regexec().
70520006a0bStron */
70620006a0bStron static char *reginput; /* String-input pointer. */
70720006a0bStron static char *regbol; /* Beginning of input, for ^ check. */
70820006a0bStron static char **regstartp; /* Pointer to startp array. */
70920006a0bStron static char **regendp; /* Ditto for endp. */
71020006a0bStron
71120006a0bStron /*
71220006a0bStron * Forwards.
71320006a0bStron */
71420006a0bStron STATIC int regtry();
71520006a0bStron STATIC int regmatch();
71620006a0bStron STATIC int regrepeat();
71720006a0bStron
71820006a0bStron #ifdef DEBUG
71920006a0bStron int regnarrate = 0;
72020006a0bStron void regdump();
72120006a0bStron STATIC char *regprop();
72220006a0bStron #endif
72320006a0bStron
72420006a0bStron /*
72520006a0bStron - regexec - match a regexp against a string
72620006a0bStron */
72720006a0bStron int
regexec2(prog,string,notbol)72820006a0bStron regexec2(prog, string, notbol)
72920006a0bStron register regexp *prog;
73020006a0bStron register char *string;
73120006a0bStron int notbol;
73220006a0bStron {
73320006a0bStron register char *s;
73420006a0bStron
73520006a0bStron /* Be paranoid... */
73620006a0bStron if (prog == NULL || string == NULL) {
73720006a0bStron regerror("NULL parameter");
73820006a0bStron return(0);
73920006a0bStron }
74020006a0bStron
74120006a0bStron /* Check validity of program. */
74220006a0bStron if (UCHARAT(prog->program) != MAGIC) {
74320006a0bStron regerror("corrupted program");
74420006a0bStron return(0);
74520006a0bStron }
74620006a0bStron
74720006a0bStron /* If there is a "must appear" string, look for it. */
74820006a0bStron if (prog->regmust != NULL) {
74920006a0bStron s = string;
75020006a0bStron while ((s = strchr(s, prog->regmust[0])) != NULL) {
75120006a0bStron if (strncmp(s, prog->regmust, prog->regmlen) == 0)
75220006a0bStron break; /* Found it. */
75320006a0bStron s++;
75420006a0bStron }
75520006a0bStron if (s == NULL) /* Not present. */
75620006a0bStron return(0);
75720006a0bStron }
75820006a0bStron
75920006a0bStron /* Mark beginning of line for ^ . */
76020006a0bStron if (notbol)
76120006a0bStron regbol = NULL;
76220006a0bStron else
76320006a0bStron regbol = string;
76420006a0bStron
76520006a0bStron /* Simplest case: anchored match need be tried only once. */
76620006a0bStron if (prog->reganch)
76720006a0bStron return(regtry(prog, string));
76820006a0bStron
76920006a0bStron /* Messy cases: unanchored match. */
77020006a0bStron s = string;
77120006a0bStron if (prog->regstart != '\0')
77220006a0bStron /* We know what char it must start with. */
77320006a0bStron while ((s = strchr(s, prog->regstart)) != NULL) {
77420006a0bStron if (regtry(prog, s))
77520006a0bStron return(1);
77620006a0bStron s++;
77720006a0bStron }
77820006a0bStron else
77920006a0bStron /* We don't -- general case. */
78020006a0bStron do {
78120006a0bStron if (regtry(prog, s))
78220006a0bStron return(1);
78320006a0bStron } while (*s++ != '\0');
78420006a0bStron
78520006a0bStron /* Failure. */
78620006a0bStron return(0);
78720006a0bStron }
78820006a0bStron
78920006a0bStron int
regexec(prog,string)79020006a0bStron regexec(prog, string)
79120006a0bStron register regexp *prog;
79220006a0bStron register char *string;
79320006a0bStron {
79420006a0bStron return regexec2(prog, string, 0);
79520006a0bStron }
79620006a0bStron
79720006a0bStron /*
79820006a0bStron - regtry - try match at specific point
79920006a0bStron */
80020006a0bStron static int /* 0 failure, 1 success */
regtry(prog,string)80120006a0bStron regtry(prog, string)
80220006a0bStron regexp *prog;
80320006a0bStron char *string;
80420006a0bStron {
80520006a0bStron register int i;
80620006a0bStron register char **sp;
80720006a0bStron register char **ep;
80820006a0bStron
80920006a0bStron reginput = string;
81020006a0bStron regstartp = prog->startp;
81120006a0bStron regendp = prog->endp;
81220006a0bStron
81320006a0bStron sp = prog->startp;
81420006a0bStron ep = prog->endp;
81520006a0bStron for (i = NSUBEXP; i > 0; i--) {
81620006a0bStron *sp++ = NULL;
81720006a0bStron *ep++ = NULL;
81820006a0bStron }
81920006a0bStron if (regmatch(prog->program + 1)) {
82020006a0bStron prog->startp[0] = string;
82120006a0bStron prog->endp[0] = reginput;
82220006a0bStron return(1);
82320006a0bStron } else
82420006a0bStron return(0);
82520006a0bStron }
82620006a0bStron
82720006a0bStron /*
82820006a0bStron - regmatch - main matching routine
82920006a0bStron *
83020006a0bStron * Conceptually the strategy is simple: check to see whether the current
83120006a0bStron * node matches, call self recursively to see whether the rest matches,
83220006a0bStron * and then act accordingly. In practice we make some effort to avoid
83320006a0bStron * recursion, in particular by going through "ordinary" nodes (that don't
83420006a0bStron * need to know whether the rest of the match failed) by a loop instead of
83520006a0bStron * by recursion.
83620006a0bStron */
83720006a0bStron static int /* 0 failure, 1 success */
regmatch(prog)83820006a0bStron regmatch(prog)
83920006a0bStron char *prog;
84020006a0bStron {
84120006a0bStron register char *scan; /* Current node. */
84220006a0bStron char *next; /* Next node. */
84320006a0bStron
84420006a0bStron scan = prog;
84520006a0bStron #ifdef DEBUG
84620006a0bStron if (scan != NULL && regnarrate)
84720006a0bStron fprintf(stderr, "%s(\n", regprop(scan));
84820006a0bStron #endif
84920006a0bStron while (scan != NULL) {
85020006a0bStron #ifdef DEBUG
85120006a0bStron if (regnarrate)
85220006a0bStron fprintf(stderr, "%s...\n", regprop(scan));
85320006a0bStron #endif
85420006a0bStron next = regnext(scan);
85520006a0bStron
85620006a0bStron switch (OP(scan)) {
85720006a0bStron case BOL:
85820006a0bStron if (reginput != regbol)
85920006a0bStron return(0);
86020006a0bStron break;
86120006a0bStron case EOL:
86220006a0bStron if (*reginput != '\0')
86320006a0bStron return(0);
86420006a0bStron break;
86520006a0bStron case ANY:
86620006a0bStron if (*reginput == '\0')
86720006a0bStron return(0);
86820006a0bStron reginput++;
86920006a0bStron break;
87020006a0bStron case EXACTLY: {
87120006a0bStron register int len;
87220006a0bStron register char *opnd;
87320006a0bStron
87420006a0bStron opnd = OPERAND(scan);
87520006a0bStron /* Inline the first character, for speed. */
87620006a0bStron if (*opnd != *reginput)
87720006a0bStron return(0);
878*838f5788Ssimonb len = (int) strlen(opnd);
87920006a0bStron if (len > 1 && strncmp(opnd, reginput, len) != 0)
88020006a0bStron return(0);
88120006a0bStron reginput += len;
88220006a0bStron }
88320006a0bStron break;
88420006a0bStron case ANYOF:
88520006a0bStron if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
88620006a0bStron return(0);
88720006a0bStron reginput++;
88820006a0bStron break;
88920006a0bStron case ANYBUT:
89020006a0bStron if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
89120006a0bStron return(0);
89220006a0bStron reginput++;
89320006a0bStron break;
89420006a0bStron case NOTHING:
89520006a0bStron break;
89620006a0bStron case BACK:
89720006a0bStron break;
89820006a0bStron case OPEN+1:
89920006a0bStron case OPEN+2:
90020006a0bStron case OPEN+3:
90120006a0bStron case OPEN+4:
90220006a0bStron case OPEN+5:
90320006a0bStron case OPEN+6:
90420006a0bStron case OPEN+7:
90520006a0bStron case OPEN+8:
90620006a0bStron case OPEN+9: {
90720006a0bStron register int no;
90820006a0bStron register char *save;
90920006a0bStron
91020006a0bStron no = OP(scan) - OPEN;
91120006a0bStron save = reginput;
91220006a0bStron
91320006a0bStron if (regmatch(next)) {
91420006a0bStron /*
91520006a0bStron * Don't set startp if some later
91620006a0bStron * invocation of the same parentheses
91720006a0bStron * already has.
91820006a0bStron */
91920006a0bStron if (regstartp[no] == NULL)
92020006a0bStron regstartp[no] = save;
92120006a0bStron return(1);
92220006a0bStron } else
92320006a0bStron return(0);
92420006a0bStron }
92520006a0bStron /* NOTREACHED */
92620006a0bStron break;
92720006a0bStron case CLOSE+1:
92820006a0bStron case CLOSE+2:
92920006a0bStron case CLOSE+3:
93020006a0bStron case CLOSE+4:
93120006a0bStron case CLOSE+5:
93220006a0bStron case CLOSE+6:
93320006a0bStron case CLOSE+7:
93420006a0bStron case CLOSE+8:
93520006a0bStron case CLOSE+9: {
93620006a0bStron register int no;
93720006a0bStron register char *save;
93820006a0bStron
93920006a0bStron no = OP(scan) - CLOSE;
94020006a0bStron save = reginput;
94120006a0bStron
94220006a0bStron if (regmatch(next)) {
94320006a0bStron /*
94420006a0bStron * Don't set endp if some later
94520006a0bStron * invocation of the same parentheses
94620006a0bStron * already has.
94720006a0bStron */
94820006a0bStron if (regendp[no] == NULL)
94920006a0bStron regendp[no] = save;
95020006a0bStron return(1);
95120006a0bStron } else
95220006a0bStron return(0);
95320006a0bStron }
95420006a0bStron /* NOTREACHED */
95520006a0bStron break;
95620006a0bStron case BRANCH: {
95720006a0bStron register char *save;
95820006a0bStron
95920006a0bStron if (OP(next) != BRANCH) /* No choice. */
96020006a0bStron next = OPERAND(scan); /* Avoid recursion. */
96120006a0bStron else {
96220006a0bStron do {
96320006a0bStron save = reginput;
96420006a0bStron if (regmatch(OPERAND(scan)))
96520006a0bStron return(1);
96620006a0bStron reginput = save;
96720006a0bStron scan = regnext(scan);
96820006a0bStron } while (scan != NULL && OP(scan) == BRANCH);
96920006a0bStron return(0);
97020006a0bStron /* NOTREACHED */
97120006a0bStron }
97220006a0bStron }
97320006a0bStron /* NOTREACHED */
97420006a0bStron break;
97520006a0bStron case STAR:
97620006a0bStron case PLUS: {
97720006a0bStron register char nextch;
97820006a0bStron register int no;
97920006a0bStron register char *save;
98020006a0bStron register int min;
98120006a0bStron
98220006a0bStron /*
98320006a0bStron * Lookahead to avoid useless match attempts
98420006a0bStron * when we know what character comes next.
98520006a0bStron */
98620006a0bStron nextch = '\0';
98720006a0bStron if (OP(next) == EXACTLY)
98820006a0bStron nextch = *OPERAND(next);
98920006a0bStron min = (OP(scan) == STAR) ? 0 : 1;
99020006a0bStron save = reginput;
99120006a0bStron no = regrepeat(OPERAND(scan));
99220006a0bStron while (no >= min) {
99320006a0bStron /* If it could work, try it. */
99420006a0bStron if (nextch == '\0' || *reginput == nextch)
99520006a0bStron if (regmatch(next))
99620006a0bStron return(1);
99720006a0bStron /* Couldn't or didn't -- back up. */
99820006a0bStron no--;
99920006a0bStron reginput = save + no;
100020006a0bStron }
100120006a0bStron return(0);
100220006a0bStron }
100320006a0bStron /* NOTREACHED */
100420006a0bStron break;
100520006a0bStron case END:
100620006a0bStron return(1); /* Success! */
100720006a0bStron /* NOTREACHED */
100820006a0bStron break;
100920006a0bStron default:
101020006a0bStron regerror("memory corruption");
101120006a0bStron return(0);
101220006a0bStron /* NOTREACHED */
101320006a0bStron break;
101420006a0bStron }
101520006a0bStron
101620006a0bStron scan = next;
101720006a0bStron }
101820006a0bStron
101920006a0bStron /*
102020006a0bStron * We get here only if there's trouble -- normally "case END" is
102120006a0bStron * the terminating point.
102220006a0bStron */
102320006a0bStron regerror("corrupted pointers");
102420006a0bStron return(0);
102520006a0bStron }
102620006a0bStron
102720006a0bStron /*
102820006a0bStron - regrepeat - repeatedly match something simple, report how many
102920006a0bStron */
103020006a0bStron static int
regrepeat(p)103120006a0bStron regrepeat(p)
103220006a0bStron char *p;
103320006a0bStron {
103420006a0bStron register int count = 0;
103520006a0bStron register char *scan;
103620006a0bStron register char *opnd;
103720006a0bStron
103820006a0bStron scan = reginput;
103920006a0bStron opnd = OPERAND(p);
104020006a0bStron switch (OP(p)) {
104120006a0bStron case ANY:
1042*838f5788Ssimonb count = (int) strlen(scan);
104320006a0bStron scan += count;
104420006a0bStron break;
104520006a0bStron case EXACTLY:
104620006a0bStron while (*opnd == *scan) {
104720006a0bStron count++;
104820006a0bStron scan++;
104920006a0bStron }
105020006a0bStron break;
105120006a0bStron case ANYOF:
105220006a0bStron while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
105320006a0bStron count++;
105420006a0bStron scan++;
105520006a0bStron }
105620006a0bStron break;
105720006a0bStron case ANYBUT:
105820006a0bStron while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
105920006a0bStron count++;
106020006a0bStron scan++;
106120006a0bStron }
106220006a0bStron break;
106320006a0bStron default: /* Oh dear. Called inappropriately. */
106420006a0bStron regerror("internal foulup");
106520006a0bStron count = 0; /* Best compromise. */
106620006a0bStron break;
106720006a0bStron }
106820006a0bStron reginput = scan;
106920006a0bStron
107020006a0bStron return(count);
107120006a0bStron }
107220006a0bStron
107320006a0bStron /*
107420006a0bStron - regnext - dig the "next" pointer out of a node
107520006a0bStron */
107620006a0bStron static char *
regnext(p)107720006a0bStron regnext(p)
107820006a0bStron register char *p;
107920006a0bStron {
108020006a0bStron register int offset;
108120006a0bStron
108220006a0bStron if (p == ®dummy)
108320006a0bStron return(NULL);
108420006a0bStron
108520006a0bStron offset = NEXT(p);
108620006a0bStron if (offset == 0)
108720006a0bStron return(NULL);
108820006a0bStron
108920006a0bStron if (OP(p) == BACK)
109020006a0bStron return(p-offset);
109120006a0bStron else
109220006a0bStron return(p+offset);
109320006a0bStron }
109420006a0bStron
109520006a0bStron #ifdef DEBUG
109620006a0bStron
109720006a0bStron STATIC char *regprop();
109820006a0bStron
109920006a0bStron /*
110020006a0bStron - regdump - dump a regexp onto stdout in vaguely comprehensible form
110120006a0bStron */
110220006a0bStron void
regdump(r)110320006a0bStron regdump(r)
110420006a0bStron regexp *r;
110520006a0bStron {
110620006a0bStron register char *s;
110720006a0bStron register char op = EXACTLY; /* Arbitrary non-END op. */
110820006a0bStron register char *next;
110920006a0bStron
111020006a0bStron
111120006a0bStron s = r->program + 1;
111220006a0bStron while (op != END) { /* While that wasn't END last time... */
111320006a0bStron op = OP(s);
111420006a0bStron printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
111520006a0bStron next = regnext(s);
111620006a0bStron if (next == NULL) /* Next ptr. */
111720006a0bStron printf("(0)");
111820006a0bStron else
111920006a0bStron printf("(%d)", (s-r->program)+(next-s));
112020006a0bStron s += 3;
112120006a0bStron if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
112220006a0bStron /* Literal string, where present. */
112320006a0bStron while (*s != '\0') {
112420006a0bStron putchar(*s);
112520006a0bStron s++;
112620006a0bStron }
112720006a0bStron s++;
112820006a0bStron }
112920006a0bStron putchar('\n');
113020006a0bStron }
113120006a0bStron
113220006a0bStron /* Header fields of interest. */
113320006a0bStron if (r->regstart != '\0')
113420006a0bStron printf("start `%c' ", r->regstart);
113520006a0bStron if (r->reganch)
113620006a0bStron printf("anchored ");
113720006a0bStron if (r->regmust != NULL)
113820006a0bStron printf("must have \"%s\"", r->regmust);
113920006a0bStron printf("\n");
114020006a0bStron }
114120006a0bStron
114220006a0bStron /*
114320006a0bStron - regprop - printable representation of opcode
114420006a0bStron */
114520006a0bStron static char *
regprop(op)114620006a0bStron regprop(op)
114720006a0bStron char *op;
114820006a0bStron {
114920006a0bStron register char *p;
115020006a0bStron static char buf[50];
115120006a0bStron
115220006a0bStron (void) strcpy(buf, ":");
115320006a0bStron
115420006a0bStron switch (OP(op)) {
115520006a0bStron case BOL:
115620006a0bStron p = "BOL";
115720006a0bStron break;
115820006a0bStron case EOL:
115920006a0bStron p = "EOL";
116020006a0bStron break;
116120006a0bStron case ANY:
116220006a0bStron p = "ANY";
116320006a0bStron break;
116420006a0bStron case ANYOF:
116520006a0bStron p = "ANYOF";
116620006a0bStron break;
116720006a0bStron case ANYBUT:
116820006a0bStron p = "ANYBUT";
116920006a0bStron break;
117020006a0bStron case BRANCH:
117120006a0bStron p = "BRANCH";
117220006a0bStron break;
117320006a0bStron case EXACTLY:
117420006a0bStron p = "EXACTLY";
117520006a0bStron break;
117620006a0bStron case NOTHING:
117720006a0bStron p = "NOTHING";
117820006a0bStron break;
117920006a0bStron case BACK:
118020006a0bStron p = "BACK";
118120006a0bStron break;
118220006a0bStron case END:
118320006a0bStron p = "END";
118420006a0bStron break;
118520006a0bStron case OPEN+1:
118620006a0bStron case OPEN+2:
118720006a0bStron case OPEN+3:
118820006a0bStron case OPEN+4:
118920006a0bStron case OPEN+5:
119020006a0bStron case OPEN+6:
119120006a0bStron case OPEN+7:
119220006a0bStron case OPEN+8:
119320006a0bStron case OPEN+9:
119420006a0bStron sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
119520006a0bStron p = NULL;
119620006a0bStron break;
119720006a0bStron case CLOSE+1:
119820006a0bStron case CLOSE+2:
119920006a0bStron case CLOSE+3:
120020006a0bStron case CLOSE+4:
120120006a0bStron case CLOSE+5:
120220006a0bStron case CLOSE+6:
120320006a0bStron case CLOSE+7:
120420006a0bStron case CLOSE+8:
120520006a0bStron case CLOSE+9:
120620006a0bStron sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
120720006a0bStron p = NULL;
120820006a0bStron break;
120920006a0bStron case STAR:
121020006a0bStron p = "STAR";
121120006a0bStron break;
121220006a0bStron case PLUS:
121320006a0bStron p = "PLUS";
121420006a0bStron break;
121520006a0bStron default:
121620006a0bStron regerror("corrupted opcode");
121720006a0bStron break;
121820006a0bStron }
121920006a0bStron if (p != NULL)
122020006a0bStron (void) strcat(buf, p);
122120006a0bStron return(buf);
122220006a0bStron }
122320006a0bStron #endif
122420006a0bStron
122520006a0bStron /*
122620006a0bStron * The following is provided for those people who do not have strcspn() in
122720006a0bStron * their C libraries. They should get off their butts and do something
122820006a0bStron * about it; at least one public-domain implementation of those (highly
122920006a0bStron * useful) string routines has been published on Usenet.
123020006a0bStron */
123120006a0bStron #ifdef STRCSPN
123220006a0bStron /*
123320006a0bStron * strcspn - find length of initial segment of s1 consisting entirely
123420006a0bStron * of characters not from s2
123520006a0bStron */
123620006a0bStron
123720006a0bStron static int
strcspn(s1,s2)123820006a0bStron strcspn(s1, s2)
123920006a0bStron char *s1;
124020006a0bStron char *s2;
124120006a0bStron {
124220006a0bStron register char *scan1;
124320006a0bStron register char *scan2;
124420006a0bStron register int count;
124520006a0bStron
124620006a0bStron count = 0;
124720006a0bStron for (scan1 = s1; *scan1 != '\0'; scan1++) {
124820006a0bStron for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
124920006a0bStron if (*scan1 == *scan2++)
125020006a0bStron return(count);
125120006a0bStron count++;
125220006a0bStron }
125320006a0bStron return(count);
125420006a0bStron }
125520006a0bStron #endif
1256