141349Sbostic /*
241349Sbostic * regcomp and regexec -- regsub and regerror are elsewhere
341349Sbostic *
441349Sbostic * Copyright (c) 1986 by University of Toronto.
541349Sbostic * Written by Henry Spencer. Not derived from licensed software.
641349Sbostic *
741349Sbostic * Permission is granted to anyone to use this software for any
841349Sbostic * purpose on any computer system, and to redistribute it freely,
941349Sbostic * subject to the following restrictions:
1041349Sbostic *
1141349Sbostic * 1. The author is not responsible for the consequences of use of
1241349Sbostic * this software, no matter how awful, even if they arise
1341349Sbostic * from defects in it.
1441349Sbostic *
1541349Sbostic * 2. The origin of this software must not be misrepresented, either
1641349Sbostic * by explicit claim or by omission.
1741349Sbostic *
1841349Sbostic * 3. Altered versions must be plainly marked as such, and must not
1941349Sbostic * be misrepresented as being the original software.
2041349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
2141349Sbostic *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
2241349Sbostic *** to assist in implementing egrep.
2341349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
2441349Sbostic *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
2541349Sbostic *** as in BSD grep and ex.
2641349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
2741349Sbostic *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
2841349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
2941349Sbostic *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
3041349Sbostic *
3141349Sbostic * Beware that some of this code is subtly aware of the way operator
3241349Sbostic * precedence is structured in regular expressions. Serious changes in
3341349Sbostic * regular-expression syntax might require a total rethink.
3441349Sbostic */
3546619Sbostic #include <regexp.h>
3641349Sbostic #include <stdio.h>
3741349Sbostic #include <ctype.h>
3846619Sbostic #include <stdlib.h>
3946619Sbostic #include <string.h>
4041349Sbostic #include "regmagic.h"
4141349Sbostic
4241349Sbostic /*
4341349Sbostic * The "internal use only" fields in regexp.h are present to pass info from
4441349Sbostic * compile to execute that permits the execute phase to run lots faster on
4541349Sbostic * simple cases. They are:
4641349Sbostic *
4741349Sbostic * regstart char that must begin a match; '\0' if none obvious
4841349Sbostic * reganch is the match anchored (at beginning-of-line only)?
4941349Sbostic * regmust string (pointer into program) that match must include, or NULL
5041349Sbostic * regmlen length of regmust string
5141349Sbostic *
5241349Sbostic * Regstart and reganch permit very fast decisions on suitable starting points
5341349Sbostic * for a match, cutting down the work a lot. Regmust permits fast rejection
5441349Sbostic * of lines that cannot possibly match. The regmust tests are costly enough
5541349Sbostic * that regcomp() supplies a regmust only if the r.e. contains something
5641349Sbostic * potentially expensive (at present, the only such thing detected is * or +
5741349Sbostic * at the start of the r.e., which can involve a lot of backup). Regmlen is
5841349Sbostic * supplied because the test in regexec() needs it and regcomp() is computing
5941349Sbostic * it anyway.
6041349Sbostic */
6141349Sbostic
6241349Sbostic /*
6341349Sbostic * Structure for regexp "program". This is essentially a linear encoding
6441349Sbostic * of a nondeterministic finite-state machine (aka syntax charts or
6541349Sbostic * "railroad normal form" in parsing technology). Each node is an opcode
6641349Sbostic * plus a "next" pointer, possibly plus an operand. "Next" pointers of
6741349Sbostic * all nodes except BRANCH implement concatenation; a "next" pointer with
6841349Sbostic * a BRANCH on both ends of it is connecting two alternatives. (Here we
6941349Sbostic * have one of the subtle syntax dependencies: an individual BRANCH (as
7041349Sbostic * opposed to a collection of them) is never concatenated with anything
7141349Sbostic * because of operator precedence.) The operand of some types of node is
7241349Sbostic * a literal string; for others, it is a node leading into a sub-FSM. In
7341349Sbostic * particular, the operand of a BRANCH node is the first node of the branch.
7441349Sbostic * (NB this is *not* a tree structure: the tail of the branch connects
7541349Sbostic * to the thing following the set of BRANCHes.) The opcodes are:
7641349Sbostic */
7741349Sbostic
7841349Sbostic /* definition number opnd? meaning */
7941349Sbostic #define END 0 /* no End of program. */
8041349Sbostic #define BOL 1 /* no Match "" at beginning of line. */
8141349Sbostic #define EOL 2 /* no Match "" at end of line. */
8241349Sbostic #define ANY 3 /* no Match any one character. */
8341349Sbostic #define ANYOF 4 /* str Match any character in this string. */
8441349Sbostic #define ANYBUT 5 /* str Match any character not in this string. */
8541349Sbostic #define BRANCH 6 /* node Match this alternative, or the next... */
8641349Sbostic #define BACK 7 /* no Match "", "next" ptr points backward. */
8741349Sbostic #define EXACTLY 8 /* str Match this string. */
8841349Sbostic #define NOTHING 9 /* no Match empty string. */
8941349Sbostic #define STAR 10 /* node Match this (simple) thing 0 or more times. */
9041349Sbostic #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
9141349Sbostic #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */
9241349Sbostic #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */
9341349Sbostic #define OPEN 20 /* no Mark this point in input as start of #n. */
9441349Sbostic /* OPEN+1 is number 1, etc. */
9541349Sbostic #define CLOSE 30 /* no Analogous to OPEN. */
9641349Sbostic
9741349Sbostic /*
9841349Sbostic * Opcode notes:
9941349Sbostic *
10041349Sbostic * BRANCH The set of branches constituting a single choice are hooked
10141349Sbostic * together with their "next" pointers, since precedence prevents
10241349Sbostic * anything being concatenated to any individual branch. The
10341349Sbostic * "next" pointer of the last BRANCH in a choice points to the
10441349Sbostic * thing following the whole choice. This is also where the
10541349Sbostic * final "next" pointer of each individual branch points; each
10641349Sbostic * branch starts with the operand node of a BRANCH node.
10741349Sbostic *
10841349Sbostic * BACK Normal "next" pointers all implicitly point forward; BACK
10941349Sbostic * exists to make loop structures possible.
11041349Sbostic *
11141349Sbostic * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
11241349Sbostic * BRANCH structures using BACK. Simple cases (one character
11341349Sbostic * per match) are implemented with STAR and PLUS for speed
11441349Sbostic * and to minimize recursive plunges.
11541349Sbostic *
11641349Sbostic * OPEN,CLOSE ...are numbered at compile time.
11741349Sbostic */
11841349Sbostic
11941349Sbostic /*
12041349Sbostic * A node is one char of opcode followed by two chars of "next" pointer.
12141349Sbostic * "Next" pointers are stored as two 8-bit pieces, high order first. The
12241349Sbostic * value is a positive offset from the opcode of the node containing it.
12341349Sbostic * An operand, if any, simply follows the node. (Note that much of the
12441349Sbostic * code generation knows about this implicit relationship.)
12541349Sbostic *
12641349Sbostic * Using two bytes for the "next" pointer is vast overkill for most things,
12741349Sbostic * but allows patterns to get big without disasters.
12841349Sbostic */
12941349Sbostic #define OP(p) (*(p))
13041349Sbostic #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
13141349Sbostic #define OPERAND(p) ((p) + 3)
13241349Sbostic
13341349Sbostic /*
13441349Sbostic * See regmagic.h for one further detail of program structure.
13541349Sbostic */
13641349Sbostic
13741349Sbostic
13841349Sbostic /*
13941349Sbostic * Utility definitions.
14041349Sbostic */
14141349Sbostic #ifndef CHARBITS
14241349Sbostic #define UCHARAT(p) ((int)*(unsigned char *)(p))
14341349Sbostic #else
14441349Sbostic #define UCHARAT(p) ((int)*(p)&CHARBITS)
14541349Sbostic #endif
14641349Sbostic
14741349Sbostic #define FAIL(m) { regerror(m); return(NULL); }
14841349Sbostic #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
14941349Sbostic
15041349Sbostic /*
15141349Sbostic * Flags to be passed up and down.
15241349Sbostic */
15341349Sbostic #define HASWIDTH 01 /* Known never to match null string. */
15441349Sbostic #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
15541349Sbostic #define SPSTART 04 /* Starts with * or +. */
15641349Sbostic #define WORST 0 /* Worst case. */
15741349Sbostic
15841349Sbostic /*
15941349Sbostic * Global work variables for regcomp().
16041349Sbostic */
16141349Sbostic static char *regparse; /* Input-scan pointer. */
16241349Sbostic static int regnpar; /* () count. */
16341349Sbostic static char regdummy;
16441349Sbostic static char *regcode; /* Code-emit pointer; ®dummy = don't. */
16541349Sbostic static long regsize; /* Code size. */
16641349Sbostic
16741349Sbostic /*
16841349Sbostic * Forward declarations for regcomp()'s friends.
16941349Sbostic */
17041349Sbostic #ifndef STATIC
17141349Sbostic #define STATIC static
17241349Sbostic #endif
17341349Sbostic STATIC char *reg();
17441349Sbostic STATIC char *regbranch();
17541349Sbostic STATIC char *regpiece();
17641349Sbostic STATIC char *regatom();
17741349Sbostic STATIC char *regnode();
17841349Sbostic STATIC char *regnext();
17941349Sbostic STATIC void regc();
18041349Sbostic STATIC void reginsert();
18141349Sbostic STATIC void regtail();
18241349Sbostic STATIC void regoptail();
18341349Sbostic #ifdef STRCSPN
18441349Sbostic STATIC int strcspn();
18541349Sbostic #endif
18641349Sbostic
18741349Sbostic /*
18841349Sbostic - regcomp - compile a regular expression into internal code
18941349Sbostic *
19041349Sbostic * We can't allocate space until we know how big the compiled form will be,
19141349Sbostic * but we can't compile it (and thus know how big it is) until we've got a
19241349Sbostic * place to put the code. So we cheat: we compile it twice, once with code
19341349Sbostic * generation turned off and size counting turned on, and once "for real".
19441349Sbostic * This also means that we don't allocate space until we are sure that the
19541349Sbostic * thing really will compile successfully, and we never have to move the
19641349Sbostic * code and thus invalidate pointers into it. (Note that it has to be in
19741349Sbostic * one piece because free() must be able to free it all.)
19841349Sbostic *
19941349Sbostic * Beware that the optimization-preparation code in here knows about some
20041349Sbostic * of the structure of the compiled regexp.
20141349Sbostic */
20241349Sbostic regexp *
regcomp(exp)20341349Sbostic regcomp(exp)
20446619Sbostic const char *exp;
20541349Sbostic {
20641349Sbostic register regexp *r;
20741349Sbostic register char *scan;
20841349Sbostic register char *longest;
20941349Sbostic register int len;
21041349Sbostic int flags;
21141349Sbostic
21241349Sbostic if (exp == NULL)
21341349Sbostic FAIL("NULL argument");
21441349Sbostic
21541349Sbostic /* First pass: determine size, legality. */
216*49375Smarc #ifdef notdef
21741349Sbostic if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */
218*49375Smarc #endif
21946619Sbostic regparse = (char *)exp;
22041349Sbostic regnpar = 1;
22141349Sbostic regsize = 0L;
22241349Sbostic regcode = ®dummy;
22341349Sbostic regc(MAGIC);
22441349Sbostic if (reg(0, &flags) == NULL)
22541349Sbostic return(NULL);
22641349Sbostic
22741349Sbostic /* Small enough for pointer-storage convention? */
22841349Sbostic if (regsize >= 32767L) /* Probably could be 65535L. */
22941349Sbostic FAIL("regexp too big");
23041349Sbostic
23141349Sbostic /* Allocate space. */
23241349Sbostic r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
23341349Sbostic if (r == NULL)
23441349Sbostic FAIL("out of space");
23541349Sbostic
23641349Sbostic /* Second pass: emit code. */
23746619Sbostic regparse = (char *)exp;
23841349Sbostic regnpar = 1;
23941349Sbostic regcode = r->program;
24041349Sbostic regc(MAGIC);
24141349Sbostic if (reg(0, &flags) == NULL)
24241349Sbostic return(NULL);
24341349Sbostic
24441349Sbostic /* Dig out information for optimizations. */
24541349Sbostic r->regstart = '\0'; /* Worst-case defaults. */
24641349Sbostic r->reganch = 0;
24741349Sbostic r->regmust = NULL;
24841349Sbostic r->regmlen = 0;
24941349Sbostic scan = r->program+1; /* First BRANCH. */
25041349Sbostic if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
25141349Sbostic scan = OPERAND(scan);
25241349Sbostic
25341349Sbostic /* Starting-point info. */
25441349Sbostic if (OP(scan) == EXACTLY)
25541349Sbostic r->regstart = *OPERAND(scan);
25641349Sbostic else if (OP(scan) == BOL)
25741349Sbostic r->reganch++;
25841349Sbostic
25941349Sbostic /*
26041349Sbostic * If there's something expensive in the r.e., find the
26141349Sbostic * longest literal string that must appear and make it the
26241349Sbostic * regmust. Resolve ties in favor of later strings, since
26341349Sbostic * the regstart check works with the beginning of the r.e.
26441349Sbostic * and avoiding duplication strengthens checking. Not a
26541349Sbostic * strong reason, but sufficient in the absence of others.
26641349Sbostic */
26741349Sbostic if (flags&SPSTART) {
26841349Sbostic longest = NULL;
26941349Sbostic len = 0;
27041349Sbostic for (; scan != NULL; scan = regnext(scan))
27141349Sbostic if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
27241349Sbostic longest = OPERAND(scan);
27341349Sbostic len = strlen(OPERAND(scan));
27441349Sbostic }
27541349Sbostic r->regmust = longest;
27641349Sbostic r->regmlen = len;
27741349Sbostic }
27841349Sbostic }
27941349Sbostic
28041349Sbostic return(r);
28141349Sbostic }
28241349Sbostic
28341349Sbostic /*
28441349Sbostic - reg - regular expression, i.e. main body or parenthesized thing
28541349Sbostic *
28641349Sbostic * Caller must absorb opening parenthesis.
28741349Sbostic *
28841349Sbostic * Combining parenthesis handling with the base level of regular expression
28941349Sbostic * is a trifle forced, but the need to tie the tails of the branches to what
29041349Sbostic * follows makes it hard to avoid.
29141349Sbostic */
29241349Sbostic static char *
reg(paren,flagp)29341349Sbostic reg(paren, flagp)
29441349Sbostic int paren; /* Parenthesized? */
29541349Sbostic int *flagp;
29641349Sbostic {
29741349Sbostic register char *ret;
29841349Sbostic register char *br;
29941349Sbostic register char *ender;
30041349Sbostic register int parno;
30141349Sbostic int flags;
30241349Sbostic
30341349Sbostic *flagp = HASWIDTH; /* Tentatively. */
30441349Sbostic
30541349Sbostic /* Make an OPEN node, if parenthesized. */
30641349Sbostic if (paren) {
30741349Sbostic if (regnpar >= NSUBEXP)
30841349Sbostic FAIL("too many ()");
30941349Sbostic parno = regnpar;
31041349Sbostic regnpar++;
31141349Sbostic ret = regnode(OPEN+parno);
31241349Sbostic } else
31341349Sbostic ret = NULL;
31441349Sbostic
31541349Sbostic /* Pick up the branches, linking them together. */
31641349Sbostic br = regbranch(&flags);
31741349Sbostic if (br == NULL)
31841349Sbostic return(NULL);
31941349Sbostic if (ret != NULL)
32041349Sbostic regtail(ret, br); /* OPEN -> first. */
32141349Sbostic else
32241349Sbostic ret = br;
32341349Sbostic if (!(flags&HASWIDTH))
32441349Sbostic *flagp &= ~HASWIDTH;
32541349Sbostic *flagp |= flags&SPSTART;
32641349Sbostic while (*regparse == '|' || *regparse == '\n') {
32741349Sbostic regparse++;
32841349Sbostic br = regbranch(&flags);
32941349Sbostic if (br == NULL)
33041349Sbostic return(NULL);
33141349Sbostic regtail(ret, br); /* BRANCH -> BRANCH. */
33241349Sbostic if (!(flags&HASWIDTH))
33341349Sbostic *flagp &= ~HASWIDTH;
33441349Sbostic *flagp |= flags&SPSTART;
33541349Sbostic }
33641349Sbostic
33741349Sbostic /* Make a closing node, and hook it on the end. */
33841349Sbostic ender = regnode((paren) ? CLOSE+parno : END);
33941349Sbostic regtail(ret, ender);
34041349Sbostic
34141349Sbostic /* Hook the tails of the branches to the closing node. */
34241349Sbostic for (br = ret; br != NULL; br = regnext(br))
34341349Sbostic regoptail(br, ender);
34441349Sbostic
34541349Sbostic /* Check for proper termination. */
34641349Sbostic if (paren && *regparse++ != ')') {
34741349Sbostic FAIL("unmatched ()");
34841349Sbostic } else if (!paren && *regparse != '\0') {
34941349Sbostic if (*regparse == ')') {
35041349Sbostic FAIL("unmatched ()");
35141349Sbostic } else
35241349Sbostic FAIL("junk on end"); /* "Can't happen". */
35341349Sbostic /* NOTREACHED */
35441349Sbostic }
35541349Sbostic
35641349Sbostic return(ret);
35741349Sbostic }
35841349Sbostic
35941349Sbostic /*
36041349Sbostic - regbranch - one alternative of an | operator
36141349Sbostic *
36241349Sbostic * Implements the concatenation operator.
36341349Sbostic */
36441349Sbostic static char *
regbranch(flagp)36541349Sbostic regbranch(flagp)
36641349Sbostic int *flagp;
36741349Sbostic {
36841349Sbostic register char *ret;
36941349Sbostic register char *chain;
37041349Sbostic register char *latest;
37141349Sbostic int flags;
37241349Sbostic
37341349Sbostic *flagp = WORST; /* Tentatively. */
37441349Sbostic
37541349Sbostic ret = regnode(BRANCH);
37641349Sbostic chain = NULL;
37741349Sbostic while (*regparse != '\0' && *regparse != ')' &&
37841349Sbostic *regparse != '\n' && *regparse != '|') {
37941349Sbostic latest = regpiece(&flags);
38041349Sbostic if (latest == NULL)
38141349Sbostic return(NULL);
38241349Sbostic *flagp |= flags&HASWIDTH;
38341349Sbostic if (chain == NULL) /* First piece. */
38441349Sbostic *flagp |= flags&SPSTART;
38541349Sbostic else
38641349Sbostic regtail(chain, latest);
38741349Sbostic chain = latest;
38841349Sbostic }
38941349Sbostic if (chain == NULL) /* Loop ran zero times. */
39041349Sbostic (void) regnode(NOTHING);
39141349Sbostic
39241349Sbostic return(ret);
39341349Sbostic }
39441349Sbostic
39541349Sbostic /*
39641349Sbostic - regpiece - something followed by possible [*+?]
39741349Sbostic *
39841349Sbostic * Note that the branching code sequences used for ? and the general cases
39941349Sbostic * of * and + are somewhat optimized: they use the same NOTHING node as
40041349Sbostic * both the endmarker for their branch list and the body of the last branch.
40141349Sbostic * It might seem that this node could be dispensed with entirely, but the
40241349Sbostic * endmarker role is not redundant.
40341349Sbostic */
40441349Sbostic static char *
regpiece(flagp)40541349Sbostic regpiece(flagp)
40641349Sbostic int *flagp;
40741349Sbostic {
40841349Sbostic register char *ret;
40941349Sbostic register char op;
41041349Sbostic register char *next;
41141349Sbostic int flags;
41241349Sbostic
41341349Sbostic ret = regatom(&flags);
41441349Sbostic if (ret == NULL)
41541349Sbostic return(NULL);
41641349Sbostic
41741349Sbostic op = *regparse;
41841349Sbostic if (!ISMULT(op)) {
41941349Sbostic *flagp = flags;
42041349Sbostic return(ret);
42141349Sbostic }
42241349Sbostic
42341349Sbostic if (!(flags&HASWIDTH) && op != '?')
42441349Sbostic FAIL("*+ operand could be empty");
42541349Sbostic *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
42641349Sbostic
42741349Sbostic if (op == '*' && (flags&SIMPLE))
42841349Sbostic reginsert(STAR, ret);
42941349Sbostic else if (op == '*') {
43041349Sbostic /* Emit x* as (x&|), where & means "self". */
43141349Sbostic reginsert(BRANCH, ret); /* Either x */
43241349Sbostic regoptail(ret, regnode(BACK)); /* and loop */
43341349Sbostic regoptail(ret, ret); /* back */
43441349Sbostic regtail(ret, regnode(BRANCH)); /* or */
43541349Sbostic regtail(ret, regnode(NOTHING)); /* null. */
43641349Sbostic } else if (op == '+' && (flags&SIMPLE))
43741349Sbostic reginsert(PLUS, ret);
43841349Sbostic else if (op == '+') {
43941349Sbostic /* Emit x+ as x(&|), where & means "self". */
44041349Sbostic next = regnode(BRANCH); /* Either */
44141349Sbostic regtail(ret, next);
44241349Sbostic regtail(regnode(BACK), ret); /* loop back */
44341349Sbostic regtail(next, regnode(BRANCH)); /* or */
44441349Sbostic regtail(ret, regnode(NOTHING)); /* null. */
44541349Sbostic } else if (op == '?') {
44641349Sbostic /* Emit x? as (x|) */
44741349Sbostic reginsert(BRANCH, ret); /* Either x */
44841349Sbostic regtail(ret, regnode(BRANCH)); /* or */
44941349Sbostic next = regnode(NOTHING); /* null. */
45041349Sbostic regtail(ret, next);
45141349Sbostic regoptail(ret, next);
45241349Sbostic }
45341349Sbostic regparse++;
45441349Sbostic if (ISMULT(*regparse))
45541349Sbostic FAIL("nested *?+");
45641349Sbostic
45741349Sbostic return(ret);
45841349Sbostic }
45941349Sbostic
46041349Sbostic /*
46141349Sbostic - regatom - the lowest level
46241349Sbostic *
46341349Sbostic * Optimization: gobbles an entire sequence of ordinary characters so that
46441349Sbostic * it can turn them into a single node, which is smaller to store and
46541349Sbostic * faster to run. Backslashed characters are exceptions, each becoming a
46641349Sbostic * separate node; the code is simpler that way and it's not worth fixing.
46741349Sbostic */
46841349Sbostic static char *
regatom(flagp)46941349Sbostic regatom(flagp)
47041349Sbostic int *flagp;
47141349Sbostic {
47241349Sbostic register char *ret;
47341349Sbostic int flags;
47441349Sbostic
47541349Sbostic *flagp = WORST; /* Tentatively. */
47641349Sbostic
47741349Sbostic switch (*regparse++) {
47841349Sbostic /* FIXME: these chars only have meaning at beg/end of pat? */
47941349Sbostic case '^':
48041349Sbostic ret = regnode(BOL);
48141349Sbostic break;
48241349Sbostic case '$':
48341349Sbostic ret = regnode(EOL);
48441349Sbostic break;
48541349Sbostic case '.':
48641349Sbostic ret = regnode(ANY);
48741349Sbostic *flagp |= HASWIDTH|SIMPLE;
48841349Sbostic break;
48941349Sbostic case '[': {
49041349Sbostic register int class;
49141349Sbostic register int classend;
49241349Sbostic
49341349Sbostic if (*regparse == '^') { /* Complement of range. */
49441349Sbostic ret = regnode(ANYBUT);
49541349Sbostic regparse++;
49641349Sbostic } else
49741349Sbostic ret = regnode(ANYOF);
49841349Sbostic if (*regparse == ']' || *regparse == '-')
49941349Sbostic regc(*regparse++);
50041349Sbostic while (*regparse != '\0' && *regparse != ']') {
50141349Sbostic if (*regparse == '-') {
50241349Sbostic regparse++;
50341349Sbostic if (*regparse == ']' || *regparse == '\0')
50441349Sbostic regc('-');
50541349Sbostic else {
50641349Sbostic class = UCHARAT(regparse-2)+1;
50741349Sbostic classend = UCHARAT(regparse);
50841349Sbostic if (class > classend+1)
50941349Sbostic FAIL("invalid [] range");
51041349Sbostic for (; class <= classend; class++)
51141349Sbostic regc(class);
51241349Sbostic regparse++;
51341349Sbostic }
51441349Sbostic } else
51541349Sbostic regc(*regparse++);
51641349Sbostic }
51741349Sbostic regc('\0');
51841349Sbostic if (*regparse != ']')
51941349Sbostic FAIL("unmatched []");
52041349Sbostic regparse++;
52141349Sbostic *flagp |= HASWIDTH|SIMPLE;
52241349Sbostic }
52341349Sbostic break;
52441349Sbostic case '(':
52541349Sbostic ret = reg(1, &flags);
52641349Sbostic if (ret == NULL)
52741349Sbostic return(NULL);
52841349Sbostic *flagp |= flags&(HASWIDTH|SPSTART);
52941349Sbostic break;
53041349Sbostic case '\0':
53141349Sbostic case '|':
53241349Sbostic case '\n':
53341349Sbostic case ')':
53441349Sbostic FAIL("internal urp"); /* Supposed to be caught earlier. */
53541349Sbostic break;
53641349Sbostic case '?':
53741349Sbostic case '+':
53841349Sbostic case '*':
53941349Sbostic FAIL("?+* follows nothing");
54041349Sbostic break;
54141349Sbostic case '\\':
54241349Sbostic switch (*regparse++) {
54341349Sbostic case '\0':
54441349Sbostic FAIL("trailing \\");
54541349Sbostic break;
54641349Sbostic case '<':
54741349Sbostic ret = regnode(WORDA);
54841349Sbostic break;
54941349Sbostic case '>':
55041349Sbostic ret = regnode(WORDZ);
55141349Sbostic break;
55241349Sbostic /* FIXME: Someday handle \1, \2, ... */
55341349Sbostic default:
55441349Sbostic /* Handle general quoted chars in exact-match routine */
55541349Sbostic goto de_fault;
55641349Sbostic }
55741349Sbostic break;
55841349Sbostic de_fault:
55941349Sbostic default:
56041349Sbostic /*
56141349Sbostic * Encode a string of characters to be matched exactly.
56241349Sbostic *
56341349Sbostic * This is a bit tricky due to quoted chars and due to
56441349Sbostic * '*', '+', and '?' taking the SINGLE char previous
56541349Sbostic * as their operand.
56641349Sbostic *
56741349Sbostic * On entry, the char at regparse[-1] is going to go
56841349Sbostic * into the string, no matter what it is. (It could be
56941349Sbostic * following a \ if we are entered from the '\' case.)
57041349Sbostic *
57141349Sbostic * Basic idea is to pick up a good char in ch and
57241349Sbostic * examine the next char. If it's *+? then we twiddle.
57341349Sbostic * If it's \ then we frozzle. If it's other magic char
57441349Sbostic * we push ch and terminate the string. If none of the
57541349Sbostic * above, we push ch on the string and go around again.
57641349Sbostic *
57741349Sbostic * regprev is used to remember where "the current char"
57841349Sbostic * starts in the string, if due to a *+? we need to back
57941349Sbostic * up and put the current char in a separate, 1-char, string.
58041349Sbostic * When regprev is NULL, ch is the only char in the
58141349Sbostic * string; this is used in *+? handling, and in setting
58241349Sbostic * flags |= SIMPLE at the end.
58341349Sbostic */
58441349Sbostic {
58541349Sbostic char *regprev;
58641349Sbostic register char ch;
58741349Sbostic
58841349Sbostic regparse--; /* Look at cur char */
58941349Sbostic ret = regnode(EXACTLY);
59041349Sbostic for ( regprev = 0 ; ; ) {
59141349Sbostic ch = *regparse++; /* Get current char */
59241349Sbostic switch (*regparse) { /* look at next one */
59341349Sbostic
59441349Sbostic default:
59541349Sbostic regc(ch); /* Add cur to string */
59641349Sbostic break;
59741349Sbostic
59841349Sbostic case '.': case '[': case '(':
59941349Sbostic case ')': case '|': case '\n':
60041349Sbostic case '$': case '^':
60141349Sbostic case '\0':
60241349Sbostic /* FIXME, $ and ^ should not always be magic */
60341349Sbostic magic:
60441349Sbostic regc(ch); /* dump cur char */
60541349Sbostic goto done; /* and we are done */
60641349Sbostic
60741349Sbostic case '?': case '+': case '*':
60841349Sbostic if (!regprev) /* If just ch in str, */
60941349Sbostic goto magic; /* use it */
61041349Sbostic /* End mult-char string one early */
61141349Sbostic regparse = regprev; /* Back up parse */
61241349Sbostic goto done;
61341349Sbostic
61441349Sbostic case '\\':
61541349Sbostic regc(ch); /* Cur char OK */
61641349Sbostic switch (regparse[1]){ /* Look after \ */
61741349Sbostic case '\0':
61841349Sbostic case '<':
61941349Sbostic case '>':
62041349Sbostic /* FIXME: Someday handle \1, \2, ... */
62141349Sbostic goto done; /* Not quoted */
62241349Sbostic default:
62341349Sbostic /* Backup point is \, scan * point is after it. */
62441349Sbostic regprev = regparse;
62541349Sbostic regparse++;
62641349Sbostic continue; /* NOT break; */
62741349Sbostic }
62841349Sbostic }
62941349Sbostic regprev = regparse; /* Set backup point */
63041349Sbostic }
63141349Sbostic done:
63241349Sbostic regc('\0');
63341349Sbostic *flagp |= HASWIDTH;
63441349Sbostic if (!regprev) /* One char? */
63541349Sbostic *flagp |= SIMPLE;
63641349Sbostic }
63741349Sbostic break;
63841349Sbostic }
63941349Sbostic
64041349Sbostic return(ret);
64141349Sbostic }
64241349Sbostic
64341349Sbostic /*
64441349Sbostic - regnode - emit a node
64541349Sbostic */
64641349Sbostic static char * /* Location. */
regnode(op)64741349Sbostic regnode(op)
64841349Sbostic char op;
64941349Sbostic {
65041349Sbostic register char *ret;
65141349Sbostic register char *ptr;
65241349Sbostic
65341349Sbostic ret = regcode;
65441349Sbostic if (ret == ®dummy) {
65541349Sbostic regsize += 3;
65641349Sbostic return(ret);
65741349Sbostic }
65841349Sbostic
65941349Sbostic ptr = ret;
66041349Sbostic *ptr++ = op;
66141349Sbostic *ptr++ = '\0'; /* Null "next" pointer. */
66241349Sbostic *ptr++ = '\0';
66341349Sbostic regcode = ptr;
66441349Sbostic
66541349Sbostic return(ret);
66641349Sbostic }
66741349Sbostic
66841349Sbostic /*
66941349Sbostic - regc - emit (if appropriate) a byte of code
67041349Sbostic */
67141349Sbostic static void
regc(b)67241349Sbostic regc(b)
67341349Sbostic char b;
67441349Sbostic {
67541349Sbostic if (regcode != ®dummy)
67641349Sbostic *regcode++ = b;
67741349Sbostic else
67841349Sbostic regsize++;
67941349Sbostic }
68041349Sbostic
68141349Sbostic /*
68241349Sbostic - reginsert - insert an operator in front of already-emitted operand
68341349Sbostic *
68441349Sbostic * Means relocating the operand.
68541349Sbostic */
68641349Sbostic static void
reginsert(op,opnd)68741349Sbostic reginsert(op, opnd)
68841349Sbostic char op;
68941349Sbostic char *opnd;
69041349Sbostic {
69141349Sbostic register char *src;
69241349Sbostic register char *dst;
69341349Sbostic register char *place;
69441349Sbostic
69541349Sbostic if (regcode == ®dummy) {
69641349Sbostic regsize += 3;
69741349Sbostic return;
69841349Sbostic }
69941349Sbostic
70041349Sbostic src = regcode;
70141349Sbostic regcode += 3;
70241349Sbostic dst = regcode;
70341349Sbostic while (src > opnd)
70441349Sbostic *--dst = *--src;
70541349Sbostic
70641349Sbostic place = opnd; /* Op node, where operand used to be. */
70741349Sbostic *place++ = op;
70841349Sbostic *place++ = '\0';
70941349Sbostic *place++ = '\0';
71041349Sbostic }
71141349Sbostic
71241349Sbostic /*
71341349Sbostic - regtail - set the next-pointer at the end of a node chain
71441349Sbostic */
71541349Sbostic static void
regtail(p,val)71641349Sbostic regtail(p, val)
71741349Sbostic char *p;
71841349Sbostic char *val;
71941349Sbostic {
72041349Sbostic register char *scan;
72141349Sbostic register char *temp;
72241349Sbostic register int offset;
72341349Sbostic
72441349Sbostic if (p == ®dummy)
72541349Sbostic return;
72641349Sbostic
72741349Sbostic /* Find last node. */
72841349Sbostic scan = p;
72941349Sbostic for (;;) {
73041349Sbostic temp = regnext(scan);
73141349Sbostic if (temp == NULL)
73241349Sbostic break;
73341349Sbostic scan = temp;
73441349Sbostic }
73541349Sbostic
73641349Sbostic if (OP(scan) == BACK)
73741349Sbostic offset = scan - val;
73841349Sbostic else
73941349Sbostic offset = val - scan;
74041349Sbostic *(scan+1) = (offset>>8)&0377;
74141349Sbostic *(scan+2) = offset&0377;
74241349Sbostic }
74341349Sbostic
74441349Sbostic /*
74541349Sbostic - regoptail - regtail on operand of first argument; nop if operandless
74641349Sbostic */
74741349Sbostic static void
regoptail(p,val)74841349Sbostic regoptail(p, val)
74941349Sbostic char *p;
75041349Sbostic char *val;
75141349Sbostic {
75241349Sbostic /* "Operandless" and "op != BRANCH" are synonymous in practice. */
75341349Sbostic if (p == NULL || p == ®dummy || OP(p) != BRANCH)
75441349Sbostic return;
75541349Sbostic regtail(OPERAND(p), val);
75641349Sbostic }
75741349Sbostic
75841349Sbostic /*
75941349Sbostic * regexec and friends
76041349Sbostic */
76141349Sbostic
76241349Sbostic /*
76341349Sbostic * Global work variables for regexec().
76441349Sbostic */
76541349Sbostic static char *reginput; /* String-input pointer. */
76641349Sbostic static char *regbol; /* Beginning of input, for ^ check. */
76741349Sbostic static char **regstartp; /* Pointer to startp array. */
76841349Sbostic static char **regendp; /* Ditto for endp. */
76941349Sbostic
77041349Sbostic /*
77141349Sbostic * Forwards.
77241349Sbostic */
77341349Sbostic STATIC int regtry();
77441349Sbostic STATIC int regmatch();
77541349Sbostic STATIC int regrepeat();
77641349Sbostic
77741349Sbostic #ifdef DEBUG
77841349Sbostic int regnarrate = 0;
77941349Sbostic void regdump();
78041349Sbostic STATIC char *regprop();
78141349Sbostic #endif
78241349Sbostic
78341349Sbostic /*
78441349Sbostic - regexec - match a regexp against a string
78541349Sbostic */
78641349Sbostic int
regexec(prog,string)78741349Sbostic regexec(prog, string)
78846619Sbostic register const regexp *prog;
78946619Sbostic register const char *string;
79041349Sbostic {
79141349Sbostic register char *s;
79241349Sbostic extern char *strchr();
79341349Sbostic
79441349Sbostic /* Be paranoid... */
79541349Sbostic if (prog == NULL || string == NULL) {
79641349Sbostic regerror("NULL parameter");
79741349Sbostic return(0);
79841349Sbostic }
79941349Sbostic
80041349Sbostic /* Check validity of program. */
80141349Sbostic if (UCHARAT(prog->program) != MAGIC) {
80241349Sbostic regerror("corrupted program");
80341349Sbostic return(0);
80441349Sbostic }
80541349Sbostic
80641349Sbostic /* If there is a "must appear" string, look for it. */
80741349Sbostic if (prog->regmust != NULL) {
80846619Sbostic s = (char *)string;
80941349Sbostic while ((s = strchr(s, prog->regmust[0])) != NULL) {
81041349Sbostic if (strncmp(s, prog->regmust, prog->regmlen) == 0)
81141349Sbostic break; /* Found it. */
81241349Sbostic s++;
81341349Sbostic }
81441349Sbostic if (s == NULL) /* Not present. */
81541349Sbostic return(0);
81641349Sbostic }
81741349Sbostic
81841349Sbostic /* Mark beginning of line for ^ . */
81946619Sbostic regbol = (char *)string;
82041349Sbostic
82141349Sbostic /* Simplest case: anchored match need be tried only once. */
82241349Sbostic if (prog->reganch)
82341349Sbostic return(regtry(prog, string));
82441349Sbostic
82541349Sbostic /* Messy cases: unanchored match. */
82646619Sbostic s = (char *)string;
82741349Sbostic if (prog->regstart != '\0')
82841349Sbostic /* We know what char it must start with. */
82941349Sbostic while ((s = strchr(s, prog->regstart)) != NULL) {
83041349Sbostic if (regtry(prog, s))
83141349Sbostic return(1);
83241349Sbostic s++;
83341349Sbostic }
83441349Sbostic else
83541349Sbostic /* We don't -- general case. */
83641349Sbostic do {
83741349Sbostic if (regtry(prog, s))
83841349Sbostic return(1);
83941349Sbostic } while (*s++ != '\0');
84041349Sbostic
84141349Sbostic /* Failure. */
84241349Sbostic return(0);
84341349Sbostic }
84441349Sbostic
84541349Sbostic /*
84641349Sbostic - regtry - try match at specific point
84741349Sbostic */
84841349Sbostic static int /* 0 failure, 1 success */
regtry(prog,string)84941349Sbostic regtry(prog, string)
85041349Sbostic regexp *prog;
85141349Sbostic char *string;
85241349Sbostic {
85341349Sbostic register int i;
85441349Sbostic register char **sp;
85541349Sbostic register char **ep;
85641349Sbostic
85741349Sbostic reginput = string;
85841349Sbostic regstartp = prog->startp;
85941349Sbostic regendp = prog->endp;
86041349Sbostic
86141349Sbostic sp = prog->startp;
86241349Sbostic ep = prog->endp;
86341349Sbostic for (i = NSUBEXP; i > 0; i--) {
86441349Sbostic *sp++ = NULL;
86541349Sbostic *ep++ = NULL;
86641349Sbostic }
86741349Sbostic if (regmatch(prog->program + 1)) {
86841349Sbostic prog->startp[0] = string;
86941349Sbostic prog->endp[0] = reginput;
87041349Sbostic return(1);
87141349Sbostic } else
87241349Sbostic return(0);
87341349Sbostic }
87441349Sbostic
87541349Sbostic /*
87641349Sbostic - regmatch - main matching routine
87741349Sbostic *
87841349Sbostic * Conceptually the strategy is simple: check to see whether the current
87941349Sbostic * node matches, call self recursively to see whether the rest matches,
88041349Sbostic * and then act accordingly. In practice we make some effort to avoid
88141349Sbostic * recursion, in particular by going through "ordinary" nodes (that don't
88241349Sbostic * need to know whether the rest of the match failed) by a loop instead of
88341349Sbostic * by recursion.
88441349Sbostic */
88541349Sbostic static int /* 0 failure, 1 success */
regmatch(prog)88641349Sbostic regmatch(prog)
88741349Sbostic char *prog;
88841349Sbostic {
88941349Sbostic register char *scan; /* Current node. */
89041349Sbostic char *next; /* Next node. */
89141349Sbostic extern char *strchr();
89241349Sbostic
89341349Sbostic scan = prog;
89441349Sbostic #ifdef DEBUG
89541349Sbostic if (scan != NULL && regnarrate)
89641349Sbostic fprintf(stderr, "%s(\n", regprop(scan));
89741349Sbostic #endif
89841349Sbostic while (scan != NULL) {
89941349Sbostic #ifdef DEBUG
90041349Sbostic if (regnarrate)
90141349Sbostic fprintf(stderr, "%s...\n", regprop(scan));
90241349Sbostic #endif
90341349Sbostic next = regnext(scan);
90441349Sbostic
90541349Sbostic switch (OP(scan)) {
90641349Sbostic case BOL:
90741349Sbostic if (reginput != regbol)
90841349Sbostic return(0);
90941349Sbostic break;
91041349Sbostic case EOL:
91141349Sbostic if (*reginput != '\0')
91241349Sbostic return(0);
91341349Sbostic break;
91441349Sbostic case WORDA:
91541349Sbostic /* Must be looking at a letter, digit, or _ */
91641349Sbostic if ((!isalnum(*reginput)) && *reginput != '_')
91741349Sbostic return(0);
91841349Sbostic /* Prev must be BOL or nonword */
91941349Sbostic if (reginput > regbol &&
92041349Sbostic (isalnum(reginput[-1]) || reginput[-1] == '_'))
92141349Sbostic return(0);
92241349Sbostic break;
92341349Sbostic case WORDZ:
92441349Sbostic /* Must be looking at non letter, digit, or _ */
92541349Sbostic if (isalnum(*reginput) || *reginput == '_')
92641349Sbostic return(0);
92741349Sbostic /* We don't care what the previous char was */
92841349Sbostic break;
92941349Sbostic case ANY:
93041349Sbostic if (*reginput == '\0')
93141349Sbostic return(0);
93241349Sbostic reginput++;
93341349Sbostic break;
93441349Sbostic case EXACTLY: {
93541349Sbostic register int len;
93641349Sbostic register char *opnd;
93741349Sbostic
93841349Sbostic opnd = OPERAND(scan);
93941349Sbostic /* Inline the first character, for speed. */
94041349Sbostic if (*opnd != *reginput)
94141349Sbostic return(0);
94241349Sbostic len = strlen(opnd);
94341349Sbostic if (len > 1 && strncmp(opnd, reginput, len) != 0)
94441349Sbostic return(0);
94541349Sbostic reginput += len;
94641349Sbostic }
94741349Sbostic break;
94841349Sbostic case ANYOF:
94941349Sbostic if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
95041349Sbostic return(0);
95141349Sbostic reginput++;
95241349Sbostic break;
95341349Sbostic case ANYBUT:
95441349Sbostic if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
95541349Sbostic return(0);
95641349Sbostic reginput++;
95741349Sbostic break;
95841349Sbostic case NOTHING:
95941349Sbostic break;
96041349Sbostic case BACK:
96141349Sbostic break;
96241349Sbostic case OPEN+1:
96341349Sbostic case OPEN+2:
96441349Sbostic case OPEN+3:
96541349Sbostic case OPEN+4:
96641349Sbostic case OPEN+5:
96741349Sbostic case OPEN+6:
96841349Sbostic case OPEN+7:
96941349Sbostic case OPEN+8:
97041349Sbostic case OPEN+9: {
97141349Sbostic register int no;
97241349Sbostic register char *save;
97341349Sbostic
97441349Sbostic no = OP(scan) - OPEN;
97541349Sbostic save = reginput;
97641349Sbostic
97741349Sbostic if (regmatch(next)) {
97841349Sbostic /*
97941349Sbostic * Don't set startp if some later
98041349Sbostic * invocation of the same parentheses
98141349Sbostic * already has.
98241349Sbostic */
98341349Sbostic if (regstartp[no] == NULL)
98441349Sbostic regstartp[no] = save;
98541349Sbostic return(1);
98641349Sbostic } else
98741349Sbostic return(0);
98841349Sbostic }
98941349Sbostic break;
99041349Sbostic case CLOSE+1:
99141349Sbostic case CLOSE+2:
99241349Sbostic case CLOSE+3:
99341349Sbostic case CLOSE+4:
99441349Sbostic case CLOSE+5:
99541349Sbostic case CLOSE+6:
99641349Sbostic case CLOSE+7:
99741349Sbostic case CLOSE+8:
99841349Sbostic case CLOSE+9: {
99941349Sbostic register int no;
100041349Sbostic register char *save;
100141349Sbostic
100241349Sbostic no = OP(scan) - CLOSE;
100341349Sbostic save = reginput;
100441349Sbostic
100541349Sbostic if (regmatch(next)) {
100641349Sbostic /*
100741349Sbostic * Don't set endp if some later
100841349Sbostic * invocation of the same parentheses
100941349Sbostic * already has.
101041349Sbostic */
101141349Sbostic if (regendp[no] == NULL)
101241349Sbostic regendp[no] = save;
101341349Sbostic return(1);
101441349Sbostic } else
101541349Sbostic return(0);
101641349Sbostic }
101741349Sbostic break;
101841349Sbostic case BRANCH: {
101941349Sbostic register char *save;
102041349Sbostic
102141349Sbostic if (OP(next) != BRANCH) /* No choice. */
102241349Sbostic next = OPERAND(scan); /* Avoid recursion. */
102341349Sbostic else {
102441349Sbostic do {
102541349Sbostic save = reginput;
102641349Sbostic if (regmatch(OPERAND(scan)))
102741349Sbostic return(1);
102841349Sbostic reginput = save;
102941349Sbostic scan = regnext(scan);
103041349Sbostic } while (scan != NULL && OP(scan) == BRANCH);
103141349Sbostic return(0);
103241349Sbostic /* NOTREACHED */
103341349Sbostic }
103441349Sbostic }
103541349Sbostic break;
103641349Sbostic case STAR:
103741349Sbostic case PLUS: {
103841349Sbostic register char nextch;
103941349Sbostic register int no;
104041349Sbostic register char *save;
104141349Sbostic register int min;
104241349Sbostic
104341349Sbostic /*
104441349Sbostic * Lookahead to avoid useless match attempts
104541349Sbostic * when we know what character comes next.
104641349Sbostic */
104741349Sbostic nextch = '\0';
104841349Sbostic if (OP(next) == EXACTLY)
104941349Sbostic nextch = *OPERAND(next);
105041349Sbostic min = (OP(scan) == STAR) ? 0 : 1;
105141349Sbostic save = reginput;
105241349Sbostic no = regrepeat(OPERAND(scan));
105341349Sbostic while (no >= min) {
105441349Sbostic /* If it could work, try it. */
105541349Sbostic if (nextch == '\0' || *reginput == nextch)
105641349Sbostic if (regmatch(next))
105741349Sbostic return(1);
105841349Sbostic /* Couldn't or didn't -- back up. */
105941349Sbostic no--;
106041349Sbostic reginput = save + no;
106141349Sbostic }
106241349Sbostic return(0);
106341349Sbostic }
106441349Sbostic break;
106541349Sbostic case END:
106641349Sbostic return(1); /* Success! */
106741349Sbostic break;
106841349Sbostic default:
106941349Sbostic regerror("memory corruption");
107041349Sbostic return(0);
107141349Sbostic break;
107241349Sbostic }
107341349Sbostic
107441349Sbostic scan = next;
107541349Sbostic }
107641349Sbostic
107741349Sbostic /*
107841349Sbostic * We get here only if there's trouble -- normally "case END" is
107941349Sbostic * the terminating point.
108041349Sbostic */
108141349Sbostic regerror("corrupted pointers");
108241349Sbostic return(0);
108341349Sbostic }
108441349Sbostic
108541349Sbostic /*
108641349Sbostic - regrepeat - repeatedly match something simple, report how many
108741349Sbostic */
108841349Sbostic static int
regrepeat(p)108941349Sbostic regrepeat(p)
109041349Sbostic char *p;
109141349Sbostic {
109241349Sbostic register int count = 0;
109341349Sbostic register char *scan;
109441349Sbostic register char *opnd;
109541349Sbostic
109641349Sbostic scan = reginput;
109741349Sbostic opnd = OPERAND(p);
109841349Sbostic switch (OP(p)) {
109941349Sbostic case ANY:
110041349Sbostic count = strlen(scan);
110141349Sbostic scan += count;
110241349Sbostic break;
110341349Sbostic case EXACTLY:
110441349Sbostic while (*opnd == *scan) {
110541349Sbostic count++;
110641349Sbostic scan++;
110741349Sbostic }
110841349Sbostic break;
110941349Sbostic case ANYOF:
111041349Sbostic while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
111141349Sbostic count++;
111241349Sbostic scan++;
111341349Sbostic }
111441349Sbostic break;
111541349Sbostic case ANYBUT:
111641349Sbostic while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
111741349Sbostic count++;
111841349Sbostic scan++;
111941349Sbostic }
112041349Sbostic break;
112141349Sbostic default: /* Oh dear. Called inappropriately. */
112241349Sbostic regerror("internal foulup");
112341349Sbostic count = 0; /* Best compromise. */
112441349Sbostic break;
112541349Sbostic }
112641349Sbostic reginput = scan;
112741349Sbostic
112841349Sbostic return(count);
112941349Sbostic }
113041349Sbostic
113141349Sbostic /*
113241349Sbostic - regnext - dig the "next" pointer out of a node
113341349Sbostic */
113441349Sbostic static char *
regnext(p)113541349Sbostic regnext(p)
113641349Sbostic register char *p;
113741349Sbostic {
113841349Sbostic register int offset;
113941349Sbostic
114041349Sbostic if (p == ®dummy)
114141349Sbostic return(NULL);
114241349Sbostic
114341349Sbostic offset = NEXT(p);
114441349Sbostic if (offset == 0)
114541349Sbostic return(NULL);
114641349Sbostic
114741349Sbostic if (OP(p) == BACK)
114841349Sbostic return(p-offset);
114941349Sbostic else
115041349Sbostic return(p+offset);
115141349Sbostic }
115241349Sbostic
115341349Sbostic #ifdef DEBUG
115441349Sbostic
115541349Sbostic STATIC char *regprop();
115641349Sbostic
115741349Sbostic /*
115841349Sbostic - regdump - dump a regexp onto stdout in vaguely comprehensible form
115941349Sbostic */
116041349Sbostic void
regdump(r)116141349Sbostic regdump(r)
116241349Sbostic regexp *r;
116341349Sbostic {
116441349Sbostic register char *s;
116541349Sbostic register char op = EXACTLY; /* Arbitrary non-END op. */
116641349Sbostic register char *next;
116741349Sbostic extern char *strchr();
116841349Sbostic
116941349Sbostic
117041349Sbostic s = r->program + 1;
117141349Sbostic while (op != END) { /* While that wasn't END last time... */
117241349Sbostic op = OP(s);
117341349Sbostic printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
117441349Sbostic next = regnext(s);
117541349Sbostic if (next == NULL) /* Next ptr. */
117641349Sbostic printf("(0)");
117741349Sbostic else
117841349Sbostic printf("(%d)", (s-r->program)+(next-s));
117941349Sbostic s += 3;
118041349Sbostic if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
118141349Sbostic /* Literal string, where present. */
118241349Sbostic while (*s != '\0') {
118341349Sbostic putchar(*s);
118441349Sbostic s++;
118541349Sbostic }
118641349Sbostic s++;
118741349Sbostic }
118841349Sbostic putchar('\n');
118941349Sbostic }
119041349Sbostic
119141349Sbostic /* Header fields of interest. */
119241349Sbostic if (r->regstart != '\0')
119341349Sbostic printf("start `%c' ", r->regstart);
119441349Sbostic if (r->reganch)
119541349Sbostic printf("anchored ");
119641349Sbostic if (r->regmust != NULL)
119741349Sbostic printf("must have \"%s\"", r->regmust);
119841349Sbostic printf("\n");
119941349Sbostic }
120041349Sbostic
120141349Sbostic /*
120241349Sbostic - regprop - printable representation of opcode
120341349Sbostic */
120441349Sbostic static char *
regprop(op)120541349Sbostic regprop(op)
120641349Sbostic char *op;
120741349Sbostic {
120841349Sbostic register char *p;
120941349Sbostic static char buf[50];
121041349Sbostic
121141349Sbostic (void) strcpy(buf, ":");
121241349Sbostic
121341349Sbostic switch (OP(op)) {
121441349Sbostic case BOL:
121541349Sbostic p = "BOL";
121641349Sbostic break;
121741349Sbostic case EOL:
121841349Sbostic p = "EOL";
121941349Sbostic break;
122041349Sbostic case ANY:
122141349Sbostic p = "ANY";
122241349Sbostic break;
122341349Sbostic case ANYOF:
122441349Sbostic p = "ANYOF";
122541349Sbostic break;
122641349Sbostic case ANYBUT:
122741349Sbostic p = "ANYBUT";
122841349Sbostic break;
122941349Sbostic case BRANCH:
123041349Sbostic p = "BRANCH";
123141349Sbostic break;
123241349Sbostic case EXACTLY:
123341349Sbostic p = "EXACTLY";
123441349Sbostic break;
123541349Sbostic case NOTHING:
123641349Sbostic p = "NOTHING";
123741349Sbostic break;
123841349Sbostic case BACK:
123941349Sbostic p = "BACK";
124041349Sbostic break;
124141349Sbostic case END:
124241349Sbostic p = "END";
124341349Sbostic break;
124441349Sbostic case OPEN+1:
124541349Sbostic case OPEN+2:
124641349Sbostic case OPEN+3:
124741349Sbostic case OPEN+4:
124841349Sbostic case OPEN+5:
124941349Sbostic case OPEN+6:
125041349Sbostic case OPEN+7:
125141349Sbostic case OPEN+8:
125241349Sbostic case OPEN+9:
125341349Sbostic sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
125441349Sbostic p = NULL;
125541349Sbostic break;
125641349Sbostic case CLOSE+1:
125741349Sbostic case CLOSE+2:
125841349Sbostic case CLOSE+3:
125941349Sbostic case CLOSE+4:
126041349Sbostic case CLOSE+5:
126141349Sbostic case CLOSE+6:
126241349Sbostic case CLOSE+7:
126341349Sbostic case CLOSE+8:
126441349Sbostic case CLOSE+9:
126541349Sbostic sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
126641349Sbostic p = NULL;
126741349Sbostic break;
126841349Sbostic case STAR:
126941349Sbostic p = "STAR";
127041349Sbostic break;
127141349Sbostic case PLUS:
127241349Sbostic p = "PLUS";
127341349Sbostic break;
127441349Sbostic case WORDA:
127541349Sbostic p = "WORDA";
127641349Sbostic break;
127741349Sbostic case WORDZ:
127841349Sbostic p = "WORDZ";
127941349Sbostic break;
128041349Sbostic default:
128141349Sbostic regerror("corrupted opcode");
128241349Sbostic break;
128341349Sbostic }
128441349Sbostic if (p != NULL)
128541349Sbostic (void) strcat(buf, p);
128641349Sbostic return(buf);
128741349Sbostic }
128841349Sbostic #endif
128941349Sbostic
129041349Sbostic /*
129141349Sbostic * The following is provided for those people who do not have strcspn() in
129241349Sbostic * their C libraries. They should get off their butts and do something
129341349Sbostic * about it; at least one public-domain implementation of those (highly
129441349Sbostic * useful) string routines has been published on Usenet.
129541349Sbostic */
129641349Sbostic #ifdef STRCSPN
129741349Sbostic /*
129841349Sbostic * strcspn - find length of initial segment of s1 consisting entirely
129941349Sbostic * of characters not from s2
130041349Sbostic */
130141349Sbostic
130241349Sbostic static int
strcspn(s1,s2)130341349Sbostic strcspn(s1, s2)
130441349Sbostic char *s1;
130541349Sbostic char *s2;
130641349Sbostic {
130741349Sbostic register char *scan1;
130841349Sbostic register char *scan2;
130941349Sbostic register int count;
131041349Sbostic
131141349Sbostic count = 0;
131241349Sbostic for (scan1 = s1; *scan1 != '\0'; scan1++) {
131341349Sbostic for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
131441349Sbostic if (*scan1 == *scan2++)
131541349Sbostic return(count);
131641349Sbostic count++;
131741349Sbostic }
131841349Sbostic return(count);
131941349Sbostic }
132041349Sbostic #endif
1321