141349Sbostic /* 241349Sbostic * regcomp and regexec -- regsub and regerror are elsewhere 341349Sbostic * 441349Sbostic * Copyright (c) 1986 by University of Toronto. 541349Sbostic * Written by Henry Spencer. Not derived from licensed software. 641349Sbostic * 741349Sbostic * Permission is granted to anyone to use this software for any 841349Sbostic * purpose on any computer system, and to redistribute it freely, 941349Sbostic * subject to the following restrictions: 1041349Sbostic * 1141349Sbostic * 1. The author is not responsible for the consequences of use of 1241349Sbostic * this software, no matter how awful, even if they arise 1341349Sbostic * from defects in it. 1441349Sbostic * 1541349Sbostic * 2. The origin of this software must not be misrepresented, either 1641349Sbostic * by explicit claim or by omission. 1741349Sbostic * 1841349Sbostic * 3. Altered versions must be plainly marked as such, and must not 1941349Sbostic * be misrepresented as being the original software. 2041349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 2141349Sbostic *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | 2241349Sbostic *** to assist in implementing egrep. 2341349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 2441349Sbostic *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching 2541349Sbostic *** as in BSD grep and ex. 2641349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 2741349Sbostic *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. 2841349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, 2941349Sbostic *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. 3041349Sbostic * 3141349Sbostic * Beware that some of this code is subtly aware of the way operator 3241349Sbostic * precedence is structured in regular expressions. Serious changes in 3341349Sbostic * regular-expression syntax might require a total rethink. 3441349Sbostic */ 35*46619Sbostic #include <regexp.h> 3641349Sbostic #include <stdio.h> 3741349Sbostic #include <ctype.h> 38*46619Sbostic #include <stdlib.h> 39*46619Sbostic #include <string.h> 4041349Sbostic #include "regmagic.h" 4141349Sbostic 4241349Sbostic /* 4341349Sbostic * The "internal use only" fields in regexp.h are present to pass info from 4441349Sbostic * compile to execute that permits the execute phase to run lots faster on 4541349Sbostic * simple cases. They are: 4641349Sbostic * 4741349Sbostic * regstart char that must begin a match; '\0' if none obvious 4841349Sbostic * reganch is the match anchored (at beginning-of-line only)? 4941349Sbostic * regmust string (pointer into program) that match must include, or NULL 5041349Sbostic * regmlen length of regmust string 5141349Sbostic * 5241349Sbostic * Regstart and reganch permit very fast decisions on suitable starting points 5341349Sbostic * for a match, cutting down the work a lot. Regmust permits fast rejection 5441349Sbostic * of lines that cannot possibly match. The regmust tests are costly enough 5541349Sbostic * that regcomp() supplies a regmust only if the r.e. contains something 5641349Sbostic * potentially expensive (at present, the only such thing detected is * or + 5741349Sbostic * at the start of the r.e., which can involve a lot of backup). Regmlen is 5841349Sbostic * supplied because the test in regexec() needs it and regcomp() is computing 5941349Sbostic * it anyway. 6041349Sbostic */ 6141349Sbostic 6241349Sbostic /* 6341349Sbostic * Structure for regexp "program". This is essentially a linear encoding 6441349Sbostic * of a nondeterministic finite-state machine (aka syntax charts or 6541349Sbostic * "railroad normal form" in parsing technology). Each node is an opcode 6641349Sbostic * plus a "next" pointer, possibly plus an operand. "Next" pointers of 6741349Sbostic * all nodes except BRANCH implement concatenation; a "next" pointer with 6841349Sbostic * a BRANCH on both ends of it is connecting two alternatives. (Here we 6941349Sbostic * have one of the subtle syntax dependencies: an individual BRANCH (as 7041349Sbostic * opposed to a collection of them) is never concatenated with anything 7141349Sbostic * because of operator precedence.) The operand of some types of node is 7241349Sbostic * a literal string; for others, it is a node leading into a sub-FSM. In 7341349Sbostic * particular, the operand of a BRANCH node is the first node of the branch. 7441349Sbostic * (NB this is *not* a tree structure: the tail of the branch connects 7541349Sbostic * to the thing following the set of BRANCHes.) The opcodes are: 7641349Sbostic */ 7741349Sbostic 7841349Sbostic /* definition number opnd? meaning */ 7941349Sbostic #define END 0 /* no End of program. */ 8041349Sbostic #define BOL 1 /* no Match "" at beginning of line. */ 8141349Sbostic #define EOL 2 /* no Match "" at end of line. */ 8241349Sbostic #define ANY 3 /* no Match any one character. */ 8341349Sbostic #define ANYOF 4 /* str Match any character in this string. */ 8441349Sbostic #define ANYBUT 5 /* str Match any character not in this string. */ 8541349Sbostic #define BRANCH 6 /* node Match this alternative, or the next... */ 8641349Sbostic #define BACK 7 /* no Match "", "next" ptr points backward. */ 8741349Sbostic #define EXACTLY 8 /* str Match this string. */ 8841349Sbostic #define NOTHING 9 /* no Match empty string. */ 8941349Sbostic #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 9041349Sbostic #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 9141349Sbostic #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ 9241349Sbostic #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ 9341349Sbostic #define OPEN 20 /* no Mark this point in input as start of #n. */ 9441349Sbostic /* OPEN+1 is number 1, etc. */ 9541349Sbostic #define CLOSE 30 /* no Analogous to OPEN. */ 9641349Sbostic 9741349Sbostic /* 9841349Sbostic * Opcode notes: 9941349Sbostic * 10041349Sbostic * BRANCH The set of branches constituting a single choice are hooked 10141349Sbostic * together with their "next" pointers, since precedence prevents 10241349Sbostic * anything being concatenated to any individual branch. The 10341349Sbostic * "next" pointer of the last BRANCH in a choice points to the 10441349Sbostic * thing following the whole choice. This is also where the 10541349Sbostic * final "next" pointer of each individual branch points; each 10641349Sbostic * branch starts with the operand node of a BRANCH node. 10741349Sbostic * 10841349Sbostic * BACK Normal "next" pointers all implicitly point forward; BACK 10941349Sbostic * exists to make loop structures possible. 11041349Sbostic * 11141349Sbostic * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 11241349Sbostic * BRANCH structures using BACK. Simple cases (one character 11341349Sbostic * per match) are implemented with STAR and PLUS for speed 11441349Sbostic * and to minimize recursive plunges. 11541349Sbostic * 11641349Sbostic * OPEN,CLOSE ...are numbered at compile time. 11741349Sbostic */ 11841349Sbostic 11941349Sbostic /* 12041349Sbostic * A node is one char of opcode followed by two chars of "next" pointer. 12141349Sbostic * "Next" pointers are stored as two 8-bit pieces, high order first. The 12241349Sbostic * value is a positive offset from the opcode of the node containing it. 12341349Sbostic * An operand, if any, simply follows the node. (Note that much of the 12441349Sbostic * code generation knows about this implicit relationship.) 12541349Sbostic * 12641349Sbostic * Using two bytes for the "next" pointer is vast overkill for most things, 12741349Sbostic * but allows patterns to get big without disasters. 12841349Sbostic */ 12941349Sbostic #define OP(p) (*(p)) 13041349Sbostic #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 13141349Sbostic #define OPERAND(p) ((p) + 3) 13241349Sbostic 13341349Sbostic /* 13441349Sbostic * See regmagic.h for one further detail of program structure. 13541349Sbostic */ 13641349Sbostic 13741349Sbostic 13841349Sbostic /* 13941349Sbostic * Utility definitions. 14041349Sbostic */ 14141349Sbostic #ifndef CHARBITS 14241349Sbostic #define UCHARAT(p) ((int)*(unsigned char *)(p)) 14341349Sbostic #else 14441349Sbostic #define UCHARAT(p) ((int)*(p)&CHARBITS) 14541349Sbostic #endif 14641349Sbostic 14741349Sbostic #define FAIL(m) { regerror(m); return(NULL); } 14841349Sbostic #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 14941349Sbostic 15041349Sbostic /* 15141349Sbostic * Flags to be passed up and down. 15241349Sbostic */ 15341349Sbostic #define HASWIDTH 01 /* Known never to match null string. */ 15441349Sbostic #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 15541349Sbostic #define SPSTART 04 /* Starts with * or +. */ 15641349Sbostic #define WORST 0 /* Worst case. */ 15741349Sbostic 15841349Sbostic /* 15941349Sbostic * Global work variables for regcomp(). 16041349Sbostic */ 16141349Sbostic static char *regparse; /* Input-scan pointer. */ 16241349Sbostic static int regnpar; /* () count. */ 16341349Sbostic static char regdummy; 16441349Sbostic static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 16541349Sbostic static long regsize; /* Code size. */ 16641349Sbostic 16741349Sbostic /* 16841349Sbostic * Forward declarations for regcomp()'s friends. 16941349Sbostic */ 17041349Sbostic #ifndef STATIC 17141349Sbostic #define STATIC static 17241349Sbostic #endif 17341349Sbostic STATIC char *reg(); 17441349Sbostic STATIC char *regbranch(); 17541349Sbostic STATIC char *regpiece(); 17641349Sbostic STATIC char *regatom(); 17741349Sbostic STATIC char *regnode(); 17841349Sbostic STATIC char *regnext(); 17941349Sbostic STATIC void regc(); 18041349Sbostic STATIC void reginsert(); 18141349Sbostic STATIC void regtail(); 18241349Sbostic STATIC void regoptail(); 18341349Sbostic #ifdef STRCSPN 18441349Sbostic STATIC int strcspn(); 18541349Sbostic #endif 18641349Sbostic 18741349Sbostic /* 18841349Sbostic - regcomp - compile a regular expression into internal code 18941349Sbostic * 19041349Sbostic * We can't allocate space until we know how big the compiled form will be, 19141349Sbostic * but we can't compile it (and thus know how big it is) until we've got a 19241349Sbostic * place to put the code. So we cheat: we compile it twice, once with code 19341349Sbostic * generation turned off and size counting turned on, and once "for real". 19441349Sbostic * This also means that we don't allocate space until we are sure that the 19541349Sbostic * thing really will compile successfully, and we never have to move the 19641349Sbostic * code and thus invalidate pointers into it. (Note that it has to be in 19741349Sbostic * one piece because free() must be able to free it all.) 19841349Sbostic * 19941349Sbostic * Beware that the optimization-preparation code in here knows about some 20041349Sbostic * of the structure of the compiled regexp. 20141349Sbostic */ 20241349Sbostic regexp * 20341349Sbostic regcomp(exp) 204*46619Sbostic const char *exp; 20541349Sbostic { 20641349Sbostic register regexp *r; 20741349Sbostic register char *scan; 20841349Sbostic register char *longest; 20941349Sbostic register int len; 21041349Sbostic int flags; 21141349Sbostic 21241349Sbostic if (exp == NULL) 21341349Sbostic FAIL("NULL argument"); 21441349Sbostic 21541349Sbostic /* First pass: determine size, legality. */ 21641349Sbostic if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ 217*46619Sbostic regparse = (char *)exp; 21841349Sbostic regnpar = 1; 21941349Sbostic regsize = 0L; 22041349Sbostic regcode = ®dummy; 22141349Sbostic regc(MAGIC); 22241349Sbostic if (reg(0, &flags) == NULL) 22341349Sbostic return(NULL); 22441349Sbostic 22541349Sbostic /* Small enough for pointer-storage convention? */ 22641349Sbostic if (regsize >= 32767L) /* Probably could be 65535L. */ 22741349Sbostic FAIL("regexp too big"); 22841349Sbostic 22941349Sbostic /* Allocate space. */ 23041349Sbostic r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 23141349Sbostic if (r == NULL) 23241349Sbostic FAIL("out of space"); 23341349Sbostic 23441349Sbostic /* Second pass: emit code. */ 235*46619Sbostic regparse = (char *)exp; 23641349Sbostic regnpar = 1; 23741349Sbostic regcode = r->program; 23841349Sbostic regc(MAGIC); 23941349Sbostic if (reg(0, &flags) == NULL) 24041349Sbostic return(NULL); 24141349Sbostic 24241349Sbostic /* Dig out information for optimizations. */ 24341349Sbostic r->regstart = '\0'; /* Worst-case defaults. */ 24441349Sbostic r->reganch = 0; 24541349Sbostic r->regmust = NULL; 24641349Sbostic r->regmlen = 0; 24741349Sbostic scan = r->program+1; /* First BRANCH. */ 24841349Sbostic if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 24941349Sbostic scan = OPERAND(scan); 25041349Sbostic 25141349Sbostic /* Starting-point info. */ 25241349Sbostic if (OP(scan) == EXACTLY) 25341349Sbostic r->regstart = *OPERAND(scan); 25441349Sbostic else if (OP(scan) == BOL) 25541349Sbostic r->reganch++; 25641349Sbostic 25741349Sbostic /* 25841349Sbostic * If there's something expensive in the r.e., find the 25941349Sbostic * longest literal string that must appear and make it the 26041349Sbostic * regmust. Resolve ties in favor of later strings, since 26141349Sbostic * the regstart check works with the beginning of the r.e. 26241349Sbostic * and avoiding duplication strengthens checking. Not a 26341349Sbostic * strong reason, but sufficient in the absence of others. 26441349Sbostic */ 26541349Sbostic if (flags&SPSTART) { 26641349Sbostic longest = NULL; 26741349Sbostic len = 0; 26841349Sbostic for (; scan != NULL; scan = regnext(scan)) 26941349Sbostic if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { 27041349Sbostic longest = OPERAND(scan); 27141349Sbostic len = strlen(OPERAND(scan)); 27241349Sbostic } 27341349Sbostic r->regmust = longest; 27441349Sbostic r->regmlen = len; 27541349Sbostic } 27641349Sbostic } 27741349Sbostic 27841349Sbostic return(r); 27941349Sbostic } 28041349Sbostic 28141349Sbostic /* 28241349Sbostic - reg - regular expression, i.e. main body or parenthesized thing 28341349Sbostic * 28441349Sbostic * Caller must absorb opening parenthesis. 28541349Sbostic * 28641349Sbostic * Combining parenthesis handling with the base level of regular expression 28741349Sbostic * is a trifle forced, but the need to tie the tails of the branches to what 28841349Sbostic * follows makes it hard to avoid. 28941349Sbostic */ 29041349Sbostic static char * 29141349Sbostic reg(paren, flagp) 29241349Sbostic int paren; /* Parenthesized? */ 29341349Sbostic int *flagp; 29441349Sbostic { 29541349Sbostic register char *ret; 29641349Sbostic register char *br; 29741349Sbostic register char *ender; 29841349Sbostic register int parno; 29941349Sbostic int flags; 30041349Sbostic 30141349Sbostic *flagp = HASWIDTH; /* Tentatively. */ 30241349Sbostic 30341349Sbostic /* Make an OPEN node, if parenthesized. */ 30441349Sbostic if (paren) { 30541349Sbostic if (regnpar >= NSUBEXP) 30641349Sbostic FAIL("too many ()"); 30741349Sbostic parno = regnpar; 30841349Sbostic regnpar++; 30941349Sbostic ret = regnode(OPEN+parno); 31041349Sbostic } else 31141349Sbostic ret = NULL; 31241349Sbostic 31341349Sbostic /* Pick up the branches, linking them together. */ 31441349Sbostic br = regbranch(&flags); 31541349Sbostic if (br == NULL) 31641349Sbostic return(NULL); 31741349Sbostic if (ret != NULL) 31841349Sbostic regtail(ret, br); /* OPEN -> first. */ 31941349Sbostic else 32041349Sbostic ret = br; 32141349Sbostic if (!(flags&HASWIDTH)) 32241349Sbostic *flagp &= ~HASWIDTH; 32341349Sbostic *flagp |= flags&SPSTART; 32441349Sbostic while (*regparse == '|' || *regparse == '\n') { 32541349Sbostic regparse++; 32641349Sbostic br = regbranch(&flags); 32741349Sbostic if (br == NULL) 32841349Sbostic return(NULL); 32941349Sbostic regtail(ret, br); /* BRANCH -> BRANCH. */ 33041349Sbostic if (!(flags&HASWIDTH)) 33141349Sbostic *flagp &= ~HASWIDTH; 33241349Sbostic *flagp |= flags&SPSTART; 33341349Sbostic } 33441349Sbostic 33541349Sbostic /* Make a closing node, and hook it on the end. */ 33641349Sbostic ender = regnode((paren) ? CLOSE+parno : END); 33741349Sbostic regtail(ret, ender); 33841349Sbostic 33941349Sbostic /* Hook the tails of the branches to the closing node. */ 34041349Sbostic for (br = ret; br != NULL; br = regnext(br)) 34141349Sbostic regoptail(br, ender); 34241349Sbostic 34341349Sbostic /* Check for proper termination. */ 34441349Sbostic if (paren && *regparse++ != ')') { 34541349Sbostic FAIL("unmatched ()"); 34641349Sbostic } else if (!paren && *regparse != '\0') { 34741349Sbostic if (*regparse == ')') { 34841349Sbostic FAIL("unmatched ()"); 34941349Sbostic } else 35041349Sbostic FAIL("junk on end"); /* "Can't happen". */ 35141349Sbostic /* NOTREACHED */ 35241349Sbostic } 35341349Sbostic 35441349Sbostic return(ret); 35541349Sbostic } 35641349Sbostic 35741349Sbostic /* 35841349Sbostic - regbranch - one alternative of an | operator 35941349Sbostic * 36041349Sbostic * Implements the concatenation operator. 36141349Sbostic */ 36241349Sbostic static char * 36341349Sbostic regbranch(flagp) 36441349Sbostic int *flagp; 36541349Sbostic { 36641349Sbostic register char *ret; 36741349Sbostic register char *chain; 36841349Sbostic register char *latest; 36941349Sbostic int flags; 37041349Sbostic 37141349Sbostic *flagp = WORST; /* Tentatively. */ 37241349Sbostic 37341349Sbostic ret = regnode(BRANCH); 37441349Sbostic chain = NULL; 37541349Sbostic while (*regparse != '\0' && *regparse != ')' && 37641349Sbostic *regparse != '\n' && *regparse != '|') { 37741349Sbostic latest = regpiece(&flags); 37841349Sbostic if (latest == NULL) 37941349Sbostic return(NULL); 38041349Sbostic *flagp |= flags&HASWIDTH; 38141349Sbostic if (chain == NULL) /* First piece. */ 38241349Sbostic *flagp |= flags&SPSTART; 38341349Sbostic else 38441349Sbostic regtail(chain, latest); 38541349Sbostic chain = latest; 38641349Sbostic } 38741349Sbostic if (chain == NULL) /* Loop ran zero times. */ 38841349Sbostic (void) regnode(NOTHING); 38941349Sbostic 39041349Sbostic return(ret); 39141349Sbostic } 39241349Sbostic 39341349Sbostic /* 39441349Sbostic - regpiece - something followed by possible [*+?] 39541349Sbostic * 39641349Sbostic * Note that the branching code sequences used for ? and the general cases 39741349Sbostic * of * and + are somewhat optimized: they use the same NOTHING node as 39841349Sbostic * both the endmarker for their branch list and the body of the last branch. 39941349Sbostic * It might seem that this node could be dispensed with entirely, but the 40041349Sbostic * endmarker role is not redundant. 40141349Sbostic */ 40241349Sbostic static char * 40341349Sbostic regpiece(flagp) 40441349Sbostic int *flagp; 40541349Sbostic { 40641349Sbostic register char *ret; 40741349Sbostic register char op; 40841349Sbostic register char *next; 40941349Sbostic int flags; 41041349Sbostic 41141349Sbostic ret = regatom(&flags); 41241349Sbostic if (ret == NULL) 41341349Sbostic return(NULL); 41441349Sbostic 41541349Sbostic op = *regparse; 41641349Sbostic if (!ISMULT(op)) { 41741349Sbostic *flagp = flags; 41841349Sbostic return(ret); 41941349Sbostic } 42041349Sbostic 42141349Sbostic if (!(flags&HASWIDTH) && op != '?') 42241349Sbostic FAIL("*+ operand could be empty"); 42341349Sbostic *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 42441349Sbostic 42541349Sbostic if (op == '*' && (flags&SIMPLE)) 42641349Sbostic reginsert(STAR, ret); 42741349Sbostic else if (op == '*') { 42841349Sbostic /* Emit x* as (x&|), where & means "self". */ 42941349Sbostic reginsert(BRANCH, ret); /* Either x */ 43041349Sbostic regoptail(ret, regnode(BACK)); /* and loop */ 43141349Sbostic regoptail(ret, ret); /* back */ 43241349Sbostic regtail(ret, regnode(BRANCH)); /* or */ 43341349Sbostic regtail(ret, regnode(NOTHING)); /* null. */ 43441349Sbostic } else if (op == '+' && (flags&SIMPLE)) 43541349Sbostic reginsert(PLUS, ret); 43641349Sbostic else if (op == '+') { 43741349Sbostic /* Emit x+ as x(&|), where & means "self". */ 43841349Sbostic next = regnode(BRANCH); /* Either */ 43941349Sbostic regtail(ret, next); 44041349Sbostic regtail(regnode(BACK), ret); /* loop back */ 44141349Sbostic regtail(next, regnode(BRANCH)); /* or */ 44241349Sbostic regtail(ret, regnode(NOTHING)); /* null. */ 44341349Sbostic } else if (op == '?') { 44441349Sbostic /* Emit x? as (x|) */ 44541349Sbostic reginsert(BRANCH, ret); /* Either x */ 44641349Sbostic regtail(ret, regnode(BRANCH)); /* or */ 44741349Sbostic next = regnode(NOTHING); /* null. */ 44841349Sbostic regtail(ret, next); 44941349Sbostic regoptail(ret, next); 45041349Sbostic } 45141349Sbostic regparse++; 45241349Sbostic if (ISMULT(*regparse)) 45341349Sbostic FAIL("nested *?+"); 45441349Sbostic 45541349Sbostic return(ret); 45641349Sbostic } 45741349Sbostic 45841349Sbostic /* 45941349Sbostic - regatom - the lowest level 46041349Sbostic * 46141349Sbostic * Optimization: gobbles an entire sequence of ordinary characters so that 46241349Sbostic * it can turn them into a single node, which is smaller to store and 46341349Sbostic * faster to run. Backslashed characters are exceptions, each becoming a 46441349Sbostic * separate node; the code is simpler that way and it's not worth fixing. 46541349Sbostic */ 46641349Sbostic static char * 46741349Sbostic regatom(flagp) 46841349Sbostic int *flagp; 46941349Sbostic { 47041349Sbostic register char *ret; 47141349Sbostic int flags; 47241349Sbostic 47341349Sbostic *flagp = WORST; /* Tentatively. */ 47441349Sbostic 47541349Sbostic switch (*regparse++) { 47641349Sbostic /* FIXME: these chars only have meaning at beg/end of pat? */ 47741349Sbostic case '^': 47841349Sbostic ret = regnode(BOL); 47941349Sbostic break; 48041349Sbostic case '$': 48141349Sbostic ret = regnode(EOL); 48241349Sbostic break; 48341349Sbostic case '.': 48441349Sbostic ret = regnode(ANY); 48541349Sbostic *flagp |= HASWIDTH|SIMPLE; 48641349Sbostic break; 48741349Sbostic case '[': { 48841349Sbostic register int class; 48941349Sbostic register int classend; 49041349Sbostic 49141349Sbostic if (*regparse == '^') { /* Complement of range. */ 49241349Sbostic ret = regnode(ANYBUT); 49341349Sbostic regparse++; 49441349Sbostic } else 49541349Sbostic ret = regnode(ANYOF); 49641349Sbostic if (*regparse == ']' || *regparse == '-') 49741349Sbostic regc(*regparse++); 49841349Sbostic while (*regparse != '\0' && *regparse != ']') { 49941349Sbostic if (*regparse == '-') { 50041349Sbostic regparse++; 50141349Sbostic if (*regparse == ']' || *regparse == '\0') 50241349Sbostic regc('-'); 50341349Sbostic else { 50441349Sbostic class = UCHARAT(regparse-2)+1; 50541349Sbostic classend = UCHARAT(regparse); 50641349Sbostic if (class > classend+1) 50741349Sbostic FAIL("invalid [] range"); 50841349Sbostic for (; class <= classend; class++) 50941349Sbostic regc(class); 51041349Sbostic regparse++; 51141349Sbostic } 51241349Sbostic } else 51341349Sbostic regc(*regparse++); 51441349Sbostic } 51541349Sbostic regc('\0'); 51641349Sbostic if (*regparse != ']') 51741349Sbostic FAIL("unmatched []"); 51841349Sbostic regparse++; 51941349Sbostic *flagp |= HASWIDTH|SIMPLE; 52041349Sbostic } 52141349Sbostic break; 52241349Sbostic case '(': 52341349Sbostic ret = reg(1, &flags); 52441349Sbostic if (ret == NULL) 52541349Sbostic return(NULL); 52641349Sbostic *flagp |= flags&(HASWIDTH|SPSTART); 52741349Sbostic break; 52841349Sbostic case '\0': 52941349Sbostic case '|': 53041349Sbostic case '\n': 53141349Sbostic case ')': 53241349Sbostic FAIL("internal urp"); /* Supposed to be caught earlier. */ 53341349Sbostic break; 53441349Sbostic case '?': 53541349Sbostic case '+': 53641349Sbostic case '*': 53741349Sbostic FAIL("?+* follows nothing"); 53841349Sbostic break; 53941349Sbostic case '\\': 54041349Sbostic switch (*regparse++) { 54141349Sbostic case '\0': 54241349Sbostic FAIL("trailing \\"); 54341349Sbostic break; 54441349Sbostic case '<': 54541349Sbostic ret = regnode(WORDA); 54641349Sbostic break; 54741349Sbostic case '>': 54841349Sbostic ret = regnode(WORDZ); 54941349Sbostic break; 55041349Sbostic /* FIXME: Someday handle \1, \2, ... */ 55141349Sbostic default: 55241349Sbostic /* Handle general quoted chars in exact-match routine */ 55341349Sbostic goto de_fault; 55441349Sbostic } 55541349Sbostic break; 55641349Sbostic de_fault: 55741349Sbostic default: 55841349Sbostic /* 55941349Sbostic * Encode a string of characters to be matched exactly. 56041349Sbostic * 56141349Sbostic * This is a bit tricky due to quoted chars and due to 56241349Sbostic * '*', '+', and '?' taking the SINGLE char previous 56341349Sbostic * as their operand. 56441349Sbostic * 56541349Sbostic * On entry, the char at regparse[-1] is going to go 56641349Sbostic * into the string, no matter what it is. (It could be 56741349Sbostic * following a \ if we are entered from the '\' case.) 56841349Sbostic * 56941349Sbostic * Basic idea is to pick up a good char in ch and 57041349Sbostic * examine the next char. If it's *+? then we twiddle. 57141349Sbostic * If it's \ then we frozzle. If it's other magic char 57241349Sbostic * we push ch and terminate the string. If none of the 57341349Sbostic * above, we push ch on the string and go around again. 57441349Sbostic * 57541349Sbostic * regprev is used to remember where "the current char" 57641349Sbostic * starts in the string, if due to a *+? we need to back 57741349Sbostic * up and put the current char in a separate, 1-char, string. 57841349Sbostic * When regprev is NULL, ch is the only char in the 57941349Sbostic * string; this is used in *+? handling, and in setting 58041349Sbostic * flags |= SIMPLE at the end. 58141349Sbostic */ 58241349Sbostic { 58341349Sbostic char *regprev; 58441349Sbostic register char ch; 58541349Sbostic 58641349Sbostic regparse--; /* Look at cur char */ 58741349Sbostic ret = regnode(EXACTLY); 58841349Sbostic for ( regprev = 0 ; ; ) { 58941349Sbostic ch = *regparse++; /* Get current char */ 59041349Sbostic switch (*regparse) { /* look at next one */ 59141349Sbostic 59241349Sbostic default: 59341349Sbostic regc(ch); /* Add cur to string */ 59441349Sbostic break; 59541349Sbostic 59641349Sbostic case '.': case '[': case '(': 59741349Sbostic case ')': case '|': case '\n': 59841349Sbostic case '$': case '^': 59941349Sbostic case '\0': 60041349Sbostic /* FIXME, $ and ^ should not always be magic */ 60141349Sbostic magic: 60241349Sbostic regc(ch); /* dump cur char */ 60341349Sbostic goto done; /* and we are done */ 60441349Sbostic 60541349Sbostic case '?': case '+': case '*': 60641349Sbostic if (!regprev) /* If just ch in str, */ 60741349Sbostic goto magic; /* use it */ 60841349Sbostic /* End mult-char string one early */ 60941349Sbostic regparse = regprev; /* Back up parse */ 61041349Sbostic goto done; 61141349Sbostic 61241349Sbostic case '\\': 61341349Sbostic regc(ch); /* Cur char OK */ 61441349Sbostic switch (regparse[1]){ /* Look after \ */ 61541349Sbostic case '\0': 61641349Sbostic case '<': 61741349Sbostic case '>': 61841349Sbostic /* FIXME: Someday handle \1, \2, ... */ 61941349Sbostic goto done; /* Not quoted */ 62041349Sbostic default: 62141349Sbostic /* Backup point is \, scan * point is after it. */ 62241349Sbostic regprev = regparse; 62341349Sbostic regparse++; 62441349Sbostic continue; /* NOT break; */ 62541349Sbostic } 62641349Sbostic } 62741349Sbostic regprev = regparse; /* Set backup point */ 62841349Sbostic } 62941349Sbostic done: 63041349Sbostic regc('\0'); 63141349Sbostic *flagp |= HASWIDTH; 63241349Sbostic if (!regprev) /* One char? */ 63341349Sbostic *flagp |= SIMPLE; 63441349Sbostic } 63541349Sbostic break; 63641349Sbostic } 63741349Sbostic 63841349Sbostic return(ret); 63941349Sbostic } 64041349Sbostic 64141349Sbostic /* 64241349Sbostic - regnode - emit a node 64341349Sbostic */ 64441349Sbostic static char * /* Location. */ 64541349Sbostic regnode(op) 64641349Sbostic char op; 64741349Sbostic { 64841349Sbostic register char *ret; 64941349Sbostic register char *ptr; 65041349Sbostic 65141349Sbostic ret = regcode; 65241349Sbostic if (ret == ®dummy) { 65341349Sbostic regsize += 3; 65441349Sbostic return(ret); 65541349Sbostic } 65641349Sbostic 65741349Sbostic ptr = ret; 65841349Sbostic *ptr++ = op; 65941349Sbostic *ptr++ = '\0'; /* Null "next" pointer. */ 66041349Sbostic *ptr++ = '\0'; 66141349Sbostic regcode = ptr; 66241349Sbostic 66341349Sbostic return(ret); 66441349Sbostic } 66541349Sbostic 66641349Sbostic /* 66741349Sbostic - regc - emit (if appropriate) a byte of code 66841349Sbostic */ 66941349Sbostic static void 67041349Sbostic regc(b) 67141349Sbostic char b; 67241349Sbostic { 67341349Sbostic if (regcode != ®dummy) 67441349Sbostic *regcode++ = b; 67541349Sbostic else 67641349Sbostic regsize++; 67741349Sbostic } 67841349Sbostic 67941349Sbostic /* 68041349Sbostic - reginsert - insert an operator in front of already-emitted operand 68141349Sbostic * 68241349Sbostic * Means relocating the operand. 68341349Sbostic */ 68441349Sbostic static void 68541349Sbostic reginsert(op, opnd) 68641349Sbostic char op; 68741349Sbostic char *opnd; 68841349Sbostic { 68941349Sbostic register char *src; 69041349Sbostic register char *dst; 69141349Sbostic register char *place; 69241349Sbostic 69341349Sbostic if (regcode == ®dummy) { 69441349Sbostic regsize += 3; 69541349Sbostic return; 69641349Sbostic } 69741349Sbostic 69841349Sbostic src = regcode; 69941349Sbostic regcode += 3; 70041349Sbostic dst = regcode; 70141349Sbostic while (src > opnd) 70241349Sbostic *--dst = *--src; 70341349Sbostic 70441349Sbostic place = opnd; /* Op node, where operand used to be. */ 70541349Sbostic *place++ = op; 70641349Sbostic *place++ = '\0'; 70741349Sbostic *place++ = '\0'; 70841349Sbostic } 70941349Sbostic 71041349Sbostic /* 71141349Sbostic - regtail - set the next-pointer at the end of a node chain 71241349Sbostic */ 71341349Sbostic static void 71441349Sbostic regtail(p, val) 71541349Sbostic char *p; 71641349Sbostic char *val; 71741349Sbostic { 71841349Sbostic register char *scan; 71941349Sbostic register char *temp; 72041349Sbostic register int offset; 72141349Sbostic 72241349Sbostic if (p == ®dummy) 72341349Sbostic return; 72441349Sbostic 72541349Sbostic /* Find last node. */ 72641349Sbostic scan = p; 72741349Sbostic for (;;) { 72841349Sbostic temp = regnext(scan); 72941349Sbostic if (temp == NULL) 73041349Sbostic break; 73141349Sbostic scan = temp; 73241349Sbostic } 73341349Sbostic 73441349Sbostic if (OP(scan) == BACK) 73541349Sbostic offset = scan - val; 73641349Sbostic else 73741349Sbostic offset = val - scan; 73841349Sbostic *(scan+1) = (offset>>8)&0377; 73941349Sbostic *(scan+2) = offset&0377; 74041349Sbostic } 74141349Sbostic 74241349Sbostic /* 74341349Sbostic - regoptail - regtail on operand of first argument; nop if operandless 74441349Sbostic */ 74541349Sbostic static void 74641349Sbostic regoptail(p, val) 74741349Sbostic char *p; 74841349Sbostic char *val; 74941349Sbostic { 75041349Sbostic /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 75141349Sbostic if (p == NULL || p == ®dummy || OP(p) != BRANCH) 75241349Sbostic return; 75341349Sbostic regtail(OPERAND(p), val); 75441349Sbostic } 75541349Sbostic 75641349Sbostic /* 75741349Sbostic * regexec and friends 75841349Sbostic */ 75941349Sbostic 76041349Sbostic /* 76141349Sbostic * Global work variables for regexec(). 76241349Sbostic */ 76341349Sbostic static char *reginput; /* String-input pointer. */ 76441349Sbostic static char *regbol; /* Beginning of input, for ^ check. */ 76541349Sbostic static char **regstartp; /* Pointer to startp array. */ 76641349Sbostic static char **regendp; /* Ditto for endp. */ 76741349Sbostic 76841349Sbostic /* 76941349Sbostic * Forwards. 77041349Sbostic */ 77141349Sbostic STATIC int regtry(); 77241349Sbostic STATIC int regmatch(); 77341349Sbostic STATIC int regrepeat(); 77441349Sbostic 77541349Sbostic #ifdef DEBUG 77641349Sbostic int regnarrate = 0; 77741349Sbostic void regdump(); 77841349Sbostic STATIC char *regprop(); 77941349Sbostic #endif 78041349Sbostic 78141349Sbostic /* 78241349Sbostic - regexec - match a regexp against a string 78341349Sbostic */ 78441349Sbostic int 78541349Sbostic regexec(prog, string) 786*46619Sbostic register const regexp *prog; 787*46619Sbostic register const char *string; 78841349Sbostic { 78941349Sbostic register char *s; 79041349Sbostic extern char *strchr(); 79141349Sbostic 79241349Sbostic /* Be paranoid... */ 79341349Sbostic if (prog == NULL || string == NULL) { 79441349Sbostic regerror("NULL parameter"); 79541349Sbostic return(0); 79641349Sbostic } 79741349Sbostic 79841349Sbostic /* Check validity of program. */ 79941349Sbostic if (UCHARAT(prog->program) != MAGIC) { 80041349Sbostic regerror("corrupted program"); 80141349Sbostic return(0); 80241349Sbostic } 80341349Sbostic 80441349Sbostic /* If there is a "must appear" string, look for it. */ 80541349Sbostic if (prog->regmust != NULL) { 806*46619Sbostic s = (char *)string; 80741349Sbostic while ((s = strchr(s, prog->regmust[0])) != NULL) { 80841349Sbostic if (strncmp(s, prog->regmust, prog->regmlen) == 0) 80941349Sbostic break; /* Found it. */ 81041349Sbostic s++; 81141349Sbostic } 81241349Sbostic if (s == NULL) /* Not present. */ 81341349Sbostic return(0); 81441349Sbostic } 81541349Sbostic 81641349Sbostic /* Mark beginning of line for ^ . */ 817*46619Sbostic regbol = (char *)string; 81841349Sbostic 81941349Sbostic /* Simplest case: anchored match need be tried only once. */ 82041349Sbostic if (prog->reganch) 82141349Sbostic return(regtry(prog, string)); 82241349Sbostic 82341349Sbostic /* Messy cases: unanchored match. */ 824*46619Sbostic s = (char *)string; 82541349Sbostic if (prog->regstart != '\0') 82641349Sbostic /* We know what char it must start with. */ 82741349Sbostic while ((s = strchr(s, prog->regstart)) != NULL) { 82841349Sbostic if (regtry(prog, s)) 82941349Sbostic return(1); 83041349Sbostic s++; 83141349Sbostic } 83241349Sbostic else 83341349Sbostic /* We don't -- general case. */ 83441349Sbostic do { 83541349Sbostic if (regtry(prog, s)) 83641349Sbostic return(1); 83741349Sbostic } while (*s++ != '\0'); 83841349Sbostic 83941349Sbostic /* Failure. */ 84041349Sbostic return(0); 84141349Sbostic } 84241349Sbostic 84341349Sbostic /* 84441349Sbostic - regtry - try match at specific point 84541349Sbostic */ 84641349Sbostic static int /* 0 failure, 1 success */ 84741349Sbostic regtry(prog, string) 84841349Sbostic regexp *prog; 84941349Sbostic char *string; 85041349Sbostic { 85141349Sbostic register int i; 85241349Sbostic register char **sp; 85341349Sbostic register char **ep; 85441349Sbostic 85541349Sbostic reginput = string; 85641349Sbostic regstartp = prog->startp; 85741349Sbostic regendp = prog->endp; 85841349Sbostic 85941349Sbostic sp = prog->startp; 86041349Sbostic ep = prog->endp; 86141349Sbostic for (i = NSUBEXP; i > 0; i--) { 86241349Sbostic *sp++ = NULL; 86341349Sbostic *ep++ = NULL; 86441349Sbostic } 86541349Sbostic if (regmatch(prog->program + 1)) { 86641349Sbostic prog->startp[0] = string; 86741349Sbostic prog->endp[0] = reginput; 86841349Sbostic return(1); 86941349Sbostic } else 87041349Sbostic return(0); 87141349Sbostic } 87241349Sbostic 87341349Sbostic /* 87441349Sbostic - regmatch - main matching routine 87541349Sbostic * 87641349Sbostic * Conceptually the strategy is simple: check to see whether the current 87741349Sbostic * node matches, call self recursively to see whether the rest matches, 87841349Sbostic * and then act accordingly. In practice we make some effort to avoid 87941349Sbostic * recursion, in particular by going through "ordinary" nodes (that don't 88041349Sbostic * need to know whether the rest of the match failed) by a loop instead of 88141349Sbostic * by recursion. 88241349Sbostic */ 88341349Sbostic static int /* 0 failure, 1 success */ 88441349Sbostic regmatch(prog) 88541349Sbostic char *prog; 88641349Sbostic { 88741349Sbostic register char *scan; /* Current node. */ 88841349Sbostic char *next; /* Next node. */ 88941349Sbostic extern char *strchr(); 89041349Sbostic 89141349Sbostic scan = prog; 89241349Sbostic #ifdef DEBUG 89341349Sbostic if (scan != NULL && regnarrate) 89441349Sbostic fprintf(stderr, "%s(\n", regprop(scan)); 89541349Sbostic #endif 89641349Sbostic while (scan != NULL) { 89741349Sbostic #ifdef DEBUG 89841349Sbostic if (regnarrate) 89941349Sbostic fprintf(stderr, "%s...\n", regprop(scan)); 90041349Sbostic #endif 90141349Sbostic next = regnext(scan); 90241349Sbostic 90341349Sbostic switch (OP(scan)) { 90441349Sbostic case BOL: 90541349Sbostic if (reginput != regbol) 90641349Sbostic return(0); 90741349Sbostic break; 90841349Sbostic case EOL: 90941349Sbostic if (*reginput != '\0') 91041349Sbostic return(0); 91141349Sbostic break; 91241349Sbostic case WORDA: 91341349Sbostic /* Must be looking at a letter, digit, or _ */ 91441349Sbostic if ((!isalnum(*reginput)) && *reginput != '_') 91541349Sbostic return(0); 91641349Sbostic /* Prev must be BOL or nonword */ 91741349Sbostic if (reginput > regbol && 91841349Sbostic (isalnum(reginput[-1]) || reginput[-1] == '_')) 91941349Sbostic return(0); 92041349Sbostic break; 92141349Sbostic case WORDZ: 92241349Sbostic /* Must be looking at non letter, digit, or _ */ 92341349Sbostic if (isalnum(*reginput) || *reginput == '_') 92441349Sbostic return(0); 92541349Sbostic /* We don't care what the previous char was */ 92641349Sbostic break; 92741349Sbostic case ANY: 92841349Sbostic if (*reginput == '\0') 92941349Sbostic return(0); 93041349Sbostic reginput++; 93141349Sbostic break; 93241349Sbostic case EXACTLY: { 93341349Sbostic register int len; 93441349Sbostic register char *opnd; 93541349Sbostic 93641349Sbostic opnd = OPERAND(scan); 93741349Sbostic /* Inline the first character, for speed. */ 93841349Sbostic if (*opnd != *reginput) 93941349Sbostic return(0); 94041349Sbostic len = strlen(opnd); 94141349Sbostic if (len > 1 && strncmp(opnd, reginput, len) != 0) 94241349Sbostic return(0); 94341349Sbostic reginput += len; 94441349Sbostic } 94541349Sbostic break; 94641349Sbostic case ANYOF: 94741349Sbostic if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 94841349Sbostic return(0); 94941349Sbostic reginput++; 95041349Sbostic break; 95141349Sbostic case ANYBUT: 95241349Sbostic if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 95341349Sbostic return(0); 95441349Sbostic reginput++; 95541349Sbostic break; 95641349Sbostic case NOTHING: 95741349Sbostic break; 95841349Sbostic case BACK: 95941349Sbostic break; 96041349Sbostic case OPEN+1: 96141349Sbostic case OPEN+2: 96241349Sbostic case OPEN+3: 96341349Sbostic case OPEN+4: 96441349Sbostic case OPEN+5: 96541349Sbostic case OPEN+6: 96641349Sbostic case OPEN+7: 96741349Sbostic case OPEN+8: 96841349Sbostic case OPEN+9: { 96941349Sbostic register int no; 97041349Sbostic register char *save; 97141349Sbostic 97241349Sbostic no = OP(scan) - OPEN; 97341349Sbostic save = reginput; 97441349Sbostic 97541349Sbostic if (regmatch(next)) { 97641349Sbostic /* 97741349Sbostic * Don't set startp if some later 97841349Sbostic * invocation of the same parentheses 97941349Sbostic * already has. 98041349Sbostic */ 98141349Sbostic if (regstartp[no] == NULL) 98241349Sbostic regstartp[no] = save; 98341349Sbostic return(1); 98441349Sbostic } else 98541349Sbostic return(0); 98641349Sbostic } 98741349Sbostic break; 98841349Sbostic case CLOSE+1: 98941349Sbostic case CLOSE+2: 99041349Sbostic case CLOSE+3: 99141349Sbostic case CLOSE+4: 99241349Sbostic case CLOSE+5: 99341349Sbostic case CLOSE+6: 99441349Sbostic case CLOSE+7: 99541349Sbostic case CLOSE+8: 99641349Sbostic case CLOSE+9: { 99741349Sbostic register int no; 99841349Sbostic register char *save; 99941349Sbostic 100041349Sbostic no = OP(scan) - CLOSE; 100141349Sbostic save = reginput; 100241349Sbostic 100341349Sbostic if (regmatch(next)) { 100441349Sbostic /* 100541349Sbostic * Don't set endp if some later 100641349Sbostic * invocation of the same parentheses 100741349Sbostic * already has. 100841349Sbostic */ 100941349Sbostic if (regendp[no] == NULL) 101041349Sbostic regendp[no] = save; 101141349Sbostic return(1); 101241349Sbostic } else 101341349Sbostic return(0); 101441349Sbostic } 101541349Sbostic break; 101641349Sbostic case BRANCH: { 101741349Sbostic register char *save; 101841349Sbostic 101941349Sbostic if (OP(next) != BRANCH) /* No choice. */ 102041349Sbostic next = OPERAND(scan); /* Avoid recursion. */ 102141349Sbostic else { 102241349Sbostic do { 102341349Sbostic save = reginput; 102441349Sbostic if (regmatch(OPERAND(scan))) 102541349Sbostic return(1); 102641349Sbostic reginput = save; 102741349Sbostic scan = regnext(scan); 102841349Sbostic } while (scan != NULL && OP(scan) == BRANCH); 102941349Sbostic return(0); 103041349Sbostic /* NOTREACHED */ 103141349Sbostic } 103241349Sbostic } 103341349Sbostic break; 103441349Sbostic case STAR: 103541349Sbostic case PLUS: { 103641349Sbostic register char nextch; 103741349Sbostic register int no; 103841349Sbostic register char *save; 103941349Sbostic register int min; 104041349Sbostic 104141349Sbostic /* 104241349Sbostic * Lookahead to avoid useless match attempts 104341349Sbostic * when we know what character comes next. 104441349Sbostic */ 104541349Sbostic nextch = '\0'; 104641349Sbostic if (OP(next) == EXACTLY) 104741349Sbostic nextch = *OPERAND(next); 104841349Sbostic min = (OP(scan) == STAR) ? 0 : 1; 104941349Sbostic save = reginput; 105041349Sbostic no = regrepeat(OPERAND(scan)); 105141349Sbostic while (no >= min) { 105241349Sbostic /* If it could work, try it. */ 105341349Sbostic if (nextch == '\0' || *reginput == nextch) 105441349Sbostic if (regmatch(next)) 105541349Sbostic return(1); 105641349Sbostic /* Couldn't or didn't -- back up. */ 105741349Sbostic no--; 105841349Sbostic reginput = save + no; 105941349Sbostic } 106041349Sbostic return(0); 106141349Sbostic } 106241349Sbostic break; 106341349Sbostic case END: 106441349Sbostic return(1); /* Success! */ 106541349Sbostic break; 106641349Sbostic default: 106741349Sbostic regerror("memory corruption"); 106841349Sbostic return(0); 106941349Sbostic break; 107041349Sbostic } 107141349Sbostic 107241349Sbostic scan = next; 107341349Sbostic } 107441349Sbostic 107541349Sbostic /* 107641349Sbostic * We get here only if there's trouble -- normally "case END" is 107741349Sbostic * the terminating point. 107841349Sbostic */ 107941349Sbostic regerror("corrupted pointers"); 108041349Sbostic return(0); 108141349Sbostic } 108241349Sbostic 108341349Sbostic /* 108441349Sbostic - regrepeat - repeatedly match something simple, report how many 108541349Sbostic */ 108641349Sbostic static int 108741349Sbostic regrepeat(p) 108841349Sbostic char *p; 108941349Sbostic { 109041349Sbostic register int count = 0; 109141349Sbostic register char *scan; 109241349Sbostic register char *opnd; 109341349Sbostic 109441349Sbostic scan = reginput; 109541349Sbostic opnd = OPERAND(p); 109641349Sbostic switch (OP(p)) { 109741349Sbostic case ANY: 109841349Sbostic count = strlen(scan); 109941349Sbostic scan += count; 110041349Sbostic break; 110141349Sbostic case EXACTLY: 110241349Sbostic while (*opnd == *scan) { 110341349Sbostic count++; 110441349Sbostic scan++; 110541349Sbostic } 110641349Sbostic break; 110741349Sbostic case ANYOF: 110841349Sbostic while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 110941349Sbostic count++; 111041349Sbostic scan++; 111141349Sbostic } 111241349Sbostic break; 111341349Sbostic case ANYBUT: 111441349Sbostic while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 111541349Sbostic count++; 111641349Sbostic scan++; 111741349Sbostic } 111841349Sbostic break; 111941349Sbostic default: /* Oh dear. Called inappropriately. */ 112041349Sbostic regerror("internal foulup"); 112141349Sbostic count = 0; /* Best compromise. */ 112241349Sbostic break; 112341349Sbostic } 112441349Sbostic reginput = scan; 112541349Sbostic 112641349Sbostic return(count); 112741349Sbostic } 112841349Sbostic 112941349Sbostic /* 113041349Sbostic - regnext - dig the "next" pointer out of a node 113141349Sbostic */ 113241349Sbostic static char * 113341349Sbostic regnext(p) 113441349Sbostic register char *p; 113541349Sbostic { 113641349Sbostic register int offset; 113741349Sbostic 113841349Sbostic if (p == ®dummy) 113941349Sbostic return(NULL); 114041349Sbostic 114141349Sbostic offset = NEXT(p); 114241349Sbostic if (offset == 0) 114341349Sbostic return(NULL); 114441349Sbostic 114541349Sbostic if (OP(p) == BACK) 114641349Sbostic return(p-offset); 114741349Sbostic else 114841349Sbostic return(p+offset); 114941349Sbostic } 115041349Sbostic 115141349Sbostic #ifdef DEBUG 115241349Sbostic 115341349Sbostic STATIC char *regprop(); 115441349Sbostic 115541349Sbostic /* 115641349Sbostic - regdump - dump a regexp onto stdout in vaguely comprehensible form 115741349Sbostic */ 115841349Sbostic void 115941349Sbostic regdump(r) 116041349Sbostic regexp *r; 116141349Sbostic { 116241349Sbostic register char *s; 116341349Sbostic register char op = EXACTLY; /* Arbitrary non-END op. */ 116441349Sbostic register char *next; 116541349Sbostic extern char *strchr(); 116641349Sbostic 116741349Sbostic 116841349Sbostic s = r->program + 1; 116941349Sbostic while (op != END) { /* While that wasn't END last time... */ 117041349Sbostic op = OP(s); 117141349Sbostic printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 117241349Sbostic next = regnext(s); 117341349Sbostic if (next == NULL) /* Next ptr. */ 117441349Sbostic printf("(0)"); 117541349Sbostic else 117641349Sbostic printf("(%d)", (s-r->program)+(next-s)); 117741349Sbostic s += 3; 117841349Sbostic if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 117941349Sbostic /* Literal string, where present. */ 118041349Sbostic while (*s != '\0') { 118141349Sbostic putchar(*s); 118241349Sbostic s++; 118341349Sbostic } 118441349Sbostic s++; 118541349Sbostic } 118641349Sbostic putchar('\n'); 118741349Sbostic } 118841349Sbostic 118941349Sbostic /* Header fields of interest. */ 119041349Sbostic if (r->regstart != '\0') 119141349Sbostic printf("start `%c' ", r->regstart); 119241349Sbostic if (r->reganch) 119341349Sbostic printf("anchored "); 119441349Sbostic if (r->regmust != NULL) 119541349Sbostic printf("must have \"%s\"", r->regmust); 119641349Sbostic printf("\n"); 119741349Sbostic } 119841349Sbostic 119941349Sbostic /* 120041349Sbostic - regprop - printable representation of opcode 120141349Sbostic */ 120241349Sbostic static char * 120341349Sbostic regprop(op) 120441349Sbostic char *op; 120541349Sbostic { 120641349Sbostic register char *p; 120741349Sbostic static char buf[50]; 120841349Sbostic 120941349Sbostic (void) strcpy(buf, ":"); 121041349Sbostic 121141349Sbostic switch (OP(op)) { 121241349Sbostic case BOL: 121341349Sbostic p = "BOL"; 121441349Sbostic break; 121541349Sbostic case EOL: 121641349Sbostic p = "EOL"; 121741349Sbostic break; 121841349Sbostic case ANY: 121941349Sbostic p = "ANY"; 122041349Sbostic break; 122141349Sbostic case ANYOF: 122241349Sbostic p = "ANYOF"; 122341349Sbostic break; 122441349Sbostic case ANYBUT: 122541349Sbostic p = "ANYBUT"; 122641349Sbostic break; 122741349Sbostic case BRANCH: 122841349Sbostic p = "BRANCH"; 122941349Sbostic break; 123041349Sbostic case EXACTLY: 123141349Sbostic p = "EXACTLY"; 123241349Sbostic break; 123341349Sbostic case NOTHING: 123441349Sbostic p = "NOTHING"; 123541349Sbostic break; 123641349Sbostic case BACK: 123741349Sbostic p = "BACK"; 123841349Sbostic break; 123941349Sbostic case END: 124041349Sbostic p = "END"; 124141349Sbostic break; 124241349Sbostic case OPEN+1: 124341349Sbostic case OPEN+2: 124441349Sbostic case OPEN+3: 124541349Sbostic case OPEN+4: 124641349Sbostic case OPEN+5: 124741349Sbostic case OPEN+6: 124841349Sbostic case OPEN+7: 124941349Sbostic case OPEN+8: 125041349Sbostic case OPEN+9: 125141349Sbostic sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 125241349Sbostic p = NULL; 125341349Sbostic break; 125441349Sbostic case CLOSE+1: 125541349Sbostic case CLOSE+2: 125641349Sbostic case CLOSE+3: 125741349Sbostic case CLOSE+4: 125841349Sbostic case CLOSE+5: 125941349Sbostic case CLOSE+6: 126041349Sbostic case CLOSE+7: 126141349Sbostic case CLOSE+8: 126241349Sbostic case CLOSE+9: 126341349Sbostic sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 126441349Sbostic p = NULL; 126541349Sbostic break; 126641349Sbostic case STAR: 126741349Sbostic p = "STAR"; 126841349Sbostic break; 126941349Sbostic case PLUS: 127041349Sbostic p = "PLUS"; 127141349Sbostic break; 127241349Sbostic case WORDA: 127341349Sbostic p = "WORDA"; 127441349Sbostic break; 127541349Sbostic case WORDZ: 127641349Sbostic p = "WORDZ"; 127741349Sbostic break; 127841349Sbostic default: 127941349Sbostic regerror("corrupted opcode"); 128041349Sbostic break; 128141349Sbostic } 128241349Sbostic if (p != NULL) 128341349Sbostic (void) strcat(buf, p); 128441349Sbostic return(buf); 128541349Sbostic } 128641349Sbostic #endif 128741349Sbostic 128841349Sbostic /* 128941349Sbostic * The following is provided for those people who do not have strcspn() in 129041349Sbostic * their C libraries. They should get off their butts and do something 129141349Sbostic * about it; at least one public-domain implementation of those (highly 129241349Sbostic * useful) string routines has been published on Usenet. 129341349Sbostic */ 129441349Sbostic #ifdef STRCSPN 129541349Sbostic /* 129641349Sbostic * strcspn - find length of initial segment of s1 consisting entirely 129741349Sbostic * of characters not from s2 129841349Sbostic */ 129941349Sbostic 130041349Sbostic static int 130141349Sbostic strcspn(s1, s2) 130241349Sbostic char *s1; 130341349Sbostic char *s2; 130441349Sbostic { 130541349Sbostic register char *scan1; 130641349Sbostic register char *scan2; 130741349Sbostic register int count; 130841349Sbostic 130941349Sbostic count = 0; 131041349Sbostic for (scan1 = s1; *scan1 != '\0'; scan1++) { 131141349Sbostic for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 131241349Sbostic if (*scan1 == *scan2++) 131341349Sbostic return(count); 131441349Sbostic count++; 131541349Sbostic } 131641349Sbostic return(count); 131741349Sbostic } 131841349Sbostic #endif 1319