141349Sbostic /* 241349Sbostic * regcomp and regexec -- regsub and regerror are elsewhere 341349Sbostic * 441349Sbostic * Copyright (c) 1986 by University of Toronto. 541349Sbostic * Written by Henry Spencer. Not derived from licensed software. 641349Sbostic * 741349Sbostic * Permission is granted to anyone to use this software for any 841349Sbostic * purpose on any computer system, and to redistribute it freely, 941349Sbostic * subject to the following restrictions: 1041349Sbostic * 1141349Sbostic * 1. The author is not responsible for the consequences of use of 1241349Sbostic * this software, no matter how awful, even if they arise 1341349Sbostic * from defects in it. 1441349Sbostic * 1541349Sbostic * 2. The origin of this software must not be misrepresented, either 1641349Sbostic * by explicit claim or by omission. 1741349Sbostic * 1841349Sbostic * 3. Altered versions must be plainly marked as such, and must not 1941349Sbostic * be misrepresented as being the original software. 2041349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 2141349Sbostic *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | 2241349Sbostic *** to assist in implementing egrep. 2341349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 2441349Sbostic *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching 2541349Sbostic *** as in BSD grep and ex. 2641349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 2741349Sbostic *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. 2841349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, 2941349Sbostic *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. 3041349Sbostic * 3141349Sbostic * Beware that some of this code is subtly aware of the way operator 3241349Sbostic * precedence is structured in regular expressions. Serious changes in 3341349Sbostic * regular-expression syntax might require a total rethink. 3441349Sbostic */ 3546619Sbostic #include <regexp.h> 3641349Sbostic #include <stdio.h> 3741349Sbostic #include <ctype.h> 3846619Sbostic #include <stdlib.h> 3946619Sbostic #include <string.h> 4041349Sbostic #include "regmagic.h" 4141349Sbostic 4241349Sbostic /* 4341349Sbostic * The "internal use only" fields in regexp.h are present to pass info from 4441349Sbostic * compile to execute that permits the execute phase to run lots faster on 4541349Sbostic * simple cases. They are: 4641349Sbostic * 4741349Sbostic * regstart char that must begin a match; '\0' if none obvious 4841349Sbostic * reganch is the match anchored (at beginning-of-line only)? 4941349Sbostic * regmust string (pointer into program) that match must include, or NULL 5041349Sbostic * regmlen length of regmust string 5141349Sbostic * 5241349Sbostic * Regstart and reganch permit very fast decisions on suitable starting points 5341349Sbostic * for a match, cutting down the work a lot. Regmust permits fast rejection 5441349Sbostic * of lines that cannot possibly match. The regmust tests are costly enough 5541349Sbostic * that regcomp() supplies a regmust only if the r.e. contains something 5641349Sbostic * potentially expensive (at present, the only such thing detected is * or + 5741349Sbostic * at the start of the r.e., which can involve a lot of backup). Regmlen is 5841349Sbostic * supplied because the test in regexec() needs it and regcomp() is computing 5941349Sbostic * it anyway. 6041349Sbostic */ 6141349Sbostic 6241349Sbostic /* 6341349Sbostic * Structure for regexp "program". This is essentially a linear encoding 6441349Sbostic * of a nondeterministic finite-state machine (aka syntax charts or 6541349Sbostic * "railroad normal form" in parsing technology). Each node is an opcode 6641349Sbostic * plus a "next" pointer, possibly plus an operand. "Next" pointers of 6741349Sbostic * all nodes except BRANCH implement concatenation; a "next" pointer with 6841349Sbostic * a BRANCH on both ends of it is connecting two alternatives. (Here we 6941349Sbostic * have one of the subtle syntax dependencies: an individual BRANCH (as 7041349Sbostic * opposed to a collection of them) is never concatenated with anything 7141349Sbostic * because of operator precedence.) The operand of some types of node is 7241349Sbostic * a literal string; for others, it is a node leading into a sub-FSM. In 7341349Sbostic * particular, the operand of a BRANCH node is the first node of the branch. 7441349Sbostic * (NB this is *not* a tree structure: the tail of the branch connects 7541349Sbostic * to the thing following the set of BRANCHes.) The opcodes are: 7641349Sbostic */ 7741349Sbostic 7841349Sbostic /* definition number opnd? meaning */ 7941349Sbostic #define END 0 /* no End of program. */ 8041349Sbostic #define BOL 1 /* no Match "" at beginning of line. */ 8141349Sbostic #define EOL 2 /* no Match "" at end of line. */ 8241349Sbostic #define ANY 3 /* no Match any one character. */ 8341349Sbostic #define ANYOF 4 /* str Match any character in this string. */ 8441349Sbostic #define ANYBUT 5 /* str Match any character not in this string. */ 8541349Sbostic #define BRANCH 6 /* node Match this alternative, or the next... */ 8641349Sbostic #define BACK 7 /* no Match "", "next" ptr points backward. */ 8741349Sbostic #define EXACTLY 8 /* str Match this string. */ 8841349Sbostic #define NOTHING 9 /* no Match empty string. */ 8941349Sbostic #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 9041349Sbostic #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 9141349Sbostic #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ 9241349Sbostic #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ 9341349Sbostic #define OPEN 20 /* no Mark this point in input as start of #n. */ 9441349Sbostic /* OPEN+1 is number 1, etc. */ 9541349Sbostic #define CLOSE 30 /* no Analogous to OPEN. */ 9641349Sbostic 9741349Sbostic /* 9841349Sbostic * Opcode notes: 9941349Sbostic * 10041349Sbostic * BRANCH The set of branches constituting a single choice are hooked 10141349Sbostic * together with their "next" pointers, since precedence prevents 10241349Sbostic * anything being concatenated to any individual branch. The 10341349Sbostic * "next" pointer of the last BRANCH in a choice points to the 10441349Sbostic * thing following the whole choice. This is also where the 10541349Sbostic * final "next" pointer of each individual branch points; each 10641349Sbostic * branch starts with the operand node of a BRANCH node. 10741349Sbostic * 10841349Sbostic * BACK Normal "next" pointers all implicitly point forward; BACK 10941349Sbostic * exists to make loop structures possible. 11041349Sbostic * 11141349Sbostic * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 11241349Sbostic * BRANCH structures using BACK. Simple cases (one character 11341349Sbostic * per match) are implemented with STAR and PLUS for speed 11441349Sbostic * and to minimize recursive plunges. 11541349Sbostic * 11641349Sbostic * OPEN,CLOSE ...are numbered at compile time. 11741349Sbostic */ 11841349Sbostic 11941349Sbostic /* 12041349Sbostic * A node is one char of opcode followed by two chars of "next" pointer. 12141349Sbostic * "Next" pointers are stored as two 8-bit pieces, high order first. The 12241349Sbostic * value is a positive offset from the opcode of the node containing it. 12341349Sbostic * An operand, if any, simply follows the node. (Note that much of the 12441349Sbostic * code generation knows about this implicit relationship.) 12541349Sbostic * 12641349Sbostic * Using two bytes for the "next" pointer is vast overkill for most things, 12741349Sbostic * but allows patterns to get big without disasters. 12841349Sbostic */ 12941349Sbostic #define OP(p) (*(p)) 13041349Sbostic #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 13141349Sbostic #define OPERAND(p) ((p) + 3) 13241349Sbostic 13341349Sbostic /* 13441349Sbostic * See regmagic.h for one further detail of program structure. 13541349Sbostic */ 13641349Sbostic 13741349Sbostic 13841349Sbostic /* 13941349Sbostic * Utility definitions. 14041349Sbostic */ 14141349Sbostic #ifndef CHARBITS 14241349Sbostic #define UCHARAT(p) ((int)*(unsigned char *)(p)) 14341349Sbostic #else 14441349Sbostic #define UCHARAT(p) ((int)*(p)&CHARBITS) 14541349Sbostic #endif 14641349Sbostic 14741349Sbostic #define FAIL(m) { regerror(m); return(NULL); } 14841349Sbostic #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 14941349Sbostic 15041349Sbostic /* 15141349Sbostic * Flags to be passed up and down. 15241349Sbostic */ 15341349Sbostic #define HASWIDTH 01 /* Known never to match null string. */ 15441349Sbostic #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 15541349Sbostic #define SPSTART 04 /* Starts with * or +. */ 15641349Sbostic #define WORST 0 /* Worst case. */ 15741349Sbostic 15841349Sbostic /* 15941349Sbostic * Global work variables for regcomp(). 16041349Sbostic */ 16141349Sbostic static char *regparse; /* Input-scan pointer. */ 16241349Sbostic static int regnpar; /* () count. */ 16341349Sbostic static char regdummy; 16441349Sbostic static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 16541349Sbostic static long regsize; /* Code size. */ 16641349Sbostic 16741349Sbostic /* 16841349Sbostic * Forward declarations for regcomp()'s friends. 16941349Sbostic */ 17041349Sbostic #ifndef STATIC 17141349Sbostic #define STATIC static 17241349Sbostic #endif 17341349Sbostic STATIC char *reg(); 17441349Sbostic STATIC char *regbranch(); 17541349Sbostic STATIC char *regpiece(); 17641349Sbostic STATIC char *regatom(); 17741349Sbostic STATIC char *regnode(); 17841349Sbostic STATIC char *regnext(); 17941349Sbostic STATIC void regc(); 18041349Sbostic STATIC void reginsert(); 18141349Sbostic STATIC void regtail(); 18241349Sbostic STATIC void regoptail(); 18341349Sbostic #ifdef STRCSPN 18441349Sbostic STATIC int strcspn(); 18541349Sbostic #endif 18641349Sbostic 18741349Sbostic /* 18841349Sbostic - regcomp - compile a regular expression into internal code 18941349Sbostic * 19041349Sbostic * We can't allocate space until we know how big the compiled form will be, 19141349Sbostic * but we can't compile it (and thus know how big it is) until we've got a 19241349Sbostic * place to put the code. So we cheat: we compile it twice, once with code 19341349Sbostic * generation turned off and size counting turned on, and once "for real". 19441349Sbostic * This also means that we don't allocate space until we are sure that the 19541349Sbostic * thing really will compile successfully, and we never have to move the 19641349Sbostic * code and thus invalidate pointers into it. (Note that it has to be in 19741349Sbostic * one piece because free() must be able to free it all.) 19841349Sbostic * 19941349Sbostic * Beware that the optimization-preparation code in here knows about some 20041349Sbostic * of the structure of the compiled regexp. 20141349Sbostic */ 20241349Sbostic regexp * 20341349Sbostic regcomp(exp) 20446619Sbostic const char *exp; 20541349Sbostic { 20641349Sbostic register regexp *r; 20741349Sbostic register char *scan; 20841349Sbostic register char *longest; 20941349Sbostic register int len; 21041349Sbostic int flags; 21141349Sbostic 21241349Sbostic if (exp == NULL) 21341349Sbostic FAIL("NULL argument"); 21441349Sbostic 21541349Sbostic /* First pass: determine size, legality. */ 216*49375Smarc #ifdef notdef 21741349Sbostic if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ 218*49375Smarc #endif 21946619Sbostic regparse = (char *)exp; 22041349Sbostic regnpar = 1; 22141349Sbostic regsize = 0L; 22241349Sbostic regcode = ®dummy; 22341349Sbostic regc(MAGIC); 22441349Sbostic if (reg(0, &flags) == NULL) 22541349Sbostic return(NULL); 22641349Sbostic 22741349Sbostic /* Small enough for pointer-storage convention? */ 22841349Sbostic if (regsize >= 32767L) /* Probably could be 65535L. */ 22941349Sbostic FAIL("regexp too big"); 23041349Sbostic 23141349Sbostic /* Allocate space. */ 23241349Sbostic r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 23341349Sbostic if (r == NULL) 23441349Sbostic FAIL("out of space"); 23541349Sbostic 23641349Sbostic /* Second pass: emit code. */ 23746619Sbostic regparse = (char *)exp; 23841349Sbostic regnpar = 1; 23941349Sbostic regcode = r->program; 24041349Sbostic regc(MAGIC); 24141349Sbostic if (reg(0, &flags) == NULL) 24241349Sbostic return(NULL); 24341349Sbostic 24441349Sbostic /* Dig out information for optimizations. */ 24541349Sbostic r->regstart = '\0'; /* Worst-case defaults. */ 24641349Sbostic r->reganch = 0; 24741349Sbostic r->regmust = NULL; 24841349Sbostic r->regmlen = 0; 24941349Sbostic scan = r->program+1; /* First BRANCH. */ 25041349Sbostic if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 25141349Sbostic scan = OPERAND(scan); 25241349Sbostic 25341349Sbostic /* Starting-point info. */ 25441349Sbostic if (OP(scan) == EXACTLY) 25541349Sbostic r->regstart = *OPERAND(scan); 25641349Sbostic else if (OP(scan) == BOL) 25741349Sbostic r->reganch++; 25841349Sbostic 25941349Sbostic /* 26041349Sbostic * If there's something expensive in the r.e., find the 26141349Sbostic * longest literal string that must appear and make it the 26241349Sbostic * regmust. Resolve ties in favor of later strings, since 26341349Sbostic * the regstart check works with the beginning of the r.e. 26441349Sbostic * and avoiding duplication strengthens checking. Not a 26541349Sbostic * strong reason, but sufficient in the absence of others. 26641349Sbostic */ 26741349Sbostic if (flags&SPSTART) { 26841349Sbostic longest = NULL; 26941349Sbostic len = 0; 27041349Sbostic for (; scan != NULL; scan = regnext(scan)) 27141349Sbostic if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { 27241349Sbostic longest = OPERAND(scan); 27341349Sbostic len = strlen(OPERAND(scan)); 27441349Sbostic } 27541349Sbostic r->regmust = longest; 27641349Sbostic r->regmlen = len; 27741349Sbostic } 27841349Sbostic } 27941349Sbostic 28041349Sbostic return(r); 28141349Sbostic } 28241349Sbostic 28341349Sbostic /* 28441349Sbostic - reg - regular expression, i.e. main body or parenthesized thing 28541349Sbostic * 28641349Sbostic * Caller must absorb opening parenthesis. 28741349Sbostic * 28841349Sbostic * Combining parenthesis handling with the base level of regular expression 28941349Sbostic * is a trifle forced, but the need to tie the tails of the branches to what 29041349Sbostic * follows makes it hard to avoid. 29141349Sbostic */ 29241349Sbostic static char * 29341349Sbostic reg(paren, flagp) 29441349Sbostic int paren; /* Parenthesized? */ 29541349Sbostic int *flagp; 29641349Sbostic { 29741349Sbostic register char *ret; 29841349Sbostic register char *br; 29941349Sbostic register char *ender; 30041349Sbostic register int parno; 30141349Sbostic int flags; 30241349Sbostic 30341349Sbostic *flagp = HASWIDTH; /* Tentatively. */ 30441349Sbostic 30541349Sbostic /* Make an OPEN node, if parenthesized. */ 30641349Sbostic if (paren) { 30741349Sbostic if (regnpar >= NSUBEXP) 30841349Sbostic FAIL("too many ()"); 30941349Sbostic parno = regnpar; 31041349Sbostic regnpar++; 31141349Sbostic ret = regnode(OPEN+parno); 31241349Sbostic } else 31341349Sbostic ret = NULL; 31441349Sbostic 31541349Sbostic /* Pick up the branches, linking them together. */ 31641349Sbostic br = regbranch(&flags); 31741349Sbostic if (br == NULL) 31841349Sbostic return(NULL); 31941349Sbostic if (ret != NULL) 32041349Sbostic regtail(ret, br); /* OPEN -> first. */ 32141349Sbostic else 32241349Sbostic ret = br; 32341349Sbostic if (!(flags&HASWIDTH)) 32441349Sbostic *flagp &= ~HASWIDTH; 32541349Sbostic *flagp |= flags&SPSTART; 32641349Sbostic while (*regparse == '|' || *regparse == '\n') { 32741349Sbostic regparse++; 32841349Sbostic br = regbranch(&flags); 32941349Sbostic if (br == NULL) 33041349Sbostic return(NULL); 33141349Sbostic regtail(ret, br); /* BRANCH -> BRANCH. */ 33241349Sbostic if (!(flags&HASWIDTH)) 33341349Sbostic *flagp &= ~HASWIDTH; 33441349Sbostic *flagp |= flags&SPSTART; 33541349Sbostic } 33641349Sbostic 33741349Sbostic /* Make a closing node, and hook it on the end. */ 33841349Sbostic ender = regnode((paren) ? CLOSE+parno : END); 33941349Sbostic regtail(ret, ender); 34041349Sbostic 34141349Sbostic /* Hook the tails of the branches to the closing node. */ 34241349Sbostic for (br = ret; br != NULL; br = regnext(br)) 34341349Sbostic regoptail(br, ender); 34441349Sbostic 34541349Sbostic /* Check for proper termination. */ 34641349Sbostic if (paren && *regparse++ != ')') { 34741349Sbostic FAIL("unmatched ()"); 34841349Sbostic } else if (!paren && *regparse != '\0') { 34941349Sbostic if (*regparse == ')') { 35041349Sbostic FAIL("unmatched ()"); 35141349Sbostic } else 35241349Sbostic FAIL("junk on end"); /* "Can't happen". */ 35341349Sbostic /* NOTREACHED */ 35441349Sbostic } 35541349Sbostic 35641349Sbostic return(ret); 35741349Sbostic } 35841349Sbostic 35941349Sbostic /* 36041349Sbostic - regbranch - one alternative of an | operator 36141349Sbostic * 36241349Sbostic * Implements the concatenation operator. 36341349Sbostic */ 36441349Sbostic static char * 36541349Sbostic regbranch(flagp) 36641349Sbostic int *flagp; 36741349Sbostic { 36841349Sbostic register char *ret; 36941349Sbostic register char *chain; 37041349Sbostic register char *latest; 37141349Sbostic int flags; 37241349Sbostic 37341349Sbostic *flagp = WORST; /* Tentatively. */ 37441349Sbostic 37541349Sbostic ret = regnode(BRANCH); 37641349Sbostic chain = NULL; 37741349Sbostic while (*regparse != '\0' && *regparse != ')' && 37841349Sbostic *regparse != '\n' && *regparse != '|') { 37941349Sbostic latest = regpiece(&flags); 38041349Sbostic if (latest == NULL) 38141349Sbostic return(NULL); 38241349Sbostic *flagp |= flags&HASWIDTH; 38341349Sbostic if (chain == NULL) /* First piece. */ 38441349Sbostic *flagp |= flags&SPSTART; 38541349Sbostic else 38641349Sbostic regtail(chain, latest); 38741349Sbostic chain = latest; 38841349Sbostic } 38941349Sbostic if (chain == NULL) /* Loop ran zero times. */ 39041349Sbostic (void) regnode(NOTHING); 39141349Sbostic 39241349Sbostic return(ret); 39341349Sbostic } 39441349Sbostic 39541349Sbostic /* 39641349Sbostic - regpiece - something followed by possible [*+?] 39741349Sbostic * 39841349Sbostic * Note that the branching code sequences used for ? and the general cases 39941349Sbostic * of * and + are somewhat optimized: they use the same NOTHING node as 40041349Sbostic * both the endmarker for their branch list and the body of the last branch. 40141349Sbostic * It might seem that this node could be dispensed with entirely, but the 40241349Sbostic * endmarker role is not redundant. 40341349Sbostic */ 40441349Sbostic static char * 40541349Sbostic regpiece(flagp) 40641349Sbostic int *flagp; 40741349Sbostic { 40841349Sbostic register char *ret; 40941349Sbostic register char op; 41041349Sbostic register char *next; 41141349Sbostic int flags; 41241349Sbostic 41341349Sbostic ret = regatom(&flags); 41441349Sbostic if (ret == NULL) 41541349Sbostic return(NULL); 41641349Sbostic 41741349Sbostic op = *regparse; 41841349Sbostic if (!ISMULT(op)) { 41941349Sbostic *flagp = flags; 42041349Sbostic return(ret); 42141349Sbostic } 42241349Sbostic 42341349Sbostic if (!(flags&HASWIDTH) && op != '?') 42441349Sbostic FAIL("*+ operand could be empty"); 42541349Sbostic *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 42641349Sbostic 42741349Sbostic if (op == '*' && (flags&SIMPLE)) 42841349Sbostic reginsert(STAR, ret); 42941349Sbostic else if (op == '*') { 43041349Sbostic /* Emit x* as (x&|), where & means "self". */ 43141349Sbostic reginsert(BRANCH, ret); /* Either x */ 43241349Sbostic regoptail(ret, regnode(BACK)); /* and loop */ 43341349Sbostic regoptail(ret, ret); /* back */ 43441349Sbostic regtail(ret, regnode(BRANCH)); /* or */ 43541349Sbostic regtail(ret, regnode(NOTHING)); /* null. */ 43641349Sbostic } else if (op == '+' && (flags&SIMPLE)) 43741349Sbostic reginsert(PLUS, ret); 43841349Sbostic else if (op == '+') { 43941349Sbostic /* Emit x+ as x(&|), where & means "self". */ 44041349Sbostic next = regnode(BRANCH); /* Either */ 44141349Sbostic regtail(ret, next); 44241349Sbostic regtail(regnode(BACK), ret); /* loop back */ 44341349Sbostic regtail(next, regnode(BRANCH)); /* or */ 44441349Sbostic regtail(ret, regnode(NOTHING)); /* null. */ 44541349Sbostic } else if (op == '?') { 44641349Sbostic /* Emit x? as (x|) */ 44741349Sbostic reginsert(BRANCH, ret); /* Either x */ 44841349Sbostic regtail(ret, regnode(BRANCH)); /* or */ 44941349Sbostic next = regnode(NOTHING); /* null. */ 45041349Sbostic regtail(ret, next); 45141349Sbostic regoptail(ret, next); 45241349Sbostic } 45341349Sbostic regparse++; 45441349Sbostic if (ISMULT(*regparse)) 45541349Sbostic FAIL("nested *?+"); 45641349Sbostic 45741349Sbostic return(ret); 45841349Sbostic } 45941349Sbostic 46041349Sbostic /* 46141349Sbostic - regatom - the lowest level 46241349Sbostic * 46341349Sbostic * Optimization: gobbles an entire sequence of ordinary characters so that 46441349Sbostic * it can turn them into a single node, which is smaller to store and 46541349Sbostic * faster to run. Backslashed characters are exceptions, each becoming a 46641349Sbostic * separate node; the code is simpler that way and it's not worth fixing. 46741349Sbostic */ 46841349Sbostic static char * 46941349Sbostic regatom(flagp) 47041349Sbostic int *flagp; 47141349Sbostic { 47241349Sbostic register char *ret; 47341349Sbostic int flags; 47441349Sbostic 47541349Sbostic *flagp = WORST; /* Tentatively. */ 47641349Sbostic 47741349Sbostic switch (*regparse++) { 47841349Sbostic /* FIXME: these chars only have meaning at beg/end of pat? */ 47941349Sbostic case '^': 48041349Sbostic ret = regnode(BOL); 48141349Sbostic break; 48241349Sbostic case '$': 48341349Sbostic ret = regnode(EOL); 48441349Sbostic break; 48541349Sbostic case '.': 48641349Sbostic ret = regnode(ANY); 48741349Sbostic *flagp |= HASWIDTH|SIMPLE; 48841349Sbostic break; 48941349Sbostic case '[': { 49041349Sbostic register int class; 49141349Sbostic register int classend; 49241349Sbostic 49341349Sbostic if (*regparse == '^') { /* Complement of range. */ 49441349Sbostic ret = regnode(ANYBUT); 49541349Sbostic regparse++; 49641349Sbostic } else 49741349Sbostic ret = regnode(ANYOF); 49841349Sbostic if (*regparse == ']' || *regparse == '-') 49941349Sbostic regc(*regparse++); 50041349Sbostic while (*regparse != '\0' && *regparse != ']') { 50141349Sbostic if (*regparse == '-') { 50241349Sbostic regparse++; 50341349Sbostic if (*regparse == ']' || *regparse == '\0') 50441349Sbostic regc('-'); 50541349Sbostic else { 50641349Sbostic class = UCHARAT(regparse-2)+1; 50741349Sbostic classend = UCHARAT(regparse); 50841349Sbostic if (class > classend+1) 50941349Sbostic FAIL("invalid [] range"); 51041349Sbostic for (; class <= classend; class++) 51141349Sbostic regc(class); 51241349Sbostic regparse++; 51341349Sbostic } 51441349Sbostic } else 51541349Sbostic regc(*regparse++); 51641349Sbostic } 51741349Sbostic regc('\0'); 51841349Sbostic if (*regparse != ']') 51941349Sbostic FAIL("unmatched []"); 52041349Sbostic regparse++; 52141349Sbostic *flagp |= HASWIDTH|SIMPLE; 52241349Sbostic } 52341349Sbostic break; 52441349Sbostic case '(': 52541349Sbostic ret = reg(1, &flags); 52641349Sbostic if (ret == NULL) 52741349Sbostic return(NULL); 52841349Sbostic *flagp |= flags&(HASWIDTH|SPSTART); 52941349Sbostic break; 53041349Sbostic case '\0': 53141349Sbostic case '|': 53241349Sbostic case '\n': 53341349Sbostic case ')': 53441349Sbostic FAIL("internal urp"); /* Supposed to be caught earlier. */ 53541349Sbostic break; 53641349Sbostic case '?': 53741349Sbostic case '+': 53841349Sbostic case '*': 53941349Sbostic FAIL("?+* follows nothing"); 54041349Sbostic break; 54141349Sbostic case '\\': 54241349Sbostic switch (*regparse++) { 54341349Sbostic case '\0': 54441349Sbostic FAIL("trailing \\"); 54541349Sbostic break; 54641349Sbostic case '<': 54741349Sbostic ret = regnode(WORDA); 54841349Sbostic break; 54941349Sbostic case '>': 55041349Sbostic ret = regnode(WORDZ); 55141349Sbostic break; 55241349Sbostic /* FIXME: Someday handle \1, \2, ... */ 55341349Sbostic default: 55441349Sbostic /* Handle general quoted chars in exact-match routine */ 55541349Sbostic goto de_fault; 55641349Sbostic } 55741349Sbostic break; 55841349Sbostic de_fault: 55941349Sbostic default: 56041349Sbostic /* 56141349Sbostic * Encode a string of characters to be matched exactly. 56241349Sbostic * 56341349Sbostic * This is a bit tricky due to quoted chars and due to 56441349Sbostic * '*', '+', and '?' taking the SINGLE char previous 56541349Sbostic * as their operand. 56641349Sbostic * 56741349Sbostic * On entry, the char at regparse[-1] is going to go 56841349Sbostic * into the string, no matter what it is. (It could be 56941349Sbostic * following a \ if we are entered from the '\' case.) 57041349Sbostic * 57141349Sbostic * Basic idea is to pick up a good char in ch and 57241349Sbostic * examine the next char. If it's *+? then we twiddle. 57341349Sbostic * If it's \ then we frozzle. If it's other magic char 57441349Sbostic * we push ch and terminate the string. If none of the 57541349Sbostic * above, we push ch on the string and go around again. 57641349Sbostic * 57741349Sbostic * regprev is used to remember where "the current char" 57841349Sbostic * starts in the string, if due to a *+? we need to back 57941349Sbostic * up and put the current char in a separate, 1-char, string. 58041349Sbostic * When regprev is NULL, ch is the only char in the 58141349Sbostic * string; this is used in *+? handling, and in setting 58241349Sbostic * flags |= SIMPLE at the end. 58341349Sbostic */ 58441349Sbostic { 58541349Sbostic char *regprev; 58641349Sbostic register char ch; 58741349Sbostic 58841349Sbostic regparse--; /* Look at cur char */ 58941349Sbostic ret = regnode(EXACTLY); 59041349Sbostic for ( regprev = 0 ; ; ) { 59141349Sbostic ch = *regparse++; /* Get current char */ 59241349Sbostic switch (*regparse) { /* look at next one */ 59341349Sbostic 59441349Sbostic default: 59541349Sbostic regc(ch); /* Add cur to string */ 59641349Sbostic break; 59741349Sbostic 59841349Sbostic case '.': case '[': case '(': 59941349Sbostic case ')': case '|': case '\n': 60041349Sbostic case '$': case '^': 60141349Sbostic case '\0': 60241349Sbostic /* FIXME, $ and ^ should not always be magic */ 60341349Sbostic magic: 60441349Sbostic regc(ch); /* dump cur char */ 60541349Sbostic goto done; /* and we are done */ 60641349Sbostic 60741349Sbostic case '?': case '+': case '*': 60841349Sbostic if (!regprev) /* If just ch in str, */ 60941349Sbostic goto magic; /* use it */ 61041349Sbostic /* End mult-char string one early */ 61141349Sbostic regparse = regprev; /* Back up parse */ 61241349Sbostic goto done; 61341349Sbostic 61441349Sbostic case '\\': 61541349Sbostic regc(ch); /* Cur char OK */ 61641349Sbostic switch (regparse[1]){ /* Look after \ */ 61741349Sbostic case '\0': 61841349Sbostic case '<': 61941349Sbostic case '>': 62041349Sbostic /* FIXME: Someday handle \1, \2, ... */ 62141349Sbostic goto done; /* Not quoted */ 62241349Sbostic default: 62341349Sbostic /* Backup point is \, scan * point is after it. */ 62441349Sbostic regprev = regparse; 62541349Sbostic regparse++; 62641349Sbostic continue; /* NOT break; */ 62741349Sbostic } 62841349Sbostic } 62941349Sbostic regprev = regparse; /* Set backup point */ 63041349Sbostic } 63141349Sbostic done: 63241349Sbostic regc('\0'); 63341349Sbostic *flagp |= HASWIDTH; 63441349Sbostic if (!regprev) /* One char? */ 63541349Sbostic *flagp |= SIMPLE; 63641349Sbostic } 63741349Sbostic break; 63841349Sbostic } 63941349Sbostic 64041349Sbostic return(ret); 64141349Sbostic } 64241349Sbostic 64341349Sbostic /* 64441349Sbostic - regnode - emit a node 64541349Sbostic */ 64641349Sbostic static char * /* Location. */ 64741349Sbostic regnode(op) 64841349Sbostic char op; 64941349Sbostic { 65041349Sbostic register char *ret; 65141349Sbostic register char *ptr; 65241349Sbostic 65341349Sbostic ret = regcode; 65441349Sbostic if (ret == ®dummy) { 65541349Sbostic regsize += 3; 65641349Sbostic return(ret); 65741349Sbostic } 65841349Sbostic 65941349Sbostic ptr = ret; 66041349Sbostic *ptr++ = op; 66141349Sbostic *ptr++ = '\0'; /* Null "next" pointer. */ 66241349Sbostic *ptr++ = '\0'; 66341349Sbostic regcode = ptr; 66441349Sbostic 66541349Sbostic return(ret); 66641349Sbostic } 66741349Sbostic 66841349Sbostic /* 66941349Sbostic - regc - emit (if appropriate) a byte of code 67041349Sbostic */ 67141349Sbostic static void 67241349Sbostic regc(b) 67341349Sbostic char b; 67441349Sbostic { 67541349Sbostic if (regcode != ®dummy) 67641349Sbostic *regcode++ = b; 67741349Sbostic else 67841349Sbostic regsize++; 67941349Sbostic } 68041349Sbostic 68141349Sbostic /* 68241349Sbostic - reginsert - insert an operator in front of already-emitted operand 68341349Sbostic * 68441349Sbostic * Means relocating the operand. 68541349Sbostic */ 68641349Sbostic static void 68741349Sbostic reginsert(op, opnd) 68841349Sbostic char op; 68941349Sbostic char *opnd; 69041349Sbostic { 69141349Sbostic register char *src; 69241349Sbostic register char *dst; 69341349Sbostic register char *place; 69441349Sbostic 69541349Sbostic if (regcode == ®dummy) { 69641349Sbostic regsize += 3; 69741349Sbostic return; 69841349Sbostic } 69941349Sbostic 70041349Sbostic src = regcode; 70141349Sbostic regcode += 3; 70241349Sbostic dst = regcode; 70341349Sbostic while (src > opnd) 70441349Sbostic *--dst = *--src; 70541349Sbostic 70641349Sbostic place = opnd; /* Op node, where operand used to be. */ 70741349Sbostic *place++ = op; 70841349Sbostic *place++ = '\0'; 70941349Sbostic *place++ = '\0'; 71041349Sbostic } 71141349Sbostic 71241349Sbostic /* 71341349Sbostic - regtail - set the next-pointer at the end of a node chain 71441349Sbostic */ 71541349Sbostic static void 71641349Sbostic regtail(p, val) 71741349Sbostic char *p; 71841349Sbostic char *val; 71941349Sbostic { 72041349Sbostic register char *scan; 72141349Sbostic register char *temp; 72241349Sbostic register int offset; 72341349Sbostic 72441349Sbostic if (p == ®dummy) 72541349Sbostic return; 72641349Sbostic 72741349Sbostic /* Find last node. */ 72841349Sbostic scan = p; 72941349Sbostic for (;;) { 73041349Sbostic temp = regnext(scan); 73141349Sbostic if (temp == NULL) 73241349Sbostic break; 73341349Sbostic scan = temp; 73441349Sbostic } 73541349Sbostic 73641349Sbostic if (OP(scan) == BACK) 73741349Sbostic offset = scan - val; 73841349Sbostic else 73941349Sbostic offset = val - scan; 74041349Sbostic *(scan+1) = (offset>>8)&0377; 74141349Sbostic *(scan+2) = offset&0377; 74241349Sbostic } 74341349Sbostic 74441349Sbostic /* 74541349Sbostic - regoptail - regtail on operand of first argument; nop if operandless 74641349Sbostic */ 74741349Sbostic static void 74841349Sbostic regoptail(p, val) 74941349Sbostic char *p; 75041349Sbostic char *val; 75141349Sbostic { 75241349Sbostic /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 75341349Sbostic if (p == NULL || p == ®dummy || OP(p) != BRANCH) 75441349Sbostic return; 75541349Sbostic regtail(OPERAND(p), val); 75641349Sbostic } 75741349Sbostic 75841349Sbostic /* 75941349Sbostic * regexec and friends 76041349Sbostic */ 76141349Sbostic 76241349Sbostic /* 76341349Sbostic * Global work variables for regexec(). 76441349Sbostic */ 76541349Sbostic static char *reginput; /* String-input pointer. */ 76641349Sbostic static char *regbol; /* Beginning of input, for ^ check. */ 76741349Sbostic static char **regstartp; /* Pointer to startp array. */ 76841349Sbostic static char **regendp; /* Ditto for endp. */ 76941349Sbostic 77041349Sbostic /* 77141349Sbostic * Forwards. 77241349Sbostic */ 77341349Sbostic STATIC int regtry(); 77441349Sbostic STATIC int regmatch(); 77541349Sbostic STATIC int regrepeat(); 77641349Sbostic 77741349Sbostic #ifdef DEBUG 77841349Sbostic int regnarrate = 0; 77941349Sbostic void regdump(); 78041349Sbostic STATIC char *regprop(); 78141349Sbostic #endif 78241349Sbostic 78341349Sbostic /* 78441349Sbostic - regexec - match a regexp against a string 78541349Sbostic */ 78641349Sbostic int 78741349Sbostic regexec(prog, string) 78846619Sbostic register const regexp *prog; 78946619Sbostic register const char *string; 79041349Sbostic { 79141349Sbostic register char *s; 79241349Sbostic extern char *strchr(); 79341349Sbostic 79441349Sbostic /* Be paranoid... */ 79541349Sbostic if (prog == NULL || string == NULL) { 79641349Sbostic regerror("NULL parameter"); 79741349Sbostic return(0); 79841349Sbostic } 79941349Sbostic 80041349Sbostic /* Check validity of program. */ 80141349Sbostic if (UCHARAT(prog->program) != MAGIC) { 80241349Sbostic regerror("corrupted program"); 80341349Sbostic return(0); 80441349Sbostic } 80541349Sbostic 80641349Sbostic /* If there is a "must appear" string, look for it. */ 80741349Sbostic if (prog->regmust != NULL) { 80846619Sbostic s = (char *)string; 80941349Sbostic while ((s = strchr(s, prog->regmust[0])) != NULL) { 81041349Sbostic if (strncmp(s, prog->regmust, prog->regmlen) == 0) 81141349Sbostic break; /* Found it. */ 81241349Sbostic s++; 81341349Sbostic } 81441349Sbostic if (s == NULL) /* Not present. */ 81541349Sbostic return(0); 81641349Sbostic } 81741349Sbostic 81841349Sbostic /* Mark beginning of line for ^ . */ 81946619Sbostic regbol = (char *)string; 82041349Sbostic 82141349Sbostic /* Simplest case: anchored match need be tried only once. */ 82241349Sbostic if (prog->reganch) 82341349Sbostic return(regtry(prog, string)); 82441349Sbostic 82541349Sbostic /* Messy cases: unanchored match. */ 82646619Sbostic s = (char *)string; 82741349Sbostic if (prog->regstart != '\0') 82841349Sbostic /* We know what char it must start with. */ 82941349Sbostic while ((s = strchr(s, prog->regstart)) != NULL) { 83041349Sbostic if (regtry(prog, s)) 83141349Sbostic return(1); 83241349Sbostic s++; 83341349Sbostic } 83441349Sbostic else 83541349Sbostic /* We don't -- general case. */ 83641349Sbostic do { 83741349Sbostic if (regtry(prog, s)) 83841349Sbostic return(1); 83941349Sbostic } while (*s++ != '\0'); 84041349Sbostic 84141349Sbostic /* Failure. */ 84241349Sbostic return(0); 84341349Sbostic } 84441349Sbostic 84541349Sbostic /* 84641349Sbostic - regtry - try match at specific point 84741349Sbostic */ 84841349Sbostic static int /* 0 failure, 1 success */ 84941349Sbostic regtry(prog, string) 85041349Sbostic regexp *prog; 85141349Sbostic char *string; 85241349Sbostic { 85341349Sbostic register int i; 85441349Sbostic register char **sp; 85541349Sbostic register char **ep; 85641349Sbostic 85741349Sbostic reginput = string; 85841349Sbostic regstartp = prog->startp; 85941349Sbostic regendp = prog->endp; 86041349Sbostic 86141349Sbostic sp = prog->startp; 86241349Sbostic ep = prog->endp; 86341349Sbostic for (i = NSUBEXP; i > 0; i--) { 86441349Sbostic *sp++ = NULL; 86541349Sbostic *ep++ = NULL; 86641349Sbostic } 86741349Sbostic if (regmatch(prog->program + 1)) { 86841349Sbostic prog->startp[0] = string; 86941349Sbostic prog->endp[0] = reginput; 87041349Sbostic return(1); 87141349Sbostic } else 87241349Sbostic return(0); 87341349Sbostic } 87441349Sbostic 87541349Sbostic /* 87641349Sbostic - regmatch - main matching routine 87741349Sbostic * 87841349Sbostic * Conceptually the strategy is simple: check to see whether the current 87941349Sbostic * node matches, call self recursively to see whether the rest matches, 88041349Sbostic * and then act accordingly. In practice we make some effort to avoid 88141349Sbostic * recursion, in particular by going through "ordinary" nodes (that don't 88241349Sbostic * need to know whether the rest of the match failed) by a loop instead of 88341349Sbostic * by recursion. 88441349Sbostic */ 88541349Sbostic static int /* 0 failure, 1 success */ 88641349Sbostic regmatch(prog) 88741349Sbostic char *prog; 88841349Sbostic { 88941349Sbostic register char *scan; /* Current node. */ 89041349Sbostic char *next; /* Next node. */ 89141349Sbostic extern char *strchr(); 89241349Sbostic 89341349Sbostic scan = prog; 89441349Sbostic #ifdef DEBUG 89541349Sbostic if (scan != NULL && regnarrate) 89641349Sbostic fprintf(stderr, "%s(\n", regprop(scan)); 89741349Sbostic #endif 89841349Sbostic while (scan != NULL) { 89941349Sbostic #ifdef DEBUG 90041349Sbostic if (regnarrate) 90141349Sbostic fprintf(stderr, "%s...\n", regprop(scan)); 90241349Sbostic #endif 90341349Sbostic next = regnext(scan); 90441349Sbostic 90541349Sbostic switch (OP(scan)) { 90641349Sbostic case BOL: 90741349Sbostic if (reginput != regbol) 90841349Sbostic return(0); 90941349Sbostic break; 91041349Sbostic case EOL: 91141349Sbostic if (*reginput != '\0') 91241349Sbostic return(0); 91341349Sbostic break; 91441349Sbostic case WORDA: 91541349Sbostic /* Must be looking at a letter, digit, or _ */ 91641349Sbostic if ((!isalnum(*reginput)) && *reginput != '_') 91741349Sbostic return(0); 91841349Sbostic /* Prev must be BOL or nonword */ 91941349Sbostic if (reginput > regbol && 92041349Sbostic (isalnum(reginput[-1]) || reginput[-1] == '_')) 92141349Sbostic return(0); 92241349Sbostic break; 92341349Sbostic case WORDZ: 92441349Sbostic /* Must be looking at non letter, digit, or _ */ 92541349Sbostic if (isalnum(*reginput) || *reginput == '_') 92641349Sbostic return(0); 92741349Sbostic /* We don't care what the previous char was */ 92841349Sbostic break; 92941349Sbostic case ANY: 93041349Sbostic if (*reginput == '\0') 93141349Sbostic return(0); 93241349Sbostic reginput++; 93341349Sbostic break; 93441349Sbostic case EXACTLY: { 93541349Sbostic register int len; 93641349Sbostic register char *opnd; 93741349Sbostic 93841349Sbostic opnd = OPERAND(scan); 93941349Sbostic /* Inline the first character, for speed. */ 94041349Sbostic if (*opnd != *reginput) 94141349Sbostic return(0); 94241349Sbostic len = strlen(opnd); 94341349Sbostic if (len > 1 && strncmp(opnd, reginput, len) != 0) 94441349Sbostic return(0); 94541349Sbostic reginput += len; 94641349Sbostic } 94741349Sbostic break; 94841349Sbostic case ANYOF: 94941349Sbostic if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 95041349Sbostic return(0); 95141349Sbostic reginput++; 95241349Sbostic break; 95341349Sbostic case ANYBUT: 95441349Sbostic if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 95541349Sbostic return(0); 95641349Sbostic reginput++; 95741349Sbostic break; 95841349Sbostic case NOTHING: 95941349Sbostic break; 96041349Sbostic case BACK: 96141349Sbostic break; 96241349Sbostic case OPEN+1: 96341349Sbostic case OPEN+2: 96441349Sbostic case OPEN+3: 96541349Sbostic case OPEN+4: 96641349Sbostic case OPEN+5: 96741349Sbostic case OPEN+6: 96841349Sbostic case OPEN+7: 96941349Sbostic case OPEN+8: 97041349Sbostic case OPEN+9: { 97141349Sbostic register int no; 97241349Sbostic register char *save; 97341349Sbostic 97441349Sbostic no = OP(scan) - OPEN; 97541349Sbostic save = reginput; 97641349Sbostic 97741349Sbostic if (regmatch(next)) { 97841349Sbostic /* 97941349Sbostic * Don't set startp if some later 98041349Sbostic * invocation of the same parentheses 98141349Sbostic * already has. 98241349Sbostic */ 98341349Sbostic if (regstartp[no] == NULL) 98441349Sbostic regstartp[no] = save; 98541349Sbostic return(1); 98641349Sbostic } else 98741349Sbostic return(0); 98841349Sbostic } 98941349Sbostic break; 99041349Sbostic case CLOSE+1: 99141349Sbostic case CLOSE+2: 99241349Sbostic case CLOSE+3: 99341349Sbostic case CLOSE+4: 99441349Sbostic case CLOSE+5: 99541349Sbostic case CLOSE+6: 99641349Sbostic case CLOSE+7: 99741349Sbostic case CLOSE+8: 99841349Sbostic case CLOSE+9: { 99941349Sbostic register int no; 100041349Sbostic register char *save; 100141349Sbostic 100241349Sbostic no = OP(scan) - CLOSE; 100341349Sbostic save = reginput; 100441349Sbostic 100541349Sbostic if (regmatch(next)) { 100641349Sbostic /* 100741349Sbostic * Don't set endp if some later 100841349Sbostic * invocation of the same parentheses 100941349Sbostic * already has. 101041349Sbostic */ 101141349Sbostic if (regendp[no] == NULL) 101241349Sbostic regendp[no] = save; 101341349Sbostic return(1); 101441349Sbostic } else 101541349Sbostic return(0); 101641349Sbostic } 101741349Sbostic break; 101841349Sbostic case BRANCH: { 101941349Sbostic register char *save; 102041349Sbostic 102141349Sbostic if (OP(next) != BRANCH) /* No choice. */ 102241349Sbostic next = OPERAND(scan); /* Avoid recursion. */ 102341349Sbostic else { 102441349Sbostic do { 102541349Sbostic save = reginput; 102641349Sbostic if (regmatch(OPERAND(scan))) 102741349Sbostic return(1); 102841349Sbostic reginput = save; 102941349Sbostic scan = regnext(scan); 103041349Sbostic } while (scan != NULL && OP(scan) == BRANCH); 103141349Sbostic return(0); 103241349Sbostic /* NOTREACHED */ 103341349Sbostic } 103441349Sbostic } 103541349Sbostic break; 103641349Sbostic case STAR: 103741349Sbostic case PLUS: { 103841349Sbostic register char nextch; 103941349Sbostic register int no; 104041349Sbostic register char *save; 104141349Sbostic register int min; 104241349Sbostic 104341349Sbostic /* 104441349Sbostic * Lookahead to avoid useless match attempts 104541349Sbostic * when we know what character comes next. 104641349Sbostic */ 104741349Sbostic nextch = '\0'; 104841349Sbostic if (OP(next) == EXACTLY) 104941349Sbostic nextch = *OPERAND(next); 105041349Sbostic min = (OP(scan) == STAR) ? 0 : 1; 105141349Sbostic save = reginput; 105241349Sbostic no = regrepeat(OPERAND(scan)); 105341349Sbostic while (no >= min) { 105441349Sbostic /* If it could work, try it. */ 105541349Sbostic if (nextch == '\0' || *reginput == nextch) 105641349Sbostic if (regmatch(next)) 105741349Sbostic return(1); 105841349Sbostic /* Couldn't or didn't -- back up. */ 105941349Sbostic no--; 106041349Sbostic reginput = save + no; 106141349Sbostic } 106241349Sbostic return(0); 106341349Sbostic } 106441349Sbostic break; 106541349Sbostic case END: 106641349Sbostic return(1); /* Success! */ 106741349Sbostic break; 106841349Sbostic default: 106941349Sbostic regerror("memory corruption"); 107041349Sbostic return(0); 107141349Sbostic break; 107241349Sbostic } 107341349Sbostic 107441349Sbostic scan = next; 107541349Sbostic } 107641349Sbostic 107741349Sbostic /* 107841349Sbostic * We get here only if there's trouble -- normally "case END" is 107941349Sbostic * the terminating point. 108041349Sbostic */ 108141349Sbostic regerror("corrupted pointers"); 108241349Sbostic return(0); 108341349Sbostic } 108441349Sbostic 108541349Sbostic /* 108641349Sbostic - regrepeat - repeatedly match something simple, report how many 108741349Sbostic */ 108841349Sbostic static int 108941349Sbostic regrepeat(p) 109041349Sbostic char *p; 109141349Sbostic { 109241349Sbostic register int count = 0; 109341349Sbostic register char *scan; 109441349Sbostic register char *opnd; 109541349Sbostic 109641349Sbostic scan = reginput; 109741349Sbostic opnd = OPERAND(p); 109841349Sbostic switch (OP(p)) { 109941349Sbostic case ANY: 110041349Sbostic count = strlen(scan); 110141349Sbostic scan += count; 110241349Sbostic break; 110341349Sbostic case EXACTLY: 110441349Sbostic while (*opnd == *scan) { 110541349Sbostic count++; 110641349Sbostic scan++; 110741349Sbostic } 110841349Sbostic break; 110941349Sbostic case ANYOF: 111041349Sbostic while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 111141349Sbostic count++; 111241349Sbostic scan++; 111341349Sbostic } 111441349Sbostic break; 111541349Sbostic case ANYBUT: 111641349Sbostic while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 111741349Sbostic count++; 111841349Sbostic scan++; 111941349Sbostic } 112041349Sbostic break; 112141349Sbostic default: /* Oh dear. Called inappropriately. */ 112241349Sbostic regerror("internal foulup"); 112341349Sbostic count = 0; /* Best compromise. */ 112441349Sbostic break; 112541349Sbostic } 112641349Sbostic reginput = scan; 112741349Sbostic 112841349Sbostic return(count); 112941349Sbostic } 113041349Sbostic 113141349Sbostic /* 113241349Sbostic - regnext - dig the "next" pointer out of a node 113341349Sbostic */ 113441349Sbostic static char * 113541349Sbostic regnext(p) 113641349Sbostic register char *p; 113741349Sbostic { 113841349Sbostic register int offset; 113941349Sbostic 114041349Sbostic if (p == ®dummy) 114141349Sbostic return(NULL); 114241349Sbostic 114341349Sbostic offset = NEXT(p); 114441349Sbostic if (offset == 0) 114541349Sbostic return(NULL); 114641349Sbostic 114741349Sbostic if (OP(p) == BACK) 114841349Sbostic return(p-offset); 114941349Sbostic else 115041349Sbostic return(p+offset); 115141349Sbostic } 115241349Sbostic 115341349Sbostic #ifdef DEBUG 115441349Sbostic 115541349Sbostic STATIC char *regprop(); 115641349Sbostic 115741349Sbostic /* 115841349Sbostic - regdump - dump a regexp onto stdout in vaguely comprehensible form 115941349Sbostic */ 116041349Sbostic void 116141349Sbostic regdump(r) 116241349Sbostic regexp *r; 116341349Sbostic { 116441349Sbostic register char *s; 116541349Sbostic register char op = EXACTLY; /* Arbitrary non-END op. */ 116641349Sbostic register char *next; 116741349Sbostic extern char *strchr(); 116841349Sbostic 116941349Sbostic 117041349Sbostic s = r->program + 1; 117141349Sbostic while (op != END) { /* While that wasn't END last time... */ 117241349Sbostic op = OP(s); 117341349Sbostic printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 117441349Sbostic next = regnext(s); 117541349Sbostic if (next == NULL) /* Next ptr. */ 117641349Sbostic printf("(0)"); 117741349Sbostic else 117841349Sbostic printf("(%d)", (s-r->program)+(next-s)); 117941349Sbostic s += 3; 118041349Sbostic if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 118141349Sbostic /* Literal string, where present. */ 118241349Sbostic while (*s != '\0') { 118341349Sbostic putchar(*s); 118441349Sbostic s++; 118541349Sbostic } 118641349Sbostic s++; 118741349Sbostic } 118841349Sbostic putchar('\n'); 118941349Sbostic } 119041349Sbostic 119141349Sbostic /* Header fields of interest. */ 119241349Sbostic if (r->regstart != '\0') 119341349Sbostic printf("start `%c' ", r->regstart); 119441349Sbostic if (r->reganch) 119541349Sbostic printf("anchored "); 119641349Sbostic if (r->regmust != NULL) 119741349Sbostic printf("must have \"%s\"", r->regmust); 119841349Sbostic printf("\n"); 119941349Sbostic } 120041349Sbostic 120141349Sbostic /* 120241349Sbostic - regprop - printable representation of opcode 120341349Sbostic */ 120441349Sbostic static char * 120541349Sbostic regprop(op) 120641349Sbostic char *op; 120741349Sbostic { 120841349Sbostic register char *p; 120941349Sbostic static char buf[50]; 121041349Sbostic 121141349Sbostic (void) strcpy(buf, ":"); 121241349Sbostic 121341349Sbostic switch (OP(op)) { 121441349Sbostic case BOL: 121541349Sbostic p = "BOL"; 121641349Sbostic break; 121741349Sbostic case EOL: 121841349Sbostic p = "EOL"; 121941349Sbostic break; 122041349Sbostic case ANY: 122141349Sbostic p = "ANY"; 122241349Sbostic break; 122341349Sbostic case ANYOF: 122441349Sbostic p = "ANYOF"; 122541349Sbostic break; 122641349Sbostic case ANYBUT: 122741349Sbostic p = "ANYBUT"; 122841349Sbostic break; 122941349Sbostic case BRANCH: 123041349Sbostic p = "BRANCH"; 123141349Sbostic break; 123241349Sbostic case EXACTLY: 123341349Sbostic p = "EXACTLY"; 123441349Sbostic break; 123541349Sbostic case NOTHING: 123641349Sbostic p = "NOTHING"; 123741349Sbostic break; 123841349Sbostic case BACK: 123941349Sbostic p = "BACK"; 124041349Sbostic break; 124141349Sbostic case END: 124241349Sbostic p = "END"; 124341349Sbostic break; 124441349Sbostic case OPEN+1: 124541349Sbostic case OPEN+2: 124641349Sbostic case OPEN+3: 124741349Sbostic case OPEN+4: 124841349Sbostic case OPEN+5: 124941349Sbostic case OPEN+6: 125041349Sbostic case OPEN+7: 125141349Sbostic case OPEN+8: 125241349Sbostic case OPEN+9: 125341349Sbostic sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 125441349Sbostic p = NULL; 125541349Sbostic break; 125641349Sbostic case CLOSE+1: 125741349Sbostic case CLOSE+2: 125841349Sbostic case CLOSE+3: 125941349Sbostic case CLOSE+4: 126041349Sbostic case CLOSE+5: 126141349Sbostic case CLOSE+6: 126241349Sbostic case CLOSE+7: 126341349Sbostic case CLOSE+8: 126441349Sbostic case CLOSE+9: 126541349Sbostic sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 126641349Sbostic p = NULL; 126741349Sbostic break; 126841349Sbostic case STAR: 126941349Sbostic p = "STAR"; 127041349Sbostic break; 127141349Sbostic case PLUS: 127241349Sbostic p = "PLUS"; 127341349Sbostic break; 127441349Sbostic case WORDA: 127541349Sbostic p = "WORDA"; 127641349Sbostic break; 127741349Sbostic case WORDZ: 127841349Sbostic p = "WORDZ"; 127941349Sbostic break; 128041349Sbostic default: 128141349Sbostic regerror("corrupted opcode"); 128241349Sbostic break; 128341349Sbostic } 128441349Sbostic if (p != NULL) 128541349Sbostic (void) strcat(buf, p); 128641349Sbostic return(buf); 128741349Sbostic } 128841349Sbostic #endif 128941349Sbostic 129041349Sbostic /* 129141349Sbostic * The following is provided for those people who do not have strcspn() in 129241349Sbostic * their C libraries. They should get off their butts and do something 129341349Sbostic * about it; at least one public-domain implementation of those (highly 129441349Sbostic * useful) string routines has been published on Usenet. 129541349Sbostic */ 129641349Sbostic #ifdef STRCSPN 129741349Sbostic /* 129841349Sbostic * strcspn - find length of initial segment of s1 consisting entirely 129941349Sbostic * of characters not from s2 130041349Sbostic */ 130141349Sbostic 130241349Sbostic static int 130341349Sbostic strcspn(s1, s2) 130441349Sbostic char *s1; 130541349Sbostic char *s2; 130641349Sbostic { 130741349Sbostic register char *scan1; 130841349Sbostic register char *scan2; 130941349Sbostic register int count; 131041349Sbostic 131141349Sbostic count = 0; 131241349Sbostic for (scan1 = s1; *scan1 != '\0'; scan1++) { 131341349Sbostic for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 131441349Sbostic if (*scan1 == *scan2++) 131541349Sbostic return(count); 131641349Sbostic count++; 131741349Sbostic } 131841349Sbostic return(count); 131941349Sbostic } 132041349Sbostic #endif 1321