1*41349Sbostic /* 2*41349Sbostic * regcomp and regexec -- regsub and regerror are elsewhere 3*41349Sbostic * 4*41349Sbostic * Copyright (c) 1986 by University of Toronto. 5*41349Sbostic * Written by Henry Spencer. Not derived from licensed software. 6*41349Sbostic * 7*41349Sbostic * Permission is granted to anyone to use this software for any 8*41349Sbostic * purpose on any computer system, and to redistribute it freely, 9*41349Sbostic * subject to the following restrictions: 10*41349Sbostic * 11*41349Sbostic * 1. The author is not responsible for the consequences of use of 12*41349Sbostic * this software, no matter how awful, even if they arise 13*41349Sbostic * from defects in it. 14*41349Sbostic * 15*41349Sbostic * 2. The origin of this software must not be misrepresented, either 16*41349Sbostic * by explicit claim or by omission. 17*41349Sbostic * 18*41349Sbostic * 3. Altered versions must be plainly marked as such, and must not 19*41349Sbostic * be misrepresented as being the original software. 20*41349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 21*41349Sbostic *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | 22*41349Sbostic *** to assist in implementing egrep. 23*41349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 24*41349Sbostic *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching 25*41349Sbostic *** as in BSD grep and ex. 26*41349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 27*41349Sbostic *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. 28*41349Sbostic *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, 29*41349Sbostic *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. 30*41349Sbostic * 31*41349Sbostic * Beware that some of this code is subtly aware of the way operator 32*41349Sbostic * precedence is structured in regular expressions. Serious changes in 33*41349Sbostic * regular-expression syntax might require a total rethink. 34*41349Sbostic */ 35*41349Sbostic #include <stdio.h> 36*41349Sbostic #include <ctype.h> 37*41349Sbostic #include <regexp.h> 38*41349Sbostic #include "regmagic.h" 39*41349Sbostic 40*41349Sbostic /* 41*41349Sbostic * The "internal use only" fields in regexp.h are present to pass info from 42*41349Sbostic * compile to execute that permits the execute phase to run lots faster on 43*41349Sbostic * simple cases. They are: 44*41349Sbostic * 45*41349Sbostic * regstart char that must begin a match; '\0' if none obvious 46*41349Sbostic * reganch is the match anchored (at beginning-of-line only)? 47*41349Sbostic * regmust string (pointer into program) that match must include, or NULL 48*41349Sbostic * regmlen length of regmust string 49*41349Sbostic * 50*41349Sbostic * Regstart and reganch permit very fast decisions on suitable starting points 51*41349Sbostic * for a match, cutting down the work a lot. Regmust permits fast rejection 52*41349Sbostic * of lines that cannot possibly match. The regmust tests are costly enough 53*41349Sbostic * that regcomp() supplies a regmust only if the r.e. contains something 54*41349Sbostic * potentially expensive (at present, the only such thing detected is * or + 55*41349Sbostic * at the start of the r.e., which can involve a lot of backup). Regmlen is 56*41349Sbostic * supplied because the test in regexec() needs it and regcomp() is computing 57*41349Sbostic * it anyway. 58*41349Sbostic */ 59*41349Sbostic 60*41349Sbostic /* 61*41349Sbostic * Structure for regexp "program". This is essentially a linear encoding 62*41349Sbostic * of a nondeterministic finite-state machine (aka syntax charts or 63*41349Sbostic * "railroad normal form" in parsing technology). Each node is an opcode 64*41349Sbostic * plus a "next" pointer, possibly plus an operand. "Next" pointers of 65*41349Sbostic * all nodes except BRANCH implement concatenation; a "next" pointer with 66*41349Sbostic * a BRANCH on both ends of it is connecting two alternatives. (Here we 67*41349Sbostic * have one of the subtle syntax dependencies: an individual BRANCH (as 68*41349Sbostic * opposed to a collection of them) is never concatenated with anything 69*41349Sbostic * because of operator precedence.) The operand of some types of node is 70*41349Sbostic * a literal string; for others, it is a node leading into a sub-FSM. In 71*41349Sbostic * particular, the operand of a BRANCH node is the first node of the branch. 72*41349Sbostic * (NB this is *not* a tree structure: the tail of the branch connects 73*41349Sbostic * to the thing following the set of BRANCHes.) The opcodes are: 74*41349Sbostic */ 75*41349Sbostic 76*41349Sbostic /* definition number opnd? meaning */ 77*41349Sbostic #define END 0 /* no End of program. */ 78*41349Sbostic #define BOL 1 /* no Match "" at beginning of line. */ 79*41349Sbostic #define EOL 2 /* no Match "" at end of line. */ 80*41349Sbostic #define ANY 3 /* no Match any one character. */ 81*41349Sbostic #define ANYOF 4 /* str Match any character in this string. */ 82*41349Sbostic #define ANYBUT 5 /* str Match any character not in this string. */ 83*41349Sbostic #define BRANCH 6 /* node Match this alternative, or the next... */ 84*41349Sbostic #define BACK 7 /* no Match "", "next" ptr points backward. */ 85*41349Sbostic #define EXACTLY 8 /* str Match this string. */ 86*41349Sbostic #define NOTHING 9 /* no Match empty string. */ 87*41349Sbostic #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 88*41349Sbostic #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 89*41349Sbostic #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ 90*41349Sbostic #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ 91*41349Sbostic #define OPEN 20 /* no Mark this point in input as start of #n. */ 92*41349Sbostic /* OPEN+1 is number 1, etc. */ 93*41349Sbostic #define CLOSE 30 /* no Analogous to OPEN. */ 94*41349Sbostic 95*41349Sbostic /* 96*41349Sbostic * Opcode notes: 97*41349Sbostic * 98*41349Sbostic * BRANCH The set of branches constituting a single choice are hooked 99*41349Sbostic * together with their "next" pointers, since precedence prevents 100*41349Sbostic * anything being concatenated to any individual branch. The 101*41349Sbostic * "next" pointer of the last BRANCH in a choice points to the 102*41349Sbostic * thing following the whole choice. This is also where the 103*41349Sbostic * final "next" pointer of each individual branch points; each 104*41349Sbostic * branch starts with the operand node of a BRANCH node. 105*41349Sbostic * 106*41349Sbostic * BACK Normal "next" pointers all implicitly point forward; BACK 107*41349Sbostic * exists to make loop structures possible. 108*41349Sbostic * 109*41349Sbostic * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 110*41349Sbostic * BRANCH structures using BACK. Simple cases (one character 111*41349Sbostic * per match) are implemented with STAR and PLUS for speed 112*41349Sbostic * and to minimize recursive plunges. 113*41349Sbostic * 114*41349Sbostic * OPEN,CLOSE ...are numbered at compile time. 115*41349Sbostic */ 116*41349Sbostic 117*41349Sbostic /* 118*41349Sbostic * A node is one char of opcode followed by two chars of "next" pointer. 119*41349Sbostic * "Next" pointers are stored as two 8-bit pieces, high order first. The 120*41349Sbostic * value is a positive offset from the opcode of the node containing it. 121*41349Sbostic * An operand, if any, simply follows the node. (Note that much of the 122*41349Sbostic * code generation knows about this implicit relationship.) 123*41349Sbostic * 124*41349Sbostic * Using two bytes for the "next" pointer is vast overkill for most things, 125*41349Sbostic * but allows patterns to get big without disasters. 126*41349Sbostic */ 127*41349Sbostic #define OP(p) (*(p)) 128*41349Sbostic #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 129*41349Sbostic #define OPERAND(p) ((p) + 3) 130*41349Sbostic 131*41349Sbostic /* 132*41349Sbostic * See regmagic.h for one further detail of program structure. 133*41349Sbostic */ 134*41349Sbostic 135*41349Sbostic 136*41349Sbostic /* 137*41349Sbostic * Utility definitions. 138*41349Sbostic */ 139*41349Sbostic #ifndef CHARBITS 140*41349Sbostic #define UCHARAT(p) ((int)*(unsigned char *)(p)) 141*41349Sbostic #else 142*41349Sbostic #define UCHARAT(p) ((int)*(p)&CHARBITS) 143*41349Sbostic #endif 144*41349Sbostic 145*41349Sbostic #define FAIL(m) { regerror(m); return(NULL); } 146*41349Sbostic #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 147*41349Sbostic 148*41349Sbostic /* 149*41349Sbostic * Flags to be passed up and down. 150*41349Sbostic */ 151*41349Sbostic #define HASWIDTH 01 /* Known never to match null string. */ 152*41349Sbostic #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 153*41349Sbostic #define SPSTART 04 /* Starts with * or +. */ 154*41349Sbostic #define WORST 0 /* Worst case. */ 155*41349Sbostic 156*41349Sbostic /* 157*41349Sbostic * Global work variables for regcomp(). 158*41349Sbostic */ 159*41349Sbostic static char *regparse; /* Input-scan pointer. */ 160*41349Sbostic static int regnpar; /* () count. */ 161*41349Sbostic static char regdummy; 162*41349Sbostic static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 163*41349Sbostic static long regsize; /* Code size. */ 164*41349Sbostic 165*41349Sbostic /* 166*41349Sbostic * Forward declarations for regcomp()'s friends. 167*41349Sbostic */ 168*41349Sbostic #ifndef STATIC 169*41349Sbostic #define STATIC static 170*41349Sbostic #endif 171*41349Sbostic STATIC char *reg(); 172*41349Sbostic STATIC char *regbranch(); 173*41349Sbostic STATIC char *regpiece(); 174*41349Sbostic STATIC char *regatom(); 175*41349Sbostic STATIC char *regnode(); 176*41349Sbostic STATIC char *regnext(); 177*41349Sbostic STATIC void regc(); 178*41349Sbostic STATIC void reginsert(); 179*41349Sbostic STATIC void regtail(); 180*41349Sbostic STATIC void regoptail(); 181*41349Sbostic #ifdef STRCSPN 182*41349Sbostic STATIC int strcspn(); 183*41349Sbostic #endif 184*41349Sbostic 185*41349Sbostic /* 186*41349Sbostic - regcomp - compile a regular expression into internal code 187*41349Sbostic * 188*41349Sbostic * We can't allocate space until we know how big the compiled form will be, 189*41349Sbostic * but we can't compile it (and thus know how big it is) until we've got a 190*41349Sbostic * place to put the code. So we cheat: we compile it twice, once with code 191*41349Sbostic * generation turned off and size counting turned on, and once "for real". 192*41349Sbostic * This also means that we don't allocate space until we are sure that the 193*41349Sbostic * thing really will compile successfully, and we never have to move the 194*41349Sbostic * code and thus invalidate pointers into it. (Note that it has to be in 195*41349Sbostic * one piece because free() must be able to free it all.) 196*41349Sbostic * 197*41349Sbostic * Beware that the optimization-preparation code in here knows about some 198*41349Sbostic * of the structure of the compiled regexp. 199*41349Sbostic */ 200*41349Sbostic regexp * 201*41349Sbostic regcomp(exp) 202*41349Sbostic char *exp; 203*41349Sbostic { 204*41349Sbostic register regexp *r; 205*41349Sbostic register char *scan; 206*41349Sbostic register char *longest; 207*41349Sbostic register int len; 208*41349Sbostic int flags; 209*41349Sbostic extern char *malloc(); 210*41349Sbostic 211*41349Sbostic if (exp == NULL) 212*41349Sbostic FAIL("NULL argument"); 213*41349Sbostic 214*41349Sbostic /* First pass: determine size, legality. */ 215*41349Sbostic if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ 216*41349Sbostic regparse = exp; 217*41349Sbostic regnpar = 1; 218*41349Sbostic regsize = 0L; 219*41349Sbostic regcode = ®dummy; 220*41349Sbostic regc(MAGIC); 221*41349Sbostic if (reg(0, &flags) == NULL) 222*41349Sbostic return(NULL); 223*41349Sbostic 224*41349Sbostic /* Small enough for pointer-storage convention? */ 225*41349Sbostic if (regsize >= 32767L) /* Probably could be 65535L. */ 226*41349Sbostic FAIL("regexp too big"); 227*41349Sbostic 228*41349Sbostic /* Allocate space. */ 229*41349Sbostic r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 230*41349Sbostic if (r == NULL) 231*41349Sbostic FAIL("out of space"); 232*41349Sbostic 233*41349Sbostic /* Second pass: emit code. */ 234*41349Sbostic regparse = exp; 235*41349Sbostic regnpar = 1; 236*41349Sbostic regcode = r->program; 237*41349Sbostic regc(MAGIC); 238*41349Sbostic if (reg(0, &flags) == NULL) 239*41349Sbostic return(NULL); 240*41349Sbostic 241*41349Sbostic /* Dig out information for optimizations. */ 242*41349Sbostic r->regstart = '\0'; /* Worst-case defaults. */ 243*41349Sbostic r->reganch = 0; 244*41349Sbostic r->regmust = NULL; 245*41349Sbostic r->regmlen = 0; 246*41349Sbostic scan = r->program+1; /* First BRANCH. */ 247*41349Sbostic if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 248*41349Sbostic scan = OPERAND(scan); 249*41349Sbostic 250*41349Sbostic /* Starting-point info. */ 251*41349Sbostic if (OP(scan) == EXACTLY) 252*41349Sbostic r->regstart = *OPERAND(scan); 253*41349Sbostic else if (OP(scan) == BOL) 254*41349Sbostic r->reganch++; 255*41349Sbostic 256*41349Sbostic /* 257*41349Sbostic * If there's something expensive in the r.e., find the 258*41349Sbostic * longest literal string that must appear and make it the 259*41349Sbostic * regmust. Resolve ties in favor of later strings, since 260*41349Sbostic * the regstart check works with the beginning of the r.e. 261*41349Sbostic * and avoiding duplication strengthens checking. Not a 262*41349Sbostic * strong reason, but sufficient in the absence of others. 263*41349Sbostic */ 264*41349Sbostic if (flags&SPSTART) { 265*41349Sbostic longest = NULL; 266*41349Sbostic len = 0; 267*41349Sbostic for (; scan != NULL; scan = regnext(scan)) 268*41349Sbostic if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { 269*41349Sbostic longest = OPERAND(scan); 270*41349Sbostic len = strlen(OPERAND(scan)); 271*41349Sbostic } 272*41349Sbostic r->regmust = longest; 273*41349Sbostic r->regmlen = len; 274*41349Sbostic } 275*41349Sbostic } 276*41349Sbostic 277*41349Sbostic return(r); 278*41349Sbostic } 279*41349Sbostic 280*41349Sbostic /* 281*41349Sbostic - reg - regular expression, i.e. main body or parenthesized thing 282*41349Sbostic * 283*41349Sbostic * Caller must absorb opening parenthesis. 284*41349Sbostic * 285*41349Sbostic * Combining parenthesis handling with the base level of regular expression 286*41349Sbostic * is a trifle forced, but the need to tie the tails of the branches to what 287*41349Sbostic * follows makes it hard to avoid. 288*41349Sbostic */ 289*41349Sbostic static char * 290*41349Sbostic reg(paren, flagp) 291*41349Sbostic int paren; /* Parenthesized? */ 292*41349Sbostic int *flagp; 293*41349Sbostic { 294*41349Sbostic register char *ret; 295*41349Sbostic register char *br; 296*41349Sbostic register char *ender; 297*41349Sbostic register int parno; 298*41349Sbostic int flags; 299*41349Sbostic 300*41349Sbostic *flagp = HASWIDTH; /* Tentatively. */ 301*41349Sbostic 302*41349Sbostic /* Make an OPEN node, if parenthesized. */ 303*41349Sbostic if (paren) { 304*41349Sbostic if (regnpar >= NSUBEXP) 305*41349Sbostic FAIL("too many ()"); 306*41349Sbostic parno = regnpar; 307*41349Sbostic regnpar++; 308*41349Sbostic ret = regnode(OPEN+parno); 309*41349Sbostic } else 310*41349Sbostic ret = NULL; 311*41349Sbostic 312*41349Sbostic /* Pick up the branches, linking them together. */ 313*41349Sbostic br = regbranch(&flags); 314*41349Sbostic if (br == NULL) 315*41349Sbostic return(NULL); 316*41349Sbostic if (ret != NULL) 317*41349Sbostic regtail(ret, br); /* OPEN -> first. */ 318*41349Sbostic else 319*41349Sbostic ret = br; 320*41349Sbostic if (!(flags&HASWIDTH)) 321*41349Sbostic *flagp &= ~HASWIDTH; 322*41349Sbostic *flagp |= flags&SPSTART; 323*41349Sbostic while (*regparse == '|' || *regparse == '\n') { 324*41349Sbostic regparse++; 325*41349Sbostic br = regbranch(&flags); 326*41349Sbostic if (br == NULL) 327*41349Sbostic return(NULL); 328*41349Sbostic regtail(ret, br); /* BRANCH -> BRANCH. */ 329*41349Sbostic if (!(flags&HASWIDTH)) 330*41349Sbostic *flagp &= ~HASWIDTH; 331*41349Sbostic *flagp |= flags&SPSTART; 332*41349Sbostic } 333*41349Sbostic 334*41349Sbostic /* Make a closing node, and hook it on the end. */ 335*41349Sbostic ender = regnode((paren) ? CLOSE+parno : END); 336*41349Sbostic regtail(ret, ender); 337*41349Sbostic 338*41349Sbostic /* Hook the tails of the branches to the closing node. */ 339*41349Sbostic for (br = ret; br != NULL; br = regnext(br)) 340*41349Sbostic regoptail(br, ender); 341*41349Sbostic 342*41349Sbostic /* Check for proper termination. */ 343*41349Sbostic if (paren && *regparse++ != ')') { 344*41349Sbostic FAIL("unmatched ()"); 345*41349Sbostic } else if (!paren && *regparse != '\0') { 346*41349Sbostic if (*regparse == ')') { 347*41349Sbostic FAIL("unmatched ()"); 348*41349Sbostic } else 349*41349Sbostic FAIL("junk on end"); /* "Can't happen". */ 350*41349Sbostic /* NOTREACHED */ 351*41349Sbostic } 352*41349Sbostic 353*41349Sbostic return(ret); 354*41349Sbostic } 355*41349Sbostic 356*41349Sbostic /* 357*41349Sbostic - regbranch - one alternative of an | operator 358*41349Sbostic * 359*41349Sbostic * Implements the concatenation operator. 360*41349Sbostic */ 361*41349Sbostic static char * 362*41349Sbostic regbranch(flagp) 363*41349Sbostic int *flagp; 364*41349Sbostic { 365*41349Sbostic register char *ret; 366*41349Sbostic register char *chain; 367*41349Sbostic register char *latest; 368*41349Sbostic int flags; 369*41349Sbostic 370*41349Sbostic *flagp = WORST; /* Tentatively. */ 371*41349Sbostic 372*41349Sbostic ret = regnode(BRANCH); 373*41349Sbostic chain = NULL; 374*41349Sbostic while (*regparse != '\0' && *regparse != ')' && 375*41349Sbostic *regparse != '\n' && *regparse != '|') { 376*41349Sbostic latest = regpiece(&flags); 377*41349Sbostic if (latest == NULL) 378*41349Sbostic return(NULL); 379*41349Sbostic *flagp |= flags&HASWIDTH; 380*41349Sbostic if (chain == NULL) /* First piece. */ 381*41349Sbostic *flagp |= flags&SPSTART; 382*41349Sbostic else 383*41349Sbostic regtail(chain, latest); 384*41349Sbostic chain = latest; 385*41349Sbostic } 386*41349Sbostic if (chain == NULL) /* Loop ran zero times. */ 387*41349Sbostic (void) regnode(NOTHING); 388*41349Sbostic 389*41349Sbostic return(ret); 390*41349Sbostic } 391*41349Sbostic 392*41349Sbostic /* 393*41349Sbostic - regpiece - something followed by possible [*+?] 394*41349Sbostic * 395*41349Sbostic * Note that the branching code sequences used for ? and the general cases 396*41349Sbostic * of * and + are somewhat optimized: they use the same NOTHING node as 397*41349Sbostic * both the endmarker for their branch list and the body of the last branch. 398*41349Sbostic * It might seem that this node could be dispensed with entirely, but the 399*41349Sbostic * endmarker role is not redundant. 400*41349Sbostic */ 401*41349Sbostic static char * 402*41349Sbostic regpiece(flagp) 403*41349Sbostic int *flagp; 404*41349Sbostic { 405*41349Sbostic register char *ret; 406*41349Sbostic register char op; 407*41349Sbostic register char *next; 408*41349Sbostic int flags; 409*41349Sbostic 410*41349Sbostic ret = regatom(&flags); 411*41349Sbostic if (ret == NULL) 412*41349Sbostic return(NULL); 413*41349Sbostic 414*41349Sbostic op = *regparse; 415*41349Sbostic if (!ISMULT(op)) { 416*41349Sbostic *flagp = flags; 417*41349Sbostic return(ret); 418*41349Sbostic } 419*41349Sbostic 420*41349Sbostic if (!(flags&HASWIDTH) && op != '?') 421*41349Sbostic FAIL("*+ operand could be empty"); 422*41349Sbostic *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 423*41349Sbostic 424*41349Sbostic if (op == '*' && (flags&SIMPLE)) 425*41349Sbostic reginsert(STAR, ret); 426*41349Sbostic else if (op == '*') { 427*41349Sbostic /* Emit x* as (x&|), where & means "self". */ 428*41349Sbostic reginsert(BRANCH, ret); /* Either x */ 429*41349Sbostic regoptail(ret, regnode(BACK)); /* and loop */ 430*41349Sbostic regoptail(ret, ret); /* back */ 431*41349Sbostic regtail(ret, regnode(BRANCH)); /* or */ 432*41349Sbostic regtail(ret, regnode(NOTHING)); /* null. */ 433*41349Sbostic } else if (op == '+' && (flags&SIMPLE)) 434*41349Sbostic reginsert(PLUS, ret); 435*41349Sbostic else if (op == '+') { 436*41349Sbostic /* Emit x+ as x(&|), where & means "self". */ 437*41349Sbostic next = regnode(BRANCH); /* Either */ 438*41349Sbostic regtail(ret, next); 439*41349Sbostic regtail(regnode(BACK), ret); /* loop back */ 440*41349Sbostic regtail(next, regnode(BRANCH)); /* or */ 441*41349Sbostic regtail(ret, regnode(NOTHING)); /* null. */ 442*41349Sbostic } else if (op == '?') { 443*41349Sbostic /* Emit x? as (x|) */ 444*41349Sbostic reginsert(BRANCH, ret); /* Either x */ 445*41349Sbostic regtail(ret, regnode(BRANCH)); /* or */ 446*41349Sbostic next = regnode(NOTHING); /* null. */ 447*41349Sbostic regtail(ret, next); 448*41349Sbostic regoptail(ret, next); 449*41349Sbostic } 450*41349Sbostic regparse++; 451*41349Sbostic if (ISMULT(*regparse)) 452*41349Sbostic FAIL("nested *?+"); 453*41349Sbostic 454*41349Sbostic return(ret); 455*41349Sbostic } 456*41349Sbostic 457*41349Sbostic /* 458*41349Sbostic - regatom - the lowest level 459*41349Sbostic * 460*41349Sbostic * Optimization: gobbles an entire sequence of ordinary characters so that 461*41349Sbostic * it can turn them into a single node, which is smaller to store and 462*41349Sbostic * faster to run. Backslashed characters are exceptions, each becoming a 463*41349Sbostic * separate node; the code is simpler that way and it's not worth fixing. 464*41349Sbostic */ 465*41349Sbostic static char * 466*41349Sbostic regatom(flagp) 467*41349Sbostic int *flagp; 468*41349Sbostic { 469*41349Sbostic register char *ret; 470*41349Sbostic int flags; 471*41349Sbostic 472*41349Sbostic *flagp = WORST; /* Tentatively. */ 473*41349Sbostic 474*41349Sbostic switch (*regparse++) { 475*41349Sbostic /* FIXME: these chars only have meaning at beg/end of pat? */ 476*41349Sbostic case '^': 477*41349Sbostic ret = regnode(BOL); 478*41349Sbostic break; 479*41349Sbostic case '$': 480*41349Sbostic ret = regnode(EOL); 481*41349Sbostic break; 482*41349Sbostic case '.': 483*41349Sbostic ret = regnode(ANY); 484*41349Sbostic *flagp |= HASWIDTH|SIMPLE; 485*41349Sbostic break; 486*41349Sbostic case '[': { 487*41349Sbostic register int class; 488*41349Sbostic register int classend; 489*41349Sbostic 490*41349Sbostic if (*regparse == '^') { /* Complement of range. */ 491*41349Sbostic ret = regnode(ANYBUT); 492*41349Sbostic regparse++; 493*41349Sbostic } else 494*41349Sbostic ret = regnode(ANYOF); 495*41349Sbostic if (*regparse == ']' || *regparse == '-') 496*41349Sbostic regc(*regparse++); 497*41349Sbostic while (*regparse != '\0' && *regparse != ']') { 498*41349Sbostic if (*regparse == '-') { 499*41349Sbostic regparse++; 500*41349Sbostic if (*regparse == ']' || *regparse == '\0') 501*41349Sbostic regc('-'); 502*41349Sbostic else { 503*41349Sbostic class = UCHARAT(regparse-2)+1; 504*41349Sbostic classend = UCHARAT(regparse); 505*41349Sbostic if (class > classend+1) 506*41349Sbostic FAIL("invalid [] range"); 507*41349Sbostic for (; class <= classend; class++) 508*41349Sbostic regc(class); 509*41349Sbostic regparse++; 510*41349Sbostic } 511*41349Sbostic } else 512*41349Sbostic regc(*regparse++); 513*41349Sbostic } 514*41349Sbostic regc('\0'); 515*41349Sbostic if (*regparse != ']') 516*41349Sbostic FAIL("unmatched []"); 517*41349Sbostic regparse++; 518*41349Sbostic *flagp |= HASWIDTH|SIMPLE; 519*41349Sbostic } 520*41349Sbostic break; 521*41349Sbostic case '(': 522*41349Sbostic ret = reg(1, &flags); 523*41349Sbostic if (ret == NULL) 524*41349Sbostic return(NULL); 525*41349Sbostic *flagp |= flags&(HASWIDTH|SPSTART); 526*41349Sbostic break; 527*41349Sbostic case '\0': 528*41349Sbostic case '|': 529*41349Sbostic case '\n': 530*41349Sbostic case ')': 531*41349Sbostic FAIL("internal urp"); /* Supposed to be caught earlier. */ 532*41349Sbostic break; 533*41349Sbostic case '?': 534*41349Sbostic case '+': 535*41349Sbostic case '*': 536*41349Sbostic FAIL("?+* follows nothing"); 537*41349Sbostic break; 538*41349Sbostic case '\\': 539*41349Sbostic switch (*regparse++) { 540*41349Sbostic case '\0': 541*41349Sbostic FAIL("trailing \\"); 542*41349Sbostic break; 543*41349Sbostic case '<': 544*41349Sbostic ret = regnode(WORDA); 545*41349Sbostic break; 546*41349Sbostic case '>': 547*41349Sbostic ret = regnode(WORDZ); 548*41349Sbostic break; 549*41349Sbostic /* FIXME: Someday handle \1, \2, ... */ 550*41349Sbostic default: 551*41349Sbostic /* Handle general quoted chars in exact-match routine */ 552*41349Sbostic goto de_fault; 553*41349Sbostic } 554*41349Sbostic break; 555*41349Sbostic de_fault: 556*41349Sbostic default: 557*41349Sbostic /* 558*41349Sbostic * Encode a string of characters to be matched exactly. 559*41349Sbostic * 560*41349Sbostic * This is a bit tricky due to quoted chars and due to 561*41349Sbostic * '*', '+', and '?' taking the SINGLE char previous 562*41349Sbostic * as their operand. 563*41349Sbostic * 564*41349Sbostic * On entry, the char at regparse[-1] is going to go 565*41349Sbostic * into the string, no matter what it is. (It could be 566*41349Sbostic * following a \ if we are entered from the '\' case.) 567*41349Sbostic * 568*41349Sbostic * Basic idea is to pick up a good char in ch and 569*41349Sbostic * examine the next char. If it's *+? then we twiddle. 570*41349Sbostic * If it's \ then we frozzle. If it's other magic char 571*41349Sbostic * we push ch and terminate the string. If none of the 572*41349Sbostic * above, we push ch on the string and go around again. 573*41349Sbostic * 574*41349Sbostic * regprev is used to remember where "the current char" 575*41349Sbostic * starts in the string, if due to a *+? we need to back 576*41349Sbostic * up and put the current char in a separate, 1-char, string. 577*41349Sbostic * When regprev is NULL, ch is the only char in the 578*41349Sbostic * string; this is used in *+? handling, and in setting 579*41349Sbostic * flags |= SIMPLE at the end. 580*41349Sbostic */ 581*41349Sbostic { 582*41349Sbostic char *regprev; 583*41349Sbostic register char ch; 584*41349Sbostic 585*41349Sbostic regparse--; /* Look at cur char */ 586*41349Sbostic ret = regnode(EXACTLY); 587*41349Sbostic for ( regprev = 0 ; ; ) { 588*41349Sbostic ch = *regparse++; /* Get current char */ 589*41349Sbostic switch (*regparse) { /* look at next one */ 590*41349Sbostic 591*41349Sbostic default: 592*41349Sbostic regc(ch); /* Add cur to string */ 593*41349Sbostic break; 594*41349Sbostic 595*41349Sbostic case '.': case '[': case '(': 596*41349Sbostic case ')': case '|': case '\n': 597*41349Sbostic case '$': case '^': 598*41349Sbostic case '\0': 599*41349Sbostic /* FIXME, $ and ^ should not always be magic */ 600*41349Sbostic magic: 601*41349Sbostic regc(ch); /* dump cur char */ 602*41349Sbostic goto done; /* and we are done */ 603*41349Sbostic 604*41349Sbostic case '?': case '+': case '*': 605*41349Sbostic if (!regprev) /* If just ch in str, */ 606*41349Sbostic goto magic; /* use it */ 607*41349Sbostic /* End mult-char string one early */ 608*41349Sbostic regparse = regprev; /* Back up parse */ 609*41349Sbostic goto done; 610*41349Sbostic 611*41349Sbostic case '\\': 612*41349Sbostic regc(ch); /* Cur char OK */ 613*41349Sbostic switch (regparse[1]){ /* Look after \ */ 614*41349Sbostic case '\0': 615*41349Sbostic case '<': 616*41349Sbostic case '>': 617*41349Sbostic /* FIXME: Someday handle \1, \2, ... */ 618*41349Sbostic goto done; /* Not quoted */ 619*41349Sbostic default: 620*41349Sbostic /* Backup point is \, scan * point is after it. */ 621*41349Sbostic regprev = regparse; 622*41349Sbostic regparse++; 623*41349Sbostic continue; /* NOT break; */ 624*41349Sbostic } 625*41349Sbostic } 626*41349Sbostic regprev = regparse; /* Set backup point */ 627*41349Sbostic } 628*41349Sbostic done: 629*41349Sbostic regc('\0'); 630*41349Sbostic *flagp |= HASWIDTH; 631*41349Sbostic if (!regprev) /* One char? */ 632*41349Sbostic *flagp |= SIMPLE; 633*41349Sbostic } 634*41349Sbostic break; 635*41349Sbostic } 636*41349Sbostic 637*41349Sbostic return(ret); 638*41349Sbostic } 639*41349Sbostic 640*41349Sbostic /* 641*41349Sbostic - regnode - emit a node 642*41349Sbostic */ 643*41349Sbostic static char * /* Location. */ 644*41349Sbostic regnode(op) 645*41349Sbostic char op; 646*41349Sbostic { 647*41349Sbostic register char *ret; 648*41349Sbostic register char *ptr; 649*41349Sbostic 650*41349Sbostic ret = regcode; 651*41349Sbostic if (ret == ®dummy) { 652*41349Sbostic regsize += 3; 653*41349Sbostic return(ret); 654*41349Sbostic } 655*41349Sbostic 656*41349Sbostic ptr = ret; 657*41349Sbostic *ptr++ = op; 658*41349Sbostic *ptr++ = '\0'; /* Null "next" pointer. */ 659*41349Sbostic *ptr++ = '\0'; 660*41349Sbostic regcode = ptr; 661*41349Sbostic 662*41349Sbostic return(ret); 663*41349Sbostic } 664*41349Sbostic 665*41349Sbostic /* 666*41349Sbostic - regc - emit (if appropriate) a byte of code 667*41349Sbostic */ 668*41349Sbostic static void 669*41349Sbostic regc(b) 670*41349Sbostic char b; 671*41349Sbostic { 672*41349Sbostic if (regcode != ®dummy) 673*41349Sbostic *regcode++ = b; 674*41349Sbostic else 675*41349Sbostic regsize++; 676*41349Sbostic } 677*41349Sbostic 678*41349Sbostic /* 679*41349Sbostic - reginsert - insert an operator in front of already-emitted operand 680*41349Sbostic * 681*41349Sbostic * Means relocating the operand. 682*41349Sbostic */ 683*41349Sbostic static void 684*41349Sbostic reginsert(op, opnd) 685*41349Sbostic char op; 686*41349Sbostic char *opnd; 687*41349Sbostic { 688*41349Sbostic register char *src; 689*41349Sbostic register char *dst; 690*41349Sbostic register char *place; 691*41349Sbostic 692*41349Sbostic if (regcode == ®dummy) { 693*41349Sbostic regsize += 3; 694*41349Sbostic return; 695*41349Sbostic } 696*41349Sbostic 697*41349Sbostic src = regcode; 698*41349Sbostic regcode += 3; 699*41349Sbostic dst = regcode; 700*41349Sbostic while (src > opnd) 701*41349Sbostic *--dst = *--src; 702*41349Sbostic 703*41349Sbostic place = opnd; /* Op node, where operand used to be. */ 704*41349Sbostic *place++ = op; 705*41349Sbostic *place++ = '\0'; 706*41349Sbostic *place++ = '\0'; 707*41349Sbostic } 708*41349Sbostic 709*41349Sbostic /* 710*41349Sbostic - regtail - set the next-pointer at the end of a node chain 711*41349Sbostic */ 712*41349Sbostic static void 713*41349Sbostic regtail(p, val) 714*41349Sbostic char *p; 715*41349Sbostic char *val; 716*41349Sbostic { 717*41349Sbostic register char *scan; 718*41349Sbostic register char *temp; 719*41349Sbostic register int offset; 720*41349Sbostic 721*41349Sbostic if (p == ®dummy) 722*41349Sbostic return; 723*41349Sbostic 724*41349Sbostic /* Find last node. */ 725*41349Sbostic scan = p; 726*41349Sbostic for (;;) { 727*41349Sbostic temp = regnext(scan); 728*41349Sbostic if (temp == NULL) 729*41349Sbostic break; 730*41349Sbostic scan = temp; 731*41349Sbostic } 732*41349Sbostic 733*41349Sbostic if (OP(scan) == BACK) 734*41349Sbostic offset = scan - val; 735*41349Sbostic else 736*41349Sbostic offset = val - scan; 737*41349Sbostic *(scan+1) = (offset>>8)&0377; 738*41349Sbostic *(scan+2) = offset&0377; 739*41349Sbostic } 740*41349Sbostic 741*41349Sbostic /* 742*41349Sbostic - regoptail - regtail on operand of first argument; nop if operandless 743*41349Sbostic */ 744*41349Sbostic static void 745*41349Sbostic regoptail(p, val) 746*41349Sbostic char *p; 747*41349Sbostic char *val; 748*41349Sbostic { 749*41349Sbostic /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 750*41349Sbostic if (p == NULL || p == ®dummy || OP(p) != BRANCH) 751*41349Sbostic return; 752*41349Sbostic regtail(OPERAND(p), val); 753*41349Sbostic } 754*41349Sbostic 755*41349Sbostic /* 756*41349Sbostic * regexec and friends 757*41349Sbostic */ 758*41349Sbostic 759*41349Sbostic /* 760*41349Sbostic * Global work variables for regexec(). 761*41349Sbostic */ 762*41349Sbostic static char *reginput; /* String-input pointer. */ 763*41349Sbostic static char *regbol; /* Beginning of input, for ^ check. */ 764*41349Sbostic static char **regstartp; /* Pointer to startp array. */ 765*41349Sbostic static char **regendp; /* Ditto for endp. */ 766*41349Sbostic 767*41349Sbostic /* 768*41349Sbostic * Forwards. 769*41349Sbostic */ 770*41349Sbostic STATIC int regtry(); 771*41349Sbostic STATIC int regmatch(); 772*41349Sbostic STATIC int regrepeat(); 773*41349Sbostic 774*41349Sbostic #ifdef DEBUG 775*41349Sbostic int regnarrate = 0; 776*41349Sbostic void regdump(); 777*41349Sbostic STATIC char *regprop(); 778*41349Sbostic #endif 779*41349Sbostic 780*41349Sbostic /* 781*41349Sbostic - regexec - match a regexp against a string 782*41349Sbostic */ 783*41349Sbostic int 784*41349Sbostic regexec(prog, string) 785*41349Sbostic register regexp *prog; 786*41349Sbostic register char *string; 787*41349Sbostic { 788*41349Sbostic register char *s; 789*41349Sbostic extern char *strchr(); 790*41349Sbostic 791*41349Sbostic /* Be paranoid... */ 792*41349Sbostic if (prog == NULL || string == NULL) { 793*41349Sbostic regerror("NULL parameter"); 794*41349Sbostic return(0); 795*41349Sbostic } 796*41349Sbostic 797*41349Sbostic /* Check validity of program. */ 798*41349Sbostic if (UCHARAT(prog->program) != MAGIC) { 799*41349Sbostic regerror("corrupted program"); 800*41349Sbostic return(0); 801*41349Sbostic } 802*41349Sbostic 803*41349Sbostic /* If there is a "must appear" string, look for it. */ 804*41349Sbostic if (prog->regmust != NULL) { 805*41349Sbostic s = string; 806*41349Sbostic while ((s = strchr(s, prog->regmust[0])) != NULL) { 807*41349Sbostic if (strncmp(s, prog->regmust, prog->regmlen) == 0) 808*41349Sbostic break; /* Found it. */ 809*41349Sbostic s++; 810*41349Sbostic } 811*41349Sbostic if (s == NULL) /* Not present. */ 812*41349Sbostic return(0); 813*41349Sbostic } 814*41349Sbostic 815*41349Sbostic /* Mark beginning of line for ^ . */ 816*41349Sbostic regbol = string; 817*41349Sbostic 818*41349Sbostic /* Simplest case: anchored match need be tried only once. */ 819*41349Sbostic if (prog->reganch) 820*41349Sbostic return(regtry(prog, string)); 821*41349Sbostic 822*41349Sbostic /* Messy cases: unanchored match. */ 823*41349Sbostic s = string; 824*41349Sbostic if (prog->regstart != '\0') 825*41349Sbostic /* We know what char it must start with. */ 826*41349Sbostic while ((s = strchr(s, prog->regstart)) != NULL) { 827*41349Sbostic if (regtry(prog, s)) 828*41349Sbostic return(1); 829*41349Sbostic s++; 830*41349Sbostic } 831*41349Sbostic else 832*41349Sbostic /* We don't -- general case. */ 833*41349Sbostic do { 834*41349Sbostic if (regtry(prog, s)) 835*41349Sbostic return(1); 836*41349Sbostic } while (*s++ != '\0'); 837*41349Sbostic 838*41349Sbostic /* Failure. */ 839*41349Sbostic return(0); 840*41349Sbostic } 841*41349Sbostic 842*41349Sbostic /* 843*41349Sbostic - regtry - try match at specific point 844*41349Sbostic */ 845*41349Sbostic static int /* 0 failure, 1 success */ 846*41349Sbostic regtry(prog, string) 847*41349Sbostic regexp *prog; 848*41349Sbostic char *string; 849*41349Sbostic { 850*41349Sbostic register int i; 851*41349Sbostic register char **sp; 852*41349Sbostic register char **ep; 853*41349Sbostic 854*41349Sbostic reginput = string; 855*41349Sbostic regstartp = prog->startp; 856*41349Sbostic regendp = prog->endp; 857*41349Sbostic 858*41349Sbostic sp = prog->startp; 859*41349Sbostic ep = prog->endp; 860*41349Sbostic for (i = NSUBEXP; i > 0; i--) { 861*41349Sbostic *sp++ = NULL; 862*41349Sbostic *ep++ = NULL; 863*41349Sbostic } 864*41349Sbostic if (regmatch(prog->program + 1)) { 865*41349Sbostic prog->startp[0] = string; 866*41349Sbostic prog->endp[0] = reginput; 867*41349Sbostic return(1); 868*41349Sbostic } else 869*41349Sbostic return(0); 870*41349Sbostic } 871*41349Sbostic 872*41349Sbostic /* 873*41349Sbostic - regmatch - main matching routine 874*41349Sbostic * 875*41349Sbostic * Conceptually the strategy is simple: check to see whether the current 876*41349Sbostic * node matches, call self recursively to see whether the rest matches, 877*41349Sbostic * and then act accordingly. In practice we make some effort to avoid 878*41349Sbostic * recursion, in particular by going through "ordinary" nodes (that don't 879*41349Sbostic * need to know whether the rest of the match failed) by a loop instead of 880*41349Sbostic * by recursion. 881*41349Sbostic */ 882*41349Sbostic static int /* 0 failure, 1 success */ 883*41349Sbostic regmatch(prog) 884*41349Sbostic char *prog; 885*41349Sbostic { 886*41349Sbostic register char *scan; /* Current node. */ 887*41349Sbostic char *next; /* Next node. */ 888*41349Sbostic extern char *strchr(); 889*41349Sbostic 890*41349Sbostic scan = prog; 891*41349Sbostic #ifdef DEBUG 892*41349Sbostic if (scan != NULL && regnarrate) 893*41349Sbostic fprintf(stderr, "%s(\n", regprop(scan)); 894*41349Sbostic #endif 895*41349Sbostic while (scan != NULL) { 896*41349Sbostic #ifdef DEBUG 897*41349Sbostic if (regnarrate) 898*41349Sbostic fprintf(stderr, "%s...\n", regprop(scan)); 899*41349Sbostic #endif 900*41349Sbostic next = regnext(scan); 901*41349Sbostic 902*41349Sbostic switch (OP(scan)) { 903*41349Sbostic case BOL: 904*41349Sbostic if (reginput != regbol) 905*41349Sbostic return(0); 906*41349Sbostic break; 907*41349Sbostic case EOL: 908*41349Sbostic if (*reginput != '\0') 909*41349Sbostic return(0); 910*41349Sbostic break; 911*41349Sbostic case WORDA: 912*41349Sbostic /* Must be looking at a letter, digit, or _ */ 913*41349Sbostic if ((!isalnum(*reginput)) && *reginput != '_') 914*41349Sbostic return(0); 915*41349Sbostic /* Prev must be BOL or nonword */ 916*41349Sbostic if (reginput > regbol && 917*41349Sbostic (isalnum(reginput[-1]) || reginput[-1] == '_')) 918*41349Sbostic return(0); 919*41349Sbostic break; 920*41349Sbostic case WORDZ: 921*41349Sbostic /* Must be looking at non letter, digit, or _ */ 922*41349Sbostic if (isalnum(*reginput) || *reginput == '_') 923*41349Sbostic return(0); 924*41349Sbostic /* We don't care what the previous char was */ 925*41349Sbostic break; 926*41349Sbostic case ANY: 927*41349Sbostic if (*reginput == '\0') 928*41349Sbostic return(0); 929*41349Sbostic reginput++; 930*41349Sbostic break; 931*41349Sbostic case EXACTLY: { 932*41349Sbostic register int len; 933*41349Sbostic register char *opnd; 934*41349Sbostic 935*41349Sbostic opnd = OPERAND(scan); 936*41349Sbostic /* Inline the first character, for speed. */ 937*41349Sbostic if (*opnd != *reginput) 938*41349Sbostic return(0); 939*41349Sbostic len = strlen(opnd); 940*41349Sbostic if (len > 1 && strncmp(opnd, reginput, len) != 0) 941*41349Sbostic return(0); 942*41349Sbostic reginput += len; 943*41349Sbostic } 944*41349Sbostic break; 945*41349Sbostic case ANYOF: 946*41349Sbostic if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 947*41349Sbostic return(0); 948*41349Sbostic reginput++; 949*41349Sbostic break; 950*41349Sbostic case ANYBUT: 951*41349Sbostic if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 952*41349Sbostic return(0); 953*41349Sbostic reginput++; 954*41349Sbostic break; 955*41349Sbostic case NOTHING: 956*41349Sbostic break; 957*41349Sbostic case BACK: 958*41349Sbostic break; 959*41349Sbostic case OPEN+1: 960*41349Sbostic case OPEN+2: 961*41349Sbostic case OPEN+3: 962*41349Sbostic case OPEN+4: 963*41349Sbostic case OPEN+5: 964*41349Sbostic case OPEN+6: 965*41349Sbostic case OPEN+7: 966*41349Sbostic case OPEN+8: 967*41349Sbostic case OPEN+9: { 968*41349Sbostic register int no; 969*41349Sbostic register char *save; 970*41349Sbostic 971*41349Sbostic no = OP(scan) - OPEN; 972*41349Sbostic save = reginput; 973*41349Sbostic 974*41349Sbostic if (regmatch(next)) { 975*41349Sbostic /* 976*41349Sbostic * Don't set startp if some later 977*41349Sbostic * invocation of the same parentheses 978*41349Sbostic * already has. 979*41349Sbostic */ 980*41349Sbostic if (regstartp[no] == NULL) 981*41349Sbostic regstartp[no] = save; 982*41349Sbostic return(1); 983*41349Sbostic } else 984*41349Sbostic return(0); 985*41349Sbostic } 986*41349Sbostic break; 987*41349Sbostic case CLOSE+1: 988*41349Sbostic case CLOSE+2: 989*41349Sbostic case CLOSE+3: 990*41349Sbostic case CLOSE+4: 991*41349Sbostic case CLOSE+5: 992*41349Sbostic case CLOSE+6: 993*41349Sbostic case CLOSE+7: 994*41349Sbostic case CLOSE+8: 995*41349Sbostic case CLOSE+9: { 996*41349Sbostic register int no; 997*41349Sbostic register char *save; 998*41349Sbostic 999*41349Sbostic no = OP(scan) - CLOSE; 1000*41349Sbostic save = reginput; 1001*41349Sbostic 1002*41349Sbostic if (regmatch(next)) { 1003*41349Sbostic /* 1004*41349Sbostic * Don't set endp if some later 1005*41349Sbostic * invocation of the same parentheses 1006*41349Sbostic * already has. 1007*41349Sbostic */ 1008*41349Sbostic if (regendp[no] == NULL) 1009*41349Sbostic regendp[no] = save; 1010*41349Sbostic return(1); 1011*41349Sbostic } else 1012*41349Sbostic return(0); 1013*41349Sbostic } 1014*41349Sbostic break; 1015*41349Sbostic case BRANCH: { 1016*41349Sbostic register char *save; 1017*41349Sbostic 1018*41349Sbostic if (OP(next) != BRANCH) /* No choice. */ 1019*41349Sbostic next = OPERAND(scan); /* Avoid recursion. */ 1020*41349Sbostic else { 1021*41349Sbostic do { 1022*41349Sbostic save = reginput; 1023*41349Sbostic if (regmatch(OPERAND(scan))) 1024*41349Sbostic return(1); 1025*41349Sbostic reginput = save; 1026*41349Sbostic scan = regnext(scan); 1027*41349Sbostic } while (scan != NULL && OP(scan) == BRANCH); 1028*41349Sbostic return(0); 1029*41349Sbostic /* NOTREACHED */ 1030*41349Sbostic } 1031*41349Sbostic } 1032*41349Sbostic break; 1033*41349Sbostic case STAR: 1034*41349Sbostic case PLUS: { 1035*41349Sbostic register char nextch; 1036*41349Sbostic register int no; 1037*41349Sbostic register char *save; 1038*41349Sbostic register int min; 1039*41349Sbostic 1040*41349Sbostic /* 1041*41349Sbostic * Lookahead to avoid useless match attempts 1042*41349Sbostic * when we know what character comes next. 1043*41349Sbostic */ 1044*41349Sbostic nextch = '\0'; 1045*41349Sbostic if (OP(next) == EXACTLY) 1046*41349Sbostic nextch = *OPERAND(next); 1047*41349Sbostic min = (OP(scan) == STAR) ? 0 : 1; 1048*41349Sbostic save = reginput; 1049*41349Sbostic no = regrepeat(OPERAND(scan)); 1050*41349Sbostic while (no >= min) { 1051*41349Sbostic /* If it could work, try it. */ 1052*41349Sbostic if (nextch == '\0' || *reginput == nextch) 1053*41349Sbostic if (regmatch(next)) 1054*41349Sbostic return(1); 1055*41349Sbostic /* Couldn't or didn't -- back up. */ 1056*41349Sbostic no--; 1057*41349Sbostic reginput = save + no; 1058*41349Sbostic } 1059*41349Sbostic return(0); 1060*41349Sbostic } 1061*41349Sbostic break; 1062*41349Sbostic case END: 1063*41349Sbostic return(1); /* Success! */ 1064*41349Sbostic break; 1065*41349Sbostic default: 1066*41349Sbostic regerror("memory corruption"); 1067*41349Sbostic return(0); 1068*41349Sbostic break; 1069*41349Sbostic } 1070*41349Sbostic 1071*41349Sbostic scan = next; 1072*41349Sbostic } 1073*41349Sbostic 1074*41349Sbostic /* 1075*41349Sbostic * We get here only if there's trouble -- normally "case END" is 1076*41349Sbostic * the terminating point. 1077*41349Sbostic */ 1078*41349Sbostic regerror("corrupted pointers"); 1079*41349Sbostic return(0); 1080*41349Sbostic } 1081*41349Sbostic 1082*41349Sbostic /* 1083*41349Sbostic - regrepeat - repeatedly match something simple, report how many 1084*41349Sbostic */ 1085*41349Sbostic static int 1086*41349Sbostic regrepeat(p) 1087*41349Sbostic char *p; 1088*41349Sbostic { 1089*41349Sbostic register int count = 0; 1090*41349Sbostic register char *scan; 1091*41349Sbostic register char *opnd; 1092*41349Sbostic 1093*41349Sbostic scan = reginput; 1094*41349Sbostic opnd = OPERAND(p); 1095*41349Sbostic switch (OP(p)) { 1096*41349Sbostic case ANY: 1097*41349Sbostic count = strlen(scan); 1098*41349Sbostic scan += count; 1099*41349Sbostic break; 1100*41349Sbostic case EXACTLY: 1101*41349Sbostic while (*opnd == *scan) { 1102*41349Sbostic count++; 1103*41349Sbostic scan++; 1104*41349Sbostic } 1105*41349Sbostic break; 1106*41349Sbostic case ANYOF: 1107*41349Sbostic while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1108*41349Sbostic count++; 1109*41349Sbostic scan++; 1110*41349Sbostic } 1111*41349Sbostic break; 1112*41349Sbostic case ANYBUT: 1113*41349Sbostic while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1114*41349Sbostic count++; 1115*41349Sbostic scan++; 1116*41349Sbostic } 1117*41349Sbostic break; 1118*41349Sbostic default: /* Oh dear. Called inappropriately. */ 1119*41349Sbostic regerror("internal foulup"); 1120*41349Sbostic count = 0; /* Best compromise. */ 1121*41349Sbostic break; 1122*41349Sbostic } 1123*41349Sbostic reginput = scan; 1124*41349Sbostic 1125*41349Sbostic return(count); 1126*41349Sbostic } 1127*41349Sbostic 1128*41349Sbostic /* 1129*41349Sbostic - regnext - dig the "next" pointer out of a node 1130*41349Sbostic */ 1131*41349Sbostic static char * 1132*41349Sbostic regnext(p) 1133*41349Sbostic register char *p; 1134*41349Sbostic { 1135*41349Sbostic register int offset; 1136*41349Sbostic 1137*41349Sbostic if (p == ®dummy) 1138*41349Sbostic return(NULL); 1139*41349Sbostic 1140*41349Sbostic offset = NEXT(p); 1141*41349Sbostic if (offset == 0) 1142*41349Sbostic return(NULL); 1143*41349Sbostic 1144*41349Sbostic if (OP(p) == BACK) 1145*41349Sbostic return(p-offset); 1146*41349Sbostic else 1147*41349Sbostic return(p+offset); 1148*41349Sbostic } 1149*41349Sbostic 1150*41349Sbostic #ifdef DEBUG 1151*41349Sbostic 1152*41349Sbostic STATIC char *regprop(); 1153*41349Sbostic 1154*41349Sbostic /* 1155*41349Sbostic - regdump - dump a regexp onto stdout in vaguely comprehensible form 1156*41349Sbostic */ 1157*41349Sbostic void 1158*41349Sbostic regdump(r) 1159*41349Sbostic regexp *r; 1160*41349Sbostic { 1161*41349Sbostic register char *s; 1162*41349Sbostic register char op = EXACTLY; /* Arbitrary non-END op. */ 1163*41349Sbostic register char *next; 1164*41349Sbostic extern char *strchr(); 1165*41349Sbostic 1166*41349Sbostic 1167*41349Sbostic s = r->program + 1; 1168*41349Sbostic while (op != END) { /* While that wasn't END last time... */ 1169*41349Sbostic op = OP(s); 1170*41349Sbostic printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1171*41349Sbostic next = regnext(s); 1172*41349Sbostic if (next == NULL) /* Next ptr. */ 1173*41349Sbostic printf("(0)"); 1174*41349Sbostic else 1175*41349Sbostic printf("(%d)", (s-r->program)+(next-s)); 1176*41349Sbostic s += 3; 1177*41349Sbostic if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1178*41349Sbostic /* Literal string, where present. */ 1179*41349Sbostic while (*s != '\0') { 1180*41349Sbostic putchar(*s); 1181*41349Sbostic s++; 1182*41349Sbostic } 1183*41349Sbostic s++; 1184*41349Sbostic } 1185*41349Sbostic putchar('\n'); 1186*41349Sbostic } 1187*41349Sbostic 1188*41349Sbostic /* Header fields of interest. */ 1189*41349Sbostic if (r->regstart != '\0') 1190*41349Sbostic printf("start `%c' ", r->regstart); 1191*41349Sbostic if (r->reganch) 1192*41349Sbostic printf("anchored "); 1193*41349Sbostic if (r->regmust != NULL) 1194*41349Sbostic printf("must have \"%s\"", r->regmust); 1195*41349Sbostic printf("\n"); 1196*41349Sbostic } 1197*41349Sbostic 1198*41349Sbostic /* 1199*41349Sbostic - regprop - printable representation of opcode 1200*41349Sbostic */ 1201*41349Sbostic static char * 1202*41349Sbostic regprop(op) 1203*41349Sbostic char *op; 1204*41349Sbostic { 1205*41349Sbostic register char *p; 1206*41349Sbostic static char buf[50]; 1207*41349Sbostic 1208*41349Sbostic (void) strcpy(buf, ":"); 1209*41349Sbostic 1210*41349Sbostic switch (OP(op)) { 1211*41349Sbostic case BOL: 1212*41349Sbostic p = "BOL"; 1213*41349Sbostic break; 1214*41349Sbostic case EOL: 1215*41349Sbostic p = "EOL"; 1216*41349Sbostic break; 1217*41349Sbostic case ANY: 1218*41349Sbostic p = "ANY"; 1219*41349Sbostic break; 1220*41349Sbostic case ANYOF: 1221*41349Sbostic p = "ANYOF"; 1222*41349Sbostic break; 1223*41349Sbostic case ANYBUT: 1224*41349Sbostic p = "ANYBUT"; 1225*41349Sbostic break; 1226*41349Sbostic case BRANCH: 1227*41349Sbostic p = "BRANCH"; 1228*41349Sbostic break; 1229*41349Sbostic case EXACTLY: 1230*41349Sbostic p = "EXACTLY"; 1231*41349Sbostic break; 1232*41349Sbostic case NOTHING: 1233*41349Sbostic p = "NOTHING"; 1234*41349Sbostic break; 1235*41349Sbostic case BACK: 1236*41349Sbostic p = "BACK"; 1237*41349Sbostic break; 1238*41349Sbostic case END: 1239*41349Sbostic p = "END"; 1240*41349Sbostic break; 1241*41349Sbostic case OPEN+1: 1242*41349Sbostic case OPEN+2: 1243*41349Sbostic case OPEN+3: 1244*41349Sbostic case OPEN+4: 1245*41349Sbostic case OPEN+5: 1246*41349Sbostic case OPEN+6: 1247*41349Sbostic case OPEN+7: 1248*41349Sbostic case OPEN+8: 1249*41349Sbostic case OPEN+9: 1250*41349Sbostic sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 1251*41349Sbostic p = NULL; 1252*41349Sbostic break; 1253*41349Sbostic case CLOSE+1: 1254*41349Sbostic case CLOSE+2: 1255*41349Sbostic case CLOSE+3: 1256*41349Sbostic case CLOSE+4: 1257*41349Sbostic case CLOSE+5: 1258*41349Sbostic case CLOSE+6: 1259*41349Sbostic case CLOSE+7: 1260*41349Sbostic case CLOSE+8: 1261*41349Sbostic case CLOSE+9: 1262*41349Sbostic sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1263*41349Sbostic p = NULL; 1264*41349Sbostic break; 1265*41349Sbostic case STAR: 1266*41349Sbostic p = "STAR"; 1267*41349Sbostic break; 1268*41349Sbostic case PLUS: 1269*41349Sbostic p = "PLUS"; 1270*41349Sbostic break; 1271*41349Sbostic case WORDA: 1272*41349Sbostic p = "WORDA"; 1273*41349Sbostic break; 1274*41349Sbostic case WORDZ: 1275*41349Sbostic p = "WORDZ"; 1276*41349Sbostic break; 1277*41349Sbostic default: 1278*41349Sbostic regerror("corrupted opcode"); 1279*41349Sbostic break; 1280*41349Sbostic } 1281*41349Sbostic if (p != NULL) 1282*41349Sbostic (void) strcat(buf, p); 1283*41349Sbostic return(buf); 1284*41349Sbostic } 1285*41349Sbostic #endif 1286*41349Sbostic 1287*41349Sbostic /* 1288*41349Sbostic * The following is provided for those people who do not have strcspn() in 1289*41349Sbostic * their C libraries. They should get off their butts and do something 1290*41349Sbostic * about it; at least one public-domain implementation of those (highly 1291*41349Sbostic * useful) string routines has been published on Usenet. 1292*41349Sbostic */ 1293*41349Sbostic #ifdef STRCSPN 1294*41349Sbostic /* 1295*41349Sbostic * strcspn - find length of initial segment of s1 consisting entirely 1296*41349Sbostic * of characters not from s2 1297*41349Sbostic */ 1298*41349Sbostic 1299*41349Sbostic static int 1300*41349Sbostic strcspn(s1, s2) 1301*41349Sbostic char *s1; 1302*41349Sbostic char *s2; 1303*41349Sbostic { 1304*41349Sbostic register char *scan1; 1305*41349Sbostic register char *scan2; 1306*41349Sbostic register int count; 1307*41349Sbostic 1308*41349Sbostic count = 0; 1309*41349Sbostic for (scan1 = s1; *scan1 != '\0'; scan1++) { 1310*41349Sbostic for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1311*41349Sbostic if (*scan1 == *scan2++) 1312*41349Sbostic return(count); 1313*41349Sbostic count++; 1314*41349Sbostic } 1315*41349Sbostic return(count); 1316*41349Sbostic } 1317*41349Sbostic #endif 1318