1 /* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | 22 *** to assist in implementing egrep. 23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching 25 *** as in BSD grep and ex. 26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. 28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, 29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. 30 * 31 * Beware that some of this code is subtly aware of the way operator 32 * precedence is structured in regular expressions. Serious changes in 33 * regular-expression syntax might require a total rethink. 34 */ 35 36 #ifndef lint 37 static char *rcsid = "$Id: regexp.c,v 1.4 1995/06/05 19:42:32 pk Exp $"; 38 #endif /* not lint */ 39 40 #include <regexp.h> 41 #include <stdio.h> 42 #include <ctype.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include "regmagic.h" 46 47 /* 48 * The "internal use only" fields in regexp.h are present to pass info from 49 * compile to execute that permits the execute phase to run lots faster on 50 * simple cases. They are: 51 * 52 * regstart char that must begin a match; '\0' if none obvious 53 * reganch is the match anchored (at beginning-of-line only)? 54 * regmust string (pointer into program) that match must include, or NULL 55 * regmlen length of regmust string 56 * 57 * Regstart and reganch permit very fast decisions on suitable starting points 58 * for a match, cutting down the work a lot. Regmust permits fast rejection 59 * of lines that cannot possibly match. The regmust tests are costly enough 60 * that regcomp() supplies a regmust only if the r.e. contains something 61 * potentially expensive (at present, the only such thing detected is * or + 62 * at the start of the r.e., which can involve a lot of backup). Regmlen is 63 * supplied because the test in regexec() needs it and regcomp() is computing 64 * it anyway. 65 */ 66 67 /* 68 * Structure for regexp "program". This is essentially a linear encoding 69 * of a nondeterministic finite-state machine (aka syntax charts or 70 * "railroad normal form" in parsing technology). Each node is an opcode 71 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 72 * all nodes except BRANCH implement concatenation; a "next" pointer with 73 * a BRANCH on both ends of it is connecting two alternatives. (Here we 74 * have one of the subtle syntax dependencies: an individual BRANCH (as 75 * opposed to a collection of them) is never concatenated with anything 76 * because of operator precedence.) The operand of some types of node is 77 * a literal string; for others, it is a node leading into a sub-FSM. In 78 * particular, the operand of a BRANCH node is the first node of the branch. 79 * (NB this is *not* a tree structure: the tail of the branch connects 80 * to the thing following the set of BRANCHes.) The opcodes are: 81 */ 82 83 /* definition number opnd? meaning */ 84 #define END 0 /* no End of program. */ 85 #define BOL 1 /* no Match "" at beginning of line. */ 86 #define EOL 2 /* no Match "" at end of line. */ 87 #define ANY 3 /* no Match any one character. */ 88 #define ANYOF 4 /* str Match any character in this string. */ 89 #define ANYBUT 5 /* str Match any character not in this string. */ 90 #define BRANCH 6 /* node Match this alternative, or the next... */ 91 #define BACK 7 /* no Match "", "next" ptr points backward. */ 92 #define EXACTLY 8 /* str Match this string. */ 93 #define NOTHING 9 /* no Match empty string. */ 94 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 95 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 96 #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ 97 #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ 98 #define OPEN 20 /* no Mark this point in input as start of #n. */ 99 /* OPEN+1 is number 1, etc. */ 100 #define CLOSE 30 /* no Analogous to OPEN. */ 101 102 /* 103 * Opcode notes: 104 * 105 * BRANCH The set of branches constituting a single choice are hooked 106 * together with their "next" pointers, since precedence prevents 107 * anything being concatenated to any individual branch. The 108 * "next" pointer of the last BRANCH in a choice points to the 109 * thing following the whole choice. This is also where the 110 * final "next" pointer of each individual branch points; each 111 * branch starts with the operand node of a BRANCH node. 112 * 113 * BACK Normal "next" pointers all implicitly point forward; BACK 114 * exists to make loop structures possible. 115 * 116 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 117 * BRANCH structures using BACK. Simple cases (one character 118 * per match) are implemented with STAR and PLUS for speed 119 * and to minimize recursive plunges. 120 * 121 * OPEN,CLOSE ...are numbered at compile time. 122 */ 123 124 /* 125 * A node is one char of opcode followed by two chars of "next" pointer. 126 * "Next" pointers are stored as two 8-bit pieces, high order first. The 127 * value is a positive offset from the opcode of the node containing it. 128 * An operand, if any, simply follows the node. (Note that much of the 129 * code generation knows about this implicit relationship.) 130 * 131 * Using two bytes for the "next" pointer is vast overkill for most things, 132 * but allows patterns to get big without disasters. 133 */ 134 #define OP(p) (*(p)) 135 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 136 #define OPERAND(p) ((p) + 3) 137 138 /* 139 * See regmagic.h for one further detail of program structure. 140 */ 141 142 143 /* 144 * Utility definitions. 145 */ 146 #ifndef CHARBITS 147 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 148 #else 149 #define UCHARAT(p) ((int)*(p)&CHARBITS) 150 #endif 151 152 #define FAIL(m) { regerror(m); return(NULL); } 153 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 154 155 /* 156 * Flags to be passed up and down. 157 */ 158 #define HASWIDTH 01 /* Known never to match null string. */ 159 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 160 #define SPSTART 04 /* Starts with * or +. */ 161 #define WORST 0 /* Worst case. */ 162 163 /* 164 * Global work variables for regcomp(). 165 */ 166 static char *regparse; /* Input-scan pointer. */ 167 static int regnpar; /* () count. */ 168 static char regdummy; 169 static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 170 static long regsize; /* Code size. */ 171 172 /* 173 * Forward declarations for regcomp()'s friends. 174 */ 175 #ifndef STATIC 176 #define STATIC static 177 #endif 178 STATIC char *reg __P((int, int *)); 179 STATIC char *regbranch __P((int *)); 180 STATIC char *regpiece __P((int *)); 181 STATIC char *regatom __P((int *)); 182 STATIC char *regnode __P((char)); 183 STATIC char *regnext __P((char *)); 184 STATIC void regc __P((char)); 185 STATIC void reginsert __P((char, char *)); 186 STATIC void regtail __P((char *, char *)); 187 STATIC void regoptail __P((char *, char *)); 188 #ifdef STRCSPN 189 STATIC int strcspn __P((char *, char *)); 190 #endif 191 192 /* 193 - regcomp - compile a regular expression into internal code 194 * 195 * We can't allocate space until we know how big the compiled form will be, 196 * but we can't compile it (and thus know how big it is) until we've got a 197 * place to put the code. So we cheat: we compile it twice, once with code 198 * generation turned off and size counting turned on, and once "for real". 199 * This also means that we don't allocate space until we are sure that the 200 * thing really will compile successfully, and we never have to move the 201 * code and thus invalidate pointers into it. (Note that it has to be in 202 * one piece because free() must be able to free it all.) 203 * 204 * Beware that the optimization-preparation code in here knows about some 205 * of the structure of the compiled regexp. 206 */ 207 regexp * 208 regcomp(exp) 209 const char *exp; 210 { 211 register regexp *r; 212 register char *scan; 213 register char *longest; 214 register int len; 215 int flags; 216 217 if (exp == NULL) 218 FAIL("NULL argument"); 219 220 /* First pass: determine size, legality. */ 221 #ifdef notdef 222 if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ 223 #endif 224 regparse = (char *)exp; 225 regnpar = 1; 226 regsize = 0L; 227 regcode = ®dummy; 228 regc(MAGIC); 229 if (reg(0, &flags) == NULL) 230 return(NULL); 231 232 /* Small enough for pointer-storage convention? */ 233 if (regsize >= 32767L) /* Probably could be 65535L. */ 234 FAIL("regexp too big"); 235 236 /* Allocate space. */ 237 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 238 if (r == NULL) 239 FAIL("out of space"); 240 241 /* Second pass: emit code. */ 242 regparse = (char *)exp; 243 regnpar = 1; 244 regcode = r->program; 245 regc(MAGIC); 246 if (reg(0, &flags) == NULL) 247 return(NULL); 248 249 /* Dig out information for optimizations. */ 250 r->regstart = '\0'; /* Worst-case defaults. */ 251 r->reganch = 0; 252 r->regmust = NULL; 253 r->regmlen = 0; 254 scan = r->program+1; /* First BRANCH. */ 255 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 256 scan = OPERAND(scan); 257 258 /* Starting-point info. */ 259 if (OP(scan) == EXACTLY) 260 r->regstart = *OPERAND(scan); 261 else if (OP(scan) == BOL) 262 r->reganch++; 263 264 /* 265 * If there's something expensive in the r.e., find the 266 * longest literal string that must appear and make it the 267 * regmust. Resolve ties in favor of later strings, since 268 * the regstart check works with the beginning of the r.e. 269 * and avoiding duplication strengthens checking. Not a 270 * strong reason, but sufficient in the absence of others. 271 */ 272 if (flags&SPSTART) { 273 longest = NULL; 274 len = 0; 275 for (; scan != NULL; scan = regnext(scan)) 276 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { 277 longest = OPERAND(scan); 278 len = strlen(OPERAND(scan)); 279 } 280 r->regmust = longest; 281 r->regmlen = len; 282 } 283 } 284 285 return(r); 286 } 287 288 /* 289 - reg - regular expression, i.e. main body or parenthesized thing 290 * 291 * Caller must absorb opening parenthesis. 292 * 293 * Combining parenthesis handling with the base level of regular expression 294 * is a trifle forced, but the need to tie the tails of the branches to what 295 * follows makes it hard to avoid. 296 */ 297 static char * 298 reg(paren, flagp) 299 int paren; /* Parenthesized? */ 300 int *flagp; 301 { 302 register char *ret; 303 register char *br; 304 register char *ender; 305 register int parno = 0; 306 int flags; 307 308 *flagp = HASWIDTH; /* Tentatively. */ 309 310 /* Make an OPEN node, if parenthesized. */ 311 if (paren) { 312 if (regnpar >= NSUBEXP) 313 FAIL("too many ()"); 314 parno = regnpar; 315 regnpar++; 316 ret = regnode(OPEN+parno); 317 } else 318 ret = NULL; 319 320 /* Pick up the branches, linking them together. */ 321 br = regbranch(&flags); 322 if (br == NULL) 323 return(NULL); 324 if (ret != NULL) 325 regtail(ret, br); /* OPEN -> first. */ 326 else 327 ret = br; 328 if (!(flags&HASWIDTH)) 329 *flagp &= ~HASWIDTH; 330 *flagp |= flags&SPSTART; 331 while (*regparse == '|' || *regparse == '\n') { 332 regparse++; 333 br = regbranch(&flags); 334 if (br == NULL) 335 return(NULL); 336 regtail(ret, br); /* BRANCH -> BRANCH. */ 337 if (!(flags&HASWIDTH)) 338 *flagp &= ~HASWIDTH; 339 *flagp |= flags&SPSTART; 340 } 341 342 /* Make a closing node, and hook it on the end. */ 343 ender = regnode((paren) ? CLOSE+parno : END); 344 regtail(ret, ender); 345 346 /* Hook the tails of the branches to the closing node. */ 347 for (br = ret; br != NULL; br = regnext(br)) 348 regoptail(br, ender); 349 350 /* Check for proper termination. */ 351 if (paren && *regparse++ != ')') { 352 FAIL("unmatched ()"); 353 } else if (!paren && *regparse != '\0') { 354 if (*regparse == ')') { 355 FAIL("unmatched ()"); 356 } else 357 FAIL("junk on end"); /* "Can't happen". */ 358 /* NOTREACHED */ 359 } 360 361 return(ret); 362 } 363 364 /* 365 - regbranch - one alternative of an | operator 366 * 367 * Implements the concatenation operator. 368 */ 369 static char * 370 regbranch(flagp) 371 int *flagp; 372 { 373 register char *ret; 374 register char *chain; 375 register char *latest; 376 int flags; 377 378 *flagp = WORST; /* Tentatively. */ 379 380 ret = regnode(BRANCH); 381 chain = NULL; 382 while (*regparse != '\0' && *regparse != ')' && 383 *regparse != '\n' && *regparse != '|') { 384 latest = regpiece(&flags); 385 if (latest == NULL) 386 return(NULL); 387 *flagp |= flags&HASWIDTH; 388 if (chain == NULL) /* First piece. */ 389 *flagp |= flags&SPSTART; 390 else 391 regtail(chain, latest); 392 chain = latest; 393 } 394 if (chain == NULL) /* Loop ran zero times. */ 395 (void) regnode(NOTHING); 396 397 return(ret); 398 } 399 400 /* 401 - regpiece - something followed by possible [*+?] 402 * 403 * Note that the branching code sequences used for ? and the general cases 404 * of * and + are somewhat optimized: they use the same NOTHING node as 405 * both the endmarker for their branch list and the body of the last branch. 406 * It might seem that this node could be dispensed with entirely, but the 407 * endmarker role is not redundant. 408 */ 409 static char * 410 regpiece(flagp) 411 int *flagp; 412 { 413 register char *ret; 414 register char op; 415 register char *next; 416 int flags; 417 418 ret = regatom(&flags); 419 if (ret == NULL) 420 return(NULL); 421 422 op = *regparse; 423 if (!ISMULT(op)) { 424 *flagp = flags; 425 return(ret); 426 } 427 428 if (!(flags&HASWIDTH) && op != '?') 429 FAIL("*+ operand could be empty"); 430 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 431 432 if (op == '*' && (flags&SIMPLE)) 433 reginsert(STAR, ret); 434 else if (op == '*') { 435 /* Emit x* as (x&|), where & means "self". */ 436 reginsert(BRANCH, ret); /* Either x */ 437 regoptail(ret, regnode(BACK)); /* and loop */ 438 regoptail(ret, ret); /* back */ 439 regtail(ret, regnode(BRANCH)); /* or */ 440 regtail(ret, regnode(NOTHING)); /* null. */ 441 } else if (op == '+' && (flags&SIMPLE)) 442 reginsert(PLUS, ret); 443 else if (op == '+') { 444 /* Emit x+ as x(&|), where & means "self". */ 445 next = regnode(BRANCH); /* Either */ 446 regtail(ret, next); 447 regtail(regnode(BACK), ret); /* loop back */ 448 regtail(next, regnode(BRANCH)); /* or */ 449 regtail(ret, regnode(NOTHING)); /* null. */ 450 } else if (op == '?') { 451 /* Emit x? as (x|) */ 452 reginsert(BRANCH, ret); /* Either x */ 453 regtail(ret, regnode(BRANCH)); /* or */ 454 next = regnode(NOTHING); /* null. */ 455 regtail(ret, next); 456 regoptail(ret, next); 457 } 458 regparse++; 459 if (ISMULT(*regparse)) 460 FAIL("nested *?+"); 461 462 return(ret); 463 } 464 465 /* 466 - regatom - the lowest level 467 * 468 * Optimization: gobbles an entire sequence of ordinary characters so that 469 * it can turn them into a single node, which is smaller to store and 470 * faster to run. Backslashed characters are exceptions, each becoming a 471 * separate node; the code is simpler that way and it's not worth fixing. 472 */ 473 static char * 474 regatom(flagp) 475 int *flagp; 476 { 477 register char *ret; 478 int flags; 479 480 *flagp = WORST; /* Tentatively. */ 481 482 switch (*regparse++) { 483 /* FIXME: these chars only have meaning at beg/end of pat? */ 484 case '^': 485 ret = regnode(BOL); 486 break; 487 case '$': 488 ret = regnode(EOL); 489 break; 490 case '.': 491 ret = regnode(ANY); 492 *flagp |= HASWIDTH|SIMPLE; 493 break; 494 case '[': { 495 register int class; 496 register int classend; 497 498 if (*regparse == '^') { /* Complement of range. */ 499 ret = regnode(ANYBUT); 500 regparse++; 501 } else 502 ret = regnode(ANYOF); 503 if (*regparse == ']' || *regparse == '-') 504 regc(*regparse++); 505 while (*regparse != '\0' && *regparse != ']') { 506 if (*regparse == '-') { 507 regparse++; 508 if (*regparse == ']' || *regparse == '\0') 509 regc('-'); 510 else { 511 class = UCHARAT(regparse-2)+1; 512 classend = UCHARAT(regparse); 513 if (class > classend+1) 514 FAIL("invalid [] range"); 515 for (; class <= classend; class++) 516 regc(class); 517 regparse++; 518 } 519 } else 520 regc(*regparse++); 521 } 522 regc('\0'); 523 if (*regparse != ']') 524 FAIL("unmatched []"); 525 regparse++; 526 *flagp |= HASWIDTH|SIMPLE; 527 } 528 break; 529 case '(': 530 ret = reg(1, &flags); 531 if (ret == NULL) 532 return(NULL); 533 *flagp |= flags&(HASWIDTH|SPSTART); 534 break; 535 case '\0': 536 case '|': 537 case '\n': 538 case ')': 539 FAIL("internal urp"); /* Supposed to be caught earlier. */ 540 break; 541 case '?': 542 case '+': 543 case '*': 544 FAIL("?+* follows nothing"); 545 break; 546 case '\\': 547 switch (*regparse++) { 548 case '\0': 549 FAIL("trailing \\"); 550 break; 551 case '<': 552 ret = regnode(WORDA); 553 break; 554 case '>': 555 ret = regnode(WORDZ); 556 break; 557 /* FIXME: Someday handle \1, \2, ... */ 558 default: 559 /* Handle general quoted chars in exact-match routine */ 560 goto de_fault; 561 } 562 break; 563 de_fault: 564 default: 565 /* 566 * Encode a string of characters to be matched exactly. 567 * 568 * This is a bit tricky due to quoted chars and due to 569 * '*', '+', and '?' taking the SINGLE char previous 570 * as their operand. 571 * 572 * On entry, the char at regparse[-1] is going to go 573 * into the string, no matter what it is. (It could be 574 * following a \ if we are entered from the '\' case.) 575 * 576 * Basic idea is to pick up a good char in ch and 577 * examine the next char. If it's *+? then we twiddle. 578 * If it's \ then we frozzle. If it's other magic char 579 * we push ch and terminate the string. If none of the 580 * above, we push ch on the string and go around again. 581 * 582 * regprev is used to remember where "the current char" 583 * starts in the string, if due to a *+? we need to back 584 * up and put the current char in a separate, 1-char, string. 585 * When regprev is NULL, ch is the only char in the 586 * string; this is used in *+? handling, and in setting 587 * flags |= SIMPLE at the end. 588 */ 589 { 590 char *regprev; 591 register char ch; 592 593 regparse--; /* Look at cur char */ 594 ret = regnode(EXACTLY); 595 for ( regprev = 0 ; ; ) { 596 ch = *regparse++; /* Get current char */ 597 switch (*regparse) { /* look at next one */ 598 599 default: 600 regc(ch); /* Add cur to string */ 601 break; 602 603 case '.': case '[': case '(': 604 case ')': case '|': case '\n': 605 case '$': case '^': 606 case '\0': 607 /* FIXME, $ and ^ should not always be magic */ 608 magic: 609 regc(ch); /* dump cur char */ 610 goto done; /* and we are done */ 611 612 case '?': case '+': case '*': 613 if (!regprev) /* If just ch in str, */ 614 goto magic; /* use it */ 615 /* End mult-char string one early */ 616 regparse = regprev; /* Back up parse */ 617 goto done; 618 619 case '\\': 620 regc(ch); /* Cur char OK */ 621 switch (regparse[1]){ /* Look after \ */ 622 case '\0': 623 case '<': 624 case '>': 625 /* FIXME: Someday handle \1, \2, ... */ 626 goto done; /* Not quoted */ 627 default: 628 /* Backup point is \, scan * point is after it. */ 629 regprev = regparse; 630 regparse++; 631 continue; /* NOT break; */ 632 } 633 } 634 regprev = regparse; /* Set backup point */ 635 } 636 done: 637 regc('\0'); 638 *flagp |= HASWIDTH; 639 if (!regprev) /* One char? */ 640 *flagp |= SIMPLE; 641 } 642 break; 643 } 644 645 return(ret); 646 } 647 648 /* 649 - regnode - emit a node 650 */ 651 static char * /* Location. */ 652 regnode(op) 653 char op; 654 { 655 register char *ret; 656 register char *ptr; 657 658 ret = regcode; 659 if (ret == ®dummy) { 660 regsize += 3; 661 return(ret); 662 } 663 664 ptr = ret; 665 *ptr++ = op; 666 *ptr++ = '\0'; /* Null "next" pointer. */ 667 *ptr++ = '\0'; 668 regcode = ptr; 669 670 return(ret); 671 } 672 673 /* 674 - regc - emit (if appropriate) a byte of code 675 */ 676 static void 677 regc(b) 678 char b; 679 { 680 if (regcode != ®dummy) 681 *regcode++ = b; 682 else 683 regsize++; 684 } 685 686 /* 687 - reginsert - insert an operator in front of already-emitted operand 688 * 689 * Means relocating the operand. 690 */ 691 static void 692 reginsert(op, opnd) 693 char op; 694 char *opnd; 695 { 696 register char *src; 697 register char *dst; 698 register char *place; 699 700 if (regcode == ®dummy) { 701 regsize += 3; 702 return; 703 } 704 705 src = regcode; 706 regcode += 3; 707 dst = regcode; 708 while (src > opnd) 709 *--dst = *--src; 710 711 place = opnd; /* Op node, where operand used to be. */ 712 *place++ = op; 713 *place++ = '\0'; 714 *place++ = '\0'; 715 } 716 717 /* 718 - regtail - set the next-pointer at the end of a node chain 719 */ 720 static void 721 regtail(p, val) 722 char *p; 723 char *val; 724 { 725 register char *scan; 726 register char *temp; 727 register int offset; 728 729 if (p == ®dummy) 730 return; 731 732 /* Find last node. */ 733 scan = p; 734 for (;;) { 735 temp = regnext(scan); 736 if (temp == NULL) 737 break; 738 scan = temp; 739 } 740 741 if (OP(scan) == BACK) 742 offset = scan - val; 743 else 744 offset = val - scan; 745 *(scan+1) = (offset>>8)&0377; 746 *(scan+2) = offset&0377; 747 } 748 749 /* 750 - regoptail - regtail on operand of first argument; nop if operandless 751 */ 752 static void 753 regoptail(p, val) 754 char *p; 755 char *val; 756 { 757 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 758 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 759 return; 760 regtail(OPERAND(p), val); 761 } 762 763 /* 764 * regexec and friends 765 */ 766 767 /* 768 * Global work variables for regexec(). 769 */ 770 static char *reginput; /* String-input pointer. */ 771 static char *regbol; /* Beginning of input, for ^ check. */ 772 static char **regstartp; /* Pointer to startp array. */ 773 static char **regendp; /* Ditto for endp. */ 774 775 /* 776 * Forwards. 777 */ 778 STATIC int regtry __P((const regexp *, const char *)); 779 STATIC int regmatch __P((char *)); 780 STATIC int regrepeat __P((char *)); 781 782 #ifdef DEBUG 783 int regnarrate = 0; 784 void regdump __P((regexp *)); 785 STATIC char *regprop __P((char *)); 786 #endif 787 788 /* 789 - regexec - match a regexp against a string 790 */ 791 int 792 regexec(prog, string) 793 register const regexp *prog; 794 register const char *string; 795 { 796 register char *s; 797 798 /* Be paranoid... */ 799 if (prog == NULL || string == NULL) { 800 regerror("NULL parameter"); 801 return(0); 802 } 803 804 /* Check validity of program. */ 805 if (UCHARAT(prog->program) != MAGIC) { 806 regerror("corrupted program"); 807 return(0); 808 } 809 810 /* If there is a "must appear" string, look for it. */ 811 if (prog->regmust != NULL) { 812 s = (char *)string; 813 while ((s = strchr(s, prog->regmust[0])) != NULL) { 814 if (strncmp(s, prog->regmust, prog->regmlen) == 0) 815 break; /* Found it. */ 816 s++; 817 } 818 if (s == NULL) /* Not present. */ 819 return(0); 820 } 821 822 /* Mark beginning of line for ^ . */ 823 regbol = (char *)string; 824 825 /* Simplest case: anchored match need be tried only once. */ 826 if (prog->reganch) 827 return(regtry(prog, string)); 828 829 /* Messy cases: unanchored match. */ 830 s = (char *)string; 831 if (prog->regstart != '\0') 832 /* We know what char it must start with. */ 833 while ((s = strchr(s, prog->regstart)) != NULL) { 834 if (regtry(prog, s)) 835 return(1); 836 s++; 837 } 838 else 839 /* We don't -- general case. */ 840 do { 841 if (regtry(prog, s)) 842 return(1); 843 } while (*s++ != '\0'); 844 845 /* Failure. */ 846 return(0); 847 } 848 849 /* 850 - regtry - try match at specific point 851 */ 852 static int /* 0 failure, 1 success */ 853 regtry(prog, string) 854 const regexp *prog; 855 const char *string; 856 { 857 register int i; 858 register char **sp; 859 register char **ep; 860 861 reginput = (char *)string; 862 regstartp = prog->startp; 863 regendp = prog->endp; 864 865 sp = prog->startp; 866 ep = prog->endp; 867 for (i = NSUBEXP; i > 0; i--) { 868 *sp++ = NULL; 869 *ep++ = NULL; 870 } 871 if (regmatch(prog->program + 1)) { 872 ((regexp *)prog)->startp[0] = (char *)string; 873 ((regexp *)prog)->endp[0] = reginput; 874 return(1); 875 } else 876 return(0); 877 } 878 879 /* 880 - regmatch - main matching routine 881 * 882 * Conceptually the strategy is simple: check to see whether the current 883 * node matches, call self recursively to see whether the rest matches, 884 * and then act accordingly. In practice we make some effort to avoid 885 * recursion, in particular by going through "ordinary" nodes (that don't 886 * need to know whether the rest of the match failed) by a loop instead of 887 * by recursion. 888 */ 889 static int /* 0 failure, 1 success */ 890 regmatch(prog) 891 char *prog; 892 { 893 register char *scan; /* Current node. */ 894 char *next; /* Next node. */ 895 896 scan = prog; 897 #ifdef DEBUG 898 if (scan != NULL && regnarrate) 899 fprintf(stderr, "%s(\n", regprop(scan)); 900 #endif 901 while (scan != NULL) { 902 #ifdef DEBUG 903 if (regnarrate) 904 fprintf(stderr, "%s...\n", regprop(scan)); 905 #endif 906 next = regnext(scan); 907 908 switch (OP(scan)) { 909 case BOL: 910 if (reginput != regbol) 911 return(0); 912 break; 913 case EOL: 914 if (*reginput != '\0') 915 return(0); 916 break; 917 case WORDA: 918 /* Must be looking at a letter, digit, or _ */ 919 if ((!isalnum(*reginput)) && *reginput != '_') 920 return(0); 921 /* Prev must be BOL or nonword */ 922 if (reginput > regbol && 923 (isalnum(reginput[-1]) || reginput[-1] == '_')) 924 return(0); 925 break; 926 case WORDZ: 927 /* Must be looking at non letter, digit, or _ */ 928 if (isalnum(*reginput) || *reginput == '_') 929 return(0); 930 /* We don't care what the previous char was */ 931 break; 932 case ANY: 933 if (*reginput == '\0') 934 return(0); 935 reginput++; 936 break; 937 case EXACTLY: { 938 register int len; 939 register char *opnd; 940 941 opnd = OPERAND(scan); 942 /* Inline the first character, for speed. */ 943 if (*opnd != *reginput) 944 return(0); 945 len = strlen(opnd); 946 if (len > 1 && strncmp(opnd, reginput, len) != 0) 947 return(0); 948 reginput += len; 949 } 950 break; 951 case ANYOF: 952 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 953 return(0); 954 reginput++; 955 break; 956 case ANYBUT: 957 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 958 return(0); 959 reginput++; 960 break; 961 case NOTHING: 962 break; 963 case BACK: 964 break; 965 case OPEN+1: 966 case OPEN+2: 967 case OPEN+3: 968 case OPEN+4: 969 case OPEN+5: 970 case OPEN+6: 971 case OPEN+7: 972 case OPEN+8: 973 case OPEN+9: { 974 register int no; 975 register char *save; 976 977 no = OP(scan) - OPEN; 978 save = reginput; 979 980 if (regmatch(next)) { 981 /* 982 * Don't set startp if some later 983 * invocation of the same parentheses 984 * already has. 985 */ 986 if (regstartp[no] == NULL) 987 regstartp[no] = save; 988 return(1); 989 } else 990 return(0); 991 } 992 break; 993 case CLOSE+1: 994 case CLOSE+2: 995 case CLOSE+3: 996 case CLOSE+4: 997 case CLOSE+5: 998 case CLOSE+6: 999 case CLOSE+7: 1000 case CLOSE+8: 1001 case CLOSE+9: { 1002 register int no; 1003 register char *save; 1004 1005 no = OP(scan) - CLOSE; 1006 save = reginput; 1007 1008 if (regmatch(next)) { 1009 /* 1010 * Don't set endp if some later 1011 * invocation of the same parentheses 1012 * already has. 1013 */ 1014 if (regendp[no] == NULL) 1015 regendp[no] = save; 1016 return(1); 1017 } else 1018 return(0); 1019 } 1020 break; 1021 case BRANCH: { 1022 register char *save; 1023 1024 if (OP(next) != BRANCH) /* No choice. */ 1025 next = OPERAND(scan); /* Avoid recursion. */ 1026 else { 1027 do { 1028 save = reginput; 1029 if (regmatch(OPERAND(scan))) 1030 return(1); 1031 reginput = save; 1032 scan = regnext(scan); 1033 } while (scan != NULL && OP(scan) == BRANCH); 1034 return(0); 1035 /* NOTREACHED */ 1036 } 1037 } 1038 break; 1039 case STAR: 1040 case PLUS: { 1041 register char nextch; 1042 register int no; 1043 register char *save; 1044 register int min; 1045 1046 /* 1047 * Lookahead to avoid useless match attempts 1048 * when we know what character comes next. 1049 */ 1050 nextch = '\0'; 1051 if (OP(next) == EXACTLY) 1052 nextch = *OPERAND(next); 1053 min = (OP(scan) == STAR) ? 0 : 1; 1054 save = reginput; 1055 no = regrepeat(OPERAND(scan)); 1056 while (no >= min) { 1057 /* If it could work, try it. */ 1058 if (nextch == '\0' || *reginput == nextch) 1059 if (regmatch(next)) 1060 return(1); 1061 /* Couldn't or didn't -- back up. */ 1062 no--; 1063 reginput = save + no; 1064 } 1065 return(0); 1066 } 1067 break; 1068 case END: 1069 return(1); /* Success! */ 1070 break; 1071 default: 1072 regerror("memory corruption"); 1073 return(0); 1074 break; 1075 } 1076 1077 scan = next; 1078 } 1079 1080 /* 1081 * We get here only if there's trouble -- normally "case END" is 1082 * the terminating point. 1083 */ 1084 regerror("corrupted pointers"); 1085 return(0); 1086 } 1087 1088 /* 1089 - regrepeat - repeatedly match something simple, report how many 1090 */ 1091 static int 1092 regrepeat(p) 1093 char *p; 1094 { 1095 register int count = 0; 1096 register char *scan; 1097 register char *opnd; 1098 1099 scan = reginput; 1100 opnd = OPERAND(p); 1101 switch (OP(p)) { 1102 case ANY: 1103 count = strlen(scan); 1104 scan += count; 1105 break; 1106 case EXACTLY: 1107 while (*opnd == *scan) { 1108 count++; 1109 scan++; 1110 } 1111 break; 1112 case ANYOF: 1113 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1114 count++; 1115 scan++; 1116 } 1117 break; 1118 case ANYBUT: 1119 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1120 count++; 1121 scan++; 1122 } 1123 break; 1124 default: /* Oh dear. Called inappropriately. */ 1125 regerror("internal foulup"); 1126 count = 0; /* Best compromise. */ 1127 break; 1128 } 1129 reginput = scan; 1130 1131 return(count); 1132 } 1133 1134 /* 1135 - regnext - dig the "next" pointer out of a node 1136 */ 1137 static char * 1138 regnext(p) 1139 register char *p; 1140 { 1141 register int offset; 1142 1143 if (p == ®dummy) 1144 return(NULL); 1145 1146 offset = NEXT(p); 1147 if (offset == 0) 1148 return(NULL); 1149 1150 if (OP(p) == BACK) 1151 return(p-offset); 1152 else 1153 return(p+offset); 1154 } 1155 1156 #ifdef DEBUG 1157 1158 /* 1159 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1160 */ 1161 void 1162 regdump(r) 1163 regexp *r; 1164 { 1165 register char *s; 1166 register char op = EXACTLY; /* Arbitrary non-END op. */ 1167 register char *next; 1168 extern char *strchr(); 1169 1170 1171 s = r->program + 1; 1172 while (op != END) { /* While that wasn't END last time... */ 1173 op = OP(s); 1174 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1175 next = regnext(s); 1176 if (next == NULL) /* Next ptr. */ 1177 printf("(0)"); 1178 else 1179 printf("(%d)", (s-r->program)+(next-s)); 1180 s += 3; 1181 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1182 /* Literal string, where present. */ 1183 while (*s != '\0') { 1184 putchar(*s); 1185 s++; 1186 } 1187 s++; 1188 } 1189 putchar('\n'); 1190 } 1191 1192 /* Header fields of interest. */ 1193 if (r->regstart != '\0') 1194 printf("start `%c' ", r->regstart); 1195 if (r->reganch) 1196 printf("anchored "); 1197 if (r->regmust != NULL) 1198 printf("must have \"%s\"", r->regmust); 1199 printf("\n"); 1200 } 1201 1202 /* 1203 - regprop - printable representation of opcode 1204 */ 1205 static char * 1206 regprop(op) 1207 char *op; 1208 { 1209 register char *p; 1210 static char buf[50]; 1211 1212 (void) strcpy(buf, ":"); 1213 1214 switch (OP(op)) { 1215 case BOL: 1216 p = "BOL"; 1217 break; 1218 case EOL: 1219 p = "EOL"; 1220 break; 1221 case ANY: 1222 p = "ANY"; 1223 break; 1224 case ANYOF: 1225 p = "ANYOF"; 1226 break; 1227 case ANYBUT: 1228 p = "ANYBUT"; 1229 break; 1230 case BRANCH: 1231 p = "BRANCH"; 1232 break; 1233 case EXACTLY: 1234 p = "EXACTLY"; 1235 break; 1236 case NOTHING: 1237 p = "NOTHING"; 1238 break; 1239 case BACK: 1240 p = "BACK"; 1241 break; 1242 case END: 1243 p = "END"; 1244 break; 1245 case OPEN+1: 1246 case OPEN+2: 1247 case OPEN+3: 1248 case OPEN+4: 1249 case OPEN+5: 1250 case OPEN+6: 1251 case OPEN+7: 1252 case OPEN+8: 1253 case OPEN+9: 1254 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 1255 p = NULL; 1256 break; 1257 case CLOSE+1: 1258 case CLOSE+2: 1259 case CLOSE+3: 1260 case CLOSE+4: 1261 case CLOSE+5: 1262 case CLOSE+6: 1263 case CLOSE+7: 1264 case CLOSE+8: 1265 case CLOSE+9: 1266 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1267 p = NULL; 1268 break; 1269 case STAR: 1270 p = "STAR"; 1271 break; 1272 case PLUS: 1273 p = "PLUS"; 1274 break; 1275 case WORDA: 1276 p = "WORDA"; 1277 break; 1278 case WORDZ: 1279 p = "WORDZ"; 1280 break; 1281 default: 1282 regerror("corrupted opcode"); 1283 break; 1284 } 1285 if (p != NULL) 1286 (void) strcat(buf, p); 1287 return(buf); 1288 } 1289 #endif 1290 1291 /* 1292 * The following is provided for those people who do not have strcspn() in 1293 * their C libraries. They should get off their butts and do something 1294 * about it; at least one public-domain implementation of those (highly 1295 * useful) string routines has been published on Usenet. 1296 */ 1297 #ifdef STRCSPN 1298 /* 1299 * strcspn - find length of initial segment of s1 consisting entirely 1300 * of characters not from s2 1301 */ 1302 1303 static int 1304 strcspn(s1, s2) 1305 char *s1; 1306 char *s2; 1307 { 1308 register char *scan1; 1309 register char *scan2; 1310 register int count; 1311 1312 count = 0; 1313 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1314 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1315 if (*scan1 == *scan2++) 1316 return(count); 1317 count++; 1318 } 1319 return(count); 1320 } 1321 #endif 1322