1 /* $Source: /u/mark/src/pax/RCS/regexp.c,v $ 2 * 3 * $Revision: 1.2 $ 4 * 5 * regexp.c - regular expression matching 6 * 7 * DESCRIPTION 8 * 9 * Underneath the reformatting and comment blocks which were added to 10 * make it consistent with the rest of the code, you will find a 11 * modified version of Henry Specer's regular expression library. 12 * Henry's functions were modified to provide the minimal regular 13 * expression matching, as required by P1003. Henry's code was 14 * copyrighted, and copy of the copyright message and restrictions 15 * are provided, verbatim, below: 16 * 17 * Copyright (c) 1986 by University of Toronto. 18 * Written by Henry Spencer. Not derived from licensed software. 19 * 20 * Permission is granted to anyone to use this software for any 21 * purpose on any computer system, and to redistribute it freely, 22 * subject to the following restrictions: 23 * 24 * 1. The author is not responsible for the consequences of use of 25 * this software, no matter how awful, even if they arise 26 * from defects in it. 27 * 28 * 2. The origin of this software must not be misrepresented, either 29 * by explicit claim or by omission. 30 * 31 * 3. Altered versions must be plainly marked as such, and must not 32 * be misrepresented as being the original software. 33 * 34 * Beware that some of this code is subtly aware of the way operator 35 * precedence is structured in regular expressions. Serious changes in 36 * regular-expression syntax might require a total rethink. 37 * 38 * AUTHORS 39 * 40 * Mark H. Colburn, NAPS International (mark@jhereg.mn.org) 41 * Henry Spencer, University of Torronto (henry@utzoo.edu) 42 * 43 * Sponsored by The USENIX Association for public distribution. 44 * 45 * $Log: regexp.c,v $ 46 * Revision 1.2 89/02/12 10:05:39 mark 47 * 1.2 release fixes 48 * 49 * Revision 1.1 88/12/23 18:02:32 mark 50 * Initial revision 51 * 52 */ 53 54 /* Headers */ 55 56 #include "pax.h" 57 58 #ifndef lint 59 static char *Ident = "$Id: regexp.c,v 1.2 89/02/12 10:05:39 mark Exp $"; 60 #endif 61 62 63 /* 64 * The "internal use only" fields in regexp.h are present to pass info from 65 * compile to execute that permits the execute phase to run lots faster on 66 * simple cases. They are: 67 * 68 * regstart char that must begin a match; '\0' if none obvious 69 * reganch is the match anchored (at beginning-of-line only)? 70 * regmust string (pointer into program) that match must include, or NULL 71 * regmlen length of regmust string 72 * 73 * Regstart and reganch permit very fast decisions on suitable starting points 74 * for a match, cutting down the work a lot. Regmust permits fast rejection 75 * of lines that cannot possibly match. The regmust tests are costly enough 76 * that regcomp() supplies a regmust only if the r.e. contains something 77 * potentially expensive (at present, the only such thing detected is * or + 78 * at the start of the r.e., which can involve a lot of backup). Regmlen is 79 * supplied because the test in regexec() needs it and regcomp() is computing 80 * it anyway. 81 */ 82 83 /* 84 * Structure for regexp "program". This is essentially a linear encoding 85 * of a nondeterministic finite-state machine (aka syntax charts or 86 * "railroad normal form" in parsing technology). Each node is an opcode 87 * plus a "nxt" pointer, possibly plus an operand. "Nxt" pointers of 88 * all nodes except BRANCH implement concatenation; a "nxt" pointer with 89 * a BRANCH on both ends of it is connecting two alternatives. (Here we 90 * have one of the subtle syntax dependencies: an individual BRANCH (as 91 * opposed to a collection of them) is never concatenated with anything 92 * because of operator precedence.) The operand of some types of node is 93 * a literal string; for others, it is a node leading into a sub-FSM. In 94 * particular, the operand of a BRANCH node is the first node of the branch. 95 * (NB this is *not* a tree structure: the tail of the branch connects 96 * to the thing following the set of BRANCHes.) The opcodes are: 97 */ 98 99 /* definition number opnd? meaning */ 100 #define END 0 /* no End of program. */ 101 #define BOL 1 /* no Match "" at beginning of line. */ 102 #define EOL 2 /* no Match "" at end of line. */ 103 #define ANY 3 /* no Match any one character. */ 104 #define ANYOF 4 /* str Match any character in this string. */ 105 #define ANYBUT 5 /* str Match any character not in this 106 * string. */ 107 #define BRANCH 6 /* node Match this alternative, or the 108 * nxt... */ 109 #define BACK 7 /* no Match "", "nxt" ptr points backward. */ 110 #define EXACTLY 8 /* str Match this string. */ 111 #define NOTHING 9 /* no Match empty string. */ 112 #define STAR 10 /* node Match this (simple) thing 0 or more 113 * times. */ 114 #define OPEN 20 /* no Mark this point in input as start of 115 * #n. */ 116 /* OPEN+1 is number 1, etc. */ 117 #define CLOSE 30 /* no Analogous to OPEN. */ 118 119 /* 120 * Opcode notes: 121 * 122 * BRANCH The set of branches constituting a single choice are hooked 123 * together with their "nxt" pointers, since precedence prevents 124 * anything being concatenated to any individual branch. The 125 * "nxt" pointer of the last BRANCH in a choice points to the 126 * thing following the whole choice. This is also where the 127 * final "nxt" pointer of each individual branch points; each 128 * branch starts with the operand node of a BRANCH node. 129 * 130 * BACK Normal "nxt" pointers all implicitly point forward; BACK 131 * exists to make loop structures possible. 132 * 133 * STAR complex '*', are implemented as circular BRANCH structures 134 * using BACK. Simple cases (one character per match) are 135 * implemented with STAR for speed and to minimize recursive 136 * plunges. 137 * 138 * OPEN,CLOSE ...are numbered at compile time. 139 */ 140 141 /* 142 * A node is one char of opcode followed by two chars of "nxt" pointer. 143 * "Nxt" pointers are stored as two 8-bit pieces, high order first. The 144 * value is a positive offset from the opcode of the node containing it. 145 * An operand, if any, simply follows the node. (Note that much of the 146 * code generation knows about this implicit relationship.) 147 * 148 * Using two bytes for the "nxt" pointer is vast overkill for most things, 149 * but allows patterns to get big without disasters. 150 */ 151 #define OP(p) (*(p)) 152 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 153 #define OPERAND(p) ((p) + 3) 154 155 /* 156 * Utility definitions. 157 */ 158 159 #define FAIL(m) { regerror(m); return(NULL); } 160 #define ISMULT(c) ((c) == '*') 161 #define META "^$.[()|*\\" 162 #ifndef CHARBITS 163 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 164 #else 165 #define UCHARAT(p) ((int)*(p)&CHARBITS) 166 #endif 167 168 /* 169 * Flags to be passed up and down. 170 */ 171 #define HASWIDTH 01 /* Known never to match null string. */ 172 #define SIMPLE 02 /* Simple enough to be STAR operand. */ 173 #define SPSTART 04 /* Starts with * */ 174 #define WORST 0 /* Worst case. */ 175 176 /* 177 * Global work variables for regcomp(). 178 */ 179 static char *regparse; /* Input-scan pointer. */ 180 static int regnpar; /* () count. */ 181 static char regdummy; 182 static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 183 static long regsize; /* Code size. */ 184 185 /* 186 * Forward declarations for regcomp()'s friends. 187 */ 188 #ifndef STATIC 189 #define STATIC static 190 #endif 191 STATIC char *reg(); 192 STATIC char *regbranch(); 193 STATIC char *regpiece(); 194 STATIC char *regatom(); 195 STATIC char *regnode(); 196 STATIC char *regnext(); 197 STATIC void regc(); 198 STATIC void reginsert(); 199 STATIC void regtail(); 200 STATIC void regoptail(); 201 #ifdef STRCSPN 202 STATIC int strcspn(); 203 #endif 204 205 /* 206 - regcomp - compile a regular expression into internal code 207 * 208 * We can't allocate space until we know how big the compiled form will be, 209 * but we can't compile it (and thus know how big it is) until we've got a 210 * place to put the code. So we cheat: we compile it twice, once with code 211 * generation turned off and size counting turned on, and once "for real". 212 * This also means that we don't allocate space until we are sure that the 213 * thing really will compile successfully, and we never have to move the 214 * code and thus invalidate pointers into it. (Note that it has to be in 215 * one piece because free() must be able to free it all.) 216 * 217 * Beware that the optimization-preparation code in here knows about some 218 * of the structure of the compiled regexp. 219 */ 220 regexp *regcomp(exp) 221 char *exp; 222 { 223 register regexp *r; 224 register char *scan; 225 register char *longest; 226 register int len; 227 int flags; 228 extern char *malloc(); 229 230 if (exp == (char *)NULL) 231 FAIL("NULL argument"); 232 233 /* First pass: determine size, legality. */ 234 regparse = exp; 235 regnpar = 1; 236 regsize = 0L; 237 regcode = ®dummy; 238 regc(MAGIC); 239 if (reg(0, &flags) == (char *)NULL) 240 return ((regexp *)NULL); 241 242 /* Small enough for pointer-storage convention? */ 243 if (regsize >= 32767L) /* Probably could be 65535L. */ 244 FAIL("regexp too big"); 245 246 /* Allocate space. */ 247 r = (regexp *) malloc(sizeof(regexp) + (unsigned) regsize); 248 if (r == (regexp *) NULL) 249 FAIL("out of space"); 250 251 /* Second pass: emit code. */ 252 regparse = exp; 253 regnpar = 1; 254 regcode = r->program; 255 regc(MAGIC); 256 if (reg(0, &flags) == NULL) 257 return ((regexp *) NULL); 258 259 /* Dig out information for optimizations. */ 260 r->regstart = '\0'; /* Worst-case defaults. */ 261 r->reganch = 0; 262 r->regmust = NULL; 263 r->regmlen = 0; 264 scan = r->program + 1; /* First BRANCH. */ 265 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 266 scan = OPERAND(scan); 267 268 /* Starting-point info. */ 269 if (OP(scan) == EXACTLY) 270 r->regstart = *OPERAND(scan); 271 else if (OP(scan) == BOL) 272 r->reganch++; 273 274 /* 275 * If there's something expensive in the r.e., find the longest 276 * literal string that must appear and make it the regmust. Resolve 277 * ties in favor of later strings, since the regstart check works 278 * with the beginning of the r.e. and avoiding duplication 279 * strengthens checking. Not a strong reason, but sufficient in the 280 * absence of others. 281 */ 282 if (flags & SPSTART) { 283 longest = NULL; 284 len = 0; 285 for (; scan != NULL; scan = regnext(scan)) 286 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { 287 longest = OPERAND(scan); 288 len = strlen(OPERAND(scan)); 289 } 290 r->regmust = longest; 291 r->regmlen = len; 292 } 293 } 294 return (r); 295 } 296 297 /* 298 - reg - regular expression, i.e. main body or parenthesized thing 299 * 300 * Caller must absorb opening parenthesis. 301 * 302 * Combining parenthesis handling with the base level of regular expression 303 * is a trifle forced, but the need to tie the tails of the branches to what 304 * follows makes it hard to avoid. 305 */ 306 static char *reg(paren, flagp) 307 int paren; /* Parenthesized? */ 308 int *flagp; 309 { 310 register char *ret; 311 register char *br; 312 register char *ender; 313 register int parno; 314 int flags; 315 316 *flagp = HASWIDTH; /* Tentatively. */ 317 318 /* Make an OPEN node, if parenthesized. */ 319 if (paren) { 320 if (regnpar >= NSUBEXP) 321 FAIL("too many ()"); 322 parno = regnpar; 323 regnpar++; 324 ret = regnode(OPEN + parno); 325 } else 326 ret = (char *)NULL; 327 328 /* Pick up the branches, linking them together. */ 329 br = regbranch(&flags); 330 if (br == (char *)NULL) 331 return ((char *)NULL); 332 if (ret != (char *)NULL) 333 regtail(ret, br); /* OPEN -> first. */ 334 else 335 ret = br; 336 if (!(flags & HASWIDTH)) 337 *flagp &= ~HASWIDTH; 338 *flagp |= flags & SPSTART; 339 while (*regparse == '|') { 340 regparse++; 341 br = regbranch(&flags); 342 if (br == (char *)NULL) 343 return ((char *)NULL); 344 regtail(ret, br); /* BRANCH -> BRANCH. */ 345 if (!(flags & HASWIDTH)) 346 *flagp &= ~HASWIDTH; 347 *flagp |= flags & SPSTART; 348 } 349 350 /* Make a closing node, and hook it on the end. */ 351 ender = regnode((paren) ? CLOSE + parno : END); 352 regtail(ret, ender); 353 354 /* Hook the tails of the branches to the closing node. */ 355 for (br = ret; br != (char *)NULL; br = regnext(br)) 356 regoptail(br, ender); 357 358 /* Check for proper termination. */ 359 if (paren && *regparse++ != ')') { 360 FAIL("unmatched ()"); 361 } else if (!paren && *regparse != '\0') { 362 if (*regparse == ')') { 363 FAIL("unmatched ()"); 364 } else 365 FAIL("junk on end");/* "Can't happen". */ 366 /* NOTREACHED */ 367 } 368 return (ret); 369 } 370 371 /* 372 - regbranch - one alternative of an | operator 373 * 374 * Implements the concatenation operator. 375 */ 376 static char *regbranch(flagp) 377 int *flagp; 378 { 379 register char *ret; 380 register char *chain; 381 register char *latest; 382 int flags; 383 384 *flagp = WORST; /* Tentatively. */ 385 386 ret = regnode(BRANCH); 387 chain = (char *)NULL; 388 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { 389 latest = regpiece(&flags); 390 if (latest == (char *)NULL) 391 return ((char *)NULL); 392 *flagp |= flags & HASWIDTH; 393 if (chain == (char *)NULL) /* First piece. */ 394 *flagp |= flags & SPSTART; 395 else 396 regtail(chain, latest); 397 chain = latest; 398 } 399 if (chain == (char *)NULL) /* Loop ran zero times. */ 400 regnode(NOTHING); 401 402 return (ret); 403 } 404 405 /* 406 - regpiece - something followed by possible [*] 407 * 408 * Note that the branching code sequence used for * is somewhat optimized: 409 * they use the same NOTHING node as both the endmarker for their branch 410 * list and the body of the last branch. It might seem that this node could 411 * be dispensed with entirely, but the endmarker role is not redundant. 412 */ 413 static char *regpiece(flagp) 414 int *flagp; 415 { 416 register char *ret; 417 register char op; 418 register char *nxt; 419 int flags; 420 421 ret = regatom(&flags); 422 if (ret == (char *)NULL) 423 return ((char *)NULL); 424 425 op = *regparse; 426 if (!ISMULT(op)) { 427 *flagp = flags; 428 return (ret); 429 } 430 if (!(flags & HASWIDTH)) 431 FAIL("* operand could be empty"); 432 *flagp = (WORST | SPSTART); 433 434 if (op == '*' && (flags & SIMPLE)) 435 reginsert(STAR, ret); 436 else if (op == '*') { 437 /* Emit x* as (x&|), where & means "self". */ 438 reginsert(BRANCH, ret); /* Either x */ 439 regoptail(ret, regnode(BACK)); /* and loop */ 440 regoptail(ret, ret); /* back */ 441 regtail(ret, regnode(BRANCH)); /* or */ 442 regtail(ret, regnode(NOTHING)); /* null. */ 443 } 444 regparse++; 445 if (ISMULT(*regparse)) 446 FAIL("nested *"); 447 448 return (ret); 449 } 450 451 /* 452 - regatom - the lowest level 453 * 454 * Optimization: gobbles an entire sequence of ordinary characters so that 455 * it can turn them into a single node, which is smaller to store and 456 * faster to run. Backslashed characters are exceptions, each becoming a 457 * separate node; the code is simpler that way and it's not worth fixing. 458 */ 459 static char *regatom(flagp) 460 int *flagp; 461 { 462 register char *ret; 463 int flags; 464 465 *flagp = WORST; /* Tentatively. */ 466 467 switch (*regparse++) { 468 case '^': 469 ret = regnode(BOL); 470 break; 471 case '$': 472 ret = regnode(EOL); 473 break; 474 case '.': 475 ret = regnode(ANY); 476 *flagp |= HASWIDTH | SIMPLE; 477 break; 478 case '[':{ 479 register int class; 480 register int classend; 481 482 if (*regparse == '^') { /* Complement of range. */ 483 ret = regnode(ANYBUT); 484 regparse++; 485 } else 486 ret = regnode(ANYOF); 487 if (*regparse == ']' || *regparse == '-') 488 regc(*regparse++); 489 while (*regparse != '\0' && *regparse != ']') { 490 if (*regparse == '-') { 491 regparse++; 492 if (*regparse == ']' || *regparse == '\0') 493 regc('-'); 494 else { 495 class = UCHARAT(regparse - 2) + 1; 496 classend = UCHARAT(regparse); 497 if (class > classend + 1) 498 FAIL("invalid [] range"); 499 for (; class <= classend; class++) 500 regc(class); 501 regparse++; 502 } 503 } else 504 regc(*regparse++); 505 } 506 regc('\0'); 507 if (*regparse != ']') 508 FAIL("unmatched []"); 509 regparse++; 510 *flagp |= HASWIDTH | SIMPLE; 511 } 512 break; 513 case '(': 514 ret = reg(1, &flags); 515 if (ret == (char *)NULL) 516 return ((char *)NULL); 517 *flagp |= flags & (HASWIDTH | SPSTART); 518 break; 519 case '\0': 520 case '|': 521 case ')': 522 FAIL("internal urp"); /* Supposed to be caught earlier. */ 523 break; 524 case '*': 525 FAIL("* follows nothing"); 526 break; 527 case '\\': 528 if (*regparse == '\0') 529 FAIL("trailing \\"); 530 ret = regnode(EXACTLY); 531 regc(*regparse++); 532 regc('\0'); 533 *flagp |= HASWIDTH | SIMPLE; 534 break; 535 default:{ 536 register int len; 537 register char ender; 538 539 regparse--; 540 len = strcspn(regparse, META); 541 if (len <= 0) 542 FAIL("internal disaster"); 543 ender = *(regparse + len); 544 if (len > 1 && ISMULT(ender)) 545 len--; /* Back off clear of * operand. */ 546 *flagp |= HASWIDTH; 547 if (len == 1) 548 *flagp |= SIMPLE; 549 ret = regnode(EXACTLY); 550 while (len > 0) { 551 regc(*regparse++); 552 len--; 553 } 554 regc('\0'); 555 } 556 break; 557 } 558 559 return (ret); 560 } 561 562 /* 563 - regnode - emit a node 564 */ 565 static char *regnode(op) 566 char op; 567 { 568 register char *ret; 569 register char *ptr; 570 571 ret = regcode; 572 if (ret == ®dummy) { 573 regsize += 3; 574 return (ret); 575 } 576 ptr = ret; 577 *ptr++ = op; 578 *ptr++ = '\0'; /* Null "nxt" pointer. */ 579 *ptr++ = '\0'; 580 regcode = ptr; 581 582 return (ret); 583 } 584 585 /* 586 - regc - emit (if appropriate) a byte of code 587 */ 588 static void regc(b) 589 char b; 590 { 591 if (regcode != ®dummy) 592 *regcode++ = b; 593 else 594 regsize++; 595 } 596 597 /* 598 - reginsert - insert an operator in front of already-emitted operand 599 * 600 * Means relocating the operand. 601 */ 602 static void reginsert(op, opnd) 603 char op; 604 char *opnd; 605 { 606 register char *src; 607 register char *dst; 608 register char *place; 609 610 if (regcode == ®dummy) { 611 regsize += 3; 612 return; 613 } 614 src = regcode; 615 regcode += 3; 616 dst = regcode; 617 while (src > opnd) 618 *--dst = *--src; 619 620 place = opnd; /* Op node, where operand used to be. */ 621 *place++ = op; 622 *place++ = '\0'; 623 *place++ = '\0'; 624 } 625 626 /* 627 - regtail - set the next-pointer at the end of a node chain 628 */ 629 static void regtail(p, val) 630 char *p; 631 char *val; 632 { 633 register char *scan; 634 register char *temp; 635 register int offset; 636 637 if (p == ®dummy) 638 return; 639 640 /* Find last node. */ 641 scan = p; 642 for (;;) { 643 temp = regnext(scan); 644 if (temp == (char *)NULL) 645 break; 646 scan = temp; 647 } 648 649 if (OP(scan) == BACK) 650 offset = scan - val; 651 else 652 offset = val - scan; 653 *(scan + 1) = (offset >> 8) & 0377; 654 *(scan + 2) = offset & 0377; 655 } 656 657 /* 658 - regoptail - regtail on operand of first argument; nop if operandless 659 */ 660 static void regoptail(p, val) 661 char *p; 662 char *val; 663 { 664 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 665 if (p == (char *)NULL || p == ®dummy || OP(p) != BRANCH) 666 return; 667 regtail(OPERAND(p), val); 668 } 669 670 /* 671 * regexec and friends 672 */ 673 674 /* 675 * Global work variables for regexec(). 676 */ 677 static char *reginput; /* String-input pointer. */ 678 static char *regbol; /* Beginning of input, for ^ check. */ 679 static char **regstartp; /* Pointer to startp array. */ 680 static char **regendp; /* Ditto for endp. */ 681 682 /* 683 * Forwards. 684 */ 685 STATIC int regtry(); 686 STATIC int regmatch(); 687 STATIC int regrepeat(); 688 689 #ifdef DEBUG 690 int regnarrate = 0; 691 void regdump(); 692 STATIC char *regprop(); 693 #endif 694 695 /* 696 - regexec - match a regexp against a string 697 */ 698 int regexec(prog, string) 699 register regexp *prog; 700 register char *string; 701 { 702 register char *s; 703 704 /* Be paranoid... */ 705 if (prog == (regexp *)NULL || string == (char *)NULL) { 706 regerror("NULL parameter"); 707 return (0); 708 } 709 /* Check validity of program. */ 710 if (UCHARAT(prog->program) != MAGIC) { 711 regerror("corrupted program"); 712 return (0); 713 } 714 /* If there is a "must appear" string, look for it. */ 715 if (prog->regmust != (char *)NULL) { 716 s = string; 717 while ((s = strchr(s, prog->regmust[0])) != (char *)NULL) { 718 if (strncmp(s, prog->regmust, prog->regmlen) == 0) 719 break; /* Found it. */ 720 s++; 721 } 722 if (s == (char *)NULL) /* Not present. */ 723 return (0); 724 } 725 /* Mark beginning of line for ^ . */ 726 regbol = string; 727 728 /* Simplest case: anchored match need be tried only once. */ 729 if (prog->reganch) 730 return (regtry(prog, string)); 731 732 /* Messy cases: unanchored match. */ 733 s = string; 734 if (prog->regstart != '\0') 735 /* We know what char it must start with. */ 736 while ((s = strchr(s, prog->regstart)) != (char *)NULL) { 737 if (regtry(prog, s)) 738 return (1); 739 s++; 740 } 741 else 742 /* We don't -- general case. */ 743 do { 744 if (regtry(prog, s)) 745 return (1); 746 } while (*s++ != '\0'); 747 748 /* Failure. */ 749 return (0); 750 } 751 752 /* 753 - regtry - try match at specific point 754 */ 755 #ifdef __STDC__ 756 757 static int regtry(regexp *prog, char *string) 758 759 #else 760 761 static int regtry(prog, string) 762 regexp *prog; 763 char *string; 764 765 #endif 766 { 767 register int i; 768 register char **sp; 769 register char **ep; 770 771 reginput = string; 772 regstartp = prog->startp; 773 regendp = prog->endp; 774 775 sp = prog->startp; 776 ep = prog->endp; 777 for (i = NSUBEXP; i > 0; i--) { 778 *sp++ = (char *)NULL; 779 *ep++ = (char *)NULL; 780 } 781 if (regmatch(prog->program + 1)) { 782 prog->startp[0] = string; 783 prog->endp[0] = reginput; 784 return (1); 785 } else 786 return (0); 787 } 788 789 /* 790 - regmatch - main matching routine 791 * 792 * Conceptually the strategy is simple: check to see whether the current 793 * node matches, call self recursively to see whether the rest matches, 794 * and then act accordingly. In practice we make some effort to avoid 795 * recursion, in particular by going through "ordinary" nodes (that don't 796 * need to know whether the rest of the match failed) by a loop instead of 797 * by recursion. 798 */ 799 #ifdef __STDC__ 800 801 static int regmatch(char *prog) 802 803 #else 804 805 static int regmatch(prog) 806 char *prog; 807 808 #endif 809 { 810 register char *scan; /* Current node. */ 811 char *nxt; /* nxt node. */ 812 813 scan = prog; 814 #ifdef DEBUG 815 if (scan != (char *)NULL && regnarrate) 816 fprintf(stderr, "%s(\n", regprop(scan)); 817 #endif 818 while (scan != (char *)NULL) { 819 #ifdef DEBUG 820 if (regnarrate) 821 fprintf(stderr, "%s...\n", regprop(scan)); 822 #endif 823 nxt = regnext(scan); 824 825 switch (OP(scan)) { 826 case BOL: 827 if (reginput != regbol) 828 return (0); 829 break; 830 case EOL: 831 if (*reginput != '\0') 832 return (0); 833 break; 834 case ANY: 835 if (*reginput == '\0') 836 return (0); 837 reginput++; 838 break; 839 case EXACTLY:{ 840 register int len; 841 register char *opnd; 842 843 opnd = OPERAND(scan); 844 /* Inline the first character, for speed. */ 845 if (*opnd != *reginput) 846 return (0); 847 len = strlen(opnd); 848 if (len > 1 && strncmp(opnd, reginput, len) != 0) 849 return (0); 850 reginput += len; 851 } 852 break; 853 case ANYOF: 854 if (*reginput == '\0' || 855 strchr(OPERAND(scan), *reginput) == (char *)NULL) 856 return (0); 857 reginput++; 858 break; 859 case ANYBUT: 860 if (*reginput == '\0' || 861 strchr(OPERAND(scan), *reginput) != (char *)NULL) 862 return (0); 863 reginput++; 864 break; 865 case NOTHING: 866 break; 867 case BACK: 868 break; 869 case OPEN + 1: 870 case OPEN + 2: 871 case OPEN + 3: 872 case OPEN + 4: 873 case OPEN + 5: 874 case OPEN + 6: 875 case OPEN + 7: 876 case OPEN + 8: 877 case OPEN + 9:{ 878 register int no; 879 register char *save; 880 881 no = OP(scan) - OPEN; 882 save = reginput; 883 884 if (regmatch(nxt)) { 885 /* 886 * Don't set startp if some later invocation of the same 887 * parentheses already has. 888 */ 889 if (regstartp[no] == (char *)NULL) 890 regstartp[no] = save; 891 return (1); 892 } else 893 return (0); 894 } 895 break; 896 case CLOSE + 1: 897 case CLOSE + 2: 898 case CLOSE + 3: 899 case CLOSE + 4: 900 case CLOSE + 5: 901 case CLOSE + 6: 902 case CLOSE + 7: 903 case CLOSE + 8: 904 case CLOSE + 9:{ 905 register int no; 906 register char *save; 907 908 no = OP(scan) - CLOSE; 909 save = reginput; 910 911 if (regmatch(nxt)) { 912 /* 913 * Don't set endp if some later invocation of the same 914 * parentheses already has. 915 */ 916 if (regendp[no] == (char *)NULL) 917 regendp[no] = save; 918 return (1); 919 } else 920 return (0); 921 } 922 break; 923 case BRANCH:{ 924 register char *save; 925 926 if (OP(nxt) != BRANCH) /* No choice. */ 927 nxt = OPERAND(scan); /* Avoid recursion. */ 928 else { 929 do { 930 save = reginput; 931 if (regmatch(OPERAND(scan))) 932 return (1); 933 reginput = save; 934 scan = regnext(scan); 935 } while (scan != (char *)NULL && OP(scan) == BRANCH); 936 return (0); 937 /* NOTREACHED */ 938 } 939 } 940 break; 941 case STAR:{ 942 register char nextch; 943 register int no; 944 register char *save; 945 register int minimum; 946 947 /* 948 * Lookahead to avoid useless match attempts when we know 949 * what character comes next. 950 */ 951 nextch = '\0'; 952 if (OP(nxt) == EXACTLY) 953 nextch = *OPERAND(nxt); 954 minimum = (OP(scan) == STAR) ? 0 : 1; 955 save = reginput; 956 no = regrepeat(OPERAND(scan)); 957 while (no >= minimum) { 958 /* If it could work, try it. */ 959 if (nextch == '\0' || *reginput == nextch) 960 if (regmatch(nxt)) 961 return (1); 962 /* Couldn't or didn't -- back up. */ 963 no--; 964 reginput = save + no; 965 } 966 return (0); 967 } 968 break; 969 case END: 970 return (1); /* Success! */ 971 break; 972 default: 973 regerror("memory corruption"); 974 return (0); 975 break; 976 } 977 978 scan = nxt; 979 } 980 981 /* 982 * We get here only if there's trouble -- normally "case END" is the 983 * terminating point. 984 */ 985 regerror("corrupted pointers"); 986 return (0); 987 } 988 989 /* 990 - regrepeat - repeatedly match something simple, report how many 991 */ 992 #ifdef __STDC__ 993 994 static int regrepeat(char *p) 995 996 #else 997 998 static int regrepeat(p) 999 char *p; 1000 1001 #endif 1002 { 1003 register int count = 0; 1004 register char *scan; 1005 register char *opnd; 1006 1007 scan = reginput; 1008 opnd = OPERAND(p); 1009 switch (OP(p)) { 1010 case ANY: 1011 count = strlen(scan); 1012 scan += count; 1013 break; 1014 case EXACTLY: 1015 while (*opnd == *scan) { 1016 count++; 1017 scan++; 1018 } 1019 break; 1020 case ANYOF: 1021 while (*scan != '\0' && strchr(opnd, *scan) != (char *)NULL) { 1022 count++; 1023 scan++; 1024 } 1025 break; 1026 case ANYBUT: 1027 while (*scan != '\0' && strchr(opnd, *scan) == (char *)NULL) { 1028 count++; 1029 scan++; 1030 } 1031 break; 1032 default: /* Oh dear. Called inappropriately. */ 1033 regerror("internal foulup"); 1034 count = 0; /* Best compromise. */ 1035 break; 1036 } 1037 reginput = scan; 1038 1039 return (count); 1040 } 1041 1042 1043 /* 1044 - regnext - dig the "nxt" pointer out of a node 1045 */ 1046 #ifdef __STDC__ 1047 1048 static char *regnext(register char *p) 1049 1050 #else 1051 1052 static char *regnext(p) 1053 register char *p; 1054 1055 #endif 1056 { 1057 register int offset; 1058 1059 if (p == ®dummy) 1060 return ((char *)NULL); 1061 1062 offset = NEXT(p); 1063 if (offset == 0) 1064 return ((char *)NULL); 1065 1066 if (OP(p) == BACK) 1067 return (p - offset); 1068 else 1069 return (p + offset); 1070 } 1071 1072 #ifdef DEBUG 1073 1074 STATIC char *regprop(); 1075 1076 /* 1077 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1078 */ 1079 #ifdef __STDC__ 1080 1081 void regdump(regexp *r) 1082 1083 #else 1084 1085 void regdump(r) 1086 regexp *r; 1087 1088 #endif 1089 { 1090 register char *s; 1091 register char op = EXACTLY; /* Arbitrary non-END op. */ 1092 register char *nxt; 1093 extern char *strchr(); 1094 1095 1096 s = r->program + 1; 1097 while (op != END) { /* While that wasn't END last time... */ 1098 op = OP(s); 1099 printf("%2d%s", s - r->program, regprop(s)); /* Where, what. */ 1100 nxt = regnext(s); 1101 if (nxt == (char *)NULL) /* nxt ptr. */ 1102 printf("(0)"); 1103 else 1104 printf("(%d)", (s - r->program) + (nxt - s)); 1105 s += 3; 1106 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1107 /* Literal string, where present. */ 1108 while (*s != '\0') { 1109 putchar(*s); 1110 s++; 1111 } 1112 s++; 1113 } 1114 putchar('\n'); 1115 } 1116 1117 /* Header fields of interest. */ 1118 if (r->regstart != '\0') 1119 printf("start `%c' ", r->regstart); 1120 if (r->reganch) 1121 printf("anchored "); 1122 if (r->regmust != (char *)NULL) 1123 printf("must have \"%s\"", r->regmust); 1124 printf("\n"); 1125 } 1126 1127 /* 1128 - regprop - printable representation of opcode 1129 */ 1130 #ifdef __STDC__ 1131 1132 static char *regprop(char *op) 1133 1134 #else 1135 1136 static char *regprop(op) 1137 char *op; 1138 1139 #endif 1140 { 1141 register char *p; 1142 static char buf[50]; 1143 1144 strcpy(buf, ":"); 1145 1146 switch (OP(op)) { 1147 case BOL: 1148 p = "BOL"; 1149 break; 1150 case EOL: 1151 p = "EOL"; 1152 break; 1153 case ANY: 1154 p = "ANY"; 1155 break; 1156 case ANYOF: 1157 p = "ANYOF"; 1158 break; 1159 case ANYBUT: 1160 p = "ANYBUT"; 1161 break; 1162 case BRANCH: 1163 p = "BRANCH"; 1164 break; 1165 case EXACTLY: 1166 p = "EXACTLY"; 1167 break; 1168 case NOTHING: 1169 p = "NOTHING"; 1170 break; 1171 case BACK: 1172 p = "BACK"; 1173 break; 1174 case END: 1175 p = "END"; 1176 break; 1177 case OPEN + 1: 1178 case OPEN + 2: 1179 case OPEN + 3: 1180 case OPEN + 4: 1181 case OPEN + 5: 1182 case OPEN + 6: 1183 case OPEN + 7: 1184 case OPEN + 8: 1185 case OPEN + 9: 1186 sprintf(buf + strlen(buf), "OPEN%d", OP(op) - OPEN); 1187 p = (char *)NULL; 1188 break; 1189 case CLOSE + 1: 1190 case CLOSE + 2: 1191 case CLOSE + 3: 1192 case CLOSE + 4: 1193 case CLOSE + 5: 1194 case CLOSE + 6: 1195 case CLOSE + 7: 1196 case CLOSE + 8: 1197 case CLOSE + 9: 1198 sprintf(buf + strlen(buf), "CLOSE%d", OP(op) - CLOSE); 1199 p = (char *)NULL; 1200 break; 1201 case STAR: 1202 p = "STAR"; 1203 break; 1204 default: 1205 regerror("corrupted opcode"); 1206 break; 1207 } 1208 if (p != (char *)NULL) 1209 strcat(buf, p); 1210 return (buf); 1211 } 1212 #endif 1213 1214 /* 1215 * The following is provided for those people who do not have strcspn() in 1216 * their C libraries. They should get off their butts and do something 1217 * about it; at least one public-domain implementation of those (highly 1218 * useful) string routines has been published on Usenet. 1219 */ 1220 #ifdef STRCSPN 1221 /* 1222 * strcspn - find length of initial segment of s1 consisting entirely 1223 * of characters not from s2 1224 */ 1225 1226 #ifdef __STDC__ 1227 1228 static int strcspn(char *s1, char *s2) 1229 1230 #else 1231 1232 static int strcspn(s1, s2) 1233 char *s1; 1234 char *s2; 1235 1236 #endif 1237 { 1238 register char *scan1; 1239 register char *scan2; 1240 register int count; 1241 1242 count = 0; 1243 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1244 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1245 if (*scan1 == *scan2++) 1246 return (count); 1247 count++; 1248 } 1249 return (count); 1250 } 1251 #endif 1252 1253 1254 /* 1255 - regsub - perform substitutions after a regexp match 1256 */ 1257 #ifdef __STDC__ 1258 1259 void regsub(regexp *prog, char *source, char *dest) 1260 1261 #else 1262 1263 void regsub(prog, source, dest) 1264 regexp *prog; 1265 char *source; 1266 char *dest; 1267 1268 #endif 1269 { 1270 register char *src; 1271 register char *dst; 1272 register char c; 1273 register int no; 1274 register int len; 1275 extern char *strncpy(); 1276 1277 if (prog == (regexp *)NULL || 1278 source == (char *)NULL || dest == (char *)NULL) { 1279 regerror("NULL parm to regsub"); 1280 return; 1281 } 1282 if (UCHARAT(prog->program) != MAGIC) { 1283 regerror("damaged regexp fed to regsub"); 1284 return; 1285 } 1286 src = source; 1287 dst = dest; 1288 while ((c = *src++) != '\0') { 1289 if (c == '&') 1290 no = 0; 1291 else if (c == '\\' && '0' <= *src && *src <= '9') 1292 no = *src++ - '0'; 1293 else 1294 no = -1; 1295 1296 if (no < 0) { /* Ordinary character. */ 1297 if (c == '\\' && (*src == '\\' || *src == '&')) 1298 c = *src++; 1299 *dst++ = c; 1300 } else if (prog->startp[no] != (char *)NULL && 1301 prog->endp[no] != (char *)NULL) { 1302 len = prog->endp[no] - prog->startp[no]; 1303 strncpy(dst, prog->startp[no], len); 1304 dst += len; 1305 if (len != 0 && *(dst - 1) == '\0') { /* strncpy hit NUL. */ 1306 regerror("damaged match string"); 1307 return; 1308 } 1309 } 1310 } 1311 *dst++ = '\0'; 1312 } 1313 1314 1315 #ifdef __STDC__ 1316 1317 void regerror(char *s) 1318 1319 #else 1320 1321 void regerror(s) 1322 char *s; 1323 1324 #endif 1325 { 1326 fprintf(stderr, "regexp(3): %s", s); 1327 exit(1); 1328 } 1329