1 /* $NetBSD: regexp.c,v 1.3 2013/09/04 19:44:21 tron Exp $ */ 2 3 /* 4 * regcomp and regexec -- regsub and regerror are elsewhere 5 * 6 * Copyright (c) 1986 by University of Toronto. 7 * Written by Henry Spencer. Not derived from licensed software. 8 * 9 * Permission is granted to anyone to use this software for any 10 * purpose on any computer system, and to redistribute it freely, 11 * subject to the following restrictions: 12 * 13 * 1. The author is not responsible for the consequences of use of 14 * this software, no matter how awful, even if they arise 15 * from defects in it. 16 * 17 * 2. The origin of this software must not be misrepresented, either 18 * by explicit claim or by omission. 19 * 20 * 3. Altered versions must be plainly marked as such, and must not 21 * be misrepresented as being the original software. 22 * 23 * Beware that some of this code is subtly aware of the way operator 24 * precedence is structured in regular expressions. Serious changes in 25 * regular-expression syntax might require a total rethink. 26 * 27 * *** NOTE: this code has been altered slightly for use in Tcl. *** 28 * Slightly modified by David MacKenzie to undo most of the changes for TCL. 29 * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman 30 */ 31 32 #include "less.h" 33 #if HAVE_STDIO_H 34 #include <stdio.h> 35 #endif 36 #if HAVE_STDLIB_H 37 #include <stdlib.h> 38 #endif 39 #if HAVE_STRING_H 40 #include <string.h> 41 #endif 42 #include "regexp.h" 43 44 /* 45 * The "internal use only" fields in regexp.h are present to pass info from 46 * compile to execute that permits the execute phase to run lots faster on 47 * simple cases. They are: 48 * 49 * regstart char that must begin a match; '\0' if none obvious 50 * reganch is the match anchored (at beginning-of-line only)? 51 * regmust string (pointer into program) that match must include, or NULL 52 * regmlen length of regmust string 53 * 54 * Regstart and reganch permit very fast decisions on suitable starting points 55 * for a match, cutting down the work a lot. Regmust permits fast rejection 56 * of lines that cannot possibly match. The regmust tests are costly enough 57 * that regcomp() supplies a regmust only if the r.e. contains something 58 * potentially expensive (at present, the only such thing detected is * or + 59 * at the start of the r.e., which can involve a lot of backup). Regmlen is 60 * supplied because the test in regexec() needs it and regcomp() is 61 * computing it anyway. 62 */ 63 64 /* 65 * Structure for regexp "program". This is essentially a linear encoding 66 * of a nondeterministic finite-state machine (aka syntax charts or 67 * "railroad normal form" in parsing technology). Each node is an opcode 68 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 69 * all nodes except BRANCH implement concatenation; a "next" pointer with 70 * a BRANCH on both ends of it is connecting two alternatives. (Here we 71 * have one of the subtle syntax dependencies: an individual BRANCH (as 72 * opposed to a collection of them) is never concatenated with anything 73 * because of operator precedence.) The operand of some types of node is 74 * a literal string; for others, it is a node leading into a sub-FSM. In 75 * particular, the operand of a BRANCH node is the first node of the branch. 76 * (NB this is *not* a tree structure: the tail of the branch connects 77 * to the thing following the set of BRANCHes.) The opcodes are: 78 */ 79 80 /* definition number opnd? meaning */ 81 #undef EOL 82 #define END 0 /* no End of program. */ 83 #define BOL 1 /* no Match "" at beginning of line. */ 84 #define EOL 2 /* no Match "" at end of line. */ 85 #define ANY 3 /* no Match any one character. */ 86 #define ANYOF 4 /* str Match any character in this string. */ 87 #define ANYBUT 5 /* str Match any character not in this string. */ 88 #define BRANCH 6 /* node Match this alternative, or the next... */ 89 #define BACK 7 /* no Match "", "next" ptr points backward. */ 90 #define EXACTLY 8 /* str Match this string. */ 91 #define NOTHING 9 /* no Match empty string. */ 92 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 93 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 94 #define OPEN 20 /* no Mark this point in input as start of #n. */ 95 /* OPEN+1 is number 1, etc. */ 96 #define CLOSE 30 /* no Analogous to OPEN. */ 97 98 /* 99 * Opcode notes: 100 * 101 * BRANCH The set of branches constituting a single choice are hooked 102 * together with their "next" pointers, since precedence prevents 103 * anything being concatenated to any individual branch. The 104 * "next" pointer of the last BRANCH in a choice points to the 105 * thing following the whole choice. This is also where the 106 * final "next" pointer of each individual branch points; each 107 * branch starts with the operand node of a BRANCH node. 108 * 109 * BACK Normal "next" pointers all implicitly point forward; BACK 110 * exists to make loop structures possible. 111 * 112 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 113 * BRANCH structures using BACK. Simple cases (one character 114 * per match) are implemented with STAR and PLUS for speed 115 * and to minimize recursive plunges. 116 * 117 * OPEN,CLOSE ...are numbered at compile time. 118 */ 119 120 /* 121 * A node is one char of opcode followed by two chars of "next" pointer. 122 * "Next" pointers are stored as two 8-bit pieces, high order first. The 123 * value is a positive offset from the opcode of the node containing it. 124 * An operand, if any, simply follows the node. (Note that much of the 125 * code generation knows about this implicit relationship.) 126 * 127 * Using two bytes for the "next" pointer is vast overkill for most things, 128 * but allows patterns to get big without disasters. 129 */ 130 #define OP(p) (*(p)) 131 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 132 #define OPERAND(p) ((p) + 3) 133 134 /* 135 * See regmagic.h for one further detail of program structure. 136 */ 137 138 139 /* 140 * Utility definitions. 141 */ 142 #ifndef CHARBITS 143 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 144 #else 145 #define UCHARAT(p) ((int)*(p)&CHARBITS) 146 #endif 147 148 #define FAIL(m) { regerror(m); return(NULL); } 149 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 150 #define META "^$.[()|?+*\\" 151 152 /* 153 * Flags to be passed up and down. 154 */ 155 #define HASWIDTH 01 /* Known never to match null string. */ 156 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 157 #define SPSTART 04 /* Starts with * or +. */ 158 #define WORST 0 /* Worst case. */ 159 160 /* 161 * Global work variables for regcomp(). 162 */ 163 static char *regparse; /* Input-scan pointer. */ 164 static int regnpar; /* () count. */ 165 static char regdummy; 166 static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 167 static long regsize; /* Code size. */ 168 169 /* 170 * The first byte of the regexp internal "program" is actually this magic 171 * number; the start node begins in the second byte. 172 */ 173 #define MAGIC 0234 174 175 176 /* 177 * Forward declarations for regcomp()'s friends. 178 */ 179 #ifndef STATIC 180 #define STATIC static 181 #endif 182 STATIC char *reg(); 183 STATIC char *regbranch(); 184 STATIC char *regpiece(); 185 STATIC char *regatom(); 186 STATIC char *regnode(); 187 STATIC char *regnext(); 188 STATIC void regc(); 189 STATIC void reginsert(); 190 STATIC void regtail(); 191 STATIC void regoptail(); 192 #ifdef STRCSPN 193 STATIC int strcspn(); 194 #endif 195 196 /* 197 - regcomp - compile a regular expression into internal code 198 * 199 * We can't allocate space until we know how big the compiled form will be, 200 * but we can't compile it (and thus know how big it is) until we've got a 201 * place to put the code. So we cheat: we compile it twice, once with code 202 * generation turned off and size counting turned on, and once "for real". 203 * This also means that we don't allocate space until we are sure that the 204 * thing really will compile successfully, and we never have to move the 205 * code and thus invalidate pointers into it. (Note that it has to be in 206 * one piece because free() must be able to free it all.) 207 * 208 * Beware that the optimization-preparation code in here knows about some 209 * of the structure of the compiled regexp. 210 */ 211 regexp * 212 regcomp(exp) 213 char *exp; 214 { 215 register regexp *r; 216 register char *scan; 217 register char *longest; 218 register int len; 219 int flags; 220 221 if (exp == NULL) 222 FAIL("NULL argument"); 223 224 /* First pass: determine size, legality. */ 225 regparse = exp; 226 regnpar = 1; 227 regsize = 0L; 228 regcode = ®dummy; 229 regc(MAGIC); 230 if (reg(0, &flags) == NULL) 231 return(NULL); 232 233 /* Small enough for pointer-storage convention? */ 234 if (regsize >= 32767L) /* Probably could be 65535L. */ 235 FAIL("regexp too big"); 236 237 /* Allocate space. */ 238 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 239 if (r == NULL) 240 FAIL("out of space"); 241 242 /* Second pass: emit code. */ 243 regparse = exp; 244 regnpar = 1; 245 regcode = r->program; 246 regc(MAGIC); 247 if (reg(0, &flags) == NULL) 248 return(NULL); 249 250 /* Dig out information for optimizations. */ 251 r->regstart = '\0'; /* Worst-case defaults. */ 252 r->reganch = 0; 253 r->regmust = NULL; 254 r->regmlen = 0; 255 scan = r->program+1; /* First BRANCH. */ 256 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 257 scan = OPERAND(scan); 258 259 /* Starting-point info. */ 260 if (OP(scan) == EXACTLY) 261 r->regstart = *OPERAND(scan); 262 else if (OP(scan) == BOL) 263 r->reganch++; 264 265 /* 266 * If there's something expensive in the r.e., find the 267 * longest literal string that must appear and make it the 268 * regmust. Resolve ties in favor of later strings, since 269 * the regstart check works with the beginning of the r.e. 270 * and avoiding duplication strengthens checking. Not a 271 * strong reason, but sufficient in the absence of others. 272 */ 273 if (flags&SPSTART) { 274 longest = NULL; 275 len = 0; 276 for (; scan != NULL; scan = regnext(scan)) 277 if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) { 278 longest = OPERAND(scan); 279 len = strlen(OPERAND(scan)); 280 } 281 r->regmust = longest; 282 r->regmlen = len; 283 } 284 } 285 286 return(r); 287 } 288 289 /* 290 - reg - regular expression, i.e. main body or parenthesized thing 291 * 292 * Caller must absorb opening parenthesis. 293 * 294 * Combining parenthesis handling with the base level of regular expression 295 * is a trifle forced, but the need to tie the tails of the branches to what 296 * follows makes it hard to avoid. 297 */ 298 static char * 299 reg(paren, flagp) 300 int paren; /* Parenthesized? */ 301 int *flagp; 302 { 303 register char *ret; 304 register char *br; 305 register char *ender; 306 register int parno = 0; 307 int flags; 308 309 *flagp = HASWIDTH; /* Tentatively. */ 310 311 /* Make an OPEN node, if parenthesized. */ 312 if (paren) { 313 if (regnpar >= NSUBEXP) 314 FAIL("too many ()"); 315 parno = regnpar; 316 regnpar++; 317 ret = regnode(OPEN+parno); 318 } else 319 ret = NULL; 320 321 /* Pick up the branches, linking them together. */ 322 br = regbranch(&flags); 323 if (br == NULL) 324 return(NULL); 325 if (ret != NULL) 326 regtail(ret, br); /* OPEN -> first. */ 327 else 328 ret = br; 329 if (!(flags&HASWIDTH)) 330 *flagp &= ~HASWIDTH; 331 *flagp |= flags&SPSTART; 332 while (*regparse == '|') { 333 regparse++; 334 br = regbranch(&flags); 335 if (br == NULL) 336 return(NULL); 337 regtail(ret, br); /* BRANCH -> BRANCH. */ 338 if (!(flags&HASWIDTH)) 339 *flagp &= ~HASWIDTH; 340 *flagp |= flags&SPSTART; 341 } 342 343 /* Make a closing node, and hook it on the end. */ 344 ender = regnode((paren) ? CLOSE+parno : END); 345 regtail(ret, ender); 346 347 /* Hook the tails of the branches to the closing node. */ 348 for (br = ret; br != NULL; br = regnext(br)) 349 regoptail(br, ender); 350 351 /* Check for proper termination. */ 352 if (paren && *regparse++ != ')') { 353 FAIL("unmatched ()"); 354 } else if (!paren && *regparse != '\0') { 355 if (*regparse == ')') { 356 FAIL("unmatched ()"); 357 } else 358 FAIL("junk on end"); /* "Can't happen". */ 359 /* NOTREACHED */ 360 } 361 362 return(ret); 363 } 364 365 /* 366 - regbranch - one alternative of an | operator 367 * 368 * Implements the concatenation operator. 369 */ 370 static char * 371 regbranch(flagp) 372 int *flagp; 373 { 374 register char *ret; 375 register char *chain; 376 register char *latest; 377 int flags; 378 379 *flagp = WORST; /* Tentatively. */ 380 381 ret = regnode(BRANCH); 382 chain = NULL; 383 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { 384 latest = regpiece(&flags); 385 if (latest == NULL) 386 return(NULL); 387 *flagp |= flags&HASWIDTH; 388 if (chain == NULL) /* First piece. */ 389 *flagp |= flags&SPSTART; 390 else 391 regtail(chain, latest); 392 chain = latest; 393 } 394 if (chain == NULL) /* Loop ran zero times. */ 395 (void) regnode(NOTHING); 396 397 return(ret); 398 } 399 400 /* 401 - regpiece - something followed by possible [*+?] 402 * 403 * Note that the branching code sequences used for ? and the general cases 404 * of * and + are somewhat optimized: they use the same NOTHING node as 405 * both the endmarker for their branch list and the body of the last branch. 406 * It might seem that this node could be dispensed with entirely, but the 407 * endmarker role is not redundant. 408 */ 409 static char * 410 regpiece(flagp) 411 int *flagp; 412 { 413 register char *ret; 414 register char op; 415 register char *next; 416 int flags; 417 418 ret = regatom(&flags); 419 if (ret == NULL) 420 return(NULL); 421 422 op = *regparse; 423 if (!ISMULT(op)) { 424 *flagp = flags; 425 return(ret); 426 } 427 428 if (!(flags&HASWIDTH) && op != '?') 429 FAIL("*+ operand could be empty"); 430 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 431 432 if (op == '*' && (flags&SIMPLE)) 433 reginsert(STAR, ret); 434 else if (op == '*') { 435 /* Emit x* as (x&|), where & means "self". */ 436 reginsert(BRANCH, ret); /* Either x */ 437 regoptail(ret, regnode(BACK)); /* and loop */ 438 regoptail(ret, ret); /* back */ 439 regtail(ret, regnode(BRANCH)); /* or */ 440 regtail(ret, regnode(NOTHING)); /* null. */ 441 } else if (op == '+' && (flags&SIMPLE)) 442 reginsert(PLUS, ret); 443 else if (op == '+') { 444 /* Emit x+ as x(&|), where & means "self". */ 445 next = regnode(BRANCH); /* Either */ 446 regtail(ret, next); 447 regtail(regnode(BACK), ret); /* loop back */ 448 regtail(next, regnode(BRANCH)); /* or */ 449 regtail(ret, regnode(NOTHING)); /* null. */ 450 } else if (op == '?') { 451 /* Emit x? as (x|) */ 452 reginsert(BRANCH, ret); /* Either x */ 453 regtail(ret, regnode(BRANCH)); /* or */ 454 next = regnode(NOTHING); /* null. */ 455 regtail(ret, next); 456 regoptail(ret, next); 457 } 458 regparse++; 459 if (ISMULT(*regparse)) 460 FAIL("nested *?+"); 461 462 return(ret); 463 } 464 465 /* 466 - regatom - the lowest level 467 * 468 * Optimization: gobbles an entire sequence of ordinary characters so that 469 * it can turn them into a single node, which is smaller to store and 470 * faster to run. Backslashed characters are exceptions, each becoming a 471 * separate node; the code is simpler that way and it's not worth fixing. 472 */ 473 static char * 474 regatom(flagp) 475 int *flagp; 476 { 477 register char *ret; 478 int flags; 479 480 *flagp = WORST; /* Tentatively. */ 481 482 switch (*regparse++) { 483 case '^': 484 ret = regnode(BOL); 485 break; 486 case '$': 487 ret = regnode(EOL); 488 break; 489 case '.': 490 ret = regnode(ANY); 491 *flagp |= HASWIDTH|SIMPLE; 492 break; 493 case '[': { 494 register int clss; 495 register int classend; 496 497 if (*regparse == '^') { /* Complement of range. */ 498 ret = regnode(ANYBUT); 499 regparse++; 500 } else 501 ret = regnode(ANYOF); 502 if (*regparse == ']' || *regparse == '-') 503 regc(*regparse++); 504 while (*regparse != '\0' && *regparse != ']') { 505 if (*regparse == '-') { 506 regparse++; 507 if (*regparse == ']' || *regparse == '\0') 508 regc('-'); 509 else { 510 clss = UCHARAT(regparse-2)+1; 511 classend = UCHARAT(regparse); 512 if (clss > classend+1) 513 FAIL("invalid [] range"); 514 for (; clss <= classend; clss++) 515 regc(clss); 516 regparse++; 517 } 518 } else 519 regc(*regparse++); 520 } 521 regc('\0'); 522 if (*regparse != ']') 523 FAIL("unmatched []"); 524 regparse++; 525 *flagp |= HASWIDTH|SIMPLE; 526 } 527 break; 528 case '(': 529 ret = reg(1, &flags); 530 if (ret == NULL) 531 return(NULL); 532 *flagp |= flags&(HASWIDTH|SPSTART); 533 break; 534 case '\0': 535 case '|': 536 case ')': 537 FAIL("internal urp"); /* Supposed to be caught earlier. */ 538 /* NOTREACHED */ 539 break; 540 case '?': 541 case '+': 542 case '*': 543 FAIL("?+* follows nothing"); 544 /* NOTREACHED */ 545 break; 546 case '\\': 547 if (*regparse == '\0') 548 FAIL("trailing \\"); 549 ret = regnode(EXACTLY); 550 regc(*regparse++); 551 regc('\0'); 552 *flagp |= HASWIDTH|SIMPLE; 553 break; 554 default: { 555 register int len; 556 register char ender; 557 558 regparse--; 559 len = strcspn(regparse, META); 560 if (len <= 0) 561 FAIL("internal disaster"); 562 ender = *(regparse+len); 563 if (len > 1 && ISMULT(ender)) 564 len--; /* Back off clear of ?+* operand. */ 565 *flagp |= HASWIDTH; 566 if (len == 1) 567 *flagp |= SIMPLE; 568 ret = regnode(EXACTLY); 569 while (len > 0) { 570 regc(*regparse++); 571 len--; 572 } 573 regc('\0'); 574 } 575 break; 576 } 577 578 return(ret); 579 } 580 581 /* 582 - regnode - emit a node 583 */ 584 static char * /* Location. */ 585 regnode(op) 586 char op; 587 { 588 register char *ret; 589 register char *ptr; 590 591 ret = regcode; 592 if (ret == ®dummy) { 593 regsize += 3; 594 return(ret); 595 } 596 597 ptr = ret; 598 *ptr++ = op; 599 *ptr++ = '\0'; /* Null "next" pointer. */ 600 *ptr++ = '\0'; 601 regcode = ptr; 602 603 return(ret); 604 } 605 606 /* 607 - regc - emit (if appropriate) a byte of code 608 */ 609 static void 610 regc(b) 611 char b; 612 { 613 if (regcode != ®dummy) 614 *regcode++ = b; 615 else 616 regsize++; 617 } 618 619 /* 620 - reginsert - insert an operator in front of already-emitted operand 621 * 622 * Means relocating the operand. 623 */ 624 static void 625 reginsert(op, opnd) 626 char op; 627 char *opnd; 628 { 629 register char *src; 630 register char *dst; 631 register char *place; 632 633 if (regcode == ®dummy) { 634 regsize += 3; 635 return; 636 } 637 638 src = regcode; 639 regcode += 3; 640 dst = regcode; 641 while (src > opnd) 642 *--dst = *--src; 643 644 place = opnd; /* Op node, where operand used to be. */ 645 *place++ = op; 646 *place++ = '\0'; 647 *place++ = '\0'; 648 } 649 650 /* 651 - regtail - set the next-pointer at the end of a node chain 652 */ 653 static void 654 regtail(p, val) 655 char *p; 656 char *val; 657 { 658 register char *scan; 659 register char *temp; 660 register int offset; 661 662 if (p == ®dummy) 663 return; 664 665 /* Find last node. */ 666 scan = p; 667 for (;;) { 668 temp = regnext(scan); 669 if (temp == NULL) 670 break; 671 scan = temp; 672 } 673 674 if (OP(scan) == BACK) 675 offset = scan - val; 676 else 677 offset = val - scan; 678 *(scan+1) = (offset>>8)&0377; 679 *(scan+2) = offset&0377; 680 } 681 682 /* 683 - regoptail - regtail on operand of first argument; nop if operandless 684 */ 685 static void 686 regoptail(p, val) 687 char *p; 688 char *val; 689 { 690 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 691 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 692 return; 693 regtail(OPERAND(p), val); 694 } 695 696 /* 697 * regexec and friends 698 */ 699 700 /* 701 * Global work variables for regexec(). 702 */ 703 static char *reginput; /* String-input pointer. */ 704 static char *regbol; /* Beginning of input, for ^ check. */ 705 static char **regstartp; /* Pointer to startp array. */ 706 static char **regendp; /* Ditto for endp. */ 707 708 /* 709 * Forwards. 710 */ 711 STATIC int regtry(); 712 STATIC int regmatch(); 713 STATIC int regrepeat(); 714 715 #ifdef DEBUG 716 int regnarrate = 0; 717 void regdump(); 718 STATIC char *regprop(); 719 #endif 720 721 /* 722 - regexec - match a regexp against a string 723 */ 724 int 725 regexec2(prog, string, notbol) 726 register regexp *prog; 727 register char *string; 728 int notbol; 729 { 730 register char *s; 731 732 /* Be paranoid... */ 733 if (prog == NULL || string == NULL) { 734 regerror("NULL parameter"); 735 return(0); 736 } 737 738 /* Check validity of program. */ 739 if (UCHARAT(prog->program) != MAGIC) { 740 regerror("corrupted program"); 741 return(0); 742 } 743 744 /* If there is a "must appear" string, look for it. */ 745 if (prog->regmust != NULL) { 746 s = string; 747 while ((s = strchr(s, prog->regmust[0])) != NULL) { 748 if (strncmp(s, prog->regmust, prog->regmlen) == 0) 749 break; /* Found it. */ 750 s++; 751 } 752 if (s == NULL) /* Not present. */ 753 return(0); 754 } 755 756 /* Mark beginning of line for ^ . */ 757 if (notbol) 758 regbol = NULL; 759 else 760 regbol = string; 761 762 /* Simplest case: anchored match need be tried only once. */ 763 if (prog->reganch) 764 return(regtry(prog, string)); 765 766 /* Messy cases: unanchored match. */ 767 s = string; 768 if (prog->regstart != '\0') 769 /* We know what char it must start with. */ 770 while ((s = strchr(s, prog->regstart)) != NULL) { 771 if (regtry(prog, s)) 772 return(1); 773 s++; 774 } 775 else 776 /* We don't -- general case. */ 777 do { 778 if (regtry(prog, s)) 779 return(1); 780 } while (*s++ != '\0'); 781 782 /* Failure. */ 783 return(0); 784 } 785 786 int 787 regexec(prog, string) 788 register regexp *prog; 789 register char *string; 790 { 791 return regexec2(prog, string, 0); 792 } 793 794 /* 795 - regtry - try match at specific point 796 */ 797 static int /* 0 failure, 1 success */ 798 regtry(prog, string) 799 regexp *prog; 800 char *string; 801 { 802 register int i; 803 register char **sp; 804 register char **ep; 805 806 reginput = string; 807 regstartp = prog->startp; 808 regendp = prog->endp; 809 810 sp = prog->startp; 811 ep = prog->endp; 812 for (i = NSUBEXP; i > 0; i--) { 813 *sp++ = NULL; 814 *ep++ = NULL; 815 } 816 if (regmatch(prog->program + 1)) { 817 prog->startp[0] = string; 818 prog->endp[0] = reginput; 819 return(1); 820 } else 821 return(0); 822 } 823 824 /* 825 - regmatch - main matching routine 826 * 827 * Conceptually the strategy is simple: check to see whether the current 828 * node matches, call self recursively to see whether the rest matches, 829 * and then act accordingly. In practice we make some effort to avoid 830 * recursion, in particular by going through "ordinary" nodes (that don't 831 * need to know whether the rest of the match failed) by a loop instead of 832 * by recursion. 833 */ 834 static int /* 0 failure, 1 success */ 835 regmatch(prog) 836 char *prog; 837 { 838 register char *scan; /* Current node. */ 839 char *next; /* Next node. */ 840 841 scan = prog; 842 #ifdef DEBUG 843 if (scan != NULL && regnarrate) 844 fprintf(stderr, "%s(\n", regprop(scan)); 845 #endif 846 while (scan != NULL) { 847 #ifdef DEBUG 848 if (regnarrate) 849 fprintf(stderr, "%s...\n", regprop(scan)); 850 #endif 851 next = regnext(scan); 852 853 switch (OP(scan)) { 854 case BOL: 855 if (reginput != regbol) 856 return(0); 857 break; 858 case EOL: 859 if (*reginput != '\0') 860 return(0); 861 break; 862 case ANY: 863 if (*reginput == '\0') 864 return(0); 865 reginput++; 866 break; 867 case EXACTLY: { 868 register int len; 869 register char *opnd; 870 871 opnd = OPERAND(scan); 872 /* Inline the first character, for speed. */ 873 if (*opnd != *reginput) 874 return(0); 875 len = strlen(opnd); 876 if (len > 1 && strncmp(opnd, reginput, len) != 0) 877 return(0); 878 reginput += len; 879 } 880 break; 881 case ANYOF: 882 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 883 return(0); 884 reginput++; 885 break; 886 case ANYBUT: 887 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 888 return(0); 889 reginput++; 890 break; 891 case NOTHING: 892 break; 893 case BACK: 894 break; 895 case OPEN+1: 896 case OPEN+2: 897 case OPEN+3: 898 case OPEN+4: 899 case OPEN+5: 900 case OPEN+6: 901 case OPEN+7: 902 case OPEN+8: 903 case OPEN+9: { 904 register int no; 905 register char *save; 906 907 no = OP(scan) - OPEN; 908 save = reginput; 909 910 if (regmatch(next)) { 911 /* 912 * Don't set startp if some later 913 * invocation of the same parentheses 914 * already has. 915 */ 916 if (regstartp[no] == NULL) 917 regstartp[no] = save; 918 return(1); 919 } else 920 return(0); 921 } 922 /* NOTREACHED */ 923 break; 924 case CLOSE+1: 925 case CLOSE+2: 926 case CLOSE+3: 927 case CLOSE+4: 928 case CLOSE+5: 929 case CLOSE+6: 930 case CLOSE+7: 931 case CLOSE+8: 932 case CLOSE+9: { 933 register int no; 934 register char *save; 935 936 no = OP(scan) - CLOSE; 937 save = reginput; 938 939 if (regmatch(next)) { 940 /* 941 * Don't set endp if some later 942 * invocation of the same parentheses 943 * already has. 944 */ 945 if (regendp[no] == NULL) 946 regendp[no] = save; 947 return(1); 948 } else 949 return(0); 950 } 951 /* NOTREACHED */ 952 break; 953 case BRANCH: { 954 register char *save; 955 956 if (OP(next) != BRANCH) /* No choice. */ 957 next = OPERAND(scan); /* Avoid recursion. */ 958 else { 959 do { 960 save = reginput; 961 if (regmatch(OPERAND(scan))) 962 return(1); 963 reginput = save; 964 scan = regnext(scan); 965 } while (scan != NULL && OP(scan) == BRANCH); 966 return(0); 967 /* NOTREACHED */ 968 } 969 } 970 /* NOTREACHED */ 971 break; 972 case STAR: 973 case PLUS: { 974 register char nextch; 975 register int no; 976 register char *save; 977 register int min; 978 979 /* 980 * Lookahead to avoid useless match attempts 981 * when we know what character comes next. 982 */ 983 nextch = '\0'; 984 if (OP(next) == EXACTLY) 985 nextch = *OPERAND(next); 986 min = (OP(scan) == STAR) ? 0 : 1; 987 save = reginput; 988 no = regrepeat(OPERAND(scan)); 989 while (no >= min) { 990 /* If it could work, try it. */ 991 if (nextch == '\0' || *reginput == nextch) 992 if (regmatch(next)) 993 return(1); 994 /* Couldn't or didn't -- back up. */ 995 no--; 996 reginput = save + no; 997 } 998 return(0); 999 } 1000 /* NOTREACHED */ 1001 break; 1002 case END: 1003 return(1); /* Success! */ 1004 /* NOTREACHED */ 1005 break; 1006 default: 1007 regerror("memory corruption"); 1008 return(0); 1009 /* NOTREACHED */ 1010 break; 1011 } 1012 1013 scan = next; 1014 } 1015 1016 /* 1017 * We get here only if there's trouble -- normally "case END" is 1018 * the terminating point. 1019 */ 1020 regerror("corrupted pointers"); 1021 return(0); 1022 } 1023 1024 /* 1025 - regrepeat - repeatedly match something simple, report how many 1026 */ 1027 static int 1028 regrepeat(p) 1029 char *p; 1030 { 1031 register int count = 0; 1032 register char *scan; 1033 register char *opnd; 1034 1035 scan = reginput; 1036 opnd = OPERAND(p); 1037 switch (OP(p)) { 1038 case ANY: 1039 count = strlen(scan); 1040 scan += count; 1041 break; 1042 case EXACTLY: 1043 while (*opnd == *scan) { 1044 count++; 1045 scan++; 1046 } 1047 break; 1048 case ANYOF: 1049 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1050 count++; 1051 scan++; 1052 } 1053 break; 1054 case ANYBUT: 1055 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1056 count++; 1057 scan++; 1058 } 1059 break; 1060 default: /* Oh dear. Called inappropriately. */ 1061 regerror("internal foulup"); 1062 count = 0; /* Best compromise. */ 1063 break; 1064 } 1065 reginput = scan; 1066 1067 return(count); 1068 } 1069 1070 /* 1071 - regnext - dig the "next" pointer out of a node 1072 */ 1073 static char * 1074 regnext(p) 1075 register char *p; 1076 { 1077 register int offset; 1078 1079 if (p == ®dummy) 1080 return(NULL); 1081 1082 offset = NEXT(p); 1083 if (offset == 0) 1084 return(NULL); 1085 1086 if (OP(p) == BACK) 1087 return(p-offset); 1088 else 1089 return(p+offset); 1090 } 1091 1092 #ifdef DEBUG 1093 1094 STATIC char *regprop(); 1095 1096 /* 1097 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1098 */ 1099 void 1100 regdump(r) 1101 regexp *r; 1102 { 1103 register char *s; 1104 register char op = EXACTLY; /* Arbitrary non-END op. */ 1105 register char *next; 1106 1107 1108 s = r->program + 1; 1109 while (op != END) { /* While that wasn't END last time... */ 1110 op = OP(s); 1111 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1112 next = regnext(s); 1113 if (next == NULL) /* Next ptr. */ 1114 printf("(0)"); 1115 else 1116 printf("(%d)", (s-r->program)+(next-s)); 1117 s += 3; 1118 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1119 /* Literal string, where present. */ 1120 while (*s != '\0') { 1121 putchar(*s); 1122 s++; 1123 } 1124 s++; 1125 } 1126 putchar('\n'); 1127 } 1128 1129 /* Header fields of interest. */ 1130 if (r->regstart != '\0') 1131 printf("start `%c' ", r->regstart); 1132 if (r->reganch) 1133 printf("anchored "); 1134 if (r->regmust != NULL) 1135 printf("must have \"%s\"", r->regmust); 1136 printf("\n"); 1137 } 1138 1139 /* 1140 - regprop - printable representation of opcode 1141 */ 1142 static char * 1143 regprop(op) 1144 char *op; 1145 { 1146 register char *p; 1147 static char buf[50]; 1148 1149 (void) strcpy(buf, ":"); 1150 1151 switch (OP(op)) { 1152 case BOL: 1153 p = "BOL"; 1154 break; 1155 case EOL: 1156 p = "EOL"; 1157 break; 1158 case ANY: 1159 p = "ANY"; 1160 break; 1161 case ANYOF: 1162 p = "ANYOF"; 1163 break; 1164 case ANYBUT: 1165 p = "ANYBUT"; 1166 break; 1167 case BRANCH: 1168 p = "BRANCH"; 1169 break; 1170 case EXACTLY: 1171 p = "EXACTLY"; 1172 break; 1173 case NOTHING: 1174 p = "NOTHING"; 1175 break; 1176 case BACK: 1177 p = "BACK"; 1178 break; 1179 case END: 1180 p = "END"; 1181 break; 1182 case OPEN+1: 1183 case OPEN+2: 1184 case OPEN+3: 1185 case OPEN+4: 1186 case OPEN+5: 1187 case OPEN+6: 1188 case OPEN+7: 1189 case OPEN+8: 1190 case OPEN+9: 1191 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 1192 p = NULL; 1193 break; 1194 case CLOSE+1: 1195 case CLOSE+2: 1196 case CLOSE+3: 1197 case CLOSE+4: 1198 case CLOSE+5: 1199 case CLOSE+6: 1200 case CLOSE+7: 1201 case CLOSE+8: 1202 case CLOSE+9: 1203 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1204 p = NULL; 1205 break; 1206 case STAR: 1207 p = "STAR"; 1208 break; 1209 case PLUS: 1210 p = "PLUS"; 1211 break; 1212 default: 1213 regerror("corrupted opcode"); 1214 break; 1215 } 1216 if (p != NULL) 1217 (void) strcat(buf, p); 1218 return(buf); 1219 } 1220 #endif 1221 1222 /* 1223 * The following is provided for those people who do not have strcspn() in 1224 * their C libraries. They should get off their butts and do something 1225 * about it; at least one public-domain implementation of those (highly 1226 * useful) string routines has been published on Usenet. 1227 */ 1228 #ifdef STRCSPN 1229 /* 1230 * strcspn - find length of initial segment of s1 consisting entirely 1231 * of characters not from s2 1232 */ 1233 1234 static int 1235 strcspn(s1, s2) 1236 char *s1; 1237 char *s2; 1238 { 1239 register char *scan1; 1240 register char *scan2; 1241 register int count; 1242 1243 count = 0; 1244 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1245 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1246 if (*scan1 == *scan2++) 1247 return(count); 1248 count++; 1249 } 1250 return(count); 1251 } 1252 #endif 1253