1 /* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 * 21 * Beware that some of this code is subtly aware of the way operator 22 * precedence is structured in regular expressions. Serious changes in 23 * regular-expression syntax might require a total rethink. 24 */ 25 #include <stdio.h> 26 #ifdef UNIX 27 #ifdef USG 28 #include <string.h> 29 #else 30 #include <strings.h> 31 #endif 32 #else 33 #include <string.h> 34 #endif 35 #include "regexp.h" 36 #include "regmagic.h" 37 #include "proto.h" 38 39 /* 40 * The "internal use only" fields in regexp.h are present to pass info from 41 * compile to execute that permits the execute phase to run lots faster on 42 * simple cases. They are: 43 * 44 * regstart char that must begin a match; '\0' if none obvious 45 * reganch is the match anchored (at beginning-of-line only)? 46 * regmust string (pointer into program) that match must include, or NULL 47 * regmlen length of regmust string 48 * 49 * Regstart and reganch permit very fast decisions on suitable starting points 50 * for a match, cutting down the work a lot. Regmust permits fast rejection 51 * of lines that cannot possibly match. The regmust tests are costly enough 52 * that regcomp() supplies a regmust only if the r.e. contains something 53 * potentially expensive (at present, the only such thing detected is * or + 54 * at the start of the r.e., which can involve a lot of backup). Regmlen is 55 * supplied because the test in regexec() needs it and regcomp() is computing 56 * it anyway. 57 */ 58 59 /* 60 * Structure for regexp "program". This is essentially a linear encoding 61 * of a nondeterministic finite-state machine (aka syntax charts or 62 * "railroad normal form" in parsing technology). Each node is an opcode 63 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 64 * all nodes except BRANCH implement concatenation; a "next" pointer with 65 * a BRANCH on both ends of it is connecting two alternatives. (Here we 66 * have one of the subtle syntax dependencies: an individual BRANCH (as 67 * opposed to a collection of them) is never concatenated with anything 68 * because of operator precedence.) The operand of some types of node is 69 * a literal string; for others, it is a node leading into a sub-FSM. In 70 * particular, the operand of a BRANCH node is the first node of the branch. 71 * (NB this is *not* a tree structure: the tail of the branch connects 72 * to the thing following the set of BRANCHes.) The opcodes are: 73 */ 74 75 /* definition number opnd? meaning */ 76 #define END 0 /* no End of program. */ 77 #define BOL 1 /* no Match "" at beginning of line. */ 78 #define EOL 2 /* no Match "" at end of line. */ 79 #define ANY 3 /* no Match any one character. */ 80 #define ANYOF 4 /* str Match any character in this string. */ 81 #define ANYBUT 5 /* str Match any character not in this string. */ 82 #define BRANCH 6 /* node Match this alternative, or the next... */ 83 #define BACK 7 /* no Match "", "next" ptr points backward. */ 84 #define EXACTLY 8 /* str Match this string. */ 85 #define NOTHING 9 /* no Match empty string. */ 86 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 87 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 88 #define OPEN 20 /* no Mark this point in input as start of #n. */ 89 /* OPEN+1 is number 1, etc. */ 90 #define CLOSE 30 /* no Analogous to OPEN. */ 91 92 /* 93 * Opcode notes: 94 * 95 * BRANCH The set of branches constituting a single choice are hooked 96 * together with their "next" pointers, since precedence prevents 97 * anything being concatenated to any individual branch. The 98 * "next" pointer of the last BRANCH in a choice points to the 99 * thing following the whole choice. This is also where the 100 * final "next" pointer of each individual branch points; each 101 * branch starts with the operand node of a BRANCH node. 102 * 103 * BACK Normal "next" pointers all implicitly point forward; BACK 104 * exists to make loop structures possible. 105 * 106 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 107 * BRANCH structures using BACK. Simple cases (one character 108 * per match) are implemented with STAR and PLUS for speed 109 * and to minimize recursive plunges. 110 * 111 * OPEN,CLOSE ...are numbered at compile time. 112 */ 113 114 /* 115 * A node is one char of opcode followed by two chars of "next" pointer. 116 * "Next" pointers are stored as two 8-bit pieces, high order first. The 117 * value is a positive offset from the opcode of the node containing it. 118 * An operand, if any, simply follows the node. (Note that much of the 119 * code generation knows about this implicit relationship.) 120 * 121 * Using two bytes for the "next" pointer is vast overkill for most things, 122 * but allows patterns to get big without disasters. 123 */ 124 #define OP(p) (*(p)) 125 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 126 #define OPERAND(p) ((p) + 3) 127 128 /* 129 * See regmagic.h for one further detail of program structure. 130 */ 131 132 133 /* 134 * Utility definitions. 135 */ 136 #ifndef CHARBITS 137 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 138 #else 139 #define UCHARAT(p) ((int)*(p)&CHARBITS) 140 #endif 141 142 #define FAIL(m) { regerror(m); return(NULL); } 143 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 144 #define META "^$.[()|?+*\\" 145 146 /* 147 * Flags to be passed up and down. 148 */ 149 #define HASWIDTH 01 /* Known never to match null string. */ 150 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 151 #define SPSTART 04 /* Starts with * or +. */ 152 #define WORST 0 /* Worst case. */ 153 #define STATIC 154 /* 155 * Global work variables for regcomp(). 156 */ 157 158 STATIC char *regparse; /* Input-scan pointer. */ 159 STATIC int regnpar; /* () count. */ 160 STATIC unsigned char regdummy; 161 STATIC unsigned char *regcode; /* Code-emit pointer; ®dummy = don't. */ 162 STATIC long regsize; /* Code size. */ 163 164 /* 165 * Forward declarations for regcomp()'s friends. 166 */ 167 STATIC unsigned char *reg(int paren, int *flagp ); 168 STATIC unsigned char *regbranch(int *flagp ); 169 STATIC unsigned char *regpiece(int *flagp ); 170 STATIC unsigned char *regatom(int *flagp ); 171 STATIC unsigned char *regnode(int op ); 172 STATIC void regc(int b ); 173 STATIC void reginsert(int op, unsigned char *opnd ); 174 STATIC void regtail(unsigned char *p, unsigned char *val ); 175 STATIC void regoptail(unsigned char *p, unsigned char *val ); 176 STATIC int regtry(regexp *prog, unsigned char *string ); 177 STATIC int regmatch(unsigned char *prog ); 178 STATIC int regrepeat(unsigned char *p ); 179 STATIC unsigned char *regnext(unsigned char *p ); 180 STATIC unsigned char *regprop(unsigned char *op ); 181 182 #ifdef STRCSPN 183 STATIC int strcspn(unsigned char *s1, unsigned char *s2 ); 184 #endif 185 186 /* 187 - regcomp - compile a regular expression into internal code 188 * 189 * We can't allocate space until we know how big the compiled form will be, 190 * but we can't compile it (and thus know how big it is) until we've got a 191 * place to put the code. So we cheat: we compile it twice, once with code 192 * generation turned off and size counting turned on, and once "for real". 193 * This also means that we don't allocate space until we are sure that the 194 * thing really will compile successfully, and we never have to move the 195 * code and thus invalidate pointers into it. (Note that it has to be in 196 * one piece because free() must be able to free it all.) 197 * 198 * Beware that the optimization-preparation code in here knows about some 199 * of the structure of the compiled regexp. 200 */ 201 regexp *regcomp(char *exp) { 202 register regexp *r; 203 register unsigned char *scan; 204 register unsigned char *longest; 205 register int len; 206 int flags; 207 208 if (exp == NULL) 209 FAIL("NULL argument"); 210 211 /* First pass: determine size, legality. */ 212 regparse = exp; 213 regnpar = 1; 214 regsize = 0L; 215 regcode = ®dummy; 216 regc(MAGIC); 217 if (reg(0, &flags) == NULL) 218 return(NULL); 219 220 /* Small enough for pointer-storage convention? */ 221 if (regsize >= 32767L) /* Probably could be 65535L. */ 222 FAIL("regexp too big"); 223 224 /* Allocate space. */ 225 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 226 if (r == NULL) 227 FAIL("out of space"); 228 229 /* Second pass: emit code. */ 230 regparse = exp; 231 regnpar = 1; 232 regcode = r->program; 233 regc(MAGIC); 234 if (reg(0, &flags) == NULL) 235 return(NULL); 236 237 /* Dig out information for optimizations. */ 238 r->regstart = '\0'; /* Worst-case defaults. */ 239 r->reganch = 0; 240 r->regmust = NULL; 241 r->regmlen = 0; 242 scan = r->program+1; /* First BRANCH. */ 243 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 244 scan = OPERAND(scan); 245 246 /* Starting-point info. */ 247 if (OP(scan) == EXACTLY) 248 r->regstart = *OPERAND(scan); 249 else if (OP(scan) == BOL) 250 r->reganch++; 251 252 /* 253 * If there's something expensive in the r.e., find the 254 * longest literal string that must appear and make it the 255 * regmust. Resolve ties in favor of later strings, since 256 * the regstart check works with the beginning of the r.e. 257 * and avoiding duplication strengthens checking. Not a 258 * strong reason, but sufficient in the absence of others. 259 */ 260 if (flags&SPSTART) { 261 longest = NULL; 262 len = 0; 263 for (; scan != NULL; scan = regnext(scan)) 264 if (OP(scan) == EXACTLY 265 && strlen((char *)OPERAND(scan)) >= len) { 266 longest = OPERAND(scan); 267 len = strlen((char *)OPERAND(scan)); 268 } 269 r->regmust = longest; 270 r->regmlen = len; 271 } 272 } 273 274 return(r); 275 } 276 277 /* 278 - reg - regular expression, i.e. main body or parenthesized thing 279 * 280 * Caller must absorb opening parenthesis. 281 * 282 * Combining parenthesis handling with the base level of regular expression 283 * is a trifle forced, but the need to tie the tails of the branches to what 284 * follows makes it hard to avoid. 285 */ 286 STATIC unsigned char *reg(int paren, int *flagp) { 287 /* Parenthesized? paren */ 288 register unsigned char *ret; 289 register unsigned char *br; 290 register unsigned char *ender; 291 register int parno; 292 int flags; 293 294 *flagp = HASWIDTH; /* Tentatively. */ 295 296 /* Make an OPEN node, if parenthesized. */ 297 if (paren) { 298 if (regnpar >= NSUBEXP) 299 FAIL("too many ()"); 300 parno = regnpar; 301 regnpar++; 302 ret = regnode(OPEN+parno); 303 } else 304 ret = NULL; 305 306 /* Pick up the branches, linking them together. */ 307 br = regbranch(&flags); 308 if (br == NULL) 309 return(NULL); 310 if (ret != NULL) 311 regtail(ret, br); /* OPEN -> first. */ 312 else 313 ret = br; 314 if (!(flags&HASWIDTH)) 315 *flagp &= ~HASWIDTH; 316 *flagp |= flags&SPSTART; 317 while (*regparse == '|') { 318 regparse++; 319 br = regbranch(&flags); 320 if (br == NULL) 321 return(NULL); 322 regtail(ret, br); /* BRANCH -> BRANCH. */ 323 if (!(flags&HASWIDTH)) 324 *flagp &= ~HASWIDTH; 325 *flagp |= flags&SPSTART; 326 } 327 328 /* Make a closing node, and hook it on the end. */ 329 ender = regnode((paren) ? CLOSE+parno : END); 330 regtail(ret, ender); 331 332 /* Hook the tails of the branches to the closing node. */ 333 for (br = ret; br != NULL; br = regnext(br)) 334 regoptail(br, ender); 335 336 /* Check for proper termination. */ 337 if (paren && *regparse++ != ')') { 338 FAIL("unmatched ()"); 339 } else if (!paren && *regparse != '\0') { 340 if (*regparse == ')') { 341 FAIL("unmatched ()"); 342 } else 343 FAIL("junk on end"); /* "Can't happen". */ 344 /* NOTREACHED */ 345 } 346 347 return(ret); 348 } 349 350 /* 351 - regbranch - one alternative of an | operator 352 * 353 * Implements the concatenation operator. 354 */ 355 STATIC unsigned char *regbranch(int *flagp) { 356 register unsigned char *ret; 357 register unsigned char *chain; 358 register unsigned char *latest; 359 int flags; 360 361 *flagp = WORST; /* Tentatively. */ 362 363 ret = regnode(BRANCH); 364 chain = NULL; 365 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { 366 latest = regpiece(&flags); 367 if (latest == NULL) 368 return(NULL); 369 *flagp |= flags&HASWIDTH; 370 if (chain == NULL) /* First piece. */ 371 *flagp |= flags&SPSTART; 372 else 373 regtail(chain, latest); 374 chain = latest; 375 } 376 if (chain == NULL) /* Loop ran zero times. */ 377 (void) regnode(NOTHING); 378 379 return(ret); 380 } 381 382 /* 383 - regpiece - something followed by possible [*+?] 384 * 385 * Note that the branching code sequences used for ? and the general cases 386 * of * and + are somewhat optimized: they use the same NOTHING node as 387 * both the endmarker for their branch list and the body of the last branch. 388 * It might seem that this node could be dispensed with entirely, but the 389 * endmarker role is not redundant. 390 */ 391 STATIC unsigned char *regpiece(int *flagp) { 392 register unsigned char *ret; 393 register unsigned char op; 394 register unsigned char *next; 395 int flags; 396 397 ret = regatom(&flags); 398 if (ret == NULL) 399 return(NULL); 400 401 op = *regparse; 402 if (!ISMULT(op)) { 403 *flagp = flags; 404 return(ret); 405 } 406 407 if (!(flags&HASWIDTH) && op != '?') 408 FAIL("*+ operand could be empty"); 409 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 410 411 if (op == '*' && (flags&SIMPLE)) 412 reginsert(STAR, ret); 413 else if (op == '*') { 414 /* Emit x* as (x&|), where & means "self". */ 415 reginsert(BRANCH, ret); /* Either x */ 416 regoptail(ret, regnode(BACK)); /* and loop */ 417 regoptail(ret, ret); /* back */ 418 regtail(ret, regnode(BRANCH)); /* or */ 419 regtail(ret, regnode(NOTHING)); /* null. */ 420 } else if (op == '+' && (flags&SIMPLE)) 421 reginsert(PLUS, ret); 422 else if (op == '+') { 423 /* Emit x+ as x(&|), where & means "self". */ 424 next = regnode(BRANCH); /* Either */ 425 regtail(ret, next); 426 regtail(regnode(BACK), ret); /* loop back */ 427 regtail(next, regnode(BRANCH)); /* or */ 428 regtail(ret, regnode(NOTHING)); /* null. */ 429 } else if (op == '?') { 430 /* Emit x? as (x|) */ 431 reginsert(BRANCH, ret); /* Either x */ 432 regtail(ret, regnode(BRANCH)); /* or */ 433 next = regnode(NOTHING); /* null. */ 434 regtail(ret, next); 435 regoptail(ret, next); 436 } 437 regparse++; 438 if (ISMULT(*regparse)) 439 FAIL("nested *?+"); 440 441 return(ret); 442 } 443 444 /* 445 - regatom - the lowest level 446 * 447 * Optimization: gobbles an entire sequence of ordinary characters so that 448 * it can turn them into a single node, which is smaller to store and 449 * faster to run. Backslashed characters are exceptions, each becoming a 450 * separate node; the code is simpler that way and it's not worth fixing. 451 */ 452 STATIC unsigned char *regatom(int *flagp) { 453 register unsigned char *ret; 454 int flags; 455 456 *flagp = WORST; /* Tentatively. */ 457 458 switch (*regparse++) { 459 case '^': 460 ret = regnode(BOL); 461 break; 462 case '$': 463 ret = regnode(EOL); 464 break; 465 case '.': 466 ret = regnode(ANY); 467 *flagp |= HASWIDTH|SIMPLE; 468 break; 469 case '[': { 470 register int class; 471 register int classend; 472 473 if (*regparse == '^') { /* Complement of range. */ 474 ret = regnode(ANYBUT); 475 regparse++; 476 } else 477 ret = regnode(ANYOF); 478 if (*regparse == ']' || *regparse == '-') 479 regc(*regparse++); 480 while (*regparse != '\0' && *regparse != ']') { 481 if (*regparse == '-') { 482 regparse++; 483 if (*regparse == ']' || *regparse == '\0') 484 regc('-'); 485 else { 486 class = UCHARAT(regparse-2)+1; 487 classend = UCHARAT(regparse); 488 if (class > classend+1) 489 FAIL("invalid [] range"); 490 for (; class <= classend; class++) 491 regc(class); 492 regparse++; 493 } 494 } else 495 regc(*regparse++); 496 } 497 regc('\0'); 498 if (*regparse != ']') 499 FAIL("unmatched []"); 500 regparse++; 501 *flagp |= HASWIDTH|SIMPLE; 502 } 503 break; 504 case '(': 505 ret = reg(1, &flags); 506 if (ret == NULL) 507 return(NULL); 508 *flagp |= flags&(HASWIDTH|SPSTART); 509 break; 510 case '\0': 511 case '|': 512 case ')': 513 FAIL("internal urp"); /* Supposed to be caught earlier. */ 514 break; 515 case '?': 516 case '+': 517 case '*': 518 FAIL("?+* follows nothing"); 519 break; 520 case '\\': 521 if (*regparse == '\0') 522 FAIL("trailing \\"); 523 ret = regnode(EXACTLY); 524 regc(*regparse++); 525 regc('\0'); 526 *flagp |= HASWIDTH|SIMPLE; 527 break; 528 default: { 529 register int len; 530 register unsigned char ender; 531 532 regparse--; 533 #ifdef STRCSPN 534 len = strcspn(regparse, (unsigned char *)META); 535 #else 536 len = strcspn((char *)regparse, META); 537 #endif 538 if (len <= 0) 539 FAIL("internal disaster"); 540 ender = *(regparse+len); 541 if (len > 1 && ISMULT(ender)) 542 len--; /* Back off clear of ?+* operand. */ 543 *flagp |= HASWIDTH; 544 if (len == 1) 545 *flagp |= SIMPLE; 546 ret = regnode(EXACTLY); 547 while (len > 0) { 548 regc(*regparse++); 549 len--; 550 } 551 regc('\0'); 552 } 553 break; 554 } 555 556 return(ret); 557 } 558 559 /* 560 - regnode - emit a node 561 */ 562 STATIC unsigned char *regnode(int iop) { 563 /* Return value is location */ 564 register unsigned char *ret; 565 register unsigned char *ptr; 566 unsigned char op; 567 568 op = (unsigned char) iop; 569 ret = regcode; 570 if (ret == ®dummy) { 571 regsize += 3; 572 return(ret); 573 } 574 575 ptr = ret; 576 *ptr++ = op; 577 *ptr++ = '\0'; /* Null "next" pointer. */ 578 *ptr++ = '\0'; 579 regcode = ptr; 580 581 return(ret); 582 } 583 584 /* 585 - regc - emit (if appropriate) a byte of code 586 */ 587 STATIC void regc(int ib) { 588 unsigned char b; 589 590 b = (unsigned char) ib; 591 if (regcode != ®dummy) 592 *regcode++ = b; 593 else 594 regsize++; 595 } 596 597 /* 598 - reginsert - insert an operator in front of already-emitted operand 599 * 600 * Means relocating the operand. 601 */ 602 STATIC void reginsert(int iop, unsigned char *opnd) { 603 register unsigned char *src; 604 register unsigned char *dst; 605 register unsigned char *place; 606 unsigned char op; 607 608 op = (unsigned char) iop; 609 if (regcode == ®dummy) { 610 regsize += 3; 611 return; 612 } 613 614 src = regcode; 615 regcode += 3; 616 dst = regcode; 617 while (src > opnd) 618 *--dst = *--src; 619 620 place = opnd; /* Op node, where operand used to be. */ 621 *place++ = op; 622 *place++ = '\0'; 623 *place++ = '\0'; 624 } 625 626 /* 627 - regtail - set the next-pointer at the end of a node chain 628 */ 629 STATIC void regtail(unsigned char *p, unsigned char *val) { 630 register unsigned char *scan; 631 register unsigned char *temp; 632 register int offset; 633 634 if (p == ®dummy) 635 return; 636 637 /* Find last node. */ 638 scan = p; 639 for (;;) { 640 temp = regnext(scan); 641 if (temp == NULL) 642 break; 643 scan = temp; 644 } 645 646 if (OP(scan) == BACK) 647 offset = scan - val; 648 else 649 offset = val - scan; 650 *(scan+1) = (offset>>8)&0377; 651 *(scan+2) = offset&0377; 652 } 653 654 /* 655 - regoptail - regtail on operand of first argument; nop if operandless 656 */ 657 STATIC void regoptail(unsigned char *p, unsigned char *val) { 658 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 659 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 660 return; 661 regtail(OPERAND(p), val); 662 } 663 664 /* 665 * regexec and friends 666 */ 667 668 /* 669 * Global work variables for regexec(). 670 */ 671 STATIC unsigned char *reginput; /* String-input pointer. */ 672 STATIC unsigned char *regbol; /* Beginning of input, for ^ check. */ 673 STATIC unsigned char **regstartp; /* Pointer to startp array. */ 674 STATIC unsigned char **regendp; /* Ditto for endp. */ 675 676 #ifdef DEBUG 677 int regnarrate = 0; 678 void regdump(void); 679 STATIC unsigned char *regprop(void); 680 #endif 681 682 /* 683 - regexec - match a regexp against a string 684 */ 685 int 686 regexec(register regexp *prog, register unsigned char *string) { 687 register unsigned char *s; 688 #ifndef STDLIB 689 extern char *strchr(); 690 #endif 691 692 /* Be paranoid... */ 693 if (prog == NULL || string == NULL) { 694 regerror("NULL parameter"); 695 return(0); 696 } 697 698 /* Check validity of program. */ 699 if (UCHARAT(prog->program) != MAGIC) { 700 regerror("corrupted program"); 701 return(0); 702 } 703 704 /* If there is a "must appear" string, look for it. */ 705 if (prog->regmust != NULL) { 706 s = string; 707 while ((s = (unsigned char *)strchr((char *)s,prog->regmust[0])) 708 != NULL) { 709 if (strncmp((char *)s, (char *)prog->regmust, 710 prog->regmlen) 711 == 0) 712 break; /* Found it. */ 713 s++; 714 } 715 if (s == NULL) /* Not present. */ 716 return(0); 717 } 718 719 /* Mark beginning of line for ^ . */ 720 regbol = string; 721 722 /* Simplest case: anchored match need be tried only once. */ 723 if (prog->reganch) 724 return(regtry(prog, string)); 725 726 /* Messy cases: unanchored match. */ 727 s = string; 728 if (prog->regstart != '\0') 729 /* We know what char it must start with. */ 730 while ((s = (unsigned char *)strchr((char *)s, prog->regstart)) 731 != NULL) { 732 if (regtry(prog, s)) 733 return(1); 734 s++; 735 } 736 else 737 /* We don't -- general case. */ 738 do { 739 if (regtry(prog, s)) 740 return(1); 741 } while (*s++ != '\0'); 742 743 /* Failure. */ 744 return(0); 745 } 746 747 /* 748 - regtry - try match at specific point 749 */ 750 STATIC int regtry(regexp *prog, unsigned char *string) { 751 /* Return value 0 failure, 1 success */ 752 register int i; 753 register unsigned char **sp; 754 register unsigned char **ep; 755 756 reginput = string; 757 regstartp = prog->startp; 758 regendp = prog->endp; 759 760 sp = prog->startp; 761 ep = prog->endp; 762 for (i = NSUBEXP; i > 0; i--) { 763 *sp++ = NULL; 764 *ep++ = NULL; 765 } 766 if (regmatch(prog->program + 1)) { 767 prog->startp[0] = string; 768 prog->endp[0] = reginput; 769 return(1); 770 } else 771 return(0); 772 } 773 774 /* 775 - regmatch - main matching routine 776 * 777 * Conceptually the strategy is simple: check to see whether the current 778 * node matches, call self recursively to see whether the rest matches, 779 * and then act accordingly. In practice we make some effort to avoid 780 * recursion, in particular by going through "ordinary" nodes (that don't 781 * need to know whether the rest of the match failed) by a loop instead of 782 * by recursion. 783 */ 784 STATIC int regmatch( unsigned char *prog) { 785 /* Return value 0 failure 1 success */ 786 register unsigned char *scan; /* Current node. */ 787 unsigned char *next; /* Next node. */ 788 #ifndef STDLIB 789 extern char *strchr(); 790 #endif 791 792 scan = prog; 793 #ifdef DEBUG 794 if (scan != NULL && regnarrate) 795 fprintf(stderr, "%s(\n", regprop(scan)); 796 #endif 797 while (scan != NULL) { 798 #ifdef DEBUG 799 if (regnarrate) 800 fprintf(stderr, "%s...\n", regprop(scan)); 801 #endif 802 next = regnext(scan); 803 804 switch (OP(scan)) { 805 case BOL: 806 if (reginput != regbol) 807 return(0); 808 break; 809 case EOL: 810 if (*reginput != '\0') 811 return(0); 812 break; 813 case ANY: 814 if (*reginput == '\0') 815 return(0); 816 reginput++; 817 break; 818 case EXACTLY: { 819 register int len; 820 register unsigned char *opnd; 821 822 opnd = OPERAND(scan); 823 /* Inline the first character, for speed. */ 824 if (*opnd != *reginput) 825 return(0); 826 len = strlen((char *)opnd); 827 if (len > 1 && strncmp((char *)opnd, 828 (char *)reginput, len) 829 != 0) 830 return(0); 831 reginput += len; 832 } 833 break; 834 case ANYOF: 835 if (*reginput == '\0' 836 || strchr((char *)OPERAND(scan), *reginput) == NULL) 837 return(0); 838 reginput++; 839 break; 840 case ANYBUT: 841 if (*reginput == '\0' 842 || strchr((char *)OPERAND(scan), *reginput) != NULL) 843 return(0); 844 reginput++; 845 break; 846 case NOTHING: 847 break; 848 case BACK: 849 break; 850 case OPEN+1: 851 case OPEN+2: 852 case OPEN+3: 853 case OPEN+4: 854 case OPEN+5: 855 case OPEN+6: 856 case OPEN+7: 857 case OPEN+8: 858 case OPEN+9: { 859 register int no; 860 register unsigned char *save; 861 862 no = OP(scan) - OPEN; 863 save = reginput; 864 865 if (regmatch(next)) { 866 /* 867 * Don't set startp if some later 868 * invocation of the same parentheses 869 * already has. 870 */ 871 if (regstartp[no] == NULL) 872 regstartp[no] = save; 873 return(1); 874 } else 875 return(0); 876 } 877 break; 878 case CLOSE+1: 879 case CLOSE+2: 880 case CLOSE+3: 881 case CLOSE+4: 882 case CLOSE+5: 883 case CLOSE+6: 884 case CLOSE+7: 885 case CLOSE+8: 886 case CLOSE+9: { 887 register int no; 888 register unsigned char *save; 889 890 no = OP(scan) - CLOSE; 891 save = reginput; 892 893 if (regmatch(next)) { 894 /* 895 * Don't set endp if some later 896 * invocation of the same parentheses 897 * already has. 898 */ 899 if (regendp[no] == NULL) 900 regendp[no] = save; 901 return(1); 902 } else 903 return(0); 904 } 905 break; 906 case BRANCH: { 907 register unsigned char *save; 908 909 if (OP(next) != BRANCH) /* No choice. */ 910 next = OPERAND(scan); /* Avoid recursion. */ 911 else { 912 do { 913 save = reginput; 914 if (regmatch(OPERAND(scan))) 915 return(1); 916 reginput = save; 917 scan = regnext(scan); 918 } while (scan != NULL && OP(scan) == BRANCH); 919 return(0); 920 /* NOTREACHED */ 921 } 922 } 923 break; 924 case STAR: 925 case PLUS: { 926 register unsigned char nextch; 927 register int no; 928 register unsigned char *save; 929 register int min; 930 931 /* 932 * Lookahead to avoid useless match attempts 933 * when we know what character comes next. 934 */ 935 nextch = '\0'; 936 if (OP(next) == EXACTLY) 937 nextch = *OPERAND(next); 938 min = (OP(scan) == STAR) ? 0 : 1; 939 save = reginput; 940 no = regrepeat(OPERAND(scan)); 941 while (no >= min) { 942 /* If it could work, try it. */ 943 if (nextch == '\0' || *reginput == nextch) 944 if (regmatch(next)) 945 return(1); 946 /* Couldn't or didn't -- back up. */ 947 no--; 948 reginput = save + no; 949 } 950 return(0); 951 } 952 break; 953 case END: 954 return(1); /* Success! */ 955 break; 956 default: 957 regerror("memory corruption"); 958 return(0); 959 break; 960 } 961 962 scan = next; 963 } 964 965 /* 966 * We get here only if there's trouble -- normally "case END" is 967 * the terminating point. 968 */ 969 regerror("corrupted pointers"); 970 return(0); 971 } 972 973 /* 974 - regrepeat - repeatedly match something simple, report how many 975 */ 976 STATIC int regrepeat(unsigned char *p) { 977 register int count = 0; 978 register unsigned char *scan; 979 register unsigned char *opnd; 980 981 scan = reginput; 982 opnd = OPERAND(p); 983 switch (OP(p)) { 984 case ANY: 985 count = strlen((char *)scan); 986 scan += count; 987 break; 988 case EXACTLY: 989 while (*opnd == *scan) { 990 count++; 991 scan++; 992 } 993 break; 994 case ANYOF: 995 while (*scan != '\0' && strchr((char *)opnd, *scan) != NULL) { 996 count++; 997 scan++; 998 } 999 break; 1000 case ANYBUT: 1001 while (*scan != '\0' && strchr((char *)opnd, *scan) == NULL) { 1002 count++; 1003 scan++; 1004 } 1005 break; 1006 default: /* Oh dear. Called inappropriately. */ 1007 regerror("internal foulup"); 1008 count = 0; /* Best compromise. */ 1009 break; 1010 } 1011 reginput = scan; 1012 1013 return(count); 1014 } 1015 1016 /* 1017 - regnext - dig the "next" pointer out of a node 1018 */ 1019 STATIC unsigned char *regnext(register unsigned char *p) { 1020 register int offset; 1021 1022 if (p == ®dummy) 1023 return(NULL); 1024 1025 offset = NEXT(p); 1026 if (offset == 0) 1027 return(NULL); 1028 1029 if (OP(p) == BACK) 1030 return(p-offset); 1031 else 1032 return(p+offset); 1033 } 1034 1035 #ifdef DEBUG 1036 1037 STATIC unsigned char *regprop(void); 1038 1039 /* 1040 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1041 */ 1042 void regdump(regexp r) { 1043 register unsigned char *s; 1044 register unsigned char op = EXACTLY; /* Arbitrary non-END op. */ 1045 register unsigned char *next; 1046 extern char *strchr(); 1047 1048 1049 s = r->program + 1; 1050 while (op != END) { /* While that wasn't END last time... */ 1051 op = OP(s); 1052 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1053 next = regnext(s); 1054 if (next == NULL) /* Next ptr. */ 1055 printf("(0)"); 1056 else 1057 printf("(%d)", (s-r->program)+(next-s)); 1058 s += 3; 1059 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1060 /* Literal string, where present. */ 1061 while (*s != '\0') { 1062 putchar(*s); 1063 s++; 1064 } 1065 s++; 1066 } 1067 putchar('\n'); 1068 } 1069 1070 /* Header fields of interest. */ 1071 if (r->regstart != '\0') 1072 printf("start `%c' ", r->regstart); 1073 if (r->reganch) 1074 printf("anchored "); 1075 if (r->regmust != NULL) 1076 printf("must have \"%s\"", r->regmust); 1077 printf("\n"); 1078 } 1079 1080 /* 1081 - regprop - printable representation of opcode 1082 */ 1083 STATIC unsigned char regprop_buf[50]; 1084 1085 STATIC unsigned char *regprop(unsigned char *op) { 1086 register unsigned char *p; 1087 1088 (void) strcpy(regprop_buf, ":"); 1089 1090 switch (OP(op)) { 1091 case BOL: 1092 p = "BOL"; 1093 break; 1094 case EOL: 1095 p = "EOL"; 1096 break; 1097 case ANY: 1098 p = "ANY"; 1099 break; 1100 case ANYOF: 1101 p = "ANYOF"; 1102 break; 1103 case ANYBUT: 1104 p = "ANYBUT"; 1105 break; 1106 case BRANCH: 1107 p = "BRANCH"; 1108 break; 1109 case EXACTLY: 1110 p = "EXACTLY"; 1111 break; 1112 case NOTHING: 1113 p = "NOTHING"; 1114 break; 1115 case BACK: 1116 p = "BACK"; 1117 break; 1118 case END: 1119 p = "END"; 1120 break; 1121 case OPEN+1: 1122 case OPEN+2: 1123 case OPEN+3: 1124 case OPEN+4: 1125 case OPEN+5: 1126 case OPEN+6: 1127 case OPEN+7: 1128 case OPEN+8: 1129 case OPEN+9: 1130 sprintf(regprop_buf+strlen(regprop_buf), "OPEN%d", OP(op)-OPEN); 1131 p = NULL; 1132 break; 1133 case CLOSE+1: 1134 case CLOSE+2: 1135 case CLOSE+3: 1136 case CLOSE+4: 1137 case CLOSE+5: 1138 case CLOSE+6: 1139 case CLOSE+7: 1140 case CLOSE+8: 1141 case CLOSE+9: 1142 sprintf(regprop_buf+strlen(regprop_buf), "CLOSE%d", OP(op)-CLOSE); 1143 p = NULL; 1144 break; 1145 case STAR: 1146 p = "STAR"; 1147 break; 1148 case PLUS: 1149 p = "PLUS"; 1150 break; 1151 default: 1152 regerror("corrupted opcode"); 1153 break; 1154 } 1155 if (p != NULL) 1156 (void) strcat(regprop_buf, p); 1157 return(regprop_buf); 1158 } 1159 #endif 1160 1161 /* 1162 * The following is provided for those people who do not have strcspn() in 1163 * their C libraries. They should get off their butts and do something 1164 * about it; at least one public-domain implementation of those (highly 1165 * useful) string routines has been published on Usenet. 1166 */ 1167 #ifdef STRCSPN 1168 /* 1169 * strcspn - find length of initial segment of s1 consisting entirely 1170 * of characters not from s2 1171 */ 1172 1173 STATIC int strcspn(unsigned char *s1, unsigned char *s2) { 1174 register unsigned char *scan1; 1175 register unsigned char *scan2; 1176 register int count; 1177 1178 count = 0; 1179 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1180 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1181 if (*scan1 == *scan2++) 1182 return(count); 1183 count++; 1184 } 1185 return(count); 1186 } 1187 #endif 1188