1 /* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 * 21 * Beware that some of this code is subtly aware of the way operator 22 * precedence is structured in regular expressions. Serious changes in 23 * regular-expression syntax might require a total rethink. 24 */ 25 #include <stdio.h> 26 #ifdef UNIX 27 #ifdef USG 28 #include <string.h> 29 #else 30 #include <strings.h> 31 #endif 32 #else 33 #include <string.h> 34 #endif 35 #include "regexp.h" 36 #include "regmagic.h" 37 #include "proto.h" 38 39 /* 40 * The "internal use only" fields in regexp.h are present to pass info from 41 * compile to execute that permits the execute phase to run lots faster on 42 * simple cases. They are: 43 * 44 * regstart char that must begin a match; '\0' if none obvious 45 * reganch is the match anchored (at beginning-of-line only)? 46 * regmust string (pointer into program) that match must include, or NULL 47 * regmlen length of regmust string 48 * 49 * Regstart and reganch permit very fast decisions on suitable starting points 50 * for a match, cutting down the work a lot. Regmust permits fast rejection 51 * of lines that cannot possibly match. The regmust tests are costly enough 52 * that regcomp() supplies a regmust only if the r.e. contains something 53 * potentially expensive (at present, the only such thing detected is * or + 54 * at the start of the r.e., which can involve a lot of backup). Regmlen is 55 * supplied because the test in regexec() needs it and regcomp() is computing 56 * it anyway. 57 */ 58 59 /* 60 * Structure for regexp "program". This is essentially a linear encoding 61 * of a nondeterministic finite-state machine (aka syntax charts or 62 * "railroad normal form" in parsing technology). Each node is an opcode 63 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 64 * all nodes except BRANCH implement concatenation; a "next" pointer with 65 * a BRANCH on both ends of it is connecting two alternatives. (Here we 66 * have one of the subtle syntax dependencies: an individual BRANCH (as 67 * opposed to a collection of them) is never concatenated with anything 68 * because of operator precedence.) The operand of some types of node is 69 * a literal string; for others, it is a node leading into a sub-FSM. In 70 * particular, the operand of a BRANCH node is the first node of the branch. 71 * (NB this is *not* a tree structure: the tail of the branch connects 72 * to the thing following the set of BRANCHes.) The opcodes are: 73 */ 74 75 /* definition number opnd? meaning */ 76 #define END 0 /* no End of program. */ 77 #define BOL 1 /* no Match "" at beginning of line. */ 78 #define EOL 2 /* no Match "" at end of line. */ 79 #define ANY 3 /* no Match any one character. */ 80 #define ANYOF 4 /* str Match any character in this string. */ 81 #define ANYBUT 5 /* str Match any character not in this string. */ 82 #define BRANCH 6 /* node Match this alternative, or the next... */ 83 #define BACK 7 /* no Match "", "next" ptr points backward. */ 84 #define EXACTLY 8 /* str Match this string. */ 85 #define NOTHING 9 /* no Match empty string. */ 86 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 87 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 88 #define OPEN 20 /* no Mark this point in input as start of #n. */ 89 /* OPEN+1 is number 1, etc. */ 90 #define CLOSE 30 /* no Analogous to OPEN. */ 91 92 /* 93 * Opcode notes: 94 * 95 * BRANCH The set of branches constituting a single choice are hooked 96 * together with their "next" pointers, since precedence prevents 97 * anything being concatenated to any individual branch. The 98 * "next" pointer of the last BRANCH in a choice points to the 99 * thing following the whole choice. This is also where the 100 * final "next" pointer of each individual branch points; each 101 * branch starts with the operand node of a BRANCH node. 102 * 103 * BACK Normal "next" pointers all implicitly point forward; BACK 104 * exists to make loop structures possible. 105 * 106 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 107 * BRANCH structures using BACK. Simple cases (one character 108 * per match) are implemented with STAR and PLUS for speed 109 * and to minimize recursive plunges. 110 * 111 * OPEN,CLOSE ...are numbered at compile time. 112 */ 113 114 /* 115 * A node is one char of opcode followed by two chars of "next" pointer. 116 * "Next" pointers are stored as two 8-bit pieces, high order first. The 117 * value is a positive offset from the opcode of the node containing it. 118 * An operand, if any, simply follows the node. (Note that much of the 119 * code generation knows about this implicit relationship.) 120 * 121 * Using two bytes for the "next" pointer is vast overkill for most things, 122 * but allows patterns to get big without disasters. 123 */ 124 #define OP(p) (*(p)) 125 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 126 #define OPERAND(p) ((p) + 3) 127 128 /* 129 * See regmagic.h for one further detail of program structure. 130 */ 131 132 133 /* 134 * Utility definitions. 135 */ 136 #ifndef CHARBITS 137 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 138 #else 139 #define UCHARAT(p) ((int)*(p)&CHARBITS) 140 #endif 141 142 #define FAIL(m) { regerror(m); return(NULL); } 143 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 144 #define META "^$.[()|?+*\\" 145 146 /* 147 * Flags to be passed up and down. 148 */ 149 #define HASWIDTH 01 /* Known never to match null string. */ 150 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 151 #define SPSTART 04 /* Starts with * or +. */ 152 #define WORST 0 /* Worst case. */ 153 #define STATIC 154 /* 155 * Global work variables for regcomp(). 156 */ 157 158 STATIC char *regparse; /* Input-scan pointer. */ 159 STATIC int regnpar; /* () count. */ 160 STATIC unsigned char regdummy; 161 STATIC unsigned char *regcode; /* Code-emit pointer; ®dummy = don't. */ 162 STATIC long regsize; /* Code size. */ 163 164 /* 165 * Forward declarations for regcomp()'s friends. 166 */ 167 STATIC unsigned char *reg(int paren, int *flagp ); 168 STATIC unsigned char *regbranch(int *flagp ); 169 STATIC unsigned char *regpiece(int *flagp ); 170 STATIC unsigned char *regatom(int *flagp ); 171 STATIC unsigned char *regnode(int op ); 172 STATIC void regc(int b ); 173 STATIC void reginsert(int op, unsigned char *opnd ); 174 STATIC void regtail(unsigned char *p, unsigned char *val ); 175 STATIC void regoptail(unsigned char *p, unsigned char *val ); 176 STATIC int regtry(regexp *prog, unsigned char *string ); 177 STATIC int regmatch(unsigned char *prog ); 178 STATIC int regrepeat(unsigned char *p ); 179 STATIC unsigned char *regnext(unsigned char *p ); 180 STATIC unsigned char *regprop(unsigned char *op ); 181 182 #ifdef STRCSPN 183 STATIC int strcspn(unsigned char *s1, unsigned char *s2 ); 184 #endif 185 186 /* 187 - regcomp - compile a regular expression into internal code 188 * 189 * We can't allocate space until we know how big the compiled form will be, 190 * but we can't compile it (and thus know how big it is) until we've got a 191 * place to put the code. So we cheat: we compile it twice, once with code 192 * generation turned off and size counting turned on, and once "for real". 193 * This also means that we don't allocate space until we are sure that the 194 * thing really will compile successfully, and we never have to move the 195 * code and thus invalidate pointers into it. (Note that it has to be in 196 * one piece because free() must be able to free it all.) 197 * 198 * Beware that the optimization-preparation code in here knows about some 199 * of the structure of the compiled regexp. 200 */ 201 regexp *regcomp(char *exp) { 202 register regexp *r; 203 register unsigned char *scan; 204 register unsigned char *longest; 205 register int len; 206 int flags; 207 208 if (exp == NULL) 209 FAIL("NULL argument"); 210 211 /* First pass: determine size, legality. */ 212 regparse = exp; 213 regnpar = 1; 214 regsize = 0L; 215 regcode = ®dummy; 216 regc(MAGIC); 217 if (reg(0, &flags) == NULL) 218 return(NULL); 219 220 /* Small enough for pointer-storage convention? */ 221 if (regsize >= 32767L) /* Probably could be 65535L. */ 222 FAIL("regexp too big"); 223 224 /* Allocate space. */ 225 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 226 if (r == NULL) 227 FAIL("out of space"); 228 229 /* Second pass: emit code. */ 230 regparse = exp; 231 regnpar = 1; 232 regcode = r->program; 233 regc(MAGIC); 234 if (reg(0, &flags) == NULL) { 235 free(r); 236 return(NULL); 237 } 238 239 /* Dig out information for optimizations. */ 240 r->regstart = '\0'; /* Worst-case defaults. */ 241 r->reganch = 0; 242 r->regmust = NULL; 243 r->regmlen = 0; 244 scan = r->program+1; /* First BRANCH. */ 245 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 246 scan = OPERAND(scan); 247 248 /* Starting-point info. */ 249 if (OP(scan) == EXACTLY) 250 r->regstart = *OPERAND(scan); 251 else if (OP(scan) == BOL) 252 r->reganch++; 253 254 /* 255 * If there's something expensive in the r.e., find the 256 * longest literal string that must appear and make it the 257 * regmust. Resolve ties in favor of later strings, since 258 * the regstart check works with the beginning of the r.e. 259 * and avoiding duplication strengthens checking. Not a 260 * strong reason, but sufficient in the absence of others. 261 */ 262 if (flags&SPSTART) { 263 longest = NULL; 264 len = 0; 265 for (; scan != NULL; scan = regnext(scan)) 266 if (OP(scan) == EXACTLY 267 && strlen((char *)OPERAND(scan)) >= len) { 268 longest = OPERAND(scan); 269 len = strlen((char *)OPERAND(scan)); 270 } 271 r->regmust = longest; 272 r->regmlen = len; 273 } 274 } 275 276 return(r); 277 } 278 279 /* 280 - reg - regular expression, i.e. main body or parenthesized thing 281 * 282 * Caller must absorb opening parenthesis. 283 * 284 * Combining parenthesis handling with the base level of regular expression 285 * is a trifle forced, but the need to tie the tails of the branches to what 286 * follows makes it hard to avoid. 287 */ 288 STATIC unsigned char *reg(int paren, int *flagp) { 289 /* Parenthesized? paren */ 290 register unsigned char *ret; 291 register unsigned char *br; 292 register unsigned char *ender; 293 register int parno; 294 int flags; 295 296 *flagp = HASWIDTH; /* Tentatively. */ 297 298 /* Make an OPEN node, if parenthesized. */ 299 if (paren) { 300 if (regnpar >= NSUBEXP) 301 FAIL("too many ()"); 302 parno = regnpar; 303 regnpar++; 304 ret = regnode(OPEN+parno); 305 } else 306 ret = NULL; 307 308 /* Pick up the branches, linking them together. */ 309 br = regbranch(&flags); 310 if (br == NULL) 311 return(NULL); 312 if (ret != NULL) 313 regtail(ret, br); /* OPEN -> first. */ 314 else 315 ret = br; 316 if (!(flags&HASWIDTH)) 317 *flagp &= ~HASWIDTH; 318 *flagp |= flags&SPSTART; 319 while (*regparse == '|') { 320 regparse++; 321 br = regbranch(&flags); 322 if (br == NULL) 323 return(NULL); 324 regtail(ret, br); /* BRANCH -> BRANCH. */ 325 if (!(flags&HASWIDTH)) 326 *flagp &= ~HASWIDTH; 327 *flagp |= flags&SPSTART; 328 } 329 330 /* Make a closing node, and hook it on the end. */ 331 ender = regnode((paren) ? CLOSE+parno : END); 332 regtail(ret, ender); 333 334 /* Hook the tails of the branches to the closing node. */ 335 for (br = ret; br != NULL; br = regnext(br)) 336 regoptail(br, ender); 337 338 /* Check for proper termination. */ 339 if (paren && *regparse++ != ')') { 340 FAIL("unmatched ()"); 341 } else if (!paren && *regparse != '\0') { 342 if (*regparse == ')') { 343 FAIL("unmatched ()"); 344 } else 345 FAIL("junk on end"); /* "Can't happen". */ 346 /* NOTREACHED */ 347 } 348 349 return(ret); 350 } 351 352 /* 353 - regbranch - one alternative of an | operator 354 * 355 * Implements the concatenation operator. 356 */ 357 STATIC unsigned char *regbranch(int *flagp) { 358 register unsigned char *ret; 359 register unsigned char *chain; 360 register unsigned char *latest; 361 int flags; 362 363 *flagp = WORST; /* Tentatively. */ 364 365 ret = regnode(BRANCH); 366 chain = NULL; 367 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { 368 latest = regpiece(&flags); 369 if (latest == NULL) 370 return(NULL); 371 *flagp |= flags&HASWIDTH; 372 if (chain == NULL) /* First piece. */ 373 *flagp |= flags&SPSTART; 374 else 375 regtail(chain, latest); 376 chain = latest; 377 } 378 if (chain == NULL) /* Loop ran zero times. */ 379 (void) regnode(NOTHING); 380 381 return(ret); 382 } 383 384 /* 385 - regpiece - something followed by possible [*+?] 386 * 387 * Note that the branching code sequences used for ? and the general cases 388 * of * and + are somewhat optimized: they use the same NOTHING node as 389 * both the endmarker for their branch list and the body of the last branch. 390 * It might seem that this node could be dispensed with entirely, but the 391 * endmarker role is not redundant. 392 */ 393 STATIC unsigned char *regpiece(int *flagp) { 394 register unsigned char *ret; 395 register unsigned char op; 396 register unsigned char *next; 397 int flags; 398 399 ret = regatom(&flags); 400 if (ret == NULL) 401 return(NULL); 402 403 op = *regparse; 404 if (!ISMULT(op)) { 405 *flagp = flags; 406 return(ret); 407 } 408 409 if (!(flags&HASWIDTH) && op != '?') 410 FAIL("*+ operand could be empty"); 411 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 412 413 if (op == '*' && (flags&SIMPLE)) 414 reginsert(STAR, ret); 415 else if (op == '*') { 416 /* Emit x* as (x&|), where & means "self". */ 417 reginsert(BRANCH, ret); /* Either x */ 418 regoptail(ret, regnode(BACK)); /* and loop */ 419 regoptail(ret, ret); /* back */ 420 regtail(ret, regnode(BRANCH)); /* or */ 421 regtail(ret, regnode(NOTHING)); /* null. */ 422 } else if (op == '+' && (flags&SIMPLE)) 423 reginsert(PLUS, ret); 424 else if (op == '+') { 425 /* Emit x+ as x(&|), where & means "self". */ 426 next = regnode(BRANCH); /* Either */ 427 regtail(ret, next); 428 regtail(regnode(BACK), ret); /* loop back */ 429 regtail(next, regnode(BRANCH)); /* or */ 430 regtail(ret, regnode(NOTHING)); /* null. */ 431 } else if (op == '?') { 432 /* Emit x? as (x|) */ 433 reginsert(BRANCH, ret); /* Either x */ 434 regtail(ret, regnode(BRANCH)); /* or */ 435 next = regnode(NOTHING); /* null. */ 436 regtail(ret, next); 437 regoptail(ret, next); 438 } 439 regparse++; 440 if (ISMULT(*regparse)) 441 FAIL("nested *?+"); 442 443 return(ret); 444 } 445 446 /* 447 - regatom - the lowest level 448 * 449 * Optimization: gobbles an entire sequence of ordinary characters so that 450 * it can turn them into a single node, which is smaller to store and 451 * faster to run. Backslashed characters are exceptions, each becoming a 452 * separate node; the code is simpler that way and it's not worth fixing. 453 */ 454 STATIC unsigned char *regatom(int *flagp) { 455 register unsigned char *ret; 456 int flags; 457 458 *flagp = WORST; /* Tentatively. */ 459 460 switch (*regparse++) { 461 case '^': 462 ret = regnode(BOL); 463 break; 464 case '$': 465 ret = regnode(EOL); 466 break; 467 case '.': 468 ret = regnode(ANY); 469 *flagp |= HASWIDTH|SIMPLE; 470 break; 471 case '[': { 472 register int class; 473 register int classend; 474 475 if (*regparse == '^') { /* Complement of range. */ 476 ret = regnode(ANYBUT); 477 regparse++; 478 } else 479 ret = regnode(ANYOF); 480 if (*regparse == ']' || *regparse == '-') 481 regc(*regparse++); 482 while (*regparse != '\0' && *regparse != ']') { 483 if (*regparse == '-') { 484 regparse++; 485 if (*regparse == ']' || *regparse == '\0') 486 regc('-'); 487 else { 488 class = UCHARAT(regparse-2)+1; 489 classend = UCHARAT(regparse); 490 if (class > classend+1) 491 FAIL("invalid [] range"); 492 for (; class <= classend; class++) 493 regc(class); 494 regparse++; 495 } 496 } else 497 regc(*regparse++); 498 } 499 regc('\0'); 500 if (*regparse != ']') 501 FAIL("unmatched []"); 502 regparse++; 503 *flagp |= HASWIDTH|SIMPLE; 504 } 505 break; 506 case '(': 507 ret = reg(1, &flags); 508 if (ret == NULL) 509 return(NULL); 510 *flagp |= flags&(HASWIDTH|SPSTART); 511 break; 512 case '\0': 513 case '|': 514 case ')': 515 FAIL("internal urp"); /* Supposed to be caught earlier. */ 516 break; 517 case '?': 518 case '+': 519 case '*': 520 FAIL("?+* follows nothing"); 521 break; 522 case '\\': 523 if (*regparse == '\0') 524 FAIL("trailing \\"); 525 ret = regnode(EXACTLY); 526 regc(*regparse++); 527 regc('\0'); 528 *flagp |= HASWIDTH|SIMPLE; 529 break; 530 default: { 531 register int len; 532 register unsigned char ender; 533 534 regparse--; 535 #ifdef STRCSPN 536 len = strcspn(regparse, (unsigned char *)META); 537 #else 538 len = strcspn((char *)regparse, META); 539 #endif 540 if (len <= 0) 541 FAIL("internal disaster"); 542 ender = *(regparse+len); 543 if (len > 1 && ISMULT(ender)) 544 len--; /* Back off clear of ?+* operand. */ 545 *flagp |= HASWIDTH; 546 if (len == 1) 547 *flagp |= SIMPLE; 548 ret = regnode(EXACTLY); 549 while (len > 0) { 550 regc(*regparse++); 551 len--; 552 } 553 regc('\0'); 554 } 555 break; 556 } 557 558 return(ret); 559 } 560 561 /* 562 - regnode - emit a node 563 */ 564 STATIC unsigned char *regnode(int iop) { 565 /* Return value is location */ 566 register unsigned char *ret; 567 register unsigned char *ptr; 568 unsigned char op; 569 570 op = (unsigned char) iop; 571 ret = regcode; 572 if (ret == ®dummy) { 573 regsize += 3; 574 return(ret); 575 } 576 577 ptr = ret; 578 *ptr++ = op; 579 *ptr++ = '\0'; /* Null "next" pointer. */ 580 *ptr++ = '\0'; 581 regcode = ptr; 582 583 return(ret); 584 } 585 586 /* 587 - regc - emit (if appropriate) a byte of code 588 */ 589 STATIC void regc(int ib) { 590 unsigned char b; 591 592 b = (unsigned char) ib; 593 if (regcode != ®dummy) 594 *regcode++ = b; 595 else 596 regsize++; 597 } 598 599 /* 600 - reginsert - insert an operator in front of already-emitted operand 601 * 602 * Means relocating the operand. 603 */ 604 STATIC void reginsert(int iop, unsigned char *opnd) { 605 register unsigned char *src; 606 register unsigned char *dst; 607 register unsigned char *place; 608 unsigned char op; 609 610 op = (unsigned char) iop; 611 if (regcode == ®dummy) { 612 regsize += 3; 613 return; 614 } 615 616 src = regcode; 617 regcode += 3; 618 dst = regcode; 619 while (src > opnd) 620 *--dst = *--src; 621 622 place = opnd; /* Op node, where operand used to be. */ 623 *place++ = op; 624 *place++ = '\0'; 625 *place++ = '\0'; 626 } 627 628 /* 629 - regtail - set the next-pointer at the end of a node chain 630 */ 631 STATIC void regtail(unsigned char *p, unsigned char *val) { 632 register unsigned char *scan; 633 register unsigned char *temp; 634 register int offset; 635 636 if (p == ®dummy) 637 return; 638 639 /* Find last node. */ 640 scan = p; 641 for (;;) { 642 temp = regnext(scan); 643 if (temp == NULL) 644 break; 645 scan = temp; 646 } 647 648 if (OP(scan) == BACK) 649 offset = scan - val; 650 else 651 offset = val - scan; 652 *(scan+1) = (offset>>8)&0377; 653 *(scan+2) = offset&0377; 654 } 655 656 /* 657 - regoptail - regtail on operand of first argument; nop if operandless 658 */ 659 STATIC void regoptail(unsigned char *p, unsigned char *val) { 660 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 661 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 662 return; 663 regtail(OPERAND(p), val); 664 } 665 666 /* 667 * regexec and friends 668 */ 669 670 /* 671 * Global work variables for regexec(). 672 */ 673 STATIC unsigned char *reginput; /* String-input pointer. */ 674 STATIC unsigned char *regbol; /* Beginning of input, for ^ check. */ 675 STATIC unsigned char **regstartp; /* Pointer to startp array. */ 676 STATIC unsigned char **regendp; /* Ditto for endp. */ 677 678 #ifdef DEBUG 679 int regnarrate = 0; 680 void regdump(void); 681 STATIC unsigned char *regprop(void); 682 #endif 683 684 /* 685 - regexec - match a regexp against a string 686 */ 687 int 688 regexec(register regexp *prog, register unsigned char *string) { 689 register unsigned char *s; 690 #ifndef STDLIB 691 extern char *strchr(); 692 #endif 693 694 /* Be paranoid... */ 695 if (prog == NULL || string == NULL) { 696 regerror("NULL parameter"); 697 return(0); 698 } 699 700 /* Check validity of program. */ 701 if (UCHARAT(prog->program) != MAGIC) { 702 regerror("corrupted program"); 703 return(0); 704 } 705 706 /* If there is a "must appear" string, look for it. */ 707 if (prog->regmust != NULL) { 708 s = string; 709 while ((s = (unsigned char *)strchr((char *)s,prog->regmust[0])) 710 != NULL) { 711 if (strncmp((char *)s, (char *)prog->regmust, 712 prog->regmlen) 713 == 0) 714 break; /* Found it. */ 715 s++; 716 } 717 if (s == NULL) /* Not present. */ 718 return(0); 719 } 720 721 /* Mark beginning of line for ^ . */ 722 regbol = string; 723 724 /* Simplest case: anchored match need be tried only once. */ 725 if (prog->reganch) 726 return(regtry(prog, string)); 727 728 /* Messy cases: unanchored match. */ 729 s = string; 730 if (prog->regstart != '\0') 731 /* We know what char it must start with. */ 732 while ((s = (unsigned char *)strchr((char *)s, prog->regstart)) 733 != NULL) { 734 if (regtry(prog, s)) 735 return(1); 736 s++; 737 } 738 else 739 /* We don't -- general case. */ 740 do { 741 if (regtry(prog, s)) 742 return(1); 743 } while (*s++ != '\0'); 744 745 /* Failure. */ 746 return(0); 747 } 748 749 /* 750 - regtry - try match at specific point 751 */ 752 STATIC int regtry(regexp *prog, unsigned char *string) { 753 /* Return value 0 failure, 1 success */ 754 register int i; 755 register unsigned char **sp; 756 register unsigned char **ep; 757 758 reginput = string; 759 regstartp = prog->startp; 760 regendp = prog->endp; 761 762 sp = prog->startp; 763 ep = prog->endp; 764 for (i = NSUBEXP; i > 0; i--) { 765 *sp++ = NULL; 766 *ep++ = NULL; 767 } 768 if (regmatch(prog->program + 1)) { 769 prog->startp[0] = string; 770 prog->endp[0] = reginput; 771 return(1); 772 } else 773 return(0); 774 } 775 776 /* 777 - regmatch - main matching routine 778 * 779 * Conceptually the strategy is simple: check to see whether the current 780 * node matches, call self recursively to see whether the rest matches, 781 * and then act accordingly. In practice we make some effort to avoid 782 * recursion, in particular by going through "ordinary" nodes (that don't 783 * need to know whether the rest of the match failed) by a loop instead of 784 * by recursion. 785 */ 786 STATIC int regmatch( unsigned char *prog) { 787 /* Return value 0 failure 1 success */ 788 register unsigned char *scan; /* Current node. */ 789 unsigned char *next; /* Next node. */ 790 #ifndef STDLIB 791 extern char *strchr(); 792 #endif 793 794 scan = prog; 795 #ifdef DEBUG 796 if (scan != NULL && regnarrate) 797 fprintf(stderr, "%s(\n", regprop(scan)); 798 #endif 799 while (scan != NULL) { 800 #ifdef DEBUG 801 if (regnarrate) 802 fprintf(stderr, "%s...\n", regprop(scan)); 803 #endif 804 next = regnext(scan); 805 806 switch (OP(scan)) { 807 case BOL: 808 if (reginput != regbol) 809 return(0); 810 break; 811 case EOL: 812 if (*reginput != '\0') 813 return(0); 814 break; 815 case ANY: 816 if (*reginput == '\0') 817 return(0); 818 reginput++; 819 break; 820 case EXACTLY: { 821 register int len; 822 register unsigned char *opnd; 823 824 opnd = OPERAND(scan); 825 /* Inline the first character, for speed. */ 826 if (*opnd != *reginput) 827 return(0); 828 len = strlen((char *)opnd); 829 if (len > 1 && strncmp((char *)opnd, 830 (char *)reginput, len) 831 != 0) 832 return(0); 833 reginput += len; 834 } 835 break; 836 case ANYOF: 837 if (*reginput == '\0' 838 || strchr((char *)OPERAND(scan), *reginput) == NULL) 839 return(0); 840 reginput++; 841 break; 842 case ANYBUT: 843 if (*reginput == '\0' 844 || strchr((char *)OPERAND(scan), *reginput) != NULL) 845 return(0); 846 reginput++; 847 break; 848 case NOTHING: 849 break; 850 case BACK: 851 break; 852 case OPEN+1: 853 case OPEN+2: 854 case OPEN+3: 855 case OPEN+4: 856 case OPEN+5: 857 case OPEN+6: 858 case OPEN+7: 859 case OPEN+8: 860 case OPEN+9: { 861 register int no; 862 register unsigned char *save; 863 864 no = OP(scan) - OPEN; 865 save = reginput; 866 867 if (regmatch(next)) { 868 /* 869 * Don't set startp if some later 870 * invocation of the same parentheses 871 * already has. 872 */ 873 if (regstartp[no] == NULL) 874 regstartp[no] = save; 875 return(1); 876 } else 877 return(0); 878 } 879 break; 880 case CLOSE+1: 881 case CLOSE+2: 882 case CLOSE+3: 883 case CLOSE+4: 884 case CLOSE+5: 885 case CLOSE+6: 886 case CLOSE+7: 887 case CLOSE+8: 888 case CLOSE+9: { 889 register int no; 890 register unsigned char *save; 891 892 no = OP(scan) - CLOSE; 893 save = reginput; 894 895 if (regmatch(next)) { 896 /* 897 * Don't set endp if some later 898 * invocation of the same parentheses 899 * already has. 900 */ 901 if (regendp[no] == NULL) 902 regendp[no] = save; 903 return(1); 904 } else 905 return(0); 906 } 907 break; 908 case BRANCH: { 909 register unsigned char *save; 910 911 if (OP(next) != BRANCH) /* No choice. */ 912 next = OPERAND(scan); /* Avoid recursion. */ 913 else { 914 do { 915 save = reginput; 916 if (regmatch(OPERAND(scan))) 917 return(1); 918 reginput = save; 919 scan = regnext(scan); 920 } while (scan != NULL && OP(scan) == BRANCH); 921 return(0); 922 /* NOTREACHED */ 923 } 924 } 925 break; 926 case STAR: 927 case PLUS: { 928 register unsigned char nextch; 929 register int no; 930 register unsigned char *save; 931 register int min; 932 933 /* 934 * Lookahead to avoid useless match attempts 935 * when we know what character comes next. 936 */ 937 nextch = '\0'; 938 if (OP(next) == EXACTLY) 939 nextch = *OPERAND(next); 940 min = (OP(scan) == STAR) ? 0 : 1; 941 save = reginput; 942 no = regrepeat(OPERAND(scan)); 943 while (no >= min) { 944 /* If it could work, try it. */ 945 if (nextch == '\0' || *reginput == nextch) 946 if (regmatch(next)) 947 return(1); 948 /* Couldn't or didn't -- back up. */ 949 no--; 950 reginput = save + no; 951 } 952 return(0); 953 } 954 break; 955 case END: 956 return(1); /* Success! */ 957 break; 958 default: 959 regerror("memory corruption"); 960 return(0); 961 break; 962 } 963 964 scan = next; 965 } 966 967 /* 968 * We get here only if there's trouble -- normally "case END" is 969 * the terminating point. 970 */ 971 regerror("corrupted pointers"); 972 return(0); 973 } 974 975 /* 976 - regrepeat - repeatedly match something simple, report how many 977 */ 978 STATIC int regrepeat(unsigned char *p) { 979 register int count = 0; 980 register unsigned char *scan; 981 register unsigned char *opnd; 982 983 scan = reginput; 984 opnd = OPERAND(p); 985 switch (OP(p)) { 986 case ANY: 987 count = strlen((char *)scan); 988 scan += count; 989 break; 990 case EXACTLY: 991 while (*opnd == *scan) { 992 count++; 993 scan++; 994 } 995 break; 996 case ANYOF: 997 while (*scan != '\0' && strchr((char *)opnd, *scan) != NULL) { 998 count++; 999 scan++; 1000 } 1001 break; 1002 case ANYBUT: 1003 while (*scan != '\0' && strchr((char *)opnd, *scan) == NULL) { 1004 count++; 1005 scan++; 1006 } 1007 break; 1008 default: /* Oh dear. Called inappropriately. */ 1009 regerror("internal foulup"); 1010 count = 0; /* Best compromise. */ 1011 break; 1012 } 1013 reginput = scan; 1014 1015 return(count); 1016 } 1017 1018 /* 1019 - regnext - dig the "next" pointer out of a node 1020 */ 1021 STATIC unsigned char *regnext(register unsigned char *p) { 1022 register int offset; 1023 1024 if (p == ®dummy) 1025 return(NULL); 1026 1027 offset = NEXT(p); 1028 if (offset == 0) 1029 return(NULL); 1030 1031 if (OP(p) == BACK) 1032 return(p-offset); 1033 else 1034 return(p+offset); 1035 } 1036 1037 #ifdef DEBUG 1038 1039 STATIC unsigned char *regprop(void); 1040 1041 /* 1042 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1043 */ 1044 void regdump(regexp r) { 1045 register unsigned char *s; 1046 register unsigned char op = EXACTLY; /* Arbitrary non-END op. */ 1047 register unsigned char *next; 1048 extern char *strchr(); 1049 1050 1051 s = r->program + 1; 1052 while (op != END) { /* While that wasn't END last time... */ 1053 op = OP(s); 1054 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1055 next = regnext(s); 1056 if (next == NULL) /* Next ptr. */ 1057 printf("(0)"); 1058 else 1059 printf("(%d)", (s-r->program)+(next-s)); 1060 s += 3; 1061 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1062 /* Literal string, where present. */ 1063 while (*s != '\0') { 1064 putchar(*s); 1065 s++; 1066 } 1067 s++; 1068 } 1069 putchar('\n'); 1070 } 1071 1072 /* Header fields of interest. */ 1073 if (r->regstart != '\0') 1074 printf("start `%c' ", r->regstart); 1075 if (r->reganch) 1076 printf("anchored "); 1077 if (r->regmust != NULL) 1078 printf("must have \"%s\"", r->regmust); 1079 printf("\n"); 1080 } 1081 1082 /* 1083 - regprop - printable representation of opcode 1084 */ 1085 STATIC unsigned char regprop_buf[50]; 1086 1087 STATIC unsigned char *regprop(unsigned char *op) { 1088 register unsigned char *p; 1089 1090 (void) strcpy(regprop_buf, ":"); 1091 1092 switch (OP(op)) { 1093 case BOL: 1094 p = "BOL"; 1095 break; 1096 case EOL: 1097 p = "EOL"; 1098 break; 1099 case ANY: 1100 p = "ANY"; 1101 break; 1102 case ANYOF: 1103 p = "ANYOF"; 1104 break; 1105 case ANYBUT: 1106 p = "ANYBUT"; 1107 break; 1108 case BRANCH: 1109 p = "BRANCH"; 1110 break; 1111 case EXACTLY: 1112 p = "EXACTLY"; 1113 break; 1114 case NOTHING: 1115 p = "NOTHING"; 1116 break; 1117 case BACK: 1118 p = "BACK"; 1119 break; 1120 case END: 1121 p = "END"; 1122 break; 1123 case OPEN+1: 1124 case OPEN+2: 1125 case OPEN+3: 1126 case OPEN+4: 1127 case OPEN+5: 1128 case OPEN+6: 1129 case OPEN+7: 1130 case OPEN+8: 1131 case OPEN+9: 1132 sprintf(regprop_buf+strlen(regprop_buf), "OPEN%d", OP(op)-OPEN); 1133 p = NULL; 1134 break; 1135 case CLOSE+1: 1136 case CLOSE+2: 1137 case CLOSE+3: 1138 case CLOSE+4: 1139 case CLOSE+5: 1140 case CLOSE+6: 1141 case CLOSE+7: 1142 case CLOSE+8: 1143 case CLOSE+9: 1144 sprintf(regprop_buf+strlen(regprop_buf), "CLOSE%d", OP(op)-CLOSE); 1145 p = NULL; 1146 break; 1147 case STAR: 1148 p = "STAR"; 1149 break; 1150 case PLUS: 1151 p = "PLUS"; 1152 break; 1153 default: 1154 regerror("corrupted opcode"); 1155 break; 1156 } 1157 if (p != NULL) 1158 (void) strcat(regprop_buf, p); 1159 return(regprop_buf); 1160 } 1161 #endif 1162 1163 /* 1164 * The following is provided for those people who do not have strcspn() in 1165 * their C libraries. They should get off their butts and do something 1166 * about it; at least one public-domain implementation of those (highly 1167 * useful) string routines has been published on Usenet. 1168 */ 1169 #ifdef STRCSPN 1170 /* 1171 * strcspn - find length of initial segment of s1 consisting entirely 1172 * of characters not from s2 1173 */ 1174 1175 STATIC int strcspn(unsigned char *s1, unsigned char *s2) { 1176 register unsigned char *scan1; 1177 register unsigned char *scan2; 1178 register int count; 1179 1180 count = 0; 1181 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1182 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1183 if (*scan1 == *scan2++) 1184 return(count); 1185 count++; 1186 } 1187 return(count); 1188 } 1189 #endif 1190