1 /* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | 22 *** to assist in implementing egrep. 23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching 25 *** as in BSD grep and ex. 26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. 28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, 29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. 30 * 31 * Beware that some of this code is subtly aware of the way operator 32 * precedence is structured in regular expressions. Serious changes in 33 * regular-expression syntax might require a total rethink. 34 */ 35 36 #ifndef lint 37 static char *rcsid = "$Id: regexp.c,v 1.3 1993/08/26 00:45:35 jtc Exp $"; 38 #endif /* not lint */ 39 40 #include <regexp.h> 41 #include <stdio.h> 42 #include <ctype.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include "regmagic.h" 46 47 /* 48 * The "internal use only" fields in regexp.h are present to pass info from 49 * compile to execute that permits the execute phase to run lots faster on 50 * simple cases. They are: 51 * 52 * regstart char that must begin a match; '\0' if none obvious 53 * reganch is the match anchored (at beginning-of-line only)? 54 * regmust string (pointer into program) that match must include, or NULL 55 * regmlen length of regmust string 56 * 57 * Regstart and reganch permit very fast decisions on suitable starting points 58 * for a match, cutting down the work a lot. Regmust permits fast rejection 59 * of lines that cannot possibly match. The regmust tests are costly enough 60 * that regcomp() supplies a regmust only if the r.e. contains something 61 * potentially expensive (at present, the only such thing detected is * or + 62 * at the start of the r.e., which can involve a lot of backup). Regmlen is 63 * supplied because the test in regexec() needs it and regcomp() is computing 64 * it anyway. 65 */ 66 67 /* 68 * Structure for regexp "program". This is essentially a linear encoding 69 * of a nondeterministic finite-state machine (aka syntax charts or 70 * "railroad normal form" in parsing technology). Each node is an opcode 71 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 72 * all nodes except BRANCH implement concatenation; a "next" pointer with 73 * a BRANCH on both ends of it is connecting two alternatives. (Here we 74 * have one of the subtle syntax dependencies: an individual BRANCH (as 75 * opposed to a collection of them) is never concatenated with anything 76 * because of operator precedence.) The operand of some types of node is 77 * a literal string; for others, it is a node leading into a sub-FSM. In 78 * particular, the operand of a BRANCH node is the first node of the branch. 79 * (NB this is *not* a tree structure: the tail of the branch connects 80 * to the thing following the set of BRANCHes.) The opcodes are: 81 */ 82 83 /* definition number opnd? meaning */ 84 #define END 0 /* no End of program. */ 85 #define BOL 1 /* no Match "" at beginning of line. */ 86 #define EOL 2 /* no Match "" at end of line. */ 87 #define ANY 3 /* no Match any one character. */ 88 #define ANYOF 4 /* str Match any character in this string. */ 89 #define ANYBUT 5 /* str Match any character not in this string. */ 90 #define BRANCH 6 /* node Match this alternative, or the next... */ 91 #define BACK 7 /* no Match "", "next" ptr points backward. */ 92 #define EXACTLY 8 /* str Match this string. */ 93 #define NOTHING 9 /* no Match empty string. */ 94 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 95 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 96 #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ 97 #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ 98 #define OPEN 20 /* no Mark this point in input as start of #n. */ 99 /* OPEN+1 is number 1, etc. */ 100 #define CLOSE 30 /* no Analogous to OPEN. */ 101 102 /* 103 * Opcode notes: 104 * 105 * BRANCH The set of branches constituting a single choice are hooked 106 * together with their "next" pointers, since precedence prevents 107 * anything being concatenated to any individual branch. The 108 * "next" pointer of the last BRANCH in a choice points to the 109 * thing following the whole choice. This is also where the 110 * final "next" pointer of each individual branch points; each 111 * branch starts with the operand node of a BRANCH node. 112 * 113 * BACK Normal "next" pointers all implicitly point forward; BACK 114 * exists to make loop structures possible. 115 * 116 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 117 * BRANCH structures using BACK. Simple cases (one character 118 * per match) are implemented with STAR and PLUS for speed 119 * and to minimize recursive plunges. 120 * 121 * OPEN,CLOSE ...are numbered at compile time. 122 */ 123 124 /* 125 * A node is one char of opcode followed by two chars of "next" pointer. 126 * "Next" pointers are stored as two 8-bit pieces, high order first. The 127 * value is a positive offset from the opcode of the node containing it. 128 * An operand, if any, simply follows the node. (Note that much of the 129 * code generation knows about this implicit relationship.) 130 * 131 * Using two bytes for the "next" pointer is vast overkill for most things, 132 * but allows patterns to get big without disasters. 133 */ 134 #define OP(p) (*(p)) 135 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 136 #define OPERAND(p) ((p) + 3) 137 138 /* 139 * See regmagic.h for one further detail of program structure. 140 */ 141 142 143 /* 144 * Utility definitions. 145 */ 146 #ifndef CHARBITS 147 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 148 #else 149 #define UCHARAT(p) ((int)*(p)&CHARBITS) 150 #endif 151 152 #define FAIL(m) { regerror(m); return(NULL); } 153 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 154 155 /* 156 * Flags to be passed up and down. 157 */ 158 #define HASWIDTH 01 /* Known never to match null string. */ 159 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 160 #define SPSTART 04 /* Starts with * or +. */ 161 #define WORST 0 /* Worst case. */ 162 163 /* 164 * Global work variables for regcomp(). 165 */ 166 static char *regparse; /* Input-scan pointer. */ 167 static int regnpar; /* () count. */ 168 static char regdummy; 169 static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 170 static long regsize; /* Code size. */ 171 172 /* 173 * Forward declarations for regcomp()'s friends. 174 */ 175 #ifndef STATIC 176 #define STATIC static 177 #endif 178 STATIC char *reg(); 179 STATIC char *regbranch(); 180 STATIC char *regpiece(); 181 STATIC char *regatom(); 182 STATIC char *regnode(); 183 STATIC char *regnext(); 184 STATIC void regc(); 185 STATIC void reginsert(); 186 STATIC void regtail(); 187 STATIC void regoptail(); 188 #ifdef STRCSPN 189 STATIC int strcspn(); 190 #endif 191 192 /* 193 - regcomp - compile a regular expression into internal code 194 * 195 * We can't allocate space until we know how big the compiled form will be, 196 * but we can't compile it (and thus know how big it is) until we've got a 197 * place to put the code. So we cheat: we compile it twice, once with code 198 * generation turned off and size counting turned on, and once "for real". 199 * This also means that we don't allocate space until we are sure that the 200 * thing really will compile successfully, and we never have to move the 201 * code and thus invalidate pointers into it. (Note that it has to be in 202 * one piece because free() must be able to free it all.) 203 * 204 * Beware that the optimization-preparation code in here knows about some 205 * of the structure of the compiled regexp. 206 */ 207 regexp * 208 regcomp(exp) 209 const char *exp; 210 { 211 register regexp *r; 212 register char *scan; 213 register char *longest; 214 register int len; 215 int flags; 216 217 if (exp == NULL) 218 FAIL("NULL argument"); 219 220 /* First pass: determine size, legality. */ 221 #ifdef notdef 222 if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ 223 #endif 224 regparse = (char *)exp; 225 regnpar = 1; 226 regsize = 0L; 227 regcode = ®dummy; 228 regc(MAGIC); 229 if (reg(0, &flags) == NULL) 230 return(NULL); 231 232 /* Small enough for pointer-storage convention? */ 233 if (regsize >= 32767L) /* Probably could be 65535L. */ 234 FAIL("regexp too big"); 235 236 /* Allocate space. */ 237 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 238 if (r == NULL) 239 FAIL("out of space"); 240 241 /* Second pass: emit code. */ 242 regparse = (char *)exp; 243 regnpar = 1; 244 regcode = r->program; 245 regc(MAGIC); 246 if (reg(0, &flags) == NULL) 247 return(NULL); 248 249 /* Dig out information for optimizations. */ 250 r->regstart = '\0'; /* Worst-case defaults. */ 251 r->reganch = 0; 252 r->regmust = NULL; 253 r->regmlen = 0; 254 scan = r->program+1; /* First BRANCH. */ 255 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 256 scan = OPERAND(scan); 257 258 /* Starting-point info. */ 259 if (OP(scan) == EXACTLY) 260 r->regstart = *OPERAND(scan); 261 else if (OP(scan) == BOL) 262 r->reganch++; 263 264 /* 265 * If there's something expensive in the r.e., find the 266 * longest literal string that must appear and make it the 267 * regmust. Resolve ties in favor of later strings, since 268 * the regstart check works with the beginning of the r.e. 269 * and avoiding duplication strengthens checking. Not a 270 * strong reason, but sufficient in the absence of others. 271 */ 272 if (flags&SPSTART) { 273 longest = NULL; 274 len = 0; 275 for (; scan != NULL; scan = regnext(scan)) 276 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { 277 longest = OPERAND(scan); 278 len = strlen(OPERAND(scan)); 279 } 280 r->regmust = longest; 281 r->regmlen = len; 282 } 283 } 284 285 return(r); 286 } 287 288 /* 289 - reg - regular expression, i.e. main body or parenthesized thing 290 * 291 * Caller must absorb opening parenthesis. 292 * 293 * Combining parenthesis handling with the base level of regular expression 294 * is a trifle forced, but the need to tie the tails of the branches to what 295 * follows makes it hard to avoid. 296 */ 297 static char * 298 reg(paren, flagp) 299 int paren; /* Parenthesized? */ 300 int *flagp; 301 { 302 register char *ret; 303 register char *br; 304 register char *ender; 305 register int parno; 306 int flags; 307 308 *flagp = HASWIDTH; /* Tentatively. */ 309 310 /* Make an OPEN node, if parenthesized. */ 311 if (paren) { 312 if (regnpar >= NSUBEXP) 313 FAIL("too many ()"); 314 parno = regnpar; 315 regnpar++; 316 ret = regnode(OPEN+parno); 317 } else 318 ret = NULL; 319 320 /* Pick up the branches, linking them together. */ 321 br = regbranch(&flags); 322 if (br == NULL) 323 return(NULL); 324 if (ret != NULL) 325 regtail(ret, br); /* OPEN -> first. */ 326 else 327 ret = br; 328 if (!(flags&HASWIDTH)) 329 *flagp &= ~HASWIDTH; 330 *flagp |= flags&SPSTART; 331 while (*regparse == '|' || *regparse == '\n') { 332 regparse++; 333 br = regbranch(&flags); 334 if (br == NULL) 335 return(NULL); 336 regtail(ret, br); /* BRANCH -> BRANCH. */ 337 if (!(flags&HASWIDTH)) 338 *flagp &= ~HASWIDTH; 339 *flagp |= flags&SPSTART; 340 } 341 342 /* Make a closing node, and hook it on the end. */ 343 ender = regnode((paren) ? CLOSE+parno : END); 344 regtail(ret, ender); 345 346 /* Hook the tails of the branches to the closing node. */ 347 for (br = ret; br != NULL; br = regnext(br)) 348 regoptail(br, ender); 349 350 /* Check for proper termination. */ 351 if (paren && *regparse++ != ')') { 352 FAIL("unmatched ()"); 353 } else if (!paren && *regparse != '\0') { 354 if (*regparse == ')') { 355 FAIL("unmatched ()"); 356 } else 357 FAIL("junk on end"); /* "Can't happen". */ 358 /* NOTREACHED */ 359 } 360 361 return(ret); 362 } 363 364 /* 365 - regbranch - one alternative of an | operator 366 * 367 * Implements the concatenation operator. 368 */ 369 static char * 370 regbranch(flagp) 371 int *flagp; 372 { 373 register char *ret; 374 register char *chain; 375 register char *latest; 376 int flags; 377 378 *flagp = WORST; /* Tentatively. */ 379 380 ret = regnode(BRANCH); 381 chain = NULL; 382 while (*regparse != '\0' && *regparse != ')' && 383 *regparse != '\n' && *regparse != '|') { 384 latest = regpiece(&flags); 385 if (latest == NULL) 386 return(NULL); 387 *flagp |= flags&HASWIDTH; 388 if (chain == NULL) /* First piece. */ 389 *flagp |= flags&SPSTART; 390 else 391 regtail(chain, latest); 392 chain = latest; 393 } 394 if (chain == NULL) /* Loop ran zero times. */ 395 (void) regnode(NOTHING); 396 397 return(ret); 398 } 399 400 /* 401 - regpiece - something followed by possible [*+?] 402 * 403 * Note that the branching code sequences used for ? and the general cases 404 * of * and + are somewhat optimized: they use the same NOTHING node as 405 * both the endmarker for their branch list and the body of the last branch. 406 * It might seem that this node could be dispensed with entirely, but the 407 * endmarker role is not redundant. 408 */ 409 static char * 410 regpiece(flagp) 411 int *flagp; 412 { 413 register char *ret; 414 register char op; 415 register char *next; 416 int flags; 417 418 ret = regatom(&flags); 419 if (ret == NULL) 420 return(NULL); 421 422 op = *regparse; 423 if (!ISMULT(op)) { 424 *flagp = flags; 425 return(ret); 426 } 427 428 if (!(flags&HASWIDTH) && op != '?') 429 FAIL("*+ operand could be empty"); 430 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 431 432 if (op == '*' && (flags&SIMPLE)) 433 reginsert(STAR, ret); 434 else if (op == '*') { 435 /* Emit x* as (x&|), where & means "self". */ 436 reginsert(BRANCH, ret); /* Either x */ 437 regoptail(ret, regnode(BACK)); /* and loop */ 438 regoptail(ret, ret); /* back */ 439 regtail(ret, regnode(BRANCH)); /* or */ 440 regtail(ret, regnode(NOTHING)); /* null. */ 441 } else if (op == '+' && (flags&SIMPLE)) 442 reginsert(PLUS, ret); 443 else if (op == '+') { 444 /* Emit x+ as x(&|), where & means "self". */ 445 next = regnode(BRANCH); /* Either */ 446 regtail(ret, next); 447 regtail(regnode(BACK), ret); /* loop back */ 448 regtail(next, regnode(BRANCH)); /* or */ 449 regtail(ret, regnode(NOTHING)); /* null. */ 450 } else if (op == '?') { 451 /* Emit x? as (x|) */ 452 reginsert(BRANCH, ret); /* Either x */ 453 regtail(ret, regnode(BRANCH)); /* or */ 454 next = regnode(NOTHING); /* null. */ 455 regtail(ret, next); 456 regoptail(ret, next); 457 } 458 regparse++; 459 if (ISMULT(*regparse)) 460 FAIL("nested *?+"); 461 462 return(ret); 463 } 464 465 /* 466 - regatom - the lowest level 467 * 468 * Optimization: gobbles an entire sequence of ordinary characters so that 469 * it can turn them into a single node, which is smaller to store and 470 * faster to run. Backslashed characters are exceptions, each becoming a 471 * separate node; the code is simpler that way and it's not worth fixing. 472 */ 473 static char * 474 regatom(flagp) 475 int *flagp; 476 { 477 register char *ret; 478 int flags; 479 480 *flagp = WORST; /* Tentatively. */ 481 482 switch (*regparse++) { 483 /* FIXME: these chars only have meaning at beg/end of pat? */ 484 case '^': 485 ret = regnode(BOL); 486 break; 487 case '$': 488 ret = regnode(EOL); 489 break; 490 case '.': 491 ret = regnode(ANY); 492 *flagp |= HASWIDTH|SIMPLE; 493 break; 494 case '[': { 495 register int class; 496 register int classend; 497 498 if (*regparse == '^') { /* Complement of range. */ 499 ret = regnode(ANYBUT); 500 regparse++; 501 } else 502 ret = regnode(ANYOF); 503 if (*regparse == ']' || *regparse == '-') 504 regc(*regparse++); 505 while (*regparse != '\0' && *regparse != ']') { 506 if (*regparse == '-') { 507 regparse++; 508 if (*regparse == ']' || *regparse == '\0') 509 regc('-'); 510 else { 511 class = UCHARAT(regparse-2)+1; 512 classend = UCHARAT(regparse); 513 if (class > classend+1) 514 FAIL("invalid [] range"); 515 for (; class <= classend; class++) 516 regc(class); 517 regparse++; 518 } 519 } else 520 regc(*regparse++); 521 } 522 regc('\0'); 523 if (*regparse != ']') 524 FAIL("unmatched []"); 525 regparse++; 526 *flagp |= HASWIDTH|SIMPLE; 527 } 528 break; 529 case '(': 530 ret = reg(1, &flags); 531 if (ret == NULL) 532 return(NULL); 533 *flagp |= flags&(HASWIDTH|SPSTART); 534 break; 535 case '\0': 536 case '|': 537 case '\n': 538 case ')': 539 FAIL("internal urp"); /* Supposed to be caught earlier. */ 540 break; 541 case '?': 542 case '+': 543 case '*': 544 FAIL("?+* follows nothing"); 545 break; 546 case '\\': 547 switch (*regparse++) { 548 case '\0': 549 FAIL("trailing \\"); 550 break; 551 case '<': 552 ret = regnode(WORDA); 553 break; 554 case '>': 555 ret = regnode(WORDZ); 556 break; 557 /* FIXME: Someday handle \1, \2, ... */ 558 default: 559 /* Handle general quoted chars in exact-match routine */ 560 goto de_fault; 561 } 562 break; 563 de_fault: 564 default: 565 /* 566 * Encode a string of characters to be matched exactly. 567 * 568 * This is a bit tricky due to quoted chars and due to 569 * '*', '+', and '?' taking the SINGLE char previous 570 * as their operand. 571 * 572 * On entry, the char at regparse[-1] is going to go 573 * into the string, no matter what it is. (It could be 574 * following a \ if we are entered from the '\' case.) 575 * 576 * Basic idea is to pick up a good char in ch and 577 * examine the next char. If it's *+? then we twiddle. 578 * If it's \ then we frozzle. If it's other magic char 579 * we push ch and terminate the string. If none of the 580 * above, we push ch on the string and go around again. 581 * 582 * regprev is used to remember where "the current char" 583 * starts in the string, if due to a *+? we need to back 584 * up and put the current char in a separate, 1-char, string. 585 * When regprev is NULL, ch is the only char in the 586 * string; this is used in *+? handling, and in setting 587 * flags |= SIMPLE at the end. 588 */ 589 { 590 char *regprev; 591 register char ch; 592 593 regparse--; /* Look at cur char */ 594 ret = regnode(EXACTLY); 595 for ( regprev = 0 ; ; ) { 596 ch = *regparse++; /* Get current char */ 597 switch (*regparse) { /* look at next one */ 598 599 default: 600 regc(ch); /* Add cur to string */ 601 break; 602 603 case '.': case '[': case '(': 604 case ')': case '|': case '\n': 605 case '$': case '^': 606 case '\0': 607 /* FIXME, $ and ^ should not always be magic */ 608 magic: 609 regc(ch); /* dump cur char */ 610 goto done; /* and we are done */ 611 612 case '?': case '+': case '*': 613 if (!regprev) /* If just ch in str, */ 614 goto magic; /* use it */ 615 /* End mult-char string one early */ 616 regparse = regprev; /* Back up parse */ 617 goto done; 618 619 case '\\': 620 regc(ch); /* Cur char OK */ 621 switch (regparse[1]){ /* Look after \ */ 622 case '\0': 623 case '<': 624 case '>': 625 /* FIXME: Someday handle \1, \2, ... */ 626 goto done; /* Not quoted */ 627 default: 628 /* Backup point is \, scan * point is after it. */ 629 regprev = regparse; 630 regparse++; 631 continue; /* NOT break; */ 632 } 633 } 634 regprev = regparse; /* Set backup point */ 635 } 636 done: 637 regc('\0'); 638 *flagp |= HASWIDTH; 639 if (!regprev) /* One char? */ 640 *flagp |= SIMPLE; 641 } 642 break; 643 } 644 645 return(ret); 646 } 647 648 /* 649 - regnode - emit a node 650 */ 651 static char * /* Location. */ 652 regnode(op) 653 char op; 654 { 655 register char *ret; 656 register char *ptr; 657 658 ret = regcode; 659 if (ret == ®dummy) { 660 regsize += 3; 661 return(ret); 662 } 663 664 ptr = ret; 665 *ptr++ = op; 666 *ptr++ = '\0'; /* Null "next" pointer. */ 667 *ptr++ = '\0'; 668 regcode = ptr; 669 670 return(ret); 671 } 672 673 /* 674 - regc - emit (if appropriate) a byte of code 675 */ 676 static void 677 regc(b) 678 char b; 679 { 680 if (regcode != ®dummy) 681 *regcode++ = b; 682 else 683 regsize++; 684 } 685 686 /* 687 - reginsert - insert an operator in front of already-emitted operand 688 * 689 * Means relocating the operand. 690 */ 691 static void 692 reginsert(op, opnd) 693 char op; 694 char *opnd; 695 { 696 register char *src; 697 register char *dst; 698 register char *place; 699 700 if (regcode == ®dummy) { 701 regsize += 3; 702 return; 703 } 704 705 src = regcode; 706 regcode += 3; 707 dst = regcode; 708 while (src > opnd) 709 *--dst = *--src; 710 711 place = opnd; /* Op node, where operand used to be. */ 712 *place++ = op; 713 *place++ = '\0'; 714 *place++ = '\0'; 715 } 716 717 /* 718 - regtail - set the next-pointer at the end of a node chain 719 */ 720 static void 721 regtail(p, val) 722 char *p; 723 char *val; 724 { 725 register char *scan; 726 register char *temp; 727 register int offset; 728 729 if (p == ®dummy) 730 return; 731 732 /* Find last node. */ 733 scan = p; 734 for (;;) { 735 temp = regnext(scan); 736 if (temp == NULL) 737 break; 738 scan = temp; 739 } 740 741 if (OP(scan) == BACK) 742 offset = scan - val; 743 else 744 offset = val - scan; 745 *(scan+1) = (offset>>8)&0377; 746 *(scan+2) = offset&0377; 747 } 748 749 /* 750 - regoptail - regtail on operand of first argument; nop if operandless 751 */ 752 static void 753 regoptail(p, val) 754 char *p; 755 char *val; 756 { 757 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 758 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 759 return; 760 regtail(OPERAND(p), val); 761 } 762 763 /* 764 * regexec and friends 765 */ 766 767 /* 768 * Global work variables for regexec(). 769 */ 770 static char *reginput; /* String-input pointer. */ 771 static char *regbol; /* Beginning of input, for ^ check. */ 772 static char **regstartp; /* Pointer to startp array. */ 773 static char **regendp; /* Ditto for endp. */ 774 775 /* 776 * Forwards. 777 */ 778 STATIC int regtry(); 779 STATIC int regmatch(); 780 STATIC int regrepeat(); 781 782 #ifdef DEBUG 783 int regnarrate = 0; 784 void regdump(); 785 STATIC char *regprop(); 786 #endif 787 788 /* 789 - regexec - match a regexp against a string 790 */ 791 int 792 regexec(prog, string) 793 register const regexp *prog; 794 register const char *string; 795 { 796 register char *s; 797 extern char *strchr(); 798 799 /* Be paranoid... */ 800 if (prog == NULL || string == NULL) { 801 regerror("NULL parameter"); 802 return(0); 803 } 804 805 /* Check validity of program. */ 806 if (UCHARAT(prog->program) != MAGIC) { 807 regerror("corrupted program"); 808 return(0); 809 } 810 811 /* If there is a "must appear" string, look for it. */ 812 if (prog->regmust != NULL) { 813 s = (char *)string; 814 while ((s = strchr(s, prog->regmust[0])) != NULL) { 815 if (strncmp(s, prog->regmust, prog->regmlen) == 0) 816 break; /* Found it. */ 817 s++; 818 } 819 if (s == NULL) /* Not present. */ 820 return(0); 821 } 822 823 /* Mark beginning of line for ^ . */ 824 regbol = (char *)string; 825 826 /* Simplest case: anchored match need be tried only once. */ 827 if (prog->reganch) 828 return(regtry(prog, string)); 829 830 /* Messy cases: unanchored match. */ 831 s = (char *)string; 832 if (prog->regstart != '\0') 833 /* We know what char it must start with. */ 834 while ((s = strchr(s, prog->regstart)) != NULL) { 835 if (regtry(prog, s)) 836 return(1); 837 s++; 838 } 839 else 840 /* We don't -- general case. */ 841 do { 842 if (regtry(prog, s)) 843 return(1); 844 } while (*s++ != '\0'); 845 846 /* Failure. */ 847 return(0); 848 } 849 850 /* 851 - regtry - try match at specific point 852 */ 853 static int /* 0 failure, 1 success */ 854 regtry(prog, string) 855 regexp *prog; 856 char *string; 857 { 858 register int i; 859 register char **sp; 860 register char **ep; 861 862 reginput = string; 863 regstartp = prog->startp; 864 regendp = prog->endp; 865 866 sp = prog->startp; 867 ep = prog->endp; 868 for (i = NSUBEXP; i > 0; i--) { 869 *sp++ = NULL; 870 *ep++ = NULL; 871 } 872 if (regmatch(prog->program + 1)) { 873 prog->startp[0] = string; 874 prog->endp[0] = reginput; 875 return(1); 876 } else 877 return(0); 878 } 879 880 /* 881 - regmatch - main matching routine 882 * 883 * Conceptually the strategy is simple: check to see whether the current 884 * node matches, call self recursively to see whether the rest matches, 885 * and then act accordingly. In practice we make some effort to avoid 886 * recursion, in particular by going through "ordinary" nodes (that don't 887 * need to know whether the rest of the match failed) by a loop instead of 888 * by recursion. 889 */ 890 static int /* 0 failure, 1 success */ 891 regmatch(prog) 892 char *prog; 893 { 894 register char *scan; /* Current node. */ 895 char *next; /* Next node. */ 896 extern char *strchr(); 897 898 scan = prog; 899 #ifdef DEBUG 900 if (scan != NULL && regnarrate) 901 fprintf(stderr, "%s(\n", regprop(scan)); 902 #endif 903 while (scan != NULL) { 904 #ifdef DEBUG 905 if (regnarrate) 906 fprintf(stderr, "%s...\n", regprop(scan)); 907 #endif 908 next = regnext(scan); 909 910 switch (OP(scan)) { 911 case BOL: 912 if (reginput != regbol) 913 return(0); 914 break; 915 case EOL: 916 if (*reginput != '\0') 917 return(0); 918 break; 919 case WORDA: 920 /* Must be looking at a letter, digit, or _ */ 921 if ((!isalnum(*reginput)) && *reginput != '_') 922 return(0); 923 /* Prev must be BOL or nonword */ 924 if (reginput > regbol && 925 (isalnum(reginput[-1]) || reginput[-1] == '_')) 926 return(0); 927 break; 928 case WORDZ: 929 /* Must be looking at non letter, digit, or _ */ 930 if (isalnum(*reginput) || *reginput == '_') 931 return(0); 932 /* We don't care what the previous char was */ 933 break; 934 case ANY: 935 if (*reginput == '\0') 936 return(0); 937 reginput++; 938 break; 939 case EXACTLY: { 940 register int len; 941 register char *opnd; 942 943 opnd = OPERAND(scan); 944 /* Inline the first character, for speed. */ 945 if (*opnd != *reginput) 946 return(0); 947 len = strlen(opnd); 948 if (len > 1 && strncmp(opnd, reginput, len) != 0) 949 return(0); 950 reginput += len; 951 } 952 break; 953 case ANYOF: 954 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 955 return(0); 956 reginput++; 957 break; 958 case ANYBUT: 959 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 960 return(0); 961 reginput++; 962 break; 963 case NOTHING: 964 break; 965 case BACK: 966 break; 967 case OPEN+1: 968 case OPEN+2: 969 case OPEN+3: 970 case OPEN+4: 971 case OPEN+5: 972 case OPEN+6: 973 case OPEN+7: 974 case OPEN+8: 975 case OPEN+9: { 976 register int no; 977 register char *save; 978 979 no = OP(scan) - OPEN; 980 save = reginput; 981 982 if (regmatch(next)) { 983 /* 984 * Don't set startp if some later 985 * invocation of the same parentheses 986 * already has. 987 */ 988 if (regstartp[no] == NULL) 989 regstartp[no] = save; 990 return(1); 991 } else 992 return(0); 993 } 994 break; 995 case CLOSE+1: 996 case CLOSE+2: 997 case CLOSE+3: 998 case CLOSE+4: 999 case CLOSE+5: 1000 case CLOSE+6: 1001 case CLOSE+7: 1002 case CLOSE+8: 1003 case CLOSE+9: { 1004 register int no; 1005 register char *save; 1006 1007 no = OP(scan) - CLOSE; 1008 save = reginput; 1009 1010 if (regmatch(next)) { 1011 /* 1012 * Don't set endp if some later 1013 * invocation of the same parentheses 1014 * already has. 1015 */ 1016 if (regendp[no] == NULL) 1017 regendp[no] = save; 1018 return(1); 1019 } else 1020 return(0); 1021 } 1022 break; 1023 case BRANCH: { 1024 register char *save; 1025 1026 if (OP(next) != BRANCH) /* No choice. */ 1027 next = OPERAND(scan); /* Avoid recursion. */ 1028 else { 1029 do { 1030 save = reginput; 1031 if (regmatch(OPERAND(scan))) 1032 return(1); 1033 reginput = save; 1034 scan = regnext(scan); 1035 } while (scan != NULL && OP(scan) == BRANCH); 1036 return(0); 1037 /* NOTREACHED */ 1038 } 1039 } 1040 break; 1041 case STAR: 1042 case PLUS: { 1043 register char nextch; 1044 register int no; 1045 register char *save; 1046 register int min; 1047 1048 /* 1049 * Lookahead to avoid useless match attempts 1050 * when we know what character comes next. 1051 */ 1052 nextch = '\0'; 1053 if (OP(next) == EXACTLY) 1054 nextch = *OPERAND(next); 1055 min = (OP(scan) == STAR) ? 0 : 1; 1056 save = reginput; 1057 no = regrepeat(OPERAND(scan)); 1058 while (no >= min) { 1059 /* If it could work, try it. */ 1060 if (nextch == '\0' || *reginput == nextch) 1061 if (regmatch(next)) 1062 return(1); 1063 /* Couldn't or didn't -- back up. */ 1064 no--; 1065 reginput = save + no; 1066 } 1067 return(0); 1068 } 1069 break; 1070 case END: 1071 return(1); /* Success! */ 1072 break; 1073 default: 1074 regerror("memory corruption"); 1075 return(0); 1076 break; 1077 } 1078 1079 scan = next; 1080 } 1081 1082 /* 1083 * We get here only if there's trouble -- normally "case END" is 1084 * the terminating point. 1085 */ 1086 regerror("corrupted pointers"); 1087 return(0); 1088 } 1089 1090 /* 1091 - regrepeat - repeatedly match something simple, report how many 1092 */ 1093 static int 1094 regrepeat(p) 1095 char *p; 1096 { 1097 register int count = 0; 1098 register char *scan; 1099 register char *opnd; 1100 1101 scan = reginput; 1102 opnd = OPERAND(p); 1103 switch (OP(p)) { 1104 case ANY: 1105 count = strlen(scan); 1106 scan += count; 1107 break; 1108 case EXACTLY: 1109 while (*opnd == *scan) { 1110 count++; 1111 scan++; 1112 } 1113 break; 1114 case ANYOF: 1115 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1116 count++; 1117 scan++; 1118 } 1119 break; 1120 case ANYBUT: 1121 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1122 count++; 1123 scan++; 1124 } 1125 break; 1126 default: /* Oh dear. Called inappropriately. */ 1127 regerror("internal foulup"); 1128 count = 0; /* Best compromise. */ 1129 break; 1130 } 1131 reginput = scan; 1132 1133 return(count); 1134 } 1135 1136 /* 1137 - regnext - dig the "next" pointer out of a node 1138 */ 1139 static char * 1140 regnext(p) 1141 register char *p; 1142 { 1143 register int offset; 1144 1145 if (p == ®dummy) 1146 return(NULL); 1147 1148 offset = NEXT(p); 1149 if (offset == 0) 1150 return(NULL); 1151 1152 if (OP(p) == BACK) 1153 return(p-offset); 1154 else 1155 return(p+offset); 1156 } 1157 1158 #ifdef DEBUG 1159 1160 STATIC char *regprop(); 1161 1162 /* 1163 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1164 */ 1165 void 1166 regdump(r) 1167 regexp *r; 1168 { 1169 register char *s; 1170 register char op = EXACTLY; /* Arbitrary non-END op. */ 1171 register char *next; 1172 extern char *strchr(); 1173 1174 1175 s = r->program + 1; 1176 while (op != END) { /* While that wasn't END last time... */ 1177 op = OP(s); 1178 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1179 next = regnext(s); 1180 if (next == NULL) /* Next ptr. */ 1181 printf("(0)"); 1182 else 1183 printf("(%d)", (s-r->program)+(next-s)); 1184 s += 3; 1185 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1186 /* Literal string, where present. */ 1187 while (*s != '\0') { 1188 putchar(*s); 1189 s++; 1190 } 1191 s++; 1192 } 1193 putchar('\n'); 1194 } 1195 1196 /* Header fields of interest. */ 1197 if (r->regstart != '\0') 1198 printf("start `%c' ", r->regstart); 1199 if (r->reganch) 1200 printf("anchored "); 1201 if (r->regmust != NULL) 1202 printf("must have \"%s\"", r->regmust); 1203 printf("\n"); 1204 } 1205 1206 /* 1207 - regprop - printable representation of opcode 1208 */ 1209 static char * 1210 regprop(op) 1211 char *op; 1212 { 1213 register char *p; 1214 static char buf[50]; 1215 1216 (void) strcpy(buf, ":"); 1217 1218 switch (OP(op)) { 1219 case BOL: 1220 p = "BOL"; 1221 break; 1222 case EOL: 1223 p = "EOL"; 1224 break; 1225 case ANY: 1226 p = "ANY"; 1227 break; 1228 case ANYOF: 1229 p = "ANYOF"; 1230 break; 1231 case ANYBUT: 1232 p = "ANYBUT"; 1233 break; 1234 case BRANCH: 1235 p = "BRANCH"; 1236 break; 1237 case EXACTLY: 1238 p = "EXACTLY"; 1239 break; 1240 case NOTHING: 1241 p = "NOTHING"; 1242 break; 1243 case BACK: 1244 p = "BACK"; 1245 break; 1246 case END: 1247 p = "END"; 1248 break; 1249 case OPEN+1: 1250 case OPEN+2: 1251 case OPEN+3: 1252 case OPEN+4: 1253 case OPEN+5: 1254 case OPEN+6: 1255 case OPEN+7: 1256 case OPEN+8: 1257 case OPEN+9: 1258 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 1259 p = NULL; 1260 break; 1261 case CLOSE+1: 1262 case CLOSE+2: 1263 case CLOSE+3: 1264 case CLOSE+4: 1265 case CLOSE+5: 1266 case CLOSE+6: 1267 case CLOSE+7: 1268 case CLOSE+8: 1269 case CLOSE+9: 1270 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1271 p = NULL; 1272 break; 1273 case STAR: 1274 p = "STAR"; 1275 break; 1276 case PLUS: 1277 p = "PLUS"; 1278 break; 1279 case WORDA: 1280 p = "WORDA"; 1281 break; 1282 case WORDZ: 1283 p = "WORDZ"; 1284 break; 1285 default: 1286 regerror("corrupted opcode"); 1287 break; 1288 } 1289 if (p != NULL) 1290 (void) strcat(buf, p); 1291 return(buf); 1292 } 1293 #endif 1294 1295 /* 1296 * The following is provided for those people who do not have strcspn() in 1297 * their C libraries. They should get off their butts and do something 1298 * about it; at least one public-domain implementation of those (highly 1299 * useful) string routines has been published on Usenet. 1300 */ 1301 #ifdef STRCSPN 1302 /* 1303 * strcspn - find length of initial segment of s1 consisting entirely 1304 * of characters not from s2 1305 */ 1306 1307 static int 1308 strcspn(s1, s2) 1309 char *s1; 1310 char *s2; 1311 { 1312 register char *scan1; 1313 register char *scan2; 1314 register int count; 1315 1316 count = 0; 1317 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1318 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1319 if (*scan1 == *scan2++) 1320 return(count); 1321 count++; 1322 } 1323 return(count); 1324 } 1325 #endif 1326