1 /* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | 22 *** to assist in implementing egrep. 23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching 25 *** as in BSD grep and ex. 26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. 28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, 29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. 30 * 31 * Beware that some of this code is subtly aware of the way operator 32 * precedence is structured in regular expressions. Serious changes in 33 * regular-expression syntax might require a total rethink. 34 */ 35 36 #include <sys/cdefs.h> 37 #ifndef lint 38 __RCSID("$NetBSD: regexp.c,v 1.7 1997/10/09 10:21:18 lukem Exp $"); 39 #endif /* not lint */ 40 41 #include <regexp.h> 42 #include <stdio.h> 43 #include <ctype.h> 44 #include <stdlib.h> 45 #include <string.h> 46 #include "regmagic.h" 47 48 /* 49 * The "internal use only" fields in regexp.h are present to pass info from 50 * compile to execute that permits the execute phase to run lots faster on 51 * simple cases. They are: 52 * 53 * regstart char that must begin a match; '\0' if none obvious 54 * reganch is the match anchored (at beginning-of-line only)? 55 * regmust string (pointer into program) that match must include, or NULL 56 * regmlen length of regmust string 57 * 58 * Regstart and reganch permit very fast decisions on suitable starting points 59 * for a match, cutting down the work a lot. Regmust permits fast rejection 60 * of lines that cannot possibly match. The regmust tests are costly enough 61 * that regcomp() supplies a regmust only if the r.e. contains something 62 * potentially expensive (at present, the only such thing detected is * or + 63 * at the start of the r.e., which can involve a lot of backup). Regmlen is 64 * supplied because the test in regexec() needs it and regcomp() is computing 65 * it anyway. 66 */ 67 68 /* 69 * Structure for regexp "program". This is essentially a linear encoding 70 * of a nondeterministic finite-state machine (aka syntax charts or 71 * "railroad normal form" in parsing technology). Each node is an opcode 72 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 73 * all nodes except BRANCH implement concatenation; a "next" pointer with 74 * a BRANCH on both ends of it is connecting two alternatives. (Here we 75 * have one of the subtle syntax dependencies: an individual BRANCH (as 76 * opposed to a collection of them) is never concatenated with anything 77 * because of operator precedence.) The operand of some types of node is 78 * a literal string; for others, it is a node leading into a sub-FSM. In 79 * particular, the operand of a BRANCH node is the first node of the branch. 80 * (NB this is *not* a tree structure: the tail of the branch connects 81 * to the thing following the set of BRANCHes.) The opcodes are: 82 */ 83 84 /* definition number opnd? meaning */ 85 #define END 0 /* no End of program. */ 86 #define BOL 1 /* no Match "" at beginning of line. */ 87 #define EOL 2 /* no Match "" at end of line. */ 88 #define ANY 3 /* no Match any one character. */ 89 #define ANYOF 4 /* str Match any character in this string. */ 90 #define ANYBUT 5 /* str Match any character not in this string. */ 91 #define BRANCH 6 /* node Match this alternative, or the next... */ 92 #define BACK 7 /* no Match "", "next" ptr points backward. */ 93 #define EXACTLY 8 /* str Match this string. */ 94 #define NOTHING 9 /* no Match empty string. */ 95 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 96 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 97 #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ 98 #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ 99 #define OPEN 20 /* no Mark this point in input as start of #n. */ 100 /* OPEN+1 is number 1, etc. */ 101 #define CLOSE 30 /* no Analogous to OPEN. */ 102 103 /* 104 * Opcode notes: 105 * 106 * BRANCH The set of branches constituting a single choice are hooked 107 * together with their "next" pointers, since precedence prevents 108 * anything being concatenated to any individual branch. The 109 * "next" pointer of the last BRANCH in a choice points to the 110 * thing following the whole choice. This is also where the 111 * final "next" pointer of each individual branch points; each 112 * branch starts with the operand node of a BRANCH node. 113 * 114 * BACK Normal "next" pointers all implicitly point forward; BACK 115 * exists to make loop structures possible. 116 * 117 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 118 * BRANCH structures using BACK. Simple cases (one character 119 * per match) are implemented with STAR and PLUS for speed 120 * and to minimize recursive plunges. 121 * 122 * OPEN,CLOSE ...are numbered at compile time. 123 */ 124 125 /* 126 * A node is one char of opcode followed by two chars of "next" pointer. 127 * "Next" pointers are stored as two 8-bit pieces, high order first. The 128 * value is a positive offset from the opcode of the node containing it. 129 * An operand, if any, simply follows the node. (Note that much of the 130 * code generation knows about this implicit relationship.) 131 * 132 * Using two bytes for the "next" pointer is vast overkill for most things, 133 * but allows patterns to get big without disasters. 134 */ 135 #define OP(p) (*(p)) 136 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 137 #define OPERAND(p) ((p) + 3) 138 139 /* 140 * See regmagic.h for one further detail of program structure. 141 */ 142 143 144 /* 145 * Utility definitions. 146 */ 147 #ifndef CHARBITS 148 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 149 #else 150 #define UCHARAT(p) ((int)*(p)&CHARBITS) 151 #endif 152 153 #define FAIL(m) { regerror(m); return(NULL); } 154 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 155 156 /* 157 * Flags to be passed up and down. 158 */ 159 #define HASWIDTH 01 /* Known never to match null string. */ 160 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 161 #define SPSTART 04 /* Starts with * or +. */ 162 #define WORST 0 /* Worst case. */ 163 164 /* 165 * Global work variables for regcomp(). 166 */ 167 static char *regparse; /* Input-scan pointer. */ 168 static int regnpar; /* () count. */ 169 static char regdummy; 170 static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 171 static long regsize; /* Code size. */ 172 173 /* 174 * Forward declarations for regcomp()'s friends. 175 */ 176 #ifndef STATIC 177 #define STATIC static 178 #endif 179 STATIC char *reg __P((int, int *)); 180 STATIC char *regbranch __P((int *)); 181 STATIC char *regpiece __P((int *)); 182 STATIC char *regatom __P((int *)); 183 STATIC char *regnode __P((char)); 184 STATIC char *regnext __P((char *)); 185 STATIC void regc __P((char)); 186 STATIC void reginsert __P((char, char *)); 187 STATIC void regtail __P((char *, char *)); 188 STATIC void regoptail __P((char *, char *)); 189 #ifdef STRCSPN 190 STATIC int strcspn __P((char *, char *)); 191 #endif 192 193 /* 194 - regcomp - compile a regular expression into internal code 195 * 196 * We can't allocate space until we know how big the compiled form will be, 197 * but we can't compile it (and thus know how big it is) until we've got a 198 * place to put the code. So we cheat: we compile it twice, once with code 199 * generation turned off and size counting turned on, and once "for real". 200 * This also means that we don't allocate space until we are sure that the 201 * thing really will compile successfully, and we never have to move the 202 * code and thus invalidate pointers into it. (Note that it has to be in 203 * one piece because free() must be able to free it all.) 204 * 205 * Beware that the optimization-preparation code in here knows about some 206 * of the structure of the compiled regexp. 207 */ 208 regexp * 209 regcomp(exp) 210 const char *exp; 211 { 212 register regexp *r; 213 register char *scan; 214 register char *longest; 215 register int len; 216 int flags; 217 218 if (exp == NULL) 219 FAIL("NULL argument"); 220 221 /* First pass: determine size, legality. */ 222 #ifdef notdef 223 if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ 224 #endif 225 regparse = (char *)exp; 226 regnpar = 1; 227 regsize = 0L; 228 regcode = ®dummy; 229 regc(MAGIC); 230 if (reg(0, &flags) == NULL) 231 return(NULL); 232 233 /* Small enough for pointer-storage convention? */ 234 if (regsize >= 32767L) /* Probably could be 65535L. */ 235 FAIL("regexp too big"); 236 237 /* Allocate space. */ 238 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 239 if (r == NULL) 240 FAIL("out of space"); 241 242 /* Second pass: emit code. */ 243 regparse = (char *)exp; 244 regnpar = 1; 245 regcode = r->program; 246 regc(MAGIC); 247 if (reg(0, &flags) == NULL) 248 return(NULL); 249 250 /* Dig out information for optimizations. */ 251 r->regstart = '\0'; /* Worst-case defaults. */ 252 r->reganch = 0; 253 r->regmust = NULL; 254 r->regmlen = 0; 255 scan = r->program+1; /* First BRANCH. */ 256 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 257 scan = OPERAND(scan); 258 259 /* Starting-point info. */ 260 if (OP(scan) == EXACTLY) 261 r->regstart = *OPERAND(scan); 262 else if (OP(scan) == BOL) 263 r->reganch++; 264 265 /* 266 * If there's something expensive in the r.e., find the 267 * longest literal string that must appear and make it the 268 * regmust. Resolve ties in favor of later strings, since 269 * the regstart check works with the beginning of the r.e. 270 * and avoiding duplication strengthens checking. Not a 271 * strong reason, but sufficient in the absence of others. 272 */ 273 if (flags&SPSTART) { 274 longest = NULL; 275 len = 0; 276 for (; scan != NULL; scan = regnext(scan)) 277 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { 278 longest = OPERAND(scan); 279 len = strlen(OPERAND(scan)); 280 } 281 r->regmust = longest; 282 r->regmlen = len; 283 } 284 } 285 286 return(r); 287 } 288 289 /* 290 - reg - regular expression, i.e. main body or parenthesized thing 291 * 292 * Caller must absorb opening parenthesis. 293 * 294 * Combining parenthesis handling with the base level of regular expression 295 * is a trifle forced, but the need to tie the tails of the branches to what 296 * follows makes it hard to avoid. 297 */ 298 static char * 299 reg(paren, flagp) 300 int paren; /* Parenthesized? */ 301 int *flagp; 302 { 303 register char *ret; 304 register char *br; 305 register char *ender; 306 register int parno = 0; 307 int flags; 308 309 *flagp = HASWIDTH; /* Tentatively. */ 310 311 /* Make an OPEN node, if parenthesized. */ 312 if (paren) { 313 if (regnpar >= NSUBEXP) 314 FAIL("too many ()"); 315 parno = regnpar; 316 regnpar++; 317 ret = regnode(OPEN+parno); 318 } else 319 ret = NULL; 320 321 /* Pick up the branches, linking them together. */ 322 br = regbranch(&flags); 323 if (br == NULL) 324 return(NULL); 325 if (ret != NULL) 326 regtail(ret, br); /* OPEN -> first. */ 327 else 328 ret = br; 329 if (!(flags&HASWIDTH)) 330 *flagp &= ~HASWIDTH; 331 *flagp |= flags&SPSTART; 332 while (*regparse == '|' || *regparse == '\n') { 333 regparse++; 334 br = regbranch(&flags); 335 if (br == NULL) 336 return(NULL); 337 regtail(ret, br); /* BRANCH -> BRANCH. */ 338 if (!(flags&HASWIDTH)) 339 *flagp &= ~HASWIDTH; 340 *flagp |= flags&SPSTART; 341 } 342 343 /* Make a closing node, and hook it on the end. */ 344 ender = regnode((paren) ? CLOSE+parno : END); 345 regtail(ret, ender); 346 347 /* Hook the tails of the branches to the closing node. */ 348 for (br = ret; br != NULL; br = regnext(br)) 349 regoptail(br, ender); 350 351 /* Check for proper termination. */ 352 if (paren && *regparse++ != ')') { 353 FAIL("unmatched ()"); 354 } else if (!paren && *regparse != '\0') { 355 if (*regparse == ')') { 356 FAIL("unmatched ()"); 357 } else 358 FAIL("junk on end"); /* "Can't happen". */ 359 /* NOTREACHED */ 360 } 361 362 return(ret); 363 } 364 365 /* 366 - regbranch - one alternative of an | operator 367 * 368 * Implements the concatenation operator. 369 */ 370 static char * 371 regbranch(flagp) 372 int *flagp; 373 { 374 register char *ret; 375 register char *chain; 376 register char *latest; 377 int flags; 378 379 *flagp = WORST; /* Tentatively. */ 380 381 ret = regnode(BRANCH); 382 chain = NULL; 383 while (*regparse != '\0' && *regparse != ')' && 384 *regparse != '\n' && *regparse != '|') { 385 latest = regpiece(&flags); 386 if (latest == NULL) 387 return(NULL); 388 *flagp |= flags&HASWIDTH; 389 if (chain == NULL) /* First piece. */ 390 *flagp |= flags&SPSTART; 391 else 392 regtail(chain, latest); 393 chain = latest; 394 } 395 if (chain == NULL) /* Loop ran zero times. */ 396 (void) regnode(NOTHING); 397 398 return(ret); 399 } 400 401 /* 402 - regpiece - something followed by possible [*+?] 403 * 404 * Note that the branching code sequences used for ? and the general cases 405 * of * and + are somewhat optimized: they use the same NOTHING node as 406 * both the endmarker for their branch list and the body of the last branch. 407 * It might seem that this node could be dispensed with entirely, but the 408 * endmarker role is not redundant. 409 */ 410 static char * 411 regpiece(flagp) 412 int *flagp; 413 { 414 register char *ret; 415 register char op; 416 register char *next; 417 int flags; 418 419 ret = regatom(&flags); 420 if (ret == NULL) 421 return(NULL); 422 423 op = *regparse; 424 if (!ISMULT(op)) { 425 *flagp = flags; 426 return(ret); 427 } 428 429 if (!(flags&HASWIDTH) && op != '?') 430 FAIL("*+ operand could be empty"); 431 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 432 433 if (op == '*' && (flags&SIMPLE)) 434 reginsert(STAR, ret); 435 else if (op == '*') { 436 /* Emit x* as (x&|), where & means "self". */ 437 reginsert(BRANCH, ret); /* Either x */ 438 regoptail(ret, regnode(BACK)); /* and loop */ 439 regoptail(ret, ret); /* back */ 440 regtail(ret, regnode(BRANCH)); /* or */ 441 regtail(ret, regnode(NOTHING)); /* null. */ 442 } else if (op == '+' && (flags&SIMPLE)) 443 reginsert(PLUS, ret); 444 else if (op == '+') { 445 /* Emit x+ as x(&|), where & means "self". */ 446 next = regnode(BRANCH); /* Either */ 447 regtail(ret, next); 448 regtail(regnode(BACK), ret); /* loop back */ 449 regtail(next, regnode(BRANCH)); /* or */ 450 regtail(ret, regnode(NOTHING)); /* null. */ 451 } else if (op == '?') { 452 /* Emit x? as (x|) */ 453 reginsert(BRANCH, ret); /* Either x */ 454 regtail(ret, regnode(BRANCH)); /* or */ 455 next = regnode(NOTHING); /* null. */ 456 regtail(ret, next); 457 regoptail(ret, next); 458 } 459 regparse++; 460 if (ISMULT(*regparse)) 461 FAIL("nested *?+"); 462 463 return(ret); 464 } 465 466 /* 467 - regatom - the lowest level 468 * 469 * Optimization: gobbles an entire sequence of ordinary characters so that 470 * it can turn them into a single node, which is smaller to store and 471 * faster to run. Backslashed characters are exceptions, each becoming a 472 * separate node; the code is simpler that way and it's not worth fixing. 473 */ 474 static char * 475 regatom(flagp) 476 int *flagp; 477 { 478 register char *ret; 479 int flags; 480 481 *flagp = WORST; /* Tentatively. */ 482 483 switch (*regparse++) { 484 /* FIXME: these chars only have meaning at beg/end of pat? */ 485 case '^': 486 ret = regnode(BOL); 487 break; 488 case '$': 489 ret = regnode(EOL); 490 break; 491 case '.': 492 ret = regnode(ANY); 493 *flagp |= HASWIDTH|SIMPLE; 494 break; 495 case '[': { 496 register int class; 497 register int classend; 498 499 if (*regparse == '^') { /* Complement of range. */ 500 ret = regnode(ANYBUT); 501 regparse++; 502 } else 503 ret = regnode(ANYOF); 504 if (*regparse == ']' || *regparse == '-') 505 regc(*regparse++); 506 while (*regparse != '\0' && *regparse != ']') { 507 if (*regparse == '-') { 508 regparse++; 509 if (*regparse == ']' || *regparse == '\0') 510 regc('-'); 511 else { 512 class = UCHARAT(regparse-2)+1; 513 classend = UCHARAT(regparse); 514 if (class > classend+1) 515 FAIL("invalid [] range"); 516 for (; class <= classend; class++) 517 regc(class); 518 regparse++; 519 } 520 } else 521 regc(*regparse++); 522 } 523 regc('\0'); 524 if (*regparse != ']') 525 FAIL("unmatched []"); 526 regparse++; 527 *flagp |= HASWIDTH|SIMPLE; 528 } 529 break; 530 case '(': 531 ret = reg(1, &flags); 532 if (ret == NULL) 533 return(NULL); 534 *flagp |= flags&(HASWIDTH|SPSTART); 535 break; 536 case '\0': 537 case '|': 538 case '\n': 539 case ')': 540 FAIL("internal urp"); /* Supposed to be caught earlier. */ 541 break; 542 case '?': 543 case '+': 544 case '*': 545 FAIL("?+* follows nothing"); 546 break; 547 case '\\': 548 switch (*regparse++) { 549 case '\0': 550 FAIL("trailing \\"); 551 break; 552 case '<': 553 ret = regnode(WORDA); 554 break; 555 case '>': 556 ret = regnode(WORDZ); 557 break; 558 /* FIXME: Someday handle \1, \2, ... */ 559 default: 560 /* Handle general quoted chars in exact-match routine */ 561 goto de_fault; 562 } 563 break; 564 de_fault: 565 default: 566 /* 567 * Encode a string of characters to be matched exactly. 568 * 569 * This is a bit tricky due to quoted chars and due to 570 * '*', '+', and '?' taking the SINGLE char previous 571 * as their operand. 572 * 573 * On entry, the char at regparse[-1] is going to go 574 * into the string, no matter what it is. (It could be 575 * following a \ if we are entered from the '\' case.) 576 * 577 * Basic idea is to pick up a good char in ch and 578 * examine the next char. If it's *+? then we twiddle. 579 * If it's \ then we frozzle. If it's other magic char 580 * we push ch and terminate the string. If none of the 581 * above, we push ch on the string and go around again. 582 * 583 * regprev is used to remember where "the current char" 584 * starts in the string, if due to a *+? we need to back 585 * up and put the current char in a separate, 1-char, string. 586 * When regprev is NULL, ch is the only char in the 587 * string; this is used in *+? handling, and in setting 588 * flags |= SIMPLE at the end. 589 */ 590 { 591 char *regprev; 592 register char ch; 593 594 regparse--; /* Look at cur char */ 595 ret = regnode(EXACTLY); 596 for ( regprev = 0 ; ; ) { 597 ch = *regparse++; /* Get current char */ 598 switch (*regparse) { /* look at next one */ 599 600 default: 601 regc(ch); /* Add cur to string */ 602 break; 603 604 case '.': case '[': case '(': 605 case ')': case '|': case '\n': 606 case '$': case '^': 607 case '\0': 608 /* FIXME, $ and ^ should not always be magic */ 609 magic: 610 regc(ch); /* dump cur char */ 611 goto done; /* and we are done */ 612 613 case '?': case '+': case '*': 614 if (!regprev) /* If just ch in str, */ 615 goto magic; /* use it */ 616 /* End mult-char string one early */ 617 regparse = regprev; /* Back up parse */ 618 goto done; 619 620 case '\\': 621 regc(ch); /* Cur char OK */ 622 switch (regparse[1]){ /* Look after \ */ 623 case '\0': 624 case '<': 625 case '>': 626 /* FIXME: Someday handle \1, \2, ... */ 627 goto done; /* Not quoted */ 628 default: 629 /* Backup point is \, scan * point is after it. */ 630 regprev = regparse; 631 regparse++; 632 continue; /* NOT break; */ 633 } 634 } 635 regprev = regparse; /* Set backup point */ 636 } 637 done: 638 regc('\0'); 639 *flagp |= HASWIDTH; 640 if (!regprev) /* One char? */ 641 *flagp |= SIMPLE; 642 } 643 break; 644 } 645 646 return(ret); 647 } 648 649 /* 650 - regnode - emit a node 651 */ 652 static char * /* Location. */ 653 regnode(op) 654 char op; 655 { 656 register char *ret; 657 register char *ptr; 658 659 ret = regcode; 660 if (ret == ®dummy) { 661 regsize += 3; 662 return(ret); 663 } 664 665 ptr = ret; 666 *ptr++ = op; 667 *ptr++ = '\0'; /* Null "next" pointer. */ 668 *ptr++ = '\0'; 669 regcode = ptr; 670 671 return(ret); 672 } 673 674 /* 675 - regc - emit (if appropriate) a byte of code 676 */ 677 static void 678 regc(b) 679 char b; 680 { 681 if (regcode != ®dummy) 682 *regcode++ = b; 683 else 684 regsize++; 685 } 686 687 /* 688 - reginsert - insert an operator in front of already-emitted operand 689 * 690 * Means relocating the operand. 691 */ 692 static void 693 reginsert(op, opnd) 694 char op; 695 char *opnd; 696 { 697 register char *src; 698 register char *dst; 699 register char *place; 700 701 if (regcode == ®dummy) { 702 regsize += 3; 703 return; 704 } 705 706 src = regcode; 707 regcode += 3; 708 dst = regcode; 709 while (src > opnd) 710 *--dst = *--src; 711 712 place = opnd; /* Op node, where operand used to be. */ 713 *place++ = op; 714 *place++ = '\0'; 715 *place++ = '\0'; 716 } 717 718 /* 719 - regtail - set the next-pointer at the end of a node chain 720 */ 721 static void 722 regtail(p, val) 723 char *p; 724 char *val; 725 { 726 register char *scan; 727 register char *temp; 728 register int offset; 729 730 if (p == ®dummy) 731 return; 732 733 /* Find last node. */ 734 scan = p; 735 for (;;) { 736 temp = regnext(scan); 737 if (temp == NULL) 738 break; 739 scan = temp; 740 } 741 742 if (OP(scan) == BACK) 743 offset = scan - val; 744 else 745 offset = val - scan; 746 *(scan+1) = (offset>>8)&0377; 747 *(scan+2) = offset&0377; 748 } 749 750 /* 751 - regoptail - regtail on operand of first argument; nop if operandless 752 */ 753 static void 754 regoptail(p, val) 755 char *p; 756 char *val; 757 { 758 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 759 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 760 return; 761 regtail(OPERAND(p), val); 762 } 763 764 /* 765 * regexec and friends 766 */ 767 768 /* 769 * Global work variables for regexec(). 770 */ 771 static char *reginput; /* String-input pointer. */ 772 static char *regbol; /* Beginning of input, for ^ check. */ 773 static char **regstartp; /* Pointer to startp array. */ 774 static char **regendp; /* Ditto for endp. */ 775 776 /* 777 * Forwards. 778 */ 779 STATIC int regtry __P((const regexp *, const char *)); 780 STATIC int regmatch __P((char *)); 781 STATIC int regrepeat __P((char *)); 782 783 #ifdef DEBUG 784 int regnarrate = 0; 785 void regdump __P((regexp *)); 786 STATIC char *regprop __P((char *)); 787 #endif 788 789 /* 790 - regexec - match a regexp against a string 791 */ 792 int 793 regexec(prog, string) 794 register const regexp *prog; 795 register const char *string; 796 { 797 register char *s; 798 799 /* Be paranoid... */ 800 if (prog == NULL || string == NULL) { 801 regerror("NULL parameter"); 802 return(0); 803 } 804 805 /* Check validity of program. */ 806 if (UCHARAT(prog->program) != MAGIC) { 807 regerror("corrupted program"); 808 return(0); 809 } 810 811 /* If there is a "must appear" string, look for it. */ 812 if (prog->regmust != NULL) { 813 s = (char *)string; 814 while ((s = strchr(s, prog->regmust[0])) != NULL) { 815 if (strncmp(s, prog->regmust, prog->regmlen) == 0) 816 break; /* Found it. */ 817 s++; 818 } 819 if (s == NULL) /* Not present. */ 820 return(0); 821 } 822 823 /* Mark beginning of line for ^ . */ 824 regbol = (char *)string; 825 826 /* Simplest case: anchored match need be tried only once. */ 827 if (prog->reganch) 828 return(regtry(prog, string)); 829 830 /* Messy cases: unanchored match. */ 831 s = (char *)string; 832 if (prog->regstart != '\0') 833 /* We know what char it must start with. */ 834 while ((s = strchr(s, prog->regstart)) != NULL) { 835 if (regtry(prog, s)) 836 return(1); 837 s++; 838 } 839 else 840 /* We don't -- general case. */ 841 do { 842 if (regtry(prog, s)) 843 return(1); 844 } while (*s++ != '\0'); 845 846 /* Failure. */ 847 return(0); 848 } 849 850 /* 851 - regtry - try match at specific point 852 */ 853 static int /* 0 failure, 1 success */ 854 regtry(prog, string) 855 const regexp *prog; 856 const char *string; 857 { 858 register int i; 859 register char **sp; 860 register char **ep; 861 862 reginput = (char *)string; /* XXX */ 863 regstartp = (char **)prog->startp; /* XXX */ 864 regendp = (char **)prog->endp; /* XXX */ 865 866 sp = (char **)prog->startp; /* XXX */ 867 ep = (char **)prog->endp; /* XXX */ 868 for (i = NSUBEXP; i > 0; i--) { 869 *sp++ = NULL; 870 *ep++ = NULL; 871 } 872 if (regmatch((char *)prog->program + 1)) { /* XXX */ 873 ((regexp *)prog)->startp[0] = (char *)string; /* XXX */ 874 ((regexp *)prog)->endp[0] = reginput; /* XXX */ 875 return(1); 876 } else 877 return(0); 878 } 879 880 /* 881 - regmatch - main matching routine 882 * 883 * Conceptually the strategy is simple: check to see whether the current 884 * node matches, call self recursively to see whether the rest matches, 885 * and then act accordingly. In practice we make some effort to avoid 886 * recursion, in particular by going through "ordinary" nodes (that don't 887 * need to know whether the rest of the match failed) by a loop instead of 888 * by recursion. 889 */ 890 static int /* 0 failure, 1 success */ 891 regmatch(prog) 892 char *prog; 893 { 894 register char *scan; /* Current node. */ 895 char *next; /* Next node. */ 896 897 scan = prog; 898 #ifdef DEBUG 899 if (scan != NULL && regnarrate) 900 fprintf(stderr, "%s(\n", regprop(scan)); 901 #endif 902 while (scan != NULL) { 903 #ifdef DEBUG 904 if (regnarrate) 905 fprintf(stderr, "%s...\n", regprop(scan)); 906 #endif 907 next = regnext(scan); 908 909 switch (OP(scan)) { 910 case BOL: 911 if (reginput != regbol) 912 return(0); 913 break; 914 case EOL: 915 if (*reginput != '\0') 916 return(0); 917 break; 918 case WORDA: 919 /* Must be looking at a letter, digit, or _ */ 920 if ((!isalnum(*reginput)) && *reginput != '_') 921 return(0); 922 /* Prev must be BOL or nonword */ 923 if (reginput > regbol && 924 (isalnum(reginput[-1]) || reginput[-1] == '_')) 925 return(0); 926 break; 927 case WORDZ: 928 /* Must be looking at non letter, digit, or _ */ 929 if (isalnum(*reginput) || *reginput == '_') 930 return(0); 931 /* We don't care what the previous char was */ 932 break; 933 case ANY: 934 if (*reginput == '\0') 935 return(0); 936 reginput++; 937 break; 938 case EXACTLY: { 939 register int len; 940 register char *opnd; 941 942 opnd = OPERAND(scan); 943 /* Inline the first character, for speed. */ 944 if (*opnd != *reginput) 945 return(0); 946 len = strlen(opnd); 947 if (len > 1 && strncmp(opnd, reginput, len) != 0) 948 return(0); 949 reginput += len; 950 } 951 break; 952 case ANYOF: 953 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 954 return(0); 955 reginput++; 956 break; 957 case ANYBUT: 958 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 959 return(0); 960 reginput++; 961 break; 962 case NOTHING: 963 break; 964 case BACK: 965 break; 966 case OPEN+1: 967 case OPEN+2: 968 case OPEN+3: 969 case OPEN+4: 970 case OPEN+5: 971 case OPEN+6: 972 case OPEN+7: 973 case OPEN+8: 974 case OPEN+9: { 975 register int no; 976 register char *save; 977 978 no = OP(scan) - OPEN; 979 save = reginput; 980 981 if (regmatch(next)) { 982 /* 983 * Don't set startp if some later 984 * invocation of the same parentheses 985 * already has. 986 */ 987 if (regstartp[no] == NULL) 988 regstartp[no] = save; 989 return(1); 990 } else 991 return(0); 992 } 993 break; 994 case CLOSE+1: 995 case CLOSE+2: 996 case CLOSE+3: 997 case CLOSE+4: 998 case CLOSE+5: 999 case CLOSE+6: 1000 case CLOSE+7: 1001 case CLOSE+8: 1002 case CLOSE+9: { 1003 register int no; 1004 register char *save; 1005 1006 no = OP(scan) - CLOSE; 1007 save = reginput; 1008 1009 if (regmatch(next)) { 1010 /* 1011 * Don't set endp if some later 1012 * invocation of the same parentheses 1013 * already has. 1014 */ 1015 if (regendp[no] == NULL) 1016 regendp[no] = save; 1017 return(1); 1018 } else 1019 return(0); 1020 } 1021 break; 1022 case BRANCH: { 1023 register char *save; 1024 1025 if (OP(next) != BRANCH) /* No choice. */ 1026 next = OPERAND(scan); /* Avoid recursion. */ 1027 else { 1028 do { 1029 save = reginput; 1030 if (regmatch(OPERAND(scan))) 1031 return(1); 1032 reginput = save; 1033 scan = regnext(scan); 1034 } while (scan != NULL && OP(scan) == BRANCH); 1035 return(0); 1036 /* NOTREACHED */ 1037 } 1038 } 1039 break; 1040 case STAR: 1041 case PLUS: { 1042 register char nextch; 1043 register int no; 1044 register char *save; 1045 register int min; 1046 1047 /* 1048 * Lookahead to avoid useless match attempts 1049 * when we know what character comes next. 1050 */ 1051 nextch = '\0'; 1052 if (OP(next) == EXACTLY) 1053 nextch = *OPERAND(next); 1054 min = (OP(scan) == STAR) ? 0 : 1; 1055 save = reginput; 1056 no = regrepeat(OPERAND(scan)); 1057 while (no >= min) { 1058 /* If it could work, try it. */ 1059 if (nextch == '\0' || *reginput == nextch) 1060 if (regmatch(next)) 1061 return(1); 1062 /* Couldn't or didn't -- back up. */ 1063 no--; 1064 reginput = save + no; 1065 } 1066 return(0); 1067 } 1068 break; 1069 case END: 1070 return(1); /* Success! */ 1071 break; 1072 default: 1073 regerror("memory corruption"); 1074 return(0); 1075 break; 1076 } 1077 1078 scan = next; 1079 } 1080 1081 /* 1082 * We get here only if there's trouble -- normally "case END" is 1083 * the terminating point. 1084 */ 1085 regerror("corrupted pointers"); 1086 return(0); 1087 } 1088 1089 /* 1090 - regrepeat - repeatedly match something simple, report how many 1091 */ 1092 static int 1093 regrepeat(p) 1094 char *p; 1095 { 1096 register int count = 0; 1097 register char *scan; 1098 register char *opnd; 1099 1100 scan = reginput; 1101 opnd = OPERAND(p); 1102 switch (OP(p)) { 1103 case ANY: 1104 count = strlen(scan); 1105 scan += count; 1106 break; 1107 case EXACTLY: 1108 while (*opnd == *scan) { 1109 count++; 1110 scan++; 1111 } 1112 break; 1113 case ANYOF: 1114 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1115 count++; 1116 scan++; 1117 } 1118 break; 1119 case ANYBUT: 1120 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1121 count++; 1122 scan++; 1123 } 1124 break; 1125 default: /* Oh dear. Called inappropriately. */ 1126 regerror("internal foulup"); 1127 count = 0; /* Best compromise. */ 1128 break; 1129 } 1130 reginput = scan; 1131 1132 return(count); 1133 } 1134 1135 /* 1136 - regnext - dig the "next" pointer out of a node 1137 */ 1138 static char * 1139 regnext(p) 1140 register char *p; 1141 { 1142 register int offset; 1143 1144 if (p == ®dummy) 1145 return(NULL); 1146 1147 offset = NEXT(p); 1148 if (offset == 0) 1149 return(NULL); 1150 1151 if (OP(p) == BACK) 1152 return(p-offset); 1153 else 1154 return(p+offset); 1155 } 1156 1157 #ifdef DEBUG 1158 1159 /* 1160 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1161 */ 1162 void 1163 regdump(r) 1164 regexp *r; 1165 { 1166 register char *s; 1167 register char op = EXACTLY; /* Arbitrary non-END op. */ 1168 register char *next; 1169 extern char *strchr(); 1170 1171 1172 s = r->program + 1; 1173 while (op != END) { /* While that wasn't END last time... */ 1174 op = OP(s); 1175 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1176 next = regnext(s); 1177 if (next == NULL) /* Next ptr. */ 1178 printf("(0)"); 1179 else 1180 printf("(%d)", (s-r->program)+(next-s)); 1181 s += 3; 1182 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1183 /* Literal string, where present. */ 1184 while (*s != '\0') { 1185 putchar(*s); 1186 s++; 1187 } 1188 s++; 1189 } 1190 putchar('\n'); 1191 } 1192 1193 /* Header fields of interest. */ 1194 if (r->regstart != '\0') 1195 printf("start `%c' ", r->regstart); 1196 if (r->reganch) 1197 printf("anchored "); 1198 if (r->regmust != NULL) 1199 printf("must have \"%s\"", r->regmust); 1200 printf("\n"); 1201 } 1202 1203 /* 1204 - regprop - printable representation of opcode 1205 */ 1206 static char * 1207 regprop(op) 1208 char *op; 1209 { 1210 register char *p; 1211 static char buf[50]; 1212 1213 (void)strncpy(buf, ":", sizeof(buf) - 1); 1214 1215 switch (OP(op)) { 1216 case BOL: 1217 p = "BOL"; 1218 break; 1219 case EOL: 1220 p = "EOL"; 1221 break; 1222 case ANY: 1223 p = "ANY"; 1224 break; 1225 case ANYOF: 1226 p = "ANYOF"; 1227 break; 1228 case ANYBUT: 1229 p = "ANYBUT"; 1230 break; 1231 case BRANCH: 1232 p = "BRANCH"; 1233 break; 1234 case EXACTLY: 1235 p = "EXACTLY"; 1236 break; 1237 case NOTHING: 1238 p = "NOTHING"; 1239 break; 1240 case BACK: 1241 p = "BACK"; 1242 break; 1243 case END: 1244 p = "END"; 1245 break; 1246 case OPEN+1: 1247 case OPEN+2: 1248 case OPEN+3: 1249 case OPEN+4: 1250 case OPEN+5: 1251 case OPEN+6: 1252 case OPEN+7: 1253 case OPEN+8: 1254 case OPEN+9: 1255 (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf), 1256 "OPEN%d", OP(op)-OPEN); 1257 p = NULL; 1258 break; 1259 case CLOSE+1: 1260 case CLOSE+2: 1261 case CLOSE+3: 1262 case CLOSE+4: 1263 case CLOSE+5: 1264 case CLOSE+6: 1265 case CLOSE+7: 1266 case CLOSE+8: 1267 case CLOSE+9: 1268 (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf), 1269 "CLOSE%d", OP(op)-CLOSE); 1270 p = NULL; 1271 break; 1272 case STAR: 1273 p = "STAR"; 1274 break; 1275 case PLUS: 1276 p = "PLUS"; 1277 break; 1278 case WORDA: 1279 p = "WORDA"; 1280 break; 1281 case WORDZ: 1282 p = "WORDZ"; 1283 break; 1284 default: 1285 regerror("corrupted opcode"); 1286 break; 1287 } 1288 if (p != NULL) 1289 (void)strncat(buf, p, sizeof(buf) - strlen(buf) - 1); 1290 return(buf); 1291 } 1292 #endif 1293 1294 /* 1295 * The following is provided for those people who do not have strcspn() in 1296 * their C libraries. They should get off their butts and do something 1297 * about it; at least one public-domain implementation of those (highly 1298 * useful) string routines has been published on Usenet. 1299 */ 1300 #ifdef STRCSPN 1301 /* 1302 * strcspn - find length of initial segment of s1 consisting entirely 1303 * of characters not from s2 1304 */ 1305 1306 static int 1307 strcspn(s1, s2) 1308 char *s1; 1309 char *s2; 1310 { 1311 register char *scan1; 1312 register char *scan2; 1313 register int count; 1314 1315 count = 0; 1316 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1317 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1318 if (*scan1 == *scan2++) 1319 return(count); 1320 count++; 1321 } 1322 return(count); 1323 } 1324 #endif 1325