1 /* regexec.c 2 */ 3 4 /* 5 * "One Ring to rule them all, One Ring to find them..." 6 */ 7 8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 9 * confused with the original package (see point 3 below). Thanks, Henry! 10 */ 11 12 /* Additional note: this code is very heavily munged from Henry's version 13 * in places. In some spots I've traded clarity for efficiency, so don't 14 * blame Henry for some of the lack of readability. 15 */ 16 17 /* The names of the functions have been changed from regcomp and 18 * regexec to pregcomp and pregexec in order to avoid conflicts 19 * with the POSIX routines of the same names. 20 */ 21 22 /*SUPPRESS 112*/ 23 /* 24 * pregcomp and pregexec -- regsub and regerror are not used in perl 25 * 26 * Copyright (c) 1986 by University of Toronto. 27 * Written by Henry Spencer. Not derived from licensed software. 28 * 29 * Permission is granted to anyone to use this software for any 30 * purpose on any computer system, and to redistribute it freely, 31 * subject to the following restrictions: 32 * 33 * 1. The author is not responsible for the consequences of use of 34 * this software, no matter how awful, even if they arise 35 * from defects in it. 36 * 37 * 2. The origin of this software must not be misrepresented, either 38 * by explicit claim or by omission. 39 * 40 * 3. Altered versions must be plainly marked as such, and must not 41 * be misrepresented as being the original software. 42 * 43 **** Alterations to Henry's code are... 44 **** 45 **** Copyright (c) 1991-1994, Larry Wall 46 **** 47 **** You may distribute under the terms of either the GNU General Public 48 **** License or the Artistic License, as specified in the README file. 49 * 50 * Beware that some of this code is subtly aware of the way operator 51 * precedence is structured in regular expressions. Serious changes in 52 * regular-expression syntax might require a total rethink. 53 */ 54 #include "EXTERN.h" 55 #include "perl.h" 56 #include "regcomp.h" 57 58 #ifndef STATIC 59 #define STATIC static 60 #endif 61 62 #ifdef DEBUGGING 63 static I32 regnarrate = 0; 64 static char* regprogram = 0; 65 #endif 66 67 /* Current curly descriptor */ 68 typedef struct curcur CURCUR; 69 struct curcur { 70 int parenfloor; /* how far back to strip paren data */ 71 int cur; /* how many instances of scan we've matched */ 72 int min; /* the minimal number of scans to match */ 73 int max; /* the maximal number of scans to match */ 74 int minmod; /* whether to work our way up or down */ 75 char * scan; /* the thing to match */ 76 char * next; /* what has to match after it */ 77 char * lastloc; /* where we started matching this scan */ 78 CURCUR * oldcc; /* current curly before we started this one */ 79 }; 80 81 static CURCUR* regcc; 82 83 typedef I32 CHECKPOINT; 84 85 CHECKPOINT regcppush _((I32 parenfloor)); 86 char * regcppop _((void)); 87 88 CHECKPOINT 89 regcppush(parenfloor) 90 I32 parenfloor; 91 { 92 int retval = savestack_ix; 93 int i = (regsize - parenfloor) * 3; 94 int p; 95 96 SSCHECK(i + 5); 97 for (p = regsize; p > parenfloor; p--) { 98 SSPUSHPTR(regendp[p]); 99 SSPUSHPTR(regstartp[p]); 100 SSPUSHINT(p); 101 } 102 SSPUSHINT(regsize); 103 SSPUSHINT(*reglastparen); 104 SSPUSHPTR(reginput); 105 SSPUSHINT(i + 3); 106 SSPUSHINT(SAVEt_REGCONTEXT); 107 return retval; 108 } 109 110 char* 111 regcppop() 112 { 113 I32 i = SSPOPINT; 114 U32 paren = 0; 115 char *input; 116 char *tmps; 117 assert(i == SAVEt_REGCONTEXT); 118 i = SSPOPINT; 119 input = (char *) SSPOPPTR; 120 *reglastparen = SSPOPINT; 121 regsize = SSPOPINT; 122 for (i -= 3; i > 0; i -= 3) { 123 paren = (U32)SSPOPINT; 124 regstartp[paren] = (char *) SSPOPPTR; 125 tmps = (char*)SSPOPPTR; 126 if (paren <= *reglastparen) 127 regendp[paren] = tmps; 128 } 129 for (paren = *reglastparen + 1; paren <= regnpar; paren++) { 130 if (paren > regsize) 131 regstartp[paren] = Nullch; 132 regendp[paren] = Nullch; 133 } 134 return input; 135 } 136 137 #define regcpblow(cp) leave_scope(cp) 138 139 /* 140 * pregexec and friends 141 */ 142 143 /* 144 * Forwards. 145 */ 146 147 static I32 regmatch _((char *prog)); 148 static I32 regrepeat _((char *p, I32 max)); 149 static I32 regtry _((regexp *prog, char *startpos)); 150 151 /* 152 - pregexec - match a regexp against a string 153 */ 154 I32 155 pregexec(prog, stringarg, strend, strbeg, minend, screamer, safebase) 156 register regexp *prog; 157 char *stringarg; 158 register char *strend; /* pointer to null at end of string */ 159 char *strbeg; /* real beginning of string */ 160 I32 minend; /* end of match must be at least minend after stringarg */ 161 SV *screamer; 162 I32 safebase; /* no need to remember string in subbase */ 163 { 164 register char *s; 165 register I32 i; 166 register char *c; 167 register char *startpos = stringarg; 168 register I32 tmp; 169 I32 minlen = 0; /* must match at least this many chars */ 170 I32 dontbother = 0; /* how many characters not to try at end */ 171 CURCUR cc; 172 173 cc.cur = 0; 174 cc.oldcc = 0; 175 regcc = &cc; 176 177 #ifdef DEBUGGING 178 regnarrate = debug & 512; 179 regprogram = prog->program; 180 #endif 181 182 /* Be paranoid... */ 183 if (prog == NULL || startpos == NULL) { 184 croak("NULL regexp parameter"); 185 return 0; 186 } 187 188 if (startpos == strbeg) /* is ^ valid at stringarg? */ 189 regprev = '\n'; 190 else { 191 regprev = stringarg[-1]; 192 if (!multiline && regprev == '\n') 193 regprev = '\0'; /* force ^ to NOT match */ 194 } 195 regprecomp = prog->precomp; 196 regnpar = prog->nparens; 197 /* Check validity of program. */ 198 if (UCHARAT(prog->program) != MAGIC) { 199 FAIL("corrupted regexp program"); 200 } 201 202 if (prog->do_folding) { 203 i = strend - startpos; 204 New(1101,c,i+1,char); 205 Copy(startpos, c, i+1, char); 206 startpos = c; 207 strend = startpos + i; 208 for (s = startpos; s < strend; s++) 209 if (isUPPER(*s)) 210 *s = toLOWER(*s); 211 } 212 213 /* If there is a "must appear" string, look for it. */ 214 s = startpos; 215 if (prog->regmust != Nullsv && 216 (!(prog->reganch & ROPT_ANCH) 217 || (multiline && prog->regback >= 0)) ) 218 { 219 if (stringarg == strbeg && screamer) { 220 if (screamfirst[BmRARE(prog->regmust)] >= 0) 221 s = screaminstr(screamer,prog->regmust); 222 else 223 s = Nullch; 224 } 225 else 226 s = fbm_instr((unsigned char*)s, (unsigned char*)strend, 227 prog->regmust); 228 if (!s) { 229 ++BmUSEFUL(prog->regmust); /* hooray */ 230 goto phooey; /* not present */ 231 } 232 else if (prog->regback >= 0) { 233 s -= prog->regback; 234 if (s < startpos) 235 s = startpos; 236 minlen = prog->regback + SvCUR(prog->regmust); 237 } 238 else if (!prog->naughty && --BmUSEFUL(prog->regmust) < 0) { /* boo */ 239 SvREFCNT_dec(prog->regmust); 240 prog->regmust = Nullsv; /* disable regmust */ 241 s = startpos; 242 } 243 else { 244 s = startpos; 245 minlen = SvCUR(prog->regmust); 246 } 247 } 248 249 /* Mark beginning of line for ^ . */ 250 regbol = startpos; 251 252 /* Mark end of line for $ (and such) */ 253 regeol = strend; 254 255 /* see how far we have to get to not match where we matched before */ 256 regtill = startpos+minend; 257 258 /* Simplest case: anchored match need be tried only once. */ 259 /* [unless multiline is set] */ 260 if (prog->reganch & ROPT_ANCH) { 261 if (regtry(prog, startpos)) 262 goto got_it; 263 else if (multiline || (prog->reganch & ROPT_IMPLICIT)) { 264 if (minlen) 265 dontbother = minlen - 1; 266 strend -= dontbother; 267 /* for multiline we only have to try after newlines */ 268 if (s > startpos) 269 s--; 270 while (s < strend) { 271 if (*s++ == '\n') { 272 if (s < strend && regtry(prog, s)) 273 goto got_it; 274 } 275 } 276 } 277 goto phooey; 278 } 279 280 /* Messy cases: unanchored match. */ 281 if (prog->regstart) { 282 if (prog->reganch & ROPT_SKIP) { /* we have /x+whatever/ */ 283 /* it must be a one character string */ 284 i = SvPVX(prog->regstart)[0]; 285 while (s < strend) { 286 if (*s == i) { 287 if (regtry(prog, s)) 288 goto got_it; 289 s++; 290 while (s < strend && *s == i) 291 s++; 292 } 293 s++; 294 } 295 } 296 else if (SvPOK(prog->regstart) == 3) { 297 /* We know what string it must start with. */ 298 while ((s = fbm_instr((unsigned char*)s, 299 (unsigned char*)strend, prog->regstart)) != NULL) 300 { 301 if (regtry(prog, s)) 302 goto got_it; 303 s++; 304 } 305 } 306 else { 307 c = SvPVX(prog->regstart); 308 while ((s = ninstr(s, strend, c, c + SvCUR(prog->regstart))) != NULL) 309 { 310 if (regtry(prog, s)) 311 goto got_it; 312 s++; 313 } 314 } 315 goto phooey; 316 } 317 /*SUPPRESS 560*/ 318 if (c = prog->regstclass) { 319 I32 doevery = (prog->reganch & ROPT_SKIP) == 0; 320 321 if (minlen) 322 dontbother = minlen - 1; 323 strend -= dontbother; /* don't bother with what can't match */ 324 tmp = 1; 325 /* We know what class it must start with. */ 326 switch (OP(c)) { 327 case ANYOF: 328 c = OPERAND(c); 329 while (s < strend) { 330 i = UCHARAT(s); 331 if (!(c[i >> 3] & (1 << (i&7)))) { 332 if (tmp && regtry(prog, s)) 333 goto got_it; 334 else 335 tmp = doevery; 336 } 337 else 338 tmp = 1; 339 s++; 340 } 341 break; 342 case BOUND: 343 if (minlen) 344 dontbother++,strend--; 345 if (s != startpos) { 346 i = s[-1]; 347 tmp = isALNUM(i); 348 } 349 else 350 tmp = isALNUM(regprev); /* assume not alphanumeric */ 351 while (s < strend) { 352 i = *s; 353 if (tmp != isALNUM(i)) { 354 tmp = !tmp; 355 if (regtry(prog, s)) 356 goto got_it; 357 } 358 s++; 359 } 360 if ((minlen || tmp) && regtry(prog,s)) 361 goto got_it; 362 break; 363 case NBOUND: 364 if (minlen) 365 dontbother++,strend--; 366 if (s != startpos) { 367 i = s[-1]; 368 tmp = isALNUM(i); 369 } 370 else 371 tmp = isALNUM(regprev); /* assume not alphanumeric */ 372 while (s < strend) { 373 i = *s; 374 if (tmp != isALNUM(i)) 375 tmp = !tmp; 376 else if (regtry(prog, s)) 377 goto got_it; 378 s++; 379 } 380 if ((minlen || !tmp) && regtry(prog,s)) 381 goto got_it; 382 break; 383 case ALNUM: 384 while (s < strend) { 385 i = *s; 386 if (isALNUM(i)) { 387 if (tmp && regtry(prog, s)) 388 goto got_it; 389 else 390 tmp = doevery; 391 } 392 else 393 tmp = 1; 394 s++; 395 } 396 break; 397 case NALNUM: 398 while (s < strend) { 399 i = *s; 400 if (!isALNUM(i)) { 401 if (tmp && regtry(prog, s)) 402 goto got_it; 403 else 404 tmp = doevery; 405 } 406 else 407 tmp = 1; 408 s++; 409 } 410 break; 411 case SPACE: 412 while (s < strend) { 413 if (isSPACE(*s)) { 414 if (tmp && regtry(prog, s)) 415 goto got_it; 416 else 417 tmp = doevery; 418 } 419 else 420 tmp = 1; 421 s++; 422 } 423 break; 424 case NSPACE: 425 while (s < strend) { 426 if (!isSPACE(*s)) { 427 if (tmp && regtry(prog, s)) 428 goto got_it; 429 else 430 tmp = doevery; 431 } 432 else 433 tmp = 1; 434 s++; 435 } 436 break; 437 case DIGIT: 438 while (s < strend) { 439 if (isDIGIT(*s)) { 440 if (tmp && regtry(prog, s)) 441 goto got_it; 442 else 443 tmp = doevery; 444 } 445 else 446 tmp = 1; 447 s++; 448 } 449 break; 450 case NDIGIT: 451 while (s < strend) { 452 if (!isDIGIT(*s)) { 453 if (tmp && regtry(prog, s)) 454 goto got_it; 455 else 456 tmp = doevery; 457 } 458 else 459 tmp = 1; 460 s++; 461 } 462 break; 463 } 464 } 465 else { 466 if (minlen) 467 dontbother = minlen - 1; 468 strend -= dontbother; 469 /* We don't know much -- general case. */ 470 do { 471 if (regtry(prog, s)) 472 goto got_it; 473 } while (s++ < strend); 474 } 475 476 /* Failure. */ 477 goto phooey; 478 479 got_it: 480 strend += dontbother; /* uncheat */ 481 prog->subbeg = strbeg; 482 prog->subend = strend; 483 if ((!safebase && (prog->nparens || sawampersand)) || prog->do_folding) { 484 i = strend - startpos + (stringarg - strbeg); 485 if (safebase) { /* no need for $digit later */ 486 s = strbeg; 487 prog->subend = s+i; 488 } 489 else if (strbeg != prog->subbase) { 490 s = savepvn(strbeg,i); /* so $digit will work later */ 491 if (prog->subbase) 492 Safefree(prog->subbase); 493 prog->subbeg = prog->subbase = s; 494 prog->subend = s+i; 495 } 496 else { 497 prog->subbeg = s = prog->subbase; 498 prog->subend = s+i; 499 } 500 s += (stringarg - strbeg); 501 for (i = 0; i <= prog->nparens; i++) { 502 if (prog->endp[i]) { 503 prog->startp[i] = s + (prog->startp[i] - startpos); 504 prog->endp[i] = s + (prog->endp[i] - startpos); 505 } 506 } 507 if (prog->do_folding) 508 Safefree(startpos); 509 } 510 return 1; 511 512 phooey: 513 if (prog->do_folding) 514 Safefree(startpos); 515 return 0; 516 } 517 518 /* 519 - regtry - try match at specific point 520 */ 521 static I32 /* 0 failure, 1 success */ 522 regtry(prog, startpos) 523 regexp *prog; 524 char *startpos; 525 { 526 register I32 i; 527 register char **sp; 528 register char **ep; 529 530 reginput = startpos; 531 regstartp = prog->startp; 532 regendp = prog->endp; 533 reglastparen = &prog->lastparen; 534 prog->lastparen = 0; 535 regsize = 0; 536 537 sp = prog->startp; 538 ep = prog->endp; 539 if (prog->nparens) { 540 for (i = prog->nparens; i >= 0; i--) { 541 *sp++ = NULL; 542 *ep++ = NULL; 543 } 544 } 545 if (regmatch(prog->program + 1) && reginput >= regtill) { 546 prog->startp[0] = startpos; 547 prog->endp[0] = reginput; 548 return 1; 549 } 550 else 551 return 0; 552 } 553 554 /* 555 - regmatch - main matching routine 556 * 557 * Conceptually the strategy is simple: check to see whether the current 558 * node matches, call self recursively to see whether the rest matches, 559 * and then act accordingly. In practice we make some effort to avoid 560 * recursion, in particular by going through "ordinary" nodes (that don't 561 * need to know whether the rest of the match failed) by a loop instead of 562 * by recursion. 563 */ 564 /* [lwall] I've hoisted the register declarations to the outer block in order to 565 * maybe save a little bit of pushing and popping on the stack. It also takes 566 * advantage of machines that use a register save mask on subroutine entry. 567 */ 568 static I32 /* 0 failure, 1 success */ 569 regmatch(prog) 570 char *prog; 571 { 572 register char *scan; /* Current node. */ 573 char *next; /* Next node. */ 574 register I32 nextchar; 575 register I32 n; /* no or next */ 576 register I32 ln; /* len or last */ 577 register char *s; /* operand or save */ 578 register char *locinput = reginput; 579 int minmod = 0; 580 #ifdef DEBUGGING 581 static int regindent = 0; 582 regindent++; 583 #endif 584 585 nextchar = *locinput; 586 scan = prog; 587 while (scan != NULL) { 588 #ifdef DEBUGGING 589 #define sayYES goto yes 590 #define sayNO goto no 591 #define saySAME(x) if (x) goto yes; else goto no 592 if (regnarrate) { 593 fprintf(stderr, "%*s%2d%-8.8s\t<%.10s>\n", regindent*2, "", 594 scan - regprogram, regprop(scan), locinput); 595 } 596 #else 597 #define sayYES return 1 598 #define sayNO return 0 599 #define saySAME(x) return x 600 #endif 601 602 #ifdef REGALIGN 603 next = scan + NEXT(scan); 604 if (next == scan) 605 next = NULL; 606 #else 607 next = regnext(scan); 608 #endif 609 610 switch (OP(scan)) { 611 case BOL: 612 if (locinput == regbol 613 ? regprev == '\n' 614 : ((nextchar || locinput < regeol) && locinput[-1] == '\n') ) 615 { 616 /* regtill = regbol; */ 617 break; 618 } 619 sayNO; 620 case MBOL: 621 if (locinput == regbol 622 ? regprev == '\n' 623 : ((nextchar || locinput < regeol) && locinput[-1] == '\n') ) 624 { 625 break; 626 } 627 sayNO; 628 case SBOL: 629 if (locinput == regbol && regprev == '\n') 630 break; 631 sayNO; 632 case GBOL: 633 if (locinput == regbol) 634 break; 635 sayNO; 636 case EOL: 637 if (multiline) 638 goto meol; 639 else 640 goto seol; 641 case MEOL: 642 meol: 643 if ((nextchar || locinput < regeol) && nextchar != '\n') 644 sayNO; 645 break; 646 case SEOL: 647 seol: 648 if ((nextchar || locinput < regeol) && nextchar != '\n') 649 sayNO; 650 if (regeol - locinput > 1) 651 sayNO; 652 break; 653 case SANY: 654 if (!nextchar && locinput >= regeol) 655 sayNO; 656 nextchar = *++locinput; 657 break; 658 case ANY: 659 if (!nextchar && locinput >= regeol || nextchar == '\n') 660 sayNO; 661 nextchar = *++locinput; 662 break; 663 case EXACTLY: 664 s = OPERAND(scan); 665 ln = *s++; 666 /* Inline the first character, for speed. */ 667 if (*s != nextchar) 668 sayNO; 669 if (regeol - locinput < ln) 670 sayNO; 671 if (ln > 1 && bcmp(s, locinput, ln) != 0) 672 sayNO; 673 locinput += ln; 674 nextchar = *locinput; 675 break; 676 case ANYOF: 677 s = OPERAND(scan); 678 if (nextchar < 0) 679 nextchar = UCHARAT(locinput); 680 if (s[nextchar >> 3] & (1 << (nextchar&7))) 681 sayNO; 682 if (!nextchar && locinput >= regeol) 683 sayNO; 684 nextchar = *++locinput; 685 break; 686 case ALNUM: 687 if (!nextchar) 688 sayNO; 689 if (!isALNUM(nextchar)) 690 sayNO; 691 nextchar = *++locinput; 692 break; 693 case NALNUM: 694 if (!nextchar && locinput >= regeol) 695 sayNO; 696 if (isALNUM(nextchar)) 697 sayNO; 698 nextchar = *++locinput; 699 break; 700 case NBOUND: 701 case BOUND: 702 if (locinput == regbol) /* was last char in word? */ 703 ln = isALNUM(regprev); 704 else 705 ln = isALNUM(locinput[-1]); 706 n = isALNUM(nextchar); /* is next char in word? */ 707 if ((ln == n) == (OP(scan) == BOUND)) 708 sayNO; 709 break; 710 case SPACE: 711 if (!nextchar && locinput >= regeol) 712 sayNO; 713 if (!isSPACE(nextchar)) 714 sayNO; 715 nextchar = *++locinput; 716 break; 717 case NSPACE: 718 if (!nextchar) 719 sayNO; 720 if (isSPACE(nextchar)) 721 sayNO; 722 nextchar = *++locinput; 723 break; 724 case DIGIT: 725 if (!isDIGIT(nextchar)) 726 sayNO; 727 nextchar = *++locinput; 728 break; 729 case NDIGIT: 730 if (!nextchar && locinput >= regeol) 731 sayNO; 732 if (isDIGIT(nextchar)) 733 sayNO; 734 nextchar = *++locinput; 735 break; 736 case REF: 737 n = ARG1(scan); /* which paren pair */ 738 s = regstartp[n]; 739 if (!s) 740 sayNO; 741 if (!regendp[n]) 742 sayNO; 743 if (s == regendp[n]) 744 break; 745 /* Inline the first character, for speed. */ 746 if (*s != nextchar) 747 sayNO; 748 ln = regendp[n] - s; 749 if (locinput + ln > regeol) 750 sayNO; 751 if (ln > 1 && bcmp(s, locinput, ln) != 0) 752 sayNO; 753 locinput += ln; 754 nextchar = *locinput; 755 break; 756 757 case NOTHING: 758 break; 759 case BACK: 760 break; 761 case OPEN: 762 n = ARG1(scan); /* which paren pair */ 763 regstartp[n] = locinput; 764 if (n > regsize) 765 regsize = n; 766 break; 767 case CLOSE: 768 n = ARG1(scan); /* which paren pair */ 769 regendp[n] = locinput; 770 if (n > *reglastparen) 771 *reglastparen = n; 772 break; 773 case CURLYX: { 774 CURCUR cc; 775 CHECKPOINT cp = savestack_ix; 776 cc.oldcc = regcc; 777 regcc = &cc; 778 cc.parenfloor = *reglastparen; 779 cc.cur = -1; 780 cc.min = ARG1(scan); 781 cc.max = ARG2(scan); 782 cc.scan = NEXTOPER(scan) + 4; 783 cc.next = next; 784 cc.minmod = minmod; 785 cc.lastloc = 0; 786 reginput = locinput; 787 n = regmatch(PREVOPER(next)); /* start on the WHILEM */ 788 regcpblow(cp); 789 regcc = cc.oldcc; 790 saySAME(n); 791 } 792 /* NOT REACHED */ 793 case WHILEM: { 794 /* 795 * This is really hard to understand, because after we match 796 * what we're trying to match, we must make sure the rest of 797 * the RE is going to match for sure, and to do that we have 798 * to go back UP the parse tree by recursing ever deeper. And 799 * if it fails, we have to reset our parent's current state 800 * that we can try again after backing off. 801 */ 802 803 CURCUR* cc = regcc; 804 n = cc->cur + 1; /* how many we know we matched */ 805 reginput = locinput; 806 807 #ifdef DEBUGGING 808 if (regnarrate) 809 fprintf(stderr, "%*s %d %lx\n", regindent*2, "", 810 n, (long)cc); 811 #endif 812 813 /* If degenerate scan matches "", assume scan done. */ 814 815 if (locinput == cc->lastloc) { 816 regcc = cc->oldcc; 817 ln = regcc->cur; 818 if (regmatch(cc->next)) 819 sayYES; 820 regcc->cur = ln; 821 regcc = cc; 822 sayNO; 823 } 824 825 /* First just match a string of min scans. */ 826 827 if (n < cc->min) { 828 cc->cur = n; 829 cc->lastloc = locinput; 830 if (regmatch(cc->scan)) 831 sayYES; 832 cc->cur = n - 1; 833 sayNO; 834 } 835 836 /* Prefer next over scan for minimal matching. */ 837 838 if (cc->minmod) { 839 regcc = cc->oldcc; 840 ln = regcc->cur; 841 if (regmatch(cc->next)) 842 sayYES; /* All done. */ 843 regcc->cur = ln; 844 regcc = cc; 845 846 if (n >= cc->max) /* Maximum greed exceeded? */ 847 sayNO; 848 849 /* Try scanning more and see if it helps. */ 850 reginput = locinput; 851 cc->cur = n; 852 cc->lastloc = locinput; 853 if (regmatch(cc->scan)) 854 sayYES; 855 cc->cur = n - 1; 856 sayNO; 857 } 858 859 /* Prefer scan over next for maximal matching. */ 860 861 if (n < cc->max) { /* More greed allowed? */ 862 regcppush(cc->parenfloor); 863 cc->cur = n; 864 cc->lastloc = locinput; 865 if (regmatch(cc->scan)) 866 sayYES; 867 regcppop(); /* Restore some previous $<digit>s? */ 868 reginput = locinput; 869 } 870 871 /* Failed deeper matches of scan, so see if this one works. */ 872 regcc = cc->oldcc; 873 ln = regcc->cur; 874 if (regmatch(cc->next)) 875 sayYES; 876 regcc->cur = ln; 877 regcc = cc; 878 cc->cur = n - 1; 879 sayNO; 880 } 881 /* NOT REACHED */ 882 case BRANCH: { 883 if (OP(next) != BRANCH) /* No choice. */ 884 next = NEXTOPER(scan);/* Avoid recursion. */ 885 else { 886 int lastparen = *reglastparen; 887 do { 888 reginput = locinput; 889 if (regmatch(NEXTOPER(scan))) 890 sayYES; 891 for (n = *reglastparen; n > lastparen; n--) 892 regendp[n] = 0; 893 *reglastparen = n; 894 895 #ifdef REGALIGN 896 /*SUPPRESS 560*/ 897 if (n = NEXT(scan)) 898 scan += n; 899 else 900 scan = NULL; 901 #else 902 scan = regnext(scan); 903 #endif 904 } while (scan != NULL && OP(scan) == BRANCH); 905 sayNO; 906 /* NOTREACHED */ 907 } 908 } 909 break; 910 case MINMOD: 911 minmod = 1; 912 break; 913 case CURLY: 914 ln = ARG1(scan); /* min to match */ 915 n = ARG2(scan); /* max to match */ 916 scan = NEXTOPER(scan) + 4; 917 goto repeat; 918 case STAR: 919 ln = 0; 920 n = 32767; 921 scan = NEXTOPER(scan); 922 goto repeat; 923 case PLUS: 924 /* 925 * Lookahead to avoid useless match attempts 926 * when we know what character comes next. 927 */ 928 ln = 1; 929 n = 32767; 930 scan = NEXTOPER(scan); 931 repeat: 932 if (OP(next) == EXACTLY) 933 nextchar = *(OPERAND(next)+1); 934 else 935 nextchar = -1000; 936 reginput = locinput; 937 if (minmod) { 938 minmod = 0; 939 if (ln && regrepeat(scan, ln) < ln) 940 sayNO; 941 while (n >= ln || (n == 32767 && ln > 0)) { /* ln overflow ? */ 942 /* If it could work, try it. */ 943 if (nextchar == -1000 || *reginput == nextchar) 944 if (regmatch(next)) 945 sayYES; 946 /* Couldn't or didn't -- back up. */ 947 reginput = locinput + ln; 948 if (regrepeat(scan, 1)) { 949 ln++; 950 reginput = locinput + ln; 951 } 952 else 953 sayNO; 954 } 955 } 956 else { 957 n = regrepeat(scan, n); 958 if (ln < n && regkind[(U8)OP(next)] == EOL && 959 (!multiline || OP(next) == SEOL)) 960 ln = n; /* why back off? */ 961 while (n >= ln) { 962 /* If it could work, try it. */ 963 if (nextchar == -1000 || *reginput == nextchar) 964 if (regmatch(next)) 965 sayYES; 966 /* Couldn't or didn't -- back up. */ 967 n--; 968 reginput = locinput + n; 969 } 970 } 971 sayNO; 972 case SUCCEED: 973 case END: 974 reginput = locinput; /* put where regtry can find it */ 975 sayYES; /* Success! */ 976 case IFMATCH: 977 reginput = locinput; 978 scan = NEXTOPER(scan); 979 if (!regmatch(scan)) 980 sayNO; 981 break; 982 case UNLESSM: 983 reginput = locinput; 984 scan = NEXTOPER(scan); 985 if (regmatch(scan)) 986 sayNO; 987 break; 988 default: 989 fprintf(stderr, "%x %d\n",(unsigned)scan,scan[1]); 990 FAIL("regexp memory corruption"); 991 } 992 scan = next; 993 } 994 995 /* 996 * We get here only if there's trouble -- normally "case END" is 997 * the terminating point. 998 */ 999 FAIL("corrupted regexp pointers"); 1000 /*NOTREACHED*/ 1001 sayNO; 1002 1003 yes: 1004 #ifdef DEBUGGING 1005 regindent--; 1006 #endif 1007 return 1; 1008 1009 no: 1010 #ifdef DEBUGGING 1011 regindent--; 1012 #endif 1013 return 0; 1014 } 1015 1016 /* 1017 - regrepeat - repeatedly match something simple, report how many 1018 */ 1019 /* 1020 * [This routine now assumes that it will only match on things of length 1. 1021 * That was true before, but now we assume scan - reginput is the count, 1022 * rather than incrementing count on every character.] 1023 */ 1024 static I32 1025 regrepeat(p, max) 1026 char *p; 1027 I32 max; 1028 { 1029 register char *scan; 1030 register char *opnd; 1031 register I32 c; 1032 register char *loceol = regeol; 1033 1034 scan = reginput; 1035 if (max != 32767 && max < loceol - scan) 1036 loceol = scan + max; 1037 opnd = OPERAND(p); 1038 switch (OP(p)) { 1039 case ANY: 1040 while (scan < loceol && *scan != '\n') 1041 scan++; 1042 break; 1043 case SANY: 1044 scan = loceol; 1045 break; 1046 case EXACTLY: /* length of string is 1 */ 1047 opnd++; 1048 while (scan < loceol && *opnd == *scan) 1049 scan++; 1050 break; 1051 case ANYOF: 1052 c = UCHARAT(scan); 1053 while (scan < loceol && !(opnd[c >> 3] & (1 << (c & 7)))) { 1054 scan++; 1055 c = UCHARAT(scan); 1056 } 1057 break; 1058 case ALNUM: 1059 while (scan < loceol && isALNUM(*scan)) 1060 scan++; 1061 break; 1062 case NALNUM: 1063 while (scan < loceol && !isALNUM(*scan)) 1064 scan++; 1065 break; 1066 case SPACE: 1067 while (scan < loceol && isSPACE(*scan)) 1068 scan++; 1069 break; 1070 case NSPACE: 1071 while (scan < loceol && !isSPACE(*scan)) 1072 scan++; 1073 break; 1074 case DIGIT: 1075 while (scan < loceol && isDIGIT(*scan)) 1076 scan++; 1077 break; 1078 case NDIGIT: 1079 while (scan < loceol && !isDIGIT(*scan)) 1080 scan++; 1081 break; 1082 default: /* Called on something of 0 width. */ 1083 break; /* So match right here or not at all. */ 1084 } 1085 1086 c = scan - reginput; 1087 reginput = scan; 1088 1089 return(c); 1090 } 1091 1092 /* 1093 - regnext - dig the "next" pointer out of a node 1094 * 1095 * [Note, when REGALIGN is defined there are two places in regmatch() 1096 * that bypass this code for speed.] 1097 */ 1098 char * 1099 regnext(p) 1100 register char *p; 1101 { 1102 register I32 offset; 1103 1104 if (p == ®dummy) 1105 return(NULL); 1106 1107 offset = NEXT(p); 1108 if (offset == 0) 1109 return(NULL); 1110 1111 #ifdef REGALIGN 1112 return(p+offset); 1113 #else 1114 if (OP(p) == BACK) 1115 return(p-offset); 1116 else 1117 return(p+offset); 1118 #endif 1119 } 1120