1 /* $NetBSD: engine.c,v 1.12 1999/09/16 11:45:20 lukem Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * @(#)engine.c 8.5 (Berkeley) 3/20/94 40 */ 41 42 /* 43 * The matching engine and friends. This file is #included by regexec.c 44 * after suitable #defines of a variety of macros used herein, so that 45 * different state representations can be used without duplicating masses 46 * of code. 47 */ 48 49 #ifdef SNAMES 50 #define matcher smatcher 51 #define fast sfast 52 #define slow sslow 53 #define dissect sdissect 54 #define backref sbackref 55 #define step sstep 56 #define print sprint 57 #define at sat 58 #define match smat 59 #endif 60 #ifdef LNAMES 61 #define matcher lmatcher 62 #define fast lfast 63 #define slow lslow 64 #define dissect ldissect 65 #define backref lbackref 66 #define step lstep 67 #define print lprint 68 #define at lat 69 #define match lmat 70 #endif 71 72 /* another structure passed up and down to avoid zillions of parameters */ 73 struct match { 74 struct re_guts *g; 75 int eflags; 76 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 77 char *offp; /* offsets work from here */ 78 char *beginp; /* start of string -- virtual NUL precedes */ 79 char *endp; /* end of string -- virtual NUL here */ 80 char *coldp; /* can be no match starting before here */ 81 char **lastpos; /* [nplus+1] */ 82 STATEVARS; 83 states st; /* current states */ 84 states fresh; /* states for a fresh start */ 85 states tmp; /* temporary */ 86 states empty; /* empty set of states */ 87 }; 88 89 /* ========= begin header generated by ./mkh ========= */ 90 #ifdef __cplusplus 91 extern "C" { 92 #endif 93 94 /* === engine.c === */ 95 static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags)); 96 static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 97 static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev)); 98 static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 99 static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 100 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft)); 101 #define BOL (OUT+1) 102 #define EOL (BOL+1) 103 #define BOLEOL (BOL+2) 104 #define NOTHING (BOL+3) 105 #define BOW (BOL+4) 106 #define EOW (BOL+5) 107 #define CODEMAX (BOL+5) /* highest code used */ 108 #define NONCHAR(c) ((c) > CHAR_MAX) 109 #define NNONCHAR (CODEMAX-CHAR_MAX) 110 #ifdef REDEBUG 111 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d)); 112 #endif 113 #ifdef REDEBUG 114 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst)); 115 #endif 116 #ifdef REDEBUG 117 static char *pchar __P((int ch)); 118 #endif 119 120 #ifdef __cplusplus 121 } 122 #endif 123 /* ========= end header generated by ./mkh ========= */ 124 125 #ifdef REDEBUG 126 #define SP(t, s, c) print(m, t, s, c, stdout) 127 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 128 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 129 #else 130 #define SP(t, s, c) /* nothing */ 131 #define AT(t, p1, p2, s1, s2) /* nothing */ 132 #define NOTE(s) /* nothing */ 133 #endif 134 135 /* 136 - matcher - the actual matching engine 137 == static int matcher(struct re_guts *g, char *string, \ 138 == size_t nmatch, regmatch_t pmatch[], int eflags); 139 */ 140 static int /* 0 success, REG_NOMATCH failure */ 141 matcher(g, string, nmatch, pmatch, eflags) 142 struct re_guts *g; 143 char *string; 144 size_t nmatch; 145 regmatch_t pmatch[]; 146 int eflags; 147 { 148 char *endp; 149 int i; 150 struct match mv; 151 struct match *m = &mv; 152 char *dp; 153 const sopno gf = g->firststate+1; /* +1 for OEND */ 154 const sopno gl = g->laststate; 155 char *start; 156 char *stop; 157 158 _DIAGASSERT(g != NULL); 159 _DIAGASSERT(string != NULL); 160 /* pmatch checked below */ 161 162 /* simplify the situation where possible */ 163 if (g->cflags®_NOSUB) 164 nmatch = 0; 165 if (eflags®_STARTEND) { 166 _DIAGASSERT(pmatch != NULL); 167 start = string + (size_t)pmatch[0].rm_so; 168 stop = string + (size_t)pmatch[0].rm_eo; 169 } else { 170 start = string; 171 stop = start + strlen(start); 172 } 173 if (stop < start) 174 return(REG_INVARG); 175 176 /* prescreening; this does wonders for this rather slow code */ 177 if (g->must != NULL) { 178 for (dp = start; dp < stop; dp++) 179 if (*dp == g->must[0] && stop - dp >= g->mlen && 180 memcmp(dp, g->must, (size_t)g->mlen) == 0) 181 break; 182 if (dp == stop) /* we didn't find g->must */ 183 return(REG_NOMATCH); 184 } 185 186 /* match struct setup */ 187 m->g = g; 188 m->eflags = eflags; 189 m->pmatch = NULL; 190 m->lastpos = NULL; 191 m->offp = string; 192 m->beginp = start; 193 m->endp = stop; 194 STATESETUP(m, 4); 195 SETUP(m->st); 196 SETUP(m->fresh); 197 SETUP(m->tmp); 198 SETUP(m->empty); 199 CLEAR(m->empty); 200 201 /* this loop does only one repetition except for backrefs */ 202 for (;;) { 203 endp = fast(m, start, stop, gf, gl); 204 if (endp == NULL) { /* a miss */ 205 STATETEARDOWN(m); 206 return(REG_NOMATCH); 207 } 208 if (nmatch == 0 && !g->backrefs) 209 break; /* no further info needed */ 210 211 /* where? */ 212 assert(m->coldp != NULL); 213 for (;;) { 214 NOTE("finding start"); 215 endp = slow(m, m->coldp, stop, gf, gl); 216 if (endp != NULL) 217 break; 218 assert(m->coldp < m->endp); 219 m->coldp++; 220 } 221 if (nmatch == 1 && !g->backrefs) 222 break; /* no further info needed */ 223 224 /* oh my, he wants the subexpressions... */ 225 if (m->pmatch == NULL) 226 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 227 sizeof(regmatch_t)); 228 if (m->pmatch == NULL) { 229 STATETEARDOWN(m); 230 return(REG_ESPACE); 231 } 232 for (i = 1; i <= m->g->nsub; i++) 233 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = (regoff_t)-1; 234 if (!g->backrefs && !(m->eflags®_BACKR)) { 235 NOTE("dissecting"); 236 dp = dissect(m, m->coldp, endp, gf, gl); 237 } else { 238 if (g->nplus > 0 && m->lastpos == NULL) 239 m->lastpos = (char **)malloc((g->nplus+1) * 240 sizeof(char *)); 241 if (g->nplus > 0 && m->lastpos == NULL) { 242 free(m->pmatch); 243 STATETEARDOWN(m); 244 return(REG_ESPACE); 245 } 246 NOTE("backref dissect"); 247 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 248 } 249 if (dp != NULL) 250 break; 251 252 /* uh-oh... we couldn't find a subexpression-level match */ 253 assert(g->backrefs); /* must be back references doing it */ 254 assert(g->nplus == 0 || m->lastpos != NULL); 255 for (;;) { 256 if (dp != NULL || endp <= m->coldp) 257 break; /* defeat */ 258 NOTE("backoff"); 259 endp = slow(m, m->coldp, endp-1, gf, gl); 260 if (endp == NULL) 261 break; /* defeat */ 262 /* try it on a shorter possibility */ 263 #ifndef NDEBUG 264 for (i = 1; i <= m->g->nsub; i++) { 265 assert(m->pmatch[i].rm_so == (regoff_t)-1); 266 assert(m->pmatch[i].rm_eo == (regoff_t)-1); 267 } 268 #endif 269 NOTE("backoff dissect"); 270 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 271 } 272 assert(dp == NULL || dp == endp); 273 if (dp != NULL) /* found a shorter one */ 274 break; 275 276 /* despite initial appearances, there is no match here */ 277 NOTE("false alarm"); 278 start = m->coldp + 1; /* recycle starting later */ 279 assert(start <= stop); 280 } 281 282 /* fill in the details if requested */ 283 if (nmatch > 0) { 284 _DIAGASSERT(pmatch != NULL); 285 pmatch[0].rm_so = m->coldp - m->offp; 286 pmatch[0].rm_eo = endp - m->offp; 287 } 288 if (nmatch > 1) { 289 assert(m->pmatch != NULL); 290 for (i = 1; i < nmatch; i++) 291 if (i <= m->g->nsub) 292 pmatch[i] = m->pmatch[i]; 293 else { 294 pmatch[i].rm_so = (regoff_t)-1; 295 pmatch[i].rm_eo = (regoff_t)-1; 296 } 297 } 298 299 if (m->pmatch != NULL) 300 free(m->pmatch); 301 if (m->lastpos != NULL) 302 free(m->lastpos); 303 STATETEARDOWN(m); 304 return(0); 305 } 306 307 /* 308 - dissect - figure out what matched what, no back references 309 == static char *dissect(struct match *m, char *start, \ 310 == char *stop, sopno startst, sopno stopst); 311 */ 312 static char * /* == stop (success) always */ 313 dissect(m, start, stop, startst, stopst) 314 struct match *m; 315 char *start; 316 char *stop; 317 sopno startst; 318 sopno stopst; 319 { 320 int i; 321 sopno ss; /* start sop of current subRE */ 322 sopno es; /* end sop of current subRE */ 323 char *sp; /* start of string matched by it */ 324 char *stp; /* string matched by it cannot pass here */ 325 char *rest; /* start of rest of string */ 326 char *tail; /* string unmatched by rest of RE */ 327 sopno ssub; /* start sop of subsubRE */ 328 sopno esub; /* end sop of subsubRE */ 329 char *ssp; /* start of string matched by subsubRE */ 330 char *sep; /* end of string matched by subsubRE */ 331 char *oldssp; /* previous ssp */ 332 #ifndef NDEBUG 333 char *dp; 334 #endif 335 336 _DIAGASSERT(m != NULL); 337 _DIAGASSERT(start != NULL); 338 _DIAGASSERT(stop != NULL); 339 340 AT("diss", start, stop, startst, stopst); 341 sp = start; 342 for (ss = startst; ss < stopst; ss = es) { 343 /* identify end of subRE */ 344 es = ss; 345 switch (OP(m->g->strip[es])) { 346 case OPLUS_: 347 case OQUEST_: 348 es += OPND(m->g->strip[es]); 349 break; 350 case OCH_: 351 while (OP(m->g->strip[es]) != O_CH) 352 es += OPND(m->g->strip[es]); 353 break; 354 } 355 es++; 356 357 /* figure out what it matched */ 358 switch (OP(m->g->strip[ss])) { 359 case OEND: 360 assert(nope); 361 break; 362 case OCHAR: 363 sp++; 364 break; 365 case OBOL: 366 case OEOL: 367 case OBOW: 368 case OEOW: 369 break; 370 case OANY: 371 case OANYOF: 372 sp++; 373 break; 374 case OBACK_: 375 case O_BACK: 376 assert(nope); 377 break; 378 /* cases where length of match is hard to find */ 379 case OQUEST_: 380 stp = stop; 381 for (;;) { 382 /* how long could this one be? */ 383 rest = slow(m, sp, stp, ss, es); 384 assert(rest != NULL); /* it did match */ 385 /* could the rest match the rest? */ 386 tail = slow(m, rest, stop, es, stopst); 387 if (tail == stop) 388 break; /* yes! */ 389 /* no -- try a shorter match for this one */ 390 stp = rest - 1; 391 assert(stp >= sp); /* it did work */ 392 } 393 ssub = ss + 1; 394 esub = es - 1; 395 /* did innards match? */ 396 if (slow(m, sp, rest, ssub, esub) != NULL) { 397 #ifdef NDEBUG 398 (void) 399 #else 400 dp = 401 #endif 402 dissect(m, sp, rest, ssub, esub); 403 assert(dp == rest); 404 } else /* no */ 405 assert(sp == rest); 406 sp = rest; 407 break; 408 case OPLUS_: 409 stp = stop; 410 for (;;) { 411 /* how long could this one be? */ 412 rest = slow(m, sp, stp, ss, es); 413 assert(rest != NULL); /* it did match */ 414 /* could the rest match the rest? */ 415 tail = slow(m, rest, stop, es, stopst); 416 if (tail == stop) 417 break; /* yes! */ 418 /* no -- try a shorter match for this one */ 419 stp = rest - 1; 420 assert(stp >= sp); /* it did work */ 421 } 422 ssub = ss + 1; 423 esub = es - 1; 424 ssp = sp; 425 oldssp = ssp; 426 for (;;) { /* find last match of innards */ 427 sep = slow(m, ssp, rest, ssub, esub); 428 if (sep == NULL || sep == ssp) 429 break; /* failed or matched null */ 430 oldssp = ssp; /* on to next try */ 431 ssp = sep; 432 } 433 if (sep == NULL) { 434 /* last successful match */ 435 sep = ssp; 436 ssp = oldssp; 437 } 438 assert(sep == rest); /* must exhaust substring */ 439 assert(slow(m, ssp, sep, ssub, esub) == rest); 440 #ifdef NDEBUG 441 (void) 442 #else 443 dp = 444 #endif 445 dissect(m, ssp, sep, ssub, esub); 446 assert(dp == sep); 447 sp = rest; 448 break; 449 case OCH_: 450 stp = stop; 451 for (;;) { 452 /* how long could this one be? */ 453 rest = slow(m, sp, stp, ss, es); 454 assert(rest != NULL); /* it did match */ 455 /* could the rest match the rest? */ 456 tail = slow(m, rest, stop, es, stopst); 457 if (tail == stop) 458 break; /* yes! */ 459 /* no -- try a shorter match for this one */ 460 stp = rest - 1; 461 assert(stp >= sp); /* it did work */ 462 } 463 ssub = ss + 1; 464 esub = ss + OPND(m->g->strip[ss]) - 1; 465 assert(OP(m->g->strip[esub]) == OOR1); 466 for (;;) { /* find first matching branch */ 467 if (slow(m, sp, rest, ssub, esub) == rest) 468 break; /* it matched all of it */ 469 /* that one missed, try next one */ 470 assert(OP(m->g->strip[esub]) == OOR1); 471 esub++; 472 assert(OP(m->g->strip[esub]) == OOR2); 473 ssub = esub + 1; 474 esub += OPND(m->g->strip[esub]); 475 if (OP(m->g->strip[esub]) == OOR2) 476 esub--; 477 else 478 assert(OP(m->g->strip[esub]) == O_CH); 479 } 480 #ifdef NDEBUG 481 (void) 482 #else 483 dp = 484 #endif 485 dissect(m, sp, rest, ssub, esub); 486 assert(dp == rest); 487 sp = rest; 488 break; 489 case O_PLUS: 490 case O_QUEST: 491 case OOR1: 492 case OOR2: 493 case O_CH: 494 assert(nope); 495 break; 496 case OLPAREN: 497 i = OPND(m->g->strip[ss]); 498 assert(0 < i && i <= m->g->nsub); 499 m->pmatch[i].rm_so = sp - m->offp; 500 break; 501 case ORPAREN: 502 i = OPND(m->g->strip[ss]); 503 assert(0 < i && i <= m->g->nsub); 504 m->pmatch[i].rm_eo = sp - m->offp; 505 break; 506 default: /* uh oh */ 507 assert(nope); 508 break; 509 } 510 } 511 512 assert(sp == stop); 513 return(sp); 514 } 515 516 /* 517 - backref - figure out what matched what, figuring in back references 518 == static char *backref(struct match *m, char *start, \ 519 == char *stop, sopno startst, sopno stopst, sopno lev); 520 */ 521 static char * /* == stop (success) or NULL (failure) */ 522 backref(m, start, stop, startst, stopst, lev) 523 struct match *m; 524 char *start; 525 char *stop; 526 sopno startst; 527 sopno stopst; 528 sopno lev; /* PLUS nesting level */ 529 { 530 int i; 531 sopno ss; /* start sop of current subRE */ 532 char *sp; /* start of string matched by it */ 533 sopno ssub; /* start sop of subsubRE */ 534 sopno esub; /* end sop of subsubRE */ 535 char *ssp; /* start of string matched by subsubRE */ 536 char *dp; 537 size_t len; 538 int hard; 539 sop s; 540 regoff_t offsave; 541 cset *cs; 542 543 _DIAGASSERT(m != NULL); 544 _DIAGASSERT(start != NULL); 545 _DIAGASSERT(stop != NULL); 546 547 AT("back", start, stop, startst, stopst); 548 sp = start; 549 550 /* get as far as we can with easy stuff */ 551 hard = 0; 552 for (ss = startst; !hard && ss < stopst; ss++) 553 switch (OP(s = m->g->strip[ss])) { 554 case OCHAR: 555 if (sp == stop || *sp++ != (char)OPND(s)) 556 return(NULL); 557 break; 558 case OANY: 559 if (sp == stop) 560 return(NULL); 561 sp++; 562 break; 563 case OANYOF: 564 cs = &m->g->sets[OPND(s)]; 565 if (sp == stop || !CHIN(cs, *sp++)) 566 return(NULL); 567 break; 568 case OBOL: 569 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 570 (sp < m->endp && *(sp-1) == '\n' && 571 (m->g->cflags®_NEWLINE)) ) 572 { /* yes */ } 573 else 574 return(NULL); 575 break; 576 case OEOL: 577 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 578 (sp < m->endp && *sp == '\n' && 579 (m->g->cflags®_NEWLINE)) ) 580 { /* yes */ } 581 else 582 return(NULL); 583 break; 584 case OBOW: 585 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 586 (sp < m->endp && *(sp-1) == '\n' && 587 (m->g->cflags®_NEWLINE)) || 588 (sp > m->beginp && 589 !ISWORD(*(sp-1))) ) && 590 (sp < m->endp && ISWORD(*sp)) ) 591 { /* yes */ } 592 else 593 return(NULL); 594 break; 595 case OEOW: 596 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 597 (sp < m->endp && *sp == '\n' && 598 (m->g->cflags®_NEWLINE)) || 599 (sp < m->endp && !ISWORD(*sp)) ) && 600 (sp > m->beginp && ISWORD(*(sp-1))) ) 601 { /* yes */ } 602 else 603 return(NULL); 604 break; 605 case O_QUEST: 606 break; 607 case OOR1: /* matches null but needs to skip */ 608 ss++; 609 s = m->g->strip[ss]; 610 do { 611 assert(OP(s) == OOR2); 612 ss += OPND(s); 613 } while (OP(s = m->g->strip[ss]) != O_CH); 614 /* note that the ss++ gets us past the O_CH */ 615 break; 616 default: /* have to make a choice */ 617 hard = 1; 618 break; 619 } 620 if (!hard) { /* that was it! */ 621 if (sp != stop) 622 return(NULL); 623 return(sp); 624 } 625 ss--; /* adjust for the for's final increment */ 626 627 /* the hard stuff */ 628 AT("hard", sp, stop, ss, stopst); 629 s = m->g->strip[ss]; 630 switch (OP(s)) { 631 case OBACK_: /* the vilest depths */ 632 i = OPND(s); 633 assert(0 < i && i <= m->g->nsub); 634 if (m->pmatch[i].rm_eo == (regoff_t)-1) 635 return(NULL); 636 assert(m->pmatch[i].rm_so != (regoff_t)-1); 637 len = (size_t)(m->pmatch[i].rm_eo - m->pmatch[i].rm_so); 638 assert(stop - m->beginp >= len); 639 if (sp > stop - len) 640 return(NULL); /* not enough left to match */ 641 ssp = m->offp + (size_t)m->pmatch[i].rm_so; 642 if (memcmp(sp, ssp, len) != 0) 643 return(NULL); 644 while (m->g->strip[ss] != SOP(O_BACK, i)) 645 ss++; 646 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 647 648 case OQUEST_: /* to null or not */ 649 dp = backref(m, sp, stop, ss+1, stopst, lev); 650 if (dp != NULL) 651 return(dp); /* not */ 652 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); 653 654 case OPLUS_: 655 assert(m->lastpos != NULL); 656 assert(lev+1 <= m->g->nplus); 657 m->lastpos[lev+1] = sp; 658 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 659 660 case O_PLUS: 661 if (sp == m->lastpos[lev]) /* last pass matched null */ 662 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 663 /* try another pass */ 664 m->lastpos[lev] = sp; 665 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); 666 if (dp == NULL) 667 dp = backref(m, sp, stop, ss+1, stopst, lev-1); 668 return(dp); 669 670 case OCH_: /* find the right one, if any */ 671 ssub = ss + 1; 672 esub = ss + OPND(s) - 1; 673 assert(OP(m->g->strip[esub]) == OOR1); 674 for (;;) { /* find first matching branch */ 675 dp = backref(m, sp, stop, ssub, esub, lev); 676 if (dp != NULL) 677 return(dp); 678 /* that one missed, try next one */ 679 if (OP(m->g->strip[esub]) == O_CH) 680 return(NULL); /* there is none */ 681 esub++; 682 assert(OP(m->g->strip[esub]) == OOR2); 683 ssub = esub + 1; 684 esub += OPND(m->g->strip[esub]); 685 if (OP(m->g->strip[esub]) == OOR2) 686 esub--; 687 else 688 assert(OP(m->g->strip[esub]) == O_CH); 689 } 690 691 case OLPAREN: /* must undo assignment if rest fails */ 692 i = OPND(s); 693 assert(0 < i && i <= m->g->nsub); 694 offsave = m->pmatch[i].rm_so; 695 m->pmatch[i].rm_so = sp - m->offp; 696 dp = backref(m, sp, stop, ss+1, stopst, lev); 697 if (dp != NULL) 698 return(dp); 699 m->pmatch[i].rm_so = offsave; 700 return(NULL); 701 702 case ORPAREN: /* must undo assignment if rest fails */ 703 i = OPND(s); 704 assert(0 < i && i <= m->g->nsub); 705 offsave = m->pmatch[i].rm_eo; 706 m->pmatch[i].rm_eo = sp - m->offp; 707 dp = backref(m, sp, stop, ss+1, stopst, lev); 708 if (dp != NULL) 709 return(dp); 710 m->pmatch[i].rm_eo = offsave; 711 return(NULL); 712 713 default: /* uh oh */ 714 assert(nope); 715 break; 716 } 717 718 /* "can't happen" */ 719 assert(nope); 720 /* NOTREACHED */ 721 return NULL; 722 } 723 724 /* 725 - fast - step through the string at top speed 726 == static char *fast(struct match *m, char *start, \ 727 == char *stop, sopno startst, sopno stopst); 728 */ 729 static char * /* where tentative match ended, or NULL */ 730 fast(m, start, stop, startst, stopst) 731 struct match *m; 732 char *start; 733 char *stop; 734 sopno startst; 735 sopno stopst; 736 { 737 states st = m->st; 738 states fresh = m->fresh; 739 states tmp = m->tmp; 740 char *p = start; 741 int c = (start == m->beginp) ? OUT : *(start-1); 742 int lastc; /* previous c */ 743 int flagch; 744 int i; 745 char *coldp; /* last p after which no match was underway */ 746 747 _DIAGASSERT(m != NULL); 748 _DIAGASSERT(start != NULL); 749 _DIAGASSERT(stop != NULL); 750 751 CLEAR(st); 752 SET1(st, startst); 753 st = step(m->g, startst, stopst, st, NOTHING, st); 754 ASSIGN(fresh, st); 755 SP("start", st, *p); 756 coldp = NULL; 757 for (;;) { 758 /* next character */ 759 lastc = c; 760 c = (p == m->endp) ? OUT : *p; 761 if (EQ(st, fresh)) 762 coldp = p; 763 764 /* is there an EOL and/or BOL between lastc and c? */ 765 flagch = '\0'; 766 i = 0; 767 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 768 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 769 flagch = BOL; 770 i = m->g->nbol; 771 } 772 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 773 (c == OUT && !(m->eflags®_NOTEOL)) ) { 774 flagch = (flagch == BOL) ? BOLEOL : EOL; 775 i += m->g->neol; 776 } 777 if (i != 0) { 778 for (; i > 0; i--) 779 st = step(m->g, startst, stopst, st, flagch, st); 780 SP("boleol", st, c); 781 } 782 783 /* how about a word boundary? */ 784 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 785 (c != OUT && ISWORD(c)) ) { 786 flagch = BOW; 787 } 788 if ( (lastc != OUT && ISWORD(lastc)) && 789 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 790 flagch = EOW; 791 } 792 if (flagch == BOW || flagch == EOW) { 793 st = step(m->g, startst, stopst, st, flagch, st); 794 SP("boweow", st, c); 795 } 796 797 /* are we done? */ 798 if (ISSET(st, stopst) || p == stop) 799 break; /* NOTE BREAK OUT */ 800 801 /* no, we must deal with this character */ 802 ASSIGN(tmp, st); 803 ASSIGN(st, fresh); 804 assert(c != OUT); 805 st = step(m->g, startst, stopst, tmp, c, st); 806 SP("aft", st, c); 807 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 808 p++; 809 } 810 811 assert(coldp != NULL); 812 m->coldp = coldp; 813 if (ISSET(st, stopst)) 814 return(p+1); 815 else 816 return(NULL); 817 } 818 819 /* 820 - slow - step through the string more deliberately 821 == static char *slow(struct match *m, char *start, \ 822 == char *stop, sopno startst, sopno stopst); 823 */ 824 static char * /* where it ended */ 825 slow(m, start, stop, startst, stopst) 826 struct match *m; 827 char *start; 828 char *stop; 829 sopno startst; 830 sopno stopst; 831 { 832 states st = m->st; 833 states empty = m->empty; 834 states tmp = m->tmp; 835 char *p = start; 836 int c = (start == m->beginp) ? OUT : *(start-1); 837 int lastc; /* previous c */ 838 int flagch; 839 int i; 840 char *matchp; /* last p at which a match ended */ 841 842 _DIAGASSERT(m != NULL); 843 _DIAGASSERT(start != NULL); 844 _DIAGASSERT(stop != NULL); 845 846 AT("slow", start, stop, startst, stopst); 847 CLEAR(st); 848 SET1(st, startst); 849 SP("sstart", st, *p); 850 st = step(m->g, startst, stopst, st, NOTHING, st); 851 matchp = NULL; 852 for (;;) { 853 /* next character */ 854 lastc = c; 855 c = (p == m->endp) ? OUT : *p; 856 857 /* is there an EOL and/or BOL between lastc and c? */ 858 flagch = '\0'; 859 i = 0; 860 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 861 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 862 flagch = BOL; 863 i = m->g->nbol; 864 } 865 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 866 (c == OUT && !(m->eflags®_NOTEOL)) ) { 867 flagch = (flagch == BOL) ? BOLEOL : EOL; 868 i += m->g->neol; 869 } 870 if (i != 0) { 871 for (; i > 0; i--) 872 st = step(m->g, startst, stopst, st, flagch, st); 873 SP("sboleol", st, c); 874 } 875 876 /* how about a word boundary? */ 877 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 878 (c != OUT && ISWORD(c)) ) { 879 flagch = BOW; 880 } 881 if ( (lastc != OUT && ISWORD(lastc)) && 882 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 883 flagch = EOW; 884 } 885 if (flagch == BOW || flagch == EOW) { 886 st = step(m->g, startst, stopst, st, flagch, st); 887 SP("sboweow", st, c); 888 } 889 890 /* are we done? */ 891 if (ISSET(st, stopst)) 892 matchp = p; 893 if (EQ(st, empty) || p == stop) 894 break; /* NOTE BREAK OUT */ 895 896 /* no, we must deal with this character */ 897 ASSIGN(tmp, st); 898 ASSIGN(st, empty); 899 assert(c != OUT); 900 st = step(m->g, startst, stopst, tmp, c, st); 901 SP("saft", st, c); 902 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 903 p++; 904 } 905 906 return(matchp); 907 } 908 909 910 /* 911 - step - map set of states reachable before char to set reachable after 912 == static states step(struct re_guts *g, sopno start, sopno stop, \ 913 == states bef, int ch, states aft); 914 == #define BOL (OUT+1) 915 == #define EOL (BOL+1) 916 == #define BOLEOL (BOL+2) 917 == #define NOTHING (BOL+3) 918 == #define BOW (BOL+4) 919 == #define EOW (BOL+5) 920 == #define CODEMAX (BOL+5) // highest code used 921 == #define NONCHAR(c) ((c) > CHAR_MAX) 922 == #define NNONCHAR (CODEMAX-CHAR_MAX) 923 */ 924 static states 925 step(g, start, stop, bef, ch, aft) 926 struct re_guts *g; 927 sopno start; /* start state within strip */ 928 sopno stop; /* state after stop state within strip */ 929 states bef; /* states reachable before */ 930 int ch; /* character or NONCHAR code */ 931 states aft; /* states already known reachable after */ 932 { 933 cset *cs; 934 sop s; 935 sopno pc; 936 onestate here; /* note, macros know this name */ 937 sopno look; 938 int i; 939 940 _DIAGASSERT(g != NULL); 941 942 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 943 s = g->strip[pc]; 944 switch (OP(s)) { 945 case OEND: 946 assert(pc == stop-1); 947 break; 948 case OCHAR: 949 /* only characters can match */ 950 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 951 if (ch == (char)OPND(s)) 952 FWD(aft, bef, 1); 953 break; 954 case OBOL: 955 if (ch == BOL || ch == BOLEOL) 956 FWD(aft, bef, 1); 957 break; 958 case OEOL: 959 if (ch == EOL || ch == BOLEOL) 960 FWD(aft, bef, 1); 961 break; 962 case OBOW: 963 if (ch == BOW) 964 FWD(aft, bef, 1); 965 break; 966 case OEOW: 967 if (ch == EOW) 968 FWD(aft, bef, 1); 969 break; 970 case OANY: 971 if (!NONCHAR(ch)) 972 FWD(aft, bef, 1); 973 break; 974 case OANYOF: 975 cs = &g->sets[OPND(s)]; 976 if (!NONCHAR(ch) && CHIN(cs, ch)) 977 FWD(aft, bef, 1); 978 break; 979 case OBACK_: /* ignored here */ 980 case O_BACK: 981 FWD(aft, aft, 1); 982 break; 983 case OPLUS_: /* forward, this is just an empty */ 984 FWD(aft, aft, 1); 985 break; 986 case O_PLUS: /* both forward and back */ 987 FWD(aft, aft, 1); 988 i = ISSETBACK(aft, OPND(s)); 989 BACK(aft, aft, OPND(s)); 990 if (!i && ISSETBACK(aft, OPND(s))) { 991 /* oho, must reconsider loop body */ 992 pc -= OPND(s) + 1; 993 INIT(here, pc); 994 } 995 break; 996 case OQUEST_: /* two branches, both forward */ 997 FWD(aft, aft, 1); 998 FWD(aft, aft, OPND(s)); 999 break; 1000 case O_QUEST: /* just an empty */ 1001 FWD(aft, aft, 1); 1002 break; 1003 case OLPAREN: /* not significant here */ 1004 case ORPAREN: 1005 FWD(aft, aft, 1); 1006 break; 1007 case OCH_: /* mark the first two branches */ 1008 FWD(aft, aft, 1); 1009 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1010 FWD(aft, aft, OPND(s)); 1011 break; 1012 case OOR1: /* done a branch, find the O_CH */ 1013 if (ISSTATEIN(aft, here)) { 1014 for (look = 1; 1015 OP(s = g->strip[pc+look]) != O_CH; 1016 look += OPND(s)) 1017 assert(OP(s) == OOR2); 1018 FWD(aft, aft, look); 1019 } 1020 break; 1021 case OOR2: /* propagate OCH_'s marking */ 1022 FWD(aft, aft, 1); 1023 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1024 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1025 FWD(aft, aft, OPND(s)); 1026 } 1027 break; 1028 case O_CH: /* just empty */ 1029 FWD(aft, aft, 1); 1030 break; 1031 default: /* ooooops... */ 1032 assert(nope); 1033 break; 1034 } 1035 } 1036 1037 return(aft); 1038 } 1039 1040 #ifdef REDEBUG 1041 /* 1042 - print - print a set of states 1043 == #ifdef REDEBUG 1044 == static void print(struct match *m, char *caption, states st, \ 1045 == int ch, FILE *d); 1046 == #endif 1047 */ 1048 static void 1049 print(m, caption, st, ch, d) 1050 struct match *m; 1051 char *caption; 1052 states st; 1053 int ch; 1054 FILE *d; 1055 { 1056 struct re_guts *g = m->g; 1057 int i; 1058 int first = 1; 1059 1060 _DIAGASSERT(m != NULL); 1061 _DIAGASSERT(caption != NULL); 1062 1063 if (!(m->eflags®_TRACE)) 1064 return; 1065 1066 _DIAGASSERT(d != NULL); 1067 1068 fprintf(d, "%s", caption); 1069 if (ch != '\0') 1070 fprintf(d, " %s", pchar(ch)); 1071 for (i = 0; i < g->nstates; i++) 1072 if (ISSET(st, i)) { 1073 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1074 first = 0; 1075 } 1076 fprintf(d, "\n"); 1077 } 1078 1079 /* 1080 - at - print current situation 1081 == #ifdef REDEBUG 1082 == static void at(struct match *m, char *title, char *start, char *stop, \ 1083 == sopno startst, sopno stopst); 1084 == #endif 1085 */ 1086 static void 1087 at(m, title, start, stop, startst, stopst) 1088 struct match *m; 1089 char *title; 1090 char *start; 1091 char *stop; 1092 sopno startst; 1093 sopno stopst; 1094 { 1095 1096 _DIAGASSERT(m != NULL); 1097 _DIAGASSERT(title != NULL); 1098 _DIAGASSERT(start != NULL); 1099 _DIAGASSERT(stop != NULL); 1100 1101 if (!(m->eflags®_TRACE)) 1102 return; 1103 1104 printf("%s %s-", title, pchar(*start)); 1105 printf("%s ", pchar(*stop)); 1106 printf("%ld-%ld\n", (long)startst, (long)stopst); 1107 } 1108 1109 #ifndef PCHARDONE 1110 #define PCHARDONE /* never again */ 1111 /* 1112 - pchar - make a character printable 1113 == #ifdef REDEBUG 1114 == static char *pchar(int ch); 1115 == #endif 1116 * 1117 * Is this identical to regchar() over in debug.c? Well, yes. But a 1118 * duplicate here avoids having a debugging-capable regexec.o tied to 1119 * a matching debug.o, and this is convenient. It all disappears in 1120 * the non-debug compilation anyway, so it doesn't matter much. 1121 */ 1122 static char * /* -> representation */ 1123 pchar(ch) 1124 int ch; 1125 { 1126 static char pbuf[10]; 1127 1128 if (isprint(ch) || ch == ' ') 1129 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1130 else 1131 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1132 return(pbuf); 1133 } 1134 #endif 1135 #endif 1136 1137 #undef matcher 1138 #undef fast 1139 #undef slow 1140 #undef dissect 1141 #undef backref 1142 #undef step 1143 #undef print 1144 #undef at 1145 #undef match 1146