1 /* $NetBSD: engine.c,v 1.11 1999/01/20 12:58:21 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * @(#)engine.c 8.5 (Berkeley) 3/20/94 40 */ 41 42 /* 43 * The matching engine and friends. This file is #included by regexec.c 44 * after suitable #defines of a variety of macros used herein, so that 45 * different state representations can be used without duplicating masses 46 * of code. 47 */ 48 49 #ifdef SNAMES 50 #define matcher smatcher 51 #define fast sfast 52 #define slow sslow 53 #define dissect sdissect 54 #define backref sbackref 55 #define step sstep 56 #define print sprint 57 #define at sat 58 #define match smat 59 #endif 60 #ifdef LNAMES 61 #define matcher lmatcher 62 #define fast lfast 63 #define slow lslow 64 #define dissect ldissect 65 #define backref lbackref 66 #define step lstep 67 #define print lprint 68 #define at lat 69 #define match lmat 70 #endif 71 72 /* another structure passed up and down to avoid zillions of parameters */ 73 struct match { 74 struct re_guts *g; 75 int eflags; 76 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 77 char *offp; /* offsets work from here */ 78 char *beginp; /* start of string -- virtual NUL precedes */ 79 char *endp; /* end of string -- virtual NUL here */ 80 char *coldp; /* can be no match starting before here */ 81 char **lastpos; /* [nplus+1] */ 82 STATEVARS; 83 states st; /* current states */ 84 states fresh; /* states for a fresh start */ 85 states tmp; /* temporary */ 86 states empty; /* empty set of states */ 87 }; 88 89 /* ========= begin header generated by ./mkh ========= */ 90 #ifdef __cplusplus 91 extern "C" { 92 #endif 93 94 /* === engine.c === */ 95 static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags)); 96 static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 97 static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev)); 98 static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 99 static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 100 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft)); 101 #define BOL (OUT+1) 102 #define EOL (BOL+1) 103 #define BOLEOL (BOL+2) 104 #define NOTHING (BOL+3) 105 #define BOW (BOL+4) 106 #define EOW (BOL+5) 107 #define CODEMAX (BOL+5) /* highest code used */ 108 #define NONCHAR(c) ((c) > CHAR_MAX) 109 #define NNONCHAR (CODEMAX-CHAR_MAX) 110 #ifdef REDEBUG 111 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d)); 112 #endif 113 #ifdef REDEBUG 114 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst)); 115 #endif 116 #ifdef REDEBUG 117 static char *pchar __P((int ch)); 118 #endif 119 120 #ifdef __cplusplus 121 } 122 #endif 123 /* ========= end header generated by ./mkh ========= */ 124 125 #ifdef REDEBUG 126 #define SP(t, s, c) print(m, t, s, c, stdout) 127 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 128 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 129 #else 130 #define SP(t, s, c) /* nothing */ 131 #define AT(t, p1, p2, s1, s2) /* nothing */ 132 #define NOTE(s) /* nothing */ 133 #endif 134 135 /* 136 - matcher - the actual matching engine 137 == static int matcher(struct re_guts *g, char *string, \ 138 == size_t nmatch, regmatch_t pmatch[], int eflags); 139 */ 140 static int /* 0 success, REG_NOMATCH failure */ 141 matcher(g, string, nmatch, pmatch, eflags) 142 struct re_guts *g; 143 char *string; 144 size_t nmatch; 145 regmatch_t pmatch[]; 146 int eflags; 147 { 148 char *endp; 149 int i; 150 struct match mv; 151 struct match *m = &mv; 152 char *dp; 153 const sopno gf = g->firststate+1; /* +1 for OEND */ 154 const sopno gl = g->laststate; 155 char *start; 156 char *stop; 157 158 /* simplify the situation where possible */ 159 if (g->cflags®_NOSUB) 160 nmatch = 0; 161 if (eflags®_STARTEND) { 162 start = string + (size_t)pmatch[0].rm_so; 163 stop = string + (size_t)pmatch[0].rm_eo; 164 } else { 165 start = string; 166 stop = start + strlen(start); 167 } 168 if (stop < start) 169 return(REG_INVARG); 170 171 /* prescreening; this does wonders for this rather slow code */ 172 if (g->must != NULL) { 173 for (dp = start; dp < stop; dp++) 174 if (*dp == g->must[0] && stop - dp >= g->mlen && 175 memcmp(dp, g->must, (size_t)g->mlen) == 0) 176 break; 177 if (dp == stop) /* we didn't find g->must */ 178 return(REG_NOMATCH); 179 } 180 181 /* match struct setup */ 182 m->g = g; 183 m->eflags = eflags; 184 m->pmatch = NULL; 185 m->lastpos = NULL; 186 m->offp = string; 187 m->beginp = start; 188 m->endp = stop; 189 STATESETUP(m, 4); 190 SETUP(m->st); 191 SETUP(m->fresh); 192 SETUP(m->tmp); 193 SETUP(m->empty); 194 CLEAR(m->empty); 195 196 /* this loop does only one repetition except for backrefs */ 197 for (;;) { 198 endp = fast(m, start, stop, gf, gl); 199 if (endp == NULL) { /* a miss */ 200 STATETEARDOWN(m); 201 return(REG_NOMATCH); 202 } 203 if (nmatch == 0 && !g->backrefs) 204 break; /* no further info needed */ 205 206 /* where? */ 207 assert(m->coldp != NULL); 208 for (;;) { 209 NOTE("finding start"); 210 endp = slow(m, m->coldp, stop, gf, gl); 211 if (endp != NULL) 212 break; 213 assert(m->coldp < m->endp); 214 m->coldp++; 215 } 216 if (nmatch == 1 && !g->backrefs) 217 break; /* no further info needed */ 218 219 /* oh my, he wants the subexpressions... */ 220 if (m->pmatch == NULL) 221 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 222 sizeof(regmatch_t)); 223 if (m->pmatch == NULL) { 224 STATETEARDOWN(m); 225 return(REG_ESPACE); 226 } 227 for (i = 1; i <= m->g->nsub; i++) 228 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = (regoff_t)-1; 229 if (!g->backrefs && !(m->eflags®_BACKR)) { 230 NOTE("dissecting"); 231 dp = dissect(m, m->coldp, endp, gf, gl); 232 } else { 233 if (g->nplus > 0 && m->lastpos == NULL) 234 m->lastpos = (char **)malloc((g->nplus+1) * 235 sizeof(char *)); 236 if (g->nplus > 0 && m->lastpos == NULL) { 237 free(m->pmatch); 238 STATETEARDOWN(m); 239 return(REG_ESPACE); 240 } 241 NOTE("backref dissect"); 242 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 243 } 244 if (dp != NULL) 245 break; 246 247 /* uh-oh... we couldn't find a subexpression-level match */ 248 assert(g->backrefs); /* must be back references doing it */ 249 assert(g->nplus == 0 || m->lastpos != NULL); 250 for (;;) { 251 if (dp != NULL || endp <= m->coldp) 252 break; /* defeat */ 253 NOTE("backoff"); 254 endp = slow(m, m->coldp, endp-1, gf, gl); 255 if (endp == NULL) 256 break; /* defeat */ 257 /* try it on a shorter possibility */ 258 #ifndef NDEBUG 259 for (i = 1; i <= m->g->nsub; i++) { 260 assert(m->pmatch[i].rm_so == (regoff_t)-1); 261 assert(m->pmatch[i].rm_eo == (regoff_t)-1); 262 } 263 #endif 264 NOTE("backoff dissect"); 265 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 266 } 267 assert(dp == NULL || dp == endp); 268 if (dp != NULL) /* found a shorter one */ 269 break; 270 271 /* despite initial appearances, there is no match here */ 272 NOTE("false alarm"); 273 start = m->coldp + 1; /* recycle starting later */ 274 assert(start <= stop); 275 } 276 277 /* fill in the details if requested */ 278 if (nmatch > 0) { 279 pmatch[0].rm_so = m->coldp - m->offp; 280 pmatch[0].rm_eo = endp - m->offp; 281 } 282 if (nmatch > 1) { 283 assert(m->pmatch != NULL); 284 for (i = 1; i < nmatch; i++) 285 if (i <= m->g->nsub) 286 pmatch[i] = m->pmatch[i]; 287 else { 288 pmatch[i].rm_so = (regoff_t)-1; 289 pmatch[i].rm_eo = (regoff_t)-1; 290 } 291 } 292 293 if (m->pmatch != NULL) 294 free(m->pmatch); 295 if (m->lastpos != NULL) 296 free(m->lastpos); 297 STATETEARDOWN(m); 298 return(0); 299 } 300 301 /* 302 - dissect - figure out what matched what, no back references 303 == static char *dissect(struct match *m, char *start, \ 304 == char *stop, sopno startst, sopno stopst); 305 */ 306 static char * /* == stop (success) always */ 307 dissect(m, start, stop, startst, stopst) 308 struct match *m; 309 char *start; 310 char *stop; 311 sopno startst; 312 sopno stopst; 313 { 314 int i; 315 sopno ss; /* start sop of current subRE */ 316 sopno es; /* end sop of current subRE */ 317 char *sp; /* start of string matched by it */ 318 char *stp; /* string matched by it cannot pass here */ 319 char *rest; /* start of rest of string */ 320 char *tail; /* string unmatched by rest of RE */ 321 sopno ssub; /* start sop of subsubRE */ 322 sopno esub; /* end sop of subsubRE */ 323 char *ssp; /* start of string matched by subsubRE */ 324 char *sep; /* end of string matched by subsubRE */ 325 char *oldssp; /* previous ssp */ 326 #ifndef NDEBUG 327 char *dp; 328 #endif 329 330 AT("diss", start, stop, startst, stopst); 331 sp = start; 332 for (ss = startst; ss < stopst; ss = es) { 333 /* identify end of subRE */ 334 es = ss; 335 switch (OP(m->g->strip[es])) { 336 case OPLUS_: 337 case OQUEST_: 338 es += OPND(m->g->strip[es]); 339 break; 340 case OCH_: 341 while (OP(m->g->strip[es]) != O_CH) 342 es += OPND(m->g->strip[es]); 343 break; 344 } 345 es++; 346 347 /* figure out what it matched */ 348 switch (OP(m->g->strip[ss])) { 349 case OEND: 350 assert(nope); 351 break; 352 case OCHAR: 353 sp++; 354 break; 355 case OBOL: 356 case OEOL: 357 case OBOW: 358 case OEOW: 359 break; 360 case OANY: 361 case OANYOF: 362 sp++; 363 break; 364 case OBACK_: 365 case O_BACK: 366 assert(nope); 367 break; 368 /* cases where length of match is hard to find */ 369 case OQUEST_: 370 stp = stop; 371 for (;;) { 372 /* how long could this one be? */ 373 rest = slow(m, sp, stp, ss, es); 374 assert(rest != NULL); /* it did match */ 375 /* could the rest match the rest? */ 376 tail = slow(m, rest, stop, es, stopst); 377 if (tail == stop) 378 break; /* yes! */ 379 /* no -- try a shorter match for this one */ 380 stp = rest - 1; 381 assert(stp >= sp); /* it did work */ 382 } 383 ssub = ss + 1; 384 esub = es - 1; 385 /* did innards match? */ 386 if (slow(m, sp, rest, ssub, esub) != NULL) { 387 #ifdef NDEBUG 388 (void) 389 #else 390 dp = 391 #endif 392 dissect(m, sp, rest, ssub, esub); 393 assert(dp == rest); 394 } else /* no */ 395 assert(sp == rest); 396 sp = rest; 397 break; 398 case OPLUS_: 399 stp = stop; 400 for (;;) { 401 /* how long could this one be? */ 402 rest = slow(m, sp, stp, ss, es); 403 assert(rest != NULL); /* it did match */ 404 /* could the rest match the rest? */ 405 tail = slow(m, rest, stop, es, stopst); 406 if (tail == stop) 407 break; /* yes! */ 408 /* no -- try a shorter match for this one */ 409 stp = rest - 1; 410 assert(stp >= sp); /* it did work */ 411 } 412 ssub = ss + 1; 413 esub = es - 1; 414 ssp = sp; 415 oldssp = ssp; 416 for (;;) { /* find last match of innards */ 417 sep = slow(m, ssp, rest, ssub, esub); 418 if (sep == NULL || sep == ssp) 419 break; /* failed or matched null */ 420 oldssp = ssp; /* on to next try */ 421 ssp = sep; 422 } 423 if (sep == NULL) { 424 /* last successful match */ 425 sep = ssp; 426 ssp = oldssp; 427 } 428 assert(sep == rest); /* must exhaust substring */ 429 assert(slow(m, ssp, sep, ssub, esub) == rest); 430 #ifdef NDEBUG 431 (void) 432 #else 433 dp = 434 #endif 435 dissect(m, ssp, sep, ssub, esub); 436 assert(dp == sep); 437 sp = rest; 438 break; 439 case OCH_: 440 stp = stop; 441 for (;;) { 442 /* how long could this one be? */ 443 rest = slow(m, sp, stp, ss, es); 444 assert(rest != NULL); /* it did match */ 445 /* could the rest match the rest? */ 446 tail = slow(m, rest, stop, es, stopst); 447 if (tail == stop) 448 break; /* yes! */ 449 /* no -- try a shorter match for this one */ 450 stp = rest - 1; 451 assert(stp >= sp); /* it did work */ 452 } 453 ssub = ss + 1; 454 esub = ss + OPND(m->g->strip[ss]) - 1; 455 assert(OP(m->g->strip[esub]) == OOR1); 456 for (;;) { /* find first matching branch */ 457 if (slow(m, sp, rest, ssub, esub) == rest) 458 break; /* it matched all of it */ 459 /* that one missed, try next one */ 460 assert(OP(m->g->strip[esub]) == OOR1); 461 esub++; 462 assert(OP(m->g->strip[esub]) == OOR2); 463 ssub = esub + 1; 464 esub += OPND(m->g->strip[esub]); 465 if (OP(m->g->strip[esub]) == OOR2) 466 esub--; 467 else 468 assert(OP(m->g->strip[esub]) == O_CH); 469 } 470 #ifdef NDEBUG 471 (void) 472 #else 473 dp = 474 #endif 475 dissect(m, sp, rest, ssub, esub); 476 assert(dp == rest); 477 sp = rest; 478 break; 479 case O_PLUS: 480 case O_QUEST: 481 case OOR1: 482 case OOR2: 483 case O_CH: 484 assert(nope); 485 break; 486 case OLPAREN: 487 i = OPND(m->g->strip[ss]); 488 assert(0 < i && i <= m->g->nsub); 489 m->pmatch[i].rm_so = sp - m->offp; 490 break; 491 case ORPAREN: 492 i = OPND(m->g->strip[ss]); 493 assert(0 < i && i <= m->g->nsub); 494 m->pmatch[i].rm_eo = sp - m->offp; 495 break; 496 default: /* uh oh */ 497 assert(nope); 498 break; 499 } 500 } 501 502 assert(sp == stop); 503 return(sp); 504 } 505 506 /* 507 - backref - figure out what matched what, figuring in back references 508 == static char *backref(struct match *m, char *start, \ 509 == char *stop, sopno startst, sopno stopst, sopno lev); 510 */ 511 static char * /* == stop (success) or NULL (failure) */ 512 backref(m, start, stop, startst, stopst, lev) 513 struct match *m; 514 char *start; 515 char *stop; 516 sopno startst; 517 sopno stopst; 518 sopno lev; /* PLUS nesting level */ 519 { 520 int i; 521 sopno ss; /* start sop of current subRE */ 522 char *sp; /* start of string matched by it */ 523 sopno ssub; /* start sop of subsubRE */ 524 sopno esub; /* end sop of subsubRE */ 525 char *ssp; /* start of string matched by subsubRE */ 526 char *dp; 527 size_t len; 528 int hard; 529 sop s; 530 regoff_t offsave; 531 cset *cs; 532 533 AT("back", start, stop, startst, stopst); 534 sp = start; 535 536 /* get as far as we can with easy stuff */ 537 hard = 0; 538 for (ss = startst; !hard && ss < stopst; ss++) 539 switch (OP(s = m->g->strip[ss])) { 540 case OCHAR: 541 if (sp == stop || *sp++ != (char)OPND(s)) 542 return(NULL); 543 break; 544 case OANY: 545 if (sp == stop) 546 return(NULL); 547 sp++; 548 break; 549 case OANYOF: 550 cs = &m->g->sets[OPND(s)]; 551 if (sp == stop || !CHIN(cs, *sp++)) 552 return(NULL); 553 break; 554 case OBOL: 555 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 556 (sp < m->endp && *(sp-1) == '\n' && 557 (m->g->cflags®_NEWLINE)) ) 558 { /* yes */ } 559 else 560 return(NULL); 561 break; 562 case OEOL: 563 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 564 (sp < m->endp && *sp == '\n' && 565 (m->g->cflags®_NEWLINE)) ) 566 { /* yes */ } 567 else 568 return(NULL); 569 break; 570 case OBOW: 571 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 572 (sp < m->endp && *(sp-1) == '\n' && 573 (m->g->cflags®_NEWLINE)) || 574 (sp > m->beginp && 575 !ISWORD(*(sp-1))) ) && 576 (sp < m->endp && ISWORD(*sp)) ) 577 { /* yes */ } 578 else 579 return(NULL); 580 break; 581 case OEOW: 582 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 583 (sp < m->endp && *sp == '\n' && 584 (m->g->cflags®_NEWLINE)) || 585 (sp < m->endp && !ISWORD(*sp)) ) && 586 (sp > m->beginp && ISWORD(*(sp-1))) ) 587 { /* yes */ } 588 else 589 return(NULL); 590 break; 591 case O_QUEST: 592 break; 593 case OOR1: /* matches null but needs to skip */ 594 ss++; 595 s = m->g->strip[ss]; 596 do { 597 assert(OP(s) == OOR2); 598 ss += OPND(s); 599 } while (OP(s = m->g->strip[ss]) != O_CH); 600 /* note that the ss++ gets us past the O_CH */ 601 break; 602 default: /* have to make a choice */ 603 hard = 1; 604 break; 605 } 606 if (!hard) { /* that was it! */ 607 if (sp != stop) 608 return(NULL); 609 return(sp); 610 } 611 ss--; /* adjust for the for's final increment */ 612 613 /* the hard stuff */ 614 AT("hard", sp, stop, ss, stopst); 615 s = m->g->strip[ss]; 616 switch (OP(s)) { 617 case OBACK_: /* the vilest depths */ 618 i = OPND(s); 619 assert(0 < i && i <= m->g->nsub); 620 if (m->pmatch[i].rm_eo == (regoff_t)-1) 621 return(NULL); 622 assert(m->pmatch[i].rm_so != (regoff_t)-1); 623 len = (size_t)(m->pmatch[i].rm_eo - m->pmatch[i].rm_so); 624 assert(stop - m->beginp >= len); 625 if (sp > stop - len) 626 return(NULL); /* not enough left to match */ 627 ssp = m->offp + (size_t)m->pmatch[i].rm_so; 628 if (memcmp(sp, ssp, len) != 0) 629 return(NULL); 630 while (m->g->strip[ss] != SOP(O_BACK, i)) 631 ss++; 632 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 633 634 case OQUEST_: /* to null or not */ 635 dp = backref(m, sp, stop, ss+1, stopst, lev); 636 if (dp != NULL) 637 return(dp); /* not */ 638 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); 639 640 case OPLUS_: 641 assert(m->lastpos != NULL); 642 assert(lev+1 <= m->g->nplus); 643 m->lastpos[lev+1] = sp; 644 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 645 646 case O_PLUS: 647 if (sp == m->lastpos[lev]) /* last pass matched null */ 648 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 649 /* try another pass */ 650 m->lastpos[lev] = sp; 651 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); 652 if (dp == NULL) 653 dp = backref(m, sp, stop, ss+1, stopst, lev-1); 654 return(dp); 655 656 case OCH_: /* find the right one, if any */ 657 ssub = ss + 1; 658 esub = ss + OPND(s) - 1; 659 assert(OP(m->g->strip[esub]) == OOR1); 660 for (;;) { /* find first matching branch */ 661 dp = backref(m, sp, stop, ssub, esub, lev); 662 if (dp != NULL) 663 return(dp); 664 /* that one missed, try next one */ 665 if (OP(m->g->strip[esub]) == O_CH) 666 return(NULL); /* there is none */ 667 esub++; 668 assert(OP(m->g->strip[esub]) == OOR2); 669 ssub = esub + 1; 670 esub += OPND(m->g->strip[esub]); 671 if (OP(m->g->strip[esub]) == OOR2) 672 esub--; 673 else 674 assert(OP(m->g->strip[esub]) == O_CH); 675 } 676 677 case OLPAREN: /* must undo assignment if rest fails */ 678 i = OPND(s); 679 assert(0 < i && i <= m->g->nsub); 680 offsave = m->pmatch[i].rm_so; 681 m->pmatch[i].rm_so = sp - m->offp; 682 dp = backref(m, sp, stop, ss+1, stopst, lev); 683 if (dp != NULL) 684 return(dp); 685 m->pmatch[i].rm_so = offsave; 686 return(NULL); 687 688 case ORPAREN: /* must undo assignment if rest fails */ 689 i = OPND(s); 690 assert(0 < i && i <= m->g->nsub); 691 offsave = m->pmatch[i].rm_eo; 692 m->pmatch[i].rm_eo = sp - m->offp; 693 dp = backref(m, sp, stop, ss+1, stopst, lev); 694 if (dp != NULL) 695 return(dp); 696 m->pmatch[i].rm_eo = offsave; 697 return(NULL); 698 699 default: /* uh oh */ 700 assert(nope); 701 break; 702 } 703 704 /* "can't happen" */ 705 assert(nope); 706 /* NOTREACHED */ 707 return NULL; 708 } 709 710 /* 711 - fast - step through the string at top speed 712 == static char *fast(struct match *m, char *start, \ 713 == char *stop, sopno startst, sopno stopst); 714 */ 715 static char * /* where tentative match ended, or NULL */ 716 fast(m, start, stop, startst, stopst) 717 struct match *m; 718 char *start; 719 char *stop; 720 sopno startst; 721 sopno stopst; 722 { 723 states st = m->st; 724 states fresh = m->fresh; 725 states tmp = m->tmp; 726 char *p = start; 727 int c = (start == m->beginp) ? OUT : *(start-1); 728 int lastc; /* previous c */ 729 int flagch; 730 int i; 731 char *coldp; /* last p after which no match was underway */ 732 733 CLEAR(st); 734 SET1(st, startst); 735 st = step(m->g, startst, stopst, st, NOTHING, st); 736 ASSIGN(fresh, st); 737 SP("start", st, *p); 738 coldp = NULL; 739 for (;;) { 740 /* next character */ 741 lastc = c; 742 c = (p == m->endp) ? OUT : *p; 743 if (EQ(st, fresh)) 744 coldp = p; 745 746 /* is there an EOL and/or BOL between lastc and c? */ 747 flagch = '\0'; 748 i = 0; 749 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 750 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 751 flagch = BOL; 752 i = m->g->nbol; 753 } 754 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 755 (c == OUT && !(m->eflags®_NOTEOL)) ) { 756 flagch = (flagch == BOL) ? BOLEOL : EOL; 757 i += m->g->neol; 758 } 759 if (i != 0) { 760 for (; i > 0; i--) 761 st = step(m->g, startst, stopst, st, flagch, st); 762 SP("boleol", st, c); 763 } 764 765 /* how about a word boundary? */ 766 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 767 (c != OUT && ISWORD(c)) ) { 768 flagch = BOW; 769 } 770 if ( (lastc != OUT && ISWORD(lastc)) && 771 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 772 flagch = EOW; 773 } 774 if (flagch == BOW || flagch == EOW) { 775 st = step(m->g, startst, stopst, st, flagch, st); 776 SP("boweow", st, c); 777 } 778 779 /* are we done? */ 780 if (ISSET(st, stopst) || p == stop) 781 break; /* NOTE BREAK OUT */ 782 783 /* no, we must deal with this character */ 784 ASSIGN(tmp, st); 785 ASSIGN(st, fresh); 786 assert(c != OUT); 787 st = step(m->g, startst, stopst, tmp, c, st); 788 SP("aft", st, c); 789 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 790 p++; 791 } 792 793 assert(coldp != NULL); 794 m->coldp = coldp; 795 if (ISSET(st, stopst)) 796 return(p+1); 797 else 798 return(NULL); 799 } 800 801 /* 802 - slow - step through the string more deliberately 803 == static char *slow(struct match *m, char *start, \ 804 == char *stop, sopno startst, sopno stopst); 805 */ 806 static char * /* where it ended */ 807 slow(m, start, stop, startst, stopst) 808 struct match *m; 809 char *start; 810 char *stop; 811 sopno startst; 812 sopno stopst; 813 { 814 states st = m->st; 815 states empty = m->empty; 816 states tmp = m->tmp; 817 char *p = start; 818 int c = (start == m->beginp) ? OUT : *(start-1); 819 int lastc; /* previous c */ 820 int flagch; 821 int i; 822 char *matchp; /* last p at which a match ended */ 823 824 AT("slow", start, stop, startst, stopst); 825 CLEAR(st); 826 SET1(st, startst); 827 SP("sstart", st, *p); 828 st = step(m->g, startst, stopst, st, NOTHING, st); 829 matchp = NULL; 830 for (;;) { 831 /* next character */ 832 lastc = c; 833 c = (p == m->endp) ? OUT : *p; 834 835 /* is there an EOL and/or BOL between lastc and c? */ 836 flagch = '\0'; 837 i = 0; 838 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 839 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 840 flagch = BOL; 841 i = m->g->nbol; 842 } 843 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 844 (c == OUT && !(m->eflags®_NOTEOL)) ) { 845 flagch = (flagch == BOL) ? BOLEOL : EOL; 846 i += m->g->neol; 847 } 848 if (i != 0) { 849 for (; i > 0; i--) 850 st = step(m->g, startst, stopst, st, flagch, st); 851 SP("sboleol", st, c); 852 } 853 854 /* how about a word boundary? */ 855 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 856 (c != OUT && ISWORD(c)) ) { 857 flagch = BOW; 858 } 859 if ( (lastc != OUT && ISWORD(lastc)) && 860 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 861 flagch = EOW; 862 } 863 if (flagch == BOW || flagch == EOW) { 864 st = step(m->g, startst, stopst, st, flagch, st); 865 SP("sboweow", st, c); 866 } 867 868 /* are we done? */ 869 if (ISSET(st, stopst)) 870 matchp = p; 871 if (EQ(st, empty) || p == stop) 872 break; /* NOTE BREAK OUT */ 873 874 /* no, we must deal with this character */ 875 ASSIGN(tmp, st); 876 ASSIGN(st, empty); 877 assert(c != OUT); 878 st = step(m->g, startst, stopst, tmp, c, st); 879 SP("saft", st, c); 880 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 881 p++; 882 } 883 884 return(matchp); 885 } 886 887 888 /* 889 - step - map set of states reachable before char to set reachable after 890 == static states step(struct re_guts *g, sopno start, sopno stop, \ 891 == states bef, int ch, states aft); 892 == #define BOL (OUT+1) 893 == #define EOL (BOL+1) 894 == #define BOLEOL (BOL+2) 895 == #define NOTHING (BOL+3) 896 == #define BOW (BOL+4) 897 == #define EOW (BOL+5) 898 == #define CODEMAX (BOL+5) // highest code used 899 == #define NONCHAR(c) ((c) > CHAR_MAX) 900 == #define NNONCHAR (CODEMAX-CHAR_MAX) 901 */ 902 static states 903 step(g, start, stop, bef, ch, aft) 904 struct re_guts *g; 905 sopno start; /* start state within strip */ 906 sopno stop; /* state after stop state within strip */ 907 states bef; /* states reachable before */ 908 int ch; /* character or NONCHAR code */ 909 states aft; /* states already known reachable after */ 910 { 911 cset *cs; 912 sop s; 913 sopno pc; 914 onestate here; /* note, macros know this name */ 915 sopno look; 916 int i; 917 918 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 919 s = g->strip[pc]; 920 switch (OP(s)) { 921 case OEND: 922 assert(pc == stop-1); 923 break; 924 case OCHAR: 925 /* only characters can match */ 926 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 927 if (ch == (char)OPND(s)) 928 FWD(aft, bef, 1); 929 break; 930 case OBOL: 931 if (ch == BOL || ch == BOLEOL) 932 FWD(aft, bef, 1); 933 break; 934 case OEOL: 935 if (ch == EOL || ch == BOLEOL) 936 FWD(aft, bef, 1); 937 break; 938 case OBOW: 939 if (ch == BOW) 940 FWD(aft, bef, 1); 941 break; 942 case OEOW: 943 if (ch == EOW) 944 FWD(aft, bef, 1); 945 break; 946 case OANY: 947 if (!NONCHAR(ch)) 948 FWD(aft, bef, 1); 949 break; 950 case OANYOF: 951 cs = &g->sets[OPND(s)]; 952 if (!NONCHAR(ch) && CHIN(cs, ch)) 953 FWD(aft, bef, 1); 954 break; 955 case OBACK_: /* ignored here */ 956 case O_BACK: 957 FWD(aft, aft, 1); 958 break; 959 case OPLUS_: /* forward, this is just an empty */ 960 FWD(aft, aft, 1); 961 break; 962 case O_PLUS: /* both forward and back */ 963 FWD(aft, aft, 1); 964 i = ISSETBACK(aft, OPND(s)); 965 BACK(aft, aft, OPND(s)); 966 if (!i && ISSETBACK(aft, OPND(s))) { 967 /* oho, must reconsider loop body */ 968 pc -= OPND(s) + 1; 969 INIT(here, pc); 970 } 971 break; 972 case OQUEST_: /* two branches, both forward */ 973 FWD(aft, aft, 1); 974 FWD(aft, aft, OPND(s)); 975 break; 976 case O_QUEST: /* just an empty */ 977 FWD(aft, aft, 1); 978 break; 979 case OLPAREN: /* not significant here */ 980 case ORPAREN: 981 FWD(aft, aft, 1); 982 break; 983 case OCH_: /* mark the first two branches */ 984 FWD(aft, aft, 1); 985 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 986 FWD(aft, aft, OPND(s)); 987 break; 988 case OOR1: /* done a branch, find the O_CH */ 989 if (ISSTATEIN(aft, here)) { 990 for (look = 1; 991 OP(s = g->strip[pc+look]) != O_CH; 992 look += OPND(s)) 993 assert(OP(s) == OOR2); 994 FWD(aft, aft, look); 995 } 996 break; 997 case OOR2: /* propagate OCH_'s marking */ 998 FWD(aft, aft, 1); 999 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1000 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1001 FWD(aft, aft, OPND(s)); 1002 } 1003 break; 1004 case O_CH: /* just empty */ 1005 FWD(aft, aft, 1); 1006 break; 1007 default: /* ooooops... */ 1008 assert(nope); 1009 break; 1010 } 1011 } 1012 1013 return(aft); 1014 } 1015 1016 #ifdef REDEBUG 1017 /* 1018 - print - print a set of states 1019 == #ifdef REDEBUG 1020 == static void print(struct match *m, char *caption, states st, \ 1021 == int ch, FILE *d); 1022 == #endif 1023 */ 1024 static void 1025 print(m, caption, st, ch, d) 1026 struct match *m; 1027 char *caption; 1028 states st; 1029 int ch; 1030 FILE *d; 1031 { 1032 struct re_guts *g = m->g; 1033 int i; 1034 int first = 1; 1035 1036 if (!(m->eflags®_TRACE)) 1037 return; 1038 1039 fprintf(d, "%s", caption); 1040 if (ch != '\0') 1041 fprintf(d, " %s", pchar(ch)); 1042 for (i = 0; i < g->nstates; i++) 1043 if (ISSET(st, i)) { 1044 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1045 first = 0; 1046 } 1047 fprintf(d, "\n"); 1048 } 1049 1050 /* 1051 - at - print current situation 1052 == #ifdef REDEBUG 1053 == static void at(struct match *m, char *title, char *start, char *stop, \ 1054 == sopno startst, sopno stopst); 1055 == #endif 1056 */ 1057 static void 1058 at(m, title, start, stop, startst, stopst) 1059 struct match *m; 1060 char *title; 1061 char *start; 1062 char *stop; 1063 sopno startst; 1064 sopno stopst; 1065 { 1066 if (!(m->eflags®_TRACE)) 1067 return; 1068 1069 printf("%s %s-", title, pchar(*start)); 1070 printf("%s ", pchar(*stop)); 1071 printf("%ld-%ld\n", (long)startst, (long)stopst); 1072 } 1073 1074 #ifndef PCHARDONE 1075 #define PCHARDONE /* never again */ 1076 /* 1077 - pchar - make a character printable 1078 == #ifdef REDEBUG 1079 == static char *pchar(int ch); 1080 == #endif 1081 * 1082 * Is this identical to regchar() over in debug.c? Well, yes. But a 1083 * duplicate here avoids having a debugging-capable regexec.o tied to 1084 * a matching debug.o, and this is convenient. It all disappears in 1085 * the non-debug compilation anyway, so it doesn't matter much. 1086 */ 1087 static char * /* -> representation */ 1088 pchar(ch) 1089 int ch; 1090 { 1091 static char pbuf[10]; 1092 1093 if (isprint(ch) || ch == ' ') 1094 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1095 else 1096 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1097 return(pbuf); 1098 } 1099 #endif 1100 #endif 1101 1102 #undef matcher 1103 #undef fast 1104 #undef slow 1105 #undef dissect 1106 #undef backref 1107 #undef step 1108 #undef print 1109 #undef at 1110 #undef match 1111