1 /* $NetBSD: engine.c,v 1.13 2001/12/17 16:32:49 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * @(#)engine.c 8.5 (Berkeley) 3/20/94 40 */ 41 42 /* 43 * The matching engine and friends. This file is #included by regexec.c 44 * after suitable #defines of a variety of macros used herein, so that 45 * different state representations can be used without duplicating masses 46 * of code. 47 */ 48 49 #ifdef SNAMES 50 #define matcher smatcher 51 #define fast sfast 52 #define slow sslow 53 #define dissect sdissect 54 #define backref sbackref 55 #define step sstep 56 #define print sprint 57 #define at sat 58 #define match smat 59 #endif 60 #ifdef LNAMES 61 #define matcher lmatcher 62 #define fast lfast 63 #define slow lslow 64 #define dissect ldissect 65 #define backref lbackref 66 #define step lstep 67 #define print lprint 68 #define at lat 69 #define match lmat 70 #endif 71 72 /* another structure passed up and down to avoid zillions of parameters */ 73 struct match { 74 struct re_guts *g; 75 int eflags; 76 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 77 char *offp; /* offsets work from here */ 78 char *beginp; /* start of string -- virtual NUL precedes */ 79 char *endp; /* end of string -- virtual NUL here */ 80 char *coldp; /* can be no match starting before here */ 81 char **lastpos; /* [nplus+1] */ 82 STATEVARS; 83 states st; /* current states */ 84 states fresh; /* states for a fresh start */ 85 states tmp; /* temporary */ 86 states empty; /* empty set of states */ 87 }; 88 89 /* ========= begin header generated by ./mkh ========= */ 90 #ifdef __cplusplus 91 extern "C" { 92 #endif 93 94 /* === engine.c === */ 95 static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags)); 96 static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 97 static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev)); 98 static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 99 static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 100 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft)); 101 #define BOL (OUT+1) 102 #define EOL (BOL+1) 103 #define BOLEOL (BOL+2) 104 #define NOTHING (BOL+3) 105 #define BOW (BOL+4) 106 #define EOW (BOL+5) 107 #define CODEMAX (BOL+5) /* highest code used */ 108 #define NONCHAR(c) ((c) > CHAR_MAX) 109 #define NNONCHAR (CODEMAX-CHAR_MAX) 110 #ifdef REDEBUG 111 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d)); 112 #endif 113 #ifdef REDEBUG 114 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst)); 115 #endif 116 #ifdef REDEBUG 117 static char *pchar __P((int ch)); 118 #endif 119 120 #ifdef __cplusplus 121 } 122 #endif 123 /* ========= end header generated by ./mkh ========= */ 124 125 #ifdef REDEBUG 126 #define SP(t, s, c) print(m, t, s, c, stdout) 127 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 128 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 129 #else 130 #define SP(t, s, c) /* nothing */ 131 #define AT(t, p1, p2, s1, s2) /* nothing */ 132 #define NOTE(s) /* nothing */ 133 #endif 134 135 /* 136 - matcher - the actual matching engine 137 == static int matcher(struct re_guts *g, char *string, \ 138 == size_t nmatch, regmatch_t pmatch[], int eflags); 139 */ 140 static int /* 0 success, REG_NOMATCH failure */ 141 matcher(g, string, nmatch, pmatch, eflags) 142 struct re_guts *g; 143 char *string; 144 size_t nmatch; 145 regmatch_t pmatch[]; 146 int eflags; 147 { 148 char *endp; 149 int i; 150 struct match mv; 151 struct match *m = &mv; 152 char *dp; 153 const sopno gf = g->firststate+1; /* +1 for OEND */ 154 const sopno gl = g->laststate; 155 char *start; 156 char *stop; 157 int error = 0; 158 159 _DIAGASSERT(g != NULL); 160 _DIAGASSERT(string != NULL); 161 /* pmatch checked below */ 162 163 /* simplify the situation where possible */ 164 if (g->cflags®_NOSUB) 165 nmatch = 0; 166 if (eflags®_STARTEND) { 167 _DIAGASSERT(pmatch != NULL); 168 start = string + (size_t)pmatch[0].rm_so; 169 stop = string + (size_t)pmatch[0].rm_eo; 170 } else { 171 start = string; 172 stop = start + strlen(start); 173 } 174 if (stop < start) 175 return(REG_INVARG); 176 177 /* prescreening; this does wonders for this rather slow code */ 178 if (g->must != NULL) { 179 for (dp = start; dp < stop; dp++) 180 if (*dp == g->must[0] && stop - dp >= g->mlen && 181 memcmp(dp, g->must, (size_t)g->mlen) == 0) 182 break; 183 if (dp == stop) /* we didn't find g->must */ 184 return(REG_NOMATCH); 185 } 186 187 /* match struct setup */ 188 m->g = g; 189 m->eflags = eflags; 190 m->pmatch = NULL; 191 m->lastpos = NULL; 192 m->offp = string; 193 m->beginp = start; 194 m->endp = stop; 195 STATESETUP(m, 4); 196 SETUP(m->st); 197 SETUP(m->fresh); 198 SETUP(m->tmp); 199 SETUP(m->empty); 200 CLEAR(m->empty); 201 202 /* this loop does only one repetition except for backrefs */ 203 for (;;) { 204 endp = fast(m, start, stop, gf, gl); 205 if (endp == NULL) { /* a miss */ 206 error = REG_NOMATCH; 207 goto done; 208 } 209 if (nmatch == 0 && !g->backrefs) 210 break; /* no further info needed */ 211 212 /* where? */ 213 assert(m->coldp != NULL); 214 for (;;) { 215 NOTE("finding start"); 216 endp = slow(m, m->coldp, stop, gf, gl); 217 if (endp != NULL) 218 break; 219 assert(m->coldp < m->endp); 220 m->coldp++; 221 } 222 if (nmatch == 1 && !g->backrefs) 223 break; /* no further info needed */ 224 225 /* oh my, he wants the subexpressions... */ 226 if (m->pmatch == NULL) 227 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 228 sizeof(regmatch_t)); 229 if (m->pmatch == NULL) { 230 error = REG_ESPACE; 231 goto done; 232 } 233 for (i = 1; i <= m->g->nsub; i++) 234 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = (regoff_t)-1; 235 if (!g->backrefs && !(m->eflags®_BACKR)) { 236 NOTE("dissecting"); 237 dp = dissect(m, m->coldp, endp, gf, gl); 238 } else { 239 if (g->nplus > 0 && m->lastpos == NULL) 240 m->lastpos = (char **)malloc((g->nplus+1) * 241 sizeof(char *)); 242 if (g->nplus > 0 && m->lastpos == NULL) { 243 error = REG_ESPACE; 244 goto done; 245 } 246 NOTE("backref dissect"); 247 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 248 } 249 if (dp != NULL) 250 break; 251 252 /* uh-oh... we couldn't find a subexpression-level match */ 253 assert(g->backrefs); /* must be back references doing it */ 254 assert(g->nplus == 0 || m->lastpos != NULL); 255 for (;;) { 256 if (dp != NULL || endp <= m->coldp) 257 break; /* defeat */ 258 NOTE("backoff"); 259 endp = slow(m, m->coldp, endp-1, gf, gl); 260 if (endp == NULL) 261 break; /* defeat */ 262 /* try it on a shorter possibility */ 263 #ifndef NDEBUG 264 for (i = 1; i <= m->g->nsub; i++) { 265 assert(m->pmatch[i].rm_so == (regoff_t)-1); 266 assert(m->pmatch[i].rm_eo == (regoff_t)-1); 267 } 268 #endif 269 NOTE("backoff dissect"); 270 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 271 } 272 assert(dp == NULL || dp == endp); 273 if (dp != NULL) /* found a shorter one */ 274 break; 275 276 /* despite initial appearances, there is no match here */ 277 NOTE("false alarm"); 278 start = m->coldp + 1; /* recycle starting later */ 279 assert(start <= stop); 280 } 281 282 /* fill in the details if requested */ 283 if (nmatch > 0) { 284 _DIAGASSERT(pmatch != NULL); 285 pmatch[0].rm_so = m->coldp - m->offp; 286 pmatch[0].rm_eo = endp - m->offp; 287 } 288 if (nmatch > 1) { 289 assert(m->pmatch != NULL); 290 for (i = 1; i < nmatch; i++) 291 if (i <= m->g->nsub) 292 pmatch[i] = m->pmatch[i]; 293 else { 294 pmatch[i].rm_so = (regoff_t)-1; 295 pmatch[i].rm_eo = (regoff_t)-1; 296 } 297 } 298 299 done: 300 if (m->pmatch != NULL) { 301 free(m->pmatch); 302 m->pmatch = NULL; 303 } 304 if (m->lastpos != NULL) { 305 free(m->lastpos); 306 m->lastpos = NULL; 307 } 308 STATETEARDOWN(m); 309 return error; 310 } 311 312 /* 313 - dissect - figure out what matched what, no back references 314 == static char *dissect(struct match *m, char *start, \ 315 == char *stop, sopno startst, sopno stopst); 316 */ 317 static char * /* == stop (success) always */ 318 dissect(m, start, stop, startst, stopst) 319 struct match *m; 320 char *start; 321 char *stop; 322 sopno startst; 323 sopno stopst; 324 { 325 int i; 326 sopno ss; /* start sop of current subRE */ 327 sopno es; /* end sop of current subRE */ 328 char *sp; /* start of string matched by it */ 329 char *stp; /* string matched by it cannot pass here */ 330 char *rest; /* start of rest of string */ 331 char *tail; /* string unmatched by rest of RE */ 332 sopno ssub; /* start sop of subsubRE */ 333 sopno esub; /* end sop of subsubRE */ 334 char *ssp; /* start of string matched by subsubRE */ 335 char *sep; /* end of string matched by subsubRE */ 336 char *oldssp; /* previous ssp */ 337 #ifndef NDEBUG 338 char *dp; 339 #endif 340 341 _DIAGASSERT(m != NULL); 342 _DIAGASSERT(start != NULL); 343 _DIAGASSERT(stop != NULL); 344 345 AT("diss", start, stop, startst, stopst); 346 sp = start; 347 for (ss = startst; ss < stopst; ss = es) { 348 /* identify end of subRE */ 349 es = ss; 350 switch (OP(m->g->strip[es])) { 351 case OPLUS_: 352 case OQUEST_: 353 es += OPND(m->g->strip[es]); 354 break; 355 case OCH_: 356 while (OP(m->g->strip[es]) != O_CH) 357 es += OPND(m->g->strip[es]); 358 break; 359 } 360 es++; 361 362 /* figure out what it matched */ 363 switch (OP(m->g->strip[ss])) { 364 case OEND: 365 assert(nope); 366 break; 367 case OCHAR: 368 sp++; 369 break; 370 case OBOL: 371 case OEOL: 372 case OBOW: 373 case OEOW: 374 break; 375 case OANY: 376 case OANYOF: 377 sp++; 378 break; 379 case OBACK_: 380 case O_BACK: 381 assert(nope); 382 break; 383 /* cases where length of match is hard to find */ 384 case OQUEST_: 385 stp = stop; 386 for (;;) { 387 /* how long could this one be? */ 388 rest = slow(m, sp, stp, ss, es); 389 assert(rest != NULL); /* it did match */ 390 /* could the rest match the rest? */ 391 tail = slow(m, rest, stop, es, stopst); 392 if (tail == stop) 393 break; /* yes! */ 394 /* no -- try a shorter match for this one */ 395 stp = rest - 1; 396 assert(stp >= sp); /* it did work */ 397 } 398 ssub = ss + 1; 399 esub = es - 1; 400 /* did innards match? */ 401 if (slow(m, sp, rest, ssub, esub) != NULL) { 402 #ifdef NDEBUG 403 (void) 404 #else 405 dp = 406 #endif 407 dissect(m, sp, rest, ssub, esub); 408 assert(dp == rest); 409 } else /* no */ 410 assert(sp == rest); 411 sp = rest; 412 break; 413 case OPLUS_: 414 stp = stop; 415 for (;;) { 416 /* how long could this one be? */ 417 rest = slow(m, sp, stp, ss, es); 418 assert(rest != NULL); /* it did match */ 419 /* could the rest match the rest? */ 420 tail = slow(m, rest, stop, es, stopst); 421 if (tail == stop) 422 break; /* yes! */ 423 /* no -- try a shorter match for this one */ 424 stp = rest - 1; 425 assert(stp >= sp); /* it did work */ 426 } 427 ssub = ss + 1; 428 esub = es - 1; 429 ssp = sp; 430 oldssp = ssp; 431 for (;;) { /* find last match of innards */ 432 sep = slow(m, ssp, rest, ssub, esub); 433 if (sep == NULL || sep == ssp) 434 break; /* failed or matched null */ 435 oldssp = ssp; /* on to next try */ 436 ssp = sep; 437 } 438 if (sep == NULL) { 439 /* last successful match */ 440 sep = ssp; 441 ssp = oldssp; 442 } 443 assert(sep == rest); /* must exhaust substring */ 444 assert(slow(m, ssp, sep, ssub, esub) == rest); 445 #ifdef NDEBUG 446 (void) 447 #else 448 dp = 449 #endif 450 dissect(m, ssp, sep, ssub, esub); 451 assert(dp == sep); 452 sp = rest; 453 break; 454 case OCH_: 455 stp = stop; 456 for (;;) { 457 /* how long could this one be? */ 458 rest = slow(m, sp, stp, ss, es); 459 assert(rest != NULL); /* it did match */ 460 /* could the rest match the rest? */ 461 tail = slow(m, rest, stop, es, stopst); 462 if (tail == stop) 463 break; /* yes! */ 464 /* no -- try a shorter match for this one */ 465 stp = rest - 1; 466 assert(stp >= sp); /* it did work */ 467 } 468 ssub = ss + 1; 469 esub = ss + OPND(m->g->strip[ss]) - 1; 470 assert(OP(m->g->strip[esub]) == OOR1); 471 for (;;) { /* find first matching branch */ 472 if (slow(m, sp, rest, ssub, esub) == rest) 473 break; /* it matched all of it */ 474 /* that one missed, try next one */ 475 assert(OP(m->g->strip[esub]) == OOR1); 476 esub++; 477 assert(OP(m->g->strip[esub]) == OOR2); 478 ssub = esub + 1; 479 esub += OPND(m->g->strip[esub]); 480 if (OP(m->g->strip[esub]) == OOR2) 481 esub--; 482 else 483 assert(OP(m->g->strip[esub]) == O_CH); 484 } 485 #ifdef NDEBUG 486 (void) 487 #else 488 dp = 489 #endif 490 dissect(m, sp, rest, ssub, esub); 491 assert(dp == rest); 492 sp = rest; 493 break; 494 case O_PLUS: 495 case O_QUEST: 496 case OOR1: 497 case OOR2: 498 case O_CH: 499 assert(nope); 500 break; 501 case OLPAREN: 502 i = OPND(m->g->strip[ss]); 503 assert(0 < i && i <= m->g->nsub); 504 m->pmatch[i].rm_so = sp - m->offp; 505 break; 506 case ORPAREN: 507 i = OPND(m->g->strip[ss]); 508 assert(0 < i && i <= m->g->nsub); 509 m->pmatch[i].rm_eo = sp - m->offp; 510 break; 511 default: /* uh oh */ 512 assert(nope); 513 break; 514 } 515 } 516 517 assert(sp == stop); 518 return(sp); 519 } 520 521 /* 522 - backref - figure out what matched what, figuring in back references 523 == static char *backref(struct match *m, char *start, \ 524 == char *stop, sopno startst, sopno stopst, sopno lev); 525 */ 526 static char * /* == stop (success) or NULL (failure) */ 527 backref(m, start, stop, startst, stopst, lev) 528 struct match *m; 529 char *start; 530 char *stop; 531 sopno startst; 532 sopno stopst; 533 sopno lev; /* PLUS nesting level */ 534 { 535 int i; 536 sopno ss; /* start sop of current subRE */ 537 char *sp; /* start of string matched by it */ 538 sopno ssub; /* start sop of subsubRE */ 539 sopno esub; /* end sop of subsubRE */ 540 char *ssp; /* start of string matched by subsubRE */ 541 char *dp; 542 size_t len; 543 int hard; 544 sop s; 545 regoff_t offsave; 546 cset *cs; 547 548 _DIAGASSERT(m != NULL); 549 _DIAGASSERT(start != NULL); 550 _DIAGASSERT(stop != NULL); 551 552 AT("back", start, stop, startst, stopst); 553 sp = start; 554 555 /* get as far as we can with easy stuff */ 556 hard = 0; 557 for (ss = startst; !hard && ss < stopst; ss++) 558 switch (OP(s = m->g->strip[ss])) { 559 case OCHAR: 560 if (sp == stop || *sp++ != (char)OPND(s)) 561 return(NULL); 562 break; 563 case OANY: 564 if (sp == stop) 565 return(NULL); 566 sp++; 567 break; 568 case OANYOF: 569 cs = &m->g->sets[OPND(s)]; 570 if (sp == stop || !CHIN(cs, *sp++)) 571 return(NULL); 572 break; 573 case OBOL: 574 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 575 (sp < m->endp && *(sp-1) == '\n' && 576 (m->g->cflags®_NEWLINE)) ) 577 { /* yes */ } 578 else 579 return(NULL); 580 break; 581 case OEOL: 582 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 583 (sp < m->endp && *sp == '\n' && 584 (m->g->cflags®_NEWLINE)) ) 585 { /* yes */ } 586 else 587 return(NULL); 588 break; 589 case OBOW: 590 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 591 (sp < m->endp && *(sp-1) == '\n' && 592 (m->g->cflags®_NEWLINE)) || 593 (sp > m->beginp && 594 !ISWORD(*(sp-1))) ) && 595 (sp < m->endp && ISWORD(*sp)) ) 596 { /* yes */ } 597 else 598 return(NULL); 599 break; 600 case OEOW: 601 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 602 (sp < m->endp && *sp == '\n' && 603 (m->g->cflags®_NEWLINE)) || 604 (sp < m->endp && !ISWORD(*sp)) ) && 605 (sp > m->beginp && ISWORD(*(sp-1))) ) 606 { /* yes */ } 607 else 608 return(NULL); 609 break; 610 case O_QUEST: 611 break; 612 case OOR1: /* matches null but needs to skip */ 613 ss++; 614 s = m->g->strip[ss]; 615 do { 616 assert(OP(s) == OOR2); 617 ss += OPND(s); 618 } while (OP(s = m->g->strip[ss]) != O_CH); 619 /* note that the ss++ gets us past the O_CH */ 620 break; 621 default: /* have to make a choice */ 622 hard = 1; 623 break; 624 } 625 if (!hard) { /* that was it! */ 626 if (sp != stop) 627 return(NULL); 628 return(sp); 629 } 630 ss--; /* adjust for the for's final increment */ 631 632 /* the hard stuff */ 633 AT("hard", sp, stop, ss, stopst); 634 s = m->g->strip[ss]; 635 switch (OP(s)) { 636 case OBACK_: /* the vilest depths */ 637 i = OPND(s); 638 assert(0 < i && i <= m->g->nsub); 639 if (m->pmatch[i].rm_eo == (regoff_t)-1) 640 return(NULL); 641 assert(m->pmatch[i].rm_so != (regoff_t)-1); 642 len = (size_t)(m->pmatch[i].rm_eo - m->pmatch[i].rm_so); 643 assert(stop - m->beginp >= len); 644 if (sp > stop - len) 645 return(NULL); /* not enough left to match */ 646 ssp = m->offp + (size_t)m->pmatch[i].rm_so; 647 if (memcmp(sp, ssp, len) != 0) 648 return(NULL); 649 while (m->g->strip[ss] != SOP(O_BACK, i)) 650 ss++; 651 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 652 653 case OQUEST_: /* to null or not */ 654 dp = backref(m, sp, stop, ss+1, stopst, lev); 655 if (dp != NULL) 656 return(dp); /* not */ 657 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); 658 659 case OPLUS_: 660 assert(m->lastpos != NULL); 661 assert(lev+1 <= m->g->nplus); 662 m->lastpos[lev+1] = sp; 663 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 664 665 case O_PLUS: 666 if (sp == m->lastpos[lev]) /* last pass matched null */ 667 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 668 /* try another pass */ 669 m->lastpos[lev] = sp; 670 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); 671 if (dp == NULL) 672 dp = backref(m, sp, stop, ss+1, stopst, lev-1); 673 return(dp); 674 675 case OCH_: /* find the right one, if any */ 676 ssub = ss + 1; 677 esub = ss + OPND(s) - 1; 678 assert(OP(m->g->strip[esub]) == OOR1); 679 for (;;) { /* find first matching branch */ 680 dp = backref(m, sp, stop, ssub, esub, lev); 681 if (dp != NULL) 682 return(dp); 683 /* that one missed, try next one */ 684 if (OP(m->g->strip[esub]) == O_CH) 685 return(NULL); /* there is none */ 686 esub++; 687 assert(OP(m->g->strip[esub]) == OOR2); 688 ssub = esub + 1; 689 esub += OPND(m->g->strip[esub]); 690 if (OP(m->g->strip[esub]) == OOR2) 691 esub--; 692 else 693 assert(OP(m->g->strip[esub]) == O_CH); 694 } 695 696 case OLPAREN: /* must undo assignment if rest fails */ 697 i = OPND(s); 698 assert(0 < i && i <= m->g->nsub); 699 offsave = m->pmatch[i].rm_so; 700 m->pmatch[i].rm_so = sp - m->offp; 701 dp = backref(m, sp, stop, ss+1, stopst, lev); 702 if (dp != NULL) 703 return(dp); 704 m->pmatch[i].rm_so = offsave; 705 return(NULL); 706 707 case ORPAREN: /* must undo assignment if rest fails */ 708 i = OPND(s); 709 assert(0 < i && i <= m->g->nsub); 710 offsave = m->pmatch[i].rm_eo; 711 m->pmatch[i].rm_eo = sp - m->offp; 712 dp = backref(m, sp, stop, ss+1, stopst, lev); 713 if (dp != NULL) 714 return(dp); 715 m->pmatch[i].rm_eo = offsave; 716 return(NULL); 717 718 default: /* uh oh */ 719 assert(nope); 720 break; 721 } 722 723 /* "can't happen" */ 724 assert(nope); 725 /* NOTREACHED */ 726 return NULL; 727 } 728 729 /* 730 - fast - step through the string at top speed 731 == static char *fast(struct match *m, char *start, \ 732 == char *stop, sopno startst, sopno stopst); 733 */ 734 static char * /* where tentative match ended, or NULL */ 735 fast(m, start, stop, startst, stopst) 736 struct match *m; 737 char *start; 738 char *stop; 739 sopno startst; 740 sopno stopst; 741 { 742 states st = m->st; 743 states fresh = m->fresh; 744 states tmp = m->tmp; 745 char *p = start; 746 int c = (start == m->beginp) ? OUT : *(start-1); 747 int lastc; /* previous c */ 748 int flagch; 749 int i; 750 char *coldp; /* last p after which no match was underway */ 751 752 _DIAGASSERT(m != NULL); 753 _DIAGASSERT(start != NULL); 754 _DIAGASSERT(stop != NULL); 755 756 CLEAR(st); 757 SET1(st, startst); 758 st = step(m->g, startst, stopst, st, NOTHING, st); 759 ASSIGN(fresh, st); 760 SP("start", st, *p); 761 coldp = NULL; 762 for (;;) { 763 /* next character */ 764 lastc = c; 765 c = (p == m->endp) ? OUT : *p; 766 if (EQ(st, fresh)) 767 coldp = p; 768 769 /* is there an EOL and/or BOL between lastc and c? */ 770 flagch = '\0'; 771 i = 0; 772 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 773 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 774 flagch = BOL; 775 i = m->g->nbol; 776 } 777 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 778 (c == OUT && !(m->eflags®_NOTEOL)) ) { 779 flagch = (flagch == BOL) ? BOLEOL : EOL; 780 i += m->g->neol; 781 } 782 if (i != 0) { 783 for (; i > 0; i--) 784 st = step(m->g, startst, stopst, st, flagch, st); 785 SP("boleol", st, c); 786 } 787 788 /* how about a word boundary? */ 789 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 790 (c != OUT && ISWORD(c)) ) { 791 flagch = BOW; 792 } 793 if ( (lastc != OUT && ISWORD(lastc)) && 794 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 795 flagch = EOW; 796 } 797 if (flagch == BOW || flagch == EOW) { 798 st = step(m->g, startst, stopst, st, flagch, st); 799 SP("boweow", st, c); 800 } 801 802 /* are we done? */ 803 if (ISSET(st, stopst) || p == stop) 804 break; /* NOTE BREAK OUT */ 805 806 /* no, we must deal with this character */ 807 ASSIGN(tmp, st); 808 ASSIGN(st, fresh); 809 assert(c != OUT); 810 st = step(m->g, startst, stopst, tmp, c, st); 811 SP("aft", st, c); 812 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 813 p++; 814 } 815 816 assert(coldp != NULL); 817 m->coldp = coldp; 818 if (ISSET(st, stopst)) 819 return(p+1); 820 else 821 return(NULL); 822 } 823 824 /* 825 - slow - step through the string more deliberately 826 == static char *slow(struct match *m, char *start, \ 827 == char *stop, sopno startst, sopno stopst); 828 */ 829 static char * /* where it ended */ 830 slow(m, start, stop, startst, stopst) 831 struct match *m; 832 char *start; 833 char *stop; 834 sopno startst; 835 sopno stopst; 836 { 837 states st = m->st; 838 states empty = m->empty; 839 states tmp = m->tmp; 840 char *p = start; 841 int c = (start == m->beginp) ? OUT : *(start-1); 842 int lastc; /* previous c */ 843 int flagch; 844 int i; 845 char *matchp; /* last p at which a match ended */ 846 847 _DIAGASSERT(m != NULL); 848 _DIAGASSERT(start != NULL); 849 _DIAGASSERT(stop != NULL); 850 851 AT("slow", start, stop, startst, stopst); 852 CLEAR(st); 853 SET1(st, startst); 854 SP("sstart", st, *p); 855 st = step(m->g, startst, stopst, st, NOTHING, st); 856 matchp = NULL; 857 for (;;) { 858 /* next character */ 859 lastc = c; 860 c = (p == m->endp) ? OUT : *p; 861 862 /* is there an EOL and/or BOL between lastc and c? */ 863 flagch = '\0'; 864 i = 0; 865 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 866 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 867 flagch = BOL; 868 i = m->g->nbol; 869 } 870 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 871 (c == OUT && !(m->eflags®_NOTEOL)) ) { 872 flagch = (flagch == BOL) ? BOLEOL : EOL; 873 i += m->g->neol; 874 } 875 if (i != 0) { 876 for (; i > 0; i--) 877 st = step(m->g, startst, stopst, st, flagch, st); 878 SP("sboleol", st, c); 879 } 880 881 /* how about a word boundary? */ 882 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 883 (c != OUT && ISWORD(c)) ) { 884 flagch = BOW; 885 } 886 if ( (lastc != OUT && ISWORD(lastc)) && 887 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 888 flagch = EOW; 889 } 890 if (flagch == BOW || flagch == EOW) { 891 st = step(m->g, startst, stopst, st, flagch, st); 892 SP("sboweow", st, c); 893 } 894 895 /* are we done? */ 896 if (ISSET(st, stopst)) 897 matchp = p; 898 if (EQ(st, empty) || p == stop) 899 break; /* NOTE BREAK OUT */ 900 901 /* no, we must deal with this character */ 902 ASSIGN(tmp, st); 903 ASSIGN(st, empty); 904 assert(c != OUT); 905 st = step(m->g, startst, stopst, tmp, c, st); 906 SP("saft", st, c); 907 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 908 p++; 909 } 910 911 return(matchp); 912 } 913 914 915 /* 916 - step - map set of states reachable before char to set reachable after 917 == static states step(struct re_guts *g, sopno start, sopno stop, \ 918 == states bef, int ch, states aft); 919 == #define BOL (OUT+1) 920 == #define EOL (BOL+1) 921 == #define BOLEOL (BOL+2) 922 == #define NOTHING (BOL+3) 923 == #define BOW (BOL+4) 924 == #define EOW (BOL+5) 925 == #define CODEMAX (BOL+5) // highest code used 926 == #define NONCHAR(c) ((c) > CHAR_MAX) 927 == #define NNONCHAR (CODEMAX-CHAR_MAX) 928 */ 929 static states 930 step(g, start, stop, bef, ch, aft) 931 struct re_guts *g; 932 sopno start; /* start state within strip */ 933 sopno stop; /* state after stop state within strip */ 934 states bef; /* states reachable before */ 935 int ch; /* character or NONCHAR code */ 936 states aft; /* states already known reachable after */ 937 { 938 cset *cs; 939 sop s; 940 sopno pc; 941 onestate here; /* note, macros know this name */ 942 sopno look; 943 int i; 944 945 _DIAGASSERT(g != NULL); 946 947 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 948 s = g->strip[pc]; 949 switch (OP(s)) { 950 case OEND: 951 assert(pc == stop-1); 952 break; 953 case OCHAR: 954 /* only characters can match */ 955 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 956 if (ch == (char)OPND(s)) 957 FWD(aft, bef, 1); 958 break; 959 case OBOL: 960 if (ch == BOL || ch == BOLEOL) 961 FWD(aft, bef, 1); 962 break; 963 case OEOL: 964 if (ch == EOL || ch == BOLEOL) 965 FWD(aft, bef, 1); 966 break; 967 case OBOW: 968 if (ch == BOW) 969 FWD(aft, bef, 1); 970 break; 971 case OEOW: 972 if (ch == EOW) 973 FWD(aft, bef, 1); 974 break; 975 case OANY: 976 if (!NONCHAR(ch)) 977 FWD(aft, bef, 1); 978 break; 979 case OANYOF: 980 cs = &g->sets[OPND(s)]; 981 if (!NONCHAR(ch) && CHIN(cs, ch)) 982 FWD(aft, bef, 1); 983 break; 984 case OBACK_: /* ignored here */ 985 case O_BACK: 986 FWD(aft, aft, 1); 987 break; 988 case OPLUS_: /* forward, this is just an empty */ 989 FWD(aft, aft, 1); 990 break; 991 case O_PLUS: /* both forward and back */ 992 FWD(aft, aft, 1); 993 i = ISSETBACK(aft, OPND(s)); 994 BACK(aft, aft, OPND(s)); 995 if (!i && ISSETBACK(aft, OPND(s))) { 996 /* oho, must reconsider loop body */ 997 pc -= OPND(s) + 1; 998 INIT(here, pc); 999 } 1000 break; 1001 case OQUEST_: /* two branches, both forward */ 1002 FWD(aft, aft, 1); 1003 FWD(aft, aft, OPND(s)); 1004 break; 1005 case O_QUEST: /* just an empty */ 1006 FWD(aft, aft, 1); 1007 break; 1008 case OLPAREN: /* not significant here */ 1009 case ORPAREN: 1010 FWD(aft, aft, 1); 1011 break; 1012 case OCH_: /* mark the first two branches */ 1013 FWD(aft, aft, 1); 1014 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1015 FWD(aft, aft, OPND(s)); 1016 break; 1017 case OOR1: /* done a branch, find the O_CH */ 1018 if (ISSTATEIN(aft, here)) { 1019 for (look = 1; 1020 OP(s = g->strip[pc+look]) != O_CH; 1021 look += OPND(s)) 1022 assert(OP(s) == OOR2); 1023 FWD(aft, aft, look); 1024 } 1025 break; 1026 case OOR2: /* propagate OCH_'s marking */ 1027 FWD(aft, aft, 1); 1028 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1029 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1030 FWD(aft, aft, OPND(s)); 1031 } 1032 break; 1033 case O_CH: /* just empty */ 1034 FWD(aft, aft, 1); 1035 break; 1036 default: /* ooooops... */ 1037 assert(nope); 1038 break; 1039 } 1040 } 1041 1042 return(aft); 1043 } 1044 1045 #ifdef REDEBUG 1046 /* 1047 - print - print a set of states 1048 == #ifdef REDEBUG 1049 == static void print(struct match *m, char *caption, states st, \ 1050 == int ch, FILE *d); 1051 == #endif 1052 */ 1053 static void 1054 print(m, caption, st, ch, d) 1055 struct match *m; 1056 char *caption; 1057 states st; 1058 int ch; 1059 FILE *d; 1060 { 1061 struct re_guts *g = m->g; 1062 int i; 1063 int first = 1; 1064 1065 _DIAGASSERT(m != NULL); 1066 _DIAGASSERT(caption != NULL); 1067 1068 if (!(m->eflags®_TRACE)) 1069 return; 1070 1071 _DIAGASSERT(d != NULL); 1072 1073 fprintf(d, "%s", caption); 1074 if (ch != '\0') 1075 fprintf(d, " %s", pchar(ch)); 1076 for (i = 0; i < g->nstates; i++) 1077 if (ISSET(st, i)) { 1078 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1079 first = 0; 1080 } 1081 fprintf(d, "\n"); 1082 } 1083 1084 /* 1085 - at - print current situation 1086 == #ifdef REDEBUG 1087 == static void at(struct match *m, char *title, char *start, char *stop, \ 1088 == sopno startst, sopno stopst); 1089 == #endif 1090 */ 1091 static void 1092 at(m, title, start, stop, startst, stopst) 1093 struct match *m; 1094 char *title; 1095 char *start; 1096 char *stop; 1097 sopno startst; 1098 sopno stopst; 1099 { 1100 1101 _DIAGASSERT(m != NULL); 1102 _DIAGASSERT(title != NULL); 1103 _DIAGASSERT(start != NULL); 1104 _DIAGASSERT(stop != NULL); 1105 1106 if (!(m->eflags®_TRACE)) 1107 return; 1108 1109 printf("%s %s-", title, pchar(*start)); 1110 printf("%s ", pchar(*stop)); 1111 printf("%ld-%ld\n", (long)startst, (long)stopst); 1112 } 1113 1114 #ifndef PCHARDONE 1115 #define PCHARDONE /* never again */ 1116 /* 1117 - pchar - make a character printable 1118 == #ifdef REDEBUG 1119 == static char *pchar(int ch); 1120 == #endif 1121 * 1122 * Is this identical to regchar() over in debug.c? Well, yes. But a 1123 * duplicate here avoids having a debugging-capable regexec.o tied to 1124 * a matching debug.o, and this is convenient. It all disappears in 1125 * the non-debug compilation anyway, so it doesn't matter much. 1126 */ 1127 static char * /* -> representation */ 1128 pchar(ch) 1129 int ch; 1130 { 1131 static char pbuf[10]; 1132 1133 if (isprint(ch) || ch == ' ') 1134 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1135 else 1136 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1137 return(pbuf); 1138 } 1139 #endif 1140 #endif 1141 1142 #undef matcher 1143 #undef fast 1144 #undef slow 1145 #undef dissect 1146 #undef backref 1147 #undef step 1148 #undef print 1149 #undef at 1150 #undef match 1151