1 /* $NetBSD: engine.c,v 1.4 2024/11/10 10:54:29 mlelstv Exp $ */ 2 /*- 3 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 4 * Copyright (c) 1992, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Henry Spencer of the University of Toronto. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)engine.c 8.4 (Berkeley) 3/19/94 39 */ 40 41 /* 42 * The matching engine and friends. This file is #included by regexec.c 43 * after suitable #defines of a variety of macros used herein, so that 44 * different state representations can be used without duplicating masses 45 * of code. 46 */ 47 48 #ifdef SNAMES 49 #define matcher smatcher 50 #define fast sfast 51 #define slow sslow 52 #define dissect sdissect 53 #define backref sbackref 54 #define step sstep 55 #define print sprint 56 #define at sat 57 #define match smat 58 #endif 59 #ifdef LNAMES 60 #define matcher lmatcher 61 #define fast lfast 62 #define slow lslow 63 #define dissect ldissect 64 #define backref lbackref 65 #define step lstep 66 #define print lprint 67 #define at lat 68 #define match lmat 69 #endif 70 71 /* another structure passed up and down to avoid zillions of parameters */ 72 struct match { 73 struct re_guts *g; 74 int eflags; 75 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 76 RCHAR_T *offp; /* offsets work from here */ 77 RCHAR_T *beginp; /* start of string -- virtual NUL precedes */ 78 RCHAR_T *endp; /* end of string -- virtual NUL here */ 79 RCHAR_T *coldp; /* can be no match starting before here */ 80 RCHAR_T **lastpos; /* [nplus+1] */ 81 STATEVARS; 82 states st; /* current states */ 83 states fresh; /* states for a fresh start */ 84 states tmp; /* temporary */ 85 states empty; /* empty set of states */ 86 }; 87 88 /* ========= begin header generated by ./mkh ========= */ 89 #ifdef __cplusplus 90 extern "C" { 91 #endif 92 93 /* === engine.c === */ 94 static int matcher __P((struct re_guts *g, RCHAR_T *string, size_t nmatch, regmatch_t pmatch[], int eflags)); 95 static RCHAR_T *dissect __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst)); 96 static RCHAR_T *backref __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst, sopno lev)); 97 static RCHAR_T *fast __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst)); 98 static RCHAR_T *slow __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst)); 99 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int flag, RCHAR_T ch, states aft)); 100 #define BOL (1) 101 #define EOL (BOL+1) 102 #define BOLEOL (BOL+2) 103 #define NOTHING (BOL+3) 104 #define BOW (BOL+4) 105 #define EOW (BOL+5) 106 #ifdef REDEBUG 107 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d)); 108 #endif 109 #ifdef REDEBUG 110 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst)); 111 #endif 112 #ifdef REDEBUG 113 static char *pchar __P((int ch)); 114 #endif 115 116 #ifdef __cplusplus 117 } 118 #endif 119 /* ========= end header generated by ./mkh ========= */ 120 121 #ifdef REDEBUG 122 #define SP(t, s, c) print(m, t, s, c, stdout) 123 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 124 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 125 #else 126 #define SP(t, s, c) /* nothing */ 127 #define AT(t, p1, p2, s1, s2) /* nothing */ 128 #define NOTE(s) /* nothing */ 129 #endif 130 131 /* 132 - matcher - the actual matching engine 133 == static int matcher(struct re_guts *g, RCHAR_T *string, \ 134 == size_t nmatch, regmatch_t pmatch[], int eflags); 135 */ 136 static int /* 0 success, REG_NOMATCH failure */ 137 matcher(g, string, nmatch, pmatch, eflags) 138 struct re_guts *g; 139 RCHAR_T *string; 140 size_t nmatch; 141 regmatch_t pmatch[]; 142 int eflags; 143 { 144 RCHAR_T *endp; 145 size_t i; 146 struct match mv; 147 struct match *m = &mv; 148 RCHAR_T *dp; 149 const sopno gf = g->firststate+1; /* +1 for OEND */ 150 const sopno gl = g->laststate; 151 RCHAR_T *start; 152 RCHAR_T *stop; 153 RCHAR_T empty[] = { REOF }; 154 155 /* Input can be a NULL pointer, treat like an empty line. */ 156 if (string == NULL) 157 string = empty; 158 159 /* simplify the situation where possible */ 160 if (g->cflags®_NOSUB) 161 nmatch = 0; 162 if (eflags®_STARTEND) { 163 start = string + pmatch[0].rm_so; 164 stop = string + pmatch[0].rm_eo; 165 } else { 166 start = string; 167 stop = start + STRLEN(start); 168 } 169 if (stop < start) 170 return(REG_INVARG); 171 172 /* prescreening; this does wonders for this rather slow code */ 173 if (g->must != NULL) { 174 for (dp = start; dp < stop; dp++) 175 if (*dp == g->must[0] && (size_t)(stop - dp) >= g->mlen && 176 MEMCMP(dp, g->must, g->mlen) == 0) 177 break; 178 if (dp == stop) /* we didn't find g->must */ 179 return(REG_NOMATCH); 180 } 181 182 /* match struct setup */ 183 m->g = g; 184 m->eflags = eflags; 185 m->pmatch = NULL; 186 m->lastpos = NULL; 187 m->offp = string; 188 m->beginp = start; 189 m->endp = stop; 190 STATESETUP(m, 4); 191 SETUP(m->st); 192 SETUP(m->fresh); 193 SETUP(m->tmp); 194 SETUP(m->empty); 195 CLEAR(m->empty); 196 197 /* this loop does only one repetition except for backrefs */ 198 for (;;) { 199 endp = fast(m, start, stop, gf, gl); 200 if (endp == NULL) { /* a miss */ 201 STATETEARDOWN(m); 202 return(REG_NOMATCH); 203 } 204 if (nmatch == 0 && !g->backrefs) 205 break; /* no further info needed */ 206 207 /* where? */ 208 assert(m->coldp != NULL); 209 for (;;) { 210 NOTE("finding start"); 211 endp = slow(m, m->coldp, stop, gf, gl); 212 if (endp != NULL) 213 break; 214 assert(m->coldp < m->endp); 215 m->coldp++; 216 } 217 if (nmatch == 1 && !g->backrefs) 218 break; /* no further info needed */ 219 220 /* oh my, he wants the subexpressions... */ 221 if (m->pmatch == NULL) 222 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 223 sizeof(regmatch_t)); 224 if (m->pmatch == NULL) { 225 STATETEARDOWN(m); 226 return(REG_ESPACE); 227 } 228 for (i = 1; i <= m->g->nsub; i++) 229 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 230 if (!g->backrefs && !(m->eflags®_BACKR)) { 231 NOTE("dissecting"); 232 dp = dissect(m, m->coldp, endp, gf, gl); 233 } else { 234 if (g->nplus > 0 && m->lastpos == NULL) 235 m->lastpos = (RCHAR_T **)malloc((g->nplus+1) * 236 sizeof(RCHAR_T *)); 237 if (g->nplus > 0 && m->lastpos == NULL) { 238 free(m->pmatch); 239 STATETEARDOWN(m); 240 return(REG_ESPACE); 241 } 242 NOTE("backref dissect"); 243 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 244 } 245 if (dp != NULL) 246 break; 247 248 /* uh-oh... we couldn't find a subexpression-level match */ 249 assert(g->backrefs); /* must be back references doing it */ 250 assert(g->nplus == 0 || m->lastpos != NULL); 251 for (;;) { 252 if (dp != NULL || endp <= m->coldp) 253 break; /* defeat */ 254 NOTE("backoff"); 255 endp = slow(m, m->coldp, endp-1, gf, gl); 256 if (endp == NULL) 257 break; /* defeat */ 258 /* try it on a shorter possibility */ 259 #ifndef NDEBUG 260 for (i = 1; i <= m->g->nsub; i++) { 261 assert(m->pmatch[i].rm_so == -1); 262 assert(m->pmatch[i].rm_eo == -1); 263 } 264 #endif 265 NOTE("backoff dissect"); 266 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 267 } 268 assert(dp == NULL || dp == endp); 269 if (dp != NULL) /* found a shorter one */ 270 break; 271 272 /* despite initial appearances, there is no match here */ 273 NOTE("false alarm"); 274 start = m->coldp + 1; /* recycle starting later */ 275 assert(start <= stop); 276 } 277 278 /* fill in the details if requested */ 279 if (nmatch > 0) { 280 pmatch[0].rm_so = m->coldp - m->offp; 281 pmatch[0].rm_eo = endp - m->offp; 282 } 283 if (nmatch > 1) { 284 assert(m->pmatch != NULL); 285 for (i = 1; i < nmatch; i++) 286 if (i <= m->g->nsub) 287 pmatch[i] = m->pmatch[i]; 288 else { 289 pmatch[i].rm_so = -1; 290 pmatch[i].rm_eo = -1; 291 } 292 } 293 294 if (m->pmatch != NULL) 295 free((char *)m->pmatch); 296 if (m->lastpos != NULL) 297 free((char *)m->lastpos); 298 STATETEARDOWN(m); 299 return(0); 300 } 301 302 /* 303 - dissect - figure out what matched what, no back references 304 == static RCHAR_T *dissect(struct match *m, RCHAR_T *start, \ 305 == RCHAR_T *stop, sopno startst, sopno stopst); 306 */ 307 static RCHAR_T * /* == stop (success) always */ 308 dissect(m, start, stop, startst, stopst) 309 struct match *m; 310 RCHAR_T *start; 311 RCHAR_T *stop; 312 sopno startst; 313 sopno stopst; 314 { 315 int i; 316 sopno ss; /* start sop of current subRE */ 317 sopno es; /* end sop of current subRE */ 318 RCHAR_T *sp; /* start of string matched by it */ 319 RCHAR_T *stp; /* string matched by it cannot pass here */ 320 RCHAR_T *rest; /* start of rest of string */ 321 RCHAR_T *tail; /* string unmatched by rest of RE */ 322 sopno ssub; /* start sop of subsubRE */ 323 sopno esub; /* end sop of subsubRE */ 324 RCHAR_T *ssp; /* start of string matched by subsubRE */ 325 RCHAR_T *sep; /* end of string matched by subsubRE */ 326 RCHAR_T *oldssp; /* previous ssp */ 327 RCHAR_T *dp; 328 329 AT("diss", start, stop, startst, stopst); 330 sp = start; 331 for (ss = startst; ss < stopst; ss = es) { 332 /* identify end of subRE */ 333 es = ss; 334 switch (m->g->strip[es]) { 335 case OPLUS_: 336 case OQUEST_: 337 es += m->g->stripdata[es]; 338 break; 339 case OCH_: 340 while (m->g->strip[es] != O_CH) 341 es += m->g->stripdata[es]; 342 break; 343 } 344 es++; 345 346 /* figure out what it matched */ 347 switch (m->g->strip[ss]) { 348 case OEND: 349 assert(nope); 350 break; 351 case OCHAR: 352 sp++; 353 break; 354 case OBOL: 355 case OEOL: 356 case OBOW: 357 case OEOW: 358 break; 359 case OANY: 360 case OANYOF: 361 sp++; 362 break; 363 case OBACK_: 364 case O_BACK: 365 assert(nope); 366 break; 367 /* cases where length of match is hard to find */ 368 case OQUEST_: 369 stp = stop; 370 for (;;) { 371 /* how long could this one be? */ 372 rest = slow(m, sp, stp, ss, es); 373 assert(rest != NULL); /* it did match */ 374 /* could the rest match the rest? */ 375 tail = slow(m, rest, stop, es, stopst); 376 if (tail == stop) 377 break; /* yes! */ 378 /* no -- try a shorter match for this one */ 379 stp = rest - 1; 380 assert(stp >= sp); /* it did work */ 381 } 382 ssub = ss + 1; 383 esub = es - 1; 384 /* did innards match? */ 385 if (slow(m, sp, rest, ssub, esub) != NULL) { 386 dp = dissect(m, sp, rest, ssub, esub); 387 __USE(dp); 388 assert(dp == rest); 389 } else /* no */ 390 assert(sp == rest); 391 sp = rest; 392 break; 393 case OPLUS_: 394 stp = stop; 395 for (;;) { 396 /* how long could this one be? */ 397 rest = slow(m, sp, stp, ss, es); 398 assert(rest != NULL); /* it did match */ 399 /* could the rest match the rest? */ 400 tail = slow(m, rest, stop, es, stopst); 401 if (tail == stop) 402 break; /* yes! */ 403 /* no -- try a shorter match for this one */ 404 stp = rest - 1; 405 assert(stp >= sp); /* it did work */ 406 } 407 ssub = ss + 1; 408 esub = es - 1; 409 ssp = sp; 410 oldssp = ssp; 411 for (;;) { /* find last match of innards */ 412 sep = slow(m, ssp, rest, ssub, esub); 413 if (sep == NULL || sep == ssp) 414 break; /* failed or matched null */ 415 oldssp = ssp; /* on to next try */ 416 ssp = sep; 417 } 418 if (sep == NULL) { 419 /* last successful match */ 420 sep = ssp; 421 ssp = oldssp; 422 } 423 assert(sep == rest); /* must exhaust substring */ 424 assert(slow(m, ssp, sep, ssub, esub) == rest); 425 dp = dissect(m, ssp, sep, ssub, esub); 426 assert(dp == sep); 427 sp = rest; 428 break; 429 case OCH_: 430 stp = stop; 431 for (;;) { 432 /* how long could this one be? */ 433 rest = slow(m, sp, stp, ss, es); 434 assert(rest != NULL); /* it did match */ 435 /* could the rest match the rest? */ 436 tail = slow(m, rest, stop, es, stopst); 437 if (tail == stop) 438 break; /* yes! */ 439 /* no -- try a shorter match for this one */ 440 stp = rest - 1; 441 assert(stp >= sp); /* it did work */ 442 } 443 ssub = ss + 1; 444 esub = ss + m->g->stripdata[ss] - 1; 445 assert(m->g->strip[esub] == OOR1); 446 for (;;) { /* find first matching branch */ 447 if (slow(m, sp, rest, ssub, esub) == rest) 448 break; /* it matched all of it */ 449 /* that one missed, try next one */ 450 assert(m->g->strip[esub] == OOR1); 451 esub++; 452 assert(m->g->strip[esub] == OOR2); 453 ssub = esub + 1; 454 esub += m->g->stripdata[esub]; 455 if (m->g->strip[esub] == OOR2) 456 esub--; 457 else 458 assert(m->g->strip[esub] == O_CH); 459 } 460 dp = dissect(m, sp, rest, ssub, esub); 461 assert(dp == rest); 462 sp = rest; 463 break; 464 case O_PLUS: 465 case O_QUEST: 466 case OOR1: 467 case OOR2: 468 case O_CH: 469 assert(nope); 470 break; 471 case OLPAREN: 472 i = m->g->stripdata[ss]; 473 assert(0 < i && i <= m->g->nsub); 474 m->pmatch[i].rm_so = sp - m->offp; 475 break; 476 case ORPAREN: 477 i = m->g->stripdata[ss]; 478 assert(0 < i && i <= m->g->nsub); 479 m->pmatch[i].rm_eo = sp - m->offp; 480 break; 481 default: /* uh oh */ 482 assert(nope); 483 break; 484 } 485 } 486 487 assert(sp == stop); 488 return(sp); 489 } 490 491 /* 492 - backref - figure out what matched what, figuring in back references 493 == static RCHAR_T *backref(struct match *m, RCHAR_T *start, \ 494 == RCHAR_T *stop, sopno startst, sopno stopst, sopno lev); 495 */ 496 static RCHAR_T * /* == stop (success) or NULL (failure) */ 497 backref(m, start, stop, startst, stopst, lev) 498 struct match *m; 499 RCHAR_T *start; 500 RCHAR_T *stop; 501 sopno startst; 502 sopno stopst; 503 sopno lev; /* PLUS nesting level */ 504 { 505 int i; 506 sopno ss; /* start sop of current subRE */ 507 RCHAR_T *sp; /* start of string matched by it */ 508 sopno ssub; /* start sop of subsubRE */ 509 sopno esub; /* end sop of subsubRE */ 510 RCHAR_T *ssp; /* start of string matched by subsubRE */ 511 RCHAR_T *dp; 512 size_t len; 513 int hard; 514 sop s; 515 RCHAR_T d; 516 regoff_t offsave; 517 cset *cs; 518 519 AT("back", start, stop, startst, stopst); 520 sp = start; 521 522 /* get as far as we can with easy stuff */ 523 hard = 0; 524 for (ss = startst; !hard && ss < stopst; ss++) { 525 s = m->g->strip[ss]; 526 d = m->g->stripdata[ss]; 527 switch (s) { 528 case OCHAR: 529 if (sp == stop || *sp++ != d) 530 return(NULL); 531 break; 532 case OANY: 533 if (sp == stop) 534 return(NULL); 535 sp++; 536 break; 537 case OANYOF: 538 cs = &m->g->sets[d]; 539 if (sp == stop || !CHIN(cs, *sp++)) 540 return(NULL); 541 break; 542 case OBOL: 543 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 544 (sp < m->endp && *(sp-1) == '\n' && 545 (m->g->cflags®_NEWLINE)) ) 546 { /* yes */ } 547 else 548 return(NULL); 549 break; 550 case OEOL: 551 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 552 (sp < m->endp && *sp == '\n' && 553 (m->g->cflags®_NEWLINE)) ) 554 { /* yes */ } 555 else 556 return(NULL); 557 break; 558 case OBOW: 559 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 560 (sp < m->endp && *(sp-1) == '\n' && 561 (m->g->cflags®_NEWLINE)) || 562 (sp > m->beginp && 563 !ISWORD(*(sp-1))) ) && 564 (sp < m->endp && ISWORD(*sp)) ) 565 { /* yes */ } 566 else 567 return(NULL); 568 break; 569 case OEOW: 570 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 571 (sp < m->endp && *sp == '\n' && 572 (m->g->cflags®_NEWLINE)) || 573 (sp < m->endp && !ISWORD(*sp)) ) && 574 (sp > m->beginp && ISWORD(*(sp-1))) ) 575 { /* yes */ } 576 else 577 return(NULL); 578 break; 579 case O_QUEST: 580 break; 581 case OOR1: /* matches null but needs to skip */ 582 ss++; 583 s = m->g->strip[ss]; 584 d = m->g->stripdata[ss]; 585 do { 586 assert(s == OOR2); 587 ss += d; 588 s = m->g->strip[ss]; 589 d = m->g->stripdata[ss]; 590 } while (s != O_CH); 591 /* note that the ss++ gets us past the O_CH */ 592 break; 593 default: /* have to make a choice */ 594 hard = 1; 595 break; 596 } 597 } 598 if (!hard) { /* that was it! */ 599 if (sp != stop) 600 return(NULL); 601 return(sp); 602 } 603 ss--; /* adjust for the for's final increment */ 604 605 /* the hard stuff */ 606 AT("hard", sp, stop, ss, stopst); 607 s = m->g->strip[ss]; 608 d = m->g->stripdata[ss]; 609 switch (s) { 610 case OBACK_: /* the vilest depths */ 611 i = d; 612 assert(0 < i && i <= m->g->nsub); 613 if (m->pmatch[i].rm_eo == -1) 614 return(NULL); 615 assert(m->pmatch[i].rm_so != -1); 616 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 617 assert(stop - m->beginp >= len); 618 if (sp > stop - len) 619 return(NULL); /* not enough left to match */ 620 ssp = m->offp + m->pmatch[i].rm_so; 621 if (memcmp(sp, ssp, len) != 0) 622 return(NULL); 623 while (m->g->strip[ss] != O_BACK || m->g->stripdata[ss] != i) 624 ss++; 625 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 626 break; 627 case OQUEST_: /* to null or not */ 628 dp = backref(m, sp, stop, ss+1, stopst, lev); 629 if (dp != NULL) 630 return(dp); /* not */ 631 return(backref(m, sp, stop, ss+d+1, stopst, lev)); 632 break; 633 case OPLUS_: 634 assert(m->lastpos != NULL); 635 assert(lev+1 <= m->g->nplus); 636 m->lastpos[lev+1] = sp; 637 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 638 break; 639 case O_PLUS: 640 if (sp == m->lastpos[lev]) /* last pass matched null */ 641 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 642 /* try another pass */ 643 m->lastpos[lev] = sp; 644 dp = backref(m, sp, stop, ss-d+1, stopst, lev); 645 if (dp == NULL) 646 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 647 else 648 return(dp); 649 break; 650 case OCH_: /* find the right one, if any */ 651 ssub = ss + 1; 652 esub = ss + d - 1; 653 assert(m->g->strip[esub] == OOR1); 654 for (;;) { /* find first matching branch */ 655 dp = backref(m, sp, stop, ssub, esub, lev); 656 if (dp != NULL) 657 return(dp); 658 /* that one missed, try next one */ 659 if (m->g->strip[esub] == O_CH) 660 return(NULL); /* there is none */ 661 esub++; 662 assert(m->g->strip[esub] == OOR2); 663 ssub = esub + 1; 664 esub += m->g->stripdata[esub]; 665 if (m->g->strip[esub] == OOR2) 666 esub--; 667 else 668 assert(m->g->strip[esub] == O_CH); 669 } 670 break; 671 case OLPAREN: /* must undo assignment if rest fails */ 672 i = d; 673 assert(0 < i && i <= m->g->nsub); 674 offsave = m->pmatch[i].rm_so; 675 m->pmatch[i].rm_so = sp - m->offp; 676 dp = backref(m, sp, stop, ss+1, stopst, lev); 677 if (dp != NULL) 678 return(dp); 679 m->pmatch[i].rm_so = offsave; 680 return(NULL); 681 break; 682 case ORPAREN: /* must undo assignment if rest fails */ 683 i = d; 684 assert(0 < i && i <= m->g->nsub); 685 offsave = m->pmatch[i].rm_eo; 686 m->pmatch[i].rm_eo = sp - m->offp; 687 dp = backref(m, sp, stop, ss+1, stopst, lev); 688 if (dp != NULL) 689 return(dp); 690 m->pmatch[i].rm_eo = offsave; 691 return(NULL); 692 break; 693 default: /* uh oh */ 694 assert(nope); 695 break; 696 } 697 698 /* "can't happen" */ 699 assert(nope); 700 /* NOTREACHED */ 701 return NULL; 702 } 703 704 /* 705 - fast - step through the string at top speed 706 == static RCHAR_T *fast(struct match *m, RCHAR_T *start, \ 707 == RCHAR_T *stop, sopno startst, sopno stopst); 708 */ 709 static RCHAR_T * /* where tentative match ended, or NULL */ 710 fast(m, start, stop, startst, stopst) 711 struct match *m; 712 RCHAR_T *start; 713 RCHAR_T *stop; 714 sopno startst; 715 sopno stopst; 716 { 717 states st = m->st; 718 states fresh = m->fresh; 719 states tmp = m->tmp; 720 RCHAR_T *p = start; 721 RCHAR_T c = (start == m->beginp) ? OUT : *(start-1); 722 RCHAR_T lastc; /* previous c */ 723 int flag; 724 int i; 725 RCHAR_T *coldp; /* last p after which no match was underway */ 726 727 CLEAR(st); 728 SET1(st, startst); 729 st = step(m->g, startst, stopst, st, NOTHING, OUT, st); 730 ASSIGN(fresh, st); 731 SP("start", st, *p); 732 coldp = NULL; 733 for (;;) { 734 /* next character */ 735 lastc = c; 736 c = (p == m->endp) ? OUT : *p; 737 if (EQ(st, fresh)) 738 coldp = p; 739 740 /* is there an EOL and/or BOL between lastc and c? */ 741 flag = 0; 742 i = 0; 743 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 744 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 745 flag = BOL; 746 i = m->g->nbol; 747 } 748 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 749 (c == OUT && !(m->eflags®_NOTEOL)) ) { 750 flag = (flag == BOL) ? BOLEOL : EOL; 751 i += m->g->neol; 752 } 753 if (i != 0) { 754 for (; i > 0; i--) 755 st = step(m->g, startst, stopst, st, flag, OUT, st); 756 SP("boleol", st, c); 757 } 758 759 /* how about a word boundary? */ 760 if ( (flag == BOL || (lastc != OUT && !ISWORD(lastc))) && 761 (c != OUT && ISWORD(c)) ) { 762 flag = BOW; 763 } 764 if ( (lastc != OUT && ISWORD(lastc)) && 765 (flag == EOL || (c != OUT && !ISWORD(c))) ) { 766 flag = EOW; 767 } 768 if (flag == BOW || flag == EOW) { 769 st = step(m->g, startst, stopst, st, flag, OUT, st); 770 SP("boweow", st, c); 771 } 772 773 /* are we done? */ 774 if (ISSET(st, stopst) || p == stop) 775 break; /* NOTE BREAK OUT */ 776 777 /* no, we must deal with this character */ 778 ASSIGN(tmp, st); 779 ASSIGN(st, fresh); 780 assert(c != OUT); 781 st = step(m->g, startst, stopst, tmp, 0, c, st); 782 SP("aft", st, c); 783 assert(EQ(step(m->g, startst, stopst, st, NOTHING, OUT, st), st)); 784 p++; 785 } 786 787 assert(coldp != NULL); 788 m->coldp = coldp; 789 if (ISSET(st, stopst)) 790 return(p+1); 791 else 792 return(NULL); 793 } 794 795 /* 796 - slow - step through the string more deliberately 797 == static RCHAR_T *slow(struct match *m, RCHAR_T *start, \ 798 == RCHAR_T *stop, sopno startst, sopno stopst); 799 */ 800 static RCHAR_T * /* where it ended */ 801 slow(m, start, stop, startst, stopst) 802 struct match *m; 803 RCHAR_T *start; 804 RCHAR_T *stop; 805 sopno startst; 806 sopno stopst; 807 { 808 states st = m->st; 809 states empty = m->empty; 810 states tmp = m->tmp; 811 RCHAR_T *p = start; 812 RCHAR_T c = (start == m->beginp) ? OUT : *(start-1); 813 RCHAR_T lastc; /* previous c */ 814 int flag; 815 int i; 816 RCHAR_T *matchp; /* last p at which a match ended */ 817 818 AT("slow", start, stop, startst, stopst); 819 CLEAR(st); 820 SET1(st, startst); 821 SP("sstart", st, *p); 822 st = step(m->g, startst, stopst, st, NOTHING, OUT, st); 823 matchp = NULL; 824 for (;;) { 825 /* next character */ 826 lastc = c; 827 c = (p == m->endp) ? OUT : *p; 828 829 /* is there an EOL and/or BOL between lastc and c? */ 830 flag = 0; 831 i = 0; 832 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 833 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 834 flag = BOL; 835 i = m->g->nbol; 836 } 837 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 838 (c == OUT && !(m->eflags®_NOTEOL)) ) { 839 flag = (flag == BOL) ? BOLEOL : EOL; 840 i += m->g->neol; 841 } 842 if (i != 0) { 843 for (; i > 0; i--) 844 st = step(m->g, startst, stopst, st, flag, OUT, st); 845 SP("sboleol", st, c); 846 } 847 848 /* how about a word boundary? */ 849 if ( (flag == BOL || (lastc != OUT && !ISWORD(lastc))) && 850 (c != OUT && ISWORD(c)) ) { 851 flag = BOW; 852 } 853 if ( (lastc != OUT && ISWORD(lastc)) && 854 (flag == EOL || (c != OUT && !ISWORD(c))) ) { 855 flag = EOW; 856 } 857 if (flag == BOW || flag == EOW) { 858 st = step(m->g, startst, stopst, st, flag, OUT, st); 859 SP("sboweow", st, c); 860 } 861 862 /* are we done? */ 863 if (ISSET(st, stopst)) 864 matchp = p; 865 if (EQ(st, empty) || p == stop) 866 break; /* NOTE BREAK OUT */ 867 868 /* no, we must deal with this character */ 869 ASSIGN(tmp, st); 870 ASSIGN(st, empty); 871 assert(c != OUT); 872 st = step(m->g, startst, stopst, tmp, 0, c, st); 873 SP("saft", st, c); 874 assert(EQ(step(m->g, startst, stopst, st, NOTHING, OUT, st), st)); 875 p++; 876 } 877 878 return(matchp); 879 } 880 881 882 /* 883 - step - map set of states reachable before char to set reachable after 884 == static states step(struct re_guts *g, sopno start, sopno stop, \ 885 == states bef, int flag, RCHAR_T ch, states aft); 886 == #define BOL (1) 887 == #define EOL (BOL+1) 888 == #define BOLEOL (BOL+2) 889 == #define NOTHING (BOL+3) 890 == #define BOW (BOL+4) 891 == #define EOW (BOL+5) 892 */ 893 static states 894 step(g, start, stop, bef, flag, ch, aft) 895 struct re_guts *g; 896 sopno start; /* start state within strip */ 897 sopno stop; /* state after stop state within strip */ 898 states bef; /* states reachable before */ 899 int flag; /* NONCHAR flag */ 900 RCHAR_T ch; /* character code */ 901 states aft; /* states already known reachable after */ 902 { 903 cset *cs; 904 sop s; 905 RCHAR_T d; 906 sopno pc; 907 onestate here; /* note, macros know this name */ 908 sopno look; 909 int i; 910 911 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 912 s = g->strip[pc]; 913 d = g->stripdata[pc]; 914 switch (s) { 915 case OEND: 916 assert(pc == stop-1); 917 break; 918 case OCHAR: 919 /* only characters can match */ 920 assert(!flag || ch != d); 921 if (ch == d) 922 FWD(aft, bef, 1); 923 break; 924 case OBOL: 925 if (flag == BOL || flag == BOLEOL) 926 FWD(aft, bef, 1); 927 break; 928 case OEOL: 929 if (flag == EOL || flag == BOLEOL) 930 FWD(aft, bef, 1); 931 break; 932 case OBOW: 933 if (flag == BOW) 934 FWD(aft, bef, 1); 935 break; 936 case OEOW: 937 if (flag == EOW) 938 FWD(aft, bef, 1); 939 break; 940 case OANY: 941 if (!flag) 942 FWD(aft, bef, 1); 943 break; 944 case OANYOF: 945 cs = &g->sets[d]; 946 if (!flag && CHIN(cs, ch)) 947 FWD(aft, bef, 1); 948 break; 949 case OBACK_: /* ignored here */ 950 case O_BACK: 951 FWD(aft, aft, 1); 952 break; 953 case OPLUS_: /* forward, this is just an empty */ 954 FWD(aft, aft, 1); 955 break; 956 case O_PLUS: /* both forward and back */ 957 FWD(aft, aft, 1); 958 i = ISSETBACK(aft, d); 959 BACK(aft, aft, d); 960 if (!i && ISSETBACK(aft, d)) { 961 /* oho, must reconsider loop body */ 962 pc -= d + 1; 963 INIT(here, pc); 964 } 965 break; 966 case OQUEST_: /* two branches, both forward */ 967 FWD(aft, aft, 1); 968 FWD(aft, aft, d); 969 break; 970 case O_QUEST: /* just an empty */ 971 FWD(aft, aft, 1); 972 break; 973 case OLPAREN: /* not significant here */ 974 case ORPAREN: 975 FWD(aft, aft, 1); 976 break; 977 case OCH_: /* mark the first two branches */ 978 FWD(aft, aft, 1); 979 assert(OP(g->strip[pc+d]) == OOR2); 980 FWD(aft, aft, d); 981 break; 982 case OOR1: /* done a branch, find the O_CH */ 983 if (ISSTATEIN(aft, here)) { 984 for (look = 1; /**/; look += d) { 985 s = g->strip[pc+look]; 986 d = g->stripdata[pc+look]; 987 if (s == O_CH) 988 break; 989 assert(s == OOR2); 990 } 991 FWD(aft, aft, look); 992 } 993 break; 994 case OOR2: /* propagate OCH_'s marking */ 995 FWD(aft, aft, 1); 996 if (g->strip[pc+d] != O_CH) { 997 assert(g->strip[pc+d] == OOR2); 998 FWD(aft, aft, d); 999 } 1000 break; 1001 case O_CH: /* just empty */ 1002 FWD(aft, aft, 1); 1003 break; 1004 default: /* ooooops... */ 1005 assert(nope); 1006 break; 1007 } 1008 } 1009 1010 return(aft); 1011 } 1012 1013 #ifdef REDEBUG 1014 /* 1015 - print - print a set of states 1016 == #ifdef REDEBUG 1017 == static void print(struct match *m, char *caption, states st, \ 1018 == int ch, FILE *d); 1019 == #endif 1020 */ 1021 static void 1022 print(m, caption, st, ch, d) 1023 struct match *m; 1024 char *caption; 1025 states st; 1026 int ch; 1027 FILE *d; 1028 { 1029 struct re_guts *g = m->g; 1030 int i; 1031 int first = 1; 1032 1033 if (!(m->eflags®_TRACE)) 1034 return; 1035 1036 fprintf(d, "%s", caption); 1037 if (ch != '\0') 1038 fprintf(d, " %s", pchar(ch)); 1039 for (i = 0; i < g->nstates; i++) 1040 if (ISSET(st, i)) { 1041 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1042 first = 0; 1043 } 1044 fprintf(d, "\n"); 1045 } 1046 1047 /* 1048 - at - print current situation 1049 == #ifdef REDEBUG 1050 == static void at(struct match *m, char *title, char *start, char *stop, \ 1051 == sopno startst, sopno stopst); 1052 == #endif 1053 */ 1054 static void 1055 at(m, title, start, stop, startst, stopst) 1056 struct match *m; 1057 char *title; 1058 char *start; 1059 char *stop; 1060 sopno startst; 1061 sopno stopst; 1062 { 1063 if (!(m->eflags®_TRACE)) 1064 return; 1065 1066 printf("%s %s-", title, pchar(*start)); 1067 printf("%s ", pchar(*stop)); 1068 printf("%ld-%ld\n", (long)startst, (long)stopst); 1069 } 1070 1071 #ifndef PCHARDONE 1072 #define PCHARDONE /* never again */ 1073 /* 1074 - pchar - make a character printable 1075 == #ifdef REDEBUG 1076 == static char *pchar(int ch); 1077 == #endif 1078 * 1079 * Is this identical to regchar() over in debug.c? Well, yes. But a 1080 * duplicate here avoids having a debugging-capable regexec.o tied to 1081 * a matching debug.o, and this is convenient. It all disappears in 1082 * the non-debug compilation anyway, so it doesn't matter much. 1083 */ 1084 static char * /* -> representation */ 1085 pchar(ch) 1086 int ch; 1087 { 1088 static char pbuf[10]; 1089 1090 if (isprint(ch) || ch == ' ') 1091 sprintf(pbuf, "%c", ch); 1092 else 1093 sprintf(pbuf, "\\%o", ch); 1094 return(pbuf); 1095 } 1096 #endif 1097 #endif 1098 1099 #undef matcher 1100 #undef fast 1101 #undef slow 1102 #undef dissect 1103 #undef backref 1104 #undef step 1105 #undef print 1106 #undef at 1107 #undef match 1108