1 /* $NetBSD: engine.c,v 1.27 2021/02/24 18:13:21 christos Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-3-Clause 5 * 6 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 7 * Copyright (c) 1992, 1993, 1994 8 * The Regents of the University of California. All rights reserved. 9 * 10 * This code is derived from software contributed to Berkeley by 11 * Henry Spencer. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)engine.c 8.5 (Berkeley) 3/20/94 38 */ 39 40 #include <sys/cdefs.h> 41 #ifdef __FBSDID 42 __FBSDID("$FreeBSD: head/lib/libc/regex/engine.c 368358 2020-12-05 03:16:05Z kevans $"); 43 #endif 44 __RCSID("$NetBSD: engine.c,v 1.27 2021/02/24 18:13:21 christos Exp $"); 45 46 #include <stdbool.h> 47 48 /* 49 * The matching engine and friends. This file is #included by regexec.c 50 * after suitable #defines of a variety of macros used herein, so that 51 * different state representations can be used without duplicating masses 52 * of code. 53 */ 54 55 #ifdef SNAMES 56 #define stepback sstepback 57 #define matcher smatcher 58 #define walk swalk 59 #define dissect sdissect 60 #define backref sbackref 61 #define step sstep 62 #define print sprint 63 #define at sat 64 #define match smat 65 #endif 66 #ifdef LNAMES 67 #define stepback lstepback 68 #define matcher lmatcher 69 #define walk lwalk 70 #define dissect ldissect 71 #define backref lbackref 72 #define step lstep 73 #define print lprint 74 #define at lat 75 #define match lmat 76 #endif 77 #ifdef MNAMES 78 #define stepback mstepback 79 #define matcher mmatcher 80 #define walk mwalk 81 #define dissect mdissect 82 #define backref mbackref 83 #define step mstep 84 #define print mprint 85 #define at mat 86 #define match mmat 87 #endif 88 89 /* another structure passed up and down to avoid zillions of parameters */ 90 struct match { 91 struct re_guts *g; 92 int eflags; 93 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 94 const char *offp; /* offsets work from here */ 95 const char *beginp; /* start of string -- virtual NUL precedes */ 96 const char *endp; /* end of string -- virtual NUL here */ 97 const char *coldp; /* can be no match starting before here */ 98 const char **lastpos; /* [nplus+1] */ 99 STATEVARS; 100 states st; /* current states */ 101 states fresh; /* states for a fresh start */ 102 states tmp; /* temporary */ 103 states empty; /* empty set of states */ 104 mbstate_t mbs; /* multibyte conversion state */ 105 }; 106 107 /* ========= begin header generated by ./mkh ========= */ 108 #ifdef __cplusplus 109 extern "C" { 110 #endif 111 112 /* === engine.c === */ 113 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 114 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 115 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); 116 static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast); 117 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags); 118 #define MAX_RECURSION 100 119 #define BOL (OUT-1) 120 #define EOL (BOL-1) 121 #define BOLEOL (BOL-2) 122 #define NOTHING (BOL-3) 123 #define BOW (BOL-4) 124 #define EOW (BOL-5) 125 #define BADCHAR (BOL-6) 126 #define NWBND (BOL-7) 127 #define NONCHAR(c) ((c) <= OUT) 128 /* sflags */ 129 #define SBOS 0x0001 130 #define SEOS 0x0002 131 132 #ifdef REDEBUG 133 static void print(struct match *m, const char *caption, states st, int ch, FILE *d); 134 #endif 135 #ifdef REDEBUG 136 static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst); 137 #endif 138 #ifdef REDEBUG 139 static const char *pchar(int ch); 140 #endif 141 142 #ifdef __cplusplus 143 } 144 #endif 145 /* ========= end header generated by ./mkh ========= */ 146 147 #ifdef REDEBUG 148 #define SP(t, s, c) print(m, t, s, c, stdout) 149 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 150 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 151 #else 152 #define SP(t, s, c) /* nothing */ 153 #define AT(t, p1, p2, s1, s2) /* nothing */ 154 #define NOTE(s) /* nothing */ 155 #endif 156 157 /* 158 * Given a multibyte string pointed to by start, step back nchar characters 159 * from current position pointed to by cur. 160 */ 161 static const char * 162 stepback(const char *start, const char *cur, int nchar) 163 { 164 const char *ret; 165 size_t wc, mbc; 166 mbstate_t mbs; 167 size_t clen; 168 169 if (MB_CUR_MAX == 1) 170 return ((cur - nchar) > start ? cur - nchar : NULL); 171 172 ret = cur; 173 for (wc = nchar; wc > 0; wc--) { 174 for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) { 175 if ((ret - mbc) < start) 176 return (NULL); 177 memset(&mbs, 0, sizeof(mbs)); 178 clen = mbrtowc(NULL, ret - mbc, mbc, &mbs); 179 if (clen != (size_t)-1 && clen != (size_t)-2) 180 break; 181 } 182 if (mbc > MB_CUR_MAX) 183 return (NULL); 184 ret -= mbc; 185 } 186 187 return (ret); 188 } 189 190 /* 191 - matcher - the actual matching engine 192 == static int matcher(struct re_guts *g, const char *string, \ 193 == size_t nmatch, regmatch_t pmatch[], int eflags); 194 */ 195 static int /* 0 success, REG_NOMATCH failure */ 196 matcher(struct re_guts *g, 197 const char *string, 198 size_t nmatch, 199 regmatch_t pmatch[], 200 int eflags) 201 { 202 const char *endp; 203 size_t i; 204 struct match mv; 205 struct match *m = &mv; 206 const char *dp = NULL; 207 const sopno gf = g->firststate+1; /* +1 for OEND */ 208 const sopno gl = g->laststate; 209 const char *start; 210 const char *stop; 211 /* Boyer-Moore algorithms variables */ 212 const char *pp; 213 size_t cj, mj; 214 const char *mustfirst; 215 const char *mustlast; 216 size_t *matchjump; 217 size_t *charjump; 218 int error = 0; 219 220 _DIAGASSERT(g != NULL); 221 _DIAGASSERT(string != NULL); 222 /* pmatch checked below */ 223 224 /* simplify the situation where possible */ 225 if (g->cflags®_NOSUB) 226 nmatch = 0; 227 if (eflags®_STARTEND) { 228 _DIAGASSERT(pmatch != NULL); 229 start = string + (size_t)pmatch[0].rm_so; 230 stop = string + (size_t)pmatch[0].rm_eo; 231 } else { 232 start = string; 233 stop = start + strlen(start); 234 } 235 if (stop < start) 236 return(REG_INVARG); 237 238 /* prescreening; this does wonders for this rather slow code */ 239 if (g->must != NULL) { 240 if (g->charjump != NULL && g->matchjump != NULL) { 241 mustfirst = g->must; 242 mustlast = g->must + g->mlen - 1; 243 charjump = g->charjump; 244 matchjump = g->matchjump; 245 pp = mustlast; 246 for (dp = start+g->mlen-1; dp < stop;) { 247 /* Fast skip non-matches */ 248 while (dp < stop && charjump[(int)*dp]) 249 dp += charjump[(int)*dp]; 250 251 if (dp >= stop) 252 break; 253 254 /* Greedy matcher */ 255 /* We depend on not being used for 256 * for strings of length 1 257 */ 258 while (*--dp == *--pp && pp != mustfirst); 259 260 if (*dp == *pp) 261 break; 262 263 /* Jump to next possible match */ 264 mj = matchjump[pp - mustfirst]; 265 cj = charjump[(int)*dp]; 266 dp += (cj < mj ? mj : cj); 267 pp = mustlast; 268 } 269 if (pp != mustfirst) 270 return(REG_NOMATCH); 271 } else { 272 for (dp = start; dp < stop; dp++) 273 if (*dp == g->must[0] && 274 (size_t)(stop - dp) >= g->mlen && 275 memcmp(dp, g->must, (size_t)g->mlen) == 0) 276 break; 277 if (dp == stop) /* we didn't find g->must */ 278 return(REG_NOMATCH); 279 } 280 } 281 282 /* match struct setup */ 283 m->g = g; 284 m->eflags = eflags; 285 m->pmatch = NULL; 286 m->lastpos = NULL; 287 m->offp = string; 288 m->beginp = start; 289 m->endp = stop; 290 STATESETUP(m, 4); 291 SETUP(m->st); 292 SETUP(m->fresh); 293 SETUP(m->tmp); 294 SETUP(m->empty); 295 CLEAR(m->empty); 296 ZAPSTATE(&m->mbs); 297 298 /* Adjust start according to moffset, to speed things up */ 299 if (dp != NULL && g->moffset > -1) { 300 const char *nstart; 301 302 nstart = stepback(start, dp, g->moffset); 303 if (nstart != NULL) 304 start = nstart; 305 } 306 307 SP("mloop", m->st, *start); 308 309 /* this loop does only one repetition except for backrefs */ 310 for (;;) { 311 endp = walk(m, start, stop, gf, gl, true); 312 if (endp == NULL) { /* a miss */ 313 error = REG_NOMATCH; 314 goto done; 315 } 316 if (nmatch == 0 && !g->backrefs) 317 break; /* no further info needed */ 318 319 /* where? */ 320 assert(m->coldp != NULL); 321 for (;;) { 322 NOTE("finding start"); 323 endp = walk(m, m->coldp, stop, gf, gl, false); 324 if (endp != NULL) 325 break; 326 assert(m->coldp < m->endp); 327 m->coldp += XMBRTOWC(NULL, m->coldp, 328 (size_t)(m->endp - m->coldp), &m->mbs, 0); 329 } 330 if (nmatch == 1 && !g->backrefs) 331 break; /* no further info needed */ 332 333 /* oh my, he wants the subexpressions... */ 334 if (m->pmatch == NULL) 335 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 336 sizeof(regmatch_t)); 337 if (m->pmatch == NULL) { 338 error = REG_ESPACE; 339 goto done; 340 } 341 for (i = 1; i <= m->g->nsub; i++) 342 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 343 if (!g->backrefs && !(m->eflags®_BACKR)) { 344 NOTE("dissecting"); 345 dp = dissect(m, m->coldp, endp, gf, gl); 346 } else { 347 if (g->nplus > 0 && m->lastpos == NULL) 348 m->lastpos = malloc((g->nplus+1) * 349 sizeof(const char *)); 350 if (g->nplus > 0 && m->lastpos == NULL) { 351 error = REG_ESPACE; 352 goto done; 353 } 354 NOTE("backref dissect"); 355 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 356 } 357 if (dp != NULL) 358 break; 359 360 /* uh-oh... we couldn't find a subexpression-level match */ 361 assert(g->backrefs); /* must be back references doing it */ 362 assert(g->nplus == 0 || m->lastpos != NULL); 363 for (;;) { 364 if (dp != NULL || endp <= m->coldp) 365 break; /* defeat */ 366 NOTE("backoff"); 367 endp = walk(m, m->coldp, endp-1, gf, gl, false); 368 if (endp == NULL) 369 break; /* defeat */ 370 /* try it on a shorter possibility */ 371 #ifndef NDEBUG 372 for (i = 1; i <= m->g->nsub; i++) { 373 assert(m->pmatch[i].rm_so == (regoff_t)-1); 374 assert(m->pmatch[i].rm_eo == (regoff_t)-1); 375 } 376 #endif 377 NOTE("backoff dissect"); 378 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 379 } 380 assert(dp == NULL || dp == endp); 381 if (dp != NULL) /* found a shorter one */ 382 break; 383 384 /* despite initial appearances, there is no match here */ 385 NOTE("false alarm"); 386 /* recycle starting later */ 387 start = m->coldp + XMBRTOWC(NULL, m->coldp, 388 (size_t)(stop - m->coldp), &m->mbs, 0); 389 assert(start <= stop); 390 } 391 392 /* fill in the details if requested */ 393 if (nmatch > 0) { 394 _DIAGASSERT(pmatch != NULL); 395 pmatch[0].rm_so = m->coldp - m->offp; 396 pmatch[0].rm_eo = endp - m->offp; 397 } 398 if (nmatch > 1) { 399 assert(m->pmatch != NULL); 400 for (i = 1; i < nmatch; i++) 401 if (i <= m->g->nsub) 402 pmatch[i] = m->pmatch[i]; 403 else { 404 pmatch[i].rm_so = (regoff_t)-1; 405 pmatch[i].rm_eo = (regoff_t)-1; 406 } 407 } 408 409 done: 410 if (m->pmatch != NULL) { 411 free(m->pmatch); 412 m->pmatch = NULL; 413 } 414 if (m->lastpos != NULL) { 415 free(__UNCONST(m->lastpos)); 416 m->lastpos = NULL; 417 } 418 STATETEARDOWN(m); 419 return error; 420 } 421 422 /* 423 - dissect - figure out what matched what, no back references 424 == static const char *dissect(struct match *m, const char *start, \ 425 == const char *stop, sopno startst, sopno stopst); 426 */ 427 static const char * /* == stop (success) always */ 428 dissect( 429 struct match *m, 430 const char *start, 431 const char *stop, 432 sopno startst, 433 sopno stopst) 434 { 435 int i; 436 sopno ss; /* start sop of current subRE */ 437 sopno es; /* end sop of current subRE */ 438 const char *sp; /* start of string matched by it */ 439 const char *stp; /* string matched by it cannot pass here */ 440 const char *rest; /* start of rest of string */ 441 const char *tail; /* string unmatched by rest of RE */ 442 sopno ssub; /* start sop of subsubRE */ 443 sopno esub; /* end sop of subsubRE */ 444 const char *ssp; /* start of string matched by subsubRE */ 445 const char *sep; /* end of string matched by subsubRE */ 446 const char *oldssp; /* previous ssp */ 447 const char *dp __unused; 448 449 _DIAGASSERT(m != NULL); 450 _DIAGASSERT(start != NULL); 451 _DIAGASSERT(stop != NULL); 452 453 AT("diss", start, stop, startst, stopst); 454 sp = start; 455 for (ss = startst; ss < stopst; ss = es) { 456 /* identify end of subRE */ 457 es = ss; 458 switch (OP(m->g->strip[es])) { 459 case OPLUS_: 460 case OQUEST_: 461 es += OPND(m->g->strip[es]); 462 break; 463 case OCH_: 464 while (OP(m->g->strip[es]) != O_CH) 465 es += OPND(m->g->strip[es]); 466 break; 467 } 468 es++; 469 470 /* figure out what it matched */ 471 switch (OP(m->g->strip[ss])) { 472 case OEND: 473 assert(nope); 474 break; 475 case OCHAR: 476 sp += XMBRTOWC(NULL, sp, (size_t)(stop - start), 477 &m->mbs, 0); 478 break; 479 case OBOL: 480 case OEOL: 481 case OBOW: 482 case OEOW: 483 case OBOS: 484 case OEOS: 485 case OWBND: 486 case ONWBND: 487 break; 488 case OANY: 489 case OANYOF: 490 sp += XMBRTOWC(NULL, sp, (size_t)(stop - start), 491 &m->mbs, 0); 492 break; 493 case OBACK_: 494 case O_BACK: 495 assert(nope); 496 break; 497 /* cases where length of match is hard to find */ 498 case OQUEST_: 499 stp = stop; 500 for (;;) { 501 /* how long could this one be? */ 502 rest = walk(m, sp, stp, ss, es, false); 503 assert(rest != NULL); /* it did match */ 504 /* could the rest match the rest? */ 505 tail = walk(m, rest, stop, es, stopst, false); 506 if (tail == stop) 507 break; /* yes! */ 508 /* no -- try a shorter match for this one */ 509 stp = rest - 1; 510 assert(stp >= sp); /* it did work */ 511 } 512 ssub = ss + 1; 513 esub = es - 1; 514 /* did innards match? */ 515 if (walk(m, sp, rest, ssub, esub, false) != NULL) { 516 dp = dissect(m, sp, rest, ssub, esub); 517 assert(dp == rest); 518 } else /* no */ 519 assert(sp == rest); 520 sp = rest; 521 break; 522 case OPLUS_: 523 stp = stop; 524 for (;;) { 525 /* how long could this one be? */ 526 rest = walk(m, sp, stp, ss, es, false); 527 assert(rest != NULL); /* it did match */ 528 /* could the rest match the rest? */ 529 tail = walk(m, rest, stop, es, stopst, false); 530 if (tail == stop) 531 break; /* yes! */ 532 /* no -- try a shorter match for this one */ 533 stp = rest - 1; 534 assert(stp >= sp); /* it did work */ 535 } 536 ssub = ss + 1; 537 esub = es - 1; 538 ssp = sp; 539 oldssp = ssp; 540 for (;;) { /* find last match of innards */ 541 sep = walk(m, ssp, rest, ssub, esub, false); 542 if (sep == NULL || sep == ssp) 543 break; /* failed or matched null */ 544 oldssp = ssp; /* on to next try */ 545 ssp = sep; 546 } 547 if (sep == NULL) { 548 /* last successful match */ 549 sep = ssp; 550 ssp = oldssp; 551 } 552 assert(sep == rest); /* must exhaust substring */ 553 assert(walk(m, ssp, sep, ssub, esub, false) == rest); 554 dp = dissect(m, ssp, sep, ssub, esub); 555 assert(dp == sep); 556 sp = rest; 557 break; 558 case OCH_: 559 stp = stop; 560 for (;;) { 561 /* how long could this one be? */ 562 rest = walk(m, sp, stp, ss, es, false); 563 assert(rest != NULL); /* it did match */ 564 /* could the rest match the rest? */ 565 tail = walk(m, rest, stop, es, stopst, false); 566 if (tail == stop) 567 break; /* yes! */ 568 /* no -- try a shorter match for this one */ 569 stp = rest - 1; 570 assert(stp >= sp); /* it did work */ 571 } 572 ssub = ss + 1; 573 esub = ss + OPND(m->g->strip[ss]) - 1; 574 assert(OP(m->g->strip[esub]) == OOR1); 575 for (;;) { /* find first matching branch */ 576 if (walk(m, sp, rest, ssub, esub, false) == rest) 577 break; /* it matched all of it */ 578 /* that one missed, try next one */ 579 assert(OP(m->g->strip[esub]) == OOR1); 580 esub++; 581 assert(OP(m->g->strip[esub]) == OOR2); 582 ssub = esub + 1; 583 esub += OPND(m->g->strip[esub]); 584 if (OP(m->g->strip[esub]) == OOR2) 585 esub--; 586 else 587 assert(OP(m->g->strip[esub]) == O_CH); 588 } 589 dp = dissect(m, sp, rest, ssub, esub); 590 assert(dp == rest); 591 sp = rest; 592 break; 593 case O_PLUS: 594 case O_QUEST: 595 case OOR1: 596 case OOR2: 597 case O_CH: 598 assert(nope); 599 break; 600 case OLPAREN: 601 i = OPND(m->g->strip[ss]); 602 assert(0 < i && i <= m->g->nsub); 603 m->pmatch[i].rm_so = sp - m->offp; 604 break; 605 case ORPAREN: 606 i = OPND(m->g->strip[ss]); 607 assert(0 < i && i <= m->g->nsub); 608 m->pmatch[i].rm_eo = sp - m->offp; 609 break; 610 default: /* uh oh */ 611 assert(nope); 612 break; 613 } 614 } 615 616 assert(sp == stop); 617 return(sp); 618 } 619 620 #define ISBOW(m, sp) \ 621 (sp < m->endp && ISWORD(*sp) && \ 622 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || \ 623 (sp > m->offp && !ISWORD(*(sp-1))))) 624 #define ISEOW(m, sp) \ 625 (((sp == m->endp && !(m->eflags®_NOTEOL)) || \ 626 (sp < m->endp && *sp == '\n' && \ 627 (m->g->cflags®_NEWLINE)) || \ 628 (sp < m->endp && !ISWORD(*sp)) ) && \ 629 (sp > m->beginp && ISWORD(*(sp-1)))) \ 630 631 /* 632 - backref - figure out what matched what, figuring in back references 633 == static const char *backref(struct match *m, const char *start, \ 634 == const char *stop, sopno startst, sopno stopst, sopno lev); 635 */ 636 static const char * /* == stop (success) or NULL (failure) */ 637 backref( 638 struct match *m, 639 const char *start, 640 const char *stop, 641 sopno startst, 642 sopno stopst, 643 sopno lev, /* PLUS nesting level */ 644 int rec) 645 { 646 int i; 647 sopno ss; /* start sop of current subRE */ 648 const char *sp; /* start of string matched by it */ 649 sopno ssub; /* start sop of subsubRE */ 650 sopno esub; /* end sop of subsubRE */ 651 const char *ssp; /* start of string matched by subsubRE */ 652 const char *dp; 653 size_t len; 654 int hard; 655 sop s; 656 regoff_t offsave; 657 cset *cs; 658 wint_t wc; 659 660 _DIAGASSERT(m != NULL); 661 _DIAGASSERT(start != NULL); 662 _DIAGASSERT(stop != NULL); 663 664 AT("back", start, stop, startst, stopst); 665 sp = start; 666 667 /* get as far as we can with easy stuff */ 668 hard = 0; 669 for (ss = startst; !hard && ss < stopst; ss++) 670 switch (OP(s = m->g->strip[ss])) { 671 case OCHAR: 672 if (sp == stop) 673 return(NULL); 674 sp += XMBRTOWC(&wc, sp, (size_t)(stop - sp), 675 &m->mbs, BADCHAR); 676 if (wc != (wint_t)OPND(s)) 677 return(NULL); 678 break; 679 case OANY: 680 if (sp == stop) 681 return(NULL); 682 sp += XMBRTOWC(&wc, sp, (size_t)(stop - sp), 683 &m->mbs, BADCHAR); 684 if (wc == BADCHAR) 685 return (NULL); 686 break; 687 case OANYOF: 688 if (sp == stop) 689 return (NULL); 690 cs = &m->g->sets[OPND(s)]; 691 sp += XMBRTOWC(&wc, sp, (size_t)(stop - sp), 692 &m->mbs, BADCHAR); 693 if (wc == BADCHAR || !CHIN(cs, wc)) 694 return(NULL); 695 break; 696 case OBOS: 697 if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0) 698 { /* yes */ } 699 else 700 return(NULL); 701 break; 702 case OEOS: 703 if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0) 704 { /* yes */ } 705 else 706 return(NULL); 707 break; 708 case OBOL: 709 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 710 (sp > m->offp && sp < m->endp && 711 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) 712 { /* yes */ } 713 else 714 return(NULL); 715 break; 716 case OEOL: 717 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 718 (sp < m->endp && *sp == '\n' && 719 (m->g->cflags®_NEWLINE)) ) 720 { /* yes */ } 721 else 722 return(NULL); 723 break; 724 case OWBND: 725 if (ISBOW(m, sp) || ISEOW(m, sp)) 726 { /* yes */ } 727 else 728 return(NULL); 729 break; 730 case ONWBND: 731 if (((sp == m->beginp) && !ISWORD(*sp)) || 732 (sp == m->endp && !ISWORD(*(sp - 1)))) 733 { /* yes, beginning/end of subject */ } 734 else if (ISWORD(*(sp - 1)) == ISWORD(*sp)) 735 { /* yes, beginning/end of subject */ } 736 else 737 return(NULL); 738 break; 739 case OBOW: 740 if (ISBOW(m, sp)) 741 { /* yes */ } 742 else 743 return(NULL); 744 break; 745 case OEOW: 746 if (ISEOW(m, sp)) 747 { /* yes */ } 748 else 749 return(NULL); 750 break; 751 case O_QUEST: 752 break; 753 case OOR1: /* matches null but needs to skip */ 754 ss++; 755 s = m->g->strip[ss]; 756 do { 757 assert(OP(s) == OOR2); 758 ss += OPND(s); 759 } while (OP(s = m->g->strip[ss]) != O_CH); 760 /* note that the ss++ gets us past the O_CH */ 761 break; 762 default: /* have to make a choice */ 763 hard = 1; 764 break; 765 } 766 if (!hard) { /* that was it! */ 767 if (sp != stop) 768 return(NULL); 769 return(sp); 770 } 771 ss--; /* adjust for the for's final increment */ 772 773 /* the hard stuff */ 774 AT("hard", sp, stop, ss, stopst); 775 s = m->g->strip[ss]; 776 switch (OP(s)) { 777 case OBACK_: /* the vilest depths */ 778 i = OPND(s); 779 assert(0 < i && i <= m->g->nsub); 780 if (m->pmatch[i].rm_eo == -1) 781 return(NULL); 782 assert(m->pmatch[i].rm_so != -1); 783 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 784 if (len == 0 && rec++ > MAX_RECURSION) 785 return(NULL); 786 assert(stop - m->beginp >= len); 787 if (sp > stop - len) 788 return(NULL); /* not enough left to match */ 789 ssp = m->offp + m->pmatch[i].rm_so; 790 if (memcmp(sp, ssp, len) != 0) 791 return(NULL); 792 while (m->g->strip[ss] != SOP(O_BACK, i)) 793 ss++; 794 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 795 case OQUEST_: /* to null or not */ 796 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 797 if (dp != NULL) 798 return(dp); /* not */ 799 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 800 case OPLUS_: 801 assert(m->lastpos != NULL); 802 assert(lev+1 <= m->g->nplus); 803 m->lastpos[lev+1] = sp; 804 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 805 case O_PLUS: 806 if (sp == m->lastpos[lev]) /* last pass matched null */ 807 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 808 /* try another pass */ 809 m->lastpos[lev] = sp; 810 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 811 if (dp == NULL) 812 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 813 else 814 return(dp); 815 case OCH_: /* find the right one, if any */ 816 ssub = ss + 1; 817 esub = ss + OPND(s) - 1; 818 assert(OP(m->g->strip[esub]) == OOR1); 819 for (;;) { /* find first matching branch */ 820 dp = backref(m, sp, stop, ssub, esub, lev, rec); 821 if (dp != NULL) 822 return(dp); 823 /* that one missed, try next one */ 824 if (OP(m->g->strip[esub]) == O_CH) 825 return(NULL); /* there is none */ 826 esub++; 827 assert(OP(m->g->strip[esub]) == OOR2); 828 ssub = esub + 1; 829 esub += OPND(m->g->strip[esub]); 830 if (OP(m->g->strip[esub]) == OOR2) 831 esub--; 832 else 833 assert(OP(m->g->strip[esub]) == O_CH); 834 } 835 /* NOTREACHED */ 836 break; 837 case OLPAREN: /* must undo assignment if rest fails */ 838 i = OPND(s); 839 assert(0 < i && i <= m->g->nsub); 840 offsave = m->pmatch[i].rm_so; 841 m->pmatch[i].rm_so = sp - m->offp; 842 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 843 if (dp != NULL) 844 return(dp); 845 m->pmatch[i].rm_so = offsave; 846 return(NULL); 847 case ORPAREN: /* must undo assignment if rest fails */ 848 i = OPND(s); 849 assert(0 < i && i <= m->g->nsub); 850 offsave = m->pmatch[i].rm_eo; 851 m->pmatch[i].rm_eo = sp - m->offp; 852 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 853 if (dp != NULL) 854 return(dp); 855 m->pmatch[i].rm_eo = offsave; 856 return(NULL); 857 default: /* uh oh */ 858 assert(nope); 859 break; 860 } 861 862 /* "can't happen" */ 863 assert(nope); 864 /* NOTREACHED */ 865 return "shut up gcc"; 866 } 867 868 /* 869 - walk - step through the string either quickly or slowly 870 == static const char *walk(struct match *m, const char *start, \ 871 == const char *stop, sopno startst, sopno stopst, bool fast); 872 */ 873 static const char * /* where it ended, or NULL */ 874 walk(struct match *m, const char *start, const char *stop, sopno startst, 875 sopno stopst, bool fast) 876 { 877 states st = m->st; 878 states fresh = m->fresh; 879 states empty = m->empty; 880 states tmp = m->tmp; 881 const char *p = start; 882 wint_t c; 883 wint_t lastc; /* previous c */ 884 wint_t flagch; 885 int sflags; 886 const char *matchp; /* last p at which a match ended */ 887 size_t i, clen; 888 889 _DIAGASSERT(m != NULL); 890 _DIAGASSERT(start != NULL); 891 _DIAGASSERT(stop != NULL); 892 893 sflags = 0; 894 AT("walk", start, stop, startst, stopst); 895 CLEAR(st); 896 SET1(st, startst); 897 SP("sstart", st, *p); 898 st = step(m->g, startst, stopst, st, NOTHING, st, sflags); 899 if (fast) 900 ASSIGN(fresh, st); 901 matchp = NULL; 902 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 903 c = OUT; 904 else { 905 /* 906 * XXX Wrong if the previous character was multi-byte. 907 * Newline never is (in encodings supported by FreeBSD), 908 * so this only breaks the ISWORD tests below. 909 */ 910 c = (uch)*(start - 1); 911 } 912 for (;;) { 913 /* next character */ 914 lastc = c; 915 sflags = 0; 916 if (p == m->endp) { 917 c = OUT; 918 clen = 0; 919 } else 920 clen = XMBRTOWC(&c, p, (size_t)(m->endp - p), 921 &m->mbs, BADCHAR); 922 923 if (fast && EQ(st, fresh)) 924 matchp = p; 925 926 /* is there an EOL and/or BOL between lastc and c? */ 927 flagch = '\0'; 928 i = 0; 929 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 930 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 931 flagch = BOL; 932 i = m->g->nbol; 933 } 934 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 935 (c == OUT && !(m->eflags®_NOTEOL)) ) { 936 flagch = (flagch == BOL) ? BOLEOL : EOL; 937 i += m->g->neol; 938 } 939 if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) { 940 sflags |= SBOS; 941 /* Step one more for BOS. */ 942 i++; 943 } 944 if (c == OUT && (m->eflags & REG_NOTEOL) == 0) { 945 sflags |= SEOS; 946 /* Step one more for EOS. */ 947 i++; 948 } 949 if (i != 0) { 950 for (; i > 0; i--) 951 st = step(m->g, startst, stopst, st, flagch, st, 952 sflags); 953 SP("sboleol", st, c); 954 } 955 956 /* how about a word boundary? */ 957 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 958 (c != OUT && ISWORD(c)) ) { 959 flagch = BOW; 960 } 961 if ( (lastc != OUT && ISWORD(lastc)) && 962 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 963 flagch = EOW; 964 } 965 if (flagch == BOW || flagch == EOW) { 966 st = step(m->g, startst, stopst, st, flagch, st, sflags); 967 SP("sboweow", st, c); 968 } 969 if (lastc != OUT && c != OUT && 970 ISWORD(lastc) == ISWORD(c)) { 971 flagch = NWBND; 972 } else if ((lastc == OUT && !ISWORD(c)) || 973 (c == OUT && !ISWORD(lastc))) { 974 flagch = NWBND; 975 } 976 if (flagch == NWBND) { 977 st = step(m->g, startst, stopst, st, flagch, st, sflags); 978 SP("snwbnd", st, c); 979 } 980 981 /* are we done? */ 982 if (ISSET(st, stopst)) { 983 if (fast) 984 break; 985 else 986 matchp = p; 987 } 988 if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p)) 989 break; /* NOTE BREAK OUT */ 990 991 /* no, we must deal with this character */ 992 ASSIGN(tmp, st); 993 if (fast) 994 ASSIGN(st, fresh); 995 else 996 ASSIGN(st, empty); 997 assert(c != OUT); 998 st = step(m->g, startst, stopst, tmp, c, st, sflags); 999 SP("saft", st, c); 1000 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags), 1001 st)); 1002 p += clen; 1003 } 1004 1005 if (fast) { 1006 assert(matchp != NULL); 1007 m->coldp = matchp; 1008 if (ISSET(st, stopst)) 1009 return (p + XMBRTOWC(NULL, p, (size_t)(stop - p), 1010 &m->mbs, 0)); 1011 else 1012 return (NULL); 1013 } else 1014 return (matchp); 1015 } 1016 1017 /* 1018 - step - map set of states reachable before char to set reachable after 1019 == static states step(struct re_guts *g, sopno start, sopno stop, \ 1020 == states bef, int ch, states aft); 1021 == #define BOL (OUT-1) 1022 == #define EOL (BOL-1) 1023 == #define BOLEOL (BOL-2) 1024 == #define NOTHING (BOL-3) 1025 == #define BOW (BOL-4) 1026 == #define EOW (BOL-5) 1027 == #define BADCHAR (BOL-6) 1028 == #define NONCHAR(c) ((c) <= OUT) 1029 */ 1030 static states 1031 step(struct re_guts *g, 1032 sopno start, /* start state within strip */ 1033 sopno stop, /* state after stop state within strip */ 1034 states bef, /* states reachable before */ 1035 wint_t ch, /* character or NONCHAR code */ 1036 states aft, /* states already known reachable after */ 1037 int sflags) /* state flags */ 1038 { 1039 cset *cs; 1040 sop s; 1041 sopno pc; 1042 onestate here; /* note, macros know this name */ 1043 sopno look; 1044 int i; 1045 1046 _DIAGASSERT(g != NULL); 1047 1048 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 1049 s = g->strip[pc]; 1050 switch (OP(s)) { 1051 case OEND: 1052 assert(pc == stop-1); 1053 break; 1054 case OCHAR: 1055 /* only characters can match */ 1056 assert(!NONCHAR(ch) || ch != OPND(s)); 1057 if (ch == (wint_t)OPND(s)) 1058 FWD(aft, bef, 1); 1059 break; 1060 case OBOS: 1061 if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0) 1062 FWD(aft, bef, 1); 1063 break; 1064 case OEOS: 1065 if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0) 1066 FWD(aft, bef, 1); 1067 break; 1068 case OBOL: 1069 if (ch == BOL || ch == BOLEOL) 1070 FWD(aft, bef, 1); 1071 break; 1072 case OEOL: 1073 if (ch == EOL || ch == BOLEOL) 1074 FWD(aft, bef, 1); 1075 break; 1076 case OBOW: 1077 if (ch == BOW) 1078 FWD(aft, bef, 1); 1079 break; 1080 case OEOW: 1081 if (ch == EOW) 1082 FWD(aft, bef, 1); 1083 break; 1084 case OWBND: 1085 if (ch == BOW || ch == EOW) 1086 FWD(aft, bef, 1); 1087 break; 1088 case ONWBND: 1089 if (ch == NWBND) 1090 FWD(aft, aft, 1); 1091 break; 1092 case OANY: 1093 if (!NONCHAR(ch)) 1094 FWD(aft, bef, 1); 1095 break; 1096 case OANYOF: 1097 cs = &g->sets[OPND(s)]; 1098 if (!NONCHAR(ch) && CHIN(cs, ch)) 1099 FWD(aft, bef, 1); 1100 break; 1101 case OBACK_: /* ignored here */ 1102 case O_BACK: 1103 FWD(aft, aft, 1); 1104 break; 1105 case OPLUS_: /* forward, this is just an empty */ 1106 FWD(aft, aft, 1); 1107 break; 1108 case O_PLUS: /* both forward and back */ 1109 FWD(aft, aft, 1); 1110 i = ISSETBACK(aft, OPND(s)); 1111 BACK(aft, aft, OPND(s)); 1112 if (!i && ISSETBACK(aft, OPND(s))) { 1113 /* oho, must reconsider loop body */ 1114 pc -= OPND(s) + 1; 1115 INIT(here, pc); 1116 } 1117 break; 1118 case OQUEST_: /* two branches, both forward */ 1119 FWD(aft, aft, 1); 1120 FWD(aft, aft, OPND(s)); 1121 break; 1122 case O_QUEST: /* just an empty */ 1123 FWD(aft, aft, 1); 1124 break; 1125 case OLPAREN: /* not significant here */ 1126 case ORPAREN: 1127 FWD(aft, aft, 1); 1128 break; 1129 case OCH_: /* mark the first two branches */ 1130 FWD(aft, aft, 1); 1131 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1132 FWD(aft, aft, OPND(s)); 1133 break; 1134 case OOR1: /* done a branch, find the O_CH */ 1135 if (ISSTATEIN(aft, here)) { 1136 for (look = 1; 1137 OP(s = g->strip[pc+look]) != O_CH; 1138 look += OPND(s)) 1139 assert(OP(s) == OOR2); 1140 FWD(aft, aft, look + 1); 1141 } 1142 break; 1143 case OOR2: /* propagate OCH_'s marking */ 1144 FWD(aft, aft, 1); 1145 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1146 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1147 FWD(aft, aft, OPND(s)); 1148 } 1149 break; 1150 case O_CH: /* just empty */ 1151 FWD(aft, aft, 1); 1152 break; 1153 default: /* ooooops... */ 1154 assert(nope); 1155 break; 1156 } 1157 } 1158 1159 return(aft); 1160 } 1161 1162 #ifdef REDEBUG 1163 /* 1164 - print - print a set of states 1165 == #ifdef REDEBUG 1166 == static void print(struct match *m, const char *caption, states st, \ 1167 == int ch, FILE *d); 1168 == #endif 1169 */ 1170 static void 1171 print(struct match *m, 1172 const char *caption, 1173 states st, 1174 int ch, 1175 FILE *d) 1176 { 1177 struct re_guts *g = m->g; 1178 sopno i; 1179 int first = 1; 1180 1181 _DIAGASSERT(m != NULL); 1182 _DIAGASSERT(caption != NULL); 1183 1184 if (!(m->eflags®_TRACE)) 1185 return; 1186 1187 _DIAGASSERT(d != NULL); 1188 1189 fprintf(d, "%s", caption); 1190 if (ch != '\0') 1191 fprintf(d, " %s", pchar(ch)); 1192 for (i = 0; i < g->nstates; i++) 1193 if (ISSET(st, i)) { 1194 fprintf(d, "%s%lu", (first) ? "\t" : ", ", i); 1195 first = 0; 1196 } 1197 fprintf(d, "\n"); 1198 } 1199 1200 /* 1201 - at - print current situation 1202 == #ifdef REDEBUG 1203 == static void at(struct match *m, const char *title, const char *start, \ 1204 == const char *stop, sopno startst, sopno stopst); 1205 == #endif 1206 */ 1207 static void 1208 at( struct match *m, 1209 const char *title, 1210 const char *start, 1211 const char *stop, 1212 sopno startst, 1213 sopno stopst) 1214 { 1215 1216 _DIAGASSERT(m != NULL); 1217 _DIAGASSERT(title != NULL); 1218 _DIAGASSERT(start != NULL); 1219 _DIAGASSERT(stop != NULL); 1220 1221 if (!(m->eflags®_TRACE)) 1222 return; 1223 1224 printf("%s %s-", title, pchar(*start)); 1225 printf("%s ", pchar(*stop)); 1226 printf("%ld-%ld\n", (long)startst, (long)stopst); 1227 } 1228 1229 #ifndef PCHARDONE 1230 #define PCHARDONE /* never again */ 1231 /* 1232 - pchar - make a character printable 1233 == #ifdef REDEBUG 1234 == static const char *pchar(int ch); 1235 == #endif 1236 * 1237 * Is this identical to regchar() over in debug.c? Well, yes. But a 1238 * duplicate here avoids having a debugging-capable regexec.o tied to 1239 * a matching debug.o, and this is convenient. It all disappears in 1240 * the non-debug compilation anyway, so it doesn't matter much. 1241 */ 1242 static const char * /* -> representation */ 1243 pchar(int ch) 1244 { 1245 static char pbuf[10]; 1246 1247 if (isprint((uch)ch) || ch == ' ') 1248 snprintf(pbuf, sizeof(pbuf), "%c", ch); 1249 else 1250 snprintf(pbuf, sizeof(pbuf), "\\%o", ch); 1251 return(pbuf); 1252 } 1253 #endif 1254 #endif 1255 1256 #undef stepback 1257 #undef matcher 1258 #undef walk 1259 #undef dissect 1260 #undef backref 1261 #undef step 1262 #undef print 1263 #undef at 1264 #undef match 1265