1 /* $NetBSD: engine.c,v 1.7 2011/11/19 17:45:11 tnozaki Exp $ */
2
3 /*-
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 * Copyright (c) 1992, 1993, 1994
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Henry Spencer of the University of Toronto.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)engine.c 8.4 (Berkeley) 3/19/94
36 */
37
38 /*
39 * The matching engine and friends. This file is #included by regexec.c
40 * after suitable #defines of a variety of macros used herein, so that
41 * different state representations can be used without duplicating masses
42 * of code.
43 */
44
45 #ifdef SNAMES
46 #define matcher smatcher
47 #define fast sfast
48 #define slow sslow
49 #define dissect sdissect
50 #define backref sbackref
51 #define step sstep
52 #define print sprint
53 #define at sat
54 #define match smat
55 #endif
56 #ifdef LNAMES
57 #define matcher lmatcher
58 #define fast lfast
59 #define slow lslow
60 #define dissect ldissect
61 #define backref lbackref
62 #define step lstep
63 #define print lprint
64 #define at lat
65 #define match lmat
66 #endif
67
68 /* another structure passed up and down to avoid zillions of parameters */
69 struct match {
70 struct re_guts *g;
71 int eflags;
72 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
73 const RCHAR_T *offp; /* offsets work from here */
74 const RCHAR_T *beginp; /* start of string -- virtual NUL precedes */
75 const RCHAR_T *endp; /* end of string -- virtual NUL here */
76 const RCHAR_T *coldp; /* can be no match starting before here */
77 const RCHAR_T **lastpos; /* [nplus+1] */
78 STATEVARS;
79 states st; /* current states */
80 states fresh; /* states for a fresh start */
81 states tmp; /* temporary */
82 states empty; /* empty set of states */
83 };
84
85 /* ========= begin header generated by ./mkh ========= */
86 #ifdef __cplusplus
87 extern "C" {
88 #endif
89
90 /* === engine.c === */
91 static int matcher(struct re_guts *g, const RCHAR_T *string, size_t nmatch, regmatch_t pmatch[], int eflags);
92 static const RCHAR_T *dissect(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst, sopno stopst);
93 static const RCHAR_T *backref(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst, sopno stopst, sopno lev);
94 static const RCHAR_T *fast(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst, sopno stopst);
95 static const RCHAR_T *slow(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst, sopno stopst);
96 static states step(struct re_guts *g, sopno start, sopno stop, states bef, int flag, RCHAR_T ch, states aft);
97 #define BOL (1)
98 #define EOL (BOL+1)
99 #define BOLEOL (BOL+2)
100 #define NOTHING (BOL+3)
101 #define BOW (BOL+4)
102 #define EOW (BOL+5)
103 #ifdef REDEBUG
104 static void print(struct match *m, char *caption, states st, int ch, FILE *d);
105 #endif
106 #ifdef REDEBUG
107 static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst);
108 #endif
109 #ifdef REDEBUG
110 static char *pchar(int ch);
111 #endif
112
113 #ifdef __cplusplus
114 }
115 #endif
116 /* ========= end header generated by ./mkh ========= */
117
118 #ifdef REDEBUG
119 #define SP(t, s, c) print(m, t, s, c, stdout)
120 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
121 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); }
122 #else
123 #define SP(t, s, c) /* nothing */
124 #define AT(t, p1, p2, s1, s2) /* nothing */
125 #define NOTE(s) /* nothing */
126 #endif
127
128 /*
129 - matcher - the actual matching engine
130 */
131 static int /* 0 success, REG_NOMATCH failure */
matcher(struct re_guts * g,const RCHAR_T * string,size_t nmatch,regmatch_t pmatch[],int eflags)132 matcher(struct re_guts *g, const RCHAR_T *string, size_t nmatch,
133 regmatch_t pmatch[], int eflags)
134 {
135 const RCHAR_T *endp;
136 size_t i;
137 struct match mv;
138 struct match *m = &mv;
139 const RCHAR_T *dp;
140 const sopno gf = g->firststate+1; /* +1 for OEND */
141 const sopno gl = g->laststate;
142 const RCHAR_T *start;
143 const RCHAR_T *stop;
144
145 /* simplify the situation where possible */
146 if (g->cflags®_NOSUB)
147 nmatch = 0;
148 if (eflags®_STARTEND) {
149 start = string + pmatch[0].rm_so;
150 stop = string + pmatch[0].rm_eo;
151 } else {
152 start = string;
153 stop = start + STRLEN(start);
154 }
155 if (stop < start)
156 return(REG_INVARG);
157
158 /* prescreening; this does wonders for this rather slow code */
159 if (g->must != NULL) {
160 for (dp = start; dp < stop; dp++)
161 if (*dp == g->must[0] && (size_t)(stop - dp) >= g->mlen &&
162 MEMCMP(dp, g->must, g->mlen) == 0)
163 break;
164 if (dp == stop) /* we didn't find g->must */
165 return(REG_NOMATCH);
166 }
167
168 /* match struct setup */
169 m->g = g;
170 m->eflags = eflags;
171 m->pmatch = NULL;
172 m->lastpos = NULL;
173 m->offp = string;
174 m->beginp = start;
175 m->endp = stop;
176 STATESETUP(m, 4);
177 SETUP(m->st);
178 SETUP(m->fresh);
179 SETUP(m->tmp);
180 SETUP(m->empty);
181 CLEAR(m->empty);
182
183 /* this loop does only one repetition except for backrefs */
184 for (;;) {
185 endp = fast(m, start, stop, gf, gl);
186 if (endp == NULL) { /* a miss */
187 STATETEARDOWN(m);
188 return(REG_NOMATCH);
189 }
190 if (nmatch == 0 && !g->backrefs)
191 break; /* no further info needed */
192
193 /* where? */
194 assert(m->coldp != NULL);
195 for (;;) {
196 NOTE("finding start");
197 endp = slow(m, m->coldp, stop, gf, gl);
198 if (endp != NULL)
199 break;
200 assert(m->coldp < m->endp);
201 m->coldp++;
202 }
203 if (nmatch == 1 && !g->backrefs)
204 break; /* no further info needed */
205
206 /* oh my, he wants the subexpressions... */
207 if (m->pmatch == NULL)
208 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
209 sizeof(regmatch_t));
210 if (m->pmatch == NULL) {
211 STATETEARDOWN(m);
212 return(REG_ESPACE);
213 }
214 for (i = 1; i <= m->g->nsub; i++)
215 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
216 if (!g->backrefs && !(m->eflags®_BACKR)) {
217 NOTE("dissecting");
218 dp = dissect(m, m->coldp, endp, gf, gl);
219 } else {
220 if (g->nplus > 0 && m->lastpos == NULL)
221 m->lastpos = (const RCHAR_T **)malloc((g->nplus+1) *
222 sizeof(const RCHAR_T *));
223 if (g->nplus > 0 && m->lastpos == NULL) {
224 free(m->pmatch);
225 STATETEARDOWN(m);
226 return(REG_ESPACE);
227 }
228 NOTE("backref dissect");
229 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
230 }
231 if (dp != NULL)
232 break;
233
234 /* uh-oh... we couldn't find a subexpression-level match */
235 assert(g->backrefs); /* must be back references doing it */
236 assert(g->nplus == 0 || m->lastpos != NULL);
237 for (;;) {
238 if (dp != NULL || endp <= m->coldp)
239 break; /* defeat */
240 NOTE("backoff");
241 endp = slow(m, m->coldp, endp-1, gf, gl);
242 if (endp == NULL)
243 break; /* defeat */
244 /* try it on a shorter possibility */
245 #ifndef NDEBUG
246 for (i = 1; i <= m->g->nsub; i++) {
247 assert(m->pmatch[i].rm_so == -1);
248 assert(m->pmatch[i].rm_eo == -1);
249 }
250 #endif
251 NOTE("backoff dissect");
252 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
253 }
254 assert(dp == NULL || dp == endp);
255 if (dp != NULL) /* found a shorter one */
256 break;
257
258 /* despite initial appearances, there is no match here */
259 NOTE("false alarm");
260 start = m->coldp + 1; /* recycle starting later */
261 assert(start <= stop);
262 }
263
264 /* fill in the details if requested */
265 if (nmatch > 0) {
266 pmatch[0].rm_so = m->coldp - m->offp;
267 pmatch[0].rm_eo = endp - m->offp;
268 }
269 if (nmatch > 1) {
270 assert(m->pmatch != NULL);
271 for (i = 1; i < nmatch; i++)
272 if (i <= m->g->nsub)
273 pmatch[i] = m->pmatch[i];
274 else {
275 pmatch[i].rm_so = -1;
276 pmatch[i].rm_eo = -1;
277 }
278 }
279
280 if (m->pmatch != NULL)
281 free((char *)m->pmatch);
282 if (m->lastpos != NULL)
283 free((char *)m->lastpos);
284 STATETEARDOWN(m);
285 return(0);
286 }
287
288 /*
289 - dissect - figure out what matched what, no back references
290 */
291 static const RCHAR_T * /* == stop (success) always */
dissect(struct match * m,const RCHAR_T * start,const RCHAR_T * stop,sopno startst,sopno stopst)292 dissect(struct match *m, const RCHAR_T *start, const RCHAR_T *stop,
293 sopno startst, sopno stopst)
294 {
295 int i;
296 sopno ss; /* start sop of current subRE */
297 sopno es; /* end sop of current subRE */
298 const RCHAR_T *sp; /* start of string matched by it */
299 const RCHAR_T *stp; /* string matched by it cannot pass here */
300 const RCHAR_T *rest; /* start of rest of string */
301 const RCHAR_T *tail; /* string unmatched by rest of RE */
302 sopno ssub; /* start sop of subsubRE */
303 sopno esub; /* end sop of subsubRE */
304 const RCHAR_T *ssp; /* start of string matched by subsubRE */
305 const RCHAR_T *sep; /* end of string matched by subsubRE */
306 const RCHAR_T *oldssp; /* previous ssp */
307 const RCHAR_T *dp;
308
309 AT("diss", start, stop, startst, stopst);
310 sp = start;
311 for (ss = startst; ss < stopst; ss = es) {
312 /* identify end of subRE */
313 es = ss;
314 switch (m->g->strip[es]) {
315 case OPLUS_:
316 case OQUEST_:
317 es += m->g->stripdata[es];
318 break;
319 case OCH_:
320 while (m->g->strip[es] != O_CH)
321 es += m->g->stripdata[es];
322 break;
323 }
324 es++;
325
326 /* figure out what it matched */
327 switch (m->g->strip[ss]) {
328 case OEND:
329 assert(nope);
330 break;
331 case OCHAR:
332 sp++;
333 break;
334 case OBOL:
335 case OEOL:
336 case OBOW:
337 case OEOW:
338 break;
339 case OANY:
340 case OANYOF:
341 sp++;
342 break;
343 case OBACK_:
344 case O_BACK:
345 assert(nope);
346 break;
347 /* cases where length of match is hard to find */
348 case OQUEST_:
349 stp = stop;
350 for (;;) {
351 /* how long could this one be? */
352 rest = slow(m, sp, stp, ss, es);
353 assert(rest != NULL); /* it did match */
354 /* could the rest match the rest? */
355 tail = slow(m, rest, stop, es, stopst);
356 if (tail == stop)
357 break; /* yes! */
358 /* no -- try a shorter match for this one */
359 stp = rest - 1;
360 assert(stp >= sp); /* it did work */
361 }
362 ssub = ss + 1;
363 esub = es - 1;
364 /* did innards match? */
365 if (slow(m, sp, rest, ssub, esub) != NULL) {
366 dp = dissect(m, sp, rest, ssub, esub);
367 assert(dp == rest);
368 } else /* no */
369 assert(sp == rest);
370 sp = rest;
371 break;
372 case OPLUS_:
373 stp = stop;
374 for (;;) {
375 /* how long could this one be? */
376 rest = slow(m, sp, stp, ss, es);
377 assert(rest != NULL); /* it did match */
378 /* could the rest match the rest? */
379 tail = slow(m, rest, stop, es, stopst);
380 if (tail == stop)
381 break; /* yes! */
382 /* no -- try a shorter match for this one */
383 stp = rest - 1;
384 assert(stp >= sp); /* it did work */
385 }
386 ssub = ss + 1;
387 esub = es - 1;
388 ssp = sp;
389 oldssp = ssp;
390 for (;;) { /* find last match of innards */
391 sep = slow(m, ssp, rest, ssub, esub);
392 if (sep == NULL || sep == ssp)
393 break; /* failed or matched null */
394 oldssp = ssp; /* on to next try */
395 ssp = sep;
396 }
397 if (sep == NULL) {
398 /* last successful match */
399 sep = ssp;
400 ssp = oldssp;
401 }
402 assert(sep == rest); /* must exhaust substring */
403 assert(slow(m, ssp, sep, ssub, esub) == rest);
404 dp = dissect(m, ssp, sep, ssub, esub);
405 assert(dp == sep);
406 sp = rest;
407 break;
408 case OCH_:
409 stp = stop;
410 for (;;) {
411 /* how long could this one be? */
412 rest = slow(m, sp, stp, ss, es);
413 assert(rest != NULL); /* it did match */
414 /* could the rest match the rest? */
415 tail = slow(m, rest, stop, es, stopst);
416 if (tail == stop)
417 break; /* yes! */
418 /* no -- try a shorter match for this one */
419 stp = rest - 1;
420 assert(stp >= sp); /* it did work */
421 }
422 ssub = ss + 1;
423 esub = ss + m->g->stripdata[ss] - 1;
424 assert(m->g->strip[esub] == OOR1);
425 for (;;) { /* find first matching branch */
426 if (slow(m, sp, rest, ssub, esub) == rest)
427 break; /* it matched all of it */
428 /* that one missed, try next one */
429 assert(m->g->strip[esub] == OOR1);
430 esub++;
431 assert(m->g->strip[esub] == OOR2);
432 ssub = esub + 1;
433 esub += m->g->stripdata[esub];
434 if (m->g->strip[esub] == OOR2)
435 esub--;
436 else
437 assert(m->g->strip[esub] == O_CH);
438 }
439 dp = dissect(m, sp, rest, ssub, esub);
440 assert(dp == rest);
441 sp = rest;
442 break;
443 case O_PLUS:
444 case O_QUEST:
445 case OOR1:
446 case OOR2:
447 case O_CH:
448 assert(nope);
449 break;
450 case OLPAREN:
451 i = m->g->stripdata[ss];
452 assert(0 < i && i <= m->g->nsub);
453 m->pmatch[i].rm_so = sp - m->offp;
454 break;
455 case ORPAREN:
456 i = m->g->stripdata[ss];
457 assert(0 < i && i <= m->g->nsub);
458 m->pmatch[i].rm_eo = sp - m->offp;
459 break;
460 default: /* uh oh */
461 assert(nope);
462 break;
463 }
464 }
465
466 assert(sp == stop);
467 return(sp);
468 }
469
470 /*
471 - backref - figure out what matched what, figuring in back references
472 */
473 static const RCHAR_T * /* == stop (success) or NULL (failure) */
backref(struct match * m,const RCHAR_T * start,const RCHAR_T * stop,sopno startst,sopno stopst,sopno lev)474 backref(struct match *m, const RCHAR_T *start, const RCHAR_T *stop,
475 sopno startst, sopno stopst, sopno lev) /* PLUS nesting level */
476 {
477 int i;
478 sopno ss; /* start sop of current subRE */
479 const RCHAR_T *sp; /* start of string matched by it */
480 sopno ssub; /* start sop of subsubRE */
481 sopno esub; /* end sop of subsubRE */
482 const RCHAR_T *ssp; /* start of string matched by subsubRE */
483 const RCHAR_T *dp;
484 size_t len;
485 int hard;
486 sop s;
487 RCHAR_T d;
488 regoff_t offsave;
489 cset *cs;
490
491 AT("back", start, stop, startst, stopst);
492 sp = start;
493
494 /* get as far as we can with easy stuff */
495 hard = 0;
496 for (ss = startst; !hard && ss < stopst; ss++) {
497 s = m->g->strip[ss];
498 d = m->g->stripdata[ss];
499 switch (s) {
500 case OCHAR:
501 if (sp == stop || *sp++ != d)
502 return(NULL);
503 break;
504 case OANY:
505 if (sp == stop)
506 return(NULL);
507 sp++;
508 break;
509 case OANYOF:
510 cs = &m->g->sets[d];
511 if (sp == stop || !CHIN(cs, *sp++))
512 return(NULL);
513 break;
514 case OBOL:
515 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
516 (sp < m->endp && *(sp-1) == '\n' &&
517 (m->g->cflags®_NEWLINE)) )
518 { /* yes */ }
519 else
520 return(NULL);
521 break;
522 case OEOL:
523 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
524 (sp < m->endp && *sp == '\n' &&
525 (m->g->cflags®_NEWLINE)) )
526 { /* yes */ }
527 else
528 return(NULL);
529 break;
530 case OBOW:
531 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
532 (sp < m->endp && *(sp-1) == '\n' &&
533 (m->g->cflags®_NEWLINE)) ||
534 (sp > m->beginp &&
535 !ISWORD(*(sp-1))) ) &&
536 (sp < m->endp && ISWORD(*sp)) )
537 { /* yes */ }
538 else
539 return(NULL);
540 break;
541 case OEOW:
542 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
543 (sp < m->endp && *sp == '\n' &&
544 (m->g->cflags®_NEWLINE)) ||
545 (sp < m->endp && !ISWORD(*sp)) ) &&
546 (sp > m->beginp && ISWORD(*(sp-1))) )
547 { /* yes */ }
548 else
549 return(NULL);
550 break;
551 case O_QUEST:
552 break;
553 case OOR1: /* matches null but needs to skip */
554 ss++;
555 s = m->g->strip[ss];
556 d = m->g->stripdata[ss];
557 do {
558 assert(s == OOR2);
559 ss += d;
560 s = m->g->strip[ss];
561 d = m->g->stripdata[ss];
562 } while (s != O_CH);
563 /* note that the ss++ gets us past the O_CH */
564 break;
565 default: /* have to make a choice */
566 hard = 1;
567 break;
568 }
569 }
570 if (!hard) { /* that was it! */
571 if (sp != stop)
572 return(NULL);
573 return(sp);
574 }
575 ss--; /* adjust for the for's final increment */
576
577 /* the hard stuff */
578 AT("hard", sp, stop, ss, stopst);
579 s = m->g->strip[ss];
580 d = m->g->stripdata[ss];
581 switch (s) {
582 case OBACK_: /* the vilest depths */
583 i = d;
584 assert(0 < i && i <= m->g->nsub);
585 if (m->pmatch[i].rm_eo == -1)
586 return(NULL);
587 assert(m->pmatch[i].rm_so != -1);
588 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
589 assert(stop - m->beginp >= len);
590 if (sp > stop - len)
591 return(NULL); /* not enough left to match */
592 ssp = m->offp + m->pmatch[i].rm_so;
593 if (memcmp(sp, ssp, len) != 0)
594 return(NULL);
595 while (m->g->strip[ss] != O_BACK || m->g->stripdata[ss] != i)
596 ss++;
597 return(backref(m, sp+len, stop, ss+1, stopst, lev));
598 break;
599 case OQUEST_: /* to null or not */
600 dp = backref(m, sp, stop, ss+1, stopst, lev);
601 if (dp != NULL)
602 return(dp); /* not */
603 return(backref(m, sp, stop, ss+d+1, stopst, lev));
604 break;
605 case OPLUS_:
606 assert(m->lastpos != NULL);
607 assert(lev+1 <= m->g->nplus);
608 m->lastpos[lev+1] = sp;
609 return(backref(m, sp, stop, ss+1, stopst, lev+1));
610 break;
611 case O_PLUS:
612 if (sp == m->lastpos[lev]) /* last pass matched null */
613 return(backref(m, sp, stop, ss+1, stopst, lev-1));
614 /* try another pass */
615 m->lastpos[lev] = sp;
616 dp = backref(m, sp, stop, ss-d+1, stopst, lev);
617 if (dp == NULL)
618 return(backref(m, sp, stop, ss+1, stopst, lev-1));
619 else
620 return(dp);
621 break;
622 case OCH_: /* find the right one, if any */
623 ssub = ss + 1;
624 esub = ss + d - 1;
625 assert(m->g->strip[esub] == OOR1);
626 for (;;) { /* find first matching branch */
627 dp = backref(m, sp, stop, ssub, esub, lev);
628 if (dp != NULL)
629 return(dp);
630 /* that one missed, try next one */
631 if (m->g->strip[esub] == O_CH)
632 return(NULL); /* there is none */
633 esub++;
634 assert(m->g->strip[esub] == OOR2);
635 ssub = esub + 1;
636 esub += m->g->stripdata[esub];
637 if (m->g->strip[esub] == OOR2)
638 esub--;
639 else
640 assert(m->g->strip[esub] == O_CH);
641 }
642 break;
643 case OLPAREN: /* must undo assignment if rest fails */
644 i = d;
645 assert(0 < i && i <= m->g->nsub);
646 offsave = m->pmatch[i].rm_so;
647 m->pmatch[i].rm_so = sp - m->offp;
648 dp = backref(m, sp, stop, ss+1, stopst, lev);
649 if (dp != NULL)
650 return(dp);
651 m->pmatch[i].rm_so = offsave;
652 return(NULL);
653 break;
654 case ORPAREN: /* must undo assignment if rest fails */
655 i = d;
656 assert(0 < i && i <= m->g->nsub);
657 offsave = m->pmatch[i].rm_eo;
658 m->pmatch[i].rm_eo = sp - m->offp;
659 dp = backref(m, sp, stop, ss+1, stopst, lev);
660 if (dp != NULL)
661 return(dp);
662 m->pmatch[i].rm_eo = offsave;
663 return(NULL);
664 break;
665 default: /* uh oh */
666 assert(nope);
667 break;
668 }
669
670 /* "can't happen" */
671 assert(nope);
672 /* NOTREACHED */
673 return NULL;
674 }
675
676 /*
677 - fast - step through the string at top speed
678 */
679 static const RCHAR_T * /* where tentative match ended, or NULL */
fast(struct match * m,const RCHAR_T * start,const RCHAR_T * stop,sopno startst,sopno stopst)680 fast(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst,
681 sopno stopst)
682 {
683 states st = m->st;
684 states fresh = m->fresh;
685 states tmp = m->tmp;
686 const RCHAR_T *p = start;
687 RCHAR_T c = (start == m->beginp) ? OUT : *(start-1);
688 RCHAR_T lastc; /* previous c */
689 int flag;
690 int i;
691 const RCHAR_T *coldp; /* last p after which no match was underway */
692
693 CLEAR(st);
694 SET1(st, startst);
695 st = step(m->g, startst, stopst, st, NOTHING, OUT, st);
696 ASSIGN(fresh, st);
697 SP("start", st, *p);
698 coldp = NULL;
699 for (;;) {
700 /* next character */
701 lastc = c;
702 c = (p == m->endp) ? OUT : *p;
703 if (EQ(st, fresh))
704 coldp = p;
705
706 /* is there an EOL and/or BOL between lastc and c? */
707 flag = 0;
708 i = 0;
709 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
710 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
711 flag = BOL;
712 i = m->g->nbol;
713 }
714 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
715 (c == OUT && !(m->eflags®_NOTEOL)) ) {
716 flag = (flag == BOL) ? BOLEOL : EOL;
717 i += m->g->neol;
718 }
719 if (i != 0) {
720 for (; i > 0; i--)
721 st = step(m->g, startst, stopst, st, flag, OUT, st);
722 SP("boleol", st, c);
723 }
724
725 /* how about a word boundary? */
726 if ( (flag == BOL || (lastc != OUT && !ISWORD(lastc))) &&
727 (c != OUT && ISWORD(c)) ) {
728 flag = BOW;
729 }
730 if ( (lastc != OUT && ISWORD(lastc)) &&
731 (flag == EOL || (c != OUT && !ISWORD(c))) ) {
732 flag = EOW;
733 }
734 if (flag == BOW || flag == EOW) {
735 st = step(m->g, startst, stopst, st, flag, OUT, st);
736 SP("boweow", st, c);
737 }
738
739 /* are we done? */
740 if (ISSET(st, stopst) || p == stop)
741 break; /* NOTE BREAK OUT */
742
743 /* no, we must deal with this character */
744 ASSIGN(tmp, st);
745 ASSIGN(st, fresh);
746 assert(c != OUT);
747 st = step(m->g, startst, stopst, tmp, 0, c, st);
748 SP("aft", st, c);
749 assert(EQ(step(m->g, startst, stopst, st, NOTHING, OUT, st), st));
750 p++;
751 }
752
753 assert(coldp != NULL);
754 m->coldp = coldp;
755 if (ISSET(st, stopst))
756 return(p+1);
757 else
758 return(NULL);
759 }
760
761 /*
762 - slow - step through the string more deliberately
763 */
764 static const RCHAR_T * /* where it ended */
slow(struct match * m,const RCHAR_T * start,const RCHAR_T * stop,sopno startst,sopno stopst)765 slow(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst,
766 sopno stopst)
767 {
768 states st = m->st;
769 states empty = m->empty;
770 states tmp = m->tmp;
771 const RCHAR_T *p = start;
772 RCHAR_T c = (start == m->beginp) ? OUT : *(start-1);
773 RCHAR_T lastc; /* previous c */
774 int flag;
775 int i;
776 const RCHAR_T *matchp; /* last p at which a match ended */
777
778 AT("slow", start, stop, startst, stopst);
779 CLEAR(st);
780 SET1(st, startst);
781 SP("sstart", st, *p);
782 st = step(m->g, startst, stopst, st, NOTHING, OUT, st);
783 matchp = NULL;
784 for (;;) {
785 /* next character */
786 lastc = c;
787 c = (p == m->endp) ? OUT : *p;
788
789 /* is there an EOL and/or BOL between lastc and c? */
790 flag = 0;
791 i = 0;
792 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
793 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
794 flag = BOL;
795 i = m->g->nbol;
796 }
797 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
798 (c == OUT && !(m->eflags®_NOTEOL)) ) {
799 flag = (flag == BOL) ? BOLEOL : EOL;
800 i += m->g->neol;
801 }
802 if (i != 0) {
803 for (; i > 0; i--)
804 st = step(m->g, startst, stopst, st, flag, OUT, st);
805 SP("sboleol", st, c);
806 }
807
808 /* how about a word boundary? */
809 if ( (flag == BOL || (lastc != OUT && !ISWORD(lastc))) &&
810 (c != OUT && ISWORD(c)) ) {
811 flag = BOW;
812 }
813 if ( (lastc != OUT && ISWORD(lastc)) &&
814 (flag == EOL || (c != OUT && !ISWORD(c))) ) {
815 flag = EOW;
816 }
817 if (flag == BOW || flag == EOW) {
818 st = step(m->g, startst, stopst, st, flag, OUT, st);
819 SP("sboweow", st, c);
820 }
821
822 /* are we done? */
823 if (ISSET(st, stopst))
824 matchp = p;
825 if (EQ(st, empty) || p == stop)
826 break; /* NOTE BREAK OUT */
827
828 /* no, we must deal with this character */
829 ASSIGN(tmp, st);
830 ASSIGN(st, empty);
831 assert(c != OUT);
832 st = step(m->g, startst, stopst, tmp, 0, c, st);
833 SP("saft", st, c);
834 assert(EQ(step(m->g, startst, stopst, st, NOTHING, OUT, st), st));
835 p++;
836 }
837
838 return(matchp);
839 }
840
841
842 /*
843 - step - map set of states reachable before char to set reachable after
844 */
845 static states
step(struct re_guts * g,sopno start,sopno stop,states bef,int flag,RCHAR_T ch,states aft)846 step(struct re_guts *g,
847 sopno start, /* start state within strip */
848 sopno stop, /* state after stop state within strip */
849 states bef, /* states reachable before */
850 int flag, /* NONCHAR flag */
851 RCHAR_T ch, /* character code */
852 states aft) /* states already known reachable after */
853 {
854 cset *cs;
855 sop s;
856 RCHAR_T d;
857 sopno pc;
858 onestate here; /* note, macros know this name */
859 sopno look;
860 int i;
861
862 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
863 s = g->strip[pc];
864 d = g->stripdata[pc];
865 switch (s) {
866 case OEND:
867 assert(pc == stop-1);
868 break;
869 case OCHAR:
870 /* only characters can match */
871 assert(!flag || ch != d);
872 if (ch == d)
873 FWD(aft, bef, 1);
874 break;
875 case OBOL:
876 if (flag == BOL || flag == BOLEOL)
877 FWD(aft, bef, 1);
878 break;
879 case OEOL:
880 if (flag == EOL || flag == BOLEOL)
881 FWD(aft, bef, 1);
882 break;
883 case OBOW:
884 if (flag == BOW)
885 FWD(aft, bef, 1);
886 break;
887 case OEOW:
888 if (flag == EOW)
889 FWD(aft, bef, 1);
890 break;
891 case OANY:
892 if (!flag)
893 FWD(aft, bef, 1);
894 break;
895 case OANYOF:
896 cs = &g->sets[d];
897 if (!flag && CHIN(cs, ch))
898 FWD(aft, bef, 1);
899 break;
900 case OBACK_: /* ignored here */
901 case O_BACK:
902 FWD(aft, aft, 1);
903 break;
904 case OPLUS_: /* forward, this is just an empty */
905 FWD(aft, aft, 1);
906 break;
907 case O_PLUS: /* both forward and back */
908 FWD(aft, aft, 1);
909 i = ISSETBACK(aft, d);
910 BACK(aft, aft, d);
911 if (!i && ISSETBACK(aft, d)) {
912 /* oho, must reconsider loop body */
913 pc -= d + 1;
914 INIT(here, pc);
915 }
916 break;
917 case OQUEST_: /* two branches, both forward */
918 FWD(aft, aft, 1);
919 FWD(aft, aft, d);
920 break;
921 case O_QUEST: /* just an empty */
922 FWD(aft, aft, 1);
923 break;
924 case OLPAREN: /* not significant here */
925 case ORPAREN:
926 FWD(aft, aft, 1);
927 break;
928 case OCH_: /* mark the first two branches */
929 FWD(aft, aft, 1);
930 assert(OP(g->strip[pc+d]) == OOR2);
931 FWD(aft, aft, d);
932 break;
933 case OOR1: /* done a branch, find the O_CH */
934 if (ISSTATEIN(aft, here)) {
935 for (look = 1; /**/; look += d) {
936 s = g->strip[pc+look];
937 d = g->stripdata[pc+look];
938 if (s == O_CH)
939 break;
940 assert(s == OOR2);
941 }
942 FWD(aft, aft, look);
943 }
944 break;
945 case OOR2: /* propagate OCH_'s marking */
946 FWD(aft, aft, 1);
947 if (g->strip[pc+d] != O_CH) {
948 assert(g->strip[pc+d] == OOR2);
949 FWD(aft, aft, d);
950 }
951 break;
952 case O_CH: /* just empty */
953 FWD(aft, aft, 1);
954 break;
955 default: /* ooooops... */
956 assert(nope);
957 break;
958 }
959 }
960
961 return(aft);
962 }
963
964 #ifdef REDEBUG
965 /*
966 - print - print a set of states
967 */
968 static void
print(struct match * m,char * caption,states st,int ch,FILE * d)969 print(struct match *m, char *caption, states st, int ch, FILE *d)
970 {
971 struct re_guts *g = m->g;
972 int i;
973 int first = 1;
974
975 if (!(m->eflags®_TRACE))
976 return;
977
978 fprintf(d, "%s", caption);
979 if (ch != '\0')
980 fprintf(d, " %s", pchar(ch));
981 for (i = 0; i < g->nstates; i++)
982 if (ISSET(st, i)) {
983 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
984 first = 0;
985 }
986 fprintf(d, "\n");
987 }
988
989 /*
990 - at - print current situation
991 */
992 static void
at(struct match * m,char * title,char * start,char * stop,sopno startst,sopno stopst)993 at(struct match *m, char *title, char *start, char *stop, sopno startst,
994 sopno stopst)
995 {
996 if (!(m->eflags®_TRACE))
997 return;
998
999 printf("%s %s-", title, pchar(*start));
1000 printf("%s ", pchar(*stop));
1001 printf("%ld-%ld\n", (long)startst, (long)stopst);
1002 }
1003
1004 #ifndef PCHARDONE
1005 #define PCHARDONE /* never again */
1006 /*
1007 - pchar - make a character printable
1008 *
1009 * Is this identical to regchar() over in debug.c? Well, yes. But a
1010 * duplicate here avoids having a debugging-capable regexec.o tied to
1011 * a matching debug.o, and this is convenient. It all disappears in
1012 * the non-debug compilation anyway, so it doesn't matter much.
1013 */
1014 static char * /* -> representation */
pchar(int ch)1015 pchar(int ch)
1016 {
1017 static char pbuf[10];
1018
1019 if (isprint(ch) || ch == ' ')
1020 snprintf(pbuf, sizeof(pbuf), "%c", ch);
1021 else
1022 snprintf(pbuf, sizeof(pbuf), "\\%o", ch);
1023 return(pbuf);
1024 }
1025 #endif
1026 #endif
1027
1028 #undef matcher
1029 #undef fast
1030 #undef slow
1031 #undef dissect
1032 #undef backref
1033 #undef step
1034 #undef print
1035 #undef at
1036 #undef match
1037