xref: /csrg-svn/lib/libc/regex/engine.c (revision 56355)
1 /*-
2  * Copyright (c) 1992 Henry Spencer.
3  * Copyright (c) 1992 The Regents of the University of California.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * Henry Spencer of the University of Toronto.
8  *
9  * %sccs.include.redist.c%
10  *
11  *	@(#)engine.c	5.3 (Berkeley) 09/30/92
12  */
13 
14 /*
15  * The matching engine and friends.  This file is #included by regexec.c
16  * after suitable #defines of a variety of macros used herein, so that
17  * different state representations can be used without duplicating masses
18  * of code.
19  */
20 
21 #ifdef SNAMES
22 #define	matcher	smatcher
23 #define	fast	sfast
24 #define	slow	sslow
25 #define	dissect	sdissect
26 #define	backref	sbackref
27 #define	expand	sexpand
28 #define	step	sstep
29 #define	print	sprint
30 #define	at	sat
31 #define	match	smat
32 #endif
33 #ifdef LNAMES
34 #define	matcher	lmatcher
35 #define	fast	lfast
36 #define	slow	lslow
37 #define	dissect	ldissect
38 #define	backref	lbackref
39 #define	expand	lexpand
40 #define	step	lstep
41 #define	print	lprint
42 #define	at	lat
43 #define	match	lmat
44 #endif
45 
46 /* another structure passed up and down to avoid zillions of parameters */
47 struct match {
48 	struct re_guts *g;
49 	int eflags;
50 	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
51 	uchar *offp;		/* offsets work from here */
52 	uchar *beginp;		/* start of string -- virtual NUL precedes */
53 	uchar *endp;		/* end of string -- virtual NUL here */
54 	uchar *coldp;		/* can be no match starting before here */
55 	uchar **lastpos;	/* [nplus+1] */
56 	STATEVARS;
57 	states st;		/* current states */
58 	states fresh;		/* states for a fresh start */
59 	states tmp;		/* temporary */
60 	states empty;		/* empty set of states */
61 };
62 
63 #include "engine.ih"
64 
65 #ifdef REDEBUG
66 #define	SP(t, s, c)	print(m, t, s, c, stdout)
67 #define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
68 #define	NOTE(str)	{ if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
69 #else
70 #define	SP(t, s, c)	/* nothing */
71 #define	AT(t, p1, p2, s1, s2)	/* nothing */
72 #define	NOTE(s)	/* nothing */
73 #endif
74 
75 /*
76  - matcher - the actual matching engine
77  == static int matcher(register struct re_guts *g, uchar *string, \
78  ==	size_t nmatch, regmatch_t pmatch[], int eflags);
79  */
80 static int			/* 0 success, REG_NOMATCH failure */
81 matcher(g, string, nmatch, pmatch, eflags)
82 register struct re_guts *g;
83 uchar *string;
84 size_t nmatch;
85 regmatch_t pmatch[];
86 int eflags;
87 {
88 	register uchar *endp;
89 	register int i;
90 	struct match mv;
91 	register struct match *m = &mv;
92 	register uchar *dp;
93 	const register sopno gf = g->firststate+1;	/* +1 for OEND */
94 	const register sopno gl = g->laststate;
95 	uchar *start;
96 	uchar *stop;
97 
98 	/* simplify the situation where possible */
99 	if (g->cflags&REG_NOSUB)
100 		nmatch = 0;
101 	if (eflags&REG_STARTEND) {
102 		start = string + pmatch[0].rm_so;
103 		stop = string + pmatch[0].rm_eo;
104 	} else {
105 		start = string;
106 		stop = start + strlen((char *)start);
107 	}
108 
109 	/* prescreening; this does wonders for this rather slow code */
110 	if (g->must != NULL) {
111 		for (dp = start; dp < stop; dp++)
112 			if (*dp == (uchar)g->must[0] && stop - dp >= g->mlen &&
113 			strncmp((char *)dp, g->must, (size_t)g->mlen) == 0)
114 				break;
115 		if (dp == stop)		/* we didn't find g->must */
116 			return(REG_NOMATCH);
117 	}
118 
119 	/* match struct setup */
120 	m->g = g;
121 	m->eflags = eflags;
122 	m->pmatch = NULL;
123 	m->lastpos = NULL;
124 	m->offp = string;
125 	m->beginp = start;
126 	m->endp = stop;
127 	STATESETUP(m, 4);
128 	SETUP(m->st);
129 	SETUP(m->fresh);
130 	SETUP(m->tmp);
131 	SETUP(m->empty);
132 	CLEAR(m->empty);
133 
134 	/* this loop does only one repetition except for backrefs */
135 	for (;;) {
136 		endp = fast(m, start, stop, gf, gl);
137 		if (endp == NULL) {		/* a miss */
138 			STATETEARDOWN(m);
139 			return(REG_NOMATCH);
140 		}
141 		if (nmatch == 0 && !g->backrefs)
142 			break;		/* no further info needed */
143 
144 		/* where? */
145 		assert(m->coldp != NULL);
146 		for (;;) {
147 			NOTE("finding start");
148 			endp = slow(m, m->coldp, stop, gf, gl);
149 			if (endp != NULL)
150 				break;
151 			assert(m->coldp < m->endp);
152 			m->coldp++;
153 		}
154 		if (nmatch == 1 && !g->backrefs)
155 			break;		/* no further info needed */
156 
157 		/* oh my, he wants the subexpressions... */
158 		if (m->pmatch == NULL)
159 			m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
160 							sizeof(regmatch_t));
161 		if (m->pmatch == NULL) {
162 			STATETEARDOWN(m);
163 			return(REG_ESPACE);
164 		}
165 		for (i = 1; i <= m->g->nsub; i++)
166 			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
167 		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
168 			NOTE("dissecting");
169 			dp = dissect(m, m->coldp, endp, gf, gl);
170 		} else {
171 			if (g->nplus > 0 && m->lastpos == NULL)
172 				m->lastpos = (uchar **)malloc((g->nplus+1) *
173 							sizeof(uchar *));
174 			if (g->nplus > 0 && m->lastpos == NULL) {
175 				free(m->pmatch);
176 				STATETEARDOWN(m);
177 				return(REG_ESPACE);
178 			}
179 			NOTE("backref dissect");
180 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
181 		}
182 		if (dp != NULL)
183 			break;
184 
185 		/* uh-oh... we couldn't find a subexpression-level match */
186 		assert(g->backrefs);	/* must be back references doing it */
187 		assert(g->nplus == 0 || m->lastpos != NULL);
188 		for (;;) {
189 			if (dp != NULL || endp <= m->coldp)
190 				break;		/* defeat */
191 			NOTE("backoff");
192 			endp = slow(m, m->coldp, endp-1, gf, gl);
193 			if (endp == NULL)
194 				break;		/* defeat */
195 			/* try it on a shorter possibility */
196 #ifndef NDEBUG
197 			for (i = 1; i <= m->g->nsub; i++) {
198 				assert(m->pmatch[i].rm_so == -1);
199 				assert(m->pmatch[i].rm_eo == -1);
200 			}
201 #endif
202 			NOTE("backoff dissect");
203 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
204 		}
205 		assert(dp == NULL || dp == endp);
206 		if (dp != NULL)		/* found a shorter one */
207 			break;
208 
209 		/* despite initial appearances, there is no match here */
210 		NOTE("false alarm");
211 		start = m->coldp + 1;	/* recycle starting later */
212 		assert(start <= stop);
213 	}
214 
215 	/* fill in the details if requested */
216 	if (nmatch > 0) {
217 		pmatch[0].rm_so = m->coldp - m->offp;
218 		pmatch[0].rm_eo = endp - m->offp;
219 	}
220 	if (nmatch > 1) {
221 		assert(m->pmatch != NULL);
222 		for (i = 1; i < nmatch; i++)
223 			if (i <= m->g->nsub)
224 				pmatch[i] = m->pmatch[i];
225 			else {
226 				pmatch[i].rm_so = -1;
227 				pmatch[i].rm_eo = -1;
228 			}
229 	}
230 
231 	if (m->pmatch != NULL)
232 		free((char *)m->pmatch);
233 	if (m->lastpos != NULL)
234 		free((char *)m->lastpos);
235 	STATETEARDOWN(m);
236 	return(0);
237 }
238 
239 /*
240  - dissect - figure out what matched what, no back references
241  == static uchar *dissect(register struct match *m, uchar *start, \
242  ==	uchar *stop, sopno startst, sopno stopst);
243  */
244 static uchar *			/* == stop (success) always */
245 dissect(m, start, stop, startst, stopst)
246 register struct match *m;
247 uchar *start;
248 uchar *stop;
249 sopno startst;
250 sopno stopst;
251 {
252 	register int i;
253 	register sopno ss;	/* start sop of current subRE */
254 	register sopno es;	/* end sop of current subRE */
255 	register uchar *sp;	/* start of string matched by it */
256 	register uchar *stp;	/* string matched by it cannot pass here */
257 	register uchar *rest;	/* start of rest of string */
258 	register uchar *tail;	/* string unmatched by rest of RE */
259 	register sopno ssub;	/* start sop of subsubRE */
260 	register sopno esub;	/* end sop of subsubRE */
261 	register uchar *ssp;	/* start of string matched by subsubRE */
262 	register uchar *sep;	/* end of string matched by subsubRE */
263 	register uchar *oldssp;	/* previous ssp */
264 	register uchar *dp;
265 
266 	AT("diss", start, stop, startst, stopst);
267 	sp = start;
268 	for (ss = startst; ss < stopst; ss = es) {
269 		/* identify end of subRE */
270 		es = ss;
271 		switch (OP(m->g->strip[es])) {
272 		case OPLUS_:
273 		case OQUEST_:
274 			es += OPND(m->g->strip[es]);
275 			break;
276 		case OCH_:
277 			while (OP(m->g->strip[es]) != O_CH)
278 				es += OPND(m->g->strip[es]);
279 			break;
280 		}
281 		es++;
282 
283 		/* figure out what it matched */
284 		switch (OP(m->g->strip[ss])) {
285 		case OEND:
286 			assert(0);
287 			break;
288 		case OCHAR:
289 			sp++;
290 			break;
291 		case OBOL:
292 		case OEOL:
293 			break;
294 		case OANY:
295 		case OANYOF:
296 			sp++;
297 			break;
298 		case OBACK_:
299 		case O_BACK:
300 			assert(0);
301 			break;
302 		/* cases where length of match is hard to find */
303 		case OQUEST_:
304 			stp = stop;
305 			for (;;) {
306 				/* how long could this one be? */
307 				rest = slow(m, sp, stp, ss, es);
308 				assert(rest != NULL);	/* it did match */
309 				/* could the rest match the rest? */
310 				tail = slow(m, rest, stop, es, stopst);
311 				if (tail == stop)
312 					break;		/* yes! */
313 				/* no -- try a shorter match for this one */
314 				stp = rest - 1;
315 				assert(stp >= sp);	/* it did work */
316 			}
317 			ssub = ss + 1;
318 			esub = es - 1;
319 			/* did innards match? */
320 			if (slow(m, sp, rest, ssub, esub) != NULL) {
321 				dp = dissect(m, sp, rest, ssub, esub);
322 				assert(dp == rest);
323 			} else		/* no */
324 				assert(sp == rest);
325 			sp = rest;
326 			break;
327 		case OPLUS_:
328 			stp = stop;
329 			for (;;) {
330 				/* how long could this one be? */
331 				rest = slow(m, sp, stp, ss, es);
332 				assert(rest != NULL);	/* it did match */
333 				/* could the rest match the rest? */
334 				tail = slow(m, rest, stop, es, stopst);
335 				if (tail == stop)
336 					break;		/* yes! */
337 				/* no -- try a shorter match for this one */
338 				stp = rest - 1;
339 				assert(stp >= sp);	/* it did work */
340 			}
341 			ssub = ss + 1;
342 			esub = es - 1;
343 			ssp = sp;
344 			oldssp = ssp;
345 			for (;;) {	/* find last match of innards */
346 				sep = slow(m, ssp, rest, ssub, esub);
347 				if (sep == NULL || sep == ssp)
348 					break;	/* failed or matched null */
349 				oldssp = ssp;	/* on to next try */
350 				ssp = sep;
351 			}
352 			if (sep == NULL) {
353 				/* last successful match */
354 				sep = ssp;
355 				ssp = oldssp;
356 			}
357 			assert(sep == rest);	/* must exhaust substring */
358 			assert(slow(m, ssp, sep, ssub, esub) == rest);
359 			dp = dissect(m, ssp, sep, ssub, esub);
360 			assert(dp == sep);
361 			sp = rest;
362 			break;
363 		case OCH_:
364 			stp = stop;
365 			for (;;) {
366 				/* how long could this one be? */
367 				rest = slow(m, sp, stp, ss, es);
368 				assert(rest != NULL);	/* it did match */
369 				/* could the rest match the rest? */
370 				tail = slow(m, rest, stop, es, stopst);
371 				if (tail == stop)
372 					break;		/* yes! */
373 				/* no -- try a shorter match for this one */
374 				stp = rest - 1;
375 				assert(stp >= sp);	/* it did work */
376 			}
377 			ssub = ss + 1;
378 			esub = ss + OPND(m->g->strip[ss]) - 1;
379 			assert(OP(m->g->strip[esub]) == OOR1);
380 			for (;;) {	/* find first matching branch */
381 				if (slow(m, sp, rest, ssub, esub) == rest)
382 					break;	/* it matched all of it */
383 				/* that one missed, try next one */
384 				assert(OP(m->g->strip[esub]) == OOR1);
385 				esub++;
386 				assert(OP(m->g->strip[esub]) == OOR2);
387 				ssub = esub + 1;
388 				esub += OPND(m->g->strip[esub]);
389 				if (OP(m->g->strip[esub]) == OOR2)
390 					esub--;
391 				else
392 					assert(OP(m->g->strip[esub]) == O_CH);
393 			}
394 			dp = dissect(m, sp, rest, ssub, esub);
395 			assert(dp == rest);
396 			sp = rest;
397 			break;
398 		case O_PLUS:
399 		case O_QUEST:
400 		case OOR1:
401 		case OOR2:
402 		case O_CH:
403 			assert(0);
404 			break;
405 		case OLPAREN:
406 			i = OPND(m->g->strip[ss]);
407 			m->pmatch[i].rm_so = sp - m->offp;
408 			break;
409 		case ORPAREN:
410 			i = OPND(m->g->strip[ss]);
411 			m->pmatch[i].rm_eo = sp - m->offp;
412 			break;
413 		default:		/* uh oh */
414 			assert(0);
415 			break;
416 		}
417 	}
418 
419 	assert(sp == stop);
420 	return(sp);
421 }
422 
423 /*
424  - backref - figure out what matched what, figuring in back references
425  == static uchar *backref(register struct match *m, uchar *start, \
426  ==	uchar *stop, sopno startst, sopno stopst, sopno lev);
427  */
428 static uchar *			/* == stop (success) or NULL (failure) */
429 backref(m, start, stop, startst, stopst, lev)
430 register struct match *m;
431 uchar *start;
432 uchar *stop;
433 sopno startst;
434 sopno stopst;
435 sopno lev;			/* PLUS nesting level */
436 {
437 	register int i;
438 	register sopno ss;	/* start sop of current subRE */
439 	register uchar *sp;	/* start of string matched by it */
440 	register sopno ssub;	/* start sop of subsubRE */
441 	register sopno esub;	/* end sop of subsubRE */
442 	register uchar *ssp;	/* start of string matched by subsubRE */
443 	register uchar *dp;
444 	register size_t len;
445 	register int hard;
446 	register sop s;
447 	register regoff_t offsave;
448 	register cset *cs;
449 
450 	AT("back", start, stop, startst, stopst);
451 	sp = start;
452 
453 	/* get as far as we can with easy stuff */
454 	hard = 0;
455 	for (ss = startst; !hard && ss < stopst; ss++)
456 		switch (OP(s = m->g->strip[ss])) {
457 		case OCHAR:
458 			if (sp == stop || *sp++ != OPND(s))
459 				return(NULL);
460 			break;
461 		case OANY:
462 			if (sp == stop)
463 				return(NULL);
464 			sp++;
465 			break;
466 		case OANYOF:
467 			cs = &m->g->sets[OPND(s)];
468 			if (sp == stop || !CHIN(cs, *sp++))
469 				return(NULL);
470 			break;
471 		case OBOL:
472 			if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
473 					(sp < m->endp && *(sp-1) == '\n' &&
474 						(m->g->cflags&REG_NEWLINE)) )
475 				{ /* yes */ }
476 			else
477 				return(NULL);
478 			break;
479 		case OEOL:
480 			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
481 					(sp < m->endp && *sp == '\n' &&
482 						(m->g->cflags&REG_NEWLINE)) )
483 				{ /* yes */ }
484 			else
485 				return(NULL);
486 			break;
487 		case O_QUEST:
488 			break;
489 		case OOR1:	/* matches null but needs to skip */
490 			ss++;
491 			s = m->g->strip[ss];
492 			do {
493 				assert(OP(s) == OOR2);
494 				ss += OPND(s);
495 			} while (OP(s = m->g->strip[ss]) != O_CH);
496 			/* note that the ss++ gets us past the O_CH */
497 			break;
498 		default:	/* have to make a choice */
499 			hard = 1;
500 			break;
501 		}
502 	if (!hard) {		/* that was it! */
503 		if (sp != stop)
504 			return(NULL);
505 		return(sp);
506 	}
507 	ss--;			/* adjust for the for's final increment */
508 
509 	/* the hard stuff */
510 	AT("hard", sp, stop, ss, stopst);
511 	s = m->g->strip[ss];
512 	switch (OP(s)) {
513 	case OBACK_:		/* the vilest depths */
514 		i = OPND(s);
515 		if (m->pmatch[i].rm_eo == -1)
516 			return(NULL);
517 		assert(m->pmatch[i].rm_so != -1);
518 		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
519 		assert(stop - m->beginp >= len);
520 		if (sp > stop - len)
521 			return(NULL);	/* not enough left to match */
522 		ssp = m->offp + m->pmatch[i].rm_so;
523 		if (strncmp((char *)sp, (char *)ssp, len) != 0)
524 			return(NULL);
525 		while (m->g->strip[ss] != SOP(O_BACK, i))
526 			ss++;
527 		return(backref(m, sp+len, stop, ss+1, stopst, lev));
528 		break;
529 	case OQUEST_:		/* to null or not */
530 		dp = backref(m, sp, stop, ss+1, stopst, lev);
531 		if (dp != NULL)
532 			return(dp);	/* not */
533 		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
534 		break;
535 	case OPLUS_:
536 		assert(m->lastpos != NULL);
537 		assert(lev+1 <= m->g->nplus);
538 		m->lastpos[lev+1] = sp;
539 		return(backref(m, sp, stop, ss+1, stopst, lev+1));
540 		break;
541 	case O_PLUS:
542 		if (sp == m->lastpos[lev])	/* last pass matched null */
543 			return(backref(m, sp, stop, ss+1, stopst, lev-1));
544 		/* try another pass */
545 		m->lastpos[lev] = sp;
546 		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
547 		if (dp == NULL)
548 			return(backref(m, sp, stop, ss+1, stopst, lev-1));
549 		else
550 			return(dp);
551 		break;
552 	case OCH_:		/* find the right one, if any */
553 		ssub = ss + 1;
554 		esub = ss + OPND(s) - 1;
555 		assert(OP(m->g->strip[esub]) == OOR1);
556 		for (;;) {	/* find first matching branch */
557 			dp = backref(m, sp, stop, ssub, esub, lev);
558 			if (dp != NULL)
559 				return(dp);
560 			/* that one missed, try next one */
561 			if (OP(m->g->strip[esub]) == O_CH)
562 				return(NULL);	/* there is none */
563 			esub++;
564 			assert(OP(m->g->strip[esub]) == OOR2);
565 			ssub = esub + 1;
566 			esub += OPND(m->g->strip[esub]);
567 			if (OP(m->g->strip[esub]) == OOR2)
568 				esub--;
569 			else
570 				assert(OP(m->g->strip[esub]) == O_CH);
571 		}
572 		break;
573 	case OLPAREN:		/* must undo assignment if rest fails */
574 		i = OPND(s);
575 		offsave = m->pmatch[i].rm_so;
576 		m->pmatch[i].rm_so = sp - m->offp;
577 		dp = backref(m, sp, stop, ss+1, stopst, lev);
578 		if (dp != NULL)
579 			return(dp);
580 		m->pmatch[i].rm_so = offsave;
581 		return(NULL);
582 		break;
583 	case ORPAREN:		/* must undo assignment if rest fails */
584 		i = OPND(s);
585 		offsave = m->pmatch[i].rm_eo;
586 		m->pmatch[i].rm_eo = sp - m->offp;
587 		dp = backref(m, sp, stop, ss+1, stopst, lev);
588 		if (dp != NULL)
589 			return(dp);
590 		m->pmatch[i].rm_eo = offsave;
591 		return(NULL);
592 		break;
593 	default:		/* uh oh */
594 		assert(0);
595 		break;
596 	}
597 
598 	/* "can't happen" */
599 	assert(0);
600 	/* NOTREACHED */
601 }
602 
603 /*
604  - fast - step through the string at top speed
605  == static uchar *fast(register struct match *m, uchar *start, \
606  ==	uchar *stop, sopno startst, sopno stopst);
607  */
608 static uchar *			/* where tentative match ended, or NULL */
609 fast(m, start, stop, startst, stopst)
610 register struct match *m;
611 uchar *start;
612 uchar *stop;
613 sopno startst;
614 sopno stopst;
615 {
616 	register states st = m->st;
617 	register states fresh = m->fresh;
618 	register states tmp = m->tmp;
619 	register uchar *p = start;
620 	register uchar c = (start == m->beginp) ? '\0' : *(start-1);
621 	register uchar lastc;	/* previous c */
622 	register int atbol;
623 	register int ateol;
624 	register uchar *coldp;	/* last p after which no match was underway */
625 
626 	CLEAR(st);
627 	SET1(st, startst);
628 	st = expand(m->g, startst, stopst, st, 0, 0);
629 	ASSIGN(fresh, st);
630 	SP("start", st, *p);
631 	coldp = NULL;
632 	for (;;) {
633 		/* next character */
634 		lastc = c;
635 		c = (p == m->endp) ? '\0' : *p;
636 		if (EQ(st, fresh))
637 			coldp = p;
638 
639 		/* is there an EOL and/or BOL between lastc and c? */
640 		atbol = ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
641 				(lastc == '\0' && !(m->eflags&REG_NOTBOL)) );
642 		ateol = ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
643 				(c == '\0' && !(m->eflags&REG_NOTEOL)) );
644 		if (atbol || ateol) {
645 			st = expand(m->g, startst, stopst, st, atbol, ateol);
646 			SP("bef", st, c);
647 		}
648 
649 		/* are we done? */
650 		if (ISSET(st, stopst) || p == stop)
651 			break;		/* NOTE BREAK OUT */
652 
653 		/* no, we must deal with this character */
654 		ASSIGN(tmp, st);
655 		ASSIGN(st, fresh);
656 		st = step(m->g, startst, stopst, tmp, c, st);
657 		SP("aft", st, c);
658 		assert(EQ(expand(m->g, startst, stopst, st, 0, 0), st));
659 		p++;
660 	}
661 
662 	assert(coldp != NULL);
663 	m->coldp = coldp;
664 	if (ISSET(st, stopst))
665 		return(p+1);
666 	else
667 		return(NULL);
668 }
669 
670 /*
671  - slow - step through the string more deliberately
672  == static uchar *slow(register struct match *m, uchar *start, \
673  ==	uchar *stop, sopno startst, sopno stopst);
674  */
675 static uchar *			/* where it ended */
676 slow(m, start, stop, startst, stopst)
677 register struct match *m;
678 uchar *start;
679 uchar *stop;
680 sopno startst;
681 sopno stopst;
682 {
683 	register states st = m->st;
684 	register states empty = m->empty;
685 	register states tmp = m->tmp;
686 	register uchar *p = start;
687 	register uchar c = (start == m->beginp) ? '\0' : *(start-1);
688 	register uchar lastc;	/* previous c */
689 	register int atbol;
690 	register int ateol;
691 	register uchar *matchp;	/* last p at which a match ended */
692 
693 	AT("slow", start, stop, startst, stopst);
694 	CLEAR(st);
695 	SET1(st, startst);
696 	SP("sstart", st, *p);
697 	matchp = NULL;
698 	for (;;) {
699 		/* next character */
700 		lastc = c;
701 		c = (p == m->endp) ? '\0' : *p;
702 
703 		/* is there an EOL and/or BOL between lastc and c? */
704 		atbol = ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
705 				(lastc == '\0' && !(m->eflags&REG_NOTBOL)) );
706 		ateol = ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
707 				(c == '\0' && !(m->eflags&REG_NOTEOL)) );
708 
709 		/* do we need an expansion before looking at the char? */
710 		if (p == start || atbol || ateol) {
711 			st = expand(m->g, startst, stopst, st, atbol, ateol);
712 			SP("sbef", st, c);
713 		}
714 
715 		/* are we done? */
716 		if (ISSET(st, stopst))
717 			matchp = p;
718 		if (EQ(st, empty) || p == stop)
719 			break;		/* NOTE BREAK OUT */
720 
721 		/* no, we must deal with this character */
722 		ASSIGN(tmp, st);
723 		ASSIGN(st, empty);
724 		st = step(m->g, startst, stopst, tmp, c, st);
725 		SP("saft", st, c);
726 		assert(EQ(expand(m->g, startst, stopst, st, 0, 0), st));
727 		p++;
728 	}
729 
730 	return(matchp);
731 }
732 
733 
734 /*
735  - expand - return set of states reachable from an initial set
736  == static states expand(register struct re_guts *g, int start, \
737  ==	int stop, register states st, int atbol, int ateol);
738  */
739 static states
740 expand(g, start, stop, st, atbol, ateol)
741 register struct re_guts *g;
742 int start;			/* start state within strip */
743 int stop;			/* state after stop state within strip */
744 register states st;
745 int atbol;			/* at start or just after \n? (for BOL) */
746 int ateol;			/* just before \n or \0? (for EOL) */
747 {
748 	register sop s;
749 	register sopno pc;
750 	register onestate here;		/* note, macros know this name */
751 	register sopno look;
752 	register int i;
753 
754 	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
755 		s = g->strip[pc];
756 		switch (OP(s)) {
757 		case OEND:
758 			assert(pc == stop-1);
759 			break;
760 		case OCHAR:		/* can't get past this */
761 			break;
762 		case OBOL:
763 			if (atbol)
764 				FWD(st, st, 1);
765 			break;
766 		case OEOL:
767 			if (ateol)
768 				FWD(st, st, 1);
769 			break;
770 		case OANY:		/* can't get past this either */
771 			break;
772 		case OANYOF:		/* or this */
773 			break;
774 		case OBACK_:		/* not significant here */
775 		case O_BACK:
776 			FWD(st, st, 1);
777 			break;
778 		case OPLUS_:		/* forward, this is just an empty */
779 			FWD(st, st, 1);
780 			break;
781 		case O_PLUS:		/* both forward and back */
782 			FWD(st, st, 1);
783 			i = ISSETBACK(st, OPND(s));
784 			BACK(st, st, OPND(s));
785 			if (!i && ISSETBACK(st, OPND(s))) {
786 				/* oho, must reconsider loop body */
787 				pc -= OPND(s) + 1;
788 				INIT(here, pc);
789 			}
790 			break;
791 		case OQUEST_:		/* two branches, both forward */
792 			FWD(st, st, 1);
793 			FWD(st, st, OPND(s));
794 			break;
795 		case O_QUEST:		/* just an empty */
796 			FWD(st, st, 1);
797 			break;
798 		case OLPAREN:		/* not significant here */
799 		case ORPAREN:
800 			FWD(st, st, 1);
801 			break;
802 		case OCH_:		/* mark the first two branches */
803 			FWD(st, st, 1);
804 			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
805 			FWD(st, st, OPND(s));
806 			break;
807 		case OOR1:		/* done a branch, find the O_CH */
808 			if (ISSTATEIN(st, here)) {
809 				for (look = 1;
810 						OP(s = g->strip[pc+look]) != O_CH;
811 						look += OPND(s))
812 					assert(OP(s) == OOR2);
813 				FWD(st, st, look);
814 			}
815 			break;
816 		case OOR2:		/* propagate OCH_'s marking onward */
817 			FWD(st, st, 1);
818 			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
819 				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
820 				FWD(st, st, OPND(s));
821 			}
822 			break;
823 		case O_CH:		/* just an empty */
824 			FWD(st, st, 1);
825 			break;
826 		default:		/* ooooops... */
827 			assert(0);
828 			break;
829 		}
830 	}
831 
832 	return(st);
833 }
834 
835 /*
836  - step - map set of states reachable before char to set reachable after
837  == static states step(register struct re_guts *g, int start, int stop, \
838  ==	register states bef, uchar ch, register states aft);
839  */
840 static states
841 step(g, start, stop, bef, ch, aft)
842 register struct re_guts *g;
843 int start;			/* start state within strip */
844 int stop;			/* state after stop state within strip */
845 register states bef;		/* states reachable before */
846 uchar ch;
847 register states aft;		/* states already known reachable after */
848 {
849 	register cset *cs;
850 	register sop s;
851 	register sopno pc;
852 	register onestate here;		/* note, macros know this name */
853 	register sopno look;
854 	register int i;
855 
856 	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
857 		s = g->strip[pc];
858 		switch (OP(s)) {
859 		case OEND:
860 			assert(pc == stop-1);
861 			break;
862 		case OCHAR:
863 			if (ch == OPND(s))
864 				FWD(aft, bef, 1);
865 			break;
866 		case OBOL:	/* these are looked after by expand() */
867 		case OEOL:
868 			break;
869 		case OANY:
870 			FWD(aft, bef, 1);
871 			break;
872 		case OANYOF:
873 			cs = &g->sets[OPND(s)];
874 			if (CHIN(cs, ch))
875 				FWD(aft, bef, 1);
876 			break;
877 		case OBACK_:		/* ignored here */
878 		case O_BACK:
879 			FWD(aft, aft, 1);
880 			break;
881 		case OPLUS_:		/* forward, this is just an empty */
882 			FWD(aft, aft, 1);
883 			break;
884 		case O_PLUS:		/* both forward and back */
885 			FWD(aft, aft, 1);
886 			i = ISSETBACK(aft, OPND(s));
887 			BACK(aft, aft, OPND(s));
888 			if (!i && ISSETBACK(aft, OPND(s))) {
889 				/* oho, must reconsider loop body */
890 				pc -= OPND(s) + 1;
891 				INIT(here, pc);
892 			}
893 			break;
894 		case OQUEST_:		/* two branches, both forward */
895 			FWD(aft, aft, 1);
896 			FWD(aft, aft, OPND(s));
897 			break;
898 		case O_QUEST:		/* just an empty */
899 			FWD(aft, aft, 1);
900 			break;
901 		case OLPAREN:		/* not significant here */
902 		case ORPAREN:
903 			FWD(aft, aft, 1);
904 			break;
905 		case OCH_:		/* mark the first two branches */
906 			FWD(aft, aft, 1);
907 			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
908 			FWD(aft, aft, OPND(s));
909 			break;
910 		case OOR1:		/* done a branch, find the O_CH */
911 			if (ISSTATEIN(aft, here)) {
912 				for (look = 1;
913 						OP(s = g->strip[pc+look]) != O_CH;
914 						look += OPND(s))
915 					assert(OP(s) == OOR2);
916 				FWD(aft, aft, look);
917 			}
918 			break;
919 		case OOR2:		/* propagate OCH_'s marking */
920 			FWD(aft, aft, 1);
921 			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
922 				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
923 				FWD(aft, aft, OPND(s));
924 			}
925 			break;
926 		case O_CH:		/* just empty */
927 			FWD(aft, aft, 1);
928 			break;
929 		default:		/* ooooops... */
930 			assert(0);
931 			break;
932 		}
933 	}
934 
935 	return(aft);
936 }
937 
938 #ifdef REDEBUG
939 /*
940  - print - print a set of states
941  == #ifdef REDEBUG
942  == static void print(struct match *m, char *caption, states st, \
943  ==	uchar ch, FILE *d);
944  == #endif
945  */
946 static void
947 print(m, caption, st, ch, d)
948 struct match *m;
949 char *caption;
950 states st;
951 uchar ch;
952 FILE *d;
953 {
954 	register struct re_guts *g = m->g;
955 	register int i;
956 	register int first = 1;
957 
958 	if (!(m->eflags&REG_TRACE))
959 		return;
960 
961 	fprintf(d, "%s", caption);
962 	if (ch != '\0')
963 		fprintf(d, " %s", pchar(ch));
964 	for (i = 0; i < g->nstates; i++)
965 		if (ISSET(st, i)) {
966 			fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
967 			first = 0;
968 		}
969 	fprintf(d, "\n");
970 }
971 
972 /*
973  - at - print current situation
974  == #ifdef REDEBUG
975  == static void at(struct match *m, char *title, uchar *start, uchar *stop, \
976  ==						sopno startst, stopno stopst);
977  == #endif
978  */
979 static void
980 at(m, title, start, stop, startst, stopst)
981 struct match *m;
982 char *title;
983 uchar *start;
984 uchar *stop;
985 sopno startst;
986 sopno stopst;
987 {
988 	if (!(m->eflags&REG_TRACE))
989 		return;
990 
991 	printf("%s %s-", title, pchar(*start));
992 	printf("%s ", pchar(*stop));
993 	printf("%ld-%ld\n", (long)startst, (long)stopst);
994 }
995 
996 #ifndef PCHARDONE
997 #define	PCHARDONE	/* never again */
998 /*
999  - pchar - make a character printable
1000  == #ifdef REDEBUG
1001  == static char *pchar(int ch);
1002  == #endif
1003  *
1004  * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1005  * duplicate here avoids having a debugging-capable regexec.o tied to
1006  * a matching debug.o, and this is convenient.  It all disappears in
1007  * the non-debug compilation anyway, so it doesn't matter much.
1008  */
1009 static char *			/* -> representation */
1010 pchar(ch)
1011 int ch;
1012 {
1013 	static char pbuf[10];
1014 
1015 	if (isprint(ch) || ch == ' ')
1016 		sprintf(pbuf, "%c", ch);
1017 	else
1018 		sprintf(pbuf, "\\%o", ch);
1019 	return(pbuf);
1020 }
1021 #endif
1022 #endif
1023 
1024 #undef	matcher
1025 #undef	fast
1026 #undef	slow
1027 #undef	dissect
1028 #undef	backref
1029 #undef	expand
1030 #undef	step
1031 #undef	print
1032 #undef	at
1033 #undef	match
1034