xref: /netbsd-src/lib/libc/regex/engine.c (revision 2b21efb927cf53c1af248d9507a1cda795ca2aa2)
1 /* $NetBSD: engine.c,v 1.26 2021/02/24 09:10:12 wiz Exp $ */
2 
3 /*-
4  * SPDX-License-Identifier: BSD-3-Clause
5  *
6  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
7  * Copyright (c) 1992, 1993, 1994
8  *	The Regents of the University of California.  All rights reserved.
9  *
10  * This code is derived from software contributed to Berkeley by
11  * Henry Spencer.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	@(#)engine.c	8.5 (Berkeley) 3/20/94
38  */
39 
40 #include <sys/cdefs.h>
41 #ifdef __FBSDID
42 __FBSDID("$FreeBSD: head/lib/libc/regex/engine.c 368358 2020-12-05 03:16:05Z kevans $");
43 #endif
44 __RCSID("$NetBSD: engine.c,v 1.26 2021/02/24 09:10:12 wiz Exp $");
45 
46 #include <stdbool.h>
47 
48 /*
49  * The matching engine and friends.  This file is #included by regexec.c
50  * after suitable #defines of a variety of macros used herein, so that
51  * different state representations can be used without duplicating masses
52  * of code.
53  */
54 
55 #ifdef SNAMES
56 #define	stepback sstepback
57 #define	matcher	smatcher
58 #define	walk	swalk
59 #define	dissect	sdissect
60 #define	backref	sbackref
61 #define	step	sstep
62 #define	print	sprint
63 #define	at	sat
64 #define	match	smat
65 #endif
66 #ifdef LNAMES
67 #define	stepback lstepback
68 #define	matcher	lmatcher
69 #define	walk	lwalk
70 #define	dissect	ldissect
71 #define	backref	lbackref
72 #define	step	lstep
73 #define	print	lprint
74 #define	at	lat
75 #define	match	lmat
76 #endif
77 #ifdef MNAMES
78 #define	stepback mstepback
79 #define	matcher	mmatcher
80 #define	walk	mwalk
81 #define	dissect	mdissect
82 #define	backref	mbackref
83 #define	step	mstep
84 #define	print	mprint
85 #define	at	mat
86 #define	match	mmat
87 #endif
88 
89 /* another structure passed up and down to avoid zillions of parameters */
90 struct match {
91 	struct re_guts *g;
92 	int eflags;
93 	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
94 	const char *offp;	/* offsets work from here */
95 	const char *beginp;	/* start of string -- virtual NUL precedes */
96 	const char *endp;	/* end of string -- virtual NUL here */
97 	const char *coldp;	/* can be no match starting before here */
98 	const char **lastpos;	/* [nplus+1] */
99 	STATEVARS;
100 	states st;		/* current states */
101 	states fresh;		/* states for a fresh start */
102 	states tmp;		/* temporary */
103 	states empty;		/* empty set of states */
104 	mbstate_t mbs;		/* multibyte conversion state */
105 };
106 
107 /* ========= begin header generated by ./mkh ========= */
108 #ifdef __cplusplus
109 extern "C" {
110 #endif
111 
112 /* === engine.c === */
113 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
114 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
115 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int);
116 static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast);
117 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags);
118 #define MAX_RECURSION	100
119 #define	BOL	(OUT-1)
120 #define	EOL	(BOL-1)
121 #define	BOLEOL	(BOL-2)
122 #define	NOTHING	(BOL-3)
123 #define	BOW	(BOL-4)
124 #define	EOW	(BOL-5)
125 #define	BADCHAR	(BOL-6)
126 #define	NWBND	(BOL-7)
127 #define	NONCHAR(c)	((c) <= OUT)
128 /* sflags */
129 #define	SBOS	0x0001
130 #define	SEOS	0x0002
131 
132 #ifdef REDEBUG
133 static void print(struct match *m, const char *caption, states st, int ch, FILE *d);
134 #endif
135 #ifdef REDEBUG
136 static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst);
137 #endif
138 #ifdef REDEBUG
139 static const char *pchar(int ch);
140 #endif
141 
142 #ifdef __cplusplus
143 }
144 #endif
145 /* ========= end header generated by ./mkh ========= */
146 
147 #ifdef REDEBUG
148 #define	SP(t, s, c)	print(m, t, s, c, stdout)
149 #define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
150 #define	NOTE(str)	{ if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
151 #else
152 #define	SP(t, s, c)	/* nothing */
153 #define	AT(t, p1, p2, s1, s2)	/* nothing */
154 #define	NOTE(s)	/* nothing */
155 #endif
156 
157 /*
158  * Given a multibyte string pointed to by start, step back nchar characters
159  * from current position pointed to by cur.
160  */
161 static const char *
162 stepback(const char *start, const char *cur, int nchar)
163 {
164 	const char *ret;
165 	size_t wc, mbc;
166 	mbstate_t mbs;
167 	size_t clen;
168 
169 	if (MB_CUR_MAX == 1)
170 		return ((cur - nchar) > start ? cur - nchar : NULL);
171 
172 	ret = cur;
173 	for (wc = nchar; wc > 0; wc--) {
174 		for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) {
175 			if ((ret - mbc) < start)
176 				return (NULL);
177 			memset(&mbs, 0, sizeof(mbs));
178 			clen = mbrtowc(NULL, ret - mbc, mbc, &mbs);
179 			if (clen != (size_t)-1 && clen != (size_t)-2)
180 				break;
181 		}
182 		if (mbc > MB_CUR_MAX)
183 			return (NULL);
184 		ret -= mbc;
185 	}
186 
187 	return (ret);
188 }
189 
190 /*
191  - matcher - the actual matching engine
192  == static int matcher(struct re_guts *g, const char *string, \
193  ==	size_t nmatch, regmatch_t pmatch[], int eflags);
194  */
195 static int			/* 0 success, REG_NOMATCH failure */
196 matcher(struct re_guts *g,
197 	const char *string,
198 	size_t nmatch,
199 	regmatch_t pmatch[],
200 	int eflags)
201 {
202 	const char *endp;
203 	size_t i;
204 	struct match mv;
205 	struct match *m = &mv;
206 	const char *dp = NULL;
207 	const sopno gf = g->firststate+1;	/* +1 for OEND */
208 	const sopno gl = g->laststate;
209 	const char *start;
210 	const char *stop;
211 	/* Boyer-Moore algorithms variables */
212 	const char *pp;
213 	int cj, mj;
214 	const char *mustfirst;
215 	const char *mustlast;
216 	size_t *matchjump;
217 	size_t *charjump;
218 	int error = 0;
219 
220 	_DIAGASSERT(g != NULL);
221 	_DIAGASSERT(string != NULL);
222 	/* pmatch checked below */
223 
224 	/* simplify the situation where possible */
225 	if (g->cflags&REG_NOSUB)
226 		nmatch = 0;
227 	if (eflags&REG_STARTEND) {
228 		_DIAGASSERT(pmatch != NULL);
229 		start = string + (size_t)pmatch[0].rm_so;
230 		stop = string + (size_t)pmatch[0].rm_eo;
231 	} else {
232 		start = string;
233 		stop = start + strlen(start);
234 	}
235 	if (stop < start)
236 		return(REG_INVARG);
237 
238 	/* prescreening; this does wonders for this rather slow code */
239 	if (g->must != NULL) {
240 		if (g->charjump != NULL && g->matchjump != NULL) {
241 			mustfirst = g->must;
242 			mustlast = g->must + g->mlen - 1;
243 			charjump = g->charjump;
244 			matchjump = g->matchjump;
245 			pp = mustlast;
246 			for (dp = start+g->mlen-1; dp < stop;) {
247 				/* Fast skip non-matches */
248 				while (dp < stop && charjump[(int)*dp])
249 					dp += charjump[(int)*dp];
250 
251 				if (dp >= stop)
252 					break;
253 
254 				/* Greedy matcher */
255 				/* We depend on not being used for
256 				 * for strings of length 1
257 				 */
258 				while (*--dp == *--pp && pp != mustfirst);
259 
260 				if (*dp == *pp)
261 					break;
262 
263 				/* Jump to next possible match */
264 				mj = matchjump[pp - mustfirst];
265 				cj = charjump[(int)*dp];
266 				dp += (cj < mj ? mj : cj);
267 				pp = mustlast;
268 			}
269 			if (pp != mustfirst)
270 				return(REG_NOMATCH);
271 		} else {
272 			for (dp = start; dp < stop; dp++)
273 				if (*dp == g->must[0] &&
274 				    (size_t)(stop - dp) >= g->mlen &&
275 				    memcmp(dp, g->must, (size_t)g->mlen) == 0)
276 					break;
277 			if (dp == stop)		/* we didn't find g->must */
278 				return(REG_NOMATCH);
279 		}
280 	}
281 
282 	/* match struct setup */
283 	m->g = g;
284 	m->eflags = eflags;
285 	m->pmatch = NULL;
286 	m->lastpos = NULL;
287 	m->offp = string;
288 	m->beginp = start;
289 	m->endp = stop;
290 	STATESETUP(m, 4);
291 	SETUP(m->st);
292 	SETUP(m->fresh);
293 	SETUP(m->tmp);
294 	SETUP(m->empty);
295 	CLEAR(m->empty);
296 	ZAPSTATE(&m->mbs);
297 
298 	/* Adjust start according to moffset, to speed things up */
299 	if (dp != NULL && g->moffset > -1) {
300 		const char *nstart;
301 
302 		nstart = stepback(start, dp, g->moffset);
303 		if (nstart != NULL)
304 			start = nstart;
305 	}
306 
307 	SP("mloop", m->st, *start);
308 
309 	/* this loop does only one repetition except for backrefs */
310 	for (;;) {
311 		endp = walk(m, start, stop, gf, gl, true);
312 		if (endp == NULL) {		/* a miss */
313 			error = REG_NOMATCH;
314 			goto done;
315 		}
316 		if (nmatch == 0 && !g->backrefs)
317 			break;		/* no further info needed */
318 
319 		/* where? */
320 		assert(m->coldp != NULL);
321 		for (;;) {
322 			NOTE("finding start");
323 			endp = walk(m, m->coldp, stop, gf, gl, false);
324 			if (endp != NULL)
325 				break;
326 			assert(m->coldp < m->endp);
327 			m->coldp += XMBRTOWC(NULL, m->coldp,
328 			    m->endp - m->coldp, &m->mbs, 0);
329 		}
330 		if (nmatch == 1 && !g->backrefs)
331 			break;		/* no further info needed */
332 
333 		/* oh my, he wants the subexpressions... */
334 		if (m->pmatch == NULL)
335 			m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
336 							sizeof(regmatch_t));
337 		if (m->pmatch == NULL) {
338 			error = REG_ESPACE;
339 			goto done;
340 		}
341 		for (i = 1; i <= m->g->nsub; i++)
342 			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
343 		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
344 			NOTE("dissecting");
345 			dp = dissect(m, m->coldp, endp, gf, gl);
346 		} else {
347 			if (g->nplus > 0 && m->lastpos == NULL)
348 				m->lastpos = malloc((g->nplus+1) *
349 						sizeof(const char *));
350 			if (g->nplus > 0 && m->lastpos == NULL) {
351 				error = REG_ESPACE;
352 				goto done;
353 			}
354 			NOTE("backref dissect");
355 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
356 		}
357 		if (dp != NULL)
358 			break;
359 
360 		/* uh-oh... we couldn't find a subexpression-level match */
361 		assert(g->backrefs);	/* must be back references doing it */
362 		assert(g->nplus == 0 || m->lastpos != NULL);
363 		for (;;) {
364 			if (dp != NULL || endp <= m->coldp)
365 				break;		/* defeat */
366 			NOTE("backoff");
367 			endp = walk(m, m->coldp, endp-1, gf, gl, false);
368 			if (endp == NULL)
369 				break;		/* defeat */
370 			/* try it on a shorter possibility */
371 #ifndef NDEBUG
372 			for (i = 1; i <= m->g->nsub; i++) {
373 				assert(m->pmatch[i].rm_so == (regoff_t)-1);
374 				assert(m->pmatch[i].rm_eo == (regoff_t)-1);
375 			}
376 #endif
377 			NOTE("backoff dissect");
378 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
379 		}
380 		assert(dp == NULL || dp == endp);
381 		if (dp != NULL)		/* found a shorter one */
382 			break;
383 
384 		/* despite initial appearances, there is no match here */
385 		NOTE("false alarm");
386 		/* recycle starting later */
387 		start = m->coldp + XMBRTOWC(NULL, m->coldp,
388 		    stop - m->coldp, &m->mbs, 0);
389 		assert(start <= stop);
390 	}
391 
392 	/* fill in the details if requested */
393 	if (nmatch > 0) {
394 		_DIAGASSERT(pmatch != NULL);
395 		pmatch[0].rm_so = m->coldp - m->offp;
396 		pmatch[0].rm_eo = endp - m->offp;
397 	}
398 	if (nmatch > 1) {
399 		assert(m->pmatch != NULL);
400 		for (i = 1; i < nmatch; i++)
401 			if (i <= m->g->nsub)
402 				pmatch[i] = m->pmatch[i];
403 			else {
404 				pmatch[i].rm_so = (regoff_t)-1;
405 				pmatch[i].rm_eo = (regoff_t)-1;
406 			}
407 	}
408 
409 done:
410 	if (m->pmatch != NULL) {
411 		free(m->pmatch);
412 		m->pmatch = NULL;
413 	}
414 	if (m->lastpos != NULL) {
415 		free((char *)m->lastpos);
416 		m->lastpos = NULL;
417 	}
418 	STATETEARDOWN(m);
419 	return error;
420 }
421 
422 /*
423  - dissect - figure out what matched what, no back references
424  == static const char *dissect(struct match *m, const char *start, \
425  ==	const char *stop, sopno startst, sopno stopst);
426  */
427 static const char *		/* == stop (success) always */
428 dissect(
429 	struct match *m,
430 	const char *start,
431 	const char *stop,
432 	sopno startst,
433 	sopno stopst)
434 {
435 	int i;
436 	sopno ss;		/* start sop of current subRE */
437 	sopno es;		/* end sop of current subRE */
438 	const char *sp;		/* start of string matched by it */
439 	const char *stp;	/* string matched by it cannot pass here */
440 	const char *rest;	/* start of rest of string */
441 	const char *tail;	/* string unmatched by rest of RE */
442 	sopno ssub;		/* start sop of subsubRE */
443 	sopno esub;		/* end sop of subsubRE */
444 	const char *ssp;	/* start of string matched by subsubRE */
445 	const char *sep;	/* end of string matched by subsubRE */
446 	const char *oldssp;	/* previous ssp */
447 	const char *dp __unused;
448 
449 	_DIAGASSERT(m != NULL);
450 	_DIAGASSERT(start != NULL);
451 	_DIAGASSERT(stop != NULL);
452 
453 	AT("diss", start, stop, startst, stopst);
454 	sp = start;
455 	for (ss = startst; ss < stopst; ss = es) {
456 		/* identify end of subRE */
457 		es = ss;
458 		switch (OP(m->g->strip[es])) {
459 		case OPLUS_:
460 		case OQUEST_:
461 			es += OPND(m->g->strip[es]);
462 			break;
463 		case OCH_:
464 			while (OP(m->g->strip[es]) != (sop)O_CH)
465 				es += OPND(m->g->strip[es]);
466 			break;
467 		}
468 		es++;
469 
470 		/* figure out what it matched */
471 		switch (OP(m->g->strip[ss])) {
472 		case OEND:
473 			assert(nope);
474 			break;
475 		case OCHAR:
476 			sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
477 			break;
478 		case OBOL:
479 		case OEOL:
480 		case OBOW:
481 		case OEOW:
482 		case OBOS:
483 		case OEOS:
484 		case OWBND:
485 		case ONWBND:
486 			break;
487 		case OANY:
488 		case OANYOF:
489 			sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
490 			break;
491 		case OBACK_:
492 		case O_BACK:
493 			assert(nope);
494 			break;
495 		/* cases where length of match is hard to find */
496 		case OQUEST_:
497 			stp = stop;
498 			for (;;) {
499 				/* how long could this one be? */
500 				rest = walk(m, sp, stp, ss, es, false);
501 				assert(rest != NULL);	/* it did match */
502 				/* could the rest match the rest? */
503 				tail = walk(m, rest, stop, es, stopst, false);
504 				if (tail == stop)
505 					break;		/* yes! */
506 				/* no -- try a shorter match for this one */
507 				stp = rest - 1;
508 				assert(stp >= sp);	/* it did work */
509 			}
510 			ssub = ss + 1;
511 			esub = es - 1;
512 			/* did innards match? */
513 			if (walk(m, sp, rest, ssub, esub, false) != NULL) {
514 				dp = dissect(m, sp, rest, ssub, esub);
515 				assert(dp == rest);
516 			} else		/* no */
517 				assert(sp == rest);
518 			sp = rest;
519 			break;
520 		case OPLUS_:
521 			stp = stop;
522 			for (;;) {
523 				/* how long could this one be? */
524 				rest = walk(m, sp, stp, ss, es, false);
525 				assert(rest != NULL);	/* it did match */
526 				/* could the rest match the rest? */
527 				tail = walk(m, rest, stop, es, stopst, false);
528 				if (tail == stop)
529 					break;		/* yes! */
530 				/* no -- try a shorter match for this one */
531 				stp = rest - 1;
532 				assert(stp >= sp);	/* it did work */
533 			}
534 			ssub = ss + 1;
535 			esub = es - 1;
536 			ssp = sp;
537 			oldssp = ssp;
538 			for (;;) {	/* find last match of innards */
539 				sep = walk(m, ssp, rest, ssub, esub, false);
540 				if (sep == NULL || sep == ssp)
541 					break;	/* failed or matched null */
542 				oldssp = ssp;	/* on to next try */
543 				ssp = sep;
544 			}
545 			if (sep == NULL) {
546 				/* last successful match */
547 				sep = ssp;
548 				ssp = oldssp;
549 			}
550 			assert(sep == rest);	/* must exhaust substring */
551 			assert(walk(m, ssp, sep, ssub, esub, false) == rest);
552 			dp = dissect(m, ssp, sep, ssub, esub);
553 			assert(dp == sep);
554 			sp = rest;
555 			break;
556 		case OCH_:
557 			stp = stop;
558 			for (;;) {
559 				/* how long could this one be? */
560 				rest = walk(m, sp, stp, ss, es, false);
561 				assert(rest != NULL);	/* it did match */
562 				/* could the rest match the rest? */
563 				tail = walk(m, rest, stop, es, stopst, false);
564 				if (tail == stop)
565 					break;		/* yes! */
566 				/* no -- try a shorter match for this one */
567 				stp = rest - 1;
568 				assert(stp >= sp);	/* it did work */
569 			}
570 			ssub = ss + 1;
571 			esub = ss + OPND(m->g->strip[ss]) - 1;
572 			assert(OP(m->g->strip[esub]) == OOR1);
573 			for (;;) {	/* find first matching branch */
574 				if (walk(m, sp, rest, ssub, esub, false) == rest)
575 					break;	/* it matched all of it */
576 				/* that one missed, try next one */
577 				assert(OP(m->g->strip[esub]) == OOR1);
578 				esub++;
579 				assert(OP(m->g->strip[esub]) == OOR2);
580 				ssub = esub + 1;
581 				esub += OPND(m->g->strip[esub]);
582 				if (OP(m->g->strip[esub]) == (sop)OOR2)
583 					esub--;
584 				else
585 					assert(OP(m->g->strip[esub]) == O_CH);
586 			}
587 			dp = dissect(m, sp, rest, ssub, esub);
588 			assert(dp == rest);
589 			sp = rest;
590 			break;
591 		case O_PLUS:
592 		case O_QUEST:
593 		case OOR1:
594 		case OOR2:
595 		case O_CH:
596 			assert(nope);
597 			break;
598 		case OLPAREN:
599 			i = OPND(m->g->strip[ss]);
600 			assert(0 < i && i <= m->g->nsub);
601 			m->pmatch[i].rm_so = sp - m->offp;
602 			break;
603 		case ORPAREN:
604 			i = OPND(m->g->strip[ss]);
605 			assert(0 < i && i <= m->g->nsub);
606 			m->pmatch[i].rm_eo = sp - m->offp;
607 			break;
608 		default:		/* uh oh */
609 			assert(nope);
610 			break;
611 		}
612 	}
613 
614 	assert(sp == stop);
615 	return(sp);
616 }
617 
618 #define	ISBOW(m, sp)					\
619     (sp < m->endp && ISWORD(*sp) &&			\
620     ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||	\
621     (sp > m->offp && !ISWORD(*(sp-1)))))
622 #define	ISEOW(m, sp)					\
623     (((sp == m->endp && !(m->eflags&REG_NOTEOL)) ||	\
624     (sp < m->endp && *sp == '\n' &&			\
625     (m->g->cflags&REG_NEWLINE)) ||			\
626     (sp < m->endp && !ISWORD(*sp)) ) &&			\
627     (sp > m->beginp && ISWORD(*(sp-1))))		\
628 
629 /*
630  - backref - figure out what matched what, figuring in back references
631  == static const char *backref(struct match *m, const char *start, \
632  ==	const char *stop, sopno startst, sopno stopst, sopno lev);
633  */
634 static const char *		/* == stop (success) or NULL (failure) */
635 backref(
636 	struct match *m,
637 	const char *start,
638 	const char *stop,
639 	sopno startst,
640 	sopno stopst,
641 	sopno lev,		/* PLUS nesting level */
642 	int rec)
643 {
644 	int i;
645 	sopno ss;		/* start sop of current subRE */
646 	const char *sp;		/* start of string matched by it */
647 	sopno ssub;		/* start sop of subsubRE */
648 	sopno esub;		/* end sop of subsubRE */
649 	const char *ssp;	/* start of string matched by subsubRE */
650 	const char *dp;
651 	size_t len;
652 	int hard;
653 	sop s;
654 	regoff_t offsave;
655 	cset *cs;
656 	wint_t wc;
657 
658 	_DIAGASSERT(m != NULL);
659 	_DIAGASSERT(start != NULL);
660 	_DIAGASSERT(stop != NULL);
661 
662 	AT("back", start, stop, startst, stopst);
663 	sp = start;
664 
665 	/* get as far as we can with easy stuff */
666 	hard = 0;
667 	for (ss = startst; !hard && ss < stopst; ss++)
668 		switch (OP(s = m->g->strip[ss])) {
669 		case OCHAR:
670 			if (sp == stop)
671 				return(NULL);
672 			sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
673 			if (wc != (wint_t)OPND(s))
674 				return(NULL);
675 			break;
676 		case OANY:
677 			if (sp == stop)
678 				return(NULL);
679 			sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
680 			if (wc == BADCHAR)
681 				return (NULL);
682 			break;
683 		case OANYOF:
684 			if (sp == stop)
685 				return (NULL);
686 			cs = &m->g->sets[OPND(s)];
687 			sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
688 			if (wc == BADCHAR || !CHIN(cs, wc))
689 				return(NULL);
690 			break;
691 		case OBOS:
692 			if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0)
693 				{ /* yes */ }
694 			else
695 				return(NULL);
696 			break;
697 		case OEOS:
698 			if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0)
699 				{ /* yes */ }
700 			else
701 				return(NULL);
702 			break;
703 		case OBOL:
704 			if ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
705 			    (sp > m->offp && sp < m->endp &&
706 			    *(sp-1) == '\n' && (m->g->cflags&REG_NEWLINE)))
707 				{ /* yes */ }
708 			else
709 				return(NULL);
710 			break;
711 		case OEOL:
712 			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
713 					(sp < m->endp && *sp == '\n' &&
714 						(m->g->cflags&REG_NEWLINE)) )
715 				{ /* yes */ }
716 			else
717 				return(NULL);
718 			break;
719 		case OWBND:
720 			if (ISBOW(m, sp) || ISEOW(m, sp))
721 				{ /* yes */ }
722 			else
723 				return(NULL);
724 			break;
725 		case ONWBND:
726 			if (((sp == m->beginp) && !ISWORD(*sp)) ||
727 			    (sp == m->endp && !ISWORD(*(sp - 1))))
728 				{ /* yes, beginning/end of subject */ }
729 			else if (ISWORD(*(sp - 1)) == ISWORD(*sp))
730 				{ /* yes, beginning/end of subject */ }
731 			else
732 				return(NULL);
733 			break;
734 		case OBOW:
735 			if (ISBOW(m, sp))
736 				{ /* yes */ }
737 			else
738 				return(NULL);
739 			break;
740 		case OEOW:
741 			if (ISEOW(m, sp))
742 				{ /* yes */ }
743 			else
744 				return(NULL);
745 			break;
746 		case O_QUEST:
747 			break;
748 		case OOR1:	/* matches null but needs to skip */
749 			ss++;
750 			s = m->g->strip[ss];
751 			do {
752 				assert(OP(s) == OOR2);
753 				ss += OPND(s);
754 			} while (OP(s = m->g->strip[ss]) != (sop)O_CH);
755 			/* note that the ss++ gets us past the O_CH */
756 			break;
757 		default:	/* have to make a choice */
758 			hard = 1;
759 			break;
760 		}
761 	if (!hard) {		/* that was it! */
762 		if (sp != stop)
763 			return(NULL);
764 		return(sp);
765 	}
766 	ss--;			/* adjust for the for's final increment */
767 
768 	/* the hard stuff */
769 	AT("hard", sp, stop, ss, stopst);
770 	s = m->g->strip[ss];
771 	switch (OP(s)) {
772 	case OBACK_:		/* the vilest depths */
773 		i = OPND(s);
774 		assert(0 < i && i <= m->g->nsub);
775 		if (m->pmatch[i].rm_eo == -1)
776 			return(NULL);
777 		assert(m->pmatch[i].rm_so != -1);
778 		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
779 		if (len == 0 && rec++ > MAX_RECURSION)
780 			return(NULL);
781 		assert(stop - m->beginp >= len);
782 		if (sp > stop - len)
783 			return(NULL);	/* not enough left to match */
784 		ssp = m->offp + m->pmatch[i].rm_so;
785 		if (memcmp(sp, ssp, len) != 0)
786 			return(NULL);
787 		while (m->g->strip[ss] != (sop)SOP(O_BACK, i))
788 			ss++;
789 		return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
790 	case OQUEST_:		/* to null or not */
791 		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
792 		if (dp != NULL)
793 			return(dp);	/* not */
794 		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
795 	case OPLUS_:
796 		assert(m->lastpos != NULL);
797 		assert(lev+1 <= m->g->nplus);
798 		m->lastpos[lev+1] = sp;
799 		return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
800 	case O_PLUS:
801 		if (sp == m->lastpos[lev])	/* last pass matched null */
802 			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
803 		/* try another pass */
804 		m->lastpos[lev] = sp;
805 		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
806 		if (dp == NULL)
807 			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
808 		else
809 			return(dp);
810 	case OCH_:		/* find the right one, if any */
811 		ssub = ss + 1;
812 		esub = ss + OPND(s) - 1;
813 		assert(OP(m->g->strip[esub]) == OOR1);
814 		for (;;) {	/* find first matching branch */
815 			dp = backref(m, sp, stop, ssub, esub, lev, rec);
816 			if (dp != NULL)
817 				return(dp);
818 			/* that one missed, try next one */
819 			if (OP(m->g->strip[esub]) == (sop)O_CH)
820 				return(NULL);	/* there is none */
821 			esub++;
822 			assert(OP(m->g->strip[esub]) == (sop)OOR2);
823 			ssub = esub + 1;
824 			esub += OPND(m->g->strip[esub]);
825 			if (OP(m->g->strip[esub]) == (sop)OOR2)
826 				esub--;
827 			else
828 				assert(OP(m->g->strip[esub]) == O_CH);
829 		}
830 		/* NOTREACHED */
831 		break;
832 	case OLPAREN:		/* must undo assignment if rest fails */
833 		i = OPND(s);
834 		assert(0 < i && i <= m->g->nsub);
835 		offsave = m->pmatch[i].rm_so;
836 		m->pmatch[i].rm_so = sp - m->offp;
837 		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
838 		if (dp != NULL)
839 			return(dp);
840 		m->pmatch[i].rm_so = offsave;
841 		return(NULL);
842 	case ORPAREN:		/* must undo assignment if rest fails */
843 		i = OPND(s);
844 		assert(0 < i && i <= m->g->nsub);
845 		offsave = m->pmatch[i].rm_eo;
846 		m->pmatch[i].rm_eo = sp - m->offp;
847 		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
848 		if (dp != NULL)
849 			return(dp);
850 		m->pmatch[i].rm_eo = offsave;
851 		return(NULL);
852 	default:		/* uh oh */
853 		assert(nope);
854 		break;
855 	}
856 
857 	/* "can't happen" */
858 	assert(nope);
859 	/* NOTREACHED */
860 	return "shut up gcc";
861 }
862 
863 /*
864  - walk - step through the string either quickly or slowly
865  == static const char *walk(struct match *m, const char *start, \
866  ==	const char *stop, sopno startst, sopno stopst, bool fast);
867  */
868 static const char * /* where it ended, or NULL */
869 walk(struct match *m, const char *start, const char *stop, sopno startst,
870 	sopno stopst, bool fast)
871 {
872 	states st = m->st;
873 	states fresh = m->fresh;
874 	states empty = m->empty;
875 	states tmp = m->tmp;
876 	const char *p = start;
877 	wint_t c;
878 	wint_t lastc;		/* previous c */
879 	wint_t flagch;
880 	int i, sflags;
881 	const char *matchp;	/* last p at which a match ended */
882 	size_t clen;
883 
884 	_DIAGASSERT(m != NULL);
885 	_DIAGASSERT(start != NULL);
886 	_DIAGASSERT(stop != NULL);
887 
888 	sflags = 0;
889 	AT("walk", start, stop, startst, stopst);
890 	CLEAR(st);
891 	SET1(st, startst);
892 	SP("sstart", st, *p);
893 	st = step(m->g, startst, stopst, st, NOTHING, st, sflags);
894 	if (fast)
895 		ASSIGN(fresh, st);
896 	matchp = NULL;
897 	if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL)))
898 		c = OUT;
899 	else {
900 		/*
901 		 * XXX Wrong if the previous character was multi-byte.
902 		 * Newline never is (in encodings supported by FreeBSD),
903 		 * so this only breaks the ISWORD tests below.
904 		 */
905 		c = (uch)*(start - 1);
906 	}
907 	for (;;) {
908 		/* next character */
909 		lastc = c;
910 		sflags = 0;
911 		if (p == m->endp) {
912 			c = OUT;
913 			clen = 0;
914 		} else
915 			clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
916 
917 		if (fast && EQ(st, fresh))
918 			matchp = p;
919 
920 		/* is there an EOL and/or BOL between lastc and c? */
921 		flagch = '\0';
922 		i = 0;
923 		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
924 				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
925 			flagch = BOL;
926 			i = m->g->nbol;
927 		}
928 		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
929 				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
930 			flagch = (flagch == BOL) ? BOLEOL : EOL;
931 			i += m->g->neol;
932 		}
933 		if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) {
934 			sflags |= SBOS;
935 			/* Step one more for BOS. */
936 			i++;
937 		}
938 		if (c == OUT && (m->eflags & REG_NOTEOL) == 0) {
939 			sflags |= SEOS;
940 			/* Step one more for EOS. */
941 			i++;
942 		}
943 		if (i != 0) {
944 			for (; i > 0; i--)
945 				st = step(m->g, startst, stopst, st, flagch, st,
946 				    sflags);
947 			SP("sboleol", st, c);
948 		}
949 
950 		/* how about a word boundary? */
951 		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
952 					(c != OUT && ISWORD(c)) ) {
953 			flagch = BOW;
954 		}
955 		if ( (lastc != OUT && ISWORD(lastc)) &&
956 				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
957 			flagch = EOW;
958 		}
959 		if (flagch == BOW || flagch == EOW) {
960 			st = step(m->g, startst, stopst, st, flagch, st, sflags);
961 			SP("sboweow", st, c);
962 		}
963 		if (lastc != OUT && c != OUT &&
964 		    ISWORD(lastc) == ISWORD(c)) {
965 			flagch = NWBND;
966 		} else if ((lastc == OUT && !ISWORD(c)) ||
967 		    (c == OUT && !ISWORD(lastc))) {
968 			flagch = NWBND;
969 		}
970 		if (flagch == NWBND) {
971 			st = step(m->g, startst, stopst, st, flagch, st, sflags);
972 			SP("snwbnd", st, c);
973 		}
974 
975 		/* are we done? */
976 		if (ISSET(st, stopst)) {
977 			if (fast)
978 				break;
979 			else
980 				matchp = p;
981 		}
982 		if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p))
983 			break;		/* NOTE BREAK OUT */
984 
985 		/* no, we must deal with this character */
986 		ASSIGN(tmp, st);
987 		if (fast)
988 			ASSIGN(st, fresh);
989 		else
990 			ASSIGN(st, empty);
991 		assert(c != OUT);
992 		st = step(m->g, startst, stopst, tmp, c, st, sflags);
993 		SP("saft", st, c);
994 		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags),
995 		    st));
996 		p += clen;
997 	}
998 
999 	if (fast) {
1000 		assert(matchp != NULL);
1001 		m->coldp = matchp;
1002 		if (ISSET(st, stopst))
1003 			return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0));
1004 		else
1005 			return (NULL);
1006 	} else
1007 		return (matchp);
1008 }
1009 
1010 /*
1011  - step - map set of states reachable before char to set reachable after
1012  == static states step(struct re_guts *g, sopno start, sopno stop, \
1013  ==	states bef, int ch, states aft);
1014  == #define	BOL	(OUT-1)
1015  == #define	EOL	(BOL-1)
1016  == #define	BOLEOL	(BOL-2)
1017  == #define	NOTHING	(BOL-3)
1018  == #define	BOW	(BOL-4)
1019  == #define	EOW	(BOL-5)
1020  == #define	BADCHAR	(BOL-6)
1021  == #define	NONCHAR(c)	((c) <= OUT)
1022  */
1023 static states
1024 step(struct re_guts *g,
1025 	sopno start,		/* start state within strip */
1026 	sopno stop,		/* state after stop state within strip */
1027 	states bef,		/* states reachable before */
1028 	wint_t ch,		/* character or NONCHAR code */
1029 	states aft,		/* states already known reachable after */
1030 	int sflags)		/* state flags */
1031 {
1032 	cset *cs;
1033 	sop s;
1034 	sopno pc;
1035 	onestate here;		/* note, macros know this name */
1036 	sopno look;
1037 	int i;
1038 
1039 	_DIAGASSERT(g != NULL);
1040 
1041 	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
1042 		s = g->strip[pc];
1043 		switch (OP(s)) {
1044 		case OEND:
1045 			assert(pc == stop-1);
1046 			break;
1047 		case OCHAR:
1048 			/* only characters can match */
1049 			assert(!NONCHAR(ch) || ch != OPND(s));
1050 			if (ch == (wint_t)OPND(s))
1051 				FWD(aft, bef, 1);
1052 			break;
1053 		case OBOS:
1054 			if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0)
1055 				FWD(aft, bef, 1);
1056 			break;
1057 		case OEOS:
1058 			if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0)
1059 				FWD(aft, bef, 1);
1060 			break;
1061 		case OBOL:
1062 			if (ch == BOL || ch == BOLEOL)
1063 				FWD(aft, bef, 1);
1064 			break;
1065 		case OEOL:
1066 			if (ch == EOL || ch == BOLEOL)
1067 				FWD(aft, bef, 1);
1068 			break;
1069 		case OBOW:
1070 			if (ch == BOW)
1071 				FWD(aft, bef, 1);
1072 			break;
1073 		case OEOW:
1074 			if (ch == EOW)
1075 				FWD(aft, bef, 1);
1076 			break;
1077 		case OWBND:
1078 			if (ch == BOW || ch == EOW)
1079 				FWD(aft, bef, 1);
1080 			break;
1081 		case ONWBND:
1082 			if (ch == NWBND)
1083 				FWD(aft, aft, 1);
1084 			break;
1085 		case OANY:
1086 			if (!NONCHAR(ch))
1087 				FWD(aft, bef, 1);
1088 			break;
1089 		case OANYOF:
1090 			cs = &g->sets[OPND(s)];
1091 			if (!NONCHAR(ch) && CHIN(cs, ch))
1092 				FWD(aft, bef, 1);
1093 			break;
1094 		case OBACK_:		/* ignored here */
1095 		case O_BACK:
1096 			FWD(aft, aft, 1);
1097 			break;
1098 		case OPLUS_:		/* forward, this is just an empty */
1099 			FWD(aft, aft, 1);
1100 			break;
1101 		case O_PLUS:		/* both forward and back */
1102 			FWD(aft, aft, 1);
1103 			i = ISSETBACK(aft, OPND(s));
1104 			BACK(aft, aft, OPND(s));
1105 			if (!i && ISSETBACK(aft, OPND(s))) {
1106 				/* oho, must reconsider loop body */
1107 				pc -= OPND(s) + 1;
1108 				INIT(here, pc);
1109 			}
1110 			break;
1111 		case OQUEST_:		/* two branches, both forward */
1112 			FWD(aft, aft, 1);
1113 			FWD(aft, aft, OPND(s));
1114 			break;
1115 		case O_QUEST:		/* just an empty */
1116 			FWD(aft, aft, 1);
1117 			break;
1118 		case OLPAREN:		/* not significant here */
1119 		case ORPAREN:
1120 			FWD(aft, aft, 1);
1121 			break;
1122 		case OCH_:		/* mark the first two branches */
1123 			FWD(aft, aft, 1);
1124 			assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2);
1125 			FWD(aft, aft, OPND(s));
1126 			break;
1127 		case OOR1:		/* done a branch, find the O_CH */
1128 			if (ISSTATEIN(aft, here)) {
1129 				for (look = 1;
1130 				    OP(s = g->strip[pc+look]) != (sop)O_CH;
1131 				    look += OPND(s))
1132 					assert(OP(s) == (sop)OOR2);
1133 				FWD(aft, aft, look + 1);
1134 			}
1135 			break;
1136 		case OOR2:		/* propagate OCH_'s marking */
1137 			FWD(aft, aft, 1);
1138 			if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) {
1139 				assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2);
1140 				FWD(aft, aft, OPND(s));
1141 			}
1142 			break;
1143 		case O_CH:		/* just empty */
1144 			FWD(aft, aft, 1);
1145 			break;
1146 		default:		/* ooooops... */
1147 			assert(nope);
1148 			break;
1149 		}
1150 	}
1151 
1152 	return(aft);
1153 }
1154 
1155 #ifdef REDEBUG
1156 /*
1157  - print - print a set of states
1158  == #ifdef REDEBUG
1159  == static void print(struct match *m, const char *caption, states st, \
1160  ==	int ch, FILE *d);
1161  == #endif
1162  */
1163 static void
1164 print(struct match *m,
1165 	const char *caption,
1166 	states st,
1167 	int ch,
1168 	FILE *d)
1169 {
1170 	struct re_guts *g = m->g;
1171 	sopno i;
1172 	int first = 1;
1173 
1174 	_DIAGASSERT(m != NULL);
1175 	_DIAGASSERT(caption != NULL);
1176 
1177 	if (!(m->eflags&REG_TRACE))
1178 		return;
1179 
1180 	_DIAGASSERT(d != NULL);
1181 
1182 	fprintf(d, "%s", caption);
1183 	if (ch != '\0')
1184 		fprintf(d, " %s", pchar(ch));
1185 	for (i = 0; i < g->nstates; i++)
1186 		if (ISSET(st, i)) {
1187 			fprintf(d, "%s%lu", (first) ? "\t" : ", ", i);
1188 			first = 0;
1189 		}
1190 	fprintf(d, "\n");
1191 }
1192 
1193 /*
1194  - at - print current situation
1195  == #ifdef REDEBUG
1196  == static void at(struct match *m, const char *title, const char *start, \
1197  ==			 const char *stop, sopno startst, sopno stopst);
1198  == #endif
1199  */
1200 static void
1201 at(	struct match *m,
1202 	const char *title,
1203 	const char *start,
1204 	const char *stop,
1205 	sopno startst,
1206 	sopno stopst)
1207 {
1208 
1209 	_DIAGASSERT(m != NULL);
1210 	_DIAGASSERT(title != NULL);
1211 	_DIAGASSERT(start != NULL);
1212 	_DIAGASSERT(stop != NULL);
1213 
1214 	if (!(m->eflags&REG_TRACE))
1215 		return;
1216 
1217 	printf("%s %s-", title, pchar(*start));
1218 	printf("%s ", pchar(*stop));
1219 	printf("%ld-%ld\n", (long)startst, (long)stopst);
1220 }
1221 
1222 #ifndef PCHARDONE
1223 #define	PCHARDONE	/* never again */
1224 /*
1225  - pchar - make a character printable
1226  == #ifdef REDEBUG
1227  == static const char *pchar(int ch);
1228  == #endif
1229  *
1230  * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1231  * duplicate here avoids having a debugging-capable regexec.o tied to
1232  * a matching debug.o, and this is convenient.  It all disappears in
1233  * the non-debug compilation anyway, so it doesn't matter much.
1234  */
1235 static const char *		/* -> representation */
1236 pchar(int ch)
1237 {
1238 	static char pbuf[10];
1239 
1240 	if (isprint((uch)ch) || ch == ' ')
1241 		snprintf(pbuf, sizeof(pbuf), "%c", ch);
1242 	else
1243 		snprintf(pbuf, sizeof(pbuf), "\\%o", ch);
1244 	return(pbuf);
1245 }
1246 #endif
1247 #endif
1248 
1249 #undef	stepback
1250 #undef	matcher
1251 #undef	walk
1252 #undef	dissect
1253 #undef	backref
1254 #undef	step
1255 #undef	print
1256 #undef	at
1257 #undef	match
1258