xref: /netbsd-src/lib/libc/regex/engine.c (revision 2cf99de68085fe08ba68bc84365a68443d1990b3)
1 /* $NetBSD: engine.c,v 1.27 2021/02/24 18:13:21 christos Exp $ */
2 
3 /*-
4  * SPDX-License-Identifier: BSD-3-Clause
5  *
6  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
7  * Copyright (c) 1992, 1993, 1994
8  *	The Regents of the University of California.  All rights reserved.
9  *
10  * This code is derived from software contributed to Berkeley by
11  * Henry Spencer.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	@(#)engine.c	8.5 (Berkeley) 3/20/94
38  */
39 
40 #include <sys/cdefs.h>
41 #ifdef __FBSDID
42 __FBSDID("$FreeBSD: head/lib/libc/regex/engine.c 368358 2020-12-05 03:16:05Z kevans $");
43 #endif
44 __RCSID("$NetBSD: engine.c,v 1.27 2021/02/24 18:13:21 christos Exp $");
45 
46 #include <stdbool.h>
47 
48 /*
49  * The matching engine and friends.  This file is #included by regexec.c
50  * after suitable #defines of a variety of macros used herein, so that
51  * different state representations can be used without duplicating masses
52  * of code.
53  */
54 
55 #ifdef SNAMES
56 #define	stepback sstepback
57 #define	matcher	smatcher
58 #define	walk	swalk
59 #define	dissect	sdissect
60 #define	backref	sbackref
61 #define	step	sstep
62 #define	print	sprint
63 #define	at	sat
64 #define	match	smat
65 #endif
66 #ifdef LNAMES
67 #define	stepback lstepback
68 #define	matcher	lmatcher
69 #define	walk	lwalk
70 #define	dissect	ldissect
71 #define	backref	lbackref
72 #define	step	lstep
73 #define	print	lprint
74 #define	at	lat
75 #define	match	lmat
76 #endif
77 #ifdef MNAMES
78 #define	stepback mstepback
79 #define	matcher	mmatcher
80 #define	walk	mwalk
81 #define	dissect	mdissect
82 #define	backref	mbackref
83 #define	step	mstep
84 #define	print	mprint
85 #define	at	mat
86 #define	match	mmat
87 #endif
88 
89 /* another structure passed up and down to avoid zillions of parameters */
90 struct match {
91 	struct re_guts *g;
92 	int eflags;
93 	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
94 	const char *offp;	/* offsets work from here */
95 	const char *beginp;	/* start of string -- virtual NUL precedes */
96 	const char *endp;	/* end of string -- virtual NUL here */
97 	const char *coldp;	/* can be no match starting before here */
98 	const char **lastpos;	/* [nplus+1] */
99 	STATEVARS;
100 	states st;		/* current states */
101 	states fresh;		/* states for a fresh start */
102 	states tmp;		/* temporary */
103 	states empty;		/* empty set of states */
104 	mbstate_t mbs;		/* multibyte conversion state */
105 };
106 
107 /* ========= begin header generated by ./mkh ========= */
108 #ifdef __cplusplus
109 extern "C" {
110 #endif
111 
112 /* === engine.c === */
113 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
114 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
115 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int);
116 static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast);
117 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags);
118 #define MAX_RECURSION	100
119 #define	BOL	(OUT-1)
120 #define	EOL	(BOL-1)
121 #define	BOLEOL	(BOL-2)
122 #define	NOTHING	(BOL-3)
123 #define	BOW	(BOL-4)
124 #define	EOW	(BOL-5)
125 #define	BADCHAR	(BOL-6)
126 #define	NWBND	(BOL-7)
127 #define	NONCHAR(c)	((c) <= OUT)
128 /* sflags */
129 #define	SBOS	0x0001
130 #define	SEOS	0x0002
131 
132 #ifdef REDEBUG
133 static void print(struct match *m, const char *caption, states st, int ch, FILE *d);
134 #endif
135 #ifdef REDEBUG
136 static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst);
137 #endif
138 #ifdef REDEBUG
139 static const char *pchar(int ch);
140 #endif
141 
142 #ifdef __cplusplus
143 }
144 #endif
145 /* ========= end header generated by ./mkh ========= */
146 
147 #ifdef REDEBUG
148 #define	SP(t, s, c)	print(m, t, s, c, stdout)
149 #define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
150 #define	NOTE(str)	{ if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
151 #else
152 #define	SP(t, s, c)	/* nothing */
153 #define	AT(t, p1, p2, s1, s2)	/* nothing */
154 #define	NOTE(s)	/* nothing */
155 #endif
156 
157 /*
158  * Given a multibyte string pointed to by start, step back nchar characters
159  * from current position pointed to by cur.
160  */
161 static const char *
162 stepback(const char *start, const char *cur, int nchar)
163 {
164 	const char *ret;
165 	size_t wc, mbc;
166 	mbstate_t mbs;
167 	size_t clen;
168 
169 	if (MB_CUR_MAX == 1)
170 		return ((cur - nchar) > start ? cur - nchar : NULL);
171 
172 	ret = cur;
173 	for (wc = nchar; wc > 0; wc--) {
174 		for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) {
175 			if ((ret - mbc) < start)
176 				return (NULL);
177 			memset(&mbs, 0, sizeof(mbs));
178 			clen = mbrtowc(NULL, ret - mbc, mbc, &mbs);
179 			if (clen != (size_t)-1 && clen != (size_t)-2)
180 				break;
181 		}
182 		if (mbc > MB_CUR_MAX)
183 			return (NULL);
184 		ret -= mbc;
185 	}
186 
187 	return (ret);
188 }
189 
190 /*
191  - matcher - the actual matching engine
192  == static int matcher(struct re_guts *g, const char *string, \
193  ==	size_t nmatch, regmatch_t pmatch[], int eflags);
194  */
195 static int			/* 0 success, REG_NOMATCH failure */
196 matcher(struct re_guts *g,
197 	const char *string,
198 	size_t nmatch,
199 	regmatch_t pmatch[],
200 	int eflags)
201 {
202 	const char *endp;
203 	size_t i;
204 	struct match mv;
205 	struct match *m = &mv;
206 	const char *dp = NULL;
207 	const sopno gf = g->firststate+1;	/* +1 for OEND */
208 	const sopno gl = g->laststate;
209 	const char *start;
210 	const char *stop;
211 	/* Boyer-Moore algorithms variables */
212 	const char *pp;
213 	size_t cj, mj;
214 	const char *mustfirst;
215 	const char *mustlast;
216 	size_t *matchjump;
217 	size_t *charjump;
218 	int error = 0;
219 
220 	_DIAGASSERT(g != NULL);
221 	_DIAGASSERT(string != NULL);
222 	/* pmatch checked below */
223 
224 	/* simplify the situation where possible */
225 	if (g->cflags&REG_NOSUB)
226 		nmatch = 0;
227 	if (eflags&REG_STARTEND) {
228 		_DIAGASSERT(pmatch != NULL);
229 		start = string + (size_t)pmatch[0].rm_so;
230 		stop = string + (size_t)pmatch[0].rm_eo;
231 	} else {
232 		start = string;
233 		stop = start + strlen(start);
234 	}
235 	if (stop < start)
236 		return(REG_INVARG);
237 
238 	/* prescreening; this does wonders for this rather slow code */
239 	if (g->must != NULL) {
240 		if (g->charjump != NULL && g->matchjump != NULL) {
241 			mustfirst = g->must;
242 			mustlast = g->must + g->mlen - 1;
243 			charjump = g->charjump;
244 			matchjump = g->matchjump;
245 			pp = mustlast;
246 			for (dp = start+g->mlen-1; dp < stop;) {
247 				/* Fast skip non-matches */
248 				while (dp < stop && charjump[(int)*dp])
249 					dp += charjump[(int)*dp];
250 
251 				if (dp >= stop)
252 					break;
253 
254 				/* Greedy matcher */
255 				/* We depend on not being used for
256 				 * for strings of length 1
257 				 */
258 				while (*--dp == *--pp && pp != mustfirst);
259 
260 				if (*dp == *pp)
261 					break;
262 
263 				/* Jump to next possible match */
264 				mj = matchjump[pp - mustfirst];
265 				cj = charjump[(int)*dp];
266 				dp += (cj < mj ? mj : cj);
267 				pp = mustlast;
268 			}
269 			if (pp != mustfirst)
270 				return(REG_NOMATCH);
271 		} else {
272 			for (dp = start; dp < stop; dp++)
273 				if (*dp == g->must[0] &&
274 				    (size_t)(stop - dp) >= g->mlen &&
275 				    memcmp(dp, g->must, (size_t)g->mlen) == 0)
276 					break;
277 			if (dp == stop)		/* we didn't find g->must */
278 				return(REG_NOMATCH);
279 		}
280 	}
281 
282 	/* match struct setup */
283 	m->g = g;
284 	m->eflags = eflags;
285 	m->pmatch = NULL;
286 	m->lastpos = NULL;
287 	m->offp = string;
288 	m->beginp = start;
289 	m->endp = stop;
290 	STATESETUP(m, 4);
291 	SETUP(m->st);
292 	SETUP(m->fresh);
293 	SETUP(m->tmp);
294 	SETUP(m->empty);
295 	CLEAR(m->empty);
296 	ZAPSTATE(&m->mbs);
297 
298 	/* Adjust start according to moffset, to speed things up */
299 	if (dp != NULL && g->moffset > -1) {
300 		const char *nstart;
301 
302 		nstart = stepback(start, dp, g->moffset);
303 		if (nstart != NULL)
304 			start = nstart;
305 	}
306 
307 	SP("mloop", m->st, *start);
308 
309 	/* this loop does only one repetition except for backrefs */
310 	for (;;) {
311 		endp = walk(m, start, stop, gf, gl, true);
312 		if (endp == NULL) {		/* a miss */
313 			error = REG_NOMATCH;
314 			goto done;
315 		}
316 		if (nmatch == 0 && !g->backrefs)
317 			break;		/* no further info needed */
318 
319 		/* where? */
320 		assert(m->coldp != NULL);
321 		for (;;) {
322 			NOTE("finding start");
323 			endp = walk(m, m->coldp, stop, gf, gl, false);
324 			if (endp != NULL)
325 				break;
326 			assert(m->coldp < m->endp);
327 			m->coldp += XMBRTOWC(NULL, m->coldp,
328 			    (size_t)(m->endp - m->coldp), &m->mbs, 0);
329 		}
330 		if (nmatch == 1 && !g->backrefs)
331 			break;		/* no further info needed */
332 
333 		/* oh my, he wants the subexpressions... */
334 		if (m->pmatch == NULL)
335 			m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
336 							sizeof(regmatch_t));
337 		if (m->pmatch == NULL) {
338 			error = REG_ESPACE;
339 			goto done;
340 		}
341 		for (i = 1; i <= m->g->nsub; i++)
342 			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
343 		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
344 			NOTE("dissecting");
345 			dp = dissect(m, m->coldp, endp, gf, gl);
346 		} else {
347 			if (g->nplus > 0 && m->lastpos == NULL)
348 				m->lastpos = malloc((g->nplus+1) *
349 						sizeof(const char *));
350 			if (g->nplus > 0 && m->lastpos == NULL) {
351 				error = REG_ESPACE;
352 				goto done;
353 			}
354 			NOTE("backref dissect");
355 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
356 		}
357 		if (dp != NULL)
358 			break;
359 
360 		/* uh-oh... we couldn't find a subexpression-level match */
361 		assert(g->backrefs);	/* must be back references doing it */
362 		assert(g->nplus == 0 || m->lastpos != NULL);
363 		for (;;) {
364 			if (dp != NULL || endp <= m->coldp)
365 				break;		/* defeat */
366 			NOTE("backoff");
367 			endp = walk(m, m->coldp, endp-1, gf, gl, false);
368 			if (endp == NULL)
369 				break;		/* defeat */
370 			/* try it on a shorter possibility */
371 #ifndef NDEBUG
372 			for (i = 1; i <= m->g->nsub; i++) {
373 				assert(m->pmatch[i].rm_so == (regoff_t)-1);
374 				assert(m->pmatch[i].rm_eo == (regoff_t)-1);
375 			}
376 #endif
377 			NOTE("backoff dissect");
378 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
379 		}
380 		assert(dp == NULL || dp == endp);
381 		if (dp != NULL)		/* found a shorter one */
382 			break;
383 
384 		/* despite initial appearances, there is no match here */
385 		NOTE("false alarm");
386 		/* recycle starting later */
387 		start = m->coldp + XMBRTOWC(NULL, m->coldp,
388 		    (size_t)(stop - m->coldp), &m->mbs, 0);
389 		assert(start <= stop);
390 	}
391 
392 	/* fill in the details if requested */
393 	if (nmatch > 0) {
394 		_DIAGASSERT(pmatch != NULL);
395 		pmatch[0].rm_so = m->coldp - m->offp;
396 		pmatch[0].rm_eo = endp - m->offp;
397 	}
398 	if (nmatch > 1) {
399 		assert(m->pmatch != NULL);
400 		for (i = 1; i < nmatch; i++)
401 			if (i <= m->g->nsub)
402 				pmatch[i] = m->pmatch[i];
403 			else {
404 				pmatch[i].rm_so = (regoff_t)-1;
405 				pmatch[i].rm_eo = (regoff_t)-1;
406 			}
407 	}
408 
409 done:
410 	if (m->pmatch != NULL) {
411 		free(m->pmatch);
412 		m->pmatch = NULL;
413 	}
414 	if (m->lastpos != NULL) {
415 		free(__UNCONST(m->lastpos));
416 		m->lastpos = NULL;
417 	}
418 	STATETEARDOWN(m);
419 	return error;
420 }
421 
422 /*
423  - dissect - figure out what matched what, no back references
424  == static const char *dissect(struct match *m, const char *start, \
425  ==	const char *stop, sopno startst, sopno stopst);
426  */
427 static const char *		/* == stop (success) always */
428 dissect(
429 	struct match *m,
430 	const char *start,
431 	const char *stop,
432 	sopno startst,
433 	sopno stopst)
434 {
435 	int i;
436 	sopno ss;		/* start sop of current subRE */
437 	sopno es;		/* end sop of current subRE */
438 	const char *sp;		/* start of string matched by it */
439 	const char *stp;	/* string matched by it cannot pass here */
440 	const char *rest;	/* start of rest of string */
441 	const char *tail;	/* string unmatched by rest of RE */
442 	sopno ssub;		/* start sop of subsubRE */
443 	sopno esub;		/* end sop of subsubRE */
444 	const char *ssp;	/* start of string matched by subsubRE */
445 	const char *sep;	/* end of string matched by subsubRE */
446 	const char *oldssp;	/* previous ssp */
447 	const char *dp __unused;
448 
449 	_DIAGASSERT(m != NULL);
450 	_DIAGASSERT(start != NULL);
451 	_DIAGASSERT(stop != NULL);
452 
453 	AT("diss", start, stop, startst, stopst);
454 	sp = start;
455 	for (ss = startst; ss < stopst; ss = es) {
456 		/* identify end of subRE */
457 		es = ss;
458 		switch (OP(m->g->strip[es])) {
459 		case OPLUS_:
460 		case OQUEST_:
461 			es += OPND(m->g->strip[es]);
462 			break;
463 		case OCH_:
464 			while (OP(m->g->strip[es]) != O_CH)
465 				es += OPND(m->g->strip[es]);
466 			break;
467 		}
468 		es++;
469 
470 		/* figure out what it matched */
471 		switch (OP(m->g->strip[ss])) {
472 		case OEND:
473 			assert(nope);
474 			break;
475 		case OCHAR:
476 			sp += XMBRTOWC(NULL, sp, (size_t)(stop - start),
477 			    &m->mbs, 0);
478 			break;
479 		case OBOL:
480 		case OEOL:
481 		case OBOW:
482 		case OEOW:
483 		case OBOS:
484 		case OEOS:
485 		case OWBND:
486 		case ONWBND:
487 			break;
488 		case OANY:
489 		case OANYOF:
490 			sp += XMBRTOWC(NULL, sp, (size_t)(stop - start),
491 			    &m->mbs, 0);
492 			break;
493 		case OBACK_:
494 		case O_BACK:
495 			assert(nope);
496 			break;
497 		/* cases where length of match is hard to find */
498 		case OQUEST_:
499 			stp = stop;
500 			for (;;) {
501 				/* how long could this one be? */
502 				rest = walk(m, sp, stp, ss, es, false);
503 				assert(rest != NULL);	/* it did match */
504 				/* could the rest match the rest? */
505 				tail = walk(m, rest, stop, es, stopst, false);
506 				if (tail == stop)
507 					break;		/* yes! */
508 				/* no -- try a shorter match for this one */
509 				stp = rest - 1;
510 				assert(stp >= sp);	/* it did work */
511 			}
512 			ssub = ss + 1;
513 			esub = es - 1;
514 			/* did innards match? */
515 			if (walk(m, sp, rest, ssub, esub, false) != NULL) {
516 				dp = dissect(m, sp, rest, ssub, esub);
517 				assert(dp == rest);
518 			} else		/* no */
519 				assert(sp == rest);
520 			sp = rest;
521 			break;
522 		case OPLUS_:
523 			stp = stop;
524 			for (;;) {
525 				/* how long could this one be? */
526 				rest = walk(m, sp, stp, ss, es, false);
527 				assert(rest != NULL);	/* it did match */
528 				/* could the rest match the rest? */
529 				tail = walk(m, rest, stop, es, stopst, false);
530 				if (tail == stop)
531 					break;		/* yes! */
532 				/* no -- try a shorter match for this one */
533 				stp = rest - 1;
534 				assert(stp >= sp);	/* it did work */
535 			}
536 			ssub = ss + 1;
537 			esub = es - 1;
538 			ssp = sp;
539 			oldssp = ssp;
540 			for (;;) {	/* find last match of innards */
541 				sep = walk(m, ssp, rest, ssub, esub, false);
542 				if (sep == NULL || sep == ssp)
543 					break;	/* failed or matched null */
544 				oldssp = ssp;	/* on to next try */
545 				ssp = sep;
546 			}
547 			if (sep == NULL) {
548 				/* last successful match */
549 				sep = ssp;
550 				ssp = oldssp;
551 			}
552 			assert(sep == rest);	/* must exhaust substring */
553 			assert(walk(m, ssp, sep, ssub, esub, false) == rest);
554 			dp = dissect(m, ssp, sep, ssub, esub);
555 			assert(dp == sep);
556 			sp = rest;
557 			break;
558 		case OCH_:
559 			stp = stop;
560 			for (;;) {
561 				/* how long could this one be? */
562 				rest = walk(m, sp, stp, ss, es, false);
563 				assert(rest != NULL);	/* it did match */
564 				/* could the rest match the rest? */
565 				tail = walk(m, rest, stop, es, stopst, false);
566 				if (tail == stop)
567 					break;		/* yes! */
568 				/* no -- try a shorter match for this one */
569 				stp = rest - 1;
570 				assert(stp >= sp);	/* it did work */
571 			}
572 			ssub = ss + 1;
573 			esub = ss + OPND(m->g->strip[ss]) - 1;
574 			assert(OP(m->g->strip[esub]) == OOR1);
575 			for (;;) {	/* find first matching branch */
576 				if (walk(m, sp, rest, ssub, esub, false) == rest)
577 					break;	/* it matched all of it */
578 				/* that one missed, try next one */
579 				assert(OP(m->g->strip[esub]) == OOR1);
580 				esub++;
581 				assert(OP(m->g->strip[esub]) == OOR2);
582 				ssub = esub + 1;
583 				esub += OPND(m->g->strip[esub]);
584 				if (OP(m->g->strip[esub]) == OOR2)
585 					esub--;
586 				else
587 					assert(OP(m->g->strip[esub]) == O_CH);
588 			}
589 			dp = dissect(m, sp, rest, ssub, esub);
590 			assert(dp == rest);
591 			sp = rest;
592 			break;
593 		case O_PLUS:
594 		case O_QUEST:
595 		case OOR1:
596 		case OOR2:
597 		case O_CH:
598 			assert(nope);
599 			break;
600 		case OLPAREN:
601 			i = OPND(m->g->strip[ss]);
602 			assert(0 < i && i <= m->g->nsub);
603 			m->pmatch[i].rm_so = sp - m->offp;
604 			break;
605 		case ORPAREN:
606 			i = OPND(m->g->strip[ss]);
607 			assert(0 < i && i <= m->g->nsub);
608 			m->pmatch[i].rm_eo = sp - m->offp;
609 			break;
610 		default:		/* uh oh */
611 			assert(nope);
612 			break;
613 		}
614 	}
615 
616 	assert(sp == stop);
617 	return(sp);
618 }
619 
620 #define	ISBOW(m, sp)					\
621     (sp < m->endp && ISWORD(*sp) &&			\
622     ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||	\
623     (sp > m->offp && !ISWORD(*(sp-1)))))
624 #define	ISEOW(m, sp)					\
625     (((sp == m->endp && !(m->eflags&REG_NOTEOL)) ||	\
626     (sp < m->endp && *sp == '\n' &&			\
627     (m->g->cflags&REG_NEWLINE)) ||			\
628     (sp < m->endp && !ISWORD(*sp)) ) &&			\
629     (sp > m->beginp && ISWORD(*(sp-1))))		\
630 
631 /*
632  - backref - figure out what matched what, figuring in back references
633  == static const char *backref(struct match *m, const char *start, \
634  ==	const char *stop, sopno startst, sopno stopst, sopno lev);
635  */
636 static const char *		/* == stop (success) or NULL (failure) */
637 backref(
638 	struct match *m,
639 	const char *start,
640 	const char *stop,
641 	sopno startst,
642 	sopno stopst,
643 	sopno lev,		/* PLUS nesting level */
644 	int rec)
645 {
646 	int i;
647 	sopno ss;		/* start sop of current subRE */
648 	const char *sp;		/* start of string matched by it */
649 	sopno ssub;		/* start sop of subsubRE */
650 	sopno esub;		/* end sop of subsubRE */
651 	const char *ssp;	/* start of string matched by subsubRE */
652 	const char *dp;
653 	size_t len;
654 	int hard;
655 	sop s;
656 	regoff_t offsave;
657 	cset *cs;
658 	wint_t wc;
659 
660 	_DIAGASSERT(m != NULL);
661 	_DIAGASSERT(start != NULL);
662 	_DIAGASSERT(stop != NULL);
663 
664 	AT("back", start, stop, startst, stopst);
665 	sp = start;
666 
667 	/* get as far as we can with easy stuff */
668 	hard = 0;
669 	for (ss = startst; !hard && ss < stopst; ss++)
670 		switch (OP(s = m->g->strip[ss])) {
671 		case OCHAR:
672 			if (sp == stop)
673 				return(NULL);
674 			sp += XMBRTOWC(&wc, sp, (size_t)(stop - sp),
675 			    &m->mbs, BADCHAR);
676 			if (wc != (wint_t)OPND(s))
677 				return(NULL);
678 			break;
679 		case OANY:
680 			if (sp == stop)
681 				return(NULL);
682 			sp += XMBRTOWC(&wc, sp, (size_t)(stop - sp),
683 			    &m->mbs, BADCHAR);
684 			if (wc == BADCHAR)
685 				return (NULL);
686 			break;
687 		case OANYOF:
688 			if (sp == stop)
689 				return (NULL);
690 			cs = &m->g->sets[OPND(s)];
691 			sp += XMBRTOWC(&wc, sp, (size_t)(stop - sp),
692 			    &m->mbs, BADCHAR);
693 			if (wc == BADCHAR || !CHIN(cs, wc))
694 				return(NULL);
695 			break;
696 		case OBOS:
697 			if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0)
698 				{ /* yes */ }
699 			else
700 				return(NULL);
701 			break;
702 		case OEOS:
703 			if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0)
704 				{ /* yes */ }
705 			else
706 				return(NULL);
707 			break;
708 		case OBOL:
709 			if ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
710 			    (sp > m->offp && sp < m->endp &&
711 			    *(sp-1) == '\n' && (m->g->cflags&REG_NEWLINE)))
712 				{ /* yes */ }
713 			else
714 				return(NULL);
715 			break;
716 		case OEOL:
717 			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
718 					(sp < m->endp && *sp == '\n' &&
719 						(m->g->cflags&REG_NEWLINE)) )
720 				{ /* yes */ }
721 			else
722 				return(NULL);
723 			break;
724 		case OWBND:
725 			if (ISBOW(m, sp) || ISEOW(m, sp))
726 				{ /* yes */ }
727 			else
728 				return(NULL);
729 			break;
730 		case ONWBND:
731 			if (((sp == m->beginp) && !ISWORD(*sp)) ||
732 			    (sp == m->endp && !ISWORD(*(sp - 1))))
733 				{ /* yes, beginning/end of subject */ }
734 			else if (ISWORD(*(sp - 1)) == ISWORD(*sp))
735 				{ /* yes, beginning/end of subject */ }
736 			else
737 				return(NULL);
738 			break;
739 		case OBOW:
740 			if (ISBOW(m, sp))
741 				{ /* yes */ }
742 			else
743 				return(NULL);
744 			break;
745 		case OEOW:
746 			if (ISEOW(m, sp))
747 				{ /* yes */ }
748 			else
749 				return(NULL);
750 			break;
751 		case O_QUEST:
752 			break;
753 		case OOR1:	/* matches null but needs to skip */
754 			ss++;
755 			s = m->g->strip[ss];
756 			do {
757 				assert(OP(s) == OOR2);
758 				ss += OPND(s);
759 			} while (OP(s = m->g->strip[ss]) != O_CH);
760 			/* note that the ss++ gets us past the O_CH */
761 			break;
762 		default:	/* have to make a choice */
763 			hard = 1;
764 			break;
765 		}
766 	if (!hard) {		/* that was it! */
767 		if (sp != stop)
768 			return(NULL);
769 		return(sp);
770 	}
771 	ss--;			/* adjust for the for's final increment */
772 
773 	/* the hard stuff */
774 	AT("hard", sp, stop, ss, stopst);
775 	s = m->g->strip[ss];
776 	switch (OP(s)) {
777 	case OBACK_:		/* the vilest depths */
778 		i = OPND(s);
779 		assert(0 < i && i <= m->g->nsub);
780 		if (m->pmatch[i].rm_eo == -1)
781 			return(NULL);
782 		assert(m->pmatch[i].rm_so != -1);
783 		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
784 		if (len == 0 && rec++ > MAX_RECURSION)
785 			return(NULL);
786 		assert(stop - m->beginp >= len);
787 		if (sp > stop - len)
788 			return(NULL);	/* not enough left to match */
789 		ssp = m->offp + m->pmatch[i].rm_so;
790 		if (memcmp(sp, ssp, len) != 0)
791 			return(NULL);
792 		while (m->g->strip[ss] != SOP(O_BACK, i))
793 			ss++;
794 		return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
795 	case OQUEST_:		/* to null or not */
796 		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
797 		if (dp != NULL)
798 			return(dp);	/* not */
799 		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
800 	case OPLUS_:
801 		assert(m->lastpos != NULL);
802 		assert(lev+1 <= m->g->nplus);
803 		m->lastpos[lev+1] = sp;
804 		return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
805 	case O_PLUS:
806 		if (sp == m->lastpos[lev])	/* last pass matched null */
807 			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
808 		/* try another pass */
809 		m->lastpos[lev] = sp;
810 		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
811 		if (dp == NULL)
812 			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
813 		else
814 			return(dp);
815 	case OCH_:		/* find the right one, if any */
816 		ssub = ss + 1;
817 		esub = ss + OPND(s) - 1;
818 		assert(OP(m->g->strip[esub]) == OOR1);
819 		for (;;) {	/* find first matching branch */
820 			dp = backref(m, sp, stop, ssub, esub, lev, rec);
821 			if (dp != NULL)
822 				return(dp);
823 			/* that one missed, try next one */
824 			if (OP(m->g->strip[esub]) == O_CH)
825 				return(NULL);	/* there is none */
826 			esub++;
827 			assert(OP(m->g->strip[esub]) == OOR2);
828 			ssub = esub + 1;
829 			esub += OPND(m->g->strip[esub]);
830 			if (OP(m->g->strip[esub]) == OOR2)
831 				esub--;
832 			else
833 				assert(OP(m->g->strip[esub]) == O_CH);
834 		}
835 		/* NOTREACHED */
836 		break;
837 	case OLPAREN:		/* must undo assignment if rest fails */
838 		i = OPND(s);
839 		assert(0 < i && i <= m->g->nsub);
840 		offsave = m->pmatch[i].rm_so;
841 		m->pmatch[i].rm_so = sp - m->offp;
842 		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
843 		if (dp != NULL)
844 			return(dp);
845 		m->pmatch[i].rm_so = offsave;
846 		return(NULL);
847 	case ORPAREN:		/* must undo assignment if rest fails */
848 		i = OPND(s);
849 		assert(0 < i && i <= m->g->nsub);
850 		offsave = m->pmatch[i].rm_eo;
851 		m->pmatch[i].rm_eo = sp - m->offp;
852 		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
853 		if (dp != NULL)
854 			return(dp);
855 		m->pmatch[i].rm_eo = offsave;
856 		return(NULL);
857 	default:		/* uh oh */
858 		assert(nope);
859 		break;
860 	}
861 
862 	/* "can't happen" */
863 	assert(nope);
864 	/* NOTREACHED */
865 	return "shut up gcc";
866 }
867 
868 /*
869  - walk - step through the string either quickly or slowly
870  == static const char *walk(struct match *m, const char *start, \
871  ==	const char *stop, sopno startst, sopno stopst, bool fast);
872  */
873 static const char * /* where it ended, or NULL */
874 walk(struct match *m, const char *start, const char *stop, sopno startst,
875 	sopno stopst, bool fast)
876 {
877 	states st = m->st;
878 	states fresh = m->fresh;
879 	states empty = m->empty;
880 	states tmp = m->tmp;
881 	const char *p = start;
882 	wint_t c;
883 	wint_t lastc;		/* previous c */
884 	wint_t flagch;
885 	int sflags;
886 	const char *matchp;	/* last p at which a match ended */
887 	size_t i, clen;
888 
889 	_DIAGASSERT(m != NULL);
890 	_DIAGASSERT(start != NULL);
891 	_DIAGASSERT(stop != NULL);
892 
893 	sflags = 0;
894 	AT("walk", start, stop, startst, stopst);
895 	CLEAR(st);
896 	SET1(st, startst);
897 	SP("sstart", st, *p);
898 	st = step(m->g, startst, stopst, st, NOTHING, st, sflags);
899 	if (fast)
900 		ASSIGN(fresh, st);
901 	matchp = NULL;
902 	if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL)))
903 		c = OUT;
904 	else {
905 		/*
906 		 * XXX Wrong if the previous character was multi-byte.
907 		 * Newline never is (in encodings supported by FreeBSD),
908 		 * so this only breaks the ISWORD tests below.
909 		 */
910 		c = (uch)*(start - 1);
911 	}
912 	for (;;) {
913 		/* next character */
914 		lastc = c;
915 		sflags = 0;
916 		if (p == m->endp) {
917 			c = OUT;
918 			clen = 0;
919 		} else
920 			clen = XMBRTOWC(&c, p, (size_t)(m->endp - p),
921 			    &m->mbs, BADCHAR);
922 
923 		if (fast && EQ(st, fresh))
924 			matchp = p;
925 
926 		/* is there an EOL and/or BOL between lastc and c? */
927 		flagch = '\0';
928 		i = 0;
929 		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
930 				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
931 			flagch = BOL;
932 			i = m->g->nbol;
933 		}
934 		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
935 				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
936 			flagch = (flagch == BOL) ? BOLEOL : EOL;
937 			i += m->g->neol;
938 		}
939 		if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) {
940 			sflags |= SBOS;
941 			/* Step one more for BOS. */
942 			i++;
943 		}
944 		if (c == OUT && (m->eflags & REG_NOTEOL) == 0) {
945 			sflags |= SEOS;
946 			/* Step one more for EOS. */
947 			i++;
948 		}
949 		if (i != 0) {
950 			for (; i > 0; i--)
951 				st = step(m->g, startst, stopst, st, flagch, st,
952 				    sflags);
953 			SP("sboleol", st, c);
954 		}
955 
956 		/* how about a word boundary? */
957 		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
958 					(c != OUT && ISWORD(c)) ) {
959 			flagch = BOW;
960 		}
961 		if ( (lastc != OUT && ISWORD(lastc)) &&
962 				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
963 			flagch = EOW;
964 		}
965 		if (flagch == BOW || flagch == EOW) {
966 			st = step(m->g, startst, stopst, st, flagch, st, sflags);
967 			SP("sboweow", st, c);
968 		}
969 		if (lastc != OUT && c != OUT &&
970 		    ISWORD(lastc) == ISWORD(c)) {
971 			flagch = NWBND;
972 		} else if ((lastc == OUT && !ISWORD(c)) ||
973 		    (c == OUT && !ISWORD(lastc))) {
974 			flagch = NWBND;
975 		}
976 		if (flagch == NWBND) {
977 			st = step(m->g, startst, stopst, st, flagch, st, sflags);
978 			SP("snwbnd", st, c);
979 		}
980 
981 		/* are we done? */
982 		if (ISSET(st, stopst)) {
983 			if (fast)
984 				break;
985 			else
986 				matchp = p;
987 		}
988 		if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p))
989 			break;		/* NOTE BREAK OUT */
990 
991 		/* no, we must deal with this character */
992 		ASSIGN(tmp, st);
993 		if (fast)
994 			ASSIGN(st, fresh);
995 		else
996 			ASSIGN(st, empty);
997 		assert(c != OUT);
998 		st = step(m->g, startst, stopst, tmp, c, st, sflags);
999 		SP("saft", st, c);
1000 		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags),
1001 		    st));
1002 		p += clen;
1003 	}
1004 
1005 	if (fast) {
1006 		assert(matchp != NULL);
1007 		m->coldp = matchp;
1008 		if (ISSET(st, stopst))
1009 			return (p + XMBRTOWC(NULL, p, (size_t)(stop - p),
1010 			    &m->mbs, 0));
1011 		else
1012 			return (NULL);
1013 	} else
1014 		return (matchp);
1015 }
1016 
1017 /*
1018  - step - map set of states reachable before char to set reachable after
1019  == static states step(struct re_guts *g, sopno start, sopno stop, \
1020  ==	states bef, int ch, states aft);
1021  == #define	BOL	(OUT-1)
1022  == #define	EOL	(BOL-1)
1023  == #define	BOLEOL	(BOL-2)
1024  == #define	NOTHING	(BOL-3)
1025  == #define	BOW	(BOL-4)
1026  == #define	EOW	(BOL-5)
1027  == #define	BADCHAR	(BOL-6)
1028  == #define	NONCHAR(c)	((c) <= OUT)
1029  */
1030 static states
1031 step(struct re_guts *g,
1032 	sopno start,		/* start state within strip */
1033 	sopno stop,		/* state after stop state within strip */
1034 	states bef,		/* states reachable before */
1035 	wint_t ch,		/* character or NONCHAR code */
1036 	states aft,		/* states already known reachable after */
1037 	int sflags)		/* state flags */
1038 {
1039 	cset *cs;
1040 	sop s;
1041 	sopno pc;
1042 	onestate here;		/* note, macros know this name */
1043 	sopno look;
1044 	int i;
1045 
1046 	_DIAGASSERT(g != NULL);
1047 
1048 	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
1049 		s = g->strip[pc];
1050 		switch (OP(s)) {
1051 		case OEND:
1052 			assert(pc == stop-1);
1053 			break;
1054 		case OCHAR:
1055 			/* only characters can match */
1056 			assert(!NONCHAR(ch) || ch != OPND(s));
1057 			if (ch == (wint_t)OPND(s))
1058 				FWD(aft, bef, 1);
1059 			break;
1060 		case OBOS:
1061 			if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0)
1062 				FWD(aft, bef, 1);
1063 			break;
1064 		case OEOS:
1065 			if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0)
1066 				FWD(aft, bef, 1);
1067 			break;
1068 		case OBOL:
1069 			if (ch == BOL || ch == BOLEOL)
1070 				FWD(aft, bef, 1);
1071 			break;
1072 		case OEOL:
1073 			if (ch == EOL || ch == BOLEOL)
1074 				FWD(aft, bef, 1);
1075 			break;
1076 		case OBOW:
1077 			if (ch == BOW)
1078 				FWD(aft, bef, 1);
1079 			break;
1080 		case OEOW:
1081 			if (ch == EOW)
1082 				FWD(aft, bef, 1);
1083 			break;
1084 		case OWBND:
1085 			if (ch == BOW || ch == EOW)
1086 				FWD(aft, bef, 1);
1087 			break;
1088 		case ONWBND:
1089 			if (ch == NWBND)
1090 				FWD(aft, aft, 1);
1091 			break;
1092 		case OANY:
1093 			if (!NONCHAR(ch))
1094 				FWD(aft, bef, 1);
1095 			break;
1096 		case OANYOF:
1097 			cs = &g->sets[OPND(s)];
1098 			if (!NONCHAR(ch) && CHIN(cs, ch))
1099 				FWD(aft, bef, 1);
1100 			break;
1101 		case OBACK_:		/* ignored here */
1102 		case O_BACK:
1103 			FWD(aft, aft, 1);
1104 			break;
1105 		case OPLUS_:		/* forward, this is just an empty */
1106 			FWD(aft, aft, 1);
1107 			break;
1108 		case O_PLUS:		/* both forward and back */
1109 			FWD(aft, aft, 1);
1110 			i = ISSETBACK(aft, OPND(s));
1111 			BACK(aft, aft, OPND(s));
1112 			if (!i && ISSETBACK(aft, OPND(s))) {
1113 				/* oho, must reconsider loop body */
1114 				pc -= OPND(s) + 1;
1115 				INIT(here, pc);
1116 			}
1117 			break;
1118 		case OQUEST_:		/* two branches, both forward */
1119 			FWD(aft, aft, 1);
1120 			FWD(aft, aft, OPND(s));
1121 			break;
1122 		case O_QUEST:		/* just an empty */
1123 			FWD(aft, aft, 1);
1124 			break;
1125 		case OLPAREN:		/* not significant here */
1126 		case ORPAREN:
1127 			FWD(aft, aft, 1);
1128 			break;
1129 		case OCH_:		/* mark the first two branches */
1130 			FWD(aft, aft, 1);
1131 			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1132 			FWD(aft, aft, OPND(s));
1133 			break;
1134 		case OOR1:		/* done a branch, find the O_CH */
1135 			if (ISSTATEIN(aft, here)) {
1136 				for (look = 1;
1137 				    OP(s = g->strip[pc+look]) != O_CH;
1138 				    look += OPND(s))
1139 					assert(OP(s) == OOR2);
1140 				FWD(aft, aft, look + 1);
1141 			}
1142 			break;
1143 		case OOR2:		/* propagate OCH_'s marking */
1144 			FWD(aft, aft, 1);
1145 			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1146 				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1147 				FWD(aft, aft, OPND(s));
1148 			}
1149 			break;
1150 		case O_CH:		/* just empty */
1151 			FWD(aft, aft, 1);
1152 			break;
1153 		default:		/* ooooops... */
1154 			assert(nope);
1155 			break;
1156 		}
1157 	}
1158 
1159 	return(aft);
1160 }
1161 
1162 #ifdef REDEBUG
1163 /*
1164  - print - print a set of states
1165  == #ifdef REDEBUG
1166  == static void print(struct match *m, const char *caption, states st, \
1167  ==	int ch, FILE *d);
1168  == #endif
1169  */
1170 static void
1171 print(struct match *m,
1172 	const char *caption,
1173 	states st,
1174 	int ch,
1175 	FILE *d)
1176 {
1177 	struct re_guts *g = m->g;
1178 	sopno i;
1179 	int first = 1;
1180 
1181 	_DIAGASSERT(m != NULL);
1182 	_DIAGASSERT(caption != NULL);
1183 
1184 	if (!(m->eflags&REG_TRACE))
1185 		return;
1186 
1187 	_DIAGASSERT(d != NULL);
1188 
1189 	fprintf(d, "%s", caption);
1190 	if (ch != '\0')
1191 		fprintf(d, " %s", pchar(ch));
1192 	for (i = 0; i < g->nstates; i++)
1193 		if (ISSET(st, i)) {
1194 			fprintf(d, "%s%lu", (first) ? "\t" : ", ", i);
1195 			first = 0;
1196 		}
1197 	fprintf(d, "\n");
1198 }
1199 
1200 /*
1201  - at - print current situation
1202  == #ifdef REDEBUG
1203  == static void at(struct match *m, const char *title, const char *start, \
1204  ==			 const char *stop, sopno startst, sopno stopst);
1205  == #endif
1206  */
1207 static void
1208 at(	struct match *m,
1209 	const char *title,
1210 	const char *start,
1211 	const char *stop,
1212 	sopno startst,
1213 	sopno stopst)
1214 {
1215 
1216 	_DIAGASSERT(m != NULL);
1217 	_DIAGASSERT(title != NULL);
1218 	_DIAGASSERT(start != NULL);
1219 	_DIAGASSERT(stop != NULL);
1220 
1221 	if (!(m->eflags&REG_TRACE))
1222 		return;
1223 
1224 	printf("%s %s-", title, pchar(*start));
1225 	printf("%s ", pchar(*stop));
1226 	printf("%ld-%ld\n", (long)startst, (long)stopst);
1227 }
1228 
1229 #ifndef PCHARDONE
1230 #define	PCHARDONE	/* never again */
1231 /*
1232  - pchar - make a character printable
1233  == #ifdef REDEBUG
1234  == static const char *pchar(int ch);
1235  == #endif
1236  *
1237  * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1238  * duplicate here avoids having a debugging-capable regexec.o tied to
1239  * a matching debug.o, and this is convenient.  It all disappears in
1240  * the non-debug compilation anyway, so it doesn't matter much.
1241  */
1242 static const char *		/* -> representation */
1243 pchar(int ch)
1244 {
1245 	static char pbuf[10];
1246 
1247 	if (isprint((uch)ch) || ch == ' ')
1248 		snprintf(pbuf, sizeof(pbuf), "%c", ch);
1249 	else
1250 		snprintf(pbuf, sizeof(pbuf), "\\%o", ch);
1251 	return(pbuf);
1252 }
1253 #endif
1254 #endif
1255 
1256 #undef	stepback
1257 #undef	matcher
1258 #undef	walk
1259 #undef	dissect
1260 #undef	backref
1261 #undef	step
1262 #undef	print
1263 #undef	at
1264 #undef	match
1265