xref: /openbsd-src/gnu/usr.bin/perl/regexec.c (revision a4afd6dad3fba28f80e70208181c06c482259988)
1 /*    regexec.c
2  */
3 
4 /*
5  * "One Ring to rule them all, One Ring to find them..."
6  */
7 
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11 
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16 
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21 
22 /*SUPPRESS 112*/
23 /*
24  * pregcomp and pregexec -- regsub and regerror are not used in perl
25  *
26  *	Copyright (c) 1986 by University of Toronto.
27  *	Written by Henry Spencer.  Not derived from licensed software.
28  *
29  *	Permission is granted to anyone to use this software for any
30  *	purpose on any computer system, and to redistribute it freely,
31  *	subject to the following restrictions:
32  *
33  *	1. The author is not responsible for the consequences of use of
34  *		this software, no matter how awful, even if they arise
35  *		from defects in it.
36  *
37  *	2. The origin of this software must not be misrepresented, either
38  *		by explicit claim or by omission.
39  *
40  *	3. Altered versions must be plainly marked as such, and must not
41  *		be misrepresented as being the original software.
42  *
43  ****    Alterations to Henry's code are...
44  ****
45  ****    Copyright (c) 1991-1994, Larry Wall
46  ****
47  ****    You may distribute under the terms of either the GNU General Public
48  ****    License or the Artistic License, as specified in the README file.
49  *
50  * Beware that some of this code is subtly aware of the way operator
51  * precedence is structured in regular expressions.  Serious changes in
52  * regular-expression syntax might require a total rethink.
53  */
54 #include "EXTERN.h"
55 #include "perl.h"
56 #include "regcomp.h"
57 
58 #ifndef STATIC
59 #define	STATIC	static
60 #endif
61 
62 #ifdef DEBUGGING
63 static I32 regnarrate = 0;
64 static char* regprogram = 0;
65 #endif
66 
67 /* Current curly descriptor */
68 typedef struct curcur CURCUR;
69 struct curcur {
70     int		parenfloor;	/* how far back to strip paren data */
71     int		cur;		/* how many instances of scan we've matched */
72     int		min;		/* the minimal number of scans to match */
73     int		max;		/* the maximal number of scans to match */
74     int		minmod;		/* whether to work our way up or down */
75     char *	scan;		/* the thing to match */
76     char *	next;		/* what has to match after it */
77     char *	lastloc;	/* where we started matching this scan */
78     CURCUR *	oldcc;		/* current curly before we started this one */
79 };
80 
81 static CURCUR* regcc;
82 
83 typedef I32 CHECKPOINT;
84 
85 CHECKPOINT regcppush _((I32 parenfloor));
86 char * regcppop _((void));
87 
88 CHECKPOINT
89 regcppush(parenfloor)
90 I32 parenfloor;
91 {
92     int retval = savestack_ix;
93     int i = (regsize - parenfloor) * 3;
94     int p;
95 
96     SSCHECK(i + 5);
97     for (p = regsize; p > parenfloor; p--) {
98 	SSPUSHPTR(regendp[p]);
99 	SSPUSHPTR(regstartp[p]);
100 	SSPUSHINT(p);
101     }
102     SSPUSHINT(regsize);
103     SSPUSHINT(*reglastparen);
104     SSPUSHPTR(reginput);
105     SSPUSHINT(i + 3);
106     SSPUSHINT(SAVEt_REGCONTEXT);
107     return retval;
108 }
109 
110 char*
111 regcppop()
112 {
113     I32 i = SSPOPINT;
114     U32 paren = 0;
115     char *input;
116     char *tmps;
117     assert(i == SAVEt_REGCONTEXT);
118     i = SSPOPINT;
119     input = (char *) SSPOPPTR;
120     *reglastparen = SSPOPINT;
121     regsize = SSPOPINT;
122     for (i -= 3; i > 0; i -= 3) {
123 	paren = (U32)SSPOPINT;
124 	regstartp[paren] = (char *) SSPOPPTR;
125 	tmps = (char*)SSPOPPTR;
126 	if (paren <= *reglastparen)
127 	    regendp[paren] = tmps;
128     }
129     for (paren = *reglastparen + 1; paren <= regnpar; paren++) {
130 	if (paren > regsize)
131 	    regstartp[paren] = Nullch;
132 	regendp[paren] = Nullch;
133     }
134     return input;
135 }
136 
137 #define regcpblow(cp) leave_scope(cp)
138 
139 /*
140  * pregexec and friends
141  */
142 
143 /*
144  * Forwards.
145  */
146 
147 static I32 regmatch _((char *prog));
148 static I32 regrepeat _((char *p, I32 max));
149 static I32 regtry _((regexp *prog, char *startpos));
150 
151 /*
152  - pregexec - match a regexp against a string
153  */
154 I32
155 pregexec(prog, stringarg, strend, strbeg, minend, screamer, safebase)
156 register regexp *prog;
157 char *stringarg;
158 register char *strend;	/* pointer to null at end of string */
159 char *strbeg;	/* real beginning of string */
160 I32 minend;	/* end of match must be at least minend after stringarg */
161 SV *screamer;
162 I32 safebase;	/* no need to remember string in subbase */
163 {
164     register char *s;
165     register I32 i;
166     register char *c;
167     register char *startpos = stringarg;
168     register I32 tmp;
169     I32 minlen = 0;		/* must match at least this many chars */
170     I32 dontbother = 0;	/* how many characters not to try at end */
171     CURCUR cc;
172 
173     cc.cur = 0;
174     cc.oldcc = 0;
175     regcc = &cc;
176 
177 #ifdef DEBUGGING
178     regnarrate = debug & 512;
179     regprogram = prog->program;
180 #endif
181 
182     /* Be paranoid... */
183     if (prog == NULL || startpos == NULL) {
184 	croak("NULL regexp parameter");
185 	return 0;
186     }
187 
188     if (startpos == strbeg)	/* is ^ valid at stringarg? */
189 	regprev = '\n';
190     else {
191 	regprev = stringarg[-1];
192 	if (!multiline && regprev == '\n')
193 	    regprev = '\0';		/* force ^ to NOT match */
194     }
195     regprecomp = prog->precomp;
196     regnpar = prog->nparens;
197     /* Check validity of program. */
198     if (UCHARAT(prog->program) != MAGIC) {
199 	FAIL("corrupted regexp program");
200     }
201 
202     if (prog->do_folding) {
203 	i = strend - startpos;
204 	New(1101,c,i+1,char);
205 	Copy(startpos, c, i+1, char);
206 	startpos = c;
207 	strend = startpos + i;
208 	for (s = startpos; s < strend; s++)
209 	    if (isUPPER(*s))
210 		*s = toLOWER(*s);
211     }
212 
213     /* If there is a "must appear" string, look for it. */
214     s = startpos;
215     if (prog->regmust != Nullsv &&
216 	(!(prog->reganch & ROPT_ANCH)
217 	 || (multiline && prog->regback >= 0)) )
218     {
219 	if (stringarg == strbeg && screamer) {
220 	    if (screamfirst[BmRARE(prog->regmust)] >= 0)
221 		    s = screaminstr(screamer,prog->regmust);
222 	    else
223 		    s = Nullch;
224 	}
225 	else
226 	    s = fbm_instr((unsigned char*)s, (unsigned char*)strend,
227 		prog->regmust);
228 	if (!s) {
229 	    ++BmUSEFUL(prog->regmust);	/* hooray */
230 	    goto phooey;	/* not present */
231 	}
232 	else if (prog->regback >= 0) {
233 	    s -= prog->regback;
234 	    if (s < startpos)
235 		s = startpos;
236 	    minlen = prog->regback + SvCUR(prog->regmust);
237 	}
238 	else if (!prog->naughty && --BmUSEFUL(prog->regmust) < 0) { /* boo */
239 	    SvREFCNT_dec(prog->regmust);
240 	    prog->regmust = Nullsv;	/* disable regmust */
241 	    s = startpos;
242 	}
243 	else {
244 	    s = startpos;
245 	    minlen = SvCUR(prog->regmust);
246 	}
247     }
248 
249     /* Mark beginning of line for ^ . */
250     regbol = startpos;
251 
252     /* Mark end of line for $ (and such) */
253     regeol = strend;
254 
255     /* see how far we have to get to not match where we matched before */
256     regtill = startpos+minend;
257 
258     /* Simplest case:  anchored match need be tried only once. */
259     /*  [unless multiline is set] */
260     if (prog->reganch & ROPT_ANCH) {
261 	if (regtry(prog, startpos))
262 	    goto got_it;
263 	else if (multiline || (prog->reganch & ROPT_IMPLICIT)) {
264 	    if (minlen)
265 		dontbother = minlen - 1;
266 	    strend -= dontbother;
267 	    /* for multiline we only have to try after newlines */
268 	    if (s > startpos)
269 		s--;
270 	    while (s < strend) {
271 		if (*s++ == '\n') {
272 		    if (s < strend && regtry(prog, s))
273 			goto got_it;
274 		}
275 	    }
276 	}
277 	goto phooey;
278     }
279 
280     /* Messy cases:  unanchored match. */
281     if (prog->regstart) {
282 	if (prog->reganch & ROPT_SKIP) {  /* we have /x+whatever/ */
283 	    /* it must be a one character string */
284 	    i = SvPVX(prog->regstart)[0];
285 	    while (s < strend) {
286 		if (*s == i) {
287 		    if (regtry(prog, s))
288 			goto got_it;
289 		    s++;
290 		    while (s < strend && *s == i)
291 			s++;
292 		}
293 		s++;
294 	    }
295 	}
296 	else if (SvPOK(prog->regstart) == 3) {
297 	    /* We know what string it must start with. */
298 	    while ((s = fbm_instr((unsigned char*)s,
299 	      (unsigned char*)strend, prog->regstart)) != NULL)
300 	    {
301 		if (regtry(prog, s))
302 		    goto got_it;
303 		s++;
304 	    }
305 	}
306 	else {
307 	    c = SvPVX(prog->regstart);
308 	    while ((s = ninstr(s, strend, c, c + SvCUR(prog->regstart))) != NULL)
309 	    {
310 		if (regtry(prog, s))
311 		    goto got_it;
312 		s++;
313 	    }
314 	}
315 	goto phooey;
316     }
317     /*SUPPRESS 560*/
318     if (c = prog->regstclass) {
319 	I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
320 
321 	if (minlen)
322 	    dontbother = minlen - 1;
323 	strend -= dontbother;	/* don't bother with what can't match */
324 	tmp = 1;
325 	/* We know what class it must start with. */
326 	switch (OP(c)) {
327 	case ANYOF:
328 	    c = OPERAND(c);
329 	    while (s < strend) {
330 		i = UCHARAT(s);
331 		if (!(c[i >> 3] & (1 << (i&7)))) {
332 		    if (tmp && regtry(prog, s))
333 			goto got_it;
334 		    else
335 			tmp = doevery;
336 		}
337 		else
338 		    tmp = 1;
339 		s++;
340 	    }
341 	    break;
342 	case BOUND:
343 	    if (minlen)
344 		dontbother++,strend--;
345 	    if (s != startpos) {
346 		i = s[-1];
347 		tmp = isALNUM(i);
348 	    }
349 	    else
350 		tmp = isALNUM(regprev);	/* assume not alphanumeric */
351 	    while (s < strend) {
352 		i = *s;
353 		if (tmp != isALNUM(i)) {
354 		    tmp = !tmp;
355 		    if (regtry(prog, s))
356 			goto got_it;
357 		}
358 		s++;
359 	    }
360 	    if ((minlen || tmp) && regtry(prog,s))
361 		goto got_it;
362 	    break;
363 	case NBOUND:
364 	    if (minlen)
365 		dontbother++,strend--;
366 	    if (s != startpos) {
367 		i = s[-1];
368 		tmp = isALNUM(i);
369 	    }
370 	    else
371 		tmp = isALNUM(regprev);	/* assume not alphanumeric */
372 	    while (s < strend) {
373 		i = *s;
374 		if (tmp != isALNUM(i))
375 		    tmp = !tmp;
376 		else if (regtry(prog, s))
377 		    goto got_it;
378 		s++;
379 	    }
380 	    if ((minlen || !tmp) && regtry(prog,s))
381 		goto got_it;
382 	    break;
383 	case ALNUM:
384 	    while (s < strend) {
385 		i = *s;
386 		if (isALNUM(i)) {
387 		    if (tmp && regtry(prog, s))
388 			goto got_it;
389 		    else
390 			tmp = doevery;
391 		}
392 		else
393 		    tmp = 1;
394 		s++;
395 	    }
396 	    break;
397 	case NALNUM:
398 	    while (s < strend) {
399 		i = *s;
400 		if (!isALNUM(i)) {
401 		    if (tmp && regtry(prog, s))
402 			goto got_it;
403 		    else
404 			tmp = doevery;
405 		}
406 		else
407 		    tmp = 1;
408 		s++;
409 	    }
410 	    break;
411 	case SPACE:
412 	    while (s < strend) {
413 		if (isSPACE(*s)) {
414 		    if (tmp && regtry(prog, s))
415 			goto got_it;
416 		    else
417 			tmp = doevery;
418 		}
419 		else
420 		    tmp = 1;
421 		s++;
422 	    }
423 	    break;
424 	case NSPACE:
425 	    while (s < strend) {
426 		if (!isSPACE(*s)) {
427 		    if (tmp && regtry(prog, s))
428 			goto got_it;
429 		    else
430 			tmp = doevery;
431 		}
432 		else
433 		    tmp = 1;
434 		s++;
435 	    }
436 	    break;
437 	case DIGIT:
438 	    while (s < strend) {
439 		if (isDIGIT(*s)) {
440 		    if (tmp && regtry(prog, s))
441 			goto got_it;
442 		    else
443 			tmp = doevery;
444 		}
445 		else
446 		    tmp = 1;
447 		s++;
448 	    }
449 	    break;
450 	case NDIGIT:
451 	    while (s < strend) {
452 		if (!isDIGIT(*s)) {
453 		    if (tmp && regtry(prog, s))
454 			goto got_it;
455 		    else
456 			tmp = doevery;
457 		}
458 		else
459 		    tmp = 1;
460 		s++;
461 	    }
462 	    break;
463 	}
464     }
465     else {
466 	if (minlen)
467 	    dontbother = minlen - 1;
468 	strend -= dontbother;
469 	/* We don't know much -- general case. */
470 	do {
471 	    if (regtry(prog, s))
472 		goto got_it;
473 	} while (s++ < strend);
474     }
475 
476     /* Failure. */
477     goto phooey;
478 
479 got_it:
480     strend += dontbother;	/* uncheat */
481     prog->subbeg = strbeg;
482     prog->subend = strend;
483     if ((!safebase && (prog->nparens || sawampersand)) || prog->do_folding) {
484 	i = strend - startpos + (stringarg - strbeg);
485 	if (safebase) {			/* no need for $digit later */
486 	    s = strbeg;
487 	    prog->subend = s+i;
488 	}
489 	else if (strbeg != prog->subbase) {
490 	    s = savepvn(strbeg,i);	/* so $digit will work later */
491 	    if (prog->subbase)
492 		Safefree(prog->subbase);
493 	    prog->subbeg = prog->subbase = s;
494 	    prog->subend = s+i;
495 	}
496 	else {
497 	    prog->subbeg = s = prog->subbase;
498 	    prog->subend = s+i;
499 	}
500 	s += (stringarg - strbeg);
501 	for (i = 0; i <= prog->nparens; i++) {
502 	    if (prog->endp[i]) {
503 		prog->startp[i] = s + (prog->startp[i] - startpos);
504 		prog->endp[i] = s + (prog->endp[i] - startpos);
505 	    }
506 	}
507 	if (prog->do_folding)
508 	    Safefree(startpos);
509     }
510     return 1;
511 
512 phooey:
513     if (prog->do_folding)
514 	Safefree(startpos);
515     return 0;
516 }
517 
518 /*
519  - regtry - try match at specific point
520  */
521 static I32			/* 0 failure, 1 success */
522 regtry(prog, startpos)
523 regexp *prog;
524 char *startpos;
525 {
526     register I32 i;
527     register char **sp;
528     register char **ep;
529 
530     reginput = startpos;
531     regstartp = prog->startp;
532     regendp = prog->endp;
533     reglastparen = &prog->lastparen;
534     prog->lastparen = 0;
535     regsize = 0;
536 
537     sp = prog->startp;
538     ep = prog->endp;
539     if (prog->nparens) {
540 	for (i = prog->nparens; i >= 0; i--) {
541 	    *sp++ = NULL;
542 	    *ep++ = NULL;
543 	}
544     }
545     if (regmatch(prog->program + 1) && reginput >= regtill) {
546 	prog->startp[0] = startpos;
547 	prog->endp[0] = reginput;
548 	return 1;
549     }
550     else
551 	return 0;
552 }
553 
554 /*
555  - regmatch - main matching routine
556  *
557  * Conceptually the strategy is simple:  check to see whether the current
558  * node matches, call self recursively to see whether the rest matches,
559  * and then act accordingly.  In practice we make some effort to avoid
560  * recursion, in particular by going through "ordinary" nodes (that don't
561  * need to know whether the rest of the match failed) by a loop instead of
562  * by recursion.
563  */
564 /* [lwall] I've hoisted the register declarations to the outer block in order to
565  * maybe save a little bit of pushing and popping on the stack.  It also takes
566  * advantage of machines that use a register save mask on subroutine entry.
567  */
568 static I32			/* 0 failure, 1 success */
569 regmatch(prog)
570 char *prog;
571 {
572     register char *scan;	/* Current node. */
573     char *next;			/* Next node. */
574     register I32 nextchar;
575     register I32 n;		/* no or next */
576     register I32 ln;		/* len or last */
577     register char *s;		/* operand or save */
578     register char *locinput = reginput;
579     int minmod = 0;
580 #ifdef DEBUGGING
581     static int regindent = 0;
582     regindent++;
583 #endif
584 
585     nextchar = *locinput;
586     scan = prog;
587     while (scan != NULL) {
588 #ifdef DEBUGGING
589 #define sayYES goto yes
590 #define sayNO goto no
591 #define saySAME(x) if (x) goto yes; else goto no
592 	if (regnarrate) {
593 	    fprintf(stderr, "%*s%2d%-8.8s\t<%.10s>\n", regindent*2, "",
594 		scan - regprogram, regprop(scan), locinput);
595 	}
596 #else
597 #define sayYES return 1
598 #define sayNO return 0
599 #define saySAME(x) return x
600 #endif
601 
602 #ifdef REGALIGN
603 	next = scan + NEXT(scan);
604 	if (next == scan)
605 	    next = NULL;
606 #else
607 	next = regnext(scan);
608 #endif
609 
610 	switch (OP(scan)) {
611 	case BOL:
612 	    if (locinput == regbol
613 		? regprev == '\n'
614 		: ((nextchar || locinput < regeol) && locinput[-1] == '\n') )
615 	    {
616 		/* regtill = regbol; */
617 		break;
618 	    }
619 	    sayNO;
620 	case MBOL:
621 	    if (locinput == regbol
622 		? regprev == '\n'
623 		: ((nextchar || locinput < regeol) && locinput[-1] == '\n') )
624 	    {
625 		break;
626 	    }
627 	    sayNO;
628 	case SBOL:
629 	    if (locinput == regbol && regprev == '\n')
630 		break;
631 	    sayNO;
632 	case GBOL:
633 	    if (locinput == regbol)
634 		break;
635 	    sayNO;
636 	case EOL:
637 	    if (multiline)
638 		goto meol;
639 	    else
640 		goto seol;
641 	case MEOL:
642 	  meol:
643 	    if ((nextchar || locinput < regeol) && nextchar != '\n')
644 		sayNO;
645 	    break;
646 	case SEOL:
647 	  seol:
648 	    if ((nextchar || locinput < regeol) && nextchar != '\n')
649 		sayNO;
650 	    if (regeol - locinput > 1)
651 		sayNO;
652 	    break;
653 	case SANY:
654 	    if (!nextchar && locinput >= regeol)
655 		sayNO;
656 	    nextchar = *++locinput;
657 	    break;
658 	case ANY:
659 	    if (!nextchar && locinput >= regeol || nextchar == '\n')
660 		sayNO;
661 	    nextchar = *++locinput;
662 	    break;
663 	case EXACTLY:
664 	    s = OPERAND(scan);
665 	    ln = *s++;
666 	    /* Inline the first character, for speed. */
667 	    if (*s != nextchar)
668 		sayNO;
669 	    if (regeol - locinput < ln)
670 		sayNO;
671 	    if (ln > 1 && bcmp(s, locinput, ln) != 0)
672 		sayNO;
673 	    locinput += ln;
674 	    nextchar = *locinput;
675 	    break;
676 	case ANYOF:
677 	    s = OPERAND(scan);
678 	    if (nextchar < 0)
679 		nextchar = UCHARAT(locinput);
680 	    if (s[nextchar >> 3] & (1 << (nextchar&7)))
681 		sayNO;
682 	    if (!nextchar && locinput >= regeol)
683 		sayNO;
684 	    nextchar = *++locinput;
685 	    break;
686 	case ALNUM:
687 	    if (!nextchar)
688 		sayNO;
689 	    if (!isALNUM(nextchar))
690 		sayNO;
691 	    nextchar = *++locinput;
692 	    break;
693 	case NALNUM:
694 	    if (!nextchar && locinput >= regeol)
695 		sayNO;
696 	    if (isALNUM(nextchar))
697 		sayNO;
698 	    nextchar = *++locinput;
699 	    break;
700 	case NBOUND:
701 	case BOUND:
702 	    if (locinput == regbol)	/* was last char in word? */
703 		ln = isALNUM(regprev);
704 	    else
705 		ln = isALNUM(locinput[-1]);
706 	    n = isALNUM(nextchar); /* is next char in word? */
707 	    if ((ln == n) == (OP(scan) == BOUND))
708 		sayNO;
709 	    break;
710 	case SPACE:
711 	    if (!nextchar && locinput >= regeol)
712 		sayNO;
713 	    if (!isSPACE(nextchar))
714 		sayNO;
715 	    nextchar = *++locinput;
716 	    break;
717 	case NSPACE:
718 	    if (!nextchar)
719 		sayNO;
720 	    if (isSPACE(nextchar))
721 		sayNO;
722 	    nextchar = *++locinput;
723 	    break;
724 	case DIGIT:
725 	    if (!isDIGIT(nextchar))
726 		sayNO;
727 	    nextchar = *++locinput;
728 	    break;
729 	case NDIGIT:
730 	    if (!nextchar && locinput >= regeol)
731 		sayNO;
732 	    if (isDIGIT(nextchar))
733 		sayNO;
734 	    nextchar = *++locinput;
735 	    break;
736 	case REF:
737 	    n = ARG1(scan);  /* which paren pair */
738 	    s = regstartp[n];
739 	    if (!s)
740 		sayNO;
741 	    if (!regendp[n])
742 		sayNO;
743 	    if (s == regendp[n])
744 		break;
745 	    /* Inline the first character, for speed. */
746 	    if (*s != nextchar)
747 		sayNO;
748 	    ln = regendp[n] - s;
749 	    if (locinput + ln > regeol)
750 		sayNO;
751 	    if (ln > 1 && bcmp(s, locinput, ln) != 0)
752 		sayNO;
753 	    locinput += ln;
754 	    nextchar = *locinput;
755 	    break;
756 
757 	case NOTHING:
758 	    break;
759 	case BACK:
760 	    break;
761 	case OPEN:
762 	    n = ARG1(scan);  /* which paren pair */
763 	    regstartp[n] = locinput;
764 	    if (n > regsize)
765 		regsize = n;
766 	    break;
767 	case CLOSE:
768 	    n = ARG1(scan);  /* which paren pair */
769 	    regendp[n] = locinput;
770 	    if (n > *reglastparen)
771 		*reglastparen = n;
772 	    break;
773 	case CURLYX: {
774 		CURCUR cc;
775 		CHECKPOINT cp = savestack_ix;
776 		cc.oldcc = regcc;
777 		regcc = &cc;
778 		cc.parenfloor = *reglastparen;
779 		cc.cur = -1;
780 		cc.min = ARG1(scan);
781 		cc.max  = ARG2(scan);
782 		cc.scan = NEXTOPER(scan) + 4;
783 		cc.next = next;
784 		cc.minmod = minmod;
785 		cc.lastloc = 0;
786 		reginput = locinput;
787 		n = regmatch(PREVOPER(next));	/* start on the WHILEM */
788 		regcpblow(cp);
789 		regcc = cc.oldcc;
790 		saySAME(n);
791 	    }
792 	    /* NOT REACHED */
793 	case WHILEM: {
794 		/*
795 		 * This is really hard to understand, because after we match
796 		 * what we're trying to match, we must make sure the rest of
797 		 * the RE is going to match for sure, and to do that we have
798 		 * to go back UP the parse tree by recursing ever deeper.  And
799 		 * if it fails, we have to reset our parent's current state
800 		 * that we can try again after backing off.
801 		 */
802 
803 		CURCUR* cc = regcc;
804 		n = cc->cur + 1;	/* how many we know we matched */
805 		reginput = locinput;
806 
807 #ifdef DEBUGGING
808 		if (regnarrate)
809 		    fprintf(stderr, "%*s  %d  %lx\n", regindent*2, "",
810 			n, (long)cc);
811 #endif
812 
813 		/* If degenerate scan matches "", assume scan done. */
814 
815 		if (locinput == cc->lastloc) {
816 		    regcc = cc->oldcc;
817 		    ln = regcc->cur;
818 		    if (regmatch(cc->next))
819 			sayYES;
820 		    regcc->cur = ln;
821 		    regcc = cc;
822 		    sayNO;
823 		}
824 
825 		/* First just match a string of min scans. */
826 
827 		if (n < cc->min) {
828 		    cc->cur = n;
829 		    cc->lastloc = locinput;
830 		    if (regmatch(cc->scan))
831 			sayYES;
832 		    cc->cur = n - 1;
833 		    sayNO;
834 		}
835 
836 		/* Prefer next over scan for minimal matching. */
837 
838 		if (cc->minmod) {
839 		    regcc = cc->oldcc;
840 		    ln = regcc->cur;
841 		    if (regmatch(cc->next))
842 			sayYES;	/* All done. */
843 		    regcc->cur = ln;
844 		    regcc = cc;
845 
846 		    if (n >= cc->max)	/* Maximum greed exceeded? */
847 			sayNO;
848 
849 		    /* Try scanning more and see if it helps. */
850 		    reginput = locinput;
851 		    cc->cur = n;
852 		    cc->lastloc = locinput;
853 		    if (regmatch(cc->scan))
854 			sayYES;
855 		    cc->cur = n - 1;
856 		    sayNO;
857 		}
858 
859 		/* Prefer scan over next for maximal matching. */
860 
861 		if (n < cc->max) {	/* More greed allowed? */
862 		    regcppush(cc->parenfloor);
863 		    cc->cur = n;
864 		    cc->lastloc = locinput;
865 		    if (regmatch(cc->scan))
866 			sayYES;
867 		    regcppop();		/* Restore some previous $<digit>s? */
868 		    reginput = locinput;
869 		}
870 
871 		/* Failed deeper matches of scan, so see if this one works. */
872 		regcc = cc->oldcc;
873 		ln = regcc->cur;
874 		if (regmatch(cc->next))
875 		    sayYES;
876 		regcc->cur = ln;
877 		regcc = cc;
878 		cc->cur = n - 1;
879 		sayNO;
880 	    }
881 	    /* NOT REACHED */
882 	case BRANCH: {
883 		if (OP(next) != BRANCH)	  /* No choice. */
884 		    next = NEXTOPER(scan);/* Avoid recursion. */
885 		else {
886 		    int lastparen = *reglastparen;
887 		    do {
888 			reginput = locinput;
889 			if (regmatch(NEXTOPER(scan)))
890 			    sayYES;
891 			for (n = *reglastparen; n > lastparen; n--)
892 			    regendp[n] = 0;
893 			*reglastparen = n;
894 
895 #ifdef REGALIGN
896 			/*SUPPRESS 560*/
897 			if (n = NEXT(scan))
898 			    scan += n;
899 			else
900 			    scan = NULL;
901 #else
902 			scan = regnext(scan);
903 #endif
904 		    } while (scan != NULL && OP(scan) == BRANCH);
905 		    sayNO;
906 		    /* NOTREACHED */
907 		}
908 	    }
909 	    break;
910 	case MINMOD:
911 	    minmod = 1;
912 	    break;
913 	case CURLY:
914 	    ln = ARG1(scan);  /* min to match */
915 	    n  = ARG2(scan);  /* max to match */
916 	    scan = NEXTOPER(scan) + 4;
917 	    goto repeat;
918 	case STAR:
919 	    ln = 0;
920 	    n = 32767;
921 	    scan = NEXTOPER(scan);
922 	    goto repeat;
923 	case PLUS:
924 	    /*
925 	    * Lookahead to avoid useless match attempts
926 	    * when we know what character comes next.
927 	    */
928 	    ln = 1;
929 	    n = 32767;
930 	    scan = NEXTOPER(scan);
931 	  repeat:
932 	    if (OP(next) == EXACTLY)
933 		nextchar = *(OPERAND(next)+1);
934 	    else
935 		nextchar = -1000;
936 	    reginput = locinput;
937 	    if (minmod) {
938 		minmod = 0;
939 		if (ln && regrepeat(scan, ln) < ln)
940 		    sayNO;
941 		while (n >= ln || (n == 32767 && ln > 0)) { /* ln overflow ? */
942 		    /* If it could work, try it. */
943 		    if (nextchar == -1000 || *reginput == nextchar)
944 			if (regmatch(next))
945 			    sayYES;
946 		    /* Couldn't or didn't -- back up. */
947 		    reginput = locinput + ln;
948 		    if (regrepeat(scan, 1)) {
949 			ln++;
950 			reginput = locinput + ln;
951 		    }
952 		    else
953 			sayNO;
954 		}
955 	    }
956 	    else {
957 		n = regrepeat(scan, n);
958 		if (ln < n && regkind[(U8)OP(next)] == EOL &&
959 		    (!multiline || OP(next) == SEOL))
960 		    ln = n;			/* why back off? */
961 		while (n >= ln) {
962 		    /* If it could work, try it. */
963 		    if (nextchar == -1000 || *reginput == nextchar)
964 			if (regmatch(next))
965 			    sayYES;
966 		    /* Couldn't or didn't -- back up. */
967 		    n--;
968 		    reginput = locinput + n;
969 		}
970 	    }
971 	    sayNO;
972 	case SUCCEED:
973 	case END:
974 	    reginput = locinput;	/* put where regtry can find it */
975 	    sayYES;			/* Success! */
976 	case IFMATCH:
977 	    reginput = locinput;
978 	    scan = NEXTOPER(scan);
979 	    if (!regmatch(scan))
980 		sayNO;
981 	    break;
982 	case UNLESSM:
983 	    reginput = locinput;
984 	    scan = NEXTOPER(scan);
985 	    if (regmatch(scan))
986 		sayNO;
987 	    break;
988 	default:
989 	    fprintf(stderr, "%x %d\n",(unsigned)scan,scan[1]);
990 	    FAIL("regexp memory corruption");
991 	}
992 	scan = next;
993     }
994 
995     /*
996     * We get here only if there's trouble -- normally "case END" is
997     * the terminating point.
998     */
999     FAIL("corrupted regexp pointers");
1000     /*NOTREACHED*/
1001     sayNO;
1002 
1003 yes:
1004 #ifdef DEBUGGING
1005     regindent--;
1006 #endif
1007     return 1;
1008 
1009 no:
1010 #ifdef DEBUGGING
1011     regindent--;
1012 #endif
1013     return 0;
1014 }
1015 
1016 /*
1017  - regrepeat - repeatedly match something simple, report how many
1018  */
1019 /*
1020  * [This routine now assumes that it will only match on things of length 1.
1021  * That was true before, but now we assume scan - reginput is the count,
1022  * rather than incrementing count on every character.]
1023  */
1024 static I32
1025 regrepeat(p, max)
1026 char *p;
1027 I32 max;
1028 {
1029     register char *scan;
1030     register char *opnd;
1031     register I32 c;
1032     register char *loceol = regeol;
1033 
1034     scan = reginput;
1035     if (max != 32767 && max < loceol - scan)
1036       loceol = scan + max;
1037     opnd = OPERAND(p);
1038     switch (OP(p)) {
1039     case ANY:
1040 	while (scan < loceol && *scan != '\n')
1041 	    scan++;
1042 	break;
1043     case SANY:
1044 	scan = loceol;
1045 	break;
1046     case EXACTLY:		/* length of string is 1 */
1047 	opnd++;
1048 	while (scan < loceol && *opnd == *scan)
1049 	    scan++;
1050 	break;
1051     case ANYOF:
1052 	c = UCHARAT(scan);
1053 	while (scan < loceol && !(opnd[c >> 3] & (1 << (c & 7)))) {
1054 	    scan++;
1055 	    c = UCHARAT(scan);
1056 	}
1057 	break;
1058     case ALNUM:
1059 	while (scan < loceol && isALNUM(*scan))
1060 	    scan++;
1061 	break;
1062     case NALNUM:
1063 	while (scan < loceol && !isALNUM(*scan))
1064 	    scan++;
1065 	break;
1066     case SPACE:
1067 	while (scan < loceol && isSPACE(*scan))
1068 	    scan++;
1069 	break;
1070     case NSPACE:
1071 	while (scan < loceol && !isSPACE(*scan))
1072 	    scan++;
1073 	break;
1074     case DIGIT:
1075 	while (scan < loceol && isDIGIT(*scan))
1076 	    scan++;
1077 	break;
1078     case NDIGIT:
1079 	while (scan < loceol && !isDIGIT(*scan))
1080 	    scan++;
1081 	break;
1082     default:		/* Called on something of 0 width. */
1083 	break;		/* So match right here or not at all. */
1084     }
1085 
1086     c = scan - reginput;
1087     reginput = scan;
1088 
1089     return(c);
1090 }
1091 
1092 /*
1093  - regnext - dig the "next" pointer out of a node
1094  *
1095  * [Note, when REGALIGN is defined there are two places in regmatch()
1096  * that bypass this code for speed.]
1097  */
1098 char *
1099 regnext(p)
1100 register char *p;
1101 {
1102     register I32 offset;
1103 
1104     if (p == &regdummy)
1105 	return(NULL);
1106 
1107     offset = NEXT(p);
1108     if (offset == 0)
1109 	return(NULL);
1110 
1111 #ifdef REGALIGN
1112     return(p+offset);
1113 #else
1114     if (OP(p) == BACK)
1115 	return(p-offset);
1116     else
1117 	return(p+offset);
1118 #endif
1119 }
1120