xref: /csrg-svn/lib/libcompat/4.3/regex.c (revision 46488)
121356Sdist /*
221356Sdist  * Copyright (c) 1980 Regents of the University of California.
321356Sdist  * All rights reserved.  The Berkeley software License Agreement
421356Sdist  * specifies the terms and conditions for redistribution.
521356Sdist  */
621356Sdist 
726580Sdonn #if defined(LIBC_SCCS) && !defined(lint)
8*46488Sbostic static char sccsid[] = "@(#)regex.c	5.3 (Berkeley) 02/21/91";
926580Sdonn #endif LIBC_SCCS and not lint
1021356Sdist 
111977Swnj #
121977Swnj 
131977Swnj /*
141977Swnj  * routines to do regular expression matching
151977Swnj  *
161977Swnj  * Entry points:
171977Swnj  *
181977Swnj  *	re_comp(s)
191977Swnj  *		char *s;
201977Swnj  *	 ... returns 0 if the string s was compiled successfully,
211977Swnj  *		     a pointer to an error message otherwise.
221977Swnj  *	     If passed 0 or a null string returns without changing
231977Swnj  *           the currently compiled re (see note 11 below).
241977Swnj  *
251977Swnj  *	re_exec(s)
261977Swnj  *		char *s;
271977Swnj  *	 ... returns 1 if the string s matches the last compiled regular
281977Swnj  *		       expression,
291977Swnj  *		     0 if the string s failed to match the last compiled
301977Swnj  *		       regular expression, and
311977Swnj  *		    -1 if the compiled regular expression was invalid
321977Swnj  *		       (indicating an internal error).
331977Swnj  *
341977Swnj  * The strings passed to both re_comp and re_exec may have trailing or
351977Swnj  * embedded newline characters; they are terminated by nulls.
361977Swnj  *
371977Swnj  * The identity of the author of these routines is lost in antiquity;
381977Swnj  * this is essentially the same as the re code in the original V6 ed.
391977Swnj  *
401977Swnj  * The regular expressions recognized are described below. This description
411977Swnj  * is essentially the same as that for ed.
421977Swnj  *
431977Swnj  *	A regular expression specifies a set of strings of characters.
441977Swnj  *	A member of this set of strings is said to be matched by
451977Swnj  *	the regular expression.  In the following specification for
461977Swnj  *	regular expressions the word `character' means any character but NUL.
471977Swnj  *
481977Swnj  *	1.  Any character except a special character matches itself.
491977Swnj  *	    Special characters are the regular expression delimiter plus
501977Swnj  *	    \ [ . and sometimes ^ * $.
511977Swnj  *	2.  A . matches any character.
521977Swnj  *	3.  A \ followed by any character except a digit or ( )
531977Swnj  *	    matches that character.
541977Swnj  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
551977Swnj  *	    character in (or not in) s. In s, \ has no special meaning,
561977Swnj  *	    and ] may only appear as the first letter. A substring
571977Swnj  *	    a-b, with a and b in ascending ASCII order, stands for
581977Swnj  *	    the inclusive range of ASCII characters.
591977Swnj  *	5.  A regular expression of form 1-4 followed by * matches a
601977Swnj  *	    sequence of 0 or more matches of the regular expression.
611977Swnj  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
621977Swnj  *	    matches what x matches.
631977Swnj  *	7.  A \ followed by a digit n matches a copy of the string that the
641977Swnj  *	    bracketed regular expression beginning with the nth \( matched.
651977Swnj  *	8.  A regular expression of form 1-8, x, followed by a regular
661977Swnj  *	    expression of form 1-7, y matches a match for x followed by
671977Swnj  *	    a match for y, with the x match being as long as possible
681977Swnj  *	    while still permitting a y match.
691977Swnj  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
701977Swnj  *	    by $), is constrained to matches that begin at the left
711977Swnj  *	    (or end at the right) end of a line.
721977Swnj  *	10. A regular expression of form 1-9 picks out the longest among
731977Swnj  *	    the leftmost matches in a line.
741977Swnj  *	11. An empty regular expression stands for a copy of the last
751977Swnj  *	    regular expression encountered.
761977Swnj  */
771977Swnj 
781977Swnj /*
791977Swnj  * constants for re's
801977Swnj  */
811977Swnj #define	CBRA	1
821977Swnj #define	CCHR	2
831977Swnj #define	CDOT	4
841977Swnj #define	CCL	6
851977Swnj #define	NCCL	8
861977Swnj #define	CDOL	10
871977Swnj #define	CEOF	11
881977Swnj #define	CKET	12
891977Swnj #define	CBACK	18
901977Swnj 
911977Swnj #define	CSTAR	01
921977Swnj 
931977Swnj #define	ESIZE	512
941977Swnj #define	NBRA	9
951977Swnj 
961977Swnj static	char	expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
971977Swnj static	char	circf;
981977Swnj 
991977Swnj /*
1001977Swnj  * compile the regular expression argument into a dfa
1011977Swnj  */
1021977Swnj char *
1031977Swnj re_comp(sp)
1041977Swnj 	register char	*sp;
1051977Swnj {
1061977Swnj 	register int	c;
1071977Swnj 	register char	*ep = expbuf;
1081977Swnj 	int	cclcnt, numbra = 0;
1091977Swnj 	char	*lastep = 0;
1101977Swnj 	char	bracket[NBRA];
1111977Swnj 	char	*bracketp = &bracket[0];
1121977Swnj 	static	char	*retoolong = "Regular expression too long";
1131977Swnj 
1141977Swnj #define	comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
1151977Swnj 
1161977Swnj 	if (sp == 0 || *sp == '\0') {
1171977Swnj 		if (*ep == 0)
1181977Swnj 			return("No previous regular expression");
1191977Swnj 		return(0);
1201977Swnj 	}
1211977Swnj 	if (*sp == '^') {
1221977Swnj 		circf = 1;
1231977Swnj 		sp++;
1241977Swnj 	}
1251977Swnj 	else
1261977Swnj 		circf = 0;
1271977Swnj 	for (;;) {
1281977Swnj 		if (ep >= &expbuf[ESIZE])
1291977Swnj 			comerr(retoolong);
1301977Swnj 		if ((c = *sp++) == '\0') {
1311977Swnj 			if (bracketp != bracket)
1321977Swnj 				comerr("unmatched \\(");
1331977Swnj 			*ep++ = CEOF;
1341977Swnj 			*ep++ = 0;
1351977Swnj 			return(0);
1361977Swnj 		}
1371977Swnj 		if (c != '*')
1381977Swnj 			lastep = ep;
1391977Swnj 		switch (c) {
1401977Swnj 
1411977Swnj 		case '.':
1421977Swnj 			*ep++ = CDOT;
1431977Swnj 			continue;
1441977Swnj 
1451977Swnj 		case '*':
1461977Swnj 			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
1471977Swnj 				goto defchar;
1481977Swnj 			*lastep |= CSTAR;
1491977Swnj 			continue;
1501977Swnj 
1511977Swnj 		case '$':
1521977Swnj 			if (*sp != '\0')
1531977Swnj 				goto defchar;
1541977Swnj 			*ep++ = CDOL;
1551977Swnj 			continue;
1561977Swnj 
1571977Swnj 		case '[':
1581977Swnj 			*ep++ = CCL;
1591977Swnj 			*ep++ = 0;
1601977Swnj 			cclcnt = 1;
1611977Swnj 			if ((c = *sp++) == '^') {
1621977Swnj 				c = *sp++;
1631977Swnj 				ep[-2] = NCCL;
1641977Swnj 			}
1651977Swnj 			do {
1661977Swnj 				if (c == '\0')
1671977Swnj 					comerr("missing ]");
1681977Swnj 				if (c == '-' && ep [-1] != 0) {
1691977Swnj 					if ((c = *sp++) == ']') {
1701977Swnj 						*ep++ = '-';
1711977Swnj 						cclcnt++;
1721977Swnj 						break;
1731977Swnj 					}
1741977Swnj 					while (ep[-1] < c) {
1751977Swnj 						*ep = ep[-1] + 1;
1761977Swnj 						ep++;
1771977Swnj 						cclcnt++;
1781977Swnj 						if (ep >= &expbuf[ESIZE])
1791977Swnj 							comerr(retoolong);
1801977Swnj 					}
1811977Swnj 				}
1821977Swnj 				*ep++ = c;
1831977Swnj 				cclcnt++;
1841977Swnj 				if (ep >= &expbuf[ESIZE])
1851977Swnj 					comerr(retoolong);
1861977Swnj 			} while ((c = *sp++) != ']');
1871977Swnj 			lastep[1] = cclcnt;
1881977Swnj 			continue;
1891977Swnj 
1901977Swnj 		case '\\':
1911977Swnj 			if ((c = *sp++) == '(') {
1921977Swnj 				if (numbra >= NBRA)
1931977Swnj 					comerr("too many \\(\\) pairs");
1941977Swnj 				*bracketp++ = numbra;
1951977Swnj 				*ep++ = CBRA;
1961977Swnj 				*ep++ = numbra++;
1971977Swnj 				continue;
1981977Swnj 			}
1991977Swnj 			if (c == ')') {
2001977Swnj 				if (bracketp <= bracket)
2011977Swnj 					comerr("unmatched \\)");
2021977Swnj 				*ep++ = CKET;
2031977Swnj 				*ep++ = *--bracketp;
2041977Swnj 				continue;
2051977Swnj 			}
2061977Swnj 			if (c >= '1' && c < ('1' + NBRA)) {
2071977Swnj 				*ep++ = CBACK;
2081977Swnj 				*ep++ = c - '1';
2091977Swnj 				continue;
2101977Swnj 			}
2111977Swnj 			*ep++ = CCHR;
2121977Swnj 			*ep++ = c;
2131977Swnj 			continue;
2141977Swnj 
2151977Swnj 		defchar:
2161977Swnj 		default:
2171977Swnj 			*ep++ = CCHR;
2181977Swnj 			*ep++ = c;
2191977Swnj 		}
2201977Swnj 	}
2211977Swnj }
2221977Swnj 
2231977Swnj /*
2241977Swnj  * match the argument string against the compiled re
2251977Swnj  */
2261977Swnj int
2271977Swnj re_exec(p1)
2281977Swnj 	register char	*p1;
2291977Swnj {
2301977Swnj 	register char	*p2 = expbuf;
2311977Swnj 	register int	c;
2321977Swnj 	int	rv;
2331977Swnj 
2341977Swnj 	for (c = 0; c < NBRA; c++) {
2351977Swnj 		braslist[c] = 0;
2361977Swnj 		braelist[c] = 0;
2371977Swnj 	}
2381977Swnj 	if (circf)
2391977Swnj 		return((advance(p1, p2)));
2401977Swnj 	/*
2411977Swnj 	 * fast check for first character
2421977Swnj 	 */
2431977Swnj 	if (*p2 == CCHR) {
2441977Swnj 		c = p2[1];
2451977Swnj 		do {
2461977Swnj 			if (*p1 != c)
2471977Swnj 				continue;
2481977Swnj 			if (rv = advance(p1, p2))
2491977Swnj 				return(rv);
2501977Swnj 		} while (*p1++);
2511977Swnj 		return(0);
2521977Swnj 	}
2531977Swnj 	/*
2541977Swnj 	 * regular algorithm
2551977Swnj 	 */
2561977Swnj 	do
2571977Swnj 		if (rv = advance(p1, p2))
2581977Swnj 			return(rv);
2591977Swnj 	while (*p1++);
2601977Swnj 	return(0);
2611977Swnj }
2621977Swnj 
2631977Swnj /*
2641977Swnj  * try to match the next thing in the dfa
2651977Swnj  */
2661977Swnj static	int
2671977Swnj advance(lp, ep)
2681977Swnj 	register char	*lp, *ep;
2691977Swnj {
2701977Swnj 	register char	*curlp;
2711977Swnj 	int	ct, i;
2721977Swnj 	int	rv;
2731977Swnj 
2741977Swnj 	for (;;)
2751977Swnj 		switch (*ep++) {
2761977Swnj 
2771977Swnj 		case CCHR:
2781977Swnj 			if (*ep++ == *lp++)
2791977Swnj 				continue;
2801977Swnj 			return(0);
2811977Swnj 
2821977Swnj 		case CDOT:
2831977Swnj 			if (*lp++)
2841977Swnj 				continue;
2851977Swnj 			return(0);
2861977Swnj 
2871977Swnj 		case CDOL:
2881977Swnj 			if (*lp == '\0')
2891977Swnj 				continue;
2901977Swnj 			return(0);
2911977Swnj 
2921977Swnj 		case CEOF:
2931977Swnj 			return(1);
2941977Swnj 
2951977Swnj 		case CCL:
2961977Swnj 			if (cclass(ep, *lp++, 1)) {
2971977Swnj 				ep += *ep;
2981977Swnj 				continue;
2991977Swnj 			}
3001977Swnj 			return(0);
3011977Swnj 
3021977Swnj 		case NCCL:
3031977Swnj 			if (cclass(ep, *lp++, 0)) {
3041977Swnj 				ep += *ep;
3051977Swnj 				continue;
3061977Swnj 			}
3071977Swnj 			return(0);
3081977Swnj 
3091977Swnj 		case CBRA:
3101977Swnj 			braslist[*ep++] = lp;
3111977Swnj 			continue;
3121977Swnj 
3131977Swnj 		case CKET:
3141977Swnj 			braelist[*ep++] = lp;
3151977Swnj 			continue;
3161977Swnj 
3171977Swnj 		case CBACK:
3181977Swnj 			if (braelist[i = *ep++] == 0)
3191977Swnj 				return(-1);
3201977Swnj 			if (backref(i, lp)) {
3211977Swnj 				lp += braelist[i] - braslist[i];
3221977Swnj 				continue;
3231977Swnj 			}
3241977Swnj 			return(0);
3251977Swnj 
3261977Swnj 		case CBACK|CSTAR:
3271977Swnj 			if (braelist[i = *ep++] == 0)
3281977Swnj 				return(-1);
3291977Swnj 			curlp = lp;
3301977Swnj 			ct = braelist[i] - braslist[i];
3311977Swnj 			while (backref(i, lp))
3321977Swnj 				lp += ct;
3331977Swnj 			while (lp >= curlp) {
3341977Swnj 				if (rv = advance(lp, ep))
3351977Swnj 					return(rv);
3361977Swnj 				lp -= ct;
3371977Swnj 			}
3381977Swnj 			continue;
3391977Swnj 
3401977Swnj 		case CDOT|CSTAR:
3411977Swnj 			curlp = lp;
3421977Swnj 			while (*lp++)
3431977Swnj 				;
3441977Swnj 			goto star;
3451977Swnj 
3461977Swnj 		case CCHR|CSTAR:
3471977Swnj 			curlp = lp;
3481977Swnj 			while (*lp++ == *ep)
3491977Swnj 				;
3501977Swnj 			ep++;
3511977Swnj 			goto star;
3521977Swnj 
3531977Swnj 		case CCL|CSTAR:
3541977Swnj 		case NCCL|CSTAR:
3551977Swnj 			curlp = lp;
3561977Swnj 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
3571977Swnj 				;
3581977Swnj 			ep += *ep;
3591977Swnj 			goto star;
3601977Swnj 
3611977Swnj 		star:
3621977Swnj 			do {
3631977Swnj 				lp--;
3641977Swnj 				if (rv = advance(lp, ep))
3651977Swnj 					return(rv);
3661977Swnj 			} while (lp > curlp);
3671977Swnj 			return(0);
3681977Swnj 
3691977Swnj 		default:
3701977Swnj 			return(-1);
3711977Swnj 		}
3721977Swnj }
3731977Swnj 
374*46488Sbostic static
3751977Swnj backref(i, lp)
3761977Swnj 	register int	i;
3771977Swnj 	register char	*lp;
3781977Swnj {
3791977Swnj 	register char	*bp;
3801977Swnj 
3811977Swnj 	bp = braslist[i];
3821977Swnj 	while (*bp++ == *lp++)
3831977Swnj 		if (bp >= braelist[i])
3841977Swnj 			return(1);
3851977Swnj 	return(0);
3861977Swnj }
3871977Swnj 
388*46488Sbostic static int
3891977Swnj cclass(set, c, af)
3901977Swnj 	register char	*set, c;
3911977Swnj 	int	af;
3921977Swnj {
3931977Swnj 	register int	n;
3941977Swnj 
3951977Swnj 	if (c == 0)
3961977Swnj 		return(0);
3971977Swnj 	n = *set++;
3981977Swnj 	while (--n)
3991977Swnj 		if (*set++ == c)
4001977Swnj 			return(af);
4011977Swnj 	return(! af);
4021977Swnj }
403