xref: /csrg-svn/lib/libcompat/4.3/regex.c (revision 61241)
148399Sbostic /*-
2*61241Sbostic  * Copyright (c) 1980, 1993
3*61241Sbostic  *	The Regents of the University of California.  All rights reserved.
448399Sbostic  *
548399Sbostic  * %sccs.include.proprietary.c%
621356Sdist  */
721356Sdist 
826580Sdonn #if defined(LIBC_SCCS) && !defined(lint)
9*61241Sbostic static char sccsid[] = "@(#)regex.c	8.1 (Berkeley) 06/04/93";
1048399Sbostic #endif /* LIBC_SCCS and not lint */
1121356Sdist 
1246597Sdonn #include <unistd.h>
131977Swnj 
1446597Sdonn static int advance();
1546597Sdonn static int backref();
1646597Sdonn static int cclass();
1746597Sdonn 
181977Swnj /*
191977Swnj  * routines to do regular expression matching
201977Swnj  *
211977Swnj  * Entry points:
221977Swnj  *
231977Swnj  *	re_comp(s)
241977Swnj  *		char *s;
251977Swnj  *	 ... returns 0 if the string s was compiled successfully,
261977Swnj  *		     a pointer to an error message otherwise.
271977Swnj  *	     If passed 0 or a null string returns without changing
281977Swnj  *           the currently compiled re (see note 11 below).
291977Swnj  *
301977Swnj  *	re_exec(s)
311977Swnj  *		char *s;
321977Swnj  *	 ... returns 1 if the string s matches the last compiled regular
331977Swnj  *		       expression,
341977Swnj  *		     0 if the string s failed to match the last compiled
351977Swnj  *		       regular expression, and
361977Swnj  *		    -1 if the compiled regular expression was invalid
371977Swnj  *		       (indicating an internal error).
381977Swnj  *
391977Swnj  * The strings passed to both re_comp and re_exec may have trailing or
401977Swnj  * embedded newline characters; they are terminated by nulls.
411977Swnj  *
421977Swnj  * The identity of the author of these routines is lost in antiquity;
431977Swnj  * this is essentially the same as the re code in the original V6 ed.
441977Swnj  *
451977Swnj  * The regular expressions recognized are described below. This description
461977Swnj  * is essentially the same as that for ed.
471977Swnj  *
481977Swnj  *	A regular expression specifies a set of strings of characters.
491977Swnj  *	A member of this set of strings is said to be matched by
501977Swnj  *	the regular expression.  In the following specification for
511977Swnj  *	regular expressions the word `character' means any character but NUL.
521977Swnj  *
531977Swnj  *	1.  Any character except a special character matches itself.
541977Swnj  *	    Special characters are the regular expression delimiter plus
551977Swnj  *	    \ [ . and sometimes ^ * $.
561977Swnj  *	2.  A . matches any character.
571977Swnj  *	3.  A \ followed by any character except a digit or ( )
581977Swnj  *	    matches that character.
591977Swnj  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
601977Swnj  *	    character in (or not in) s. In s, \ has no special meaning,
611977Swnj  *	    and ] may only appear as the first letter. A substring
621977Swnj  *	    a-b, with a and b in ascending ASCII order, stands for
631977Swnj  *	    the inclusive range of ASCII characters.
641977Swnj  *	5.  A regular expression of form 1-4 followed by * matches a
651977Swnj  *	    sequence of 0 or more matches of the regular expression.
661977Swnj  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
671977Swnj  *	    matches what x matches.
681977Swnj  *	7.  A \ followed by a digit n matches a copy of the string that the
691977Swnj  *	    bracketed regular expression beginning with the nth \( matched.
701977Swnj  *	8.  A regular expression of form 1-8, x, followed by a regular
711977Swnj  *	    expression of form 1-7, y matches a match for x followed by
721977Swnj  *	    a match for y, with the x match being as long as possible
731977Swnj  *	    while still permitting a y match.
741977Swnj  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
751977Swnj  *	    by $), is constrained to matches that begin at the left
761977Swnj  *	    (or end at the right) end of a line.
771977Swnj  *	10. A regular expression of form 1-9 picks out the longest among
781977Swnj  *	    the leftmost matches in a line.
791977Swnj  *	11. An empty regular expression stands for a copy of the last
801977Swnj  *	    regular expression encountered.
811977Swnj  */
821977Swnj 
831977Swnj /*
841977Swnj  * constants for re's
851977Swnj  */
861977Swnj #define	CBRA	1
871977Swnj #define	CCHR	2
881977Swnj #define	CDOT	4
891977Swnj #define	CCL	6
901977Swnj #define	NCCL	8
911977Swnj #define	CDOL	10
921977Swnj #define	CEOF	11
931977Swnj #define	CKET	12
941977Swnj #define	CBACK	18
951977Swnj 
961977Swnj #define	CSTAR	01
971977Swnj 
981977Swnj #define	ESIZE	512
991977Swnj #define	NBRA	9
1001977Swnj 
1011977Swnj static	char	expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
1021977Swnj static	char	circf;
1031977Swnj 
1041977Swnj /*
1051977Swnj  * compile the regular expression argument into a dfa
1061977Swnj  */
1071977Swnj char *
re_comp(sp)1081977Swnj re_comp(sp)
10946597Sdonn 	register const char	*sp;
1101977Swnj {
1111977Swnj 	register int	c;
1121977Swnj 	register char	*ep = expbuf;
1131977Swnj 	int	cclcnt, numbra = 0;
1141977Swnj 	char	*lastep = 0;
1151977Swnj 	char	bracket[NBRA];
1161977Swnj 	char	*bracketp = &bracket[0];
1171977Swnj 	static	char	*retoolong = "Regular expression too long";
1181977Swnj 
1191977Swnj #define	comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
1201977Swnj 
1211977Swnj 	if (sp == 0 || *sp == '\0') {
1221977Swnj 		if (*ep == 0)
1231977Swnj 			return("No previous regular expression");
1241977Swnj 		return(0);
1251977Swnj 	}
1261977Swnj 	if (*sp == '^') {
1271977Swnj 		circf = 1;
1281977Swnj 		sp++;
1291977Swnj 	}
1301977Swnj 	else
1311977Swnj 		circf = 0;
1321977Swnj 	for (;;) {
1331977Swnj 		if (ep >= &expbuf[ESIZE])
1341977Swnj 			comerr(retoolong);
1351977Swnj 		if ((c = *sp++) == '\0') {
1361977Swnj 			if (bracketp != bracket)
1371977Swnj 				comerr("unmatched \\(");
1381977Swnj 			*ep++ = CEOF;
1391977Swnj 			*ep++ = 0;
1401977Swnj 			return(0);
1411977Swnj 		}
1421977Swnj 		if (c != '*')
1431977Swnj 			lastep = ep;
1441977Swnj 		switch (c) {
1451977Swnj 
1461977Swnj 		case '.':
1471977Swnj 			*ep++ = CDOT;
1481977Swnj 			continue;
1491977Swnj 
1501977Swnj 		case '*':
1511977Swnj 			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
1521977Swnj 				goto defchar;
1531977Swnj 			*lastep |= CSTAR;
1541977Swnj 			continue;
1551977Swnj 
1561977Swnj 		case '$':
1571977Swnj 			if (*sp != '\0')
1581977Swnj 				goto defchar;
1591977Swnj 			*ep++ = CDOL;
1601977Swnj 			continue;
1611977Swnj 
1621977Swnj 		case '[':
1631977Swnj 			*ep++ = CCL;
1641977Swnj 			*ep++ = 0;
1651977Swnj 			cclcnt = 1;
1661977Swnj 			if ((c = *sp++) == '^') {
1671977Swnj 				c = *sp++;
1681977Swnj 				ep[-2] = NCCL;
1691977Swnj 			}
1701977Swnj 			do {
1711977Swnj 				if (c == '\0')
1721977Swnj 					comerr("missing ]");
1731977Swnj 				if (c == '-' && ep [-1] != 0) {
1741977Swnj 					if ((c = *sp++) == ']') {
1751977Swnj 						*ep++ = '-';
1761977Swnj 						cclcnt++;
1771977Swnj 						break;
1781977Swnj 					}
1791977Swnj 					while (ep[-1] < c) {
1801977Swnj 						*ep = ep[-1] + 1;
1811977Swnj 						ep++;
1821977Swnj 						cclcnt++;
1831977Swnj 						if (ep >= &expbuf[ESIZE])
1841977Swnj 							comerr(retoolong);
1851977Swnj 					}
1861977Swnj 				}
1871977Swnj 				*ep++ = c;
1881977Swnj 				cclcnt++;
1891977Swnj 				if (ep >= &expbuf[ESIZE])
1901977Swnj 					comerr(retoolong);
1911977Swnj 			} while ((c = *sp++) != ']');
1921977Swnj 			lastep[1] = cclcnt;
1931977Swnj 			continue;
1941977Swnj 
1951977Swnj 		case '\\':
1961977Swnj 			if ((c = *sp++) == '(') {
1971977Swnj 				if (numbra >= NBRA)
1981977Swnj 					comerr("too many \\(\\) pairs");
1991977Swnj 				*bracketp++ = numbra;
2001977Swnj 				*ep++ = CBRA;
2011977Swnj 				*ep++ = numbra++;
2021977Swnj 				continue;
2031977Swnj 			}
2041977Swnj 			if (c == ')') {
2051977Swnj 				if (bracketp <= bracket)
2061977Swnj 					comerr("unmatched \\)");
2071977Swnj 				*ep++ = CKET;
2081977Swnj 				*ep++ = *--bracketp;
2091977Swnj 				continue;
2101977Swnj 			}
2111977Swnj 			if (c >= '1' && c < ('1' + NBRA)) {
2121977Swnj 				*ep++ = CBACK;
2131977Swnj 				*ep++ = c - '1';
2141977Swnj 				continue;
2151977Swnj 			}
2161977Swnj 			*ep++ = CCHR;
2171977Swnj 			*ep++ = c;
2181977Swnj 			continue;
2191977Swnj 
2201977Swnj 		defchar:
2211977Swnj 		default:
2221977Swnj 			*ep++ = CCHR;
2231977Swnj 			*ep++ = c;
2241977Swnj 		}
2251977Swnj 	}
2261977Swnj }
2271977Swnj 
2281977Swnj /*
2291977Swnj  * match the argument string against the compiled re
2301977Swnj  */
2311977Swnj int
re_exec(p1)2321977Swnj re_exec(p1)
23346597Sdonn 	register const char	*p1;
2341977Swnj {
2351977Swnj 	register char	*p2 = expbuf;
2361977Swnj 	register int	c;
2371977Swnj 	int	rv;
2381977Swnj 
2391977Swnj 	for (c = 0; c < NBRA; c++) {
2401977Swnj 		braslist[c] = 0;
2411977Swnj 		braelist[c] = 0;
2421977Swnj 	}
2431977Swnj 	if (circf)
2441977Swnj 		return((advance(p1, p2)));
2451977Swnj 	/*
2461977Swnj 	 * fast check for first character
2471977Swnj 	 */
2481977Swnj 	if (*p2 == CCHR) {
2491977Swnj 		c = p2[1];
2501977Swnj 		do {
2511977Swnj 			if (*p1 != c)
2521977Swnj 				continue;
2531977Swnj 			if (rv = advance(p1, p2))
2541977Swnj 				return(rv);
2551977Swnj 		} while (*p1++);
2561977Swnj 		return(0);
2571977Swnj 	}
2581977Swnj 	/*
2591977Swnj 	 * regular algorithm
2601977Swnj 	 */
2611977Swnj 	do
2621977Swnj 		if (rv = advance(p1, p2))
2631977Swnj 			return(rv);
2641977Swnj 	while (*p1++);
2651977Swnj 	return(0);
2661977Swnj }
2671977Swnj 
2681977Swnj /*
2691977Swnj  * try to match the next thing in the dfa
2701977Swnj  */
2711977Swnj static	int
advance(lp,ep)2721977Swnj advance(lp, ep)
2731977Swnj 	register char	*lp, *ep;
2741977Swnj {
2751977Swnj 	register char	*curlp;
2761977Swnj 	int	ct, i;
2771977Swnj 	int	rv;
2781977Swnj 
2791977Swnj 	for (;;)
2801977Swnj 		switch (*ep++) {
2811977Swnj 
2821977Swnj 		case CCHR:
2831977Swnj 			if (*ep++ == *lp++)
2841977Swnj 				continue;
2851977Swnj 			return(0);
2861977Swnj 
2871977Swnj 		case CDOT:
2881977Swnj 			if (*lp++)
2891977Swnj 				continue;
2901977Swnj 			return(0);
2911977Swnj 
2921977Swnj 		case CDOL:
2931977Swnj 			if (*lp == '\0')
2941977Swnj 				continue;
2951977Swnj 			return(0);
2961977Swnj 
2971977Swnj 		case CEOF:
2981977Swnj 			return(1);
2991977Swnj 
3001977Swnj 		case CCL:
3011977Swnj 			if (cclass(ep, *lp++, 1)) {
3021977Swnj 				ep += *ep;
3031977Swnj 				continue;
3041977Swnj 			}
3051977Swnj 			return(0);
3061977Swnj 
3071977Swnj 		case NCCL:
3081977Swnj 			if (cclass(ep, *lp++, 0)) {
3091977Swnj 				ep += *ep;
3101977Swnj 				continue;
3111977Swnj 			}
3121977Swnj 			return(0);
3131977Swnj 
3141977Swnj 		case CBRA:
3151977Swnj 			braslist[*ep++] = lp;
3161977Swnj 			continue;
3171977Swnj 
3181977Swnj 		case CKET:
3191977Swnj 			braelist[*ep++] = lp;
3201977Swnj 			continue;
3211977Swnj 
3221977Swnj 		case CBACK:
3231977Swnj 			if (braelist[i = *ep++] == 0)
3241977Swnj 				return(-1);
3251977Swnj 			if (backref(i, lp)) {
3261977Swnj 				lp += braelist[i] - braslist[i];
3271977Swnj 				continue;
3281977Swnj 			}
3291977Swnj 			return(0);
3301977Swnj 
3311977Swnj 		case CBACK|CSTAR:
3321977Swnj 			if (braelist[i = *ep++] == 0)
3331977Swnj 				return(-1);
3341977Swnj 			curlp = lp;
3351977Swnj 			ct = braelist[i] - braslist[i];
3361977Swnj 			while (backref(i, lp))
3371977Swnj 				lp += ct;
3381977Swnj 			while (lp >= curlp) {
3391977Swnj 				if (rv = advance(lp, ep))
3401977Swnj 					return(rv);
3411977Swnj 				lp -= ct;
3421977Swnj 			}
3431977Swnj 			continue;
3441977Swnj 
3451977Swnj 		case CDOT|CSTAR:
3461977Swnj 			curlp = lp;
3471977Swnj 			while (*lp++)
3481977Swnj 				;
3491977Swnj 			goto star;
3501977Swnj 
3511977Swnj 		case CCHR|CSTAR:
3521977Swnj 			curlp = lp;
3531977Swnj 			while (*lp++ == *ep)
3541977Swnj 				;
3551977Swnj 			ep++;
3561977Swnj 			goto star;
3571977Swnj 
3581977Swnj 		case CCL|CSTAR:
3591977Swnj 		case NCCL|CSTAR:
3601977Swnj 			curlp = lp;
3611977Swnj 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
3621977Swnj 				;
3631977Swnj 			ep += *ep;
3641977Swnj 			goto star;
3651977Swnj 
3661977Swnj 		star:
3671977Swnj 			do {
3681977Swnj 				lp--;
3691977Swnj 				if (rv = advance(lp, ep))
3701977Swnj 					return(rv);
3711977Swnj 			} while (lp > curlp);
3721977Swnj 			return(0);
3731977Swnj 
3741977Swnj 		default:
3751977Swnj 			return(-1);
3761977Swnj 		}
3771977Swnj }
3781977Swnj 
37946488Sbostic static
backref(i,lp)3801977Swnj backref(i, lp)
3811977Swnj 	register int	i;
3821977Swnj 	register char	*lp;
3831977Swnj {
3841977Swnj 	register char	*bp;
3851977Swnj 
3861977Swnj 	bp = braslist[i];
3871977Swnj 	while (*bp++ == *lp++)
3881977Swnj 		if (bp >= braelist[i])
3891977Swnj 			return(1);
3901977Swnj 	return(0);
3911977Swnj }
3921977Swnj 
39346488Sbostic static int
cclass(set,c,af)3941977Swnj cclass(set, c, af)
3951977Swnj 	register char	*set, c;
3961977Swnj 	int	af;
3971977Swnj {
3981977Swnj 	register int	n;
3991977Swnj 
4001977Swnj 	if (c == 0)
4011977Swnj 		return(0);
4021977Swnj 	n = *set++;
4031977Swnj 	while (--n)
4041977Swnj 		if (*set++ == c)
4051977Swnj 			return(af);
4061977Swnj 	return(! af);
4071977Swnj }
408