xref: /csrg-svn/lib/libcompat/4.3/regex.c (revision 46597)
121356Sdist /*
221356Sdist  * Copyright (c) 1980 Regents of the University of California.
321356Sdist  * All rights reserved.  The Berkeley software License Agreement
421356Sdist  * specifies the terms and conditions for redistribution.
521356Sdist  */
621356Sdist 
726580Sdonn #if defined(LIBC_SCCS) && !defined(lint)
8*46597Sdonn static char sccsid[] = "@(#)regex.c	5.4 (Berkeley) 02/23/91";
926580Sdonn #endif LIBC_SCCS and not lint
1021356Sdist 
11*46597Sdonn #include <unistd.h>
121977Swnj 
13*46597Sdonn static int advance();
14*46597Sdonn static int backref();
15*46597Sdonn static int cclass();
16*46597Sdonn 
171977Swnj /*
181977Swnj  * routines to do regular expression matching
191977Swnj  *
201977Swnj  * Entry points:
211977Swnj  *
221977Swnj  *	re_comp(s)
231977Swnj  *		char *s;
241977Swnj  *	 ... returns 0 if the string s was compiled successfully,
251977Swnj  *		     a pointer to an error message otherwise.
261977Swnj  *	     If passed 0 or a null string returns without changing
271977Swnj  *           the currently compiled re (see note 11 below).
281977Swnj  *
291977Swnj  *	re_exec(s)
301977Swnj  *		char *s;
311977Swnj  *	 ... returns 1 if the string s matches the last compiled regular
321977Swnj  *		       expression,
331977Swnj  *		     0 if the string s failed to match the last compiled
341977Swnj  *		       regular expression, and
351977Swnj  *		    -1 if the compiled regular expression was invalid
361977Swnj  *		       (indicating an internal error).
371977Swnj  *
381977Swnj  * The strings passed to both re_comp and re_exec may have trailing or
391977Swnj  * embedded newline characters; they are terminated by nulls.
401977Swnj  *
411977Swnj  * The identity of the author of these routines is lost in antiquity;
421977Swnj  * this is essentially the same as the re code in the original V6 ed.
431977Swnj  *
441977Swnj  * The regular expressions recognized are described below. This description
451977Swnj  * is essentially the same as that for ed.
461977Swnj  *
471977Swnj  *	A regular expression specifies a set of strings of characters.
481977Swnj  *	A member of this set of strings is said to be matched by
491977Swnj  *	the regular expression.  In the following specification for
501977Swnj  *	regular expressions the word `character' means any character but NUL.
511977Swnj  *
521977Swnj  *	1.  Any character except a special character matches itself.
531977Swnj  *	    Special characters are the regular expression delimiter plus
541977Swnj  *	    \ [ . and sometimes ^ * $.
551977Swnj  *	2.  A . matches any character.
561977Swnj  *	3.  A \ followed by any character except a digit or ( )
571977Swnj  *	    matches that character.
581977Swnj  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
591977Swnj  *	    character in (or not in) s. In s, \ has no special meaning,
601977Swnj  *	    and ] may only appear as the first letter. A substring
611977Swnj  *	    a-b, with a and b in ascending ASCII order, stands for
621977Swnj  *	    the inclusive range of ASCII characters.
631977Swnj  *	5.  A regular expression of form 1-4 followed by * matches a
641977Swnj  *	    sequence of 0 or more matches of the regular expression.
651977Swnj  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
661977Swnj  *	    matches what x matches.
671977Swnj  *	7.  A \ followed by a digit n matches a copy of the string that the
681977Swnj  *	    bracketed regular expression beginning with the nth \( matched.
691977Swnj  *	8.  A regular expression of form 1-8, x, followed by a regular
701977Swnj  *	    expression of form 1-7, y matches a match for x followed by
711977Swnj  *	    a match for y, with the x match being as long as possible
721977Swnj  *	    while still permitting a y match.
731977Swnj  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
741977Swnj  *	    by $), is constrained to matches that begin at the left
751977Swnj  *	    (or end at the right) end of a line.
761977Swnj  *	10. A regular expression of form 1-9 picks out the longest among
771977Swnj  *	    the leftmost matches in a line.
781977Swnj  *	11. An empty regular expression stands for a copy of the last
791977Swnj  *	    regular expression encountered.
801977Swnj  */
811977Swnj 
821977Swnj /*
831977Swnj  * constants for re's
841977Swnj  */
851977Swnj #define	CBRA	1
861977Swnj #define	CCHR	2
871977Swnj #define	CDOT	4
881977Swnj #define	CCL	6
891977Swnj #define	NCCL	8
901977Swnj #define	CDOL	10
911977Swnj #define	CEOF	11
921977Swnj #define	CKET	12
931977Swnj #define	CBACK	18
941977Swnj 
951977Swnj #define	CSTAR	01
961977Swnj 
971977Swnj #define	ESIZE	512
981977Swnj #define	NBRA	9
991977Swnj 
1001977Swnj static	char	expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
1011977Swnj static	char	circf;
1021977Swnj 
1031977Swnj /*
1041977Swnj  * compile the regular expression argument into a dfa
1051977Swnj  */
1061977Swnj char *
1071977Swnj re_comp(sp)
108*46597Sdonn 	register const char	*sp;
1091977Swnj {
1101977Swnj 	register int	c;
1111977Swnj 	register char	*ep = expbuf;
1121977Swnj 	int	cclcnt, numbra = 0;
1131977Swnj 	char	*lastep = 0;
1141977Swnj 	char	bracket[NBRA];
1151977Swnj 	char	*bracketp = &bracket[0];
1161977Swnj 	static	char	*retoolong = "Regular expression too long";
1171977Swnj 
1181977Swnj #define	comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
1191977Swnj 
1201977Swnj 	if (sp == 0 || *sp == '\0') {
1211977Swnj 		if (*ep == 0)
1221977Swnj 			return("No previous regular expression");
1231977Swnj 		return(0);
1241977Swnj 	}
1251977Swnj 	if (*sp == '^') {
1261977Swnj 		circf = 1;
1271977Swnj 		sp++;
1281977Swnj 	}
1291977Swnj 	else
1301977Swnj 		circf = 0;
1311977Swnj 	for (;;) {
1321977Swnj 		if (ep >= &expbuf[ESIZE])
1331977Swnj 			comerr(retoolong);
1341977Swnj 		if ((c = *sp++) == '\0') {
1351977Swnj 			if (bracketp != bracket)
1361977Swnj 				comerr("unmatched \\(");
1371977Swnj 			*ep++ = CEOF;
1381977Swnj 			*ep++ = 0;
1391977Swnj 			return(0);
1401977Swnj 		}
1411977Swnj 		if (c != '*')
1421977Swnj 			lastep = ep;
1431977Swnj 		switch (c) {
1441977Swnj 
1451977Swnj 		case '.':
1461977Swnj 			*ep++ = CDOT;
1471977Swnj 			continue;
1481977Swnj 
1491977Swnj 		case '*':
1501977Swnj 			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
1511977Swnj 				goto defchar;
1521977Swnj 			*lastep |= CSTAR;
1531977Swnj 			continue;
1541977Swnj 
1551977Swnj 		case '$':
1561977Swnj 			if (*sp != '\0')
1571977Swnj 				goto defchar;
1581977Swnj 			*ep++ = CDOL;
1591977Swnj 			continue;
1601977Swnj 
1611977Swnj 		case '[':
1621977Swnj 			*ep++ = CCL;
1631977Swnj 			*ep++ = 0;
1641977Swnj 			cclcnt = 1;
1651977Swnj 			if ((c = *sp++) == '^') {
1661977Swnj 				c = *sp++;
1671977Swnj 				ep[-2] = NCCL;
1681977Swnj 			}
1691977Swnj 			do {
1701977Swnj 				if (c == '\0')
1711977Swnj 					comerr("missing ]");
1721977Swnj 				if (c == '-' && ep [-1] != 0) {
1731977Swnj 					if ((c = *sp++) == ']') {
1741977Swnj 						*ep++ = '-';
1751977Swnj 						cclcnt++;
1761977Swnj 						break;
1771977Swnj 					}
1781977Swnj 					while (ep[-1] < c) {
1791977Swnj 						*ep = ep[-1] + 1;
1801977Swnj 						ep++;
1811977Swnj 						cclcnt++;
1821977Swnj 						if (ep >= &expbuf[ESIZE])
1831977Swnj 							comerr(retoolong);
1841977Swnj 					}
1851977Swnj 				}
1861977Swnj 				*ep++ = c;
1871977Swnj 				cclcnt++;
1881977Swnj 				if (ep >= &expbuf[ESIZE])
1891977Swnj 					comerr(retoolong);
1901977Swnj 			} while ((c = *sp++) != ']');
1911977Swnj 			lastep[1] = cclcnt;
1921977Swnj 			continue;
1931977Swnj 
1941977Swnj 		case '\\':
1951977Swnj 			if ((c = *sp++) == '(') {
1961977Swnj 				if (numbra >= NBRA)
1971977Swnj 					comerr("too many \\(\\) pairs");
1981977Swnj 				*bracketp++ = numbra;
1991977Swnj 				*ep++ = CBRA;
2001977Swnj 				*ep++ = numbra++;
2011977Swnj 				continue;
2021977Swnj 			}
2031977Swnj 			if (c == ')') {
2041977Swnj 				if (bracketp <= bracket)
2051977Swnj 					comerr("unmatched \\)");
2061977Swnj 				*ep++ = CKET;
2071977Swnj 				*ep++ = *--bracketp;
2081977Swnj 				continue;
2091977Swnj 			}
2101977Swnj 			if (c >= '1' && c < ('1' + NBRA)) {
2111977Swnj 				*ep++ = CBACK;
2121977Swnj 				*ep++ = c - '1';
2131977Swnj 				continue;
2141977Swnj 			}
2151977Swnj 			*ep++ = CCHR;
2161977Swnj 			*ep++ = c;
2171977Swnj 			continue;
2181977Swnj 
2191977Swnj 		defchar:
2201977Swnj 		default:
2211977Swnj 			*ep++ = CCHR;
2221977Swnj 			*ep++ = c;
2231977Swnj 		}
2241977Swnj 	}
2251977Swnj }
2261977Swnj 
2271977Swnj /*
2281977Swnj  * match the argument string against the compiled re
2291977Swnj  */
2301977Swnj int
2311977Swnj re_exec(p1)
232*46597Sdonn 	register const char	*p1;
2331977Swnj {
2341977Swnj 	register char	*p2 = expbuf;
2351977Swnj 	register int	c;
2361977Swnj 	int	rv;
2371977Swnj 
2381977Swnj 	for (c = 0; c < NBRA; c++) {
2391977Swnj 		braslist[c] = 0;
2401977Swnj 		braelist[c] = 0;
2411977Swnj 	}
2421977Swnj 	if (circf)
2431977Swnj 		return((advance(p1, p2)));
2441977Swnj 	/*
2451977Swnj 	 * fast check for first character
2461977Swnj 	 */
2471977Swnj 	if (*p2 == CCHR) {
2481977Swnj 		c = p2[1];
2491977Swnj 		do {
2501977Swnj 			if (*p1 != c)
2511977Swnj 				continue;
2521977Swnj 			if (rv = advance(p1, p2))
2531977Swnj 				return(rv);
2541977Swnj 		} while (*p1++);
2551977Swnj 		return(0);
2561977Swnj 	}
2571977Swnj 	/*
2581977Swnj 	 * regular algorithm
2591977Swnj 	 */
2601977Swnj 	do
2611977Swnj 		if (rv = advance(p1, p2))
2621977Swnj 			return(rv);
2631977Swnj 	while (*p1++);
2641977Swnj 	return(0);
2651977Swnj }
2661977Swnj 
2671977Swnj /*
2681977Swnj  * try to match the next thing in the dfa
2691977Swnj  */
2701977Swnj static	int
2711977Swnj advance(lp, ep)
2721977Swnj 	register char	*lp, *ep;
2731977Swnj {
2741977Swnj 	register char	*curlp;
2751977Swnj 	int	ct, i;
2761977Swnj 	int	rv;
2771977Swnj 
2781977Swnj 	for (;;)
2791977Swnj 		switch (*ep++) {
2801977Swnj 
2811977Swnj 		case CCHR:
2821977Swnj 			if (*ep++ == *lp++)
2831977Swnj 				continue;
2841977Swnj 			return(0);
2851977Swnj 
2861977Swnj 		case CDOT:
2871977Swnj 			if (*lp++)
2881977Swnj 				continue;
2891977Swnj 			return(0);
2901977Swnj 
2911977Swnj 		case CDOL:
2921977Swnj 			if (*lp == '\0')
2931977Swnj 				continue;
2941977Swnj 			return(0);
2951977Swnj 
2961977Swnj 		case CEOF:
2971977Swnj 			return(1);
2981977Swnj 
2991977Swnj 		case CCL:
3001977Swnj 			if (cclass(ep, *lp++, 1)) {
3011977Swnj 				ep += *ep;
3021977Swnj 				continue;
3031977Swnj 			}
3041977Swnj 			return(0);
3051977Swnj 
3061977Swnj 		case NCCL:
3071977Swnj 			if (cclass(ep, *lp++, 0)) {
3081977Swnj 				ep += *ep;
3091977Swnj 				continue;
3101977Swnj 			}
3111977Swnj 			return(0);
3121977Swnj 
3131977Swnj 		case CBRA:
3141977Swnj 			braslist[*ep++] = lp;
3151977Swnj 			continue;
3161977Swnj 
3171977Swnj 		case CKET:
3181977Swnj 			braelist[*ep++] = lp;
3191977Swnj 			continue;
3201977Swnj 
3211977Swnj 		case CBACK:
3221977Swnj 			if (braelist[i = *ep++] == 0)
3231977Swnj 				return(-1);
3241977Swnj 			if (backref(i, lp)) {
3251977Swnj 				lp += braelist[i] - braslist[i];
3261977Swnj 				continue;
3271977Swnj 			}
3281977Swnj 			return(0);
3291977Swnj 
3301977Swnj 		case CBACK|CSTAR:
3311977Swnj 			if (braelist[i = *ep++] == 0)
3321977Swnj 				return(-1);
3331977Swnj 			curlp = lp;
3341977Swnj 			ct = braelist[i] - braslist[i];
3351977Swnj 			while (backref(i, lp))
3361977Swnj 				lp += ct;
3371977Swnj 			while (lp >= curlp) {
3381977Swnj 				if (rv = advance(lp, ep))
3391977Swnj 					return(rv);
3401977Swnj 				lp -= ct;
3411977Swnj 			}
3421977Swnj 			continue;
3431977Swnj 
3441977Swnj 		case CDOT|CSTAR:
3451977Swnj 			curlp = lp;
3461977Swnj 			while (*lp++)
3471977Swnj 				;
3481977Swnj 			goto star;
3491977Swnj 
3501977Swnj 		case CCHR|CSTAR:
3511977Swnj 			curlp = lp;
3521977Swnj 			while (*lp++ == *ep)
3531977Swnj 				;
3541977Swnj 			ep++;
3551977Swnj 			goto star;
3561977Swnj 
3571977Swnj 		case CCL|CSTAR:
3581977Swnj 		case NCCL|CSTAR:
3591977Swnj 			curlp = lp;
3601977Swnj 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
3611977Swnj 				;
3621977Swnj 			ep += *ep;
3631977Swnj 			goto star;
3641977Swnj 
3651977Swnj 		star:
3661977Swnj 			do {
3671977Swnj 				lp--;
3681977Swnj 				if (rv = advance(lp, ep))
3691977Swnj 					return(rv);
3701977Swnj 			} while (lp > curlp);
3711977Swnj 			return(0);
3721977Swnj 
3731977Swnj 		default:
3741977Swnj 			return(-1);
3751977Swnj 		}
3761977Swnj }
3771977Swnj 
37846488Sbostic static
3791977Swnj backref(i, lp)
3801977Swnj 	register int	i;
3811977Swnj 	register char	*lp;
3821977Swnj {
3831977Swnj 	register char	*bp;
3841977Swnj 
3851977Swnj 	bp = braslist[i];
3861977Swnj 	while (*bp++ == *lp++)
3871977Swnj 		if (bp >= braelist[i])
3881977Swnj 			return(1);
3891977Swnj 	return(0);
3901977Swnj }
3911977Swnj 
39246488Sbostic static int
3931977Swnj cclass(set, c, af)
3941977Swnj 	register char	*set, c;
3951977Swnj 	int	af;
3961977Swnj {
3971977Swnj 	register int	n;
3981977Swnj 
3991977Swnj 	if (c == 0)
4001977Swnj 		return(0);
4011977Swnj 	n = *set++;
4021977Swnj 	while (--n)
4031977Swnj 		if (*set++ == c)
4041977Swnj 			return(af);
4051977Swnj 	return(! af);
4061977Swnj }
407