xref: /csrg-svn/lib/libcompat/4.3/regex.c (revision 1977)
1*1977Swnj /* @(#)regex.c	4.1 (Berkeley) 12/21/80 */
2*1977Swnj #
3*1977Swnj 
4*1977Swnj /*
5*1977Swnj  * routines to do regular expression matching
6*1977Swnj  *
7*1977Swnj  * Entry points:
8*1977Swnj  *
9*1977Swnj  *	re_comp(s)
10*1977Swnj  *		char *s;
11*1977Swnj  *	 ... returns 0 if the string s was compiled successfully,
12*1977Swnj  *		     a pointer to an error message otherwise.
13*1977Swnj  *	     If passed 0 or a null string returns without changing
14*1977Swnj  *           the currently compiled re (see note 11 below).
15*1977Swnj  *
16*1977Swnj  *	re_exec(s)
17*1977Swnj  *		char *s;
18*1977Swnj  *	 ... returns 1 if the string s matches the last compiled regular
19*1977Swnj  *		       expression,
20*1977Swnj  *		     0 if the string s failed to match the last compiled
21*1977Swnj  *		       regular expression, and
22*1977Swnj  *		    -1 if the compiled regular expression was invalid
23*1977Swnj  *		       (indicating an internal error).
24*1977Swnj  *
25*1977Swnj  * The strings passed to both re_comp and re_exec may have trailing or
26*1977Swnj  * embedded newline characters; they are terminated by nulls.
27*1977Swnj  *
28*1977Swnj  * The identity of the author of these routines is lost in antiquity;
29*1977Swnj  * this is essentially the same as the re code in the original V6 ed.
30*1977Swnj  *
31*1977Swnj  * The regular expressions recognized are described below. This description
32*1977Swnj  * is essentially the same as that for ed.
33*1977Swnj  *
34*1977Swnj  *	A regular expression specifies a set of strings of characters.
35*1977Swnj  *	A member of this set of strings is said to be matched by
36*1977Swnj  *	the regular expression.  In the following specification for
37*1977Swnj  *	regular expressions the word `character' means any character but NUL.
38*1977Swnj  *
39*1977Swnj  *	1.  Any character except a special character matches itself.
40*1977Swnj  *	    Special characters are the regular expression delimiter plus
41*1977Swnj  *	    \ [ . and sometimes ^ * $.
42*1977Swnj  *	2.  A . matches any character.
43*1977Swnj  *	3.  A \ followed by any character except a digit or ( )
44*1977Swnj  *	    matches that character.
45*1977Swnj  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
46*1977Swnj  *	    character in (or not in) s. In s, \ has no special meaning,
47*1977Swnj  *	    and ] may only appear as the first letter. A substring
48*1977Swnj  *	    a-b, with a and b in ascending ASCII order, stands for
49*1977Swnj  *	    the inclusive range of ASCII characters.
50*1977Swnj  *	5.  A regular expression of form 1-4 followed by * matches a
51*1977Swnj  *	    sequence of 0 or more matches of the regular expression.
52*1977Swnj  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
53*1977Swnj  *	    matches what x matches.
54*1977Swnj  *	7.  A \ followed by a digit n matches a copy of the string that the
55*1977Swnj  *	    bracketed regular expression beginning with the nth \( matched.
56*1977Swnj  *	8.  A regular expression of form 1-8, x, followed by a regular
57*1977Swnj  *	    expression of form 1-7, y matches a match for x followed by
58*1977Swnj  *	    a match for y, with the x match being as long as possible
59*1977Swnj  *	    while still permitting a y match.
60*1977Swnj  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
61*1977Swnj  *	    by $), is constrained to matches that begin at the left
62*1977Swnj  *	    (or end at the right) end of a line.
63*1977Swnj  *	10. A regular expression of form 1-9 picks out the longest among
64*1977Swnj  *	    the leftmost matches in a line.
65*1977Swnj  *	11. An empty regular expression stands for a copy of the last
66*1977Swnj  *	    regular expression encountered.
67*1977Swnj  */
68*1977Swnj 
69*1977Swnj /*
70*1977Swnj  * constants for re's
71*1977Swnj  */
72*1977Swnj #define	CBRA	1
73*1977Swnj #define	CCHR	2
74*1977Swnj #define	CDOT	4
75*1977Swnj #define	CCL	6
76*1977Swnj #define	NCCL	8
77*1977Swnj #define	CDOL	10
78*1977Swnj #define	CEOF	11
79*1977Swnj #define	CKET	12
80*1977Swnj #define	CBACK	18
81*1977Swnj 
82*1977Swnj #define	CSTAR	01
83*1977Swnj 
84*1977Swnj #define	ESIZE	512
85*1977Swnj #define	NBRA	9
86*1977Swnj 
87*1977Swnj static	char	expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
88*1977Swnj static	char	circf;
89*1977Swnj 
90*1977Swnj /*
91*1977Swnj  * compile the regular expression argument into a dfa
92*1977Swnj  */
93*1977Swnj char *
94*1977Swnj re_comp(sp)
95*1977Swnj 	register char	*sp;
96*1977Swnj {
97*1977Swnj 	register int	c;
98*1977Swnj 	register char	*ep = expbuf;
99*1977Swnj 	int	cclcnt, numbra = 0;
100*1977Swnj 	char	*lastep = 0;
101*1977Swnj 	char	bracket[NBRA];
102*1977Swnj 	char	*bracketp = &bracket[0];
103*1977Swnj 	static	char	*retoolong = "Regular expression too long";
104*1977Swnj 
105*1977Swnj #define	comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
106*1977Swnj 
107*1977Swnj 	if (sp == 0 || *sp == '\0') {
108*1977Swnj 		if (*ep == 0)
109*1977Swnj 			return("No previous regular expression");
110*1977Swnj 		return(0);
111*1977Swnj 	}
112*1977Swnj 	if (*sp == '^') {
113*1977Swnj 		circf = 1;
114*1977Swnj 		sp++;
115*1977Swnj 	}
116*1977Swnj 	else
117*1977Swnj 		circf = 0;
118*1977Swnj 	for (;;) {
119*1977Swnj 		if (ep >= &expbuf[ESIZE])
120*1977Swnj 			comerr(retoolong);
121*1977Swnj 		if ((c = *sp++) == '\0') {
122*1977Swnj 			if (bracketp != bracket)
123*1977Swnj 				comerr("unmatched \\(");
124*1977Swnj 			*ep++ = CEOF;
125*1977Swnj 			*ep++ = 0;
126*1977Swnj 			return(0);
127*1977Swnj 		}
128*1977Swnj 		if (c != '*')
129*1977Swnj 			lastep = ep;
130*1977Swnj 		switch (c) {
131*1977Swnj 
132*1977Swnj 		case '.':
133*1977Swnj 			*ep++ = CDOT;
134*1977Swnj 			continue;
135*1977Swnj 
136*1977Swnj 		case '*':
137*1977Swnj 			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
138*1977Swnj 				goto defchar;
139*1977Swnj 			*lastep |= CSTAR;
140*1977Swnj 			continue;
141*1977Swnj 
142*1977Swnj 		case '$':
143*1977Swnj 			if (*sp != '\0')
144*1977Swnj 				goto defchar;
145*1977Swnj 			*ep++ = CDOL;
146*1977Swnj 			continue;
147*1977Swnj 
148*1977Swnj 		case '[':
149*1977Swnj 			*ep++ = CCL;
150*1977Swnj 			*ep++ = 0;
151*1977Swnj 			cclcnt = 1;
152*1977Swnj 			if ((c = *sp++) == '^') {
153*1977Swnj 				c = *sp++;
154*1977Swnj 				ep[-2] = NCCL;
155*1977Swnj 			}
156*1977Swnj 			do {
157*1977Swnj 				if (c == '\0')
158*1977Swnj 					comerr("missing ]");
159*1977Swnj 				if (c == '-' && ep [-1] != 0) {
160*1977Swnj 					if ((c = *sp++) == ']') {
161*1977Swnj 						*ep++ = '-';
162*1977Swnj 						cclcnt++;
163*1977Swnj 						break;
164*1977Swnj 					}
165*1977Swnj 					while (ep[-1] < c) {
166*1977Swnj 						*ep = ep[-1] + 1;
167*1977Swnj 						ep++;
168*1977Swnj 						cclcnt++;
169*1977Swnj 						if (ep >= &expbuf[ESIZE])
170*1977Swnj 							comerr(retoolong);
171*1977Swnj 					}
172*1977Swnj 				}
173*1977Swnj 				*ep++ = c;
174*1977Swnj 				cclcnt++;
175*1977Swnj 				if (ep >= &expbuf[ESIZE])
176*1977Swnj 					comerr(retoolong);
177*1977Swnj 			} while ((c = *sp++) != ']');
178*1977Swnj 			lastep[1] = cclcnt;
179*1977Swnj 			continue;
180*1977Swnj 
181*1977Swnj 		case '\\':
182*1977Swnj 			if ((c = *sp++) == '(') {
183*1977Swnj 				if (numbra >= NBRA)
184*1977Swnj 					comerr("too many \\(\\) pairs");
185*1977Swnj 				*bracketp++ = numbra;
186*1977Swnj 				*ep++ = CBRA;
187*1977Swnj 				*ep++ = numbra++;
188*1977Swnj 				continue;
189*1977Swnj 			}
190*1977Swnj 			if (c == ')') {
191*1977Swnj 				if (bracketp <= bracket)
192*1977Swnj 					comerr("unmatched \\)");
193*1977Swnj 				*ep++ = CKET;
194*1977Swnj 				*ep++ = *--bracketp;
195*1977Swnj 				continue;
196*1977Swnj 			}
197*1977Swnj 			if (c >= '1' && c < ('1' + NBRA)) {
198*1977Swnj 				*ep++ = CBACK;
199*1977Swnj 				*ep++ = c - '1';
200*1977Swnj 				continue;
201*1977Swnj 			}
202*1977Swnj 			*ep++ = CCHR;
203*1977Swnj 			*ep++ = c;
204*1977Swnj 			continue;
205*1977Swnj 
206*1977Swnj 		defchar:
207*1977Swnj 		default:
208*1977Swnj 			*ep++ = CCHR;
209*1977Swnj 			*ep++ = c;
210*1977Swnj 		}
211*1977Swnj 	}
212*1977Swnj }
213*1977Swnj 
214*1977Swnj /*
215*1977Swnj  * match the argument string against the compiled re
216*1977Swnj  */
217*1977Swnj int
218*1977Swnj re_exec(p1)
219*1977Swnj 	register char	*p1;
220*1977Swnj {
221*1977Swnj 	register char	*p2 = expbuf;
222*1977Swnj 	register int	c;
223*1977Swnj 	int	rv;
224*1977Swnj 
225*1977Swnj 	for (c = 0; c < NBRA; c++) {
226*1977Swnj 		braslist[c] = 0;
227*1977Swnj 		braelist[c] = 0;
228*1977Swnj 	}
229*1977Swnj 	if (circf)
230*1977Swnj 		return((advance(p1, p2)));
231*1977Swnj 	/*
232*1977Swnj 	 * fast check for first character
233*1977Swnj 	 */
234*1977Swnj 	if (*p2 == CCHR) {
235*1977Swnj 		c = p2[1];
236*1977Swnj 		do {
237*1977Swnj 			if (*p1 != c)
238*1977Swnj 				continue;
239*1977Swnj 			if (rv = advance(p1, p2))
240*1977Swnj 				return(rv);
241*1977Swnj 		} while (*p1++);
242*1977Swnj 		return(0);
243*1977Swnj 	}
244*1977Swnj 	/*
245*1977Swnj 	 * regular algorithm
246*1977Swnj 	 */
247*1977Swnj 	do
248*1977Swnj 		if (rv = advance(p1, p2))
249*1977Swnj 			return(rv);
250*1977Swnj 	while (*p1++);
251*1977Swnj 	return(0);
252*1977Swnj }
253*1977Swnj 
254*1977Swnj /*
255*1977Swnj  * try to match the next thing in the dfa
256*1977Swnj  */
257*1977Swnj static	int
258*1977Swnj advance(lp, ep)
259*1977Swnj 	register char	*lp, *ep;
260*1977Swnj {
261*1977Swnj 	register char	*curlp;
262*1977Swnj 	int	ct, i;
263*1977Swnj 	int	rv;
264*1977Swnj 
265*1977Swnj 	for (;;)
266*1977Swnj 		switch (*ep++) {
267*1977Swnj 
268*1977Swnj 		case CCHR:
269*1977Swnj 			if (*ep++ == *lp++)
270*1977Swnj 				continue;
271*1977Swnj 			return(0);
272*1977Swnj 
273*1977Swnj 		case CDOT:
274*1977Swnj 			if (*lp++)
275*1977Swnj 				continue;
276*1977Swnj 			return(0);
277*1977Swnj 
278*1977Swnj 		case CDOL:
279*1977Swnj 			if (*lp == '\0')
280*1977Swnj 				continue;
281*1977Swnj 			return(0);
282*1977Swnj 
283*1977Swnj 		case CEOF:
284*1977Swnj 			return(1);
285*1977Swnj 
286*1977Swnj 		case CCL:
287*1977Swnj 			if (cclass(ep, *lp++, 1)) {
288*1977Swnj 				ep += *ep;
289*1977Swnj 				continue;
290*1977Swnj 			}
291*1977Swnj 			return(0);
292*1977Swnj 
293*1977Swnj 		case NCCL:
294*1977Swnj 			if (cclass(ep, *lp++, 0)) {
295*1977Swnj 				ep += *ep;
296*1977Swnj 				continue;
297*1977Swnj 			}
298*1977Swnj 			return(0);
299*1977Swnj 
300*1977Swnj 		case CBRA:
301*1977Swnj 			braslist[*ep++] = lp;
302*1977Swnj 			continue;
303*1977Swnj 
304*1977Swnj 		case CKET:
305*1977Swnj 			braelist[*ep++] = lp;
306*1977Swnj 			continue;
307*1977Swnj 
308*1977Swnj 		case CBACK:
309*1977Swnj 			if (braelist[i = *ep++] == 0)
310*1977Swnj 				return(-1);
311*1977Swnj 			if (backref(i, lp)) {
312*1977Swnj 				lp += braelist[i] - braslist[i];
313*1977Swnj 				continue;
314*1977Swnj 			}
315*1977Swnj 			return(0);
316*1977Swnj 
317*1977Swnj 		case CBACK|CSTAR:
318*1977Swnj 			if (braelist[i = *ep++] == 0)
319*1977Swnj 				return(-1);
320*1977Swnj 			curlp = lp;
321*1977Swnj 			ct = braelist[i] - braslist[i];
322*1977Swnj 			while (backref(i, lp))
323*1977Swnj 				lp += ct;
324*1977Swnj 			while (lp >= curlp) {
325*1977Swnj 				if (rv = advance(lp, ep))
326*1977Swnj 					return(rv);
327*1977Swnj 				lp -= ct;
328*1977Swnj 			}
329*1977Swnj 			continue;
330*1977Swnj 
331*1977Swnj 		case CDOT|CSTAR:
332*1977Swnj 			curlp = lp;
333*1977Swnj 			while (*lp++)
334*1977Swnj 				;
335*1977Swnj 			goto star;
336*1977Swnj 
337*1977Swnj 		case CCHR|CSTAR:
338*1977Swnj 			curlp = lp;
339*1977Swnj 			while (*lp++ == *ep)
340*1977Swnj 				;
341*1977Swnj 			ep++;
342*1977Swnj 			goto star;
343*1977Swnj 
344*1977Swnj 		case CCL|CSTAR:
345*1977Swnj 		case NCCL|CSTAR:
346*1977Swnj 			curlp = lp;
347*1977Swnj 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
348*1977Swnj 				;
349*1977Swnj 			ep += *ep;
350*1977Swnj 			goto star;
351*1977Swnj 
352*1977Swnj 		star:
353*1977Swnj 			do {
354*1977Swnj 				lp--;
355*1977Swnj 				if (rv = advance(lp, ep))
356*1977Swnj 					return(rv);
357*1977Swnj 			} while (lp > curlp);
358*1977Swnj 			return(0);
359*1977Swnj 
360*1977Swnj 		default:
361*1977Swnj 			return(-1);
362*1977Swnj 		}
363*1977Swnj }
364*1977Swnj 
365*1977Swnj backref(i, lp)
366*1977Swnj 	register int	i;
367*1977Swnj 	register char	*lp;
368*1977Swnj {
369*1977Swnj 	register char	*bp;
370*1977Swnj 
371*1977Swnj 	bp = braslist[i];
372*1977Swnj 	while (*bp++ == *lp++)
373*1977Swnj 		if (bp >= braelist[i])
374*1977Swnj 			return(1);
375*1977Swnj 	return(0);
376*1977Swnj }
377*1977Swnj 
378*1977Swnj int
379*1977Swnj cclass(set, c, af)
380*1977Swnj 	register char	*set, c;
381*1977Swnj 	int	af;
382*1977Swnj {
383*1977Swnj 	register int	n;
384*1977Swnj 
385*1977Swnj 	if (c == 0)
386*1977Swnj 		return(0);
387*1977Swnj 	n = *set++;
388*1977Swnj 	while (--n)
389*1977Swnj 		if (*set++ == c)
390*1977Swnj 			return(af);
391*1977Swnj 	return(! af);
392*1977Swnj }
393