xref: /csrg-svn/games/quiz/rxp.c (revision 51611)
1*51611Sbostic /*-
2*51611Sbostic  * Copyright (c) 1991 The Regents of the University of California.
3*51611Sbostic  * All rights reserved.
4*51611Sbostic  *
5*51611Sbostic  * This code is derived from software contributed to Berkeley by
6*51611Sbostic  * Jim R. Oldroyd at The Instruction Set.
7*51611Sbostic  *
8*51611Sbostic  * %sccs.include.redist.c%
9*51611Sbostic  */
10*51611Sbostic 
11*51611Sbostic #ifndef lint
12*51611Sbostic static char sccsid[] = "@(#)rxp.c	5.1 (Berkeley) 11/10/91";
13*51611Sbostic #endif /* not lint */
14*51611Sbostic 
15*51611Sbostic /*
16*51611Sbostic  * regular expression parser
17*51611Sbostic  *
18*51611Sbostic  * external functions and return values are:
19*51611Sbostic  * rxp_compile(s)
20*51611Sbostic  *	TRUE	success
21*51611Sbostic  *	FALSE	parse failure; error message will be in char rxperr[]
22*51611Sbostic  * metas are:
23*51611Sbostic  *	{...}	optional pattern, equialent to [...|]
24*51611Sbostic  *	|	alternate pattern
25*51611Sbostic  *	[...]	pattern delimiters
26*51611Sbostic  *
27*51611Sbostic  * rxp_match(s)
28*51611Sbostic  *	TRUE	string s matches compiled pattern
29*51611Sbostic  *	FALSE	match failure or regexp error
30*51611Sbostic  *
31*51611Sbostic  * rxp_expand()
32*51611Sbostic  *	char *	reverse-engineered regular expression string
33*51611Sbostic  *	NULL	regexp error
34*51611Sbostic  */
35*51611Sbostic 
36*51611Sbostic #include <stdio.h>
37*51611Sbostic #include <ctype.h>
38*51611Sbostic #include "quiz.h"
39*51611Sbostic 					/* regexp tokens,	arg */
40*51611Sbostic #define	LIT	(-1)			/* literal character,	char */
41*51611Sbostic #define	SOT	(-2)			/* start text anchor,	- */
42*51611Sbostic #define	EOT	(-3)			/* end text anchor,	- */
43*51611Sbostic #define	GRP_S	(-4)			/* start alternate grp,	ptr_to_end */
44*51611Sbostic #define	GRP_E	(-5)			/* end group,		- */
45*51611Sbostic #define	ALT_S	(-6)			/* alternate starts,	ptr_to_next */
46*51611Sbostic #define	ALT_E	(-7)			/* alternate ends,	- */
47*51611Sbostic #define	END	(-8)			/* end of regexp,	- */
48*51611Sbostic 
49*51611Sbostic typedef short Rxp_t;			/* type for regexp tokens */
50*51611Sbostic 
51*51611Sbostic static Rxp_t rxpbuf[RXP_LINE_SZ];	/* compiled regular expression buffer */
52*51611Sbostic char rxperr[128];			/* parser error message */
53*51611Sbostic 
54*51611Sbostic int	 rxp__compile __P((char *, int));
55*51611Sbostic char	*rxp__expand __P((int));
56*51611Sbostic int	 rxp__match __P((char *, int, Rxp_t *, Rxp_t *, char *));
57*51611Sbostic 
58*51611Sbostic int
59*51611Sbostic rxp_compile(s)
60*51611Sbostic 	register char *	s;
61*51611Sbostic {
62*51611Sbostic 	return (rxp__compile(s, TRUE));
63*51611Sbostic }
64*51611Sbostic 
65*51611Sbostic static int
66*51611Sbostic rxp__compile(s, first)
67*51611Sbostic 	register char *s;
68*51611Sbostic 	int first;
69*51611Sbostic {
70*51611Sbostic 	static Rxp_t *rp;
71*51611Sbostic 	static char *sp;
72*51611Sbostic 	Rxp_t *grp_ptr;
73*51611Sbostic 	Rxp_t *alt_ptr;
74*51611Sbostic 	int esc, err;
75*51611Sbostic 
76*51611Sbostic 	esc = 0;
77*51611Sbostic 	if (first) {
78*51611Sbostic 		rp = rxpbuf;
79*51611Sbostic 		sp = s;
80*51611Sbostic 		*rp++ = SOT;	/* auto-anchor: pat is really ^pat$ */
81*51611Sbostic 		*rp++ = GRP_S;	/* auto-group: ^pat$ is really ^[pat]$ */
82*51611Sbostic 		*rp++ = 0;
83*51611Sbostic 	}
84*51611Sbostic 	*rp++ = ALT_S;
85*51611Sbostic 	alt_ptr = rp;
86*51611Sbostic 	*rp++ = 0;
87*51611Sbostic 	for (; *sp; ++sp) {
88*51611Sbostic 		if (rp - rxpbuf >= RXP_LINE_SZ - 4) {
89*51611Sbostic 			(void)snprintf(rxperr, sizeof(rxperr),
90*51611Sbostic 			    "regular expression too long %s", s);
91*51611Sbostic 			return (FALSE);
92*51611Sbostic 		}
93*51611Sbostic 		if (*sp == ':' && !esc)
94*51611Sbostic 			break;
95*51611Sbostic 		if (esc) {
96*51611Sbostic 			*rp++ = LIT;
97*51611Sbostic 			*rp++ = *sp;
98*51611Sbostic 			esc = 0;
99*51611Sbostic 		}
100*51611Sbostic 		else switch (*sp) {
101*51611Sbostic 		case '\\':
102*51611Sbostic 			esc = 1;
103*51611Sbostic 			break;
104*51611Sbostic 		case '{':
105*51611Sbostic 		case '[':
106*51611Sbostic 			*rp++ = GRP_S;
107*51611Sbostic 			grp_ptr = rp;
108*51611Sbostic 			*rp++ = 0;
109*51611Sbostic 			sp++;
110*51611Sbostic 			if ((err = rxp__compile(s, FALSE)) != TRUE)
111*51611Sbostic 				return (err);
112*51611Sbostic 			*rp++ = GRP_E;
113*51611Sbostic 			*grp_ptr = rp - rxpbuf;
114*51611Sbostic 			break;
115*51611Sbostic 		case '}':
116*51611Sbostic 		case ']':
117*51611Sbostic 		case '|':
118*51611Sbostic 			*rp++ = ALT_E;
119*51611Sbostic 			*alt_ptr = rp - rxpbuf;
120*51611Sbostic 			if (*sp != ']') {
121*51611Sbostic 				*rp++ = ALT_S;
122*51611Sbostic 				alt_ptr = rp;
123*51611Sbostic 				*rp++ = 0;
124*51611Sbostic 			}
125*51611Sbostic 			if (*sp != '|') {
126*51611Sbostic 				if (*sp != ']') {
127*51611Sbostic 					*rp++ = ALT_E;
128*51611Sbostic 					*alt_ptr = rp - rxpbuf;
129*51611Sbostic 				}
130*51611Sbostic 				if (first) {
131*51611Sbostic 					(void)snprintf(rxperr, sizeof(rxperr),
132*51611Sbostic 					    "unmatched alternator in regexp %s",
133*51611Sbostic 					     s);
134*51611Sbostic 					return (FALSE);
135*51611Sbostic 				}
136*51611Sbostic 				return (TRUE);
137*51611Sbostic 			}
138*51611Sbostic 			break;
139*51611Sbostic 		default:
140*51611Sbostic 			*rp++ = LIT;
141*51611Sbostic 			*rp++ = *sp;
142*51611Sbostic 			esc = 0;
143*51611Sbostic 			break;
144*51611Sbostic 		}
145*51611Sbostic 	}
146*51611Sbostic 	if (!first) {
147*51611Sbostic 		(void)snprintf(rxperr, sizeof(rxperr),
148*51611Sbostic 		    "unmatched alternator in regexp %s", s);
149*51611Sbostic 		return (FALSE);
150*51611Sbostic 	}
151*51611Sbostic 	*rp++ = ALT_E;
152*51611Sbostic 	*alt_ptr = rp - rxpbuf;
153*51611Sbostic 	*rp++ = GRP_E;
154*51611Sbostic 	*(rxpbuf + 2) = rp - rxpbuf;
155*51611Sbostic 	*rp++ = EOT;
156*51611Sbostic 	*rp = END;
157*51611Sbostic 	return (TRUE);
158*51611Sbostic }
159*51611Sbostic 
160*51611Sbostic /*
161*51611Sbostic  * match string against compiled regular expression
162*51611Sbostic  */
163*51611Sbostic int
164*51611Sbostic rxp_match(s)
165*51611Sbostic 	register char *	s;
166*51611Sbostic {
167*51611Sbostic 	return (rxp__match(s, TRUE, NULL, NULL, NULL));
168*51611Sbostic }
169*51611Sbostic 
170*51611Sbostic static int
171*51611Sbostic rxp__match(s, first, j_succ, j_fail, sp_fail)
172*51611Sbostic 	char *s;
173*51611Sbostic 	int first;
174*51611Sbostic 	Rxp_t *j_succ;		/* jump here on successful alt match */
175*51611Sbostic 	Rxp_t *j_fail;		/* jump here on failed match */
176*51611Sbostic 	char *sp_fail;		/* reset sp to here on failed match */
177*51611Sbostic {
178*51611Sbostic 	static Rxp_t *rp;
179*51611Sbostic 	static char *sp;
180*51611Sbostic 	register int ch;
181*51611Sbostic 	Rxp_t *grp_end;
182*51611Sbostic 	int err;
183*51611Sbostic 
184*51611Sbostic 	if (first) {
185*51611Sbostic 		rp = rxpbuf;
186*51611Sbostic 		sp = s;
187*51611Sbostic 	}
188*51611Sbostic 	while (rp < rxpbuf + RXP_LINE_SZ && *rp != END)
189*51611Sbostic 		switch(*rp) {
190*51611Sbostic 		case LIT:
191*51611Sbostic 			rp++;
192*51611Sbostic 			ch = isascii(*rp) && isupper(*rp) ? tolower(*rp) : *rp;
193*51611Sbostic 			if (ch != *sp++) {
194*51611Sbostic 				rp = j_fail;
195*51611Sbostic 				sp = sp_fail;
196*51611Sbostic 				return (TRUE);
197*51611Sbostic 			}
198*51611Sbostic 			rp++;
199*51611Sbostic 			break;
200*51611Sbostic 		case SOT:
201*51611Sbostic 			if (sp != s)
202*51611Sbostic 				return (FALSE);
203*51611Sbostic 			rp++;
204*51611Sbostic 			break;
205*51611Sbostic 		case EOT:
206*51611Sbostic 			if (*sp != 0)
207*51611Sbostic 				return (FALSE);
208*51611Sbostic 			rp++;
209*51611Sbostic 			break;
210*51611Sbostic 		case GRP_S:
211*51611Sbostic 			rp++;
212*51611Sbostic 			grp_end = rxpbuf + *rp++;
213*51611Sbostic 			break;
214*51611Sbostic 		case ALT_S:
215*51611Sbostic 			rp++;
216*51611Sbostic 			if ((err = rxp__match(sp,
217*51611Sbostic 			    FALSE, grp_end, rxpbuf + *rp++, sp)) != TRUE)
218*51611Sbostic 				return (err);
219*51611Sbostic 			break;
220*51611Sbostic 		case ALT_E:
221*51611Sbostic 			rp = j_succ;
222*51611Sbostic 			return (TRUE);
223*51611Sbostic 		case GRP_E:
224*51611Sbostic 		default:
225*51611Sbostic 			return (FALSE);
226*51611Sbostic 		}
227*51611Sbostic 	return (*rp != END ? FALSE : TRUE);
228*51611Sbostic }
229*51611Sbostic 
230*51611Sbostic /*
231*51611Sbostic  * Reverse engineer the regular expression, by picking first of all alternates.
232*51611Sbostic  */
233*51611Sbostic char *
234*51611Sbostic rxp_expand()
235*51611Sbostic {
236*51611Sbostic 	return (rxp__expand(TRUE));
237*51611Sbostic }
238*51611Sbostic 
239*51611Sbostic static char *
240*51611Sbostic rxp__expand(first)
241*51611Sbostic 	int first;
242*51611Sbostic {
243*51611Sbostic 	static char buf[RXP_LINE_SZ/2];
244*51611Sbostic 	static Rxp_t *rp;
245*51611Sbostic 	static char *bp;
246*51611Sbostic 	Rxp_t *grp_ptr;
247*51611Sbostic 	char *err;
248*51611Sbostic 
249*51611Sbostic 	if (first) {
250*51611Sbostic 		rp = rxpbuf;
251*51611Sbostic 		bp = buf;
252*51611Sbostic 	}
253*51611Sbostic 	while (rp < rxpbuf + RXP_LINE_SZ && *rp != END)
254*51611Sbostic 		switch(*rp) {
255*51611Sbostic 		case LIT:
256*51611Sbostic 			rp++;
257*51611Sbostic 			*bp++ = *rp++;
258*51611Sbostic 			break;
259*51611Sbostic 		case GRP_S:
260*51611Sbostic 			rp++;
261*51611Sbostic 			grp_ptr = rxpbuf + *rp;
262*51611Sbostic 			rp++;
263*51611Sbostic 			if ((err = rxp__expand(FALSE)) == NULL)
264*51611Sbostic 				return (err);
265*51611Sbostic 			rp = grp_ptr;
266*51611Sbostic 			break;
267*51611Sbostic 		case ALT_E:
268*51611Sbostic 			return (buf);
269*51611Sbostic 		case ALT_S:
270*51611Sbostic 			rp++;
271*51611Sbostic 			/* FALLTHROUGH */
272*51611Sbostic 		case SOT:
273*51611Sbostic 		case EOT:
274*51611Sbostic 		case GRP_E:
275*51611Sbostic 			rp++;
276*51611Sbostic 			break;
277*51611Sbostic 		default:
278*51611Sbostic 			return (NULL);
279*51611Sbostic 		}
280*51611Sbostic 	if (first) {
281*51611Sbostic 		if (*rp != END)
282*51611Sbostic 			return (NULL);
283*51611Sbostic 		*bp = '\0';
284*51611Sbostic 	}
285*51611Sbostic 	return (buf);
286*51611Sbostic }
287