xref: /csrg-svn/games/quiz/rxp.c (revision 60836)
151611Sbostic /*-
2*60836Sbostic  * Copyright (c) 1991, 1993
3*60836Sbostic  *	The Regents of the University of California.  All rights reserved.
451611Sbostic  *
551611Sbostic  * This code is derived from software contributed to Berkeley by
652145Sbostic  * Jim R. Oldroyd at The Instruction Set and Keith Gabryelski at
752145Sbostic  * Commodore Business Machines.
851611Sbostic  *
951611Sbostic  * %sccs.include.redist.c%
1051611Sbostic  */
1151611Sbostic 
1251611Sbostic #ifndef lint
13*60836Sbostic static char sccsid[] = "@(#)rxp.c	8.1 (Berkeley) 05/31/93";
1451611Sbostic #endif /* not lint */
1551611Sbostic 
1651611Sbostic /*
1751611Sbostic  * regular expression parser
1851611Sbostic  *
1951611Sbostic  * external functions and return values are:
2051611Sbostic  * rxp_compile(s)
2151611Sbostic  *	TRUE	success
2251611Sbostic  *	FALSE	parse failure; error message will be in char rxperr[]
2351611Sbostic  * metas are:
2451611Sbostic  *	{...}	optional pattern, equialent to [...|]
2551611Sbostic  *	|	alternate pattern
2651611Sbostic  *	[...]	pattern delimiters
2751611Sbostic  *
2851611Sbostic  * rxp_match(s)
2951611Sbostic  *	TRUE	string s matches compiled pattern
3051611Sbostic  *	FALSE	match failure or regexp error
3151611Sbostic  *
3251611Sbostic  * rxp_expand()
3351611Sbostic  *	char *	reverse-engineered regular expression string
3451611Sbostic  *	NULL	regexp error
3551611Sbostic  */
3651611Sbostic 
3751611Sbostic #include <stdio.h>
3851611Sbostic #include <ctype.h>
3951611Sbostic #include "quiz.h"
4051611Sbostic 					/* regexp tokens,	arg */
4151611Sbostic #define	LIT	(-1)			/* literal character,	char */
4251611Sbostic #define	SOT	(-2)			/* start text anchor,	- */
4351611Sbostic #define	EOT	(-3)			/* end text anchor,	- */
4451611Sbostic #define	GRP_S	(-4)			/* start alternate grp,	ptr_to_end */
4551611Sbostic #define	GRP_E	(-5)			/* end group,		- */
4651611Sbostic #define	ALT_S	(-6)			/* alternate starts,	ptr_to_next */
4751611Sbostic #define	ALT_E	(-7)			/* alternate ends,	- */
4851611Sbostic #define	END	(-8)			/* end of regexp,	- */
4951611Sbostic 
5051611Sbostic typedef short Rxp_t;			/* type for regexp tokens */
5151611Sbostic 
5251611Sbostic static Rxp_t rxpbuf[RXP_LINE_SZ];	/* compiled regular expression buffer */
5351611Sbostic char rxperr[128];			/* parser error message */
5451611Sbostic 
5557829Storek static int	 rxp__compile __P((char *, int));
5657829Storek static char	*rxp__expand __P((int));
5757829Storek static int	 rxp__match __P((char *, int, Rxp_t *, Rxp_t *, char *));
5851611Sbostic 
5951611Sbostic int
rxp_compile(s)6051611Sbostic rxp_compile(s)
6151611Sbostic 	register char *	s;
6251611Sbostic {
6351611Sbostic 	return (rxp__compile(s, TRUE));
6451611Sbostic }
6551611Sbostic 
6651611Sbostic static int
rxp__compile(s,first)6751611Sbostic rxp__compile(s, first)
6851611Sbostic 	register char *s;
6951611Sbostic 	int first;
7051611Sbostic {
7151611Sbostic 	static Rxp_t *rp;
7251611Sbostic 	static char *sp;
7351611Sbostic 	Rxp_t *grp_ptr;
7451611Sbostic 	Rxp_t *alt_ptr;
7551611Sbostic 	int esc, err;
7651611Sbostic 
7751611Sbostic 	esc = 0;
7851611Sbostic 	if (first) {
7951611Sbostic 		rp = rxpbuf;
8051611Sbostic 		sp = s;
8151611Sbostic 		*rp++ = SOT;	/* auto-anchor: pat is really ^pat$ */
8251611Sbostic 		*rp++ = GRP_S;	/* auto-group: ^pat$ is really ^[pat]$ */
8351611Sbostic 		*rp++ = 0;
8451611Sbostic 	}
8551611Sbostic 	*rp++ = ALT_S;
8651611Sbostic 	alt_ptr = rp;
8751611Sbostic 	*rp++ = 0;
8851611Sbostic 	for (; *sp; ++sp) {
8951611Sbostic 		if (rp - rxpbuf >= RXP_LINE_SZ - 4) {
9051611Sbostic 			(void)snprintf(rxperr, sizeof(rxperr),
9151611Sbostic 			    "regular expression too long %s", s);
9251611Sbostic 			return (FALSE);
9351611Sbostic 		}
9451611Sbostic 		if (*sp == ':' && !esc)
9551611Sbostic 			break;
9651611Sbostic 		if (esc) {
9751611Sbostic 			*rp++ = LIT;
9851611Sbostic 			*rp++ = *sp;
9951611Sbostic 			esc = 0;
10051611Sbostic 		}
10151611Sbostic 		else switch (*sp) {
10251611Sbostic 		case '\\':
10351611Sbostic 			esc = 1;
10451611Sbostic 			break;
10551611Sbostic 		case '{':
10651611Sbostic 		case '[':
10751611Sbostic 			*rp++ = GRP_S;
10851611Sbostic 			grp_ptr = rp;
10951611Sbostic 			*rp++ = 0;
11051611Sbostic 			sp++;
11151611Sbostic 			if ((err = rxp__compile(s, FALSE)) != TRUE)
11251611Sbostic 				return (err);
11351611Sbostic 			*rp++ = GRP_E;
11451611Sbostic 			*grp_ptr = rp - rxpbuf;
11551611Sbostic 			break;
11651611Sbostic 		case '}':
11751611Sbostic 		case ']':
11851611Sbostic 		case '|':
11951611Sbostic 			*rp++ = ALT_E;
12051611Sbostic 			*alt_ptr = rp - rxpbuf;
12151611Sbostic 			if (*sp != ']') {
12251611Sbostic 				*rp++ = ALT_S;
12351611Sbostic 				alt_ptr = rp;
12451611Sbostic 				*rp++ = 0;
12551611Sbostic 			}
12651611Sbostic 			if (*sp != '|') {
12751611Sbostic 				if (*sp != ']') {
12851611Sbostic 					*rp++ = ALT_E;
12951611Sbostic 					*alt_ptr = rp - rxpbuf;
13051611Sbostic 				}
13151611Sbostic 				if (first) {
13251611Sbostic 					(void)snprintf(rxperr, sizeof(rxperr),
13351611Sbostic 					    "unmatched alternator in regexp %s",
13451611Sbostic 					     s);
13551611Sbostic 					return (FALSE);
13651611Sbostic 				}
13751611Sbostic 				return (TRUE);
13851611Sbostic 			}
13951611Sbostic 			break;
14051611Sbostic 		default:
14151611Sbostic 			*rp++ = LIT;
14251611Sbostic 			*rp++ = *sp;
14351611Sbostic 			esc = 0;
14451611Sbostic 			break;
14551611Sbostic 		}
14651611Sbostic 	}
14751611Sbostic 	if (!first) {
14851611Sbostic 		(void)snprintf(rxperr, sizeof(rxperr),
14951611Sbostic 		    "unmatched alternator in regexp %s", s);
15051611Sbostic 		return (FALSE);
15151611Sbostic 	}
15251611Sbostic 	*rp++ = ALT_E;
15351611Sbostic 	*alt_ptr = rp - rxpbuf;
15451611Sbostic 	*rp++ = GRP_E;
15551611Sbostic 	*(rxpbuf + 2) = rp - rxpbuf;
15651611Sbostic 	*rp++ = EOT;
15751611Sbostic 	*rp = END;
15851611Sbostic 	return (TRUE);
15951611Sbostic }
16051611Sbostic 
16151611Sbostic /*
16251611Sbostic  * match string against compiled regular expression
16351611Sbostic  */
16451611Sbostic int
rxp_match(s)16551611Sbostic rxp_match(s)
16651611Sbostic 	register char *	s;
16751611Sbostic {
16851611Sbostic 	return (rxp__match(s, TRUE, NULL, NULL, NULL));
16951611Sbostic }
17051611Sbostic 
17151611Sbostic static int
rxp__match(s,first,j_succ,j_fail,sp_fail)17251611Sbostic rxp__match(s, first, j_succ, j_fail, sp_fail)
17351611Sbostic 	char *s;
17451611Sbostic 	int first;
17551611Sbostic 	Rxp_t *j_succ;		/* jump here on successful alt match */
17651611Sbostic 	Rxp_t *j_fail;		/* jump here on failed match */
17751611Sbostic 	char *sp_fail;		/* reset sp to here on failed match */
17851611Sbostic {
17951611Sbostic 	static Rxp_t *rp;
18051611Sbostic 	static char *sp;
18151611Sbostic 	register int ch;
18251611Sbostic 	Rxp_t *grp_end;
18351611Sbostic 	int err;
18451611Sbostic 
18551611Sbostic 	if (first) {
18651611Sbostic 		rp = rxpbuf;
18751611Sbostic 		sp = s;
18851611Sbostic 	}
18951611Sbostic 	while (rp < rxpbuf + RXP_LINE_SZ && *rp != END)
19051611Sbostic 		switch(*rp) {
19151611Sbostic 		case LIT:
19251611Sbostic 			rp++;
19351611Sbostic 			ch = isascii(*rp) && isupper(*rp) ? tolower(*rp) : *rp;
19451611Sbostic 			if (ch != *sp++) {
19551611Sbostic 				rp = j_fail;
19651611Sbostic 				sp = sp_fail;
19751611Sbostic 				return (TRUE);
19851611Sbostic 			}
19951611Sbostic 			rp++;
20051611Sbostic 			break;
20151611Sbostic 		case SOT:
20251611Sbostic 			if (sp != s)
20351611Sbostic 				return (FALSE);
20451611Sbostic 			rp++;
20551611Sbostic 			break;
20651611Sbostic 		case EOT:
20751611Sbostic 			if (*sp != 0)
20851611Sbostic 				return (FALSE);
20951611Sbostic 			rp++;
21051611Sbostic 			break;
21151611Sbostic 		case GRP_S:
21251611Sbostic 			rp++;
21351611Sbostic 			grp_end = rxpbuf + *rp++;
21451611Sbostic 			break;
21551611Sbostic 		case ALT_S:
21651611Sbostic 			rp++;
21751611Sbostic 			if ((err = rxp__match(sp,
21851611Sbostic 			    FALSE, grp_end, rxpbuf + *rp++, sp)) != TRUE)
21951611Sbostic 				return (err);
22051611Sbostic 			break;
22151611Sbostic 		case ALT_E:
22251611Sbostic 			rp = j_succ;
22351611Sbostic 			return (TRUE);
22451611Sbostic 		case GRP_E:
22551611Sbostic 		default:
22651611Sbostic 			return (FALSE);
22751611Sbostic 		}
22851611Sbostic 	return (*rp != END ? FALSE : TRUE);
22951611Sbostic }
23051611Sbostic 
23151611Sbostic /*
23251611Sbostic  * Reverse engineer the regular expression, by picking first of all alternates.
23351611Sbostic  */
23451611Sbostic char *
rxp_expand()23551611Sbostic rxp_expand()
23651611Sbostic {
23751611Sbostic 	return (rxp__expand(TRUE));
23851611Sbostic }
23951611Sbostic 
24051611Sbostic static char *
rxp__expand(first)24151611Sbostic rxp__expand(first)
24251611Sbostic 	int first;
24351611Sbostic {
24451611Sbostic 	static char buf[RXP_LINE_SZ/2];
24551611Sbostic 	static Rxp_t *rp;
24651611Sbostic 	static char *bp;
24751611Sbostic 	Rxp_t *grp_ptr;
24851611Sbostic 	char *err;
24951611Sbostic 
25051611Sbostic 	if (first) {
25151611Sbostic 		rp = rxpbuf;
25251611Sbostic 		bp = buf;
25351611Sbostic 	}
25451611Sbostic 	while (rp < rxpbuf + RXP_LINE_SZ && *rp != END)
25551611Sbostic 		switch(*rp) {
25651611Sbostic 		case LIT:
25751611Sbostic 			rp++;
25851611Sbostic 			*bp++ = *rp++;
25951611Sbostic 			break;
26051611Sbostic 		case GRP_S:
26151611Sbostic 			rp++;
26251611Sbostic 			grp_ptr = rxpbuf + *rp;
26351611Sbostic 			rp++;
26451611Sbostic 			if ((err = rxp__expand(FALSE)) == NULL)
26551611Sbostic 				return (err);
26651611Sbostic 			rp = grp_ptr;
26751611Sbostic 			break;
26851611Sbostic 		case ALT_E:
26951611Sbostic 			return (buf);
27051611Sbostic 		case ALT_S:
27151611Sbostic 			rp++;
27251611Sbostic 			/* FALLTHROUGH */
27351611Sbostic 		case SOT:
27451611Sbostic 		case EOT:
27551611Sbostic 		case GRP_E:
27651611Sbostic 			rp++;
27751611Sbostic 			break;
27851611Sbostic 		default:
27951611Sbostic 			return (NULL);
28051611Sbostic 		}
28151611Sbostic 	if (first) {
28251611Sbostic 		if (*rp != END)
28351611Sbostic 			return (NULL);
28451611Sbostic 		*bp = '\0';
28551611Sbostic 	}
28651611Sbostic 	return (buf);
28751611Sbostic }
288