xref: /netbsd-src/games/quiz/rxp.c (revision 369abe069b9a5e1b28efe903ba0c77aa728c498e)
1*369abe06Sandvar /*	$NetBSD: rxp.c,v 1.14 2021/11/07 20:31:09 andvar Exp $	*/
2ee0f9d10Scgd 
35bfb98b1Scgd /*-
4a432dc58Scgd  * Copyright (c) 1991, 1993
5a432dc58Scgd  *	The Regents of the University of California.  All rights reserved.
65bfb98b1Scgd  *
75bfb98b1Scgd  * This code is derived from software contributed to Berkeley by
8a432dc58Scgd  * Jim R. Oldroyd at The Instruction Set and Keith Gabryelski at
9a432dc58Scgd  * Commodore Business Machines.
105bfb98b1Scgd  *
115bfb98b1Scgd  * Redistribution and use in source and binary forms, with or without
125bfb98b1Scgd  * modification, are permitted provided that the following conditions
135bfb98b1Scgd  * are met:
145bfb98b1Scgd  * 1. Redistributions of source code must retain the above copyright
155bfb98b1Scgd  *    notice, this list of conditions and the following disclaimer.
165bfb98b1Scgd  * 2. Redistributions in binary form must reproduce the above copyright
175bfb98b1Scgd  *    notice, this list of conditions and the following disclaimer in the
185bfb98b1Scgd  *    documentation and/or other materials provided with the distribution.
19e5aeb4eaSagc  * 3. Neither the name of the University nor the names of its contributors
205bfb98b1Scgd  *    may be used to endorse or promote products derived from this software
215bfb98b1Scgd  *    without specific prior written permission.
225bfb98b1Scgd  *
235bfb98b1Scgd  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
245bfb98b1Scgd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
255bfb98b1Scgd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
265bfb98b1Scgd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
275bfb98b1Scgd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
285bfb98b1Scgd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
295bfb98b1Scgd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
305bfb98b1Scgd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
315bfb98b1Scgd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
325bfb98b1Scgd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
335bfb98b1Scgd  * SUCH DAMAGE.
345bfb98b1Scgd  */
355bfb98b1Scgd 
36d8297446Slukem #include <sys/cdefs.h>
375bfb98b1Scgd #ifndef lint
38ee0f9d10Scgd #if 0
39a432dc58Scgd static char sccsid[] = "@(#)rxp.c	8.1 (Berkeley) 5/31/93";
40ee0f9d10Scgd #else
41*369abe06Sandvar __RCSID("$NetBSD: rxp.c,v 1.14 2021/11/07 20:31:09 andvar Exp $");
42ee0f9d10Scgd #endif
435bfb98b1Scgd #endif /* not lint */
445bfb98b1Scgd 
455bfb98b1Scgd /*
465bfb98b1Scgd  * regular expression parser
475bfb98b1Scgd  *
485bfb98b1Scgd  * external functions and return values are:
495bfb98b1Scgd  * rxp_compile(s)
505bfb98b1Scgd  *	TRUE	success
515bfb98b1Scgd  *	FALSE	parse failure; error message will be in char rxperr[]
525bfb98b1Scgd  * metas are:
53*369abe06Sandvar  *	{...}	optional pattern, equivalent to [...|]
545bfb98b1Scgd  *	|	alternate pattern
555bfb98b1Scgd  *	[...]	pattern delimiters
565bfb98b1Scgd  *
575bfb98b1Scgd  * rxp_match(s)
585bfb98b1Scgd  *	TRUE	string s matches compiled pattern
595bfb98b1Scgd  *	FALSE	match failure or regexp error
605bfb98b1Scgd  *
615bfb98b1Scgd  * rxp_expand()
625bfb98b1Scgd  *	char *	reverse-engineered regular expression string
635bfb98b1Scgd  *	NULL	regexp error
645bfb98b1Scgd  */
655bfb98b1Scgd 
665bfb98b1Scgd #include <stdio.h>
6733f953abSthorpej #include <stdlib.h>
685bfb98b1Scgd #include <ctype.h>
695bfb98b1Scgd #include "quiz.h"
705bfb98b1Scgd 					/* regexp tokens,	arg */
715bfb98b1Scgd #define	LIT	(-1)			/* literal character,	char */
725bfb98b1Scgd #define	SOT	(-2)			/* start text anchor,	- */
735bfb98b1Scgd #define	EOT	(-3)			/* end text anchor,	- */
745bfb98b1Scgd #define	GRP_S	(-4)			/* start alternate grp,	ptr_to_end */
755bfb98b1Scgd #define	GRP_E	(-5)			/* end group,		- */
765bfb98b1Scgd #define	ALT_S	(-6)			/* alternate starts,	ptr_to_next */
775bfb98b1Scgd #define	ALT_E	(-7)			/* alternate ends,	- */
785bfb98b1Scgd #define	END	(-8)			/* end of regexp,	- */
795bfb98b1Scgd 
805bfb98b1Scgd typedef short Rxp_t;			/* type for regexp tokens */
815bfb98b1Scgd 
825bfb98b1Scgd static Rxp_t rxpbuf[RXP_LINE_SZ];	/* compiled regular expression buffer */
835bfb98b1Scgd char rxperr[128];			/* parser error message */
845bfb98b1Scgd 
85cb5fd834Sjsm static int	 rxp__compile(const char *, int);
86cb5fd834Sjsm static char	*rxp__expand(int);
87cb5fd834Sjsm static int	 rxp__match(const char *, int, Rxp_t *, Rxp_t *, const char *);
885bfb98b1Scgd 
895bfb98b1Scgd int
rxp_compile(const char * s)9004ed5236Sdholland rxp_compile(const char *s)
915bfb98b1Scgd {
925bfb98b1Scgd 	return (rxp__compile(s, TRUE));
935bfb98b1Scgd }
945bfb98b1Scgd 
955bfb98b1Scgd static int
rxp__compile(const char * s,int first)9604ed5236Sdholland rxp__compile(const char *s, int first)
975bfb98b1Scgd {
985bfb98b1Scgd 	static Rxp_t *rp;
99092d3130Sjsm 	static const char *sp;
1005bfb98b1Scgd 	Rxp_t *grp_ptr;
1015bfb98b1Scgd 	Rxp_t *alt_ptr;
1025bfb98b1Scgd 	int esc, err;
1035bfb98b1Scgd 
1045bfb98b1Scgd 	esc = 0;
1055bfb98b1Scgd 	if (first) {
1065bfb98b1Scgd 		rp = rxpbuf;
1075bfb98b1Scgd 		sp = s;
1085bfb98b1Scgd 		*rp++ = SOT;	/* auto-anchor: pat is really ^pat$ */
1095bfb98b1Scgd 		*rp++ = GRP_S;	/* auto-group: ^pat$ is really ^[pat]$ */
1105bfb98b1Scgd 		*rp++ = 0;
1115bfb98b1Scgd 	}
1125bfb98b1Scgd 	*rp++ = ALT_S;
1135bfb98b1Scgd 	alt_ptr = rp;
1145bfb98b1Scgd 	*rp++ = 0;
1155bfb98b1Scgd 	for (; *sp; ++sp) {
1165bfb98b1Scgd 		if (rp - rxpbuf >= RXP_LINE_SZ - 4) {
1175bfb98b1Scgd 			(void)snprintf(rxperr, sizeof(rxperr),
1185bfb98b1Scgd 			    "regular expression too long %s", s);
1195bfb98b1Scgd 			return (FALSE);
1205bfb98b1Scgd 		}
1215bfb98b1Scgd 		if (*sp == ':' && !esc)
1225bfb98b1Scgd 			break;
1235bfb98b1Scgd 		if (esc) {
1245bfb98b1Scgd 			*rp++ = LIT;
1255bfb98b1Scgd 			*rp++ = *sp;
1265bfb98b1Scgd 			esc = 0;
1275bfb98b1Scgd 		}
1285bfb98b1Scgd 		else switch (*sp) {
1295bfb98b1Scgd 		case '\\':
1305bfb98b1Scgd 			esc = 1;
1315bfb98b1Scgd 			break;
1325bfb98b1Scgd 		case '{':
1335bfb98b1Scgd 		case '[':
1345bfb98b1Scgd 			*rp++ = GRP_S;
1355bfb98b1Scgd 			grp_ptr = rp;
1365bfb98b1Scgd 			*rp++ = 0;
1375bfb98b1Scgd 			sp++;
1385bfb98b1Scgd 			if ((err = rxp__compile(s, FALSE)) != TRUE)
1395bfb98b1Scgd 				return (err);
1405bfb98b1Scgd 			*rp++ = GRP_E;
1415bfb98b1Scgd 			*grp_ptr = rp - rxpbuf;
1425bfb98b1Scgd 			break;
1435bfb98b1Scgd 		case '}':
1445bfb98b1Scgd 		case ']':
1455bfb98b1Scgd 		case '|':
1465bfb98b1Scgd 			*rp++ = ALT_E;
1475bfb98b1Scgd 			*alt_ptr = rp - rxpbuf;
1485bfb98b1Scgd 			if (*sp != ']') {
1495bfb98b1Scgd 				*rp++ = ALT_S;
1505bfb98b1Scgd 				alt_ptr = rp;
1515bfb98b1Scgd 				*rp++ = 0;
1525bfb98b1Scgd 			}
1535bfb98b1Scgd 			if (*sp != '|') {
1545bfb98b1Scgd 				if (*sp != ']') {
1555bfb98b1Scgd 					*rp++ = ALT_E;
1565bfb98b1Scgd 					*alt_ptr = rp - rxpbuf;
1575bfb98b1Scgd 				}
1585bfb98b1Scgd 				if (first) {
1595bfb98b1Scgd 					(void)snprintf(rxperr, sizeof(rxperr),
1605bfb98b1Scgd 					    "unmatched alternator in regexp %s",
1615bfb98b1Scgd 					     s);
1625bfb98b1Scgd 					return (FALSE);
1635bfb98b1Scgd 				}
1645bfb98b1Scgd 				return (TRUE);
1655bfb98b1Scgd 			}
1665bfb98b1Scgd 			break;
1675bfb98b1Scgd 		default:
1685bfb98b1Scgd 			*rp++ = LIT;
1695bfb98b1Scgd 			*rp++ = *sp;
1705bfb98b1Scgd 			esc = 0;
1715bfb98b1Scgd 			break;
1725bfb98b1Scgd 		}
1735bfb98b1Scgd 	}
1745bfb98b1Scgd 	if (!first) {
1755bfb98b1Scgd 		(void)snprintf(rxperr, sizeof(rxperr),
1765bfb98b1Scgd 		    "unmatched alternator in regexp %s", s);
1775bfb98b1Scgd 		return (FALSE);
1785bfb98b1Scgd 	}
1795bfb98b1Scgd 	*rp++ = ALT_E;
1805bfb98b1Scgd 	*alt_ptr = rp - rxpbuf;
1815bfb98b1Scgd 	*rp++ = GRP_E;
1825bfb98b1Scgd 	*(rxpbuf + 2) = rp - rxpbuf;
1835bfb98b1Scgd 	*rp++ = EOT;
1845bfb98b1Scgd 	*rp = END;
1855bfb98b1Scgd 	return (TRUE);
1865bfb98b1Scgd }
1875bfb98b1Scgd 
1885bfb98b1Scgd /*
1895bfb98b1Scgd  * match string against compiled regular expression
1905bfb98b1Scgd  */
1915bfb98b1Scgd int
rxp_match(const char * s)19204ed5236Sdholland rxp_match(const char *s)
1935bfb98b1Scgd {
1945bfb98b1Scgd 	return (rxp__match(s, TRUE, NULL, NULL, NULL));
1955bfb98b1Scgd }
1965bfb98b1Scgd 
1975bfb98b1Scgd static int
rxp__match(const char * s,int first,Rxp_t * j_succ,Rxp_t * j_fail,const char * sp_fail)19804ed5236Sdholland rxp__match(const char *s,
19904ed5236Sdholland 	   int first,
20004ed5236Sdholland 	   Rxp_t *j_succ,		/* jump here on successful alt match */
20104ed5236Sdholland 	   Rxp_t *j_fail,		/* jump here on failed match */
20204ed5236Sdholland 	   const char *sp_fail)		/* reset sp to here on failed match */
2035bfb98b1Scgd {
2045bfb98b1Scgd 	static Rxp_t *rp;
205092d3130Sjsm 	static const char *sp;
206d8297446Slukem 	int ch;
207d8297446Slukem 	Rxp_t *grp_end = NULL;
2085bfb98b1Scgd 
2095bfb98b1Scgd 	if (first) {
2105bfb98b1Scgd 		rp = rxpbuf;
2115bfb98b1Scgd 		sp = s;
2125bfb98b1Scgd 	}
2135bfb98b1Scgd 	while (rp < rxpbuf + RXP_LINE_SZ && *rp != END)
2145bfb98b1Scgd 		switch(*rp) {
2155bfb98b1Scgd 		case LIT:
2165bfb98b1Scgd 			rp++;
2175bfb98b1Scgd 			ch = isascii(*rp) && isupper(*rp) ? tolower(*rp) : *rp;
2185bfb98b1Scgd 			if (ch != *sp++) {
2195bfb98b1Scgd 				rp = j_fail;
2205bfb98b1Scgd 				sp = sp_fail;
221a7d4c586Sdbj 				return (FALSE);
2225bfb98b1Scgd 			}
2235bfb98b1Scgd 			rp++;
2245bfb98b1Scgd 			break;
2255bfb98b1Scgd 		case SOT:
2265bfb98b1Scgd 			if (sp != s)
2275bfb98b1Scgd 				return (FALSE);
2285bfb98b1Scgd 			rp++;
2295bfb98b1Scgd 			break;
2305bfb98b1Scgd 		case EOT:
2315bfb98b1Scgd 			if (*sp != 0)
2325bfb98b1Scgd 				return (FALSE);
2335bfb98b1Scgd 			rp++;
2345bfb98b1Scgd 			break;
2355bfb98b1Scgd 		case GRP_S:
2365bfb98b1Scgd 			rp++;
2375bfb98b1Scgd 			grp_end = rxpbuf + *rp++;
2385bfb98b1Scgd 			break;
2395bfb98b1Scgd 		case ALT_S:
2405bfb98b1Scgd 			rp++;
241a7d4c586Sdbj 			rxp__match(sp, FALSE, grp_end, rxpbuf + *rp++, sp);
2425bfb98b1Scgd 			break;
2435bfb98b1Scgd 		case ALT_E:
2445bfb98b1Scgd 			rp = j_succ;
2455bfb98b1Scgd 			return (TRUE);
2465bfb98b1Scgd 		case GRP_E:
247a7d4c586Sdbj 			rp = j_fail;
248a7d4c586Sdbj 			sp = sp_fail;
2495bfb98b1Scgd 			return (FALSE);
250a7d4c586Sdbj 		default:
251a7d4c586Sdbj 			abort();
2525bfb98b1Scgd 		}
2535bfb98b1Scgd 	return (*rp != END ? FALSE : TRUE);
2545bfb98b1Scgd }
2555bfb98b1Scgd 
2565bfb98b1Scgd /*
2575bfb98b1Scgd  * Reverse engineer the regular expression, by picking first of all alternates.
2585bfb98b1Scgd  */
2595bfb98b1Scgd char *
rxp_expand(void)26004ed5236Sdholland rxp_expand(void)
2615bfb98b1Scgd {
2625bfb98b1Scgd 	return (rxp__expand(TRUE));
2635bfb98b1Scgd }
2645bfb98b1Scgd 
2655bfb98b1Scgd static char *
rxp__expand(int first)26604ed5236Sdholland rxp__expand(int first)
2675bfb98b1Scgd {
2685bfb98b1Scgd 	static char buf[RXP_LINE_SZ/2];
2695bfb98b1Scgd 	static Rxp_t *rp;
2705bfb98b1Scgd 	static char *bp;
2715bfb98b1Scgd 	Rxp_t *grp_ptr;
2725bfb98b1Scgd 	char *err;
2735bfb98b1Scgd 
2745bfb98b1Scgd 	if (first) {
2755bfb98b1Scgd 		rp = rxpbuf;
2765bfb98b1Scgd 		bp = buf;
2775bfb98b1Scgd 	}
2785bfb98b1Scgd 	while (rp < rxpbuf + RXP_LINE_SZ && *rp != END)
2795bfb98b1Scgd 		switch(*rp) {
2805bfb98b1Scgd 		case LIT:
2815bfb98b1Scgd 			rp++;
2825bfb98b1Scgd 			*bp++ = *rp++;
2835bfb98b1Scgd 			break;
2845bfb98b1Scgd 		case GRP_S:
2855bfb98b1Scgd 			rp++;
2865bfb98b1Scgd 			grp_ptr = rxpbuf + *rp;
2875bfb98b1Scgd 			rp++;
2885bfb98b1Scgd 			if ((err = rxp__expand(FALSE)) == NULL)
2895bfb98b1Scgd 				return (err);
2905bfb98b1Scgd 			rp = grp_ptr;
2915bfb98b1Scgd 			break;
2925bfb98b1Scgd 		case ALT_E:
2935bfb98b1Scgd 			return (buf);
2945bfb98b1Scgd 		case ALT_S:
2955bfb98b1Scgd 			rp++;
2965bfb98b1Scgd 			/* FALLTHROUGH */
2975bfb98b1Scgd 		case SOT:
2985bfb98b1Scgd 		case EOT:
2995bfb98b1Scgd 		case GRP_E:
3005bfb98b1Scgd 			rp++;
3015bfb98b1Scgd 			break;
3025bfb98b1Scgd 		default:
3035bfb98b1Scgd 			return (NULL);
3045bfb98b1Scgd 		}
3055bfb98b1Scgd 	if (first) {
3065bfb98b1Scgd 		if (*rp != END)
3075bfb98b1Scgd 			return (NULL);
3085bfb98b1Scgd 		*bp = '\0';
3095bfb98b1Scgd 	}
3105bfb98b1Scgd 	return (buf);
3115bfb98b1Scgd }
312