1*369abe06Sandvar /* $NetBSD: rxp.c,v 1.14 2021/11/07 20:31:09 andvar Exp $ */
2ee0f9d10Scgd
35bfb98b1Scgd /*-
4a432dc58Scgd * Copyright (c) 1991, 1993
5a432dc58Scgd * The Regents of the University of California. All rights reserved.
65bfb98b1Scgd *
75bfb98b1Scgd * This code is derived from software contributed to Berkeley by
8a432dc58Scgd * Jim R. Oldroyd at The Instruction Set and Keith Gabryelski at
9a432dc58Scgd * Commodore Business Machines.
105bfb98b1Scgd *
115bfb98b1Scgd * Redistribution and use in source and binary forms, with or without
125bfb98b1Scgd * modification, are permitted provided that the following conditions
135bfb98b1Scgd * are met:
145bfb98b1Scgd * 1. Redistributions of source code must retain the above copyright
155bfb98b1Scgd * notice, this list of conditions and the following disclaimer.
165bfb98b1Scgd * 2. Redistributions in binary form must reproduce the above copyright
175bfb98b1Scgd * notice, this list of conditions and the following disclaimer in the
185bfb98b1Scgd * documentation and/or other materials provided with the distribution.
19e5aeb4eaSagc * 3. Neither the name of the University nor the names of its contributors
205bfb98b1Scgd * may be used to endorse or promote products derived from this software
215bfb98b1Scgd * without specific prior written permission.
225bfb98b1Scgd *
235bfb98b1Scgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
245bfb98b1Scgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
255bfb98b1Scgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
265bfb98b1Scgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
275bfb98b1Scgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
285bfb98b1Scgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
295bfb98b1Scgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
305bfb98b1Scgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
315bfb98b1Scgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
325bfb98b1Scgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
335bfb98b1Scgd * SUCH DAMAGE.
345bfb98b1Scgd */
355bfb98b1Scgd
36d8297446Slukem #include <sys/cdefs.h>
375bfb98b1Scgd #ifndef lint
38ee0f9d10Scgd #if 0
39a432dc58Scgd static char sccsid[] = "@(#)rxp.c 8.1 (Berkeley) 5/31/93";
40ee0f9d10Scgd #else
41*369abe06Sandvar __RCSID("$NetBSD: rxp.c,v 1.14 2021/11/07 20:31:09 andvar Exp $");
42ee0f9d10Scgd #endif
435bfb98b1Scgd #endif /* not lint */
445bfb98b1Scgd
455bfb98b1Scgd /*
465bfb98b1Scgd * regular expression parser
475bfb98b1Scgd *
485bfb98b1Scgd * external functions and return values are:
495bfb98b1Scgd * rxp_compile(s)
505bfb98b1Scgd * TRUE success
515bfb98b1Scgd * FALSE parse failure; error message will be in char rxperr[]
525bfb98b1Scgd * metas are:
53*369abe06Sandvar * {...} optional pattern, equivalent to [...|]
545bfb98b1Scgd * | alternate pattern
555bfb98b1Scgd * [...] pattern delimiters
565bfb98b1Scgd *
575bfb98b1Scgd * rxp_match(s)
585bfb98b1Scgd * TRUE string s matches compiled pattern
595bfb98b1Scgd * FALSE match failure or regexp error
605bfb98b1Scgd *
615bfb98b1Scgd * rxp_expand()
625bfb98b1Scgd * char * reverse-engineered regular expression string
635bfb98b1Scgd * NULL regexp error
645bfb98b1Scgd */
655bfb98b1Scgd
665bfb98b1Scgd #include <stdio.h>
6733f953abSthorpej #include <stdlib.h>
685bfb98b1Scgd #include <ctype.h>
695bfb98b1Scgd #include "quiz.h"
705bfb98b1Scgd /* regexp tokens, arg */
715bfb98b1Scgd #define LIT (-1) /* literal character, char */
725bfb98b1Scgd #define SOT (-2) /* start text anchor, - */
735bfb98b1Scgd #define EOT (-3) /* end text anchor, - */
745bfb98b1Scgd #define GRP_S (-4) /* start alternate grp, ptr_to_end */
755bfb98b1Scgd #define GRP_E (-5) /* end group, - */
765bfb98b1Scgd #define ALT_S (-6) /* alternate starts, ptr_to_next */
775bfb98b1Scgd #define ALT_E (-7) /* alternate ends, - */
785bfb98b1Scgd #define END (-8) /* end of regexp, - */
795bfb98b1Scgd
805bfb98b1Scgd typedef short Rxp_t; /* type for regexp tokens */
815bfb98b1Scgd
825bfb98b1Scgd static Rxp_t rxpbuf[RXP_LINE_SZ]; /* compiled regular expression buffer */
835bfb98b1Scgd char rxperr[128]; /* parser error message */
845bfb98b1Scgd
85cb5fd834Sjsm static int rxp__compile(const char *, int);
86cb5fd834Sjsm static char *rxp__expand(int);
87cb5fd834Sjsm static int rxp__match(const char *, int, Rxp_t *, Rxp_t *, const char *);
885bfb98b1Scgd
895bfb98b1Scgd int
rxp_compile(const char * s)9004ed5236Sdholland rxp_compile(const char *s)
915bfb98b1Scgd {
925bfb98b1Scgd return (rxp__compile(s, TRUE));
935bfb98b1Scgd }
945bfb98b1Scgd
955bfb98b1Scgd static int
rxp__compile(const char * s,int first)9604ed5236Sdholland rxp__compile(const char *s, int first)
975bfb98b1Scgd {
985bfb98b1Scgd static Rxp_t *rp;
99092d3130Sjsm static const char *sp;
1005bfb98b1Scgd Rxp_t *grp_ptr;
1015bfb98b1Scgd Rxp_t *alt_ptr;
1025bfb98b1Scgd int esc, err;
1035bfb98b1Scgd
1045bfb98b1Scgd esc = 0;
1055bfb98b1Scgd if (first) {
1065bfb98b1Scgd rp = rxpbuf;
1075bfb98b1Scgd sp = s;
1085bfb98b1Scgd *rp++ = SOT; /* auto-anchor: pat is really ^pat$ */
1095bfb98b1Scgd *rp++ = GRP_S; /* auto-group: ^pat$ is really ^[pat]$ */
1105bfb98b1Scgd *rp++ = 0;
1115bfb98b1Scgd }
1125bfb98b1Scgd *rp++ = ALT_S;
1135bfb98b1Scgd alt_ptr = rp;
1145bfb98b1Scgd *rp++ = 0;
1155bfb98b1Scgd for (; *sp; ++sp) {
1165bfb98b1Scgd if (rp - rxpbuf >= RXP_LINE_SZ - 4) {
1175bfb98b1Scgd (void)snprintf(rxperr, sizeof(rxperr),
1185bfb98b1Scgd "regular expression too long %s", s);
1195bfb98b1Scgd return (FALSE);
1205bfb98b1Scgd }
1215bfb98b1Scgd if (*sp == ':' && !esc)
1225bfb98b1Scgd break;
1235bfb98b1Scgd if (esc) {
1245bfb98b1Scgd *rp++ = LIT;
1255bfb98b1Scgd *rp++ = *sp;
1265bfb98b1Scgd esc = 0;
1275bfb98b1Scgd }
1285bfb98b1Scgd else switch (*sp) {
1295bfb98b1Scgd case '\\':
1305bfb98b1Scgd esc = 1;
1315bfb98b1Scgd break;
1325bfb98b1Scgd case '{':
1335bfb98b1Scgd case '[':
1345bfb98b1Scgd *rp++ = GRP_S;
1355bfb98b1Scgd grp_ptr = rp;
1365bfb98b1Scgd *rp++ = 0;
1375bfb98b1Scgd sp++;
1385bfb98b1Scgd if ((err = rxp__compile(s, FALSE)) != TRUE)
1395bfb98b1Scgd return (err);
1405bfb98b1Scgd *rp++ = GRP_E;
1415bfb98b1Scgd *grp_ptr = rp - rxpbuf;
1425bfb98b1Scgd break;
1435bfb98b1Scgd case '}':
1445bfb98b1Scgd case ']':
1455bfb98b1Scgd case '|':
1465bfb98b1Scgd *rp++ = ALT_E;
1475bfb98b1Scgd *alt_ptr = rp - rxpbuf;
1485bfb98b1Scgd if (*sp != ']') {
1495bfb98b1Scgd *rp++ = ALT_S;
1505bfb98b1Scgd alt_ptr = rp;
1515bfb98b1Scgd *rp++ = 0;
1525bfb98b1Scgd }
1535bfb98b1Scgd if (*sp != '|') {
1545bfb98b1Scgd if (*sp != ']') {
1555bfb98b1Scgd *rp++ = ALT_E;
1565bfb98b1Scgd *alt_ptr = rp - rxpbuf;
1575bfb98b1Scgd }
1585bfb98b1Scgd if (first) {
1595bfb98b1Scgd (void)snprintf(rxperr, sizeof(rxperr),
1605bfb98b1Scgd "unmatched alternator in regexp %s",
1615bfb98b1Scgd s);
1625bfb98b1Scgd return (FALSE);
1635bfb98b1Scgd }
1645bfb98b1Scgd return (TRUE);
1655bfb98b1Scgd }
1665bfb98b1Scgd break;
1675bfb98b1Scgd default:
1685bfb98b1Scgd *rp++ = LIT;
1695bfb98b1Scgd *rp++ = *sp;
1705bfb98b1Scgd esc = 0;
1715bfb98b1Scgd break;
1725bfb98b1Scgd }
1735bfb98b1Scgd }
1745bfb98b1Scgd if (!first) {
1755bfb98b1Scgd (void)snprintf(rxperr, sizeof(rxperr),
1765bfb98b1Scgd "unmatched alternator in regexp %s", s);
1775bfb98b1Scgd return (FALSE);
1785bfb98b1Scgd }
1795bfb98b1Scgd *rp++ = ALT_E;
1805bfb98b1Scgd *alt_ptr = rp - rxpbuf;
1815bfb98b1Scgd *rp++ = GRP_E;
1825bfb98b1Scgd *(rxpbuf + 2) = rp - rxpbuf;
1835bfb98b1Scgd *rp++ = EOT;
1845bfb98b1Scgd *rp = END;
1855bfb98b1Scgd return (TRUE);
1865bfb98b1Scgd }
1875bfb98b1Scgd
1885bfb98b1Scgd /*
1895bfb98b1Scgd * match string against compiled regular expression
1905bfb98b1Scgd */
1915bfb98b1Scgd int
rxp_match(const char * s)19204ed5236Sdholland rxp_match(const char *s)
1935bfb98b1Scgd {
1945bfb98b1Scgd return (rxp__match(s, TRUE, NULL, NULL, NULL));
1955bfb98b1Scgd }
1965bfb98b1Scgd
1975bfb98b1Scgd static int
rxp__match(const char * s,int first,Rxp_t * j_succ,Rxp_t * j_fail,const char * sp_fail)19804ed5236Sdholland rxp__match(const char *s,
19904ed5236Sdholland int first,
20004ed5236Sdholland Rxp_t *j_succ, /* jump here on successful alt match */
20104ed5236Sdholland Rxp_t *j_fail, /* jump here on failed match */
20204ed5236Sdholland const char *sp_fail) /* reset sp to here on failed match */
2035bfb98b1Scgd {
2045bfb98b1Scgd static Rxp_t *rp;
205092d3130Sjsm static const char *sp;
206d8297446Slukem int ch;
207d8297446Slukem Rxp_t *grp_end = NULL;
2085bfb98b1Scgd
2095bfb98b1Scgd if (first) {
2105bfb98b1Scgd rp = rxpbuf;
2115bfb98b1Scgd sp = s;
2125bfb98b1Scgd }
2135bfb98b1Scgd while (rp < rxpbuf + RXP_LINE_SZ && *rp != END)
2145bfb98b1Scgd switch(*rp) {
2155bfb98b1Scgd case LIT:
2165bfb98b1Scgd rp++;
2175bfb98b1Scgd ch = isascii(*rp) && isupper(*rp) ? tolower(*rp) : *rp;
2185bfb98b1Scgd if (ch != *sp++) {
2195bfb98b1Scgd rp = j_fail;
2205bfb98b1Scgd sp = sp_fail;
221a7d4c586Sdbj return (FALSE);
2225bfb98b1Scgd }
2235bfb98b1Scgd rp++;
2245bfb98b1Scgd break;
2255bfb98b1Scgd case SOT:
2265bfb98b1Scgd if (sp != s)
2275bfb98b1Scgd return (FALSE);
2285bfb98b1Scgd rp++;
2295bfb98b1Scgd break;
2305bfb98b1Scgd case EOT:
2315bfb98b1Scgd if (*sp != 0)
2325bfb98b1Scgd return (FALSE);
2335bfb98b1Scgd rp++;
2345bfb98b1Scgd break;
2355bfb98b1Scgd case GRP_S:
2365bfb98b1Scgd rp++;
2375bfb98b1Scgd grp_end = rxpbuf + *rp++;
2385bfb98b1Scgd break;
2395bfb98b1Scgd case ALT_S:
2405bfb98b1Scgd rp++;
241a7d4c586Sdbj rxp__match(sp, FALSE, grp_end, rxpbuf + *rp++, sp);
2425bfb98b1Scgd break;
2435bfb98b1Scgd case ALT_E:
2445bfb98b1Scgd rp = j_succ;
2455bfb98b1Scgd return (TRUE);
2465bfb98b1Scgd case GRP_E:
247a7d4c586Sdbj rp = j_fail;
248a7d4c586Sdbj sp = sp_fail;
2495bfb98b1Scgd return (FALSE);
250a7d4c586Sdbj default:
251a7d4c586Sdbj abort();
2525bfb98b1Scgd }
2535bfb98b1Scgd return (*rp != END ? FALSE : TRUE);
2545bfb98b1Scgd }
2555bfb98b1Scgd
2565bfb98b1Scgd /*
2575bfb98b1Scgd * Reverse engineer the regular expression, by picking first of all alternates.
2585bfb98b1Scgd */
2595bfb98b1Scgd char *
rxp_expand(void)26004ed5236Sdholland rxp_expand(void)
2615bfb98b1Scgd {
2625bfb98b1Scgd return (rxp__expand(TRUE));
2635bfb98b1Scgd }
2645bfb98b1Scgd
2655bfb98b1Scgd static char *
rxp__expand(int first)26604ed5236Sdholland rxp__expand(int first)
2675bfb98b1Scgd {
2685bfb98b1Scgd static char buf[RXP_LINE_SZ/2];
2695bfb98b1Scgd static Rxp_t *rp;
2705bfb98b1Scgd static char *bp;
2715bfb98b1Scgd Rxp_t *grp_ptr;
2725bfb98b1Scgd char *err;
2735bfb98b1Scgd
2745bfb98b1Scgd if (first) {
2755bfb98b1Scgd rp = rxpbuf;
2765bfb98b1Scgd bp = buf;
2775bfb98b1Scgd }
2785bfb98b1Scgd while (rp < rxpbuf + RXP_LINE_SZ && *rp != END)
2795bfb98b1Scgd switch(*rp) {
2805bfb98b1Scgd case LIT:
2815bfb98b1Scgd rp++;
2825bfb98b1Scgd *bp++ = *rp++;
2835bfb98b1Scgd break;
2845bfb98b1Scgd case GRP_S:
2855bfb98b1Scgd rp++;
2865bfb98b1Scgd grp_ptr = rxpbuf + *rp;
2875bfb98b1Scgd rp++;
2885bfb98b1Scgd if ((err = rxp__expand(FALSE)) == NULL)
2895bfb98b1Scgd return (err);
2905bfb98b1Scgd rp = grp_ptr;
2915bfb98b1Scgd break;
2925bfb98b1Scgd case ALT_E:
2935bfb98b1Scgd return (buf);
2945bfb98b1Scgd case ALT_S:
2955bfb98b1Scgd rp++;
2965bfb98b1Scgd /* FALLTHROUGH */
2975bfb98b1Scgd case SOT:
2985bfb98b1Scgd case EOT:
2995bfb98b1Scgd case GRP_E:
3005bfb98b1Scgd rp++;
3015bfb98b1Scgd break;
3025bfb98b1Scgd default:
3035bfb98b1Scgd return (NULL);
3045bfb98b1Scgd }
3055bfb98b1Scgd if (first) {
3065bfb98b1Scgd if (*rp != END)
3075bfb98b1Scgd return (NULL);
3085bfb98b1Scgd *bp = '\0';
3095bfb98b1Scgd }
3105bfb98b1Scgd return (buf);
3115bfb98b1Scgd }
312