1*51611Sbostic /*- 2*51611Sbostic * Copyright (c) 1991 The Regents of the University of California. 3*51611Sbostic * All rights reserved. 4*51611Sbostic * 5*51611Sbostic * This code is derived from software contributed to Berkeley by 6*51611Sbostic * Jim R. Oldroyd at The Instruction Set. 7*51611Sbostic * 8*51611Sbostic * %sccs.include.redist.c% 9*51611Sbostic */ 10*51611Sbostic 11*51611Sbostic #ifndef lint 12*51611Sbostic static char sccsid[] = "@(#)rxp.c 5.1 (Berkeley) 11/10/91"; 13*51611Sbostic #endif /* not lint */ 14*51611Sbostic 15*51611Sbostic /* 16*51611Sbostic * regular expression parser 17*51611Sbostic * 18*51611Sbostic * external functions and return values are: 19*51611Sbostic * rxp_compile(s) 20*51611Sbostic * TRUE success 21*51611Sbostic * FALSE parse failure; error message will be in char rxperr[] 22*51611Sbostic * metas are: 23*51611Sbostic * {...} optional pattern, equialent to [...|] 24*51611Sbostic * | alternate pattern 25*51611Sbostic * [...] pattern delimiters 26*51611Sbostic * 27*51611Sbostic * rxp_match(s) 28*51611Sbostic * TRUE string s matches compiled pattern 29*51611Sbostic * FALSE match failure or regexp error 30*51611Sbostic * 31*51611Sbostic * rxp_expand() 32*51611Sbostic * char * reverse-engineered regular expression string 33*51611Sbostic * NULL regexp error 34*51611Sbostic */ 35*51611Sbostic 36*51611Sbostic #include <stdio.h> 37*51611Sbostic #include <ctype.h> 38*51611Sbostic #include "quiz.h" 39*51611Sbostic /* regexp tokens, arg */ 40*51611Sbostic #define LIT (-1) /* literal character, char */ 41*51611Sbostic #define SOT (-2) /* start text anchor, - */ 42*51611Sbostic #define EOT (-3) /* end text anchor, - */ 43*51611Sbostic #define GRP_S (-4) /* start alternate grp, ptr_to_end */ 44*51611Sbostic #define GRP_E (-5) /* end group, - */ 45*51611Sbostic #define ALT_S (-6) /* alternate starts, ptr_to_next */ 46*51611Sbostic #define ALT_E (-7) /* alternate ends, - */ 47*51611Sbostic #define END (-8) /* end of regexp, - */ 48*51611Sbostic 49*51611Sbostic typedef short Rxp_t; /* type for regexp tokens */ 50*51611Sbostic 51*51611Sbostic static Rxp_t rxpbuf[RXP_LINE_SZ]; /* compiled regular expression buffer */ 52*51611Sbostic char rxperr[128]; /* parser error message */ 53*51611Sbostic 54*51611Sbostic int rxp__compile __P((char *, int)); 55*51611Sbostic char *rxp__expand __P((int)); 56*51611Sbostic int rxp__match __P((char *, int, Rxp_t *, Rxp_t *, char *)); 57*51611Sbostic 58*51611Sbostic int 59*51611Sbostic rxp_compile(s) 60*51611Sbostic register char * s; 61*51611Sbostic { 62*51611Sbostic return (rxp__compile(s, TRUE)); 63*51611Sbostic } 64*51611Sbostic 65*51611Sbostic static int 66*51611Sbostic rxp__compile(s, first) 67*51611Sbostic register char *s; 68*51611Sbostic int first; 69*51611Sbostic { 70*51611Sbostic static Rxp_t *rp; 71*51611Sbostic static char *sp; 72*51611Sbostic Rxp_t *grp_ptr; 73*51611Sbostic Rxp_t *alt_ptr; 74*51611Sbostic int esc, err; 75*51611Sbostic 76*51611Sbostic esc = 0; 77*51611Sbostic if (first) { 78*51611Sbostic rp = rxpbuf; 79*51611Sbostic sp = s; 80*51611Sbostic *rp++ = SOT; /* auto-anchor: pat is really ^pat$ */ 81*51611Sbostic *rp++ = GRP_S; /* auto-group: ^pat$ is really ^[pat]$ */ 82*51611Sbostic *rp++ = 0; 83*51611Sbostic } 84*51611Sbostic *rp++ = ALT_S; 85*51611Sbostic alt_ptr = rp; 86*51611Sbostic *rp++ = 0; 87*51611Sbostic for (; *sp; ++sp) { 88*51611Sbostic if (rp - rxpbuf >= RXP_LINE_SZ - 4) { 89*51611Sbostic (void)snprintf(rxperr, sizeof(rxperr), 90*51611Sbostic "regular expression too long %s", s); 91*51611Sbostic return (FALSE); 92*51611Sbostic } 93*51611Sbostic if (*sp == ':' && !esc) 94*51611Sbostic break; 95*51611Sbostic if (esc) { 96*51611Sbostic *rp++ = LIT; 97*51611Sbostic *rp++ = *sp; 98*51611Sbostic esc = 0; 99*51611Sbostic } 100*51611Sbostic else switch (*sp) { 101*51611Sbostic case '\\': 102*51611Sbostic esc = 1; 103*51611Sbostic break; 104*51611Sbostic case '{': 105*51611Sbostic case '[': 106*51611Sbostic *rp++ = GRP_S; 107*51611Sbostic grp_ptr = rp; 108*51611Sbostic *rp++ = 0; 109*51611Sbostic sp++; 110*51611Sbostic if ((err = rxp__compile(s, FALSE)) != TRUE) 111*51611Sbostic return (err); 112*51611Sbostic *rp++ = GRP_E; 113*51611Sbostic *grp_ptr = rp - rxpbuf; 114*51611Sbostic break; 115*51611Sbostic case '}': 116*51611Sbostic case ']': 117*51611Sbostic case '|': 118*51611Sbostic *rp++ = ALT_E; 119*51611Sbostic *alt_ptr = rp - rxpbuf; 120*51611Sbostic if (*sp != ']') { 121*51611Sbostic *rp++ = ALT_S; 122*51611Sbostic alt_ptr = rp; 123*51611Sbostic *rp++ = 0; 124*51611Sbostic } 125*51611Sbostic if (*sp != '|') { 126*51611Sbostic if (*sp != ']') { 127*51611Sbostic *rp++ = ALT_E; 128*51611Sbostic *alt_ptr = rp - rxpbuf; 129*51611Sbostic } 130*51611Sbostic if (first) { 131*51611Sbostic (void)snprintf(rxperr, sizeof(rxperr), 132*51611Sbostic "unmatched alternator in regexp %s", 133*51611Sbostic s); 134*51611Sbostic return (FALSE); 135*51611Sbostic } 136*51611Sbostic return (TRUE); 137*51611Sbostic } 138*51611Sbostic break; 139*51611Sbostic default: 140*51611Sbostic *rp++ = LIT; 141*51611Sbostic *rp++ = *sp; 142*51611Sbostic esc = 0; 143*51611Sbostic break; 144*51611Sbostic } 145*51611Sbostic } 146*51611Sbostic if (!first) { 147*51611Sbostic (void)snprintf(rxperr, sizeof(rxperr), 148*51611Sbostic "unmatched alternator in regexp %s", s); 149*51611Sbostic return (FALSE); 150*51611Sbostic } 151*51611Sbostic *rp++ = ALT_E; 152*51611Sbostic *alt_ptr = rp - rxpbuf; 153*51611Sbostic *rp++ = GRP_E; 154*51611Sbostic *(rxpbuf + 2) = rp - rxpbuf; 155*51611Sbostic *rp++ = EOT; 156*51611Sbostic *rp = END; 157*51611Sbostic return (TRUE); 158*51611Sbostic } 159*51611Sbostic 160*51611Sbostic /* 161*51611Sbostic * match string against compiled regular expression 162*51611Sbostic */ 163*51611Sbostic int 164*51611Sbostic rxp_match(s) 165*51611Sbostic register char * s; 166*51611Sbostic { 167*51611Sbostic return (rxp__match(s, TRUE, NULL, NULL, NULL)); 168*51611Sbostic } 169*51611Sbostic 170*51611Sbostic static int 171*51611Sbostic rxp__match(s, first, j_succ, j_fail, sp_fail) 172*51611Sbostic char *s; 173*51611Sbostic int first; 174*51611Sbostic Rxp_t *j_succ; /* jump here on successful alt match */ 175*51611Sbostic Rxp_t *j_fail; /* jump here on failed match */ 176*51611Sbostic char *sp_fail; /* reset sp to here on failed match */ 177*51611Sbostic { 178*51611Sbostic static Rxp_t *rp; 179*51611Sbostic static char *sp; 180*51611Sbostic register int ch; 181*51611Sbostic Rxp_t *grp_end; 182*51611Sbostic int err; 183*51611Sbostic 184*51611Sbostic if (first) { 185*51611Sbostic rp = rxpbuf; 186*51611Sbostic sp = s; 187*51611Sbostic } 188*51611Sbostic while (rp < rxpbuf + RXP_LINE_SZ && *rp != END) 189*51611Sbostic switch(*rp) { 190*51611Sbostic case LIT: 191*51611Sbostic rp++; 192*51611Sbostic ch = isascii(*rp) && isupper(*rp) ? tolower(*rp) : *rp; 193*51611Sbostic if (ch != *sp++) { 194*51611Sbostic rp = j_fail; 195*51611Sbostic sp = sp_fail; 196*51611Sbostic return (TRUE); 197*51611Sbostic } 198*51611Sbostic rp++; 199*51611Sbostic break; 200*51611Sbostic case SOT: 201*51611Sbostic if (sp != s) 202*51611Sbostic return (FALSE); 203*51611Sbostic rp++; 204*51611Sbostic break; 205*51611Sbostic case EOT: 206*51611Sbostic if (*sp != 0) 207*51611Sbostic return (FALSE); 208*51611Sbostic rp++; 209*51611Sbostic break; 210*51611Sbostic case GRP_S: 211*51611Sbostic rp++; 212*51611Sbostic grp_end = rxpbuf + *rp++; 213*51611Sbostic break; 214*51611Sbostic case ALT_S: 215*51611Sbostic rp++; 216*51611Sbostic if ((err = rxp__match(sp, 217*51611Sbostic FALSE, grp_end, rxpbuf + *rp++, sp)) != TRUE) 218*51611Sbostic return (err); 219*51611Sbostic break; 220*51611Sbostic case ALT_E: 221*51611Sbostic rp = j_succ; 222*51611Sbostic return (TRUE); 223*51611Sbostic case GRP_E: 224*51611Sbostic default: 225*51611Sbostic return (FALSE); 226*51611Sbostic } 227*51611Sbostic return (*rp != END ? FALSE : TRUE); 228*51611Sbostic } 229*51611Sbostic 230*51611Sbostic /* 231*51611Sbostic * Reverse engineer the regular expression, by picking first of all alternates. 232*51611Sbostic */ 233*51611Sbostic char * 234*51611Sbostic rxp_expand() 235*51611Sbostic { 236*51611Sbostic return (rxp__expand(TRUE)); 237*51611Sbostic } 238*51611Sbostic 239*51611Sbostic static char * 240*51611Sbostic rxp__expand(first) 241*51611Sbostic int first; 242*51611Sbostic { 243*51611Sbostic static char buf[RXP_LINE_SZ/2]; 244*51611Sbostic static Rxp_t *rp; 245*51611Sbostic static char *bp; 246*51611Sbostic Rxp_t *grp_ptr; 247*51611Sbostic char *err; 248*51611Sbostic 249*51611Sbostic if (first) { 250*51611Sbostic rp = rxpbuf; 251*51611Sbostic bp = buf; 252*51611Sbostic } 253*51611Sbostic while (rp < rxpbuf + RXP_LINE_SZ && *rp != END) 254*51611Sbostic switch(*rp) { 255*51611Sbostic case LIT: 256*51611Sbostic rp++; 257*51611Sbostic *bp++ = *rp++; 258*51611Sbostic break; 259*51611Sbostic case GRP_S: 260*51611Sbostic rp++; 261*51611Sbostic grp_ptr = rxpbuf + *rp; 262*51611Sbostic rp++; 263*51611Sbostic if ((err = rxp__expand(FALSE)) == NULL) 264*51611Sbostic return (err); 265*51611Sbostic rp = grp_ptr; 266*51611Sbostic break; 267*51611Sbostic case ALT_E: 268*51611Sbostic return (buf); 269*51611Sbostic case ALT_S: 270*51611Sbostic rp++; 271*51611Sbostic /* FALLTHROUGH */ 272*51611Sbostic case SOT: 273*51611Sbostic case EOT: 274*51611Sbostic case GRP_E: 275*51611Sbostic rp++; 276*51611Sbostic break; 277*51611Sbostic default: 278*51611Sbostic return (NULL); 279*51611Sbostic } 280*51611Sbostic if (first) { 281*51611Sbostic if (*rp != END) 282*51611Sbostic return (NULL); 283*51611Sbostic *bp = '\0'; 284*51611Sbostic } 285*51611Sbostic return (buf); 286*51611Sbostic } 287