122063Sdist /*
2*62423Sbostic * Copyright (c) 1980, 1993
3*62423Sbostic * The Regents of the University of California. All rights reserved.
436167Sbostic *
556144Selan *
656260Selan * %sccs.include.redist.c%
722063Sdist */
88659Smckusick
922063Sdist #ifndef lint
1056272Selan static char copyright[] =
11*62423Sbostic "@(#) Copyright (c) 1980, 1993\n\
12*62423Sbostic The Regents of the University of California. All rights reserved.\n";
1336167Sbostic #endif /* not lint */
1422063Sdist
1556260Selan #ifndef lint
16*62423Sbostic static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 06/06/93";
1756260Selan #endif /* not lint */
1856260Selan
198659Smckusick #include <ctype.h>
2056144Selan #include <stdlib.h>
2156144Selan #include <string.h>
2256144Selan #include "extern.h"
238659Smckusick
248659Smckusick #define FALSE 0
2556144Selan #define TRUE !(FALSE)
268659Smckusick #define NIL 0
278659Smckusick
2856144Selan static void expconv __P((void));
298659Smckusick
3056144Selan boolean _escaped; /* true if we are currently _escaped */
3156144Selan char *_start; /* start of string */
3256144Selan boolean l_onecase; /* true if upper and lower equivalent */
3356144Selan
348659Smckusick #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
358659Smckusick
368659Smckusick /* STRNCMP - like strncmp except that we convert the
378659Smckusick * first string to lower case before comparing
388659Smckusick * if l_onecase is set.
398659Smckusick */
408659Smckusick
4156144Selan int
STRNCMP(s1,s2,len)428659Smckusick STRNCMP(s1, s2, len)
438659Smckusick register char *s1,*s2;
448659Smckusick register int len;
458659Smckusick {
468659Smckusick if (l_onecase) {
478659Smckusick do
488659Smckusick if (*s2 - makelower(*s1))
498659Smckusick return (*s2 - makelower(*s1));
508659Smckusick else {
518659Smckusick s2++;
528659Smckusick s1++;
538659Smckusick }
548659Smckusick while (--len);
558659Smckusick } else {
568659Smckusick do
578659Smckusick if (*s2 - *s1)
588659Smckusick return (*s2 - *s1);
598659Smckusick else {
608659Smckusick s2++;
618659Smckusick s1++;
628659Smckusick }
638659Smckusick while (--len);
648659Smckusick }
658659Smckusick return(0);
668659Smckusick }
678659Smckusick
688659Smckusick /* The following routine converts an irregular expression to
698659Smckusick * internal format.
708659Smckusick *
718659Smckusick * Either meta symbols (\a \d or \p) or character strings or
728659Smckusick * operations ( alternation or perenthesizing ) can be
738659Smckusick * specified. Each starts with a descriptor byte. The descriptor
748659Smckusick * byte has STR set for strings, META set for meta symbols
758659Smckusick * and OPER set for operations.
768659Smckusick * The descriptor byte can also have the OPT bit set if the object
778659Smckusick * defined is optional. Also ALT can be set to indicate an alternation.
788659Smckusick *
798659Smckusick * For metasymbols the byte following the descriptor byte identities
808659Smckusick * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
818659Smckusick * strings the byte after the descriptor is a character count for
828659Smckusick * the string:
838659Smckusick *
848659Smckusick * meta symbols := descriptor
858659Smckusick * symbol
868659Smckusick *
878659Smckusick * strings := descriptor
888659Smckusick * character count
898659Smckusick * the string
908659Smckusick *
918659Smckusick * operatins := descriptor
928659Smckusick * symbol
938659Smckusick * character count
948659Smckusick */
958659Smckusick
968659Smckusick /*
978659Smckusick * handy macros for accessing parts of match blocks
988659Smckusick */
998659Smckusick #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
1008659Smckusick #define MNEXT(A) (A+2) /* character following a metasymbol block */
1018659Smckusick
1028659Smckusick #define OSYM(A) (*(A+1)) /* symbol in an operation block */
1038659Smckusick #define OCNT(A) (*(A+2)) /* character count */
1048659Smckusick #define ONEXT(A) (A+3) /* next character after the operation */
1058659Smckusick #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
1068659Smckusick
1078659Smckusick #define SCNT(A) (*(A+1)) /* byte count of a string */
1088659Smckusick #define SSTR(A) (A+2) /* address of the string */
1098659Smckusick #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
1108659Smckusick
1118659Smckusick /*
1128659Smckusick * bit flags in the descriptor
1138659Smckusick */
1148659Smckusick #define OPT 1
1158659Smckusick #define STR 2
1168659Smckusick #define META 4
1178659Smckusick #define ALT 8
1188659Smckusick #define OPER 16
1198659Smckusick
12056144Selan static char *ccre; /* pointer to current position in converted exp*/
12156144Selan static char *ure; /* pointer current position in unconverted exp */
1228659Smckusick
1238659Smckusick char *
convexp(re)1248659Smckusick convexp(re)
1258659Smckusick char *re; /* unconverted irregular expression */
1268659Smckusick {
1278659Smckusick register char *cre; /* pointer to converted regular expression */
1288659Smckusick
1298659Smckusick /* allocate room for the converted expression */
1308659Smckusick if (re == NIL)
1318659Smckusick return (NIL);
1328659Smckusick if (*re == '\0')
1338659Smckusick return (NIL);
1348659Smckusick cre = malloc (4 * strlen(re) + 3);
1358659Smckusick ccre = cre;
1368659Smckusick ure = re;
1378659Smckusick
1388659Smckusick /* start the conversion with a \a */
1398659Smckusick *cre = META | OPT;
1408659Smckusick MSYM(cre) = 'a';
1418659Smckusick ccre = MNEXT(cre);
1428659Smckusick
1438659Smckusick /* start the conversion (its recursive) */
1448659Smckusick expconv ();
1458659Smckusick *ccre = 0;
1468659Smckusick return (cre);
1478659Smckusick }
1488659Smckusick
14956144Selan static void
expconv()1508659Smckusick expconv()
1518659Smckusick {
1528659Smckusick register char *cs; /* pointer to current symbol in converted exp */
1538659Smckusick register char c; /* character being processed */
1548659Smckusick register char *acs; /* pinter to last alternate */
1558659Smckusick register int temp;
1568659Smckusick
1578659Smckusick /* let the conversion begin */
1588659Smckusick acs = NIL;
15917500Sralph cs = NIL;
1608659Smckusick while (*ure != NIL) {
1618659Smckusick switch (c = *ure++) {
1628659Smckusick
1638659Smckusick case '\\':
1648659Smckusick switch (c = *ure++) {
1658659Smckusick
1668659Smckusick /* escaped characters are just characters */
1678659Smckusick default:
16817500Sralph if (cs == NIL || (*cs & STR) == 0) {
1698659Smckusick cs = ccre;
1708659Smckusick *cs = STR;
1718659Smckusick SCNT(cs) = 1;
1728659Smckusick ccre += 2;
1738659Smckusick } else
1748659Smckusick SCNT(cs)++;
1758659Smckusick *ccre++ = c;
1768659Smckusick break;
1778659Smckusick
1788659Smckusick /* normal(?) metacharacters */
1798659Smckusick case 'a':
1808659Smckusick case 'd':
1818659Smckusick case 'e':
1828659Smckusick case 'p':
1838659Smckusick if (acs != NIL && acs != cs) {
1848659Smckusick do {
1858659Smckusick temp = OCNT(acs);
1868659Smckusick OCNT(acs) = ccre - acs;
1878659Smckusick acs -= temp;
1888659Smckusick } while (temp != 0);
1898659Smckusick acs = NIL;
1908659Smckusick }
1918659Smckusick cs = ccre;
1928659Smckusick *cs = META;
1938659Smckusick MSYM(cs) = c;
1948659Smckusick ccre = MNEXT(cs);
1958659Smckusick break;
1968659Smckusick }
1978659Smckusick break;
1988659Smckusick
1998659Smckusick /* just put the symbol in */
2008659Smckusick case '^':
2018659Smckusick case '$':
2028659Smckusick if (acs != NIL && acs != cs) {
2038659Smckusick do {
2048659Smckusick temp = OCNT(acs);
2058659Smckusick OCNT(acs) = ccre - acs;
2068659Smckusick acs -= temp;
2078659Smckusick } while (temp != 0);
2088659Smckusick acs = NIL;
2098659Smckusick }
2108659Smckusick cs = ccre;
2118659Smckusick *cs = META;
2128659Smckusick MSYM(cs) = c;
2138659Smckusick ccre = MNEXT(cs);
2148659Smckusick break;
2158659Smckusick
2168659Smckusick /* mark the last match sequence as optional */
2178659Smckusick case '?':
21817500Sralph if (cs)
21917500Sralph *cs = *cs | OPT;
2208659Smckusick break;
2218659Smckusick
2228659Smckusick /* recurse and define a subexpression */
2238659Smckusick case '(':
2248659Smckusick if (acs != NIL && acs != cs) {
2258659Smckusick do {
2268659Smckusick temp = OCNT(acs);
2278659Smckusick OCNT(acs) = ccre - acs;
2288659Smckusick acs -= temp;
2298659Smckusick } while (temp != 0);
2308659Smckusick acs = NIL;
2318659Smckusick }
2328659Smckusick cs = ccre;
2338659Smckusick *cs = OPER;
2348659Smckusick OSYM(cs) = '(';
2358659Smckusick ccre = ONEXT(cs);
2368659Smckusick expconv ();
2378659Smckusick OCNT(cs) = ccre - cs; /* offset to next symbol */
2388659Smckusick break;
2398659Smckusick
24056144Selan /* reurn from a recursion */
2418659Smckusick case ')':
2428659Smckusick if (acs != NIL) {
2438659Smckusick do {
2448659Smckusick temp = OCNT(acs);
2458659Smckusick OCNT(acs) = ccre - acs;
2468659Smckusick acs -= temp;
2478659Smckusick } while (temp != 0);
2488659Smckusick acs = NIL;
2498659Smckusick }
2508659Smckusick cs = ccre;
2518659Smckusick *cs = META;
2528659Smckusick MSYM(cs) = c;
2538659Smckusick ccre = MNEXT(cs);
2548659Smckusick return;
2558659Smckusick
2568659Smckusick /* mark the last match sequence as having an alternate */
2578659Smckusick /* the third byte will contain an offset to jump over the */
2588659Smckusick /* alternate match in case the first did not fail */
2598659Smckusick case '|':
2608659Smckusick if (acs != NIL && acs != cs)
2618659Smckusick OCNT(ccre) = ccre - acs; /* make a back pointer */
2628659Smckusick else
2638659Smckusick OCNT(ccre) = 0;
2648659Smckusick *cs |= ALT;
2658659Smckusick cs = ccre;
2668659Smckusick *cs = OPER;
2678659Smckusick OSYM(cs) = '|';
2688659Smckusick ccre = ONEXT(cs);
2698659Smckusick acs = cs; /* remember that the pointer is to be filles */
2708659Smckusick break;
2718659Smckusick
2728659Smckusick /* if its not a metasymbol just build a scharacter string */
2738659Smckusick default:
27417500Sralph if (cs == NIL || (*cs & STR) == 0) {
2758659Smckusick cs = ccre;
2768659Smckusick *cs = STR;
2778659Smckusick SCNT(cs) = 1;
2788659Smckusick ccre = SSTR(cs);
2798659Smckusick } else
2808659Smckusick SCNT(cs)++;
2818659Smckusick *ccre++ = c;
2828659Smckusick break;
2838659Smckusick }
2848659Smckusick }
2858659Smckusick if (acs != NIL) {
2868659Smckusick do {
2878659Smckusick temp = OCNT(acs);
2888659Smckusick OCNT(acs) = ccre - acs;
2898659Smckusick acs -= temp;
2908659Smckusick } while (temp != 0);
2918659Smckusick acs = NIL;
2928659Smckusick }
2938659Smckusick return;
2948659Smckusick }
2958659Smckusick /* end of convertre */
2968659Smckusick
2978659Smckusick
2988659Smckusick /*
2998659Smckusick * The following routine recognises an irregular expresion
3008659Smckusick * with the following special characters:
3018659Smckusick *
3028659Smckusick * \? - means last match was optional
3038659Smckusick * \a - matches any number of characters
3048659Smckusick * \d - matches any number of spaces and tabs
3058659Smckusick * \p - matches any number of alphanumeric
3068659Smckusick * characters. The
3078659Smckusick * characters matched will be copied into
3088659Smckusick * the area pointed to by 'name'.
3098659Smckusick * \| - alternation
3108659Smckusick * \( \) - grouping used mostly for alternation and
3118659Smckusick * optionality
3128659Smckusick *
3138659Smckusick * The irregular expression must be translated to internal form
3148659Smckusick * prior to calling this routine
3158659Smckusick *
3168659Smckusick * The value returned is the pointer to the first non \a
3178659Smckusick * character matched.
3188659Smckusick */
3198659Smckusick
3208659Smckusick char *
expmatch(s,re,mstring)3218659Smckusick expmatch (s, re, mstring)
3228659Smckusick register char *s; /* string to check for a match in */
3238659Smckusick register char *re; /* a converted irregular expression */
3248659Smckusick register char *mstring; /* where to put whatever matches a \p */
3258659Smckusick {
3268659Smckusick register char *cs; /* the current symbol */
3278659Smckusick register char *ptr,*s1; /* temporary pointer */
3288659Smckusick boolean matched; /* a temporary boolean */
3298659Smckusick
3308659Smckusick /* initial conditions */
3318659Smckusick if (re == NIL)
3328659Smckusick return (NIL);
3338659Smckusick cs = re;
3348659Smckusick matched = FALSE;
3358659Smckusick
3368659Smckusick /* loop till expression string is exhausted (or at least pretty tired) */
3378659Smckusick while (*cs) {
3388659Smckusick switch (*cs & (OPER | STR | META)) {
3398659Smckusick
3408659Smckusick /* try to match a string */
3418659Smckusick case STR:
3428659Smckusick matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
3438659Smckusick if (matched) {
3448659Smckusick
3458659Smckusick /* hoorah it matches */
3468659Smckusick s += SCNT(cs);
3478659Smckusick cs = SNEXT(cs);
3488659Smckusick } else if (*cs & ALT) {
3498659Smckusick
3508659Smckusick /* alternation, skip to next expression */
3518659Smckusick cs = SNEXT(cs);
3528659Smckusick } else if (*cs & OPT) {
3538659Smckusick
3548659Smckusick /* the match is optional */
3558659Smckusick cs = SNEXT(cs);
3568659Smckusick matched = 1; /* indicate a successful match */
3578659Smckusick } else {
3588659Smckusick
3598659Smckusick /* no match, error return */
3608659Smckusick return (NIL);
3618659Smckusick }
3628659Smckusick break;
3638659Smckusick
3648659Smckusick /* an operator, do something fancy */
3658659Smckusick case OPER:
3668659Smckusick switch (OSYM(cs)) {
3678659Smckusick
3688659Smckusick /* this is an alternation */
3698659Smckusick case '|':
3708659Smckusick if (matched)
3718659Smckusick
3728659Smckusick /* last thing in the alternation was a match, skip ahead */
3738659Smckusick cs = OPTR(cs);
3748659Smckusick else
3758659Smckusick
3768659Smckusick /* no match, keep trying */
3778659Smckusick cs = ONEXT(cs);
3788659Smckusick break;
3798659Smckusick
3808659Smckusick /* this is a grouping, recurse */
3818659Smckusick case '(':
3828659Smckusick ptr = expmatch (s, ONEXT(cs), mstring);
3838659Smckusick if (ptr != NIL) {
3848659Smckusick
3858659Smckusick /* the subexpression matched */
3868659Smckusick matched = 1;
3878659Smckusick s = ptr;
3888659Smckusick } else if (*cs & ALT) {
3898659Smckusick
3908659Smckusick /* alternation, skip to next expression */
3918659Smckusick matched = 0;
3928659Smckusick } else if (*cs & OPT) {
3938659Smckusick
3948659Smckusick /* the match is optional */
3958659Smckusick matched = 1; /* indicate a successful match */
3968659Smckusick } else {
3978659Smckusick
3988659Smckusick /* no match, error return */
3998659Smckusick return (NIL);
4008659Smckusick }
4018659Smckusick cs = OPTR(cs);
4028659Smckusick break;
4038659Smckusick }
4048659Smckusick break;
4058659Smckusick
4068659Smckusick /* try to match a metasymbol */
4078659Smckusick case META:
4088659Smckusick switch (MSYM(cs)) {
4098659Smckusick
4108659Smckusick /* try to match anything and remember what was matched */
4118659Smckusick case 'p':
4128659Smckusick /*
4138659Smckusick * This is really the same as trying the match the
4148659Smckusick * remaining parts of the expression to any subset
4158659Smckusick * of the string.
4168659Smckusick */
4178659Smckusick s1 = s;
4188659Smckusick do {
4198659Smckusick ptr = expmatch (s1, MNEXT(cs), mstring);
4208659Smckusick if (ptr != NIL && s1 != s) {
4218659Smckusick
4228659Smckusick /* we have a match, remember the match */
4238659Smckusick strncpy (mstring, s, s1 - s);
4248659Smckusick mstring[s1 - s] = '\0';
4258659Smckusick return (ptr);
4268659Smckusick } else if (ptr != NIL && (*cs & OPT)) {
4278659Smckusick
4288659Smckusick /* it was aoptional so no match is ok */
4298659Smckusick return (ptr);
4308659Smckusick } else if (ptr != NIL) {
4318659Smckusick
4328659Smckusick /* not optional and we still matched */
4338659Smckusick return (NIL);
4348659Smckusick }
4358659Smckusick if (!isalnum(*s1) && *s1 != '_')
4368659Smckusick return (NIL);
4378659Smckusick if (*s1 == '\\')
4388659Smckusick _escaped = _escaped ? FALSE : TRUE;
4398659Smckusick else
4408659Smckusick _escaped = FALSE;
4418659Smckusick } while (*s1++);
4428659Smckusick return (NIL);
4438659Smckusick
4448659Smckusick /* try to match anything */
4458659Smckusick case 'a':
4468659Smckusick /*
4478659Smckusick * This is really the same as trying the match the
4488659Smckusick * remaining parts of the expression to any subset
4498659Smckusick * of the string.
4508659Smckusick */
4518659Smckusick s1 = s;
4528659Smckusick do {
4538659Smckusick ptr = expmatch (s1, MNEXT(cs), mstring);
4548659Smckusick if (ptr != NIL && s1 != s) {
4558659Smckusick
4568659Smckusick /* we have a match */
4578659Smckusick return (ptr);
4588659Smckusick } else if (ptr != NIL && (*cs & OPT)) {
4598659Smckusick
4608659Smckusick /* it was aoptional so no match is ok */
4618659Smckusick return (ptr);
4628659Smckusick } else if (ptr != NIL) {
4638659Smckusick
4648659Smckusick /* not optional and we still matched */
4658659Smckusick return (NIL);
4668659Smckusick }
4678659Smckusick if (*s1 == '\\')
4688659Smckusick _escaped = _escaped ? FALSE : TRUE;
4698659Smckusick else
4708659Smckusick _escaped = FALSE;
4718659Smckusick } while (*s1++);
4728659Smckusick return (NIL);
4738659Smckusick
4748659Smckusick /* fail if we are currently _escaped */
4758659Smckusick case 'e':
4768659Smckusick if (_escaped)
4778659Smckusick return(NIL);
4788659Smckusick cs = MNEXT(cs);
4798659Smckusick break;
4808659Smckusick
4818659Smckusick /* match any number of tabs and spaces */
4828659Smckusick case 'd':
4838659Smckusick ptr = s;
4848659Smckusick while (*s == ' ' || *s == '\t')
4858659Smckusick s++;
4868659Smckusick if (s != ptr || s == _start) {
4878659Smckusick
4888659Smckusick /* match, be happy */
4898659Smckusick matched = 1;
4908659Smckusick cs = MNEXT(cs);
4918659Smckusick } else if (*s == '\n' || *s == '\0') {
4928659Smckusick
4938659Smckusick /* match, be happy */
4948659Smckusick matched = 1;
4958659Smckusick cs = MNEXT(cs);
4968659Smckusick } else if (*cs & ALT) {
4978659Smckusick
4988659Smckusick /* try the next part */
4998659Smckusick matched = 0;
5008659Smckusick cs = MNEXT(cs);
5018659Smckusick } else if (*cs & OPT) {
5028659Smckusick
5038659Smckusick /* doesn't matter */
5048659Smckusick matched = 1;
5058659Smckusick cs = MNEXT(cs);
5068659Smckusick } else
5078659Smckusick
5088659Smckusick /* no match, error return */
5098659Smckusick return (NIL);
5108659Smckusick break;
5118659Smckusick
5128659Smckusick /* check for end of line */
5138659Smckusick case '$':
5148659Smckusick if (*s == '\0' || *s == '\n') {
5158659Smckusick
5168659Smckusick /* match, be happy */
5178659Smckusick s++;
5188659Smckusick matched = 1;
5198659Smckusick cs = MNEXT(cs);
5208659Smckusick } else if (*cs & ALT) {
5218659Smckusick
5228659Smckusick /* try the next part */
5238659Smckusick matched = 0;
5248659Smckusick cs = MNEXT(cs);
5258659Smckusick } else if (*cs & OPT) {
5268659Smckusick
5278659Smckusick /* doesn't matter */
5288659Smckusick matched = 1;
5298659Smckusick cs = MNEXT(cs);
5308659Smckusick } else
5318659Smckusick
5328659Smckusick /* no match, error return */
5338659Smckusick return (NIL);
5348659Smckusick break;
5358659Smckusick
5368659Smckusick /* check for start of line */
5378659Smckusick case '^':
5388659Smckusick if (s == _start) {
5398659Smckusick
5408659Smckusick /* match, be happy */
5418659Smckusick matched = 1;
5428659Smckusick cs = MNEXT(cs);
5438659Smckusick } else if (*cs & ALT) {
5448659Smckusick
5458659Smckusick /* try the next part */
5468659Smckusick matched = 0;
5478659Smckusick cs = MNEXT(cs);
5488659Smckusick } else if (*cs & OPT) {
5498659Smckusick
5508659Smckusick /* doesn't matter */
5518659Smckusick matched = 1;
5528659Smckusick cs = MNEXT(cs);
5538659Smckusick } else
5548659Smckusick
5558659Smckusick /* no match, error return */
5568659Smckusick return (NIL);
5578659Smckusick break;
5588659Smckusick
5598659Smckusick /* end of a subexpression, return success */
5608659Smckusick case ')':
5618659Smckusick return (s);
5628659Smckusick }
5638659Smckusick break;
5648659Smckusick }
5658659Smckusick }
5668659Smckusick return (s);
5678659Smckusick }
568