1*17500Sralph static char sccsid[] = "@(#)regexp.c 4.2 (Berkeley) 12/11/84"; 28659Smckusick 38659Smckusick #include <ctype.h> 48659Smckusick 58659Smckusick typedef int boolean; 68659Smckusick #define TRUE 1 78659Smckusick #define FALSE 0 88659Smckusick #define NIL 0 98659Smckusick 108659Smckusick boolean l_onecase; /* true if upper and lower equivalent */ 118659Smckusick 128659Smckusick #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 138659Smckusick 148659Smckusick /* STRNCMP - like strncmp except that we convert the 158659Smckusick * first string to lower case before comparing 168659Smckusick * if l_onecase is set. 178659Smckusick */ 188659Smckusick 198659Smckusick STRNCMP(s1, s2, len) 208659Smckusick register char *s1,*s2; 218659Smckusick register int len; 228659Smckusick { 238659Smckusick if (l_onecase) { 248659Smckusick do 258659Smckusick if (*s2 - makelower(*s1)) 268659Smckusick return (*s2 - makelower(*s1)); 278659Smckusick else { 288659Smckusick s2++; 298659Smckusick s1++; 308659Smckusick } 318659Smckusick while (--len); 328659Smckusick } else { 338659Smckusick do 348659Smckusick if (*s2 - *s1) 358659Smckusick return (*s2 - *s1); 368659Smckusick else { 378659Smckusick s2++; 388659Smckusick s1++; 398659Smckusick } 408659Smckusick while (--len); 418659Smckusick } 428659Smckusick return(0); 438659Smckusick } 448659Smckusick 458659Smckusick /* The following routine converts an irregular expression to 468659Smckusick * internal format. 478659Smckusick * 488659Smckusick * Either meta symbols (\a \d or \p) or character strings or 498659Smckusick * operations ( alternation or perenthesizing ) can be 508659Smckusick * specified. Each starts with a descriptor byte. The descriptor 518659Smckusick * byte has STR set for strings, META set for meta symbols 528659Smckusick * and OPER set for operations. 538659Smckusick * The descriptor byte can also have the OPT bit set if the object 548659Smckusick * defined is optional. Also ALT can be set to indicate an alternation. 558659Smckusick * 568659Smckusick * For metasymbols the byte following the descriptor byte identities 578659Smckusick * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 588659Smckusick * strings the byte after the descriptor is a character count for 598659Smckusick * the string: 608659Smckusick * 618659Smckusick * meta symbols := descriptor 628659Smckusick * symbol 638659Smckusick * 648659Smckusick * strings := descriptor 658659Smckusick * character count 668659Smckusick * the string 678659Smckusick * 688659Smckusick * operatins := descriptor 698659Smckusick * symbol 708659Smckusick * character count 718659Smckusick */ 728659Smckusick 738659Smckusick /* 748659Smckusick * handy macros for accessing parts of match blocks 758659Smckusick */ 768659Smckusick #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 778659Smckusick #define MNEXT(A) (A+2) /* character following a metasymbol block */ 788659Smckusick 798659Smckusick #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 808659Smckusick #define OCNT(A) (*(A+2)) /* character count */ 818659Smckusick #define ONEXT(A) (A+3) /* next character after the operation */ 828659Smckusick #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 838659Smckusick 848659Smckusick #define SCNT(A) (*(A+1)) /* byte count of a string */ 858659Smckusick #define SSTR(A) (A+2) /* address of the string */ 868659Smckusick #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 878659Smckusick 888659Smckusick /* 898659Smckusick * bit flags in the descriptor 908659Smckusick */ 918659Smckusick #define OPT 1 928659Smckusick #define STR 2 938659Smckusick #define META 4 948659Smckusick #define ALT 8 958659Smckusick #define OPER 16 968659Smckusick 978659Smckusick char *ure; /* pointer current position in unconverted exp */ 988659Smckusick char *ccre; /* pointer to current position in converted exp*/ 998659Smckusick char *malloc(); 1008659Smckusick 1018659Smckusick char * 1028659Smckusick convexp(re) 1038659Smckusick char *re; /* unconverted irregular expression */ 1048659Smckusick { 1058659Smckusick register char *cre; /* pointer to converted regular expression */ 1068659Smckusick 1078659Smckusick /* allocate room for the converted expression */ 1088659Smckusick if (re == NIL) 1098659Smckusick return (NIL); 1108659Smckusick if (*re == '\0') 1118659Smckusick return (NIL); 1128659Smckusick cre = malloc (4 * strlen(re) + 3); 1138659Smckusick ccre = cre; 1148659Smckusick ure = re; 1158659Smckusick 1168659Smckusick /* start the conversion with a \a */ 1178659Smckusick *cre = META | OPT; 1188659Smckusick MSYM(cre) = 'a'; 1198659Smckusick ccre = MNEXT(cre); 1208659Smckusick 1218659Smckusick /* start the conversion (its recursive) */ 1228659Smckusick expconv (); 1238659Smckusick *ccre = 0; 1248659Smckusick return (cre); 1258659Smckusick } 1268659Smckusick 1278659Smckusick expconv() 1288659Smckusick { 1298659Smckusick register char *cs; /* pointer to current symbol in converted exp */ 1308659Smckusick register char c; /* character being processed */ 1318659Smckusick register char *acs; /* pinter to last alternate */ 1328659Smckusick register int temp; 1338659Smckusick 1348659Smckusick /* let the conversion begin */ 1358659Smckusick acs = NIL; 136*17500Sralph cs = NIL; 1378659Smckusick while (*ure != NIL) { 1388659Smckusick switch (c = *ure++) { 1398659Smckusick 1408659Smckusick case '\\': 1418659Smckusick switch (c = *ure++) { 1428659Smckusick 1438659Smckusick /* escaped characters are just characters */ 1448659Smckusick default: 145*17500Sralph if (cs == NIL || (*cs & STR) == 0) { 1468659Smckusick cs = ccre; 1478659Smckusick *cs = STR; 1488659Smckusick SCNT(cs) = 1; 1498659Smckusick ccre += 2; 1508659Smckusick } else 1518659Smckusick SCNT(cs)++; 1528659Smckusick *ccre++ = c; 1538659Smckusick break; 1548659Smckusick 1558659Smckusick /* normal(?) metacharacters */ 1568659Smckusick case 'a': 1578659Smckusick case 'd': 1588659Smckusick case 'e': 1598659Smckusick case 'p': 1608659Smckusick if (acs != NIL && acs != cs) { 1618659Smckusick do { 1628659Smckusick temp = OCNT(acs); 1638659Smckusick OCNT(acs) = ccre - acs; 1648659Smckusick acs -= temp; 1658659Smckusick } while (temp != 0); 1668659Smckusick acs = NIL; 1678659Smckusick } 1688659Smckusick cs = ccre; 1698659Smckusick *cs = META; 1708659Smckusick MSYM(cs) = c; 1718659Smckusick ccre = MNEXT(cs); 1728659Smckusick break; 1738659Smckusick } 1748659Smckusick break; 1758659Smckusick 1768659Smckusick /* just put the symbol in */ 1778659Smckusick case '^': 1788659Smckusick case '$': 1798659Smckusick if (acs != NIL && acs != cs) { 1808659Smckusick do { 1818659Smckusick temp = OCNT(acs); 1828659Smckusick OCNT(acs) = ccre - acs; 1838659Smckusick acs -= temp; 1848659Smckusick } while (temp != 0); 1858659Smckusick acs = NIL; 1868659Smckusick } 1878659Smckusick cs = ccre; 1888659Smckusick *cs = META; 1898659Smckusick MSYM(cs) = c; 1908659Smckusick ccre = MNEXT(cs); 1918659Smckusick break; 1928659Smckusick 1938659Smckusick /* mark the last match sequence as optional */ 1948659Smckusick case '?': 195*17500Sralph if (cs) 196*17500Sralph *cs = *cs | OPT; 1978659Smckusick break; 1988659Smckusick 1998659Smckusick /* recurse and define a subexpression */ 2008659Smckusick case '(': 2018659Smckusick if (acs != NIL && acs != cs) { 2028659Smckusick do { 2038659Smckusick temp = OCNT(acs); 2048659Smckusick OCNT(acs) = ccre - acs; 2058659Smckusick acs -= temp; 2068659Smckusick } while (temp != 0); 2078659Smckusick acs = NIL; 2088659Smckusick } 2098659Smckusick cs = ccre; 2108659Smckusick *cs = OPER; 2118659Smckusick OSYM(cs) = '('; 2128659Smckusick ccre = ONEXT(cs); 2138659Smckusick expconv (); 2148659Smckusick OCNT(cs) = ccre - cs; /* offset to next symbol */ 2158659Smckusick break; 2168659Smckusick 2178659Smckusick /* return from a recursion */ 2188659Smckusick case ')': 2198659Smckusick if (acs != NIL) { 2208659Smckusick do { 2218659Smckusick temp = OCNT(acs); 2228659Smckusick OCNT(acs) = ccre - acs; 2238659Smckusick acs -= temp; 2248659Smckusick } while (temp != 0); 2258659Smckusick acs = NIL; 2268659Smckusick } 2278659Smckusick cs = ccre; 2288659Smckusick *cs = META; 2298659Smckusick MSYM(cs) = c; 2308659Smckusick ccre = MNEXT(cs); 2318659Smckusick return; 2328659Smckusick 2338659Smckusick /* mark the last match sequence as having an alternate */ 2348659Smckusick /* the third byte will contain an offset to jump over the */ 2358659Smckusick /* alternate match in case the first did not fail */ 2368659Smckusick case '|': 2378659Smckusick if (acs != NIL && acs != cs) 2388659Smckusick OCNT(ccre) = ccre - acs; /* make a back pointer */ 2398659Smckusick else 2408659Smckusick OCNT(ccre) = 0; 2418659Smckusick *cs |= ALT; 2428659Smckusick cs = ccre; 2438659Smckusick *cs = OPER; 2448659Smckusick OSYM(cs) = '|'; 2458659Smckusick ccre = ONEXT(cs); 2468659Smckusick acs = cs; /* remember that the pointer is to be filles */ 2478659Smckusick break; 2488659Smckusick 2498659Smckusick /* if its not a metasymbol just build a scharacter string */ 2508659Smckusick default: 251*17500Sralph if (cs == NIL || (*cs & STR) == 0) { 2528659Smckusick cs = ccre; 2538659Smckusick *cs = STR; 2548659Smckusick SCNT(cs) = 1; 2558659Smckusick ccre = SSTR(cs); 2568659Smckusick } else 2578659Smckusick SCNT(cs)++; 2588659Smckusick *ccre++ = c; 2598659Smckusick break; 2608659Smckusick } 2618659Smckusick } 2628659Smckusick if (acs != NIL) { 2638659Smckusick do { 2648659Smckusick temp = OCNT(acs); 2658659Smckusick OCNT(acs) = ccre - acs; 2668659Smckusick acs -= temp; 2678659Smckusick } while (temp != 0); 2688659Smckusick acs = NIL; 2698659Smckusick } 2708659Smckusick return; 2718659Smckusick } 2728659Smckusick /* end of convertre */ 2738659Smckusick 2748659Smckusick 2758659Smckusick /* 2768659Smckusick * The following routine recognises an irregular expresion 2778659Smckusick * with the following special characters: 2788659Smckusick * 2798659Smckusick * \? - means last match was optional 2808659Smckusick * \a - matches any number of characters 2818659Smckusick * \d - matches any number of spaces and tabs 2828659Smckusick * \p - matches any number of alphanumeric 2838659Smckusick * characters. The 2848659Smckusick * characters matched will be copied into 2858659Smckusick * the area pointed to by 'name'. 2868659Smckusick * \| - alternation 2878659Smckusick * \( \) - grouping used mostly for alternation and 2888659Smckusick * optionality 2898659Smckusick * 2908659Smckusick * The irregular expression must be translated to internal form 2918659Smckusick * prior to calling this routine 2928659Smckusick * 2938659Smckusick * The value returned is the pointer to the first non \a 2948659Smckusick * character matched. 2958659Smckusick */ 2968659Smckusick 2978659Smckusick boolean _escaped; /* true if we are currently _escaped */ 2988659Smckusick char *_start; /* start of string */ 2998659Smckusick 3008659Smckusick char * 3018659Smckusick expmatch (s, re, mstring) 3028659Smckusick register char *s; /* string to check for a match in */ 3038659Smckusick register char *re; /* a converted irregular expression */ 3048659Smckusick register char *mstring; /* where to put whatever matches a \p */ 3058659Smckusick { 3068659Smckusick register char *cs; /* the current symbol */ 3078659Smckusick register char *ptr,*s1; /* temporary pointer */ 3088659Smckusick boolean matched; /* a temporary boolean */ 3098659Smckusick 3108659Smckusick /* initial conditions */ 3118659Smckusick if (re == NIL) 3128659Smckusick return (NIL); 3138659Smckusick cs = re; 3148659Smckusick matched = FALSE; 3158659Smckusick 3168659Smckusick /* loop till expression string is exhausted (or at least pretty tired) */ 3178659Smckusick while (*cs) { 3188659Smckusick switch (*cs & (OPER | STR | META)) { 3198659Smckusick 3208659Smckusick /* try to match a string */ 3218659Smckusick case STR: 3228659Smckusick matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 3238659Smckusick if (matched) { 3248659Smckusick 3258659Smckusick /* hoorah it matches */ 3268659Smckusick s += SCNT(cs); 3278659Smckusick cs = SNEXT(cs); 3288659Smckusick } else if (*cs & ALT) { 3298659Smckusick 3308659Smckusick /* alternation, skip to next expression */ 3318659Smckusick cs = SNEXT(cs); 3328659Smckusick } else if (*cs & OPT) { 3338659Smckusick 3348659Smckusick /* the match is optional */ 3358659Smckusick cs = SNEXT(cs); 3368659Smckusick matched = 1; /* indicate a successful match */ 3378659Smckusick } else { 3388659Smckusick 3398659Smckusick /* no match, error return */ 3408659Smckusick return (NIL); 3418659Smckusick } 3428659Smckusick break; 3438659Smckusick 3448659Smckusick /* an operator, do something fancy */ 3458659Smckusick case OPER: 3468659Smckusick switch (OSYM(cs)) { 3478659Smckusick 3488659Smckusick /* this is an alternation */ 3498659Smckusick case '|': 3508659Smckusick if (matched) 3518659Smckusick 3528659Smckusick /* last thing in the alternation was a match, skip ahead */ 3538659Smckusick cs = OPTR(cs); 3548659Smckusick else 3558659Smckusick 3568659Smckusick /* no match, keep trying */ 3578659Smckusick cs = ONEXT(cs); 3588659Smckusick break; 3598659Smckusick 3608659Smckusick /* this is a grouping, recurse */ 3618659Smckusick case '(': 3628659Smckusick ptr = expmatch (s, ONEXT(cs), mstring); 3638659Smckusick if (ptr != NIL) { 3648659Smckusick 3658659Smckusick /* the subexpression matched */ 3668659Smckusick matched = 1; 3678659Smckusick s = ptr; 3688659Smckusick } else if (*cs & ALT) { 3698659Smckusick 3708659Smckusick /* alternation, skip to next expression */ 3718659Smckusick matched = 0; 3728659Smckusick } else if (*cs & OPT) { 3738659Smckusick 3748659Smckusick /* the match is optional */ 3758659Smckusick matched = 1; /* indicate a successful match */ 3768659Smckusick } else { 3778659Smckusick 3788659Smckusick /* no match, error return */ 3798659Smckusick return (NIL); 3808659Smckusick } 3818659Smckusick cs = OPTR(cs); 3828659Smckusick break; 3838659Smckusick } 3848659Smckusick break; 3858659Smckusick 3868659Smckusick /* try to match a metasymbol */ 3878659Smckusick case META: 3888659Smckusick switch (MSYM(cs)) { 3898659Smckusick 3908659Smckusick /* try to match anything and remember what was matched */ 3918659Smckusick case 'p': 3928659Smckusick /* 3938659Smckusick * This is really the same as trying the match the 3948659Smckusick * remaining parts of the expression to any subset 3958659Smckusick * of the string. 3968659Smckusick */ 3978659Smckusick s1 = s; 3988659Smckusick do { 3998659Smckusick ptr = expmatch (s1, MNEXT(cs), mstring); 4008659Smckusick if (ptr != NIL && s1 != s) { 4018659Smckusick 4028659Smckusick /* we have a match, remember the match */ 4038659Smckusick strncpy (mstring, s, s1 - s); 4048659Smckusick mstring[s1 - s] = '\0'; 4058659Smckusick return (ptr); 4068659Smckusick } else if (ptr != NIL && (*cs & OPT)) { 4078659Smckusick 4088659Smckusick /* it was aoptional so no match is ok */ 4098659Smckusick return (ptr); 4108659Smckusick } else if (ptr != NIL) { 4118659Smckusick 4128659Smckusick /* not optional and we still matched */ 4138659Smckusick return (NIL); 4148659Smckusick } 4158659Smckusick if (!isalnum(*s1) && *s1 != '_') 4168659Smckusick return (NIL); 4178659Smckusick if (*s1 == '\\') 4188659Smckusick _escaped = _escaped ? FALSE : TRUE; 4198659Smckusick else 4208659Smckusick _escaped = FALSE; 4218659Smckusick } while (*s1++); 4228659Smckusick return (NIL); 4238659Smckusick 4248659Smckusick /* try to match anything */ 4258659Smckusick case 'a': 4268659Smckusick /* 4278659Smckusick * This is really the same as trying the match the 4288659Smckusick * remaining parts of the expression to any subset 4298659Smckusick * of the string. 4308659Smckusick */ 4318659Smckusick s1 = s; 4328659Smckusick do { 4338659Smckusick ptr = expmatch (s1, MNEXT(cs), mstring); 4348659Smckusick if (ptr != NIL && s1 != s) { 4358659Smckusick 4368659Smckusick /* we have a match */ 4378659Smckusick return (ptr); 4388659Smckusick } else if (ptr != NIL && (*cs & OPT)) { 4398659Smckusick 4408659Smckusick /* it was aoptional so no match is ok */ 4418659Smckusick return (ptr); 4428659Smckusick } else if (ptr != NIL) { 4438659Smckusick 4448659Smckusick /* not optional and we still matched */ 4458659Smckusick return (NIL); 4468659Smckusick } 4478659Smckusick if (*s1 == '\\') 4488659Smckusick _escaped = _escaped ? FALSE : TRUE; 4498659Smckusick else 4508659Smckusick _escaped = FALSE; 4518659Smckusick } while (*s1++); 4528659Smckusick return (NIL); 4538659Smckusick 4548659Smckusick /* fail if we are currently _escaped */ 4558659Smckusick case 'e': 4568659Smckusick if (_escaped) 4578659Smckusick return(NIL); 4588659Smckusick cs = MNEXT(cs); 4598659Smckusick break; 4608659Smckusick 4618659Smckusick /* match any number of tabs and spaces */ 4628659Smckusick case 'd': 4638659Smckusick ptr = s; 4648659Smckusick while (*s == ' ' || *s == '\t') 4658659Smckusick s++; 4668659Smckusick if (s != ptr || s == _start) { 4678659Smckusick 4688659Smckusick /* match, be happy */ 4698659Smckusick matched = 1; 4708659Smckusick cs = MNEXT(cs); 4718659Smckusick } else if (*s == '\n' || *s == '\0') { 4728659Smckusick 4738659Smckusick /* match, be happy */ 4748659Smckusick matched = 1; 4758659Smckusick cs = MNEXT(cs); 4768659Smckusick } else if (*cs & ALT) { 4778659Smckusick 4788659Smckusick /* try the next part */ 4798659Smckusick matched = 0; 4808659Smckusick cs = MNEXT(cs); 4818659Smckusick } else if (*cs & OPT) { 4828659Smckusick 4838659Smckusick /* doesn't matter */ 4848659Smckusick matched = 1; 4858659Smckusick cs = MNEXT(cs); 4868659Smckusick } else 4878659Smckusick 4888659Smckusick /* no match, error return */ 4898659Smckusick return (NIL); 4908659Smckusick break; 4918659Smckusick 4928659Smckusick /* check for end of line */ 4938659Smckusick case '$': 4948659Smckusick if (*s == '\0' || *s == '\n') { 4958659Smckusick 4968659Smckusick /* match, be happy */ 4978659Smckusick s++; 4988659Smckusick matched = 1; 4998659Smckusick cs = MNEXT(cs); 5008659Smckusick } else if (*cs & ALT) { 5018659Smckusick 5028659Smckusick /* try the next part */ 5038659Smckusick matched = 0; 5048659Smckusick cs = MNEXT(cs); 5058659Smckusick } else if (*cs & OPT) { 5068659Smckusick 5078659Smckusick /* doesn't matter */ 5088659Smckusick matched = 1; 5098659Smckusick cs = MNEXT(cs); 5108659Smckusick } else 5118659Smckusick 5128659Smckusick /* no match, error return */ 5138659Smckusick return (NIL); 5148659Smckusick break; 5158659Smckusick 5168659Smckusick /* check for start of line */ 5178659Smckusick case '^': 5188659Smckusick if (s == _start) { 5198659Smckusick 5208659Smckusick /* match, be happy */ 5218659Smckusick matched = 1; 5228659Smckusick cs = MNEXT(cs); 5238659Smckusick } else if (*cs & ALT) { 5248659Smckusick 5258659Smckusick /* try the next part */ 5268659Smckusick matched = 0; 5278659Smckusick cs = MNEXT(cs); 5288659Smckusick } else if (*cs & OPT) { 5298659Smckusick 5308659Smckusick /* doesn't matter */ 5318659Smckusick matched = 1; 5328659Smckusick cs = MNEXT(cs); 5338659Smckusick } else 5348659Smckusick 5358659Smckusick /* no match, error return */ 5368659Smckusick return (NIL); 5378659Smckusick break; 5388659Smckusick 5398659Smckusick /* end of a subexpression, return success */ 5408659Smckusick case ')': 5418659Smckusick return (s); 5428659Smckusick } 5438659Smckusick break; 5448659Smckusick } 5458659Smckusick } 5468659Smckusick return (s); 5478659Smckusick } 548