1*22063Sdist /* 2*22063Sdist * Copyright (c) 1980 Regents of the University of California. 3*22063Sdist * All rights reserved. The Berkeley software License Agreement 4*22063Sdist * specifies the terms and conditions for redistribution. 5*22063Sdist */ 68659Smckusick 7*22063Sdist #ifndef lint 8*22063Sdist static char sccsid[] = "@(#)regexp.c 5.1 (Berkeley) 06/05/85"; 9*22063Sdist #endif not lint 10*22063Sdist 118659Smckusick #include <ctype.h> 128659Smckusick 138659Smckusick typedef int boolean; 148659Smckusick #define TRUE 1 158659Smckusick #define FALSE 0 168659Smckusick #define NIL 0 178659Smckusick 188659Smckusick boolean l_onecase; /* true if upper and lower equivalent */ 198659Smckusick 208659Smckusick #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 218659Smckusick 228659Smckusick /* STRNCMP - like strncmp except that we convert the 238659Smckusick * first string to lower case before comparing 248659Smckusick * if l_onecase is set. 258659Smckusick */ 268659Smckusick 278659Smckusick STRNCMP(s1, s2, len) 288659Smckusick register char *s1,*s2; 298659Smckusick register int len; 308659Smckusick { 318659Smckusick if (l_onecase) { 328659Smckusick do 338659Smckusick if (*s2 - makelower(*s1)) 348659Smckusick return (*s2 - makelower(*s1)); 358659Smckusick else { 368659Smckusick s2++; 378659Smckusick s1++; 388659Smckusick } 398659Smckusick while (--len); 408659Smckusick } else { 418659Smckusick do 428659Smckusick if (*s2 - *s1) 438659Smckusick return (*s2 - *s1); 448659Smckusick else { 458659Smckusick s2++; 468659Smckusick s1++; 478659Smckusick } 488659Smckusick while (--len); 498659Smckusick } 508659Smckusick return(0); 518659Smckusick } 528659Smckusick 538659Smckusick /* The following routine converts an irregular expression to 548659Smckusick * internal format. 558659Smckusick * 568659Smckusick * Either meta symbols (\a \d or \p) or character strings or 578659Smckusick * operations ( alternation or perenthesizing ) can be 588659Smckusick * specified. Each starts with a descriptor byte. The descriptor 598659Smckusick * byte has STR set for strings, META set for meta symbols 608659Smckusick * and OPER set for operations. 618659Smckusick * The descriptor byte can also have the OPT bit set if the object 628659Smckusick * defined is optional. Also ALT can be set to indicate an alternation. 638659Smckusick * 648659Smckusick * For metasymbols the byte following the descriptor byte identities 658659Smckusick * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 668659Smckusick * strings the byte after the descriptor is a character count for 678659Smckusick * the string: 688659Smckusick * 698659Smckusick * meta symbols := descriptor 708659Smckusick * symbol 718659Smckusick * 728659Smckusick * strings := descriptor 738659Smckusick * character count 748659Smckusick * the string 758659Smckusick * 768659Smckusick * operatins := descriptor 778659Smckusick * symbol 788659Smckusick * character count 798659Smckusick */ 808659Smckusick 818659Smckusick /* 828659Smckusick * handy macros for accessing parts of match blocks 838659Smckusick */ 848659Smckusick #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 858659Smckusick #define MNEXT(A) (A+2) /* character following a metasymbol block */ 868659Smckusick 878659Smckusick #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 888659Smckusick #define OCNT(A) (*(A+2)) /* character count */ 898659Smckusick #define ONEXT(A) (A+3) /* next character after the operation */ 908659Smckusick #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 918659Smckusick 928659Smckusick #define SCNT(A) (*(A+1)) /* byte count of a string */ 938659Smckusick #define SSTR(A) (A+2) /* address of the string */ 948659Smckusick #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 958659Smckusick 968659Smckusick /* 978659Smckusick * bit flags in the descriptor 988659Smckusick */ 998659Smckusick #define OPT 1 1008659Smckusick #define STR 2 1018659Smckusick #define META 4 1028659Smckusick #define ALT 8 1038659Smckusick #define OPER 16 1048659Smckusick 1058659Smckusick char *ure; /* pointer current position in unconverted exp */ 1068659Smckusick char *ccre; /* pointer to current position in converted exp*/ 1078659Smckusick char *malloc(); 1088659Smckusick 1098659Smckusick char * 1108659Smckusick convexp(re) 1118659Smckusick char *re; /* unconverted irregular expression */ 1128659Smckusick { 1138659Smckusick register char *cre; /* pointer to converted regular expression */ 1148659Smckusick 1158659Smckusick /* allocate room for the converted expression */ 1168659Smckusick if (re == NIL) 1178659Smckusick return (NIL); 1188659Smckusick if (*re == '\0') 1198659Smckusick return (NIL); 1208659Smckusick cre = malloc (4 * strlen(re) + 3); 1218659Smckusick ccre = cre; 1228659Smckusick ure = re; 1238659Smckusick 1248659Smckusick /* start the conversion with a \a */ 1258659Smckusick *cre = META | OPT; 1268659Smckusick MSYM(cre) = 'a'; 1278659Smckusick ccre = MNEXT(cre); 1288659Smckusick 1298659Smckusick /* start the conversion (its recursive) */ 1308659Smckusick expconv (); 1318659Smckusick *ccre = 0; 1328659Smckusick return (cre); 1338659Smckusick } 1348659Smckusick 1358659Smckusick expconv() 1368659Smckusick { 1378659Smckusick register char *cs; /* pointer to current symbol in converted exp */ 1388659Smckusick register char c; /* character being processed */ 1398659Smckusick register char *acs; /* pinter to last alternate */ 1408659Smckusick register int temp; 1418659Smckusick 1428659Smckusick /* let the conversion begin */ 1438659Smckusick acs = NIL; 14417500Sralph cs = NIL; 1458659Smckusick while (*ure != NIL) { 1468659Smckusick switch (c = *ure++) { 1478659Smckusick 1488659Smckusick case '\\': 1498659Smckusick switch (c = *ure++) { 1508659Smckusick 1518659Smckusick /* escaped characters are just characters */ 1528659Smckusick default: 15317500Sralph if (cs == NIL || (*cs & STR) == 0) { 1548659Smckusick cs = ccre; 1558659Smckusick *cs = STR; 1568659Smckusick SCNT(cs) = 1; 1578659Smckusick ccre += 2; 1588659Smckusick } else 1598659Smckusick SCNT(cs)++; 1608659Smckusick *ccre++ = c; 1618659Smckusick break; 1628659Smckusick 1638659Smckusick /* normal(?) metacharacters */ 1648659Smckusick case 'a': 1658659Smckusick case 'd': 1668659Smckusick case 'e': 1678659Smckusick case 'p': 1688659Smckusick if (acs != NIL && acs != cs) { 1698659Smckusick do { 1708659Smckusick temp = OCNT(acs); 1718659Smckusick OCNT(acs) = ccre - acs; 1728659Smckusick acs -= temp; 1738659Smckusick } while (temp != 0); 1748659Smckusick acs = NIL; 1758659Smckusick } 1768659Smckusick cs = ccre; 1778659Smckusick *cs = META; 1788659Smckusick MSYM(cs) = c; 1798659Smckusick ccre = MNEXT(cs); 1808659Smckusick break; 1818659Smckusick } 1828659Smckusick break; 1838659Smckusick 1848659Smckusick /* just put the symbol in */ 1858659Smckusick case '^': 1868659Smckusick case '$': 1878659Smckusick if (acs != NIL && acs != cs) { 1888659Smckusick do { 1898659Smckusick temp = OCNT(acs); 1908659Smckusick OCNT(acs) = ccre - acs; 1918659Smckusick acs -= temp; 1928659Smckusick } while (temp != 0); 1938659Smckusick acs = NIL; 1948659Smckusick } 1958659Smckusick cs = ccre; 1968659Smckusick *cs = META; 1978659Smckusick MSYM(cs) = c; 1988659Smckusick ccre = MNEXT(cs); 1998659Smckusick break; 2008659Smckusick 2018659Smckusick /* mark the last match sequence as optional */ 2028659Smckusick case '?': 20317500Sralph if (cs) 20417500Sralph *cs = *cs | OPT; 2058659Smckusick break; 2068659Smckusick 2078659Smckusick /* recurse and define a subexpression */ 2088659Smckusick case '(': 2098659Smckusick if (acs != NIL && acs != cs) { 2108659Smckusick do { 2118659Smckusick temp = OCNT(acs); 2128659Smckusick OCNT(acs) = ccre - acs; 2138659Smckusick acs -= temp; 2148659Smckusick } while (temp != 0); 2158659Smckusick acs = NIL; 2168659Smckusick } 2178659Smckusick cs = ccre; 2188659Smckusick *cs = OPER; 2198659Smckusick OSYM(cs) = '('; 2208659Smckusick ccre = ONEXT(cs); 2218659Smckusick expconv (); 2228659Smckusick OCNT(cs) = ccre - cs; /* offset to next symbol */ 2238659Smckusick break; 2248659Smckusick 2258659Smckusick /* return from a recursion */ 2268659Smckusick case ')': 2278659Smckusick if (acs != NIL) { 2288659Smckusick do { 2298659Smckusick temp = OCNT(acs); 2308659Smckusick OCNT(acs) = ccre - acs; 2318659Smckusick acs -= temp; 2328659Smckusick } while (temp != 0); 2338659Smckusick acs = NIL; 2348659Smckusick } 2358659Smckusick cs = ccre; 2368659Smckusick *cs = META; 2378659Smckusick MSYM(cs) = c; 2388659Smckusick ccre = MNEXT(cs); 2398659Smckusick return; 2408659Smckusick 2418659Smckusick /* mark the last match sequence as having an alternate */ 2428659Smckusick /* the third byte will contain an offset to jump over the */ 2438659Smckusick /* alternate match in case the first did not fail */ 2448659Smckusick case '|': 2458659Smckusick if (acs != NIL && acs != cs) 2468659Smckusick OCNT(ccre) = ccre - acs; /* make a back pointer */ 2478659Smckusick else 2488659Smckusick OCNT(ccre) = 0; 2498659Smckusick *cs |= ALT; 2508659Smckusick cs = ccre; 2518659Smckusick *cs = OPER; 2528659Smckusick OSYM(cs) = '|'; 2538659Smckusick ccre = ONEXT(cs); 2548659Smckusick acs = cs; /* remember that the pointer is to be filles */ 2558659Smckusick break; 2568659Smckusick 2578659Smckusick /* if its not a metasymbol just build a scharacter string */ 2588659Smckusick default: 25917500Sralph if (cs == NIL || (*cs & STR) == 0) { 2608659Smckusick cs = ccre; 2618659Smckusick *cs = STR; 2628659Smckusick SCNT(cs) = 1; 2638659Smckusick ccre = SSTR(cs); 2648659Smckusick } else 2658659Smckusick SCNT(cs)++; 2668659Smckusick *ccre++ = c; 2678659Smckusick break; 2688659Smckusick } 2698659Smckusick } 2708659Smckusick if (acs != NIL) { 2718659Smckusick do { 2728659Smckusick temp = OCNT(acs); 2738659Smckusick OCNT(acs) = ccre - acs; 2748659Smckusick acs -= temp; 2758659Smckusick } while (temp != 0); 2768659Smckusick acs = NIL; 2778659Smckusick } 2788659Smckusick return; 2798659Smckusick } 2808659Smckusick /* end of convertre */ 2818659Smckusick 2828659Smckusick 2838659Smckusick /* 2848659Smckusick * The following routine recognises an irregular expresion 2858659Smckusick * with the following special characters: 2868659Smckusick * 2878659Smckusick * \? - means last match was optional 2888659Smckusick * \a - matches any number of characters 2898659Smckusick * \d - matches any number of spaces and tabs 2908659Smckusick * \p - matches any number of alphanumeric 2918659Smckusick * characters. The 2928659Smckusick * characters matched will be copied into 2938659Smckusick * the area pointed to by 'name'. 2948659Smckusick * \| - alternation 2958659Smckusick * \( \) - grouping used mostly for alternation and 2968659Smckusick * optionality 2978659Smckusick * 2988659Smckusick * The irregular expression must be translated to internal form 2998659Smckusick * prior to calling this routine 3008659Smckusick * 3018659Smckusick * The value returned is the pointer to the first non \a 3028659Smckusick * character matched. 3038659Smckusick */ 3048659Smckusick 3058659Smckusick boolean _escaped; /* true if we are currently _escaped */ 3068659Smckusick char *_start; /* start of string */ 3078659Smckusick 3088659Smckusick char * 3098659Smckusick expmatch (s, re, mstring) 3108659Smckusick register char *s; /* string to check for a match in */ 3118659Smckusick register char *re; /* a converted irregular expression */ 3128659Smckusick register char *mstring; /* where to put whatever matches a \p */ 3138659Smckusick { 3148659Smckusick register char *cs; /* the current symbol */ 3158659Smckusick register char *ptr,*s1; /* temporary pointer */ 3168659Smckusick boolean matched; /* a temporary boolean */ 3178659Smckusick 3188659Smckusick /* initial conditions */ 3198659Smckusick if (re == NIL) 3208659Smckusick return (NIL); 3218659Smckusick cs = re; 3228659Smckusick matched = FALSE; 3238659Smckusick 3248659Smckusick /* loop till expression string is exhausted (or at least pretty tired) */ 3258659Smckusick while (*cs) { 3268659Smckusick switch (*cs & (OPER | STR | META)) { 3278659Smckusick 3288659Smckusick /* try to match a string */ 3298659Smckusick case STR: 3308659Smckusick matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 3318659Smckusick if (matched) { 3328659Smckusick 3338659Smckusick /* hoorah it matches */ 3348659Smckusick s += SCNT(cs); 3358659Smckusick cs = SNEXT(cs); 3368659Smckusick } else if (*cs & ALT) { 3378659Smckusick 3388659Smckusick /* alternation, skip to next expression */ 3398659Smckusick cs = SNEXT(cs); 3408659Smckusick } else if (*cs & OPT) { 3418659Smckusick 3428659Smckusick /* the match is optional */ 3438659Smckusick cs = SNEXT(cs); 3448659Smckusick matched = 1; /* indicate a successful match */ 3458659Smckusick } else { 3468659Smckusick 3478659Smckusick /* no match, error return */ 3488659Smckusick return (NIL); 3498659Smckusick } 3508659Smckusick break; 3518659Smckusick 3528659Smckusick /* an operator, do something fancy */ 3538659Smckusick case OPER: 3548659Smckusick switch (OSYM(cs)) { 3558659Smckusick 3568659Smckusick /* this is an alternation */ 3578659Smckusick case '|': 3588659Smckusick if (matched) 3598659Smckusick 3608659Smckusick /* last thing in the alternation was a match, skip ahead */ 3618659Smckusick cs = OPTR(cs); 3628659Smckusick else 3638659Smckusick 3648659Smckusick /* no match, keep trying */ 3658659Smckusick cs = ONEXT(cs); 3668659Smckusick break; 3678659Smckusick 3688659Smckusick /* this is a grouping, recurse */ 3698659Smckusick case '(': 3708659Smckusick ptr = expmatch (s, ONEXT(cs), mstring); 3718659Smckusick if (ptr != NIL) { 3728659Smckusick 3738659Smckusick /* the subexpression matched */ 3748659Smckusick matched = 1; 3758659Smckusick s = ptr; 3768659Smckusick } else if (*cs & ALT) { 3778659Smckusick 3788659Smckusick /* alternation, skip to next expression */ 3798659Smckusick matched = 0; 3808659Smckusick } else if (*cs & OPT) { 3818659Smckusick 3828659Smckusick /* the match is optional */ 3838659Smckusick matched = 1; /* indicate a successful match */ 3848659Smckusick } else { 3858659Smckusick 3868659Smckusick /* no match, error return */ 3878659Smckusick return (NIL); 3888659Smckusick } 3898659Smckusick cs = OPTR(cs); 3908659Smckusick break; 3918659Smckusick } 3928659Smckusick break; 3938659Smckusick 3948659Smckusick /* try to match a metasymbol */ 3958659Smckusick case META: 3968659Smckusick switch (MSYM(cs)) { 3978659Smckusick 3988659Smckusick /* try to match anything and remember what was matched */ 3998659Smckusick case 'p': 4008659Smckusick /* 4018659Smckusick * This is really the same as trying the match the 4028659Smckusick * remaining parts of the expression to any subset 4038659Smckusick * of the string. 4048659Smckusick */ 4058659Smckusick s1 = s; 4068659Smckusick do { 4078659Smckusick ptr = expmatch (s1, MNEXT(cs), mstring); 4088659Smckusick if (ptr != NIL && s1 != s) { 4098659Smckusick 4108659Smckusick /* we have a match, remember the match */ 4118659Smckusick strncpy (mstring, s, s1 - s); 4128659Smckusick mstring[s1 - s] = '\0'; 4138659Smckusick return (ptr); 4148659Smckusick } else if (ptr != NIL && (*cs & OPT)) { 4158659Smckusick 4168659Smckusick /* it was aoptional so no match is ok */ 4178659Smckusick return (ptr); 4188659Smckusick } else if (ptr != NIL) { 4198659Smckusick 4208659Smckusick /* not optional and we still matched */ 4218659Smckusick return (NIL); 4228659Smckusick } 4238659Smckusick if (!isalnum(*s1) && *s1 != '_') 4248659Smckusick return (NIL); 4258659Smckusick if (*s1 == '\\') 4268659Smckusick _escaped = _escaped ? FALSE : TRUE; 4278659Smckusick else 4288659Smckusick _escaped = FALSE; 4298659Smckusick } while (*s1++); 4308659Smckusick return (NIL); 4318659Smckusick 4328659Smckusick /* try to match anything */ 4338659Smckusick case 'a': 4348659Smckusick /* 4358659Smckusick * This is really the same as trying the match the 4368659Smckusick * remaining parts of the expression to any subset 4378659Smckusick * of the string. 4388659Smckusick */ 4398659Smckusick s1 = s; 4408659Smckusick do { 4418659Smckusick ptr = expmatch (s1, MNEXT(cs), mstring); 4428659Smckusick if (ptr != NIL && s1 != s) { 4438659Smckusick 4448659Smckusick /* we have a match */ 4458659Smckusick return (ptr); 4468659Smckusick } else if (ptr != NIL && (*cs & OPT)) { 4478659Smckusick 4488659Smckusick /* it was aoptional so no match is ok */ 4498659Smckusick return (ptr); 4508659Smckusick } else if (ptr != NIL) { 4518659Smckusick 4528659Smckusick /* not optional and we still matched */ 4538659Smckusick return (NIL); 4548659Smckusick } 4558659Smckusick if (*s1 == '\\') 4568659Smckusick _escaped = _escaped ? FALSE : TRUE; 4578659Smckusick else 4588659Smckusick _escaped = FALSE; 4598659Smckusick } while (*s1++); 4608659Smckusick return (NIL); 4618659Smckusick 4628659Smckusick /* fail if we are currently _escaped */ 4638659Smckusick case 'e': 4648659Smckusick if (_escaped) 4658659Smckusick return(NIL); 4668659Smckusick cs = MNEXT(cs); 4678659Smckusick break; 4688659Smckusick 4698659Smckusick /* match any number of tabs and spaces */ 4708659Smckusick case 'd': 4718659Smckusick ptr = s; 4728659Smckusick while (*s == ' ' || *s == '\t') 4738659Smckusick s++; 4748659Smckusick if (s != ptr || s == _start) { 4758659Smckusick 4768659Smckusick /* match, be happy */ 4778659Smckusick matched = 1; 4788659Smckusick cs = MNEXT(cs); 4798659Smckusick } else if (*s == '\n' || *s == '\0') { 4808659Smckusick 4818659Smckusick /* match, be happy */ 4828659Smckusick matched = 1; 4838659Smckusick cs = MNEXT(cs); 4848659Smckusick } else if (*cs & ALT) { 4858659Smckusick 4868659Smckusick /* try the next part */ 4878659Smckusick matched = 0; 4888659Smckusick cs = MNEXT(cs); 4898659Smckusick } else if (*cs & OPT) { 4908659Smckusick 4918659Smckusick /* doesn't matter */ 4928659Smckusick matched = 1; 4938659Smckusick cs = MNEXT(cs); 4948659Smckusick } else 4958659Smckusick 4968659Smckusick /* no match, error return */ 4978659Smckusick return (NIL); 4988659Smckusick break; 4998659Smckusick 5008659Smckusick /* check for end of line */ 5018659Smckusick case '$': 5028659Smckusick if (*s == '\0' || *s == '\n') { 5038659Smckusick 5048659Smckusick /* match, be happy */ 5058659Smckusick s++; 5068659Smckusick matched = 1; 5078659Smckusick cs = MNEXT(cs); 5088659Smckusick } else if (*cs & ALT) { 5098659Smckusick 5108659Smckusick /* try the next part */ 5118659Smckusick matched = 0; 5128659Smckusick cs = MNEXT(cs); 5138659Smckusick } else if (*cs & OPT) { 5148659Smckusick 5158659Smckusick /* doesn't matter */ 5168659Smckusick matched = 1; 5178659Smckusick cs = MNEXT(cs); 5188659Smckusick } else 5198659Smckusick 5208659Smckusick /* no match, error return */ 5218659Smckusick return (NIL); 5228659Smckusick break; 5238659Smckusick 5248659Smckusick /* check for start of line */ 5258659Smckusick case '^': 5268659Smckusick if (s == _start) { 5278659Smckusick 5288659Smckusick /* match, be happy */ 5298659Smckusick matched = 1; 5308659Smckusick cs = MNEXT(cs); 5318659Smckusick } else if (*cs & ALT) { 5328659Smckusick 5338659Smckusick /* try the next part */ 5348659Smckusick matched = 0; 5358659Smckusick cs = MNEXT(cs); 5368659Smckusick } else if (*cs & OPT) { 5378659Smckusick 5388659Smckusick /* doesn't matter */ 5398659Smckusick matched = 1; 5408659Smckusick cs = MNEXT(cs); 5418659Smckusick } else 5428659Smckusick 5438659Smckusick /* no match, error return */ 5448659Smckusick return (NIL); 5458659Smckusick break; 5468659Smckusick 5478659Smckusick /* end of a subexpression, return success */ 5488659Smckusick case ')': 5498659Smckusick return (s); 5508659Smckusick } 5518659Smckusick break; 5528659Smckusick } 5538659Smckusick } 5548659Smckusick return (s); 5558659Smckusick } 556