122063Sdist /* 236167Sbostic * Copyright (c) 1980 The Regents of the University of California. 336167Sbostic * All rights reserved. 436167Sbostic * 556144Selan * 6*56260Selan * %sccs.include.redist.c% 722063Sdist */ 88659Smckusick 922063Sdist #ifndef lint 10*56260Selan char copyright[] = 11*56260Selan "@(#) Copyright (c) 1980 The Regents of the University of California.\n\ 12*56260Selan All rights reserved.\n"; 1336167Sbostic #endif /* not lint */ 1422063Sdist 15*56260Selan #ifndef lint 16*56260Selan static char sccsid[] = "@(#)regexp.c 5.5 (Berkeley) 09/15/92"; 17*56260Selan #endif /* not lint */ 18*56260Selan 198659Smckusick #include <ctype.h> 2056144Selan #include <stdlib.h> 2156144Selan #include <string.h> 2256144Selan #include "extern.h" 238659Smckusick 248659Smckusick #define FALSE 0 2556144Selan #define TRUE !(FALSE) 268659Smckusick #define NIL 0 278659Smckusick 2856144Selan static void expconv __P((void)); 298659Smckusick 3056144Selan boolean _escaped; /* true if we are currently _escaped */ 3156144Selan char *_start; /* start of string */ 3256144Selan boolean l_onecase; /* true if upper and lower equivalent */ 3356144Selan 348659Smckusick #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 358659Smckusick 368659Smckusick /* STRNCMP - like strncmp except that we convert the 378659Smckusick * first string to lower case before comparing 388659Smckusick * if l_onecase is set. 398659Smckusick */ 408659Smckusick 4156144Selan int 428659Smckusick STRNCMP(s1, s2, len) 438659Smckusick register char *s1,*s2; 448659Smckusick register int len; 458659Smckusick { 468659Smckusick if (l_onecase) { 478659Smckusick do 488659Smckusick if (*s2 - makelower(*s1)) 498659Smckusick return (*s2 - makelower(*s1)); 508659Smckusick else { 518659Smckusick s2++; 528659Smckusick s1++; 538659Smckusick } 548659Smckusick while (--len); 558659Smckusick } else { 568659Smckusick do 578659Smckusick if (*s2 - *s1) 588659Smckusick return (*s2 - *s1); 598659Smckusick else { 608659Smckusick s2++; 618659Smckusick s1++; 628659Smckusick } 638659Smckusick while (--len); 648659Smckusick } 658659Smckusick return(0); 668659Smckusick } 678659Smckusick 688659Smckusick /* The following routine converts an irregular expression to 698659Smckusick * internal format. 708659Smckusick * 718659Smckusick * Either meta symbols (\a \d or \p) or character strings or 728659Smckusick * operations ( alternation or perenthesizing ) can be 738659Smckusick * specified. Each starts with a descriptor byte. The descriptor 748659Smckusick * byte has STR set for strings, META set for meta symbols 758659Smckusick * and OPER set for operations. 768659Smckusick * The descriptor byte can also have the OPT bit set if the object 778659Smckusick * defined is optional. Also ALT can be set to indicate an alternation. 788659Smckusick * 798659Smckusick * For metasymbols the byte following the descriptor byte identities 808659Smckusick * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 818659Smckusick * strings the byte after the descriptor is a character count for 828659Smckusick * the string: 838659Smckusick * 848659Smckusick * meta symbols := descriptor 858659Smckusick * symbol 868659Smckusick * 878659Smckusick * strings := descriptor 888659Smckusick * character count 898659Smckusick * the string 908659Smckusick * 918659Smckusick * operatins := descriptor 928659Smckusick * symbol 938659Smckusick * character count 948659Smckusick */ 958659Smckusick 968659Smckusick /* 978659Smckusick * handy macros for accessing parts of match blocks 988659Smckusick */ 998659Smckusick #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 1008659Smckusick #define MNEXT(A) (A+2) /* character following a metasymbol block */ 1018659Smckusick 1028659Smckusick #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 1038659Smckusick #define OCNT(A) (*(A+2)) /* character count */ 1048659Smckusick #define ONEXT(A) (A+3) /* next character after the operation */ 1058659Smckusick #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 1068659Smckusick 1078659Smckusick #define SCNT(A) (*(A+1)) /* byte count of a string */ 1088659Smckusick #define SSTR(A) (A+2) /* address of the string */ 1098659Smckusick #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 1108659Smckusick 1118659Smckusick /* 1128659Smckusick * bit flags in the descriptor 1138659Smckusick */ 1148659Smckusick #define OPT 1 1158659Smckusick #define STR 2 1168659Smckusick #define META 4 1178659Smckusick #define ALT 8 1188659Smckusick #define OPER 16 1198659Smckusick 12056144Selan static char *ccre; /* pointer to current position in converted exp*/ 12156144Selan static char *ure; /* pointer current position in unconverted exp */ 1228659Smckusick 1238659Smckusick char * 1248659Smckusick convexp(re) 1258659Smckusick char *re; /* unconverted irregular expression */ 1268659Smckusick { 1278659Smckusick register char *cre; /* pointer to converted regular expression */ 1288659Smckusick 1298659Smckusick /* allocate room for the converted expression */ 1308659Smckusick if (re == NIL) 1318659Smckusick return (NIL); 1328659Smckusick if (*re == '\0') 1338659Smckusick return (NIL); 1348659Smckusick cre = malloc (4 * strlen(re) + 3); 1358659Smckusick ccre = cre; 1368659Smckusick ure = re; 1378659Smckusick 1388659Smckusick /* start the conversion with a \a */ 1398659Smckusick *cre = META | OPT; 1408659Smckusick MSYM(cre) = 'a'; 1418659Smckusick ccre = MNEXT(cre); 1428659Smckusick 1438659Smckusick /* start the conversion (its recursive) */ 1448659Smckusick expconv (); 1458659Smckusick *ccre = 0; 1468659Smckusick return (cre); 1478659Smckusick } 1488659Smckusick 14956144Selan static void 1508659Smckusick expconv() 1518659Smckusick { 1528659Smckusick register char *cs; /* pointer to current symbol in converted exp */ 1538659Smckusick register char c; /* character being processed */ 1548659Smckusick register char *acs; /* pinter to last alternate */ 1558659Smckusick register int temp; 1568659Smckusick 1578659Smckusick /* let the conversion begin */ 1588659Smckusick acs = NIL; 15917500Sralph cs = NIL; 1608659Smckusick while (*ure != NIL) { 1618659Smckusick switch (c = *ure++) { 1628659Smckusick 1638659Smckusick case '\\': 1648659Smckusick switch (c = *ure++) { 1658659Smckusick 1668659Smckusick /* escaped characters are just characters */ 1678659Smckusick default: 16817500Sralph if (cs == NIL || (*cs & STR) == 0) { 1698659Smckusick cs = ccre; 1708659Smckusick *cs = STR; 1718659Smckusick SCNT(cs) = 1; 1728659Smckusick ccre += 2; 1738659Smckusick } else 1748659Smckusick SCNT(cs)++; 1758659Smckusick *ccre++ = c; 1768659Smckusick break; 1778659Smckusick 1788659Smckusick /* normal(?) metacharacters */ 1798659Smckusick case 'a': 1808659Smckusick case 'd': 1818659Smckusick case 'e': 1828659Smckusick case 'p': 1838659Smckusick if (acs != NIL && acs != cs) { 1848659Smckusick do { 1858659Smckusick temp = OCNT(acs); 1868659Smckusick OCNT(acs) = ccre - acs; 1878659Smckusick acs -= temp; 1888659Smckusick } while (temp != 0); 1898659Smckusick acs = NIL; 1908659Smckusick } 1918659Smckusick cs = ccre; 1928659Smckusick *cs = META; 1938659Smckusick MSYM(cs) = c; 1948659Smckusick ccre = MNEXT(cs); 1958659Smckusick break; 1968659Smckusick } 1978659Smckusick break; 1988659Smckusick 1998659Smckusick /* just put the symbol in */ 2008659Smckusick case '^': 2018659Smckusick case '$': 2028659Smckusick if (acs != NIL && acs != cs) { 2038659Smckusick do { 2048659Smckusick temp = OCNT(acs); 2058659Smckusick OCNT(acs) = ccre - acs; 2068659Smckusick acs -= temp; 2078659Smckusick } while (temp != 0); 2088659Smckusick acs = NIL; 2098659Smckusick } 2108659Smckusick cs = ccre; 2118659Smckusick *cs = META; 2128659Smckusick MSYM(cs) = c; 2138659Smckusick ccre = MNEXT(cs); 2148659Smckusick break; 2158659Smckusick 2168659Smckusick /* mark the last match sequence as optional */ 2178659Smckusick case '?': 21817500Sralph if (cs) 21917500Sralph *cs = *cs | OPT; 2208659Smckusick break; 2218659Smckusick 2228659Smckusick /* recurse and define a subexpression */ 2238659Smckusick case '(': 2248659Smckusick if (acs != NIL && acs != cs) { 2258659Smckusick do { 2268659Smckusick temp = OCNT(acs); 2278659Smckusick OCNT(acs) = ccre - acs; 2288659Smckusick acs -= temp; 2298659Smckusick } while (temp != 0); 2308659Smckusick acs = NIL; 2318659Smckusick } 2328659Smckusick cs = ccre; 2338659Smckusick *cs = OPER; 2348659Smckusick OSYM(cs) = '('; 2358659Smckusick ccre = ONEXT(cs); 2368659Smckusick expconv (); 2378659Smckusick OCNT(cs) = ccre - cs; /* offset to next symbol */ 2388659Smckusick break; 2398659Smckusick 24056144Selan /* reurn from a recursion */ 2418659Smckusick case ')': 2428659Smckusick if (acs != NIL) { 2438659Smckusick do { 2448659Smckusick temp = OCNT(acs); 2458659Smckusick OCNT(acs) = ccre - acs; 2468659Smckusick acs -= temp; 2478659Smckusick } while (temp != 0); 2488659Smckusick acs = NIL; 2498659Smckusick } 2508659Smckusick cs = ccre; 2518659Smckusick *cs = META; 2528659Smckusick MSYM(cs) = c; 2538659Smckusick ccre = MNEXT(cs); 2548659Smckusick return; 2558659Smckusick 2568659Smckusick /* mark the last match sequence as having an alternate */ 2578659Smckusick /* the third byte will contain an offset to jump over the */ 2588659Smckusick /* alternate match in case the first did not fail */ 2598659Smckusick case '|': 2608659Smckusick if (acs != NIL && acs != cs) 2618659Smckusick OCNT(ccre) = ccre - acs; /* make a back pointer */ 2628659Smckusick else 2638659Smckusick OCNT(ccre) = 0; 2648659Smckusick *cs |= ALT; 2658659Smckusick cs = ccre; 2668659Smckusick *cs = OPER; 2678659Smckusick OSYM(cs) = '|'; 2688659Smckusick ccre = ONEXT(cs); 2698659Smckusick acs = cs; /* remember that the pointer is to be filles */ 2708659Smckusick break; 2718659Smckusick 2728659Smckusick /* if its not a metasymbol just build a scharacter string */ 2738659Smckusick default: 27417500Sralph if (cs == NIL || (*cs & STR) == 0) { 2758659Smckusick cs = ccre; 2768659Smckusick *cs = STR; 2778659Smckusick SCNT(cs) = 1; 2788659Smckusick ccre = SSTR(cs); 2798659Smckusick } else 2808659Smckusick SCNT(cs)++; 2818659Smckusick *ccre++ = c; 2828659Smckusick break; 2838659Smckusick } 2848659Smckusick } 2858659Smckusick if (acs != NIL) { 2868659Smckusick do { 2878659Smckusick temp = OCNT(acs); 2888659Smckusick OCNT(acs) = ccre - acs; 2898659Smckusick acs -= temp; 2908659Smckusick } while (temp != 0); 2918659Smckusick acs = NIL; 2928659Smckusick } 2938659Smckusick return; 2948659Smckusick } 2958659Smckusick /* end of convertre */ 2968659Smckusick 2978659Smckusick 2988659Smckusick /* 2998659Smckusick * The following routine recognises an irregular expresion 3008659Smckusick * with the following special characters: 3018659Smckusick * 3028659Smckusick * \? - means last match was optional 3038659Smckusick * \a - matches any number of characters 3048659Smckusick * \d - matches any number of spaces and tabs 3058659Smckusick * \p - matches any number of alphanumeric 3068659Smckusick * characters. The 3078659Smckusick * characters matched will be copied into 3088659Smckusick * the area pointed to by 'name'. 3098659Smckusick * \| - alternation 3108659Smckusick * \( \) - grouping used mostly for alternation and 3118659Smckusick * optionality 3128659Smckusick * 3138659Smckusick * The irregular expression must be translated to internal form 3148659Smckusick * prior to calling this routine 3158659Smckusick * 3168659Smckusick * The value returned is the pointer to the first non \a 3178659Smckusick * character matched. 3188659Smckusick */ 3198659Smckusick 3208659Smckusick char * 3218659Smckusick expmatch (s, re, mstring) 3228659Smckusick register char *s; /* string to check for a match in */ 3238659Smckusick register char *re; /* a converted irregular expression */ 3248659Smckusick register char *mstring; /* where to put whatever matches a \p */ 3258659Smckusick { 3268659Smckusick register char *cs; /* the current symbol */ 3278659Smckusick register char *ptr,*s1; /* temporary pointer */ 3288659Smckusick boolean matched; /* a temporary boolean */ 3298659Smckusick 3308659Smckusick /* initial conditions */ 3318659Smckusick if (re == NIL) 3328659Smckusick return (NIL); 3338659Smckusick cs = re; 3348659Smckusick matched = FALSE; 3358659Smckusick 3368659Smckusick /* loop till expression string is exhausted (or at least pretty tired) */ 3378659Smckusick while (*cs) { 3388659Smckusick switch (*cs & (OPER | STR | META)) { 3398659Smckusick 3408659Smckusick /* try to match a string */ 3418659Smckusick case STR: 3428659Smckusick matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 3438659Smckusick if (matched) { 3448659Smckusick 3458659Smckusick /* hoorah it matches */ 3468659Smckusick s += SCNT(cs); 3478659Smckusick cs = SNEXT(cs); 3488659Smckusick } else if (*cs & ALT) { 3498659Smckusick 3508659Smckusick /* alternation, skip to next expression */ 3518659Smckusick cs = SNEXT(cs); 3528659Smckusick } else if (*cs & OPT) { 3538659Smckusick 3548659Smckusick /* the match is optional */ 3558659Smckusick cs = SNEXT(cs); 3568659Smckusick matched = 1; /* indicate a successful match */ 3578659Smckusick } else { 3588659Smckusick 3598659Smckusick /* no match, error return */ 3608659Smckusick return (NIL); 3618659Smckusick } 3628659Smckusick break; 3638659Smckusick 3648659Smckusick /* an operator, do something fancy */ 3658659Smckusick case OPER: 3668659Smckusick switch (OSYM(cs)) { 3678659Smckusick 3688659Smckusick /* this is an alternation */ 3698659Smckusick case '|': 3708659Smckusick if (matched) 3718659Smckusick 3728659Smckusick /* last thing in the alternation was a match, skip ahead */ 3738659Smckusick cs = OPTR(cs); 3748659Smckusick else 3758659Smckusick 3768659Smckusick /* no match, keep trying */ 3778659Smckusick cs = ONEXT(cs); 3788659Smckusick break; 3798659Smckusick 3808659Smckusick /* this is a grouping, recurse */ 3818659Smckusick case '(': 3828659Smckusick ptr = expmatch (s, ONEXT(cs), mstring); 3838659Smckusick if (ptr != NIL) { 3848659Smckusick 3858659Smckusick /* the subexpression matched */ 3868659Smckusick matched = 1; 3878659Smckusick s = ptr; 3888659Smckusick } else if (*cs & ALT) { 3898659Smckusick 3908659Smckusick /* alternation, skip to next expression */ 3918659Smckusick matched = 0; 3928659Smckusick } else if (*cs & OPT) { 3938659Smckusick 3948659Smckusick /* the match is optional */ 3958659Smckusick matched = 1; /* indicate a successful match */ 3968659Smckusick } else { 3978659Smckusick 3988659Smckusick /* no match, error return */ 3998659Smckusick return (NIL); 4008659Smckusick } 4018659Smckusick cs = OPTR(cs); 4028659Smckusick break; 4038659Smckusick } 4048659Smckusick break; 4058659Smckusick 4068659Smckusick /* try to match a metasymbol */ 4078659Smckusick case META: 4088659Smckusick switch (MSYM(cs)) { 4098659Smckusick 4108659Smckusick /* try to match anything and remember what was matched */ 4118659Smckusick case 'p': 4128659Smckusick /* 4138659Smckusick * This is really the same as trying the match the 4148659Smckusick * remaining parts of the expression to any subset 4158659Smckusick * of the string. 4168659Smckusick */ 4178659Smckusick s1 = s; 4188659Smckusick do { 4198659Smckusick ptr = expmatch (s1, MNEXT(cs), mstring); 4208659Smckusick if (ptr != NIL && s1 != s) { 4218659Smckusick 4228659Smckusick /* we have a match, remember the match */ 4238659Smckusick strncpy (mstring, s, s1 - s); 4248659Smckusick mstring[s1 - s] = '\0'; 4258659Smckusick return (ptr); 4268659Smckusick } else if (ptr != NIL && (*cs & OPT)) { 4278659Smckusick 4288659Smckusick /* it was aoptional so no match is ok */ 4298659Smckusick return (ptr); 4308659Smckusick } else if (ptr != NIL) { 4318659Smckusick 4328659Smckusick /* not optional and we still matched */ 4338659Smckusick return (NIL); 4348659Smckusick } 4358659Smckusick if (!isalnum(*s1) && *s1 != '_') 4368659Smckusick return (NIL); 4378659Smckusick if (*s1 == '\\') 4388659Smckusick _escaped = _escaped ? FALSE : TRUE; 4398659Smckusick else 4408659Smckusick _escaped = FALSE; 4418659Smckusick } while (*s1++); 4428659Smckusick return (NIL); 4438659Smckusick 4448659Smckusick /* try to match anything */ 4458659Smckusick case 'a': 4468659Smckusick /* 4478659Smckusick * This is really the same as trying the match the 4488659Smckusick * remaining parts of the expression to any subset 4498659Smckusick * of the string. 4508659Smckusick */ 4518659Smckusick s1 = s; 4528659Smckusick do { 4538659Smckusick ptr = expmatch (s1, MNEXT(cs), mstring); 4548659Smckusick if (ptr != NIL && s1 != s) { 4558659Smckusick 4568659Smckusick /* we have a match */ 4578659Smckusick return (ptr); 4588659Smckusick } else if (ptr != NIL && (*cs & OPT)) { 4598659Smckusick 4608659Smckusick /* it was aoptional so no match is ok */ 4618659Smckusick return (ptr); 4628659Smckusick } else if (ptr != NIL) { 4638659Smckusick 4648659Smckusick /* not optional and we still matched */ 4658659Smckusick return (NIL); 4668659Smckusick } 4678659Smckusick if (*s1 == '\\') 4688659Smckusick _escaped = _escaped ? FALSE : TRUE; 4698659Smckusick else 4708659Smckusick _escaped = FALSE; 4718659Smckusick } while (*s1++); 4728659Smckusick return (NIL); 4738659Smckusick 4748659Smckusick /* fail if we are currently _escaped */ 4758659Smckusick case 'e': 4768659Smckusick if (_escaped) 4778659Smckusick return(NIL); 4788659Smckusick cs = MNEXT(cs); 4798659Smckusick break; 4808659Smckusick 4818659Smckusick /* match any number of tabs and spaces */ 4828659Smckusick case 'd': 4838659Smckusick ptr = s; 4848659Smckusick while (*s == ' ' || *s == '\t') 4858659Smckusick s++; 4868659Smckusick if (s != ptr || s == _start) { 4878659Smckusick 4888659Smckusick /* match, be happy */ 4898659Smckusick matched = 1; 4908659Smckusick cs = MNEXT(cs); 4918659Smckusick } else if (*s == '\n' || *s == '\0') { 4928659Smckusick 4938659Smckusick /* match, be happy */ 4948659Smckusick matched = 1; 4958659Smckusick cs = MNEXT(cs); 4968659Smckusick } else if (*cs & ALT) { 4978659Smckusick 4988659Smckusick /* try the next part */ 4998659Smckusick matched = 0; 5008659Smckusick cs = MNEXT(cs); 5018659Smckusick } else if (*cs & OPT) { 5028659Smckusick 5038659Smckusick /* doesn't matter */ 5048659Smckusick matched = 1; 5058659Smckusick cs = MNEXT(cs); 5068659Smckusick } else 5078659Smckusick 5088659Smckusick /* no match, error return */ 5098659Smckusick return (NIL); 5108659Smckusick break; 5118659Smckusick 5128659Smckusick /* check for end of line */ 5138659Smckusick case '$': 5148659Smckusick if (*s == '\0' || *s == '\n') { 5158659Smckusick 5168659Smckusick /* match, be happy */ 5178659Smckusick s++; 5188659Smckusick matched = 1; 5198659Smckusick cs = MNEXT(cs); 5208659Smckusick } else if (*cs & ALT) { 5218659Smckusick 5228659Smckusick /* try the next part */ 5238659Smckusick matched = 0; 5248659Smckusick cs = MNEXT(cs); 5258659Smckusick } else if (*cs & OPT) { 5268659Smckusick 5278659Smckusick /* doesn't matter */ 5288659Smckusick matched = 1; 5298659Smckusick cs = MNEXT(cs); 5308659Smckusick } else 5318659Smckusick 5328659Smckusick /* no match, error return */ 5338659Smckusick return (NIL); 5348659Smckusick break; 5358659Smckusick 5368659Smckusick /* check for start of line */ 5378659Smckusick case '^': 5388659Smckusick if (s == _start) { 5398659Smckusick 5408659Smckusick /* match, be happy */ 5418659Smckusick matched = 1; 5428659Smckusick cs = MNEXT(cs); 5438659Smckusick } else if (*cs & ALT) { 5448659Smckusick 5458659Smckusick /* try the next part */ 5468659Smckusick matched = 0; 5478659Smckusick cs = MNEXT(cs); 5488659Smckusick } else if (*cs & OPT) { 5498659Smckusick 5508659Smckusick /* doesn't matter */ 5518659Smckusick matched = 1; 5528659Smckusick cs = MNEXT(cs); 5538659Smckusick } else 5548659Smckusick 5558659Smckusick /* no match, error return */ 5568659Smckusick return (NIL); 5578659Smckusick break; 5588659Smckusick 5598659Smckusick /* end of a subexpression, return success */ 5608659Smckusick case ')': 5618659Smckusick return (s); 5628659Smckusick } 5638659Smckusick break; 5648659Smckusick } 5658659Smckusick } 5668659Smckusick return (s); 5678659Smckusick } 568