122063Sdist /* 2*36167Sbostic * Copyright (c) 1980 The Regents of the University of California. 3*36167Sbostic * All rights reserved. 4*36167Sbostic * 5*36167Sbostic * Redistribution and use in source and binary forms are permitted 6*36167Sbostic * provided that the above copyright notice and this paragraph are 7*36167Sbostic * duplicated in all such forms and that any documentation, 8*36167Sbostic * advertising materials, and other materials related to such 9*36167Sbostic * distribution and use acknowledge that the software was developed 10*36167Sbostic * by the University of California, Berkeley. The name of the 11*36167Sbostic * University may not be used to endorse or promote products derived 12*36167Sbostic * from this software without specific prior written permission. 13*36167Sbostic * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 14*36167Sbostic * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 15*36167Sbostic * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 1622063Sdist */ 178659Smckusick 1822063Sdist #ifndef lint 19*36167Sbostic static char sccsid[] = "@(#)regexp.c 5.2 (Berkeley) 10/25/88"; 20*36167Sbostic #endif /* not lint */ 2122063Sdist 228659Smckusick #include <ctype.h> 238659Smckusick 248659Smckusick typedef int boolean; 258659Smckusick #define TRUE 1 268659Smckusick #define FALSE 0 278659Smckusick #define NIL 0 288659Smckusick 298659Smckusick boolean l_onecase; /* true if upper and lower equivalent */ 308659Smckusick 318659Smckusick #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 328659Smckusick 338659Smckusick /* STRNCMP - like strncmp except that we convert the 348659Smckusick * first string to lower case before comparing 358659Smckusick * if l_onecase is set. 368659Smckusick */ 378659Smckusick 388659Smckusick STRNCMP(s1, s2, len) 398659Smckusick register char *s1,*s2; 408659Smckusick register int len; 418659Smckusick { 428659Smckusick if (l_onecase) { 438659Smckusick do 448659Smckusick if (*s2 - makelower(*s1)) 458659Smckusick return (*s2 - makelower(*s1)); 468659Smckusick else { 478659Smckusick s2++; 488659Smckusick s1++; 498659Smckusick } 508659Smckusick while (--len); 518659Smckusick } else { 528659Smckusick do 538659Smckusick if (*s2 - *s1) 548659Smckusick return (*s2 - *s1); 558659Smckusick else { 568659Smckusick s2++; 578659Smckusick s1++; 588659Smckusick } 598659Smckusick while (--len); 608659Smckusick } 618659Smckusick return(0); 628659Smckusick } 638659Smckusick 648659Smckusick /* The following routine converts an irregular expression to 658659Smckusick * internal format. 668659Smckusick * 678659Smckusick * Either meta symbols (\a \d or \p) or character strings or 688659Smckusick * operations ( alternation or perenthesizing ) can be 698659Smckusick * specified. Each starts with a descriptor byte. The descriptor 708659Smckusick * byte has STR set for strings, META set for meta symbols 718659Smckusick * and OPER set for operations. 728659Smckusick * The descriptor byte can also have the OPT bit set if the object 738659Smckusick * defined is optional. Also ALT can be set to indicate an alternation. 748659Smckusick * 758659Smckusick * For metasymbols the byte following the descriptor byte identities 768659Smckusick * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 778659Smckusick * strings the byte after the descriptor is a character count for 788659Smckusick * the string: 798659Smckusick * 808659Smckusick * meta symbols := descriptor 818659Smckusick * symbol 828659Smckusick * 838659Smckusick * strings := descriptor 848659Smckusick * character count 858659Smckusick * the string 868659Smckusick * 878659Smckusick * operatins := descriptor 888659Smckusick * symbol 898659Smckusick * character count 908659Smckusick */ 918659Smckusick 928659Smckusick /* 938659Smckusick * handy macros for accessing parts of match blocks 948659Smckusick */ 958659Smckusick #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 968659Smckusick #define MNEXT(A) (A+2) /* character following a metasymbol block */ 978659Smckusick 988659Smckusick #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 998659Smckusick #define OCNT(A) (*(A+2)) /* character count */ 1008659Smckusick #define ONEXT(A) (A+3) /* next character after the operation */ 1018659Smckusick #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 1028659Smckusick 1038659Smckusick #define SCNT(A) (*(A+1)) /* byte count of a string */ 1048659Smckusick #define SSTR(A) (A+2) /* address of the string */ 1058659Smckusick #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 1068659Smckusick 1078659Smckusick /* 1088659Smckusick * bit flags in the descriptor 1098659Smckusick */ 1108659Smckusick #define OPT 1 1118659Smckusick #define STR 2 1128659Smckusick #define META 4 1138659Smckusick #define ALT 8 1148659Smckusick #define OPER 16 1158659Smckusick 1168659Smckusick char *ure; /* pointer current position in unconverted exp */ 1178659Smckusick char *ccre; /* pointer to current position in converted exp*/ 1188659Smckusick char *malloc(); 1198659Smckusick 1208659Smckusick char * 1218659Smckusick convexp(re) 1228659Smckusick char *re; /* unconverted irregular expression */ 1238659Smckusick { 1248659Smckusick register char *cre; /* pointer to converted regular expression */ 1258659Smckusick 1268659Smckusick /* allocate room for the converted expression */ 1278659Smckusick if (re == NIL) 1288659Smckusick return (NIL); 1298659Smckusick if (*re == '\0') 1308659Smckusick return (NIL); 1318659Smckusick cre = malloc (4 * strlen(re) + 3); 1328659Smckusick ccre = cre; 1338659Smckusick ure = re; 1348659Smckusick 1358659Smckusick /* start the conversion with a \a */ 1368659Smckusick *cre = META | OPT; 1378659Smckusick MSYM(cre) = 'a'; 1388659Smckusick ccre = MNEXT(cre); 1398659Smckusick 1408659Smckusick /* start the conversion (its recursive) */ 1418659Smckusick expconv (); 1428659Smckusick *ccre = 0; 1438659Smckusick return (cre); 1448659Smckusick } 1458659Smckusick 1468659Smckusick expconv() 1478659Smckusick { 1488659Smckusick register char *cs; /* pointer to current symbol in converted exp */ 1498659Smckusick register char c; /* character being processed */ 1508659Smckusick register char *acs; /* pinter to last alternate */ 1518659Smckusick register int temp; 1528659Smckusick 1538659Smckusick /* let the conversion begin */ 1548659Smckusick acs = NIL; 15517500Sralph cs = NIL; 1568659Smckusick while (*ure != NIL) { 1578659Smckusick switch (c = *ure++) { 1588659Smckusick 1598659Smckusick case '\\': 1608659Smckusick switch (c = *ure++) { 1618659Smckusick 1628659Smckusick /* escaped characters are just characters */ 1638659Smckusick default: 16417500Sralph if (cs == NIL || (*cs & STR) == 0) { 1658659Smckusick cs = ccre; 1668659Smckusick *cs = STR; 1678659Smckusick SCNT(cs) = 1; 1688659Smckusick ccre += 2; 1698659Smckusick } else 1708659Smckusick SCNT(cs)++; 1718659Smckusick *ccre++ = c; 1728659Smckusick break; 1738659Smckusick 1748659Smckusick /* normal(?) metacharacters */ 1758659Smckusick case 'a': 1768659Smckusick case 'd': 1778659Smckusick case 'e': 1788659Smckusick case 'p': 1798659Smckusick if (acs != NIL && acs != cs) { 1808659Smckusick do { 1818659Smckusick temp = OCNT(acs); 1828659Smckusick OCNT(acs) = ccre - acs; 1838659Smckusick acs -= temp; 1848659Smckusick } while (temp != 0); 1858659Smckusick acs = NIL; 1868659Smckusick } 1878659Smckusick cs = ccre; 1888659Smckusick *cs = META; 1898659Smckusick MSYM(cs) = c; 1908659Smckusick ccre = MNEXT(cs); 1918659Smckusick break; 1928659Smckusick } 1938659Smckusick break; 1948659Smckusick 1958659Smckusick /* just put the symbol in */ 1968659Smckusick case '^': 1978659Smckusick case '$': 1988659Smckusick if (acs != NIL && acs != cs) { 1998659Smckusick do { 2008659Smckusick temp = OCNT(acs); 2018659Smckusick OCNT(acs) = ccre - acs; 2028659Smckusick acs -= temp; 2038659Smckusick } while (temp != 0); 2048659Smckusick acs = NIL; 2058659Smckusick } 2068659Smckusick cs = ccre; 2078659Smckusick *cs = META; 2088659Smckusick MSYM(cs) = c; 2098659Smckusick ccre = MNEXT(cs); 2108659Smckusick break; 2118659Smckusick 2128659Smckusick /* mark the last match sequence as optional */ 2138659Smckusick case '?': 21417500Sralph if (cs) 21517500Sralph *cs = *cs | OPT; 2168659Smckusick break; 2178659Smckusick 2188659Smckusick /* recurse and define a subexpression */ 2198659Smckusick case '(': 2208659Smckusick if (acs != NIL && acs != cs) { 2218659Smckusick do { 2228659Smckusick temp = OCNT(acs); 2238659Smckusick OCNT(acs) = ccre - acs; 2248659Smckusick acs -= temp; 2258659Smckusick } while (temp != 0); 2268659Smckusick acs = NIL; 2278659Smckusick } 2288659Smckusick cs = ccre; 2298659Smckusick *cs = OPER; 2308659Smckusick OSYM(cs) = '('; 2318659Smckusick ccre = ONEXT(cs); 2328659Smckusick expconv (); 2338659Smckusick OCNT(cs) = ccre - cs; /* offset to next symbol */ 2348659Smckusick break; 2358659Smckusick 2368659Smckusick /* return from a recursion */ 2378659Smckusick case ')': 2388659Smckusick if (acs != NIL) { 2398659Smckusick do { 2408659Smckusick temp = OCNT(acs); 2418659Smckusick OCNT(acs) = ccre - acs; 2428659Smckusick acs -= temp; 2438659Smckusick } while (temp != 0); 2448659Smckusick acs = NIL; 2458659Smckusick } 2468659Smckusick cs = ccre; 2478659Smckusick *cs = META; 2488659Smckusick MSYM(cs) = c; 2498659Smckusick ccre = MNEXT(cs); 2508659Smckusick return; 2518659Smckusick 2528659Smckusick /* mark the last match sequence as having an alternate */ 2538659Smckusick /* the third byte will contain an offset to jump over the */ 2548659Smckusick /* alternate match in case the first did not fail */ 2558659Smckusick case '|': 2568659Smckusick if (acs != NIL && acs != cs) 2578659Smckusick OCNT(ccre) = ccre - acs; /* make a back pointer */ 2588659Smckusick else 2598659Smckusick OCNT(ccre) = 0; 2608659Smckusick *cs |= ALT; 2618659Smckusick cs = ccre; 2628659Smckusick *cs = OPER; 2638659Smckusick OSYM(cs) = '|'; 2648659Smckusick ccre = ONEXT(cs); 2658659Smckusick acs = cs; /* remember that the pointer is to be filles */ 2668659Smckusick break; 2678659Smckusick 2688659Smckusick /* if its not a metasymbol just build a scharacter string */ 2698659Smckusick default: 27017500Sralph if (cs == NIL || (*cs & STR) == 0) { 2718659Smckusick cs = ccre; 2728659Smckusick *cs = STR; 2738659Smckusick SCNT(cs) = 1; 2748659Smckusick ccre = SSTR(cs); 2758659Smckusick } else 2768659Smckusick SCNT(cs)++; 2778659Smckusick *ccre++ = c; 2788659Smckusick break; 2798659Smckusick } 2808659Smckusick } 2818659Smckusick if (acs != NIL) { 2828659Smckusick do { 2838659Smckusick temp = OCNT(acs); 2848659Smckusick OCNT(acs) = ccre - acs; 2858659Smckusick acs -= temp; 2868659Smckusick } while (temp != 0); 2878659Smckusick acs = NIL; 2888659Smckusick } 2898659Smckusick return; 2908659Smckusick } 2918659Smckusick /* end of convertre */ 2928659Smckusick 2938659Smckusick 2948659Smckusick /* 2958659Smckusick * The following routine recognises an irregular expresion 2968659Smckusick * with the following special characters: 2978659Smckusick * 2988659Smckusick * \? - means last match was optional 2998659Smckusick * \a - matches any number of characters 3008659Smckusick * \d - matches any number of spaces and tabs 3018659Smckusick * \p - matches any number of alphanumeric 3028659Smckusick * characters. The 3038659Smckusick * characters matched will be copied into 3048659Smckusick * the area pointed to by 'name'. 3058659Smckusick * \| - alternation 3068659Smckusick * \( \) - grouping used mostly for alternation and 3078659Smckusick * optionality 3088659Smckusick * 3098659Smckusick * The irregular expression must be translated to internal form 3108659Smckusick * prior to calling this routine 3118659Smckusick * 3128659Smckusick * The value returned is the pointer to the first non \a 3138659Smckusick * character matched. 3148659Smckusick */ 3158659Smckusick 3168659Smckusick boolean _escaped; /* true if we are currently _escaped */ 3178659Smckusick char *_start; /* start of string */ 3188659Smckusick 3198659Smckusick char * 3208659Smckusick expmatch (s, re, mstring) 3218659Smckusick register char *s; /* string to check for a match in */ 3228659Smckusick register char *re; /* a converted irregular expression */ 3238659Smckusick register char *mstring; /* where to put whatever matches a \p */ 3248659Smckusick { 3258659Smckusick register char *cs; /* the current symbol */ 3268659Smckusick register char *ptr,*s1; /* temporary pointer */ 3278659Smckusick boolean matched; /* a temporary boolean */ 3288659Smckusick 3298659Smckusick /* initial conditions */ 3308659Smckusick if (re == NIL) 3318659Smckusick return (NIL); 3328659Smckusick cs = re; 3338659Smckusick matched = FALSE; 3348659Smckusick 3358659Smckusick /* loop till expression string is exhausted (or at least pretty tired) */ 3368659Smckusick while (*cs) { 3378659Smckusick switch (*cs & (OPER | STR | META)) { 3388659Smckusick 3398659Smckusick /* try to match a string */ 3408659Smckusick case STR: 3418659Smckusick matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 3428659Smckusick if (matched) { 3438659Smckusick 3448659Smckusick /* hoorah it matches */ 3458659Smckusick s += SCNT(cs); 3468659Smckusick cs = SNEXT(cs); 3478659Smckusick } else if (*cs & ALT) { 3488659Smckusick 3498659Smckusick /* alternation, skip to next expression */ 3508659Smckusick cs = SNEXT(cs); 3518659Smckusick } else if (*cs & OPT) { 3528659Smckusick 3538659Smckusick /* the match is optional */ 3548659Smckusick cs = SNEXT(cs); 3558659Smckusick matched = 1; /* indicate a successful match */ 3568659Smckusick } else { 3578659Smckusick 3588659Smckusick /* no match, error return */ 3598659Smckusick return (NIL); 3608659Smckusick } 3618659Smckusick break; 3628659Smckusick 3638659Smckusick /* an operator, do something fancy */ 3648659Smckusick case OPER: 3658659Smckusick switch (OSYM(cs)) { 3668659Smckusick 3678659Smckusick /* this is an alternation */ 3688659Smckusick case '|': 3698659Smckusick if (matched) 3708659Smckusick 3718659Smckusick /* last thing in the alternation was a match, skip ahead */ 3728659Smckusick cs = OPTR(cs); 3738659Smckusick else 3748659Smckusick 3758659Smckusick /* no match, keep trying */ 3768659Smckusick cs = ONEXT(cs); 3778659Smckusick break; 3788659Smckusick 3798659Smckusick /* this is a grouping, recurse */ 3808659Smckusick case '(': 3818659Smckusick ptr = expmatch (s, ONEXT(cs), mstring); 3828659Smckusick if (ptr != NIL) { 3838659Smckusick 3848659Smckusick /* the subexpression matched */ 3858659Smckusick matched = 1; 3868659Smckusick s = ptr; 3878659Smckusick } else if (*cs & ALT) { 3888659Smckusick 3898659Smckusick /* alternation, skip to next expression */ 3908659Smckusick matched = 0; 3918659Smckusick } else if (*cs & OPT) { 3928659Smckusick 3938659Smckusick /* the match is optional */ 3948659Smckusick matched = 1; /* indicate a successful match */ 3958659Smckusick } else { 3968659Smckusick 3978659Smckusick /* no match, error return */ 3988659Smckusick return (NIL); 3998659Smckusick } 4008659Smckusick cs = OPTR(cs); 4018659Smckusick break; 4028659Smckusick } 4038659Smckusick break; 4048659Smckusick 4058659Smckusick /* try to match a metasymbol */ 4068659Smckusick case META: 4078659Smckusick switch (MSYM(cs)) { 4088659Smckusick 4098659Smckusick /* try to match anything and remember what was matched */ 4108659Smckusick case 'p': 4118659Smckusick /* 4128659Smckusick * This is really the same as trying the match the 4138659Smckusick * remaining parts of the expression to any subset 4148659Smckusick * of the string. 4158659Smckusick */ 4168659Smckusick s1 = s; 4178659Smckusick do { 4188659Smckusick ptr = expmatch (s1, MNEXT(cs), mstring); 4198659Smckusick if (ptr != NIL && s1 != s) { 4208659Smckusick 4218659Smckusick /* we have a match, remember the match */ 4228659Smckusick strncpy (mstring, s, s1 - s); 4238659Smckusick mstring[s1 - s] = '\0'; 4248659Smckusick return (ptr); 4258659Smckusick } else if (ptr != NIL && (*cs & OPT)) { 4268659Smckusick 4278659Smckusick /* it was aoptional so no match is ok */ 4288659Smckusick return (ptr); 4298659Smckusick } else if (ptr != NIL) { 4308659Smckusick 4318659Smckusick /* not optional and we still matched */ 4328659Smckusick return (NIL); 4338659Smckusick } 4348659Smckusick if (!isalnum(*s1) && *s1 != '_') 4358659Smckusick return (NIL); 4368659Smckusick if (*s1 == '\\') 4378659Smckusick _escaped = _escaped ? FALSE : TRUE; 4388659Smckusick else 4398659Smckusick _escaped = FALSE; 4408659Smckusick } while (*s1++); 4418659Smckusick return (NIL); 4428659Smckusick 4438659Smckusick /* try to match anything */ 4448659Smckusick case 'a': 4458659Smckusick /* 4468659Smckusick * This is really the same as trying the match the 4478659Smckusick * remaining parts of the expression to any subset 4488659Smckusick * of the string. 4498659Smckusick */ 4508659Smckusick s1 = s; 4518659Smckusick do { 4528659Smckusick ptr = expmatch (s1, MNEXT(cs), mstring); 4538659Smckusick if (ptr != NIL && s1 != s) { 4548659Smckusick 4558659Smckusick /* we have a match */ 4568659Smckusick return (ptr); 4578659Smckusick } else if (ptr != NIL && (*cs & OPT)) { 4588659Smckusick 4598659Smckusick /* it was aoptional so no match is ok */ 4608659Smckusick return (ptr); 4618659Smckusick } else if (ptr != NIL) { 4628659Smckusick 4638659Smckusick /* not optional and we still matched */ 4648659Smckusick return (NIL); 4658659Smckusick } 4668659Smckusick if (*s1 == '\\') 4678659Smckusick _escaped = _escaped ? FALSE : TRUE; 4688659Smckusick else 4698659Smckusick _escaped = FALSE; 4708659Smckusick } while (*s1++); 4718659Smckusick return (NIL); 4728659Smckusick 4738659Smckusick /* fail if we are currently _escaped */ 4748659Smckusick case 'e': 4758659Smckusick if (_escaped) 4768659Smckusick return(NIL); 4778659Smckusick cs = MNEXT(cs); 4788659Smckusick break; 4798659Smckusick 4808659Smckusick /* match any number of tabs and spaces */ 4818659Smckusick case 'd': 4828659Smckusick ptr = s; 4838659Smckusick while (*s == ' ' || *s == '\t') 4848659Smckusick s++; 4858659Smckusick if (s != ptr || s == _start) { 4868659Smckusick 4878659Smckusick /* match, be happy */ 4888659Smckusick matched = 1; 4898659Smckusick cs = MNEXT(cs); 4908659Smckusick } else if (*s == '\n' || *s == '\0') { 4918659Smckusick 4928659Smckusick /* match, be happy */ 4938659Smckusick matched = 1; 4948659Smckusick cs = MNEXT(cs); 4958659Smckusick } else if (*cs & ALT) { 4968659Smckusick 4978659Smckusick /* try the next part */ 4988659Smckusick matched = 0; 4998659Smckusick cs = MNEXT(cs); 5008659Smckusick } else if (*cs & OPT) { 5018659Smckusick 5028659Smckusick /* doesn't matter */ 5038659Smckusick matched = 1; 5048659Smckusick cs = MNEXT(cs); 5058659Smckusick } else 5068659Smckusick 5078659Smckusick /* no match, error return */ 5088659Smckusick return (NIL); 5098659Smckusick break; 5108659Smckusick 5118659Smckusick /* check for end of line */ 5128659Smckusick case '$': 5138659Smckusick if (*s == '\0' || *s == '\n') { 5148659Smckusick 5158659Smckusick /* match, be happy */ 5168659Smckusick s++; 5178659Smckusick matched = 1; 5188659Smckusick cs = MNEXT(cs); 5198659Smckusick } else if (*cs & ALT) { 5208659Smckusick 5218659Smckusick /* try the next part */ 5228659Smckusick matched = 0; 5238659Smckusick cs = MNEXT(cs); 5248659Smckusick } else if (*cs & OPT) { 5258659Smckusick 5268659Smckusick /* doesn't matter */ 5278659Smckusick matched = 1; 5288659Smckusick cs = MNEXT(cs); 5298659Smckusick } else 5308659Smckusick 5318659Smckusick /* no match, error return */ 5328659Smckusick return (NIL); 5338659Smckusick break; 5348659Smckusick 5358659Smckusick /* check for start of line */ 5368659Smckusick case '^': 5378659Smckusick if (s == _start) { 5388659Smckusick 5398659Smckusick /* match, be happy */ 5408659Smckusick matched = 1; 5418659Smckusick cs = MNEXT(cs); 5428659Smckusick } else if (*cs & ALT) { 5438659Smckusick 5448659Smckusick /* try the next part */ 5458659Smckusick matched = 0; 5468659Smckusick cs = MNEXT(cs); 5478659Smckusick } else if (*cs & OPT) { 5488659Smckusick 5498659Smckusick /* doesn't matter */ 5508659Smckusick matched = 1; 5518659Smckusick cs = MNEXT(cs); 5528659Smckusick } else 5538659Smckusick 5548659Smckusick /* no match, error return */ 5558659Smckusick return (NIL); 5568659Smckusick break; 5578659Smckusick 5588659Smckusick /* end of a subexpression, return success */ 5598659Smckusick case ')': 5608659Smckusick return (s); 5618659Smckusick } 5628659Smckusick break; 5638659Smckusick } 5648659Smckusick } 5658659Smckusick return (s); 5668659Smckusick } 567