1*21356Sdist /* 2*21356Sdist * Copyright (c) 1980 Regents of the University of California. 3*21356Sdist * All rights reserved. The Berkeley software License Agreement 4*21356Sdist * specifies the terms and conditions for redistribution. 5*21356Sdist */ 6*21356Sdist 7*21356Sdist #ifndef lint 8*21356Sdist static char sccsid[] = "@(#)regex.c 5.1 (Berkeley) 05/30/85"; 9*21356Sdist #endif not lint 10*21356Sdist 111977Swnj # 121977Swnj 131977Swnj /* 141977Swnj * routines to do regular expression matching 151977Swnj * 161977Swnj * Entry points: 171977Swnj * 181977Swnj * re_comp(s) 191977Swnj * char *s; 201977Swnj * ... returns 0 if the string s was compiled successfully, 211977Swnj * a pointer to an error message otherwise. 221977Swnj * If passed 0 or a null string returns without changing 231977Swnj * the currently compiled re (see note 11 below). 241977Swnj * 251977Swnj * re_exec(s) 261977Swnj * char *s; 271977Swnj * ... returns 1 if the string s matches the last compiled regular 281977Swnj * expression, 291977Swnj * 0 if the string s failed to match the last compiled 301977Swnj * regular expression, and 311977Swnj * -1 if the compiled regular expression was invalid 321977Swnj * (indicating an internal error). 331977Swnj * 341977Swnj * The strings passed to both re_comp and re_exec may have trailing or 351977Swnj * embedded newline characters; they are terminated by nulls. 361977Swnj * 371977Swnj * The identity of the author of these routines is lost in antiquity; 381977Swnj * this is essentially the same as the re code in the original V6 ed. 391977Swnj * 401977Swnj * The regular expressions recognized are described below. This description 411977Swnj * is essentially the same as that for ed. 421977Swnj * 431977Swnj * A regular expression specifies a set of strings of characters. 441977Swnj * A member of this set of strings is said to be matched by 451977Swnj * the regular expression. In the following specification for 461977Swnj * regular expressions the word `character' means any character but NUL. 471977Swnj * 481977Swnj * 1. Any character except a special character matches itself. 491977Swnj * Special characters are the regular expression delimiter plus 501977Swnj * \ [ . and sometimes ^ * $. 511977Swnj * 2. A . matches any character. 521977Swnj * 3. A \ followed by any character except a digit or ( ) 531977Swnj * matches that character. 541977Swnj * 4. A nonempty string s bracketed [s] (or [^s]) matches any 551977Swnj * character in (or not in) s. In s, \ has no special meaning, 561977Swnj * and ] may only appear as the first letter. A substring 571977Swnj * a-b, with a and b in ascending ASCII order, stands for 581977Swnj * the inclusive range of ASCII characters. 591977Swnj * 5. A regular expression of form 1-4 followed by * matches a 601977Swnj * sequence of 0 or more matches of the regular expression. 611977Swnj * 6. A regular expression, x, of form 1-8, bracketed \(x\) 621977Swnj * matches what x matches. 631977Swnj * 7. A \ followed by a digit n matches a copy of the string that the 641977Swnj * bracketed regular expression beginning with the nth \( matched. 651977Swnj * 8. A regular expression of form 1-8, x, followed by a regular 661977Swnj * expression of form 1-7, y matches a match for x followed by 671977Swnj * a match for y, with the x match being as long as possible 681977Swnj * while still permitting a y match. 691977Swnj * 9. A regular expression of form 1-8 preceded by ^ (or followed 701977Swnj * by $), is constrained to matches that begin at the left 711977Swnj * (or end at the right) end of a line. 721977Swnj * 10. A regular expression of form 1-9 picks out the longest among 731977Swnj * the leftmost matches in a line. 741977Swnj * 11. An empty regular expression stands for a copy of the last 751977Swnj * regular expression encountered. 761977Swnj */ 771977Swnj 781977Swnj /* 791977Swnj * constants for re's 801977Swnj */ 811977Swnj #define CBRA 1 821977Swnj #define CCHR 2 831977Swnj #define CDOT 4 841977Swnj #define CCL 6 851977Swnj #define NCCL 8 861977Swnj #define CDOL 10 871977Swnj #define CEOF 11 881977Swnj #define CKET 12 891977Swnj #define CBACK 18 901977Swnj 911977Swnj #define CSTAR 01 921977Swnj 931977Swnj #define ESIZE 512 941977Swnj #define NBRA 9 951977Swnj 961977Swnj static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; 971977Swnj static char circf; 981977Swnj 991977Swnj /* 1001977Swnj * compile the regular expression argument into a dfa 1011977Swnj */ 1021977Swnj char * 1031977Swnj re_comp(sp) 1041977Swnj register char *sp; 1051977Swnj { 1061977Swnj register int c; 1071977Swnj register char *ep = expbuf; 1081977Swnj int cclcnt, numbra = 0; 1091977Swnj char *lastep = 0; 1101977Swnj char bracket[NBRA]; 1111977Swnj char *bracketp = &bracket[0]; 1121977Swnj static char *retoolong = "Regular expression too long"; 1131977Swnj 1141977Swnj #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } 1151977Swnj 1161977Swnj if (sp == 0 || *sp == '\0') { 1171977Swnj if (*ep == 0) 1181977Swnj return("No previous regular expression"); 1191977Swnj return(0); 1201977Swnj } 1211977Swnj if (*sp == '^') { 1221977Swnj circf = 1; 1231977Swnj sp++; 1241977Swnj } 1251977Swnj else 1261977Swnj circf = 0; 1271977Swnj for (;;) { 1281977Swnj if (ep >= &expbuf[ESIZE]) 1291977Swnj comerr(retoolong); 1301977Swnj if ((c = *sp++) == '\0') { 1311977Swnj if (bracketp != bracket) 1321977Swnj comerr("unmatched \\("); 1331977Swnj *ep++ = CEOF; 1341977Swnj *ep++ = 0; 1351977Swnj return(0); 1361977Swnj } 1371977Swnj if (c != '*') 1381977Swnj lastep = ep; 1391977Swnj switch (c) { 1401977Swnj 1411977Swnj case '.': 1421977Swnj *ep++ = CDOT; 1431977Swnj continue; 1441977Swnj 1451977Swnj case '*': 1461977Swnj if (lastep == 0 || *lastep == CBRA || *lastep == CKET) 1471977Swnj goto defchar; 1481977Swnj *lastep |= CSTAR; 1491977Swnj continue; 1501977Swnj 1511977Swnj case '$': 1521977Swnj if (*sp != '\0') 1531977Swnj goto defchar; 1541977Swnj *ep++ = CDOL; 1551977Swnj continue; 1561977Swnj 1571977Swnj case '[': 1581977Swnj *ep++ = CCL; 1591977Swnj *ep++ = 0; 1601977Swnj cclcnt = 1; 1611977Swnj if ((c = *sp++) == '^') { 1621977Swnj c = *sp++; 1631977Swnj ep[-2] = NCCL; 1641977Swnj } 1651977Swnj do { 1661977Swnj if (c == '\0') 1671977Swnj comerr("missing ]"); 1681977Swnj if (c == '-' && ep [-1] != 0) { 1691977Swnj if ((c = *sp++) == ']') { 1701977Swnj *ep++ = '-'; 1711977Swnj cclcnt++; 1721977Swnj break; 1731977Swnj } 1741977Swnj while (ep[-1] < c) { 1751977Swnj *ep = ep[-1] + 1; 1761977Swnj ep++; 1771977Swnj cclcnt++; 1781977Swnj if (ep >= &expbuf[ESIZE]) 1791977Swnj comerr(retoolong); 1801977Swnj } 1811977Swnj } 1821977Swnj *ep++ = c; 1831977Swnj cclcnt++; 1841977Swnj if (ep >= &expbuf[ESIZE]) 1851977Swnj comerr(retoolong); 1861977Swnj } while ((c = *sp++) != ']'); 1871977Swnj lastep[1] = cclcnt; 1881977Swnj continue; 1891977Swnj 1901977Swnj case '\\': 1911977Swnj if ((c = *sp++) == '(') { 1921977Swnj if (numbra >= NBRA) 1931977Swnj comerr("too many \\(\\) pairs"); 1941977Swnj *bracketp++ = numbra; 1951977Swnj *ep++ = CBRA; 1961977Swnj *ep++ = numbra++; 1971977Swnj continue; 1981977Swnj } 1991977Swnj if (c == ')') { 2001977Swnj if (bracketp <= bracket) 2011977Swnj comerr("unmatched \\)"); 2021977Swnj *ep++ = CKET; 2031977Swnj *ep++ = *--bracketp; 2041977Swnj continue; 2051977Swnj } 2061977Swnj if (c >= '1' && c < ('1' + NBRA)) { 2071977Swnj *ep++ = CBACK; 2081977Swnj *ep++ = c - '1'; 2091977Swnj continue; 2101977Swnj } 2111977Swnj *ep++ = CCHR; 2121977Swnj *ep++ = c; 2131977Swnj continue; 2141977Swnj 2151977Swnj defchar: 2161977Swnj default: 2171977Swnj *ep++ = CCHR; 2181977Swnj *ep++ = c; 2191977Swnj } 2201977Swnj } 2211977Swnj } 2221977Swnj 2231977Swnj /* 2241977Swnj * match the argument string against the compiled re 2251977Swnj */ 2261977Swnj int 2271977Swnj re_exec(p1) 2281977Swnj register char *p1; 2291977Swnj { 2301977Swnj register char *p2 = expbuf; 2311977Swnj register int c; 2321977Swnj int rv; 2331977Swnj 2341977Swnj for (c = 0; c < NBRA; c++) { 2351977Swnj braslist[c] = 0; 2361977Swnj braelist[c] = 0; 2371977Swnj } 2381977Swnj if (circf) 2391977Swnj return((advance(p1, p2))); 2401977Swnj /* 2411977Swnj * fast check for first character 2421977Swnj */ 2431977Swnj if (*p2 == CCHR) { 2441977Swnj c = p2[1]; 2451977Swnj do { 2461977Swnj if (*p1 != c) 2471977Swnj continue; 2481977Swnj if (rv = advance(p1, p2)) 2491977Swnj return(rv); 2501977Swnj } while (*p1++); 2511977Swnj return(0); 2521977Swnj } 2531977Swnj /* 2541977Swnj * regular algorithm 2551977Swnj */ 2561977Swnj do 2571977Swnj if (rv = advance(p1, p2)) 2581977Swnj return(rv); 2591977Swnj while (*p1++); 2601977Swnj return(0); 2611977Swnj } 2621977Swnj 2631977Swnj /* 2641977Swnj * try to match the next thing in the dfa 2651977Swnj */ 2661977Swnj static int 2671977Swnj advance(lp, ep) 2681977Swnj register char *lp, *ep; 2691977Swnj { 2701977Swnj register char *curlp; 2711977Swnj int ct, i; 2721977Swnj int rv; 2731977Swnj 2741977Swnj for (;;) 2751977Swnj switch (*ep++) { 2761977Swnj 2771977Swnj case CCHR: 2781977Swnj if (*ep++ == *lp++) 2791977Swnj continue; 2801977Swnj return(0); 2811977Swnj 2821977Swnj case CDOT: 2831977Swnj if (*lp++) 2841977Swnj continue; 2851977Swnj return(0); 2861977Swnj 2871977Swnj case CDOL: 2881977Swnj if (*lp == '\0') 2891977Swnj continue; 2901977Swnj return(0); 2911977Swnj 2921977Swnj case CEOF: 2931977Swnj return(1); 2941977Swnj 2951977Swnj case CCL: 2961977Swnj if (cclass(ep, *lp++, 1)) { 2971977Swnj ep += *ep; 2981977Swnj continue; 2991977Swnj } 3001977Swnj return(0); 3011977Swnj 3021977Swnj case NCCL: 3031977Swnj if (cclass(ep, *lp++, 0)) { 3041977Swnj ep += *ep; 3051977Swnj continue; 3061977Swnj } 3071977Swnj return(0); 3081977Swnj 3091977Swnj case CBRA: 3101977Swnj braslist[*ep++] = lp; 3111977Swnj continue; 3121977Swnj 3131977Swnj case CKET: 3141977Swnj braelist[*ep++] = lp; 3151977Swnj continue; 3161977Swnj 3171977Swnj case CBACK: 3181977Swnj if (braelist[i = *ep++] == 0) 3191977Swnj return(-1); 3201977Swnj if (backref(i, lp)) { 3211977Swnj lp += braelist[i] - braslist[i]; 3221977Swnj continue; 3231977Swnj } 3241977Swnj return(0); 3251977Swnj 3261977Swnj case CBACK|CSTAR: 3271977Swnj if (braelist[i = *ep++] == 0) 3281977Swnj return(-1); 3291977Swnj curlp = lp; 3301977Swnj ct = braelist[i] - braslist[i]; 3311977Swnj while (backref(i, lp)) 3321977Swnj lp += ct; 3331977Swnj while (lp >= curlp) { 3341977Swnj if (rv = advance(lp, ep)) 3351977Swnj return(rv); 3361977Swnj lp -= ct; 3371977Swnj } 3381977Swnj continue; 3391977Swnj 3401977Swnj case CDOT|CSTAR: 3411977Swnj curlp = lp; 3421977Swnj while (*lp++) 3431977Swnj ; 3441977Swnj goto star; 3451977Swnj 3461977Swnj case CCHR|CSTAR: 3471977Swnj curlp = lp; 3481977Swnj while (*lp++ == *ep) 3491977Swnj ; 3501977Swnj ep++; 3511977Swnj goto star; 3521977Swnj 3531977Swnj case CCL|CSTAR: 3541977Swnj case NCCL|CSTAR: 3551977Swnj curlp = lp; 3561977Swnj while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 3571977Swnj ; 3581977Swnj ep += *ep; 3591977Swnj goto star; 3601977Swnj 3611977Swnj star: 3621977Swnj do { 3631977Swnj lp--; 3641977Swnj if (rv = advance(lp, ep)) 3651977Swnj return(rv); 3661977Swnj } while (lp > curlp); 3671977Swnj return(0); 3681977Swnj 3691977Swnj default: 3701977Swnj return(-1); 3711977Swnj } 3721977Swnj } 3731977Swnj 3741977Swnj backref(i, lp) 3751977Swnj register int i; 3761977Swnj register char *lp; 3771977Swnj { 3781977Swnj register char *bp; 3791977Swnj 3801977Swnj bp = braslist[i]; 3811977Swnj while (*bp++ == *lp++) 3821977Swnj if (bp >= braelist[i]) 3831977Swnj return(1); 3841977Swnj return(0); 3851977Swnj } 3861977Swnj 3871977Swnj int 3881977Swnj cclass(set, c, af) 3891977Swnj register char *set, c; 3901977Swnj int af; 3911977Swnj { 3921977Swnj register int n; 3931977Swnj 3941977Swnj if (c == 0) 3951977Swnj return(0); 3961977Swnj n = *set++; 3971977Swnj while (--n) 3981977Swnj if (*set++ == c) 3991977Swnj return(af); 4001977Swnj return(! af); 4011977Swnj } 402