1*48399Sbostic /*- 2*48399Sbostic * Copyright (c) 1980 The Regents of the University of California. 3*48399Sbostic * All rights reserved. 4*48399Sbostic * 5*48399Sbostic * %sccs.include.proprietary.c% 621356Sdist */ 721356Sdist 826580Sdonn #if defined(LIBC_SCCS) && !defined(lint) 9*48399Sbostic static char sccsid[] = "@(#)regex.c 5.5 (Berkeley) 04/20/91"; 10*48399Sbostic #endif /* LIBC_SCCS and not lint */ 1121356Sdist 1246597Sdonn #include <unistd.h> 131977Swnj 1446597Sdonn static int advance(); 1546597Sdonn static int backref(); 1646597Sdonn static int cclass(); 1746597Sdonn 181977Swnj /* 191977Swnj * routines to do regular expression matching 201977Swnj * 211977Swnj * Entry points: 221977Swnj * 231977Swnj * re_comp(s) 241977Swnj * char *s; 251977Swnj * ... returns 0 if the string s was compiled successfully, 261977Swnj * a pointer to an error message otherwise. 271977Swnj * If passed 0 or a null string returns without changing 281977Swnj * the currently compiled re (see note 11 below). 291977Swnj * 301977Swnj * re_exec(s) 311977Swnj * char *s; 321977Swnj * ... returns 1 if the string s matches the last compiled regular 331977Swnj * expression, 341977Swnj * 0 if the string s failed to match the last compiled 351977Swnj * regular expression, and 361977Swnj * -1 if the compiled regular expression was invalid 371977Swnj * (indicating an internal error). 381977Swnj * 391977Swnj * The strings passed to both re_comp and re_exec may have trailing or 401977Swnj * embedded newline characters; they are terminated by nulls. 411977Swnj * 421977Swnj * The identity of the author of these routines is lost in antiquity; 431977Swnj * this is essentially the same as the re code in the original V6 ed. 441977Swnj * 451977Swnj * The regular expressions recognized are described below. This description 461977Swnj * is essentially the same as that for ed. 471977Swnj * 481977Swnj * A regular expression specifies a set of strings of characters. 491977Swnj * A member of this set of strings is said to be matched by 501977Swnj * the regular expression. In the following specification for 511977Swnj * regular expressions the word `character' means any character but NUL. 521977Swnj * 531977Swnj * 1. Any character except a special character matches itself. 541977Swnj * Special characters are the regular expression delimiter plus 551977Swnj * \ [ . and sometimes ^ * $. 561977Swnj * 2. A . matches any character. 571977Swnj * 3. A \ followed by any character except a digit or ( ) 581977Swnj * matches that character. 591977Swnj * 4. A nonempty string s bracketed [s] (or [^s]) matches any 601977Swnj * character in (or not in) s. In s, \ has no special meaning, 611977Swnj * and ] may only appear as the first letter. A substring 621977Swnj * a-b, with a and b in ascending ASCII order, stands for 631977Swnj * the inclusive range of ASCII characters. 641977Swnj * 5. A regular expression of form 1-4 followed by * matches a 651977Swnj * sequence of 0 or more matches of the regular expression. 661977Swnj * 6. A regular expression, x, of form 1-8, bracketed \(x\) 671977Swnj * matches what x matches. 681977Swnj * 7. A \ followed by a digit n matches a copy of the string that the 691977Swnj * bracketed regular expression beginning with the nth \( matched. 701977Swnj * 8. A regular expression of form 1-8, x, followed by a regular 711977Swnj * expression of form 1-7, y matches a match for x followed by 721977Swnj * a match for y, with the x match being as long as possible 731977Swnj * while still permitting a y match. 741977Swnj * 9. A regular expression of form 1-8 preceded by ^ (or followed 751977Swnj * by $), is constrained to matches that begin at the left 761977Swnj * (or end at the right) end of a line. 771977Swnj * 10. A regular expression of form 1-9 picks out the longest among 781977Swnj * the leftmost matches in a line. 791977Swnj * 11. An empty regular expression stands for a copy of the last 801977Swnj * regular expression encountered. 811977Swnj */ 821977Swnj 831977Swnj /* 841977Swnj * constants for re's 851977Swnj */ 861977Swnj #define CBRA 1 871977Swnj #define CCHR 2 881977Swnj #define CDOT 4 891977Swnj #define CCL 6 901977Swnj #define NCCL 8 911977Swnj #define CDOL 10 921977Swnj #define CEOF 11 931977Swnj #define CKET 12 941977Swnj #define CBACK 18 951977Swnj 961977Swnj #define CSTAR 01 971977Swnj 981977Swnj #define ESIZE 512 991977Swnj #define NBRA 9 1001977Swnj 1011977Swnj static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; 1021977Swnj static char circf; 1031977Swnj 1041977Swnj /* 1051977Swnj * compile the regular expression argument into a dfa 1061977Swnj */ 1071977Swnj char * 1081977Swnj re_comp(sp) 10946597Sdonn register const char *sp; 1101977Swnj { 1111977Swnj register int c; 1121977Swnj register char *ep = expbuf; 1131977Swnj int cclcnt, numbra = 0; 1141977Swnj char *lastep = 0; 1151977Swnj char bracket[NBRA]; 1161977Swnj char *bracketp = &bracket[0]; 1171977Swnj static char *retoolong = "Regular expression too long"; 1181977Swnj 1191977Swnj #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } 1201977Swnj 1211977Swnj if (sp == 0 || *sp == '\0') { 1221977Swnj if (*ep == 0) 1231977Swnj return("No previous regular expression"); 1241977Swnj return(0); 1251977Swnj } 1261977Swnj if (*sp == '^') { 1271977Swnj circf = 1; 1281977Swnj sp++; 1291977Swnj } 1301977Swnj else 1311977Swnj circf = 0; 1321977Swnj for (;;) { 1331977Swnj if (ep >= &expbuf[ESIZE]) 1341977Swnj comerr(retoolong); 1351977Swnj if ((c = *sp++) == '\0') { 1361977Swnj if (bracketp != bracket) 1371977Swnj comerr("unmatched \\("); 1381977Swnj *ep++ = CEOF; 1391977Swnj *ep++ = 0; 1401977Swnj return(0); 1411977Swnj } 1421977Swnj if (c != '*') 1431977Swnj lastep = ep; 1441977Swnj switch (c) { 1451977Swnj 1461977Swnj case '.': 1471977Swnj *ep++ = CDOT; 1481977Swnj continue; 1491977Swnj 1501977Swnj case '*': 1511977Swnj if (lastep == 0 || *lastep == CBRA || *lastep == CKET) 1521977Swnj goto defchar; 1531977Swnj *lastep |= CSTAR; 1541977Swnj continue; 1551977Swnj 1561977Swnj case '$': 1571977Swnj if (*sp != '\0') 1581977Swnj goto defchar; 1591977Swnj *ep++ = CDOL; 1601977Swnj continue; 1611977Swnj 1621977Swnj case '[': 1631977Swnj *ep++ = CCL; 1641977Swnj *ep++ = 0; 1651977Swnj cclcnt = 1; 1661977Swnj if ((c = *sp++) == '^') { 1671977Swnj c = *sp++; 1681977Swnj ep[-2] = NCCL; 1691977Swnj } 1701977Swnj do { 1711977Swnj if (c == '\0') 1721977Swnj comerr("missing ]"); 1731977Swnj if (c == '-' && ep [-1] != 0) { 1741977Swnj if ((c = *sp++) == ']') { 1751977Swnj *ep++ = '-'; 1761977Swnj cclcnt++; 1771977Swnj break; 1781977Swnj } 1791977Swnj while (ep[-1] < c) { 1801977Swnj *ep = ep[-1] + 1; 1811977Swnj ep++; 1821977Swnj cclcnt++; 1831977Swnj if (ep >= &expbuf[ESIZE]) 1841977Swnj comerr(retoolong); 1851977Swnj } 1861977Swnj } 1871977Swnj *ep++ = c; 1881977Swnj cclcnt++; 1891977Swnj if (ep >= &expbuf[ESIZE]) 1901977Swnj comerr(retoolong); 1911977Swnj } while ((c = *sp++) != ']'); 1921977Swnj lastep[1] = cclcnt; 1931977Swnj continue; 1941977Swnj 1951977Swnj case '\\': 1961977Swnj if ((c = *sp++) == '(') { 1971977Swnj if (numbra >= NBRA) 1981977Swnj comerr("too many \\(\\) pairs"); 1991977Swnj *bracketp++ = numbra; 2001977Swnj *ep++ = CBRA; 2011977Swnj *ep++ = numbra++; 2021977Swnj continue; 2031977Swnj } 2041977Swnj if (c == ')') { 2051977Swnj if (bracketp <= bracket) 2061977Swnj comerr("unmatched \\)"); 2071977Swnj *ep++ = CKET; 2081977Swnj *ep++ = *--bracketp; 2091977Swnj continue; 2101977Swnj } 2111977Swnj if (c >= '1' && c < ('1' + NBRA)) { 2121977Swnj *ep++ = CBACK; 2131977Swnj *ep++ = c - '1'; 2141977Swnj continue; 2151977Swnj } 2161977Swnj *ep++ = CCHR; 2171977Swnj *ep++ = c; 2181977Swnj continue; 2191977Swnj 2201977Swnj defchar: 2211977Swnj default: 2221977Swnj *ep++ = CCHR; 2231977Swnj *ep++ = c; 2241977Swnj } 2251977Swnj } 2261977Swnj } 2271977Swnj 2281977Swnj /* 2291977Swnj * match the argument string against the compiled re 2301977Swnj */ 2311977Swnj int 2321977Swnj re_exec(p1) 23346597Sdonn register const char *p1; 2341977Swnj { 2351977Swnj register char *p2 = expbuf; 2361977Swnj register int c; 2371977Swnj int rv; 2381977Swnj 2391977Swnj for (c = 0; c < NBRA; c++) { 2401977Swnj braslist[c] = 0; 2411977Swnj braelist[c] = 0; 2421977Swnj } 2431977Swnj if (circf) 2441977Swnj return((advance(p1, p2))); 2451977Swnj /* 2461977Swnj * fast check for first character 2471977Swnj */ 2481977Swnj if (*p2 == CCHR) { 2491977Swnj c = p2[1]; 2501977Swnj do { 2511977Swnj if (*p1 != c) 2521977Swnj continue; 2531977Swnj if (rv = advance(p1, p2)) 2541977Swnj return(rv); 2551977Swnj } while (*p1++); 2561977Swnj return(0); 2571977Swnj } 2581977Swnj /* 2591977Swnj * regular algorithm 2601977Swnj */ 2611977Swnj do 2621977Swnj if (rv = advance(p1, p2)) 2631977Swnj return(rv); 2641977Swnj while (*p1++); 2651977Swnj return(0); 2661977Swnj } 2671977Swnj 2681977Swnj /* 2691977Swnj * try to match the next thing in the dfa 2701977Swnj */ 2711977Swnj static int 2721977Swnj advance(lp, ep) 2731977Swnj register char *lp, *ep; 2741977Swnj { 2751977Swnj register char *curlp; 2761977Swnj int ct, i; 2771977Swnj int rv; 2781977Swnj 2791977Swnj for (;;) 2801977Swnj switch (*ep++) { 2811977Swnj 2821977Swnj case CCHR: 2831977Swnj if (*ep++ == *lp++) 2841977Swnj continue; 2851977Swnj return(0); 2861977Swnj 2871977Swnj case CDOT: 2881977Swnj if (*lp++) 2891977Swnj continue; 2901977Swnj return(0); 2911977Swnj 2921977Swnj case CDOL: 2931977Swnj if (*lp == '\0') 2941977Swnj continue; 2951977Swnj return(0); 2961977Swnj 2971977Swnj case CEOF: 2981977Swnj return(1); 2991977Swnj 3001977Swnj case CCL: 3011977Swnj if (cclass(ep, *lp++, 1)) { 3021977Swnj ep += *ep; 3031977Swnj continue; 3041977Swnj } 3051977Swnj return(0); 3061977Swnj 3071977Swnj case NCCL: 3081977Swnj if (cclass(ep, *lp++, 0)) { 3091977Swnj ep += *ep; 3101977Swnj continue; 3111977Swnj } 3121977Swnj return(0); 3131977Swnj 3141977Swnj case CBRA: 3151977Swnj braslist[*ep++] = lp; 3161977Swnj continue; 3171977Swnj 3181977Swnj case CKET: 3191977Swnj braelist[*ep++] = lp; 3201977Swnj continue; 3211977Swnj 3221977Swnj case CBACK: 3231977Swnj if (braelist[i = *ep++] == 0) 3241977Swnj return(-1); 3251977Swnj if (backref(i, lp)) { 3261977Swnj lp += braelist[i] - braslist[i]; 3271977Swnj continue; 3281977Swnj } 3291977Swnj return(0); 3301977Swnj 3311977Swnj case CBACK|CSTAR: 3321977Swnj if (braelist[i = *ep++] == 0) 3331977Swnj return(-1); 3341977Swnj curlp = lp; 3351977Swnj ct = braelist[i] - braslist[i]; 3361977Swnj while (backref(i, lp)) 3371977Swnj lp += ct; 3381977Swnj while (lp >= curlp) { 3391977Swnj if (rv = advance(lp, ep)) 3401977Swnj return(rv); 3411977Swnj lp -= ct; 3421977Swnj } 3431977Swnj continue; 3441977Swnj 3451977Swnj case CDOT|CSTAR: 3461977Swnj curlp = lp; 3471977Swnj while (*lp++) 3481977Swnj ; 3491977Swnj goto star; 3501977Swnj 3511977Swnj case CCHR|CSTAR: 3521977Swnj curlp = lp; 3531977Swnj while (*lp++ == *ep) 3541977Swnj ; 3551977Swnj ep++; 3561977Swnj goto star; 3571977Swnj 3581977Swnj case CCL|CSTAR: 3591977Swnj case NCCL|CSTAR: 3601977Swnj curlp = lp; 3611977Swnj while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 3621977Swnj ; 3631977Swnj ep += *ep; 3641977Swnj goto star; 3651977Swnj 3661977Swnj star: 3671977Swnj do { 3681977Swnj lp--; 3691977Swnj if (rv = advance(lp, ep)) 3701977Swnj return(rv); 3711977Swnj } while (lp > curlp); 3721977Swnj return(0); 3731977Swnj 3741977Swnj default: 3751977Swnj return(-1); 3761977Swnj } 3771977Swnj } 3781977Swnj 37946488Sbostic static 3801977Swnj backref(i, lp) 3811977Swnj register int i; 3821977Swnj register char *lp; 3831977Swnj { 3841977Swnj register char *bp; 3851977Swnj 3861977Swnj bp = braslist[i]; 3871977Swnj while (*bp++ == *lp++) 3881977Swnj if (bp >= braelist[i]) 3891977Swnj return(1); 3901977Swnj return(0); 3911977Swnj } 3921977Swnj 39346488Sbostic static int 3941977Swnj cclass(set, c, af) 3951977Swnj register char *set, c; 3961977Swnj int af; 3971977Swnj { 3981977Swnj register int n; 3991977Swnj 4001977Swnj if (c == 0) 4011977Swnj return(0); 4021977Swnj n = *set++; 4031977Swnj while (--n) 4041977Swnj if (*set++ == c) 4051977Swnj return(af); 4061977Swnj return(! af); 4071977Swnj } 408