121356Sdist /* 221356Sdist * Copyright (c) 1980 Regents of the University of California. 321356Sdist * All rights reserved. The Berkeley software License Agreement 421356Sdist * specifies the terms and conditions for redistribution. 521356Sdist */ 621356Sdist 726580Sdonn #if defined(LIBC_SCCS) && !defined(lint) 8*46597Sdonn static char sccsid[] = "@(#)regex.c 5.4 (Berkeley) 02/23/91"; 926580Sdonn #endif LIBC_SCCS and not lint 1021356Sdist 11*46597Sdonn #include <unistd.h> 121977Swnj 13*46597Sdonn static int advance(); 14*46597Sdonn static int backref(); 15*46597Sdonn static int cclass(); 16*46597Sdonn 171977Swnj /* 181977Swnj * routines to do regular expression matching 191977Swnj * 201977Swnj * Entry points: 211977Swnj * 221977Swnj * re_comp(s) 231977Swnj * char *s; 241977Swnj * ... returns 0 if the string s was compiled successfully, 251977Swnj * a pointer to an error message otherwise. 261977Swnj * If passed 0 or a null string returns without changing 271977Swnj * the currently compiled re (see note 11 below). 281977Swnj * 291977Swnj * re_exec(s) 301977Swnj * char *s; 311977Swnj * ... returns 1 if the string s matches the last compiled regular 321977Swnj * expression, 331977Swnj * 0 if the string s failed to match the last compiled 341977Swnj * regular expression, and 351977Swnj * -1 if the compiled regular expression was invalid 361977Swnj * (indicating an internal error). 371977Swnj * 381977Swnj * The strings passed to both re_comp and re_exec may have trailing or 391977Swnj * embedded newline characters; they are terminated by nulls. 401977Swnj * 411977Swnj * The identity of the author of these routines is lost in antiquity; 421977Swnj * this is essentially the same as the re code in the original V6 ed. 431977Swnj * 441977Swnj * The regular expressions recognized are described below. This description 451977Swnj * is essentially the same as that for ed. 461977Swnj * 471977Swnj * A regular expression specifies a set of strings of characters. 481977Swnj * A member of this set of strings is said to be matched by 491977Swnj * the regular expression. In the following specification for 501977Swnj * regular expressions the word `character' means any character but NUL. 511977Swnj * 521977Swnj * 1. Any character except a special character matches itself. 531977Swnj * Special characters are the regular expression delimiter plus 541977Swnj * \ [ . and sometimes ^ * $. 551977Swnj * 2. A . matches any character. 561977Swnj * 3. A \ followed by any character except a digit or ( ) 571977Swnj * matches that character. 581977Swnj * 4. A nonempty string s bracketed [s] (or [^s]) matches any 591977Swnj * character in (or not in) s. In s, \ has no special meaning, 601977Swnj * and ] may only appear as the first letter. A substring 611977Swnj * a-b, with a and b in ascending ASCII order, stands for 621977Swnj * the inclusive range of ASCII characters. 631977Swnj * 5. A regular expression of form 1-4 followed by * matches a 641977Swnj * sequence of 0 or more matches of the regular expression. 651977Swnj * 6. A regular expression, x, of form 1-8, bracketed \(x\) 661977Swnj * matches what x matches. 671977Swnj * 7. A \ followed by a digit n matches a copy of the string that the 681977Swnj * bracketed regular expression beginning with the nth \( matched. 691977Swnj * 8. A regular expression of form 1-8, x, followed by a regular 701977Swnj * expression of form 1-7, y matches a match for x followed by 711977Swnj * a match for y, with the x match being as long as possible 721977Swnj * while still permitting a y match. 731977Swnj * 9. A regular expression of form 1-8 preceded by ^ (or followed 741977Swnj * by $), is constrained to matches that begin at the left 751977Swnj * (or end at the right) end of a line. 761977Swnj * 10. A regular expression of form 1-9 picks out the longest among 771977Swnj * the leftmost matches in a line. 781977Swnj * 11. An empty regular expression stands for a copy of the last 791977Swnj * regular expression encountered. 801977Swnj */ 811977Swnj 821977Swnj /* 831977Swnj * constants for re's 841977Swnj */ 851977Swnj #define CBRA 1 861977Swnj #define CCHR 2 871977Swnj #define CDOT 4 881977Swnj #define CCL 6 891977Swnj #define NCCL 8 901977Swnj #define CDOL 10 911977Swnj #define CEOF 11 921977Swnj #define CKET 12 931977Swnj #define CBACK 18 941977Swnj 951977Swnj #define CSTAR 01 961977Swnj 971977Swnj #define ESIZE 512 981977Swnj #define NBRA 9 991977Swnj 1001977Swnj static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; 1011977Swnj static char circf; 1021977Swnj 1031977Swnj /* 1041977Swnj * compile the regular expression argument into a dfa 1051977Swnj */ 1061977Swnj char * 1071977Swnj re_comp(sp) 108*46597Sdonn register const char *sp; 1091977Swnj { 1101977Swnj register int c; 1111977Swnj register char *ep = expbuf; 1121977Swnj int cclcnt, numbra = 0; 1131977Swnj char *lastep = 0; 1141977Swnj char bracket[NBRA]; 1151977Swnj char *bracketp = &bracket[0]; 1161977Swnj static char *retoolong = "Regular expression too long"; 1171977Swnj 1181977Swnj #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } 1191977Swnj 1201977Swnj if (sp == 0 || *sp == '\0') { 1211977Swnj if (*ep == 0) 1221977Swnj return("No previous regular expression"); 1231977Swnj return(0); 1241977Swnj } 1251977Swnj if (*sp == '^') { 1261977Swnj circf = 1; 1271977Swnj sp++; 1281977Swnj } 1291977Swnj else 1301977Swnj circf = 0; 1311977Swnj for (;;) { 1321977Swnj if (ep >= &expbuf[ESIZE]) 1331977Swnj comerr(retoolong); 1341977Swnj if ((c = *sp++) == '\0') { 1351977Swnj if (bracketp != bracket) 1361977Swnj comerr("unmatched \\("); 1371977Swnj *ep++ = CEOF; 1381977Swnj *ep++ = 0; 1391977Swnj return(0); 1401977Swnj } 1411977Swnj if (c != '*') 1421977Swnj lastep = ep; 1431977Swnj switch (c) { 1441977Swnj 1451977Swnj case '.': 1461977Swnj *ep++ = CDOT; 1471977Swnj continue; 1481977Swnj 1491977Swnj case '*': 1501977Swnj if (lastep == 0 || *lastep == CBRA || *lastep == CKET) 1511977Swnj goto defchar; 1521977Swnj *lastep |= CSTAR; 1531977Swnj continue; 1541977Swnj 1551977Swnj case '$': 1561977Swnj if (*sp != '\0') 1571977Swnj goto defchar; 1581977Swnj *ep++ = CDOL; 1591977Swnj continue; 1601977Swnj 1611977Swnj case '[': 1621977Swnj *ep++ = CCL; 1631977Swnj *ep++ = 0; 1641977Swnj cclcnt = 1; 1651977Swnj if ((c = *sp++) == '^') { 1661977Swnj c = *sp++; 1671977Swnj ep[-2] = NCCL; 1681977Swnj } 1691977Swnj do { 1701977Swnj if (c == '\0') 1711977Swnj comerr("missing ]"); 1721977Swnj if (c == '-' && ep [-1] != 0) { 1731977Swnj if ((c = *sp++) == ']') { 1741977Swnj *ep++ = '-'; 1751977Swnj cclcnt++; 1761977Swnj break; 1771977Swnj } 1781977Swnj while (ep[-1] < c) { 1791977Swnj *ep = ep[-1] + 1; 1801977Swnj ep++; 1811977Swnj cclcnt++; 1821977Swnj if (ep >= &expbuf[ESIZE]) 1831977Swnj comerr(retoolong); 1841977Swnj } 1851977Swnj } 1861977Swnj *ep++ = c; 1871977Swnj cclcnt++; 1881977Swnj if (ep >= &expbuf[ESIZE]) 1891977Swnj comerr(retoolong); 1901977Swnj } while ((c = *sp++) != ']'); 1911977Swnj lastep[1] = cclcnt; 1921977Swnj continue; 1931977Swnj 1941977Swnj case '\\': 1951977Swnj if ((c = *sp++) == '(') { 1961977Swnj if (numbra >= NBRA) 1971977Swnj comerr("too many \\(\\) pairs"); 1981977Swnj *bracketp++ = numbra; 1991977Swnj *ep++ = CBRA; 2001977Swnj *ep++ = numbra++; 2011977Swnj continue; 2021977Swnj } 2031977Swnj if (c == ')') { 2041977Swnj if (bracketp <= bracket) 2051977Swnj comerr("unmatched \\)"); 2061977Swnj *ep++ = CKET; 2071977Swnj *ep++ = *--bracketp; 2081977Swnj continue; 2091977Swnj } 2101977Swnj if (c >= '1' && c < ('1' + NBRA)) { 2111977Swnj *ep++ = CBACK; 2121977Swnj *ep++ = c - '1'; 2131977Swnj continue; 2141977Swnj } 2151977Swnj *ep++ = CCHR; 2161977Swnj *ep++ = c; 2171977Swnj continue; 2181977Swnj 2191977Swnj defchar: 2201977Swnj default: 2211977Swnj *ep++ = CCHR; 2221977Swnj *ep++ = c; 2231977Swnj } 2241977Swnj } 2251977Swnj } 2261977Swnj 2271977Swnj /* 2281977Swnj * match the argument string against the compiled re 2291977Swnj */ 2301977Swnj int 2311977Swnj re_exec(p1) 232*46597Sdonn register const char *p1; 2331977Swnj { 2341977Swnj register char *p2 = expbuf; 2351977Swnj register int c; 2361977Swnj int rv; 2371977Swnj 2381977Swnj for (c = 0; c < NBRA; c++) { 2391977Swnj braslist[c] = 0; 2401977Swnj braelist[c] = 0; 2411977Swnj } 2421977Swnj if (circf) 2431977Swnj return((advance(p1, p2))); 2441977Swnj /* 2451977Swnj * fast check for first character 2461977Swnj */ 2471977Swnj if (*p2 == CCHR) { 2481977Swnj c = p2[1]; 2491977Swnj do { 2501977Swnj if (*p1 != c) 2511977Swnj continue; 2521977Swnj if (rv = advance(p1, p2)) 2531977Swnj return(rv); 2541977Swnj } while (*p1++); 2551977Swnj return(0); 2561977Swnj } 2571977Swnj /* 2581977Swnj * regular algorithm 2591977Swnj */ 2601977Swnj do 2611977Swnj if (rv = advance(p1, p2)) 2621977Swnj return(rv); 2631977Swnj while (*p1++); 2641977Swnj return(0); 2651977Swnj } 2661977Swnj 2671977Swnj /* 2681977Swnj * try to match the next thing in the dfa 2691977Swnj */ 2701977Swnj static int 2711977Swnj advance(lp, ep) 2721977Swnj register char *lp, *ep; 2731977Swnj { 2741977Swnj register char *curlp; 2751977Swnj int ct, i; 2761977Swnj int rv; 2771977Swnj 2781977Swnj for (;;) 2791977Swnj switch (*ep++) { 2801977Swnj 2811977Swnj case CCHR: 2821977Swnj if (*ep++ == *lp++) 2831977Swnj continue; 2841977Swnj return(0); 2851977Swnj 2861977Swnj case CDOT: 2871977Swnj if (*lp++) 2881977Swnj continue; 2891977Swnj return(0); 2901977Swnj 2911977Swnj case CDOL: 2921977Swnj if (*lp == '\0') 2931977Swnj continue; 2941977Swnj return(0); 2951977Swnj 2961977Swnj case CEOF: 2971977Swnj return(1); 2981977Swnj 2991977Swnj case CCL: 3001977Swnj if (cclass(ep, *lp++, 1)) { 3011977Swnj ep += *ep; 3021977Swnj continue; 3031977Swnj } 3041977Swnj return(0); 3051977Swnj 3061977Swnj case NCCL: 3071977Swnj if (cclass(ep, *lp++, 0)) { 3081977Swnj ep += *ep; 3091977Swnj continue; 3101977Swnj } 3111977Swnj return(0); 3121977Swnj 3131977Swnj case CBRA: 3141977Swnj braslist[*ep++] = lp; 3151977Swnj continue; 3161977Swnj 3171977Swnj case CKET: 3181977Swnj braelist[*ep++] = lp; 3191977Swnj continue; 3201977Swnj 3211977Swnj case CBACK: 3221977Swnj if (braelist[i = *ep++] == 0) 3231977Swnj return(-1); 3241977Swnj if (backref(i, lp)) { 3251977Swnj lp += braelist[i] - braslist[i]; 3261977Swnj continue; 3271977Swnj } 3281977Swnj return(0); 3291977Swnj 3301977Swnj case CBACK|CSTAR: 3311977Swnj if (braelist[i = *ep++] == 0) 3321977Swnj return(-1); 3331977Swnj curlp = lp; 3341977Swnj ct = braelist[i] - braslist[i]; 3351977Swnj while (backref(i, lp)) 3361977Swnj lp += ct; 3371977Swnj while (lp >= curlp) { 3381977Swnj if (rv = advance(lp, ep)) 3391977Swnj return(rv); 3401977Swnj lp -= ct; 3411977Swnj } 3421977Swnj continue; 3431977Swnj 3441977Swnj case CDOT|CSTAR: 3451977Swnj curlp = lp; 3461977Swnj while (*lp++) 3471977Swnj ; 3481977Swnj goto star; 3491977Swnj 3501977Swnj case CCHR|CSTAR: 3511977Swnj curlp = lp; 3521977Swnj while (*lp++ == *ep) 3531977Swnj ; 3541977Swnj ep++; 3551977Swnj goto star; 3561977Swnj 3571977Swnj case CCL|CSTAR: 3581977Swnj case NCCL|CSTAR: 3591977Swnj curlp = lp; 3601977Swnj while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 3611977Swnj ; 3621977Swnj ep += *ep; 3631977Swnj goto star; 3641977Swnj 3651977Swnj star: 3661977Swnj do { 3671977Swnj lp--; 3681977Swnj if (rv = advance(lp, ep)) 3691977Swnj return(rv); 3701977Swnj } while (lp > curlp); 3711977Swnj return(0); 3721977Swnj 3731977Swnj default: 3741977Swnj return(-1); 3751977Swnj } 3761977Swnj } 3771977Swnj 37846488Sbostic static 3791977Swnj backref(i, lp) 3801977Swnj register int i; 3811977Swnj register char *lp; 3821977Swnj { 3831977Swnj register char *bp; 3841977Swnj 3851977Swnj bp = braslist[i]; 3861977Swnj while (*bp++ == *lp++) 3871977Swnj if (bp >= braelist[i]) 3881977Swnj return(1); 3891977Swnj return(0); 3901977Swnj } 3911977Swnj 39246488Sbostic static int 3931977Swnj cclass(set, c, af) 3941977Swnj register char *set, c; 3951977Swnj int af; 3961977Swnj { 3971977Swnj register int n; 3981977Swnj 3991977Swnj if (c == 0) 4001977Swnj return(0); 4011977Swnj n = *set++; 4021977Swnj while (--n) 4031977Swnj if (*set++ == c) 4041977Swnj return(af); 4051977Swnj return(! af); 4061977Swnj } 407