148399Sbostic /*-
2*61241Sbostic * Copyright (c) 1980, 1993
3*61241Sbostic * The Regents of the University of California. All rights reserved.
448399Sbostic *
548399Sbostic * %sccs.include.proprietary.c%
621356Sdist */
721356Sdist
826580Sdonn #if defined(LIBC_SCCS) && !defined(lint)
9*61241Sbostic static char sccsid[] = "@(#)regex.c 8.1 (Berkeley) 06/04/93";
1048399Sbostic #endif /* LIBC_SCCS and not lint */
1121356Sdist
1246597Sdonn #include <unistd.h>
131977Swnj
1446597Sdonn static int advance();
1546597Sdonn static int backref();
1646597Sdonn static int cclass();
1746597Sdonn
181977Swnj /*
191977Swnj * routines to do regular expression matching
201977Swnj *
211977Swnj * Entry points:
221977Swnj *
231977Swnj * re_comp(s)
241977Swnj * char *s;
251977Swnj * ... returns 0 if the string s was compiled successfully,
261977Swnj * a pointer to an error message otherwise.
271977Swnj * If passed 0 or a null string returns without changing
281977Swnj * the currently compiled re (see note 11 below).
291977Swnj *
301977Swnj * re_exec(s)
311977Swnj * char *s;
321977Swnj * ... returns 1 if the string s matches the last compiled regular
331977Swnj * expression,
341977Swnj * 0 if the string s failed to match the last compiled
351977Swnj * regular expression, and
361977Swnj * -1 if the compiled regular expression was invalid
371977Swnj * (indicating an internal error).
381977Swnj *
391977Swnj * The strings passed to both re_comp and re_exec may have trailing or
401977Swnj * embedded newline characters; they are terminated by nulls.
411977Swnj *
421977Swnj * The identity of the author of these routines is lost in antiquity;
431977Swnj * this is essentially the same as the re code in the original V6 ed.
441977Swnj *
451977Swnj * The regular expressions recognized are described below. This description
461977Swnj * is essentially the same as that for ed.
471977Swnj *
481977Swnj * A regular expression specifies a set of strings of characters.
491977Swnj * A member of this set of strings is said to be matched by
501977Swnj * the regular expression. In the following specification for
511977Swnj * regular expressions the word `character' means any character but NUL.
521977Swnj *
531977Swnj * 1. Any character except a special character matches itself.
541977Swnj * Special characters are the regular expression delimiter plus
551977Swnj * \ [ . and sometimes ^ * $.
561977Swnj * 2. A . matches any character.
571977Swnj * 3. A \ followed by any character except a digit or ( )
581977Swnj * matches that character.
591977Swnj * 4. A nonempty string s bracketed [s] (or [^s]) matches any
601977Swnj * character in (or not in) s. In s, \ has no special meaning,
611977Swnj * and ] may only appear as the first letter. A substring
621977Swnj * a-b, with a and b in ascending ASCII order, stands for
631977Swnj * the inclusive range of ASCII characters.
641977Swnj * 5. A regular expression of form 1-4 followed by * matches a
651977Swnj * sequence of 0 or more matches of the regular expression.
661977Swnj * 6. A regular expression, x, of form 1-8, bracketed \(x\)
671977Swnj * matches what x matches.
681977Swnj * 7. A \ followed by a digit n matches a copy of the string that the
691977Swnj * bracketed regular expression beginning with the nth \( matched.
701977Swnj * 8. A regular expression of form 1-8, x, followed by a regular
711977Swnj * expression of form 1-7, y matches a match for x followed by
721977Swnj * a match for y, with the x match being as long as possible
731977Swnj * while still permitting a y match.
741977Swnj * 9. A regular expression of form 1-8 preceded by ^ (or followed
751977Swnj * by $), is constrained to matches that begin at the left
761977Swnj * (or end at the right) end of a line.
771977Swnj * 10. A regular expression of form 1-9 picks out the longest among
781977Swnj * the leftmost matches in a line.
791977Swnj * 11. An empty regular expression stands for a copy of the last
801977Swnj * regular expression encountered.
811977Swnj */
821977Swnj
831977Swnj /*
841977Swnj * constants for re's
851977Swnj */
861977Swnj #define CBRA 1
871977Swnj #define CCHR 2
881977Swnj #define CDOT 4
891977Swnj #define CCL 6
901977Swnj #define NCCL 8
911977Swnj #define CDOL 10
921977Swnj #define CEOF 11
931977Swnj #define CKET 12
941977Swnj #define CBACK 18
951977Swnj
961977Swnj #define CSTAR 01
971977Swnj
981977Swnj #define ESIZE 512
991977Swnj #define NBRA 9
1001977Swnj
1011977Swnj static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
1021977Swnj static char circf;
1031977Swnj
1041977Swnj /*
1051977Swnj * compile the regular expression argument into a dfa
1061977Swnj */
1071977Swnj char *
re_comp(sp)1081977Swnj re_comp(sp)
10946597Sdonn register const char *sp;
1101977Swnj {
1111977Swnj register int c;
1121977Swnj register char *ep = expbuf;
1131977Swnj int cclcnt, numbra = 0;
1141977Swnj char *lastep = 0;
1151977Swnj char bracket[NBRA];
1161977Swnj char *bracketp = &bracket[0];
1171977Swnj static char *retoolong = "Regular expression too long";
1181977Swnj
1191977Swnj #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
1201977Swnj
1211977Swnj if (sp == 0 || *sp == '\0') {
1221977Swnj if (*ep == 0)
1231977Swnj return("No previous regular expression");
1241977Swnj return(0);
1251977Swnj }
1261977Swnj if (*sp == '^') {
1271977Swnj circf = 1;
1281977Swnj sp++;
1291977Swnj }
1301977Swnj else
1311977Swnj circf = 0;
1321977Swnj for (;;) {
1331977Swnj if (ep >= &expbuf[ESIZE])
1341977Swnj comerr(retoolong);
1351977Swnj if ((c = *sp++) == '\0') {
1361977Swnj if (bracketp != bracket)
1371977Swnj comerr("unmatched \\(");
1381977Swnj *ep++ = CEOF;
1391977Swnj *ep++ = 0;
1401977Swnj return(0);
1411977Swnj }
1421977Swnj if (c != '*')
1431977Swnj lastep = ep;
1441977Swnj switch (c) {
1451977Swnj
1461977Swnj case '.':
1471977Swnj *ep++ = CDOT;
1481977Swnj continue;
1491977Swnj
1501977Swnj case '*':
1511977Swnj if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
1521977Swnj goto defchar;
1531977Swnj *lastep |= CSTAR;
1541977Swnj continue;
1551977Swnj
1561977Swnj case '$':
1571977Swnj if (*sp != '\0')
1581977Swnj goto defchar;
1591977Swnj *ep++ = CDOL;
1601977Swnj continue;
1611977Swnj
1621977Swnj case '[':
1631977Swnj *ep++ = CCL;
1641977Swnj *ep++ = 0;
1651977Swnj cclcnt = 1;
1661977Swnj if ((c = *sp++) == '^') {
1671977Swnj c = *sp++;
1681977Swnj ep[-2] = NCCL;
1691977Swnj }
1701977Swnj do {
1711977Swnj if (c == '\0')
1721977Swnj comerr("missing ]");
1731977Swnj if (c == '-' && ep [-1] != 0) {
1741977Swnj if ((c = *sp++) == ']') {
1751977Swnj *ep++ = '-';
1761977Swnj cclcnt++;
1771977Swnj break;
1781977Swnj }
1791977Swnj while (ep[-1] < c) {
1801977Swnj *ep = ep[-1] + 1;
1811977Swnj ep++;
1821977Swnj cclcnt++;
1831977Swnj if (ep >= &expbuf[ESIZE])
1841977Swnj comerr(retoolong);
1851977Swnj }
1861977Swnj }
1871977Swnj *ep++ = c;
1881977Swnj cclcnt++;
1891977Swnj if (ep >= &expbuf[ESIZE])
1901977Swnj comerr(retoolong);
1911977Swnj } while ((c = *sp++) != ']');
1921977Swnj lastep[1] = cclcnt;
1931977Swnj continue;
1941977Swnj
1951977Swnj case '\\':
1961977Swnj if ((c = *sp++) == '(') {
1971977Swnj if (numbra >= NBRA)
1981977Swnj comerr("too many \\(\\) pairs");
1991977Swnj *bracketp++ = numbra;
2001977Swnj *ep++ = CBRA;
2011977Swnj *ep++ = numbra++;
2021977Swnj continue;
2031977Swnj }
2041977Swnj if (c == ')') {
2051977Swnj if (bracketp <= bracket)
2061977Swnj comerr("unmatched \\)");
2071977Swnj *ep++ = CKET;
2081977Swnj *ep++ = *--bracketp;
2091977Swnj continue;
2101977Swnj }
2111977Swnj if (c >= '1' && c < ('1' + NBRA)) {
2121977Swnj *ep++ = CBACK;
2131977Swnj *ep++ = c - '1';
2141977Swnj continue;
2151977Swnj }
2161977Swnj *ep++ = CCHR;
2171977Swnj *ep++ = c;
2181977Swnj continue;
2191977Swnj
2201977Swnj defchar:
2211977Swnj default:
2221977Swnj *ep++ = CCHR;
2231977Swnj *ep++ = c;
2241977Swnj }
2251977Swnj }
2261977Swnj }
2271977Swnj
2281977Swnj /*
2291977Swnj * match the argument string against the compiled re
2301977Swnj */
2311977Swnj int
re_exec(p1)2321977Swnj re_exec(p1)
23346597Sdonn register const char *p1;
2341977Swnj {
2351977Swnj register char *p2 = expbuf;
2361977Swnj register int c;
2371977Swnj int rv;
2381977Swnj
2391977Swnj for (c = 0; c < NBRA; c++) {
2401977Swnj braslist[c] = 0;
2411977Swnj braelist[c] = 0;
2421977Swnj }
2431977Swnj if (circf)
2441977Swnj return((advance(p1, p2)));
2451977Swnj /*
2461977Swnj * fast check for first character
2471977Swnj */
2481977Swnj if (*p2 == CCHR) {
2491977Swnj c = p2[1];
2501977Swnj do {
2511977Swnj if (*p1 != c)
2521977Swnj continue;
2531977Swnj if (rv = advance(p1, p2))
2541977Swnj return(rv);
2551977Swnj } while (*p1++);
2561977Swnj return(0);
2571977Swnj }
2581977Swnj /*
2591977Swnj * regular algorithm
2601977Swnj */
2611977Swnj do
2621977Swnj if (rv = advance(p1, p2))
2631977Swnj return(rv);
2641977Swnj while (*p1++);
2651977Swnj return(0);
2661977Swnj }
2671977Swnj
2681977Swnj /*
2691977Swnj * try to match the next thing in the dfa
2701977Swnj */
2711977Swnj static int
advance(lp,ep)2721977Swnj advance(lp, ep)
2731977Swnj register char *lp, *ep;
2741977Swnj {
2751977Swnj register char *curlp;
2761977Swnj int ct, i;
2771977Swnj int rv;
2781977Swnj
2791977Swnj for (;;)
2801977Swnj switch (*ep++) {
2811977Swnj
2821977Swnj case CCHR:
2831977Swnj if (*ep++ == *lp++)
2841977Swnj continue;
2851977Swnj return(0);
2861977Swnj
2871977Swnj case CDOT:
2881977Swnj if (*lp++)
2891977Swnj continue;
2901977Swnj return(0);
2911977Swnj
2921977Swnj case CDOL:
2931977Swnj if (*lp == '\0')
2941977Swnj continue;
2951977Swnj return(0);
2961977Swnj
2971977Swnj case CEOF:
2981977Swnj return(1);
2991977Swnj
3001977Swnj case CCL:
3011977Swnj if (cclass(ep, *lp++, 1)) {
3021977Swnj ep += *ep;
3031977Swnj continue;
3041977Swnj }
3051977Swnj return(0);
3061977Swnj
3071977Swnj case NCCL:
3081977Swnj if (cclass(ep, *lp++, 0)) {
3091977Swnj ep += *ep;
3101977Swnj continue;
3111977Swnj }
3121977Swnj return(0);
3131977Swnj
3141977Swnj case CBRA:
3151977Swnj braslist[*ep++] = lp;
3161977Swnj continue;
3171977Swnj
3181977Swnj case CKET:
3191977Swnj braelist[*ep++] = lp;
3201977Swnj continue;
3211977Swnj
3221977Swnj case CBACK:
3231977Swnj if (braelist[i = *ep++] == 0)
3241977Swnj return(-1);
3251977Swnj if (backref(i, lp)) {
3261977Swnj lp += braelist[i] - braslist[i];
3271977Swnj continue;
3281977Swnj }
3291977Swnj return(0);
3301977Swnj
3311977Swnj case CBACK|CSTAR:
3321977Swnj if (braelist[i = *ep++] == 0)
3331977Swnj return(-1);
3341977Swnj curlp = lp;
3351977Swnj ct = braelist[i] - braslist[i];
3361977Swnj while (backref(i, lp))
3371977Swnj lp += ct;
3381977Swnj while (lp >= curlp) {
3391977Swnj if (rv = advance(lp, ep))
3401977Swnj return(rv);
3411977Swnj lp -= ct;
3421977Swnj }
3431977Swnj continue;
3441977Swnj
3451977Swnj case CDOT|CSTAR:
3461977Swnj curlp = lp;
3471977Swnj while (*lp++)
3481977Swnj ;
3491977Swnj goto star;
3501977Swnj
3511977Swnj case CCHR|CSTAR:
3521977Swnj curlp = lp;
3531977Swnj while (*lp++ == *ep)
3541977Swnj ;
3551977Swnj ep++;
3561977Swnj goto star;
3571977Swnj
3581977Swnj case CCL|CSTAR:
3591977Swnj case NCCL|CSTAR:
3601977Swnj curlp = lp;
3611977Swnj while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
3621977Swnj ;
3631977Swnj ep += *ep;
3641977Swnj goto star;
3651977Swnj
3661977Swnj star:
3671977Swnj do {
3681977Swnj lp--;
3691977Swnj if (rv = advance(lp, ep))
3701977Swnj return(rv);
3711977Swnj } while (lp > curlp);
3721977Swnj return(0);
3731977Swnj
3741977Swnj default:
3751977Swnj return(-1);
3761977Swnj }
3771977Swnj }
3781977Swnj
37946488Sbostic static
backref(i,lp)3801977Swnj backref(i, lp)
3811977Swnj register int i;
3821977Swnj register char *lp;
3831977Swnj {
3841977Swnj register char *bp;
3851977Swnj
3861977Swnj bp = braslist[i];
3871977Swnj while (*bp++ == *lp++)
3881977Swnj if (bp >= braelist[i])
3891977Swnj return(1);
3901977Swnj return(0);
3911977Swnj }
3921977Swnj
39346488Sbostic static int
cclass(set,c,af)3941977Swnj cclass(set, c, af)
3951977Swnj register char *set, c;
3961977Swnj int af;
3971977Swnj {
3981977Swnj register int n;
3991977Swnj
4001977Swnj if (c == 0)
4011977Swnj return(0);
4021977Swnj n = *set++;
4031977Swnj while (--n)
4041977Swnj if (*set++ == c)
4051977Swnj return(af);
4061977Swnj return(! af);
4071977Swnj }
408