10Sstevel@tonic-gate /*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
50Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only
60Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance
70Sstevel@tonic-gate * with the License.
80Sstevel@tonic-gate *
90Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
100Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
110Sstevel@tonic-gate * See the License for the specific language governing permissions
120Sstevel@tonic-gate * and limitations under the License.
130Sstevel@tonic-gate *
140Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
150Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
160Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
170Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
180Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
190Sstevel@tonic-gate *
200Sstevel@tonic-gate * CDDL HEADER END
210Sstevel@tonic-gate */
220Sstevel@tonic-gate /*
230Sstevel@tonic-gate * Copyright 1987 Sun Microsystems, Inc. All rights reserved.
240Sstevel@tonic-gate * Use is subject to license terms.
250Sstevel@tonic-gate */
260Sstevel@tonic-gate
270Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI"
280Sstevel@tonic-gate
290Sstevel@tonic-gate /*
300Sstevel@tonic-gate * routines to do regular expression matching
310Sstevel@tonic-gate *
320Sstevel@tonic-gate * Entry points:
330Sstevel@tonic-gate *
340Sstevel@tonic-gate * re_comp(s)
350Sstevel@tonic-gate * char *s;
360Sstevel@tonic-gate * ... returns 0 if the string s was compiled successfully,
370Sstevel@tonic-gate * a pointer to an error message otherwise.
380Sstevel@tonic-gate * If passed 0 or a null string returns without changing
390Sstevel@tonic-gate * the currently compiled re (see note 11 below).
400Sstevel@tonic-gate *
410Sstevel@tonic-gate * re_exec(s)
420Sstevel@tonic-gate * char *s;
430Sstevel@tonic-gate * ... returns 1 if the string s matches the last compiled regular
440Sstevel@tonic-gate * expression,
450Sstevel@tonic-gate * 0 if the string s failed to match the last compiled
460Sstevel@tonic-gate * regular expression, and
470Sstevel@tonic-gate * -1 if the compiled regular expression was invalid
480Sstevel@tonic-gate * (indicating an internal error).
490Sstevel@tonic-gate *
500Sstevel@tonic-gate * The strings passed to both re_comp and re_exec may have trailing or
510Sstevel@tonic-gate * embedded newline characters; they are terminated by nulls.
520Sstevel@tonic-gate *
530Sstevel@tonic-gate * The identity of the author of these routines is lost in antiquity;
540Sstevel@tonic-gate * this is essentially the same as the re code in the original V6 ed.
550Sstevel@tonic-gate *
560Sstevel@tonic-gate * The regular expressions recognized are described below. This description
570Sstevel@tonic-gate * is essentially the same as that for ed.
580Sstevel@tonic-gate *
590Sstevel@tonic-gate * A regular expression specifies a set of strings of characters.
600Sstevel@tonic-gate * A member of this set of strings is said to be matched by
610Sstevel@tonic-gate * the regular expression. In the following specification for
620Sstevel@tonic-gate * regular expressions the word `character' means any character but NUL.
630Sstevel@tonic-gate *
640Sstevel@tonic-gate * 1. Any character except a special character matches itself.
650Sstevel@tonic-gate * Special characters are the regular expression delimiter plus
660Sstevel@tonic-gate * \ [ . and sometimes ^ * $.
670Sstevel@tonic-gate * 2. A . matches any character.
680Sstevel@tonic-gate * 3. A \ followed by any character except a digit or ( )
690Sstevel@tonic-gate * matches that character.
700Sstevel@tonic-gate * 4. A nonempty string s bracketed [s] (or [^s]) matches any
710Sstevel@tonic-gate * character in (or not in) s. In s, \ has no special meaning,
720Sstevel@tonic-gate * and ] may only appear as the first letter. A substring
730Sstevel@tonic-gate * a-b, with a and b in ascending ASCII order, stands for
740Sstevel@tonic-gate * the inclusive range of ASCII characters.
750Sstevel@tonic-gate * 5. A regular expression of form 1-4 followed by * matches a
760Sstevel@tonic-gate * sequence of 0 or more matches of the regular expression.
770Sstevel@tonic-gate * 6. A regular expression, x, of form 1-8, bracketed \(x\)
780Sstevel@tonic-gate * matches what x matches.
790Sstevel@tonic-gate * 7. A \ followed by a digit n matches a copy of the string that the
800Sstevel@tonic-gate * bracketed regular expression beginning with the nth \( matched.
810Sstevel@tonic-gate * 8. A regular expression of form 1-8, x, followed by a regular
820Sstevel@tonic-gate * expression of form 1-7, y matches a match for x followed by
830Sstevel@tonic-gate * a match for y, with the x match being as long as possible
840Sstevel@tonic-gate * while still permitting a y match.
850Sstevel@tonic-gate * 9. A regular expression of form 1-8 preceded by ^ (or followed
860Sstevel@tonic-gate * by $), is constrained to matches that begin at the left
870Sstevel@tonic-gate * (or end at the right) end of a line.
880Sstevel@tonic-gate * 10. A regular expression of form 1-9 picks out the longest among
890Sstevel@tonic-gate * the leftmost matches in a line.
900Sstevel@tonic-gate * 11. An empty regular expression stands for a copy of the last
910Sstevel@tonic-gate * regular expression encountered.
920Sstevel@tonic-gate */
930Sstevel@tonic-gate
940Sstevel@tonic-gate /*
950Sstevel@tonic-gate * constants for re's
960Sstevel@tonic-gate */
970Sstevel@tonic-gate #define CBRA 1
980Sstevel@tonic-gate #define CCHR 2
990Sstevel@tonic-gate #define CDOT 4
1000Sstevel@tonic-gate #define CCL 6
1010Sstevel@tonic-gate #define NCCL 8
1020Sstevel@tonic-gate #define CDOL 10
1030Sstevel@tonic-gate #define CEOF 11
1040Sstevel@tonic-gate #define CKET 12
1050Sstevel@tonic-gate #define CBACK 18
1060Sstevel@tonic-gate
1070Sstevel@tonic-gate #define CSTAR 01
1080Sstevel@tonic-gate
1090Sstevel@tonic-gate #define ESIZE 512
1100Sstevel@tonic-gate #define NBRA 9
1110Sstevel@tonic-gate
1120Sstevel@tonic-gate static struct re_globals {
1130Sstevel@tonic-gate char _expbuf[ESIZE];
1140Sstevel@tonic-gate char *_braslist[NBRA], *_braelist[NBRA];
1150Sstevel@tonic-gate char _circf;
1160Sstevel@tonic-gate } *re_globals;
1170Sstevel@tonic-gate #define expbuf (_re->_expbuf)
1180Sstevel@tonic-gate #define braslist (_re->_braslist)
1190Sstevel@tonic-gate #define braelist (_re->_braelist)
1200Sstevel@tonic-gate #define circf (_re->_circf)
1210Sstevel@tonic-gate
122*722Smuffin static int advance(char *, char *);
123*722Smuffin static int backref(int, char *);
124*722Smuffin static int cclass(char *, char, int);
125*722Smuffin
1260Sstevel@tonic-gate /*
1270Sstevel@tonic-gate * compile the regular expression argument into a dfa
1280Sstevel@tonic-gate */
1290Sstevel@tonic-gate char *
re_comp(char * sp)130*722Smuffin re_comp(char *sp)
1310Sstevel@tonic-gate {
132*722Smuffin int c;
133*722Smuffin struct re_globals *_re = re_globals;
134*722Smuffin char *ep;
1350Sstevel@tonic-gate int cclcnt, numbra = 0;
1360Sstevel@tonic-gate char *lastep = 0;
1370Sstevel@tonic-gate char bracket[NBRA];
1380Sstevel@tonic-gate char *bracketp = &bracket[0];
1390Sstevel@tonic-gate char *retoolong = "Regular expression too long";
1400Sstevel@tonic-gate
1410Sstevel@tonic-gate if (_re == 0) {
1420Sstevel@tonic-gate _re = (struct re_globals *)calloc(1, sizeof (*_re));
1430Sstevel@tonic-gate if (_re == 0)
1440Sstevel@tonic-gate return ("Out of memory");
1450Sstevel@tonic-gate re_globals = _re;
1460Sstevel@tonic-gate }
1470Sstevel@tonic-gate ep = expbuf;
1480Sstevel@tonic-gate
1490Sstevel@tonic-gate #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
1500Sstevel@tonic-gate
1510Sstevel@tonic-gate if (sp == 0 || *sp == '\0') {
1520Sstevel@tonic-gate if (*ep == 0)
1530Sstevel@tonic-gate return("No previous regular expression");
154*722Smuffin return (0);
1550Sstevel@tonic-gate }
1560Sstevel@tonic-gate if (*sp == '^') {
1570Sstevel@tonic-gate circf = 1;
1580Sstevel@tonic-gate sp++;
1590Sstevel@tonic-gate }
1600Sstevel@tonic-gate else
1610Sstevel@tonic-gate circf = 0;
1620Sstevel@tonic-gate for (;;) {
1630Sstevel@tonic-gate if (ep >= &expbuf[ESIZE])
1640Sstevel@tonic-gate comerr(retoolong);
1650Sstevel@tonic-gate if ((c = *sp++) == '\0') {
1660Sstevel@tonic-gate if (bracketp != bracket)
1670Sstevel@tonic-gate comerr("unmatched \\(");
1680Sstevel@tonic-gate *ep++ = CEOF;
1690Sstevel@tonic-gate *ep++ = 0;
170*722Smuffin return (0);
1710Sstevel@tonic-gate }
1720Sstevel@tonic-gate if (c != '*')
1730Sstevel@tonic-gate lastep = ep;
1740Sstevel@tonic-gate switch (c) {
1750Sstevel@tonic-gate
1760Sstevel@tonic-gate case '.':
1770Sstevel@tonic-gate *ep++ = CDOT;
1780Sstevel@tonic-gate continue;
1790Sstevel@tonic-gate
1800Sstevel@tonic-gate case '*':
1810Sstevel@tonic-gate if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
1820Sstevel@tonic-gate goto defchar;
1830Sstevel@tonic-gate *lastep |= CSTAR;
1840Sstevel@tonic-gate continue;
1850Sstevel@tonic-gate
1860Sstevel@tonic-gate case '$':
1870Sstevel@tonic-gate if (*sp != '\0')
1880Sstevel@tonic-gate goto defchar;
1890Sstevel@tonic-gate *ep++ = CDOL;
1900Sstevel@tonic-gate continue;
1910Sstevel@tonic-gate
1920Sstevel@tonic-gate case '[':
1930Sstevel@tonic-gate *ep++ = CCL;
1940Sstevel@tonic-gate *ep++ = 0;
1950Sstevel@tonic-gate cclcnt = 1;
1960Sstevel@tonic-gate if ((c = *sp++) == '^') {
1970Sstevel@tonic-gate c = *sp++;
1980Sstevel@tonic-gate ep[-2] = NCCL;
1990Sstevel@tonic-gate }
2000Sstevel@tonic-gate do {
2010Sstevel@tonic-gate if (c == '\0')
2020Sstevel@tonic-gate comerr("missing ]");
2030Sstevel@tonic-gate if (c == '-' && ep [-1] != 0) {
2040Sstevel@tonic-gate if ((c = *sp++) == ']') {
2050Sstevel@tonic-gate *ep++ = '-';
2060Sstevel@tonic-gate cclcnt++;
2070Sstevel@tonic-gate break;
2080Sstevel@tonic-gate }
2090Sstevel@tonic-gate while (ep[-1] < c) {
2100Sstevel@tonic-gate *ep = ep[-1] + 1;
2110Sstevel@tonic-gate ep++;
2120Sstevel@tonic-gate cclcnt++;
2130Sstevel@tonic-gate if (ep >= &expbuf[ESIZE])
2140Sstevel@tonic-gate comerr(retoolong);
2150Sstevel@tonic-gate }
2160Sstevel@tonic-gate }
2170Sstevel@tonic-gate *ep++ = c;
2180Sstevel@tonic-gate cclcnt++;
2190Sstevel@tonic-gate if (ep >= &expbuf[ESIZE])
2200Sstevel@tonic-gate comerr(retoolong);
2210Sstevel@tonic-gate } while ((c = *sp++) != ']');
2220Sstevel@tonic-gate lastep[1] = cclcnt;
2230Sstevel@tonic-gate continue;
2240Sstevel@tonic-gate
2250Sstevel@tonic-gate case '\\':
2260Sstevel@tonic-gate if ((c = *sp++) == '(') {
2270Sstevel@tonic-gate if (numbra >= NBRA)
2280Sstevel@tonic-gate comerr("too many \\(\\) pairs");
2290Sstevel@tonic-gate *bracketp++ = numbra;
2300Sstevel@tonic-gate *ep++ = CBRA;
2310Sstevel@tonic-gate *ep++ = numbra++;
2320Sstevel@tonic-gate continue;
2330Sstevel@tonic-gate }
2340Sstevel@tonic-gate if (c == ')') {
2350Sstevel@tonic-gate if (bracketp <= bracket)
2360Sstevel@tonic-gate comerr("unmatched \\)");
2370Sstevel@tonic-gate *ep++ = CKET;
2380Sstevel@tonic-gate *ep++ = *--bracketp;
2390Sstevel@tonic-gate continue;
2400Sstevel@tonic-gate }
2410Sstevel@tonic-gate if (c >= '1' && c < ('1' + NBRA)) {
2420Sstevel@tonic-gate *ep++ = CBACK;
2430Sstevel@tonic-gate *ep++ = c - '1';
2440Sstevel@tonic-gate continue;
2450Sstevel@tonic-gate }
2460Sstevel@tonic-gate *ep++ = CCHR;
2470Sstevel@tonic-gate *ep++ = c;
2480Sstevel@tonic-gate continue;
2490Sstevel@tonic-gate
2500Sstevel@tonic-gate defchar:
2510Sstevel@tonic-gate default:
2520Sstevel@tonic-gate *ep++ = CCHR;
2530Sstevel@tonic-gate *ep++ = c;
2540Sstevel@tonic-gate }
2550Sstevel@tonic-gate }
2560Sstevel@tonic-gate }
2570Sstevel@tonic-gate
2580Sstevel@tonic-gate /*
2590Sstevel@tonic-gate * match the argument string against the compiled re
2600Sstevel@tonic-gate */
2610Sstevel@tonic-gate int
re_exec(char * p1)262*722Smuffin re_exec(char *p1)
2630Sstevel@tonic-gate {
264*722Smuffin struct re_globals *_re = re_globals;
265*722Smuffin char *p2;
266*722Smuffin int c;
2670Sstevel@tonic-gate int rv;
2680Sstevel@tonic-gate
2690Sstevel@tonic-gate if (_re == 0)
2700Sstevel@tonic-gate return (0);
2710Sstevel@tonic-gate p2 = expbuf;
2720Sstevel@tonic-gate for (c = 0; c < NBRA; c++) {
2730Sstevel@tonic-gate braslist[c] = 0;
2740Sstevel@tonic-gate braelist[c] = 0;
2750Sstevel@tonic-gate }
2760Sstevel@tonic-gate if (circf)
2770Sstevel@tonic-gate return((advance(p1, p2)));
2780Sstevel@tonic-gate /*
2790Sstevel@tonic-gate * fast check for first character
2800Sstevel@tonic-gate */
2810Sstevel@tonic-gate if (*p2 == CCHR) {
2820Sstevel@tonic-gate c = p2[1];
2830Sstevel@tonic-gate do {
2840Sstevel@tonic-gate if (*p1 != c)
2850Sstevel@tonic-gate continue;
2860Sstevel@tonic-gate if (rv = advance(p1, p2))
2870Sstevel@tonic-gate return(rv);
2880Sstevel@tonic-gate } while (*p1++);
2890Sstevel@tonic-gate return(0);
2900Sstevel@tonic-gate }
2910Sstevel@tonic-gate /*
2920Sstevel@tonic-gate * regular algorithm
2930Sstevel@tonic-gate */
2940Sstevel@tonic-gate do
2950Sstevel@tonic-gate if (rv = advance(p1, p2))
2960Sstevel@tonic-gate return(rv);
2970Sstevel@tonic-gate while (*p1++);
2980Sstevel@tonic-gate return(0);
2990Sstevel@tonic-gate }
3000Sstevel@tonic-gate
3010Sstevel@tonic-gate /*
3020Sstevel@tonic-gate * try to match the next thing in the dfa
3030Sstevel@tonic-gate */
3040Sstevel@tonic-gate static int
advance(char * lp,char * ep)305*722Smuffin advance(char *lp, char *ep)
3060Sstevel@tonic-gate {
307*722Smuffin char *curlp;
3080Sstevel@tonic-gate int ct, i;
3090Sstevel@tonic-gate int rv;
310*722Smuffin struct re_globals *_re = re_globals;
3110Sstevel@tonic-gate
3120Sstevel@tonic-gate for (;;)
3130Sstevel@tonic-gate switch (*ep++) {
3140Sstevel@tonic-gate
3150Sstevel@tonic-gate case CCHR:
3160Sstevel@tonic-gate if (*ep++ == *lp++)
3170Sstevel@tonic-gate continue;
3180Sstevel@tonic-gate return(0);
3190Sstevel@tonic-gate
3200Sstevel@tonic-gate case CDOT:
3210Sstevel@tonic-gate if (*lp++)
3220Sstevel@tonic-gate continue;
3230Sstevel@tonic-gate return(0);
3240Sstevel@tonic-gate
3250Sstevel@tonic-gate case CDOL:
3260Sstevel@tonic-gate if (*lp == '\0')
3270Sstevel@tonic-gate continue;
3280Sstevel@tonic-gate return(0);
3290Sstevel@tonic-gate
3300Sstevel@tonic-gate case CEOF:
3310Sstevel@tonic-gate return(1);
3320Sstevel@tonic-gate
3330Sstevel@tonic-gate case CCL:
3340Sstevel@tonic-gate if (cclass(ep, *lp++, 1)) {
3350Sstevel@tonic-gate ep += *ep;
3360Sstevel@tonic-gate continue;
3370Sstevel@tonic-gate }
3380Sstevel@tonic-gate return(0);
3390Sstevel@tonic-gate
3400Sstevel@tonic-gate case NCCL:
3410Sstevel@tonic-gate if (cclass(ep, *lp++, 0)) {
3420Sstevel@tonic-gate ep += *ep;
3430Sstevel@tonic-gate continue;
3440Sstevel@tonic-gate }
3450Sstevel@tonic-gate return(0);
3460Sstevel@tonic-gate
3470Sstevel@tonic-gate case CBRA:
3480Sstevel@tonic-gate braslist[*ep++] = lp;
3490Sstevel@tonic-gate continue;
3500Sstevel@tonic-gate
3510Sstevel@tonic-gate case CKET:
3520Sstevel@tonic-gate braelist[*ep++] = lp;
3530Sstevel@tonic-gate continue;
3540Sstevel@tonic-gate
3550Sstevel@tonic-gate case CBACK:
3560Sstevel@tonic-gate if (braelist[i = *ep++] == 0)
3570Sstevel@tonic-gate return(-1);
3580Sstevel@tonic-gate if (backref(i, lp)) {
3590Sstevel@tonic-gate lp += braelist[i] - braslist[i];
3600Sstevel@tonic-gate continue;
3610Sstevel@tonic-gate }
3620Sstevel@tonic-gate return(0);
3630Sstevel@tonic-gate
3640Sstevel@tonic-gate case CBACK|CSTAR:
3650Sstevel@tonic-gate if (braelist[i = *ep++] == 0)
3660Sstevel@tonic-gate return(-1);
3670Sstevel@tonic-gate curlp = lp;
3680Sstevel@tonic-gate ct = braelist[i] - braslist[i];
3690Sstevel@tonic-gate while (backref(i, lp))
3700Sstevel@tonic-gate lp += ct;
3710Sstevel@tonic-gate while (lp >= curlp) {
3720Sstevel@tonic-gate if (rv = advance(lp, ep))
3730Sstevel@tonic-gate return(rv);
3740Sstevel@tonic-gate lp -= ct;
3750Sstevel@tonic-gate }
3760Sstevel@tonic-gate continue;
3770Sstevel@tonic-gate
3780Sstevel@tonic-gate case CDOT|CSTAR:
3790Sstevel@tonic-gate curlp = lp;
3800Sstevel@tonic-gate while (*lp++)
3810Sstevel@tonic-gate ;
3820Sstevel@tonic-gate goto star;
3830Sstevel@tonic-gate
3840Sstevel@tonic-gate case CCHR|CSTAR:
3850Sstevel@tonic-gate curlp = lp;
3860Sstevel@tonic-gate while (*lp++ == *ep)
3870Sstevel@tonic-gate ;
3880Sstevel@tonic-gate ep++;
3890Sstevel@tonic-gate goto star;
3900Sstevel@tonic-gate
3910Sstevel@tonic-gate case CCL|CSTAR:
3920Sstevel@tonic-gate case NCCL|CSTAR:
3930Sstevel@tonic-gate curlp = lp;
3940Sstevel@tonic-gate while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
3950Sstevel@tonic-gate ;
3960Sstevel@tonic-gate ep += *ep;
3970Sstevel@tonic-gate goto star;
3980Sstevel@tonic-gate
3990Sstevel@tonic-gate star:
4000Sstevel@tonic-gate do {
4010Sstevel@tonic-gate lp--;
4020Sstevel@tonic-gate if (rv = advance(lp, ep))
4030Sstevel@tonic-gate return(rv);
4040Sstevel@tonic-gate } while (lp > curlp);
4050Sstevel@tonic-gate return(0);
4060Sstevel@tonic-gate
4070Sstevel@tonic-gate default:
4080Sstevel@tonic-gate return(-1);
4090Sstevel@tonic-gate }
4100Sstevel@tonic-gate }
4110Sstevel@tonic-gate
412*722Smuffin static int
backref(int i,char * lp)413*722Smuffin backref(int i, char *lp)
4140Sstevel@tonic-gate {
415*722Smuffin char *bp;
416*722Smuffin struct re_globals *_re = re_globals;
4170Sstevel@tonic-gate
4180Sstevel@tonic-gate bp = braslist[i];
4190Sstevel@tonic-gate while (*bp++ == *lp++)
4200Sstevel@tonic-gate if (bp >= braelist[i])
421*722Smuffin return (1);
422*722Smuffin return (0);
4230Sstevel@tonic-gate }
4240Sstevel@tonic-gate
4250Sstevel@tonic-gate static int
cclass(char * set,char c,int af)426*722Smuffin cclass(char *set, char c, int af)
4270Sstevel@tonic-gate {
428*722Smuffin int n;
4290Sstevel@tonic-gate
4300Sstevel@tonic-gate if (c == 0)
4310Sstevel@tonic-gate return(0);
4320Sstevel@tonic-gate n = *set++;
4330Sstevel@tonic-gate while (--n)
4340Sstevel@tonic-gate if (*set++ == c)
435*722Smuffin return (af);
436*722Smuffin return (!af);
4370Sstevel@tonic-gate }
438