1 /*-
2 * Copyright (c) 1980, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * %sccs.include.proprietary.c%
6 */
7
8 #if defined(LIBC_SCCS) && !defined(lint)
9 static char sccsid[] = "@(#)regex.c 8.1 (Berkeley) 06/04/93";
10 #endif /* LIBC_SCCS and not lint */
11
12 #include <unistd.h>
13
14 static int advance();
15 static int backref();
16 static int cclass();
17
18 /*
19 * routines to do regular expression matching
20 *
21 * Entry points:
22 *
23 * re_comp(s)
24 * char *s;
25 * ... returns 0 if the string s was compiled successfully,
26 * a pointer to an error message otherwise.
27 * If passed 0 or a null string returns without changing
28 * the currently compiled re (see note 11 below).
29 *
30 * re_exec(s)
31 * char *s;
32 * ... returns 1 if the string s matches the last compiled regular
33 * expression,
34 * 0 if the string s failed to match the last compiled
35 * regular expression, and
36 * -1 if the compiled regular expression was invalid
37 * (indicating an internal error).
38 *
39 * The strings passed to both re_comp and re_exec may have trailing or
40 * embedded newline characters; they are terminated by nulls.
41 *
42 * The identity of the author of these routines is lost in antiquity;
43 * this is essentially the same as the re code in the original V6 ed.
44 *
45 * The regular expressions recognized are described below. This description
46 * is essentially the same as that for ed.
47 *
48 * A regular expression specifies a set of strings of characters.
49 * A member of this set of strings is said to be matched by
50 * the regular expression. In the following specification for
51 * regular expressions the word `character' means any character but NUL.
52 *
53 * 1. Any character except a special character matches itself.
54 * Special characters are the regular expression delimiter plus
55 * \ [ . and sometimes ^ * $.
56 * 2. A . matches any character.
57 * 3. A \ followed by any character except a digit or ( )
58 * matches that character.
59 * 4. A nonempty string s bracketed [s] (or [^s]) matches any
60 * character in (or not in) s. In s, \ has no special meaning,
61 * and ] may only appear as the first letter. A substring
62 * a-b, with a and b in ascending ASCII order, stands for
63 * the inclusive range of ASCII characters.
64 * 5. A regular expression of form 1-4 followed by * matches a
65 * sequence of 0 or more matches of the regular expression.
66 * 6. A regular expression, x, of form 1-8, bracketed \(x\)
67 * matches what x matches.
68 * 7. A \ followed by a digit n matches a copy of the string that the
69 * bracketed regular expression beginning with the nth \( matched.
70 * 8. A regular expression of form 1-8, x, followed by a regular
71 * expression of form 1-7, y matches a match for x followed by
72 * a match for y, with the x match being as long as possible
73 * while still permitting a y match.
74 * 9. A regular expression of form 1-8 preceded by ^ (or followed
75 * by $), is constrained to matches that begin at the left
76 * (or end at the right) end of a line.
77 * 10. A regular expression of form 1-9 picks out the longest among
78 * the leftmost matches in a line.
79 * 11. An empty regular expression stands for a copy of the last
80 * regular expression encountered.
81 */
82
83 /*
84 * constants for re's
85 */
86 #define CBRA 1
87 #define CCHR 2
88 #define CDOT 4
89 #define CCL 6
90 #define NCCL 8
91 #define CDOL 10
92 #define CEOF 11
93 #define CKET 12
94 #define CBACK 18
95
96 #define CSTAR 01
97
98 #define ESIZE 512
99 #define NBRA 9
100
101 static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
102 static char circf;
103
104 /*
105 * compile the regular expression argument into a dfa
106 */
107 char *
re_comp(sp)108 re_comp(sp)
109 register const char *sp;
110 {
111 register int c;
112 register char *ep = expbuf;
113 int cclcnt, numbra = 0;
114 char *lastep = 0;
115 char bracket[NBRA];
116 char *bracketp = &bracket[0];
117 static char *retoolong = "Regular expression too long";
118
119 #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
120
121 if (sp == 0 || *sp == '\0') {
122 if (*ep == 0)
123 return("No previous regular expression");
124 return(0);
125 }
126 if (*sp == '^') {
127 circf = 1;
128 sp++;
129 }
130 else
131 circf = 0;
132 for (;;) {
133 if (ep >= &expbuf[ESIZE])
134 comerr(retoolong);
135 if ((c = *sp++) == '\0') {
136 if (bracketp != bracket)
137 comerr("unmatched \\(");
138 *ep++ = CEOF;
139 *ep++ = 0;
140 return(0);
141 }
142 if (c != '*')
143 lastep = ep;
144 switch (c) {
145
146 case '.':
147 *ep++ = CDOT;
148 continue;
149
150 case '*':
151 if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
152 goto defchar;
153 *lastep |= CSTAR;
154 continue;
155
156 case '$':
157 if (*sp != '\0')
158 goto defchar;
159 *ep++ = CDOL;
160 continue;
161
162 case '[':
163 *ep++ = CCL;
164 *ep++ = 0;
165 cclcnt = 1;
166 if ((c = *sp++) == '^') {
167 c = *sp++;
168 ep[-2] = NCCL;
169 }
170 do {
171 if (c == '\0')
172 comerr("missing ]");
173 if (c == '-' && ep [-1] != 0) {
174 if ((c = *sp++) == ']') {
175 *ep++ = '-';
176 cclcnt++;
177 break;
178 }
179 while (ep[-1] < c) {
180 *ep = ep[-1] + 1;
181 ep++;
182 cclcnt++;
183 if (ep >= &expbuf[ESIZE])
184 comerr(retoolong);
185 }
186 }
187 *ep++ = c;
188 cclcnt++;
189 if (ep >= &expbuf[ESIZE])
190 comerr(retoolong);
191 } while ((c = *sp++) != ']');
192 lastep[1] = cclcnt;
193 continue;
194
195 case '\\':
196 if ((c = *sp++) == '(') {
197 if (numbra >= NBRA)
198 comerr("too many \\(\\) pairs");
199 *bracketp++ = numbra;
200 *ep++ = CBRA;
201 *ep++ = numbra++;
202 continue;
203 }
204 if (c == ')') {
205 if (bracketp <= bracket)
206 comerr("unmatched \\)");
207 *ep++ = CKET;
208 *ep++ = *--bracketp;
209 continue;
210 }
211 if (c >= '1' && c < ('1' + NBRA)) {
212 *ep++ = CBACK;
213 *ep++ = c - '1';
214 continue;
215 }
216 *ep++ = CCHR;
217 *ep++ = c;
218 continue;
219
220 defchar:
221 default:
222 *ep++ = CCHR;
223 *ep++ = c;
224 }
225 }
226 }
227
228 /*
229 * match the argument string against the compiled re
230 */
231 int
re_exec(p1)232 re_exec(p1)
233 register const char *p1;
234 {
235 register char *p2 = expbuf;
236 register int c;
237 int rv;
238
239 for (c = 0; c < NBRA; c++) {
240 braslist[c] = 0;
241 braelist[c] = 0;
242 }
243 if (circf)
244 return((advance(p1, p2)));
245 /*
246 * fast check for first character
247 */
248 if (*p2 == CCHR) {
249 c = p2[1];
250 do {
251 if (*p1 != c)
252 continue;
253 if (rv = advance(p1, p2))
254 return(rv);
255 } while (*p1++);
256 return(0);
257 }
258 /*
259 * regular algorithm
260 */
261 do
262 if (rv = advance(p1, p2))
263 return(rv);
264 while (*p1++);
265 return(0);
266 }
267
268 /*
269 * try to match the next thing in the dfa
270 */
271 static int
advance(lp,ep)272 advance(lp, ep)
273 register char *lp, *ep;
274 {
275 register char *curlp;
276 int ct, i;
277 int rv;
278
279 for (;;)
280 switch (*ep++) {
281
282 case CCHR:
283 if (*ep++ == *lp++)
284 continue;
285 return(0);
286
287 case CDOT:
288 if (*lp++)
289 continue;
290 return(0);
291
292 case CDOL:
293 if (*lp == '\0')
294 continue;
295 return(0);
296
297 case CEOF:
298 return(1);
299
300 case CCL:
301 if (cclass(ep, *lp++, 1)) {
302 ep += *ep;
303 continue;
304 }
305 return(0);
306
307 case NCCL:
308 if (cclass(ep, *lp++, 0)) {
309 ep += *ep;
310 continue;
311 }
312 return(0);
313
314 case CBRA:
315 braslist[*ep++] = lp;
316 continue;
317
318 case CKET:
319 braelist[*ep++] = lp;
320 continue;
321
322 case CBACK:
323 if (braelist[i = *ep++] == 0)
324 return(-1);
325 if (backref(i, lp)) {
326 lp += braelist[i] - braslist[i];
327 continue;
328 }
329 return(0);
330
331 case CBACK|CSTAR:
332 if (braelist[i = *ep++] == 0)
333 return(-1);
334 curlp = lp;
335 ct = braelist[i] - braslist[i];
336 while (backref(i, lp))
337 lp += ct;
338 while (lp >= curlp) {
339 if (rv = advance(lp, ep))
340 return(rv);
341 lp -= ct;
342 }
343 continue;
344
345 case CDOT|CSTAR:
346 curlp = lp;
347 while (*lp++)
348 ;
349 goto star;
350
351 case CCHR|CSTAR:
352 curlp = lp;
353 while (*lp++ == *ep)
354 ;
355 ep++;
356 goto star;
357
358 case CCL|CSTAR:
359 case NCCL|CSTAR:
360 curlp = lp;
361 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
362 ;
363 ep += *ep;
364 goto star;
365
366 star:
367 do {
368 lp--;
369 if (rv = advance(lp, ep))
370 return(rv);
371 } while (lp > curlp);
372 return(0);
373
374 default:
375 return(-1);
376 }
377 }
378
379 static
backref(i,lp)380 backref(i, lp)
381 register int i;
382 register char *lp;
383 {
384 register char *bp;
385
386 bp = braslist[i];
387 while (*bp++ == *lp++)
388 if (bp >= braelist[i])
389 return(1);
390 return(0);
391 }
392
393 static int
cclass(set,c,af)394 cclass(set, c, af)
395 register char *set, c;
396 int af;
397 {
398 register int n;
399
400 if (c == 0)
401 return(0);
402 n = *set++;
403 while (--n)
404 if (*set++ == c)
405 return(af);
406 return(! af);
407 }
408