xref: /plan9-contrib/sys/src/cmd/awk/re.c (revision 219b2ee8daee37f4aad58d63f21287faa8e4ffdc)
1 /*
2 Copyright (c) 1989 AT&T
3 	All Rights Reserved
4 
5 THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF AT&T.
6 
7 The copyright notice above does not evidence any
8 actual or intended publication of such source code.
9 */
10 
11 #define DEBUG
12 #include <stdio.h>
13 #include <ctype.h>
14 #include <setjmp.h>
15 #include <math.h>
16 #include <string.h>
17 #include <stdlib.h>
18 #include <time.h>
19 #include "awk.h"
20 #include "y.tab.h"
21 #include "regexp.h"
22 
23 	/* This file provides the interface between the main body of
24 	 * awk and the pattern matching package.  It preprocesses
25 	 * patterns prior to compilation to provide awk-like semantics
26 	 * to character sequences not supported by the pattern package.
27 	 * The following conversions are performed:
28 	 *
29 	 *	"()"		->	"[]"
30 	 *	"[-"		->	"[\-"
31 	 *	"[^-"		->	"[^\-"
32 	 *	"-]"		->	"\-]"
33 	 *	"[]"		->	"[]*"
34 	 *	"\xdddd"	->	"\z" where 'z' is the UTF sequence
35 	 *					for the hex value
36 	 *	"\ddd"		->	"\o" where 'o' is a char octal value
37 	 *	"\b"		->	"\B"	where 'B' is backspace
38 	 *	"\t"		->	"\T"	where 'T' is tab
39 	 *	"\f"		->	"\F"	where 'F' is form feed
40 	 *	"\n"		->	"\N"	where 'N' is newline
41 	 *	"\r"		->	"\r"	where 'C' is cr
42 	 */
43 
44 #define	MAXRE	512
45 
46 static uchar	re[MAXRE];	/* copy buffer */
47 
48 uchar	*patbeg;
49 int	patlen;			/* number of chars in pattern */
50 
51 #define	NPATS	20		/* number of slots in pattern cache */
52 
53 static struct pat_list		/* dynamic pattern cache */
54 {
55 	char	*re;
56 	int	use;
57 	Reprog	*program;
58 } pattern[NPATS];
59 
60 static int npats;		/* cache fill level */
61 
62 	/* Compile a pattern */
63 void
64 *compre(uchar *pat)
65 {
66 	int i, j, inclass;
67 	uchar c, *p, val, *s;
68 	Reprog *program;
69 
70 	if (!compile_time) {	/* search cache for dynamic pattern */
71 		for (i = 0; i < npats; i++)
72 			if (!strcmp(pat, pattern[i].re)) {
73 				pattern[i].use++;
74 				return((void *) pattern[i].program);
75 			}
76 	}
77 		/* Preprocess Pattern for compilation */
78 	p = re;
79 	s = pat;
80 	inclass = 0;
81 	while (c = *s++) {
82 		if (c == '\\') {
83 			quoted(&s, &p, re+MAXRE);
84 			continue;
85 		}
86 		else if (!inclass && c == '(' && *s == ')') {
87 			if (p < re+MAXRE-2) {	/* '()' -> '[]*' */
88 				*p++ = '[';
89 				*p++ = ']';
90 				c = '*';
91 				s++;
92 			}
93 			else overflow();
94 		}
95 		else if (c == '['){			/* '[-' -> '[\-' */
96 			inclass = 1;
97 			if (*s == '-') {
98 				if (p < re+MAXRE-2) {
99 					*p++ = '[';
100 					*p++ = '\\';
101 					c = *s++;
102 				}
103 				else overflow();
104 			}				/* '[^-' -> '[^\-'*/
105 			else if (*s == '^' && s[1] == '-'){
106 				if (p < re+MAXRE-3) {
107 					*p++ = '[';
108 					*p++ = *s++;
109 					*p++ = '\\';
110 					c = *s++;
111 				}
112 				else overflow();
113 			}
114 			else if (*s == '['){		/* skip '[[' */
115 				if (p < re+MAXRE-1)
116 					*p++ = c;
117 				else overflow();
118 				c = *s++;
119 			}
120 			else if (*s == '^' && s[1] == '[') {	/* skip '[^['*/
121 				if (p < re+MAXRE-2) {
122 					*p++ = c;
123 					*p++ = *s++;
124 					c = *s++;
125 				}
126 				else overflow();
127 			}
128 			else if (*s == ']') {		/* '[]' -> '[]*' */
129 				if (p < re+MAXRE-2) {
130 					*p++ = c;
131 					*p++ = *s++;
132 					c = '*';
133 					inclass = 0;
134 				}
135 				else overflow();
136 			}
137 		}
138 		else if (c == '-' && *s == ']') {	/* '-]' -> '\-]' */
139 			if (p < re+MAXRE-1)
140 				*p++ = '\\';
141 			else overflow();
142 		}
143 		else if (c == ']')
144 			inclass = 0;
145 		if (p < re+MAXRE-1)
146 			*p++ = c;
147 		else overflow();
148 	}
149 	*p = 0;
150 	program = regcomp(re);		/* compile pattern */
151 	if (!compile_time) {
152 		if (npats < NPATS)	/* Room in cache */
153 			i = npats++;
154 		else {			/* Throw out least used */
155 			int use = pattern[0].use;
156 			i = 0;
157 			for (j = 1; j < NPATS; j++) {
158 				if (pattern[j].use < use) {
159 					use = pattern[j].use;
160 					i = j;
161 				}
162 			}
163 			xfree(pattern[i].program);
164 			xfree(pattern[i].re);
165 		}
166 		pattern[i].re = tostring(pat);
167 		pattern[i].program = program;
168 		pattern[i++].use = 1;
169 	}
170 	return((void *) program);
171 }
172 
173 	/* T/F match indication - matched string not exported */
174 int
175 match(void *p, uchar *s, uchar *start)
176 {
177 	return regexec((Reprog *) p, (char *) s, 0, 0);
178 }
179 
180 	/* match and delimit the matched string */
181 int
182 pmatch(void *p, uchar *s, uchar *start)
183 {
184 	Resub m;
185 
186 	m.s.sp = start;
187 	m.e.ep = 0;
188 	if (regexec((Reprog *) p, (char *) s, &m, 1)) {
189 		patbeg = m.s.sp;
190 		patlen = m.e.ep-m.s.sp;
191 		return 1;
192 	}
193 	patlen = -1;
194 	patbeg = start;
195 	return 0;
196 }
197 
198 	/* perform a non-empty match */
199 int
200 nematch(void *p, uchar *s, uchar *start)
201 {
202 	if (pmatch(p, s, start) == 1 && patlen > 0)
203 		return 1;
204 	patlen = -1;
205 	patbeg = start;
206 	return 0;
207 }
208 /* in the parsing of regular expressions, metacharacters like . have */
209 /* to be seen literally;  \056 is not a metacharacter. */
210 
211 hexstr(char **pp)	/* find and eval hex string at pp, return new p */
212 {
213 	char c;
214 	int n = 0;
215 	int i;
216 
217 	for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
218 		if (isdigit(c))
219 			n = 16 * n + c - '0';
220 		else if ('a' <= c && c <= 'f')
221 			n = 16 * n + c - 'a' + 10;
222 		else if ('A' <= c && c <= 'F')
223 			n = 16 * n + c - 'A' + 10;
224 	}
225 	*pp += i;
226 	return n;
227 }
228 
229 	/* look for awk-specific escape sequences */
230 
231 #define isoctdigit(c) ((c) >= '0' && (c) <= '8') /* multiple use of arg */
232 
233 void
234 quoted(uchar **s, uchar **to, uchar *end)	/* handle escaped sequence */
235 {
236 	char *p = *s;
237 	char *t = *to;
238 	wchar_t c;
239 
240 	switch(c = *p++) {
241 	case 't':
242 		c = '\t';
243 		break;
244 	case 'n':
245 		c = '\n';
246 		break;
247 	case 'f':
248 		c = '\f';
249 		break;
250 	case 'r':
251 		c = '\r';
252 		break;
253 	case 'b':
254 		c = '\b';
255 		break;
256 	default:
257 		if (t < end-1)		/* all else must be escaped */
258 			*t++ = '\\';
259 		if (c == 'x') {		/* hexadecimal goo follows */
260 			c = hexstr(&p);
261 			if (t < end-MB_CUR_MAX)
262 				t += wctomb(t, c);
263 			else overflow();
264 			*to = t;
265 			*s = p;
266 			return;
267 		} else if (isoctdigit(c)) {	/* \d \dd \ddd */
268 			c -= '0';
269 			if (isoctdigit(*p)) {
270 				c = 8 * c + *p++ - '0';
271 				if (isoctdigit(*p))
272 					c = 8 * c + *p++ - '0';
273 			}
274 		}
275 		break;
276 	}
277 	if (t < end-1)
278 		*t++ = c;
279 	*s = p;
280 	*to = t;
281 }
282 	/* count rune positions */
283 int
284 countposn(uchar *s, int n)
285 {
286 	int i;
287 	uchar *end;
288 
289 	for (i = 0, end = s+n; *s && s < end; i++)
290 		s += mblen(s, n);
291 	return(i);
292 }
293 
294 	/* pattern package error handler */
295 
296 void
297 regerror(char *s)
298 {
299 	ERROR "%s", s FATAL;
300 }
301 
302 void
303 overflow(void)
304 {
305 	ERROR "%s", "regular expression too big" FATAL;
306 }
307