xref: /plan9/sys/src/cmd/awk/re.c (revision 59cc4ca53493a3c6d2349fe2b7f7c40f7dce7294)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 
26 #define DEBUG
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <setjmp.h>
30 #include <math.h>
31 #include <string.h>
32 #include <stdlib.h>
33 #include <time.h>
34 #include "awk.h"
35 #include "y.tab.h"
36 #include "regexp.h"
37 
38 	/* This file provides the interface between the main body of
39 	 * awk and the pattern matching package.  It preprocesses
40 	 * patterns prior to compilation to provide awk-like semantics
41 	 * to character sequences not supported by the pattern package.
42 	 * The following conversions are performed:
43 	 *
44 	 *	"()"		->	"[]"
45 	 *	"[-"		->	"[\-"
46 	 *	"[^-"		->	"[^\-"
47 	 *	"-]"		->	"\-]"
48 	 *	"[]"		->	"[]*"
49 	 *	"\xdddd"	->	"\z" where 'z' is the UTF sequence
50 	 *					for the hex value
51 	 *	"\ddd"		->	"\o" where 'o' is a char octal value
52 	 *	"\b"		->	"\B"	where 'B' is backspace
53 	 *	"\t"		->	"\T"	where 'T' is tab
54 	 *	"\f"		->	"\F"	where 'F' is form feed
55 	 *	"\n"		->	"\N"	where 'N' is newline
56 	 *	"\r"		->	"\r"	where 'C' is cr
57 	 */
58 
59 #define	MAXRE	512
60 
61 static char	re[MAXRE];	/* copy buffer */
62 
63 char	*patbeg;
64 int	patlen;			/* number of chars in pattern */
65 
66 #define	NPATS	20		/* number of slots in pattern cache */
67 
68 static struct pat_list		/* dynamic pattern cache */
69 {
70 	char	*re;
71 	int	use;
72 	Reprog	*program;
73 } pattern[NPATS];
74 
75 static int npats;		/* cache fill level */
76 
77 	/* Compile a pattern */
78 void
compre(char * pat)79 *compre(char *pat)
80 {
81 	int i, j, inclass;
82 	char c, *p, *s;
83 	Reprog *program;
84 
85 	if (!compile_time) {	/* search cache for dynamic pattern */
86 		for (i = 0; i < npats; i++)
87 			if (!strcmp(pat, pattern[i].re)) {
88 				pattern[i].use++;
89 				return((void *) pattern[i].program);
90 			}
91 	}
92 		/* Preprocess Pattern for compilation */
93 	p = re;
94 	s = pat;
95 	inclass = 0;
96 	while (c = *s++) {
97 		if (c == '\\') {
98 			quoted(&s, &p, re+MAXRE);
99 			continue;
100 		}
101 		else if (!inclass && c == '(' && *s == ')') {
102 			if (p < re+MAXRE-2) {	/* '()' -> '[]*' */
103 				*p++ = '[';
104 				*p++ = ']';
105 				c = '*';
106 				s++;
107 			}
108 			else overflow();
109 		}
110 		else if (c == '['){			/* '[-' -> '[\-' */
111 			inclass = 1;
112 			if (*s == '-') {
113 				if (p < re+MAXRE-2) {
114 					*p++ = '[';
115 					*p++ = '\\';
116 					c = *s++;
117 				}
118 				else overflow();
119 			}				/* '[^-' -> '[^\-'*/
120 			else if (*s == '^' && s[1] == '-'){
121 				if (p < re+MAXRE-3) {
122 					*p++ = '[';
123 					*p++ = *s++;
124 					*p++ = '\\';
125 					c = *s++;
126 				}
127 				else overflow();
128 			}
129 			else if (*s == '['){		/* skip '[[' */
130 				if (p < re+MAXRE-1)
131 					*p++ = c;
132 				else overflow();
133 				c = *s++;
134 			}
135 			else if (*s == '^' && s[1] == '[') {	/* skip '[^['*/
136 				if (p < re+MAXRE-2) {
137 					*p++ = c;
138 					*p++ = *s++;
139 					c = *s++;
140 				}
141 				else overflow();
142 			}
143 			else if (*s == ']') {		/* '[]' -> '[]*' */
144 				if (p < re+MAXRE-2) {
145 					*p++ = c;
146 					*p++ = *s++;
147 					c = '*';
148 					inclass = 0;
149 				}
150 				else overflow();
151 			}
152 		}
153 		else if (c == '-' && *s == ']') {	/* '-]' -> '\-]' */
154 			if (p < re+MAXRE-1)
155 				*p++ = '\\';
156 			else overflow();
157 		}
158 		else if (c == ']')
159 			inclass = 0;
160 		if (p < re+MAXRE-1)
161 			*p++ = c;
162 		else overflow();
163 	}
164 	*p = 0;
165 	program = regcomp(re);		/* compile pattern */
166 	if (!compile_time) {
167 		if (npats < NPATS)	/* Room in cache */
168 			i = npats++;
169 		else {			/* Throw out least used */
170 			int use = pattern[0].use;
171 			i = 0;
172 			for (j = 1; j < NPATS; j++) {
173 				if (pattern[j].use < use) {
174 					use = pattern[j].use;
175 					i = j;
176 				}
177 			}
178 			xfree(pattern[i].program);
179 			xfree(pattern[i].re);
180 		}
181 		pattern[i].re = tostring(pat);
182 		pattern[i].program = program;
183 		pattern[i].use = 1;
184 	}
185 	return((void *) program);
186 }
187 
188 	/* T/F match indication - matched string not exported */
189 int
match(void * p,char * s,char *)190 match(void *p, char *s, char *)
191 {
192 	return regexec((Reprog *) p, (char *) s, 0, 0);
193 }
194 
195 	/* match and delimit the matched string */
196 int
pmatch(void * p,char * s,char * start)197 pmatch(void *p, char *s, char *start)
198 {
199 	Resub m;
200 
201 	m.s.sp = start;
202 	m.e.ep = 0;
203 	if (regexec((Reprog *) p, (char *) s, &m, 1)) {
204 		patbeg = m.s.sp;
205 		patlen = m.e.ep-m.s.sp;
206 		return 1;
207 	}
208 	patlen = -1;
209 	patbeg = start;
210 	return 0;
211 }
212 
213 	/* perform a non-empty match */
214 int
nematch(void * p,char * s,char * start)215 nematch(void *p, char *s, char *start)
216 {
217 	if (pmatch(p, s, start) == 1 && patlen > 0)
218 		return 1;
219 	patlen = -1;
220 	patbeg = start;
221 	return 0;
222 }
223 /* in the parsing of regular expressions, metacharacters like . have */
224 /* to be seen literally;  \056 is not a metacharacter. */
225 
hexstr(char ** pp)226 hexstr(char **pp)	/* find and eval hex string at pp, return new p */
227 {
228 	char c;
229 	int n = 0;
230 	int i;
231 
232 	for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
233 		if (isdigit(c))
234 			n = 16 * n + c - '0';
235 		else if ('a' <= c && c <= 'f')
236 			n = 16 * n + c - 'a' + 10;
237 		else if ('A' <= c && c <= 'F')
238 			n = 16 * n + c - 'A' + 10;
239 	}
240 	*pp += i;
241 	return n;
242 }
243 
244 	/* look for awk-specific escape sequences */
245 
246 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
247 
248 void
quoted(char ** s,char ** to,char * end)249 quoted(char **s, char **to, char *end)	/* handle escaped sequence */
250 {
251 	char *p = *s;
252 	char *t = *to;
253 	wchar_t c;
254 
255 	switch(c = *p++) {
256 	case 't':
257 		c = '\t';
258 		break;
259 	case 'n':
260 		c = '\n';
261 		break;
262 	case 'f':
263 		c = '\f';
264 		break;
265 	case 'r':
266 		c = '\r';
267 		break;
268 	case 'b':
269 		c = '\b';
270 		break;
271 	default:
272 		if (t < end-1)		/* all else must be escaped */
273 			*t++ = '\\';
274 		if (c == 'x') {		/* hexadecimal goo follows */
275 			c = hexstr(&p);
276 			if (t < end-MB_CUR_MAX)
277 				t += wctomb(t, c);
278 			else overflow();
279 			*to = t;
280 			*s = p;
281 			return;
282 		} else if (isoctdigit(c)) {	/* \d \dd \ddd */
283 			c -= '0';
284 			if (isoctdigit(*p)) {
285 				c = 8 * c + *p++ - '0';
286 				if (isoctdigit(*p))
287 					c = 8 * c + *p++ - '0';
288 			}
289 		}
290 		break;
291 	}
292 	if (t < end-1)
293 		*t++ = c;
294 	*s = p;
295 	*to = t;
296 }
297 	/* count rune positions */
298 int
countposn(char * s,int n)299 countposn(char *s, int n)
300 {
301 	int i, j;
302 	char *end;
303 
304 	for (i = 0, end = s+n; *s && s < end; i++){
305 		j = mblen(s, n);
306 		if(j <= 0)
307 			j = 1;
308 		s += j;
309 	}
310 	return(i);
311 }
312 
313 	/* pattern package error handler */
314 
315 void
regerror(char * s)316 regerror(char *s)
317 {
318 	FATAL("%s", s);
319 }
320 
321 void
overflow(void)322 overflow(void)
323 {
324 	FATAL("%s", "regular expression too big");
325 }
326