1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25
26 #define DEBUG
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <setjmp.h>
30 #include <math.h>
31 #include <string.h>
32 #include <stdlib.h>
33 #include <time.h>
34 #include "awk.h"
35 #include "y.tab.h"
36 #include "regexp.h"
37
38 /* This file provides the interface between the main body of
39 * awk and the pattern matching package. It preprocesses
40 * patterns prior to compilation to provide awk-like semantics
41 * to character sequences not supported by the pattern package.
42 * The following conversions are performed:
43 *
44 * "()" -> "[]"
45 * "[-" -> "[\-"
46 * "[^-" -> "[^\-"
47 * "-]" -> "\-]"
48 * "[]" -> "[]*"
49 * "\xdddd" -> "\z" where 'z' is the UTF sequence
50 * for the hex value
51 * "\ddd" -> "\o" where 'o' is a char octal value
52 * "\b" -> "\B" where 'B' is backspace
53 * "\t" -> "\T" where 'T' is tab
54 * "\f" -> "\F" where 'F' is form feed
55 * "\n" -> "\N" where 'N' is newline
56 * "\r" -> "\r" where 'C' is cr
57 */
58
59 #define MAXRE 512
60
61 static char re[MAXRE]; /* copy buffer */
62
63 char *patbeg;
64 int patlen; /* number of chars in pattern */
65
66 #define NPATS 20 /* number of slots in pattern cache */
67
68 static struct pat_list /* dynamic pattern cache */
69 {
70 char *re;
71 int use;
72 Reprog *program;
73 } pattern[NPATS];
74
75 static int npats; /* cache fill level */
76
77 /* Compile a pattern */
78 void
compre(char * pat)79 *compre(char *pat)
80 {
81 int i, j, inclass;
82 char c, *p, *s;
83 Reprog *program;
84
85 if (!compile_time) { /* search cache for dynamic pattern */
86 for (i = 0; i < npats; i++)
87 if (!strcmp(pat, pattern[i].re)) {
88 pattern[i].use++;
89 return((void *) pattern[i].program);
90 }
91 }
92 /* Preprocess Pattern for compilation */
93 p = re;
94 s = pat;
95 inclass = 0;
96 while (c = *s++) {
97 if (c == '\\') {
98 quoted(&s, &p, re+MAXRE);
99 continue;
100 }
101 else if (!inclass && c == '(' && *s == ')') {
102 if (p < re+MAXRE-2) { /* '()' -> '[]*' */
103 *p++ = '[';
104 *p++ = ']';
105 c = '*';
106 s++;
107 }
108 else overflow();
109 }
110 else if (c == '['){ /* '[-' -> '[\-' */
111 inclass = 1;
112 if (*s == '-') {
113 if (p < re+MAXRE-2) {
114 *p++ = '[';
115 *p++ = '\\';
116 c = *s++;
117 }
118 else overflow();
119 } /* '[^-' -> '[^\-'*/
120 else if (*s == '^' && s[1] == '-'){
121 if (p < re+MAXRE-3) {
122 *p++ = '[';
123 *p++ = *s++;
124 *p++ = '\\';
125 c = *s++;
126 }
127 else overflow();
128 }
129 else if (*s == '['){ /* skip '[[' */
130 if (p < re+MAXRE-1)
131 *p++ = c;
132 else overflow();
133 c = *s++;
134 }
135 else if (*s == '^' && s[1] == '[') { /* skip '[^['*/
136 if (p < re+MAXRE-2) {
137 *p++ = c;
138 *p++ = *s++;
139 c = *s++;
140 }
141 else overflow();
142 }
143 else if (*s == ']') { /* '[]' -> '[]*' */
144 if (p < re+MAXRE-2) {
145 *p++ = c;
146 *p++ = *s++;
147 c = '*';
148 inclass = 0;
149 }
150 else overflow();
151 }
152 }
153 else if (c == '-' && *s == ']') { /* '-]' -> '\-]' */
154 if (p < re+MAXRE-1)
155 *p++ = '\\';
156 else overflow();
157 }
158 else if (c == ']')
159 inclass = 0;
160 if (p < re+MAXRE-1)
161 *p++ = c;
162 else overflow();
163 }
164 *p = 0;
165 program = regcomp(re); /* compile pattern */
166 if (!compile_time) {
167 if (npats < NPATS) /* Room in cache */
168 i = npats++;
169 else { /* Throw out least used */
170 int use = pattern[0].use;
171 i = 0;
172 for (j = 1; j < NPATS; j++) {
173 if (pattern[j].use < use) {
174 use = pattern[j].use;
175 i = j;
176 }
177 }
178 xfree(pattern[i].program);
179 xfree(pattern[i].re);
180 }
181 pattern[i].re = tostring(pat);
182 pattern[i].program = program;
183 pattern[i].use = 1;
184 }
185 return((void *) program);
186 }
187
188 /* T/F match indication - matched string not exported */
189 int
match(void * p,char * s,char *)190 match(void *p, char *s, char *)
191 {
192 return regexec((Reprog *) p, (char *) s, 0, 0);
193 }
194
195 /* match and delimit the matched string */
196 int
pmatch(void * p,char * s,char * start)197 pmatch(void *p, char *s, char *start)
198 {
199 Resub m;
200
201 m.s.sp = start;
202 m.e.ep = 0;
203 if (regexec((Reprog *) p, (char *) s, &m, 1)) {
204 patbeg = m.s.sp;
205 patlen = m.e.ep-m.s.sp;
206 return 1;
207 }
208 patlen = -1;
209 patbeg = start;
210 return 0;
211 }
212
213 /* perform a non-empty match */
214 int
nematch(void * p,char * s,char * start)215 nematch(void *p, char *s, char *start)
216 {
217 if (pmatch(p, s, start) == 1 && patlen > 0)
218 return 1;
219 patlen = -1;
220 patbeg = start;
221 return 0;
222 }
223 /* in the parsing of regular expressions, metacharacters like . have */
224 /* to be seen literally; \056 is not a metacharacter. */
225
hexstr(char ** pp)226 hexstr(char **pp) /* find and eval hex string at pp, return new p */
227 {
228 char c;
229 int n = 0;
230 int i;
231
232 for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
233 if (isdigit(c))
234 n = 16 * n + c - '0';
235 else if ('a' <= c && c <= 'f')
236 n = 16 * n + c - 'a' + 10;
237 else if ('A' <= c && c <= 'F')
238 n = 16 * n + c - 'A' + 10;
239 }
240 *pp += i;
241 return n;
242 }
243
244 /* look for awk-specific escape sequences */
245
246 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
247
248 void
quoted(char ** s,char ** to,char * end)249 quoted(char **s, char **to, char *end) /* handle escaped sequence */
250 {
251 char *p = *s;
252 char *t = *to;
253 wchar_t c;
254
255 switch(c = *p++) {
256 case 't':
257 c = '\t';
258 break;
259 case 'n':
260 c = '\n';
261 break;
262 case 'f':
263 c = '\f';
264 break;
265 case 'r':
266 c = '\r';
267 break;
268 case 'b':
269 c = '\b';
270 break;
271 default:
272 if (t < end-1) /* all else must be escaped */
273 *t++ = '\\';
274 if (c == 'x') { /* hexadecimal goo follows */
275 c = hexstr(&p);
276 if (t < end-MB_CUR_MAX)
277 t += wctomb(t, c);
278 else overflow();
279 *to = t;
280 *s = p;
281 return;
282 } else if (isoctdigit(c)) { /* \d \dd \ddd */
283 c -= '0';
284 if (isoctdigit(*p)) {
285 c = 8 * c + *p++ - '0';
286 if (isoctdigit(*p))
287 c = 8 * c + *p++ - '0';
288 }
289 }
290 break;
291 }
292 if (t < end-1)
293 *t++ = c;
294 *s = p;
295 *to = t;
296 }
297 /* count rune positions */
298 int
countposn(char * s,int n)299 countposn(char *s, int n)
300 {
301 int i, j;
302 char *end;
303
304 for (i = 0, end = s+n; *s && s < end; i++){
305 j = mblen(s, n);
306 if(j <= 0)
307 j = 1;
308 s += j;
309 }
310 return(i);
311 }
312
313 /* pattern package error handler */
314
315 void
regerror(char * s)316 regerror(char *s)
317 {
318 FATAL("%s", s);
319 }
320
321 void
overflow(void)322 overflow(void)
323 {
324 FATAL("%s", "regular expression too big");
325 }
326