1 /* 2 Copyright (c) 1989 AT&T 3 All Rights Reserved 4 5 THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF AT&T. 6 7 The copyright notice above does not evidence any 8 actual or intended publication of such source code. 9 */ 10 11 #define DEBUG 12 #include <stdio.h> 13 #include <ctype.h> 14 #include <setjmp.h> 15 #include <math.h> 16 #include <string.h> 17 #include <stdlib.h> 18 #include <time.h> 19 #include "awk.h" 20 #include "y.tab.h" 21 #include "regexp.h" 22 23 /* This file provides the interface between the main body of 24 * awk and the pattern matching package. It preprocesses 25 * patterns prior to compilation to provide awk-like semantics 26 * to character sequences not supported by the pattern package. 27 * The following conversions are performed: 28 * 29 * "()" -> "[]" 30 * "[-" -> "[\-" 31 * "[^-" -> "[^\-" 32 * "-]" -> "\-]" 33 * "[]" -> "[]*" 34 * "\xdddd" -> "\z" where 'z' is the UTF sequence 35 * for the hex value 36 * "\ddd" -> "\o" where 'o' is a char octal value 37 * "\b" -> "\B" where 'B' is backspace 38 * "\t" -> "\T" where 'T' is tab 39 * "\f" -> "\F" where 'F' is form feed 40 * "\n" -> "\N" where 'N' is newline 41 * "\r" -> "\r" where 'C' is cr 42 */ 43 44 #define MAXRE 512 45 46 static uchar re[MAXRE]; /* copy buffer */ 47 48 uchar *patbeg; 49 int patlen; /* number of chars in pattern */ 50 51 #define NPATS 20 /* number of slots in pattern cache */ 52 53 static struct pat_list /* dynamic pattern cache */ 54 { 55 char *re; 56 int use; 57 Reprog *program; 58 } pattern[NPATS]; 59 60 static int npats; /* cache fill level */ 61 62 /* Compile a pattern */ 63 void 64 *compre(uchar *pat) 65 { 66 int i, j, inclass; 67 uchar c, *p, val, *s; 68 Reprog *program; 69 70 if (!compile_time) { /* search cache for dynamic pattern */ 71 for (i = 0; i < npats; i++) 72 if (!strcmp(pat, pattern[i].re)) { 73 pattern[i].use++; 74 return((void *) pattern[i].program); 75 } 76 } 77 /* Preprocess Pattern for compilation */ 78 p = re; 79 s = pat; 80 inclass = 0; 81 while (c = *s++) { 82 if (c == '\\') { 83 quoted(&s, &p, re+MAXRE); 84 continue; 85 } 86 else if (!inclass && c == '(' && *s == ')') { 87 if (p < re+MAXRE-2) { /* '()' -> '[]*' */ 88 *p++ = '['; 89 *p++ = ']'; 90 c = '*'; 91 s++; 92 } 93 else overflow(); 94 } 95 else if (c == '['){ /* '[-' -> '[\-' */ 96 inclass = 1; 97 if (*s == '-') { 98 if (p < re+MAXRE-2) { 99 *p++ = '['; 100 *p++ = '\\'; 101 c = *s++; 102 } 103 else overflow(); 104 } /* '[^-' -> '[^\-'*/ 105 else if (*s == '^' && s[1] == '-'){ 106 if (p < re+MAXRE-3) { 107 *p++ = '['; 108 *p++ = *s++; 109 *p++ = '\\'; 110 c = *s++; 111 } 112 else overflow(); 113 } 114 else if (*s == '['){ /* skip '[[' */ 115 if (p < re+MAXRE-1) 116 *p++ = c; 117 else overflow(); 118 c = *s++; 119 } 120 else if (*s == '^' && s[1] == '[') { /* skip '[^['*/ 121 if (p < re+MAXRE-2) { 122 *p++ = c; 123 *p++ = *s++; 124 c = *s++; 125 } 126 else overflow(); 127 } 128 else if (*s == ']') { /* '[]' -> '[]*' */ 129 if (p < re+MAXRE-2) { 130 *p++ = c; 131 *p++ = *s++; 132 c = '*'; 133 inclass = 0; 134 } 135 else overflow(); 136 } 137 } 138 else if (c == '-' && *s == ']') { /* '-]' -> '\-]' */ 139 if (p < re+MAXRE-1) 140 *p++ = '\\'; 141 else overflow(); 142 } 143 else if (c == ']') 144 inclass = 0; 145 if (p < re+MAXRE-1) 146 *p++ = c; 147 else overflow(); 148 } 149 *p = 0; 150 program = regcomp(re); /* compile pattern */ 151 if (!compile_time) { 152 if (npats < NPATS) /* Room in cache */ 153 i = npats++; 154 else { /* Throw out least used */ 155 int use = pattern[0].use; 156 i = 0; 157 for (j = 1; j < NPATS; j++) { 158 if (pattern[j].use < use) { 159 use = pattern[j].use; 160 i = j; 161 } 162 } 163 xfree(pattern[i].program); 164 xfree(pattern[i].re); 165 } 166 pattern[i].re = tostring(pat); 167 pattern[i].program = program; 168 pattern[i++].use = 1; 169 } 170 return((void *) program); 171 } 172 173 /* T/F match indication - matched string not exported */ 174 int 175 match(void *p, uchar *s, uchar *start) 176 { 177 return regexec((Reprog *) p, (char *) s, 0, 0); 178 } 179 180 /* match and delimit the matched string */ 181 int 182 pmatch(void *p, uchar *s, uchar *start) 183 { 184 Resub m; 185 186 m.s.sp = start; 187 m.e.ep = 0; 188 if (regexec((Reprog *) p, (char *) s, &m, 1)) { 189 patbeg = m.s.sp; 190 patlen = m.e.ep-m.s.sp; 191 return 1; 192 } 193 patlen = -1; 194 patbeg = start; 195 return 0; 196 } 197 198 /* perform a non-empty match */ 199 int 200 nematch(void *p, uchar *s, uchar *start) 201 { 202 if (pmatch(p, s, start) == 1 && patlen > 0) 203 return 1; 204 patlen = -1; 205 patbeg = start; 206 return 0; 207 } 208 /* in the parsing of regular expressions, metacharacters like . have */ 209 /* to be seen literally; \056 is not a metacharacter. */ 210 211 hexstr(char **pp) /* find and eval hex string at pp, return new p */ 212 { 213 char c; 214 int n = 0; 215 int i; 216 217 for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) { 218 if (isdigit(c)) 219 n = 16 * n + c - '0'; 220 else if ('a' <= c && c <= 'f') 221 n = 16 * n + c - 'a' + 10; 222 else if ('A' <= c && c <= 'F') 223 n = 16 * n + c - 'A' + 10; 224 } 225 *pp += i; 226 return n; 227 } 228 229 /* look for awk-specific escape sequences */ 230 231 #define isoctdigit(c) ((c) >= '0' && (c) <= '8') /* multiple use of arg */ 232 233 void 234 quoted(uchar **s, uchar **to, uchar *end) /* handle escaped sequence */ 235 { 236 char *p = *s; 237 char *t = *to; 238 wchar_t c; 239 240 switch(c = *p++) { 241 case 't': 242 c = '\t'; 243 break; 244 case 'n': 245 c = '\n'; 246 break; 247 case 'f': 248 c = '\f'; 249 break; 250 case 'r': 251 c = '\r'; 252 break; 253 case 'b': 254 c = '\b'; 255 break; 256 default: 257 if (t < end-1) /* all else must be escaped */ 258 *t++ = '\\'; 259 if (c == 'x') { /* hexadecimal goo follows */ 260 c = hexstr(&p); 261 if (t < end-MB_CUR_MAX) 262 t += wctomb(t, c); 263 else overflow(); 264 *to = t; 265 *s = p; 266 return; 267 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 268 c -= '0'; 269 if (isoctdigit(*p)) { 270 c = 8 * c + *p++ - '0'; 271 if (isoctdigit(*p)) 272 c = 8 * c + *p++ - '0'; 273 } 274 } 275 break; 276 } 277 if (t < end-1) 278 *t++ = c; 279 *s = p; 280 *to = t; 281 } 282 /* count rune positions */ 283 int 284 countposn(uchar *s, int n) 285 { 286 int i; 287 uchar *end; 288 289 for (i = 0, end = s+n; *s && s < end; i++) 290 s += mblen(s, n); 291 return(i); 292 } 293 294 /* pattern package error handler */ 295 296 void 297 regerror(char *s) 298 { 299 ERROR "%s", s FATAL; 300 } 301 302 void 303 overflow(void) 304 { 305 ERROR "%s", "regular expression too big" FATAL; 306 } 307