17dd7cddfSDavid du Colombier /****************************************************************
27dd7cddfSDavid du Colombier Copyright (C) Lucent Technologies 1997
33e12c5d1SDavid du Colombier All Rights Reserved
43e12c5d1SDavid du Colombier
57dd7cddfSDavid du Colombier Permission to use, copy, modify, and distribute this software and
67dd7cddfSDavid du Colombier its documentation for any purpose and without fee is hereby
77dd7cddfSDavid du Colombier granted, provided that the above copyright notice appear in all
87dd7cddfSDavid du Colombier copies and that both that the copyright notice and this
97dd7cddfSDavid du Colombier permission notice and warranty disclaimer appear in supporting
107dd7cddfSDavid du Colombier documentation, and that the name Lucent Technologies or any of
117dd7cddfSDavid du Colombier its entities not be used in advertising or publicity pertaining
127dd7cddfSDavid du Colombier to distribution of the software without specific, written prior
137dd7cddfSDavid du Colombier permission.
143e12c5d1SDavid du Colombier
157dd7cddfSDavid du Colombier LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
167dd7cddfSDavid du Colombier INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
177dd7cddfSDavid du Colombier IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
187dd7cddfSDavid du Colombier SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
197dd7cddfSDavid du Colombier WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
207dd7cddfSDavid du Colombier IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
217dd7cddfSDavid du Colombier ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
227dd7cddfSDavid du Colombier THIS SOFTWARE.
237dd7cddfSDavid du Colombier ****************************************************************/
247dd7cddfSDavid du Colombier
253e12c5d1SDavid du Colombier
263e12c5d1SDavid du Colombier #define DEBUG
273e12c5d1SDavid du Colombier #include <stdio.h>
283e12c5d1SDavid du Colombier #include <ctype.h>
293e12c5d1SDavid du Colombier #include <setjmp.h>
303e12c5d1SDavid du Colombier #include <math.h>
313e12c5d1SDavid du Colombier #include <string.h>
323e12c5d1SDavid du Colombier #include <stdlib.h>
333e12c5d1SDavid du Colombier #include <time.h>
343e12c5d1SDavid du Colombier #include "awk.h"
353e12c5d1SDavid du Colombier #include "y.tab.h"
363e12c5d1SDavid du Colombier #include "regexp.h"
373e12c5d1SDavid du Colombier
383e12c5d1SDavid du Colombier /* This file provides the interface between the main body of
393e12c5d1SDavid du Colombier * awk and the pattern matching package. It preprocesses
403e12c5d1SDavid du Colombier * patterns prior to compilation to provide awk-like semantics
413e12c5d1SDavid du Colombier * to character sequences not supported by the pattern package.
423e12c5d1SDavid du Colombier * The following conversions are performed:
433e12c5d1SDavid du Colombier *
443e12c5d1SDavid du Colombier * "()" -> "[]"
453e12c5d1SDavid du Colombier * "[-" -> "[\-"
463e12c5d1SDavid du Colombier * "[^-" -> "[^\-"
473e12c5d1SDavid du Colombier * "-]" -> "\-]"
483e12c5d1SDavid du Colombier * "[]" -> "[]*"
493e12c5d1SDavid du Colombier * "\xdddd" -> "\z" where 'z' is the UTF sequence
503e12c5d1SDavid du Colombier * for the hex value
513e12c5d1SDavid du Colombier * "\ddd" -> "\o" where 'o' is a char octal value
523e12c5d1SDavid du Colombier * "\b" -> "\B" where 'B' is backspace
533e12c5d1SDavid du Colombier * "\t" -> "\T" where 'T' is tab
543e12c5d1SDavid du Colombier * "\f" -> "\F" where 'F' is form feed
553e12c5d1SDavid du Colombier * "\n" -> "\N" where 'N' is newline
563e12c5d1SDavid du Colombier * "\r" -> "\r" where 'C' is cr
573e12c5d1SDavid du Colombier */
583e12c5d1SDavid du Colombier
593e12c5d1SDavid du Colombier #define MAXRE 512
603e12c5d1SDavid du Colombier
617dd7cddfSDavid du Colombier static char re[MAXRE]; /* copy buffer */
623e12c5d1SDavid du Colombier
637dd7cddfSDavid du Colombier char *patbeg;
643e12c5d1SDavid du Colombier int patlen; /* number of chars in pattern */
653e12c5d1SDavid du Colombier
663e12c5d1SDavid du Colombier #define NPATS 20 /* number of slots in pattern cache */
673e12c5d1SDavid du Colombier
683e12c5d1SDavid du Colombier static struct pat_list /* dynamic pattern cache */
693e12c5d1SDavid du Colombier {
703e12c5d1SDavid du Colombier char *re;
713e12c5d1SDavid du Colombier int use;
723e12c5d1SDavid du Colombier Reprog *program;
733e12c5d1SDavid du Colombier } pattern[NPATS];
743e12c5d1SDavid du Colombier
753e12c5d1SDavid du Colombier static int npats; /* cache fill level */
763e12c5d1SDavid du Colombier
773e12c5d1SDavid du Colombier /* Compile a pattern */
783e12c5d1SDavid du Colombier void
compre(char * pat)797dd7cddfSDavid du Colombier *compre(char *pat)
803e12c5d1SDavid du Colombier {
813e12c5d1SDavid du Colombier int i, j, inclass;
827dd7cddfSDavid du Colombier char c, *p, *s;
833e12c5d1SDavid du Colombier Reprog *program;
843e12c5d1SDavid du Colombier
853e12c5d1SDavid du Colombier if (!compile_time) { /* search cache for dynamic pattern */
863e12c5d1SDavid du Colombier for (i = 0; i < npats; i++)
873e12c5d1SDavid du Colombier if (!strcmp(pat, pattern[i].re)) {
883e12c5d1SDavid du Colombier pattern[i].use++;
893e12c5d1SDavid du Colombier return((void *) pattern[i].program);
903e12c5d1SDavid du Colombier }
913e12c5d1SDavid du Colombier }
923e12c5d1SDavid du Colombier /* Preprocess Pattern for compilation */
933e12c5d1SDavid du Colombier p = re;
943e12c5d1SDavid du Colombier s = pat;
953e12c5d1SDavid du Colombier inclass = 0;
963e12c5d1SDavid du Colombier while (c = *s++) {
973e12c5d1SDavid du Colombier if (c == '\\') {
983e12c5d1SDavid du Colombier quoted(&s, &p, re+MAXRE);
993e12c5d1SDavid du Colombier continue;
1003e12c5d1SDavid du Colombier }
1013e12c5d1SDavid du Colombier else if (!inclass && c == '(' && *s == ')') {
1023e12c5d1SDavid du Colombier if (p < re+MAXRE-2) { /* '()' -> '[]*' */
1033e12c5d1SDavid du Colombier *p++ = '[';
1043e12c5d1SDavid du Colombier *p++ = ']';
1053e12c5d1SDavid du Colombier c = '*';
1063e12c5d1SDavid du Colombier s++;
1073e12c5d1SDavid du Colombier }
1083e12c5d1SDavid du Colombier else overflow();
1093e12c5d1SDavid du Colombier }
1103e12c5d1SDavid du Colombier else if (c == '['){ /* '[-' -> '[\-' */
1113e12c5d1SDavid du Colombier inclass = 1;
1123e12c5d1SDavid du Colombier if (*s == '-') {
1133e12c5d1SDavid du Colombier if (p < re+MAXRE-2) {
1143e12c5d1SDavid du Colombier *p++ = '[';
1153e12c5d1SDavid du Colombier *p++ = '\\';
1163e12c5d1SDavid du Colombier c = *s++;
1173e12c5d1SDavid du Colombier }
1183e12c5d1SDavid du Colombier else overflow();
1193e12c5d1SDavid du Colombier } /* '[^-' -> '[^\-'*/
1203e12c5d1SDavid du Colombier else if (*s == '^' && s[1] == '-'){
1213e12c5d1SDavid du Colombier if (p < re+MAXRE-3) {
1223e12c5d1SDavid du Colombier *p++ = '[';
1233e12c5d1SDavid du Colombier *p++ = *s++;
1243e12c5d1SDavid du Colombier *p++ = '\\';
1253e12c5d1SDavid du Colombier c = *s++;
1263e12c5d1SDavid du Colombier }
1273e12c5d1SDavid du Colombier else overflow();
1283e12c5d1SDavid du Colombier }
1293e12c5d1SDavid du Colombier else if (*s == '['){ /* skip '[[' */
1303e12c5d1SDavid du Colombier if (p < re+MAXRE-1)
1313e12c5d1SDavid du Colombier *p++ = c;
1323e12c5d1SDavid du Colombier else overflow();
1333e12c5d1SDavid du Colombier c = *s++;
1343e12c5d1SDavid du Colombier }
1353e12c5d1SDavid du Colombier else if (*s == '^' && s[1] == '[') { /* skip '[^['*/
1363e12c5d1SDavid du Colombier if (p < re+MAXRE-2) {
1373e12c5d1SDavid du Colombier *p++ = c;
1383e12c5d1SDavid du Colombier *p++ = *s++;
1393e12c5d1SDavid du Colombier c = *s++;
1403e12c5d1SDavid du Colombier }
1413e12c5d1SDavid du Colombier else overflow();
1423e12c5d1SDavid du Colombier }
1433e12c5d1SDavid du Colombier else if (*s == ']') { /* '[]' -> '[]*' */
1443e12c5d1SDavid du Colombier if (p < re+MAXRE-2) {
1453e12c5d1SDavid du Colombier *p++ = c;
1463e12c5d1SDavid du Colombier *p++ = *s++;
1473e12c5d1SDavid du Colombier c = '*';
1483e12c5d1SDavid du Colombier inclass = 0;
1493e12c5d1SDavid du Colombier }
1503e12c5d1SDavid du Colombier else overflow();
1513e12c5d1SDavid du Colombier }
1523e12c5d1SDavid du Colombier }
1533e12c5d1SDavid du Colombier else if (c == '-' && *s == ']') { /* '-]' -> '\-]' */
1543e12c5d1SDavid du Colombier if (p < re+MAXRE-1)
1553e12c5d1SDavid du Colombier *p++ = '\\';
1563e12c5d1SDavid du Colombier else overflow();
1573e12c5d1SDavid du Colombier }
1583e12c5d1SDavid du Colombier else if (c == ']')
1593e12c5d1SDavid du Colombier inclass = 0;
1603e12c5d1SDavid du Colombier if (p < re+MAXRE-1)
1613e12c5d1SDavid du Colombier *p++ = c;
1623e12c5d1SDavid du Colombier else overflow();
1633e12c5d1SDavid du Colombier }
1643e12c5d1SDavid du Colombier *p = 0;
1653e12c5d1SDavid du Colombier program = regcomp(re); /* compile pattern */
1663e12c5d1SDavid du Colombier if (!compile_time) {
1673e12c5d1SDavid du Colombier if (npats < NPATS) /* Room in cache */
1683e12c5d1SDavid du Colombier i = npats++;
1693e12c5d1SDavid du Colombier else { /* Throw out least used */
1703e12c5d1SDavid du Colombier int use = pattern[0].use;
1713e12c5d1SDavid du Colombier i = 0;
1723e12c5d1SDavid du Colombier for (j = 1; j < NPATS; j++) {
1733e12c5d1SDavid du Colombier if (pattern[j].use < use) {
1743e12c5d1SDavid du Colombier use = pattern[j].use;
1753e12c5d1SDavid du Colombier i = j;
1763e12c5d1SDavid du Colombier }
1773e12c5d1SDavid du Colombier }
1783e12c5d1SDavid du Colombier xfree(pattern[i].program);
1793e12c5d1SDavid du Colombier xfree(pattern[i].re);
1803e12c5d1SDavid du Colombier }
1813e12c5d1SDavid du Colombier pattern[i].re = tostring(pat);
1823e12c5d1SDavid du Colombier pattern[i].program = program;
1837dd7cddfSDavid du Colombier pattern[i].use = 1;
1843e12c5d1SDavid du Colombier }
1853e12c5d1SDavid du Colombier return((void *) program);
1863e12c5d1SDavid du Colombier }
1873e12c5d1SDavid du Colombier
1883e12c5d1SDavid du Colombier /* T/F match indication - matched string not exported */
1893e12c5d1SDavid du Colombier int
match(void * p,char * s,char *)1907dd7cddfSDavid du Colombier match(void *p, char *s, char *)
1913e12c5d1SDavid du Colombier {
1923e12c5d1SDavid du Colombier return regexec((Reprog *) p, (char *) s, 0, 0);
1933e12c5d1SDavid du Colombier }
1943e12c5d1SDavid du Colombier
1953e12c5d1SDavid du Colombier /* match and delimit the matched string */
1963e12c5d1SDavid du Colombier int
pmatch(void * p,char * s,char * start)1977dd7cddfSDavid du Colombier pmatch(void *p, char *s, char *start)
1983e12c5d1SDavid du Colombier {
1993e12c5d1SDavid du Colombier Resub m;
2003e12c5d1SDavid du Colombier
2013e12c5d1SDavid du Colombier m.s.sp = start;
2023e12c5d1SDavid du Colombier m.e.ep = 0;
2033e12c5d1SDavid du Colombier if (regexec((Reprog *) p, (char *) s, &m, 1)) {
2043e12c5d1SDavid du Colombier patbeg = m.s.sp;
2053e12c5d1SDavid du Colombier patlen = m.e.ep-m.s.sp;
2063e12c5d1SDavid du Colombier return 1;
2073e12c5d1SDavid du Colombier }
2083e12c5d1SDavid du Colombier patlen = -1;
2093e12c5d1SDavid du Colombier patbeg = start;
2103e12c5d1SDavid du Colombier return 0;
2113e12c5d1SDavid du Colombier }
2123e12c5d1SDavid du Colombier
2133e12c5d1SDavid du Colombier /* perform a non-empty match */
2143e12c5d1SDavid du Colombier int
nematch(void * p,char * s,char * start)2157dd7cddfSDavid du Colombier nematch(void *p, char *s, char *start)
2163e12c5d1SDavid du Colombier {
2173e12c5d1SDavid du Colombier if (pmatch(p, s, start) == 1 && patlen > 0)
2183e12c5d1SDavid du Colombier return 1;
2193e12c5d1SDavid du Colombier patlen = -1;
2203e12c5d1SDavid du Colombier patbeg = start;
2213e12c5d1SDavid du Colombier return 0;
2223e12c5d1SDavid du Colombier }
2233e12c5d1SDavid du Colombier /* in the parsing of regular expressions, metacharacters like . have */
2243e12c5d1SDavid du Colombier /* to be seen literally; \056 is not a metacharacter. */
2253e12c5d1SDavid du Colombier
hexstr(char ** pp)2263e12c5d1SDavid du Colombier hexstr(char **pp) /* find and eval hex string at pp, return new p */
2273e12c5d1SDavid du Colombier {
2283e12c5d1SDavid du Colombier char c;
2293e12c5d1SDavid du Colombier int n = 0;
2303e12c5d1SDavid du Colombier int i;
2313e12c5d1SDavid du Colombier
2323e12c5d1SDavid du Colombier for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
2333e12c5d1SDavid du Colombier if (isdigit(c))
2343e12c5d1SDavid du Colombier n = 16 * n + c - '0';
2353e12c5d1SDavid du Colombier else if ('a' <= c && c <= 'f')
2363e12c5d1SDavid du Colombier n = 16 * n + c - 'a' + 10;
2373e12c5d1SDavid du Colombier else if ('A' <= c && c <= 'F')
2383e12c5d1SDavid du Colombier n = 16 * n + c - 'A' + 10;
2393e12c5d1SDavid du Colombier }
2403e12c5d1SDavid du Colombier *pp += i;
2413e12c5d1SDavid du Colombier return n;
2423e12c5d1SDavid du Colombier }
2433e12c5d1SDavid du Colombier
2443e12c5d1SDavid du Colombier /* look for awk-specific escape sequences */
2453e12c5d1SDavid du Colombier
2467dd7cddfSDavid du Colombier #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
2473e12c5d1SDavid du Colombier
2483e12c5d1SDavid du Colombier void
quoted(char ** s,char ** to,char * end)2497dd7cddfSDavid du Colombier quoted(char **s, char **to, char *end) /* handle escaped sequence */
2503e12c5d1SDavid du Colombier {
2513e12c5d1SDavid du Colombier char *p = *s;
2523e12c5d1SDavid du Colombier char *t = *to;
2533e12c5d1SDavid du Colombier wchar_t c;
2543e12c5d1SDavid du Colombier
2553e12c5d1SDavid du Colombier switch(c = *p++) {
2563e12c5d1SDavid du Colombier case 't':
2573e12c5d1SDavid du Colombier c = '\t';
2583e12c5d1SDavid du Colombier break;
2593e12c5d1SDavid du Colombier case 'n':
2603e12c5d1SDavid du Colombier c = '\n';
2613e12c5d1SDavid du Colombier break;
2623e12c5d1SDavid du Colombier case 'f':
2633e12c5d1SDavid du Colombier c = '\f';
2643e12c5d1SDavid du Colombier break;
2653e12c5d1SDavid du Colombier case 'r':
2663e12c5d1SDavid du Colombier c = '\r';
2673e12c5d1SDavid du Colombier break;
2683e12c5d1SDavid du Colombier case 'b':
2693e12c5d1SDavid du Colombier c = '\b';
2703e12c5d1SDavid du Colombier break;
2713e12c5d1SDavid du Colombier default:
2723e12c5d1SDavid du Colombier if (t < end-1) /* all else must be escaped */
2733e12c5d1SDavid du Colombier *t++ = '\\';
2743e12c5d1SDavid du Colombier if (c == 'x') { /* hexadecimal goo follows */
2753e12c5d1SDavid du Colombier c = hexstr(&p);
2763e12c5d1SDavid du Colombier if (t < end-MB_CUR_MAX)
2773e12c5d1SDavid du Colombier t += wctomb(t, c);
2783e12c5d1SDavid du Colombier else overflow();
2793e12c5d1SDavid du Colombier *to = t;
2803e12c5d1SDavid du Colombier *s = p;
2813e12c5d1SDavid du Colombier return;
2823e12c5d1SDavid du Colombier } else if (isoctdigit(c)) { /* \d \dd \ddd */
2833e12c5d1SDavid du Colombier c -= '0';
2843e12c5d1SDavid du Colombier if (isoctdigit(*p)) {
2853e12c5d1SDavid du Colombier c = 8 * c + *p++ - '0';
2863e12c5d1SDavid du Colombier if (isoctdigit(*p))
2873e12c5d1SDavid du Colombier c = 8 * c + *p++ - '0';
2883e12c5d1SDavid du Colombier }
2893e12c5d1SDavid du Colombier }
2903e12c5d1SDavid du Colombier break;
2913e12c5d1SDavid du Colombier }
2923e12c5d1SDavid du Colombier if (t < end-1)
2933e12c5d1SDavid du Colombier *t++ = c;
2943e12c5d1SDavid du Colombier *s = p;
2953e12c5d1SDavid du Colombier *to = t;
2963e12c5d1SDavid du Colombier }
2973e12c5d1SDavid du Colombier /* count rune positions */
2983e12c5d1SDavid du Colombier int
countposn(char * s,int n)2997dd7cddfSDavid du Colombier countposn(char *s, int n)
3003e12c5d1SDavid du Colombier {
301*59cc4ca5SDavid du Colombier int i, j;
3027dd7cddfSDavid du Colombier char *end;
3033e12c5d1SDavid du Colombier
304*59cc4ca5SDavid du Colombier for (i = 0, end = s+n; *s && s < end; i++){
305*59cc4ca5SDavid du Colombier j = mblen(s, n);
306*59cc4ca5SDavid du Colombier if(j <= 0)
307*59cc4ca5SDavid du Colombier j = 1;
308*59cc4ca5SDavid du Colombier s += j;
309*59cc4ca5SDavid du Colombier }
3103e12c5d1SDavid du Colombier return(i);
3113e12c5d1SDavid du Colombier }
3123e12c5d1SDavid du Colombier
3133e12c5d1SDavid du Colombier /* pattern package error handler */
3143e12c5d1SDavid du Colombier
3153e12c5d1SDavid du Colombier void
regerror(char * s)3163e12c5d1SDavid du Colombier regerror(char *s)
3173e12c5d1SDavid du Colombier {
3187dd7cddfSDavid du Colombier FATAL("%s", s);
3193e12c5d1SDavid du Colombier }
3203e12c5d1SDavid du Colombier
3213e12c5d1SDavid du Colombier void
overflow(void)3223e12c5d1SDavid du Colombier overflow(void)
3233e12c5d1SDavid du Colombier {
3247dd7cddfSDavid du Colombier FATAL("%s", "regular expression too big");
3253e12c5d1SDavid du Colombier }
326