xref: /netbsd-src/usr.bin/vgrind/regexp.c (revision ba65fde2d7fefa7d39838fa5fa855e62bd606b5e)
1 /*	$NetBSD: regexp.c,v 1.12 2012/03/20 20:34:59 matt Exp $	*/
2 
3 /*
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 #ifndef lint
35 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\
36  The Regents of the University of California.  All rights reserved.");
37 #endif /* not lint */
38 
39 #ifndef lint
40 #if 0
41 static char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
42 #endif
43 __RCSID("$NetBSD: regexp.c,v 1.12 2012/03/20 20:34:59 matt Exp $");
44 #endif /* not lint */
45 
46 #include <assert.h>
47 #include <ctype.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include "extern.h"
51 
52 #define FALSE	0
53 #define TRUE	!(FALSE)
54 #define NIL	0
55 
56 static void	expconv __P((void));
57 
58 boolean	 x_escaped;	/* true if we are currently x_escaped */
59 char	*x_start;	/* start of string */
60 boolean	 l_onecase;	/* true if upper and lower equivalent */
61 
62 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
63 
64 /*  STRNCMP -	like strncmp except that we convert the
65  *	 	first string to lower case before comparing
66  *		if l_onecase is set.
67  */
68 
69 int
70 STRNCMP(char *s1, char *s2, int len)
71 {
72 	if (l_onecase) {
73 	    do
74 		if (*s2 - makelower(*s1))
75 			return (*s2 - makelower(*s1));
76 		else {
77 			s2++;
78 			s1++;
79 		}
80 	    while (--len);
81 	} else {
82 	    do
83 		if (*s2 - *s1)
84 			return (*s2 - *s1);
85 		else {
86 			s2++;
87 			s1++;
88 		}
89 	    while (--len);
90 	}
91 	return(0);
92 }
93 
94 /*	The following routine converts an irregular expression to
95  *	internal format.
96  *
97  *	Either meta symbols (\a \d or \p) or character strings or
98  *	operations ( alternation or perenthesizing ) can be
99  *	specified.  Each starts with a descriptor byte.  The descriptor
100  *	byte has STR set for strings, META set for meta symbols
101  *	and OPER set for operations.
102  *	The descriptor byte can also have the OPT bit set if the object
103  *	defined is optional.  Also ALT can be set to indicate an alternation.
104  *
105  *	For metasymbols the byte following the descriptor byte identities
106  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
107  *	strings the byte after the descriptor is a character count for
108  *	the string:
109  *
110  *		meta symbols := descriptor
111  *				symbol
112  *
113  *		strings :=	descriptor
114  *				character count
115  *				the string
116  *
117  *		operatins :=	descriptor
118  *				symbol
119  *				character count
120  */
121 
122 /*
123  *  handy macros for accessing parts of match blocks
124  */
125 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
126 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
127 
128 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
129 #define OCNT(A) (*(A+2))	/* character count */
130 #define ONEXT(A) (A+3)		/* next character after the operation */
131 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
132 
133 #define SCNT(A) (*(A+1))	/* byte count of a string */
134 #define SSTR(A) (A+2)		/* address of the string */
135 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
136 
137 /*
138  *  bit flags in the descriptor
139  */
140 #define OPT 1
141 #define STR 2
142 #define META 4
143 #define ALT 8
144 #define OPER 16
145 
146 static char *ccre;	/* pointer to current position in converted exp*/
147 static char *ure;	/* pointer current position in unconverted exp */
148 
149 char *
150 convexp(char *re)	/* unconverted irregular expression */
151 {
152     char *cre;		/* pointer to converted regular expression */
153 
154     /* allocate room for the converted expression */
155     if (re == NIL)
156 	return (NIL);
157     if (*re == '\0')
158 	return (NIL);
159     cre = malloc(4 * strlen(re) + 3);
160     ccre = cre;
161     ure = re;
162 
163     /* start the conversion with a \a */
164     *cre = META | OPT;
165     MSYM(cre) = 'a';
166     ccre = MNEXT(cre);
167 
168     /* start the conversion (its recursive) */
169     expconv();
170     *ccre = 0;
171     return (cre);
172 }
173 
174 static void
175 expconv(void)
176 {
177     char *cs;		/* pointer to current symbol in converted exp */
178     char c;		/* character being processed */
179     char *acs;		/* pinter to last alternate */
180     int temp;
181 
182     /* let the conversion begin */
183     acs = NIL;
184     cs = NIL;
185     while (*ure != NIL) {
186 	switch (c = *ure++) {
187 
188 	case '\\':
189 	    switch (c = *ure++) {
190 
191 	    /* escaped characters are just characters */
192 	    default:
193 		if (cs == NIL || (*cs & STR) == 0) {
194 		    cs = ccre;
195 		    *cs = STR;
196 		    SCNT(cs) = 1;
197 		    ccre += 2;
198 		} else
199 		    SCNT(cs)++;
200 		*ccre++ = c;
201 		break;
202 
203 	    /* normal(?) metacharacters */
204 	    case 'a':
205 	    case 'd':
206 	    case 'e':
207 	    case 'p':
208 		if (acs != NIL && acs != cs) {
209 		    do {
210 			temp = OCNT(acs);
211 			OCNT(acs) = ccre - acs;
212 			acs -= temp;
213 		    } while (temp != 0);
214 		    acs = NIL;
215 		}
216 		cs = ccre;
217 		*cs = META;
218 		MSYM(cs) = c;
219 		ccre = MNEXT(cs);
220 		break;
221 	    }
222 	    break;
223 
224 	/* just put the symbol in */
225 	case '^':
226 	case '$':
227 	    if (acs != NIL && acs != cs) {
228 		do {
229 		    temp = OCNT(acs);
230 		    OCNT(acs) = ccre - acs;
231 		    acs -= temp;
232 		} while (temp != 0);
233 		acs = NIL;
234 	    }
235 	    cs = ccre;
236 	    *cs = META;
237 	    MSYM(cs) = c;
238 	    ccre = MNEXT(cs);
239 	    break;
240 
241 	/* mark the last match sequence as optional */
242 	case '?':
243 	    if (cs)
244 	    	*cs = *cs | OPT;
245 	    break;
246 
247 	/* recurse and define a subexpression */
248 	case '(':
249 	    if (acs != NIL && acs != cs) {
250 		do {
251 		    temp = OCNT(acs);
252 		    OCNT(acs) = ccre - acs;
253 		    acs -= temp;
254 		} while (temp != 0);
255 		acs = NIL;
256 	    }
257 	    cs = ccre;
258 	    *cs = OPER;
259 	    OSYM(cs) = '(';
260 	    ccre = ONEXT(cs);
261 	    expconv();
262 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
263 	    break;
264 
265 	/* reurn from a recursion */
266 	case ')':
267 	    if (acs != NIL) {
268 		do {
269 		    temp = OCNT(acs);
270 		    OCNT(acs) = ccre - acs;
271 		    acs -= temp;
272 		} while (temp != 0);
273 		acs = NIL;
274 	    }
275 	    cs = ccre;
276 	    *cs = META;
277 	    MSYM(cs) = c;
278 	    ccre = MNEXT(cs);
279 	    return;
280 
281 	/* mark the last match sequence as having an alternate */
282 	/* the third byte will contain an offset to jump over the */
283 	/* alternate match in case the first did not fail */
284 	case '|':
285 	    if (acs != NIL && acs != cs)
286 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
287 	    else
288 		OCNT(ccre) = 0;
289 	    assert(cs != NULL);
290 	    *cs |= ALT;
291 	    cs = ccre;
292 	    *cs = OPER;
293 	    OSYM(cs) = '|';
294 	    ccre = ONEXT(cs);
295 	    acs = cs;	/* remember that the pointer is to be filles */
296 	    break;
297 
298 	/* if its not a metasymbol just build a scharacter string */
299 	default:
300 	    if (cs == NIL || (*cs & STR) == 0) {
301 		cs = ccre;
302 		*cs = STR;
303 		SCNT(cs) = 1;
304 		ccre = SSTR(cs);
305 	    } else
306 		SCNT(cs)++;
307 	    *ccre++ = c;
308 	    break;
309 	}
310     }
311     if (acs != NIL) {
312 	do {
313 	    temp = OCNT(acs);
314 	    OCNT(acs) = ccre - acs;
315 	    acs -= temp;
316 	} while (temp != 0);
317 	acs = NIL;
318     }
319     return;
320 }
321 /* end of convertre */
322 
323 
324 /*
325  *	The following routine recognises an irregular expresion
326  *	with the following special characters:
327  *
328  *		\?	-	means last match was optional
329  *		\a	-	matches any number of characters
330  *		\d	-	matches any number of spaces and tabs
331  *		\p	-	matches any number of alphanumeric
332  *				characters. The
333  *				characters matched will be copied into
334  *				the area pointed to by 'name'.
335  *		\|	-	alternation
336  *		\( \)	-	grouping used mostly for alternation and
337  *				optionality
338  *
339  *	The irregular expression must be translated to internal form
340  *	prior to calling this routine
341  *
342  *	The value returned is the pointer to the first non \a
343  *	character matched.
344  */
345 
346 char *
347 expmatch(
348     char *s,		/* string to check for a match in */
349     char *re,		/* a converted irregular expression */
350     char *mstring)	/* where to put whatever matches a \p */
351 {
352     char *cs;		/* the current symbol */
353     char *ptr,*s1;	/* temporary pointer */
354     boolean matched;	/* a temporary boolean */
355 
356     /* initial conditions */
357     if (re == NIL)
358 	return (NIL);
359     cs = re;
360     matched = FALSE;
361 
362     /* loop till expression string is exhausted (or at least pretty tired) */
363     while (*cs) {
364 	switch (*cs & (OPER | STR | META)) {
365 
366 	/* try to match a string */
367 	case STR:
368 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
369 	    if (matched) {
370 
371 		/* hoorah it matches */
372 		s += SCNT(cs);
373 		cs = SNEXT(cs);
374 	    } else if (*cs & ALT) {
375 
376 		/* alternation, skip to next expression */
377 		cs = SNEXT(cs);
378 	    } else if (*cs & OPT) {
379 
380 		/* the match is optional */
381 		cs = SNEXT(cs);
382 		matched = 1;		/* indicate a successful match */
383 	    } else {
384 
385 		/* no match, error return */
386 		return (NIL);
387 	    }
388 	    break;
389 
390 	/* an operator, do something fancy */
391 	case OPER:
392 	    switch (OSYM(cs)) {
393 
394 	    /* this is an alternation */
395 	    case '|':
396 		if (matched)
397 
398 		    /* last thing in the alternation was a match, skip ahead */
399 		    cs = OPTR(cs);
400 		else
401 
402 		    /* no match, keep trying */
403 		    cs = ONEXT(cs);
404 		break;
405 
406 	    /* this is a grouping, recurse */
407 	    case '(':
408 		ptr = expmatch(s, ONEXT(cs), mstring);
409 		if (ptr != NIL) {
410 
411 		    /* the subexpression matched */
412 		    matched = 1;
413 		    s = ptr;
414 		} else if (*cs & ALT) {
415 
416 		    /* alternation, skip to next expression */
417 		    matched = 0;
418 		} else if (*cs & OPT) {
419 
420 		    /* the match is optional */
421 		    matched = 1;	/* indicate a successful match */
422 		} else {
423 
424 		    /* no match, error return */
425 		    return (NIL);
426 		}
427 		cs = OPTR(cs);
428 		break;
429 	    }
430 	    break;
431 
432 	/* try to match a metasymbol */
433 	case META:
434 	    switch (MSYM(cs)) {
435 
436 	    /* try to match anything and remember what was matched */
437 	    case 'p':
438 		/*
439 		 *  This is really the same as trying the match the
440 		 *  remaining parts of the expression to any subset
441 		 *  of the string.
442 		 */
443 		s1 = s;
444 		do {
445 		    ptr = expmatch(s1, MNEXT(cs), mstring);
446 		    if (ptr != NIL && s1 != s) {
447 
448 			/* we have a match, remember the match */
449 			strncpy(mstring, s, s1 - s);
450 			mstring[s1 - s] = '\0';
451 			return (ptr);
452 		    } else if (ptr != NIL && (*cs & OPT)) {
453 
454 			/* it was aoptional so no match is ok */
455 			return (ptr);
456 		    } else if (ptr != NIL) {
457 
458 			/* not optional and we still matched */
459 			return (NIL);
460 		    }
461 		    if (!isalnum((unsigned char)*s1) && *s1 != '_')
462 			return (NIL);
463 		    if (*s1 == '\\')
464 			x_escaped = x_escaped ? FALSE : TRUE;
465 		    else
466 			x_escaped = FALSE;
467 		} while (*s1++);
468 		return (NIL);
469 
470 	    /* try to match anything */
471 	    case 'a':
472 		/*
473 		 *  This is really the same as trying the match the
474 		 *  remaining parts of the expression to any subset
475 		 *  of the string.
476 		 */
477 		s1 = s;
478 		do {
479 		    ptr = expmatch(s1, MNEXT(cs), mstring);
480 		    if (ptr != NIL && s1 != s) {
481 
482 			/* we have a match */
483 			return (ptr);
484 		    } else if (ptr != NIL && (*cs & OPT)) {
485 
486 			/* it was aoptional so no match is ok */
487 			return (ptr);
488 		    } else if (ptr != NIL) {
489 
490 			/* not optional and we still matched */
491 			return (NIL);
492 		    }
493 		    if (*s1 == '\\')
494 			x_escaped = x_escaped ? FALSE : TRUE;
495 		    else
496 			x_escaped = FALSE;
497 		} while (*s1++);
498 		return (NIL);
499 
500 	    /* fail if we are currently x_escaped */
501 	    case 'e':
502 		if (x_escaped)
503 		    return(NIL);
504 		cs = MNEXT(cs);
505 		break;
506 
507 	    /* match any number of tabs and spaces */
508 	    case 'd':
509 		ptr = s;
510 		while (*s == ' ' || *s == '\t')
511 		    s++;
512 		if (s != ptr || s == x_start) {
513 
514 		    /* match, be happy */
515 		    matched = 1;
516 		    cs = MNEXT(cs);
517 		} else if (*s == '\n' || *s == '\0') {
518 
519 		    /* match, be happy */
520 		    matched = 1;
521 		    cs = MNEXT(cs);
522 		} else if (*cs & ALT) {
523 
524 		    /* try the next part */
525 		    matched = 0;
526 		    cs = MNEXT(cs);
527 		} else if (*cs & OPT) {
528 
529 		    /* doesn't matter */
530 		    matched = 1;
531 		    cs = MNEXT(cs);
532 		} else
533 
534 		    /* no match, error return */
535 		    return (NIL);
536 		break;
537 
538 	    /* check for end of line */
539 	    case '$':
540 		if (*s == '\0' || *s == '\n') {
541 
542 		    /* match, be happy */
543 		    s++;
544 		    matched = 1;
545 		    cs = MNEXT(cs);
546 		} else if (*cs & ALT) {
547 
548 		    /* try the next part */
549 		    matched = 0;
550 		    cs = MNEXT(cs);
551 		} else if (*cs & OPT) {
552 
553 		    /* doesn't matter */
554 		    matched = 1;
555 		    cs = MNEXT(cs);
556 		} else
557 
558 		    /* no match, error return */
559 		    return (NIL);
560 		break;
561 
562 	    /* check for start of line */
563 	    case '^':
564 		if (s == x_start) {
565 
566 		    /* match, be happy */
567 		    matched = 1;
568 		    cs = MNEXT(cs);
569 		} else if (*cs & ALT) {
570 
571 		    /* try the next part */
572 		    matched = 0;
573 		    cs = MNEXT(cs);
574 		} else if (*cs & OPT) {
575 
576 		    /* doesn't matter */
577 		    matched = 1;
578 		    cs = MNEXT(cs);
579 		} else
580 
581 		    /* no match, error return */
582 		    return (NIL);
583 		break;
584 
585 	    /* end of a subexpression, return success */
586 	    case ')':
587 		return (s);
588 	    }
589 	    break;
590 	}
591     }
592     return (s);
593 }
594