xref: /netbsd-src/usr.bin/vgrind/regexp.c (revision 4e00368f12e7278a94903a082dfe31dfebb70415)
1 /*	$NetBSD: regexp.c,v 1.13 2014/07/15 13:17:15 christos Exp $	*/
2 
3 /*
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #if HAVE_NBTOOL_CONFIG_H
34 #include "nbtool_config.h"
35 #endif
36 
37 #include <sys/cdefs.h>
38 #ifndef lint
39 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\
40  The Regents of the University of California.  All rights reserved.");
41 #endif /* not lint */
42 
43 #ifndef lint
44 #if 0
45 static char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
46 #endif
47 __RCSID("$NetBSD: regexp.c,v 1.13 2014/07/15 13:17:15 christos Exp $");
48 #endif /* not lint */
49 
50 #include <assert.h>
51 #include <ctype.h>
52 #include <stdlib.h>
53 #include <stdbool.h>
54 #include <string.h>
55 #include "extern.h"
56 
57 static void	expconv (void);
58 
59 bool	 x_escaped;	/* true if we are currently x_escaped */
60 char	*x_start;	/* start of string */
61 bool	 l_onecase;	/* true if upper and lower equivalent */
62 
63 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
64 
65 static char
66 tocc(ptrdiff_t x) {
67 	if (x & ~0xff)
68 		abort();
69 	return (char)x;
70 }
71 
72 /*  STRNCMP -	like strncmp except that we convert the
73  *	 	first string to lower case before comparing
74  *		if l_onecase is set.
75  */
76 
77 int
78 STRNCMP(char *s1, char *s2, int len)
79 {
80 	if (l_onecase) {
81 	    do
82 		if (*s2 - makelower(*s1))
83 			return *s2 - makelower(*s1);
84 		else {
85 			s2++;
86 			s1++;
87 		}
88 	    while (--len);
89 	} else {
90 	    do
91 		if (*s2 - *s1)
92 			return *s2 - *s1;
93 		else {
94 			s2++;
95 			s1++;
96 		}
97 	    while (--len);
98 	}
99 	return(0);
100 }
101 
102 /*	The following routine converts an irregular expression to
103  *	internal format.
104  *
105  *	Either meta symbols (\a \d or \p) or character strings or
106  *	operations ( alternation or perenthesizing ) can be
107  *	specified.  Each starts with a descriptor byte.  The descriptor
108  *	byte has STR set for strings, META set for meta symbols
109  *	and OPER set for operations.
110  *	The descriptor byte can also have the OPT bit set if the object
111  *	defined is optional.  Also ALT can be set to indicate an alternation.
112  *
113  *	For metasymbols the byte following the descriptor byte identities
114  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
115  *	strings the byte after the descriptor is a character count for
116  *	the string:
117  *
118  *		meta symbols := descriptor
119  *				symbol
120  *
121  *		strings :=	descriptor
122  *				character count
123  *				the string
124  *
125  *		operatins :=	descriptor
126  *				symbol
127  *				character count
128  */
129 
130 /*
131  *  handy macros for accessing parts of match blocks
132  */
133 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
134 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
135 
136 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
137 #define OCNT(A) (*(A+2))	/* character count */
138 #define ONEXT(A) (A+3)		/* next character after the operation */
139 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
140 
141 #define SCNT(A) (*(A+1))	/* byte count of a string */
142 #define SSTR(A) (A+2)		/* address of the string */
143 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
144 
145 /*
146  *  bit flags in the descriptor
147  */
148 #define OPT 1
149 #define STR 2
150 #define META 4
151 #define ALT 8
152 #define OPER 16
153 
154 static char *ccre;	/* pointer to current position in converted exp*/
155 static char *ure;	/* pointer current position in unconverted exp */
156 
157 char *
158 convexp(char *re)	/* unconverted irregular expression */
159 {
160     char *cre;		/* pointer to converted regular expression */
161 
162     /* allocate room for the converted expression */
163     if (re == NULL)
164 	return NULL;
165     if (*re == '\0')
166 	return NULL;
167     cre = malloc(4 * strlen(re) + 3);
168     ccre = cre;
169     ure = re;
170 
171     /* start the conversion with a \a */
172     *cre = META | OPT;
173     MSYM(cre) = 'a';
174     ccre = MNEXT(cre);
175 
176     /* start the conversion (its recursive) */
177     expconv();
178     *ccre = 0;
179     return cre;
180 }
181 
182 static void
183 expconv(void)
184 {
185     char *cs;		/* pointer to current symbol in converted exp */
186     char c;		/* character being processed */
187     char *acs;		/* pinter to last alternate */
188     int temp;
189 
190     /* let the conversion begin */
191     acs = NULL;
192     cs = NULL;
193     while (*ure) {
194 	switch (c = *ure++) {
195 
196 	case '\\':
197 	    switch (c = *ure++) {
198 
199 	    /* escaped characters are just characters */
200 	    default:
201 		if (cs == NULL || (*cs & STR) == 0) {
202 		    cs = ccre;
203 		    *cs = STR;
204 		    SCNT(cs) = 1;
205 		    ccre += 2;
206 		} else
207 		    SCNT(cs)++;
208 		*ccre++ = c;
209 		break;
210 
211 	    /* normal(?) metacharacters */
212 	    case 'a':
213 	    case 'd':
214 	    case 'e':
215 	    case 'p':
216 		if (acs != NULL && acs != cs) {
217 		    do {
218 			temp = OCNT(acs);
219 			OCNT(acs) = tocc(ccre - acs);
220 			acs -= temp;
221 		    } while (temp != 0);
222 		    acs = NULL;
223 		}
224 		cs = ccre;
225 		*cs = META;
226 		MSYM(cs) = c;
227 		ccre = MNEXT(cs);
228 		break;
229 	    }
230 	    break;
231 
232 	/* just put the symbol in */
233 	case '^':
234 	case '$':
235 	    if (acs != NULL && acs != cs) {
236 		do {
237 		    temp = OCNT(acs);
238 		    OCNT(acs) = tocc(ccre - acs);
239 		    acs -= temp;
240 		} while (temp != 0);
241 		acs = NULL;
242 	    }
243 	    cs = ccre;
244 	    *cs = META;
245 	    MSYM(cs) = c;
246 	    ccre = MNEXT(cs);
247 	    break;
248 
249 	/* mark the last match sequence as optional */
250 	case '?':
251 	    if (cs)
252 	    	*cs = *cs | OPT;
253 	    break;
254 
255 	/* recurse and define a subexpression */
256 	case '(':
257 	    if (acs != NULL && acs != cs) {
258 		do {
259 		    temp = OCNT(acs);
260 		    OCNT(acs) = tocc(ccre - acs);
261 		    acs -= temp;
262 		} while (temp != 0);
263 		acs = NULL;
264 	    }
265 	    cs = ccre;
266 	    *cs = OPER;
267 	    OSYM(cs) = '(';
268 	    ccre = ONEXT(cs);
269 	    expconv();
270 	    OCNT(cs) = tocc(ccre - cs);		/* offset to next symbol */
271 	    break;
272 
273 	/* reurn from a recursion */
274 	case ')':
275 	    if (acs != NULL) {
276 		do {
277 		    temp = OCNT(acs);
278 		    OCNT(acs) = tocc(ccre - acs);
279 		    acs -= temp;
280 		} while (temp != 0);
281 		acs = NULL;
282 	    }
283 	    cs = ccre;
284 	    *cs = META;
285 	    MSYM(cs) = c;
286 	    ccre = MNEXT(cs);
287 	    return;
288 
289 	/* mark the last match sequence as having an alternate */
290 	/* the third byte will contain an offset to jump over the */
291 	/* alternate match in case the first did not fail */
292 	case '|':
293 	    if (acs != NULL && acs != cs)
294 		OCNT(ccre) = tocc(ccre - acs);	/* make a back pointer */
295 	    else
296 		OCNT(ccre) = 0;
297 	    assert(cs != NULL);
298 	    *cs |= ALT;
299 	    cs = ccre;
300 	    *cs = OPER;
301 	    OSYM(cs) = '|';
302 	    ccre = ONEXT(cs);
303 	    acs = cs;	/* remember that the pointer is to be filles */
304 	    break;
305 
306 	/* if its not a metasymbol just build a scharacter string */
307 	default:
308 	    if (cs == NULL || (*cs & STR) == 0) {
309 		cs = ccre;
310 		*cs = STR;
311 		SCNT(cs) = 1;
312 		ccre = SSTR(cs);
313 	    } else
314 		SCNT(cs)++;
315 	    *ccre++ = c;
316 	    break;
317 	}
318     }
319     if (acs != NULL) {
320 	do {
321 	    temp = OCNT(acs);
322 	    OCNT(acs) = tocc(ccre - acs);
323 	    acs -= temp;
324 	} while (temp != 0);
325 	acs = NULL;
326     }
327     return;
328 }
329 /* end of convertre */
330 
331 
332 /*
333  *	The following routine recognises an irregular expresion
334  *	with the following special characters:
335  *
336  *		\?	-	means last match was optional
337  *		\a	-	matches any number of characters
338  *		\d	-	matches any number of spaces and tabs
339  *		\p	-	matches any number of alphanumeric
340  *				characters. The
341  *				characters matched will be copied into
342  *				the area pointed to by 'name'.
343  *		\|	-	alternation
344  *		\( \)	-	grouping used mostly for alternation and
345  *				optionality
346  *
347  *	The irregular expression must be translated to internal form
348  *	prior to calling this routine
349  *
350  *	The value returned is the pointer to the first non \a
351  *	character matched.
352  */
353 
354 char *
355 expmatch(
356     char *s,		/* string to check for a match in */
357     char *re,		/* a converted irregular expression */
358     char *mstring)	/* where to put whatever matches a \p */
359 {
360     char *cs;		/* the current symbol */
361     char *ptr,*s1;	/* temporary pointer */
362     bool matched;	/* a temporary bool */
363 
364     /* initial conditions */
365     if (re == NULL)
366 	return NULL;
367     cs = re;
368     matched = false;
369 
370     /* loop till expression string is exhausted (or at least pretty tired) */
371     while (*cs) {
372 	switch (*cs & (OPER | STR | META)) {
373 
374 	/* try to match a string */
375 	case STR:
376 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
377 	    if (matched) {
378 
379 		/* hoorah it matches */
380 		s += SCNT(cs);
381 		cs = SNEXT(cs);
382 	    } else if (*cs & ALT) {
383 
384 		/* alternation, skip to next expression */
385 		cs = SNEXT(cs);
386 	    } else if (*cs & OPT) {
387 
388 		/* the match is optional */
389 		cs = SNEXT(cs);
390 		matched = 1;		/* indicate a successful match */
391 	    } else {
392 
393 		/* no match, error return */
394 		return NULL;
395 	    }
396 	    break;
397 
398 	/* an operator, do something fancy */
399 	case OPER:
400 	    switch (OSYM(cs)) {
401 
402 	    /* this is an alternation */
403 	    case '|':
404 		if (matched)
405 
406 		    /* last thing in the alternation was a match, skip ahead */
407 		    cs = OPTR(cs);
408 		else
409 
410 		    /* no match, keep trying */
411 		    cs = ONEXT(cs);
412 		break;
413 
414 	    /* this is a grouping, recurse */
415 	    case '(':
416 		ptr = expmatch(s, ONEXT(cs), mstring);
417 		if (ptr != NULL) {
418 
419 		    /* the subexpression matched */
420 		    matched = 1;
421 		    s = ptr;
422 		} else if (*cs & ALT) {
423 
424 		    /* alternation, skip to next expression */
425 		    matched = 0;
426 		} else if (*cs & OPT) {
427 
428 		    /* the match is optional */
429 		    matched = 1;	/* indicate a successful match */
430 		} else {
431 
432 		    /* no match, error return */
433 		    return NULL;
434 		}
435 		cs = OPTR(cs);
436 		break;
437 	    }
438 	    break;
439 
440 	/* try to match a metasymbol */
441 	case META:
442 	    switch (MSYM(cs)) {
443 
444 	    /* try to match anything and remember what was matched */
445 	    case 'p':
446 		/*
447 		 *  This is really the same as trying the match the
448 		 *  remaining parts of the expression to any subset
449 		 *  of the string.
450 		 */
451 		s1 = s;
452 		do {
453 		    ptr = expmatch(s1, MNEXT(cs), mstring);
454 		    if (ptr != NULL && s1 != s) {
455 
456 			/* we have a match, remember the match */
457 			strncpy(mstring, s, (size_t)(s1 - s));
458 			mstring[s1 - s] = '\0';
459 			return ptr;
460 		    } else if (ptr != NULL && (*cs & OPT)) {
461 
462 			/* it was aoptional so no match is ok */
463 			return ptr;
464 		    } else if (ptr != NULL) {
465 
466 			/* not optional and we still matched */
467 			return NULL;
468 		    }
469 		    if (!isalnum((unsigned char)*s1) && *s1 != '_')
470 			return NULL;
471 		    if (*s1 == '\\')
472 			x_escaped = x_escaped ? false : true;
473 		    else
474 			x_escaped = false;
475 		} while (*s1++);
476 		return NULL;
477 
478 	    /* try to match anything */
479 	    case 'a':
480 		/*
481 		 *  This is really the same as trying the match the
482 		 *  remaining parts of the expression to any subset
483 		 *  of the string.
484 		 */
485 		s1 = s;
486 		do {
487 		    ptr = expmatch(s1, MNEXT(cs), mstring);
488 		    if (ptr != NULL && s1 != s) {
489 
490 			/* we have a match */
491 			return ptr;
492 		    } else if (ptr != NULL && (*cs & OPT)) {
493 
494 			/* it was aoptional so no match is ok */
495 			return ptr;
496 		    } else if (ptr != NULL) {
497 
498 			/* not optional and we still matched */
499 			return NULL;
500 		    }
501 		    if (*s1 == '\\')
502 			x_escaped = x_escaped ? false : true;
503 		    else
504 			x_escaped = false;
505 		} while (*s1++);
506 		return NULL;
507 
508 	    /* fail if we are currently x_escaped */
509 	    case 'e':
510 		if (x_escaped)
511 		    return NULL;
512 		cs = MNEXT(cs);
513 		break;
514 
515 	    /* match any number of tabs and spaces */
516 	    case 'd':
517 		ptr = s;
518 		while (*s == ' ' || *s == '\t')
519 		    s++;
520 		if (s != ptr || s == x_start) {
521 
522 		    /* match, be happy */
523 		    matched = 1;
524 		    cs = MNEXT(cs);
525 		} else if (*s == '\n' || *s == '\0') {
526 
527 		    /* match, be happy */
528 		    matched = 1;
529 		    cs = MNEXT(cs);
530 		} else if (*cs & ALT) {
531 
532 		    /* try the next part */
533 		    matched = 0;
534 		    cs = MNEXT(cs);
535 		} else if (*cs & OPT) {
536 
537 		    /* doesn't matter */
538 		    matched = 1;
539 		    cs = MNEXT(cs);
540 		} else
541 
542 		    /* no match, error return */
543 		    return NULL;
544 		break;
545 
546 	    /* check for end of line */
547 	    case '$':
548 		if (*s == '\0' || *s == '\n') {
549 
550 		    /* match, be happy */
551 		    s++;
552 		    matched = 1;
553 		    cs = MNEXT(cs);
554 		} else if (*cs & ALT) {
555 
556 		    /* try the next part */
557 		    matched = 0;
558 		    cs = MNEXT(cs);
559 		} else if (*cs & OPT) {
560 
561 		    /* doesn't matter */
562 		    matched = 1;
563 		    cs = MNEXT(cs);
564 		} else
565 
566 		    /* no match, error return */
567 		    return NULL;
568 		break;
569 
570 	    /* check for start of line */
571 	    case '^':
572 		if (s == x_start) {
573 
574 		    /* match, be happy */
575 		    matched = 1;
576 		    cs = MNEXT(cs);
577 		} else if (*cs & ALT) {
578 
579 		    /* try the next part */
580 		    matched = 0;
581 		    cs = MNEXT(cs);
582 		} else if (*cs & OPT) {
583 
584 		    /* doesn't matter */
585 		    matched = 1;
586 		    cs = MNEXT(cs);
587 		} else
588 
589 		    /* no match, error return */
590 		    return NULL;
591 		break;
592 
593 	    /* end of a subexpression, return success */
594 	    case ')':
595 		return s;
596 	    }
597 	    break;
598 	}
599     }
600     return s;
601 }
602