xref: /netbsd-src/usr.bin/vgrind/regexp.c (revision d9158b13b5dfe46201430699a3f7a235ecf28df3)
1 /*
2  * Copyright (c) 1980 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 #ifndef lint
35 /*static char sccsid[] = "from: @(#)regexp.c	5.3 (Berkeley) 6/1/90";*/
36 static char rcsid[] = "$Id: regexp.c,v 1.2 1993/08/01 18:03:29 mycroft Exp $";
37 #endif /* not lint */
38 
39 #include <ctype.h>
40 
41 typedef int	boolean;
42 #define TRUE	1
43 #define FALSE	0
44 #define NIL	0
45 
46 boolean l_onecase;	/* true if upper and lower equivalent */
47 
48 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
49 
50 /*  STRNCMP -	like strncmp except that we convert the
51  *	 	first string to lower case before comparing
52  *		if l_onecase is set.
53  */
54 
55 STRNCMP(s1, s2, len)
56 	register char *s1,*s2;
57 	register int len;
58 {
59 	if (l_onecase) {
60 	    do
61 		if (*s2 - makelower(*s1))
62 			return (*s2 - makelower(*s1));
63 		else {
64 			s2++;
65 			s1++;
66 		}
67 	    while (--len);
68 	} else {
69 	    do
70 		if (*s2 - *s1)
71 			return (*s2 - *s1);
72 		else {
73 			s2++;
74 			s1++;
75 		}
76 	    while (--len);
77 	}
78 	return(0);
79 }
80 
81 /*	The following routine converts an irregular expression to
82  *	internal format.
83  *
84  *	Either meta symbols (\a \d or \p) or character strings or
85  *	operations ( alternation or perenthesizing ) can be
86  *	specified.  Each starts with a descriptor byte.  The descriptor
87  *	byte has STR set for strings, META set for meta symbols
88  *	and OPER set for operations.
89  *	The descriptor byte can also have the OPT bit set if the object
90  *	defined is optional.  Also ALT can be set to indicate an alternation.
91  *
92  *	For metasymbols the byte following the descriptor byte identities
93  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
94  *	strings the byte after the descriptor is a character count for
95  *	the string:
96  *
97  *		meta symbols := descriptor
98  *				symbol
99  *
100  *		strings :=	descriptor
101  *				character count
102  *				the string
103  *
104  *		operatins :=	descriptor
105  *				symbol
106  *				character count
107  */
108 
109 /*
110  *  handy macros for accessing parts of match blocks
111  */
112 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
113 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
114 
115 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
116 #define OCNT(A) (*(A+2))	/* character count */
117 #define ONEXT(A) (A+3)		/* next character after the operation */
118 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
119 
120 #define SCNT(A) (*(A+1))	/* byte count of a string */
121 #define SSTR(A) (A+2)		/* address of the string */
122 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
123 
124 /*
125  *  bit flags in the descriptor
126  */
127 #define OPT 1
128 #define STR 2
129 #define META 4
130 #define ALT 8
131 #define OPER 16
132 
133 char *ure;		/* pointer current position in unconverted exp */
134 char *ccre;		/* pointer to current position in converted exp*/
135 char *malloc();
136 
137 char *
138 convexp(re)
139     char *re;		/* unconverted irregular expression */
140 {
141     register char *cre;		/* pointer to converted regular expression */
142 
143     /* allocate room for the converted expression */
144     if (re == NIL)
145 	return (NIL);
146     if (*re == '\0')
147 	return (NIL);
148     cre = malloc (4 * strlen(re) + 3);
149     ccre = cre;
150     ure = re;
151 
152     /* start the conversion with a \a */
153     *cre = META | OPT;
154     MSYM(cre) = 'a';
155     ccre = MNEXT(cre);
156 
157     /* start the conversion (its recursive) */
158     expconv ();
159     *ccre = 0;
160     return (cre);
161 }
162 
163 expconv()
164 {
165     register char *cs;		/* pointer to current symbol in converted exp */
166     register char c;		/* character being processed */
167     register char *acs;		/* pinter to last alternate */
168     register int temp;
169 
170     /* let the conversion begin */
171     acs = NIL;
172     cs = NIL;
173     while (*ure != NIL) {
174 	switch (c = *ure++) {
175 
176 	case '\\':
177 	    switch (c = *ure++) {
178 
179 	    /* escaped characters are just characters */
180 	    default:
181 		if (cs == NIL || (*cs & STR) == 0) {
182 		    cs = ccre;
183 		    *cs = STR;
184 		    SCNT(cs) = 1;
185 		    ccre += 2;
186 		} else
187 		    SCNT(cs)++;
188 		*ccre++ = c;
189 		break;
190 
191 	    /* normal(?) metacharacters */
192 	    case 'a':
193 	    case 'd':
194 	    case 'e':
195 	    case 'p':
196 		if (acs != NIL && acs != cs) {
197 		    do {
198 			temp = OCNT(acs);
199 			OCNT(acs) = ccre - acs;
200 			acs -= temp;
201 		    } while (temp != 0);
202 		    acs = NIL;
203 		}
204 		cs = ccre;
205 		*cs = META;
206 		MSYM(cs) = c;
207 		ccre = MNEXT(cs);
208 		break;
209 	    }
210 	    break;
211 
212 	/* just put the symbol in */
213 	case '^':
214 	case '$':
215 	    if (acs != NIL && acs != cs) {
216 		do {
217 		    temp = OCNT(acs);
218 		    OCNT(acs) = ccre - acs;
219 		    acs -= temp;
220 		} while (temp != 0);
221 		acs = NIL;
222 	    }
223 	    cs = ccre;
224 	    *cs = META;
225 	    MSYM(cs) = c;
226 	    ccre = MNEXT(cs);
227 	    break;
228 
229 	/* mark the last match sequence as optional */
230 	case '?':
231 	    if (cs)
232 	    	*cs = *cs | OPT;
233 	    break;
234 
235 	/* recurse and define a subexpression */
236 	case '(':
237 	    if (acs != NIL && acs != cs) {
238 		do {
239 		    temp = OCNT(acs);
240 		    OCNT(acs) = ccre - acs;
241 		    acs -= temp;
242 		} while (temp != 0);
243 		acs = NIL;
244 	    }
245 	    cs = ccre;
246 	    *cs = OPER;
247 	    OSYM(cs) = '(';
248 	    ccre = ONEXT(cs);
249 	    expconv ();
250 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
251 	    break;
252 
253 	/* return from a recursion */
254 	case ')':
255 	    if (acs != NIL) {
256 		do {
257 		    temp = OCNT(acs);
258 		    OCNT(acs) = ccre - acs;
259 		    acs -= temp;
260 		} while (temp != 0);
261 		acs = NIL;
262 	    }
263 	    cs = ccre;
264 	    *cs = META;
265 	    MSYM(cs) = c;
266 	    ccre = MNEXT(cs);
267 	    return;
268 
269 	/* mark the last match sequence as having an alternate */
270 	/* the third byte will contain an offset to jump over the */
271 	/* alternate match in case the first did not fail */
272 	case '|':
273 	    if (acs != NIL && acs != cs)
274 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
275 	    else
276 		OCNT(ccre) = 0;
277 	    *cs |= ALT;
278 	    cs = ccre;
279 	    *cs = OPER;
280 	    OSYM(cs) = '|';
281 	    ccre = ONEXT(cs);
282 	    acs = cs;	/* remember that the pointer is to be filles */
283 	    break;
284 
285 	/* if its not a metasymbol just build a scharacter string */
286 	default:
287 	    if (cs == NIL || (*cs & STR) == 0) {
288 		cs = ccre;
289 		*cs = STR;
290 		SCNT(cs) = 1;
291 		ccre = SSTR(cs);
292 	    } else
293 		SCNT(cs)++;
294 	    *ccre++ = c;
295 	    break;
296 	}
297     }
298     if (acs != NIL) {
299 	do {
300 	    temp = OCNT(acs);
301 	    OCNT(acs) = ccre - acs;
302 	    acs -= temp;
303 	} while (temp != 0);
304 	acs = NIL;
305     }
306     return;
307 }
308 /* end of convertre */
309 
310 
311 /*
312  *	The following routine recognises an irregular expresion
313  *	with the following special characters:
314  *
315  *		\?	-	means last match was optional
316  *		\a	-	matches any number of characters
317  *		\d	-	matches any number of spaces and tabs
318  *		\p	-	matches any number of alphanumeric
319  *				characters. The
320  *				characters matched will be copied into
321  *				the area pointed to by 'name'.
322  *		\|	-	alternation
323  *		\( \)	-	grouping used mostly for alternation and
324  *				optionality
325  *
326  *	The irregular expression must be translated to internal form
327  *	prior to calling this routine
328  *
329  *	The value returned is the pointer to the first non \a
330  *	character matched.
331  */
332 
333 boolean _escaped;		/* true if we are currently _escaped */
334 char *_start;			/* start of string */
335 
336 char *
337 expmatch (s, re, mstring)
338     register char *s;		/* string to check for a match in */
339     register char *re;		/* a converted irregular expression */
340     register char *mstring;	/* where to put whatever matches a \p */
341 {
342     register char *cs;		/* the current symbol */
343     register char *ptr,*s1;	/* temporary pointer */
344     boolean matched;		/* a temporary boolean */
345 
346     /* initial conditions */
347     if (re == NIL)
348 	return (NIL);
349     cs = re;
350     matched = FALSE;
351 
352     /* loop till expression string is exhausted (or at least pretty tired) */
353     while (*cs) {
354 	switch (*cs & (OPER | STR | META)) {
355 
356 	/* try to match a string */
357 	case STR:
358 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
359 	    if (matched) {
360 
361 		/* hoorah it matches */
362 		s += SCNT(cs);
363 		cs = SNEXT(cs);
364 	    } else if (*cs & ALT) {
365 
366 		/* alternation, skip to next expression */
367 		cs = SNEXT(cs);
368 	    } else if (*cs & OPT) {
369 
370 		/* the match is optional */
371 		cs = SNEXT(cs);
372 		matched = 1;		/* indicate a successful match */
373 	    } else {
374 
375 		/* no match, error return */
376 		return (NIL);
377 	    }
378 	    break;
379 
380 	/* an operator, do something fancy */
381 	case OPER:
382 	    switch (OSYM(cs)) {
383 
384 	    /* this is an alternation */
385 	    case '|':
386 		if (matched)
387 
388 		    /* last thing in the alternation was a match, skip ahead */
389 		    cs = OPTR(cs);
390 		else
391 
392 		    /* no match, keep trying */
393 		    cs = ONEXT(cs);
394 		break;
395 
396 	    /* this is a grouping, recurse */
397 	    case '(':
398 		ptr = expmatch (s, ONEXT(cs), mstring);
399 		if (ptr != NIL) {
400 
401 		    /* the subexpression matched */
402 		    matched = 1;
403 		    s = ptr;
404 		} else if (*cs & ALT) {
405 
406 		    /* alternation, skip to next expression */
407 		    matched = 0;
408 		} else if (*cs & OPT) {
409 
410 		    /* the match is optional */
411 		    matched = 1;	/* indicate a successful match */
412 		} else {
413 
414 		    /* no match, error return */
415 		    return (NIL);
416 		}
417 		cs = OPTR(cs);
418 		break;
419 	    }
420 	    break;
421 
422 	/* try to match a metasymbol */
423 	case META:
424 	    switch (MSYM(cs)) {
425 
426 	    /* try to match anything and remember what was matched */
427 	    case 'p':
428 		/*
429 		 *  This is really the same as trying the match the
430 		 *  remaining parts of the expression to any subset
431 		 *  of the string.
432 		 */
433 		s1 = s;
434 		do {
435 		    ptr = expmatch (s1, MNEXT(cs), mstring);
436 		    if (ptr != NIL && s1 != s) {
437 
438 			/* we have a match, remember the match */
439 			strncpy (mstring, s, s1 - s);
440 			mstring[s1 - s] = '\0';
441 			return (ptr);
442 		    } else if (ptr != NIL && (*cs & OPT)) {
443 
444 			/* it was aoptional so no match is ok */
445 			return (ptr);
446 		    } else if (ptr != NIL) {
447 
448 			/* not optional and we still matched */
449 			return (NIL);
450 		    }
451 		    if (!isalnum(*s1) && *s1 != '_')
452 			return (NIL);
453 		    if (*s1 == '\\')
454 			_escaped = _escaped ? FALSE : TRUE;
455 		    else
456 			_escaped = FALSE;
457 		} while (*s1++);
458 		return (NIL);
459 
460 	    /* try to match anything */
461 	    case 'a':
462 		/*
463 		 *  This is really the same as trying the match the
464 		 *  remaining parts of the expression to any subset
465 		 *  of the string.
466 		 */
467 		s1 = s;
468 		do {
469 		    ptr = expmatch (s1, MNEXT(cs), mstring);
470 		    if (ptr != NIL && s1 != s) {
471 
472 			/* we have a match */
473 			return (ptr);
474 		    } else if (ptr != NIL && (*cs & OPT)) {
475 
476 			/* it was aoptional so no match is ok */
477 			return (ptr);
478 		    } else if (ptr != NIL) {
479 
480 			/* not optional and we still matched */
481 			return (NIL);
482 		    }
483 		    if (*s1 == '\\')
484 			_escaped = _escaped ? FALSE : TRUE;
485 		    else
486 			_escaped = FALSE;
487 		} while (*s1++);
488 		return (NIL);
489 
490 	    /* fail if we are currently _escaped */
491 	    case 'e':
492 		if (_escaped)
493 		    return(NIL);
494 		cs = MNEXT(cs);
495 		break;
496 
497 	    /* match any number of tabs and spaces */
498 	    case 'd':
499 		ptr = s;
500 		while (*s == ' ' || *s == '\t')
501 		    s++;
502 		if (s != ptr || s == _start) {
503 
504 		    /* match, be happy */
505 		    matched = 1;
506 		    cs = MNEXT(cs);
507 		} else if (*s == '\n' || *s == '\0') {
508 
509 		    /* match, be happy */
510 		    matched = 1;
511 		    cs = MNEXT(cs);
512 		} else if (*cs & ALT) {
513 
514 		    /* try the next part */
515 		    matched = 0;
516 		    cs = MNEXT(cs);
517 		} else if (*cs & OPT) {
518 
519 		    /* doesn't matter */
520 		    matched = 1;
521 		    cs = MNEXT(cs);
522 		} else
523 
524 		    /* no match, error return */
525 		    return (NIL);
526 		break;
527 
528 	    /* check for end of line */
529 	    case '$':
530 		if (*s == '\0' || *s == '\n') {
531 
532 		    /* match, be happy */
533 		    s++;
534 		    matched = 1;
535 		    cs = MNEXT(cs);
536 		} else if (*cs & ALT) {
537 
538 		    /* try the next part */
539 		    matched = 0;
540 		    cs = MNEXT(cs);
541 		} else if (*cs & OPT) {
542 
543 		    /* doesn't matter */
544 		    matched = 1;
545 		    cs = MNEXT(cs);
546 		} else
547 
548 		    /* no match, error return */
549 		    return (NIL);
550 		break;
551 
552 	    /* check for start of line */
553 	    case '^':
554 		if (s == _start) {
555 
556 		    /* match, be happy */
557 		    matched = 1;
558 		    cs = MNEXT(cs);
559 		} else if (*cs & ALT) {
560 
561 		    /* try the next part */
562 		    matched = 0;
563 		    cs = MNEXT(cs);
564 		} else if (*cs & OPT) {
565 
566 		    /* doesn't matter */
567 		    matched = 1;
568 		    cs = MNEXT(cs);
569 		} else
570 
571 		    /* no match, error return */
572 		    return (NIL);
573 		break;
574 
575 	    /* end of a subexpression, return success */
576 	    case ')':
577 		return (s);
578 	    }
579 	    break;
580 	}
581     }
582     return (s);
583 }
584