xref: /netbsd-src/usr.bin/vgrind/regexp.c (revision 811e6386f8c5e4a3521c7003da29ec8673e344fa)
1 /*
2  * Copyright (c) 1980 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 #ifndef lint
35 static char sccsid[] = "@(#)regexp.c	5.3 (Berkeley) 6/1/90";
36 #endif /* not lint */
37 
38 #include <ctype.h>
39 
40 typedef int	boolean;
41 #define TRUE	1
42 #define FALSE	0
43 #define NIL	0
44 
45 boolean l_onecase;	/* true if upper and lower equivalent */
46 
47 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
48 
49 /*  STRNCMP -	like strncmp except that we convert the
50  *	 	first string to lower case before comparing
51  *		if l_onecase is set.
52  */
53 
54 STRNCMP(s1, s2, len)
55 	register char *s1,*s2;
56 	register int len;
57 {
58 	if (l_onecase) {
59 	    do
60 		if (*s2 - makelower(*s1))
61 			return (*s2 - makelower(*s1));
62 		else {
63 			s2++;
64 			s1++;
65 		}
66 	    while (--len);
67 	} else {
68 	    do
69 		if (*s2 - *s1)
70 			return (*s2 - *s1);
71 		else {
72 			s2++;
73 			s1++;
74 		}
75 	    while (--len);
76 	}
77 	return(0);
78 }
79 
80 /*	The following routine converts an irregular expression to
81  *	internal format.
82  *
83  *	Either meta symbols (\a \d or \p) or character strings or
84  *	operations ( alternation or perenthesizing ) can be
85  *	specified.  Each starts with a descriptor byte.  The descriptor
86  *	byte has STR set for strings, META set for meta symbols
87  *	and OPER set for operations.
88  *	The descriptor byte can also have the OPT bit set if the object
89  *	defined is optional.  Also ALT can be set to indicate an alternation.
90  *
91  *	For metasymbols the byte following the descriptor byte identities
92  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
93  *	strings the byte after the descriptor is a character count for
94  *	the string:
95  *
96  *		meta symbols := descriptor
97  *				symbol
98  *
99  *		strings :=	descriptor
100  *				character count
101  *				the string
102  *
103  *		operatins :=	descriptor
104  *				symbol
105  *				character count
106  */
107 
108 /*
109  *  handy macros for accessing parts of match blocks
110  */
111 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
112 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
113 
114 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
115 #define OCNT(A) (*(A+2))	/* character count */
116 #define ONEXT(A) (A+3)		/* next character after the operation */
117 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
118 
119 #define SCNT(A) (*(A+1))	/* byte count of a string */
120 #define SSTR(A) (A+2)		/* address of the string */
121 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
122 
123 /*
124  *  bit flags in the descriptor
125  */
126 #define OPT 1
127 #define STR 2
128 #define META 4
129 #define ALT 8
130 #define OPER 16
131 
132 char *ure;		/* pointer current position in unconverted exp */
133 char *ccre;		/* pointer to current position in converted exp*/
134 char *malloc();
135 
136 char *
137 convexp(re)
138     char *re;		/* unconverted irregular expression */
139 {
140     register char *cre;		/* pointer to converted regular expression */
141 
142     /* allocate room for the converted expression */
143     if (re == NIL)
144 	return (NIL);
145     if (*re == '\0')
146 	return (NIL);
147     cre = malloc (4 * strlen(re) + 3);
148     ccre = cre;
149     ure = re;
150 
151     /* start the conversion with a \a */
152     *cre = META | OPT;
153     MSYM(cre) = 'a';
154     ccre = MNEXT(cre);
155 
156     /* start the conversion (its recursive) */
157     expconv ();
158     *ccre = 0;
159     return (cre);
160 }
161 
162 expconv()
163 {
164     register char *cs;		/* pointer to current symbol in converted exp */
165     register char c;		/* character being processed */
166     register char *acs;		/* pinter to last alternate */
167     register int temp;
168 
169     /* let the conversion begin */
170     acs = NIL;
171     cs = NIL;
172     while (*ure != NIL) {
173 	switch (c = *ure++) {
174 
175 	case '\\':
176 	    switch (c = *ure++) {
177 
178 	    /* escaped characters are just characters */
179 	    default:
180 		if (cs == NIL || (*cs & STR) == 0) {
181 		    cs = ccre;
182 		    *cs = STR;
183 		    SCNT(cs) = 1;
184 		    ccre += 2;
185 		} else
186 		    SCNT(cs)++;
187 		*ccre++ = c;
188 		break;
189 
190 	    /* normal(?) metacharacters */
191 	    case 'a':
192 	    case 'd':
193 	    case 'e':
194 	    case 'p':
195 		if (acs != NIL && acs != cs) {
196 		    do {
197 			temp = OCNT(acs);
198 			OCNT(acs) = ccre - acs;
199 			acs -= temp;
200 		    } while (temp != 0);
201 		    acs = NIL;
202 		}
203 		cs = ccre;
204 		*cs = META;
205 		MSYM(cs) = c;
206 		ccre = MNEXT(cs);
207 		break;
208 	    }
209 	    break;
210 
211 	/* just put the symbol in */
212 	case '^':
213 	case '$':
214 	    if (acs != NIL && acs != cs) {
215 		do {
216 		    temp = OCNT(acs);
217 		    OCNT(acs) = ccre - acs;
218 		    acs -= temp;
219 		} while (temp != 0);
220 		acs = NIL;
221 	    }
222 	    cs = ccre;
223 	    *cs = META;
224 	    MSYM(cs) = c;
225 	    ccre = MNEXT(cs);
226 	    break;
227 
228 	/* mark the last match sequence as optional */
229 	case '?':
230 	    if (cs)
231 	    	*cs = *cs | OPT;
232 	    break;
233 
234 	/* recurse and define a subexpression */
235 	case '(':
236 	    if (acs != NIL && acs != cs) {
237 		do {
238 		    temp = OCNT(acs);
239 		    OCNT(acs) = ccre - acs;
240 		    acs -= temp;
241 		} while (temp != 0);
242 		acs = NIL;
243 	    }
244 	    cs = ccre;
245 	    *cs = OPER;
246 	    OSYM(cs) = '(';
247 	    ccre = ONEXT(cs);
248 	    expconv ();
249 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
250 	    break;
251 
252 	/* return from a recursion */
253 	case ')':
254 	    if (acs != NIL) {
255 		do {
256 		    temp = OCNT(acs);
257 		    OCNT(acs) = ccre - acs;
258 		    acs -= temp;
259 		} while (temp != 0);
260 		acs = NIL;
261 	    }
262 	    cs = ccre;
263 	    *cs = META;
264 	    MSYM(cs) = c;
265 	    ccre = MNEXT(cs);
266 	    return;
267 
268 	/* mark the last match sequence as having an alternate */
269 	/* the third byte will contain an offset to jump over the */
270 	/* alternate match in case the first did not fail */
271 	case '|':
272 	    if (acs != NIL && acs != cs)
273 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
274 	    else
275 		OCNT(ccre) = 0;
276 	    *cs |= ALT;
277 	    cs = ccre;
278 	    *cs = OPER;
279 	    OSYM(cs) = '|';
280 	    ccre = ONEXT(cs);
281 	    acs = cs;	/* remember that the pointer is to be filles */
282 	    break;
283 
284 	/* if its not a metasymbol just build a scharacter string */
285 	default:
286 	    if (cs == NIL || (*cs & STR) == 0) {
287 		cs = ccre;
288 		*cs = STR;
289 		SCNT(cs) = 1;
290 		ccre = SSTR(cs);
291 	    } else
292 		SCNT(cs)++;
293 	    *ccre++ = c;
294 	    break;
295 	}
296     }
297     if (acs != NIL) {
298 	do {
299 	    temp = OCNT(acs);
300 	    OCNT(acs) = ccre - acs;
301 	    acs -= temp;
302 	} while (temp != 0);
303 	acs = NIL;
304     }
305     return;
306 }
307 /* end of convertre */
308 
309 
310 /*
311  *	The following routine recognises an irregular expresion
312  *	with the following special characters:
313  *
314  *		\?	-	means last match was optional
315  *		\a	-	matches any number of characters
316  *		\d	-	matches any number of spaces and tabs
317  *		\p	-	matches any number of alphanumeric
318  *				characters. The
319  *				characters matched will be copied into
320  *				the area pointed to by 'name'.
321  *		\|	-	alternation
322  *		\( \)	-	grouping used mostly for alternation and
323  *				optionality
324  *
325  *	The irregular expression must be translated to internal form
326  *	prior to calling this routine
327  *
328  *	The value returned is the pointer to the first non \a
329  *	character matched.
330  */
331 
332 boolean _escaped;		/* true if we are currently _escaped */
333 char *_start;			/* start of string */
334 
335 char *
336 expmatch (s, re, mstring)
337     register char *s;		/* string to check for a match in */
338     register char *re;		/* a converted irregular expression */
339     register char *mstring;	/* where to put whatever matches a \p */
340 {
341     register char *cs;		/* the current symbol */
342     register char *ptr,*s1;	/* temporary pointer */
343     boolean matched;		/* a temporary boolean */
344 
345     /* initial conditions */
346     if (re == NIL)
347 	return (NIL);
348     cs = re;
349     matched = FALSE;
350 
351     /* loop till expression string is exhausted (or at least pretty tired) */
352     while (*cs) {
353 	switch (*cs & (OPER | STR | META)) {
354 
355 	/* try to match a string */
356 	case STR:
357 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
358 	    if (matched) {
359 
360 		/* hoorah it matches */
361 		s += SCNT(cs);
362 		cs = SNEXT(cs);
363 	    } else if (*cs & ALT) {
364 
365 		/* alternation, skip to next expression */
366 		cs = SNEXT(cs);
367 	    } else if (*cs & OPT) {
368 
369 		/* the match is optional */
370 		cs = SNEXT(cs);
371 		matched = 1;		/* indicate a successful match */
372 	    } else {
373 
374 		/* no match, error return */
375 		return (NIL);
376 	    }
377 	    break;
378 
379 	/* an operator, do something fancy */
380 	case OPER:
381 	    switch (OSYM(cs)) {
382 
383 	    /* this is an alternation */
384 	    case '|':
385 		if (matched)
386 
387 		    /* last thing in the alternation was a match, skip ahead */
388 		    cs = OPTR(cs);
389 		else
390 
391 		    /* no match, keep trying */
392 		    cs = ONEXT(cs);
393 		break;
394 
395 	    /* this is a grouping, recurse */
396 	    case '(':
397 		ptr = expmatch (s, ONEXT(cs), mstring);
398 		if (ptr != NIL) {
399 
400 		    /* the subexpression matched */
401 		    matched = 1;
402 		    s = ptr;
403 		} else if (*cs & ALT) {
404 
405 		    /* alternation, skip to next expression */
406 		    matched = 0;
407 		} else if (*cs & OPT) {
408 
409 		    /* the match is optional */
410 		    matched = 1;	/* indicate a successful match */
411 		} else {
412 
413 		    /* no match, error return */
414 		    return (NIL);
415 		}
416 		cs = OPTR(cs);
417 		break;
418 	    }
419 	    break;
420 
421 	/* try to match a metasymbol */
422 	case META:
423 	    switch (MSYM(cs)) {
424 
425 	    /* try to match anything and remember what was matched */
426 	    case 'p':
427 		/*
428 		 *  This is really the same as trying the match the
429 		 *  remaining parts of the expression to any subset
430 		 *  of the string.
431 		 */
432 		s1 = s;
433 		do {
434 		    ptr = expmatch (s1, MNEXT(cs), mstring);
435 		    if (ptr != NIL && s1 != s) {
436 
437 			/* we have a match, remember the match */
438 			strncpy (mstring, s, s1 - s);
439 			mstring[s1 - s] = '\0';
440 			return (ptr);
441 		    } else if (ptr != NIL && (*cs & OPT)) {
442 
443 			/* it was aoptional so no match is ok */
444 			return (ptr);
445 		    } else if (ptr != NIL) {
446 
447 			/* not optional and we still matched */
448 			return (NIL);
449 		    }
450 		    if (!isalnum(*s1) && *s1 != '_')
451 			return (NIL);
452 		    if (*s1 == '\\')
453 			_escaped = _escaped ? FALSE : TRUE;
454 		    else
455 			_escaped = FALSE;
456 		} while (*s1++);
457 		return (NIL);
458 
459 	    /* try to match anything */
460 	    case 'a':
461 		/*
462 		 *  This is really the same as trying the match the
463 		 *  remaining parts of the expression to any subset
464 		 *  of the string.
465 		 */
466 		s1 = s;
467 		do {
468 		    ptr = expmatch (s1, MNEXT(cs), mstring);
469 		    if (ptr != NIL && s1 != s) {
470 
471 			/* we have a match */
472 			return (ptr);
473 		    } else if (ptr != NIL && (*cs & OPT)) {
474 
475 			/* it was aoptional so no match is ok */
476 			return (ptr);
477 		    } else if (ptr != NIL) {
478 
479 			/* not optional and we still matched */
480 			return (NIL);
481 		    }
482 		    if (*s1 == '\\')
483 			_escaped = _escaped ? FALSE : TRUE;
484 		    else
485 			_escaped = FALSE;
486 		} while (*s1++);
487 		return (NIL);
488 
489 	    /* fail if we are currently _escaped */
490 	    case 'e':
491 		if (_escaped)
492 		    return(NIL);
493 		cs = MNEXT(cs);
494 		break;
495 
496 	    /* match any number of tabs and spaces */
497 	    case 'd':
498 		ptr = s;
499 		while (*s == ' ' || *s == '\t')
500 		    s++;
501 		if (s != ptr || s == _start) {
502 
503 		    /* match, be happy */
504 		    matched = 1;
505 		    cs = MNEXT(cs);
506 		} else if (*s == '\n' || *s == '\0') {
507 
508 		    /* match, be happy */
509 		    matched = 1;
510 		    cs = MNEXT(cs);
511 		} else if (*cs & ALT) {
512 
513 		    /* try the next part */
514 		    matched = 0;
515 		    cs = MNEXT(cs);
516 		} else if (*cs & OPT) {
517 
518 		    /* doesn't matter */
519 		    matched = 1;
520 		    cs = MNEXT(cs);
521 		} else
522 
523 		    /* no match, error return */
524 		    return (NIL);
525 		break;
526 
527 	    /* check for end of line */
528 	    case '$':
529 		if (*s == '\0' || *s == '\n') {
530 
531 		    /* match, be happy */
532 		    s++;
533 		    matched = 1;
534 		    cs = MNEXT(cs);
535 		} else if (*cs & ALT) {
536 
537 		    /* try the next part */
538 		    matched = 0;
539 		    cs = MNEXT(cs);
540 		} else if (*cs & OPT) {
541 
542 		    /* doesn't matter */
543 		    matched = 1;
544 		    cs = MNEXT(cs);
545 		} else
546 
547 		    /* no match, error return */
548 		    return (NIL);
549 		break;
550 
551 	    /* check for start of line */
552 	    case '^':
553 		if (s == _start) {
554 
555 		    /* match, be happy */
556 		    matched = 1;
557 		    cs = MNEXT(cs);
558 		} else if (*cs & ALT) {
559 
560 		    /* try the next part */
561 		    matched = 0;
562 		    cs = MNEXT(cs);
563 		} else if (*cs & OPT) {
564 
565 		    /* doesn't matter */
566 		    matched = 1;
567 		    cs = MNEXT(cs);
568 		} else
569 
570 		    /* no match, error return */
571 		    return (NIL);
572 		break;
573 
574 	    /* end of a subexpression, return success */
575 	    case ')':
576 		return (s);
577 	    }
578 	    break;
579 	}
580     }
581     return (s);
582 }
583