xref: /csrg-svn/usr.bin/vgrind/regexp.c (revision 22063)
1 /*
2  * Copyright (c) 1980 Regents of the University of California.
3  * All rights reserved.  The Berkeley software License Agreement
4  * specifies the terms and conditions for redistribution.
5  */
6 
7 #ifndef lint
8 static char sccsid[] = "@(#)regexp.c	5.1 (Berkeley) 06/05/85";
9 #endif not lint
10 
11 #include <ctype.h>
12 
13 typedef int	boolean;
14 #define TRUE	1
15 #define FALSE	0
16 #define NIL	0
17 
18 boolean l_onecase;	/* true if upper and lower equivalent */
19 
20 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
21 
22 /*  STRNCMP -	like strncmp except that we convert the
23  *	 	first string to lower case before comparing
24  *		if l_onecase is set.
25  */
26 
27 STRNCMP(s1, s2, len)
28 	register char *s1,*s2;
29 	register int len;
30 {
31 	if (l_onecase) {
32 	    do
33 		if (*s2 - makelower(*s1))
34 			return (*s2 - makelower(*s1));
35 		else {
36 			s2++;
37 			s1++;
38 		}
39 	    while (--len);
40 	} else {
41 	    do
42 		if (*s2 - *s1)
43 			return (*s2 - *s1);
44 		else {
45 			s2++;
46 			s1++;
47 		}
48 	    while (--len);
49 	}
50 	return(0);
51 }
52 
53 /*	The following routine converts an irregular expression to
54  *	internal format.
55  *
56  *	Either meta symbols (\a \d or \p) or character strings or
57  *	operations ( alternation or perenthesizing ) can be
58  *	specified.  Each starts with a descriptor byte.  The descriptor
59  *	byte has STR set for strings, META set for meta symbols
60  *	and OPER set for operations.
61  *	The descriptor byte can also have the OPT bit set if the object
62  *	defined is optional.  Also ALT can be set to indicate an alternation.
63  *
64  *	For metasymbols the byte following the descriptor byte identities
65  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
66  *	strings the byte after the descriptor is a character count for
67  *	the string:
68  *
69  *		meta symbols := descriptor
70  *				symbol
71  *
72  *		strings :=	descriptor
73  *				character count
74  *				the string
75  *
76  *		operatins :=	descriptor
77  *				symbol
78  *				character count
79  */
80 
81 /*
82  *  handy macros for accessing parts of match blocks
83  */
84 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
85 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
86 
87 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
88 #define OCNT(A) (*(A+2))	/* character count */
89 #define ONEXT(A) (A+3)		/* next character after the operation */
90 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
91 
92 #define SCNT(A) (*(A+1))	/* byte count of a string */
93 #define SSTR(A) (A+2)		/* address of the string */
94 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
95 
96 /*
97  *  bit flags in the descriptor
98  */
99 #define OPT 1
100 #define STR 2
101 #define META 4
102 #define ALT 8
103 #define OPER 16
104 
105 char *ure;		/* pointer current position in unconverted exp */
106 char *ccre;		/* pointer to current position in converted exp*/
107 char *malloc();
108 
109 char *
110 convexp(re)
111     char *re;		/* unconverted irregular expression */
112 {
113     register char *cre;		/* pointer to converted regular expression */
114 
115     /* allocate room for the converted expression */
116     if (re == NIL)
117 	return (NIL);
118     if (*re == '\0')
119 	return (NIL);
120     cre = malloc (4 * strlen(re) + 3);
121     ccre = cre;
122     ure = re;
123 
124     /* start the conversion with a \a */
125     *cre = META | OPT;
126     MSYM(cre) = 'a';
127     ccre = MNEXT(cre);
128 
129     /* start the conversion (its recursive) */
130     expconv ();
131     *ccre = 0;
132     return (cre);
133 }
134 
135 expconv()
136 {
137     register char *cs;		/* pointer to current symbol in converted exp */
138     register char c;		/* character being processed */
139     register char *acs;		/* pinter to last alternate */
140     register int temp;
141 
142     /* let the conversion begin */
143     acs = NIL;
144     cs = NIL;
145     while (*ure != NIL) {
146 	switch (c = *ure++) {
147 
148 	case '\\':
149 	    switch (c = *ure++) {
150 
151 	    /* escaped characters are just characters */
152 	    default:
153 		if (cs == NIL || (*cs & STR) == 0) {
154 		    cs = ccre;
155 		    *cs = STR;
156 		    SCNT(cs) = 1;
157 		    ccre += 2;
158 		} else
159 		    SCNT(cs)++;
160 		*ccre++ = c;
161 		break;
162 
163 	    /* normal(?) metacharacters */
164 	    case 'a':
165 	    case 'd':
166 	    case 'e':
167 	    case 'p':
168 		if (acs != NIL && acs != cs) {
169 		    do {
170 			temp = OCNT(acs);
171 			OCNT(acs) = ccre - acs;
172 			acs -= temp;
173 		    } while (temp != 0);
174 		    acs = NIL;
175 		}
176 		cs = ccre;
177 		*cs = META;
178 		MSYM(cs) = c;
179 		ccre = MNEXT(cs);
180 		break;
181 	    }
182 	    break;
183 
184 	/* just put the symbol in */
185 	case '^':
186 	case '$':
187 	    if (acs != NIL && acs != cs) {
188 		do {
189 		    temp = OCNT(acs);
190 		    OCNT(acs) = ccre - acs;
191 		    acs -= temp;
192 		} while (temp != 0);
193 		acs = NIL;
194 	    }
195 	    cs = ccre;
196 	    *cs = META;
197 	    MSYM(cs) = c;
198 	    ccre = MNEXT(cs);
199 	    break;
200 
201 	/* mark the last match sequence as optional */
202 	case '?':
203 	    if (cs)
204 	    	*cs = *cs | OPT;
205 	    break;
206 
207 	/* recurse and define a subexpression */
208 	case '(':
209 	    if (acs != NIL && acs != cs) {
210 		do {
211 		    temp = OCNT(acs);
212 		    OCNT(acs) = ccre - acs;
213 		    acs -= temp;
214 		} while (temp != 0);
215 		acs = NIL;
216 	    }
217 	    cs = ccre;
218 	    *cs = OPER;
219 	    OSYM(cs) = '(';
220 	    ccre = ONEXT(cs);
221 	    expconv ();
222 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
223 	    break;
224 
225 	/* return from a recursion */
226 	case ')':
227 	    if (acs != NIL) {
228 		do {
229 		    temp = OCNT(acs);
230 		    OCNT(acs) = ccre - acs;
231 		    acs -= temp;
232 		} while (temp != 0);
233 		acs = NIL;
234 	    }
235 	    cs = ccre;
236 	    *cs = META;
237 	    MSYM(cs) = c;
238 	    ccre = MNEXT(cs);
239 	    return;
240 
241 	/* mark the last match sequence as having an alternate */
242 	/* the third byte will contain an offset to jump over the */
243 	/* alternate match in case the first did not fail */
244 	case '|':
245 	    if (acs != NIL && acs != cs)
246 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
247 	    else
248 		OCNT(ccre) = 0;
249 	    *cs |= ALT;
250 	    cs = ccre;
251 	    *cs = OPER;
252 	    OSYM(cs) = '|';
253 	    ccre = ONEXT(cs);
254 	    acs = cs;	/* remember that the pointer is to be filles */
255 	    break;
256 
257 	/* if its not a metasymbol just build a scharacter string */
258 	default:
259 	    if (cs == NIL || (*cs & STR) == 0) {
260 		cs = ccre;
261 		*cs = STR;
262 		SCNT(cs) = 1;
263 		ccre = SSTR(cs);
264 	    } else
265 		SCNT(cs)++;
266 	    *ccre++ = c;
267 	    break;
268 	}
269     }
270     if (acs != NIL) {
271 	do {
272 	    temp = OCNT(acs);
273 	    OCNT(acs) = ccre - acs;
274 	    acs -= temp;
275 	} while (temp != 0);
276 	acs = NIL;
277     }
278     return;
279 }
280 /* end of convertre */
281 
282 
283 /*
284  *	The following routine recognises an irregular expresion
285  *	with the following special characters:
286  *
287  *		\?	-	means last match was optional
288  *		\a	-	matches any number of characters
289  *		\d	-	matches any number of spaces and tabs
290  *		\p	-	matches any number of alphanumeric
291  *				characters. The
292  *				characters matched will be copied into
293  *				the area pointed to by 'name'.
294  *		\|	-	alternation
295  *		\( \)	-	grouping used mostly for alternation and
296  *				optionality
297  *
298  *	The irregular expression must be translated to internal form
299  *	prior to calling this routine
300  *
301  *	The value returned is the pointer to the first non \a
302  *	character matched.
303  */
304 
305 boolean _escaped;		/* true if we are currently _escaped */
306 char *_start;			/* start of string */
307 
308 char *
309 expmatch (s, re, mstring)
310     register char *s;		/* string to check for a match in */
311     register char *re;		/* a converted irregular expression */
312     register char *mstring;	/* where to put whatever matches a \p */
313 {
314     register char *cs;		/* the current symbol */
315     register char *ptr,*s1;	/* temporary pointer */
316     boolean matched;		/* a temporary boolean */
317 
318     /* initial conditions */
319     if (re == NIL)
320 	return (NIL);
321     cs = re;
322     matched = FALSE;
323 
324     /* loop till expression string is exhausted (or at least pretty tired) */
325     while (*cs) {
326 	switch (*cs & (OPER | STR | META)) {
327 
328 	/* try to match a string */
329 	case STR:
330 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
331 	    if (matched) {
332 
333 		/* hoorah it matches */
334 		s += SCNT(cs);
335 		cs = SNEXT(cs);
336 	    } else if (*cs & ALT) {
337 
338 		/* alternation, skip to next expression */
339 		cs = SNEXT(cs);
340 	    } else if (*cs & OPT) {
341 
342 		/* the match is optional */
343 		cs = SNEXT(cs);
344 		matched = 1;		/* indicate a successful match */
345 	    } else {
346 
347 		/* no match, error return */
348 		return (NIL);
349 	    }
350 	    break;
351 
352 	/* an operator, do something fancy */
353 	case OPER:
354 	    switch (OSYM(cs)) {
355 
356 	    /* this is an alternation */
357 	    case '|':
358 		if (matched)
359 
360 		    /* last thing in the alternation was a match, skip ahead */
361 		    cs = OPTR(cs);
362 		else
363 
364 		    /* no match, keep trying */
365 		    cs = ONEXT(cs);
366 		break;
367 
368 	    /* this is a grouping, recurse */
369 	    case '(':
370 		ptr = expmatch (s, ONEXT(cs), mstring);
371 		if (ptr != NIL) {
372 
373 		    /* the subexpression matched */
374 		    matched = 1;
375 		    s = ptr;
376 		} else if (*cs & ALT) {
377 
378 		    /* alternation, skip to next expression */
379 		    matched = 0;
380 		} else if (*cs & OPT) {
381 
382 		    /* the match is optional */
383 		    matched = 1;	/* indicate a successful match */
384 		} else {
385 
386 		    /* no match, error return */
387 		    return (NIL);
388 		}
389 		cs = OPTR(cs);
390 		break;
391 	    }
392 	    break;
393 
394 	/* try to match a metasymbol */
395 	case META:
396 	    switch (MSYM(cs)) {
397 
398 	    /* try to match anything and remember what was matched */
399 	    case 'p':
400 		/*
401 		 *  This is really the same as trying the match the
402 		 *  remaining parts of the expression to any subset
403 		 *  of the string.
404 		 */
405 		s1 = s;
406 		do {
407 		    ptr = expmatch (s1, MNEXT(cs), mstring);
408 		    if (ptr != NIL && s1 != s) {
409 
410 			/* we have a match, remember the match */
411 			strncpy (mstring, s, s1 - s);
412 			mstring[s1 - s] = '\0';
413 			return (ptr);
414 		    } else if (ptr != NIL && (*cs & OPT)) {
415 
416 			/* it was aoptional so no match is ok */
417 			return (ptr);
418 		    } else if (ptr != NIL) {
419 
420 			/* not optional and we still matched */
421 			return (NIL);
422 		    }
423 		    if (!isalnum(*s1) && *s1 != '_')
424 			return (NIL);
425 		    if (*s1 == '\\')
426 			_escaped = _escaped ? FALSE : TRUE;
427 		    else
428 			_escaped = FALSE;
429 		} while (*s1++);
430 		return (NIL);
431 
432 	    /* try to match anything */
433 	    case 'a':
434 		/*
435 		 *  This is really the same as trying the match the
436 		 *  remaining parts of the expression to any subset
437 		 *  of the string.
438 		 */
439 		s1 = s;
440 		do {
441 		    ptr = expmatch (s1, MNEXT(cs), mstring);
442 		    if (ptr != NIL && s1 != s) {
443 
444 			/* we have a match */
445 			return (ptr);
446 		    } else if (ptr != NIL && (*cs & OPT)) {
447 
448 			/* it was aoptional so no match is ok */
449 			return (ptr);
450 		    } else if (ptr != NIL) {
451 
452 			/* not optional and we still matched */
453 			return (NIL);
454 		    }
455 		    if (*s1 == '\\')
456 			_escaped = _escaped ? FALSE : TRUE;
457 		    else
458 			_escaped = FALSE;
459 		} while (*s1++);
460 		return (NIL);
461 
462 	    /* fail if we are currently _escaped */
463 	    case 'e':
464 		if (_escaped)
465 		    return(NIL);
466 		cs = MNEXT(cs);
467 		break;
468 
469 	    /* match any number of tabs and spaces */
470 	    case 'd':
471 		ptr = s;
472 		while (*s == ' ' || *s == '\t')
473 		    s++;
474 		if (s != ptr || s == _start) {
475 
476 		    /* match, be happy */
477 		    matched = 1;
478 		    cs = MNEXT(cs);
479 		} else if (*s == '\n' || *s == '\0') {
480 
481 		    /* match, be happy */
482 		    matched = 1;
483 		    cs = MNEXT(cs);
484 		} else if (*cs & ALT) {
485 
486 		    /* try the next part */
487 		    matched = 0;
488 		    cs = MNEXT(cs);
489 		} else if (*cs & OPT) {
490 
491 		    /* doesn't matter */
492 		    matched = 1;
493 		    cs = MNEXT(cs);
494 		} else
495 
496 		    /* no match, error return */
497 		    return (NIL);
498 		break;
499 
500 	    /* check for end of line */
501 	    case '$':
502 		if (*s == '\0' || *s == '\n') {
503 
504 		    /* match, be happy */
505 		    s++;
506 		    matched = 1;
507 		    cs = MNEXT(cs);
508 		} else if (*cs & ALT) {
509 
510 		    /* try the next part */
511 		    matched = 0;
512 		    cs = MNEXT(cs);
513 		} else if (*cs & OPT) {
514 
515 		    /* doesn't matter */
516 		    matched = 1;
517 		    cs = MNEXT(cs);
518 		} else
519 
520 		    /* no match, error return */
521 		    return (NIL);
522 		break;
523 
524 	    /* check for start of line */
525 	    case '^':
526 		if (s == _start) {
527 
528 		    /* match, be happy */
529 		    matched = 1;
530 		    cs = MNEXT(cs);
531 		} else if (*cs & ALT) {
532 
533 		    /* try the next part */
534 		    matched = 0;
535 		    cs = MNEXT(cs);
536 		} else if (*cs & OPT) {
537 
538 		    /* doesn't matter */
539 		    matched = 1;
540 		    cs = MNEXT(cs);
541 		} else
542 
543 		    /* no match, error return */
544 		    return (NIL);
545 		break;
546 
547 	    /* end of a subexpression, return success */
548 	    case ')':
549 		return (s);
550 	    }
551 	    break;
552 	}
553     }
554     return (s);
555 }
556