xref: /csrg-svn/usr.bin/vgrind/regexp.c (revision 17500)
1 static char sccsid[] = "@(#)regexp.c	4.2	(Berkeley)	12/11/84";
2 
3 #include <ctype.h>
4 
5 typedef int	boolean;
6 #define TRUE	1
7 #define FALSE	0
8 #define NIL	0
9 
10 boolean l_onecase;	/* true if upper and lower equivalent */
11 
12 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
13 
14 /*  STRNCMP -	like strncmp except that we convert the
15  *	 	first string to lower case before comparing
16  *		if l_onecase is set.
17  */
18 
19 STRNCMP(s1, s2, len)
20 	register char *s1,*s2;
21 	register int len;
22 {
23 	if (l_onecase) {
24 	    do
25 		if (*s2 - makelower(*s1))
26 			return (*s2 - makelower(*s1));
27 		else {
28 			s2++;
29 			s1++;
30 		}
31 	    while (--len);
32 	} else {
33 	    do
34 		if (*s2 - *s1)
35 			return (*s2 - *s1);
36 		else {
37 			s2++;
38 			s1++;
39 		}
40 	    while (--len);
41 	}
42 	return(0);
43 }
44 
45 /*	The following routine converts an irregular expression to
46  *	internal format.
47  *
48  *	Either meta symbols (\a \d or \p) or character strings or
49  *	operations ( alternation or perenthesizing ) can be
50  *	specified.  Each starts with a descriptor byte.  The descriptor
51  *	byte has STR set for strings, META set for meta symbols
52  *	and OPER set for operations.
53  *	The descriptor byte can also have the OPT bit set if the object
54  *	defined is optional.  Also ALT can be set to indicate an alternation.
55  *
56  *	For metasymbols the byte following the descriptor byte identities
57  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
58  *	strings the byte after the descriptor is a character count for
59  *	the string:
60  *
61  *		meta symbols := descriptor
62  *				symbol
63  *
64  *		strings :=	descriptor
65  *				character count
66  *				the string
67  *
68  *		operatins :=	descriptor
69  *				symbol
70  *				character count
71  */
72 
73 /*
74  *  handy macros for accessing parts of match blocks
75  */
76 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
77 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
78 
79 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
80 #define OCNT(A) (*(A+2))	/* character count */
81 #define ONEXT(A) (A+3)		/* next character after the operation */
82 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
83 
84 #define SCNT(A) (*(A+1))	/* byte count of a string */
85 #define SSTR(A) (A+2)		/* address of the string */
86 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
87 
88 /*
89  *  bit flags in the descriptor
90  */
91 #define OPT 1
92 #define STR 2
93 #define META 4
94 #define ALT 8
95 #define OPER 16
96 
97 char *ure;		/* pointer current position in unconverted exp */
98 char *ccre;		/* pointer to current position in converted exp*/
99 char *malloc();
100 
101 char *
102 convexp(re)
103     char *re;		/* unconverted irregular expression */
104 {
105     register char *cre;		/* pointer to converted regular expression */
106 
107     /* allocate room for the converted expression */
108     if (re == NIL)
109 	return (NIL);
110     if (*re == '\0')
111 	return (NIL);
112     cre = malloc (4 * strlen(re) + 3);
113     ccre = cre;
114     ure = re;
115 
116     /* start the conversion with a \a */
117     *cre = META | OPT;
118     MSYM(cre) = 'a';
119     ccre = MNEXT(cre);
120 
121     /* start the conversion (its recursive) */
122     expconv ();
123     *ccre = 0;
124     return (cre);
125 }
126 
127 expconv()
128 {
129     register char *cs;		/* pointer to current symbol in converted exp */
130     register char c;		/* character being processed */
131     register char *acs;		/* pinter to last alternate */
132     register int temp;
133 
134     /* let the conversion begin */
135     acs = NIL;
136     cs = NIL;
137     while (*ure != NIL) {
138 	switch (c = *ure++) {
139 
140 	case '\\':
141 	    switch (c = *ure++) {
142 
143 	    /* escaped characters are just characters */
144 	    default:
145 		if (cs == NIL || (*cs & STR) == 0) {
146 		    cs = ccre;
147 		    *cs = STR;
148 		    SCNT(cs) = 1;
149 		    ccre += 2;
150 		} else
151 		    SCNT(cs)++;
152 		*ccre++ = c;
153 		break;
154 
155 	    /* normal(?) metacharacters */
156 	    case 'a':
157 	    case 'd':
158 	    case 'e':
159 	    case 'p':
160 		if (acs != NIL && acs != cs) {
161 		    do {
162 			temp = OCNT(acs);
163 			OCNT(acs) = ccre - acs;
164 			acs -= temp;
165 		    } while (temp != 0);
166 		    acs = NIL;
167 		}
168 		cs = ccre;
169 		*cs = META;
170 		MSYM(cs) = c;
171 		ccre = MNEXT(cs);
172 		break;
173 	    }
174 	    break;
175 
176 	/* just put the symbol in */
177 	case '^':
178 	case '$':
179 	    if (acs != NIL && acs != cs) {
180 		do {
181 		    temp = OCNT(acs);
182 		    OCNT(acs) = ccre - acs;
183 		    acs -= temp;
184 		} while (temp != 0);
185 		acs = NIL;
186 	    }
187 	    cs = ccre;
188 	    *cs = META;
189 	    MSYM(cs) = c;
190 	    ccre = MNEXT(cs);
191 	    break;
192 
193 	/* mark the last match sequence as optional */
194 	case '?':
195 	    if (cs)
196 	    	*cs = *cs | OPT;
197 	    break;
198 
199 	/* recurse and define a subexpression */
200 	case '(':
201 	    if (acs != NIL && acs != cs) {
202 		do {
203 		    temp = OCNT(acs);
204 		    OCNT(acs) = ccre - acs;
205 		    acs -= temp;
206 		} while (temp != 0);
207 		acs = NIL;
208 	    }
209 	    cs = ccre;
210 	    *cs = OPER;
211 	    OSYM(cs) = '(';
212 	    ccre = ONEXT(cs);
213 	    expconv ();
214 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
215 	    break;
216 
217 	/* return from a recursion */
218 	case ')':
219 	    if (acs != NIL) {
220 		do {
221 		    temp = OCNT(acs);
222 		    OCNT(acs) = ccre - acs;
223 		    acs -= temp;
224 		} while (temp != 0);
225 		acs = NIL;
226 	    }
227 	    cs = ccre;
228 	    *cs = META;
229 	    MSYM(cs) = c;
230 	    ccre = MNEXT(cs);
231 	    return;
232 
233 	/* mark the last match sequence as having an alternate */
234 	/* the third byte will contain an offset to jump over the */
235 	/* alternate match in case the first did not fail */
236 	case '|':
237 	    if (acs != NIL && acs != cs)
238 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
239 	    else
240 		OCNT(ccre) = 0;
241 	    *cs |= ALT;
242 	    cs = ccre;
243 	    *cs = OPER;
244 	    OSYM(cs) = '|';
245 	    ccre = ONEXT(cs);
246 	    acs = cs;	/* remember that the pointer is to be filles */
247 	    break;
248 
249 	/* if its not a metasymbol just build a scharacter string */
250 	default:
251 	    if (cs == NIL || (*cs & STR) == 0) {
252 		cs = ccre;
253 		*cs = STR;
254 		SCNT(cs) = 1;
255 		ccre = SSTR(cs);
256 	    } else
257 		SCNT(cs)++;
258 	    *ccre++ = c;
259 	    break;
260 	}
261     }
262     if (acs != NIL) {
263 	do {
264 	    temp = OCNT(acs);
265 	    OCNT(acs) = ccre - acs;
266 	    acs -= temp;
267 	} while (temp != 0);
268 	acs = NIL;
269     }
270     return;
271 }
272 /* end of convertre */
273 
274 
275 /*
276  *	The following routine recognises an irregular expresion
277  *	with the following special characters:
278  *
279  *		\?	-	means last match was optional
280  *		\a	-	matches any number of characters
281  *		\d	-	matches any number of spaces and tabs
282  *		\p	-	matches any number of alphanumeric
283  *				characters. The
284  *				characters matched will be copied into
285  *				the area pointed to by 'name'.
286  *		\|	-	alternation
287  *		\( \)	-	grouping used mostly for alternation and
288  *				optionality
289  *
290  *	The irregular expression must be translated to internal form
291  *	prior to calling this routine
292  *
293  *	The value returned is the pointer to the first non \a
294  *	character matched.
295  */
296 
297 boolean _escaped;		/* true if we are currently _escaped */
298 char *_start;			/* start of string */
299 
300 char *
301 expmatch (s, re, mstring)
302     register char *s;		/* string to check for a match in */
303     register char *re;		/* a converted irregular expression */
304     register char *mstring;	/* where to put whatever matches a \p */
305 {
306     register char *cs;		/* the current symbol */
307     register char *ptr,*s1;	/* temporary pointer */
308     boolean matched;		/* a temporary boolean */
309 
310     /* initial conditions */
311     if (re == NIL)
312 	return (NIL);
313     cs = re;
314     matched = FALSE;
315 
316     /* loop till expression string is exhausted (or at least pretty tired) */
317     while (*cs) {
318 	switch (*cs & (OPER | STR | META)) {
319 
320 	/* try to match a string */
321 	case STR:
322 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
323 	    if (matched) {
324 
325 		/* hoorah it matches */
326 		s += SCNT(cs);
327 		cs = SNEXT(cs);
328 	    } else if (*cs & ALT) {
329 
330 		/* alternation, skip to next expression */
331 		cs = SNEXT(cs);
332 	    } else if (*cs & OPT) {
333 
334 		/* the match is optional */
335 		cs = SNEXT(cs);
336 		matched = 1;		/* indicate a successful match */
337 	    } else {
338 
339 		/* no match, error return */
340 		return (NIL);
341 	    }
342 	    break;
343 
344 	/* an operator, do something fancy */
345 	case OPER:
346 	    switch (OSYM(cs)) {
347 
348 	    /* this is an alternation */
349 	    case '|':
350 		if (matched)
351 
352 		    /* last thing in the alternation was a match, skip ahead */
353 		    cs = OPTR(cs);
354 		else
355 
356 		    /* no match, keep trying */
357 		    cs = ONEXT(cs);
358 		break;
359 
360 	    /* this is a grouping, recurse */
361 	    case '(':
362 		ptr = expmatch (s, ONEXT(cs), mstring);
363 		if (ptr != NIL) {
364 
365 		    /* the subexpression matched */
366 		    matched = 1;
367 		    s = ptr;
368 		} else if (*cs & ALT) {
369 
370 		    /* alternation, skip to next expression */
371 		    matched = 0;
372 		} else if (*cs & OPT) {
373 
374 		    /* the match is optional */
375 		    matched = 1;	/* indicate a successful match */
376 		} else {
377 
378 		    /* no match, error return */
379 		    return (NIL);
380 		}
381 		cs = OPTR(cs);
382 		break;
383 	    }
384 	    break;
385 
386 	/* try to match a metasymbol */
387 	case META:
388 	    switch (MSYM(cs)) {
389 
390 	    /* try to match anything and remember what was matched */
391 	    case 'p':
392 		/*
393 		 *  This is really the same as trying the match the
394 		 *  remaining parts of the expression to any subset
395 		 *  of the string.
396 		 */
397 		s1 = s;
398 		do {
399 		    ptr = expmatch (s1, MNEXT(cs), mstring);
400 		    if (ptr != NIL && s1 != s) {
401 
402 			/* we have a match, remember the match */
403 			strncpy (mstring, s, s1 - s);
404 			mstring[s1 - s] = '\0';
405 			return (ptr);
406 		    } else if (ptr != NIL && (*cs & OPT)) {
407 
408 			/* it was aoptional so no match is ok */
409 			return (ptr);
410 		    } else if (ptr != NIL) {
411 
412 			/* not optional and we still matched */
413 			return (NIL);
414 		    }
415 		    if (!isalnum(*s1) && *s1 != '_')
416 			return (NIL);
417 		    if (*s1 == '\\')
418 			_escaped = _escaped ? FALSE : TRUE;
419 		    else
420 			_escaped = FALSE;
421 		} while (*s1++);
422 		return (NIL);
423 
424 	    /* try to match anything */
425 	    case 'a':
426 		/*
427 		 *  This is really the same as trying the match the
428 		 *  remaining parts of the expression to any subset
429 		 *  of the string.
430 		 */
431 		s1 = s;
432 		do {
433 		    ptr = expmatch (s1, MNEXT(cs), mstring);
434 		    if (ptr != NIL && s1 != s) {
435 
436 			/* we have a match */
437 			return (ptr);
438 		    } else if (ptr != NIL && (*cs & OPT)) {
439 
440 			/* it was aoptional so no match is ok */
441 			return (ptr);
442 		    } else if (ptr != NIL) {
443 
444 			/* not optional and we still matched */
445 			return (NIL);
446 		    }
447 		    if (*s1 == '\\')
448 			_escaped = _escaped ? FALSE : TRUE;
449 		    else
450 			_escaped = FALSE;
451 		} while (*s1++);
452 		return (NIL);
453 
454 	    /* fail if we are currently _escaped */
455 	    case 'e':
456 		if (_escaped)
457 		    return(NIL);
458 		cs = MNEXT(cs);
459 		break;
460 
461 	    /* match any number of tabs and spaces */
462 	    case 'd':
463 		ptr = s;
464 		while (*s == ' ' || *s == '\t')
465 		    s++;
466 		if (s != ptr || s == _start) {
467 
468 		    /* match, be happy */
469 		    matched = 1;
470 		    cs = MNEXT(cs);
471 		} else if (*s == '\n' || *s == '\0') {
472 
473 		    /* match, be happy */
474 		    matched = 1;
475 		    cs = MNEXT(cs);
476 		} else if (*cs & ALT) {
477 
478 		    /* try the next part */
479 		    matched = 0;
480 		    cs = MNEXT(cs);
481 		} else if (*cs & OPT) {
482 
483 		    /* doesn't matter */
484 		    matched = 1;
485 		    cs = MNEXT(cs);
486 		} else
487 
488 		    /* no match, error return */
489 		    return (NIL);
490 		break;
491 
492 	    /* check for end of line */
493 	    case '$':
494 		if (*s == '\0' || *s == '\n') {
495 
496 		    /* match, be happy */
497 		    s++;
498 		    matched = 1;
499 		    cs = MNEXT(cs);
500 		} else if (*cs & ALT) {
501 
502 		    /* try the next part */
503 		    matched = 0;
504 		    cs = MNEXT(cs);
505 		} else if (*cs & OPT) {
506 
507 		    /* doesn't matter */
508 		    matched = 1;
509 		    cs = MNEXT(cs);
510 		} else
511 
512 		    /* no match, error return */
513 		    return (NIL);
514 		break;
515 
516 	    /* check for start of line */
517 	    case '^':
518 		if (s == _start) {
519 
520 		    /* match, be happy */
521 		    matched = 1;
522 		    cs = MNEXT(cs);
523 		} else if (*cs & ALT) {
524 
525 		    /* try the next part */
526 		    matched = 0;
527 		    cs = MNEXT(cs);
528 		} else if (*cs & OPT) {
529 
530 		    /* doesn't matter */
531 		    matched = 1;
532 		    cs = MNEXT(cs);
533 		} else
534 
535 		    /* no match, error return */
536 		    return (NIL);
537 		break;
538 
539 	    /* end of a subexpression, return success */
540 	    case ')':
541 		return (s);
542 	    }
543 	    break;
544 	}
545     }
546     return (s);
547 }
548