1 /*
2 * Copyright (c) 1980, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 *
6 * %sccs.include.redist.c%
7 */
8
9 #ifndef lint
10 static char copyright[] =
11 "@(#) Copyright (c) 1980, 1993\n\
12 The Regents of the University of California. All rights reserved.\n";
13 #endif /* not lint */
14
15 #ifndef lint
16 static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 06/06/93";
17 #endif /* not lint */
18
19 #include <ctype.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include "extern.h"
23
24 #define FALSE 0
25 #define TRUE !(FALSE)
26 #define NIL 0
27
28 static void expconv __P((void));
29
30 boolean _escaped; /* true if we are currently _escaped */
31 char *_start; /* start of string */
32 boolean l_onecase; /* true if upper and lower equivalent */
33
34 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
35
36 /* STRNCMP - like strncmp except that we convert the
37 * first string to lower case before comparing
38 * if l_onecase is set.
39 */
40
41 int
STRNCMP(s1,s2,len)42 STRNCMP(s1, s2, len)
43 register char *s1,*s2;
44 register int len;
45 {
46 if (l_onecase) {
47 do
48 if (*s2 - makelower(*s1))
49 return (*s2 - makelower(*s1));
50 else {
51 s2++;
52 s1++;
53 }
54 while (--len);
55 } else {
56 do
57 if (*s2 - *s1)
58 return (*s2 - *s1);
59 else {
60 s2++;
61 s1++;
62 }
63 while (--len);
64 }
65 return(0);
66 }
67
68 /* The following routine converts an irregular expression to
69 * internal format.
70 *
71 * Either meta symbols (\a \d or \p) or character strings or
72 * operations ( alternation or perenthesizing ) can be
73 * specified. Each starts with a descriptor byte. The descriptor
74 * byte has STR set for strings, META set for meta symbols
75 * and OPER set for operations.
76 * The descriptor byte can also have the OPT bit set if the object
77 * defined is optional. Also ALT can be set to indicate an alternation.
78 *
79 * For metasymbols the byte following the descriptor byte identities
80 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
81 * strings the byte after the descriptor is a character count for
82 * the string:
83 *
84 * meta symbols := descriptor
85 * symbol
86 *
87 * strings := descriptor
88 * character count
89 * the string
90 *
91 * operatins := descriptor
92 * symbol
93 * character count
94 */
95
96 /*
97 * handy macros for accessing parts of match blocks
98 */
99 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
100 #define MNEXT(A) (A+2) /* character following a metasymbol block */
101
102 #define OSYM(A) (*(A+1)) /* symbol in an operation block */
103 #define OCNT(A) (*(A+2)) /* character count */
104 #define ONEXT(A) (A+3) /* next character after the operation */
105 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
106
107 #define SCNT(A) (*(A+1)) /* byte count of a string */
108 #define SSTR(A) (A+2) /* address of the string */
109 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
110
111 /*
112 * bit flags in the descriptor
113 */
114 #define OPT 1
115 #define STR 2
116 #define META 4
117 #define ALT 8
118 #define OPER 16
119
120 static char *ccre; /* pointer to current position in converted exp*/
121 static char *ure; /* pointer current position in unconverted exp */
122
123 char *
convexp(re)124 convexp(re)
125 char *re; /* unconverted irregular expression */
126 {
127 register char *cre; /* pointer to converted regular expression */
128
129 /* allocate room for the converted expression */
130 if (re == NIL)
131 return (NIL);
132 if (*re == '\0')
133 return (NIL);
134 cre = malloc (4 * strlen(re) + 3);
135 ccre = cre;
136 ure = re;
137
138 /* start the conversion with a \a */
139 *cre = META | OPT;
140 MSYM(cre) = 'a';
141 ccre = MNEXT(cre);
142
143 /* start the conversion (its recursive) */
144 expconv ();
145 *ccre = 0;
146 return (cre);
147 }
148
149 static void
expconv()150 expconv()
151 {
152 register char *cs; /* pointer to current symbol in converted exp */
153 register char c; /* character being processed */
154 register char *acs; /* pinter to last alternate */
155 register int temp;
156
157 /* let the conversion begin */
158 acs = NIL;
159 cs = NIL;
160 while (*ure != NIL) {
161 switch (c = *ure++) {
162
163 case '\\':
164 switch (c = *ure++) {
165
166 /* escaped characters are just characters */
167 default:
168 if (cs == NIL || (*cs & STR) == 0) {
169 cs = ccre;
170 *cs = STR;
171 SCNT(cs) = 1;
172 ccre += 2;
173 } else
174 SCNT(cs)++;
175 *ccre++ = c;
176 break;
177
178 /* normal(?) metacharacters */
179 case 'a':
180 case 'd':
181 case 'e':
182 case 'p':
183 if (acs != NIL && acs != cs) {
184 do {
185 temp = OCNT(acs);
186 OCNT(acs) = ccre - acs;
187 acs -= temp;
188 } while (temp != 0);
189 acs = NIL;
190 }
191 cs = ccre;
192 *cs = META;
193 MSYM(cs) = c;
194 ccre = MNEXT(cs);
195 break;
196 }
197 break;
198
199 /* just put the symbol in */
200 case '^':
201 case '$':
202 if (acs != NIL && acs != cs) {
203 do {
204 temp = OCNT(acs);
205 OCNT(acs) = ccre - acs;
206 acs -= temp;
207 } while (temp != 0);
208 acs = NIL;
209 }
210 cs = ccre;
211 *cs = META;
212 MSYM(cs) = c;
213 ccre = MNEXT(cs);
214 break;
215
216 /* mark the last match sequence as optional */
217 case '?':
218 if (cs)
219 *cs = *cs | OPT;
220 break;
221
222 /* recurse and define a subexpression */
223 case '(':
224 if (acs != NIL && acs != cs) {
225 do {
226 temp = OCNT(acs);
227 OCNT(acs) = ccre - acs;
228 acs -= temp;
229 } while (temp != 0);
230 acs = NIL;
231 }
232 cs = ccre;
233 *cs = OPER;
234 OSYM(cs) = '(';
235 ccre = ONEXT(cs);
236 expconv ();
237 OCNT(cs) = ccre - cs; /* offset to next symbol */
238 break;
239
240 /* reurn from a recursion */
241 case ')':
242 if (acs != NIL) {
243 do {
244 temp = OCNT(acs);
245 OCNT(acs) = ccre - acs;
246 acs -= temp;
247 } while (temp != 0);
248 acs = NIL;
249 }
250 cs = ccre;
251 *cs = META;
252 MSYM(cs) = c;
253 ccre = MNEXT(cs);
254 return;
255
256 /* mark the last match sequence as having an alternate */
257 /* the third byte will contain an offset to jump over the */
258 /* alternate match in case the first did not fail */
259 case '|':
260 if (acs != NIL && acs != cs)
261 OCNT(ccre) = ccre - acs; /* make a back pointer */
262 else
263 OCNT(ccre) = 0;
264 *cs |= ALT;
265 cs = ccre;
266 *cs = OPER;
267 OSYM(cs) = '|';
268 ccre = ONEXT(cs);
269 acs = cs; /* remember that the pointer is to be filles */
270 break;
271
272 /* if its not a metasymbol just build a scharacter string */
273 default:
274 if (cs == NIL || (*cs & STR) == 0) {
275 cs = ccre;
276 *cs = STR;
277 SCNT(cs) = 1;
278 ccre = SSTR(cs);
279 } else
280 SCNT(cs)++;
281 *ccre++ = c;
282 break;
283 }
284 }
285 if (acs != NIL) {
286 do {
287 temp = OCNT(acs);
288 OCNT(acs) = ccre - acs;
289 acs -= temp;
290 } while (temp != 0);
291 acs = NIL;
292 }
293 return;
294 }
295 /* end of convertre */
296
297
298 /*
299 * The following routine recognises an irregular expresion
300 * with the following special characters:
301 *
302 * \? - means last match was optional
303 * \a - matches any number of characters
304 * \d - matches any number of spaces and tabs
305 * \p - matches any number of alphanumeric
306 * characters. The
307 * characters matched will be copied into
308 * the area pointed to by 'name'.
309 * \| - alternation
310 * \( \) - grouping used mostly for alternation and
311 * optionality
312 *
313 * The irregular expression must be translated to internal form
314 * prior to calling this routine
315 *
316 * The value returned is the pointer to the first non \a
317 * character matched.
318 */
319
320 char *
expmatch(s,re,mstring)321 expmatch (s, re, mstring)
322 register char *s; /* string to check for a match in */
323 register char *re; /* a converted irregular expression */
324 register char *mstring; /* where to put whatever matches a \p */
325 {
326 register char *cs; /* the current symbol */
327 register char *ptr,*s1; /* temporary pointer */
328 boolean matched; /* a temporary boolean */
329
330 /* initial conditions */
331 if (re == NIL)
332 return (NIL);
333 cs = re;
334 matched = FALSE;
335
336 /* loop till expression string is exhausted (or at least pretty tired) */
337 while (*cs) {
338 switch (*cs & (OPER | STR | META)) {
339
340 /* try to match a string */
341 case STR:
342 matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
343 if (matched) {
344
345 /* hoorah it matches */
346 s += SCNT(cs);
347 cs = SNEXT(cs);
348 } else if (*cs & ALT) {
349
350 /* alternation, skip to next expression */
351 cs = SNEXT(cs);
352 } else if (*cs & OPT) {
353
354 /* the match is optional */
355 cs = SNEXT(cs);
356 matched = 1; /* indicate a successful match */
357 } else {
358
359 /* no match, error return */
360 return (NIL);
361 }
362 break;
363
364 /* an operator, do something fancy */
365 case OPER:
366 switch (OSYM(cs)) {
367
368 /* this is an alternation */
369 case '|':
370 if (matched)
371
372 /* last thing in the alternation was a match, skip ahead */
373 cs = OPTR(cs);
374 else
375
376 /* no match, keep trying */
377 cs = ONEXT(cs);
378 break;
379
380 /* this is a grouping, recurse */
381 case '(':
382 ptr = expmatch (s, ONEXT(cs), mstring);
383 if (ptr != NIL) {
384
385 /* the subexpression matched */
386 matched = 1;
387 s = ptr;
388 } else if (*cs & ALT) {
389
390 /* alternation, skip to next expression */
391 matched = 0;
392 } else if (*cs & OPT) {
393
394 /* the match is optional */
395 matched = 1; /* indicate a successful match */
396 } else {
397
398 /* no match, error return */
399 return (NIL);
400 }
401 cs = OPTR(cs);
402 break;
403 }
404 break;
405
406 /* try to match a metasymbol */
407 case META:
408 switch (MSYM(cs)) {
409
410 /* try to match anything and remember what was matched */
411 case 'p':
412 /*
413 * This is really the same as trying the match the
414 * remaining parts of the expression to any subset
415 * of the string.
416 */
417 s1 = s;
418 do {
419 ptr = expmatch (s1, MNEXT(cs), mstring);
420 if (ptr != NIL && s1 != s) {
421
422 /* we have a match, remember the match */
423 strncpy (mstring, s, s1 - s);
424 mstring[s1 - s] = '\0';
425 return (ptr);
426 } else if (ptr != NIL && (*cs & OPT)) {
427
428 /* it was aoptional so no match is ok */
429 return (ptr);
430 } else if (ptr != NIL) {
431
432 /* not optional and we still matched */
433 return (NIL);
434 }
435 if (!isalnum(*s1) && *s1 != '_')
436 return (NIL);
437 if (*s1 == '\\')
438 _escaped = _escaped ? FALSE : TRUE;
439 else
440 _escaped = FALSE;
441 } while (*s1++);
442 return (NIL);
443
444 /* try to match anything */
445 case 'a':
446 /*
447 * This is really the same as trying the match the
448 * remaining parts of the expression to any subset
449 * of the string.
450 */
451 s1 = s;
452 do {
453 ptr = expmatch (s1, MNEXT(cs), mstring);
454 if (ptr != NIL && s1 != s) {
455
456 /* we have a match */
457 return (ptr);
458 } else if (ptr != NIL && (*cs & OPT)) {
459
460 /* it was aoptional so no match is ok */
461 return (ptr);
462 } else if (ptr != NIL) {
463
464 /* not optional and we still matched */
465 return (NIL);
466 }
467 if (*s1 == '\\')
468 _escaped = _escaped ? FALSE : TRUE;
469 else
470 _escaped = FALSE;
471 } while (*s1++);
472 return (NIL);
473
474 /* fail if we are currently _escaped */
475 case 'e':
476 if (_escaped)
477 return(NIL);
478 cs = MNEXT(cs);
479 break;
480
481 /* match any number of tabs and spaces */
482 case 'd':
483 ptr = s;
484 while (*s == ' ' || *s == '\t')
485 s++;
486 if (s != ptr || s == _start) {
487
488 /* match, be happy */
489 matched = 1;
490 cs = MNEXT(cs);
491 } else if (*s == '\n' || *s == '\0') {
492
493 /* match, be happy */
494 matched = 1;
495 cs = MNEXT(cs);
496 } else if (*cs & ALT) {
497
498 /* try the next part */
499 matched = 0;
500 cs = MNEXT(cs);
501 } else if (*cs & OPT) {
502
503 /* doesn't matter */
504 matched = 1;
505 cs = MNEXT(cs);
506 } else
507
508 /* no match, error return */
509 return (NIL);
510 break;
511
512 /* check for end of line */
513 case '$':
514 if (*s == '\0' || *s == '\n') {
515
516 /* match, be happy */
517 s++;
518 matched = 1;
519 cs = MNEXT(cs);
520 } else if (*cs & ALT) {
521
522 /* try the next part */
523 matched = 0;
524 cs = MNEXT(cs);
525 } else if (*cs & OPT) {
526
527 /* doesn't matter */
528 matched = 1;
529 cs = MNEXT(cs);
530 } else
531
532 /* no match, error return */
533 return (NIL);
534 break;
535
536 /* check for start of line */
537 case '^':
538 if (s == _start) {
539
540 /* match, be happy */
541 matched = 1;
542 cs = MNEXT(cs);
543 } else if (*cs & ALT) {
544
545 /* try the next part */
546 matched = 0;
547 cs = MNEXT(cs);
548 } else if (*cs & OPT) {
549
550 /* doesn't matter */
551 matched = 1;
552 cs = MNEXT(cs);
553 } else
554
555 /* no match, error return */
556 return (NIL);
557 break;
558
559 /* end of a subexpression, return success */
560 case ')':
561 return (s);
562 }
563 break;
564 }
565 }
566 return (s);
567 }
568