xref: /onnv-gate/usr/src/lib/libldap4/common/regex.c (revision 3857:21b9b714e4ab)
1 /*
2  *
3  * Portions Copyright 1998 Sun Microsystems, Inc.  All rights reserved.
4  * Use is subject to license terms.
5  *
6  */
7 
8 #pragma ident	"%Z%%M%	%I%	%E% SMI"
9 
10 #include "portable.h"
11 #include "stdio.h"
12 #include "stdlib.h"
13 
14 #if defined( MACOS ) || defined( DOS ) || defined( _WIN32 ) || defined( NEED_BSDREGEX )
15 #include "regex.h"
16 
17 /*
18  * regex - Regular expression pattern matching  and replacement
19  *
20  * By:  Ozan S. Yigit (oz)
21  *      Dept. of Computer Science
22  *      York University
23  *
24  * These routines are the PUBLIC DOMAIN equivalents of regex
25  * routines as found in 4.nBSD UN*X, with minor extensions.
26  *
27  * These routines are derived from various implementations found
28  * in software tools books, and Conroy's grep. They are NOT derived
29  * from licensed/restricted software.
30  * For more interesting/academic/complicated implementations,
31  * see Henry Spencer's regexp routines, or GNU Emacs pattern
32  * matching module.
33  *
34  * Modification history:
35  *
36  * $Log: regex.c,v $
37  * Revision 1.12  1996/04/25  16:20:59  mcs
38  * make re_exec() match "" with ".*" and similar patterns
39  * hopefully this change doesn't break anything else!
40  *
41  * Revision 1.11  1994/12/14  21:33:45  mcs
42  * use new NEED_BSDREGEX
43  * fix pmatch() prototype
44  *
45  * Revision 1.10  1994/12/12  18:16:39  mcs
46  * use on NetBSD
47  *
48  * Revision 1.9  1994/11/15  19:16:35  mcs
49  * add (CHAR) cast to make VisualC++ happy
50  *
51  * Revision 1.8  1994/11/08  21:14:32  mcs
52  * WIN32 changes
53  *
54  * Revision 1.7  1994/07/23  19:51:24  mcs
55  * use ANSI-style inline function parameters
56  *
57  * Revision 1.6  1993/10/18  01:52:32  tim
58  * include for VMS
59  *
60  * Revision 1.5  1993/09/28  21:37:54  mcs
61  * HP/UX needs the regex we include (not in its libc)
62  *
63  * Revision 1.4  1993/08/27  15:59:52  mcs
64  * use CHAR for deftab
65  *
66  * Revision 1.3  1993/08/27  15:49:47  mcs
67  * added missing 0 to octal constants
68  * use unsigned char for CHAR under DOS
69  *
70  * Revision 1.2  1993/08/27  14:57:48  mcs
71  * add proto. for pmatch
72  *
73  * Revision 1.1  1993/08/18  21:20:02  mcs
74  * Initial revision
75  *
76  * Revision 1.4  1991/10/17  03:56:42  oz
77  * miscellaneous changes, small cleanups etc.
78  *
79  * Revision 1.3  1989/04/01  14:18:09  oz
80  * Change all references to a dfa: this is actually an nfa.
81  *
82  * Revision 1.2  88/08/28  15:36:04  oz
83  * Use a complement bitmap to represent NCL.
84  * This removes the need to have seperate
85  * code in the pmatch case block - it is
86  * just CCL code now.
87  *
88  * Use the actual CCL code in the CLO
89  * section of pmatch. No need for a recursive
90  * pmatch call.
91  *
92  * Use a bitmap table to set char bits in an
93  * 8-bit chunk.
94  *
95  * Interfaces:
96  *      re_comp:        compile a regular expression into a NFA.
97  *
98  *			char *re_comp(s)
99  *			char *s;
100  *
101  *      re_exec:        execute the NFA to match a pattern.
102  *
103  *			int re_exec(s)
104  *			char *s;
105  *
106  *	re_modw		change re_exec's understanding of what a "word"
107  *			looks like (for \< and \>) by adding into the
108  *			hidden word-syntax table.
109  *
110  *			void re_modw(s)
111  *			char *s;
112  *
113  *      re_subs:	substitute the matched portions in a new string.
114  *
115  *			int re_subs(src, dst)
116  *			char *src;
117  *			char *dst;
118  *
119  *	re_fail:	failure routine for re_exec.
120  *
121  *			void re_fail(msg, op)
122  *			char *msg;
123  *			char op;
124  *
125  * Regular Expressions:
126  *
127  *      [1]     char    matches itself, unless it is a special
128  *                      character (metachar): . \ [ ] * + ^ $
129  *
130  *      [2]     .       matches any character.
131  *
132  *      [3]     \       matches the character following it, except
133  *			when followed by a left or right round bracket,
134  *			a digit 1 to 9 or a left or right angle bracket.
135  *			(see [7], [8] and [9])
136  *			It is used as an escape character for all
137  *			other meta-characters, and itself. When used
138  *			in a set ([4]), it is treated as an ordinary
139  *			character.
140  *
141  *      [4]     [set]   matches one of the characters in the set.
142  *                      If the first character in the set is "^",
143  *                      it matches a character NOT in the set, i.e.
144  *			complements the set. A shorthand S-E is
145  *			used to specify a set of characters S upto
146  *			E, inclusive. The special characters "]" and
147  *			"-" have no special meaning if they appear
148  *			as the first chars in the set.
149  *                      examples:        match:
150  *
151  *                              [a-z]    any lowercase alpha
152  *
153  *                              [^]-]    any char except ] and -
154  *
155  *                              [^A-Z]   any char except uppercase
156  *                                       alpha
157  *
158  *                              [a-zA-Z] any alpha
159  *
160  *      [5]     *       any regular expression form [1] to [4], followed by
161  *                      closure char (*) matches zero or more matches of
162  *                      that form.
163  *
164  *      [6]     +       same as [5], except it matches one or more.
165  *
166  *      [7]             a regular expression in the form [1] to [10], enclosed
167  *                      as \(form\) matches what form matches. The enclosure
168  *                      creates a set of tags, used for [8] and for
169  *                      pattern substution. The tagged forms are numbered
170  *			starting from 1.
171  *
172  *      [8]             a \ followed by a digit 1 to 9 matches whatever a
173  *                      previously tagged regular expression ([7]) matched.
174  *
175  *	[9]	\<	a regular expression starting with a \< construct
176  *		\>	and/or ending with a \> construct, restricts the
177  *			pattern matching to the beginning of a word, and/or
178  *			the end of a word. A word is defined to be a character
179  *			string beginning and/or ending with the characters
180  *			A-Z a-z 0-9 and _. It must also be preceded and/or
181  *			followed by any character outside those mentioned.
182  *
183  *      [10]            a composite regular expression xy where x and y
184  *                      are in the form [1] to [10] matches the longest
185  *                      match of x followed by a match for y.
186  *
187  *      [11]	^	a regular expression starting with a ^ character
188  *		$	and/or ending with a $ character, restricts the
189  *                      pattern matching to the beginning of the line,
190  *                      or the end of line. [anchors] Elsewhere in the
191  *			pattern, ^ and $ are treated as ordinary characters.
192  *
193  *
194  * Acknowledgements:
195  *
196  *	HCR's Hugh Redelmeier has been most helpful in various
197  *	stages of development. He convinced me to include BOW
198  *	and EOW constructs, originally invented by Rob Pike at
199  *	the University of Toronto.
200  *
201  * References:
202  *              Software tools			Kernighan & Plauger
203  *              Software tools in Pascal        Kernighan & Plauger
204  *              Grep [rsx-11 C dist]            David Conroy
205  *		ed - text editor		Un*x Programmer's Manual
206  *		Advanced editing on Un*x	B. W. Kernighan
207  *		RegExp routines			Henry Spencer
208  *
209  * Notes:
210  *
211  *	This implementation uses a bit-set representation for character
212  *	classes for speed and compactness. Each character is represented
213  *	by one bit in a 128-bit block. Thus, CCL always takes a
214  *	constant 16 bytes in the internal nfa, and re_exec does a single
215  *	bit comparison to locate the character in the set.
216  *
217  * Examples:
218  *
219  *	pattern:	foo*.*
220  *	compile:	CHR f CHR o CLO CHR o END CLO ANY END END
221  *	matches:	fo foo fooo foobar fobar foxx ...
222  *
223  *	pattern:	fo[ob]a[rz]
224  *	compile:	CHR f CHR o CCL bitset CHR a CCL bitset END
225  *	matches:	fobar fooar fobaz fooaz
226  *
227  *	pattern:	foo\\+
228  *	compile:	CHR f CHR o CHR o CHR \ CLO CHR \ END END
229  *	matches:	foo\ foo\\ foo\\\  ...
230  *
231  *	pattern:	\(foo\)[1-3]\1	(same as foo[1-3]foo)
232  *	compile:	BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
233  *	matches:	foo1foo foo2foo foo3foo
234  *
235  *	pattern:	\(fo.*\)-\1
236  *	compile:	BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
237  *	matches:	foo-foo fo-fo fob-fob foobar-foobar ...
238  */
239 
240 #define MAXNFA  1024
241 #define MAXTAG  10
242 
243 #define OKP     1
244 #define NOP     0
245 
246 #define CHR     1
247 #define ANY     2
248 #define CCL     3
249 #define BOL     4
250 #define EOL     5
251 #define BOT     6
252 #define EOT     7
253 #define BOW	8
254 #define EOW	9
255 #define REF     10
256 #define CLO     11
257 
258 #define END     0
259 
260 /*
261  * The following defines are not meant to be changeable.
262  * They are for readability only.
263  */
264 #define MAXCHR	128
265 #define CHRBIT	8
266 #define BITBLK	MAXCHR/CHRBIT
267 #define BLKIND	0170
268 #define BITIND	07
269 
270 #define ASCIIB	0177
271 
272 #if defined( DOS ) || defined( _WIN32 ) || defined(SUN)
273 typedef unsigned char CHAR;
274 #else /* DOS */
275 typedef /*unsigned*/ char CHAR;
276 #endif /* DOS */
277 
278 static int  tagstk[MAXTAG];             /* subpat tag stack..*/
279 static CHAR nfa[MAXNFA];		/* automaton..       */
280 static int  sta = NOP;               	/* status of lastpat */
281 
282 static CHAR bittab[BITBLK];		/* bit table for CCL */
283 					/* pre-set bits...   */
284 static CHAR bitarr[] = {1,2,4,8,16,32,64,128};
285 
286 #ifdef DEBUG
287 static void nfadump(CHAR *);
288 void symbolic(char *);
289 #endif
290 
291 static void
chset(CHAR c)292 chset(CHAR c)
293 {
294 	bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
295 }
296 
297 #define badpat(x)	(*nfa = END, x)
298 #define store(x)	*mp++ = x
299 
300 char *
re_comp(char * pat)301 re_comp( char *pat )
302 {
303 	register char *p;               /* pattern pointer   */
304 	register CHAR *mp=nfa;          /* nfa pointer       */
305 	register CHAR *lp;              /* saved pointer..   */
306 	register CHAR *sp=nfa;          /* another one..     */
307 
308 	register int tagi = 0;          /* tag stack index   */
309 	register int tagc = 1;          /* actual tag count  */
310 
311 	register int n;
312 	register CHAR mask;		/* xor mask -CCL/NCL */
313 	int c1, c2;
314 
315 	if (!pat || !*pat)
316 		if (sta)
317 			return 0;
318 		else
319 			return badpat("No previous regular expression");
320 	sta = NOP;
321 
322 	for (p = pat; *p; p++) {
323 		lp = mp;
324 		switch(*p) {
325 
326 		case '.':               /* match any char..  */
327 			store(ANY);
328 			break;
329 
330 		case '^':               /* match beginning.. */
331 			if (p == pat)
332 				store(BOL);
333 			else {
334 				store(CHR);
335 				store(*p);
336 			}
337 			break;
338 
339 		case '$':               /* match endofline.. */
340 			if (!*(p+1))
341 				store(EOL);
342 			else {
343 				store(CHR);
344 				store(*p);
345 			}
346 			break;
347 
348 		case '[':               /* match char class..*/
349 			store(CCL);
350 
351 			if (*++p == '^') {
352 				mask = 0377;
353 				p++;
354 			}
355 			else
356 				mask = 0;
357 
358 			if (*p == '-')		/* real dash */
359 				chset(*p++);
360 			if (*p == ']')		/* real brac */
361 				chset(*p++);
362 			while (*p && *p != ']') {
363 				if (*p == '-' && *(p+1) && *(p+1) != ']') {
364 					p++;
365 					c1 = *(p-2) + 1;
366 					c2 = *p++;
367 					while (c1 <= c2)
368 						chset((CHAR)c1++);
369 				}
370 #ifdef EXTEND
371 				else if (*p == '\\' && *(p+1)) {
372 					p++;
373 					chset(*p++);
374 				}
375 #endif
376 				else
377 					chset(*p++);
378 			}
379 			if (!*p)
380 				return badpat("Missing ]");
381 
382 			for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
383 				store(mask ^ bittab[n]);
384 
385 			break;
386 
387 		case '*':               /* match 0 or more.. */
388 		case '+':               /* match 1 or more.. */
389 			if (p == pat)
390 				return badpat("Empty closure");
391 			lp = sp;		/* previous opcode */
392 			if (*lp == CLO)		/* equivalence..   */
393 				break;
394 			switch(*lp) {
395 
396 			case BOL:
397 			case BOT:
398 			case EOT:
399 			case BOW:
400 			case EOW:
401 			case REF:
402 				return badpat("Illegal closure");
403 			default:
404 				break;
405 			}
406 
407 			if (*p == '+')
408 				for (sp = mp; lp < sp; lp++)
409 					store(*lp);
410 
411 			store(END);
412 			store(END);
413 			sp = mp;
414 			while (--mp > lp)
415 				*mp = mp[-1];
416 			store(CLO);
417 			mp = sp;
418 			break;
419 
420 		case '\\':              /* tags, backrefs .. */
421 			switch(*++p) {
422 
423 			case '(':
424 				if (tagc < MAXTAG) {
425 					tagstk[++tagi] = tagc;
426 					store(BOT);
427 					store(tagc++);
428 				}
429 				else
430 					return badpat("Too many \\(\\) pairs");
431 				break;
432 			case ')':
433 				if (*sp == BOT)
434 					return badpat("Null pattern inside \\(\\)");
435 				if (tagi > 0) {
436 					store(EOT);
437 					store(tagstk[tagi--]);
438 				}
439 				else
440 					return badpat("Unmatched \\)");
441 				break;
442 			case '<':
443 				store(BOW);
444 				break;
445 			case '>':
446 				if (*sp == BOW)
447 					return badpat("Null pattern inside \\<\\>");
448 				store(EOW);
449 				break;
450 			case '1':
451 			case '2':
452 			case '3':
453 			case '4':
454 			case '5':
455 			case '6':
456 			case '7':
457 			case '8':
458 			case '9':
459 				n = *p-'0';
460 				if (tagi > 0 && tagstk[tagi] == n)
461 					return badpat("Cyclical reference");
462 				if (tagc > n) {
463 					store(REF);
464 					store(n);
465 				}
466 				else
467 					return badpat("Undetermined reference");
468 				break;
469 #ifdef EXTEND
470 			case 'b':
471 				store(CHR);
472 				store('\b');
473 				break;
474 			case 'n':
475 				store(CHR);
476 				store('\n');
477 				break;
478 			case 'f':
479 				store(CHR);
480 				store('\f');
481 				break;
482 			case 'r':
483 				store(CHR);
484 				store('\r');
485 				break;
486 			case 't':
487 				store(CHR);
488 				store('\t');
489 				break;
490 #endif
491 			default:
492 				store(CHR);
493 				store(*p);
494 			}
495 			break;
496 
497 		default :               /* an ordinary char  */
498 			store(CHR);
499 			store(*p);
500 			break;
501 		}
502 		sp = lp;
503 	}
504 	if (tagi > 0)
505 		return badpat("Unmatched \\(");
506 	store(END);
507 	sta = OKP;
508 	return 0;
509 }
510 
511 
512 static char *bol;
513 char *bopat[MAXTAG];
514 char *eopat[MAXTAG];
515 #ifdef NEEDPROTOS
516 static char *pmatch( char *lp, CHAR *ap );
517 #else /* NEEDPROTOS */
518 static char *pmatch();
519 #endif /* NEEDPROTOS */
520 
521 /*
522  * re_exec:
523  * 	execute nfa to find a match.
524  *
525  *	special cases: (nfa[0])
526  *		BOL
527  *			Match only once, starting from the
528  *			beginning.
529  *		CHR
530  *			First locate the character without
531  *			calling pmatch, and if found, call
532  *			pmatch for the remaining string.
533  *		END
534  *			re_comp failed, poor luser did not
535  *			check for it. Fail fast.
536  *
537  *	If a match is found, bopat[0] and eopat[0] are set
538  *	to the beginning and the end of the matched fragment,
539  *	respectively.
540  *
541  */
542 
543 int
re_exec(char * lp)544 re_exec( char *lp )
545 {
546 	register char c;
547 	register char *ep = 0;
548 	register CHAR *ap = nfa;
549 
550 	bol = lp;
551 
552 	bopat[0] = 0;
553 	bopat[1] = 0;
554 	bopat[2] = 0;
555 	bopat[3] = 0;
556 	bopat[4] = 0;
557 	bopat[5] = 0;
558 	bopat[6] = 0;
559 	bopat[7] = 0;
560 	bopat[8] = 0;
561 	bopat[9] = 0;
562 
563 	switch(*ap) {
564 
565 	case BOL:			/* anchored: match from BOL only */
566 		ep = pmatch(lp,ap);
567 		break;
568 	case CHR:			/* ordinary char: locate it fast */
569 		c = *(ap+1);
570 		while (*lp && *lp != c)
571 			lp++;
572 		if (!*lp)		/* if EOS, fail, else fall thru. */
573 			return 0;
574 	default:			/* regular matching all the way. */
575 		do {
576 			if ((ep = pmatch(lp,ap)))
577 				break;
578 			lp++;
579 		} while (*lp);
580 
581 		break;
582 	case END:			/* munged automaton. fail always */
583 		return 0;
584 	}
585 	if (!ep)
586 		return 0;
587 
588 	bopat[0] = lp;
589 	eopat[0] = ep;
590 	return 1;
591 }
592 
593 /*
594  * pmatch: internal routine for the hard part
595  *
596  * 	This code is partly snarfed from an early grep written by
597  *	David Conroy. The backref and tag stuff, and various other
598  *	innovations are by oz.
599  *
600  *	special case optimizations: (nfa[n], nfa[n+1])
601  *		CLO ANY
602  *			We KNOW .* will match everything upto the
603  *			end of line. Thus, directly go to the end of
604  *			line, without recursive pmatch calls. As in
605  *			the other closure cases, the remaining pattern
606  *			must be matched by moving backwards on the
607  *			string recursively, to find a match for xy
608  *			(x is ".*" and y is the remaining pattern)
609  *			where the match satisfies the LONGEST match for
610  *			x followed by a match for y.
611  *		CLO CHR
612  *			We can again scan the string forward for the
613  *			single char and at the point of failure, we
614  *			execute the remaining nfa recursively, same as
615  *			above.
616  *
617  *	At the end of a successful match, bopat[n] and eopat[n]
618  *	are set to the beginning and end of subpatterns matched
619  *	by tagged expressions (n = 1 to 9).
620  *
621  */
622 
623 #ifndef re_fail
624 extern void re_fail();
625 #endif /* re_fail */
626 
627 /*
628  * character classification table for word boundary operators BOW
629  * and EOW. the reason for not using ctype macros is that we can
630  * let the user add into our own table. see re_modw. This table
631  * is not in the bitset form, since we may wish to extend it in the
632  * future for other character classifications.
633  *
634  *	TRUE for 0-9 A-Z a-z _
635  */
636 static char chrtyp[MAXCHR] = {
637 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
638 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
639 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
640 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
641 	0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
642 	1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
643 	0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
644 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
645 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
646 	1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
647 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
648 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
649 	1, 1, 1, 0, 0, 0, 0, 0
650 	};
651 
652 #define inascii(x)	(0177&(x))
653 #define iswordc(x) 	chrtyp[inascii(x)]
654 #define isinset(x,y) 	((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
655 
656 /*
657  * skip values for CLO XXX to skip past the closure
658  */
659 
660 #define ANYSKIP	2 	/* [CLO] ANY END ...	     */
661 #define CHRSKIP	3	/* [CLO] CHR chr END ...     */
662 #define CCLSKIP 18	/* [CLO] CCL 16bytes END ... */
663 
664 static char *
pmatch(char * lp,CHAR * ap)665 pmatch( char *lp, CHAR *ap)
666 {
667 	register int op, c, n;
668 	register char *e;		/* extra pointer for CLO */
669 	register char *bp;		/* beginning of subpat.. */
670 	register char *ep;		/* ending of subpat..	 */
671 	char *are;			/* to save the line ptr. */
672 
673 	while ((op = *ap++) != END)
674 		switch(op) {
675 
676 		case CHR:
677 			if (*lp++ != *ap++)
678 				return 0;
679 			break;
680 		case ANY:
681 			if (!*lp++)
682 				return 0;
683 			break;
684 		case CCL:
685 			c = *lp++;
686 			if (!isinset(ap,c))
687 				return 0;
688 			ap += BITBLK;
689 			break;
690 		case BOL:
691 			if (lp != bol)
692 				return 0;
693 			break;
694 		case EOL:
695 			if (*lp)
696 				return 0;
697 			break;
698 		case BOT:
699 			bopat[*ap++] = lp;
700 			break;
701 		case EOT:
702 			eopat[*ap++] = lp;
703 			break;
704  		case BOW:
705 			if (lp!=bol && iswordc(lp[-1]) || !iswordc(*lp))
706 				return 0;
707 			break;
708 		case EOW:
709 			if (lp==bol || !iswordc(lp[-1]) || iswordc(*lp))
710 				return 0;
711 			break;
712 		case REF:
713 			n = *ap++;
714 			bp = bopat[n];
715 			ep = eopat[n];
716 			while (bp < ep)
717 				if (*bp++ != *lp++)
718 					return 0;
719 			break;
720 		case CLO:
721 			are = lp;
722 			switch(*ap) {
723 
724 			case ANY:
725 				while (*lp)
726 					lp++;
727 				n = ANYSKIP;
728 				break;
729 			case CHR:
730 				c = *(ap+1);
731 				while (*lp && c == *lp)
732 					lp++;
733 				n = CHRSKIP;
734 				break;
735 			case CCL:
736 				while ((c = *lp) && isinset(ap+1,c))
737 					lp++;
738 				n = CCLSKIP;
739 				break;
740 			default:
741 				re_fail("closure: bad nfa.", *ap);
742 				return 0;
743 			}
744 
745 			ap += n;
746 
747 			while (lp >= are) {
748 				if (e = pmatch(lp, ap))
749 					return e;
750 				--lp;
751 			}
752 			return 0;
753 		default:
754 			re_fail("re_exec: bad nfa.", op);
755 			return 0;
756 		}
757 	return lp;
758 }
759 
760 /*
761  * re_modw:
762  *	add new characters into the word table to change re_exec's
763  *	understanding of what a word should look like. Note that we
764  *	only accept additions into the word definition.
765  *
766  *	If the string parameter is 0 or null string, the table is
767  *	reset back to the default containing A-Z a-z 0-9 _. [We use
768  *	the compact bitset representation for the default table]
769  */
770 
771 static CHAR deftab[16] = {
772 	0, 0, 0, 0, 0, 0, 0377, 003, 0376, 0377, 0377, 0207,
773 	0376, 0377, 0377, 007
774 };
775 
776 void
re_modw(char * s)777 re_modw( char *s )
778 {
779 	register int i;
780 
781 	if (!s || !*s) {
782 		for (i = 0; i < MAXCHR; i++)
783 			if (!isinset(deftab,i))
784 				iswordc(i) = 0;
785 	}
786 	else
787 		while(*s)
788 			iswordc(*s++) = 1;
789 }
790 
791 /*
792  * re_subs:
793  *	substitute the matched portions of the src in dst.
794  *
795  *	&	substitute the entire matched pattern.
796  *
797  *	\digit	substitute a subpattern, with the given	tag number.
798  *		Tags are numbered from 1 to 9. If the particular
799  *		tagged subpattern does not exist, null is substituted.
800  */
801 int
re_subs(char * src,char * dst)802 re_subs( char *src, char *dst)
803 {
804 	register char c;
805 	register int  pin;
806 	register char *bp;
807 	register char *ep;
808 
809 	if (!*src || !bopat[0])
810 		return 0;
811 
812 	while (c = *src++) {
813 		switch(c) {
814 
815 		case '&':
816 			pin = 0;
817 			break;
818 
819 		case '\\':
820 			c = *src++;
821 			if (c >= '0' && c <= '9') {
822 				pin = c - '0';
823 				break;
824 			}
825 
826 		default:
827 			*dst++ = c;
828 			continue;
829 		}
830 
831 		if ((bp = bopat[pin]) && (ep = eopat[pin])) {
832 			while (*bp && bp < ep)
833 				*dst++ = *bp++;
834 			if (bp < ep)
835 				return 0;
836 		}
837 	}
838 	*dst = (char) 0;
839 	return 1;
840 }
841 
842 #ifdef DEBUG
843 /*
844  * symbolic - produce a symbolic dump of the nfa
845  */
846 void
symbolic(char * s)847 symbolic( char *s )
848 {
849 	(void) printf("pattern: %s\n", s);
850 	(void) printf("nfacode:\n");
851 	nfadump(nfa);
852 }
853 
854 static void
nfadump(CHAR * ap)855 nfadump( CHAR *ap)
856 {
857 	register int n;
858 
859 	while (*ap != END)
860 		switch(*ap++) {
861 		case CLO:
862 			(void) printf("CLOSURE");
863 			nfadump(ap);
864 			switch(*ap) {
865 			case CHR:
866 				n = CHRSKIP;
867 				break;
868 			case ANY:
869 				n = ANYSKIP;
870 				break;
871 			case CCL:
872 				n = CCLSKIP;
873 				break;
874 			}
875 			ap += n;
876 			break;
877 		case CHR:
878 			(void) printf("\tCHR %c\n",*ap++);
879 			break;
880 		case ANY:
881 			(void) printf("\tANY .\n");
882 			break;
883 		case BOL:
884 			(void) printf("\tBOL -\n");
885 			break;
886 		case EOL:
887 			(void) printf("\tEOL -\n");
888 			break;
889 		case BOT:
890 			(void) printf("BOT: %d\n",*ap++);
891 			break;
892 		case EOT:
893 			(void) printf("EOT: %d\n",*ap++);
894 			break;
895 		case BOW:
896 			(void) printf("BOW\n");
897 			break;
898 		case EOW:
899 			(void) printf("EOW\n");
900 			break;
901 		case REF:
902 			(void) printf("REF: %d\n",*ap++);
903 			break;
904 		case CCL:
905 			(void) printf("\tCCL [");
906 			for (n = 0; n < MAXCHR; n++)
907 				if (isinset(ap,(CHAR)n)) {
908 					if (n < ' ')
909 						(void) printf("^%c", n ^ 0x040);
910 					else
911 						(void) printf("%c", n);
912 				}
913 			(void) printf("]\n");
914 			ap += BITBLK;
915 			break;
916 		default:
917 			(void) printf("bad nfa. opcode %o\n", ap[-1]);
918 			exit(1);
919 			break;
920 		}
921 }
922 #endif
923 #endif /* MACOS or DOS or NEED_BSDREGEX */
924