xref: /onnv-gate/usr/src/lib/libldap4/common/regex.c (revision 0:68f95e015346)
1 /*
2  *
3  * Portions Copyright %G% Sun Microsystems, Inc. All Rights Reserved
4  *
5  */
6 
7 #pragma ident	"%Z%%M%	%I%	%E% SMI"
8 
9 #include "portable.h"
10 #include "stdio.h"
11 #include "stdlib.h"
12 
13 #if defined( MACOS ) || defined( DOS ) || defined( _WIN32 ) || defined( NEED_BSDREGEX )
14 #include "regex.h"
15 
16 /*
17  * regex - Regular expression pattern matching  and replacement
18  *
19  * By:  Ozan S. Yigit (oz)
20  *      Dept. of Computer Science
21  *      York University
22  *
23  * These routines are the PUBLIC DOMAIN equivalents of regex
24  * routines as found in 4.nBSD UN*X, with minor extensions.
25  *
26  * These routines are derived from various implementations found
27  * in software tools books, and Conroy's grep. They are NOT derived
28  * from licensed/restricted software.
29  * For more interesting/academic/complicated implementations,
30  * see Henry Spencer's regexp routines, or GNU Emacs pattern
31  * matching module.
32  *
33  * Modification history:
34  *
35  * $Log: regex.c,v $
36  * Revision 1.12  1996/04/25  16:20:59  mcs
37  * make re_exec() match "" with ".*" and similar patterns
38  * hopefully this change doesn't break anything else!
39  *
40  * Revision 1.11  1994/12/14  21:33:45  mcs
41  * use new NEED_BSDREGEX
42  * fix pmatch() prototype
43  *
44  * Revision 1.10  1994/12/12  18:16:39  mcs
45  * use on NetBSD
46  *
47  * Revision 1.9  1994/11/15  19:16:35  mcs
48  * add (CHAR) cast to make VisualC++ happy
49  *
50  * Revision 1.8  1994/11/08  21:14:32  mcs
51  * WIN32 changes
52  *
53  * Revision 1.7  1994/07/23  19:51:24  mcs
54  * use ANSI-style inline function parameters
55  *
56  * Revision 1.6  1993/10/18  01:52:32  tim
57  * include for VMS
58  *
59  * Revision 1.5  1993/09/28  21:37:54  mcs
60  * HP/UX needs the regex we include (not in its libc)
61  *
62  * Revision 1.4  1993/08/27  15:59:52  mcs
63  * use CHAR for deftab
64  *
65  * Revision 1.3  1993/08/27  15:49:47  mcs
66  * added missing 0 to octal constants
67  * use unsigned char for CHAR under DOS
68  *
69  * Revision 1.2  1993/08/27  14:57:48  mcs
70  * add proto. for pmatch
71  *
72  * Revision 1.1  1993/08/18  21:20:02  mcs
73  * Initial revision
74  *
75  * Revision 1.4  1991/10/17  03:56:42  oz
76  * miscellaneous changes, small cleanups etc.
77  *
78  * Revision 1.3  1989/04/01  14:18:09  oz
79  * Change all references to a dfa: this is actually an nfa.
80  *
81  * Revision 1.2  88/08/28  15:36:04  oz
82  * Use a complement bitmap to represent NCL.
83  * This removes the need to have seperate
84  * code in the pmatch case block - it is
85  * just CCL code now.
86  *
87  * Use the actual CCL code in the CLO
88  * section of pmatch. No need for a recursive
89  * pmatch call.
90  *
91  * Use a bitmap table to set char bits in an
92  * 8-bit chunk.
93  *
94  * Interfaces:
95  *      re_comp:        compile a regular expression into a NFA.
96  *
97  *			char *re_comp(s)
98  *			char *s;
99  *
100  *      re_exec:        execute the NFA to match a pattern.
101  *
102  *			int re_exec(s)
103  *			char *s;
104  *
105  *	re_modw		change re_exec's understanding of what a "word"
106  *			looks like (for \< and \>) by adding into the
107  *			hidden word-syntax table.
108  *
109  *			void re_modw(s)
110  *			char *s;
111  *
112  *      re_subs:	substitute the matched portions in a new string.
113  *
114  *			int re_subs(src, dst)
115  *			char *src;
116  *			char *dst;
117  *
118  *	re_fail:	failure routine for re_exec.
119  *
120  *			void re_fail(msg, op)
121  *			char *msg;
122  *			char op;
123  *
124  * Regular Expressions:
125  *
126  *      [1]     char    matches itself, unless it is a special
127  *                      character (metachar): . \ [ ] * + ^ $
128  *
129  *      [2]     .       matches any character.
130  *
131  *      [3]     \       matches the character following it, except
132  *			when followed by a left or right round bracket,
133  *			a digit 1 to 9 or a left or right angle bracket.
134  *			(see [7], [8] and [9])
135  *			It is used as an escape character for all
136  *			other meta-characters, and itself. When used
137  *			in a set ([4]), it is treated as an ordinary
138  *			character.
139  *
140  *      [4]     [set]   matches one of the characters in the set.
141  *                      If the first character in the set is "^",
142  *                      it matches a character NOT in the set, i.e.
143  *			complements the set. A shorthand S-E is
144  *			used to specify a set of characters S upto
145  *			E, inclusive. The special characters "]" and
146  *			"-" have no special meaning if they appear
147  *			as the first chars in the set.
148  *                      examples:        match:
149  *
150  *                              [a-z]    any lowercase alpha
151  *
152  *                              [^]-]    any char except ] and -
153  *
154  *                              [^A-Z]   any char except uppercase
155  *                                       alpha
156  *
157  *                              [a-zA-Z] any alpha
158  *
159  *      [5]     *       any regular expression form [1] to [4], followed by
160  *                      closure char (*) matches zero or more matches of
161  *                      that form.
162  *
163  *      [6]     +       same as [5], except it matches one or more.
164  *
165  *      [7]             a regular expression in the form [1] to [10], enclosed
166  *                      as \(form\) matches what form matches. The enclosure
167  *                      creates a set of tags, used for [8] and for
168  *                      pattern substution. The tagged forms are numbered
169  *			starting from 1.
170  *
171  *      [8]             a \ followed by a digit 1 to 9 matches whatever a
172  *                      previously tagged regular expression ([7]) matched.
173  *
174  *	[9]	\<	a regular expression starting with a \< construct
175  *		\>	and/or ending with a \> construct, restricts the
176  *			pattern matching to the beginning of a word, and/or
177  *			the end of a word. A word is defined to be a character
178  *			string beginning and/or ending with the characters
179  *			A-Z a-z 0-9 and _. It must also be preceded and/or
180  *			followed by any character outside those mentioned.
181  *
182  *      [10]            a composite regular expression xy where x and y
183  *                      are in the form [1] to [10] matches the longest
184  *                      match of x followed by a match for y.
185  *
186  *      [11]	^	a regular expression starting with a ^ character
187  *		$	and/or ending with a $ character, restricts the
188  *                      pattern matching to the beginning of the line,
189  *                      or the end of line. [anchors] Elsewhere in the
190  *			pattern, ^ and $ are treated as ordinary characters.
191  *
192  *
193  * Acknowledgements:
194  *
195  *	HCR's Hugh Redelmeier has been most helpful in various
196  *	stages of development. He convinced me to include BOW
197  *	and EOW constructs, originally invented by Rob Pike at
198  *	the University of Toronto.
199  *
200  * References:
201  *              Software tools			Kernighan & Plauger
202  *              Software tools in Pascal        Kernighan & Plauger
203  *              Grep [rsx-11 C dist]            David Conroy
204  *		ed - text editor		Un*x Programmer's Manual
205  *		Advanced editing on Un*x	B. W. Kernighan
206  *		RegExp routines			Henry Spencer
207  *
208  * Notes:
209  *
210  *	This implementation uses a bit-set representation for character
211  *	classes for speed and compactness. Each character is represented
212  *	by one bit in a 128-bit block. Thus, CCL always takes a
213  *	constant 16 bytes in the internal nfa, and re_exec does a single
214  *	bit comparison to locate the character in the set.
215  *
216  * Examples:
217  *
218  *	pattern:	foo*.*
219  *	compile:	CHR f CHR o CLO CHR o END CLO ANY END END
220  *	matches:	fo foo fooo foobar fobar foxx ...
221  *
222  *	pattern:	fo[ob]a[rz]
223  *	compile:	CHR f CHR o CCL bitset CHR a CCL bitset END
224  *	matches:	fobar fooar fobaz fooaz
225  *
226  *	pattern:	foo\\+
227  *	compile:	CHR f CHR o CHR o CHR \ CLO CHR \ END END
228  *	matches:	foo\ foo\\ foo\\\  ...
229  *
230  *	pattern:	\(foo\)[1-3]\1	(same as foo[1-3]foo)
231  *	compile:	BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
232  *	matches:	foo1foo foo2foo foo3foo
233  *
234  *	pattern:	\(fo.*\)-\1
235  *	compile:	BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
236  *	matches:	foo-foo fo-fo fob-fob foobar-foobar ...
237  */
238 
239 #define MAXNFA  1024
240 #define MAXTAG  10
241 
242 #define OKP     1
243 #define NOP     0
244 
245 #define CHR     1
246 #define ANY     2
247 #define CCL     3
248 #define BOL     4
249 #define EOL     5
250 #define BOT     6
251 #define EOT     7
252 #define BOW	8
253 #define EOW	9
254 #define REF     10
255 #define CLO     11
256 
257 #define END     0
258 
259 /*
260  * The following defines are not meant to be changeable.
261  * They are for readability only.
262  */
263 #define MAXCHR	128
264 #define CHRBIT	8
265 #define BITBLK	MAXCHR/CHRBIT
266 #define BLKIND	0170
267 #define BITIND	07
268 
269 #define ASCIIB	0177
270 
271 #if defined( DOS ) || defined( _WIN32 ) || defined(SUN)
272 typedef unsigned char CHAR;
273 #else /* DOS */
274 typedef /*unsigned*/ char CHAR;
275 #endif /* DOS */
276 
277 static int  tagstk[MAXTAG];             /* subpat tag stack..*/
278 static CHAR nfa[MAXNFA];		/* automaton..       */
279 static int  sta = NOP;               	/* status of lastpat */
280 
281 static CHAR bittab[BITBLK];		/* bit table for CCL */
282 					/* pre-set bits...   */
283 static CHAR bitarr[] = {1,2,4,8,16,32,64,128};
284 
285 #ifdef DEBUG
286 static void nfadump(CHAR *);
287 void symbolic(char *);
288 #endif
289 
290 static void
291 chset(CHAR c)
292 {
293 	bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
294 }
295 
296 #define badpat(x)	(*nfa = END, x)
297 #define store(x)	*mp++ = x
298 
299 char *
300 re_comp( char *pat )
301 {
302 	register char *p;               /* pattern pointer   */
303 	register CHAR *mp=nfa;          /* nfa pointer       */
304 	register CHAR *lp;              /* saved pointer..   */
305 	register CHAR *sp=nfa;          /* another one..     */
306 
307 	register int tagi = 0;          /* tag stack index   */
308 	register int tagc = 1;          /* actual tag count  */
309 
310 	register int n;
311 	register CHAR mask;		/* xor mask -CCL/NCL */
312 	int c1, c2;
313 
314 	if (!pat || !*pat)
315 		if (sta)
316 			return 0;
317 		else
318 			return badpat("No previous regular expression");
319 	sta = NOP;
320 
321 	for (p = pat; *p; p++) {
322 		lp = mp;
323 		switch(*p) {
324 
325 		case '.':               /* match any char..  */
326 			store(ANY);
327 			break;
328 
329 		case '^':               /* match beginning.. */
330 			if (p == pat)
331 				store(BOL);
332 			else {
333 				store(CHR);
334 				store(*p);
335 			}
336 			break;
337 
338 		case '$':               /* match endofline.. */
339 			if (!*(p+1))
340 				store(EOL);
341 			else {
342 				store(CHR);
343 				store(*p);
344 			}
345 			break;
346 
347 		case '[':               /* match char class..*/
348 			store(CCL);
349 
350 			if (*++p == '^') {
351 				mask = 0377;
352 				p++;
353 			}
354 			else
355 				mask = 0;
356 
357 			if (*p == '-')		/* real dash */
358 				chset(*p++);
359 			if (*p == ']')		/* real brac */
360 				chset(*p++);
361 			while (*p && *p != ']') {
362 				if (*p == '-' && *(p+1) && *(p+1) != ']') {
363 					p++;
364 					c1 = *(p-2) + 1;
365 					c2 = *p++;
366 					while (c1 <= c2)
367 						chset((CHAR)c1++);
368 				}
369 #ifdef EXTEND
370 				else if (*p == '\\' && *(p+1)) {
371 					p++;
372 					chset(*p++);
373 				}
374 #endif
375 				else
376 					chset(*p++);
377 			}
378 			if (!*p)
379 				return badpat("Missing ]");
380 
381 			for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
382 				store(mask ^ bittab[n]);
383 
384 			break;
385 
386 		case '*':               /* match 0 or more.. */
387 		case '+':               /* match 1 or more.. */
388 			if (p == pat)
389 				return badpat("Empty closure");
390 			lp = sp;		/* previous opcode */
391 			if (*lp == CLO)		/* equivalence..   */
392 				break;
393 			switch(*lp) {
394 
395 			case BOL:
396 			case BOT:
397 			case EOT:
398 			case BOW:
399 			case EOW:
400 			case REF:
401 				return badpat("Illegal closure");
402 			default:
403 				break;
404 			}
405 
406 			if (*p == '+')
407 				for (sp = mp; lp < sp; lp++)
408 					store(*lp);
409 
410 			store(END);
411 			store(END);
412 			sp = mp;
413 			while (--mp > lp)
414 				*mp = mp[-1];
415 			store(CLO);
416 			mp = sp;
417 			break;
418 
419 		case '\\':              /* tags, backrefs .. */
420 			switch(*++p) {
421 
422 			case '(':
423 				if (tagc < MAXTAG) {
424 					tagstk[++tagi] = tagc;
425 					store(BOT);
426 					store(tagc++);
427 				}
428 				else
429 					return badpat("Too many \\(\\) pairs");
430 				break;
431 			case ')':
432 				if (*sp == BOT)
433 					return badpat("Null pattern inside \\(\\)");
434 				if (tagi > 0) {
435 					store(EOT);
436 					store(tagstk[tagi--]);
437 				}
438 				else
439 					return badpat("Unmatched \\)");
440 				break;
441 			case '<':
442 				store(BOW);
443 				break;
444 			case '>':
445 				if (*sp == BOW)
446 					return badpat("Null pattern inside \\<\\>");
447 				store(EOW);
448 				break;
449 			case '1':
450 			case '2':
451 			case '3':
452 			case '4':
453 			case '5':
454 			case '6':
455 			case '7':
456 			case '8':
457 			case '9':
458 				n = *p-'0';
459 				if (tagi > 0 && tagstk[tagi] == n)
460 					return badpat("Cyclical reference");
461 				if (tagc > n) {
462 					store(REF);
463 					store(n);
464 				}
465 				else
466 					return badpat("Undetermined reference");
467 				break;
468 #ifdef EXTEND
469 			case 'b':
470 				store(CHR);
471 				store('\b');
472 				break;
473 			case 'n':
474 				store(CHR);
475 				store('\n');
476 				break;
477 			case 'f':
478 				store(CHR);
479 				store('\f');
480 				break;
481 			case 'r':
482 				store(CHR);
483 				store('\r');
484 				break;
485 			case 't':
486 				store(CHR);
487 				store('\t');
488 				break;
489 #endif
490 			default:
491 				store(CHR);
492 				store(*p);
493 			}
494 			break;
495 
496 		default :               /* an ordinary char  */
497 			store(CHR);
498 			store(*p);
499 			break;
500 		}
501 		sp = lp;
502 	}
503 	if (tagi > 0)
504 		return badpat("Unmatched \\(");
505 	store(END);
506 	sta = OKP;
507 	return 0;
508 }
509 
510 
511 static char *bol;
512 char *bopat[MAXTAG];
513 char *eopat[MAXTAG];
514 #ifdef NEEDPROTOS
515 static char *pmatch( char *lp, CHAR *ap );
516 #else /* NEEDPROTOS */
517 static char *pmatch();
518 #endif /* NEEDPROTOS */
519 
520 /*
521  * re_exec:
522  * 	execute nfa to find a match.
523  *
524  *	special cases: (nfa[0])
525  *		BOL
526  *			Match only once, starting from the
527  *			beginning.
528  *		CHR
529  *			First locate the character without
530  *			calling pmatch, and if found, call
531  *			pmatch for the remaining string.
532  *		END
533  *			re_comp failed, poor luser did not
534  *			check for it. Fail fast.
535  *
536  *	If a match is found, bopat[0] and eopat[0] are set
537  *	to the beginning and the end of the matched fragment,
538  *	respectively.
539  *
540  */
541 
542 int
543 re_exec( char *lp )
544 {
545 	register char c;
546 	register char *ep = 0;
547 	register CHAR *ap = nfa;
548 
549 	bol = lp;
550 
551 	bopat[0] = 0;
552 	bopat[1] = 0;
553 	bopat[2] = 0;
554 	bopat[3] = 0;
555 	bopat[4] = 0;
556 	bopat[5] = 0;
557 	bopat[6] = 0;
558 	bopat[7] = 0;
559 	bopat[8] = 0;
560 	bopat[9] = 0;
561 
562 	switch(*ap) {
563 
564 	case BOL:			/* anchored: match from BOL only */
565 		ep = pmatch(lp,ap);
566 		break;
567 	case CHR:			/* ordinary char: locate it fast */
568 		c = *(ap+1);
569 		while (*lp && *lp != c)
570 			lp++;
571 		if (!*lp)		/* if EOS, fail, else fall thru. */
572 			return 0;
573 	default:			/* regular matching all the way. */
574 		do {
575 			if ((ep = pmatch(lp,ap)))
576 				break;
577 			lp++;
578 		} while (*lp);
579 
580 		break;
581 	case END:			/* munged automaton. fail always */
582 		return 0;
583 	}
584 	if (!ep)
585 		return 0;
586 
587 	bopat[0] = lp;
588 	eopat[0] = ep;
589 	return 1;
590 }
591 
592 /*
593  * pmatch: internal routine for the hard part
594  *
595  * 	This code is partly snarfed from an early grep written by
596  *	David Conroy. The backref and tag stuff, and various other
597  *	innovations are by oz.
598  *
599  *	special case optimizations: (nfa[n], nfa[n+1])
600  *		CLO ANY
601  *			We KNOW .* will match everything upto the
602  *			end of line. Thus, directly go to the end of
603  *			line, without recursive pmatch calls. As in
604  *			the other closure cases, the remaining pattern
605  *			must be matched by moving backwards on the
606  *			string recursively, to find a match for xy
607  *			(x is ".*" and y is the remaining pattern)
608  *			where the match satisfies the LONGEST match for
609  *			x followed by a match for y.
610  *		CLO CHR
611  *			We can again scan the string forward for the
612  *			single char and at the point of failure, we
613  *			execute the remaining nfa recursively, same as
614  *			above.
615  *
616  *	At the end of a successful match, bopat[n] and eopat[n]
617  *	are set to the beginning and end of subpatterns matched
618  *	by tagged expressions (n = 1 to 9).
619  *
620  */
621 
622 #ifndef re_fail
623 extern void re_fail();
624 #endif /* re_fail */
625 
626 /*
627  * character classification table for word boundary operators BOW
628  * and EOW. the reason for not using ctype macros is that we can
629  * let the user add into our own table. see re_modw. This table
630  * is not in the bitset form, since we may wish to extend it in the
631  * future for other character classifications.
632  *
633  *	TRUE for 0-9 A-Z a-z _
634  */
635 static char chrtyp[MAXCHR] = {
636 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
637 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
638 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
639 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
640 	0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
641 	1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
642 	0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
643 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
644 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
645 	1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
646 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
647 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
648 	1, 1, 1, 0, 0, 0, 0, 0
649 	};
650 
651 #define inascii(x)	(0177&(x))
652 #define iswordc(x) 	chrtyp[inascii(x)]
653 #define isinset(x,y) 	((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
654 
655 /*
656  * skip values for CLO XXX to skip past the closure
657  */
658 
659 #define ANYSKIP	2 	/* [CLO] ANY END ...	     */
660 #define CHRSKIP	3	/* [CLO] CHR chr END ...     */
661 #define CCLSKIP 18	/* [CLO] CCL 16bytes END ... */
662 
663 static char *
664 pmatch( char *lp, CHAR *ap)
665 {
666 	register int op, c, n;
667 	register char *e;		/* extra pointer for CLO */
668 	register char *bp;		/* beginning of subpat.. */
669 	register char *ep;		/* ending of subpat..	 */
670 	char *are;			/* to save the line ptr. */
671 
672 	while ((op = *ap++) != END)
673 		switch(op) {
674 
675 		case CHR:
676 			if (*lp++ != *ap++)
677 				return 0;
678 			break;
679 		case ANY:
680 			if (!*lp++)
681 				return 0;
682 			break;
683 		case CCL:
684 			c = *lp++;
685 			if (!isinset(ap,c))
686 				return 0;
687 			ap += BITBLK;
688 			break;
689 		case BOL:
690 			if (lp != bol)
691 				return 0;
692 			break;
693 		case EOL:
694 			if (*lp)
695 				return 0;
696 			break;
697 		case BOT:
698 			bopat[*ap++] = lp;
699 			break;
700 		case EOT:
701 			eopat[*ap++] = lp;
702 			break;
703  		case BOW:
704 			if (lp!=bol && iswordc(lp[-1]) || !iswordc(*lp))
705 				return 0;
706 			break;
707 		case EOW:
708 			if (lp==bol || !iswordc(lp[-1]) || iswordc(*lp))
709 				return 0;
710 			break;
711 		case REF:
712 			n = *ap++;
713 			bp = bopat[n];
714 			ep = eopat[n];
715 			while (bp < ep)
716 				if (*bp++ != *lp++)
717 					return 0;
718 			break;
719 		case CLO:
720 			are = lp;
721 			switch(*ap) {
722 
723 			case ANY:
724 				while (*lp)
725 					lp++;
726 				n = ANYSKIP;
727 				break;
728 			case CHR:
729 				c = *(ap+1);
730 				while (*lp && c == *lp)
731 					lp++;
732 				n = CHRSKIP;
733 				break;
734 			case CCL:
735 				while ((c = *lp) && isinset(ap+1,c))
736 					lp++;
737 				n = CCLSKIP;
738 				break;
739 			default:
740 				re_fail("closure: bad nfa.", *ap);
741 				return 0;
742 			}
743 
744 			ap += n;
745 
746 			while (lp >= are) {
747 				if (e = pmatch(lp, ap))
748 					return e;
749 				--lp;
750 			}
751 			return 0;
752 		default:
753 			re_fail("re_exec: bad nfa.", op);
754 			return 0;
755 		}
756 	return lp;
757 }
758 
759 /*
760  * re_modw:
761  *	add new characters into the word table to change re_exec's
762  *	understanding of what a word should look like. Note that we
763  *	only accept additions into the word definition.
764  *
765  *	If the string parameter is 0 or null string, the table is
766  *	reset back to the default containing A-Z a-z 0-9 _. [We use
767  *	the compact bitset representation for the default table]
768  */
769 
770 static CHAR deftab[16] = {
771 	0, 0, 0, 0, 0, 0, 0377, 003, 0376, 0377, 0377, 0207,
772 	0376, 0377, 0377, 007
773 };
774 
775 void
776 re_modw( char *s )
777 {
778 	register int i;
779 
780 	if (!s || !*s) {
781 		for (i = 0; i < MAXCHR; i++)
782 			if (!isinset(deftab,i))
783 				iswordc(i) = 0;
784 	}
785 	else
786 		while(*s)
787 			iswordc(*s++) = 1;
788 }
789 
790 /*
791  * re_subs:
792  *	substitute the matched portions of the src in dst.
793  *
794  *	&	substitute the entire matched pattern.
795  *
796  *	\digit	substitute a subpattern, with the given	tag number.
797  *		Tags are numbered from 1 to 9. If the particular
798  *		tagged subpattern does not exist, null is substituted.
799  */
800 int
801 re_subs( char *src, char *dst)
802 {
803 	register char c;
804 	register int  pin;
805 	register char *bp;
806 	register char *ep;
807 
808 	if (!*src || !bopat[0])
809 		return 0;
810 
811 	while (c = *src++) {
812 		switch(c) {
813 
814 		case '&':
815 			pin = 0;
816 			break;
817 
818 		case '\\':
819 			c = *src++;
820 			if (c >= '0' && c <= '9') {
821 				pin = c - '0';
822 				break;
823 			}
824 
825 		default:
826 			*dst++ = c;
827 			continue;
828 		}
829 
830 		if ((bp = bopat[pin]) && (ep = eopat[pin])) {
831 			while (*bp && bp < ep)
832 				*dst++ = *bp++;
833 			if (bp < ep)
834 				return 0;
835 		}
836 	}
837 	*dst = (char) 0;
838 	return 1;
839 }
840 
841 #ifdef DEBUG
842 /*
843  * symbolic - produce a symbolic dump of the nfa
844  */
845 void
846 symbolic( char *s )
847 {
848 	(void) printf("pattern: %s\n", s);
849 	(void) printf("nfacode:\n");
850 	nfadump(nfa);
851 }
852 
853 static void
854 nfadump( CHAR *ap)
855 {
856 	register int n;
857 
858 	while (*ap != END)
859 		switch(*ap++) {
860 		case CLO:
861 			(void) printf("CLOSURE");
862 			nfadump(ap);
863 			switch(*ap) {
864 			case CHR:
865 				n = CHRSKIP;
866 				break;
867 			case ANY:
868 				n = ANYSKIP;
869 				break;
870 			case CCL:
871 				n = CCLSKIP;
872 				break;
873 			}
874 			ap += n;
875 			break;
876 		case CHR:
877 			(void) printf("\tCHR %c\n",*ap++);
878 			break;
879 		case ANY:
880 			(void) printf("\tANY .\n");
881 			break;
882 		case BOL:
883 			(void) printf("\tBOL -\n");
884 			break;
885 		case EOL:
886 			(void) printf("\tEOL -\n");
887 			break;
888 		case BOT:
889 			(void) printf("BOT: %d\n",*ap++);
890 			break;
891 		case EOT:
892 			(void) printf("EOT: %d\n",*ap++);
893 			break;
894 		case BOW:
895 			(void) printf("BOW\n");
896 			break;
897 		case EOW:
898 			(void) printf("EOW\n");
899 			break;
900 		case REF:
901 			(void) printf("REF: %d\n",*ap++);
902 			break;
903 		case CCL:
904 			(void) printf("\tCCL [");
905 			for (n = 0; n < MAXCHR; n++)
906 				if (isinset(ap,(CHAR)n)) {
907 					if (n < ' ')
908 						(void) printf("^%c", n ^ 0x040);
909 					else
910 						(void) printf("%c", n);
911 				}
912 			(void) printf("]\n");
913 			ap += BITBLK;
914 			break;
915 		default:
916 			(void) printf("bad nfa. opcode %o\n", ap[-1]);
917 			exit(1);
918 			break;
919 		}
920 }
921 #endif
922 #endif /* MACOS or DOS or NEED_BSDREGEX */
923