xref: /onnv-gate/usr/src/lib/libldap4/common/regex.c (revision 3857:21b9b714e4ab)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  *
3*3857Sstevel  * Portions Copyright 1998 Sun Microsystems, Inc.  All rights reserved.
4*3857Sstevel  * Use is subject to license terms.
50Sstevel@tonic-gate  *
60Sstevel@tonic-gate  */
70Sstevel@tonic-gate 
80Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
90Sstevel@tonic-gate 
100Sstevel@tonic-gate #include "portable.h"
110Sstevel@tonic-gate #include "stdio.h"
120Sstevel@tonic-gate #include "stdlib.h"
130Sstevel@tonic-gate 
140Sstevel@tonic-gate #if defined( MACOS ) || defined( DOS ) || defined( _WIN32 ) || defined( NEED_BSDREGEX )
150Sstevel@tonic-gate #include "regex.h"
160Sstevel@tonic-gate 
170Sstevel@tonic-gate /*
180Sstevel@tonic-gate  * regex - Regular expression pattern matching  and replacement
190Sstevel@tonic-gate  *
200Sstevel@tonic-gate  * By:  Ozan S. Yigit (oz)
210Sstevel@tonic-gate  *      Dept. of Computer Science
220Sstevel@tonic-gate  *      York University
230Sstevel@tonic-gate  *
240Sstevel@tonic-gate  * These routines are the PUBLIC DOMAIN equivalents of regex
250Sstevel@tonic-gate  * routines as found in 4.nBSD UN*X, with minor extensions.
260Sstevel@tonic-gate  *
270Sstevel@tonic-gate  * These routines are derived from various implementations found
280Sstevel@tonic-gate  * in software tools books, and Conroy's grep. They are NOT derived
290Sstevel@tonic-gate  * from licensed/restricted software.
300Sstevel@tonic-gate  * For more interesting/academic/complicated implementations,
310Sstevel@tonic-gate  * see Henry Spencer's regexp routines, or GNU Emacs pattern
320Sstevel@tonic-gate  * matching module.
330Sstevel@tonic-gate  *
340Sstevel@tonic-gate  * Modification history:
350Sstevel@tonic-gate  *
360Sstevel@tonic-gate  * $Log: regex.c,v $
370Sstevel@tonic-gate  * Revision 1.12  1996/04/25  16:20:59  mcs
380Sstevel@tonic-gate  * make re_exec() match "" with ".*" and similar patterns
390Sstevel@tonic-gate  * hopefully this change doesn't break anything else!
400Sstevel@tonic-gate  *
410Sstevel@tonic-gate  * Revision 1.11  1994/12/14  21:33:45  mcs
420Sstevel@tonic-gate  * use new NEED_BSDREGEX
430Sstevel@tonic-gate  * fix pmatch() prototype
440Sstevel@tonic-gate  *
450Sstevel@tonic-gate  * Revision 1.10  1994/12/12  18:16:39  mcs
460Sstevel@tonic-gate  * use on NetBSD
470Sstevel@tonic-gate  *
480Sstevel@tonic-gate  * Revision 1.9  1994/11/15  19:16:35  mcs
490Sstevel@tonic-gate  * add (CHAR) cast to make VisualC++ happy
500Sstevel@tonic-gate  *
510Sstevel@tonic-gate  * Revision 1.8  1994/11/08  21:14:32  mcs
520Sstevel@tonic-gate  * WIN32 changes
530Sstevel@tonic-gate  *
540Sstevel@tonic-gate  * Revision 1.7  1994/07/23  19:51:24  mcs
550Sstevel@tonic-gate  * use ANSI-style inline function parameters
560Sstevel@tonic-gate  *
570Sstevel@tonic-gate  * Revision 1.6  1993/10/18  01:52:32  tim
580Sstevel@tonic-gate  * include for VMS
590Sstevel@tonic-gate  *
600Sstevel@tonic-gate  * Revision 1.5  1993/09/28  21:37:54  mcs
610Sstevel@tonic-gate  * HP/UX needs the regex we include (not in its libc)
620Sstevel@tonic-gate  *
630Sstevel@tonic-gate  * Revision 1.4  1993/08/27  15:59:52  mcs
640Sstevel@tonic-gate  * use CHAR for deftab
650Sstevel@tonic-gate  *
660Sstevel@tonic-gate  * Revision 1.3  1993/08/27  15:49:47  mcs
670Sstevel@tonic-gate  * added missing 0 to octal constants
680Sstevel@tonic-gate  * use unsigned char for CHAR under DOS
690Sstevel@tonic-gate  *
700Sstevel@tonic-gate  * Revision 1.2  1993/08/27  14:57:48  mcs
710Sstevel@tonic-gate  * add proto. for pmatch
720Sstevel@tonic-gate  *
730Sstevel@tonic-gate  * Revision 1.1  1993/08/18  21:20:02  mcs
740Sstevel@tonic-gate  * Initial revision
750Sstevel@tonic-gate  *
760Sstevel@tonic-gate  * Revision 1.4  1991/10/17  03:56:42  oz
770Sstevel@tonic-gate  * miscellaneous changes, small cleanups etc.
780Sstevel@tonic-gate  *
790Sstevel@tonic-gate  * Revision 1.3  1989/04/01  14:18:09  oz
800Sstevel@tonic-gate  * Change all references to a dfa: this is actually an nfa.
810Sstevel@tonic-gate  *
820Sstevel@tonic-gate  * Revision 1.2  88/08/28  15:36:04  oz
830Sstevel@tonic-gate  * Use a complement bitmap to represent NCL.
840Sstevel@tonic-gate  * This removes the need to have seperate
850Sstevel@tonic-gate  * code in the pmatch case block - it is
860Sstevel@tonic-gate  * just CCL code now.
870Sstevel@tonic-gate  *
880Sstevel@tonic-gate  * Use the actual CCL code in the CLO
890Sstevel@tonic-gate  * section of pmatch. No need for a recursive
900Sstevel@tonic-gate  * pmatch call.
910Sstevel@tonic-gate  *
920Sstevel@tonic-gate  * Use a bitmap table to set char bits in an
930Sstevel@tonic-gate  * 8-bit chunk.
940Sstevel@tonic-gate  *
950Sstevel@tonic-gate  * Interfaces:
960Sstevel@tonic-gate  *      re_comp:        compile a regular expression into a NFA.
970Sstevel@tonic-gate  *
980Sstevel@tonic-gate  *			char *re_comp(s)
990Sstevel@tonic-gate  *			char *s;
1000Sstevel@tonic-gate  *
1010Sstevel@tonic-gate  *      re_exec:        execute the NFA to match a pattern.
1020Sstevel@tonic-gate  *
1030Sstevel@tonic-gate  *			int re_exec(s)
1040Sstevel@tonic-gate  *			char *s;
1050Sstevel@tonic-gate  *
1060Sstevel@tonic-gate  *	re_modw		change re_exec's understanding of what a "word"
1070Sstevel@tonic-gate  *			looks like (for \< and \>) by adding into the
1080Sstevel@tonic-gate  *			hidden word-syntax table.
1090Sstevel@tonic-gate  *
1100Sstevel@tonic-gate  *			void re_modw(s)
1110Sstevel@tonic-gate  *			char *s;
1120Sstevel@tonic-gate  *
1130Sstevel@tonic-gate  *      re_subs:	substitute the matched portions in a new string.
1140Sstevel@tonic-gate  *
1150Sstevel@tonic-gate  *			int re_subs(src, dst)
1160Sstevel@tonic-gate  *			char *src;
1170Sstevel@tonic-gate  *			char *dst;
1180Sstevel@tonic-gate  *
1190Sstevel@tonic-gate  *	re_fail:	failure routine for re_exec.
1200Sstevel@tonic-gate  *
1210Sstevel@tonic-gate  *			void re_fail(msg, op)
1220Sstevel@tonic-gate  *			char *msg;
1230Sstevel@tonic-gate  *			char op;
1240Sstevel@tonic-gate  *
1250Sstevel@tonic-gate  * Regular Expressions:
1260Sstevel@tonic-gate  *
1270Sstevel@tonic-gate  *      [1]     char    matches itself, unless it is a special
1280Sstevel@tonic-gate  *                      character (metachar): . \ [ ] * + ^ $
1290Sstevel@tonic-gate  *
1300Sstevel@tonic-gate  *      [2]     .       matches any character.
1310Sstevel@tonic-gate  *
1320Sstevel@tonic-gate  *      [3]     \       matches the character following it, except
1330Sstevel@tonic-gate  *			when followed by a left or right round bracket,
1340Sstevel@tonic-gate  *			a digit 1 to 9 or a left or right angle bracket.
1350Sstevel@tonic-gate  *			(see [7], [8] and [9])
1360Sstevel@tonic-gate  *			It is used as an escape character for all
1370Sstevel@tonic-gate  *			other meta-characters, and itself. When used
1380Sstevel@tonic-gate  *			in a set ([4]), it is treated as an ordinary
1390Sstevel@tonic-gate  *			character.
1400Sstevel@tonic-gate  *
1410Sstevel@tonic-gate  *      [4]     [set]   matches one of the characters in the set.
1420Sstevel@tonic-gate  *                      If the first character in the set is "^",
1430Sstevel@tonic-gate  *                      it matches a character NOT in the set, i.e.
1440Sstevel@tonic-gate  *			complements the set. A shorthand S-E is
1450Sstevel@tonic-gate  *			used to specify a set of characters S upto
1460Sstevel@tonic-gate  *			E, inclusive. The special characters "]" and
1470Sstevel@tonic-gate  *			"-" have no special meaning if they appear
1480Sstevel@tonic-gate  *			as the first chars in the set.
1490Sstevel@tonic-gate  *                      examples:        match:
1500Sstevel@tonic-gate  *
1510Sstevel@tonic-gate  *                              [a-z]    any lowercase alpha
1520Sstevel@tonic-gate  *
1530Sstevel@tonic-gate  *                              [^]-]    any char except ] and -
1540Sstevel@tonic-gate  *
1550Sstevel@tonic-gate  *                              [^A-Z]   any char except uppercase
1560Sstevel@tonic-gate  *                                       alpha
1570Sstevel@tonic-gate  *
1580Sstevel@tonic-gate  *                              [a-zA-Z] any alpha
1590Sstevel@tonic-gate  *
1600Sstevel@tonic-gate  *      [5]     *       any regular expression form [1] to [4], followed by
1610Sstevel@tonic-gate  *                      closure char (*) matches zero or more matches of
1620Sstevel@tonic-gate  *                      that form.
1630Sstevel@tonic-gate  *
1640Sstevel@tonic-gate  *      [6]     +       same as [5], except it matches one or more.
1650Sstevel@tonic-gate  *
1660Sstevel@tonic-gate  *      [7]             a regular expression in the form [1] to [10], enclosed
1670Sstevel@tonic-gate  *                      as \(form\) matches what form matches. The enclosure
1680Sstevel@tonic-gate  *                      creates a set of tags, used for [8] and for
1690Sstevel@tonic-gate  *                      pattern substution. The tagged forms are numbered
1700Sstevel@tonic-gate  *			starting from 1.
1710Sstevel@tonic-gate  *
1720Sstevel@tonic-gate  *      [8]             a \ followed by a digit 1 to 9 matches whatever a
1730Sstevel@tonic-gate  *                      previously tagged regular expression ([7]) matched.
1740Sstevel@tonic-gate  *
1750Sstevel@tonic-gate  *	[9]	\<	a regular expression starting with a \< construct
1760Sstevel@tonic-gate  *		\>	and/or ending with a \> construct, restricts the
1770Sstevel@tonic-gate  *			pattern matching to the beginning of a word, and/or
1780Sstevel@tonic-gate  *			the end of a word. A word is defined to be a character
1790Sstevel@tonic-gate  *			string beginning and/or ending with the characters
1800Sstevel@tonic-gate  *			A-Z a-z 0-9 and _. It must also be preceded and/or
1810Sstevel@tonic-gate  *			followed by any character outside those mentioned.
1820Sstevel@tonic-gate  *
1830Sstevel@tonic-gate  *      [10]            a composite regular expression xy where x and y
1840Sstevel@tonic-gate  *                      are in the form [1] to [10] matches the longest
1850Sstevel@tonic-gate  *                      match of x followed by a match for y.
1860Sstevel@tonic-gate  *
1870Sstevel@tonic-gate  *      [11]	^	a regular expression starting with a ^ character
1880Sstevel@tonic-gate  *		$	and/or ending with a $ character, restricts the
1890Sstevel@tonic-gate  *                      pattern matching to the beginning of the line,
1900Sstevel@tonic-gate  *                      or the end of line. [anchors] Elsewhere in the
1910Sstevel@tonic-gate  *			pattern, ^ and $ are treated as ordinary characters.
1920Sstevel@tonic-gate  *
1930Sstevel@tonic-gate  *
1940Sstevel@tonic-gate  * Acknowledgements:
1950Sstevel@tonic-gate  *
1960Sstevel@tonic-gate  *	HCR's Hugh Redelmeier has been most helpful in various
1970Sstevel@tonic-gate  *	stages of development. He convinced me to include BOW
1980Sstevel@tonic-gate  *	and EOW constructs, originally invented by Rob Pike at
1990Sstevel@tonic-gate  *	the University of Toronto.
2000Sstevel@tonic-gate  *
2010Sstevel@tonic-gate  * References:
2020Sstevel@tonic-gate  *              Software tools			Kernighan & Plauger
2030Sstevel@tonic-gate  *              Software tools in Pascal        Kernighan & Plauger
2040Sstevel@tonic-gate  *              Grep [rsx-11 C dist]            David Conroy
2050Sstevel@tonic-gate  *		ed - text editor		Un*x Programmer's Manual
2060Sstevel@tonic-gate  *		Advanced editing on Un*x	B. W. Kernighan
2070Sstevel@tonic-gate  *		RegExp routines			Henry Spencer
2080Sstevel@tonic-gate  *
2090Sstevel@tonic-gate  * Notes:
2100Sstevel@tonic-gate  *
2110Sstevel@tonic-gate  *	This implementation uses a bit-set representation for character
2120Sstevel@tonic-gate  *	classes for speed and compactness. Each character is represented
2130Sstevel@tonic-gate  *	by one bit in a 128-bit block. Thus, CCL always takes a
2140Sstevel@tonic-gate  *	constant 16 bytes in the internal nfa, and re_exec does a single
2150Sstevel@tonic-gate  *	bit comparison to locate the character in the set.
2160Sstevel@tonic-gate  *
2170Sstevel@tonic-gate  * Examples:
2180Sstevel@tonic-gate  *
2190Sstevel@tonic-gate  *	pattern:	foo*.*
2200Sstevel@tonic-gate  *	compile:	CHR f CHR o CLO CHR o END CLO ANY END END
2210Sstevel@tonic-gate  *	matches:	fo foo fooo foobar fobar foxx ...
2220Sstevel@tonic-gate  *
2230Sstevel@tonic-gate  *	pattern:	fo[ob]a[rz]
2240Sstevel@tonic-gate  *	compile:	CHR f CHR o CCL bitset CHR a CCL bitset END
2250Sstevel@tonic-gate  *	matches:	fobar fooar fobaz fooaz
2260Sstevel@tonic-gate  *
2270Sstevel@tonic-gate  *	pattern:	foo\\+
2280Sstevel@tonic-gate  *	compile:	CHR f CHR o CHR o CHR \ CLO CHR \ END END
2290Sstevel@tonic-gate  *	matches:	foo\ foo\\ foo\\\  ...
2300Sstevel@tonic-gate  *
2310Sstevel@tonic-gate  *	pattern:	\(foo\)[1-3]\1	(same as foo[1-3]foo)
2320Sstevel@tonic-gate  *	compile:	BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
2330Sstevel@tonic-gate  *	matches:	foo1foo foo2foo foo3foo
2340Sstevel@tonic-gate  *
2350Sstevel@tonic-gate  *	pattern:	\(fo.*\)-\1
2360Sstevel@tonic-gate  *	compile:	BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
2370Sstevel@tonic-gate  *	matches:	foo-foo fo-fo fob-fob foobar-foobar ...
2380Sstevel@tonic-gate  */
2390Sstevel@tonic-gate 
2400Sstevel@tonic-gate #define MAXNFA  1024
2410Sstevel@tonic-gate #define MAXTAG  10
2420Sstevel@tonic-gate 
2430Sstevel@tonic-gate #define OKP     1
2440Sstevel@tonic-gate #define NOP     0
2450Sstevel@tonic-gate 
2460Sstevel@tonic-gate #define CHR     1
2470Sstevel@tonic-gate #define ANY     2
2480Sstevel@tonic-gate #define CCL     3
2490Sstevel@tonic-gate #define BOL     4
2500Sstevel@tonic-gate #define EOL     5
2510Sstevel@tonic-gate #define BOT     6
2520Sstevel@tonic-gate #define EOT     7
2530Sstevel@tonic-gate #define BOW	8
2540Sstevel@tonic-gate #define EOW	9
2550Sstevel@tonic-gate #define REF     10
2560Sstevel@tonic-gate #define CLO     11
2570Sstevel@tonic-gate 
2580Sstevel@tonic-gate #define END     0
2590Sstevel@tonic-gate 
2600Sstevel@tonic-gate /*
2610Sstevel@tonic-gate  * The following defines are not meant to be changeable.
2620Sstevel@tonic-gate  * They are for readability only.
2630Sstevel@tonic-gate  */
2640Sstevel@tonic-gate #define MAXCHR	128
2650Sstevel@tonic-gate #define CHRBIT	8
2660Sstevel@tonic-gate #define BITBLK	MAXCHR/CHRBIT
2670Sstevel@tonic-gate #define BLKIND	0170
2680Sstevel@tonic-gate #define BITIND	07
2690Sstevel@tonic-gate 
2700Sstevel@tonic-gate #define ASCIIB	0177
2710Sstevel@tonic-gate 
2720Sstevel@tonic-gate #if defined( DOS ) || defined( _WIN32 ) || defined(SUN)
2730Sstevel@tonic-gate typedef unsigned char CHAR;
2740Sstevel@tonic-gate #else /* DOS */
2750Sstevel@tonic-gate typedef /*unsigned*/ char CHAR;
2760Sstevel@tonic-gate #endif /* DOS */
2770Sstevel@tonic-gate 
2780Sstevel@tonic-gate static int  tagstk[MAXTAG];             /* subpat tag stack..*/
2790Sstevel@tonic-gate static CHAR nfa[MAXNFA];		/* automaton..       */
2800Sstevel@tonic-gate static int  sta = NOP;               	/* status of lastpat */
2810Sstevel@tonic-gate 
2820Sstevel@tonic-gate static CHAR bittab[BITBLK];		/* bit table for CCL */
2830Sstevel@tonic-gate 					/* pre-set bits...   */
2840Sstevel@tonic-gate static CHAR bitarr[] = {1,2,4,8,16,32,64,128};
2850Sstevel@tonic-gate 
2860Sstevel@tonic-gate #ifdef DEBUG
2870Sstevel@tonic-gate static void nfadump(CHAR *);
2880Sstevel@tonic-gate void symbolic(char *);
2890Sstevel@tonic-gate #endif
2900Sstevel@tonic-gate 
2910Sstevel@tonic-gate static void
chset(CHAR c)2920Sstevel@tonic-gate chset(CHAR c)
2930Sstevel@tonic-gate {
2940Sstevel@tonic-gate 	bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
2950Sstevel@tonic-gate }
2960Sstevel@tonic-gate 
2970Sstevel@tonic-gate #define badpat(x)	(*nfa = END, x)
2980Sstevel@tonic-gate #define store(x)	*mp++ = x
2990Sstevel@tonic-gate 
3000Sstevel@tonic-gate char *
re_comp(char * pat)3010Sstevel@tonic-gate re_comp( char *pat )
3020Sstevel@tonic-gate {
3030Sstevel@tonic-gate 	register char *p;               /* pattern pointer   */
3040Sstevel@tonic-gate 	register CHAR *mp=nfa;          /* nfa pointer       */
3050Sstevel@tonic-gate 	register CHAR *lp;              /* saved pointer..   */
3060Sstevel@tonic-gate 	register CHAR *sp=nfa;          /* another one..     */
3070Sstevel@tonic-gate 
3080Sstevel@tonic-gate 	register int tagi = 0;          /* tag stack index   */
3090Sstevel@tonic-gate 	register int tagc = 1;          /* actual tag count  */
3100Sstevel@tonic-gate 
3110Sstevel@tonic-gate 	register int n;
3120Sstevel@tonic-gate 	register CHAR mask;		/* xor mask -CCL/NCL */
3130Sstevel@tonic-gate 	int c1, c2;
3140Sstevel@tonic-gate 
3150Sstevel@tonic-gate 	if (!pat || !*pat)
3160Sstevel@tonic-gate 		if (sta)
3170Sstevel@tonic-gate 			return 0;
3180Sstevel@tonic-gate 		else
3190Sstevel@tonic-gate 			return badpat("No previous regular expression");
3200Sstevel@tonic-gate 	sta = NOP;
3210Sstevel@tonic-gate 
3220Sstevel@tonic-gate 	for (p = pat; *p; p++) {
3230Sstevel@tonic-gate 		lp = mp;
3240Sstevel@tonic-gate 		switch(*p) {
3250Sstevel@tonic-gate 
3260Sstevel@tonic-gate 		case '.':               /* match any char..  */
3270Sstevel@tonic-gate 			store(ANY);
3280Sstevel@tonic-gate 			break;
3290Sstevel@tonic-gate 
3300Sstevel@tonic-gate 		case '^':               /* match beginning.. */
3310Sstevel@tonic-gate 			if (p == pat)
3320Sstevel@tonic-gate 				store(BOL);
3330Sstevel@tonic-gate 			else {
3340Sstevel@tonic-gate 				store(CHR);
3350Sstevel@tonic-gate 				store(*p);
3360Sstevel@tonic-gate 			}
3370Sstevel@tonic-gate 			break;
3380Sstevel@tonic-gate 
3390Sstevel@tonic-gate 		case '$':               /* match endofline.. */
3400Sstevel@tonic-gate 			if (!*(p+1))
3410Sstevel@tonic-gate 				store(EOL);
3420Sstevel@tonic-gate 			else {
3430Sstevel@tonic-gate 				store(CHR);
3440Sstevel@tonic-gate 				store(*p);
3450Sstevel@tonic-gate 			}
3460Sstevel@tonic-gate 			break;
3470Sstevel@tonic-gate 
3480Sstevel@tonic-gate 		case '[':               /* match char class..*/
3490Sstevel@tonic-gate 			store(CCL);
3500Sstevel@tonic-gate 
3510Sstevel@tonic-gate 			if (*++p == '^') {
3520Sstevel@tonic-gate 				mask = 0377;
3530Sstevel@tonic-gate 				p++;
3540Sstevel@tonic-gate 			}
3550Sstevel@tonic-gate 			else
3560Sstevel@tonic-gate 				mask = 0;
3570Sstevel@tonic-gate 
3580Sstevel@tonic-gate 			if (*p == '-')		/* real dash */
3590Sstevel@tonic-gate 				chset(*p++);
3600Sstevel@tonic-gate 			if (*p == ']')		/* real brac */
3610Sstevel@tonic-gate 				chset(*p++);
3620Sstevel@tonic-gate 			while (*p && *p != ']') {
3630Sstevel@tonic-gate 				if (*p == '-' && *(p+1) && *(p+1) != ']') {
3640Sstevel@tonic-gate 					p++;
3650Sstevel@tonic-gate 					c1 = *(p-2) + 1;
3660Sstevel@tonic-gate 					c2 = *p++;
3670Sstevel@tonic-gate 					while (c1 <= c2)
3680Sstevel@tonic-gate 						chset((CHAR)c1++);
3690Sstevel@tonic-gate 				}
3700Sstevel@tonic-gate #ifdef EXTEND
3710Sstevel@tonic-gate 				else if (*p == '\\' && *(p+1)) {
3720Sstevel@tonic-gate 					p++;
3730Sstevel@tonic-gate 					chset(*p++);
3740Sstevel@tonic-gate 				}
3750Sstevel@tonic-gate #endif
3760Sstevel@tonic-gate 				else
3770Sstevel@tonic-gate 					chset(*p++);
3780Sstevel@tonic-gate 			}
3790Sstevel@tonic-gate 			if (!*p)
3800Sstevel@tonic-gate 				return badpat("Missing ]");
3810Sstevel@tonic-gate 
3820Sstevel@tonic-gate 			for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
3830Sstevel@tonic-gate 				store(mask ^ bittab[n]);
3840Sstevel@tonic-gate 
3850Sstevel@tonic-gate 			break;
3860Sstevel@tonic-gate 
3870Sstevel@tonic-gate 		case '*':               /* match 0 or more.. */
3880Sstevel@tonic-gate 		case '+':               /* match 1 or more.. */
3890Sstevel@tonic-gate 			if (p == pat)
3900Sstevel@tonic-gate 				return badpat("Empty closure");
3910Sstevel@tonic-gate 			lp = sp;		/* previous opcode */
3920Sstevel@tonic-gate 			if (*lp == CLO)		/* equivalence..   */
3930Sstevel@tonic-gate 				break;
3940Sstevel@tonic-gate 			switch(*lp) {
3950Sstevel@tonic-gate 
3960Sstevel@tonic-gate 			case BOL:
3970Sstevel@tonic-gate 			case BOT:
3980Sstevel@tonic-gate 			case EOT:
3990Sstevel@tonic-gate 			case BOW:
4000Sstevel@tonic-gate 			case EOW:
4010Sstevel@tonic-gate 			case REF:
4020Sstevel@tonic-gate 				return badpat("Illegal closure");
4030Sstevel@tonic-gate 			default:
4040Sstevel@tonic-gate 				break;
4050Sstevel@tonic-gate 			}
4060Sstevel@tonic-gate 
4070Sstevel@tonic-gate 			if (*p == '+')
4080Sstevel@tonic-gate 				for (sp = mp; lp < sp; lp++)
4090Sstevel@tonic-gate 					store(*lp);
4100Sstevel@tonic-gate 
4110Sstevel@tonic-gate 			store(END);
4120Sstevel@tonic-gate 			store(END);
4130Sstevel@tonic-gate 			sp = mp;
4140Sstevel@tonic-gate 			while (--mp > lp)
4150Sstevel@tonic-gate 				*mp = mp[-1];
4160Sstevel@tonic-gate 			store(CLO);
4170Sstevel@tonic-gate 			mp = sp;
4180Sstevel@tonic-gate 			break;
4190Sstevel@tonic-gate 
4200Sstevel@tonic-gate 		case '\\':              /* tags, backrefs .. */
4210Sstevel@tonic-gate 			switch(*++p) {
4220Sstevel@tonic-gate 
4230Sstevel@tonic-gate 			case '(':
4240Sstevel@tonic-gate 				if (tagc < MAXTAG) {
4250Sstevel@tonic-gate 					tagstk[++tagi] = tagc;
4260Sstevel@tonic-gate 					store(BOT);
4270Sstevel@tonic-gate 					store(tagc++);
4280Sstevel@tonic-gate 				}
4290Sstevel@tonic-gate 				else
4300Sstevel@tonic-gate 					return badpat("Too many \\(\\) pairs");
4310Sstevel@tonic-gate 				break;
4320Sstevel@tonic-gate 			case ')':
4330Sstevel@tonic-gate 				if (*sp == BOT)
4340Sstevel@tonic-gate 					return badpat("Null pattern inside \\(\\)");
4350Sstevel@tonic-gate 				if (tagi > 0) {
4360Sstevel@tonic-gate 					store(EOT);
4370Sstevel@tonic-gate 					store(tagstk[tagi--]);
4380Sstevel@tonic-gate 				}
4390Sstevel@tonic-gate 				else
4400Sstevel@tonic-gate 					return badpat("Unmatched \\)");
4410Sstevel@tonic-gate 				break;
4420Sstevel@tonic-gate 			case '<':
4430Sstevel@tonic-gate 				store(BOW);
4440Sstevel@tonic-gate 				break;
4450Sstevel@tonic-gate 			case '>':
4460Sstevel@tonic-gate 				if (*sp == BOW)
4470Sstevel@tonic-gate 					return badpat("Null pattern inside \\<\\>");
4480Sstevel@tonic-gate 				store(EOW);
4490Sstevel@tonic-gate 				break;
4500Sstevel@tonic-gate 			case '1':
4510Sstevel@tonic-gate 			case '2':
4520Sstevel@tonic-gate 			case '3':
4530Sstevel@tonic-gate 			case '4':
4540Sstevel@tonic-gate 			case '5':
4550Sstevel@tonic-gate 			case '6':
4560Sstevel@tonic-gate 			case '7':
4570Sstevel@tonic-gate 			case '8':
4580Sstevel@tonic-gate 			case '9':
4590Sstevel@tonic-gate 				n = *p-'0';
4600Sstevel@tonic-gate 				if (tagi > 0 && tagstk[tagi] == n)
4610Sstevel@tonic-gate 					return badpat("Cyclical reference");
4620Sstevel@tonic-gate 				if (tagc > n) {
4630Sstevel@tonic-gate 					store(REF);
4640Sstevel@tonic-gate 					store(n);
4650Sstevel@tonic-gate 				}
4660Sstevel@tonic-gate 				else
4670Sstevel@tonic-gate 					return badpat("Undetermined reference");
4680Sstevel@tonic-gate 				break;
4690Sstevel@tonic-gate #ifdef EXTEND
4700Sstevel@tonic-gate 			case 'b':
4710Sstevel@tonic-gate 				store(CHR);
4720Sstevel@tonic-gate 				store('\b');
4730Sstevel@tonic-gate 				break;
4740Sstevel@tonic-gate 			case 'n':
4750Sstevel@tonic-gate 				store(CHR);
4760Sstevel@tonic-gate 				store('\n');
4770Sstevel@tonic-gate 				break;
4780Sstevel@tonic-gate 			case 'f':
4790Sstevel@tonic-gate 				store(CHR);
4800Sstevel@tonic-gate 				store('\f');
4810Sstevel@tonic-gate 				break;
4820Sstevel@tonic-gate 			case 'r':
4830Sstevel@tonic-gate 				store(CHR);
4840Sstevel@tonic-gate 				store('\r');
4850Sstevel@tonic-gate 				break;
4860Sstevel@tonic-gate 			case 't':
4870Sstevel@tonic-gate 				store(CHR);
4880Sstevel@tonic-gate 				store('\t');
4890Sstevel@tonic-gate 				break;
4900Sstevel@tonic-gate #endif
4910Sstevel@tonic-gate 			default:
4920Sstevel@tonic-gate 				store(CHR);
4930Sstevel@tonic-gate 				store(*p);
4940Sstevel@tonic-gate 			}
4950Sstevel@tonic-gate 			break;
4960Sstevel@tonic-gate 
4970Sstevel@tonic-gate 		default :               /* an ordinary char  */
4980Sstevel@tonic-gate 			store(CHR);
4990Sstevel@tonic-gate 			store(*p);
5000Sstevel@tonic-gate 			break;
5010Sstevel@tonic-gate 		}
5020Sstevel@tonic-gate 		sp = lp;
5030Sstevel@tonic-gate 	}
5040Sstevel@tonic-gate 	if (tagi > 0)
5050Sstevel@tonic-gate 		return badpat("Unmatched \\(");
5060Sstevel@tonic-gate 	store(END);
5070Sstevel@tonic-gate 	sta = OKP;
5080Sstevel@tonic-gate 	return 0;
5090Sstevel@tonic-gate }
5100Sstevel@tonic-gate 
5110Sstevel@tonic-gate 
5120Sstevel@tonic-gate static char *bol;
5130Sstevel@tonic-gate char *bopat[MAXTAG];
5140Sstevel@tonic-gate char *eopat[MAXTAG];
5150Sstevel@tonic-gate #ifdef NEEDPROTOS
5160Sstevel@tonic-gate static char *pmatch( char *lp, CHAR *ap );
5170Sstevel@tonic-gate #else /* NEEDPROTOS */
5180Sstevel@tonic-gate static char *pmatch();
5190Sstevel@tonic-gate #endif /* NEEDPROTOS */
5200Sstevel@tonic-gate 
5210Sstevel@tonic-gate /*
5220Sstevel@tonic-gate  * re_exec:
5230Sstevel@tonic-gate  * 	execute nfa to find a match.
5240Sstevel@tonic-gate  *
5250Sstevel@tonic-gate  *	special cases: (nfa[0])
5260Sstevel@tonic-gate  *		BOL
5270Sstevel@tonic-gate  *			Match only once, starting from the
5280Sstevel@tonic-gate  *			beginning.
5290Sstevel@tonic-gate  *		CHR
5300Sstevel@tonic-gate  *			First locate the character without
5310Sstevel@tonic-gate  *			calling pmatch, and if found, call
5320Sstevel@tonic-gate  *			pmatch for the remaining string.
5330Sstevel@tonic-gate  *		END
5340Sstevel@tonic-gate  *			re_comp failed, poor luser did not
5350Sstevel@tonic-gate  *			check for it. Fail fast.
5360Sstevel@tonic-gate  *
5370Sstevel@tonic-gate  *	If a match is found, bopat[0] and eopat[0] are set
5380Sstevel@tonic-gate  *	to the beginning and the end of the matched fragment,
5390Sstevel@tonic-gate  *	respectively.
5400Sstevel@tonic-gate  *
5410Sstevel@tonic-gate  */
5420Sstevel@tonic-gate 
5430Sstevel@tonic-gate int
re_exec(char * lp)5440Sstevel@tonic-gate re_exec( char *lp )
5450Sstevel@tonic-gate {
5460Sstevel@tonic-gate 	register char c;
5470Sstevel@tonic-gate 	register char *ep = 0;
5480Sstevel@tonic-gate 	register CHAR *ap = nfa;
5490Sstevel@tonic-gate 
5500Sstevel@tonic-gate 	bol = lp;
5510Sstevel@tonic-gate 
5520Sstevel@tonic-gate 	bopat[0] = 0;
5530Sstevel@tonic-gate 	bopat[1] = 0;
5540Sstevel@tonic-gate 	bopat[2] = 0;
5550Sstevel@tonic-gate 	bopat[3] = 0;
5560Sstevel@tonic-gate 	bopat[4] = 0;
5570Sstevel@tonic-gate 	bopat[5] = 0;
5580Sstevel@tonic-gate 	bopat[6] = 0;
5590Sstevel@tonic-gate 	bopat[7] = 0;
5600Sstevel@tonic-gate 	bopat[8] = 0;
5610Sstevel@tonic-gate 	bopat[9] = 0;
5620Sstevel@tonic-gate 
5630Sstevel@tonic-gate 	switch(*ap) {
5640Sstevel@tonic-gate 
5650Sstevel@tonic-gate 	case BOL:			/* anchored: match from BOL only */
5660Sstevel@tonic-gate 		ep = pmatch(lp,ap);
5670Sstevel@tonic-gate 		break;
5680Sstevel@tonic-gate 	case CHR:			/* ordinary char: locate it fast */
5690Sstevel@tonic-gate 		c = *(ap+1);
5700Sstevel@tonic-gate 		while (*lp && *lp != c)
5710Sstevel@tonic-gate 			lp++;
5720Sstevel@tonic-gate 		if (!*lp)		/* if EOS, fail, else fall thru. */
5730Sstevel@tonic-gate 			return 0;
5740Sstevel@tonic-gate 	default:			/* regular matching all the way. */
5750Sstevel@tonic-gate 		do {
5760Sstevel@tonic-gate 			if ((ep = pmatch(lp,ap)))
5770Sstevel@tonic-gate 				break;
5780Sstevel@tonic-gate 			lp++;
5790Sstevel@tonic-gate 		} while (*lp);
5800Sstevel@tonic-gate 
5810Sstevel@tonic-gate 		break;
5820Sstevel@tonic-gate 	case END:			/* munged automaton. fail always */
5830Sstevel@tonic-gate 		return 0;
5840Sstevel@tonic-gate 	}
5850Sstevel@tonic-gate 	if (!ep)
5860Sstevel@tonic-gate 		return 0;
5870Sstevel@tonic-gate 
5880Sstevel@tonic-gate 	bopat[0] = lp;
5890Sstevel@tonic-gate 	eopat[0] = ep;
5900Sstevel@tonic-gate 	return 1;
5910Sstevel@tonic-gate }
5920Sstevel@tonic-gate 
5930Sstevel@tonic-gate /*
5940Sstevel@tonic-gate  * pmatch: internal routine for the hard part
5950Sstevel@tonic-gate  *
5960Sstevel@tonic-gate  * 	This code is partly snarfed from an early grep written by
5970Sstevel@tonic-gate  *	David Conroy. The backref and tag stuff, and various other
5980Sstevel@tonic-gate  *	innovations are by oz.
5990Sstevel@tonic-gate  *
6000Sstevel@tonic-gate  *	special case optimizations: (nfa[n], nfa[n+1])
6010Sstevel@tonic-gate  *		CLO ANY
6020Sstevel@tonic-gate  *			We KNOW .* will match everything upto the
6030Sstevel@tonic-gate  *			end of line. Thus, directly go to the end of
6040Sstevel@tonic-gate  *			line, without recursive pmatch calls. As in
6050Sstevel@tonic-gate  *			the other closure cases, the remaining pattern
6060Sstevel@tonic-gate  *			must be matched by moving backwards on the
6070Sstevel@tonic-gate  *			string recursively, to find a match for xy
6080Sstevel@tonic-gate  *			(x is ".*" and y is the remaining pattern)
6090Sstevel@tonic-gate  *			where the match satisfies the LONGEST match for
6100Sstevel@tonic-gate  *			x followed by a match for y.
6110Sstevel@tonic-gate  *		CLO CHR
6120Sstevel@tonic-gate  *			We can again scan the string forward for the
6130Sstevel@tonic-gate  *			single char and at the point of failure, we
6140Sstevel@tonic-gate  *			execute the remaining nfa recursively, same as
6150Sstevel@tonic-gate  *			above.
6160Sstevel@tonic-gate  *
6170Sstevel@tonic-gate  *	At the end of a successful match, bopat[n] and eopat[n]
6180Sstevel@tonic-gate  *	are set to the beginning and end of subpatterns matched
6190Sstevel@tonic-gate  *	by tagged expressions (n = 1 to 9).
6200Sstevel@tonic-gate  *
6210Sstevel@tonic-gate  */
6220Sstevel@tonic-gate 
6230Sstevel@tonic-gate #ifndef re_fail
6240Sstevel@tonic-gate extern void re_fail();
6250Sstevel@tonic-gate #endif /* re_fail */
6260Sstevel@tonic-gate 
6270Sstevel@tonic-gate /*
6280Sstevel@tonic-gate  * character classification table for word boundary operators BOW
6290Sstevel@tonic-gate  * and EOW. the reason for not using ctype macros is that we can
6300Sstevel@tonic-gate  * let the user add into our own table. see re_modw. This table
6310Sstevel@tonic-gate  * is not in the bitset form, since we may wish to extend it in the
6320Sstevel@tonic-gate  * future for other character classifications.
6330Sstevel@tonic-gate  *
6340Sstevel@tonic-gate  *	TRUE for 0-9 A-Z a-z _
6350Sstevel@tonic-gate  */
6360Sstevel@tonic-gate static char chrtyp[MAXCHR] = {
6370Sstevel@tonic-gate 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6380Sstevel@tonic-gate 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6390Sstevel@tonic-gate 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6400Sstevel@tonic-gate 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6410Sstevel@tonic-gate 	0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
6420Sstevel@tonic-gate 	1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
6430Sstevel@tonic-gate 	0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
6440Sstevel@tonic-gate 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6450Sstevel@tonic-gate 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6460Sstevel@tonic-gate 	1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
6470Sstevel@tonic-gate 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6480Sstevel@tonic-gate 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6490Sstevel@tonic-gate 	1, 1, 1, 0, 0, 0, 0, 0
6500Sstevel@tonic-gate 	};
6510Sstevel@tonic-gate 
6520Sstevel@tonic-gate #define inascii(x)	(0177&(x))
6530Sstevel@tonic-gate #define iswordc(x) 	chrtyp[inascii(x)]
6540Sstevel@tonic-gate #define isinset(x,y) 	((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
6550Sstevel@tonic-gate 
6560Sstevel@tonic-gate /*
6570Sstevel@tonic-gate  * skip values for CLO XXX to skip past the closure
6580Sstevel@tonic-gate  */
6590Sstevel@tonic-gate 
6600Sstevel@tonic-gate #define ANYSKIP	2 	/* [CLO] ANY END ...	     */
6610Sstevel@tonic-gate #define CHRSKIP	3	/* [CLO] CHR chr END ...     */
6620Sstevel@tonic-gate #define CCLSKIP 18	/* [CLO] CCL 16bytes END ... */
6630Sstevel@tonic-gate 
6640Sstevel@tonic-gate static char *
pmatch(char * lp,CHAR * ap)6650Sstevel@tonic-gate pmatch( char *lp, CHAR *ap)
6660Sstevel@tonic-gate {
6670Sstevel@tonic-gate 	register int op, c, n;
6680Sstevel@tonic-gate 	register char *e;		/* extra pointer for CLO */
6690Sstevel@tonic-gate 	register char *bp;		/* beginning of subpat.. */
6700Sstevel@tonic-gate 	register char *ep;		/* ending of subpat..	 */
6710Sstevel@tonic-gate 	char *are;			/* to save the line ptr. */
6720Sstevel@tonic-gate 
6730Sstevel@tonic-gate 	while ((op = *ap++) != END)
6740Sstevel@tonic-gate 		switch(op) {
6750Sstevel@tonic-gate 
6760Sstevel@tonic-gate 		case CHR:
6770Sstevel@tonic-gate 			if (*lp++ != *ap++)
6780Sstevel@tonic-gate 				return 0;
6790Sstevel@tonic-gate 			break;
6800Sstevel@tonic-gate 		case ANY:
6810Sstevel@tonic-gate 			if (!*lp++)
6820Sstevel@tonic-gate 				return 0;
6830Sstevel@tonic-gate 			break;
6840Sstevel@tonic-gate 		case CCL:
6850Sstevel@tonic-gate 			c = *lp++;
6860Sstevel@tonic-gate 			if (!isinset(ap,c))
6870Sstevel@tonic-gate 				return 0;
6880Sstevel@tonic-gate 			ap += BITBLK;
6890Sstevel@tonic-gate 			break;
6900Sstevel@tonic-gate 		case BOL:
6910Sstevel@tonic-gate 			if (lp != bol)
6920Sstevel@tonic-gate 				return 0;
6930Sstevel@tonic-gate 			break;
6940Sstevel@tonic-gate 		case EOL:
6950Sstevel@tonic-gate 			if (*lp)
6960Sstevel@tonic-gate 				return 0;
6970Sstevel@tonic-gate 			break;
6980Sstevel@tonic-gate 		case BOT:
6990Sstevel@tonic-gate 			bopat[*ap++] = lp;
7000Sstevel@tonic-gate 			break;
7010Sstevel@tonic-gate 		case EOT:
7020Sstevel@tonic-gate 			eopat[*ap++] = lp;
7030Sstevel@tonic-gate 			break;
7040Sstevel@tonic-gate  		case BOW:
7050Sstevel@tonic-gate 			if (lp!=bol && iswordc(lp[-1]) || !iswordc(*lp))
7060Sstevel@tonic-gate 				return 0;
7070Sstevel@tonic-gate 			break;
7080Sstevel@tonic-gate 		case EOW:
7090Sstevel@tonic-gate 			if (lp==bol || !iswordc(lp[-1]) || iswordc(*lp))
7100Sstevel@tonic-gate 				return 0;
7110Sstevel@tonic-gate 			break;
7120Sstevel@tonic-gate 		case REF:
7130Sstevel@tonic-gate 			n = *ap++;
7140Sstevel@tonic-gate 			bp = bopat[n];
7150Sstevel@tonic-gate 			ep = eopat[n];
7160Sstevel@tonic-gate 			while (bp < ep)
7170Sstevel@tonic-gate 				if (*bp++ != *lp++)
7180Sstevel@tonic-gate 					return 0;
7190Sstevel@tonic-gate 			break;
7200Sstevel@tonic-gate 		case CLO:
7210Sstevel@tonic-gate 			are = lp;
7220Sstevel@tonic-gate 			switch(*ap) {
7230Sstevel@tonic-gate 
7240Sstevel@tonic-gate 			case ANY:
7250Sstevel@tonic-gate 				while (*lp)
7260Sstevel@tonic-gate 					lp++;
7270Sstevel@tonic-gate 				n = ANYSKIP;
7280Sstevel@tonic-gate 				break;
7290Sstevel@tonic-gate 			case CHR:
7300Sstevel@tonic-gate 				c = *(ap+1);
7310Sstevel@tonic-gate 				while (*lp && c == *lp)
7320Sstevel@tonic-gate 					lp++;
7330Sstevel@tonic-gate 				n = CHRSKIP;
7340Sstevel@tonic-gate 				break;
7350Sstevel@tonic-gate 			case CCL:
7360Sstevel@tonic-gate 				while ((c = *lp) && isinset(ap+1,c))
7370Sstevel@tonic-gate 					lp++;
7380Sstevel@tonic-gate 				n = CCLSKIP;
7390Sstevel@tonic-gate 				break;
7400Sstevel@tonic-gate 			default:
7410Sstevel@tonic-gate 				re_fail("closure: bad nfa.", *ap);
7420Sstevel@tonic-gate 				return 0;
7430Sstevel@tonic-gate 			}
7440Sstevel@tonic-gate 
7450Sstevel@tonic-gate 			ap += n;
7460Sstevel@tonic-gate 
7470Sstevel@tonic-gate 			while (lp >= are) {
7480Sstevel@tonic-gate 				if (e = pmatch(lp, ap))
7490Sstevel@tonic-gate 					return e;
7500Sstevel@tonic-gate 				--lp;
7510Sstevel@tonic-gate 			}
7520Sstevel@tonic-gate 			return 0;
7530Sstevel@tonic-gate 		default:
7540Sstevel@tonic-gate 			re_fail("re_exec: bad nfa.", op);
7550Sstevel@tonic-gate 			return 0;
7560Sstevel@tonic-gate 		}
7570Sstevel@tonic-gate 	return lp;
7580Sstevel@tonic-gate }
7590Sstevel@tonic-gate 
7600Sstevel@tonic-gate /*
7610Sstevel@tonic-gate  * re_modw:
7620Sstevel@tonic-gate  *	add new characters into the word table to change re_exec's
7630Sstevel@tonic-gate  *	understanding of what a word should look like. Note that we
7640Sstevel@tonic-gate  *	only accept additions into the word definition.
7650Sstevel@tonic-gate  *
7660Sstevel@tonic-gate  *	If the string parameter is 0 or null string, the table is
7670Sstevel@tonic-gate  *	reset back to the default containing A-Z a-z 0-9 _. [We use
7680Sstevel@tonic-gate  *	the compact bitset representation for the default table]
7690Sstevel@tonic-gate  */
7700Sstevel@tonic-gate 
7710Sstevel@tonic-gate static CHAR deftab[16] = {
7720Sstevel@tonic-gate 	0, 0, 0, 0, 0, 0, 0377, 003, 0376, 0377, 0377, 0207,
7730Sstevel@tonic-gate 	0376, 0377, 0377, 007
7740Sstevel@tonic-gate };
7750Sstevel@tonic-gate 
7760Sstevel@tonic-gate void
re_modw(char * s)7770Sstevel@tonic-gate re_modw( char *s )
7780Sstevel@tonic-gate {
7790Sstevel@tonic-gate 	register int i;
7800Sstevel@tonic-gate 
7810Sstevel@tonic-gate 	if (!s || !*s) {
7820Sstevel@tonic-gate 		for (i = 0; i < MAXCHR; i++)
7830Sstevel@tonic-gate 			if (!isinset(deftab,i))
7840Sstevel@tonic-gate 				iswordc(i) = 0;
7850Sstevel@tonic-gate 	}
7860Sstevel@tonic-gate 	else
7870Sstevel@tonic-gate 		while(*s)
7880Sstevel@tonic-gate 			iswordc(*s++) = 1;
7890Sstevel@tonic-gate }
7900Sstevel@tonic-gate 
7910Sstevel@tonic-gate /*
7920Sstevel@tonic-gate  * re_subs:
7930Sstevel@tonic-gate  *	substitute the matched portions of the src in dst.
7940Sstevel@tonic-gate  *
7950Sstevel@tonic-gate  *	&	substitute the entire matched pattern.
7960Sstevel@tonic-gate  *
7970Sstevel@tonic-gate  *	\digit	substitute a subpattern, with the given	tag number.
7980Sstevel@tonic-gate  *		Tags are numbered from 1 to 9. If the particular
7990Sstevel@tonic-gate  *		tagged subpattern does not exist, null is substituted.
8000Sstevel@tonic-gate  */
8010Sstevel@tonic-gate int
re_subs(char * src,char * dst)8020Sstevel@tonic-gate re_subs( char *src, char *dst)
8030Sstevel@tonic-gate {
8040Sstevel@tonic-gate 	register char c;
8050Sstevel@tonic-gate 	register int  pin;
8060Sstevel@tonic-gate 	register char *bp;
8070Sstevel@tonic-gate 	register char *ep;
8080Sstevel@tonic-gate 
8090Sstevel@tonic-gate 	if (!*src || !bopat[0])
8100Sstevel@tonic-gate 		return 0;
8110Sstevel@tonic-gate 
8120Sstevel@tonic-gate 	while (c = *src++) {
8130Sstevel@tonic-gate 		switch(c) {
8140Sstevel@tonic-gate 
8150Sstevel@tonic-gate 		case '&':
8160Sstevel@tonic-gate 			pin = 0;
8170Sstevel@tonic-gate 			break;
8180Sstevel@tonic-gate 
8190Sstevel@tonic-gate 		case '\\':
8200Sstevel@tonic-gate 			c = *src++;
8210Sstevel@tonic-gate 			if (c >= '0' && c <= '9') {
8220Sstevel@tonic-gate 				pin = c - '0';
8230Sstevel@tonic-gate 				break;
8240Sstevel@tonic-gate 			}
8250Sstevel@tonic-gate 
8260Sstevel@tonic-gate 		default:
8270Sstevel@tonic-gate 			*dst++ = c;
8280Sstevel@tonic-gate 			continue;
8290Sstevel@tonic-gate 		}
8300Sstevel@tonic-gate 
8310Sstevel@tonic-gate 		if ((bp = bopat[pin]) && (ep = eopat[pin])) {
8320Sstevel@tonic-gate 			while (*bp && bp < ep)
8330Sstevel@tonic-gate 				*dst++ = *bp++;
8340Sstevel@tonic-gate 			if (bp < ep)
8350Sstevel@tonic-gate 				return 0;
8360Sstevel@tonic-gate 		}
8370Sstevel@tonic-gate 	}
8380Sstevel@tonic-gate 	*dst = (char) 0;
8390Sstevel@tonic-gate 	return 1;
8400Sstevel@tonic-gate }
8410Sstevel@tonic-gate 
8420Sstevel@tonic-gate #ifdef DEBUG
8430Sstevel@tonic-gate /*
8440Sstevel@tonic-gate  * symbolic - produce a symbolic dump of the nfa
8450Sstevel@tonic-gate  */
8460Sstevel@tonic-gate void
symbolic(char * s)8470Sstevel@tonic-gate symbolic( char *s )
8480Sstevel@tonic-gate {
8490Sstevel@tonic-gate 	(void) printf("pattern: %s\n", s);
8500Sstevel@tonic-gate 	(void) printf("nfacode:\n");
8510Sstevel@tonic-gate 	nfadump(nfa);
8520Sstevel@tonic-gate }
8530Sstevel@tonic-gate 
8540Sstevel@tonic-gate static void
nfadump(CHAR * ap)8550Sstevel@tonic-gate nfadump( CHAR *ap)
8560Sstevel@tonic-gate {
8570Sstevel@tonic-gate 	register int n;
8580Sstevel@tonic-gate 
8590Sstevel@tonic-gate 	while (*ap != END)
8600Sstevel@tonic-gate 		switch(*ap++) {
8610Sstevel@tonic-gate 		case CLO:
8620Sstevel@tonic-gate 			(void) printf("CLOSURE");
8630Sstevel@tonic-gate 			nfadump(ap);
8640Sstevel@tonic-gate 			switch(*ap) {
8650Sstevel@tonic-gate 			case CHR:
8660Sstevel@tonic-gate 				n = CHRSKIP;
8670Sstevel@tonic-gate 				break;
8680Sstevel@tonic-gate 			case ANY:
8690Sstevel@tonic-gate 				n = ANYSKIP;
8700Sstevel@tonic-gate 				break;
8710Sstevel@tonic-gate 			case CCL:
8720Sstevel@tonic-gate 				n = CCLSKIP;
8730Sstevel@tonic-gate 				break;
8740Sstevel@tonic-gate 			}
8750Sstevel@tonic-gate 			ap += n;
8760Sstevel@tonic-gate 			break;
8770Sstevel@tonic-gate 		case CHR:
8780Sstevel@tonic-gate 			(void) printf("\tCHR %c\n",*ap++);
8790Sstevel@tonic-gate 			break;
8800Sstevel@tonic-gate 		case ANY:
8810Sstevel@tonic-gate 			(void) printf("\tANY .\n");
8820Sstevel@tonic-gate 			break;
8830Sstevel@tonic-gate 		case BOL:
8840Sstevel@tonic-gate 			(void) printf("\tBOL -\n");
8850Sstevel@tonic-gate 			break;
8860Sstevel@tonic-gate 		case EOL:
8870Sstevel@tonic-gate 			(void) printf("\tEOL -\n");
8880Sstevel@tonic-gate 			break;
8890Sstevel@tonic-gate 		case BOT:
8900Sstevel@tonic-gate 			(void) printf("BOT: %d\n",*ap++);
8910Sstevel@tonic-gate 			break;
8920Sstevel@tonic-gate 		case EOT:
8930Sstevel@tonic-gate 			(void) printf("EOT: %d\n",*ap++);
8940Sstevel@tonic-gate 			break;
8950Sstevel@tonic-gate 		case BOW:
8960Sstevel@tonic-gate 			(void) printf("BOW\n");
8970Sstevel@tonic-gate 			break;
8980Sstevel@tonic-gate 		case EOW:
8990Sstevel@tonic-gate 			(void) printf("EOW\n");
9000Sstevel@tonic-gate 			break;
9010Sstevel@tonic-gate 		case REF:
9020Sstevel@tonic-gate 			(void) printf("REF: %d\n",*ap++);
9030Sstevel@tonic-gate 			break;
9040Sstevel@tonic-gate 		case CCL:
9050Sstevel@tonic-gate 			(void) printf("\tCCL [");
9060Sstevel@tonic-gate 			for (n = 0; n < MAXCHR; n++)
9070Sstevel@tonic-gate 				if (isinset(ap,(CHAR)n)) {
9080Sstevel@tonic-gate 					if (n < ' ')
9090Sstevel@tonic-gate 						(void) printf("^%c", n ^ 0x040);
9100Sstevel@tonic-gate 					else
9110Sstevel@tonic-gate 						(void) printf("%c", n);
9120Sstevel@tonic-gate 				}
9130Sstevel@tonic-gate 			(void) printf("]\n");
9140Sstevel@tonic-gate 			ap += BITBLK;
9150Sstevel@tonic-gate 			break;
9160Sstevel@tonic-gate 		default:
9170Sstevel@tonic-gate 			(void) printf("bad nfa. opcode %o\n", ap[-1]);
9180Sstevel@tonic-gate 			exit(1);
9190Sstevel@tonic-gate 			break;
9200Sstevel@tonic-gate 		}
9210Sstevel@tonic-gate }
9220Sstevel@tonic-gate #endif
9230Sstevel@tonic-gate #endif /* MACOS or DOS or NEED_BSDREGEX */
924