xref: /csrg-svn/usr.bin/grep/egrep/egrep.c (revision 32362)
132332Sbostic #ifndef lint
2*32362Sbostic static char sccsid[] = "@(#)egrep.c	5.5 (Berkeley) 10/07/87";
332332Sbostic #endif not lint
432332Sbostic 
532332Sbostic /*
632332Sbostic      Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
732332Sbostic      table as in original paper (CACM, October, 1977).  No delta1 or delta2.
832332Sbostic      According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
932332Sbostic      minimal practical value.  However, to improve for worst case input,
1032332Sbostic      integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
1132332Sbostic      Comput., Feb. 1986) deserves consideration.
1232332Sbostic 
1332332Sbostic      Method: 	extract longest metacharacter-free string from expression.
1432332Sbostic 		this is done using a side-effect from henry spencer's regcomp().
1532332Sbostic 		use boyer-moore to match such, then pass submatching lines
1632332Sbostic 		to either regexp() or standard 'egrep', depending on certain
1732332Sbostic 		criteria within execstrategy() below.  [this tradeoff is due
1832332Sbostic 		to the general slowness of the regexp() nondeterministic
1932332Sbostic 		machine on complex expressions, as well as the startup time
2032332Sbostic 		of standard 'egrep' on short files.]  alternatively, one may
2132332Sbostic 		change the vendor-supplied 'egrep' automaton to include
2232332Sbostic 		boyer-moore directly.  see accompanying writeup for discussion
2332332Sbostic 		of kanji expression treatment.
2432332Sbostic 
2532332Sbostic 		late addition:  apply trickbag for fast match of simple
2632332Sbostic 		alternations (sublinear, in common low-cardinality cases).
2732332Sbostic 		trap fgrep into this lair.
2832332Sbostic 
2932332Sbostic 		gnu additions:  -f, newline as |, \< and \> [in regexec()], more
3032332Sbostic 				comments.  inspire better dfa exec() strategy.
3132332Sbostic 				serious testing and help with special cases.
3232332Sbostic 
3332332Sbostic      Algorithm amalgam summary:
3432332Sbostic 
3532332Sbostic 		dfa e?grep 		(aho/thompson)
3632332Sbostic 		ndfa regexp() 		(spencer/aho)
3732332Sbostic 		bmg			(boyer/moore/gosper)
3832332Sbostic 		"superimposed" bmg   	(jaw)
3932332Sbostic 		fgrep			(aho/corrasick)
4032332Sbostic 
4132332Sbostic 		sorry, but the knuth/morris/pratt machine, horspool's
4232332Sbostic 		"frequentist" code, and the rabin/karp matcher, however cute,
4332332Sbostic 		just don't cut it for this production.
4432332Sbostic 
4532332Sbostic      James A. Woods				Copyright (c) 1986
4632332Sbostic      NASA Ames Research Center
4732332Sbostic */
4832332Sbostic #include <stdio.h>
4932332Sbostic #include <ctype.h>
5032332Sbostic #include <sys/types.h>
5132332Sbostic #include <sys/stat.h>
5232337Sbostic #include <sys/file.h>
5332332Sbostic #include <regexp.h>		/* must be henry spencer's version */
5432332Sbostic 
5532332Sbostic #define	MIN(A, B)	((A) > (B) ? (B) : (A))
5632332Sbostic 
5732332Sbostic #ifdef	SLOWSYS
5832332Sbostic #define read	xread
5932332Sbostic #endif
6032332Sbostic /*
6132332Sbostic  * define existing [ef]?grep program locations below for use by execvp().
6232332Sbostic  * [execlp() would be used were it not for the possibility of
6332332Sbostic  * installation-dependent recursion.]
6432332Sbostic  */
6532332Sbostic #ifndef EGREPSTD
6632332Sbostic #define	EGREPSTD	"/usr/bin/egrep"
6732332Sbostic #endif
6832332Sbostic #ifndef GREPSTD
6932332Sbostic #define	GREPSTD		"/bin/grep"
7032332Sbostic #endif
7132332Sbostic #ifndef FGREPSTD
7232332Sbostic #define	FGREPSTD	"/usr/bin/fgrep"
7332332Sbostic #endif
7432332Sbostic 
7532332Sbostic #define BUFSIZE	8192		/* make higher for cray */
7632332Sbostic #define PATSIZE 6000
7732332Sbostic #define LARGE 	BUFSIZE + PATSIZE
7832332Sbostic 
7932332Sbostic #define NALT	7		/* tied to scanf() size in alternate() */
8032332Sbostic #define NMUSH	6		/* loosely relates to expected alt length */
8132332Sbostic 
8232332Sbostic #define	FIRSTFEW	33 	/* Always do FIRSTFEW matches with regexec() */
8332332Sbostic #define	PUNTPERCENT	10	/* After FIRSTFEW, if PUNTPERCENT of the input
8432332Sbostic 				 * was processed by regexp(), exec std egrep. */
8532332Sbostic #define NL	'\n'
8632332Sbostic #define	EOS	'\0'
8732332Sbostic #define	NONASCII	0200	/* Bit mask for Kanji non-ascii chars */
8832332Sbostic #define META	"\n^$.[]()?+*|\\"	/* egrep meta-characters */
8932332Sbostic #define SS2	'\216'		/* EUC Katakana (or Chinese2) prefix */
9032332Sbostic #define SS3	'\217'		/* EUC Kanji2 (or Chinese3) prefix */
9132332Sbostic 
9232332Sbostic extern char *optarg;
9332332Sbostic extern int optind;
9432332Sbostic char *progname;
9532332Sbostic 
9632332Sbostic int cflag, iflag, eflag, fflag, lflag, nflag;	/* SVID flags */
9732332Sbostic int sflag, hflag;		/* v7, v8, bsd */
9832332Sbostic 
9932332Sbostic int firstflag;			/* Stop at first match */
10032332Sbostic int grepflag;			/* Called as "grep" */
10132332Sbostic int fgrepflag;			/* Called as "fgrep" */
10232332Sbostic int altflag;			/* Simple alternation in pattern */
10332332Sbostic int boyonly;			/* No regexp needed -- all simple */
10432332Sbostic int flushflag;
10532332Sbostic int grepold, egrepold, fgrepold;
10632332Sbostic 
10732332Sbostic int nalt;			/* Number of alternatives */
10832332Sbostic int nsuccess;			/* 1 for match, 2 for error */
10932332Sbostic int altmin;			/* Minimum length of all the alternate
11032332Sbostic 				 * strings */
11132332Sbostic int firstfile;			/* argv index of first file argument */
11232332Sbostic int patind;			/* argv index of pattern */
11332332Sbostic long nmatch;			/* Number of matches in this file */
11432332Sbostic long incount, counted;		/* Amount of input consumed */
11532332Sbostic long rxcount;			/* Bytes of input processed by regexec() */
11632332Sbostic int boyfound;			/* accumulated partial matches (tripped by
11732332Sbostic 				 * FIRSTFEW) */
11832332Sbostic int prevmatch;			/* next three lines aid fast -n */
11932332Sbostic long nline, prevnline;
12032332Sbostic char *prevloc;
12132332Sbostic 
12232332Sbostic regexp *rspencer;
12332332Sbostic char *pattern;
12432332Sbostic char *patboy;			/* Pattern for simple Boyer-Moore */
12532332Sbostic char *patfile;			/* Filename containing pattern(s) */
12632332Sbostic 
12732332Sbostic int delta0[256];		/* Boyer-Moore algorithm core */
12832332Sbostic char cmap[256];			/* Usually 0-255, but if -i, maps upper to
12932332Sbostic 				 * lower case */
13032332Sbostic char str[BUFSIZE + 2];
13132332Sbostic int nleftover;
13232332Sbostic char linetemp[BUFSIZE];
13332339Sbostic char *altpat[NALT];		/* alternation component storage */
13432332Sbostic int altlen[NALT];
13532332Sbostic short altset[NMUSH + 1][256];
13632332Sbostic char preamble[200];		/* match prefix (filename, line no.) */
13732332Sbostic 
13832332Sbostic int fd;
13932332Sbostic char *
14032332Sbostic strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *sprintf(), *malloc();
14132332Sbostic char *
14232332Sbostic grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
14332332Sbostic char **args;
14432332Sbostic 
14532332Sbostic main(argc, argv)
14632332Sbostic 	int argc;
14732332Sbostic 	char *argv[];
14832332Sbostic {
14932332Sbostic 	int c;
15032332Sbostic 	int errflag = 0;
15132332Sbostic 
15232332Sbostic 	args = argv;
15332332Sbostic 
15432332Sbostic 	if ((progname = strrchr(argv[0], '/')) != 0)
15532332Sbostic 		progname++;
15632332Sbostic 	else
15732332Sbostic 		progname = argv[0];
15832332Sbostic 	if (strcmp(progname, "grep") == 0)
15932332Sbostic 		grepflag++;
16032332Sbostic 	if (strcmp(progname, "fgrep") == 0)
16132332Sbostic 		fgrepflag++;
16232332Sbostic 
16332332Sbostic 	while ((c = getopt(argc, argv, "bchie:f:lnsvwxy1")) != EOF) {
16432332Sbostic 		switch (c) {
16532332Sbostic 
16632332Sbostic 		case 'f':
16732332Sbostic 			fflag++;
16832332Sbostic 			patfile = optarg;
16932332Sbostic 			continue;
17032332Sbostic 		case 'b':
17132332Sbostic 		case 'v':
17232332Sbostic 			egrepold++;	/* boyer-moore of little help here */
17332332Sbostic 			continue;
17432332Sbostic 		case 'c':
17532332Sbostic 			cflag++;
17632332Sbostic 			continue;
17732332Sbostic 		case 'e':
17832332Sbostic 			eflag++;
17932332Sbostic 			pattern = optarg;
18032332Sbostic 			continue;
18132332Sbostic 		case 'h':
18232332Sbostic 			hflag++;
18332332Sbostic 			continue;
18432332Sbostic 		case '1':	/* Stop at very first match */
18532332Sbostic 			firstflag++;	/* spead freaks only */
18632332Sbostic 			continue;
18732332Sbostic 		case 'i':
18832332Sbostic 			iflag++;
18932332Sbostic 			continue;
19032332Sbostic 		case 'l':
19132332Sbostic 			lflag++;
19232332Sbostic 			continue;
19332332Sbostic 		case 'n':
19432332Sbostic 			nflag++;
19532332Sbostic 			continue;
19632332Sbostic 		case 's':
19732332Sbostic 			sflag++;
19832332Sbostic 			continue;
19932332Sbostic 		case 'w':
20032332Sbostic 		case 'y':
20132332Sbostic 			if (!grepflag)
20232332Sbostic 				errflag++;
20332332Sbostic 			grepold++;
20432332Sbostic 			continue;
20532332Sbostic 		case 'x':	/* needs more work, like -b above */
20632332Sbostic 			if (!fgrepflag)
20732332Sbostic 				errflag++;
20832332Sbostic 			fgrepold++;
20932332Sbostic 			continue;
21032332Sbostic 		case '?':
21132332Sbostic 			errflag++;
21232332Sbostic 		}
21332332Sbostic 	}
21432332Sbostic 	if (errflag || ((argc <= optind) && !fflag && !eflag)) {
21532332Sbostic 		if (grepflag)
21632332Sbostic oops("usage: grep [-bcihlnsvwy] [-e] pattern [file ...]");
21732332Sbostic 		else if (fgrepflag)
21832332Sbostic oops("usage: fgrep [-bcilnv] {-f patfile | [-e] strings} [file ...]");
21932332Sbostic 		else		/* encourage SVID options, though we provide
22032332Sbostic 				 * others */
22132332Sbostic oops("usage: egrep [-bcilnv] {-f patfile | [-e] pattern} [file ...]");
22232332Sbostic 	}
22332332Sbostic 	patind = optind;
22432332Sbostic 	if (fflag)
22532332Sbostic 		pattern = pfile(patfile), patind = 0;
22632332Sbostic 	else if (!eflag)
22732332Sbostic 		pattern = argv[optind++];
22832332Sbostic 
22932332Sbostic 	if ((argc - optind) <= 1)	/* Filename invisible given < 2 files */
23032332Sbostic 		hflag++;
23132332Sbostic 	if (pattern[0] == EOS)
23232332Sbostic 		kernighan(argv);	/* same as it ever was */
23332332Sbostic 	/*
23432332Sbostic 	 * 'grep/egrep' merger -- "old" grep is called to handle: tagged
23532332Sbostic 	 * exprs \( \), word matches \< and \>, -w and -y options, char
23632332Sbostic 	 * classes with '-' at end (egrep bug?), and patterns beginning with
23732332Sbostic 	 * an asterisk (don't ask why). otherwise, characters meaningful to
23832332Sbostic 	 * 'egrep' but not to 'grep' are escaped; the entire expr is then
23932332Sbostic 	 * passed to 'egrep'.
24032332Sbostic 	 */
24132332Sbostic 	if (grepflag && !grepold) {
24232332Sbostic 		if (strindex(pattern, "\\(") >= 0 ||
24332332Sbostic 		    strindex(pattern, "\\<") >= 0 ||
24432332Sbostic 		    strindex(pattern, "\\>") >= 0 ||
24532332Sbostic 		    strindex(pattern, "-]") >= 0 ||
24632332Sbostic 		    pattern[0] == '*')	/* grep bug */
24732332Sbostic 			grepold++;
24832332Sbostic 		else
24932332Sbostic 			pattern = grepxlat(pattern);
25032332Sbostic 	}
25132332Sbostic 	if (grepold || egrepold || fgrepold)
25232332Sbostic 		kernighan(argv);
25332332Sbostic 
25432332Sbostic 	if (iflag)
25532332Sbostic 		strcpy(pattern, fold(pattern));
25632332Sbostic 	/*
25732332Sbostic 	 * If the pattern is a plain string, just run boyer-moore. If it
25832332Sbostic 	 * consists of meta-free alternatives, run "superimposed" bmg.
25932332Sbostic 	 * Otherwise, find best string, and compile pattern for regexec().
26032332Sbostic 	 */
26132332Sbostic 	if (strpbrk(pattern, META) == NULL) {	/* do boyer-moore only */
26232332Sbostic 		boyonly++;
26332332Sbostic 		patboy = pattern;
26432332Sbostic 	} else {
26532332Sbostic 		if ((patboy = alternate(pattern)) != NULL)
26632332Sbostic 			boyonly++;
26732332Sbostic 		else {
26832332Sbostic 			if ((patboy = isolate(pattern)) == NULL)
26932332Sbostic 				kernighan(argv);	/* expr too involved */
27032332Sbostic #ifndef NOKANJI
27132332Sbostic 			for (c = 0; pattern[c] != EOS; c++)
27232332Sbostic 				if (pattern[c] & NONASCII)	/* kanji + meta */
27332332Sbostic 					kernighan(argv);
27432332Sbostic #endif
27532332Sbostic 			if ((rspencer = regcomp(pattern)) == NULL)
27632332Sbostic 				oops("regcomp failure");
27732332Sbostic 		}
27832332Sbostic 	}
27932332Sbostic 	gosper(patboy);		/* "pre-conditioning is wonderful"
28032332Sbostic 				 * -- v. strassen */
28132332Sbostic 
28232332Sbostic 	if ((firstfile = optind) >= argc) {
28332332Sbostic 		/* Grep standard input */
28432332Sbostic 		if (lflag)	/* We don't know its name! */
28532332Sbostic 			exit(1);
28632332Sbostic 		egsecute((char *) NULL);
28732332Sbostic 	} else {
28832332Sbostic 		while (optind < argc) {
28932332Sbostic 			egsecute(argv[optind]);
29032332Sbostic 			optind++;
29132332Sbostic 			if (firstflag && (nsuccess == 1))
29232332Sbostic 				break;
29332332Sbostic 		}
29432332Sbostic 	}
29532332Sbostic 	exit((nsuccess == 2) ? 2 : (nsuccess == 0));
29632332Sbostic }
29732332Sbostic 
29832332Sbostic char *
29932332Sbostic pfile(pfname)			/* absorb expression from file */
30032332Sbostic 	char *pfname;
30132332Sbostic {
30232337Sbostic 	int fd;
30332332Sbostic 	struct stat patstat;
30432332Sbostic 	static char *pat;
30532332Sbostic 
30632337Sbostic 	if ((fd = open(pfname, O_RDONLY, 0)) < 0)
30732332Sbostic 		oops("can't read pattern file");
30832337Sbostic 	if (fstat(fd, &patstat) != 0)
30932332Sbostic 		oops("can't stat pattern file");
31032332Sbostic 	if (patstat.st_size > PATSIZE) {
31132332Sbostic 		if (fgrepflag) {	/* defer to unix version */
31232332Sbostic 			fgrepold++;
31332332Sbostic 			return "dummy";
31432332Sbostic 		} else
31532332Sbostic 			oops("pattern file too big");
31632332Sbostic 	}
31732332Sbostic 	if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
31832332Sbostic 		oops("out of memory to read pattern file");
31932337Sbostic 	if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
32032332Sbostic 		oops("error reading pattern file");
32132337Sbostic 	(void) close(fd);
32232332Sbostic 
32332332Sbostic 	pat[patstat.st_size] = EOS;
32432332Sbostic 	if (pat[patstat.st_size - 1] == NL)	/* NOP for egrep; helps grep */
32532332Sbostic 		pat[patstat.st_size - 1] = EOS;
32632332Sbostic 
32732332Sbostic 	if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
32832332Sbostic 		if (fgrepflag)
32932332Sbostic 			fgrepold++;	/* "what's it all about, alfie?" */
33032332Sbostic 		else
33132332Sbostic 			egrepold++;
33232332Sbostic 	}
33332332Sbostic 	return (pat);
33432332Sbostic }
33532332Sbostic 
33632332Sbostic egsecute(file)
33732332Sbostic 	char *file;
33832332Sbostic {
33932332Sbostic 	if (file == NULL)
34032332Sbostic 		fd = 0;
34132339Sbostic 	else if ((fd = open(file, O_RDONLY, 0)) <= 0) {
34232332Sbostic 		fprintf(stderr, "%s: can't open %s\n", progname, file);
34332332Sbostic 		nsuccess = 2;
34432332Sbostic 		return;
34532332Sbostic 	}
34632332Sbostic 	chimaera(file, patboy);
34732332Sbostic 
34832332Sbostic 	if (!boyonly && !flushflag && file != NULL)
34932332Sbostic 		flushmatches();
35032332Sbostic 	if (file != NULL)
35132332Sbostic 		close(fd);
35232332Sbostic }
35332332Sbostic 
35432332Sbostic chimaera(file, pat)		/* "reach out and boyer-moore search someone" */
35532332Sbostic 	char *file, *pat;	/* -- soon-to-be-popular bumper sticker */
35632332Sbostic {
35732332Sbostic 	register char *k, *strend, *s;
35832332Sbostic 	register int j, count;
35932332Sbostic 	register int *deltazero = delta0;
36032332Sbostic 	int patlen = altmin;
36132332Sbostic 	char *t;
36232332Sbostic 	char *gotamatch(), *kanji(), *linesave(), *submatch();
36332332Sbostic 
36432332Sbostic 	nleftover = boyfound = flushflag = 0;
36532332Sbostic 	nline = 1L;
36632332Sbostic 	prevmatch = 0;
36732332Sbostic 	nmatch = counted = rxcount = 0L;
36832332Sbostic 
36932332Sbostic 	while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
37032332Sbostic 
37132332Sbostic 		counted += count;
37232332Sbostic 		strend = linesave(str, count);
37332332Sbostic 
37432332Sbostic 		for (k = str + patlen - 1; k < strend;) {
37532332Sbostic 			/*
37632332Sbostic 			 * for a large class of patterns, upwards of 80% of
37732332Sbostic 			 * match time is spent on the next line.  we beat
37832332Sbostic 			 * existing microcode (vax 'matchc') this way.
37932332Sbostic 			 */
38032332Sbostic 			while ((k += deltazero[*(unsigned char *) k]) < strend);
38132332Sbostic 			if (k < (str + LARGE))
38232332Sbostic 				break;
38332332Sbostic 			k -= LARGE;
38432332Sbostic 
38532332Sbostic 			if (altflag) {
38632332Sbostic 				/*
38732332Sbostic 				 * Parallel Boyer-Moore.  Check whether each
38832332Sbostic 				 * of the previous <altmin> chars COULD be
38932332Sbostic 				 * from one of the alternative strings.
39032332Sbostic 				 */
39132332Sbostic 				s = k - 1;
39232332Sbostic 				j = altmin;
39332332Sbostic 				while (altset[--j][(unsigned char)
39432332Sbostic 					     cmap[*(unsigned char *) s--]]);
39532332Sbostic 				/*
39632332Sbostic 				 * quick test fails. in this life, compare
39732332Sbostic 				 * 'em all.  but, a "reverse trie" would
39832332Sbostic 				 * attenuate worst case (linear w/delta2?).
39932332Sbostic 				 */
40032332Sbostic 				if (--j < 0) {
40132332Sbostic 					count = nalt - 1;
40232332Sbostic 					do {
40332332Sbostic 						s = k;
40432332Sbostic 						j = altlen[count];
40532332Sbostic 						t = altpat[count];
40632332Sbostic 
40732332Sbostic 						while
40832332Sbostic 							(cmap[*(unsigned char *) s--]
40932332Sbostic 							 == t[--j]);
41032332Sbostic 						if (j < 0)
41132332Sbostic 							break;
41232332Sbostic 					}
41332332Sbostic 					while (count--);
41432332Sbostic 				}
41532332Sbostic 			} else {
41632332Sbostic 				/* One string -- check it */
41732332Sbostic 				j = patlen - 1;
41832332Sbostic 				s = k - 1;
41932332Sbostic 				while (cmap[*(unsigned char *) s--] == pat[--j]);
42032332Sbostic 			}
42132332Sbostic 			/*
42232332Sbostic 			 * delta-less shortcut for literati. short shrift for
42332332Sbostic 			 * genetic engineers?
42432332Sbostic 			 */
42532332Sbostic 			if (j >= 0) {
42632332Sbostic 				k++;	/* no match; restart next char */
42732332Sbostic 				continue;
42832332Sbostic 			}
42932332Sbostic 			k = submatch(file, pat, str, strend, k, count);
43032332Sbostic 			if (k == NULL)
43132332Sbostic 				return;
43232332Sbostic 		}
43332332Sbostic 		if (nflag) {
43432332Sbostic 			if (prevmatch)
43532332Sbostic 				nline = prevnline + nlcount(prevloc, k);
43632332Sbostic 			else
43732332Sbostic 				nline = nline + nlcount(str, k);
43832332Sbostic 			prevmatch = 0;
43932332Sbostic 		}
44032332Sbostic 		strncpy(str, linetemp, nleftover);
44132332Sbostic 	}
44232332Sbostic 	if (cflag) {
44332332Sbostic 		/* Bug from old grep: -c overrides -h.  We fix the bug. */
44432332Sbostic 		if (!hflag)
44532332Sbostic 			printf("%s:", file);
44632332Sbostic 		printf("%ld\n", nmatch);
44732332Sbostic 	}
44832332Sbostic }
44932332Sbostic 
45032332Sbostic char *
45132332Sbostic linesave(str, count)		/* accumulate partial line at end of buffer */
45232332Sbostic 	char str[];
45332332Sbostic 	register int count;
45432332Sbostic {
45532332Sbostic 	register int j;
45632332Sbostic 
45732332Sbostic 	count += nleftover;
45832332Sbostic 	if (count != BUFSIZE && fd != 0)
45932332Sbostic 		str[count++] = NL;	/* insurance for broken last line */
46032332Sbostic 	str[count] = EOS;
46132332Sbostic 	for (j = count - 1; str[j] != NL && j >= 0;)
46232332Sbostic 		j--;
46332332Sbostic 	/*
46432332Sbostic 	 * break up these lines: long line (> BUFSIZE), last line of file, or
46532332Sbostic 	 * short return from read(), as from tee(1) input
46632332Sbostic 	 */
46732332Sbostic 	if (j < 0 && (count == (BUFSIZE - nleftover))) {
46832332Sbostic 		str[count++] = NL;
46932332Sbostic 		str[count] = EOS;
47032332Sbostic 		linetemp[0] = EOS;
47132332Sbostic 		nleftover = 0;
47232332Sbostic 		return (str + count);
47332332Sbostic 	} else {
47432332Sbostic 		nleftover = count - j - 1;
47532332Sbostic 		strncpy(linetemp, str + j + 1, nleftover);
47632332Sbostic 		return (str + j);
47732332Sbostic 	}
47832332Sbostic }
47932332Sbostic 
48032332Sbostic /*
48132332Sbostic  * Process partial match. First check for mis-aligned Kanji, then match line
48232332Sbostic  * against full compiled r.e. if statistics do not warrant handing off to
48332332Sbostic  * standard egrep.
48432332Sbostic  */
48532332Sbostic char *
48632332Sbostic submatch(file, pat, str, strend, k, altindex)
48732332Sbostic 	char file[], pat[], str[];
48832332Sbostic 	register char *strend, *k;
48932332Sbostic 	int altindex;
49032332Sbostic {
49132332Sbostic 	register char *s;
49232332Sbostic 	char *t, c;
49332332Sbostic 
49432332Sbostic 	t = k;
49532332Sbostic 	s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
49632332Sbostic #ifndef NOKANJI
49732332Sbostic 	c = ((altflag) ? altpat[altindex][0] : pat[0]);
49832332Sbostic 	if (c & NONASCII)
49932332Sbostic 		if ((s = kanji(str, s, k)) == NULL)
50032332Sbostic 			return (++k);	/* reject false kanji */
50132332Sbostic #endif
50232332Sbostic 	do;
50332332Sbostic 	while (*s != NL && --s >= str);
50432332Sbostic 	k = s + 1;		/* now at line start */
50532332Sbostic 
50632332Sbostic 	if (boyonly)
50732332Sbostic 		return (gotamatch(file, k));
50832332Sbostic 
50932332Sbostic 	incount = counted - (strend - k);
51032332Sbostic 	if (boyfound++ == FIRSTFEW)
51132332Sbostic 		execstrategy(file);
51232332Sbostic 
51332332Sbostic 	s = t;
51432332Sbostic 	do
51532332Sbostic 		rxcount++;
51632332Sbostic 	while (*s++ != NL);
51732332Sbostic 	*--s = EOS;
51832332Sbostic 	/*
51932332Sbostic 	 * "quick henry -- the flit" (after theodor geisel)
52032332Sbostic 	 */
52132332Sbostic 	if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
52232332Sbostic 		*s = NL;
52332332Sbostic 		if (gotamatch(file, k) == NULL)
52432332Sbostic 			return (NULL);
52532332Sbostic 	}
52632332Sbostic 	*s = NL;
52732332Sbostic 	return (s + 1);
52832332Sbostic }
52932332Sbostic 
53032338Sbostic #ifndef NOKANJI
53132332Sbostic /*
53232332Sbostic  * EUC code disambiguation -- scan backwards to first 7-bit code, while
53332332Sbostic  * counting intervening 8-bit codes.  If odd, reject unaligned Kanji pattern.
53432332Sbostic  * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
53532332Sbostic  */
53632332Sbostic char *
53732332Sbostic kanji(str, s, k)
53832332Sbostic 	register char *str, *s, *k;
53932332Sbostic {
54032332Sbostic 	register int j = 0;
54132332Sbostic 
54232332Sbostic 	for (s--; s >= str; s--) {
54332332Sbostic 		if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
54432332Sbostic 			break;
54532332Sbostic 		j++;
54632332Sbostic 	}
54732332Sbostic #ifndef CHINESE
54832332Sbostic 	if (*s == SS2)
54932332Sbostic 		j -= 1;
55032332Sbostic #endif  CHINESE
55132332Sbostic 	return ((j & 01) ? NULL : k);
55232332Sbostic }
55332338Sbostic #endif
55432332Sbostic 
55532332Sbostic /*
55632332Sbostic  * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
55732332Sbostic  */
55832332Sbostic gosper(pattern)
55932332Sbostic 	char *pattern;		/* ... HAKMEM lives ... */
56032332Sbostic {
56132332Sbostic 	register int i, j;
56232332Sbostic 	unsigned char c;
56332332Sbostic 
56432332Sbostic 	/* Make one-string case look like simple alternatives case */
56532332Sbostic 	if (!altflag) {
56632332Sbostic 		nalt = 1;
56732332Sbostic 		altmin = altlen[0] = strlen(pattern);
56832339Sbostic 		altpat[0] = pattern;
56932332Sbostic 	}
57032332Sbostic 	/* For chars that aren't in any string, skip by string length. */
57132332Sbostic 	for (j = 0; j < 256; j++) {
57232332Sbostic 		delta0[j] = altmin;
57332332Sbostic 		cmap[j] = j;	/* Sneak in initialization of cmap */
57432332Sbostic 	}
57532332Sbostic 
57632332Sbostic 	/* For chars in a string, skip distance from char to end of string. */
57732332Sbostic 	/* (If char appears more than once, skip minimum distance.) */
57832332Sbostic 	for (i = 0; i < nalt; i++)
57932332Sbostic 		for (j = 0; j < altlen[i] - 1; j++) {
58032332Sbostic 			c = altpat[i][j];
58132332Sbostic 			delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
58232332Sbostic 			if (iflag && islower((int) c))
58332332Sbostic 				delta0[toupper((int) c)] = delta0[c];
58432332Sbostic 		}
58532332Sbostic 
58632332Sbostic 	/* For last char of each string, fall out of search loop. */
58732332Sbostic 	for (i = 0; i < nalt; i++) {
58832332Sbostic 		c = altpat[i][altlen[i] - 1];
58932332Sbostic 		delta0[c] = LARGE;
59032332Sbostic 		if (iflag && islower((int) c))
59132332Sbostic 			delta0[toupper((int) c)] = LARGE;
59232332Sbostic 	}
59332332Sbostic 	if (iflag)
59432332Sbostic 		for (j = 'A'; j <= 'Z'; j++)
59532332Sbostic 			cmap[j] = tolower((int) j);
59632332Sbostic }
59732332Sbostic 
59832332Sbostic /*
59932332Sbostic  * Print, count, or stop on full match. Result is either the location for
60032332Sbostic  * continued search, or NULL to stop.
60132332Sbostic  */
60232332Sbostic char *
60332332Sbostic gotamatch(file, s)
60432332Sbostic 	register char *file, *s;
60532332Sbostic {
60632332Sbostic 	char *savematch();
60732332Sbostic 	int squirrel = 0;	/* nonzero to squirrel away FIRSTFEW matches */
60832332Sbostic 
60932332Sbostic 	nmatch++;
61032332Sbostic 	nsuccess = 1;
61132332Sbostic 	if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
61232332Sbostic 		squirrel = 1;
61332332Sbostic 
61432332Sbostic 	if (sflag)
61532332Sbostic 		return (NULL);	/* -s usurps all flags (unlike some versions) */
61632332Sbostic 	if (cflag) {		/* -c overrides -l, we guess */
61732332Sbostic 		do;
61832332Sbostic 		while (*s++ != NL);
61932332Sbostic 	} else if (lflag) {
62032332Sbostic 		puts(file);
62132332Sbostic 		return (NULL);
62232332Sbostic 	} else {
62332332Sbostic 		if (!hflag)
62432332Sbostic 			if (!squirrel)
62532332Sbostic 				printf("%s:", file);
62632332Sbostic 			else
62732332Sbostic 				sprintf(preamble, "%s:", file);
62832332Sbostic 		if (nflag) {
62932332Sbostic 			if (prevmatch)
63032332Sbostic 				prevnline = prevnline + nlcount(prevloc, s);
63132332Sbostic 			else
63232332Sbostic 				prevnline = nline + nlcount(str, s);
63332332Sbostic 			prevmatch = 1;
63432332Sbostic 
63532332Sbostic 			if (!squirrel)
63632332Sbostic 				printf("%ld:", prevnline);
63732332Sbostic 			else
63832332Sbostic 				sprintf(preamble + strlen(preamble),
63932332Sbostic 					"%ld:", prevnline);
64032332Sbostic 		}
64132332Sbostic 		if (!squirrel) {
64232332Sbostic 			do
64332332Sbostic 				putchar(*s);
64432332Sbostic 			while (*s++ != NL);
64532332Sbostic 		} else
64632332Sbostic 			s = savematch(s);
64732332Sbostic 
64832332Sbostic 		if (nflag)
64932332Sbostic 			prevloc = s - 1;
65032332Sbostic 	}
65132332Sbostic 	return ((firstflag && !cflag) ? NULL : s);
65232332Sbostic }
65332332Sbostic 
65432332Sbostic char *
65532332Sbostic fold(line)
65632332Sbostic 	char *line;
65732332Sbostic {
65832332Sbostic 	static char fline[BUFSIZE];
65932332Sbostic 	register char *s, *t = fline;
66032332Sbostic 
66132332Sbostic 	for (s = line; *s != EOS; s++)
66232332Sbostic 		*t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
66332332Sbostic 	*t = EOS;
66432332Sbostic 	return (fline);
66532332Sbostic }
66632332Sbostic 
66732332Sbostic strindex(s, t)			/* the easy way, as in K&P, p. 192 */
66832332Sbostic 	char *s, *t;
66932332Sbostic {
67032332Sbostic 	int i, n;
67132332Sbostic 
67232332Sbostic 	n = strlen(t);
67332332Sbostic 	for (i = 0; s[i] != '\0'; i++)
67432332Sbostic 		if (strncmp(s + i, t, n) == 0)
67532332Sbostic 			return (i);
67632332Sbostic 	return (-1);
67732332Sbostic }
67832332Sbostic 
67932332Sbostic char *
68032332Sbostic grepxlat(pattern)		/* grep pattern meta conversion */
68132332Sbostic 	char *pattern;
68232332Sbostic {
68332332Sbostic 	register char *p, *s;
68432332Sbostic 	static char newpat[BUFSIZE];
68532332Sbostic 
68632332Sbostic 	for (s = newpat, p = pattern; *p != EOS;) {
68732332Sbostic 		if (*p == '\\') {	/* skip escapes ... */
68832332Sbostic 			*s++ = *p++;
68932332Sbostic 			if (*p)
69032332Sbostic 				*s++ = *p++;
69132332Sbostic 		} else if (*p == '[') {	/* ... and char classes */
69232332Sbostic 			while (*p != EOS && *p != ']')
69332332Sbostic 				*s++ = *p++;
69432332Sbostic 		} else if (strchr("+?|()", *p) != NULL) {
69532332Sbostic 			*s++ = '\\';	/* insert protection */
69632332Sbostic 			*s++ = *p++;
69732332Sbostic 		} else
69832332Sbostic 			*s++ = *p++;
69932332Sbostic 	}
70032332Sbostic 	*s = EOS;
70132332Sbostic 	grepflag = ((patind) ? 0 : 1);
70232332Sbostic 	return (newpat);
70332332Sbostic }
70432332Sbostic 
70532332Sbostic /*
70632332Sbostic  * Test for simple alternation.  Result is NULL if it's not so simple, or is
70732332Sbostic  * a pointer to the first string if it is. Warning:  sscanf size is a
70832332Sbostic  * fixpoint, beyond which the speedup linearity starts to break down.  In the
70932332Sbostic  * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
71032332Sbostic  * altpat[] to arbitrary size is not useful.
71132332Sbostic  */
71232332Sbostic char *
71332332Sbostic alternate(regexpr)
71432332Sbostic 	char *regexpr;
71532332Sbostic {
71632339Sbostic 	register int i, j;
71732339Sbostic 	register char *start, *stop;
71832332Sbostic 	unsigned char c;
71932332Sbostic 
72032332Sbostic 	if (fgrepflag && strchr(regexpr, '|'))
72132339Sbostic 			return (NULL);
72232332Sbostic 
72332339Sbostic 	/*
72432339Sbostic 	 * break pattern up into altpat array; delimit on newline, bar,
72532339Sbostic 	 * or EOS.  We know we won't overflow, we've already checked the
72632339Sbostic 	 * number of patterns we're going to find against NALT.
72732339Sbostic 	 * Also, set length of pattern and find minimum pattern length.
72832339Sbostic 	 */
72932339Sbostic 	nalt = 0;
73032339Sbostic 	altmin = NMUSH;
73132339Sbostic 	for (start = stop = regexpr;; ++stop)
73232339Sbostic 		if (!*stop || *stop == '|' || *stop == NL) {
73332339Sbostic 			altlen[nalt] = j = stop - start;
73432339Sbostic 			if (j < altmin)
73532339Sbostic 				altmin = j;
73632339Sbostic 			if (!(altpat[nalt] = malloc((u_int)(j + 1))))
73732339Sbostic 				oops("out of memory");
73832339Sbostic 			bcopy(start, altpat[nalt], j);
73932339Sbostic 			altpat[nalt][j] = EOS;
740*32362Sbostic 			++nalt;
74132339Sbostic 			if (!*stop)
74232339Sbostic 				break;
743*32362Sbostic 			if (nalt == NALT)
74432339Sbostic 				return(NULL);
74532339Sbostic 			if (*stop == NL)
74632339Sbostic 				*stop = '|';
74732339Sbostic 			start = stop + 1;
74832339Sbostic 		}
74932332Sbostic 	if (!fgrepflag) {
75032332Sbostic 		if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
75132332Sbostic 			return (NULL);
75232332Sbostic 		if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
75332332Sbostic 		    || strindex(regexpr, "||") >= 0)
75432332Sbostic 			return (NULL);
75532332Sbostic 	}
75632332Sbostic 
75732332Sbostic 	if (nalt > 1) {		/* build superimposed "pre-match" sets per
75832332Sbostic 				 * char */
75932332Sbostic 		altflag++;
76032332Sbostic 		for (j = 0; j < nalt; j++)
76132332Sbostic 			for (i = 0; i < altmin; i++) {
76232332Sbostic 				c = altpat[j][altlen[j] - altmin + i];
76332332Sbostic 				altset[i + 1][c] = 1;	/* offset for sentinel */
76432332Sbostic 			}
76532332Sbostic 	}
76632332Sbostic 	return (altpat[0]);
76732332Sbostic }
76832332Sbostic 
76932332Sbostic /*
77032332Sbostic  * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
77132332Sbostic  * determine whether to use dfa-based egrep:  We do FIRSTFEW matches with
77232332Sbostic  * regexec().  If Boyer-Moore up to now matched more than PUNTPERCENT
77332332Sbostic  * of the input, the r.e. is likely to be underspecified, so do old *grep,
77432332Sbostic  * which is faster on complex patterns than regexp().  At FIRSTFEW,
77532332Sbostic  * dump the saved matches collected by savematch(). They are saved
77632332Sbostic  * so that a "PUNT" can "rewind" to ignore them.  Stdin is problematic,
77732332Sbostic  * since it's hard to rewind.
77832332Sbostic  */
77932332Sbostic 
78032332Sbostic execstrategy(file)
78132332Sbostic 	char *file;
78232332Sbostic {
78332332Sbostic 	int pctmatch;
78432332Sbostic 
78532332Sbostic 	pctmatch = (100 * rxcount) / incount;
78632332Sbostic 	if (pctmatch > PUNTPERCENT && file != NULL)
78732332Sbostic 		kernighan(args);
78832332Sbostic 	if (file != NULL)
78932332Sbostic 		flushmatches();
79032332Sbostic }
79132332Sbostic 
79232332Sbostic nlcount(bstart, bstop)		/* flail interval to totalize newlines. */
79332332Sbostic 	char *bstart, *bstop;
79432332Sbostic {
79532332Sbostic 	register char *s = bstart;
79632332Sbostic 	register char *t = bstop;
79732332Sbostic 	register int count = 0;
79832332Sbostic 
79932332Sbostic 	do {			/* loop unroll for older architectures */
80032332Sbostic 		if (*t == NL)	/* ... ask ames!jaw for sample code */
80132332Sbostic 			count++;
80232332Sbostic 	} while (t-- > s);
80332332Sbostic 
80432332Sbostic 	return (count);
80532332Sbostic }
80632332Sbostic 
80732332Sbostic char *
80832332Sbostic isolate(regexpr)		/* isolate longest metacharacter-free string */
80932332Sbostic 	char *regexpr;
81032332Sbostic {
81132332Sbostic 	char *dummyexpr;
81232332Sbostic 
81332332Sbostic 	/*
81432332Sbostic 	 * We add (.)* because Henry's regcomp only figures regmust if it
81532332Sbostic 	 * sees a leading * pattern.  Foo!
81632332Sbostic 	 */
81732332Sbostic 	dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
81832332Sbostic 	sprintf(dummyexpr, "(.)*%s", regexpr);
81932332Sbostic 	if ((rspencer = regcomp(dummyexpr)) == NULL)
82032332Sbostic 		kernighan(args);
82132332Sbostic 	return (rspencer->regmust);
82232332Sbostic }
82332332Sbostic 
82432332Sbostic char *matches[FIRSTFEW];
82532332Sbostic static int mcount = 0;
82632332Sbostic 
82732332Sbostic char *
82832332Sbostic savematch(s)			/* horde matches during statistics gathering */
82932332Sbostic 	register char *s;
83032332Sbostic {
83132332Sbostic 	char *p;
83232332Sbostic 	char *start = s;
83332332Sbostic 	int msize = 0;
83432332Sbostic 	int psize = strlen(preamble);
83532332Sbostic 
83632332Sbostic 	while (*s++ != NL)
83732332Sbostic 		msize++;
83832332Sbostic 	*--s = EOS;
83932332Sbostic 
84032332Sbostic 	p = malloc((unsigned) msize + 1 + psize);
84132332Sbostic 	strcpy(p, preamble);
84232332Sbostic 	strcpy(p + psize, start);
84332332Sbostic 	matches[mcount++] = p;
84432332Sbostic 
84532332Sbostic 	preamble[0] = 0;
84632332Sbostic 	*s = NL;
84732332Sbostic 	return (s);
84832332Sbostic }
84932332Sbostic 
85032332Sbostic flushmatches()
85132332Sbostic {
85232332Sbostic 	int n;
85332332Sbostic 
85432332Sbostic 	flushflag = 1;
85532332Sbostic 	for (n = 0; n < mcount; n++)
85632332Sbostic 		printf("%s\n", matches[n]);
85732332Sbostic 	mcount = 0;
85832332Sbostic }
85932332Sbostic 
86032332Sbostic oops(message)
86132332Sbostic 	char *message;
86232332Sbostic {
86332332Sbostic 	fprintf(stderr, "%s: %s\n", progname, message);
86432332Sbostic 	exit(2);
86532332Sbostic }
86632332Sbostic 
86732332Sbostic kernighan(args)			/* "let others do the hard part ..." */
86832332Sbostic 	char *args[];
86932332Sbostic {
87032332Sbostic 	/*
87132332Sbostic 	 * We may have already run grep on some of the files; remove them
87232332Sbostic 	 * from the arg list we pass on.  Note that we can't delete them
87332332Sbostic 	 * totally because the number of file names affects the output
87432332Sbostic 	 * (automatic -h).
87532332Sbostic 	 */
87632332Sbostic 	/* better would be fork/exec per punted file -- jaw */
87732332Sbostic 
87832332Sbostic 	while (firstfile && optind > firstfile)
87932332Sbostic 		args[firstfile++] = "/dev/null";
88032332Sbostic 	if (patind)
88132332Sbostic 		args[patind] = pattern;
88232339Sbostic 	(void) fflush(stdout);
88332332Sbostic 
88432332Sbostic 	if (grepflag)
88532332Sbostic 		execvp(GREPSTD, args), oops("can't exec old 'grep'");
88632332Sbostic 	else if (fgrepflag)
88732332Sbostic 		execvp(FGREPSTD, args), oops("can't exec old 'fgrep'");
88832332Sbostic 	else
88932332Sbostic 		execvp(EGREPSTD, args), oops("can't exec old 'egrep'");
89032332Sbostic }
891