xref: /csrg-svn/usr.bin/grep/egrep/egrep.c (revision 32337)
132332Sbostic #ifndef lint
2*32337Sbostic static char sccsid[] = "@(#)egrep.c	5.2 (Berkeley) 10/05/87";
332332Sbostic #endif not lint
432332Sbostic 
532332Sbostic /*
632332Sbostic      Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
732332Sbostic      table as in original paper (CACM, October, 1977).  No delta1 or delta2.
832332Sbostic      According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
932332Sbostic      minimal practical value.  However, to improve for worst case input,
1032332Sbostic      integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
1132332Sbostic      Comput., Feb. 1986) deserves consideration.
1232332Sbostic 
1332332Sbostic      Method: 	extract longest metacharacter-free string from expression.
1432332Sbostic 		this is done using a side-effect from henry spencer's regcomp().
1532332Sbostic 		use boyer-moore to match such, then pass submatching lines
1632332Sbostic 		to either regexp() or standard 'egrep', depending on certain
1732332Sbostic 		criteria within execstrategy() below.  [this tradeoff is due
1832332Sbostic 		to the general slowness of the regexp() nondeterministic
1932332Sbostic 		machine on complex expressions, as well as the startup time
2032332Sbostic 		of standard 'egrep' on short files.]  alternatively, one may
2132332Sbostic 		change the vendor-supplied 'egrep' automaton to include
2232332Sbostic 		boyer-moore directly.  see accompanying writeup for discussion
2332332Sbostic 		of kanji expression treatment.
2432332Sbostic 
2532332Sbostic 		late addition:  apply trickbag for fast match of simple
2632332Sbostic 		alternations (sublinear, in common low-cardinality cases).
2732332Sbostic 		trap fgrep into this lair.
2832332Sbostic 
2932332Sbostic 		gnu additions:  -f, newline as |, \< and \> [in regexec()], more
3032332Sbostic 				comments.  inspire better dfa exec() strategy.
3132332Sbostic 				serious testing and help with special cases.
3232332Sbostic 
3332332Sbostic      Algorithm amalgam summary:
3432332Sbostic 
3532332Sbostic 		dfa e?grep 		(aho/thompson)
3632332Sbostic 		ndfa regexp() 		(spencer/aho)
3732332Sbostic 		bmg			(boyer/moore/gosper)
3832332Sbostic 		"superimposed" bmg   	(jaw)
3932332Sbostic 		fgrep			(aho/corrasick)
4032332Sbostic 
4132332Sbostic 		sorry, but the knuth/morris/pratt machine, horspool's
4232332Sbostic 		"frequentist" code, and the rabin/karp matcher, however cute,
4332332Sbostic 		just don't cut it for this production.
4432332Sbostic 
4532332Sbostic      James A. Woods				Copyright (c) 1986
4632332Sbostic      NASA Ames Research Center
4732332Sbostic */
4832332Sbostic #include <stdio.h>
4932332Sbostic #include <ctype.h>
5032332Sbostic #include <sys/types.h>
5132332Sbostic #include <sys/stat.h>
52*32337Sbostic #include <sys/file.h>
5332332Sbostic #include <regexp.h>		/* must be henry spencer's version */
5432332Sbostic 
5532332Sbostic #define	MIN(A, B)	((A) > (B) ? (B) : (A))
5632332Sbostic 
5732332Sbostic #ifdef	SLOWSYS
5832332Sbostic #define read	xread
5932332Sbostic #endif
6032332Sbostic /*
6132332Sbostic  * define existing [ef]?grep program locations below for use by execvp().
6232332Sbostic  * [execlp() would be used were it not for the possibility of
6332332Sbostic  * installation-dependent recursion.]
6432332Sbostic  */
6532332Sbostic #ifndef EGREPSTD
6632332Sbostic #define	EGREPSTD	"/usr/bin/egrep"
6732332Sbostic #endif
6832332Sbostic #ifndef GREPSTD
6932332Sbostic #define	GREPSTD		"/bin/grep"
7032332Sbostic #endif
7132332Sbostic #ifndef FGREPSTD
7232332Sbostic #define	FGREPSTD	"/usr/bin/fgrep"
7332332Sbostic #endif
7432332Sbostic 
7532332Sbostic #define BUFSIZE	8192		/* make higher for cray */
7632332Sbostic #define PATSIZE 6000
7732332Sbostic #define LARGE 	BUFSIZE + PATSIZE
7832332Sbostic 
7932332Sbostic #define ALTSIZE	100		/* generous? */
8032332Sbostic #define NALT	7		/* tied to scanf() size in alternate() */
8132332Sbostic #define NMUSH	6		/* loosely relates to expected alt length */
8232332Sbostic 
8332332Sbostic #define	FIRSTFEW	33 	/* Always do FIRSTFEW matches with regexec() */
8432332Sbostic #define	PUNTPERCENT	10	/* After FIRSTFEW, if PUNTPERCENT of the input
8532332Sbostic 				 * was processed by regexp(), exec std egrep. */
8632332Sbostic #define NL	'\n'
8732332Sbostic #define	EOS	'\0'
8832332Sbostic #define	NONASCII	0200	/* Bit mask for Kanji non-ascii chars */
8932332Sbostic #define META	"\n^$.[]()?+*|\\"	/* egrep meta-characters */
9032332Sbostic #define SS2	'\216'		/* EUC Katakana (or Chinese2) prefix */
9132332Sbostic #define SS3	'\217'		/* EUC Kanji2 (or Chinese3) prefix */
9232332Sbostic 
9332332Sbostic extern char *optarg;
9432332Sbostic extern int optind;
9532332Sbostic char *progname;
9632332Sbostic 
9732332Sbostic int cflag, iflag, eflag, fflag, lflag, nflag;	/* SVID flags */
9832332Sbostic int sflag, hflag;		/* v7, v8, bsd */
9932332Sbostic 
10032332Sbostic int firstflag;			/* Stop at first match */
10132332Sbostic int grepflag;			/* Called as "grep" */
10232332Sbostic int fgrepflag;			/* Called as "fgrep" */
10332332Sbostic int altflag;			/* Simple alternation in pattern */
10432332Sbostic int boyonly;			/* No regexp needed -- all simple */
10532332Sbostic int flushflag;
10632332Sbostic int grepold, egrepold, fgrepold;
10732332Sbostic 
10832332Sbostic int nalt;			/* Number of alternatives */
10932332Sbostic int nsuccess;			/* 1 for match, 2 for error */
11032332Sbostic int altmin;			/* Minimum length of all the alternate
11132332Sbostic 				 * strings */
11232332Sbostic int firstfile;			/* argv index of first file argument */
11332332Sbostic int patind;			/* argv index of pattern */
11432332Sbostic long nmatch;			/* Number of matches in this file */
11532332Sbostic long incount, counted;		/* Amount of input consumed */
11632332Sbostic long rxcount;			/* Bytes of input processed by regexec() */
11732332Sbostic int boyfound;			/* accumulated partial matches (tripped by
11832332Sbostic 				 * FIRSTFEW) */
11932332Sbostic int prevmatch;			/* next three lines aid fast -n */
12032332Sbostic long nline, prevnline;
12132332Sbostic char *prevloc;
12232332Sbostic 
12332332Sbostic regexp *rspencer;
12432332Sbostic char *pattern;
12532332Sbostic char *patboy;			/* Pattern for simple Boyer-Moore */
12632332Sbostic char *patfile;			/* Filename containing pattern(s) */
12732332Sbostic 
12832332Sbostic int delta0[256];		/* Boyer-Moore algorithm core */
12932332Sbostic char cmap[256];			/* Usually 0-255, but if -i, maps upper to
13032332Sbostic 				 * lower case */
13132332Sbostic char str[BUFSIZE + 2];
13232332Sbostic int nleftover;
13332332Sbostic char linetemp[BUFSIZE];
13432332Sbostic char altpat[NALT][ALTSIZE];	/* alternation component storage */
13532332Sbostic int altlen[NALT];
13632332Sbostic short altset[NMUSH + 1][256];
13732332Sbostic char preamble[200];		/* match prefix (filename, line no.) */
13832332Sbostic 
13932332Sbostic int fd;
14032332Sbostic char *
14132332Sbostic strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *sprintf(), *malloc();
14232332Sbostic char *
14332332Sbostic grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
14432332Sbostic char **args;
14532332Sbostic 
14632332Sbostic main(argc, argv)
14732332Sbostic 	int argc;
14832332Sbostic 	char *argv[];
14932332Sbostic {
15032332Sbostic 	int c;
15132332Sbostic 	int errflag = 0;
15232332Sbostic 
15332332Sbostic 	args = argv;
15432332Sbostic 
15532332Sbostic 	if ((progname = strrchr(argv[0], '/')) != 0)
15632332Sbostic 		progname++;
15732332Sbostic 	else
15832332Sbostic 		progname = argv[0];
15932332Sbostic 	if (strcmp(progname, "grep") == 0)
16032332Sbostic 		grepflag++;
16132332Sbostic 	if (strcmp(progname, "fgrep") == 0)
16232332Sbostic 		fgrepflag++;
16332332Sbostic 
16432332Sbostic 	while ((c = getopt(argc, argv, "bchie:f:lnsvwxy1")) != EOF) {
16532332Sbostic 		switch (c) {
16632332Sbostic 
16732332Sbostic 		case 'f':
16832332Sbostic 			fflag++;
16932332Sbostic 			patfile = optarg;
17032332Sbostic 			continue;
17132332Sbostic 		case 'b':
17232332Sbostic 		case 'v':
17332332Sbostic 			egrepold++;	/* boyer-moore of little help here */
17432332Sbostic 			continue;
17532332Sbostic 		case 'c':
17632332Sbostic 			cflag++;
17732332Sbostic 			continue;
17832332Sbostic 		case 'e':
17932332Sbostic 			eflag++;
18032332Sbostic 			pattern = optarg;
18132332Sbostic 			continue;
18232332Sbostic 		case 'h':
18332332Sbostic 			hflag++;
18432332Sbostic 			continue;
18532332Sbostic 		case '1':	/* Stop at very first match */
18632332Sbostic 			firstflag++;	/* spead freaks only */
18732332Sbostic 			continue;
18832332Sbostic 		case 'i':
18932332Sbostic 			iflag++;
19032332Sbostic 			continue;
19132332Sbostic 		case 'l':
19232332Sbostic 			lflag++;
19332332Sbostic 			continue;
19432332Sbostic 		case 'n':
19532332Sbostic 			nflag++;
19632332Sbostic 			continue;
19732332Sbostic 		case 's':
19832332Sbostic 			sflag++;
19932332Sbostic 			continue;
20032332Sbostic 		case 'w':
20132332Sbostic 		case 'y':
20232332Sbostic 			if (!grepflag)
20332332Sbostic 				errflag++;
20432332Sbostic 			grepold++;
20532332Sbostic 			continue;
20632332Sbostic 		case 'x':	/* needs more work, like -b above */
20732332Sbostic 			if (!fgrepflag)
20832332Sbostic 				errflag++;
20932332Sbostic 			fgrepold++;
21032332Sbostic 			continue;
21132332Sbostic 		case '?':
21232332Sbostic 			errflag++;
21332332Sbostic 		}
21432332Sbostic 	}
21532332Sbostic 	if (errflag || ((argc <= optind) && !fflag && !eflag)) {
21632332Sbostic 		if (grepflag)
21732332Sbostic oops("usage: grep [-bcihlnsvwy] [-e] pattern [file ...]");
21832332Sbostic 		else if (fgrepflag)
21932332Sbostic oops("usage: fgrep [-bcilnv] {-f patfile | [-e] strings} [file ...]");
22032332Sbostic 		else		/* encourage SVID options, though we provide
22132332Sbostic 				 * others */
22232332Sbostic oops("usage: egrep [-bcilnv] {-f patfile | [-e] pattern} [file ...]");
22332332Sbostic 	}
22432332Sbostic 	patind = optind;
22532332Sbostic 	if (fflag)
22632332Sbostic 		pattern = pfile(patfile), patind = 0;
22732332Sbostic 	else if (!eflag)
22832332Sbostic 		pattern = argv[optind++];
22932332Sbostic 
23032332Sbostic 	if ((argc - optind) <= 1)	/* Filename invisible given < 2 files */
23132332Sbostic 		hflag++;
23232332Sbostic 	if (pattern[0] == EOS)
23332332Sbostic 		kernighan(argv);	/* same as it ever was */
23432332Sbostic 	/*
23532332Sbostic 	 * 'grep/egrep' merger -- "old" grep is called to handle: tagged
23632332Sbostic 	 * exprs \( \), word matches \< and \>, -w and -y options, char
23732332Sbostic 	 * classes with '-' at end (egrep bug?), and patterns beginning with
23832332Sbostic 	 * an asterisk (don't ask why). otherwise, characters meaningful to
23932332Sbostic 	 * 'egrep' but not to 'grep' are escaped; the entire expr is then
24032332Sbostic 	 * passed to 'egrep'.
24132332Sbostic 	 */
24232332Sbostic 	if (grepflag && !grepold) {
24332332Sbostic 		if (strindex(pattern, "\\(") >= 0 ||
24432332Sbostic 		    strindex(pattern, "\\<") >= 0 ||
24532332Sbostic 		    strindex(pattern, "\\>") >= 0 ||
24632332Sbostic 		    strindex(pattern, "-]") >= 0 ||
24732332Sbostic 		    pattern[0] == '*')	/* grep bug */
24832332Sbostic 			grepold++;
24932332Sbostic 		else
25032332Sbostic 			pattern = grepxlat(pattern);
25132332Sbostic 	}
25232332Sbostic 	if (grepold || egrepold || fgrepold)
25332332Sbostic 		kernighan(argv);
25432332Sbostic 
25532332Sbostic 	if (iflag)
25632332Sbostic 		strcpy(pattern, fold(pattern));
25732332Sbostic 	/*
25832332Sbostic 	 * If the pattern is a plain string, just run boyer-moore. If it
25932332Sbostic 	 * consists of meta-free alternatives, run "superimposed" bmg.
26032332Sbostic 	 * Otherwise, find best string, and compile pattern for regexec().
26132332Sbostic 	 */
26232332Sbostic 	if (strpbrk(pattern, META) == NULL) {	/* do boyer-moore only */
26332332Sbostic 		boyonly++;
26432332Sbostic 		patboy = pattern;
26532332Sbostic 	} else {
26632332Sbostic 		if ((patboy = alternate(pattern)) != NULL)
26732332Sbostic 			boyonly++;
26832332Sbostic 		else {
26932332Sbostic 			if ((patboy = isolate(pattern)) == NULL)
27032332Sbostic 				kernighan(argv);	/* expr too involved */
27132332Sbostic #ifndef NOKANJI
27232332Sbostic 			for (c = 0; pattern[c] != EOS; c++)
27332332Sbostic 				if (pattern[c] & NONASCII)	/* kanji + meta */
27432332Sbostic 					kernighan(argv);
27532332Sbostic #endif
27632332Sbostic 			if ((rspencer = regcomp(pattern)) == NULL)
27732332Sbostic 				oops("regcomp failure");
27832332Sbostic 		}
27932332Sbostic 	}
28032332Sbostic 	gosper(patboy);		/* "pre-conditioning is wonderful"
28132332Sbostic 				 * -- v. strassen */
28232332Sbostic 
28332332Sbostic 	if ((firstfile = optind) >= argc) {
28432332Sbostic 		/* Grep standard input */
28532332Sbostic 		if (lflag)	/* We don't know its name! */
28632332Sbostic 			exit(1);
28732332Sbostic 		egsecute((char *) NULL);
28832332Sbostic 	} else {
28932332Sbostic 		while (optind < argc) {
29032332Sbostic 			egsecute(argv[optind]);
29132332Sbostic 			optind++;
29232332Sbostic 			if (firstflag && (nsuccess == 1))
29332332Sbostic 				break;
29432332Sbostic 		}
29532332Sbostic 	}
29632332Sbostic 	exit((nsuccess == 2) ? 2 : (nsuccess == 0));
29732332Sbostic }
29832332Sbostic 
29932332Sbostic char *
30032332Sbostic pfile(pfname)			/* absorb expression from file */
30132332Sbostic 	char *pfname;
30232332Sbostic {
303*32337Sbostic 	int fd;
30432332Sbostic 	struct stat patstat;
30532332Sbostic 	static char *pat;
30632332Sbostic 
307*32337Sbostic 	if ((fd = open(pfname, O_RDONLY, 0)) < 0)
30832332Sbostic 		oops("can't read pattern file");
309*32337Sbostic 	if (fstat(fd, &patstat) != 0)
31032332Sbostic 		oops("can't stat pattern file");
31132332Sbostic 	if (patstat.st_size > PATSIZE) {
31232332Sbostic 		if (fgrepflag) {	/* defer to unix version */
31332332Sbostic 			fgrepold++;
31432332Sbostic 			return "dummy";
31532332Sbostic 		} else
31632332Sbostic 			oops("pattern file too big");
31732332Sbostic 	}
31832332Sbostic 	if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
31932332Sbostic 		oops("out of memory to read pattern file");
320*32337Sbostic 	if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
32132332Sbostic 		oops("error reading pattern file");
322*32337Sbostic 	(void) close(fd);
32332332Sbostic 
32432332Sbostic 	pat[patstat.st_size] = EOS;
32532332Sbostic 	if (pat[patstat.st_size - 1] == NL)	/* NOP for egrep; helps grep */
32632332Sbostic 		pat[patstat.st_size - 1] = EOS;
32732332Sbostic 
32832332Sbostic 	if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
32932332Sbostic 		if (fgrepflag)
33032332Sbostic 			fgrepold++;	/* "what's it all about, alfie?" */
33132332Sbostic 		else
33232332Sbostic 			egrepold++;
33332332Sbostic 	}
33432332Sbostic 	return (pat);
33532332Sbostic }
33632332Sbostic 
33732332Sbostic egsecute(file)
33832332Sbostic 	char *file;
33932332Sbostic {
34032332Sbostic 	if (file == NULL)
34132332Sbostic 		fd = 0;
34232332Sbostic 	else if ((fd = open(file, 0)) <= 0) {
34332332Sbostic 		fprintf(stderr, "%s: can't open %s\n", progname, file);
34432332Sbostic 		nsuccess = 2;
34532332Sbostic 		return;
34632332Sbostic 	}
34732332Sbostic 	chimaera(file, patboy);
34832332Sbostic 
34932332Sbostic 	if (!boyonly && !flushflag && file != NULL)
35032332Sbostic 		flushmatches();
35132332Sbostic 	if (file != NULL)
35232332Sbostic 		close(fd);
35332332Sbostic }
35432332Sbostic 
35532332Sbostic chimaera(file, pat)		/* "reach out and boyer-moore search someone" */
35632332Sbostic 	char *file, *pat;	/* -- soon-to-be-popular bumper sticker */
35732332Sbostic {
35832332Sbostic 	register char *k, *strend, *s;
35932332Sbostic 	register int j, count;
36032332Sbostic 	register int *deltazero = delta0;
36132332Sbostic 	int patlen = altmin;
36232332Sbostic 	char *t;
36332332Sbostic 	char *gotamatch(), *kanji(), *linesave(), *submatch();
36432332Sbostic 
36532332Sbostic 	nleftover = boyfound = flushflag = 0;
36632332Sbostic 	nline = 1L;
36732332Sbostic 	prevmatch = 0;
36832332Sbostic 	nmatch = counted = rxcount = 0L;
36932332Sbostic 
37032332Sbostic 	while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
37132332Sbostic 
37232332Sbostic 		counted += count;
37332332Sbostic 		strend = linesave(str, count);
37432332Sbostic 
37532332Sbostic 		for (k = str + patlen - 1; k < strend;) {
37632332Sbostic 			/*
37732332Sbostic 			 * for a large class of patterns, upwards of 80% of
37832332Sbostic 			 * match time is spent on the next line.  we beat
37932332Sbostic 			 * existing microcode (vax 'matchc') this way.
38032332Sbostic 			 */
38132332Sbostic 			while ((k += deltazero[*(unsigned char *) k]) < strend);
38232332Sbostic 			if (k < (str + LARGE))
38332332Sbostic 				break;
38432332Sbostic 			k -= LARGE;
38532332Sbostic 
38632332Sbostic 			if (altflag) {
38732332Sbostic 				/*
38832332Sbostic 				 * Parallel Boyer-Moore.  Check whether each
38932332Sbostic 				 * of the previous <altmin> chars COULD be
39032332Sbostic 				 * from one of the alternative strings.
39132332Sbostic 				 */
39232332Sbostic 				s = k - 1;
39332332Sbostic 				j = altmin;
39432332Sbostic 				while (altset[--j][(unsigned char)
39532332Sbostic 					     cmap[*(unsigned char *) s--]]);
39632332Sbostic 				/*
39732332Sbostic 				 * quick test fails. in this life, compare
39832332Sbostic 				 * 'em all.  but, a "reverse trie" would
39932332Sbostic 				 * attenuate worst case (linear w/delta2?).
40032332Sbostic 				 */
40132332Sbostic 				if (--j < 0) {
40232332Sbostic 					count = nalt - 1;
40332332Sbostic 					do {
40432332Sbostic 						s = k;
40532332Sbostic 						j = altlen[count];
40632332Sbostic 						t = altpat[count];
40732332Sbostic 
40832332Sbostic 						while
40932332Sbostic 							(cmap[*(unsigned char *) s--]
41032332Sbostic 							 == t[--j]);
41132332Sbostic 						if (j < 0)
41232332Sbostic 							break;
41332332Sbostic 					}
41432332Sbostic 					while (count--);
41532332Sbostic 				}
41632332Sbostic 			} else {
41732332Sbostic 				/* One string -- check it */
41832332Sbostic 				j = patlen - 1;
41932332Sbostic 				s = k - 1;
42032332Sbostic 				while (cmap[*(unsigned char *) s--] == pat[--j]);
42132332Sbostic 			}
42232332Sbostic 			/*
42332332Sbostic 			 * delta-less shortcut for literati. short shrift for
42432332Sbostic 			 * genetic engineers?
42532332Sbostic 			 */
42632332Sbostic 			if (j >= 0) {
42732332Sbostic 				k++;	/* no match; restart next char */
42832332Sbostic 				continue;
42932332Sbostic 			}
43032332Sbostic 			k = submatch(file, pat, str, strend, k, count);
43132332Sbostic 			if (k == NULL)
43232332Sbostic 				return;
43332332Sbostic 		}
43432332Sbostic 		if (nflag) {
43532332Sbostic 			if (prevmatch)
43632332Sbostic 				nline = prevnline + nlcount(prevloc, k);
43732332Sbostic 			else
43832332Sbostic 				nline = nline + nlcount(str, k);
43932332Sbostic 			prevmatch = 0;
44032332Sbostic 		}
44132332Sbostic 		strncpy(str, linetemp, nleftover);
44232332Sbostic 	}
44332332Sbostic 	if (cflag) {
44432332Sbostic 		/* Bug from old grep: -c overrides -h.  We fix the bug. */
44532332Sbostic 		if (!hflag)
44632332Sbostic 			printf("%s:", file);
44732332Sbostic 		printf("%ld\n", nmatch);
44832332Sbostic 	}
44932332Sbostic }
45032332Sbostic 
45132332Sbostic char *
45232332Sbostic linesave(str, count)		/* accumulate partial line at end of buffer */
45332332Sbostic 	char str[];
45432332Sbostic 	register int count;
45532332Sbostic {
45632332Sbostic 	register int j;
45732332Sbostic 
45832332Sbostic 	count += nleftover;
45932332Sbostic 	if (count != BUFSIZE && fd != 0)
46032332Sbostic 		str[count++] = NL;	/* insurance for broken last line */
46132332Sbostic 	str[count] = EOS;
46232332Sbostic 	for (j = count - 1; str[j] != NL && j >= 0;)
46332332Sbostic 		j--;
46432332Sbostic 	/*
46532332Sbostic 	 * break up these lines: long line (> BUFSIZE), last line of file, or
46632332Sbostic 	 * short return from read(), as from tee(1) input
46732332Sbostic 	 */
46832332Sbostic 	if (j < 0 && (count == (BUFSIZE - nleftover))) {
46932332Sbostic 		str[count++] = NL;
47032332Sbostic 		str[count] = EOS;
47132332Sbostic 		linetemp[0] = EOS;
47232332Sbostic 		nleftover = 0;
47332332Sbostic 		return (str + count);
47432332Sbostic 	} else {
47532332Sbostic 		nleftover = count - j - 1;
47632332Sbostic 		strncpy(linetemp, str + j + 1, nleftover);
47732332Sbostic 		return (str + j);
47832332Sbostic 	}
47932332Sbostic }
48032332Sbostic 
48132332Sbostic /*
48232332Sbostic  * Process partial match. First check for mis-aligned Kanji, then match line
48332332Sbostic  * against full compiled r.e. if statistics do not warrant handing off to
48432332Sbostic  * standard egrep.
48532332Sbostic  */
48632332Sbostic char *
48732332Sbostic submatch(file, pat, str, strend, k, altindex)
48832332Sbostic 	char file[], pat[], str[];
48932332Sbostic 	register char *strend, *k;
49032332Sbostic 	int altindex;
49132332Sbostic {
49232332Sbostic 	register char *s;
49332332Sbostic 	char *t, c;
49432332Sbostic 
49532332Sbostic 	t = k;
49632332Sbostic 	s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
49732332Sbostic #ifndef NOKANJI
49832332Sbostic 	c = ((altflag) ? altpat[altindex][0] : pat[0]);
49932332Sbostic 	if (c & NONASCII)
50032332Sbostic 		if ((s = kanji(str, s, k)) == NULL)
50132332Sbostic 			return (++k);	/* reject false kanji */
50232332Sbostic #endif
50332332Sbostic 	do;
50432332Sbostic 	while (*s != NL && --s >= str);
50532332Sbostic 	k = s + 1;		/* now at line start */
50632332Sbostic 
50732332Sbostic 	if (boyonly)
50832332Sbostic 		return (gotamatch(file, k));
50932332Sbostic 
51032332Sbostic 	incount = counted - (strend - k);
51132332Sbostic 	if (boyfound++ == FIRSTFEW)
51232332Sbostic 		execstrategy(file);
51332332Sbostic 
51432332Sbostic 	s = t;
51532332Sbostic 	do
51632332Sbostic 		rxcount++;
51732332Sbostic 	while (*s++ != NL);
51832332Sbostic 	*--s = EOS;
51932332Sbostic 	/*
52032332Sbostic 	 * "quick henry -- the flit" (after theodor geisel)
52132332Sbostic 	 */
52232332Sbostic 	if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
52332332Sbostic 		*s = NL;
52432332Sbostic 		if (gotamatch(file, k) == NULL)
52532332Sbostic 			return (NULL);
52632332Sbostic 	}
52732332Sbostic 	*s = NL;
52832332Sbostic 	return (s + 1);
52932332Sbostic }
53032332Sbostic 
53132332Sbostic /*
53232332Sbostic  * EUC code disambiguation -- scan backwards to first 7-bit code, while
53332332Sbostic  * counting intervening 8-bit codes.  If odd, reject unaligned Kanji pattern.
53432332Sbostic  * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
53532332Sbostic  */
53632332Sbostic char *
53732332Sbostic kanji(str, s, k)
53832332Sbostic 	register char *str, *s, *k;
53932332Sbostic {
54032332Sbostic 	register int j = 0;
54132332Sbostic 
54232332Sbostic 	for (s--; s >= str; s--) {
54332332Sbostic 		if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
54432332Sbostic 			break;
54532332Sbostic 		j++;
54632332Sbostic 	}
54732332Sbostic #ifndef CHINESE
54832332Sbostic 	if (*s == SS2)
54932332Sbostic 		j -= 1;
55032332Sbostic #endif  CHINESE
55132332Sbostic 	return ((j & 01) ? NULL : k);
55232332Sbostic }
55332332Sbostic 
55432332Sbostic /*
55532332Sbostic  * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
55632332Sbostic  */
55732332Sbostic gosper(pattern)
55832332Sbostic 	char *pattern;		/* ... HAKMEM lives ... */
55932332Sbostic {
56032332Sbostic 	register int i, j;
56132332Sbostic 	unsigned char c;
56232332Sbostic 
56332332Sbostic 	/* Make one-string case look like simple alternatives case */
56432332Sbostic 	if (!altflag) {
56532332Sbostic 		nalt = 1;
56632332Sbostic 		altmin = altlen[0] = strlen(pattern);
56732332Sbostic 		strcpy(altpat[0], pattern);
56832332Sbostic 	}
56932332Sbostic 	/* For chars that aren't in any string, skip by string length. */
57032332Sbostic 	for (j = 0; j < 256; j++) {
57132332Sbostic 		delta0[j] = altmin;
57232332Sbostic 		cmap[j] = j;	/* Sneak in initialization of cmap */
57332332Sbostic 	}
57432332Sbostic 
57532332Sbostic 	/* For chars in a string, skip distance from char to end of string. */
57632332Sbostic 	/* (If char appears more than once, skip minimum distance.) */
57732332Sbostic 	for (i = 0; i < nalt; i++)
57832332Sbostic 		for (j = 0; j < altlen[i] - 1; j++) {
57932332Sbostic 			c = altpat[i][j];
58032332Sbostic 			delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
58132332Sbostic 			if (iflag && islower((int) c))
58232332Sbostic 				delta0[toupper((int) c)] = delta0[c];
58332332Sbostic 		}
58432332Sbostic 
58532332Sbostic 	/* For last char of each string, fall out of search loop. */
58632332Sbostic 	for (i = 0; i < nalt; i++) {
58732332Sbostic 		c = altpat[i][altlen[i] - 1];
58832332Sbostic 		delta0[c] = LARGE;
58932332Sbostic 		if (iflag && islower((int) c))
59032332Sbostic 			delta0[toupper((int) c)] = LARGE;
59132332Sbostic 	}
59232332Sbostic 	if (iflag)
59332332Sbostic 		for (j = 'A'; j <= 'Z'; j++)
59432332Sbostic 			cmap[j] = tolower((int) j);
59532332Sbostic }
59632332Sbostic 
59732332Sbostic /*
59832332Sbostic  * Print, count, or stop on full match. Result is either the location for
59932332Sbostic  * continued search, or NULL to stop.
60032332Sbostic  */
60132332Sbostic char *
60232332Sbostic gotamatch(file, s)
60332332Sbostic 	register char *file, *s;
60432332Sbostic {
60532332Sbostic 	char *savematch();
60632332Sbostic 	int squirrel = 0;	/* nonzero to squirrel away FIRSTFEW matches */
60732332Sbostic 
60832332Sbostic 	nmatch++;
60932332Sbostic 	nsuccess = 1;
61032332Sbostic 	if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
61132332Sbostic 		squirrel = 1;
61232332Sbostic 
61332332Sbostic 	if (sflag)
61432332Sbostic 		return (NULL);	/* -s usurps all flags (unlike some versions) */
61532332Sbostic 	if (cflag) {		/* -c overrides -l, we guess */
61632332Sbostic 		do;
61732332Sbostic 		while (*s++ != NL);
61832332Sbostic 	} else if (lflag) {
61932332Sbostic 		puts(file);
62032332Sbostic 		return (NULL);
62132332Sbostic 	} else {
62232332Sbostic 		if (!hflag)
62332332Sbostic 			if (!squirrel)
62432332Sbostic 				printf("%s:", file);
62532332Sbostic 			else
62632332Sbostic 				sprintf(preamble, "%s:", file);
62732332Sbostic 		if (nflag) {
62832332Sbostic 			if (prevmatch)
62932332Sbostic 				prevnline = prevnline + nlcount(prevloc, s);
63032332Sbostic 			else
63132332Sbostic 				prevnline = nline + nlcount(str, s);
63232332Sbostic 			prevmatch = 1;
63332332Sbostic 
63432332Sbostic 			if (!squirrel)
63532332Sbostic 				printf("%ld:", prevnline);
63632332Sbostic 			else
63732332Sbostic 				sprintf(preamble + strlen(preamble),
63832332Sbostic 					"%ld:", prevnline);
63932332Sbostic 		}
64032332Sbostic 		if (!squirrel) {
64132332Sbostic 			do
64232332Sbostic 				putchar(*s);
64332332Sbostic 			while (*s++ != NL);
64432332Sbostic 		} else
64532332Sbostic 			s = savematch(s);
64632332Sbostic 
64732332Sbostic 		if (nflag)
64832332Sbostic 			prevloc = s - 1;
64932332Sbostic 	}
65032332Sbostic 	return ((firstflag && !cflag) ? NULL : s);
65132332Sbostic }
65232332Sbostic 
65332332Sbostic char *
65432332Sbostic fold(line)
65532332Sbostic 	char *line;
65632332Sbostic {
65732332Sbostic 	static char fline[BUFSIZE];
65832332Sbostic 	register char *s, *t = fline;
65932332Sbostic 
66032332Sbostic 	for (s = line; *s != EOS; s++)
66132332Sbostic 		*t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
66232332Sbostic 	*t = EOS;
66332332Sbostic 	return (fline);
66432332Sbostic }
66532332Sbostic 
66632332Sbostic strindex(s, t)			/* the easy way, as in K&P, p. 192 */
66732332Sbostic 	char *s, *t;
66832332Sbostic {
66932332Sbostic 	int i, n;
67032332Sbostic 
67132332Sbostic 	n = strlen(t);
67232332Sbostic 	for (i = 0; s[i] != '\0'; i++)
67332332Sbostic 		if (strncmp(s + i, t, n) == 0)
67432332Sbostic 			return (i);
67532332Sbostic 	return (-1);
67632332Sbostic }
67732332Sbostic 
67832332Sbostic char *
67932332Sbostic grepxlat(pattern)		/* grep pattern meta conversion */
68032332Sbostic 	char *pattern;
68132332Sbostic {
68232332Sbostic 	register char *p, *s;
68332332Sbostic 	static char newpat[BUFSIZE];
68432332Sbostic 
68532332Sbostic 	for (s = newpat, p = pattern; *p != EOS;) {
68632332Sbostic 		if (*p == '\\') {	/* skip escapes ... */
68732332Sbostic 			*s++ = *p++;
68832332Sbostic 			if (*p)
68932332Sbostic 				*s++ = *p++;
69032332Sbostic 		} else if (*p == '[') {	/* ... and char classes */
69132332Sbostic 			while (*p != EOS && *p != ']')
69232332Sbostic 				*s++ = *p++;
69332332Sbostic 		} else if (strchr("+?|()", *p) != NULL) {
69432332Sbostic 			*s++ = '\\';	/* insert protection */
69532332Sbostic 			*s++ = *p++;
69632332Sbostic 		} else
69732332Sbostic 			*s++ = *p++;
69832332Sbostic 	}
69932332Sbostic 	*s = EOS;
70032332Sbostic 	grepflag = ((patind) ? 0 : 1);
70132332Sbostic 	return (newpat);
70232332Sbostic }
70332332Sbostic 
70432332Sbostic /*
70532332Sbostic  * Test for simple alternation.  Result is NULL if it's not so simple, or is
70632332Sbostic  * a pointer to the first string if it is. Warning:  sscanf size is a
70732332Sbostic  * fixpoint, beyond which the speedup linearity starts to break down.  In the
70832332Sbostic  * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
70932332Sbostic  * altpat[] to arbitrary size is not useful.
71032332Sbostic  */
71132332Sbostic char *
71232332Sbostic alternate(regexpr)
71332332Sbostic 	char *regexpr;
71432332Sbostic {
71532332Sbostic 	register i, j, min;
71632332Sbostic 	unsigned char c;
71732332Sbostic 	char oflow[100];
71832332Sbostic 
71932332Sbostic 	if (fgrepflag && strchr(regexpr, '|'))
72032332Sbostic 		return (NULL);
72132332Sbostic 
72232332Sbostic 	i = strlen(regexpr);
72332332Sbostic 	for (j = 0; j < i; j++)
72432332Sbostic 		if (regexpr[j] == NL)
72532332Sbostic 			regexpr[j] = '|';
72632332Sbostic 
72732332Sbostic 	if (!fgrepflag) {
72832332Sbostic 		if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
72932332Sbostic 			return (NULL);
73032332Sbostic 		if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
73132332Sbostic 		    || strindex(regexpr, "||") >= 0)
73232332Sbostic 			return (NULL);
73332332Sbostic 	}
73432332Sbostic 	oflow[0] = EOS;
73532332Sbostic 	nalt = sscanf(regexpr, "%[^|]|%[^|]|%[^|]|%[^|]|%[^|]|%[^|]|%[^|]|%[^|]",
73632332Sbostic 		      altpat[0], altpat[1], altpat[2], altpat[3], altpat[4], altpat[5], altpat[6], oflow);
73732332Sbostic 
73832332Sbostic 	if (nalt < 1 || oflow[0] != EOS)
73932332Sbostic 		return (NULL);
74032332Sbostic 
74132332Sbostic 	altmin = NMUSH;
74232332Sbostic 	for (j = 0; j < nalt; j++) {
74332332Sbostic 		min = altlen[j] = strlen(altpat[j]);
74432332Sbostic 		if (min > ALTSIZE)
74532332Sbostic 			return (NULL);
74632332Sbostic 		altmin = MIN(altmin, min);
74732332Sbostic 	}
74832332Sbostic 	if (nalt > 1) {		/* build superimposed "pre-match" sets per
74932332Sbostic 				 * char */
75032332Sbostic 		altflag++;
75132332Sbostic 		for (j = 0; j < nalt; j++)
75232332Sbostic 			for (i = 0; i < altmin; i++) {
75332332Sbostic 				c = altpat[j][altlen[j] - altmin + i];
75432332Sbostic 				altset[i + 1][c] = 1;	/* offset for sentinel */
75532332Sbostic 			}
75632332Sbostic 	}
75732332Sbostic 	return (altpat[0]);
75832332Sbostic }
75932332Sbostic 
76032332Sbostic /*
76132332Sbostic  * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
76232332Sbostic  * determine whether to use dfa-based egrep:  We do FIRSTFEW matches with
76332332Sbostic  * regexec().  If Boyer-Moore up to now matched more than PUNTPERCENT
76432332Sbostic  * of the input, the r.e. is likely to be underspecified, so do old *grep,
76532332Sbostic  * which is faster on complex patterns than regexp().  At FIRSTFEW,
76632332Sbostic  * dump the saved matches collected by savematch(). They are saved
76732332Sbostic  * so that a "PUNT" can "rewind" to ignore them.  Stdin is problematic,
76832332Sbostic  * since it's hard to rewind.
76932332Sbostic  */
77032332Sbostic 
77132332Sbostic execstrategy(file)
77232332Sbostic 	char *file;
77332332Sbostic {
77432332Sbostic 	int pctmatch;
77532332Sbostic 
77632332Sbostic 	pctmatch = (100 * rxcount) / incount;
77732332Sbostic 	if (pctmatch > PUNTPERCENT && file != NULL)
77832332Sbostic 		kernighan(args);
77932332Sbostic 	if (file != NULL)
78032332Sbostic 		flushmatches();
78132332Sbostic }
78232332Sbostic 
78332332Sbostic nlcount(bstart, bstop)		/* flail interval to totalize newlines. */
78432332Sbostic 	char *bstart, *bstop;
78532332Sbostic {
78632332Sbostic 	register char *s = bstart;
78732332Sbostic 	register char *t = bstop;
78832332Sbostic 	register int count = 0;
78932332Sbostic 
79032332Sbostic 	do {			/* loop unroll for older architectures */
79132332Sbostic 		if (*t == NL)	/* ... ask ames!jaw for sample code */
79232332Sbostic 			count++;
79332332Sbostic 	} while (t-- > s);
79432332Sbostic 
79532332Sbostic 	return (count);
79632332Sbostic }
79732332Sbostic 
79832332Sbostic char *
79932332Sbostic isolate(regexpr)		/* isolate longest metacharacter-free string */
80032332Sbostic 	char *regexpr;
80132332Sbostic {
80232332Sbostic 	char *dummyexpr;
80332332Sbostic 
80432332Sbostic 	/*
80532332Sbostic 	 * We add (.)* because Henry's regcomp only figures regmust if it
80632332Sbostic 	 * sees a leading * pattern.  Foo!
80732332Sbostic 	 */
80832332Sbostic 	dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
80932332Sbostic 	sprintf(dummyexpr, "(.)*%s", regexpr);
81032332Sbostic 	if ((rspencer = regcomp(dummyexpr)) == NULL)
81132332Sbostic 		kernighan(args);
81232332Sbostic 	return (rspencer->regmust);
81332332Sbostic }
81432332Sbostic 
81532332Sbostic char *matches[FIRSTFEW];
81632332Sbostic static int mcount = 0;
81732332Sbostic 
81832332Sbostic char *
81932332Sbostic savematch(s)			/* horde matches during statistics gathering */
82032332Sbostic 	register char *s;
82132332Sbostic {
82232332Sbostic 	char *p;
82332332Sbostic 	char *start = s;
82432332Sbostic 	int msize = 0;
82532332Sbostic 	int psize = strlen(preamble);
82632332Sbostic 
82732332Sbostic 	while (*s++ != NL)
82832332Sbostic 		msize++;
82932332Sbostic 	*--s = EOS;
83032332Sbostic 
83132332Sbostic 	p = malloc((unsigned) msize + 1 + psize);
83232332Sbostic 	strcpy(p, preamble);
83332332Sbostic 	strcpy(p + psize, start);
83432332Sbostic 	matches[mcount++] = p;
83532332Sbostic 
83632332Sbostic 	preamble[0] = 0;
83732332Sbostic 	*s = NL;
83832332Sbostic 	return (s);
83932332Sbostic }
84032332Sbostic 
84132332Sbostic flushmatches()
84232332Sbostic {
84332332Sbostic 	int n;
84432332Sbostic 
84532332Sbostic 	flushflag = 1;
84632332Sbostic 	for (n = 0; n < mcount; n++)
84732332Sbostic 		printf("%s\n", matches[n]);
84832332Sbostic 	mcount = 0;
84932332Sbostic }
85032332Sbostic 
85132332Sbostic oops(message)
85232332Sbostic 	char *message;
85332332Sbostic {
85432332Sbostic 	fprintf(stderr, "%s: %s\n", progname, message);
85532332Sbostic 	exit(2);
85632332Sbostic }
85732332Sbostic 
85832332Sbostic kernighan(args)			/* "let others do the hard part ..." */
85932332Sbostic 	char *args[];
86032332Sbostic {
86132332Sbostic 	/*
86232332Sbostic 	 * We may have already run grep on some of the files; remove them
86332332Sbostic 	 * from the arg list we pass on.  Note that we can't delete them
86432332Sbostic 	 * totally because the number of file names affects the output
86532332Sbostic 	 * (automatic -h).
86632332Sbostic 	 */
86732332Sbostic 	/* better would be fork/exec per punted file -- jaw */
86832332Sbostic 
86932332Sbostic 	while (firstfile && optind > firstfile)
87032332Sbostic 		args[firstfile++] = "/dev/null";
87132332Sbostic 	if (patind)
87232332Sbostic 		args[patind] = pattern;
87332332Sbostic 	fflush(stdout);
87432332Sbostic 
87532332Sbostic 	if (grepflag)
87632332Sbostic 		execvp(GREPSTD, args), oops("can't exec old 'grep'");
87732332Sbostic 	else if (fgrepflag)
87832332Sbostic 		execvp(FGREPSTD, args), oops("can't exec old 'fgrep'");
87932332Sbostic 	else
88032332Sbostic 		execvp(EGREPSTD, args), oops("can't exec old 'egrep'");
88132332Sbostic }
882