xref: /csrg-svn/usr.bin/grep/egrep/egrep.c (revision 32366)
132332Sbostic #ifndef lint
2*32366Sbostic static char sccsid[] = "@(#)egrep.c	5.8 (Berkeley) 10/08/87";
332332Sbostic #endif not lint
432332Sbostic 
532332Sbostic /*
632332Sbostic      Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
732332Sbostic      table as in original paper (CACM, October, 1977).  No delta1 or delta2.
832332Sbostic      According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
932332Sbostic      minimal practical value.  However, to improve for worst case input,
1032332Sbostic      integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
1132332Sbostic      Comput., Feb. 1986) deserves consideration.
1232332Sbostic 
1332332Sbostic      Method: 	extract longest metacharacter-free string from expression.
1432332Sbostic 		this is done using a side-effect from henry spencer's regcomp().
1532332Sbostic 		use boyer-moore to match such, then pass submatching lines
1632332Sbostic 		to either regexp() or standard 'egrep', depending on certain
1732332Sbostic 		criteria within execstrategy() below.  [this tradeoff is due
1832332Sbostic 		to the general slowness of the regexp() nondeterministic
1932332Sbostic 		machine on complex expressions, as well as the startup time
2032332Sbostic 		of standard 'egrep' on short files.]  alternatively, one may
2132332Sbostic 		change the vendor-supplied 'egrep' automaton to include
2232332Sbostic 		boyer-moore directly.  see accompanying writeup for discussion
2332332Sbostic 		of kanji expression treatment.
2432332Sbostic 
2532332Sbostic 		late addition:  apply trickbag for fast match of simple
2632332Sbostic 		alternations (sublinear, in common low-cardinality cases).
2732332Sbostic 		trap fgrep into this lair.
2832332Sbostic 
2932332Sbostic 		gnu additions:  -f, newline as |, \< and \> [in regexec()], more
3032332Sbostic 				comments.  inspire better dfa exec() strategy.
3132332Sbostic 				serious testing and help with special cases.
3232332Sbostic 
3332332Sbostic      Algorithm amalgam summary:
3432332Sbostic 
3532332Sbostic 		dfa e?grep 		(aho/thompson)
3632332Sbostic 		ndfa regexp() 		(spencer/aho)
3732332Sbostic 		bmg			(boyer/moore/gosper)
3832332Sbostic 		"superimposed" bmg   	(jaw)
3932332Sbostic 		fgrep			(aho/corrasick)
4032332Sbostic 
4132332Sbostic 		sorry, but the knuth/morris/pratt machine, horspool's
4232332Sbostic 		"frequentist" code, and the rabin/karp matcher, however cute,
4332332Sbostic 		just don't cut it for this production.
4432332Sbostic 
4532332Sbostic      James A. Woods				Copyright (c) 1986
4632332Sbostic      NASA Ames Research Center
4732332Sbostic */
4832332Sbostic #include <stdio.h>
4932332Sbostic #include <ctype.h>
5032332Sbostic #include <sys/types.h>
5132332Sbostic #include <sys/stat.h>
5232337Sbostic #include <sys/file.h>
5332332Sbostic #include <regexp.h>		/* must be henry spencer's version */
5432332Sbostic 
5532332Sbostic #define	MIN(A, B)	((A) > (B) ? (B) : (A))
5632332Sbostic 
5732332Sbostic #ifdef	SLOWSYS
5832332Sbostic #define read	xread
5932332Sbostic #endif
6032332Sbostic /*
6132332Sbostic  * define existing [ef]?grep program locations below for use by execvp().
6232332Sbostic  * [execlp() would be used were it not for the possibility of
6332332Sbostic  * installation-dependent recursion.]
6432332Sbostic  */
6532332Sbostic #ifndef EGREPSTD
6632332Sbostic #define	EGREPSTD	"/usr/bin/egrep"
6732332Sbostic #endif
6832332Sbostic #ifndef GREPSTD
6932332Sbostic #define	GREPSTD		"/bin/grep"
7032332Sbostic #endif
7132332Sbostic #ifndef FGREPSTD
7232332Sbostic #define	FGREPSTD	"/usr/bin/fgrep"
7332332Sbostic #endif
7432332Sbostic 
7532332Sbostic #define BUFSIZE	8192		/* make higher for cray */
7632332Sbostic #define PATSIZE 6000
7732332Sbostic #define LARGE 	BUFSIZE + PATSIZE
7832332Sbostic 
7932332Sbostic #define NALT	7		/* tied to scanf() size in alternate() */
8032332Sbostic #define NMUSH	6		/* loosely relates to expected alt length */
8132332Sbostic 
8232332Sbostic #define	FIRSTFEW	33 	/* Always do FIRSTFEW matches with regexec() */
8332332Sbostic #define	PUNTPERCENT	10	/* After FIRSTFEW, if PUNTPERCENT of the input
8432332Sbostic 				 * was processed by regexp(), exec std egrep. */
8532332Sbostic #define NL	'\n'
8632332Sbostic #define	EOS	'\0'
8732332Sbostic #define	NONASCII	0200	/* Bit mask for Kanji non-ascii chars */
8832332Sbostic #define META	"\n^$.[]()?+*|\\"	/* egrep meta-characters */
8932332Sbostic #define SS2	'\216'		/* EUC Katakana (or Chinese2) prefix */
9032332Sbostic #define SS3	'\217'		/* EUC Kanji2 (or Chinese3) prefix */
9132332Sbostic 
9232332Sbostic extern char *optarg;
9332332Sbostic extern int optind;
9432332Sbostic char *progname;
9532332Sbostic 
9632332Sbostic int cflag, iflag, eflag, fflag, lflag, nflag;	/* SVID flags */
9732332Sbostic int sflag, hflag;		/* v7, v8, bsd */
9832332Sbostic 
9932332Sbostic int firstflag;			/* Stop at first match */
10032332Sbostic int grepflag;			/* Called as "grep" */
10132332Sbostic int fgrepflag;			/* Called as "fgrep" */
10232332Sbostic int altflag;			/* Simple alternation in pattern */
10332332Sbostic int boyonly;			/* No regexp needed -- all simple */
10432332Sbostic int flushflag;
10532332Sbostic int grepold, egrepold, fgrepold;
10632332Sbostic 
10732332Sbostic int nalt;			/* Number of alternatives */
10832332Sbostic int nsuccess;			/* 1 for match, 2 for error */
10932332Sbostic int altmin;			/* Minimum length of all the alternate
11032332Sbostic 				 * strings */
11132332Sbostic int firstfile;			/* argv index of first file argument */
11232332Sbostic int patind;			/* argv index of pattern */
11332332Sbostic long nmatch;			/* Number of matches in this file */
11432332Sbostic long incount, counted;		/* Amount of input consumed */
11532332Sbostic long rxcount;			/* Bytes of input processed by regexec() */
11632332Sbostic int boyfound;			/* accumulated partial matches (tripped by
11732332Sbostic 				 * FIRSTFEW) */
11832332Sbostic int prevmatch;			/* next three lines aid fast -n */
11932332Sbostic long nline, prevnline;
12032332Sbostic char *prevloc;
12132332Sbostic 
12232332Sbostic regexp *rspencer;
12332332Sbostic char *pattern;
12432332Sbostic char *patboy;			/* Pattern for simple Boyer-Moore */
12532332Sbostic char *patfile;			/* Filename containing pattern(s) */
12632332Sbostic 
12732332Sbostic int delta0[256];		/* Boyer-Moore algorithm core */
12832332Sbostic char cmap[256];			/* Usually 0-255, but if -i, maps upper to
12932332Sbostic 				 * lower case */
13032332Sbostic char str[BUFSIZE + 2];
13132332Sbostic int nleftover;
13232332Sbostic char linetemp[BUFSIZE];
13332339Sbostic char *altpat[NALT];		/* alternation component storage */
13432332Sbostic int altlen[NALT];
13532332Sbostic short altset[NMUSH + 1][256];
13632332Sbostic char preamble[200];		/* match prefix (filename, line no.) */
13732332Sbostic 
13832332Sbostic int fd;
13932332Sbostic char *
14032332Sbostic strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *sprintf(), *malloc();
14132332Sbostic char *
14232332Sbostic grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
14332332Sbostic char **args;
14432332Sbostic 
14532332Sbostic main(argc, argv)
14632332Sbostic 	int argc;
14732332Sbostic 	char *argv[];
14832332Sbostic {
14932365Sbostic 	int c, oflag;
15032332Sbostic 	int errflag = 0;
15132332Sbostic 
15232332Sbostic 	args = argv;
15332332Sbostic 
15432332Sbostic 	if ((progname = strrchr(argv[0], '/')) != 0)
15532332Sbostic 		progname++;
15632332Sbostic 	else
15732332Sbostic 		progname = argv[0];
15832332Sbostic 	if (strcmp(progname, "grep") == 0)
15932332Sbostic 		grepflag++;
16032365Sbostic 	else if (strcmp(progname, "fgrep") == 0)
16132332Sbostic 		fgrepflag++;
16232332Sbostic 
16332365Sbostic 	oflag = 0;
16432365Sbostic 	while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) {
16532332Sbostic 		switch (c) {
16632332Sbostic 
16732332Sbostic 		case 'f':
16832332Sbostic 			fflag++;
16932332Sbostic 			patfile = optarg;
17032332Sbostic 			continue;
17132332Sbostic 		case 'b':
17232332Sbostic 		case 'v':
17332332Sbostic 			egrepold++;	/* boyer-moore of little help here */
17432332Sbostic 			continue;
17532332Sbostic 		case 'c':
17632332Sbostic 			cflag++;
17732332Sbostic 			continue;
17832332Sbostic 		case 'e':
17932332Sbostic 			eflag++;
18032332Sbostic 			pattern = optarg;
18132332Sbostic 			continue;
18232332Sbostic 		case 'h':
18332332Sbostic 			hflag++;
18432332Sbostic 			continue;
18532365Sbostic 		case 'o':
18632365Sbostic 			oflag++;
18732365Sbostic 			continue;
18832332Sbostic 		case '1':	/* Stop at very first match */
18932332Sbostic 			firstflag++;	/* spead freaks only */
19032332Sbostic 			continue;
19132332Sbostic 		case 'i':
19232332Sbostic 			iflag++;
19332332Sbostic 			continue;
19432332Sbostic 		case 'l':
19532332Sbostic 			lflag++;
19632332Sbostic 			continue;
19732332Sbostic 		case 'n':
19832332Sbostic 			nflag++;
19932332Sbostic 			continue;
20032332Sbostic 		case 's':
20132332Sbostic 			sflag++;
20232332Sbostic 			continue;
20332332Sbostic 		case 'w':
20432332Sbostic 		case 'y':
20532332Sbostic 			if (!grepflag)
20632332Sbostic 				errflag++;
20732332Sbostic 			grepold++;
20832332Sbostic 			continue;
20932332Sbostic 		case 'x':	/* needs more work, like -b above */
21032332Sbostic 			if (!fgrepflag)
21132332Sbostic 				errflag++;
21232332Sbostic 			fgrepold++;
21332332Sbostic 			continue;
21432332Sbostic 		case '?':
21532332Sbostic 			errflag++;
21632332Sbostic 		}
21732332Sbostic 	}
21832332Sbostic 	if (errflag || ((argc <= optind) && !fflag && !eflag)) {
21932332Sbostic 		if (grepflag)
22032365Sbostic oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]");
22132332Sbostic 		else if (fgrepflag)
222*32366Sbostic oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]");
22332332Sbostic 		else		/* encourage SVID options, though we provide
22432332Sbostic 				 * others */
225*32366Sbostic oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]");
22632332Sbostic 	}
22732332Sbostic 	patind = optind;
22832332Sbostic 	if (fflag)
22932332Sbostic 		pattern = pfile(patfile), patind = 0;
23032332Sbostic 	else if (!eflag)
23132332Sbostic 		pattern = argv[optind++];
23232332Sbostic 
23332365Sbostic 	if (!oflag && (argc - optind) <= 1)	/* Filename invisible given < 2 files */
23432332Sbostic 		hflag++;
23532332Sbostic 	if (pattern[0] == EOS)
23632332Sbostic 		kernighan(argv);	/* same as it ever was */
23732332Sbostic 	/*
23832332Sbostic 	 * 'grep/egrep' merger -- "old" grep is called to handle: tagged
23932332Sbostic 	 * exprs \( \), word matches \< and \>, -w and -y options, char
24032332Sbostic 	 * classes with '-' at end (egrep bug?), and patterns beginning with
24132332Sbostic 	 * an asterisk (don't ask why). otherwise, characters meaningful to
24232332Sbostic 	 * 'egrep' but not to 'grep' are escaped; the entire expr is then
24332332Sbostic 	 * passed to 'egrep'.
24432332Sbostic 	 */
24532332Sbostic 	if (grepflag && !grepold) {
24632332Sbostic 		if (strindex(pattern, "\\(") >= 0 ||
24732332Sbostic 		    strindex(pattern, "\\<") >= 0 ||
24832332Sbostic 		    strindex(pattern, "\\>") >= 0 ||
24932332Sbostic 		    strindex(pattern, "-]") >= 0 ||
25032332Sbostic 		    pattern[0] == '*')	/* grep bug */
25132332Sbostic 			grepold++;
25232332Sbostic 		else
25332332Sbostic 			pattern = grepxlat(pattern);
25432332Sbostic 	}
25532332Sbostic 	if (grepold || egrepold || fgrepold)
25632332Sbostic 		kernighan(argv);
25732332Sbostic 
25832332Sbostic 	if (iflag)
25932332Sbostic 		strcpy(pattern, fold(pattern));
26032332Sbostic 	/*
26132332Sbostic 	 * If the pattern is a plain string, just run boyer-moore. If it
26232332Sbostic 	 * consists of meta-free alternatives, run "superimposed" bmg.
26332332Sbostic 	 * Otherwise, find best string, and compile pattern for regexec().
26432332Sbostic 	 */
26532332Sbostic 	if (strpbrk(pattern, META) == NULL) {	/* do boyer-moore only */
26632332Sbostic 		boyonly++;
26732332Sbostic 		patboy = pattern;
26832332Sbostic 	} else {
26932332Sbostic 		if ((patboy = alternate(pattern)) != NULL)
27032332Sbostic 			boyonly++;
27132332Sbostic 		else {
27232332Sbostic 			if ((patboy = isolate(pattern)) == NULL)
27332332Sbostic 				kernighan(argv);	/* expr too involved */
27432332Sbostic #ifndef NOKANJI
27532332Sbostic 			for (c = 0; pattern[c] != EOS; c++)
27632332Sbostic 				if (pattern[c] & NONASCII)	/* kanji + meta */
27732332Sbostic 					kernighan(argv);
27832332Sbostic #endif
27932332Sbostic 			if ((rspencer = regcomp(pattern)) == NULL)
28032332Sbostic 				oops("regcomp failure");
28132332Sbostic 		}
28232332Sbostic 	}
28332332Sbostic 	gosper(patboy);		/* "pre-conditioning is wonderful"
28432332Sbostic 				 * -- v. strassen */
28532332Sbostic 
28632332Sbostic 	if ((firstfile = optind) >= argc) {
28732332Sbostic 		/* Grep standard input */
28832332Sbostic 		if (lflag)	/* We don't know its name! */
28932332Sbostic 			exit(1);
29032332Sbostic 		egsecute((char *) NULL);
29132332Sbostic 	} else {
29232332Sbostic 		while (optind < argc) {
29332332Sbostic 			egsecute(argv[optind]);
29432332Sbostic 			optind++;
29532332Sbostic 			if (firstflag && (nsuccess == 1))
29632332Sbostic 				break;
29732332Sbostic 		}
29832332Sbostic 	}
29932332Sbostic 	exit((nsuccess == 2) ? 2 : (nsuccess == 0));
30032332Sbostic }
30132332Sbostic 
30232332Sbostic char *
30332332Sbostic pfile(pfname)			/* absorb expression from file */
30432332Sbostic 	char *pfname;
30532332Sbostic {
30632337Sbostic 	int fd;
30732332Sbostic 	struct stat patstat;
30832332Sbostic 	static char *pat;
30932332Sbostic 
31032337Sbostic 	if ((fd = open(pfname, O_RDONLY, 0)) < 0)
31132332Sbostic 		oops("can't read pattern file");
31232337Sbostic 	if (fstat(fd, &patstat) != 0)
31332332Sbostic 		oops("can't stat pattern file");
31432332Sbostic 	if (patstat.st_size > PATSIZE) {
31532332Sbostic 		if (fgrepflag) {	/* defer to unix version */
31632332Sbostic 			fgrepold++;
31732332Sbostic 			return "dummy";
31832332Sbostic 		} else
31932332Sbostic 			oops("pattern file too big");
32032332Sbostic 	}
32132332Sbostic 	if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
32232332Sbostic 		oops("out of memory to read pattern file");
32332337Sbostic 	if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
32432332Sbostic 		oops("error reading pattern file");
32532337Sbostic 	(void) close(fd);
32632332Sbostic 
32732332Sbostic 	pat[patstat.st_size] = EOS;
32832332Sbostic 	if (pat[patstat.st_size - 1] == NL)	/* NOP for egrep; helps grep */
32932332Sbostic 		pat[patstat.st_size - 1] = EOS;
33032332Sbostic 
33132332Sbostic 	if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
33232332Sbostic 		if (fgrepflag)
33332332Sbostic 			fgrepold++;	/* "what's it all about, alfie?" */
33432332Sbostic 		else
33532332Sbostic 			egrepold++;
33632332Sbostic 	}
33732332Sbostic 	return (pat);
33832332Sbostic }
33932332Sbostic 
34032332Sbostic egsecute(file)
34132332Sbostic 	char *file;
34232332Sbostic {
34332332Sbostic 	if (file == NULL)
34432332Sbostic 		fd = 0;
34532339Sbostic 	else if ((fd = open(file, O_RDONLY, 0)) <= 0) {
34632332Sbostic 		fprintf(stderr, "%s: can't open %s\n", progname, file);
34732332Sbostic 		nsuccess = 2;
34832332Sbostic 		return;
34932332Sbostic 	}
35032332Sbostic 	chimaera(file, patboy);
35132332Sbostic 
35232332Sbostic 	if (!boyonly && !flushflag && file != NULL)
35332332Sbostic 		flushmatches();
35432332Sbostic 	if (file != NULL)
35532332Sbostic 		close(fd);
35632332Sbostic }
35732332Sbostic 
35832332Sbostic chimaera(file, pat)		/* "reach out and boyer-moore search someone" */
35932332Sbostic 	char *file, *pat;	/* -- soon-to-be-popular bumper sticker */
36032332Sbostic {
36132332Sbostic 	register char *k, *strend, *s;
36232332Sbostic 	register int j, count;
36332332Sbostic 	register int *deltazero = delta0;
36432332Sbostic 	int patlen = altmin;
36532332Sbostic 	char *t;
36632332Sbostic 	char *gotamatch(), *kanji(), *linesave(), *submatch();
36732332Sbostic 
36832332Sbostic 	nleftover = boyfound = flushflag = 0;
36932332Sbostic 	nline = 1L;
37032332Sbostic 	prevmatch = 0;
37132332Sbostic 	nmatch = counted = rxcount = 0L;
37232332Sbostic 
37332332Sbostic 	while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
37432332Sbostic 
37532332Sbostic 		counted += count;
37632332Sbostic 		strend = linesave(str, count);
37732332Sbostic 
37832332Sbostic 		for (k = str + patlen - 1; k < strend;) {
37932332Sbostic 			/*
38032332Sbostic 			 * for a large class of patterns, upwards of 80% of
38132332Sbostic 			 * match time is spent on the next line.  we beat
38232332Sbostic 			 * existing microcode (vax 'matchc') this way.
38332332Sbostic 			 */
38432332Sbostic 			while ((k += deltazero[*(unsigned char *) k]) < strend);
38532332Sbostic 			if (k < (str + LARGE))
38632332Sbostic 				break;
38732332Sbostic 			k -= LARGE;
38832332Sbostic 
38932332Sbostic 			if (altflag) {
39032332Sbostic 				/*
39132332Sbostic 				 * Parallel Boyer-Moore.  Check whether each
39232332Sbostic 				 * of the previous <altmin> chars COULD be
39332332Sbostic 				 * from one of the alternative strings.
39432332Sbostic 				 */
39532332Sbostic 				s = k - 1;
39632332Sbostic 				j = altmin;
39732332Sbostic 				while (altset[--j][(unsigned char)
39832332Sbostic 					     cmap[*(unsigned char *) s--]]);
39932332Sbostic 				/*
40032332Sbostic 				 * quick test fails. in this life, compare
40132332Sbostic 				 * 'em all.  but, a "reverse trie" would
40232332Sbostic 				 * attenuate worst case (linear w/delta2?).
40332332Sbostic 				 */
40432332Sbostic 				if (--j < 0) {
40532332Sbostic 					count = nalt - 1;
40632332Sbostic 					do {
40732332Sbostic 						s = k;
40832332Sbostic 						j = altlen[count];
40932332Sbostic 						t = altpat[count];
41032332Sbostic 
41132332Sbostic 						while
41232332Sbostic 							(cmap[*(unsigned char *) s--]
41332332Sbostic 							 == t[--j]);
41432332Sbostic 						if (j < 0)
41532332Sbostic 							break;
41632332Sbostic 					}
41732332Sbostic 					while (count--);
41832332Sbostic 				}
41932332Sbostic 			} else {
42032332Sbostic 				/* One string -- check it */
42132332Sbostic 				j = patlen - 1;
42232332Sbostic 				s = k - 1;
42332332Sbostic 				while (cmap[*(unsigned char *) s--] == pat[--j]);
42432332Sbostic 			}
42532332Sbostic 			/*
42632332Sbostic 			 * delta-less shortcut for literati. short shrift for
42732332Sbostic 			 * genetic engineers?
42832332Sbostic 			 */
42932332Sbostic 			if (j >= 0) {
43032332Sbostic 				k++;	/* no match; restart next char */
43132332Sbostic 				continue;
43232332Sbostic 			}
43332332Sbostic 			k = submatch(file, pat, str, strend, k, count);
43432332Sbostic 			if (k == NULL)
43532332Sbostic 				return;
43632332Sbostic 		}
43732332Sbostic 		if (nflag) {
43832332Sbostic 			if (prevmatch)
43932332Sbostic 				nline = prevnline + nlcount(prevloc, k);
44032332Sbostic 			else
44132332Sbostic 				nline = nline + nlcount(str, k);
44232332Sbostic 			prevmatch = 0;
44332332Sbostic 		}
44432332Sbostic 		strncpy(str, linetemp, nleftover);
44532332Sbostic 	}
44632332Sbostic 	if (cflag) {
44732332Sbostic 		/* Bug from old grep: -c overrides -h.  We fix the bug. */
44832332Sbostic 		if (!hflag)
44932332Sbostic 			printf("%s:", file);
45032332Sbostic 		printf("%ld\n", nmatch);
45132332Sbostic 	}
45232332Sbostic }
45332332Sbostic 
45432332Sbostic char *
45532332Sbostic linesave(str, count)		/* accumulate partial line at end of buffer */
45632332Sbostic 	char str[];
45732332Sbostic 	register int count;
45832332Sbostic {
45932332Sbostic 	register int j;
46032332Sbostic 
46132332Sbostic 	count += nleftover;
46232332Sbostic 	if (count != BUFSIZE && fd != 0)
46332332Sbostic 		str[count++] = NL;	/* insurance for broken last line */
46432332Sbostic 	str[count] = EOS;
46532332Sbostic 	for (j = count - 1; str[j] != NL && j >= 0;)
46632332Sbostic 		j--;
46732332Sbostic 	/*
46832332Sbostic 	 * break up these lines: long line (> BUFSIZE), last line of file, or
46932332Sbostic 	 * short return from read(), as from tee(1) input
47032332Sbostic 	 */
47132332Sbostic 	if (j < 0 && (count == (BUFSIZE - nleftover))) {
47232332Sbostic 		str[count++] = NL;
47332332Sbostic 		str[count] = EOS;
47432332Sbostic 		linetemp[0] = EOS;
47532332Sbostic 		nleftover = 0;
47632332Sbostic 		return (str + count);
47732332Sbostic 	} else {
47832332Sbostic 		nleftover = count - j - 1;
47932332Sbostic 		strncpy(linetemp, str + j + 1, nleftover);
48032332Sbostic 		return (str + j);
48132332Sbostic 	}
48232332Sbostic }
48332332Sbostic 
48432332Sbostic /*
48532332Sbostic  * Process partial match. First check for mis-aligned Kanji, then match line
48632332Sbostic  * against full compiled r.e. if statistics do not warrant handing off to
48732332Sbostic  * standard egrep.
48832332Sbostic  */
48932332Sbostic char *
49032332Sbostic submatch(file, pat, str, strend, k, altindex)
49132332Sbostic 	char file[], pat[], str[];
49232332Sbostic 	register char *strend, *k;
49332332Sbostic 	int altindex;
49432332Sbostic {
49532332Sbostic 	register char *s;
49632332Sbostic 	char *t, c;
49732332Sbostic 
49832332Sbostic 	t = k;
49932332Sbostic 	s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
50032332Sbostic #ifndef NOKANJI
50132332Sbostic 	c = ((altflag) ? altpat[altindex][0] : pat[0]);
50232332Sbostic 	if (c & NONASCII)
50332332Sbostic 		if ((s = kanji(str, s, k)) == NULL)
50432332Sbostic 			return (++k);	/* reject false kanji */
50532332Sbostic #endif
50632332Sbostic 	do;
50732332Sbostic 	while (*s != NL && --s >= str);
50832332Sbostic 	k = s + 1;		/* now at line start */
50932332Sbostic 
51032332Sbostic 	if (boyonly)
51132332Sbostic 		return (gotamatch(file, k));
51232332Sbostic 
51332332Sbostic 	incount = counted - (strend - k);
51432332Sbostic 	if (boyfound++ == FIRSTFEW)
51532332Sbostic 		execstrategy(file);
51632332Sbostic 
51732332Sbostic 	s = t;
51832332Sbostic 	do
51932332Sbostic 		rxcount++;
52032332Sbostic 	while (*s++ != NL);
52132332Sbostic 	*--s = EOS;
52232332Sbostic 	/*
52332332Sbostic 	 * "quick henry -- the flit" (after theodor geisel)
52432332Sbostic 	 */
52532332Sbostic 	if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
52632332Sbostic 		*s = NL;
52732332Sbostic 		if (gotamatch(file, k) == NULL)
52832332Sbostic 			return (NULL);
52932332Sbostic 	}
53032332Sbostic 	*s = NL;
53132332Sbostic 	return (s + 1);
53232332Sbostic }
53332332Sbostic 
53432338Sbostic #ifndef NOKANJI
53532332Sbostic /*
53632332Sbostic  * EUC code disambiguation -- scan backwards to first 7-bit code, while
53732332Sbostic  * counting intervening 8-bit codes.  If odd, reject unaligned Kanji pattern.
53832332Sbostic  * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
53932332Sbostic  */
54032332Sbostic char *
54132332Sbostic kanji(str, s, k)
54232332Sbostic 	register char *str, *s, *k;
54332332Sbostic {
54432332Sbostic 	register int j = 0;
54532332Sbostic 
54632332Sbostic 	for (s--; s >= str; s--) {
54732332Sbostic 		if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
54832332Sbostic 			break;
54932332Sbostic 		j++;
55032332Sbostic 	}
55132332Sbostic #ifndef CHINESE
55232332Sbostic 	if (*s == SS2)
55332332Sbostic 		j -= 1;
55432332Sbostic #endif  CHINESE
55532332Sbostic 	return ((j & 01) ? NULL : k);
55632332Sbostic }
55732338Sbostic #endif
55832332Sbostic 
55932332Sbostic /*
56032332Sbostic  * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
56132332Sbostic  */
56232332Sbostic gosper(pattern)
56332332Sbostic 	char *pattern;		/* ... HAKMEM lives ... */
56432332Sbostic {
56532332Sbostic 	register int i, j;
56632332Sbostic 	unsigned char c;
56732332Sbostic 
56832332Sbostic 	/* Make one-string case look like simple alternatives case */
56932332Sbostic 	if (!altflag) {
57032332Sbostic 		nalt = 1;
57132332Sbostic 		altmin = altlen[0] = strlen(pattern);
57232339Sbostic 		altpat[0] = pattern;
57332332Sbostic 	}
57432332Sbostic 	/* For chars that aren't in any string, skip by string length. */
57532332Sbostic 	for (j = 0; j < 256; j++) {
57632332Sbostic 		delta0[j] = altmin;
57732332Sbostic 		cmap[j] = j;	/* Sneak in initialization of cmap */
57832332Sbostic 	}
57932332Sbostic 
58032332Sbostic 	/* For chars in a string, skip distance from char to end of string. */
58132332Sbostic 	/* (If char appears more than once, skip minimum distance.) */
58232332Sbostic 	for (i = 0; i < nalt; i++)
58332332Sbostic 		for (j = 0; j < altlen[i] - 1; j++) {
58432332Sbostic 			c = altpat[i][j];
58532332Sbostic 			delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
58632332Sbostic 			if (iflag && islower((int) c))
58732332Sbostic 				delta0[toupper((int) c)] = delta0[c];
58832332Sbostic 		}
58932332Sbostic 
59032332Sbostic 	/* For last char of each string, fall out of search loop. */
59132332Sbostic 	for (i = 0; i < nalt; i++) {
59232332Sbostic 		c = altpat[i][altlen[i] - 1];
59332332Sbostic 		delta0[c] = LARGE;
59432332Sbostic 		if (iflag && islower((int) c))
59532332Sbostic 			delta0[toupper((int) c)] = LARGE;
59632332Sbostic 	}
59732332Sbostic 	if (iflag)
59832332Sbostic 		for (j = 'A'; j <= 'Z'; j++)
59932332Sbostic 			cmap[j] = tolower((int) j);
60032332Sbostic }
60132332Sbostic 
60232332Sbostic /*
60332332Sbostic  * Print, count, or stop on full match. Result is either the location for
60432332Sbostic  * continued search, or NULL to stop.
60532332Sbostic  */
60632332Sbostic char *
60732332Sbostic gotamatch(file, s)
60832332Sbostic 	register char *file, *s;
60932332Sbostic {
61032332Sbostic 	char *savematch();
61132332Sbostic 	int squirrel = 0;	/* nonzero to squirrel away FIRSTFEW matches */
61232332Sbostic 
61332332Sbostic 	nmatch++;
61432332Sbostic 	nsuccess = 1;
61532332Sbostic 	if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
61632332Sbostic 		squirrel = 1;
61732332Sbostic 
61832332Sbostic 	if (sflag)
61932332Sbostic 		return (NULL);	/* -s usurps all flags (unlike some versions) */
62032332Sbostic 	if (cflag) {		/* -c overrides -l, we guess */
62132332Sbostic 		do;
62232332Sbostic 		while (*s++ != NL);
62332332Sbostic 	} else if (lflag) {
62432332Sbostic 		puts(file);
62532332Sbostic 		return (NULL);
62632332Sbostic 	} else {
62732332Sbostic 		if (!hflag)
62832332Sbostic 			if (!squirrel)
62932332Sbostic 				printf("%s:", file);
63032332Sbostic 			else
63132332Sbostic 				sprintf(preamble, "%s:", file);
63232332Sbostic 		if (nflag) {
63332332Sbostic 			if (prevmatch)
63432332Sbostic 				prevnline = prevnline + nlcount(prevloc, s);
63532332Sbostic 			else
63632332Sbostic 				prevnline = nline + nlcount(str, s);
63732332Sbostic 			prevmatch = 1;
63832332Sbostic 
63932332Sbostic 			if (!squirrel)
64032332Sbostic 				printf("%ld:", prevnline);
64132332Sbostic 			else
64232332Sbostic 				sprintf(preamble + strlen(preamble),
64332332Sbostic 					"%ld:", prevnline);
64432332Sbostic 		}
64532332Sbostic 		if (!squirrel) {
64632332Sbostic 			do
64732332Sbostic 				putchar(*s);
64832332Sbostic 			while (*s++ != NL);
64932332Sbostic 		} else
65032332Sbostic 			s = savematch(s);
65132332Sbostic 
65232332Sbostic 		if (nflag)
65332332Sbostic 			prevloc = s - 1;
65432332Sbostic 	}
65532332Sbostic 	return ((firstflag && !cflag) ? NULL : s);
65632332Sbostic }
65732332Sbostic 
65832332Sbostic char *
65932332Sbostic fold(line)
66032332Sbostic 	char *line;
66132332Sbostic {
66232332Sbostic 	static char fline[BUFSIZE];
66332332Sbostic 	register char *s, *t = fline;
66432332Sbostic 
66532332Sbostic 	for (s = line; *s != EOS; s++)
66632332Sbostic 		*t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
66732332Sbostic 	*t = EOS;
66832332Sbostic 	return (fline);
66932332Sbostic }
67032332Sbostic 
67132332Sbostic strindex(s, t)			/* the easy way, as in K&P, p. 192 */
67232332Sbostic 	char *s, *t;
67332332Sbostic {
67432332Sbostic 	int i, n;
67532332Sbostic 
67632332Sbostic 	n = strlen(t);
67732332Sbostic 	for (i = 0; s[i] != '\0'; i++)
67832332Sbostic 		if (strncmp(s + i, t, n) == 0)
67932332Sbostic 			return (i);
68032332Sbostic 	return (-1);
68132332Sbostic }
68232332Sbostic 
68332332Sbostic char *
68432332Sbostic grepxlat(pattern)		/* grep pattern meta conversion */
68532332Sbostic 	char *pattern;
68632332Sbostic {
68732332Sbostic 	register char *p, *s;
68832332Sbostic 	static char newpat[BUFSIZE];
68932332Sbostic 
69032332Sbostic 	for (s = newpat, p = pattern; *p != EOS;) {
69132332Sbostic 		if (*p == '\\') {	/* skip escapes ... */
69232332Sbostic 			*s++ = *p++;
69332332Sbostic 			if (*p)
69432332Sbostic 				*s++ = *p++;
69532332Sbostic 		} else if (*p == '[') {	/* ... and char classes */
69632332Sbostic 			while (*p != EOS && *p != ']')
69732332Sbostic 				*s++ = *p++;
69832332Sbostic 		} else if (strchr("+?|()", *p) != NULL) {
69932332Sbostic 			*s++ = '\\';	/* insert protection */
70032332Sbostic 			*s++ = *p++;
70132332Sbostic 		} else
70232332Sbostic 			*s++ = *p++;
70332332Sbostic 	}
70432332Sbostic 	*s = EOS;
70532332Sbostic 	grepflag = ((patind) ? 0 : 1);
70632332Sbostic 	return (newpat);
70732332Sbostic }
70832332Sbostic 
70932332Sbostic /*
71032332Sbostic  * Test for simple alternation.  Result is NULL if it's not so simple, or is
71132332Sbostic  * a pointer to the first string if it is. Warning:  sscanf size is a
71232332Sbostic  * fixpoint, beyond which the speedup linearity starts to break down.  In the
71332332Sbostic  * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
71432332Sbostic  * altpat[] to arbitrary size is not useful.
71532332Sbostic  */
71632332Sbostic char *
71732332Sbostic alternate(regexpr)
71832332Sbostic 	char *regexpr;
71932332Sbostic {
72032339Sbostic 	register int i, j;
72132339Sbostic 	register char *start, *stop;
72232332Sbostic 	unsigned char c;
72332332Sbostic 
72432332Sbostic 	if (fgrepflag && strchr(regexpr, '|'))
72532339Sbostic 			return (NULL);
72632332Sbostic 
72732339Sbostic 	/*
72832339Sbostic 	 * break pattern up into altpat array; delimit on newline, bar,
72932339Sbostic 	 * or EOS.  We know we won't overflow, we've already checked the
73032339Sbostic 	 * number of patterns we're going to find against NALT.
73132339Sbostic 	 * Also, set length of pattern and find minimum pattern length.
73232339Sbostic 	 */
73332339Sbostic 	nalt = 0;
73432339Sbostic 	altmin = NMUSH;
73532339Sbostic 	for (start = stop = regexpr;; ++stop)
73632339Sbostic 		if (!*stop || *stop == '|' || *stop == NL) {
73732339Sbostic 			altlen[nalt] = j = stop - start;
73832339Sbostic 			if (j < altmin)
73932339Sbostic 				altmin = j;
74032339Sbostic 			if (!(altpat[nalt] = malloc((u_int)(j + 1))))
74132339Sbostic 				oops("out of memory");
74232339Sbostic 			bcopy(start, altpat[nalt], j);
74332339Sbostic 			altpat[nalt][j] = EOS;
74432362Sbostic 			++nalt;
74532339Sbostic 			if (!*stop)
74632339Sbostic 				break;
74732362Sbostic 			if (nalt == NALT)
74832339Sbostic 				return(NULL);
74932339Sbostic 			if (*stop == NL)
75032339Sbostic 				*stop = '|';
75132339Sbostic 			start = stop + 1;
75232339Sbostic 		}
75332332Sbostic 	if (!fgrepflag) {
75432332Sbostic 		if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
75532332Sbostic 			return (NULL);
75632332Sbostic 		if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
75732332Sbostic 		    || strindex(regexpr, "||") >= 0)
75832332Sbostic 			return (NULL);
75932332Sbostic 	}
76032332Sbostic 
76132332Sbostic 	if (nalt > 1) {		/* build superimposed "pre-match" sets per
76232332Sbostic 				 * char */
76332332Sbostic 		altflag++;
76432332Sbostic 		for (j = 0; j < nalt; j++)
76532332Sbostic 			for (i = 0; i < altmin; i++) {
76632332Sbostic 				c = altpat[j][altlen[j] - altmin + i];
76732332Sbostic 				altset[i + 1][c] = 1;	/* offset for sentinel */
76832332Sbostic 			}
76932332Sbostic 	}
77032332Sbostic 	return (altpat[0]);
77132332Sbostic }
77232332Sbostic 
77332332Sbostic /*
77432332Sbostic  * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
77532332Sbostic  * determine whether to use dfa-based egrep:  We do FIRSTFEW matches with
77632332Sbostic  * regexec().  If Boyer-Moore up to now matched more than PUNTPERCENT
77732332Sbostic  * of the input, the r.e. is likely to be underspecified, so do old *grep,
77832332Sbostic  * which is faster on complex patterns than regexp().  At FIRSTFEW,
77932332Sbostic  * dump the saved matches collected by savematch(). They are saved
78032332Sbostic  * so that a "PUNT" can "rewind" to ignore them.  Stdin is problematic,
78132332Sbostic  * since it's hard to rewind.
78232332Sbostic  */
78332332Sbostic 
78432332Sbostic execstrategy(file)
78532332Sbostic 	char *file;
78632332Sbostic {
78732332Sbostic 	int pctmatch;
78832332Sbostic 
78932332Sbostic 	pctmatch = (100 * rxcount) / incount;
79032332Sbostic 	if (pctmatch > PUNTPERCENT && file != NULL)
79132332Sbostic 		kernighan(args);
79232332Sbostic 	if (file != NULL)
79332332Sbostic 		flushmatches();
79432332Sbostic }
79532332Sbostic 
79632332Sbostic nlcount(bstart, bstop)		/* flail interval to totalize newlines. */
79732332Sbostic 	char *bstart, *bstop;
79832332Sbostic {
79932332Sbostic 	register char *s = bstart;
80032332Sbostic 	register char *t = bstop;
80132332Sbostic 	register int count = 0;
80232332Sbostic 
80332332Sbostic 	do {			/* loop unroll for older architectures */
80432332Sbostic 		if (*t == NL)	/* ... ask ames!jaw for sample code */
80532332Sbostic 			count++;
80632332Sbostic 	} while (t-- > s);
80732332Sbostic 
80832332Sbostic 	return (count);
80932332Sbostic }
81032332Sbostic 
81132332Sbostic char *
81232332Sbostic isolate(regexpr)		/* isolate longest metacharacter-free string */
81332332Sbostic 	char *regexpr;
81432332Sbostic {
81532332Sbostic 	char *dummyexpr;
81632332Sbostic 
81732332Sbostic 	/*
81832332Sbostic 	 * We add (.)* because Henry's regcomp only figures regmust if it
81932332Sbostic 	 * sees a leading * pattern.  Foo!
82032332Sbostic 	 */
82132332Sbostic 	dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
82232332Sbostic 	sprintf(dummyexpr, "(.)*%s", regexpr);
82332332Sbostic 	if ((rspencer = regcomp(dummyexpr)) == NULL)
82432332Sbostic 		kernighan(args);
82532332Sbostic 	return (rspencer->regmust);
82632332Sbostic }
82732332Sbostic 
82832332Sbostic char *matches[FIRSTFEW];
82932332Sbostic static int mcount = 0;
83032332Sbostic 
83132332Sbostic char *
83232332Sbostic savematch(s)			/* horde matches during statistics gathering */
83332332Sbostic 	register char *s;
83432332Sbostic {
83532332Sbostic 	char *p;
83632332Sbostic 	char *start = s;
83732332Sbostic 	int msize = 0;
83832332Sbostic 	int psize = strlen(preamble);
83932332Sbostic 
84032332Sbostic 	while (*s++ != NL)
84132332Sbostic 		msize++;
84232332Sbostic 	*--s = EOS;
84332332Sbostic 
84432332Sbostic 	p = malloc((unsigned) msize + 1 + psize);
84532332Sbostic 	strcpy(p, preamble);
84632332Sbostic 	strcpy(p + psize, start);
84732332Sbostic 	matches[mcount++] = p;
84832332Sbostic 
84932332Sbostic 	preamble[0] = 0;
85032332Sbostic 	*s = NL;
85132332Sbostic 	return (s);
85232332Sbostic }
85332332Sbostic 
85432332Sbostic flushmatches()
85532332Sbostic {
85632332Sbostic 	int n;
85732332Sbostic 
85832332Sbostic 	flushflag = 1;
85932332Sbostic 	for (n = 0; n < mcount; n++)
86032332Sbostic 		printf("%s\n", matches[n]);
86132332Sbostic 	mcount = 0;
86232332Sbostic }
86332332Sbostic 
86432332Sbostic oops(message)
86532332Sbostic 	char *message;
86632332Sbostic {
86732332Sbostic 	fprintf(stderr, "%s: %s\n", progname, message);
86832332Sbostic 	exit(2);
86932332Sbostic }
87032332Sbostic 
87132332Sbostic kernighan(args)			/* "let others do the hard part ..." */
87232332Sbostic 	char *args[];
87332332Sbostic {
87432332Sbostic 	/*
87532332Sbostic 	 * We may have already run grep on some of the files; remove them
87632332Sbostic 	 * from the arg list we pass on.  Note that we can't delete them
87732332Sbostic 	 * totally because the number of file names affects the output
87832332Sbostic 	 * (automatic -h).
87932332Sbostic 	 */
88032332Sbostic 	/* better would be fork/exec per punted file -- jaw */
88132332Sbostic 
88232332Sbostic 	while (firstfile && optind > firstfile)
88332332Sbostic 		args[firstfile++] = "/dev/null";
88432332Sbostic 	if (patind)
88532332Sbostic 		args[patind] = pattern;
88632339Sbostic 	(void) fflush(stdout);
88732332Sbostic 
88832332Sbostic 	if (grepflag)
88932332Sbostic 		execvp(GREPSTD, args), oops("can't exec old 'grep'");
89032332Sbostic 	else if (fgrepflag)
89132332Sbostic 		execvp(FGREPSTD, args), oops("can't exec old 'fgrep'");
89232332Sbostic 	else
89332332Sbostic 		execvp(EGREPSTD, args), oops("can't exec old 'egrep'");
89432332Sbostic }
895