xref: /csrg-svn/usr.bin/grep/egrep/egrep.c (revision 62020)
148263Sbostic /*-
2*62020Sbostic  * Copyright (c) 1991, 1993
3*62020Sbostic  *	The Regents of the University of California.  All rights reserved.
448263Sbostic  *
548263Sbostic  * %sccs.include.redist.c%
648263Sbostic  */
748263Sbostic 
832332Sbostic #ifndef lint
9*62020Sbostic static char copyright[] =
10*62020Sbostic "@(#) Copyright (c) 1991, 1993\n\
11*62020Sbostic 	The Regents of the University of California.  All rights reserved.\n";
1248263Sbostic #endif /* not lint */
1332332Sbostic 
1448263Sbostic #ifndef lint
15*62020Sbostic static char sccsid[] = "@(#)egrep.c	8.1 (Berkeley) 06/06/93";
1648263Sbostic #endif /* not lint */
1748263Sbostic 
1832332Sbostic /*
1932332Sbostic      Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
2032332Sbostic      table as in original paper (CACM, October, 1977).  No delta1 or delta2.
2132332Sbostic      According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
2232332Sbostic      minimal practical value.  However, to improve for worst case input,
2332332Sbostic      integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
2432332Sbostic      Comput., Feb. 1986) deserves consideration.
2532332Sbostic 
2632332Sbostic      Method: 	extract longest metacharacter-free string from expression.
2732332Sbostic 		this is done using a side-effect from henry spencer's regcomp().
2832332Sbostic 		use boyer-moore to match such, then pass submatching lines
2932332Sbostic 		to either regexp() or standard 'egrep', depending on certain
3032332Sbostic 		criteria within execstrategy() below.  [this tradeoff is due
3132332Sbostic 		to the general slowness of the regexp() nondeterministic
3232332Sbostic 		machine on complex expressions, as well as the startup time
3332332Sbostic 		of standard 'egrep' on short files.]  alternatively, one may
3432332Sbostic 		change the vendor-supplied 'egrep' automaton to include
3532332Sbostic 		boyer-moore directly.  see accompanying writeup for discussion
3632332Sbostic 		of kanji expression treatment.
3732332Sbostic 
3832332Sbostic 		late addition:  apply trickbag for fast match of simple
3932332Sbostic 		alternations (sublinear, in common low-cardinality cases).
4032332Sbostic 		trap fgrep into this lair.
4132332Sbostic 
4232332Sbostic 		gnu additions:  -f, newline as |, \< and \> [in regexec()], more
4332332Sbostic 				comments.  inspire better dfa exec() strategy.
4432332Sbostic 				serious testing and help with special cases.
4532332Sbostic 
4632332Sbostic      Algorithm amalgam summary:
4732332Sbostic 
4832332Sbostic 		dfa e?grep 		(aho/thompson)
4932332Sbostic 		ndfa regexp() 		(spencer/aho)
5032332Sbostic 		bmg			(boyer/moore/gosper)
5132332Sbostic 		"superimposed" bmg   	(jaw)
5232332Sbostic 		fgrep			(aho/corrasick)
5332332Sbostic 
5432332Sbostic 		sorry, but the knuth/morris/pratt machine, horspool's
5532332Sbostic 		"frequentist" code, and the rabin/karp matcher, however cute,
5632332Sbostic 		just don't cut it for this production.
5732332Sbostic 
5832332Sbostic      James A. Woods				Copyright (c) 1986
5932332Sbostic      NASA Ames Research Center
6032332Sbostic */
6132332Sbostic #include <sys/types.h>
6232332Sbostic #include <sys/stat.h>
6332337Sbostic #include <sys/file.h>
6432332Sbostic #include <regexp.h>		/* must be henry spencer's version */
6537914Sbostic #include <stdio.h>
6637914Sbostic #include <ctype.h>
6737914Sbostic #include "pathnames.h"
6832332Sbostic 
6932332Sbostic #define	MIN(A, B)	((A) > (B) ? (B) : (A))
7032332Sbostic 
7132332Sbostic #ifdef	SLOWSYS
7232332Sbostic #define read	xread
7332332Sbostic #endif
7432332Sbostic 
7532332Sbostic #define BUFSIZE	8192		/* make higher for cray */
7632332Sbostic #define PATSIZE 6000
7732332Sbostic #define LARGE 	BUFSIZE + PATSIZE
7832332Sbostic 
7932332Sbostic #define NALT	7		/* tied to scanf() size in alternate() */
8032332Sbostic #define NMUSH	6		/* loosely relates to expected alt length */
8132332Sbostic 
8232332Sbostic #define	FIRSTFEW	33 	/* Always do FIRSTFEW matches with regexec() */
8332332Sbostic #define	PUNTPERCENT	10	/* After FIRSTFEW, if PUNTPERCENT of the input
8432332Sbostic 				 * was processed by regexp(), exec std egrep. */
8532332Sbostic #define NL	'\n'
8632332Sbostic #define	EOS	'\0'
8732332Sbostic #define	NONASCII	0200	/* Bit mask for Kanji non-ascii chars */
8832332Sbostic #define META	"\n^$.[]()?+*|\\"	/* egrep meta-characters */
8932332Sbostic #define SS2	'\216'		/* EUC Katakana (or Chinese2) prefix */
9032332Sbostic #define SS3	'\217'		/* EUC Kanji2 (or Chinese3) prefix */
9132332Sbostic 
9232332Sbostic extern char *optarg;
9332332Sbostic extern int optind;
9432332Sbostic char *progname;
9532332Sbostic 
9632332Sbostic int cflag, iflag, eflag, fflag, lflag, nflag;	/* SVID flags */
9732332Sbostic int sflag, hflag;		/* v7, v8, bsd */
9832332Sbostic 
9932332Sbostic int firstflag;			/* Stop at first match */
10032332Sbostic int grepflag;			/* Called as "grep" */
10132332Sbostic int fgrepflag;			/* Called as "fgrep" */
10232332Sbostic int altflag;			/* Simple alternation in pattern */
10332332Sbostic int boyonly;			/* No regexp needed -- all simple */
10432332Sbostic int flushflag;
10532332Sbostic int grepold, egrepold, fgrepold;
10632332Sbostic 
10732332Sbostic int nalt;			/* Number of alternatives */
10832332Sbostic int nsuccess;			/* 1 for match, 2 for error */
10932332Sbostic int altmin;			/* Minimum length of all the alternate
11032332Sbostic 				 * strings */
11132332Sbostic int firstfile;			/* argv index of first file argument */
11232332Sbostic int patind;			/* argv index of pattern */
11332332Sbostic long nmatch;			/* Number of matches in this file */
11432332Sbostic long incount, counted;		/* Amount of input consumed */
11532332Sbostic long rxcount;			/* Bytes of input processed by regexec() */
11632332Sbostic int boyfound;			/* accumulated partial matches (tripped by
11732332Sbostic 				 * FIRSTFEW) */
11832332Sbostic int prevmatch;			/* next three lines aid fast -n */
11932332Sbostic long nline, prevnline;
12032332Sbostic char *prevloc;
12132332Sbostic 
12232332Sbostic regexp *rspencer;
12332332Sbostic char *pattern;
12432332Sbostic char *patboy;			/* Pattern for simple Boyer-Moore */
12532332Sbostic char *patfile;			/* Filename containing pattern(s) */
12632332Sbostic 
12732332Sbostic int delta0[256];		/* Boyer-Moore algorithm core */
12832332Sbostic char cmap[256];			/* Usually 0-255, but if -i, maps upper to
12932332Sbostic 				 * lower case */
13032332Sbostic char str[BUFSIZE + 2];
13132332Sbostic int nleftover;
13232332Sbostic char linetemp[BUFSIZE];
13332339Sbostic char *altpat[NALT];		/* alternation component storage */
13432332Sbostic int altlen[NALT];
13532332Sbostic short altset[NMUSH + 1][256];
13632332Sbostic char preamble[200];		/* match prefix (filename, line no.) */
13732332Sbostic 
13832332Sbostic int fd;
13932332Sbostic char *
14032502Sbostic strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc();
14132332Sbostic char *
14232332Sbostic grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
14335692Sbostic char *gotamatch(), *kanji(), *linesave(), *submatch();
14432332Sbostic char **args;
14532332Sbostic 
main(argc,argv)14632332Sbostic main(argc, argv)
14732332Sbostic 	int argc;
14832332Sbostic 	char *argv[];
14932332Sbostic {
15032365Sbostic 	int c, oflag;
15132332Sbostic 	int errflag = 0;
15232332Sbostic 
15332332Sbostic 	args = argv;
15432332Sbostic 
15532332Sbostic 	if ((progname = strrchr(argv[0], '/')) != 0)
15632332Sbostic 		progname++;
15732332Sbostic 	else
15832332Sbostic 		progname = argv[0];
15932332Sbostic 	if (strcmp(progname, "grep") == 0)
16032332Sbostic 		grepflag++;
16132365Sbostic 	else if (strcmp(progname, "fgrep") == 0)
16232332Sbostic 		fgrepflag++;
16332332Sbostic 
16432365Sbostic 	oflag = 0;
16532365Sbostic 	while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) {
16632332Sbostic 		switch (c) {
16732332Sbostic 
16832332Sbostic 		case 'f':
16932332Sbostic 			fflag++;
17032332Sbostic 			patfile = optarg;
17132332Sbostic 			continue;
17232332Sbostic 		case 'b':
17332332Sbostic 		case 'v':
17432332Sbostic 			egrepold++;	/* boyer-moore of little help here */
17532332Sbostic 			continue;
17632332Sbostic 		case 'c':
17732332Sbostic 			cflag++;
17832332Sbostic 			continue;
17932332Sbostic 		case 'e':
18032332Sbostic 			eflag++;
18132332Sbostic 			pattern = optarg;
18232332Sbostic 			continue;
18332332Sbostic 		case 'h':
18432332Sbostic 			hflag++;
18532332Sbostic 			continue;
18632365Sbostic 		case 'o':
18732365Sbostic 			oflag++;
18832365Sbostic 			continue;
18932332Sbostic 		case '1':	/* Stop at very first match */
19032332Sbostic 			firstflag++;	/* spead freaks only */
19132332Sbostic 			continue;
19232332Sbostic 		case 'i':
19332332Sbostic 			iflag++;
19432332Sbostic 			continue;
19532332Sbostic 		case 'l':
19632332Sbostic 			lflag++;
19732332Sbostic 			continue;
19832332Sbostic 		case 'n':
19932332Sbostic 			nflag++;
20032332Sbostic 			continue;
20132332Sbostic 		case 's':
20232332Sbostic 			sflag++;
20332332Sbostic 			continue;
20432332Sbostic 		case 'w':
20532332Sbostic 		case 'y':
20632332Sbostic 			if (!grepflag)
20732332Sbostic 				errflag++;
20832332Sbostic 			grepold++;
20932332Sbostic 			continue;
21032332Sbostic 		case 'x':	/* needs more work, like -b above */
21132332Sbostic 			if (!fgrepflag)
21232332Sbostic 				errflag++;
21332332Sbostic 			fgrepold++;
21432332Sbostic 			continue;
21532332Sbostic 		case '?':
21632332Sbostic 			errflag++;
21732332Sbostic 		}
21832332Sbostic 	}
21932332Sbostic 	if (errflag || ((argc <= optind) && !fflag && !eflag)) {
22032332Sbostic 		if (grepflag)
22132365Sbostic oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]");
22232332Sbostic 		else if (fgrepflag)
22332366Sbostic oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]");
22432332Sbostic 		else		/* encourage SVID options, though we provide
22532332Sbostic 				 * others */
22632366Sbostic oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]");
22732332Sbostic 	}
22832332Sbostic 	if (fflag)
22934917Sbostic 		pattern = pfile(patfile);
23034917Sbostic 	else if (!eflag) {
23134917Sbostic 		patind = optind;
23232332Sbostic 		pattern = argv[optind++];
23334917Sbostic 	}
23432332Sbostic 
23532365Sbostic 	if (!oflag && (argc - optind) <= 1)	/* Filename invisible given < 2 files */
23632332Sbostic 		hflag++;
23732332Sbostic 	if (pattern[0] == EOS)
23832332Sbostic 		kernighan(argv);	/* same as it ever was */
23932332Sbostic 	/*
24032332Sbostic 	 * 'grep/egrep' merger -- "old" grep is called to handle: tagged
24132332Sbostic 	 * exprs \( \), word matches \< and \>, -w and -y options, char
24232332Sbostic 	 * classes with '-' at end (egrep bug?), and patterns beginning with
24332332Sbostic 	 * an asterisk (don't ask why). otherwise, characters meaningful to
24432332Sbostic 	 * 'egrep' but not to 'grep' are escaped; the entire expr is then
24532332Sbostic 	 * passed to 'egrep'.
24632332Sbostic 	 */
24732332Sbostic 	if (grepflag && !grepold) {
24832332Sbostic 		if (strindex(pattern, "\\(") >= 0 ||
24932332Sbostic 		    strindex(pattern, "\\<") >= 0 ||
25032332Sbostic 		    strindex(pattern, "\\>") >= 0 ||
25132332Sbostic 		    strindex(pattern, "-]") >= 0 ||
25232332Sbostic 		    pattern[0] == '*')	/* grep bug */
25332332Sbostic 			grepold++;
25432332Sbostic 		else
25532332Sbostic 			pattern = grepxlat(pattern);
25632332Sbostic 	}
25732332Sbostic 	if (grepold || egrepold || fgrepold)
25832332Sbostic 		kernighan(argv);
25932332Sbostic 
26032332Sbostic 	if (iflag)
26132332Sbostic 		strcpy(pattern, fold(pattern));
26232332Sbostic 	/*
26332332Sbostic 	 * If the pattern is a plain string, just run boyer-moore. If it
26432332Sbostic 	 * consists of meta-free alternatives, run "superimposed" bmg.
26532332Sbostic 	 * Otherwise, find best string, and compile pattern for regexec().
26632332Sbostic 	 */
26732332Sbostic 	if (strpbrk(pattern, META) == NULL) {	/* do boyer-moore only */
26832332Sbostic 		boyonly++;
26932332Sbostic 		patboy = pattern;
27032332Sbostic 	} else {
27132332Sbostic 		if ((patboy = alternate(pattern)) != NULL)
27232332Sbostic 			boyonly++;
27332332Sbostic 		else {
27432332Sbostic 			if ((patboy = isolate(pattern)) == NULL)
27532332Sbostic 				kernighan(argv);	/* expr too involved */
27632332Sbostic #ifndef NOKANJI
27732332Sbostic 			for (c = 0; pattern[c] != EOS; c++)
27832332Sbostic 				if (pattern[c] & NONASCII)	/* kanji + meta */
27932332Sbostic 					kernighan(argv);
28032332Sbostic #endif
28132332Sbostic 			if ((rspencer = regcomp(pattern)) == NULL)
28232332Sbostic 				oops("regcomp failure");
28332332Sbostic 		}
28432332Sbostic 	}
28532332Sbostic 	gosper(patboy);		/* "pre-conditioning is wonderful"
28632332Sbostic 				 * -- v. strassen */
28732332Sbostic 
28832332Sbostic 	if ((firstfile = optind) >= argc) {
28932332Sbostic 		/* Grep standard input */
29032332Sbostic 		if (lflag)	/* We don't know its name! */
29132332Sbostic 			exit(1);
29232332Sbostic 		egsecute((char *) NULL);
29332332Sbostic 	} else {
29432332Sbostic 		while (optind < argc) {
29532332Sbostic 			egsecute(argv[optind]);
29632332Sbostic 			optind++;
29732332Sbostic 			if (firstflag && (nsuccess == 1))
29832332Sbostic 				break;
29932332Sbostic 		}
30032332Sbostic 	}
30132332Sbostic 	exit((nsuccess == 2) ? 2 : (nsuccess == 0));
30232332Sbostic }
30332332Sbostic 
30432332Sbostic char *
pfile(pfname)30532332Sbostic pfile(pfname)			/* absorb expression from file */
30632332Sbostic 	char *pfname;
30732332Sbostic {
30832337Sbostic 	int fd;
30932332Sbostic 	struct stat patstat;
31032332Sbostic 	static char *pat;
31132332Sbostic 
31232337Sbostic 	if ((fd = open(pfname, O_RDONLY, 0)) < 0)
31332332Sbostic 		oops("can't read pattern file");
31432337Sbostic 	if (fstat(fd, &patstat) != 0)
31532332Sbostic 		oops("can't stat pattern file");
31632332Sbostic 	if (patstat.st_size > PATSIZE) {
31732332Sbostic 		if (fgrepflag) {	/* defer to unix version */
31832332Sbostic 			fgrepold++;
31932332Sbostic 			return "dummy";
32032332Sbostic 		} else
32132332Sbostic 			oops("pattern file too big");
32232332Sbostic 	}
32332332Sbostic 	if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
32432332Sbostic 		oops("out of memory to read pattern file");
32532337Sbostic 	if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
32632332Sbostic 		oops("error reading pattern file");
32732337Sbostic 	(void) close(fd);
32832332Sbostic 
32932332Sbostic 	pat[patstat.st_size] = EOS;
33032332Sbostic 	if (pat[patstat.st_size - 1] == NL)	/* NOP for egrep; helps grep */
33132332Sbostic 		pat[patstat.st_size - 1] = EOS;
33232332Sbostic 
33332332Sbostic 	if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
33432332Sbostic 		if (fgrepflag)
33532332Sbostic 			fgrepold++;	/* "what's it all about, alfie?" */
33632332Sbostic 		else
33732332Sbostic 			egrepold++;
33832332Sbostic 	}
33932332Sbostic 	return (pat);
34032332Sbostic }
34132332Sbostic 
egsecute(file)34232332Sbostic egsecute(file)
34332332Sbostic 	char *file;
34432332Sbostic {
34546717Sbostic 	extern int errno;
34646717Sbostic 
34732332Sbostic 	if (file == NULL)
34832332Sbostic 		fd = 0;
34932339Sbostic 	else if ((fd = open(file, O_RDONLY, 0)) <= 0) {
35046717Sbostic 		fprintf(stderr,
35146717Sbostic 		    "%s: %s: %s\n", progname, file, strerror(errno));
35232332Sbostic 		nsuccess = 2;
35332332Sbostic 		return;
35432332Sbostic 	}
35532332Sbostic 	chimaera(file, patboy);
35632332Sbostic 
35732332Sbostic 	if (!boyonly && !flushflag && file != NULL)
35832332Sbostic 		flushmatches();
35932332Sbostic 	if (file != NULL)
36032332Sbostic 		close(fd);
36132332Sbostic }
36232332Sbostic 
chimaera(file,pat)36332332Sbostic chimaera(file, pat)		/* "reach out and boyer-moore search someone" */
36432332Sbostic 	char *file, *pat;	/* -- soon-to-be-popular bumper sticker */
36532332Sbostic {
36632332Sbostic 	register char *k, *strend, *s;
36732332Sbostic 	register int j, count;
36832332Sbostic 	register int *deltazero = delta0;
36932332Sbostic 	int patlen = altmin;
37032332Sbostic 	char *t;
37132332Sbostic 
37232332Sbostic 	nleftover = boyfound = flushflag = 0;
37332332Sbostic 	nline = 1L;
37432332Sbostic 	prevmatch = 0;
37532332Sbostic 	nmatch = counted = rxcount = 0L;
37632332Sbostic 
37732332Sbostic 	while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
37832332Sbostic 
37932332Sbostic 		counted += count;
38032332Sbostic 		strend = linesave(str, count);
38132332Sbostic 
38232332Sbostic 		for (k = str + patlen - 1; k < strend;) {
38332332Sbostic 			/*
38432332Sbostic 			 * for a large class of patterns, upwards of 80% of
38532332Sbostic 			 * match time is spent on the next line.  we beat
38632332Sbostic 			 * existing microcode (vax 'matchc') this way.
38732332Sbostic 			 */
38832332Sbostic 			while ((k += deltazero[*(unsigned char *) k]) < strend);
38932332Sbostic 			if (k < (str + LARGE))
39032332Sbostic 				break;
39132332Sbostic 			k -= LARGE;
39232332Sbostic 
39332332Sbostic 			if (altflag) {
39432332Sbostic 				/*
39532332Sbostic 				 * Parallel Boyer-Moore.  Check whether each
39632332Sbostic 				 * of the previous <altmin> chars COULD be
39732332Sbostic 				 * from one of the alternative strings.
39832332Sbostic 				 */
39932332Sbostic 				s = k - 1;
40032332Sbostic 				j = altmin;
40132332Sbostic 				while (altset[--j][(unsigned char)
40232332Sbostic 					     cmap[*(unsigned char *) s--]]);
40332332Sbostic 				/*
40432332Sbostic 				 * quick test fails. in this life, compare
40532332Sbostic 				 * 'em all.  but, a "reverse trie" would
40632332Sbostic 				 * attenuate worst case (linear w/delta2?).
40732332Sbostic 				 */
40832332Sbostic 				if (--j < 0) {
40932332Sbostic 					count = nalt - 1;
41032332Sbostic 					do {
41132332Sbostic 						s = k;
41232332Sbostic 						j = altlen[count];
41332332Sbostic 						t = altpat[count];
41432332Sbostic 
41532332Sbostic 						while
41632332Sbostic 							(cmap[*(unsigned char *) s--]
41732332Sbostic 							 == t[--j]);
41832332Sbostic 						if (j < 0)
41932332Sbostic 							break;
42032332Sbostic 					}
42132332Sbostic 					while (count--);
42232332Sbostic 				}
42332332Sbostic 			} else {
42432332Sbostic 				/* One string -- check it */
42532332Sbostic 				j = patlen - 1;
42632332Sbostic 				s = k - 1;
42732332Sbostic 				while (cmap[*(unsigned char *) s--] == pat[--j]);
42832332Sbostic 			}
42932332Sbostic 			/*
43032332Sbostic 			 * delta-less shortcut for literati. short shrift for
43132332Sbostic 			 * genetic engineers?
43232332Sbostic 			 */
43332332Sbostic 			if (j >= 0) {
43432332Sbostic 				k++;	/* no match; restart next char */
43532332Sbostic 				continue;
43632332Sbostic 			}
43732332Sbostic 			k = submatch(file, pat, str, strend, k, count);
43832332Sbostic 			if (k == NULL)
43932332Sbostic 				return;
44032332Sbostic 		}
44132332Sbostic 		if (nflag) {
44232332Sbostic 			if (prevmatch)
44332332Sbostic 				nline = prevnline + nlcount(prevloc, k);
44432332Sbostic 			else
44532332Sbostic 				nline = nline + nlcount(str, k);
44632332Sbostic 			prevmatch = 0;
44732332Sbostic 		}
44832332Sbostic 		strncpy(str, linetemp, nleftover);
44932332Sbostic 	}
45032332Sbostic 	if (cflag) {
45132332Sbostic 		/* Bug from old grep: -c overrides -h.  We fix the bug. */
45232332Sbostic 		if (!hflag)
45332332Sbostic 			printf("%s:", file);
45432332Sbostic 		printf("%ld\n", nmatch);
45532332Sbostic 	}
45632332Sbostic }
45732332Sbostic 
45832332Sbostic char *
linesave(str,count)45932332Sbostic linesave(str, count)		/* accumulate partial line at end of buffer */
46032332Sbostic 	char str[];
46132332Sbostic 	register int count;
46232332Sbostic {
46332332Sbostic 	register int j;
46432332Sbostic 
46532332Sbostic 	count += nleftover;
46632332Sbostic 	if (count != BUFSIZE && fd != 0)
46732332Sbostic 		str[count++] = NL;	/* insurance for broken last line */
46832332Sbostic 	str[count] = EOS;
46932332Sbostic 	for (j = count - 1; str[j] != NL && j >= 0;)
47032332Sbostic 		j--;
47132332Sbostic 	/*
47232332Sbostic 	 * break up these lines: long line (> BUFSIZE), last line of file, or
47332332Sbostic 	 * short return from read(), as from tee(1) input
47432332Sbostic 	 */
47532332Sbostic 	if (j < 0 && (count == (BUFSIZE - nleftover))) {
47632332Sbostic 		str[count++] = NL;
47732332Sbostic 		str[count] = EOS;
47832332Sbostic 		linetemp[0] = EOS;
47932332Sbostic 		nleftover = 0;
48032332Sbostic 		return (str + count);
48132332Sbostic 	} else {
48232332Sbostic 		nleftover = count - j - 1;
48332332Sbostic 		strncpy(linetemp, str + j + 1, nleftover);
48432332Sbostic 		return (str + j);
48532332Sbostic 	}
48632332Sbostic }
48732332Sbostic 
48832332Sbostic /*
48932332Sbostic  * Process partial match. First check for mis-aligned Kanji, then match line
49032332Sbostic  * against full compiled r.e. if statistics do not warrant handing off to
49132332Sbostic  * standard egrep.
49232332Sbostic  */
49332332Sbostic char *
submatch(file,pat,str,strend,k,altindex)49432332Sbostic submatch(file, pat, str, strend, k, altindex)
49532332Sbostic 	char file[], pat[], str[];
49632332Sbostic 	register char *strend, *k;
49732332Sbostic 	int altindex;
49832332Sbostic {
49932332Sbostic 	register char *s;
50032332Sbostic 	char *t, c;
50132332Sbostic 
50232332Sbostic 	t = k;
50332332Sbostic 	s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
50432332Sbostic #ifndef NOKANJI
50532332Sbostic 	c = ((altflag) ? altpat[altindex][0] : pat[0]);
50632332Sbostic 	if (c & NONASCII)
50732332Sbostic 		if ((s = kanji(str, s, k)) == NULL)
50832332Sbostic 			return (++k);	/* reject false kanji */
50932332Sbostic #endif
51032332Sbostic 	do;
51132332Sbostic 	while (*s != NL && --s >= str);
51232332Sbostic 	k = s + 1;		/* now at line start */
51332332Sbostic 
51432332Sbostic 	if (boyonly)
51532332Sbostic 		return (gotamatch(file, k));
51632332Sbostic 
51732332Sbostic 	incount = counted - (strend - k);
51832332Sbostic 	if (boyfound++ == FIRSTFEW)
51932332Sbostic 		execstrategy(file);
52032332Sbostic 
52132332Sbostic 	s = t;
52232332Sbostic 	do
52332332Sbostic 		rxcount++;
52432332Sbostic 	while (*s++ != NL);
52532332Sbostic 	*--s = EOS;
52632332Sbostic 	/*
52732332Sbostic 	 * "quick henry -- the flit" (after theodor geisel)
52832332Sbostic 	 */
52932332Sbostic 	if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
53032332Sbostic 		*s = NL;
53132332Sbostic 		if (gotamatch(file, k) == NULL)
53232332Sbostic 			return (NULL);
53332332Sbostic 	}
53432332Sbostic 	*s = NL;
53532332Sbostic 	return (s + 1);
53632332Sbostic }
53732332Sbostic 
53832338Sbostic #ifndef NOKANJI
53932332Sbostic /*
54032332Sbostic  * EUC code disambiguation -- scan backwards to first 7-bit code, while
54132332Sbostic  * counting intervening 8-bit codes.  If odd, reject unaligned Kanji pattern.
54232332Sbostic  * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
54332332Sbostic  */
54432332Sbostic char *
kanji(str,s,k)54532332Sbostic kanji(str, s, k)
54632332Sbostic 	register char *str, *s, *k;
54732332Sbostic {
54832332Sbostic 	register int j = 0;
54932332Sbostic 
55032332Sbostic 	for (s--; s >= str; s--) {
55132332Sbostic 		if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
55232332Sbostic 			break;
55332332Sbostic 		j++;
55432332Sbostic 	}
55532332Sbostic #ifndef CHINESE
55632332Sbostic 	if (*s == SS2)
55732332Sbostic 		j -= 1;
55832332Sbostic #endif  CHINESE
55932332Sbostic 	return ((j & 01) ? NULL : k);
56032332Sbostic }
56132338Sbostic #endif
56232332Sbostic 
56332332Sbostic /*
56432332Sbostic  * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
56532332Sbostic  */
gosper(pattern)56632332Sbostic gosper(pattern)
56732332Sbostic 	char *pattern;		/* ... HAKMEM lives ... */
56832332Sbostic {
56932332Sbostic 	register int i, j;
57032332Sbostic 	unsigned char c;
57132332Sbostic 
57232332Sbostic 	/* Make one-string case look like simple alternatives case */
57332332Sbostic 	if (!altflag) {
57432332Sbostic 		nalt = 1;
57532332Sbostic 		altmin = altlen[0] = strlen(pattern);
57632339Sbostic 		altpat[0] = pattern;
57732332Sbostic 	}
57832332Sbostic 	/* For chars that aren't in any string, skip by string length. */
57932332Sbostic 	for (j = 0; j < 256; j++) {
58032332Sbostic 		delta0[j] = altmin;
58132332Sbostic 		cmap[j] = j;	/* Sneak in initialization of cmap */
58232332Sbostic 	}
58332332Sbostic 
58432332Sbostic 	/* For chars in a string, skip distance from char to end of string. */
58532332Sbostic 	/* (If char appears more than once, skip minimum distance.) */
58632332Sbostic 	for (i = 0; i < nalt; i++)
58732332Sbostic 		for (j = 0; j < altlen[i] - 1; j++) {
58832332Sbostic 			c = altpat[i][j];
58932332Sbostic 			delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
59032332Sbostic 			if (iflag && islower((int) c))
59132332Sbostic 				delta0[toupper((int) c)] = delta0[c];
59232332Sbostic 		}
59332332Sbostic 
59432332Sbostic 	/* For last char of each string, fall out of search loop. */
59532332Sbostic 	for (i = 0; i < nalt; i++) {
59632332Sbostic 		c = altpat[i][altlen[i] - 1];
59732332Sbostic 		delta0[c] = LARGE;
59832332Sbostic 		if (iflag && islower((int) c))
59932332Sbostic 			delta0[toupper((int) c)] = LARGE;
60032332Sbostic 	}
60132332Sbostic 	if (iflag)
60232332Sbostic 		for (j = 'A'; j <= 'Z'; j++)
60332332Sbostic 			cmap[j] = tolower((int) j);
60432332Sbostic }
60532332Sbostic 
60632332Sbostic /*
60732332Sbostic  * Print, count, or stop on full match. Result is either the location for
60832332Sbostic  * continued search, or NULL to stop.
60932332Sbostic  */
61032332Sbostic char *
gotamatch(file,s)61132332Sbostic gotamatch(file, s)
61232332Sbostic 	register char *file, *s;
61332332Sbostic {
61432332Sbostic 	char *savematch();
61532332Sbostic 	int squirrel = 0;	/* nonzero to squirrel away FIRSTFEW matches */
61632332Sbostic 
61732332Sbostic 	nmatch++;
61832332Sbostic 	nsuccess = 1;
61932332Sbostic 	if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
62032332Sbostic 		squirrel = 1;
62132332Sbostic 
62232332Sbostic 	if (sflag)
62332332Sbostic 		return (NULL);	/* -s usurps all flags (unlike some versions) */
62432332Sbostic 	if (cflag) {		/* -c overrides -l, we guess */
62532332Sbostic 		do;
62632332Sbostic 		while (*s++ != NL);
62732332Sbostic 	} else if (lflag) {
62832332Sbostic 		puts(file);
62932332Sbostic 		return (NULL);
63032332Sbostic 	} else {
63132332Sbostic 		if (!hflag)
63232332Sbostic 			if (!squirrel)
63332332Sbostic 				printf("%s:", file);
63432332Sbostic 			else
63532502Sbostic 				(void)sprintf(preamble, "%s:", file);
63632332Sbostic 		if (nflag) {
63732332Sbostic 			if (prevmatch)
63832332Sbostic 				prevnline = prevnline + nlcount(prevloc, s);
63932332Sbostic 			else
64032332Sbostic 				prevnline = nline + nlcount(str, s);
64132332Sbostic 			prevmatch = 1;
64232332Sbostic 
64332332Sbostic 			if (!squirrel)
64432332Sbostic 				printf("%ld:", prevnline);
64532332Sbostic 			else
64632502Sbostic 				(void)sprintf(preamble + strlen(preamble),
64732332Sbostic 					"%ld:", prevnline);
64832332Sbostic 		}
64932332Sbostic 		if (!squirrel) {
65032332Sbostic 			do
65132332Sbostic 				putchar(*s);
65232332Sbostic 			while (*s++ != NL);
65332332Sbostic 		} else
65432332Sbostic 			s = savematch(s);
65532332Sbostic 
65632332Sbostic 		if (nflag)
65732332Sbostic 			prevloc = s - 1;
65832332Sbostic 	}
65932332Sbostic 	return ((firstflag && !cflag) ? NULL : s);
66032332Sbostic }
66132332Sbostic 
66232332Sbostic char *
fold(line)66332332Sbostic fold(line)
66432332Sbostic 	char *line;
66532332Sbostic {
66632332Sbostic 	static char fline[BUFSIZE];
66732332Sbostic 	register char *s, *t = fline;
66832332Sbostic 
66932332Sbostic 	for (s = line; *s != EOS; s++)
67032332Sbostic 		*t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
67132332Sbostic 	*t = EOS;
67232332Sbostic 	return (fline);
67332332Sbostic }
67432332Sbostic 
strindex(s,t)67532332Sbostic strindex(s, t)			/* the easy way, as in K&P, p. 192 */
67632332Sbostic 	char *s, *t;
67732332Sbostic {
67832332Sbostic 	int i, n;
67932332Sbostic 
68032332Sbostic 	n = strlen(t);
68132332Sbostic 	for (i = 0; s[i] != '\0'; i++)
68232332Sbostic 		if (strncmp(s + i, t, n) == 0)
68332332Sbostic 			return (i);
68432332Sbostic 	return (-1);
68532332Sbostic }
68632332Sbostic 
68732332Sbostic char *
grepxlat(pattern)68832332Sbostic grepxlat(pattern)		/* grep pattern meta conversion */
68932332Sbostic 	char *pattern;
69032332Sbostic {
69132332Sbostic 	register char *p, *s;
69232332Sbostic 	static char newpat[BUFSIZE];
69332332Sbostic 
69432332Sbostic 	for (s = newpat, p = pattern; *p != EOS;) {
69532332Sbostic 		if (*p == '\\') {	/* skip escapes ... */
69632332Sbostic 			*s++ = *p++;
69732332Sbostic 			if (*p)
69832332Sbostic 				*s++ = *p++;
69932332Sbostic 		} else if (*p == '[') {	/* ... and char classes */
70032332Sbostic 			while (*p != EOS && *p != ']')
70132332Sbostic 				*s++ = *p++;
70232332Sbostic 		} else if (strchr("+?|()", *p) != NULL) {
70332332Sbostic 			*s++ = '\\';	/* insert protection */
70432332Sbostic 			*s++ = *p++;
70532332Sbostic 		} else
70632332Sbostic 			*s++ = *p++;
70732332Sbostic 	}
70832332Sbostic 	*s = EOS;
70932332Sbostic 	grepflag = ((patind) ? 0 : 1);
71032332Sbostic 	return (newpat);
71132332Sbostic }
71232332Sbostic 
71332332Sbostic /*
71432332Sbostic  * Test for simple alternation.  Result is NULL if it's not so simple, or is
71532332Sbostic  * a pointer to the first string if it is. Warning:  sscanf size is a
71632332Sbostic  * fixpoint, beyond which the speedup linearity starts to break down.  In the
71732332Sbostic  * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
71832332Sbostic  * altpat[] to arbitrary size is not useful.
71932332Sbostic  */
72032332Sbostic char *
alternate(regexpr)72132332Sbostic alternate(regexpr)
72232332Sbostic 	char *regexpr;
72332332Sbostic {
72432339Sbostic 	register int i, j;
72532339Sbostic 	register char *start, *stop;
72632332Sbostic 	unsigned char c;
72732332Sbostic 
72832332Sbostic 	if (fgrepflag && strchr(regexpr, '|'))
72932339Sbostic 			return (NULL);
73032332Sbostic 
73132339Sbostic 	/*
73232339Sbostic 	 * break pattern up into altpat array; delimit on newline, bar,
73332339Sbostic 	 * or EOS.  We know we won't overflow, we've already checked the
73432339Sbostic 	 * number of patterns we're going to find against NALT.
73532339Sbostic 	 * Also, set length of pattern and find minimum pattern length.
73632339Sbostic 	 */
73732339Sbostic 	nalt = 0;
73832339Sbostic 	altmin = NMUSH;
73932339Sbostic 	for (start = stop = regexpr;; ++stop)
74032339Sbostic 		if (!*stop || *stop == '|' || *stop == NL) {
74132339Sbostic 			altlen[nalt] = j = stop - start;
74232339Sbostic 			if (j < altmin)
74332339Sbostic 				altmin = j;
74432339Sbostic 			if (!(altpat[nalt] = malloc((u_int)(j + 1))))
74532339Sbostic 				oops("out of memory");
74632339Sbostic 			bcopy(start, altpat[nalt], j);
74732339Sbostic 			altpat[nalt][j] = EOS;
74832362Sbostic 			++nalt;
74932339Sbostic 			if (!*stop)
75032339Sbostic 				break;
75132362Sbostic 			if (nalt == NALT)
75232339Sbostic 				return(NULL);
75332339Sbostic 			if (*stop == NL)
75432339Sbostic 				*stop = '|';
75532339Sbostic 			start = stop + 1;
75632339Sbostic 		}
75732332Sbostic 	if (!fgrepflag) {
75832332Sbostic 		if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
75932332Sbostic 			return (NULL);
76032332Sbostic 		if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
76132332Sbostic 		    || strindex(regexpr, "||") >= 0)
76232332Sbostic 			return (NULL);
76332332Sbostic 	}
76432332Sbostic 
76532332Sbostic 	if (nalt > 1) {		/* build superimposed "pre-match" sets per
76632332Sbostic 				 * char */
76732332Sbostic 		altflag++;
76832332Sbostic 		for (j = 0; j < nalt; j++)
76932332Sbostic 			for (i = 0; i < altmin; i++) {
77032332Sbostic 				c = altpat[j][altlen[j] - altmin + i];
77132332Sbostic 				altset[i + 1][c] = 1;	/* offset for sentinel */
77232332Sbostic 			}
77332332Sbostic 	}
77432332Sbostic 	return (altpat[0]);
77532332Sbostic }
77632332Sbostic 
77732332Sbostic /*
77832332Sbostic  * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
77932332Sbostic  * determine whether to use dfa-based egrep:  We do FIRSTFEW matches with
78032332Sbostic  * regexec().  If Boyer-Moore up to now matched more than PUNTPERCENT
78132332Sbostic  * of the input, the r.e. is likely to be underspecified, so do old *grep,
78232332Sbostic  * which is faster on complex patterns than regexp().  At FIRSTFEW,
78332332Sbostic  * dump the saved matches collected by savematch(). They are saved
78432332Sbostic  * so that a "PUNT" can "rewind" to ignore them.  Stdin is problematic,
78532332Sbostic  * since it's hard to rewind.
78632332Sbostic  */
78732332Sbostic 
execstrategy(file)78832332Sbostic execstrategy(file)
78932332Sbostic 	char *file;
79032332Sbostic {
79132332Sbostic 	int pctmatch;
79232332Sbostic 
79332332Sbostic 	pctmatch = (100 * rxcount) / incount;
79432332Sbostic 	if (pctmatch > PUNTPERCENT && file != NULL)
79532332Sbostic 		kernighan(args);
79632332Sbostic 	if (file != NULL)
79732332Sbostic 		flushmatches();
79832332Sbostic }
79932332Sbostic 
nlcount(bstart,bstop)80032332Sbostic nlcount(bstart, bstop)		/* flail interval to totalize newlines. */
80132332Sbostic 	char *bstart, *bstop;
80232332Sbostic {
80332332Sbostic 	register char *s = bstart;
80432332Sbostic 	register char *t = bstop;
80532332Sbostic 	register int count = 0;
80632332Sbostic 
80732332Sbostic 	do {			/* loop unroll for older architectures */
80832332Sbostic 		if (*t == NL)	/* ... ask ames!jaw for sample code */
80932332Sbostic 			count++;
81032332Sbostic 	} while (t-- > s);
81132332Sbostic 
81232332Sbostic 	return (count);
81332332Sbostic }
81432332Sbostic 
81532332Sbostic char *
isolate(regexpr)81632332Sbostic isolate(regexpr)		/* isolate longest metacharacter-free string */
81732332Sbostic 	char *regexpr;
81832332Sbostic {
81932332Sbostic 	char *dummyexpr;
82032332Sbostic 
82132332Sbostic 	/*
82232332Sbostic 	 * We add (.)* because Henry's regcomp only figures regmust if it
82332332Sbostic 	 * sees a leading * pattern.  Foo!
82432332Sbostic 	 */
82532332Sbostic 	dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
82632502Sbostic 	(void)sprintf(dummyexpr, "(.)*%s", regexpr);
82732332Sbostic 	if ((rspencer = regcomp(dummyexpr)) == NULL)
82832332Sbostic 		kernighan(args);
82932332Sbostic 	return (rspencer->regmust);
83032332Sbostic }
83132332Sbostic 
83232332Sbostic char *matches[FIRSTFEW];
83332332Sbostic static int mcount = 0;
83432332Sbostic 
83532332Sbostic char *
savematch(s)83632332Sbostic savematch(s)			/* horde matches during statistics gathering */
83732332Sbostic 	register char *s;
83832332Sbostic {
83932332Sbostic 	char *p;
84032332Sbostic 	char *start = s;
84132332Sbostic 	int msize = 0;
84232332Sbostic 	int psize = strlen(preamble);
84332332Sbostic 
84432332Sbostic 	while (*s++ != NL)
84532332Sbostic 		msize++;
84632332Sbostic 	*--s = EOS;
84732332Sbostic 
84832332Sbostic 	p = malloc((unsigned) msize + 1 + psize);
84932332Sbostic 	strcpy(p, preamble);
85032332Sbostic 	strcpy(p + psize, start);
85132332Sbostic 	matches[mcount++] = p;
85232332Sbostic 
85332332Sbostic 	preamble[0] = 0;
85432332Sbostic 	*s = NL;
85532332Sbostic 	return (s);
85632332Sbostic }
85732332Sbostic 
flushmatches()85832332Sbostic flushmatches()
85932332Sbostic {
86032332Sbostic 	int n;
86132332Sbostic 
86232332Sbostic 	flushflag = 1;
86332332Sbostic 	for (n = 0; n < mcount; n++)
86432332Sbostic 		printf("%s\n", matches[n]);
86532332Sbostic 	mcount = 0;
86632332Sbostic }
86732332Sbostic 
oops(message)86832332Sbostic oops(message)
86932332Sbostic 	char *message;
87032332Sbostic {
87132332Sbostic 	fprintf(stderr, "%s: %s\n", progname, message);
87232332Sbostic 	exit(2);
87332332Sbostic }
87432332Sbostic 
kernighan(args)87532332Sbostic kernighan(args)			/* "let others do the hard part ..." */
87632332Sbostic 	char *args[];
87732332Sbostic {
87832332Sbostic 	/*
87932332Sbostic 	 * We may have already run grep on some of the files; remove them
88032332Sbostic 	 * from the arg list we pass on.  Note that we can't delete them
88132332Sbostic 	 * totally because the number of file names affects the output
88232332Sbostic 	 * (automatic -h).
88332332Sbostic 	 */
88432332Sbostic 	/* better would be fork/exec per punted file -- jaw */
88532332Sbostic 
88632332Sbostic 	while (firstfile && optind > firstfile)
88737914Sbostic 		args[firstfile++] = _PATH_DEVNULL;
88832332Sbostic 	if (patind)
88932332Sbostic 		args[patind] = pattern;
89032339Sbostic 	(void) fflush(stdout);
89132332Sbostic 
89232332Sbostic 	if (grepflag)
89337914Sbostic 		execvp(_PATH_GREPSTD, args), oops("can't exec old 'grep'");
89432332Sbostic 	else if (fgrepflag)
89537914Sbostic 		execvp(_PATH_FGREPSTD, args), oops("can't exec old 'fgrep'");
89632332Sbostic 	else
89737914Sbostic 		execvp(_PATH_EGREPSTD, args), oops("can't exec old 'egrep'");
89832332Sbostic }
899