xref: /csrg-svn/usr.bin/grep/egrep/egrep.c (revision 46717)
132332Sbostic #ifndef lint
2*46717Sbostic static char sccsid[] = "@(#)egrep.c	5.13 (Berkeley) 02/27/91";
332332Sbostic #endif not lint
432332Sbostic 
532332Sbostic /*
632332Sbostic      Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
732332Sbostic      table as in original paper (CACM, October, 1977).  No delta1 or delta2.
832332Sbostic      According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
932332Sbostic      minimal practical value.  However, to improve for worst case input,
1032332Sbostic      integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
1132332Sbostic      Comput., Feb. 1986) deserves consideration.
1232332Sbostic 
1332332Sbostic      Method: 	extract longest metacharacter-free string from expression.
1432332Sbostic 		this is done using a side-effect from henry spencer's regcomp().
1532332Sbostic 		use boyer-moore to match such, then pass submatching lines
1632332Sbostic 		to either regexp() or standard 'egrep', depending on certain
1732332Sbostic 		criteria within execstrategy() below.  [this tradeoff is due
1832332Sbostic 		to the general slowness of the regexp() nondeterministic
1932332Sbostic 		machine on complex expressions, as well as the startup time
2032332Sbostic 		of standard 'egrep' on short files.]  alternatively, one may
2132332Sbostic 		change the vendor-supplied 'egrep' automaton to include
2232332Sbostic 		boyer-moore directly.  see accompanying writeup for discussion
2332332Sbostic 		of kanji expression treatment.
2432332Sbostic 
2532332Sbostic 		late addition:  apply trickbag for fast match of simple
2632332Sbostic 		alternations (sublinear, in common low-cardinality cases).
2732332Sbostic 		trap fgrep into this lair.
2832332Sbostic 
2932332Sbostic 		gnu additions:  -f, newline as |, \< and \> [in regexec()], more
3032332Sbostic 				comments.  inspire better dfa exec() strategy.
3132332Sbostic 				serious testing and help with special cases.
3232332Sbostic 
3332332Sbostic      Algorithm amalgam summary:
3432332Sbostic 
3532332Sbostic 		dfa e?grep 		(aho/thompson)
3632332Sbostic 		ndfa regexp() 		(spencer/aho)
3732332Sbostic 		bmg			(boyer/moore/gosper)
3832332Sbostic 		"superimposed" bmg   	(jaw)
3932332Sbostic 		fgrep			(aho/corrasick)
4032332Sbostic 
4132332Sbostic 		sorry, but the knuth/morris/pratt machine, horspool's
4232332Sbostic 		"frequentist" code, and the rabin/karp matcher, however cute,
4332332Sbostic 		just don't cut it for this production.
4432332Sbostic 
4532332Sbostic      James A. Woods				Copyright (c) 1986
4632332Sbostic      NASA Ames Research Center
4732332Sbostic */
4832332Sbostic #include <sys/types.h>
4932332Sbostic #include <sys/stat.h>
5032337Sbostic #include <sys/file.h>
5132332Sbostic #include <regexp.h>		/* must be henry spencer's version */
5237914Sbostic #include <stdio.h>
5337914Sbostic #include <ctype.h>
5437914Sbostic #include "pathnames.h"
5532332Sbostic 
5632332Sbostic #define	MIN(A, B)	((A) > (B) ? (B) : (A))
5732332Sbostic 
5832332Sbostic #ifdef	SLOWSYS
5932332Sbostic #define read	xread
6032332Sbostic #endif
6132332Sbostic 
6232332Sbostic #define BUFSIZE	8192		/* make higher for cray */
6332332Sbostic #define PATSIZE 6000
6432332Sbostic #define LARGE 	BUFSIZE + PATSIZE
6532332Sbostic 
6632332Sbostic #define NALT	7		/* tied to scanf() size in alternate() */
6732332Sbostic #define NMUSH	6		/* loosely relates to expected alt length */
6832332Sbostic 
6932332Sbostic #define	FIRSTFEW	33 	/* Always do FIRSTFEW matches with regexec() */
7032332Sbostic #define	PUNTPERCENT	10	/* After FIRSTFEW, if PUNTPERCENT of the input
7132332Sbostic 				 * was processed by regexp(), exec std egrep. */
7232332Sbostic #define NL	'\n'
7332332Sbostic #define	EOS	'\0'
7432332Sbostic #define	NONASCII	0200	/* Bit mask for Kanji non-ascii chars */
7532332Sbostic #define META	"\n^$.[]()?+*|\\"	/* egrep meta-characters */
7632332Sbostic #define SS2	'\216'		/* EUC Katakana (or Chinese2) prefix */
7732332Sbostic #define SS3	'\217'		/* EUC Kanji2 (or Chinese3) prefix */
7832332Sbostic 
7932332Sbostic extern char *optarg;
8032332Sbostic extern int optind;
8132332Sbostic char *progname;
8232332Sbostic 
8332332Sbostic int cflag, iflag, eflag, fflag, lflag, nflag;	/* SVID flags */
8432332Sbostic int sflag, hflag;		/* v7, v8, bsd */
8532332Sbostic 
8632332Sbostic int firstflag;			/* Stop at first match */
8732332Sbostic int grepflag;			/* Called as "grep" */
8832332Sbostic int fgrepflag;			/* Called as "fgrep" */
8932332Sbostic int altflag;			/* Simple alternation in pattern */
9032332Sbostic int boyonly;			/* No regexp needed -- all simple */
9132332Sbostic int flushflag;
9232332Sbostic int grepold, egrepold, fgrepold;
9332332Sbostic 
9432332Sbostic int nalt;			/* Number of alternatives */
9532332Sbostic int nsuccess;			/* 1 for match, 2 for error */
9632332Sbostic int altmin;			/* Minimum length of all the alternate
9732332Sbostic 				 * strings */
9832332Sbostic int firstfile;			/* argv index of first file argument */
9932332Sbostic int patind;			/* argv index of pattern */
10032332Sbostic long nmatch;			/* Number of matches in this file */
10132332Sbostic long incount, counted;		/* Amount of input consumed */
10232332Sbostic long rxcount;			/* Bytes of input processed by regexec() */
10332332Sbostic int boyfound;			/* accumulated partial matches (tripped by
10432332Sbostic 				 * FIRSTFEW) */
10532332Sbostic int prevmatch;			/* next three lines aid fast -n */
10632332Sbostic long nline, prevnline;
10732332Sbostic char *prevloc;
10832332Sbostic 
10932332Sbostic regexp *rspencer;
11032332Sbostic char *pattern;
11132332Sbostic char *patboy;			/* Pattern for simple Boyer-Moore */
11232332Sbostic char *patfile;			/* Filename containing pattern(s) */
11332332Sbostic 
11432332Sbostic int delta0[256];		/* Boyer-Moore algorithm core */
11532332Sbostic char cmap[256];			/* Usually 0-255, but if -i, maps upper to
11632332Sbostic 				 * lower case */
11732332Sbostic char str[BUFSIZE + 2];
11832332Sbostic int nleftover;
11932332Sbostic char linetemp[BUFSIZE];
12032339Sbostic char *altpat[NALT];		/* alternation component storage */
12132332Sbostic int altlen[NALT];
12232332Sbostic short altset[NMUSH + 1][256];
12332332Sbostic char preamble[200];		/* match prefix (filename, line no.) */
12432332Sbostic 
12532332Sbostic int fd;
12632332Sbostic char *
12732502Sbostic strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc();
12832332Sbostic char *
12932332Sbostic grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
13035692Sbostic char *gotamatch(), *kanji(), *linesave(), *submatch();
13132332Sbostic char **args;
13232332Sbostic 
13332332Sbostic main(argc, argv)
13432332Sbostic 	int argc;
13532332Sbostic 	char *argv[];
13632332Sbostic {
13732365Sbostic 	int c, oflag;
13832332Sbostic 	int errflag = 0;
13932332Sbostic 
14032332Sbostic 	args = argv;
14132332Sbostic 
14232332Sbostic 	if ((progname = strrchr(argv[0], '/')) != 0)
14332332Sbostic 		progname++;
14432332Sbostic 	else
14532332Sbostic 		progname = argv[0];
14632332Sbostic 	if (strcmp(progname, "grep") == 0)
14732332Sbostic 		grepflag++;
14832365Sbostic 	else if (strcmp(progname, "fgrep") == 0)
14932332Sbostic 		fgrepflag++;
15032332Sbostic 
15132365Sbostic 	oflag = 0;
15232365Sbostic 	while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) {
15332332Sbostic 		switch (c) {
15432332Sbostic 
15532332Sbostic 		case 'f':
15632332Sbostic 			fflag++;
15732332Sbostic 			patfile = optarg;
15832332Sbostic 			continue;
15932332Sbostic 		case 'b':
16032332Sbostic 		case 'v':
16132332Sbostic 			egrepold++;	/* boyer-moore of little help here */
16232332Sbostic 			continue;
16332332Sbostic 		case 'c':
16432332Sbostic 			cflag++;
16532332Sbostic 			continue;
16632332Sbostic 		case 'e':
16732332Sbostic 			eflag++;
16832332Sbostic 			pattern = optarg;
16932332Sbostic 			continue;
17032332Sbostic 		case 'h':
17132332Sbostic 			hflag++;
17232332Sbostic 			continue;
17332365Sbostic 		case 'o':
17432365Sbostic 			oflag++;
17532365Sbostic 			continue;
17632332Sbostic 		case '1':	/* Stop at very first match */
17732332Sbostic 			firstflag++;	/* spead freaks only */
17832332Sbostic 			continue;
17932332Sbostic 		case 'i':
18032332Sbostic 			iflag++;
18132332Sbostic 			continue;
18232332Sbostic 		case 'l':
18332332Sbostic 			lflag++;
18432332Sbostic 			continue;
18532332Sbostic 		case 'n':
18632332Sbostic 			nflag++;
18732332Sbostic 			continue;
18832332Sbostic 		case 's':
18932332Sbostic 			sflag++;
19032332Sbostic 			continue;
19132332Sbostic 		case 'w':
19232332Sbostic 		case 'y':
19332332Sbostic 			if (!grepflag)
19432332Sbostic 				errflag++;
19532332Sbostic 			grepold++;
19632332Sbostic 			continue;
19732332Sbostic 		case 'x':	/* needs more work, like -b above */
19832332Sbostic 			if (!fgrepflag)
19932332Sbostic 				errflag++;
20032332Sbostic 			fgrepold++;
20132332Sbostic 			continue;
20232332Sbostic 		case '?':
20332332Sbostic 			errflag++;
20432332Sbostic 		}
20532332Sbostic 	}
20632332Sbostic 	if (errflag || ((argc <= optind) && !fflag && !eflag)) {
20732332Sbostic 		if (grepflag)
20832365Sbostic oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]");
20932332Sbostic 		else if (fgrepflag)
21032366Sbostic oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]");
21132332Sbostic 		else		/* encourage SVID options, though we provide
21232332Sbostic 				 * others */
21332366Sbostic oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]");
21432332Sbostic 	}
21532332Sbostic 	if (fflag)
21634917Sbostic 		pattern = pfile(patfile);
21734917Sbostic 	else if (!eflag) {
21834917Sbostic 		patind = optind;
21932332Sbostic 		pattern = argv[optind++];
22034917Sbostic 	}
22132332Sbostic 
22232365Sbostic 	if (!oflag && (argc - optind) <= 1)	/* Filename invisible given < 2 files */
22332332Sbostic 		hflag++;
22432332Sbostic 	if (pattern[0] == EOS)
22532332Sbostic 		kernighan(argv);	/* same as it ever was */
22632332Sbostic 	/*
22732332Sbostic 	 * 'grep/egrep' merger -- "old" grep is called to handle: tagged
22832332Sbostic 	 * exprs \( \), word matches \< and \>, -w and -y options, char
22932332Sbostic 	 * classes with '-' at end (egrep bug?), and patterns beginning with
23032332Sbostic 	 * an asterisk (don't ask why). otherwise, characters meaningful to
23132332Sbostic 	 * 'egrep' but not to 'grep' are escaped; the entire expr is then
23232332Sbostic 	 * passed to 'egrep'.
23332332Sbostic 	 */
23432332Sbostic 	if (grepflag && !grepold) {
23532332Sbostic 		if (strindex(pattern, "\\(") >= 0 ||
23632332Sbostic 		    strindex(pattern, "\\<") >= 0 ||
23732332Sbostic 		    strindex(pattern, "\\>") >= 0 ||
23832332Sbostic 		    strindex(pattern, "-]") >= 0 ||
23932332Sbostic 		    pattern[0] == '*')	/* grep bug */
24032332Sbostic 			grepold++;
24132332Sbostic 		else
24232332Sbostic 			pattern = grepxlat(pattern);
24332332Sbostic 	}
24432332Sbostic 	if (grepold || egrepold || fgrepold)
24532332Sbostic 		kernighan(argv);
24632332Sbostic 
24732332Sbostic 	if (iflag)
24832332Sbostic 		strcpy(pattern, fold(pattern));
24932332Sbostic 	/*
25032332Sbostic 	 * If the pattern is a plain string, just run boyer-moore. If it
25132332Sbostic 	 * consists of meta-free alternatives, run "superimposed" bmg.
25232332Sbostic 	 * Otherwise, find best string, and compile pattern for regexec().
25332332Sbostic 	 */
25432332Sbostic 	if (strpbrk(pattern, META) == NULL) {	/* do boyer-moore only */
25532332Sbostic 		boyonly++;
25632332Sbostic 		patboy = pattern;
25732332Sbostic 	} else {
25832332Sbostic 		if ((patboy = alternate(pattern)) != NULL)
25932332Sbostic 			boyonly++;
26032332Sbostic 		else {
26132332Sbostic 			if ((patboy = isolate(pattern)) == NULL)
26232332Sbostic 				kernighan(argv);	/* expr too involved */
26332332Sbostic #ifndef NOKANJI
26432332Sbostic 			for (c = 0; pattern[c] != EOS; c++)
26532332Sbostic 				if (pattern[c] & NONASCII)	/* kanji + meta */
26632332Sbostic 					kernighan(argv);
26732332Sbostic #endif
26832332Sbostic 			if ((rspencer = regcomp(pattern)) == NULL)
26932332Sbostic 				oops("regcomp failure");
27032332Sbostic 		}
27132332Sbostic 	}
27232332Sbostic 	gosper(patboy);		/* "pre-conditioning is wonderful"
27332332Sbostic 				 * -- v. strassen */
27432332Sbostic 
27532332Sbostic 	if ((firstfile = optind) >= argc) {
27632332Sbostic 		/* Grep standard input */
27732332Sbostic 		if (lflag)	/* We don't know its name! */
27832332Sbostic 			exit(1);
27932332Sbostic 		egsecute((char *) NULL);
28032332Sbostic 	} else {
28132332Sbostic 		while (optind < argc) {
28232332Sbostic 			egsecute(argv[optind]);
28332332Sbostic 			optind++;
28432332Sbostic 			if (firstflag && (nsuccess == 1))
28532332Sbostic 				break;
28632332Sbostic 		}
28732332Sbostic 	}
28832332Sbostic 	exit((nsuccess == 2) ? 2 : (nsuccess == 0));
28932332Sbostic }
29032332Sbostic 
29132332Sbostic char *
29232332Sbostic pfile(pfname)			/* absorb expression from file */
29332332Sbostic 	char *pfname;
29432332Sbostic {
29532337Sbostic 	int fd;
29632332Sbostic 	struct stat patstat;
29732332Sbostic 	static char *pat;
29832332Sbostic 
29932337Sbostic 	if ((fd = open(pfname, O_RDONLY, 0)) < 0)
30032332Sbostic 		oops("can't read pattern file");
30132337Sbostic 	if (fstat(fd, &patstat) != 0)
30232332Sbostic 		oops("can't stat pattern file");
30332332Sbostic 	if (patstat.st_size > PATSIZE) {
30432332Sbostic 		if (fgrepflag) {	/* defer to unix version */
30532332Sbostic 			fgrepold++;
30632332Sbostic 			return "dummy";
30732332Sbostic 		} else
30832332Sbostic 			oops("pattern file too big");
30932332Sbostic 	}
31032332Sbostic 	if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
31132332Sbostic 		oops("out of memory to read pattern file");
31232337Sbostic 	if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
31332332Sbostic 		oops("error reading pattern file");
31432337Sbostic 	(void) close(fd);
31532332Sbostic 
31632332Sbostic 	pat[patstat.st_size] = EOS;
31732332Sbostic 	if (pat[patstat.st_size - 1] == NL)	/* NOP for egrep; helps grep */
31832332Sbostic 		pat[patstat.st_size - 1] = EOS;
31932332Sbostic 
32032332Sbostic 	if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
32132332Sbostic 		if (fgrepflag)
32232332Sbostic 			fgrepold++;	/* "what's it all about, alfie?" */
32332332Sbostic 		else
32432332Sbostic 			egrepold++;
32532332Sbostic 	}
32632332Sbostic 	return (pat);
32732332Sbostic }
32832332Sbostic 
32932332Sbostic egsecute(file)
33032332Sbostic 	char *file;
33132332Sbostic {
332*46717Sbostic 	extern int errno;
333*46717Sbostic 
33432332Sbostic 	if (file == NULL)
33532332Sbostic 		fd = 0;
33632339Sbostic 	else if ((fd = open(file, O_RDONLY, 0)) <= 0) {
337*46717Sbostic 		fprintf(stderr,
338*46717Sbostic 		    "%s: %s: %s\n", progname, file, strerror(errno));
33932332Sbostic 		nsuccess = 2;
34032332Sbostic 		return;
34132332Sbostic 	}
34232332Sbostic 	chimaera(file, patboy);
34332332Sbostic 
34432332Sbostic 	if (!boyonly && !flushflag && file != NULL)
34532332Sbostic 		flushmatches();
34632332Sbostic 	if (file != NULL)
34732332Sbostic 		close(fd);
34832332Sbostic }
34932332Sbostic 
35032332Sbostic chimaera(file, pat)		/* "reach out and boyer-moore search someone" */
35132332Sbostic 	char *file, *pat;	/* -- soon-to-be-popular bumper sticker */
35232332Sbostic {
35332332Sbostic 	register char *k, *strend, *s;
35432332Sbostic 	register int j, count;
35532332Sbostic 	register int *deltazero = delta0;
35632332Sbostic 	int patlen = altmin;
35732332Sbostic 	char *t;
35832332Sbostic 
35932332Sbostic 	nleftover = boyfound = flushflag = 0;
36032332Sbostic 	nline = 1L;
36132332Sbostic 	prevmatch = 0;
36232332Sbostic 	nmatch = counted = rxcount = 0L;
36332332Sbostic 
36432332Sbostic 	while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
36532332Sbostic 
36632332Sbostic 		counted += count;
36732332Sbostic 		strend = linesave(str, count);
36832332Sbostic 
36932332Sbostic 		for (k = str + patlen - 1; k < strend;) {
37032332Sbostic 			/*
37132332Sbostic 			 * for a large class of patterns, upwards of 80% of
37232332Sbostic 			 * match time is spent on the next line.  we beat
37332332Sbostic 			 * existing microcode (vax 'matchc') this way.
37432332Sbostic 			 */
37532332Sbostic 			while ((k += deltazero[*(unsigned char *) k]) < strend);
37632332Sbostic 			if (k < (str + LARGE))
37732332Sbostic 				break;
37832332Sbostic 			k -= LARGE;
37932332Sbostic 
38032332Sbostic 			if (altflag) {
38132332Sbostic 				/*
38232332Sbostic 				 * Parallel Boyer-Moore.  Check whether each
38332332Sbostic 				 * of the previous <altmin> chars COULD be
38432332Sbostic 				 * from one of the alternative strings.
38532332Sbostic 				 */
38632332Sbostic 				s = k - 1;
38732332Sbostic 				j = altmin;
38832332Sbostic 				while (altset[--j][(unsigned char)
38932332Sbostic 					     cmap[*(unsigned char *) s--]]);
39032332Sbostic 				/*
39132332Sbostic 				 * quick test fails. in this life, compare
39232332Sbostic 				 * 'em all.  but, a "reverse trie" would
39332332Sbostic 				 * attenuate worst case (linear w/delta2?).
39432332Sbostic 				 */
39532332Sbostic 				if (--j < 0) {
39632332Sbostic 					count = nalt - 1;
39732332Sbostic 					do {
39832332Sbostic 						s = k;
39932332Sbostic 						j = altlen[count];
40032332Sbostic 						t = altpat[count];
40132332Sbostic 
40232332Sbostic 						while
40332332Sbostic 							(cmap[*(unsigned char *) s--]
40432332Sbostic 							 == t[--j]);
40532332Sbostic 						if (j < 0)
40632332Sbostic 							break;
40732332Sbostic 					}
40832332Sbostic 					while (count--);
40932332Sbostic 				}
41032332Sbostic 			} else {
41132332Sbostic 				/* One string -- check it */
41232332Sbostic 				j = patlen - 1;
41332332Sbostic 				s = k - 1;
41432332Sbostic 				while (cmap[*(unsigned char *) s--] == pat[--j]);
41532332Sbostic 			}
41632332Sbostic 			/*
41732332Sbostic 			 * delta-less shortcut for literati. short shrift for
41832332Sbostic 			 * genetic engineers?
41932332Sbostic 			 */
42032332Sbostic 			if (j >= 0) {
42132332Sbostic 				k++;	/* no match; restart next char */
42232332Sbostic 				continue;
42332332Sbostic 			}
42432332Sbostic 			k = submatch(file, pat, str, strend, k, count);
42532332Sbostic 			if (k == NULL)
42632332Sbostic 				return;
42732332Sbostic 		}
42832332Sbostic 		if (nflag) {
42932332Sbostic 			if (prevmatch)
43032332Sbostic 				nline = prevnline + nlcount(prevloc, k);
43132332Sbostic 			else
43232332Sbostic 				nline = nline + nlcount(str, k);
43332332Sbostic 			prevmatch = 0;
43432332Sbostic 		}
43532332Sbostic 		strncpy(str, linetemp, nleftover);
43632332Sbostic 	}
43732332Sbostic 	if (cflag) {
43832332Sbostic 		/* Bug from old grep: -c overrides -h.  We fix the bug. */
43932332Sbostic 		if (!hflag)
44032332Sbostic 			printf("%s:", file);
44132332Sbostic 		printf("%ld\n", nmatch);
44232332Sbostic 	}
44332332Sbostic }
44432332Sbostic 
44532332Sbostic char *
44632332Sbostic linesave(str, count)		/* accumulate partial line at end of buffer */
44732332Sbostic 	char str[];
44832332Sbostic 	register int count;
44932332Sbostic {
45032332Sbostic 	register int j;
45132332Sbostic 
45232332Sbostic 	count += nleftover;
45332332Sbostic 	if (count != BUFSIZE && fd != 0)
45432332Sbostic 		str[count++] = NL;	/* insurance for broken last line */
45532332Sbostic 	str[count] = EOS;
45632332Sbostic 	for (j = count - 1; str[j] != NL && j >= 0;)
45732332Sbostic 		j--;
45832332Sbostic 	/*
45932332Sbostic 	 * break up these lines: long line (> BUFSIZE), last line of file, or
46032332Sbostic 	 * short return from read(), as from tee(1) input
46132332Sbostic 	 */
46232332Sbostic 	if (j < 0 && (count == (BUFSIZE - nleftover))) {
46332332Sbostic 		str[count++] = NL;
46432332Sbostic 		str[count] = EOS;
46532332Sbostic 		linetemp[0] = EOS;
46632332Sbostic 		nleftover = 0;
46732332Sbostic 		return (str + count);
46832332Sbostic 	} else {
46932332Sbostic 		nleftover = count - j - 1;
47032332Sbostic 		strncpy(linetemp, str + j + 1, nleftover);
47132332Sbostic 		return (str + j);
47232332Sbostic 	}
47332332Sbostic }
47432332Sbostic 
47532332Sbostic /*
47632332Sbostic  * Process partial match. First check for mis-aligned Kanji, then match line
47732332Sbostic  * against full compiled r.e. if statistics do not warrant handing off to
47832332Sbostic  * standard egrep.
47932332Sbostic  */
48032332Sbostic char *
48132332Sbostic submatch(file, pat, str, strend, k, altindex)
48232332Sbostic 	char file[], pat[], str[];
48332332Sbostic 	register char *strend, *k;
48432332Sbostic 	int altindex;
48532332Sbostic {
48632332Sbostic 	register char *s;
48732332Sbostic 	char *t, c;
48832332Sbostic 
48932332Sbostic 	t = k;
49032332Sbostic 	s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
49132332Sbostic #ifndef NOKANJI
49232332Sbostic 	c = ((altflag) ? altpat[altindex][0] : pat[0]);
49332332Sbostic 	if (c & NONASCII)
49432332Sbostic 		if ((s = kanji(str, s, k)) == NULL)
49532332Sbostic 			return (++k);	/* reject false kanji */
49632332Sbostic #endif
49732332Sbostic 	do;
49832332Sbostic 	while (*s != NL && --s >= str);
49932332Sbostic 	k = s + 1;		/* now at line start */
50032332Sbostic 
50132332Sbostic 	if (boyonly)
50232332Sbostic 		return (gotamatch(file, k));
50332332Sbostic 
50432332Sbostic 	incount = counted - (strend - k);
50532332Sbostic 	if (boyfound++ == FIRSTFEW)
50632332Sbostic 		execstrategy(file);
50732332Sbostic 
50832332Sbostic 	s = t;
50932332Sbostic 	do
51032332Sbostic 		rxcount++;
51132332Sbostic 	while (*s++ != NL);
51232332Sbostic 	*--s = EOS;
51332332Sbostic 	/*
51432332Sbostic 	 * "quick henry -- the flit" (after theodor geisel)
51532332Sbostic 	 */
51632332Sbostic 	if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
51732332Sbostic 		*s = NL;
51832332Sbostic 		if (gotamatch(file, k) == NULL)
51932332Sbostic 			return (NULL);
52032332Sbostic 	}
52132332Sbostic 	*s = NL;
52232332Sbostic 	return (s + 1);
52332332Sbostic }
52432332Sbostic 
52532338Sbostic #ifndef NOKANJI
52632332Sbostic /*
52732332Sbostic  * EUC code disambiguation -- scan backwards to first 7-bit code, while
52832332Sbostic  * counting intervening 8-bit codes.  If odd, reject unaligned Kanji pattern.
52932332Sbostic  * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
53032332Sbostic  */
53132332Sbostic char *
53232332Sbostic kanji(str, s, k)
53332332Sbostic 	register char *str, *s, *k;
53432332Sbostic {
53532332Sbostic 	register int j = 0;
53632332Sbostic 
53732332Sbostic 	for (s--; s >= str; s--) {
53832332Sbostic 		if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
53932332Sbostic 			break;
54032332Sbostic 		j++;
54132332Sbostic 	}
54232332Sbostic #ifndef CHINESE
54332332Sbostic 	if (*s == SS2)
54432332Sbostic 		j -= 1;
54532332Sbostic #endif  CHINESE
54632332Sbostic 	return ((j & 01) ? NULL : k);
54732332Sbostic }
54832338Sbostic #endif
54932332Sbostic 
55032332Sbostic /*
55132332Sbostic  * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
55232332Sbostic  */
55332332Sbostic gosper(pattern)
55432332Sbostic 	char *pattern;		/* ... HAKMEM lives ... */
55532332Sbostic {
55632332Sbostic 	register int i, j;
55732332Sbostic 	unsigned char c;
55832332Sbostic 
55932332Sbostic 	/* Make one-string case look like simple alternatives case */
56032332Sbostic 	if (!altflag) {
56132332Sbostic 		nalt = 1;
56232332Sbostic 		altmin = altlen[0] = strlen(pattern);
56332339Sbostic 		altpat[0] = pattern;
56432332Sbostic 	}
56532332Sbostic 	/* For chars that aren't in any string, skip by string length. */
56632332Sbostic 	for (j = 0; j < 256; j++) {
56732332Sbostic 		delta0[j] = altmin;
56832332Sbostic 		cmap[j] = j;	/* Sneak in initialization of cmap */
56932332Sbostic 	}
57032332Sbostic 
57132332Sbostic 	/* For chars in a string, skip distance from char to end of string. */
57232332Sbostic 	/* (If char appears more than once, skip minimum distance.) */
57332332Sbostic 	for (i = 0; i < nalt; i++)
57432332Sbostic 		for (j = 0; j < altlen[i] - 1; j++) {
57532332Sbostic 			c = altpat[i][j];
57632332Sbostic 			delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
57732332Sbostic 			if (iflag && islower((int) c))
57832332Sbostic 				delta0[toupper((int) c)] = delta0[c];
57932332Sbostic 		}
58032332Sbostic 
58132332Sbostic 	/* For last char of each string, fall out of search loop. */
58232332Sbostic 	for (i = 0; i < nalt; i++) {
58332332Sbostic 		c = altpat[i][altlen[i] - 1];
58432332Sbostic 		delta0[c] = LARGE;
58532332Sbostic 		if (iflag && islower((int) c))
58632332Sbostic 			delta0[toupper((int) c)] = LARGE;
58732332Sbostic 	}
58832332Sbostic 	if (iflag)
58932332Sbostic 		for (j = 'A'; j <= 'Z'; j++)
59032332Sbostic 			cmap[j] = tolower((int) j);
59132332Sbostic }
59232332Sbostic 
59332332Sbostic /*
59432332Sbostic  * Print, count, or stop on full match. Result is either the location for
59532332Sbostic  * continued search, or NULL to stop.
59632332Sbostic  */
59732332Sbostic char *
59832332Sbostic gotamatch(file, s)
59932332Sbostic 	register char *file, *s;
60032332Sbostic {
60132332Sbostic 	char *savematch();
60232332Sbostic 	int squirrel = 0;	/* nonzero to squirrel away FIRSTFEW matches */
60332332Sbostic 
60432332Sbostic 	nmatch++;
60532332Sbostic 	nsuccess = 1;
60632332Sbostic 	if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
60732332Sbostic 		squirrel = 1;
60832332Sbostic 
60932332Sbostic 	if (sflag)
61032332Sbostic 		return (NULL);	/* -s usurps all flags (unlike some versions) */
61132332Sbostic 	if (cflag) {		/* -c overrides -l, we guess */
61232332Sbostic 		do;
61332332Sbostic 		while (*s++ != NL);
61432332Sbostic 	} else if (lflag) {
61532332Sbostic 		puts(file);
61632332Sbostic 		return (NULL);
61732332Sbostic 	} else {
61832332Sbostic 		if (!hflag)
61932332Sbostic 			if (!squirrel)
62032332Sbostic 				printf("%s:", file);
62132332Sbostic 			else
62232502Sbostic 				(void)sprintf(preamble, "%s:", file);
62332332Sbostic 		if (nflag) {
62432332Sbostic 			if (prevmatch)
62532332Sbostic 				prevnline = prevnline + nlcount(prevloc, s);
62632332Sbostic 			else
62732332Sbostic 				prevnline = nline + nlcount(str, s);
62832332Sbostic 			prevmatch = 1;
62932332Sbostic 
63032332Sbostic 			if (!squirrel)
63132332Sbostic 				printf("%ld:", prevnline);
63232332Sbostic 			else
63332502Sbostic 				(void)sprintf(preamble + strlen(preamble),
63432332Sbostic 					"%ld:", prevnline);
63532332Sbostic 		}
63632332Sbostic 		if (!squirrel) {
63732332Sbostic 			do
63832332Sbostic 				putchar(*s);
63932332Sbostic 			while (*s++ != NL);
64032332Sbostic 		} else
64132332Sbostic 			s = savematch(s);
64232332Sbostic 
64332332Sbostic 		if (nflag)
64432332Sbostic 			prevloc = s - 1;
64532332Sbostic 	}
64632332Sbostic 	return ((firstflag && !cflag) ? NULL : s);
64732332Sbostic }
64832332Sbostic 
64932332Sbostic char *
65032332Sbostic fold(line)
65132332Sbostic 	char *line;
65232332Sbostic {
65332332Sbostic 	static char fline[BUFSIZE];
65432332Sbostic 	register char *s, *t = fline;
65532332Sbostic 
65632332Sbostic 	for (s = line; *s != EOS; s++)
65732332Sbostic 		*t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
65832332Sbostic 	*t = EOS;
65932332Sbostic 	return (fline);
66032332Sbostic }
66132332Sbostic 
66232332Sbostic strindex(s, t)			/* the easy way, as in K&P, p. 192 */
66332332Sbostic 	char *s, *t;
66432332Sbostic {
66532332Sbostic 	int i, n;
66632332Sbostic 
66732332Sbostic 	n = strlen(t);
66832332Sbostic 	for (i = 0; s[i] != '\0'; i++)
66932332Sbostic 		if (strncmp(s + i, t, n) == 0)
67032332Sbostic 			return (i);
67132332Sbostic 	return (-1);
67232332Sbostic }
67332332Sbostic 
67432332Sbostic char *
67532332Sbostic grepxlat(pattern)		/* grep pattern meta conversion */
67632332Sbostic 	char *pattern;
67732332Sbostic {
67832332Sbostic 	register char *p, *s;
67932332Sbostic 	static char newpat[BUFSIZE];
68032332Sbostic 
68132332Sbostic 	for (s = newpat, p = pattern; *p != EOS;) {
68232332Sbostic 		if (*p == '\\') {	/* skip escapes ... */
68332332Sbostic 			*s++ = *p++;
68432332Sbostic 			if (*p)
68532332Sbostic 				*s++ = *p++;
68632332Sbostic 		} else if (*p == '[') {	/* ... and char classes */
68732332Sbostic 			while (*p != EOS && *p != ']')
68832332Sbostic 				*s++ = *p++;
68932332Sbostic 		} else if (strchr("+?|()", *p) != NULL) {
69032332Sbostic 			*s++ = '\\';	/* insert protection */
69132332Sbostic 			*s++ = *p++;
69232332Sbostic 		} else
69332332Sbostic 			*s++ = *p++;
69432332Sbostic 	}
69532332Sbostic 	*s = EOS;
69632332Sbostic 	grepflag = ((patind) ? 0 : 1);
69732332Sbostic 	return (newpat);
69832332Sbostic }
69932332Sbostic 
70032332Sbostic /*
70132332Sbostic  * Test for simple alternation.  Result is NULL if it's not so simple, or is
70232332Sbostic  * a pointer to the first string if it is. Warning:  sscanf size is a
70332332Sbostic  * fixpoint, beyond which the speedup linearity starts to break down.  In the
70432332Sbostic  * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
70532332Sbostic  * altpat[] to arbitrary size is not useful.
70632332Sbostic  */
70732332Sbostic char *
70832332Sbostic alternate(regexpr)
70932332Sbostic 	char *regexpr;
71032332Sbostic {
71132339Sbostic 	register int i, j;
71232339Sbostic 	register char *start, *stop;
71332332Sbostic 	unsigned char c;
71432332Sbostic 
71532332Sbostic 	if (fgrepflag && strchr(regexpr, '|'))
71632339Sbostic 			return (NULL);
71732332Sbostic 
71832339Sbostic 	/*
71932339Sbostic 	 * break pattern up into altpat array; delimit on newline, bar,
72032339Sbostic 	 * or EOS.  We know we won't overflow, we've already checked the
72132339Sbostic 	 * number of patterns we're going to find against NALT.
72232339Sbostic 	 * Also, set length of pattern and find minimum pattern length.
72332339Sbostic 	 */
72432339Sbostic 	nalt = 0;
72532339Sbostic 	altmin = NMUSH;
72632339Sbostic 	for (start = stop = regexpr;; ++stop)
72732339Sbostic 		if (!*stop || *stop == '|' || *stop == NL) {
72832339Sbostic 			altlen[nalt] = j = stop - start;
72932339Sbostic 			if (j < altmin)
73032339Sbostic 				altmin = j;
73132339Sbostic 			if (!(altpat[nalt] = malloc((u_int)(j + 1))))
73232339Sbostic 				oops("out of memory");
73332339Sbostic 			bcopy(start, altpat[nalt], j);
73432339Sbostic 			altpat[nalt][j] = EOS;
73532362Sbostic 			++nalt;
73632339Sbostic 			if (!*stop)
73732339Sbostic 				break;
73832362Sbostic 			if (nalt == NALT)
73932339Sbostic 				return(NULL);
74032339Sbostic 			if (*stop == NL)
74132339Sbostic 				*stop = '|';
74232339Sbostic 			start = stop + 1;
74332339Sbostic 		}
74432332Sbostic 	if (!fgrepflag) {
74532332Sbostic 		if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
74632332Sbostic 			return (NULL);
74732332Sbostic 		if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
74832332Sbostic 		    || strindex(regexpr, "||") >= 0)
74932332Sbostic 			return (NULL);
75032332Sbostic 	}
75132332Sbostic 
75232332Sbostic 	if (nalt > 1) {		/* build superimposed "pre-match" sets per
75332332Sbostic 				 * char */
75432332Sbostic 		altflag++;
75532332Sbostic 		for (j = 0; j < nalt; j++)
75632332Sbostic 			for (i = 0; i < altmin; i++) {
75732332Sbostic 				c = altpat[j][altlen[j] - altmin + i];
75832332Sbostic 				altset[i + 1][c] = 1;	/* offset for sentinel */
75932332Sbostic 			}
76032332Sbostic 	}
76132332Sbostic 	return (altpat[0]);
76232332Sbostic }
76332332Sbostic 
76432332Sbostic /*
76532332Sbostic  * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
76632332Sbostic  * determine whether to use dfa-based egrep:  We do FIRSTFEW matches with
76732332Sbostic  * regexec().  If Boyer-Moore up to now matched more than PUNTPERCENT
76832332Sbostic  * of the input, the r.e. is likely to be underspecified, so do old *grep,
76932332Sbostic  * which is faster on complex patterns than regexp().  At FIRSTFEW,
77032332Sbostic  * dump the saved matches collected by savematch(). They are saved
77132332Sbostic  * so that a "PUNT" can "rewind" to ignore them.  Stdin is problematic,
77232332Sbostic  * since it's hard to rewind.
77332332Sbostic  */
77432332Sbostic 
77532332Sbostic execstrategy(file)
77632332Sbostic 	char *file;
77732332Sbostic {
77832332Sbostic 	int pctmatch;
77932332Sbostic 
78032332Sbostic 	pctmatch = (100 * rxcount) / incount;
78132332Sbostic 	if (pctmatch > PUNTPERCENT && file != NULL)
78232332Sbostic 		kernighan(args);
78332332Sbostic 	if (file != NULL)
78432332Sbostic 		flushmatches();
78532332Sbostic }
78632332Sbostic 
78732332Sbostic nlcount(bstart, bstop)		/* flail interval to totalize newlines. */
78832332Sbostic 	char *bstart, *bstop;
78932332Sbostic {
79032332Sbostic 	register char *s = bstart;
79132332Sbostic 	register char *t = bstop;
79232332Sbostic 	register int count = 0;
79332332Sbostic 
79432332Sbostic 	do {			/* loop unroll for older architectures */
79532332Sbostic 		if (*t == NL)	/* ... ask ames!jaw for sample code */
79632332Sbostic 			count++;
79732332Sbostic 	} while (t-- > s);
79832332Sbostic 
79932332Sbostic 	return (count);
80032332Sbostic }
80132332Sbostic 
80232332Sbostic char *
80332332Sbostic isolate(regexpr)		/* isolate longest metacharacter-free string */
80432332Sbostic 	char *regexpr;
80532332Sbostic {
80632332Sbostic 	char *dummyexpr;
80732332Sbostic 
80832332Sbostic 	/*
80932332Sbostic 	 * We add (.)* because Henry's regcomp only figures regmust if it
81032332Sbostic 	 * sees a leading * pattern.  Foo!
81132332Sbostic 	 */
81232332Sbostic 	dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
81332502Sbostic 	(void)sprintf(dummyexpr, "(.)*%s", regexpr);
81432332Sbostic 	if ((rspencer = regcomp(dummyexpr)) == NULL)
81532332Sbostic 		kernighan(args);
81632332Sbostic 	return (rspencer->regmust);
81732332Sbostic }
81832332Sbostic 
81932332Sbostic char *matches[FIRSTFEW];
82032332Sbostic static int mcount = 0;
82132332Sbostic 
82232332Sbostic char *
82332332Sbostic savematch(s)			/* horde matches during statistics gathering */
82432332Sbostic 	register char *s;
82532332Sbostic {
82632332Sbostic 	char *p;
82732332Sbostic 	char *start = s;
82832332Sbostic 	int msize = 0;
82932332Sbostic 	int psize = strlen(preamble);
83032332Sbostic 
83132332Sbostic 	while (*s++ != NL)
83232332Sbostic 		msize++;
83332332Sbostic 	*--s = EOS;
83432332Sbostic 
83532332Sbostic 	p = malloc((unsigned) msize + 1 + psize);
83632332Sbostic 	strcpy(p, preamble);
83732332Sbostic 	strcpy(p + psize, start);
83832332Sbostic 	matches[mcount++] = p;
83932332Sbostic 
84032332Sbostic 	preamble[0] = 0;
84132332Sbostic 	*s = NL;
84232332Sbostic 	return (s);
84332332Sbostic }
84432332Sbostic 
84532332Sbostic flushmatches()
84632332Sbostic {
84732332Sbostic 	int n;
84832332Sbostic 
84932332Sbostic 	flushflag = 1;
85032332Sbostic 	for (n = 0; n < mcount; n++)
85132332Sbostic 		printf("%s\n", matches[n]);
85232332Sbostic 	mcount = 0;
85332332Sbostic }
85432332Sbostic 
85532332Sbostic oops(message)
85632332Sbostic 	char *message;
85732332Sbostic {
85832332Sbostic 	fprintf(stderr, "%s: %s\n", progname, message);
85932332Sbostic 	exit(2);
86032332Sbostic }
86132332Sbostic 
86232332Sbostic kernighan(args)			/* "let others do the hard part ..." */
86332332Sbostic 	char *args[];
86432332Sbostic {
86532332Sbostic 	/*
86632332Sbostic 	 * We may have already run grep on some of the files; remove them
86732332Sbostic 	 * from the arg list we pass on.  Note that we can't delete them
86832332Sbostic 	 * totally because the number of file names affects the output
86932332Sbostic 	 * (automatic -h).
87032332Sbostic 	 */
87132332Sbostic 	/* better would be fork/exec per punted file -- jaw */
87232332Sbostic 
87332332Sbostic 	while (firstfile && optind > firstfile)
87437914Sbostic 		args[firstfile++] = _PATH_DEVNULL;
87532332Sbostic 	if (patind)
87632332Sbostic 		args[patind] = pattern;
87732339Sbostic 	(void) fflush(stdout);
87832332Sbostic 
87932332Sbostic 	if (grepflag)
88037914Sbostic 		execvp(_PATH_GREPSTD, args), oops("can't exec old 'grep'");
88132332Sbostic 	else if (fgrepflag)
88237914Sbostic 		execvp(_PATH_FGREPSTD, args), oops("can't exec old 'fgrep'");
88332332Sbostic 	else
88437914Sbostic 		execvp(_PATH_EGREPSTD, args), oops("can't exec old 'egrep'");
88532332Sbostic }
886