xref: /csrg-svn/usr.bin/grep/egrep/egrep.c (revision 32332)
1*32332Sbostic #ifndef lint
2*32332Sbostic static char sccsid[] = "@(#)egrep.c	5.1 (Berkeley) 10/03/87";
3*32332Sbostic #endif not lint
4*32332Sbostic 
5*32332Sbostic /*
6*32332Sbostic      Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
7*32332Sbostic      table as in original paper (CACM, October, 1977).  No delta1 or delta2.
8*32332Sbostic      According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
9*32332Sbostic      minimal practical value.  However, to improve for worst case input,
10*32332Sbostic      integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
11*32332Sbostic      Comput., Feb. 1986) deserves consideration.
12*32332Sbostic 
13*32332Sbostic      Method: 	extract longest metacharacter-free string from expression.
14*32332Sbostic 		this is done using a side-effect from henry spencer's regcomp().
15*32332Sbostic 		use boyer-moore to match such, then pass submatching lines
16*32332Sbostic 		to either regexp() or standard 'egrep', depending on certain
17*32332Sbostic 		criteria within execstrategy() below.  [this tradeoff is due
18*32332Sbostic 		to the general slowness of the regexp() nondeterministic
19*32332Sbostic 		machine on complex expressions, as well as the startup time
20*32332Sbostic 		of standard 'egrep' on short files.]  alternatively, one may
21*32332Sbostic 		change the vendor-supplied 'egrep' automaton to include
22*32332Sbostic 		boyer-moore directly.  see accompanying writeup for discussion
23*32332Sbostic 		of kanji expression treatment.
24*32332Sbostic 
25*32332Sbostic 		late addition:  apply trickbag for fast match of simple
26*32332Sbostic 		alternations (sublinear, in common low-cardinality cases).
27*32332Sbostic 		trap fgrep into this lair.
28*32332Sbostic 
29*32332Sbostic 		gnu additions:  -f, newline as |, \< and \> [in regexec()], more
30*32332Sbostic 				comments.  inspire better dfa exec() strategy.
31*32332Sbostic 				serious testing and help with special cases.
32*32332Sbostic 
33*32332Sbostic      Algorithm amalgam summary:
34*32332Sbostic 
35*32332Sbostic 		dfa e?grep 		(aho/thompson)
36*32332Sbostic 		ndfa regexp() 		(spencer/aho)
37*32332Sbostic 		bmg			(boyer/moore/gosper)
38*32332Sbostic 		"superimposed" bmg   	(jaw)
39*32332Sbostic 		fgrep			(aho/corrasick)
40*32332Sbostic 
41*32332Sbostic 		sorry, but the knuth/morris/pratt machine, horspool's
42*32332Sbostic 		"frequentist" code, and the rabin/karp matcher, however cute,
43*32332Sbostic 		just don't cut it for this production.
44*32332Sbostic 
45*32332Sbostic      James A. Woods				Copyright (c) 1986
46*32332Sbostic      NASA Ames Research Center
47*32332Sbostic */
48*32332Sbostic #include <stdio.h>
49*32332Sbostic #include <ctype.h>
50*32332Sbostic #include <sys/types.h>
51*32332Sbostic #include <sys/stat.h>
52*32332Sbostic #include <regexp.h>		/* must be henry spencer's version */
53*32332Sbostic 
54*32332Sbostic #define	MIN(A, B)	((A) > (B) ? (B) : (A))
55*32332Sbostic 
56*32332Sbostic #ifdef	SLOWSYS
57*32332Sbostic #define read	xread
58*32332Sbostic #endif
59*32332Sbostic /*
60*32332Sbostic  * define existing [ef]?grep program locations below for use by execvp().
61*32332Sbostic  * [execlp() would be used were it not for the possibility of
62*32332Sbostic  * installation-dependent recursion.]
63*32332Sbostic  */
64*32332Sbostic #ifndef EGREPSTD
65*32332Sbostic #define	EGREPSTD	"/usr/bin/egrep"
66*32332Sbostic #endif
67*32332Sbostic #ifndef GREPSTD
68*32332Sbostic #define	GREPSTD		"/bin/grep"
69*32332Sbostic #endif
70*32332Sbostic #ifndef FGREPSTD
71*32332Sbostic #define	FGREPSTD	"/usr/bin/fgrep"
72*32332Sbostic #endif
73*32332Sbostic 
74*32332Sbostic #define BUFSIZE	8192		/* make higher for cray */
75*32332Sbostic #define PATSIZE 6000
76*32332Sbostic #define LARGE 	BUFSIZE + PATSIZE
77*32332Sbostic 
78*32332Sbostic #define ALTSIZE	100		/* generous? */
79*32332Sbostic #define NALT	7		/* tied to scanf() size in alternate() */
80*32332Sbostic #define NMUSH	6		/* loosely relates to expected alt length */
81*32332Sbostic 
82*32332Sbostic #define	FIRSTFEW	33 	/* Always do FIRSTFEW matches with regexec() */
83*32332Sbostic #define	PUNTPERCENT	10	/* After FIRSTFEW, if PUNTPERCENT of the input
84*32332Sbostic 				 * was processed by regexp(), exec std egrep. */
85*32332Sbostic #define NL	'\n'
86*32332Sbostic #define	EOS	'\0'
87*32332Sbostic #define	NONASCII	0200	/* Bit mask for Kanji non-ascii chars */
88*32332Sbostic #define META	"\n^$.[]()?+*|\\"	/* egrep meta-characters */
89*32332Sbostic #define SS2	'\216'		/* EUC Katakana (or Chinese2) prefix */
90*32332Sbostic #define SS3	'\217'		/* EUC Kanji2 (or Chinese3) prefix */
91*32332Sbostic 
92*32332Sbostic extern char *optarg;
93*32332Sbostic extern int optind;
94*32332Sbostic char *progname;
95*32332Sbostic 
96*32332Sbostic int cflag, iflag, eflag, fflag, lflag, nflag;	/* SVID flags */
97*32332Sbostic int sflag, hflag;		/* v7, v8, bsd */
98*32332Sbostic 
99*32332Sbostic int firstflag;			/* Stop at first match */
100*32332Sbostic int grepflag;			/* Called as "grep" */
101*32332Sbostic int fgrepflag;			/* Called as "fgrep" */
102*32332Sbostic int altflag;			/* Simple alternation in pattern */
103*32332Sbostic int boyonly;			/* No regexp needed -- all simple */
104*32332Sbostic int flushflag;
105*32332Sbostic int grepold, egrepold, fgrepold;
106*32332Sbostic 
107*32332Sbostic int nalt;			/* Number of alternatives */
108*32332Sbostic int nsuccess;			/* 1 for match, 2 for error */
109*32332Sbostic int altmin;			/* Minimum length of all the alternate
110*32332Sbostic 				 * strings */
111*32332Sbostic int firstfile;			/* argv index of first file argument */
112*32332Sbostic int patind;			/* argv index of pattern */
113*32332Sbostic long nmatch;			/* Number of matches in this file */
114*32332Sbostic long incount, counted;		/* Amount of input consumed */
115*32332Sbostic long rxcount;			/* Bytes of input processed by regexec() */
116*32332Sbostic int boyfound;			/* accumulated partial matches (tripped by
117*32332Sbostic 				 * FIRSTFEW) */
118*32332Sbostic int prevmatch;			/* next three lines aid fast -n */
119*32332Sbostic long nline, prevnline;
120*32332Sbostic char *prevloc;
121*32332Sbostic 
122*32332Sbostic regexp *rspencer;
123*32332Sbostic char *pattern;
124*32332Sbostic char *patboy;			/* Pattern for simple Boyer-Moore */
125*32332Sbostic char *patfile;			/* Filename containing pattern(s) */
126*32332Sbostic 
127*32332Sbostic int delta0[256];		/* Boyer-Moore algorithm core */
128*32332Sbostic char cmap[256];			/* Usually 0-255, but if -i, maps upper to
129*32332Sbostic 				 * lower case */
130*32332Sbostic char str[BUFSIZE + 2];
131*32332Sbostic int nleftover;
132*32332Sbostic char linetemp[BUFSIZE];
133*32332Sbostic char altpat[NALT][ALTSIZE];	/* alternation component storage */
134*32332Sbostic int altlen[NALT];
135*32332Sbostic short altset[NMUSH + 1][256];
136*32332Sbostic char preamble[200];		/* match prefix (filename, line no.) */
137*32332Sbostic 
138*32332Sbostic int fd;
139*32332Sbostic char *
140*32332Sbostic strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *sprintf(), *malloc();
141*32332Sbostic char *
142*32332Sbostic grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
143*32332Sbostic char **args;
144*32332Sbostic 
145*32332Sbostic main(argc, argv)
146*32332Sbostic 	int argc;
147*32332Sbostic 	char *argv[];
148*32332Sbostic {
149*32332Sbostic 	int c;
150*32332Sbostic 	int errflag = 0;
151*32332Sbostic 
152*32332Sbostic 	args = argv;
153*32332Sbostic 
154*32332Sbostic 	if ((progname = strrchr(argv[0], '/')) != 0)
155*32332Sbostic 		progname++;
156*32332Sbostic 	else
157*32332Sbostic 		progname = argv[0];
158*32332Sbostic 	if (strcmp(progname, "grep") == 0)
159*32332Sbostic 		grepflag++;
160*32332Sbostic 	if (strcmp(progname, "fgrep") == 0)
161*32332Sbostic 		fgrepflag++;
162*32332Sbostic 
163*32332Sbostic 	while ((c = getopt(argc, argv, "bchie:f:lnsvwxy1")) != EOF) {
164*32332Sbostic 		switch (c) {
165*32332Sbostic 
166*32332Sbostic 		case 'f':
167*32332Sbostic 			fflag++;
168*32332Sbostic 			patfile = optarg;
169*32332Sbostic 			continue;
170*32332Sbostic 		case 'b':
171*32332Sbostic 		case 'v':
172*32332Sbostic 			egrepold++;	/* boyer-moore of little help here */
173*32332Sbostic 			continue;
174*32332Sbostic 		case 'c':
175*32332Sbostic 			cflag++;
176*32332Sbostic 			continue;
177*32332Sbostic 		case 'e':
178*32332Sbostic 			eflag++;
179*32332Sbostic 			pattern = optarg;
180*32332Sbostic 			continue;
181*32332Sbostic 		case 'h':
182*32332Sbostic 			hflag++;
183*32332Sbostic 			continue;
184*32332Sbostic 		case '1':	/* Stop at very first match */
185*32332Sbostic 			firstflag++;	/* spead freaks only */
186*32332Sbostic 			continue;
187*32332Sbostic 		case 'i':
188*32332Sbostic 			iflag++;
189*32332Sbostic 			continue;
190*32332Sbostic 		case 'l':
191*32332Sbostic 			lflag++;
192*32332Sbostic 			continue;
193*32332Sbostic 		case 'n':
194*32332Sbostic 			nflag++;
195*32332Sbostic 			continue;
196*32332Sbostic 		case 's':
197*32332Sbostic 			sflag++;
198*32332Sbostic 			continue;
199*32332Sbostic 		case 'w':
200*32332Sbostic 		case 'y':
201*32332Sbostic 			if (!grepflag)
202*32332Sbostic 				errflag++;
203*32332Sbostic 			grepold++;
204*32332Sbostic 			continue;
205*32332Sbostic 		case 'x':	/* needs more work, like -b above */
206*32332Sbostic 			if (!fgrepflag)
207*32332Sbostic 				errflag++;
208*32332Sbostic 			fgrepold++;
209*32332Sbostic 			continue;
210*32332Sbostic 		case '?':
211*32332Sbostic 			errflag++;
212*32332Sbostic 		}
213*32332Sbostic 	}
214*32332Sbostic 	if (errflag || ((argc <= optind) && !fflag && !eflag)) {
215*32332Sbostic 		if (grepflag)
216*32332Sbostic oops("usage: grep [-bcihlnsvwy] [-e] pattern [file ...]");
217*32332Sbostic 		else if (fgrepflag)
218*32332Sbostic oops("usage: fgrep [-bcilnv] {-f patfile | [-e] strings} [file ...]");
219*32332Sbostic 		else		/* encourage SVID options, though we provide
220*32332Sbostic 				 * others */
221*32332Sbostic oops("usage: egrep [-bcilnv] {-f patfile | [-e] pattern} [file ...]");
222*32332Sbostic 	}
223*32332Sbostic 	patind = optind;
224*32332Sbostic 	if (fflag)
225*32332Sbostic 		pattern = pfile(patfile), patind = 0;
226*32332Sbostic 	else if (!eflag)
227*32332Sbostic 		pattern = argv[optind++];
228*32332Sbostic 
229*32332Sbostic 	if ((argc - optind) <= 1)	/* Filename invisible given < 2 files */
230*32332Sbostic 		hflag++;
231*32332Sbostic 	if (pattern[0] == EOS)
232*32332Sbostic 		kernighan(argv);	/* same as it ever was */
233*32332Sbostic 	/*
234*32332Sbostic 	 * 'grep/egrep' merger -- "old" grep is called to handle: tagged
235*32332Sbostic 	 * exprs \( \), word matches \< and \>, -w and -y options, char
236*32332Sbostic 	 * classes with '-' at end (egrep bug?), and patterns beginning with
237*32332Sbostic 	 * an asterisk (don't ask why). otherwise, characters meaningful to
238*32332Sbostic 	 * 'egrep' but not to 'grep' are escaped; the entire expr is then
239*32332Sbostic 	 * passed to 'egrep'.
240*32332Sbostic 	 */
241*32332Sbostic 	if (grepflag && !grepold) {
242*32332Sbostic 		if (strindex(pattern, "\\(") >= 0 ||
243*32332Sbostic 		    strindex(pattern, "\\<") >= 0 ||
244*32332Sbostic 		    strindex(pattern, "\\>") >= 0 ||
245*32332Sbostic 		    strindex(pattern, "-]") >= 0 ||
246*32332Sbostic 		    pattern[0] == '*')	/* grep bug */
247*32332Sbostic 			grepold++;
248*32332Sbostic 		else
249*32332Sbostic 			pattern = grepxlat(pattern);
250*32332Sbostic 	}
251*32332Sbostic 	if (grepold || egrepold || fgrepold)
252*32332Sbostic 		kernighan(argv);
253*32332Sbostic 
254*32332Sbostic 	if (iflag)
255*32332Sbostic 		strcpy(pattern, fold(pattern));
256*32332Sbostic 	/*
257*32332Sbostic 	 * If the pattern is a plain string, just run boyer-moore. If it
258*32332Sbostic 	 * consists of meta-free alternatives, run "superimposed" bmg.
259*32332Sbostic 	 * Otherwise, find best string, and compile pattern for regexec().
260*32332Sbostic 	 */
261*32332Sbostic 	if (strpbrk(pattern, META) == NULL) {	/* do boyer-moore only */
262*32332Sbostic 		boyonly++;
263*32332Sbostic 		patboy = pattern;
264*32332Sbostic 	} else {
265*32332Sbostic 		if ((patboy = alternate(pattern)) != NULL)
266*32332Sbostic 			boyonly++;
267*32332Sbostic 		else {
268*32332Sbostic 			if ((patboy = isolate(pattern)) == NULL)
269*32332Sbostic 				kernighan(argv);	/* expr too involved */
270*32332Sbostic #ifndef NOKANJI
271*32332Sbostic 			for (c = 0; pattern[c] != EOS; c++)
272*32332Sbostic 				if (pattern[c] & NONASCII)	/* kanji + meta */
273*32332Sbostic 					kernighan(argv);
274*32332Sbostic #endif
275*32332Sbostic 			if ((rspencer = regcomp(pattern)) == NULL)
276*32332Sbostic 				oops("regcomp failure");
277*32332Sbostic 		}
278*32332Sbostic 	}
279*32332Sbostic 	gosper(patboy);		/* "pre-conditioning is wonderful"
280*32332Sbostic 				 * -- v. strassen */
281*32332Sbostic 
282*32332Sbostic 	if ((firstfile = optind) >= argc) {
283*32332Sbostic 		/* Grep standard input */
284*32332Sbostic 		if (lflag)	/* We don't know its name! */
285*32332Sbostic 			exit(1);
286*32332Sbostic 		egsecute((char *) NULL);
287*32332Sbostic 	} else {
288*32332Sbostic 		while (optind < argc) {
289*32332Sbostic 			egsecute(argv[optind]);
290*32332Sbostic 			optind++;
291*32332Sbostic 			if (firstflag && (nsuccess == 1))
292*32332Sbostic 				break;
293*32332Sbostic 		}
294*32332Sbostic 	}
295*32332Sbostic 	exit((nsuccess == 2) ? 2 : (nsuccess == 0));
296*32332Sbostic }
297*32332Sbostic 
298*32332Sbostic char *
299*32332Sbostic pfile(pfname)			/* absorb expression from file */
300*32332Sbostic 	char *pfname;
301*32332Sbostic {
302*32332Sbostic 	FILE *pf;
303*32332Sbostic 	struct stat patstat;
304*32332Sbostic 	static char *pat;
305*32332Sbostic 
306*32332Sbostic 	if ((pf = fopen(pfname, "r")) == NULL)
307*32332Sbostic 		oops("can't read pattern file");
308*32332Sbostic 	if (fstat(fileno(pf), &patstat) != 0)
309*32332Sbostic 		oops("can't stat pattern file");
310*32332Sbostic 	if (patstat.st_size > PATSIZE) {
311*32332Sbostic 		if (fgrepflag) {	/* defer to unix version */
312*32332Sbostic 			fgrepold++;
313*32332Sbostic 			return "dummy";
314*32332Sbostic 		} else
315*32332Sbostic 			oops("pattern file too big");
316*32332Sbostic 	}
317*32332Sbostic 	if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
318*32332Sbostic 		oops("out of memory to read pattern file");
319*32332Sbostic 	if (patstat.st_size !=
320*32332Sbostic 	    fread(pat, sizeof(char), (int) patstat.st_size + 1, pf))
321*32332Sbostic 		oops("error reading pattern file");
322*32332Sbostic 	(void) fclose(pf);
323*32332Sbostic 
324*32332Sbostic 	pat[patstat.st_size] = EOS;
325*32332Sbostic 	if (pat[patstat.st_size - 1] == NL)	/* NOP for egrep; helps grep */
326*32332Sbostic 		pat[patstat.st_size - 1] = EOS;
327*32332Sbostic 
328*32332Sbostic 	if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
329*32332Sbostic 		if (fgrepflag)
330*32332Sbostic 			fgrepold++;	/* "what's it all about, alfie?" */
331*32332Sbostic 		else
332*32332Sbostic 			egrepold++;
333*32332Sbostic 	}
334*32332Sbostic 	return (pat);
335*32332Sbostic }
336*32332Sbostic 
337*32332Sbostic egsecute(file)
338*32332Sbostic 	char *file;
339*32332Sbostic {
340*32332Sbostic 	if (file == NULL)
341*32332Sbostic 		fd = 0;
342*32332Sbostic 	else if ((fd = open(file, 0)) <= 0) {
343*32332Sbostic 		fprintf(stderr, "%s: can't open %s\n", progname, file);
344*32332Sbostic 		nsuccess = 2;
345*32332Sbostic 		return;
346*32332Sbostic 	}
347*32332Sbostic 	chimaera(file, patboy);
348*32332Sbostic 
349*32332Sbostic 	if (!boyonly && !flushflag && file != NULL)
350*32332Sbostic 		flushmatches();
351*32332Sbostic 	if (file != NULL)
352*32332Sbostic 		close(fd);
353*32332Sbostic }
354*32332Sbostic 
355*32332Sbostic chimaera(file, pat)		/* "reach out and boyer-moore search someone" */
356*32332Sbostic 	char *file, *pat;	/* -- soon-to-be-popular bumper sticker */
357*32332Sbostic {
358*32332Sbostic 	register char *k, *strend, *s;
359*32332Sbostic 	register int j, count;
360*32332Sbostic 	register int *deltazero = delta0;
361*32332Sbostic 	int patlen = altmin;
362*32332Sbostic 	char *t;
363*32332Sbostic 	char *gotamatch(), *kanji(), *linesave(), *submatch();
364*32332Sbostic 
365*32332Sbostic 	nleftover = boyfound = flushflag = 0;
366*32332Sbostic 	nline = 1L;
367*32332Sbostic 	prevmatch = 0;
368*32332Sbostic 	nmatch = counted = rxcount = 0L;
369*32332Sbostic 
370*32332Sbostic 	while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
371*32332Sbostic 
372*32332Sbostic 		counted += count;
373*32332Sbostic 		strend = linesave(str, count);
374*32332Sbostic 
375*32332Sbostic 		for (k = str + patlen - 1; k < strend;) {
376*32332Sbostic 			/*
377*32332Sbostic 			 * for a large class of patterns, upwards of 80% of
378*32332Sbostic 			 * match time is spent on the next line.  we beat
379*32332Sbostic 			 * existing microcode (vax 'matchc') this way.
380*32332Sbostic 			 */
381*32332Sbostic 			while ((k += deltazero[*(unsigned char *) k]) < strend);
382*32332Sbostic 			if (k < (str + LARGE))
383*32332Sbostic 				break;
384*32332Sbostic 			k -= LARGE;
385*32332Sbostic 
386*32332Sbostic 			if (altflag) {
387*32332Sbostic 				/*
388*32332Sbostic 				 * Parallel Boyer-Moore.  Check whether each
389*32332Sbostic 				 * of the previous <altmin> chars COULD be
390*32332Sbostic 				 * from one of the alternative strings.
391*32332Sbostic 				 */
392*32332Sbostic 				s = k - 1;
393*32332Sbostic 				j = altmin;
394*32332Sbostic 				while (altset[--j][(unsigned char)
395*32332Sbostic 					     cmap[*(unsigned char *) s--]]);
396*32332Sbostic 				/*
397*32332Sbostic 				 * quick test fails. in this life, compare
398*32332Sbostic 				 * 'em all.  but, a "reverse trie" would
399*32332Sbostic 				 * attenuate worst case (linear w/delta2?).
400*32332Sbostic 				 */
401*32332Sbostic 				if (--j < 0) {
402*32332Sbostic 					count = nalt - 1;
403*32332Sbostic 					do {
404*32332Sbostic 						s = k;
405*32332Sbostic 						j = altlen[count];
406*32332Sbostic 						t = altpat[count];
407*32332Sbostic 
408*32332Sbostic 						while
409*32332Sbostic 							(cmap[*(unsigned char *) s--]
410*32332Sbostic 							 == t[--j]);
411*32332Sbostic 						if (j < 0)
412*32332Sbostic 							break;
413*32332Sbostic 					}
414*32332Sbostic 					while (count--);
415*32332Sbostic 				}
416*32332Sbostic 			} else {
417*32332Sbostic 				/* One string -- check it */
418*32332Sbostic 				j = patlen - 1;
419*32332Sbostic 				s = k - 1;
420*32332Sbostic 				while (cmap[*(unsigned char *) s--] == pat[--j]);
421*32332Sbostic 			}
422*32332Sbostic 			/*
423*32332Sbostic 			 * delta-less shortcut for literati. short shrift for
424*32332Sbostic 			 * genetic engineers?
425*32332Sbostic 			 */
426*32332Sbostic 			if (j >= 0) {
427*32332Sbostic 				k++;	/* no match; restart next char */
428*32332Sbostic 				continue;
429*32332Sbostic 			}
430*32332Sbostic 			k = submatch(file, pat, str, strend, k, count);
431*32332Sbostic 			if (k == NULL)
432*32332Sbostic 				return;
433*32332Sbostic 		}
434*32332Sbostic 		if (nflag) {
435*32332Sbostic 			if (prevmatch)
436*32332Sbostic 				nline = prevnline + nlcount(prevloc, k);
437*32332Sbostic 			else
438*32332Sbostic 				nline = nline + nlcount(str, k);
439*32332Sbostic 			prevmatch = 0;
440*32332Sbostic 		}
441*32332Sbostic 		strncpy(str, linetemp, nleftover);
442*32332Sbostic 	}
443*32332Sbostic 	if (cflag) {
444*32332Sbostic 		/* Bug from old grep: -c overrides -h.  We fix the bug. */
445*32332Sbostic 		if (!hflag)
446*32332Sbostic 			printf("%s:", file);
447*32332Sbostic 		printf("%ld\n", nmatch);
448*32332Sbostic 	}
449*32332Sbostic }
450*32332Sbostic 
451*32332Sbostic char *
452*32332Sbostic linesave(str, count)		/* accumulate partial line at end of buffer */
453*32332Sbostic 	char str[];
454*32332Sbostic 	register int count;
455*32332Sbostic {
456*32332Sbostic 	register int j;
457*32332Sbostic 
458*32332Sbostic 	count += nleftover;
459*32332Sbostic 	if (count != BUFSIZE && fd != 0)
460*32332Sbostic 		str[count++] = NL;	/* insurance for broken last line */
461*32332Sbostic 	str[count] = EOS;
462*32332Sbostic 	for (j = count - 1; str[j] != NL && j >= 0;)
463*32332Sbostic 		j--;
464*32332Sbostic 	/*
465*32332Sbostic 	 * break up these lines: long line (> BUFSIZE), last line of file, or
466*32332Sbostic 	 * short return from read(), as from tee(1) input
467*32332Sbostic 	 */
468*32332Sbostic 	if (j < 0 && (count == (BUFSIZE - nleftover))) {
469*32332Sbostic 		str[count++] = NL;
470*32332Sbostic 		str[count] = EOS;
471*32332Sbostic 		linetemp[0] = EOS;
472*32332Sbostic 		nleftover = 0;
473*32332Sbostic 		return (str + count);
474*32332Sbostic 	} else {
475*32332Sbostic 		nleftover = count - j - 1;
476*32332Sbostic 		strncpy(linetemp, str + j + 1, nleftover);
477*32332Sbostic 		return (str + j);
478*32332Sbostic 	}
479*32332Sbostic }
480*32332Sbostic 
481*32332Sbostic /*
482*32332Sbostic  * Process partial match. First check for mis-aligned Kanji, then match line
483*32332Sbostic  * against full compiled r.e. if statistics do not warrant handing off to
484*32332Sbostic  * standard egrep.
485*32332Sbostic  */
486*32332Sbostic char *
487*32332Sbostic submatch(file, pat, str, strend, k, altindex)
488*32332Sbostic 	char file[], pat[], str[];
489*32332Sbostic 	register char *strend, *k;
490*32332Sbostic 	int altindex;
491*32332Sbostic {
492*32332Sbostic 	register char *s;
493*32332Sbostic 	char *t, c;
494*32332Sbostic 
495*32332Sbostic 	t = k;
496*32332Sbostic 	s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
497*32332Sbostic #ifndef NOKANJI
498*32332Sbostic 	c = ((altflag) ? altpat[altindex][0] : pat[0]);
499*32332Sbostic 	if (c & NONASCII)
500*32332Sbostic 		if ((s = kanji(str, s, k)) == NULL)
501*32332Sbostic 			return (++k);	/* reject false kanji */
502*32332Sbostic #endif
503*32332Sbostic 	do;
504*32332Sbostic 	while (*s != NL && --s >= str);
505*32332Sbostic 	k = s + 1;		/* now at line start */
506*32332Sbostic 
507*32332Sbostic 	if (boyonly)
508*32332Sbostic 		return (gotamatch(file, k));
509*32332Sbostic 
510*32332Sbostic 	incount = counted - (strend - k);
511*32332Sbostic 	if (boyfound++ == FIRSTFEW)
512*32332Sbostic 		execstrategy(file);
513*32332Sbostic 
514*32332Sbostic 	s = t;
515*32332Sbostic 	do
516*32332Sbostic 		rxcount++;
517*32332Sbostic 	while (*s++ != NL);
518*32332Sbostic 	*--s = EOS;
519*32332Sbostic 	/*
520*32332Sbostic 	 * "quick henry -- the flit" (after theodor geisel)
521*32332Sbostic 	 */
522*32332Sbostic 	if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
523*32332Sbostic 		*s = NL;
524*32332Sbostic 		if (gotamatch(file, k) == NULL)
525*32332Sbostic 			return (NULL);
526*32332Sbostic 	}
527*32332Sbostic 	*s = NL;
528*32332Sbostic 	return (s + 1);
529*32332Sbostic }
530*32332Sbostic 
531*32332Sbostic /*
532*32332Sbostic  * EUC code disambiguation -- scan backwards to first 7-bit code, while
533*32332Sbostic  * counting intervening 8-bit codes.  If odd, reject unaligned Kanji pattern.
534*32332Sbostic  * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
535*32332Sbostic  */
536*32332Sbostic char *
537*32332Sbostic kanji(str, s, k)
538*32332Sbostic 	register char *str, *s, *k;
539*32332Sbostic {
540*32332Sbostic 	register int j = 0;
541*32332Sbostic 
542*32332Sbostic 	for (s--; s >= str; s--) {
543*32332Sbostic 		if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
544*32332Sbostic 			break;
545*32332Sbostic 		j++;
546*32332Sbostic 	}
547*32332Sbostic #ifndef CHINESE
548*32332Sbostic 	if (*s == SS2)
549*32332Sbostic 		j -= 1;
550*32332Sbostic #endif  CHINESE
551*32332Sbostic 	return ((j & 01) ? NULL : k);
552*32332Sbostic }
553*32332Sbostic 
554*32332Sbostic /*
555*32332Sbostic  * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
556*32332Sbostic  */
557*32332Sbostic gosper(pattern)
558*32332Sbostic 	char *pattern;		/* ... HAKMEM lives ... */
559*32332Sbostic {
560*32332Sbostic 	register int i, j;
561*32332Sbostic 	unsigned char c;
562*32332Sbostic 
563*32332Sbostic 	/* Make one-string case look like simple alternatives case */
564*32332Sbostic 	if (!altflag) {
565*32332Sbostic 		nalt = 1;
566*32332Sbostic 		altmin = altlen[0] = strlen(pattern);
567*32332Sbostic 		strcpy(altpat[0], pattern);
568*32332Sbostic 	}
569*32332Sbostic 	/* For chars that aren't in any string, skip by string length. */
570*32332Sbostic 	for (j = 0; j < 256; j++) {
571*32332Sbostic 		delta0[j] = altmin;
572*32332Sbostic 		cmap[j] = j;	/* Sneak in initialization of cmap */
573*32332Sbostic 	}
574*32332Sbostic 
575*32332Sbostic 	/* For chars in a string, skip distance from char to end of string. */
576*32332Sbostic 	/* (If char appears more than once, skip minimum distance.) */
577*32332Sbostic 	for (i = 0; i < nalt; i++)
578*32332Sbostic 		for (j = 0; j < altlen[i] - 1; j++) {
579*32332Sbostic 			c = altpat[i][j];
580*32332Sbostic 			delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
581*32332Sbostic 			if (iflag && islower((int) c))
582*32332Sbostic 				delta0[toupper((int) c)] = delta0[c];
583*32332Sbostic 		}
584*32332Sbostic 
585*32332Sbostic 	/* For last char of each string, fall out of search loop. */
586*32332Sbostic 	for (i = 0; i < nalt; i++) {
587*32332Sbostic 		c = altpat[i][altlen[i] - 1];
588*32332Sbostic 		delta0[c] = LARGE;
589*32332Sbostic 		if (iflag && islower((int) c))
590*32332Sbostic 			delta0[toupper((int) c)] = LARGE;
591*32332Sbostic 	}
592*32332Sbostic 	if (iflag)
593*32332Sbostic 		for (j = 'A'; j <= 'Z'; j++)
594*32332Sbostic 			cmap[j] = tolower((int) j);
595*32332Sbostic }
596*32332Sbostic 
597*32332Sbostic /*
598*32332Sbostic  * Print, count, or stop on full match. Result is either the location for
599*32332Sbostic  * continued search, or NULL to stop.
600*32332Sbostic  */
601*32332Sbostic char *
602*32332Sbostic gotamatch(file, s)
603*32332Sbostic 	register char *file, *s;
604*32332Sbostic {
605*32332Sbostic 	char *savematch();
606*32332Sbostic 	int squirrel = 0;	/* nonzero to squirrel away FIRSTFEW matches */
607*32332Sbostic 
608*32332Sbostic 	nmatch++;
609*32332Sbostic 	nsuccess = 1;
610*32332Sbostic 	if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
611*32332Sbostic 		squirrel = 1;
612*32332Sbostic 
613*32332Sbostic 	if (sflag)
614*32332Sbostic 		return (NULL);	/* -s usurps all flags (unlike some versions) */
615*32332Sbostic 	if (cflag) {		/* -c overrides -l, we guess */
616*32332Sbostic 		do;
617*32332Sbostic 		while (*s++ != NL);
618*32332Sbostic 	} else if (lflag) {
619*32332Sbostic 		puts(file);
620*32332Sbostic 		return (NULL);
621*32332Sbostic 	} else {
622*32332Sbostic 		if (!hflag)
623*32332Sbostic 			if (!squirrel)
624*32332Sbostic 				printf("%s:", file);
625*32332Sbostic 			else
626*32332Sbostic 				sprintf(preamble, "%s:", file);
627*32332Sbostic 		if (nflag) {
628*32332Sbostic 			if (prevmatch)
629*32332Sbostic 				prevnline = prevnline + nlcount(prevloc, s);
630*32332Sbostic 			else
631*32332Sbostic 				prevnline = nline + nlcount(str, s);
632*32332Sbostic 			prevmatch = 1;
633*32332Sbostic 
634*32332Sbostic 			if (!squirrel)
635*32332Sbostic 				printf("%ld:", prevnline);
636*32332Sbostic 			else
637*32332Sbostic 				sprintf(preamble + strlen(preamble),
638*32332Sbostic 					"%ld:", prevnline);
639*32332Sbostic 		}
640*32332Sbostic 		if (!squirrel) {
641*32332Sbostic 			do
642*32332Sbostic 				putchar(*s);
643*32332Sbostic 			while (*s++ != NL);
644*32332Sbostic 		} else
645*32332Sbostic 			s = savematch(s);
646*32332Sbostic 
647*32332Sbostic 		if (nflag)
648*32332Sbostic 			prevloc = s - 1;
649*32332Sbostic 	}
650*32332Sbostic 	return ((firstflag && !cflag) ? NULL : s);
651*32332Sbostic }
652*32332Sbostic 
653*32332Sbostic char *
654*32332Sbostic fold(line)
655*32332Sbostic 	char *line;
656*32332Sbostic {
657*32332Sbostic 	static char fline[BUFSIZE];
658*32332Sbostic 	register char *s, *t = fline;
659*32332Sbostic 
660*32332Sbostic 	for (s = line; *s != EOS; s++)
661*32332Sbostic 		*t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
662*32332Sbostic 	*t = EOS;
663*32332Sbostic 	return (fline);
664*32332Sbostic }
665*32332Sbostic 
666*32332Sbostic strindex(s, t)			/* the easy way, as in K&P, p. 192 */
667*32332Sbostic 	char *s, *t;
668*32332Sbostic {
669*32332Sbostic 	int i, n;
670*32332Sbostic 
671*32332Sbostic 	n = strlen(t);
672*32332Sbostic 	for (i = 0; s[i] != '\0'; i++)
673*32332Sbostic 		if (strncmp(s + i, t, n) == 0)
674*32332Sbostic 			return (i);
675*32332Sbostic 	return (-1);
676*32332Sbostic }
677*32332Sbostic 
678*32332Sbostic char *
679*32332Sbostic grepxlat(pattern)		/* grep pattern meta conversion */
680*32332Sbostic 	char *pattern;
681*32332Sbostic {
682*32332Sbostic 	register char *p, *s;
683*32332Sbostic 	static char newpat[BUFSIZE];
684*32332Sbostic 
685*32332Sbostic 	for (s = newpat, p = pattern; *p != EOS;) {
686*32332Sbostic 		if (*p == '\\') {	/* skip escapes ... */
687*32332Sbostic 			*s++ = *p++;
688*32332Sbostic 			if (*p)
689*32332Sbostic 				*s++ = *p++;
690*32332Sbostic 		} else if (*p == '[') {	/* ... and char classes */
691*32332Sbostic 			while (*p != EOS && *p != ']')
692*32332Sbostic 				*s++ = *p++;
693*32332Sbostic 		} else if (strchr("+?|()", *p) != NULL) {
694*32332Sbostic 			*s++ = '\\';	/* insert protection */
695*32332Sbostic 			*s++ = *p++;
696*32332Sbostic 		} else
697*32332Sbostic 			*s++ = *p++;
698*32332Sbostic 	}
699*32332Sbostic 	*s = EOS;
700*32332Sbostic 	grepflag = ((patind) ? 0 : 1);
701*32332Sbostic 	return (newpat);
702*32332Sbostic }
703*32332Sbostic 
704*32332Sbostic /*
705*32332Sbostic  * Test for simple alternation.  Result is NULL if it's not so simple, or is
706*32332Sbostic  * a pointer to the first string if it is. Warning:  sscanf size is a
707*32332Sbostic  * fixpoint, beyond which the speedup linearity starts to break down.  In the
708*32332Sbostic  * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
709*32332Sbostic  * altpat[] to arbitrary size is not useful.
710*32332Sbostic  */
711*32332Sbostic char *
712*32332Sbostic alternate(regexpr)
713*32332Sbostic 	char *regexpr;
714*32332Sbostic {
715*32332Sbostic 	register i, j, min;
716*32332Sbostic 	unsigned char c;
717*32332Sbostic 	char oflow[100];
718*32332Sbostic 
719*32332Sbostic 	if (fgrepflag && strchr(regexpr, '|'))
720*32332Sbostic 		return (NULL);
721*32332Sbostic 
722*32332Sbostic 	i = strlen(regexpr);
723*32332Sbostic 	for (j = 0; j < i; j++)
724*32332Sbostic 		if (regexpr[j] == NL)
725*32332Sbostic 			regexpr[j] = '|';
726*32332Sbostic 
727*32332Sbostic 	if (!fgrepflag) {
728*32332Sbostic 		if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
729*32332Sbostic 			return (NULL);
730*32332Sbostic 		if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
731*32332Sbostic 		    || strindex(regexpr, "||") >= 0)
732*32332Sbostic 			return (NULL);
733*32332Sbostic 	}
734*32332Sbostic 	oflow[0] = EOS;
735*32332Sbostic 	nalt = sscanf(regexpr, "%[^|]|%[^|]|%[^|]|%[^|]|%[^|]|%[^|]|%[^|]|%[^|]",
736*32332Sbostic 		      altpat[0], altpat[1], altpat[2], altpat[3], altpat[4], altpat[5], altpat[6], oflow);
737*32332Sbostic 
738*32332Sbostic 	if (nalt < 1 || oflow[0] != EOS)
739*32332Sbostic 		return (NULL);
740*32332Sbostic 
741*32332Sbostic 	altmin = NMUSH;
742*32332Sbostic 	for (j = 0; j < nalt; j++) {
743*32332Sbostic 		min = altlen[j] = strlen(altpat[j]);
744*32332Sbostic 		if (min > ALTSIZE)
745*32332Sbostic 			return (NULL);
746*32332Sbostic 		altmin = MIN(altmin, min);
747*32332Sbostic 	}
748*32332Sbostic 	if (nalt > 1) {		/* build superimposed "pre-match" sets per
749*32332Sbostic 				 * char */
750*32332Sbostic 		altflag++;
751*32332Sbostic 		for (j = 0; j < nalt; j++)
752*32332Sbostic 			for (i = 0; i < altmin; i++) {
753*32332Sbostic 				c = altpat[j][altlen[j] - altmin + i];
754*32332Sbostic 				altset[i + 1][c] = 1;	/* offset for sentinel */
755*32332Sbostic 			}
756*32332Sbostic 	}
757*32332Sbostic 	return (altpat[0]);
758*32332Sbostic }
759*32332Sbostic 
760*32332Sbostic /*
761*32332Sbostic  * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
762*32332Sbostic  * determine whether to use dfa-based egrep:  We do FIRSTFEW matches with
763*32332Sbostic  * regexec().  If Boyer-Moore up to now matched more than PUNTPERCENT
764*32332Sbostic  * of the input, the r.e. is likely to be underspecified, so do old *grep,
765*32332Sbostic  * which is faster on complex patterns than regexp().  At FIRSTFEW,
766*32332Sbostic  * dump the saved matches collected by savematch(). They are saved
767*32332Sbostic  * so that a "PUNT" can "rewind" to ignore them.  Stdin is problematic,
768*32332Sbostic  * since it's hard to rewind.
769*32332Sbostic  */
770*32332Sbostic 
771*32332Sbostic execstrategy(file)
772*32332Sbostic 	char *file;
773*32332Sbostic {
774*32332Sbostic 	int pctmatch;
775*32332Sbostic 
776*32332Sbostic 	pctmatch = (100 * rxcount) / incount;
777*32332Sbostic 	if (pctmatch > PUNTPERCENT && file != NULL)
778*32332Sbostic 		kernighan(args);
779*32332Sbostic 	if (file != NULL)
780*32332Sbostic 		flushmatches();
781*32332Sbostic }
782*32332Sbostic 
783*32332Sbostic nlcount(bstart, bstop)		/* flail interval to totalize newlines. */
784*32332Sbostic 	char *bstart, *bstop;
785*32332Sbostic {
786*32332Sbostic 	register char *s = bstart;
787*32332Sbostic 	register char *t = bstop;
788*32332Sbostic 	register int count = 0;
789*32332Sbostic 
790*32332Sbostic 	do {			/* loop unroll for older architectures */
791*32332Sbostic 		if (*t == NL)	/* ... ask ames!jaw for sample code */
792*32332Sbostic 			count++;
793*32332Sbostic 	} while (t-- > s);
794*32332Sbostic 
795*32332Sbostic 	return (count);
796*32332Sbostic }
797*32332Sbostic 
798*32332Sbostic char *
799*32332Sbostic isolate(regexpr)		/* isolate longest metacharacter-free string */
800*32332Sbostic 	char *regexpr;
801*32332Sbostic {
802*32332Sbostic 	char *dummyexpr;
803*32332Sbostic 
804*32332Sbostic 	/*
805*32332Sbostic 	 * We add (.)* because Henry's regcomp only figures regmust if it
806*32332Sbostic 	 * sees a leading * pattern.  Foo!
807*32332Sbostic 	 */
808*32332Sbostic 	dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
809*32332Sbostic 	sprintf(dummyexpr, "(.)*%s", regexpr);
810*32332Sbostic 	if ((rspencer = regcomp(dummyexpr)) == NULL)
811*32332Sbostic 		kernighan(args);
812*32332Sbostic 	return (rspencer->regmust);
813*32332Sbostic }
814*32332Sbostic 
815*32332Sbostic char *matches[FIRSTFEW];
816*32332Sbostic static int mcount = 0;
817*32332Sbostic 
818*32332Sbostic char *
819*32332Sbostic savematch(s)			/* horde matches during statistics gathering */
820*32332Sbostic 	register char *s;
821*32332Sbostic {
822*32332Sbostic 	char *p;
823*32332Sbostic 	char *start = s;
824*32332Sbostic 	int msize = 0;
825*32332Sbostic 	int psize = strlen(preamble);
826*32332Sbostic 
827*32332Sbostic 	while (*s++ != NL)
828*32332Sbostic 		msize++;
829*32332Sbostic 	*--s = EOS;
830*32332Sbostic 
831*32332Sbostic 	p = malloc((unsigned) msize + 1 + psize);
832*32332Sbostic 	strcpy(p, preamble);
833*32332Sbostic 	strcpy(p + psize, start);
834*32332Sbostic 	matches[mcount++] = p;
835*32332Sbostic 
836*32332Sbostic 	preamble[0] = 0;
837*32332Sbostic 	*s = NL;
838*32332Sbostic 	return (s);
839*32332Sbostic }
840*32332Sbostic 
841*32332Sbostic flushmatches()
842*32332Sbostic {
843*32332Sbostic 	int n;
844*32332Sbostic 
845*32332Sbostic 	flushflag = 1;
846*32332Sbostic 	for (n = 0; n < mcount; n++)
847*32332Sbostic 		printf("%s\n", matches[n]);
848*32332Sbostic 	mcount = 0;
849*32332Sbostic }
850*32332Sbostic 
851*32332Sbostic oops(message)
852*32332Sbostic 	char *message;
853*32332Sbostic {
854*32332Sbostic 	fprintf(stderr, "%s: %s\n", progname, message);
855*32332Sbostic 	exit(2);
856*32332Sbostic }
857*32332Sbostic 
858*32332Sbostic kernighan(args)			/* "let others do the hard part ..." */
859*32332Sbostic 	char *args[];
860*32332Sbostic {
861*32332Sbostic 	/*
862*32332Sbostic 	 * We may have already run grep on some of the files; remove them
863*32332Sbostic 	 * from the arg list we pass on.  Note that we can't delete them
864*32332Sbostic 	 * totally because the number of file names affects the output
865*32332Sbostic 	 * (automatic -h).
866*32332Sbostic 	 */
867*32332Sbostic 	/* better would be fork/exec per punted file -- jaw */
868*32332Sbostic 
869*32332Sbostic 	while (firstfile && optind > firstfile)
870*32332Sbostic 		args[firstfile++] = "/dev/null";
871*32332Sbostic 	if (patind)
872*32332Sbostic 		args[patind] = pattern;
873*32332Sbostic 	fflush(stdout);
874*32332Sbostic 
875*32332Sbostic 	if (grepflag)
876*32332Sbostic 		execvp(GREPSTD, args), oops("can't exec old 'grep'");
877*32332Sbostic 	else if (fgrepflag)
878*32332Sbostic 		execvp(FGREPSTD, args), oops("can't exec old 'fgrep'");
879*32332Sbostic 	else
880*32332Sbostic 		execvp(EGREPSTD, args), oops("can't exec old 'egrep'");
881*32332Sbostic }
882