148263Sbostic /*-
2*62020Sbostic * Copyright (c) 1991, 1993
3*62020Sbostic * The Regents of the University of California. All rights reserved.
448263Sbostic *
548263Sbostic * %sccs.include.redist.c%
648263Sbostic */
748263Sbostic
832332Sbostic #ifndef lint
9*62020Sbostic static char copyright[] =
10*62020Sbostic "@(#) Copyright (c) 1991, 1993\n\
11*62020Sbostic The Regents of the University of California. All rights reserved.\n";
1248263Sbostic #endif /* not lint */
1332332Sbostic
1448263Sbostic #ifndef lint
15*62020Sbostic static char sccsid[] = "@(#)egrep.c 8.1 (Berkeley) 06/06/93";
1648263Sbostic #endif /* not lint */
1748263Sbostic
1832332Sbostic /*
1932332Sbostic Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
2032332Sbostic table as in original paper (CACM, October, 1977). No delta1 or delta2.
2132332Sbostic According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
2232332Sbostic minimal practical value. However, to improve for worst case input,
2332332Sbostic integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
2432332Sbostic Comput., Feb. 1986) deserves consideration.
2532332Sbostic
2632332Sbostic Method: extract longest metacharacter-free string from expression.
2732332Sbostic this is done using a side-effect from henry spencer's regcomp().
2832332Sbostic use boyer-moore to match such, then pass submatching lines
2932332Sbostic to either regexp() or standard 'egrep', depending on certain
3032332Sbostic criteria within execstrategy() below. [this tradeoff is due
3132332Sbostic to the general slowness of the regexp() nondeterministic
3232332Sbostic machine on complex expressions, as well as the startup time
3332332Sbostic of standard 'egrep' on short files.] alternatively, one may
3432332Sbostic change the vendor-supplied 'egrep' automaton to include
3532332Sbostic boyer-moore directly. see accompanying writeup for discussion
3632332Sbostic of kanji expression treatment.
3732332Sbostic
3832332Sbostic late addition: apply trickbag for fast match of simple
3932332Sbostic alternations (sublinear, in common low-cardinality cases).
4032332Sbostic trap fgrep into this lair.
4132332Sbostic
4232332Sbostic gnu additions: -f, newline as |, \< and \> [in regexec()], more
4332332Sbostic comments. inspire better dfa exec() strategy.
4432332Sbostic serious testing and help with special cases.
4532332Sbostic
4632332Sbostic Algorithm amalgam summary:
4732332Sbostic
4832332Sbostic dfa e?grep (aho/thompson)
4932332Sbostic ndfa regexp() (spencer/aho)
5032332Sbostic bmg (boyer/moore/gosper)
5132332Sbostic "superimposed" bmg (jaw)
5232332Sbostic fgrep (aho/corrasick)
5332332Sbostic
5432332Sbostic sorry, but the knuth/morris/pratt machine, horspool's
5532332Sbostic "frequentist" code, and the rabin/karp matcher, however cute,
5632332Sbostic just don't cut it for this production.
5732332Sbostic
5832332Sbostic James A. Woods Copyright (c) 1986
5932332Sbostic NASA Ames Research Center
6032332Sbostic */
6132332Sbostic #include <sys/types.h>
6232332Sbostic #include <sys/stat.h>
6332337Sbostic #include <sys/file.h>
6432332Sbostic #include <regexp.h> /* must be henry spencer's version */
6537914Sbostic #include <stdio.h>
6637914Sbostic #include <ctype.h>
6737914Sbostic #include "pathnames.h"
6832332Sbostic
6932332Sbostic #define MIN(A, B) ((A) > (B) ? (B) : (A))
7032332Sbostic
7132332Sbostic #ifdef SLOWSYS
7232332Sbostic #define read xread
7332332Sbostic #endif
7432332Sbostic
7532332Sbostic #define BUFSIZE 8192 /* make higher for cray */
7632332Sbostic #define PATSIZE 6000
7732332Sbostic #define LARGE BUFSIZE + PATSIZE
7832332Sbostic
7932332Sbostic #define NALT 7 /* tied to scanf() size in alternate() */
8032332Sbostic #define NMUSH 6 /* loosely relates to expected alt length */
8132332Sbostic
8232332Sbostic #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */
8332332Sbostic #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input
8432332Sbostic * was processed by regexp(), exec std egrep. */
8532332Sbostic #define NL '\n'
8632332Sbostic #define EOS '\0'
8732332Sbostic #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */
8832332Sbostic #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */
8932332Sbostic #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */
9032332Sbostic #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */
9132332Sbostic
9232332Sbostic extern char *optarg;
9332332Sbostic extern int optind;
9432332Sbostic char *progname;
9532332Sbostic
9632332Sbostic int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */
9732332Sbostic int sflag, hflag; /* v7, v8, bsd */
9832332Sbostic
9932332Sbostic int firstflag; /* Stop at first match */
10032332Sbostic int grepflag; /* Called as "grep" */
10132332Sbostic int fgrepflag; /* Called as "fgrep" */
10232332Sbostic int altflag; /* Simple alternation in pattern */
10332332Sbostic int boyonly; /* No regexp needed -- all simple */
10432332Sbostic int flushflag;
10532332Sbostic int grepold, egrepold, fgrepold;
10632332Sbostic
10732332Sbostic int nalt; /* Number of alternatives */
10832332Sbostic int nsuccess; /* 1 for match, 2 for error */
10932332Sbostic int altmin; /* Minimum length of all the alternate
11032332Sbostic * strings */
11132332Sbostic int firstfile; /* argv index of first file argument */
11232332Sbostic int patind; /* argv index of pattern */
11332332Sbostic long nmatch; /* Number of matches in this file */
11432332Sbostic long incount, counted; /* Amount of input consumed */
11532332Sbostic long rxcount; /* Bytes of input processed by regexec() */
11632332Sbostic int boyfound; /* accumulated partial matches (tripped by
11732332Sbostic * FIRSTFEW) */
11832332Sbostic int prevmatch; /* next three lines aid fast -n */
11932332Sbostic long nline, prevnline;
12032332Sbostic char *prevloc;
12132332Sbostic
12232332Sbostic regexp *rspencer;
12332332Sbostic char *pattern;
12432332Sbostic char *patboy; /* Pattern for simple Boyer-Moore */
12532332Sbostic char *patfile; /* Filename containing pattern(s) */
12632332Sbostic
12732332Sbostic int delta0[256]; /* Boyer-Moore algorithm core */
12832332Sbostic char cmap[256]; /* Usually 0-255, but if -i, maps upper to
12932332Sbostic * lower case */
13032332Sbostic char str[BUFSIZE + 2];
13132332Sbostic int nleftover;
13232332Sbostic char linetemp[BUFSIZE];
13332339Sbostic char *altpat[NALT]; /* alternation component storage */
13432332Sbostic int altlen[NALT];
13532332Sbostic short altset[NMUSH + 1][256];
13632332Sbostic char preamble[200]; /* match prefix (filename, line no.) */
13732332Sbostic
13832332Sbostic int fd;
13932332Sbostic char *
14032502Sbostic strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc();
14132332Sbostic char *
14232332Sbostic grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
14335692Sbostic char *gotamatch(), *kanji(), *linesave(), *submatch();
14432332Sbostic char **args;
14532332Sbostic
main(argc,argv)14632332Sbostic main(argc, argv)
14732332Sbostic int argc;
14832332Sbostic char *argv[];
14932332Sbostic {
15032365Sbostic int c, oflag;
15132332Sbostic int errflag = 0;
15232332Sbostic
15332332Sbostic args = argv;
15432332Sbostic
15532332Sbostic if ((progname = strrchr(argv[0], '/')) != 0)
15632332Sbostic progname++;
15732332Sbostic else
15832332Sbostic progname = argv[0];
15932332Sbostic if (strcmp(progname, "grep") == 0)
16032332Sbostic grepflag++;
16132365Sbostic else if (strcmp(progname, "fgrep") == 0)
16232332Sbostic fgrepflag++;
16332332Sbostic
16432365Sbostic oflag = 0;
16532365Sbostic while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) {
16632332Sbostic switch (c) {
16732332Sbostic
16832332Sbostic case 'f':
16932332Sbostic fflag++;
17032332Sbostic patfile = optarg;
17132332Sbostic continue;
17232332Sbostic case 'b':
17332332Sbostic case 'v':
17432332Sbostic egrepold++; /* boyer-moore of little help here */
17532332Sbostic continue;
17632332Sbostic case 'c':
17732332Sbostic cflag++;
17832332Sbostic continue;
17932332Sbostic case 'e':
18032332Sbostic eflag++;
18132332Sbostic pattern = optarg;
18232332Sbostic continue;
18332332Sbostic case 'h':
18432332Sbostic hflag++;
18532332Sbostic continue;
18632365Sbostic case 'o':
18732365Sbostic oflag++;
18832365Sbostic continue;
18932332Sbostic case '1': /* Stop at very first match */
19032332Sbostic firstflag++; /* spead freaks only */
19132332Sbostic continue;
19232332Sbostic case 'i':
19332332Sbostic iflag++;
19432332Sbostic continue;
19532332Sbostic case 'l':
19632332Sbostic lflag++;
19732332Sbostic continue;
19832332Sbostic case 'n':
19932332Sbostic nflag++;
20032332Sbostic continue;
20132332Sbostic case 's':
20232332Sbostic sflag++;
20332332Sbostic continue;
20432332Sbostic case 'w':
20532332Sbostic case 'y':
20632332Sbostic if (!grepflag)
20732332Sbostic errflag++;
20832332Sbostic grepold++;
20932332Sbostic continue;
21032332Sbostic case 'x': /* needs more work, like -b above */
21132332Sbostic if (!fgrepflag)
21232332Sbostic errflag++;
21332332Sbostic fgrepold++;
21432332Sbostic continue;
21532332Sbostic case '?':
21632332Sbostic errflag++;
21732332Sbostic }
21832332Sbostic }
21932332Sbostic if (errflag || ((argc <= optind) && !fflag && !eflag)) {
22032332Sbostic if (grepflag)
22132365Sbostic oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]");
22232332Sbostic else if (fgrepflag)
22332366Sbostic oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]");
22432332Sbostic else /* encourage SVID options, though we provide
22532332Sbostic * others */
22632366Sbostic oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]");
22732332Sbostic }
22832332Sbostic if (fflag)
22934917Sbostic pattern = pfile(patfile);
23034917Sbostic else if (!eflag) {
23134917Sbostic patind = optind;
23232332Sbostic pattern = argv[optind++];
23334917Sbostic }
23432332Sbostic
23532365Sbostic if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */
23632332Sbostic hflag++;
23732332Sbostic if (pattern[0] == EOS)
23832332Sbostic kernighan(argv); /* same as it ever was */
23932332Sbostic /*
24032332Sbostic * 'grep/egrep' merger -- "old" grep is called to handle: tagged
24132332Sbostic * exprs \( \), word matches \< and \>, -w and -y options, char
24232332Sbostic * classes with '-' at end (egrep bug?), and patterns beginning with
24332332Sbostic * an asterisk (don't ask why). otherwise, characters meaningful to
24432332Sbostic * 'egrep' but not to 'grep' are escaped; the entire expr is then
24532332Sbostic * passed to 'egrep'.
24632332Sbostic */
24732332Sbostic if (grepflag && !grepold) {
24832332Sbostic if (strindex(pattern, "\\(") >= 0 ||
24932332Sbostic strindex(pattern, "\\<") >= 0 ||
25032332Sbostic strindex(pattern, "\\>") >= 0 ||
25132332Sbostic strindex(pattern, "-]") >= 0 ||
25232332Sbostic pattern[0] == '*') /* grep bug */
25332332Sbostic grepold++;
25432332Sbostic else
25532332Sbostic pattern = grepxlat(pattern);
25632332Sbostic }
25732332Sbostic if (grepold || egrepold || fgrepold)
25832332Sbostic kernighan(argv);
25932332Sbostic
26032332Sbostic if (iflag)
26132332Sbostic strcpy(pattern, fold(pattern));
26232332Sbostic /*
26332332Sbostic * If the pattern is a plain string, just run boyer-moore. If it
26432332Sbostic * consists of meta-free alternatives, run "superimposed" bmg.
26532332Sbostic * Otherwise, find best string, and compile pattern for regexec().
26632332Sbostic */
26732332Sbostic if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */
26832332Sbostic boyonly++;
26932332Sbostic patboy = pattern;
27032332Sbostic } else {
27132332Sbostic if ((patboy = alternate(pattern)) != NULL)
27232332Sbostic boyonly++;
27332332Sbostic else {
27432332Sbostic if ((patboy = isolate(pattern)) == NULL)
27532332Sbostic kernighan(argv); /* expr too involved */
27632332Sbostic #ifndef NOKANJI
27732332Sbostic for (c = 0; pattern[c] != EOS; c++)
27832332Sbostic if (pattern[c] & NONASCII) /* kanji + meta */
27932332Sbostic kernighan(argv);
28032332Sbostic #endif
28132332Sbostic if ((rspencer = regcomp(pattern)) == NULL)
28232332Sbostic oops("regcomp failure");
28332332Sbostic }
28432332Sbostic }
28532332Sbostic gosper(patboy); /* "pre-conditioning is wonderful"
28632332Sbostic * -- v. strassen */
28732332Sbostic
28832332Sbostic if ((firstfile = optind) >= argc) {
28932332Sbostic /* Grep standard input */
29032332Sbostic if (lflag) /* We don't know its name! */
29132332Sbostic exit(1);
29232332Sbostic egsecute((char *) NULL);
29332332Sbostic } else {
29432332Sbostic while (optind < argc) {
29532332Sbostic egsecute(argv[optind]);
29632332Sbostic optind++;
29732332Sbostic if (firstflag && (nsuccess == 1))
29832332Sbostic break;
29932332Sbostic }
30032332Sbostic }
30132332Sbostic exit((nsuccess == 2) ? 2 : (nsuccess == 0));
30232332Sbostic }
30332332Sbostic
30432332Sbostic char *
pfile(pfname)30532332Sbostic pfile(pfname) /* absorb expression from file */
30632332Sbostic char *pfname;
30732332Sbostic {
30832337Sbostic int fd;
30932332Sbostic struct stat patstat;
31032332Sbostic static char *pat;
31132332Sbostic
31232337Sbostic if ((fd = open(pfname, O_RDONLY, 0)) < 0)
31332332Sbostic oops("can't read pattern file");
31432337Sbostic if (fstat(fd, &patstat) != 0)
31532332Sbostic oops("can't stat pattern file");
31632332Sbostic if (patstat.st_size > PATSIZE) {
31732332Sbostic if (fgrepflag) { /* defer to unix version */
31832332Sbostic fgrepold++;
31932332Sbostic return "dummy";
32032332Sbostic } else
32132332Sbostic oops("pattern file too big");
32232332Sbostic }
32332332Sbostic if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
32432332Sbostic oops("out of memory to read pattern file");
32532337Sbostic if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
32632332Sbostic oops("error reading pattern file");
32732337Sbostic (void) close(fd);
32832332Sbostic
32932332Sbostic pat[patstat.st_size] = EOS;
33032332Sbostic if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */
33132332Sbostic pat[patstat.st_size - 1] = EOS;
33232332Sbostic
33332332Sbostic if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
33432332Sbostic if (fgrepflag)
33532332Sbostic fgrepold++; /* "what's it all about, alfie?" */
33632332Sbostic else
33732332Sbostic egrepold++;
33832332Sbostic }
33932332Sbostic return (pat);
34032332Sbostic }
34132332Sbostic
egsecute(file)34232332Sbostic egsecute(file)
34332332Sbostic char *file;
34432332Sbostic {
34546717Sbostic extern int errno;
34646717Sbostic
34732332Sbostic if (file == NULL)
34832332Sbostic fd = 0;
34932339Sbostic else if ((fd = open(file, O_RDONLY, 0)) <= 0) {
35046717Sbostic fprintf(stderr,
35146717Sbostic "%s: %s: %s\n", progname, file, strerror(errno));
35232332Sbostic nsuccess = 2;
35332332Sbostic return;
35432332Sbostic }
35532332Sbostic chimaera(file, patboy);
35632332Sbostic
35732332Sbostic if (!boyonly && !flushflag && file != NULL)
35832332Sbostic flushmatches();
35932332Sbostic if (file != NULL)
36032332Sbostic close(fd);
36132332Sbostic }
36232332Sbostic
chimaera(file,pat)36332332Sbostic chimaera(file, pat) /* "reach out and boyer-moore search someone" */
36432332Sbostic char *file, *pat; /* -- soon-to-be-popular bumper sticker */
36532332Sbostic {
36632332Sbostic register char *k, *strend, *s;
36732332Sbostic register int j, count;
36832332Sbostic register int *deltazero = delta0;
36932332Sbostic int patlen = altmin;
37032332Sbostic char *t;
37132332Sbostic
37232332Sbostic nleftover = boyfound = flushflag = 0;
37332332Sbostic nline = 1L;
37432332Sbostic prevmatch = 0;
37532332Sbostic nmatch = counted = rxcount = 0L;
37632332Sbostic
37732332Sbostic while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
37832332Sbostic
37932332Sbostic counted += count;
38032332Sbostic strend = linesave(str, count);
38132332Sbostic
38232332Sbostic for (k = str + patlen - 1; k < strend;) {
38332332Sbostic /*
38432332Sbostic * for a large class of patterns, upwards of 80% of
38532332Sbostic * match time is spent on the next line. we beat
38632332Sbostic * existing microcode (vax 'matchc') this way.
38732332Sbostic */
38832332Sbostic while ((k += deltazero[*(unsigned char *) k]) < strend);
38932332Sbostic if (k < (str + LARGE))
39032332Sbostic break;
39132332Sbostic k -= LARGE;
39232332Sbostic
39332332Sbostic if (altflag) {
39432332Sbostic /*
39532332Sbostic * Parallel Boyer-Moore. Check whether each
39632332Sbostic * of the previous <altmin> chars COULD be
39732332Sbostic * from one of the alternative strings.
39832332Sbostic */
39932332Sbostic s = k - 1;
40032332Sbostic j = altmin;
40132332Sbostic while (altset[--j][(unsigned char)
40232332Sbostic cmap[*(unsigned char *) s--]]);
40332332Sbostic /*
40432332Sbostic * quick test fails. in this life, compare
40532332Sbostic * 'em all. but, a "reverse trie" would
40632332Sbostic * attenuate worst case (linear w/delta2?).
40732332Sbostic */
40832332Sbostic if (--j < 0) {
40932332Sbostic count = nalt - 1;
41032332Sbostic do {
41132332Sbostic s = k;
41232332Sbostic j = altlen[count];
41332332Sbostic t = altpat[count];
41432332Sbostic
41532332Sbostic while
41632332Sbostic (cmap[*(unsigned char *) s--]
41732332Sbostic == t[--j]);
41832332Sbostic if (j < 0)
41932332Sbostic break;
42032332Sbostic }
42132332Sbostic while (count--);
42232332Sbostic }
42332332Sbostic } else {
42432332Sbostic /* One string -- check it */
42532332Sbostic j = patlen - 1;
42632332Sbostic s = k - 1;
42732332Sbostic while (cmap[*(unsigned char *) s--] == pat[--j]);
42832332Sbostic }
42932332Sbostic /*
43032332Sbostic * delta-less shortcut for literati. short shrift for
43132332Sbostic * genetic engineers?
43232332Sbostic */
43332332Sbostic if (j >= 0) {
43432332Sbostic k++; /* no match; restart next char */
43532332Sbostic continue;
43632332Sbostic }
43732332Sbostic k = submatch(file, pat, str, strend, k, count);
43832332Sbostic if (k == NULL)
43932332Sbostic return;
44032332Sbostic }
44132332Sbostic if (nflag) {
44232332Sbostic if (prevmatch)
44332332Sbostic nline = prevnline + nlcount(prevloc, k);
44432332Sbostic else
44532332Sbostic nline = nline + nlcount(str, k);
44632332Sbostic prevmatch = 0;
44732332Sbostic }
44832332Sbostic strncpy(str, linetemp, nleftover);
44932332Sbostic }
45032332Sbostic if (cflag) {
45132332Sbostic /* Bug from old grep: -c overrides -h. We fix the bug. */
45232332Sbostic if (!hflag)
45332332Sbostic printf("%s:", file);
45432332Sbostic printf("%ld\n", nmatch);
45532332Sbostic }
45632332Sbostic }
45732332Sbostic
45832332Sbostic char *
linesave(str,count)45932332Sbostic linesave(str, count) /* accumulate partial line at end of buffer */
46032332Sbostic char str[];
46132332Sbostic register int count;
46232332Sbostic {
46332332Sbostic register int j;
46432332Sbostic
46532332Sbostic count += nleftover;
46632332Sbostic if (count != BUFSIZE && fd != 0)
46732332Sbostic str[count++] = NL; /* insurance for broken last line */
46832332Sbostic str[count] = EOS;
46932332Sbostic for (j = count - 1; str[j] != NL && j >= 0;)
47032332Sbostic j--;
47132332Sbostic /*
47232332Sbostic * break up these lines: long line (> BUFSIZE), last line of file, or
47332332Sbostic * short return from read(), as from tee(1) input
47432332Sbostic */
47532332Sbostic if (j < 0 && (count == (BUFSIZE - nleftover))) {
47632332Sbostic str[count++] = NL;
47732332Sbostic str[count] = EOS;
47832332Sbostic linetemp[0] = EOS;
47932332Sbostic nleftover = 0;
48032332Sbostic return (str + count);
48132332Sbostic } else {
48232332Sbostic nleftover = count - j - 1;
48332332Sbostic strncpy(linetemp, str + j + 1, nleftover);
48432332Sbostic return (str + j);
48532332Sbostic }
48632332Sbostic }
48732332Sbostic
48832332Sbostic /*
48932332Sbostic * Process partial match. First check for mis-aligned Kanji, then match line
49032332Sbostic * against full compiled r.e. if statistics do not warrant handing off to
49132332Sbostic * standard egrep.
49232332Sbostic */
49332332Sbostic char *
submatch(file,pat,str,strend,k,altindex)49432332Sbostic submatch(file, pat, str, strend, k, altindex)
49532332Sbostic char file[], pat[], str[];
49632332Sbostic register char *strend, *k;
49732332Sbostic int altindex;
49832332Sbostic {
49932332Sbostic register char *s;
50032332Sbostic char *t, c;
50132332Sbostic
50232332Sbostic t = k;
50332332Sbostic s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
50432332Sbostic #ifndef NOKANJI
50532332Sbostic c = ((altflag) ? altpat[altindex][0] : pat[0]);
50632332Sbostic if (c & NONASCII)
50732332Sbostic if ((s = kanji(str, s, k)) == NULL)
50832332Sbostic return (++k); /* reject false kanji */
50932332Sbostic #endif
51032332Sbostic do;
51132332Sbostic while (*s != NL && --s >= str);
51232332Sbostic k = s + 1; /* now at line start */
51332332Sbostic
51432332Sbostic if (boyonly)
51532332Sbostic return (gotamatch(file, k));
51632332Sbostic
51732332Sbostic incount = counted - (strend - k);
51832332Sbostic if (boyfound++ == FIRSTFEW)
51932332Sbostic execstrategy(file);
52032332Sbostic
52132332Sbostic s = t;
52232332Sbostic do
52332332Sbostic rxcount++;
52432332Sbostic while (*s++ != NL);
52532332Sbostic *--s = EOS;
52632332Sbostic /*
52732332Sbostic * "quick henry -- the flit" (after theodor geisel)
52832332Sbostic */
52932332Sbostic if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
53032332Sbostic *s = NL;
53132332Sbostic if (gotamatch(file, k) == NULL)
53232332Sbostic return (NULL);
53332332Sbostic }
53432332Sbostic *s = NL;
53532332Sbostic return (s + 1);
53632332Sbostic }
53732332Sbostic
53832338Sbostic #ifndef NOKANJI
53932332Sbostic /*
54032332Sbostic * EUC code disambiguation -- scan backwards to first 7-bit code, while
54132332Sbostic * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern.
54232332Sbostic * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
54332332Sbostic */
54432332Sbostic char *
kanji(str,s,k)54532332Sbostic kanji(str, s, k)
54632332Sbostic register char *str, *s, *k;
54732332Sbostic {
54832332Sbostic register int j = 0;
54932332Sbostic
55032332Sbostic for (s--; s >= str; s--) {
55132332Sbostic if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
55232332Sbostic break;
55332332Sbostic j++;
55432332Sbostic }
55532332Sbostic #ifndef CHINESE
55632332Sbostic if (*s == SS2)
55732332Sbostic j -= 1;
55832332Sbostic #endif CHINESE
55932332Sbostic return ((j & 01) ? NULL : k);
56032332Sbostic }
56132338Sbostic #endif
56232332Sbostic
56332332Sbostic /*
56432332Sbostic * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
56532332Sbostic */
gosper(pattern)56632332Sbostic gosper(pattern)
56732332Sbostic char *pattern; /* ... HAKMEM lives ... */
56832332Sbostic {
56932332Sbostic register int i, j;
57032332Sbostic unsigned char c;
57132332Sbostic
57232332Sbostic /* Make one-string case look like simple alternatives case */
57332332Sbostic if (!altflag) {
57432332Sbostic nalt = 1;
57532332Sbostic altmin = altlen[0] = strlen(pattern);
57632339Sbostic altpat[0] = pattern;
57732332Sbostic }
57832332Sbostic /* For chars that aren't in any string, skip by string length. */
57932332Sbostic for (j = 0; j < 256; j++) {
58032332Sbostic delta0[j] = altmin;
58132332Sbostic cmap[j] = j; /* Sneak in initialization of cmap */
58232332Sbostic }
58332332Sbostic
58432332Sbostic /* For chars in a string, skip distance from char to end of string. */
58532332Sbostic /* (If char appears more than once, skip minimum distance.) */
58632332Sbostic for (i = 0; i < nalt; i++)
58732332Sbostic for (j = 0; j < altlen[i] - 1; j++) {
58832332Sbostic c = altpat[i][j];
58932332Sbostic delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
59032332Sbostic if (iflag && islower((int) c))
59132332Sbostic delta0[toupper((int) c)] = delta0[c];
59232332Sbostic }
59332332Sbostic
59432332Sbostic /* For last char of each string, fall out of search loop. */
59532332Sbostic for (i = 0; i < nalt; i++) {
59632332Sbostic c = altpat[i][altlen[i] - 1];
59732332Sbostic delta0[c] = LARGE;
59832332Sbostic if (iflag && islower((int) c))
59932332Sbostic delta0[toupper((int) c)] = LARGE;
60032332Sbostic }
60132332Sbostic if (iflag)
60232332Sbostic for (j = 'A'; j <= 'Z'; j++)
60332332Sbostic cmap[j] = tolower((int) j);
60432332Sbostic }
60532332Sbostic
60632332Sbostic /*
60732332Sbostic * Print, count, or stop on full match. Result is either the location for
60832332Sbostic * continued search, or NULL to stop.
60932332Sbostic */
61032332Sbostic char *
gotamatch(file,s)61132332Sbostic gotamatch(file, s)
61232332Sbostic register char *file, *s;
61332332Sbostic {
61432332Sbostic char *savematch();
61532332Sbostic int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */
61632332Sbostic
61732332Sbostic nmatch++;
61832332Sbostic nsuccess = 1;
61932332Sbostic if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
62032332Sbostic squirrel = 1;
62132332Sbostic
62232332Sbostic if (sflag)
62332332Sbostic return (NULL); /* -s usurps all flags (unlike some versions) */
62432332Sbostic if (cflag) { /* -c overrides -l, we guess */
62532332Sbostic do;
62632332Sbostic while (*s++ != NL);
62732332Sbostic } else if (lflag) {
62832332Sbostic puts(file);
62932332Sbostic return (NULL);
63032332Sbostic } else {
63132332Sbostic if (!hflag)
63232332Sbostic if (!squirrel)
63332332Sbostic printf("%s:", file);
63432332Sbostic else
63532502Sbostic (void)sprintf(preamble, "%s:", file);
63632332Sbostic if (nflag) {
63732332Sbostic if (prevmatch)
63832332Sbostic prevnline = prevnline + nlcount(prevloc, s);
63932332Sbostic else
64032332Sbostic prevnline = nline + nlcount(str, s);
64132332Sbostic prevmatch = 1;
64232332Sbostic
64332332Sbostic if (!squirrel)
64432332Sbostic printf("%ld:", prevnline);
64532332Sbostic else
64632502Sbostic (void)sprintf(preamble + strlen(preamble),
64732332Sbostic "%ld:", prevnline);
64832332Sbostic }
64932332Sbostic if (!squirrel) {
65032332Sbostic do
65132332Sbostic putchar(*s);
65232332Sbostic while (*s++ != NL);
65332332Sbostic } else
65432332Sbostic s = savematch(s);
65532332Sbostic
65632332Sbostic if (nflag)
65732332Sbostic prevloc = s - 1;
65832332Sbostic }
65932332Sbostic return ((firstflag && !cflag) ? NULL : s);
66032332Sbostic }
66132332Sbostic
66232332Sbostic char *
fold(line)66332332Sbostic fold(line)
66432332Sbostic char *line;
66532332Sbostic {
66632332Sbostic static char fline[BUFSIZE];
66732332Sbostic register char *s, *t = fline;
66832332Sbostic
66932332Sbostic for (s = line; *s != EOS; s++)
67032332Sbostic *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
67132332Sbostic *t = EOS;
67232332Sbostic return (fline);
67332332Sbostic }
67432332Sbostic
strindex(s,t)67532332Sbostic strindex(s, t) /* the easy way, as in K&P, p. 192 */
67632332Sbostic char *s, *t;
67732332Sbostic {
67832332Sbostic int i, n;
67932332Sbostic
68032332Sbostic n = strlen(t);
68132332Sbostic for (i = 0; s[i] != '\0'; i++)
68232332Sbostic if (strncmp(s + i, t, n) == 0)
68332332Sbostic return (i);
68432332Sbostic return (-1);
68532332Sbostic }
68632332Sbostic
68732332Sbostic char *
grepxlat(pattern)68832332Sbostic grepxlat(pattern) /* grep pattern meta conversion */
68932332Sbostic char *pattern;
69032332Sbostic {
69132332Sbostic register char *p, *s;
69232332Sbostic static char newpat[BUFSIZE];
69332332Sbostic
69432332Sbostic for (s = newpat, p = pattern; *p != EOS;) {
69532332Sbostic if (*p == '\\') { /* skip escapes ... */
69632332Sbostic *s++ = *p++;
69732332Sbostic if (*p)
69832332Sbostic *s++ = *p++;
69932332Sbostic } else if (*p == '[') { /* ... and char classes */
70032332Sbostic while (*p != EOS && *p != ']')
70132332Sbostic *s++ = *p++;
70232332Sbostic } else if (strchr("+?|()", *p) != NULL) {
70332332Sbostic *s++ = '\\'; /* insert protection */
70432332Sbostic *s++ = *p++;
70532332Sbostic } else
70632332Sbostic *s++ = *p++;
70732332Sbostic }
70832332Sbostic *s = EOS;
70932332Sbostic grepflag = ((patind) ? 0 : 1);
71032332Sbostic return (newpat);
71132332Sbostic }
71232332Sbostic
71332332Sbostic /*
71432332Sbostic * Test for simple alternation. Result is NULL if it's not so simple, or is
71532332Sbostic * a pointer to the first string if it is. Warning: sscanf size is a
71632332Sbostic * fixpoint, beyond which the speedup linearity starts to break down. In the
71732332Sbostic * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
71832332Sbostic * altpat[] to arbitrary size is not useful.
71932332Sbostic */
72032332Sbostic char *
alternate(regexpr)72132332Sbostic alternate(regexpr)
72232332Sbostic char *regexpr;
72332332Sbostic {
72432339Sbostic register int i, j;
72532339Sbostic register char *start, *stop;
72632332Sbostic unsigned char c;
72732332Sbostic
72832332Sbostic if (fgrepflag && strchr(regexpr, '|'))
72932339Sbostic return (NULL);
73032332Sbostic
73132339Sbostic /*
73232339Sbostic * break pattern up into altpat array; delimit on newline, bar,
73332339Sbostic * or EOS. We know we won't overflow, we've already checked the
73432339Sbostic * number of patterns we're going to find against NALT.
73532339Sbostic * Also, set length of pattern and find minimum pattern length.
73632339Sbostic */
73732339Sbostic nalt = 0;
73832339Sbostic altmin = NMUSH;
73932339Sbostic for (start = stop = regexpr;; ++stop)
74032339Sbostic if (!*stop || *stop == '|' || *stop == NL) {
74132339Sbostic altlen[nalt] = j = stop - start;
74232339Sbostic if (j < altmin)
74332339Sbostic altmin = j;
74432339Sbostic if (!(altpat[nalt] = malloc((u_int)(j + 1))))
74532339Sbostic oops("out of memory");
74632339Sbostic bcopy(start, altpat[nalt], j);
74732339Sbostic altpat[nalt][j] = EOS;
74832362Sbostic ++nalt;
74932339Sbostic if (!*stop)
75032339Sbostic break;
75132362Sbostic if (nalt == NALT)
75232339Sbostic return(NULL);
75332339Sbostic if (*stop == NL)
75432339Sbostic *stop = '|';
75532339Sbostic start = stop + 1;
75632339Sbostic }
75732332Sbostic if (!fgrepflag) {
75832332Sbostic if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
75932332Sbostic return (NULL);
76032332Sbostic if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
76132332Sbostic || strindex(regexpr, "||") >= 0)
76232332Sbostic return (NULL);
76332332Sbostic }
76432332Sbostic
76532332Sbostic if (nalt > 1) { /* build superimposed "pre-match" sets per
76632332Sbostic * char */
76732332Sbostic altflag++;
76832332Sbostic for (j = 0; j < nalt; j++)
76932332Sbostic for (i = 0; i < altmin; i++) {
77032332Sbostic c = altpat[j][altlen[j] - altmin + i];
77132332Sbostic altset[i + 1][c] = 1; /* offset for sentinel */
77232332Sbostic }
77332332Sbostic }
77432332Sbostic return (altpat[0]);
77532332Sbostic }
77632332Sbostic
77732332Sbostic /*
77832332Sbostic * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
77932332Sbostic * determine whether to use dfa-based egrep: We do FIRSTFEW matches with
78032332Sbostic * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT
78132332Sbostic * of the input, the r.e. is likely to be underspecified, so do old *grep,
78232332Sbostic * which is faster on complex patterns than regexp(). At FIRSTFEW,
78332332Sbostic * dump the saved matches collected by savematch(). They are saved
78432332Sbostic * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic,
78532332Sbostic * since it's hard to rewind.
78632332Sbostic */
78732332Sbostic
execstrategy(file)78832332Sbostic execstrategy(file)
78932332Sbostic char *file;
79032332Sbostic {
79132332Sbostic int pctmatch;
79232332Sbostic
79332332Sbostic pctmatch = (100 * rxcount) / incount;
79432332Sbostic if (pctmatch > PUNTPERCENT && file != NULL)
79532332Sbostic kernighan(args);
79632332Sbostic if (file != NULL)
79732332Sbostic flushmatches();
79832332Sbostic }
79932332Sbostic
nlcount(bstart,bstop)80032332Sbostic nlcount(bstart, bstop) /* flail interval to totalize newlines. */
80132332Sbostic char *bstart, *bstop;
80232332Sbostic {
80332332Sbostic register char *s = bstart;
80432332Sbostic register char *t = bstop;
80532332Sbostic register int count = 0;
80632332Sbostic
80732332Sbostic do { /* loop unroll for older architectures */
80832332Sbostic if (*t == NL) /* ... ask ames!jaw for sample code */
80932332Sbostic count++;
81032332Sbostic } while (t-- > s);
81132332Sbostic
81232332Sbostic return (count);
81332332Sbostic }
81432332Sbostic
81532332Sbostic char *
isolate(regexpr)81632332Sbostic isolate(regexpr) /* isolate longest metacharacter-free string */
81732332Sbostic char *regexpr;
81832332Sbostic {
81932332Sbostic char *dummyexpr;
82032332Sbostic
82132332Sbostic /*
82232332Sbostic * We add (.)* because Henry's regcomp only figures regmust if it
82332332Sbostic * sees a leading * pattern. Foo!
82432332Sbostic */
82532332Sbostic dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
82632502Sbostic (void)sprintf(dummyexpr, "(.)*%s", regexpr);
82732332Sbostic if ((rspencer = regcomp(dummyexpr)) == NULL)
82832332Sbostic kernighan(args);
82932332Sbostic return (rspencer->regmust);
83032332Sbostic }
83132332Sbostic
83232332Sbostic char *matches[FIRSTFEW];
83332332Sbostic static int mcount = 0;
83432332Sbostic
83532332Sbostic char *
savematch(s)83632332Sbostic savematch(s) /* horde matches during statistics gathering */
83732332Sbostic register char *s;
83832332Sbostic {
83932332Sbostic char *p;
84032332Sbostic char *start = s;
84132332Sbostic int msize = 0;
84232332Sbostic int psize = strlen(preamble);
84332332Sbostic
84432332Sbostic while (*s++ != NL)
84532332Sbostic msize++;
84632332Sbostic *--s = EOS;
84732332Sbostic
84832332Sbostic p = malloc((unsigned) msize + 1 + psize);
84932332Sbostic strcpy(p, preamble);
85032332Sbostic strcpy(p + psize, start);
85132332Sbostic matches[mcount++] = p;
85232332Sbostic
85332332Sbostic preamble[0] = 0;
85432332Sbostic *s = NL;
85532332Sbostic return (s);
85632332Sbostic }
85732332Sbostic
flushmatches()85832332Sbostic flushmatches()
85932332Sbostic {
86032332Sbostic int n;
86132332Sbostic
86232332Sbostic flushflag = 1;
86332332Sbostic for (n = 0; n < mcount; n++)
86432332Sbostic printf("%s\n", matches[n]);
86532332Sbostic mcount = 0;
86632332Sbostic }
86732332Sbostic
oops(message)86832332Sbostic oops(message)
86932332Sbostic char *message;
87032332Sbostic {
87132332Sbostic fprintf(stderr, "%s: %s\n", progname, message);
87232332Sbostic exit(2);
87332332Sbostic }
87432332Sbostic
kernighan(args)87532332Sbostic kernighan(args) /* "let others do the hard part ..." */
87632332Sbostic char *args[];
87732332Sbostic {
87832332Sbostic /*
87932332Sbostic * We may have already run grep on some of the files; remove them
88032332Sbostic * from the arg list we pass on. Note that we can't delete them
88132332Sbostic * totally because the number of file names affects the output
88232332Sbostic * (automatic -h).
88332332Sbostic */
88432332Sbostic /* better would be fork/exec per punted file -- jaw */
88532332Sbostic
88632332Sbostic while (firstfile && optind > firstfile)
88737914Sbostic args[firstfile++] = _PATH_DEVNULL;
88832332Sbostic if (patind)
88932332Sbostic args[patind] = pattern;
89032339Sbostic (void) fflush(stdout);
89132332Sbostic
89232332Sbostic if (grepflag)
89337914Sbostic execvp(_PATH_GREPSTD, args), oops("can't exec old 'grep'");
89432332Sbostic else if (fgrepflag)
89537914Sbostic execvp(_PATH_FGREPSTD, args), oops("can't exec old 'fgrep'");
89632332Sbostic else
89737914Sbostic execvp(_PATH_EGREPSTD, args), oops("can't exec old 'egrep'");
89832332Sbostic }
899