132332Sbostic #ifndef lint 2*35692Sbostic static char sccsid[] = "@(#)egrep.c 5.11 (Berkeley) 09/22/88"; 332332Sbostic #endif not lint 432332Sbostic 532332Sbostic /* 632332Sbostic Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0 732332Sbostic table as in original paper (CACM, October, 1977). No delta1 or delta2. 832332Sbostic According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of 932332Sbostic minimal practical value. However, to improve for worst case input, 1032332Sbostic integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J. 1132332Sbostic Comput., Feb. 1986) deserves consideration. 1232332Sbostic 1332332Sbostic Method: extract longest metacharacter-free string from expression. 1432332Sbostic this is done using a side-effect from henry spencer's regcomp(). 1532332Sbostic use boyer-moore to match such, then pass submatching lines 1632332Sbostic to either regexp() or standard 'egrep', depending on certain 1732332Sbostic criteria within execstrategy() below. [this tradeoff is due 1832332Sbostic to the general slowness of the regexp() nondeterministic 1932332Sbostic machine on complex expressions, as well as the startup time 2032332Sbostic of standard 'egrep' on short files.] alternatively, one may 2132332Sbostic change the vendor-supplied 'egrep' automaton to include 2232332Sbostic boyer-moore directly. see accompanying writeup for discussion 2332332Sbostic of kanji expression treatment. 2432332Sbostic 2532332Sbostic late addition: apply trickbag for fast match of simple 2632332Sbostic alternations (sublinear, in common low-cardinality cases). 2732332Sbostic trap fgrep into this lair. 2832332Sbostic 2932332Sbostic gnu additions: -f, newline as |, \< and \> [in regexec()], more 3032332Sbostic comments. inspire better dfa exec() strategy. 3132332Sbostic serious testing and help with special cases. 3232332Sbostic 3332332Sbostic Algorithm amalgam summary: 3432332Sbostic 3532332Sbostic dfa e?grep (aho/thompson) 3632332Sbostic ndfa regexp() (spencer/aho) 3732332Sbostic bmg (boyer/moore/gosper) 3832332Sbostic "superimposed" bmg (jaw) 3932332Sbostic fgrep (aho/corrasick) 4032332Sbostic 4132332Sbostic sorry, but the knuth/morris/pratt machine, horspool's 4232332Sbostic "frequentist" code, and the rabin/karp matcher, however cute, 4332332Sbostic just don't cut it for this production. 4432332Sbostic 4532332Sbostic James A. Woods Copyright (c) 1986 4632332Sbostic NASA Ames Research Center 4732332Sbostic */ 4832332Sbostic #include <stdio.h> 4932332Sbostic #include <ctype.h> 5032332Sbostic #include <sys/types.h> 5132332Sbostic #include <sys/stat.h> 5232337Sbostic #include <sys/file.h> 5332332Sbostic #include <regexp.h> /* must be henry spencer's version */ 5432332Sbostic 5532332Sbostic #define MIN(A, B) ((A) > (B) ? (B) : (A)) 5632332Sbostic 5732332Sbostic #ifdef SLOWSYS 5832332Sbostic #define read xread 5932332Sbostic #endif 6032332Sbostic /* 6132332Sbostic * define existing [ef]?grep program locations below for use by execvp(). 6232332Sbostic * [execlp() would be used were it not for the possibility of 6332332Sbostic * installation-dependent recursion.] 6432332Sbostic */ 6532332Sbostic #ifndef EGREPSTD 6632332Sbostic #define EGREPSTD "/usr/bin/egrep" 6732332Sbostic #endif 6832332Sbostic #ifndef GREPSTD 6932332Sbostic #define GREPSTD "/bin/grep" 7032332Sbostic #endif 7132332Sbostic #ifndef FGREPSTD 7232332Sbostic #define FGREPSTD "/usr/bin/fgrep" 7332332Sbostic #endif 7432332Sbostic 7532332Sbostic #define BUFSIZE 8192 /* make higher for cray */ 7632332Sbostic #define PATSIZE 6000 7732332Sbostic #define LARGE BUFSIZE + PATSIZE 7832332Sbostic 7932332Sbostic #define NALT 7 /* tied to scanf() size in alternate() */ 8032332Sbostic #define NMUSH 6 /* loosely relates to expected alt length */ 8132332Sbostic 8232332Sbostic #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */ 8332332Sbostic #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input 8432332Sbostic * was processed by regexp(), exec std egrep. */ 8532332Sbostic #define NL '\n' 8632332Sbostic #define EOS '\0' 8732332Sbostic #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */ 8832332Sbostic #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */ 8932332Sbostic #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */ 9032332Sbostic #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */ 9132332Sbostic 9232332Sbostic extern char *optarg; 9332332Sbostic extern int optind; 9432332Sbostic char *progname; 9532332Sbostic 9632332Sbostic int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */ 9732332Sbostic int sflag, hflag; /* v7, v8, bsd */ 9832332Sbostic 9932332Sbostic int firstflag; /* Stop at first match */ 10032332Sbostic int grepflag; /* Called as "grep" */ 10132332Sbostic int fgrepflag; /* Called as "fgrep" */ 10232332Sbostic int altflag; /* Simple alternation in pattern */ 10332332Sbostic int boyonly; /* No regexp needed -- all simple */ 10432332Sbostic int flushflag; 10532332Sbostic int grepold, egrepold, fgrepold; 10632332Sbostic 10732332Sbostic int nalt; /* Number of alternatives */ 10832332Sbostic int nsuccess; /* 1 for match, 2 for error */ 10932332Sbostic int altmin; /* Minimum length of all the alternate 11032332Sbostic * strings */ 11132332Sbostic int firstfile; /* argv index of first file argument */ 11232332Sbostic int patind; /* argv index of pattern */ 11332332Sbostic long nmatch; /* Number of matches in this file */ 11432332Sbostic long incount, counted; /* Amount of input consumed */ 11532332Sbostic long rxcount; /* Bytes of input processed by regexec() */ 11632332Sbostic int boyfound; /* accumulated partial matches (tripped by 11732332Sbostic * FIRSTFEW) */ 11832332Sbostic int prevmatch; /* next three lines aid fast -n */ 11932332Sbostic long nline, prevnline; 12032332Sbostic char *prevloc; 12132332Sbostic 12232332Sbostic regexp *rspencer; 12332332Sbostic char *pattern; 12432332Sbostic char *patboy; /* Pattern for simple Boyer-Moore */ 12532332Sbostic char *patfile; /* Filename containing pattern(s) */ 12632332Sbostic 12732332Sbostic int delta0[256]; /* Boyer-Moore algorithm core */ 12832332Sbostic char cmap[256]; /* Usually 0-255, but if -i, maps upper to 12932332Sbostic * lower case */ 13032332Sbostic char str[BUFSIZE + 2]; 13132332Sbostic int nleftover; 13232332Sbostic char linetemp[BUFSIZE]; 13332339Sbostic char *altpat[NALT]; /* alternation component storage */ 13432332Sbostic int altlen[NALT]; 13532332Sbostic short altset[NMUSH + 1][256]; 13632332Sbostic char preamble[200]; /* match prefix (filename, line no.) */ 13732332Sbostic 13832332Sbostic int fd; 13932332Sbostic char * 14032502Sbostic strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc(); 14132332Sbostic char * 14232332Sbostic grepxlat(), *fold(), *pfile(), *alternate(), *isolate(); 143*35692Sbostic char *gotamatch(), *kanji(), *linesave(), *submatch(); 14432332Sbostic char **args; 14532332Sbostic 14632332Sbostic main(argc, argv) 14732332Sbostic int argc; 14832332Sbostic char *argv[]; 14932332Sbostic { 15032365Sbostic int c, oflag; 15132332Sbostic int errflag = 0; 15232332Sbostic 15332332Sbostic args = argv; 15432332Sbostic 15532332Sbostic if ((progname = strrchr(argv[0], '/')) != 0) 15632332Sbostic progname++; 15732332Sbostic else 15832332Sbostic progname = argv[0]; 15932332Sbostic if (strcmp(progname, "grep") == 0) 16032332Sbostic grepflag++; 16132365Sbostic else if (strcmp(progname, "fgrep") == 0) 16232332Sbostic fgrepflag++; 16332332Sbostic 16432365Sbostic oflag = 0; 16532365Sbostic while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) { 16632332Sbostic switch (c) { 16732332Sbostic 16832332Sbostic case 'f': 16932332Sbostic fflag++; 17032332Sbostic patfile = optarg; 17132332Sbostic continue; 17232332Sbostic case 'b': 17332332Sbostic case 'v': 17432332Sbostic egrepold++; /* boyer-moore of little help here */ 17532332Sbostic continue; 17632332Sbostic case 'c': 17732332Sbostic cflag++; 17832332Sbostic continue; 17932332Sbostic case 'e': 18032332Sbostic eflag++; 18132332Sbostic pattern = optarg; 18232332Sbostic continue; 18332332Sbostic case 'h': 18432332Sbostic hflag++; 18532332Sbostic continue; 18632365Sbostic case 'o': 18732365Sbostic oflag++; 18832365Sbostic continue; 18932332Sbostic case '1': /* Stop at very first match */ 19032332Sbostic firstflag++; /* spead freaks only */ 19132332Sbostic continue; 19232332Sbostic case 'i': 19332332Sbostic iflag++; 19432332Sbostic continue; 19532332Sbostic case 'l': 19632332Sbostic lflag++; 19732332Sbostic continue; 19832332Sbostic case 'n': 19932332Sbostic nflag++; 20032332Sbostic continue; 20132332Sbostic case 's': 20232332Sbostic sflag++; 20332332Sbostic continue; 20432332Sbostic case 'w': 20532332Sbostic case 'y': 20632332Sbostic if (!grepflag) 20732332Sbostic errflag++; 20832332Sbostic grepold++; 20932332Sbostic continue; 21032332Sbostic case 'x': /* needs more work, like -b above */ 21132332Sbostic if (!fgrepflag) 21232332Sbostic errflag++; 21332332Sbostic fgrepold++; 21432332Sbostic continue; 21532332Sbostic case '?': 21632332Sbostic errflag++; 21732332Sbostic } 21832332Sbostic } 21932332Sbostic if (errflag || ((argc <= optind) && !fflag && !eflag)) { 22032332Sbostic if (grepflag) 22132365Sbostic oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]"); 22232332Sbostic else if (fgrepflag) 22332366Sbostic oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]"); 22432332Sbostic else /* encourage SVID options, though we provide 22532332Sbostic * others */ 22632366Sbostic oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]"); 22732332Sbostic } 22832332Sbostic if (fflag) 22934917Sbostic pattern = pfile(patfile); 23034917Sbostic else if (!eflag) { 23134917Sbostic patind = optind; 23232332Sbostic pattern = argv[optind++]; 23334917Sbostic } 23432332Sbostic 23532365Sbostic if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */ 23632332Sbostic hflag++; 23732332Sbostic if (pattern[0] == EOS) 23832332Sbostic kernighan(argv); /* same as it ever was */ 23932332Sbostic /* 24032332Sbostic * 'grep/egrep' merger -- "old" grep is called to handle: tagged 24132332Sbostic * exprs \( \), word matches \< and \>, -w and -y options, char 24232332Sbostic * classes with '-' at end (egrep bug?), and patterns beginning with 24332332Sbostic * an asterisk (don't ask why). otherwise, characters meaningful to 24432332Sbostic * 'egrep' but not to 'grep' are escaped; the entire expr is then 24532332Sbostic * passed to 'egrep'. 24632332Sbostic */ 24732332Sbostic if (grepflag && !grepold) { 24832332Sbostic if (strindex(pattern, "\\(") >= 0 || 24932332Sbostic strindex(pattern, "\\<") >= 0 || 25032332Sbostic strindex(pattern, "\\>") >= 0 || 25132332Sbostic strindex(pattern, "-]") >= 0 || 25232332Sbostic pattern[0] == '*') /* grep bug */ 25332332Sbostic grepold++; 25432332Sbostic else 25532332Sbostic pattern = grepxlat(pattern); 25632332Sbostic } 25732332Sbostic if (grepold || egrepold || fgrepold) 25832332Sbostic kernighan(argv); 25932332Sbostic 26032332Sbostic if (iflag) 26132332Sbostic strcpy(pattern, fold(pattern)); 26232332Sbostic /* 26332332Sbostic * If the pattern is a plain string, just run boyer-moore. If it 26432332Sbostic * consists of meta-free alternatives, run "superimposed" bmg. 26532332Sbostic * Otherwise, find best string, and compile pattern for regexec(). 26632332Sbostic */ 26732332Sbostic if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */ 26832332Sbostic boyonly++; 26932332Sbostic patboy = pattern; 27032332Sbostic } else { 27132332Sbostic if ((patboy = alternate(pattern)) != NULL) 27232332Sbostic boyonly++; 27332332Sbostic else { 27432332Sbostic if ((patboy = isolate(pattern)) == NULL) 27532332Sbostic kernighan(argv); /* expr too involved */ 27632332Sbostic #ifndef NOKANJI 27732332Sbostic for (c = 0; pattern[c] != EOS; c++) 27832332Sbostic if (pattern[c] & NONASCII) /* kanji + meta */ 27932332Sbostic kernighan(argv); 28032332Sbostic #endif 28132332Sbostic if ((rspencer = regcomp(pattern)) == NULL) 28232332Sbostic oops("regcomp failure"); 28332332Sbostic } 28432332Sbostic } 28532332Sbostic gosper(patboy); /* "pre-conditioning is wonderful" 28632332Sbostic * -- v. strassen */ 28732332Sbostic 28832332Sbostic if ((firstfile = optind) >= argc) { 28932332Sbostic /* Grep standard input */ 29032332Sbostic if (lflag) /* We don't know its name! */ 29132332Sbostic exit(1); 29232332Sbostic egsecute((char *) NULL); 29332332Sbostic } else { 29432332Sbostic while (optind < argc) { 29532332Sbostic egsecute(argv[optind]); 29632332Sbostic optind++; 29732332Sbostic if (firstflag && (nsuccess == 1)) 29832332Sbostic break; 29932332Sbostic } 30032332Sbostic } 30132332Sbostic exit((nsuccess == 2) ? 2 : (nsuccess == 0)); 30232332Sbostic } 30332332Sbostic 30432332Sbostic char * 30532332Sbostic pfile(pfname) /* absorb expression from file */ 30632332Sbostic char *pfname; 30732332Sbostic { 30832337Sbostic int fd; 30932332Sbostic struct stat patstat; 31032332Sbostic static char *pat; 31132332Sbostic 31232337Sbostic if ((fd = open(pfname, O_RDONLY, 0)) < 0) 31332332Sbostic oops("can't read pattern file"); 31432337Sbostic if (fstat(fd, &patstat) != 0) 31532332Sbostic oops("can't stat pattern file"); 31632332Sbostic if (patstat.st_size > PATSIZE) { 31732332Sbostic if (fgrepflag) { /* defer to unix version */ 31832332Sbostic fgrepold++; 31932332Sbostic return "dummy"; 32032332Sbostic } else 32132332Sbostic oops("pattern file too big"); 32232332Sbostic } 32332332Sbostic if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL) 32432332Sbostic oops("out of memory to read pattern file"); 32532337Sbostic if (patstat.st_size != read(fd, pat, (int)patstat.st_size)) 32632332Sbostic oops("error reading pattern file"); 32732337Sbostic (void) close(fd); 32832332Sbostic 32932332Sbostic pat[patstat.st_size] = EOS; 33032332Sbostic if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */ 33132332Sbostic pat[patstat.st_size - 1] = EOS; 33232332Sbostic 33332332Sbostic if (nlcount(pat, &pat[patstat.st_size]) > NALT) { 33432332Sbostic if (fgrepflag) 33532332Sbostic fgrepold++; /* "what's it all about, alfie?" */ 33632332Sbostic else 33732332Sbostic egrepold++; 33832332Sbostic } 33932332Sbostic return (pat); 34032332Sbostic } 34132332Sbostic 34232332Sbostic egsecute(file) 34332332Sbostic char *file; 34432332Sbostic { 34532332Sbostic if (file == NULL) 34632332Sbostic fd = 0; 34732339Sbostic else if ((fd = open(file, O_RDONLY, 0)) <= 0) { 34832332Sbostic fprintf(stderr, "%s: can't open %s\n", progname, file); 34932332Sbostic nsuccess = 2; 35032332Sbostic return; 35132332Sbostic } 35232332Sbostic chimaera(file, patboy); 35332332Sbostic 35432332Sbostic if (!boyonly && !flushflag && file != NULL) 35532332Sbostic flushmatches(); 35632332Sbostic if (file != NULL) 35732332Sbostic close(fd); 35832332Sbostic } 35932332Sbostic 36032332Sbostic chimaera(file, pat) /* "reach out and boyer-moore search someone" */ 36132332Sbostic char *file, *pat; /* -- soon-to-be-popular bumper sticker */ 36232332Sbostic { 36332332Sbostic register char *k, *strend, *s; 36432332Sbostic register int j, count; 36532332Sbostic register int *deltazero = delta0; 36632332Sbostic int patlen = altmin; 36732332Sbostic char *t; 36832332Sbostic 36932332Sbostic nleftover = boyfound = flushflag = 0; 37032332Sbostic nline = 1L; 37132332Sbostic prevmatch = 0; 37232332Sbostic nmatch = counted = rxcount = 0L; 37332332Sbostic 37432332Sbostic while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) { 37532332Sbostic 37632332Sbostic counted += count; 37732332Sbostic strend = linesave(str, count); 37832332Sbostic 37932332Sbostic for (k = str + patlen - 1; k < strend;) { 38032332Sbostic /* 38132332Sbostic * for a large class of patterns, upwards of 80% of 38232332Sbostic * match time is spent on the next line. we beat 38332332Sbostic * existing microcode (vax 'matchc') this way. 38432332Sbostic */ 38532332Sbostic while ((k += deltazero[*(unsigned char *) k]) < strend); 38632332Sbostic if (k < (str + LARGE)) 38732332Sbostic break; 38832332Sbostic k -= LARGE; 38932332Sbostic 39032332Sbostic if (altflag) { 39132332Sbostic /* 39232332Sbostic * Parallel Boyer-Moore. Check whether each 39332332Sbostic * of the previous <altmin> chars COULD be 39432332Sbostic * from one of the alternative strings. 39532332Sbostic */ 39632332Sbostic s = k - 1; 39732332Sbostic j = altmin; 39832332Sbostic while (altset[--j][(unsigned char) 39932332Sbostic cmap[*(unsigned char *) s--]]); 40032332Sbostic /* 40132332Sbostic * quick test fails. in this life, compare 40232332Sbostic * 'em all. but, a "reverse trie" would 40332332Sbostic * attenuate worst case (linear w/delta2?). 40432332Sbostic */ 40532332Sbostic if (--j < 0) { 40632332Sbostic count = nalt - 1; 40732332Sbostic do { 40832332Sbostic s = k; 40932332Sbostic j = altlen[count]; 41032332Sbostic t = altpat[count]; 41132332Sbostic 41232332Sbostic while 41332332Sbostic (cmap[*(unsigned char *) s--] 41432332Sbostic == t[--j]); 41532332Sbostic if (j < 0) 41632332Sbostic break; 41732332Sbostic } 41832332Sbostic while (count--); 41932332Sbostic } 42032332Sbostic } else { 42132332Sbostic /* One string -- check it */ 42232332Sbostic j = patlen - 1; 42332332Sbostic s = k - 1; 42432332Sbostic while (cmap[*(unsigned char *) s--] == pat[--j]); 42532332Sbostic } 42632332Sbostic /* 42732332Sbostic * delta-less shortcut for literati. short shrift for 42832332Sbostic * genetic engineers? 42932332Sbostic */ 43032332Sbostic if (j >= 0) { 43132332Sbostic k++; /* no match; restart next char */ 43232332Sbostic continue; 43332332Sbostic } 43432332Sbostic k = submatch(file, pat, str, strend, k, count); 43532332Sbostic if (k == NULL) 43632332Sbostic return; 43732332Sbostic } 43832332Sbostic if (nflag) { 43932332Sbostic if (prevmatch) 44032332Sbostic nline = prevnline + nlcount(prevloc, k); 44132332Sbostic else 44232332Sbostic nline = nline + nlcount(str, k); 44332332Sbostic prevmatch = 0; 44432332Sbostic } 44532332Sbostic strncpy(str, linetemp, nleftover); 44632332Sbostic } 44732332Sbostic if (cflag) { 44832332Sbostic /* Bug from old grep: -c overrides -h. We fix the bug. */ 44932332Sbostic if (!hflag) 45032332Sbostic printf("%s:", file); 45132332Sbostic printf("%ld\n", nmatch); 45232332Sbostic } 45332332Sbostic } 45432332Sbostic 45532332Sbostic char * 45632332Sbostic linesave(str, count) /* accumulate partial line at end of buffer */ 45732332Sbostic char str[]; 45832332Sbostic register int count; 45932332Sbostic { 46032332Sbostic register int j; 46132332Sbostic 46232332Sbostic count += nleftover; 46332332Sbostic if (count != BUFSIZE && fd != 0) 46432332Sbostic str[count++] = NL; /* insurance for broken last line */ 46532332Sbostic str[count] = EOS; 46632332Sbostic for (j = count - 1; str[j] != NL && j >= 0;) 46732332Sbostic j--; 46832332Sbostic /* 46932332Sbostic * break up these lines: long line (> BUFSIZE), last line of file, or 47032332Sbostic * short return from read(), as from tee(1) input 47132332Sbostic */ 47232332Sbostic if (j < 0 && (count == (BUFSIZE - nleftover))) { 47332332Sbostic str[count++] = NL; 47432332Sbostic str[count] = EOS; 47532332Sbostic linetemp[0] = EOS; 47632332Sbostic nleftover = 0; 47732332Sbostic return (str + count); 47832332Sbostic } else { 47932332Sbostic nleftover = count - j - 1; 48032332Sbostic strncpy(linetemp, str + j + 1, nleftover); 48132332Sbostic return (str + j); 48232332Sbostic } 48332332Sbostic } 48432332Sbostic 48532332Sbostic /* 48632332Sbostic * Process partial match. First check for mis-aligned Kanji, then match line 48732332Sbostic * against full compiled r.e. if statistics do not warrant handing off to 48832332Sbostic * standard egrep. 48932332Sbostic */ 49032332Sbostic char * 49132332Sbostic submatch(file, pat, str, strend, k, altindex) 49232332Sbostic char file[], pat[], str[]; 49332332Sbostic register char *strend, *k; 49432332Sbostic int altindex; 49532332Sbostic { 49632332Sbostic register char *s; 49732332Sbostic char *t, c; 49832332Sbostic 49932332Sbostic t = k; 50032332Sbostic s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1); 50132332Sbostic #ifndef NOKANJI 50232332Sbostic c = ((altflag) ? altpat[altindex][0] : pat[0]); 50332332Sbostic if (c & NONASCII) 50432332Sbostic if ((s = kanji(str, s, k)) == NULL) 50532332Sbostic return (++k); /* reject false kanji */ 50632332Sbostic #endif 50732332Sbostic do; 50832332Sbostic while (*s != NL && --s >= str); 50932332Sbostic k = s + 1; /* now at line start */ 51032332Sbostic 51132332Sbostic if (boyonly) 51232332Sbostic return (gotamatch(file, k)); 51332332Sbostic 51432332Sbostic incount = counted - (strend - k); 51532332Sbostic if (boyfound++ == FIRSTFEW) 51632332Sbostic execstrategy(file); 51732332Sbostic 51832332Sbostic s = t; 51932332Sbostic do 52032332Sbostic rxcount++; 52132332Sbostic while (*s++ != NL); 52232332Sbostic *--s = EOS; 52332332Sbostic /* 52432332Sbostic * "quick henry -- the flit" (after theodor geisel) 52532332Sbostic */ 52632332Sbostic if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) { 52732332Sbostic *s = NL; 52832332Sbostic if (gotamatch(file, k) == NULL) 52932332Sbostic return (NULL); 53032332Sbostic } 53132332Sbostic *s = NL; 53232332Sbostic return (s + 1); 53332332Sbostic } 53432332Sbostic 53532338Sbostic #ifndef NOKANJI 53632332Sbostic /* 53732332Sbostic * EUC code disambiguation -- scan backwards to first 7-bit code, while 53832332Sbostic * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern. 53932332Sbostic * SS2/3 checks are for intermixed Japanase Katakana or Kanji2. 54032332Sbostic */ 54132332Sbostic char * 54232332Sbostic kanji(str, s, k) 54332332Sbostic register char *str, *s, *k; 54432332Sbostic { 54532332Sbostic register int j = 0; 54632332Sbostic 54732332Sbostic for (s--; s >= str; s--) { 54832332Sbostic if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0) 54932332Sbostic break; 55032332Sbostic j++; 55132332Sbostic } 55232332Sbostic #ifndef CHINESE 55332332Sbostic if (*s == SS2) 55432332Sbostic j -= 1; 55532332Sbostic #endif CHINESE 55632332Sbostic return ((j & 01) ? NULL : k); 55732332Sbostic } 55832338Sbostic #endif 55932332Sbostic 56032332Sbostic /* 56132332Sbostic * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] 56232332Sbostic */ 56332332Sbostic gosper(pattern) 56432332Sbostic char *pattern; /* ... HAKMEM lives ... */ 56532332Sbostic { 56632332Sbostic register int i, j; 56732332Sbostic unsigned char c; 56832332Sbostic 56932332Sbostic /* Make one-string case look like simple alternatives case */ 57032332Sbostic if (!altflag) { 57132332Sbostic nalt = 1; 57232332Sbostic altmin = altlen[0] = strlen(pattern); 57332339Sbostic altpat[0] = pattern; 57432332Sbostic } 57532332Sbostic /* For chars that aren't in any string, skip by string length. */ 57632332Sbostic for (j = 0; j < 256; j++) { 57732332Sbostic delta0[j] = altmin; 57832332Sbostic cmap[j] = j; /* Sneak in initialization of cmap */ 57932332Sbostic } 58032332Sbostic 58132332Sbostic /* For chars in a string, skip distance from char to end of string. */ 58232332Sbostic /* (If char appears more than once, skip minimum distance.) */ 58332332Sbostic for (i = 0; i < nalt; i++) 58432332Sbostic for (j = 0; j < altlen[i] - 1; j++) { 58532332Sbostic c = altpat[i][j]; 58632332Sbostic delta0[c] = MIN(delta0[c], altlen[i] - j - 1); 58732332Sbostic if (iflag && islower((int) c)) 58832332Sbostic delta0[toupper((int) c)] = delta0[c]; 58932332Sbostic } 59032332Sbostic 59132332Sbostic /* For last char of each string, fall out of search loop. */ 59232332Sbostic for (i = 0; i < nalt; i++) { 59332332Sbostic c = altpat[i][altlen[i] - 1]; 59432332Sbostic delta0[c] = LARGE; 59532332Sbostic if (iflag && islower((int) c)) 59632332Sbostic delta0[toupper((int) c)] = LARGE; 59732332Sbostic } 59832332Sbostic if (iflag) 59932332Sbostic for (j = 'A'; j <= 'Z'; j++) 60032332Sbostic cmap[j] = tolower((int) j); 60132332Sbostic } 60232332Sbostic 60332332Sbostic /* 60432332Sbostic * Print, count, or stop on full match. Result is either the location for 60532332Sbostic * continued search, or NULL to stop. 60632332Sbostic */ 60732332Sbostic char * 60832332Sbostic gotamatch(file, s) 60932332Sbostic register char *file, *s; 61032332Sbostic { 61132332Sbostic char *savematch(); 61232332Sbostic int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */ 61332332Sbostic 61432332Sbostic nmatch++; 61532332Sbostic nsuccess = 1; 61632332Sbostic if (!boyonly && boyfound <= FIRSTFEW && file != NULL) 61732332Sbostic squirrel = 1; 61832332Sbostic 61932332Sbostic if (sflag) 62032332Sbostic return (NULL); /* -s usurps all flags (unlike some versions) */ 62132332Sbostic if (cflag) { /* -c overrides -l, we guess */ 62232332Sbostic do; 62332332Sbostic while (*s++ != NL); 62432332Sbostic } else if (lflag) { 62532332Sbostic puts(file); 62632332Sbostic return (NULL); 62732332Sbostic } else { 62832332Sbostic if (!hflag) 62932332Sbostic if (!squirrel) 63032332Sbostic printf("%s:", file); 63132332Sbostic else 63232502Sbostic (void)sprintf(preamble, "%s:", file); 63332332Sbostic if (nflag) { 63432332Sbostic if (prevmatch) 63532332Sbostic prevnline = prevnline + nlcount(prevloc, s); 63632332Sbostic else 63732332Sbostic prevnline = nline + nlcount(str, s); 63832332Sbostic prevmatch = 1; 63932332Sbostic 64032332Sbostic if (!squirrel) 64132332Sbostic printf("%ld:", prevnline); 64232332Sbostic else 64332502Sbostic (void)sprintf(preamble + strlen(preamble), 64432332Sbostic "%ld:", prevnline); 64532332Sbostic } 64632332Sbostic if (!squirrel) { 64732332Sbostic do 64832332Sbostic putchar(*s); 64932332Sbostic while (*s++ != NL); 65032332Sbostic } else 65132332Sbostic s = savematch(s); 65232332Sbostic 65332332Sbostic if (nflag) 65432332Sbostic prevloc = s - 1; 65532332Sbostic } 65632332Sbostic return ((firstflag && !cflag) ? NULL : s); 65732332Sbostic } 65832332Sbostic 65932332Sbostic char * 66032332Sbostic fold(line) 66132332Sbostic char *line; 66232332Sbostic { 66332332Sbostic static char fline[BUFSIZE]; 66432332Sbostic register char *s, *t = fline; 66532332Sbostic 66632332Sbostic for (s = line; *s != EOS; s++) 66732332Sbostic *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s); 66832332Sbostic *t = EOS; 66932332Sbostic return (fline); 67032332Sbostic } 67132332Sbostic 67232332Sbostic strindex(s, t) /* the easy way, as in K&P, p. 192 */ 67332332Sbostic char *s, *t; 67432332Sbostic { 67532332Sbostic int i, n; 67632332Sbostic 67732332Sbostic n = strlen(t); 67832332Sbostic for (i = 0; s[i] != '\0'; i++) 67932332Sbostic if (strncmp(s + i, t, n) == 0) 68032332Sbostic return (i); 68132332Sbostic return (-1); 68232332Sbostic } 68332332Sbostic 68432332Sbostic char * 68532332Sbostic grepxlat(pattern) /* grep pattern meta conversion */ 68632332Sbostic char *pattern; 68732332Sbostic { 68832332Sbostic register char *p, *s; 68932332Sbostic static char newpat[BUFSIZE]; 69032332Sbostic 69132332Sbostic for (s = newpat, p = pattern; *p != EOS;) { 69232332Sbostic if (*p == '\\') { /* skip escapes ... */ 69332332Sbostic *s++ = *p++; 69432332Sbostic if (*p) 69532332Sbostic *s++ = *p++; 69632332Sbostic } else if (*p == '[') { /* ... and char classes */ 69732332Sbostic while (*p != EOS && *p != ']') 69832332Sbostic *s++ = *p++; 69932332Sbostic } else if (strchr("+?|()", *p) != NULL) { 70032332Sbostic *s++ = '\\'; /* insert protection */ 70132332Sbostic *s++ = *p++; 70232332Sbostic } else 70332332Sbostic *s++ = *p++; 70432332Sbostic } 70532332Sbostic *s = EOS; 70632332Sbostic grepflag = ((patind) ? 0 : 1); 70732332Sbostic return (newpat); 70832332Sbostic } 70932332Sbostic 71032332Sbostic /* 71132332Sbostic * Test for simple alternation. Result is NULL if it's not so simple, or is 71232332Sbostic * a pointer to the first string if it is. Warning: sscanf size is a 71332332Sbostic * fixpoint, beyond which the speedup linearity starts to break down. In the 71432332Sbostic * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing 71532332Sbostic * altpat[] to arbitrary size is not useful. 71632332Sbostic */ 71732332Sbostic char * 71832332Sbostic alternate(regexpr) 71932332Sbostic char *regexpr; 72032332Sbostic { 72132339Sbostic register int i, j; 72232339Sbostic register char *start, *stop; 72332332Sbostic unsigned char c; 72432332Sbostic 72532332Sbostic if (fgrepflag && strchr(regexpr, '|')) 72632339Sbostic return (NULL); 72732332Sbostic 72832339Sbostic /* 72932339Sbostic * break pattern up into altpat array; delimit on newline, bar, 73032339Sbostic * or EOS. We know we won't overflow, we've already checked the 73132339Sbostic * number of patterns we're going to find against NALT. 73232339Sbostic * Also, set length of pattern and find minimum pattern length. 73332339Sbostic */ 73432339Sbostic nalt = 0; 73532339Sbostic altmin = NMUSH; 73632339Sbostic for (start = stop = regexpr;; ++stop) 73732339Sbostic if (!*stop || *stop == '|' || *stop == NL) { 73832339Sbostic altlen[nalt] = j = stop - start; 73932339Sbostic if (j < altmin) 74032339Sbostic altmin = j; 74132339Sbostic if (!(altpat[nalt] = malloc((u_int)(j + 1)))) 74232339Sbostic oops("out of memory"); 74332339Sbostic bcopy(start, altpat[nalt], j); 74432339Sbostic altpat[nalt][j] = EOS; 74532362Sbostic ++nalt; 74632339Sbostic if (!*stop) 74732339Sbostic break; 74832362Sbostic if (nalt == NALT) 74932339Sbostic return(NULL); 75032339Sbostic if (*stop == NL) 75132339Sbostic *stop = '|'; 75232339Sbostic start = stop + 1; 75332339Sbostic } 75432332Sbostic if (!fgrepflag) { 75532332Sbostic if (strchr(regexpr, '|') == NULL || regexpr[0] == '|') 75632332Sbostic return (NULL); 75732332Sbostic if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL 75832332Sbostic || strindex(regexpr, "||") >= 0) 75932332Sbostic return (NULL); 76032332Sbostic } 76132332Sbostic 76232332Sbostic if (nalt > 1) { /* build superimposed "pre-match" sets per 76332332Sbostic * char */ 76432332Sbostic altflag++; 76532332Sbostic for (j = 0; j < nalt; j++) 76632332Sbostic for (i = 0; i < altmin; i++) { 76732332Sbostic c = altpat[j][altlen[j] - altmin + i]; 76832332Sbostic altset[i + 1][c] = 1; /* offset for sentinel */ 76932332Sbostic } 77032332Sbostic } 77132332Sbostic return (altpat[0]); 77232332Sbostic } 77332332Sbostic 77432332Sbostic /* 77532332Sbostic * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to 77632332Sbostic * determine whether to use dfa-based egrep: We do FIRSTFEW matches with 77732332Sbostic * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT 77832332Sbostic * of the input, the r.e. is likely to be underspecified, so do old *grep, 77932332Sbostic * which is faster on complex patterns than regexp(). At FIRSTFEW, 78032332Sbostic * dump the saved matches collected by savematch(). They are saved 78132332Sbostic * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic, 78232332Sbostic * since it's hard to rewind. 78332332Sbostic */ 78432332Sbostic 78532332Sbostic execstrategy(file) 78632332Sbostic char *file; 78732332Sbostic { 78832332Sbostic int pctmatch; 78932332Sbostic 79032332Sbostic pctmatch = (100 * rxcount) / incount; 79132332Sbostic if (pctmatch > PUNTPERCENT && file != NULL) 79232332Sbostic kernighan(args); 79332332Sbostic if (file != NULL) 79432332Sbostic flushmatches(); 79532332Sbostic } 79632332Sbostic 79732332Sbostic nlcount(bstart, bstop) /* flail interval to totalize newlines. */ 79832332Sbostic char *bstart, *bstop; 79932332Sbostic { 80032332Sbostic register char *s = bstart; 80132332Sbostic register char *t = bstop; 80232332Sbostic register int count = 0; 80332332Sbostic 80432332Sbostic do { /* loop unroll for older architectures */ 80532332Sbostic if (*t == NL) /* ... ask ames!jaw for sample code */ 80632332Sbostic count++; 80732332Sbostic } while (t-- > s); 80832332Sbostic 80932332Sbostic return (count); 81032332Sbostic } 81132332Sbostic 81232332Sbostic char * 81332332Sbostic isolate(regexpr) /* isolate longest metacharacter-free string */ 81432332Sbostic char *regexpr; 81532332Sbostic { 81632332Sbostic char *dummyexpr; 81732332Sbostic 81832332Sbostic /* 81932332Sbostic * We add (.)* because Henry's regcomp only figures regmust if it 82032332Sbostic * sees a leading * pattern. Foo! 82132332Sbostic */ 82232332Sbostic dummyexpr = malloc((unsigned) strlen(regexpr) + 5); 82332502Sbostic (void)sprintf(dummyexpr, "(.)*%s", regexpr); 82432332Sbostic if ((rspencer = regcomp(dummyexpr)) == NULL) 82532332Sbostic kernighan(args); 82632332Sbostic return (rspencer->regmust); 82732332Sbostic } 82832332Sbostic 82932332Sbostic char *matches[FIRSTFEW]; 83032332Sbostic static int mcount = 0; 83132332Sbostic 83232332Sbostic char * 83332332Sbostic savematch(s) /* horde matches during statistics gathering */ 83432332Sbostic register char *s; 83532332Sbostic { 83632332Sbostic char *p; 83732332Sbostic char *start = s; 83832332Sbostic int msize = 0; 83932332Sbostic int psize = strlen(preamble); 84032332Sbostic 84132332Sbostic while (*s++ != NL) 84232332Sbostic msize++; 84332332Sbostic *--s = EOS; 84432332Sbostic 84532332Sbostic p = malloc((unsigned) msize + 1 + psize); 84632332Sbostic strcpy(p, preamble); 84732332Sbostic strcpy(p + psize, start); 84832332Sbostic matches[mcount++] = p; 84932332Sbostic 85032332Sbostic preamble[0] = 0; 85132332Sbostic *s = NL; 85232332Sbostic return (s); 85332332Sbostic } 85432332Sbostic 85532332Sbostic flushmatches() 85632332Sbostic { 85732332Sbostic int n; 85832332Sbostic 85932332Sbostic flushflag = 1; 86032332Sbostic for (n = 0; n < mcount; n++) 86132332Sbostic printf("%s\n", matches[n]); 86232332Sbostic mcount = 0; 86332332Sbostic } 86432332Sbostic 86532332Sbostic oops(message) 86632332Sbostic char *message; 86732332Sbostic { 86832332Sbostic fprintf(stderr, "%s: %s\n", progname, message); 86932332Sbostic exit(2); 87032332Sbostic } 87132332Sbostic 87232332Sbostic kernighan(args) /* "let others do the hard part ..." */ 87332332Sbostic char *args[]; 87432332Sbostic { 87532332Sbostic /* 87632332Sbostic * We may have already run grep on some of the files; remove them 87732332Sbostic * from the arg list we pass on. Note that we can't delete them 87832332Sbostic * totally because the number of file names affects the output 87932332Sbostic * (automatic -h). 88032332Sbostic */ 88132332Sbostic /* better would be fork/exec per punted file -- jaw */ 88232332Sbostic 88332332Sbostic while (firstfile && optind > firstfile) 88432332Sbostic args[firstfile++] = "/dev/null"; 88532332Sbostic if (patind) 88632332Sbostic args[patind] = pattern; 88732339Sbostic (void) fflush(stdout); 88832332Sbostic 88932332Sbostic if (grepflag) 89032332Sbostic execvp(GREPSTD, args), oops("can't exec old 'grep'"); 89132332Sbostic else if (fgrepflag) 89232332Sbostic execvp(FGREPSTD, args), oops("can't exec old 'fgrep'"); 89332332Sbostic else 89432332Sbostic execvp(EGREPSTD, args), oops("can't exec old 'egrep'"); 89532332Sbostic } 896