132332Sbostic #ifndef lint 2*32339Sbostic static char sccsid[] = "@(#)egrep.c 5.4 (Berkeley) 10/05/87"; 332332Sbostic #endif not lint 432332Sbostic 532332Sbostic /* 632332Sbostic Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0 732332Sbostic table as in original paper (CACM, October, 1977). No delta1 or delta2. 832332Sbostic According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of 932332Sbostic minimal practical value. However, to improve for worst case input, 1032332Sbostic integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J. 1132332Sbostic Comput., Feb. 1986) deserves consideration. 1232332Sbostic 1332332Sbostic Method: extract longest metacharacter-free string from expression. 1432332Sbostic this is done using a side-effect from henry spencer's regcomp(). 1532332Sbostic use boyer-moore to match such, then pass submatching lines 1632332Sbostic to either regexp() or standard 'egrep', depending on certain 1732332Sbostic criteria within execstrategy() below. [this tradeoff is due 1832332Sbostic to the general slowness of the regexp() nondeterministic 1932332Sbostic machine on complex expressions, as well as the startup time 2032332Sbostic of standard 'egrep' on short files.] alternatively, one may 2132332Sbostic change the vendor-supplied 'egrep' automaton to include 2232332Sbostic boyer-moore directly. see accompanying writeup for discussion 2332332Sbostic of kanji expression treatment. 2432332Sbostic 2532332Sbostic late addition: apply trickbag for fast match of simple 2632332Sbostic alternations (sublinear, in common low-cardinality cases). 2732332Sbostic trap fgrep into this lair. 2832332Sbostic 2932332Sbostic gnu additions: -f, newline as |, \< and \> [in regexec()], more 3032332Sbostic comments. inspire better dfa exec() strategy. 3132332Sbostic serious testing and help with special cases. 3232332Sbostic 3332332Sbostic Algorithm amalgam summary: 3432332Sbostic 3532332Sbostic dfa e?grep (aho/thompson) 3632332Sbostic ndfa regexp() (spencer/aho) 3732332Sbostic bmg (boyer/moore/gosper) 3832332Sbostic "superimposed" bmg (jaw) 3932332Sbostic fgrep (aho/corrasick) 4032332Sbostic 4132332Sbostic sorry, but the knuth/morris/pratt machine, horspool's 4232332Sbostic "frequentist" code, and the rabin/karp matcher, however cute, 4332332Sbostic just don't cut it for this production. 4432332Sbostic 4532332Sbostic James A. Woods Copyright (c) 1986 4632332Sbostic NASA Ames Research Center 4732332Sbostic */ 4832332Sbostic #include <stdio.h> 4932332Sbostic #include <ctype.h> 5032332Sbostic #include <sys/types.h> 5132332Sbostic #include <sys/stat.h> 5232337Sbostic #include <sys/file.h> 5332332Sbostic #include <regexp.h> /* must be henry spencer's version */ 5432332Sbostic 5532332Sbostic #define MIN(A, B) ((A) > (B) ? (B) : (A)) 5632332Sbostic 5732332Sbostic #ifdef SLOWSYS 5832332Sbostic #define read xread 5932332Sbostic #endif 6032332Sbostic /* 6132332Sbostic * define existing [ef]?grep program locations below for use by execvp(). 6232332Sbostic * [execlp() would be used were it not for the possibility of 6332332Sbostic * installation-dependent recursion.] 6432332Sbostic */ 6532332Sbostic #ifndef EGREPSTD 6632332Sbostic #define EGREPSTD "/usr/bin/egrep" 6732332Sbostic #endif 6832332Sbostic #ifndef GREPSTD 6932332Sbostic #define GREPSTD "/bin/grep" 7032332Sbostic #endif 7132332Sbostic #ifndef FGREPSTD 7232332Sbostic #define FGREPSTD "/usr/bin/fgrep" 7332332Sbostic #endif 7432332Sbostic 7532332Sbostic #define BUFSIZE 8192 /* make higher for cray */ 7632332Sbostic #define PATSIZE 6000 7732332Sbostic #define LARGE BUFSIZE + PATSIZE 7832332Sbostic 7932332Sbostic #define NALT 7 /* tied to scanf() size in alternate() */ 8032332Sbostic #define NMUSH 6 /* loosely relates to expected alt length */ 8132332Sbostic 8232332Sbostic #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */ 8332332Sbostic #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input 8432332Sbostic * was processed by regexp(), exec std egrep. */ 8532332Sbostic #define NL '\n' 8632332Sbostic #define EOS '\0' 8732332Sbostic #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */ 8832332Sbostic #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */ 8932332Sbostic #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */ 9032332Sbostic #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */ 9132332Sbostic 9232332Sbostic extern char *optarg; 9332332Sbostic extern int optind; 9432332Sbostic char *progname; 9532332Sbostic 9632332Sbostic int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */ 9732332Sbostic int sflag, hflag; /* v7, v8, bsd */ 9832332Sbostic 9932332Sbostic int firstflag; /* Stop at first match */ 10032332Sbostic int grepflag; /* Called as "grep" */ 10132332Sbostic int fgrepflag; /* Called as "fgrep" */ 10232332Sbostic int altflag; /* Simple alternation in pattern */ 10332332Sbostic int boyonly; /* No regexp needed -- all simple */ 10432332Sbostic int flushflag; 10532332Sbostic int grepold, egrepold, fgrepold; 10632332Sbostic 10732332Sbostic int nalt; /* Number of alternatives */ 10832332Sbostic int nsuccess; /* 1 for match, 2 for error */ 10932332Sbostic int altmin; /* Minimum length of all the alternate 11032332Sbostic * strings */ 11132332Sbostic int firstfile; /* argv index of first file argument */ 11232332Sbostic int patind; /* argv index of pattern */ 11332332Sbostic long nmatch; /* Number of matches in this file */ 11432332Sbostic long incount, counted; /* Amount of input consumed */ 11532332Sbostic long rxcount; /* Bytes of input processed by regexec() */ 11632332Sbostic int boyfound; /* accumulated partial matches (tripped by 11732332Sbostic * FIRSTFEW) */ 11832332Sbostic int prevmatch; /* next three lines aid fast -n */ 11932332Sbostic long nline, prevnline; 12032332Sbostic char *prevloc; 12132332Sbostic 12232332Sbostic regexp *rspencer; 12332332Sbostic char *pattern; 12432332Sbostic char *patboy; /* Pattern for simple Boyer-Moore */ 12532332Sbostic char *patfile; /* Filename containing pattern(s) */ 12632332Sbostic 12732332Sbostic int delta0[256]; /* Boyer-Moore algorithm core */ 12832332Sbostic char cmap[256]; /* Usually 0-255, but if -i, maps upper to 12932332Sbostic * lower case */ 13032332Sbostic char str[BUFSIZE + 2]; 13132332Sbostic int nleftover; 13232332Sbostic char linetemp[BUFSIZE]; 133*32339Sbostic char *altpat[NALT]; /* alternation component storage */ 13432332Sbostic int altlen[NALT]; 13532332Sbostic short altset[NMUSH + 1][256]; 13632332Sbostic char preamble[200]; /* match prefix (filename, line no.) */ 13732332Sbostic 13832332Sbostic int fd; 13932332Sbostic char * 14032332Sbostic strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *sprintf(), *malloc(); 14132332Sbostic char * 14232332Sbostic grepxlat(), *fold(), *pfile(), *alternate(), *isolate(); 14332332Sbostic char **args; 14432332Sbostic 14532332Sbostic main(argc, argv) 14632332Sbostic int argc; 14732332Sbostic char *argv[]; 14832332Sbostic { 14932332Sbostic int c; 15032332Sbostic int errflag = 0; 15132332Sbostic 15232332Sbostic args = argv; 15332332Sbostic 15432332Sbostic if ((progname = strrchr(argv[0], '/')) != 0) 15532332Sbostic progname++; 15632332Sbostic else 15732332Sbostic progname = argv[0]; 15832332Sbostic if (strcmp(progname, "grep") == 0) 15932332Sbostic grepflag++; 16032332Sbostic if (strcmp(progname, "fgrep") == 0) 16132332Sbostic fgrepflag++; 16232332Sbostic 16332332Sbostic while ((c = getopt(argc, argv, "bchie:f:lnsvwxy1")) != EOF) { 16432332Sbostic switch (c) { 16532332Sbostic 16632332Sbostic case 'f': 16732332Sbostic fflag++; 16832332Sbostic patfile = optarg; 16932332Sbostic continue; 17032332Sbostic case 'b': 17132332Sbostic case 'v': 17232332Sbostic egrepold++; /* boyer-moore of little help here */ 17332332Sbostic continue; 17432332Sbostic case 'c': 17532332Sbostic cflag++; 17632332Sbostic continue; 17732332Sbostic case 'e': 17832332Sbostic eflag++; 17932332Sbostic pattern = optarg; 18032332Sbostic continue; 18132332Sbostic case 'h': 18232332Sbostic hflag++; 18332332Sbostic continue; 18432332Sbostic case '1': /* Stop at very first match */ 18532332Sbostic firstflag++; /* spead freaks only */ 18632332Sbostic continue; 18732332Sbostic case 'i': 18832332Sbostic iflag++; 18932332Sbostic continue; 19032332Sbostic case 'l': 19132332Sbostic lflag++; 19232332Sbostic continue; 19332332Sbostic case 'n': 19432332Sbostic nflag++; 19532332Sbostic continue; 19632332Sbostic case 's': 19732332Sbostic sflag++; 19832332Sbostic continue; 19932332Sbostic case 'w': 20032332Sbostic case 'y': 20132332Sbostic if (!grepflag) 20232332Sbostic errflag++; 20332332Sbostic grepold++; 20432332Sbostic continue; 20532332Sbostic case 'x': /* needs more work, like -b above */ 20632332Sbostic if (!fgrepflag) 20732332Sbostic errflag++; 20832332Sbostic fgrepold++; 20932332Sbostic continue; 21032332Sbostic case '?': 21132332Sbostic errflag++; 21232332Sbostic } 21332332Sbostic } 21432332Sbostic if (errflag || ((argc <= optind) && !fflag && !eflag)) { 21532332Sbostic if (grepflag) 21632332Sbostic oops("usage: grep [-bcihlnsvwy] [-e] pattern [file ...]"); 21732332Sbostic else if (fgrepflag) 21832332Sbostic oops("usage: fgrep [-bcilnv] {-f patfile | [-e] strings} [file ...]"); 21932332Sbostic else /* encourage SVID options, though we provide 22032332Sbostic * others */ 22132332Sbostic oops("usage: egrep [-bcilnv] {-f patfile | [-e] pattern} [file ...]"); 22232332Sbostic } 22332332Sbostic patind = optind; 22432332Sbostic if (fflag) 22532332Sbostic pattern = pfile(patfile), patind = 0; 22632332Sbostic else if (!eflag) 22732332Sbostic pattern = argv[optind++]; 22832332Sbostic 22932332Sbostic if ((argc - optind) <= 1) /* Filename invisible given < 2 files */ 23032332Sbostic hflag++; 23132332Sbostic if (pattern[0] == EOS) 23232332Sbostic kernighan(argv); /* same as it ever was */ 23332332Sbostic /* 23432332Sbostic * 'grep/egrep' merger -- "old" grep is called to handle: tagged 23532332Sbostic * exprs \( \), word matches \< and \>, -w and -y options, char 23632332Sbostic * classes with '-' at end (egrep bug?), and patterns beginning with 23732332Sbostic * an asterisk (don't ask why). otherwise, characters meaningful to 23832332Sbostic * 'egrep' but not to 'grep' are escaped; the entire expr is then 23932332Sbostic * passed to 'egrep'. 24032332Sbostic */ 24132332Sbostic if (grepflag && !grepold) { 24232332Sbostic if (strindex(pattern, "\\(") >= 0 || 24332332Sbostic strindex(pattern, "\\<") >= 0 || 24432332Sbostic strindex(pattern, "\\>") >= 0 || 24532332Sbostic strindex(pattern, "-]") >= 0 || 24632332Sbostic pattern[0] == '*') /* grep bug */ 24732332Sbostic grepold++; 24832332Sbostic else 24932332Sbostic pattern = grepxlat(pattern); 25032332Sbostic } 25132332Sbostic if (grepold || egrepold || fgrepold) 25232332Sbostic kernighan(argv); 25332332Sbostic 25432332Sbostic if (iflag) 25532332Sbostic strcpy(pattern, fold(pattern)); 25632332Sbostic /* 25732332Sbostic * If the pattern is a plain string, just run boyer-moore. If it 25832332Sbostic * consists of meta-free alternatives, run "superimposed" bmg. 25932332Sbostic * Otherwise, find best string, and compile pattern for regexec(). 26032332Sbostic */ 26132332Sbostic if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */ 26232332Sbostic boyonly++; 26332332Sbostic patboy = pattern; 26432332Sbostic } else { 26532332Sbostic if ((patboy = alternate(pattern)) != NULL) 26632332Sbostic boyonly++; 26732332Sbostic else { 26832332Sbostic if ((patboy = isolate(pattern)) == NULL) 26932332Sbostic kernighan(argv); /* expr too involved */ 27032332Sbostic #ifndef NOKANJI 27132332Sbostic for (c = 0; pattern[c] != EOS; c++) 27232332Sbostic if (pattern[c] & NONASCII) /* kanji + meta */ 27332332Sbostic kernighan(argv); 27432332Sbostic #endif 27532332Sbostic if ((rspencer = regcomp(pattern)) == NULL) 27632332Sbostic oops("regcomp failure"); 27732332Sbostic } 27832332Sbostic } 27932332Sbostic gosper(patboy); /* "pre-conditioning is wonderful" 28032332Sbostic * -- v. strassen */ 28132332Sbostic 28232332Sbostic if ((firstfile = optind) >= argc) { 28332332Sbostic /* Grep standard input */ 28432332Sbostic if (lflag) /* We don't know its name! */ 28532332Sbostic exit(1); 28632332Sbostic egsecute((char *) NULL); 28732332Sbostic } else { 28832332Sbostic while (optind < argc) { 28932332Sbostic egsecute(argv[optind]); 29032332Sbostic optind++; 29132332Sbostic if (firstflag && (nsuccess == 1)) 29232332Sbostic break; 29332332Sbostic } 29432332Sbostic } 29532332Sbostic exit((nsuccess == 2) ? 2 : (nsuccess == 0)); 29632332Sbostic } 29732332Sbostic 29832332Sbostic char * 29932332Sbostic pfile(pfname) /* absorb expression from file */ 30032332Sbostic char *pfname; 30132332Sbostic { 30232337Sbostic int fd; 30332332Sbostic struct stat patstat; 30432332Sbostic static char *pat; 30532332Sbostic 30632337Sbostic if ((fd = open(pfname, O_RDONLY, 0)) < 0) 30732332Sbostic oops("can't read pattern file"); 30832337Sbostic if (fstat(fd, &patstat) != 0) 30932332Sbostic oops("can't stat pattern file"); 31032332Sbostic if (patstat.st_size > PATSIZE) { 31132332Sbostic if (fgrepflag) { /* defer to unix version */ 31232332Sbostic fgrepold++; 31332332Sbostic return "dummy"; 31432332Sbostic } else 31532332Sbostic oops("pattern file too big"); 31632332Sbostic } 31732332Sbostic if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL) 31832332Sbostic oops("out of memory to read pattern file"); 31932337Sbostic if (patstat.st_size != read(fd, pat, (int)patstat.st_size)) 32032332Sbostic oops("error reading pattern file"); 32132337Sbostic (void) close(fd); 32232332Sbostic 32332332Sbostic pat[patstat.st_size] = EOS; 32432332Sbostic if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */ 32532332Sbostic pat[patstat.st_size - 1] = EOS; 32632332Sbostic 32732332Sbostic if (nlcount(pat, &pat[patstat.st_size]) > NALT) { 32832332Sbostic if (fgrepflag) 32932332Sbostic fgrepold++; /* "what's it all about, alfie?" */ 33032332Sbostic else 33132332Sbostic egrepold++; 33232332Sbostic } 33332332Sbostic return (pat); 33432332Sbostic } 33532332Sbostic 33632332Sbostic egsecute(file) 33732332Sbostic char *file; 33832332Sbostic { 33932332Sbostic if (file == NULL) 34032332Sbostic fd = 0; 341*32339Sbostic else if ((fd = open(file, O_RDONLY, 0)) <= 0) { 34232332Sbostic fprintf(stderr, "%s: can't open %s\n", progname, file); 34332332Sbostic nsuccess = 2; 34432332Sbostic return; 34532332Sbostic } 34632332Sbostic chimaera(file, patboy); 34732332Sbostic 34832332Sbostic if (!boyonly && !flushflag && file != NULL) 34932332Sbostic flushmatches(); 35032332Sbostic if (file != NULL) 35132332Sbostic close(fd); 35232332Sbostic } 35332332Sbostic 35432332Sbostic chimaera(file, pat) /* "reach out and boyer-moore search someone" */ 35532332Sbostic char *file, *pat; /* -- soon-to-be-popular bumper sticker */ 35632332Sbostic { 35732332Sbostic register char *k, *strend, *s; 35832332Sbostic register int j, count; 35932332Sbostic register int *deltazero = delta0; 36032332Sbostic int patlen = altmin; 36132332Sbostic char *t; 36232332Sbostic char *gotamatch(), *kanji(), *linesave(), *submatch(); 36332332Sbostic 36432332Sbostic nleftover = boyfound = flushflag = 0; 36532332Sbostic nline = 1L; 36632332Sbostic prevmatch = 0; 36732332Sbostic nmatch = counted = rxcount = 0L; 36832332Sbostic 36932332Sbostic while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) { 37032332Sbostic 37132332Sbostic counted += count; 37232332Sbostic strend = linesave(str, count); 37332332Sbostic 37432332Sbostic for (k = str + patlen - 1; k < strend;) { 37532332Sbostic /* 37632332Sbostic * for a large class of patterns, upwards of 80% of 37732332Sbostic * match time is spent on the next line. we beat 37832332Sbostic * existing microcode (vax 'matchc') this way. 37932332Sbostic */ 38032332Sbostic while ((k += deltazero[*(unsigned char *) k]) < strend); 38132332Sbostic if (k < (str + LARGE)) 38232332Sbostic break; 38332332Sbostic k -= LARGE; 38432332Sbostic 38532332Sbostic if (altflag) { 38632332Sbostic /* 38732332Sbostic * Parallel Boyer-Moore. Check whether each 38832332Sbostic * of the previous <altmin> chars COULD be 38932332Sbostic * from one of the alternative strings. 39032332Sbostic */ 39132332Sbostic s = k - 1; 39232332Sbostic j = altmin; 39332332Sbostic while (altset[--j][(unsigned char) 39432332Sbostic cmap[*(unsigned char *) s--]]); 39532332Sbostic /* 39632332Sbostic * quick test fails. in this life, compare 39732332Sbostic * 'em all. but, a "reverse trie" would 39832332Sbostic * attenuate worst case (linear w/delta2?). 39932332Sbostic */ 40032332Sbostic if (--j < 0) { 40132332Sbostic count = nalt - 1; 40232332Sbostic do { 40332332Sbostic s = k; 40432332Sbostic j = altlen[count]; 40532332Sbostic t = altpat[count]; 40632332Sbostic 40732332Sbostic while 40832332Sbostic (cmap[*(unsigned char *) s--] 40932332Sbostic == t[--j]); 41032332Sbostic if (j < 0) 41132332Sbostic break; 41232332Sbostic } 41332332Sbostic while (count--); 41432332Sbostic } 41532332Sbostic } else { 41632332Sbostic /* One string -- check it */ 41732332Sbostic j = patlen - 1; 41832332Sbostic s = k - 1; 41932332Sbostic while (cmap[*(unsigned char *) s--] == pat[--j]); 42032332Sbostic } 42132332Sbostic /* 42232332Sbostic * delta-less shortcut for literati. short shrift for 42332332Sbostic * genetic engineers? 42432332Sbostic */ 42532332Sbostic if (j >= 0) { 42632332Sbostic k++; /* no match; restart next char */ 42732332Sbostic continue; 42832332Sbostic } 42932332Sbostic k = submatch(file, pat, str, strend, k, count); 43032332Sbostic if (k == NULL) 43132332Sbostic return; 43232332Sbostic } 43332332Sbostic if (nflag) { 43432332Sbostic if (prevmatch) 43532332Sbostic nline = prevnline + nlcount(prevloc, k); 43632332Sbostic else 43732332Sbostic nline = nline + nlcount(str, k); 43832332Sbostic prevmatch = 0; 43932332Sbostic } 44032332Sbostic strncpy(str, linetemp, nleftover); 44132332Sbostic } 44232332Sbostic if (cflag) { 44332332Sbostic /* Bug from old grep: -c overrides -h. We fix the bug. */ 44432332Sbostic if (!hflag) 44532332Sbostic printf("%s:", file); 44632332Sbostic printf("%ld\n", nmatch); 44732332Sbostic } 44832332Sbostic } 44932332Sbostic 45032332Sbostic char * 45132332Sbostic linesave(str, count) /* accumulate partial line at end of buffer */ 45232332Sbostic char str[]; 45332332Sbostic register int count; 45432332Sbostic { 45532332Sbostic register int j; 45632332Sbostic 45732332Sbostic count += nleftover; 45832332Sbostic if (count != BUFSIZE && fd != 0) 45932332Sbostic str[count++] = NL; /* insurance for broken last line */ 46032332Sbostic str[count] = EOS; 46132332Sbostic for (j = count - 1; str[j] != NL && j >= 0;) 46232332Sbostic j--; 46332332Sbostic /* 46432332Sbostic * break up these lines: long line (> BUFSIZE), last line of file, or 46532332Sbostic * short return from read(), as from tee(1) input 46632332Sbostic */ 46732332Sbostic if (j < 0 && (count == (BUFSIZE - nleftover))) { 46832332Sbostic str[count++] = NL; 46932332Sbostic str[count] = EOS; 47032332Sbostic linetemp[0] = EOS; 47132332Sbostic nleftover = 0; 47232332Sbostic return (str + count); 47332332Sbostic } else { 47432332Sbostic nleftover = count - j - 1; 47532332Sbostic strncpy(linetemp, str + j + 1, nleftover); 47632332Sbostic return (str + j); 47732332Sbostic } 47832332Sbostic } 47932332Sbostic 48032332Sbostic /* 48132332Sbostic * Process partial match. First check for mis-aligned Kanji, then match line 48232332Sbostic * against full compiled r.e. if statistics do not warrant handing off to 48332332Sbostic * standard egrep. 48432332Sbostic */ 48532332Sbostic char * 48632332Sbostic submatch(file, pat, str, strend, k, altindex) 48732332Sbostic char file[], pat[], str[]; 48832332Sbostic register char *strend, *k; 48932332Sbostic int altindex; 49032332Sbostic { 49132332Sbostic register char *s; 49232332Sbostic char *t, c; 49332332Sbostic 49432332Sbostic t = k; 49532332Sbostic s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1); 49632332Sbostic #ifndef NOKANJI 49732332Sbostic c = ((altflag) ? altpat[altindex][0] : pat[0]); 49832332Sbostic if (c & NONASCII) 49932332Sbostic if ((s = kanji(str, s, k)) == NULL) 50032332Sbostic return (++k); /* reject false kanji */ 50132332Sbostic #endif 50232332Sbostic do; 50332332Sbostic while (*s != NL && --s >= str); 50432332Sbostic k = s + 1; /* now at line start */ 50532332Sbostic 50632332Sbostic if (boyonly) 50732332Sbostic return (gotamatch(file, k)); 50832332Sbostic 50932332Sbostic incount = counted - (strend - k); 51032332Sbostic if (boyfound++ == FIRSTFEW) 51132332Sbostic execstrategy(file); 51232332Sbostic 51332332Sbostic s = t; 51432332Sbostic do 51532332Sbostic rxcount++; 51632332Sbostic while (*s++ != NL); 51732332Sbostic *--s = EOS; 51832332Sbostic /* 51932332Sbostic * "quick henry -- the flit" (after theodor geisel) 52032332Sbostic */ 52132332Sbostic if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) { 52232332Sbostic *s = NL; 52332332Sbostic if (gotamatch(file, k) == NULL) 52432332Sbostic return (NULL); 52532332Sbostic } 52632332Sbostic *s = NL; 52732332Sbostic return (s + 1); 52832332Sbostic } 52932332Sbostic 53032338Sbostic #ifndef NOKANJI 53132332Sbostic /* 53232332Sbostic * EUC code disambiguation -- scan backwards to first 7-bit code, while 53332332Sbostic * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern. 53432332Sbostic * SS2/3 checks are for intermixed Japanase Katakana or Kanji2. 53532332Sbostic */ 53632332Sbostic char * 53732332Sbostic kanji(str, s, k) 53832332Sbostic register char *str, *s, *k; 53932332Sbostic { 54032332Sbostic register int j = 0; 54132332Sbostic 54232332Sbostic for (s--; s >= str; s--) { 54332332Sbostic if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0) 54432332Sbostic break; 54532332Sbostic j++; 54632332Sbostic } 54732332Sbostic #ifndef CHINESE 54832332Sbostic if (*s == SS2) 54932332Sbostic j -= 1; 55032332Sbostic #endif CHINESE 55132332Sbostic return ((j & 01) ? NULL : k); 55232332Sbostic } 55332338Sbostic #endif 55432332Sbostic 55532332Sbostic /* 55632332Sbostic * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] 55732332Sbostic */ 55832332Sbostic gosper(pattern) 55932332Sbostic char *pattern; /* ... HAKMEM lives ... */ 56032332Sbostic { 56132332Sbostic register int i, j; 56232332Sbostic unsigned char c; 56332332Sbostic 56432332Sbostic /* Make one-string case look like simple alternatives case */ 56532332Sbostic if (!altflag) { 56632332Sbostic nalt = 1; 56732332Sbostic altmin = altlen[0] = strlen(pattern); 568*32339Sbostic altpat[0] = pattern; 56932332Sbostic } 57032332Sbostic /* For chars that aren't in any string, skip by string length. */ 57132332Sbostic for (j = 0; j < 256; j++) { 57232332Sbostic delta0[j] = altmin; 57332332Sbostic cmap[j] = j; /* Sneak in initialization of cmap */ 57432332Sbostic } 57532332Sbostic 57632332Sbostic /* For chars in a string, skip distance from char to end of string. */ 57732332Sbostic /* (If char appears more than once, skip minimum distance.) */ 57832332Sbostic for (i = 0; i < nalt; i++) 57932332Sbostic for (j = 0; j < altlen[i] - 1; j++) { 58032332Sbostic c = altpat[i][j]; 58132332Sbostic delta0[c] = MIN(delta0[c], altlen[i] - j - 1); 58232332Sbostic if (iflag && islower((int) c)) 58332332Sbostic delta0[toupper((int) c)] = delta0[c]; 58432332Sbostic } 58532332Sbostic 58632332Sbostic /* For last char of each string, fall out of search loop. */ 58732332Sbostic for (i = 0; i < nalt; i++) { 58832332Sbostic c = altpat[i][altlen[i] - 1]; 58932332Sbostic delta0[c] = LARGE; 59032332Sbostic if (iflag && islower((int) c)) 59132332Sbostic delta0[toupper((int) c)] = LARGE; 59232332Sbostic } 59332332Sbostic if (iflag) 59432332Sbostic for (j = 'A'; j <= 'Z'; j++) 59532332Sbostic cmap[j] = tolower((int) j); 59632332Sbostic } 59732332Sbostic 59832332Sbostic /* 59932332Sbostic * Print, count, or stop on full match. Result is either the location for 60032332Sbostic * continued search, or NULL to stop. 60132332Sbostic */ 60232332Sbostic char * 60332332Sbostic gotamatch(file, s) 60432332Sbostic register char *file, *s; 60532332Sbostic { 60632332Sbostic char *savematch(); 60732332Sbostic int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */ 60832332Sbostic 60932332Sbostic nmatch++; 61032332Sbostic nsuccess = 1; 61132332Sbostic if (!boyonly && boyfound <= FIRSTFEW && file != NULL) 61232332Sbostic squirrel = 1; 61332332Sbostic 61432332Sbostic if (sflag) 61532332Sbostic return (NULL); /* -s usurps all flags (unlike some versions) */ 61632332Sbostic if (cflag) { /* -c overrides -l, we guess */ 61732332Sbostic do; 61832332Sbostic while (*s++ != NL); 61932332Sbostic } else if (lflag) { 62032332Sbostic puts(file); 62132332Sbostic return (NULL); 62232332Sbostic } else { 62332332Sbostic if (!hflag) 62432332Sbostic if (!squirrel) 62532332Sbostic printf("%s:", file); 62632332Sbostic else 62732332Sbostic sprintf(preamble, "%s:", file); 62832332Sbostic if (nflag) { 62932332Sbostic if (prevmatch) 63032332Sbostic prevnline = prevnline + nlcount(prevloc, s); 63132332Sbostic else 63232332Sbostic prevnline = nline + nlcount(str, s); 63332332Sbostic prevmatch = 1; 63432332Sbostic 63532332Sbostic if (!squirrel) 63632332Sbostic printf("%ld:", prevnline); 63732332Sbostic else 63832332Sbostic sprintf(preamble + strlen(preamble), 63932332Sbostic "%ld:", prevnline); 64032332Sbostic } 64132332Sbostic if (!squirrel) { 64232332Sbostic do 64332332Sbostic putchar(*s); 64432332Sbostic while (*s++ != NL); 64532332Sbostic } else 64632332Sbostic s = savematch(s); 64732332Sbostic 64832332Sbostic if (nflag) 64932332Sbostic prevloc = s - 1; 65032332Sbostic } 65132332Sbostic return ((firstflag && !cflag) ? NULL : s); 65232332Sbostic } 65332332Sbostic 65432332Sbostic char * 65532332Sbostic fold(line) 65632332Sbostic char *line; 65732332Sbostic { 65832332Sbostic static char fline[BUFSIZE]; 65932332Sbostic register char *s, *t = fline; 66032332Sbostic 66132332Sbostic for (s = line; *s != EOS; s++) 66232332Sbostic *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s); 66332332Sbostic *t = EOS; 66432332Sbostic return (fline); 66532332Sbostic } 66632332Sbostic 66732332Sbostic strindex(s, t) /* the easy way, as in K&P, p. 192 */ 66832332Sbostic char *s, *t; 66932332Sbostic { 67032332Sbostic int i, n; 67132332Sbostic 67232332Sbostic n = strlen(t); 67332332Sbostic for (i = 0; s[i] != '\0'; i++) 67432332Sbostic if (strncmp(s + i, t, n) == 0) 67532332Sbostic return (i); 67632332Sbostic return (-1); 67732332Sbostic } 67832332Sbostic 67932332Sbostic char * 68032332Sbostic grepxlat(pattern) /* grep pattern meta conversion */ 68132332Sbostic char *pattern; 68232332Sbostic { 68332332Sbostic register char *p, *s; 68432332Sbostic static char newpat[BUFSIZE]; 68532332Sbostic 68632332Sbostic for (s = newpat, p = pattern; *p != EOS;) { 68732332Sbostic if (*p == '\\') { /* skip escapes ... */ 68832332Sbostic *s++ = *p++; 68932332Sbostic if (*p) 69032332Sbostic *s++ = *p++; 69132332Sbostic } else if (*p == '[') { /* ... and char classes */ 69232332Sbostic while (*p != EOS && *p != ']') 69332332Sbostic *s++ = *p++; 69432332Sbostic } else if (strchr("+?|()", *p) != NULL) { 69532332Sbostic *s++ = '\\'; /* insert protection */ 69632332Sbostic *s++ = *p++; 69732332Sbostic } else 69832332Sbostic *s++ = *p++; 69932332Sbostic } 70032332Sbostic *s = EOS; 70132332Sbostic grepflag = ((patind) ? 0 : 1); 70232332Sbostic return (newpat); 70332332Sbostic } 70432332Sbostic 70532332Sbostic /* 70632332Sbostic * Test for simple alternation. Result is NULL if it's not so simple, or is 70732332Sbostic * a pointer to the first string if it is. Warning: sscanf size is a 70832332Sbostic * fixpoint, beyond which the speedup linearity starts to break down. In the 70932332Sbostic * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing 71032332Sbostic * altpat[] to arbitrary size is not useful. 71132332Sbostic */ 71232332Sbostic char * 71332332Sbostic alternate(regexpr) 71432332Sbostic char *regexpr; 71532332Sbostic { 716*32339Sbostic register int i, j; 717*32339Sbostic register char *start, *stop; 71832332Sbostic unsigned char c; 71932332Sbostic 72032332Sbostic if (fgrepflag && strchr(regexpr, '|')) 721*32339Sbostic return (NULL); 72232332Sbostic 723*32339Sbostic /* 724*32339Sbostic * break pattern up into altpat array; delimit on newline, bar, 725*32339Sbostic * or EOS. We know we won't overflow, we've already checked the 726*32339Sbostic * number of patterns we're going to find against NALT. 727*32339Sbostic * Also, set length of pattern and find minimum pattern length. 728*32339Sbostic */ 729*32339Sbostic nalt = 0; 730*32339Sbostic altmin = NMUSH; 731*32339Sbostic for (start = stop = regexpr;; ++stop) 732*32339Sbostic if (!*stop || *stop == '|' || *stop == NL) { 733*32339Sbostic altlen[nalt] = j = stop - start; 734*32339Sbostic if (j < altmin) 735*32339Sbostic altmin = j; 736*32339Sbostic if (!(altpat[nalt] = malloc((u_int)(j + 1)))) 737*32339Sbostic oops("out of memory"); 738*32339Sbostic bcopy(start, altpat[nalt], j); 739*32339Sbostic altpat[nalt][j] = EOS; 740*32339Sbostic if (!*stop) 741*32339Sbostic break; 742*32339Sbostic if (++nalt == NALT) 743*32339Sbostic return(NULL); 744*32339Sbostic if (*stop == NL) 745*32339Sbostic *stop = '|'; 746*32339Sbostic start = stop + 1; 747*32339Sbostic } 74832332Sbostic if (!fgrepflag) { 74932332Sbostic if (strchr(regexpr, '|') == NULL || regexpr[0] == '|') 75032332Sbostic return (NULL); 75132332Sbostic if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL 75232332Sbostic || strindex(regexpr, "||") >= 0) 75332332Sbostic return (NULL); 75432332Sbostic } 75532332Sbostic 75632332Sbostic if (nalt > 1) { /* build superimposed "pre-match" sets per 75732332Sbostic * char */ 75832332Sbostic altflag++; 75932332Sbostic for (j = 0; j < nalt; j++) 76032332Sbostic for (i = 0; i < altmin; i++) { 76132332Sbostic c = altpat[j][altlen[j] - altmin + i]; 76232332Sbostic altset[i + 1][c] = 1; /* offset for sentinel */ 76332332Sbostic } 76432332Sbostic } 76532332Sbostic return (altpat[0]); 76632332Sbostic } 76732332Sbostic 76832332Sbostic /* 76932332Sbostic * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to 77032332Sbostic * determine whether to use dfa-based egrep: We do FIRSTFEW matches with 77132332Sbostic * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT 77232332Sbostic * of the input, the r.e. is likely to be underspecified, so do old *grep, 77332332Sbostic * which is faster on complex patterns than regexp(). At FIRSTFEW, 77432332Sbostic * dump the saved matches collected by savematch(). They are saved 77532332Sbostic * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic, 77632332Sbostic * since it's hard to rewind. 77732332Sbostic */ 77832332Sbostic 77932332Sbostic execstrategy(file) 78032332Sbostic char *file; 78132332Sbostic { 78232332Sbostic int pctmatch; 78332332Sbostic 78432332Sbostic pctmatch = (100 * rxcount) / incount; 78532332Sbostic if (pctmatch > PUNTPERCENT && file != NULL) 78632332Sbostic kernighan(args); 78732332Sbostic if (file != NULL) 78832332Sbostic flushmatches(); 78932332Sbostic } 79032332Sbostic 79132332Sbostic nlcount(bstart, bstop) /* flail interval to totalize newlines. */ 79232332Sbostic char *bstart, *bstop; 79332332Sbostic { 79432332Sbostic register char *s = bstart; 79532332Sbostic register char *t = bstop; 79632332Sbostic register int count = 0; 79732332Sbostic 79832332Sbostic do { /* loop unroll for older architectures */ 79932332Sbostic if (*t == NL) /* ... ask ames!jaw for sample code */ 80032332Sbostic count++; 80132332Sbostic } while (t-- > s); 80232332Sbostic 80332332Sbostic return (count); 80432332Sbostic } 80532332Sbostic 80632332Sbostic char * 80732332Sbostic isolate(regexpr) /* isolate longest metacharacter-free string */ 80832332Sbostic char *regexpr; 80932332Sbostic { 81032332Sbostic char *dummyexpr; 81132332Sbostic 81232332Sbostic /* 81332332Sbostic * We add (.)* because Henry's regcomp only figures regmust if it 81432332Sbostic * sees a leading * pattern. Foo! 81532332Sbostic */ 81632332Sbostic dummyexpr = malloc((unsigned) strlen(regexpr) + 5); 81732332Sbostic sprintf(dummyexpr, "(.)*%s", regexpr); 81832332Sbostic if ((rspencer = regcomp(dummyexpr)) == NULL) 81932332Sbostic kernighan(args); 82032332Sbostic return (rspencer->regmust); 82132332Sbostic } 82232332Sbostic 82332332Sbostic char *matches[FIRSTFEW]; 82432332Sbostic static int mcount = 0; 82532332Sbostic 82632332Sbostic char * 82732332Sbostic savematch(s) /* horde matches during statistics gathering */ 82832332Sbostic register char *s; 82932332Sbostic { 83032332Sbostic char *p; 83132332Sbostic char *start = s; 83232332Sbostic int msize = 0; 83332332Sbostic int psize = strlen(preamble); 83432332Sbostic 83532332Sbostic while (*s++ != NL) 83632332Sbostic msize++; 83732332Sbostic *--s = EOS; 83832332Sbostic 83932332Sbostic p = malloc((unsigned) msize + 1 + psize); 84032332Sbostic strcpy(p, preamble); 84132332Sbostic strcpy(p + psize, start); 84232332Sbostic matches[mcount++] = p; 84332332Sbostic 84432332Sbostic preamble[0] = 0; 84532332Sbostic *s = NL; 84632332Sbostic return (s); 84732332Sbostic } 84832332Sbostic 84932332Sbostic flushmatches() 85032332Sbostic { 85132332Sbostic int n; 85232332Sbostic 85332332Sbostic flushflag = 1; 85432332Sbostic for (n = 0; n < mcount; n++) 85532332Sbostic printf("%s\n", matches[n]); 85632332Sbostic mcount = 0; 85732332Sbostic } 85832332Sbostic 85932332Sbostic oops(message) 86032332Sbostic char *message; 86132332Sbostic { 86232332Sbostic fprintf(stderr, "%s: %s\n", progname, message); 86332332Sbostic exit(2); 86432332Sbostic } 86532332Sbostic 86632332Sbostic kernighan(args) /* "let others do the hard part ..." */ 86732332Sbostic char *args[]; 86832332Sbostic { 86932332Sbostic /* 87032332Sbostic * We may have already run grep on some of the files; remove them 87132332Sbostic * from the arg list we pass on. Note that we can't delete them 87232332Sbostic * totally because the number of file names affects the output 87332332Sbostic * (automatic -h). 87432332Sbostic */ 87532332Sbostic /* better would be fork/exec per punted file -- jaw */ 87632332Sbostic 87732332Sbostic while (firstfile && optind > firstfile) 87832332Sbostic args[firstfile++] = "/dev/null"; 87932332Sbostic if (patind) 88032332Sbostic args[patind] = pattern; 881*32339Sbostic (void) fflush(stdout); 88232332Sbostic 88332332Sbostic if (grepflag) 88432332Sbostic execvp(GREPSTD, args), oops("can't exec old 'grep'"); 88532332Sbostic else if (fgrepflag) 88632332Sbostic execvp(FGREPSTD, args), oops("can't exec old 'fgrep'"); 88732332Sbostic else 88832332Sbostic execvp(EGREPSTD, args), oops("can't exec old 'egrep'"); 88932332Sbostic } 890