132332Sbostic #ifndef lint 2*32338Sbostic static char sccsid[] = "@(#)egrep.c 5.3 (Berkeley) 10/05/87"; 332332Sbostic #endif not lint 432332Sbostic 532332Sbostic /* 632332Sbostic Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0 732332Sbostic table as in original paper (CACM, October, 1977). No delta1 or delta2. 832332Sbostic According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of 932332Sbostic minimal practical value. However, to improve for worst case input, 1032332Sbostic integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J. 1132332Sbostic Comput., Feb. 1986) deserves consideration. 1232332Sbostic 1332332Sbostic Method: extract longest metacharacter-free string from expression. 1432332Sbostic this is done using a side-effect from henry spencer's regcomp(). 1532332Sbostic use boyer-moore to match such, then pass submatching lines 1632332Sbostic to either regexp() or standard 'egrep', depending on certain 1732332Sbostic criteria within execstrategy() below. [this tradeoff is due 1832332Sbostic to the general slowness of the regexp() nondeterministic 1932332Sbostic machine on complex expressions, as well as the startup time 2032332Sbostic of standard 'egrep' on short files.] alternatively, one may 2132332Sbostic change the vendor-supplied 'egrep' automaton to include 2232332Sbostic boyer-moore directly. see accompanying writeup for discussion 2332332Sbostic of kanji expression treatment. 2432332Sbostic 2532332Sbostic late addition: apply trickbag for fast match of simple 2632332Sbostic alternations (sublinear, in common low-cardinality cases). 2732332Sbostic trap fgrep into this lair. 2832332Sbostic 2932332Sbostic gnu additions: -f, newline as |, \< and \> [in regexec()], more 3032332Sbostic comments. inspire better dfa exec() strategy. 3132332Sbostic serious testing and help with special cases. 3232332Sbostic 3332332Sbostic Algorithm amalgam summary: 3432332Sbostic 3532332Sbostic dfa e?grep (aho/thompson) 3632332Sbostic ndfa regexp() (spencer/aho) 3732332Sbostic bmg (boyer/moore/gosper) 3832332Sbostic "superimposed" bmg (jaw) 3932332Sbostic fgrep (aho/corrasick) 4032332Sbostic 4132332Sbostic sorry, but the knuth/morris/pratt machine, horspool's 4232332Sbostic "frequentist" code, and the rabin/karp matcher, however cute, 4332332Sbostic just don't cut it for this production. 4432332Sbostic 4532332Sbostic James A. Woods Copyright (c) 1986 4632332Sbostic NASA Ames Research Center 4732332Sbostic */ 4832332Sbostic #include <stdio.h> 4932332Sbostic #include <ctype.h> 5032332Sbostic #include <sys/types.h> 5132332Sbostic #include <sys/stat.h> 5232337Sbostic #include <sys/file.h> 5332332Sbostic #include <regexp.h> /* must be henry spencer's version */ 5432332Sbostic 5532332Sbostic #define MIN(A, B) ((A) > (B) ? (B) : (A)) 5632332Sbostic 5732332Sbostic #ifdef SLOWSYS 5832332Sbostic #define read xread 5932332Sbostic #endif 6032332Sbostic /* 6132332Sbostic * define existing [ef]?grep program locations below for use by execvp(). 6232332Sbostic * [execlp() would be used were it not for the possibility of 6332332Sbostic * installation-dependent recursion.] 6432332Sbostic */ 6532332Sbostic #ifndef EGREPSTD 6632332Sbostic #define EGREPSTD "/usr/bin/egrep" 6732332Sbostic #endif 6832332Sbostic #ifndef GREPSTD 6932332Sbostic #define GREPSTD "/bin/grep" 7032332Sbostic #endif 7132332Sbostic #ifndef FGREPSTD 7232332Sbostic #define FGREPSTD "/usr/bin/fgrep" 7332332Sbostic #endif 7432332Sbostic 7532332Sbostic #define BUFSIZE 8192 /* make higher for cray */ 7632332Sbostic #define PATSIZE 6000 7732332Sbostic #define LARGE BUFSIZE + PATSIZE 7832332Sbostic 7932332Sbostic #define ALTSIZE 100 /* generous? */ 8032332Sbostic #define NALT 7 /* tied to scanf() size in alternate() */ 8132332Sbostic #define NMUSH 6 /* loosely relates to expected alt length */ 8232332Sbostic 8332332Sbostic #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */ 8432332Sbostic #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input 8532332Sbostic * was processed by regexp(), exec std egrep. */ 8632332Sbostic #define NL '\n' 8732332Sbostic #define EOS '\0' 8832332Sbostic #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */ 8932332Sbostic #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */ 9032332Sbostic #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */ 9132332Sbostic #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */ 9232332Sbostic 9332332Sbostic extern char *optarg; 9432332Sbostic extern int optind; 9532332Sbostic char *progname; 9632332Sbostic 9732332Sbostic int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */ 9832332Sbostic int sflag, hflag; /* v7, v8, bsd */ 9932332Sbostic 10032332Sbostic int firstflag; /* Stop at first match */ 10132332Sbostic int grepflag; /* Called as "grep" */ 10232332Sbostic int fgrepflag; /* Called as "fgrep" */ 10332332Sbostic int altflag; /* Simple alternation in pattern */ 10432332Sbostic int boyonly; /* No regexp needed -- all simple */ 10532332Sbostic int flushflag; 10632332Sbostic int grepold, egrepold, fgrepold; 10732332Sbostic 10832332Sbostic int nalt; /* Number of alternatives */ 10932332Sbostic int nsuccess; /* 1 for match, 2 for error */ 11032332Sbostic int altmin; /* Minimum length of all the alternate 11132332Sbostic * strings */ 11232332Sbostic int firstfile; /* argv index of first file argument */ 11332332Sbostic int patind; /* argv index of pattern */ 11432332Sbostic long nmatch; /* Number of matches in this file */ 11532332Sbostic long incount, counted; /* Amount of input consumed */ 11632332Sbostic long rxcount; /* Bytes of input processed by regexec() */ 11732332Sbostic int boyfound; /* accumulated partial matches (tripped by 11832332Sbostic * FIRSTFEW) */ 11932332Sbostic int prevmatch; /* next three lines aid fast -n */ 12032332Sbostic long nline, prevnline; 12132332Sbostic char *prevloc; 12232332Sbostic 12332332Sbostic regexp *rspencer; 12432332Sbostic char *pattern; 12532332Sbostic char *patboy; /* Pattern for simple Boyer-Moore */ 12632332Sbostic char *patfile; /* Filename containing pattern(s) */ 12732332Sbostic 12832332Sbostic int delta0[256]; /* Boyer-Moore algorithm core */ 12932332Sbostic char cmap[256]; /* Usually 0-255, but if -i, maps upper to 13032332Sbostic * lower case */ 13132332Sbostic char str[BUFSIZE + 2]; 13232332Sbostic int nleftover; 13332332Sbostic char linetemp[BUFSIZE]; 13432332Sbostic char altpat[NALT][ALTSIZE]; /* alternation component storage */ 13532332Sbostic int altlen[NALT]; 13632332Sbostic short altset[NMUSH + 1][256]; 13732332Sbostic char preamble[200]; /* match prefix (filename, line no.) */ 13832332Sbostic 13932332Sbostic int fd; 14032332Sbostic char * 14132332Sbostic strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *sprintf(), *malloc(); 14232332Sbostic char * 14332332Sbostic grepxlat(), *fold(), *pfile(), *alternate(), *isolate(); 14432332Sbostic char **args; 14532332Sbostic 14632332Sbostic main(argc, argv) 14732332Sbostic int argc; 14832332Sbostic char *argv[]; 14932332Sbostic { 15032332Sbostic int c; 15132332Sbostic int errflag = 0; 15232332Sbostic 15332332Sbostic args = argv; 15432332Sbostic 15532332Sbostic if ((progname = strrchr(argv[0], '/')) != 0) 15632332Sbostic progname++; 15732332Sbostic else 15832332Sbostic progname = argv[0]; 15932332Sbostic if (strcmp(progname, "grep") == 0) 16032332Sbostic grepflag++; 16132332Sbostic if (strcmp(progname, "fgrep") == 0) 16232332Sbostic fgrepflag++; 16332332Sbostic 16432332Sbostic while ((c = getopt(argc, argv, "bchie:f:lnsvwxy1")) != EOF) { 16532332Sbostic switch (c) { 16632332Sbostic 16732332Sbostic case 'f': 16832332Sbostic fflag++; 16932332Sbostic patfile = optarg; 17032332Sbostic continue; 17132332Sbostic case 'b': 17232332Sbostic case 'v': 17332332Sbostic egrepold++; /* boyer-moore of little help here */ 17432332Sbostic continue; 17532332Sbostic case 'c': 17632332Sbostic cflag++; 17732332Sbostic continue; 17832332Sbostic case 'e': 17932332Sbostic eflag++; 18032332Sbostic pattern = optarg; 18132332Sbostic continue; 18232332Sbostic case 'h': 18332332Sbostic hflag++; 18432332Sbostic continue; 18532332Sbostic case '1': /* Stop at very first match */ 18632332Sbostic firstflag++; /* spead freaks only */ 18732332Sbostic continue; 18832332Sbostic case 'i': 18932332Sbostic iflag++; 19032332Sbostic continue; 19132332Sbostic case 'l': 19232332Sbostic lflag++; 19332332Sbostic continue; 19432332Sbostic case 'n': 19532332Sbostic nflag++; 19632332Sbostic continue; 19732332Sbostic case 's': 19832332Sbostic sflag++; 19932332Sbostic continue; 20032332Sbostic case 'w': 20132332Sbostic case 'y': 20232332Sbostic if (!grepflag) 20332332Sbostic errflag++; 20432332Sbostic grepold++; 20532332Sbostic continue; 20632332Sbostic case 'x': /* needs more work, like -b above */ 20732332Sbostic if (!fgrepflag) 20832332Sbostic errflag++; 20932332Sbostic fgrepold++; 21032332Sbostic continue; 21132332Sbostic case '?': 21232332Sbostic errflag++; 21332332Sbostic } 21432332Sbostic } 21532332Sbostic if (errflag || ((argc <= optind) && !fflag && !eflag)) { 21632332Sbostic if (grepflag) 21732332Sbostic oops("usage: grep [-bcihlnsvwy] [-e] pattern [file ...]"); 21832332Sbostic else if (fgrepflag) 21932332Sbostic oops("usage: fgrep [-bcilnv] {-f patfile | [-e] strings} [file ...]"); 22032332Sbostic else /* encourage SVID options, though we provide 22132332Sbostic * others */ 22232332Sbostic oops("usage: egrep [-bcilnv] {-f patfile | [-e] pattern} [file ...]"); 22332332Sbostic } 22432332Sbostic patind = optind; 22532332Sbostic if (fflag) 22632332Sbostic pattern = pfile(patfile), patind = 0; 22732332Sbostic else if (!eflag) 22832332Sbostic pattern = argv[optind++]; 22932332Sbostic 23032332Sbostic if ((argc - optind) <= 1) /* Filename invisible given < 2 files */ 23132332Sbostic hflag++; 23232332Sbostic if (pattern[0] == EOS) 23332332Sbostic kernighan(argv); /* same as it ever was */ 23432332Sbostic /* 23532332Sbostic * 'grep/egrep' merger -- "old" grep is called to handle: tagged 23632332Sbostic * exprs \( \), word matches \< and \>, -w and -y options, char 23732332Sbostic * classes with '-' at end (egrep bug?), and patterns beginning with 23832332Sbostic * an asterisk (don't ask why). otherwise, characters meaningful to 23932332Sbostic * 'egrep' but not to 'grep' are escaped; the entire expr is then 24032332Sbostic * passed to 'egrep'. 24132332Sbostic */ 24232332Sbostic if (grepflag && !grepold) { 24332332Sbostic if (strindex(pattern, "\\(") >= 0 || 24432332Sbostic strindex(pattern, "\\<") >= 0 || 24532332Sbostic strindex(pattern, "\\>") >= 0 || 24632332Sbostic strindex(pattern, "-]") >= 0 || 24732332Sbostic pattern[0] == '*') /* grep bug */ 24832332Sbostic grepold++; 24932332Sbostic else 25032332Sbostic pattern = grepxlat(pattern); 25132332Sbostic } 25232332Sbostic if (grepold || egrepold || fgrepold) 25332332Sbostic kernighan(argv); 25432332Sbostic 25532332Sbostic if (iflag) 25632332Sbostic strcpy(pattern, fold(pattern)); 25732332Sbostic /* 25832332Sbostic * If the pattern is a plain string, just run boyer-moore. If it 25932332Sbostic * consists of meta-free alternatives, run "superimposed" bmg. 26032332Sbostic * Otherwise, find best string, and compile pattern for regexec(). 26132332Sbostic */ 26232332Sbostic if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */ 26332332Sbostic boyonly++; 26432332Sbostic patboy = pattern; 26532332Sbostic } else { 26632332Sbostic if ((patboy = alternate(pattern)) != NULL) 26732332Sbostic boyonly++; 26832332Sbostic else { 26932332Sbostic if ((patboy = isolate(pattern)) == NULL) 27032332Sbostic kernighan(argv); /* expr too involved */ 27132332Sbostic #ifndef NOKANJI 27232332Sbostic for (c = 0; pattern[c] != EOS; c++) 27332332Sbostic if (pattern[c] & NONASCII) /* kanji + meta */ 27432332Sbostic kernighan(argv); 27532332Sbostic #endif 27632332Sbostic if ((rspencer = regcomp(pattern)) == NULL) 27732332Sbostic oops("regcomp failure"); 27832332Sbostic } 27932332Sbostic } 28032332Sbostic gosper(patboy); /* "pre-conditioning is wonderful" 28132332Sbostic * -- v. strassen */ 28232332Sbostic 28332332Sbostic if ((firstfile = optind) >= argc) { 28432332Sbostic /* Grep standard input */ 28532332Sbostic if (lflag) /* We don't know its name! */ 28632332Sbostic exit(1); 28732332Sbostic egsecute((char *) NULL); 28832332Sbostic } else { 28932332Sbostic while (optind < argc) { 29032332Sbostic egsecute(argv[optind]); 29132332Sbostic optind++; 29232332Sbostic if (firstflag && (nsuccess == 1)) 29332332Sbostic break; 29432332Sbostic } 29532332Sbostic } 29632332Sbostic exit((nsuccess == 2) ? 2 : (nsuccess == 0)); 29732332Sbostic } 29832332Sbostic 29932332Sbostic char * 30032332Sbostic pfile(pfname) /* absorb expression from file */ 30132332Sbostic char *pfname; 30232332Sbostic { 30332337Sbostic int fd; 30432332Sbostic struct stat patstat; 30532332Sbostic static char *pat; 30632332Sbostic 30732337Sbostic if ((fd = open(pfname, O_RDONLY, 0)) < 0) 30832332Sbostic oops("can't read pattern file"); 30932337Sbostic if (fstat(fd, &patstat) != 0) 31032332Sbostic oops("can't stat pattern file"); 31132332Sbostic if (patstat.st_size > PATSIZE) { 31232332Sbostic if (fgrepflag) { /* defer to unix version */ 31332332Sbostic fgrepold++; 31432332Sbostic return "dummy"; 31532332Sbostic } else 31632332Sbostic oops("pattern file too big"); 31732332Sbostic } 31832332Sbostic if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL) 31932332Sbostic oops("out of memory to read pattern file"); 32032337Sbostic if (patstat.st_size != read(fd, pat, (int)patstat.st_size)) 32132332Sbostic oops("error reading pattern file"); 32232337Sbostic (void) close(fd); 32332332Sbostic 32432332Sbostic pat[patstat.st_size] = EOS; 32532332Sbostic if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */ 32632332Sbostic pat[patstat.st_size - 1] = EOS; 32732332Sbostic 32832332Sbostic if (nlcount(pat, &pat[patstat.st_size]) > NALT) { 32932332Sbostic if (fgrepflag) 33032332Sbostic fgrepold++; /* "what's it all about, alfie?" */ 33132332Sbostic else 33232332Sbostic egrepold++; 33332332Sbostic } 33432332Sbostic return (pat); 33532332Sbostic } 33632332Sbostic 33732332Sbostic egsecute(file) 33832332Sbostic char *file; 33932332Sbostic { 34032332Sbostic if (file == NULL) 34132332Sbostic fd = 0; 34232332Sbostic else if ((fd = open(file, 0)) <= 0) { 34332332Sbostic fprintf(stderr, "%s: can't open %s\n", progname, file); 34432332Sbostic nsuccess = 2; 34532332Sbostic return; 34632332Sbostic } 34732332Sbostic chimaera(file, patboy); 34832332Sbostic 34932332Sbostic if (!boyonly && !flushflag && file != NULL) 35032332Sbostic flushmatches(); 35132332Sbostic if (file != NULL) 35232332Sbostic close(fd); 35332332Sbostic } 35432332Sbostic 35532332Sbostic chimaera(file, pat) /* "reach out and boyer-moore search someone" */ 35632332Sbostic char *file, *pat; /* -- soon-to-be-popular bumper sticker */ 35732332Sbostic { 35832332Sbostic register char *k, *strend, *s; 35932332Sbostic register int j, count; 36032332Sbostic register int *deltazero = delta0; 36132332Sbostic int patlen = altmin; 36232332Sbostic char *t; 36332332Sbostic char *gotamatch(), *kanji(), *linesave(), *submatch(); 36432332Sbostic 36532332Sbostic nleftover = boyfound = flushflag = 0; 36632332Sbostic nline = 1L; 36732332Sbostic prevmatch = 0; 36832332Sbostic nmatch = counted = rxcount = 0L; 36932332Sbostic 37032332Sbostic while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) { 37132332Sbostic 37232332Sbostic counted += count; 37332332Sbostic strend = linesave(str, count); 37432332Sbostic 37532332Sbostic for (k = str + patlen - 1; k < strend;) { 37632332Sbostic /* 37732332Sbostic * for a large class of patterns, upwards of 80% of 37832332Sbostic * match time is spent on the next line. we beat 37932332Sbostic * existing microcode (vax 'matchc') this way. 38032332Sbostic */ 38132332Sbostic while ((k += deltazero[*(unsigned char *) k]) < strend); 38232332Sbostic if (k < (str + LARGE)) 38332332Sbostic break; 38432332Sbostic k -= LARGE; 38532332Sbostic 38632332Sbostic if (altflag) { 38732332Sbostic /* 38832332Sbostic * Parallel Boyer-Moore. Check whether each 38932332Sbostic * of the previous <altmin> chars COULD be 39032332Sbostic * from one of the alternative strings. 39132332Sbostic */ 39232332Sbostic s = k - 1; 39332332Sbostic j = altmin; 39432332Sbostic while (altset[--j][(unsigned char) 39532332Sbostic cmap[*(unsigned char *) s--]]); 39632332Sbostic /* 39732332Sbostic * quick test fails. in this life, compare 39832332Sbostic * 'em all. but, a "reverse trie" would 39932332Sbostic * attenuate worst case (linear w/delta2?). 40032332Sbostic */ 40132332Sbostic if (--j < 0) { 40232332Sbostic count = nalt - 1; 40332332Sbostic do { 40432332Sbostic s = k; 40532332Sbostic j = altlen[count]; 40632332Sbostic t = altpat[count]; 40732332Sbostic 40832332Sbostic while 40932332Sbostic (cmap[*(unsigned char *) s--] 41032332Sbostic == t[--j]); 41132332Sbostic if (j < 0) 41232332Sbostic break; 41332332Sbostic } 41432332Sbostic while (count--); 41532332Sbostic } 41632332Sbostic } else { 41732332Sbostic /* One string -- check it */ 41832332Sbostic j = patlen - 1; 41932332Sbostic s = k - 1; 42032332Sbostic while (cmap[*(unsigned char *) s--] == pat[--j]); 42132332Sbostic } 42232332Sbostic /* 42332332Sbostic * delta-less shortcut for literati. short shrift for 42432332Sbostic * genetic engineers? 42532332Sbostic */ 42632332Sbostic if (j >= 0) { 42732332Sbostic k++; /* no match; restart next char */ 42832332Sbostic continue; 42932332Sbostic } 43032332Sbostic k = submatch(file, pat, str, strend, k, count); 43132332Sbostic if (k == NULL) 43232332Sbostic return; 43332332Sbostic } 43432332Sbostic if (nflag) { 43532332Sbostic if (prevmatch) 43632332Sbostic nline = prevnline + nlcount(prevloc, k); 43732332Sbostic else 43832332Sbostic nline = nline + nlcount(str, k); 43932332Sbostic prevmatch = 0; 44032332Sbostic } 44132332Sbostic strncpy(str, linetemp, nleftover); 44232332Sbostic } 44332332Sbostic if (cflag) { 44432332Sbostic /* Bug from old grep: -c overrides -h. We fix the bug. */ 44532332Sbostic if (!hflag) 44632332Sbostic printf("%s:", file); 44732332Sbostic printf("%ld\n", nmatch); 44832332Sbostic } 44932332Sbostic } 45032332Sbostic 45132332Sbostic char * 45232332Sbostic linesave(str, count) /* accumulate partial line at end of buffer */ 45332332Sbostic char str[]; 45432332Sbostic register int count; 45532332Sbostic { 45632332Sbostic register int j; 45732332Sbostic 45832332Sbostic count += nleftover; 45932332Sbostic if (count != BUFSIZE && fd != 0) 46032332Sbostic str[count++] = NL; /* insurance for broken last line */ 46132332Sbostic str[count] = EOS; 46232332Sbostic for (j = count - 1; str[j] != NL && j >= 0;) 46332332Sbostic j--; 46432332Sbostic /* 46532332Sbostic * break up these lines: long line (> BUFSIZE), last line of file, or 46632332Sbostic * short return from read(), as from tee(1) input 46732332Sbostic */ 46832332Sbostic if (j < 0 && (count == (BUFSIZE - nleftover))) { 46932332Sbostic str[count++] = NL; 47032332Sbostic str[count] = EOS; 47132332Sbostic linetemp[0] = EOS; 47232332Sbostic nleftover = 0; 47332332Sbostic return (str + count); 47432332Sbostic } else { 47532332Sbostic nleftover = count - j - 1; 47632332Sbostic strncpy(linetemp, str + j + 1, nleftover); 47732332Sbostic return (str + j); 47832332Sbostic } 47932332Sbostic } 48032332Sbostic 48132332Sbostic /* 48232332Sbostic * Process partial match. First check for mis-aligned Kanji, then match line 48332332Sbostic * against full compiled r.e. if statistics do not warrant handing off to 48432332Sbostic * standard egrep. 48532332Sbostic */ 48632332Sbostic char * 48732332Sbostic submatch(file, pat, str, strend, k, altindex) 48832332Sbostic char file[], pat[], str[]; 48932332Sbostic register char *strend, *k; 49032332Sbostic int altindex; 49132332Sbostic { 49232332Sbostic register char *s; 49332332Sbostic char *t, c; 49432332Sbostic 49532332Sbostic t = k; 49632332Sbostic s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1); 49732332Sbostic #ifndef NOKANJI 49832332Sbostic c = ((altflag) ? altpat[altindex][0] : pat[0]); 49932332Sbostic if (c & NONASCII) 50032332Sbostic if ((s = kanji(str, s, k)) == NULL) 50132332Sbostic return (++k); /* reject false kanji */ 50232332Sbostic #endif 50332332Sbostic do; 50432332Sbostic while (*s != NL && --s >= str); 50532332Sbostic k = s + 1; /* now at line start */ 50632332Sbostic 50732332Sbostic if (boyonly) 50832332Sbostic return (gotamatch(file, k)); 50932332Sbostic 51032332Sbostic incount = counted - (strend - k); 51132332Sbostic if (boyfound++ == FIRSTFEW) 51232332Sbostic execstrategy(file); 51332332Sbostic 51432332Sbostic s = t; 51532332Sbostic do 51632332Sbostic rxcount++; 51732332Sbostic while (*s++ != NL); 51832332Sbostic *--s = EOS; 51932332Sbostic /* 52032332Sbostic * "quick henry -- the flit" (after theodor geisel) 52132332Sbostic */ 52232332Sbostic if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) { 52332332Sbostic *s = NL; 52432332Sbostic if (gotamatch(file, k) == NULL) 52532332Sbostic return (NULL); 52632332Sbostic } 52732332Sbostic *s = NL; 52832332Sbostic return (s + 1); 52932332Sbostic } 53032332Sbostic 531*32338Sbostic #ifndef NOKANJI 53232332Sbostic /* 53332332Sbostic * EUC code disambiguation -- scan backwards to first 7-bit code, while 53432332Sbostic * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern. 53532332Sbostic * SS2/3 checks are for intermixed Japanase Katakana or Kanji2. 53632332Sbostic */ 53732332Sbostic char * 53832332Sbostic kanji(str, s, k) 53932332Sbostic register char *str, *s, *k; 54032332Sbostic { 54132332Sbostic register int j = 0; 54232332Sbostic 54332332Sbostic for (s--; s >= str; s--) { 54432332Sbostic if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0) 54532332Sbostic break; 54632332Sbostic j++; 54732332Sbostic } 54832332Sbostic #ifndef CHINESE 54932332Sbostic if (*s == SS2) 55032332Sbostic j -= 1; 55132332Sbostic #endif CHINESE 55232332Sbostic return ((j & 01) ? NULL : k); 55332332Sbostic } 554*32338Sbostic #endif 55532332Sbostic 55632332Sbostic /* 55732332Sbostic * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] 55832332Sbostic */ 55932332Sbostic gosper(pattern) 56032332Sbostic char *pattern; /* ... HAKMEM lives ... */ 56132332Sbostic { 56232332Sbostic register int i, j; 56332332Sbostic unsigned char c; 56432332Sbostic 56532332Sbostic /* Make one-string case look like simple alternatives case */ 56632332Sbostic if (!altflag) { 56732332Sbostic nalt = 1; 56832332Sbostic altmin = altlen[0] = strlen(pattern); 56932332Sbostic strcpy(altpat[0], pattern); 57032332Sbostic } 57132332Sbostic /* For chars that aren't in any string, skip by string length. */ 57232332Sbostic for (j = 0; j < 256; j++) { 57332332Sbostic delta0[j] = altmin; 57432332Sbostic cmap[j] = j; /* Sneak in initialization of cmap */ 57532332Sbostic } 57632332Sbostic 57732332Sbostic /* For chars in a string, skip distance from char to end of string. */ 57832332Sbostic /* (If char appears more than once, skip minimum distance.) */ 57932332Sbostic for (i = 0; i < nalt; i++) 58032332Sbostic for (j = 0; j < altlen[i] - 1; j++) { 58132332Sbostic c = altpat[i][j]; 58232332Sbostic delta0[c] = MIN(delta0[c], altlen[i] - j - 1); 58332332Sbostic if (iflag && islower((int) c)) 58432332Sbostic delta0[toupper((int) c)] = delta0[c]; 58532332Sbostic } 58632332Sbostic 58732332Sbostic /* For last char of each string, fall out of search loop. */ 58832332Sbostic for (i = 0; i < nalt; i++) { 58932332Sbostic c = altpat[i][altlen[i] - 1]; 59032332Sbostic delta0[c] = LARGE; 59132332Sbostic if (iflag && islower((int) c)) 59232332Sbostic delta0[toupper((int) c)] = LARGE; 59332332Sbostic } 59432332Sbostic if (iflag) 59532332Sbostic for (j = 'A'; j <= 'Z'; j++) 59632332Sbostic cmap[j] = tolower((int) j); 59732332Sbostic } 59832332Sbostic 59932332Sbostic /* 60032332Sbostic * Print, count, or stop on full match. Result is either the location for 60132332Sbostic * continued search, or NULL to stop. 60232332Sbostic */ 60332332Sbostic char * 60432332Sbostic gotamatch(file, s) 60532332Sbostic register char *file, *s; 60632332Sbostic { 60732332Sbostic char *savematch(); 60832332Sbostic int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */ 60932332Sbostic 61032332Sbostic nmatch++; 61132332Sbostic nsuccess = 1; 61232332Sbostic if (!boyonly && boyfound <= FIRSTFEW && file != NULL) 61332332Sbostic squirrel = 1; 61432332Sbostic 61532332Sbostic if (sflag) 61632332Sbostic return (NULL); /* -s usurps all flags (unlike some versions) */ 61732332Sbostic if (cflag) { /* -c overrides -l, we guess */ 61832332Sbostic do; 61932332Sbostic while (*s++ != NL); 62032332Sbostic } else if (lflag) { 62132332Sbostic puts(file); 62232332Sbostic return (NULL); 62332332Sbostic } else { 62432332Sbostic if (!hflag) 62532332Sbostic if (!squirrel) 62632332Sbostic printf("%s:", file); 62732332Sbostic else 62832332Sbostic sprintf(preamble, "%s:", file); 62932332Sbostic if (nflag) { 63032332Sbostic if (prevmatch) 63132332Sbostic prevnline = prevnline + nlcount(prevloc, s); 63232332Sbostic else 63332332Sbostic prevnline = nline + nlcount(str, s); 63432332Sbostic prevmatch = 1; 63532332Sbostic 63632332Sbostic if (!squirrel) 63732332Sbostic printf("%ld:", prevnline); 63832332Sbostic else 63932332Sbostic sprintf(preamble + strlen(preamble), 64032332Sbostic "%ld:", prevnline); 64132332Sbostic } 64232332Sbostic if (!squirrel) { 64332332Sbostic do 64432332Sbostic putchar(*s); 64532332Sbostic while (*s++ != NL); 64632332Sbostic } else 64732332Sbostic s = savematch(s); 64832332Sbostic 64932332Sbostic if (nflag) 65032332Sbostic prevloc = s - 1; 65132332Sbostic } 65232332Sbostic return ((firstflag && !cflag) ? NULL : s); 65332332Sbostic } 65432332Sbostic 65532332Sbostic char * 65632332Sbostic fold(line) 65732332Sbostic char *line; 65832332Sbostic { 65932332Sbostic static char fline[BUFSIZE]; 66032332Sbostic register char *s, *t = fline; 66132332Sbostic 66232332Sbostic for (s = line; *s != EOS; s++) 66332332Sbostic *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s); 66432332Sbostic *t = EOS; 66532332Sbostic return (fline); 66632332Sbostic } 66732332Sbostic 66832332Sbostic strindex(s, t) /* the easy way, as in K&P, p. 192 */ 66932332Sbostic char *s, *t; 67032332Sbostic { 67132332Sbostic int i, n; 67232332Sbostic 67332332Sbostic n = strlen(t); 67432332Sbostic for (i = 0; s[i] != '\0'; i++) 67532332Sbostic if (strncmp(s + i, t, n) == 0) 67632332Sbostic return (i); 67732332Sbostic return (-1); 67832332Sbostic } 67932332Sbostic 68032332Sbostic char * 68132332Sbostic grepxlat(pattern) /* grep pattern meta conversion */ 68232332Sbostic char *pattern; 68332332Sbostic { 68432332Sbostic register char *p, *s; 68532332Sbostic static char newpat[BUFSIZE]; 68632332Sbostic 68732332Sbostic for (s = newpat, p = pattern; *p != EOS;) { 68832332Sbostic if (*p == '\\') { /* skip escapes ... */ 68932332Sbostic *s++ = *p++; 69032332Sbostic if (*p) 69132332Sbostic *s++ = *p++; 69232332Sbostic } else if (*p == '[') { /* ... and char classes */ 69332332Sbostic while (*p != EOS && *p != ']') 69432332Sbostic *s++ = *p++; 69532332Sbostic } else if (strchr("+?|()", *p) != NULL) { 69632332Sbostic *s++ = '\\'; /* insert protection */ 69732332Sbostic *s++ = *p++; 69832332Sbostic } else 69932332Sbostic *s++ = *p++; 70032332Sbostic } 70132332Sbostic *s = EOS; 70232332Sbostic grepflag = ((patind) ? 0 : 1); 70332332Sbostic return (newpat); 70432332Sbostic } 70532332Sbostic 70632332Sbostic /* 70732332Sbostic * Test for simple alternation. Result is NULL if it's not so simple, or is 70832332Sbostic * a pointer to the first string if it is. Warning: sscanf size is a 70932332Sbostic * fixpoint, beyond which the speedup linearity starts to break down. In the 71032332Sbostic * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing 71132332Sbostic * altpat[] to arbitrary size is not useful. 71232332Sbostic */ 71332332Sbostic char * 71432332Sbostic alternate(regexpr) 71532332Sbostic char *regexpr; 71632332Sbostic { 71732332Sbostic register i, j, min; 71832332Sbostic unsigned char c; 71932332Sbostic char oflow[100]; 72032332Sbostic 72132332Sbostic if (fgrepflag && strchr(regexpr, '|')) 72232332Sbostic return (NULL); 72332332Sbostic 72432332Sbostic i = strlen(regexpr); 72532332Sbostic for (j = 0; j < i; j++) 72632332Sbostic if (regexpr[j] == NL) 72732332Sbostic regexpr[j] = '|'; 72832332Sbostic 72932332Sbostic if (!fgrepflag) { 73032332Sbostic if (strchr(regexpr, '|') == NULL || regexpr[0] == '|') 73132332Sbostic return (NULL); 73232332Sbostic if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL 73332332Sbostic || strindex(regexpr, "||") >= 0) 73432332Sbostic return (NULL); 73532332Sbostic } 73632332Sbostic oflow[0] = EOS; 73732332Sbostic nalt = sscanf(regexpr, "%[^|]|%[^|]|%[^|]|%[^|]|%[^|]|%[^|]|%[^|]|%[^|]", 73832332Sbostic altpat[0], altpat[1], altpat[2], altpat[3], altpat[4], altpat[5], altpat[6], oflow); 73932332Sbostic 74032332Sbostic if (nalt < 1 || oflow[0] != EOS) 74132332Sbostic return (NULL); 74232332Sbostic 74332332Sbostic altmin = NMUSH; 74432332Sbostic for (j = 0; j < nalt; j++) { 74532332Sbostic min = altlen[j] = strlen(altpat[j]); 74632332Sbostic if (min > ALTSIZE) 74732332Sbostic return (NULL); 74832332Sbostic altmin = MIN(altmin, min); 74932332Sbostic } 75032332Sbostic if (nalt > 1) { /* build superimposed "pre-match" sets per 75132332Sbostic * char */ 75232332Sbostic altflag++; 75332332Sbostic for (j = 0; j < nalt; j++) 75432332Sbostic for (i = 0; i < altmin; i++) { 75532332Sbostic c = altpat[j][altlen[j] - altmin + i]; 75632332Sbostic altset[i + 1][c] = 1; /* offset for sentinel */ 75732332Sbostic } 75832332Sbostic } 75932332Sbostic return (altpat[0]); 76032332Sbostic } 76132332Sbostic 76232332Sbostic /* 76332332Sbostic * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to 76432332Sbostic * determine whether to use dfa-based egrep: We do FIRSTFEW matches with 76532332Sbostic * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT 76632332Sbostic * of the input, the r.e. is likely to be underspecified, so do old *grep, 76732332Sbostic * which is faster on complex patterns than regexp(). At FIRSTFEW, 76832332Sbostic * dump the saved matches collected by savematch(). They are saved 76932332Sbostic * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic, 77032332Sbostic * since it's hard to rewind. 77132332Sbostic */ 77232332Sbostic 77332332Sbostic execstrategy(file) 77432332Sbostic char *file; 77532332Sbostic { 77632332Sbostic int pctmatch; 77732332Sbostic 77832332Sbostic pctmatch = (100 * rxcount) / incount; 77932332Sbostic if (pctmatch > PUNTPERCENT && file != NULL) 78032332Sbostic kernighan(args); 78132332Sbostic if (file != NULL) 78232332Sbostic flushmatches(); 78332332Sbostic } 78432332Sbostic 78532332Sbostic nlcount(bstart, bstop) /* flail interval to totalize newlines. */ 78632332Sbostic char *bstart, *bstop; 78732332Sbostic { 78832332Sbostic register char *s = bstart; 78932332Sbostic register char *t = bstop; 79032332Sbostic register int count = 0; 79132332Sbostic 79232332Sbostic do { /* loop unroll for older architectures */ 79332332Sbostic if (*t == NL) /* ... ask ames!jaw for sample code */ 79432332Sbostic count++; 79532332Sbostic } while (t-- > s); 79632332Sbostic 79732332Sbostic return (count); 79832332Sbostic } 79932332Sbostic 80032332Sbostic char * 80132332Sbostic isolate(regexpr) /* isolate longest metacharacter-free string */ 80232332Sbostic char *regexpr; 80332332Sbostic { 80432332Sbostic char *dummyexpr; 80532332Sbostic 80632332Sbostic /* 80732332Sbostic * We add (.)* because Henry's regcomp only figures regmust if it 80832332Sbostic * sees a leading * pattern. Foo! 80932332Sbostic */ 81032332Sbostic dummyexpr = malloc((unsigned) strlen(regexpr) + 5); 81132332Sbostic sprintf(dummyexpr, "(.)*%s", regexpr); 81232332Sbostic if ((rspencer = regcomp(dummyexpr)) == NULL) 81332332Sbostic kernighan(args); 81432332Sbostic return (rspencer->regmust); 81532332Sbostic } 81632332Sbostic 81732332Sbostic char *matches[FIRSTFEW]; 81832332Sbostic static int mcount = 0; 81932332Sbostic 82032332Sbostic char * 82132332Sbostic savematch(s) /* horde matches during statistics gathering */ 82232332Sbostic register char *s; 82332332Sbostic { 82432332Sbostic char *p; 82532332Sbostic char *start = s; 82632332Sbostic int msize = 0; 82732332Sbostic int psize = strlen(preamble); 82832332Sbostic 82932332Sbostic while (*s++ != NL) 83032332Sbostic msize++; 83132332Sbostic *--s = EOS; 83232332Sbostic 83332332Sbostic p = malloc((unsigned) msize + 1 + psize); 83432332Sbostic strcpy(p, preamble); 83532332Sbostic strcpy(p + psize, start); 83632332Sbostic matches[mcount++] = p; 83732332Sbostic 83832332Sbostic preamble[0] = 0; 83932332Sbostic *s = NL; 84032332Sbostic return (s); 84132332Sbostic } 84232332Sbostic 84332332Sbostic flushmatches() 84432332Sbostic { 84532332Sbostic int n; 84632332Sbostic 84732332Sbostic flushflag = 1; 84832332Sbostic for (n = 0; n < mcount; n++) 84932332Sbostic printf("%s\n", matches[n]); 85032332Sbostic mcount = 0; 85132332Sbostic } 85232332Sbostic 85332332Sbostic oops(message) 85432332Sbostic char *message; 85532332Sbostic { 85632332Sbostic fprintf(stderr, "%s: %s\n", progname, message); 85732332Sbostic exit(2); 85832332Sbostic } 85932332Sbostic 86032332Sbostic kernighan(args) /* "let others do the hard part ..." */ 86132332Sbostic char *args[]; 86232332Sbostic { 86332332Sbostic /* 86432332Sbostic * We may have already run grep on some of the files; remove them 86532332Sbostic * from the arg list we pass on. Note that we can't delete them 86632332Sbostic * totally because the number of file names affects the output 86732332Sbostic * (automatic -h). 86832332Sbostic */ 86932332Sbostic /* better would be fork/exec per punted file -- jaw */ 87032332Sbostic 87132332Sbostic while (firstfile && optind > firstfile) 87232332Sbostic args[firstfile++] = "/dev/null"; 87332332Sbostic if (patind) 87432332Sbostic args[patind] = pattern; 87532332Sbostic fflush(stdout); 87632332Sbostic 87732332Sbostic if (grepflag) 87832332Sbostic execvp(GREPSTD, args), oops("can't exec old 'grep'"); 87932332Sbostic else if (fgrepflag) 88032332Sbostic execvp(FGREPSTD, args), oops("can't exec old 'fgrep'"); 88132332Sbostic else 88232332Sbostic execvp(EGREPSTD, args), oops("can't exec old 'egrep'"); 88332332Sbostic } 884