132332Sbostic #ifndef lint 2*32502Sbostic static char sccsid[] = "@(#)egrep.c 5.9 (Berkeley) 10/22/87"; 332332Sbostic #endif not lint 432332Sbostic 532332Sbostic /* 632332Sbostic Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0 732332Sbostic table as in original paper (CACM, October, 1977). No delta1 or delta2. 832332Sbostic According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of 932332Sbostic minimal practical value. However, to improve for worst case input, 1032332Sbostic integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J. 1132332Sbostic Comput., Feb. 1986) deserves consideration. 1232332Sbostic 1332332Sbostic Method: extract longest metacharacter-free string from expression. 1432332Sbostic this is done using a side-effect from henry spencer's regcomp(). 1532332Sbostic use boyer-moore to match such, then pass submatching lines 1632332Sbostic to either regexp() or standard 'egrep', depending on certain 1732332Sbostic criteria within execstrategy() below. [this tradeoff is due 1832332Sbostic to the general slowness of the regexp() nondeterministic 1932332Sbostic machine on complex expressions, as well as the startup time 2032332Sbostic of standard 'egrep' on short files.] alternatively, one may 2132332Sbostic change the vendor-supplied 'egrep' automaton to include 2232332Sbostic boyer-moore directly. see accompanying writeup for discussion 2332332Sbostic of kanji expression treatment. 2432332Sbostic 2532332Sbostic late addition: apply trickbag for fast match of simple 2632332Sbostic alternations (sublinear, in common low-cardinality cases). 2732332Sbostic trap fgrep into this lair. 2832332Sbostic 2932332Sbostic gnu additions: -f, newline as |, \< and \> [in regexec()], more 3032332Sbostic comments. inspire better dfa exec() strategy. 3132332Sbostic serious testing and help with special cases. 3232332Sbostic 3332332Sbostic Algorithm amalgam summary: 3432332Sbostic 3532332Sbostic dfa e?grep (aho/thompson) 3632332Sbostic ndfa regexp() (spencer/aho) 3732332Sbostic bmg (boyer/moore/gosper) 3832332Sbostic "superimposed" bmg (jaw) 3932332Sbostic fgrep (aho/corrasick) 4032332Sbostic 4132332Sbostic sorry, but the knuth/morris/pratt machine, horspool's 4232332Sbostic "frequentist" code, and the rabin/karp matcher, however cute, 4332332Sbostic just don't cut it for this production. 4432332Sbostic 4532332Sbostic James A. Woods Copyright (c) 1986 4632332Sbostic NASA Ames Research Center 4732332Sbostic */ 4832332Sbostic #include <stdio.h> 4932332Sbostic #include <ctype.h> 5032332Sbostic #include <sys/types.h> 5132332Sbostic #include <sys/stat.h> 5232337Sbostic #include <sys/file.h> 5332332Sbostic #include <regexp.h> /* must be henry spencer's version */ 5432332Sbostic 5532332Sbostic #define MIN(A, B) ((A) > (B) ? (B) : (A)) 5632332Sbostic 5732332Sbostic #ifdef SLOWSYS 5832332Sbostic #define read xread 5932332Sbostic #endif 6032332Sbostic /* 6132332Sbostic * define existing [ef]?grep program locations below for use by execvp(). 6232332Sbostic * [execlp() would be used were it not for the possibility of 6332332Sbostic * installation-dependent recursion.] 6432332Sbostic */ 6532332Sbostic #ifndef EGREPSTD 6632332Sbostic #define EGREPSTD "/usr/bin/egrep" 6732332Sbostic #endif 6832332Sbostic #ifndef GREPSTD 6932332Sbostic #define GREPSTD "/bin/grep" 7032332Sbostic #endif 7132332Sbostic #ifndef FGREPSTD 7232332Sbostic #define FGREPSTD "/usr/bin/fgrep" 7332332Sbostic #endif 7432332Sbostic 7532332Sbostic #define BUFSIZE 8192 /* make higher for cray */ 7632332Sbostic #define PATSIZE 6000 7732332Sbostic #define LARGE BUFSIZE + PATSIZE 7832332Sbostic 7932332Sbostic #define NALT 7 /* tied to scanf() size in alternate() */ 8032332Sbostic #define NMUSH 6 /* loosely relates to expected alt length */ 8132332Sbostic 8232332Sbostic #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */ 8332332Sbostic #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input 8432332Sbostic * was processed by regexp(), exec std egrep. */ 8532332Sbostic #define NL '\n' 8632332Sbostic #define EOS '\0' 8732332Sbostic #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */ 8832332Sbostic #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */ 8932332Sbostic #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */ 9032332Sbostic #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */ 9132332Sbostic 9232332Sbostic extern char *optarg; 9332332Sbostic extern int optind; 9432332Sbostic char *progname; 9532332Sbostic 9632332Sbostic int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */ 9732332Sbostic int sflag, hflag; /* v7, v8, bsd */ 9832332Sbostic 9932332Sbostic int firstflag; /* Stop at first match */ 10032332Sbostic int grepflag; /* Called as "grep" */ 10132332Sbostic int fgrepflag; /* Called as "fgrep" */ 10232332Sbostic int altflag; /* Simple alternation in pattern */ 10332332Sbostic int boyonly; /* No regexp needed -- all simple */ 10432332Sbostic int flushflag; 10532332Sbostic int grepold, egrepold, fgrepold; 10632332Sbostic 10732332Sbostic int nalt; /* Number of alternatives */ 10832332Sbostic int nsuccess; /* 1 for match, 2 for error */ 10932332Sbostic int altmin; /* Minimum length of all the alternate 11032332Sbostic * strings */ 11132332Sbostic int firstfile; /* argv index of first file argument */ 11232332Sbostic int patind; /* argv index of pattern */ 11332332Sbostic long nmatch; /* Number of matches in this file */ 11432332Sbostic long incount, counted; /* Amount of input consumed */ 11532332Sbostic long rxcount; /* Bytes of input processed by regexec() */ 11632332Sbostic int boyfound; /* accumulated partial matches (tripped by 11732332Sbostic * FIRSTFEW) */ 11832332Sbostic int prevmatch; /* next three lines aid fast -n */ 11932332Sbostic long nline, prevnline; 12032332Sbostic char *prevloc; 12132332Sbostic 12232332Sbostic regexp *rspencer; 12332332Sbostic char *pattern; 12432332Sbostic char *patboy; /* Pattern for simple Boyer-Moore */ 12532332Sbostic char *patfile; /* Filename containing pattern(s) */ 12632332Sbostic 12732332Sbostic int delta0[256]; /* Boyer-Moore algorithm core */ 12832332Sbostic char cmap[256]; /* Usually 0-255, but if -i, maps upper to 12932332Sbostic * lower case */ 13032332Sbostic char str[BUFSIZE + 2]; 13132332Sbostic int nleftover; 13232332Sbostic char linetemp[BUFSIZE]; 13332339Sbostic char *altpat[NALT]; /* alternation component storage */ 13432332Sbostic int altlen[NALT]; 13532332Sbostic short altset[NMUSH + 1][256]; 13632332Sbostic char preamble[200]; /* match prefix (filename, line no.) */ 13732332Sbostic 13832332Sbostic int fd; 13932332Sbostic char * 140*32502Sbostic strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc(); 14132332Sbostic char * 14232332Sbostic grepxlat(), *fold(), *pfile(), *alternate(), *isolate(); 14332332Sbostic char **args; 14432332Sbostic 14532332Sbostic main(argc, argv) 14632332Sbostic int argc; 14732332Sbostic char *argv[]; 14832332Sbostic { 14932365Sbostic int c, oflag; 15032332Sbostic int errflag = 0; 15132332Sbostic 15232332Sbostic args = argv; 15332332Sbostic 15432332Sbostic if ((progname = strrchr(argv[0], '/')) != 0) 15532332Sbostic progname++; 15632332Sbostic else 15732332Sbostic progname = argv[0]; 15832332Sbostic if (strcmp(progname, "grep") == 0) 15932332Sbostic grepflag++; 16032365Sbostic else if (strcmp(progname, "fgrep") == 0) 16132332Sbostic fgrepflag++; 16232332Sbostic 16332365Sbostic oflag = 0; 16432365Sbostic while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) { 16532332Sbostic switch (c) { 16632332Sbostic 16732332Sbostic case 'f': 16832332Sbostic fflag++; 16932332Sbostic patfile = optarg; 17032332Sbostic continue; 17132332Sbostic case 'b': 17232332Sbostic case 'v': 17332332Sbostic egrepold++; /* boyer-moore of little help here */ 17432332Sbostic continue; 17532332Sbostic case 'c': 17632332Sbostic cflag++; 17732332Sbostic continue; 17832332Sbostic case 'e': 17932332Sbostic eflag++; 18032332Sbostic pattern = optarg; 18132332Sbostic continue; 18232332Sbostic case 'h': 18332332Sbostic hflag++; 18432332Sbostic continue; 18532365Sbostic case 'o': 18632365Sbostic oflag++; 18732365Sbostic continue; 18832332Sbostic case '1': /* Stop at very first match */ 18932332Sbostic firstflag++; /* spead freaks only */ 19032332Sbostic continue; 19132332Sbostic case 'i': 19232332Sbostic iflag++; 19332332Sbostic continue; 19432332Sbostic case 'l': 19532332Sbostic lflag++; 19632332Sbostic continue; 19732332Sbostic case 'n': 19832332Sbostic nflag++; 19932332Sbostic continue; 20032332Sbostic case 's': 20132332Sbostic sflag++; 20232332Sbostic continue; 20332332Sbostic case 'w': 20432332Sbostic case 'y': 20532332Sbostic if (!grepflag) 20632332Sbostic errflag++; 20732332Sbostic grepold++; 20832332Sbostic continue; 20932332Sbostic case 'x': /* needs more work, like -b above */ 21032332Sbostic if (!fgrepflag) 21132332Sbostic errflag++; 21232332Sbostic fgrepold++; 21332332Sbostic continue; 21432332Sbostic case '?': 21532332Sbostic errflag++; 21632332Sbostic } 21732332Sbostic } 21832332Sbostic if (errflag || ((argc <= optind) && !fflag && !eflag)) { 21932332Sbostic if (grepflag) 22032365Sbostic oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]"); 22132332Sbostic else if (fgrepflag) 22232366Sbostic oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]"); 22332332Sbostic else /* encourage SVID options, though we provide 22432332Sbostic * others */ 22532366Sbostic oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]"); 22632332Sbostic } 22732332Sbostic patind = optind; 22832332Sbostic if (fflag) 22932332Sbostic pattern = pfile(patfile), patind = 0; 23032332Sbostic else if (!eflag) 23132332Sbostic pattern = argv[optind++]; 23232332Sbostic 23332365Sbostic if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */ 23432332Sbostic hflag++; 23532332Sbostic if (pattern[0] == EOS) 23632332Sbostic kernighan(argv); /* same as it ever was */ 23732332Sbostic /* 23832332Sbostic * 'grep/egrep' merger -- "old" grep is called to handle: tagged 23932332Sbostic * exprs \( \), word matches \< and \>, -w and -y options, char 24032332Sbostic * classes with '-' at end (egrep bug?), and patterns beginning with 24132332Sbostic * an asterisk (don't ask why). otherwise, characters meaningful to 24232332Sbostic * 'egrep' but not to 'grep' are escaped; the entire expr is then 24332332Sbostic * passed to 'egrep'. 24432332Sbostic */ 24532332Sbostic if (grepflag && !grepold) { 24632332Sbostic if (strindex(pattern, "\\(") >= 0 || 24732332Sbostic strindex(pattern, "\\<") >= 0 || 24832332Sbostic strindex(pattern, "\\>") >= 0 || 24932332Sbostic strindex(pattern, "-]") >= 0 || 25032332Sbostic pattern[0] == '*') /* grep bug */ 25132332Sbostic grepold++; 25232332Sbostic else 25332332Sbostic pattern = grepxlat(pattern); 25432332Sbostic } 25532332Sbostic if (grepold || egrepold || fgrepold) 25632332Sbostic kernighan(argv); 25732332Sbostic 25832332Sbostic if (iflag) 25932332Sbostic strcpy(pattern, fold(pattern)); 26032332Sbostic /* 26132332Sbostic * If the pattern is a plain string, just run boyer-moore. If it 26232332Sbostic * consists of meta-free alternatives, run "superimposed" bmg. 26332332Sbostic * Otherwise, find best string, and compile pattern for regexec(). 26432332Sbostic */ 26532332Sbostic if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */ 26632332Sbostic boyonly++; 26732332Sbostic patboy = pattern; 26832332Sbostic } else { 26932332Sbostic if ((patboy = alternate(pattern)) != NULL) 27032332Sbostic boyonly++; 27132332Sbostic else { 27232332Sbostic if ((patboy = isolate(pattern)) == NULL) 27332332Sbostic kernighan(argv); /* expr too involved */ 27432332Sbostic #ifndef NOKANJI 27532332Sbostic for (c = 0; pattern[c] != EOS; c++) 27632332Sbostic if (pattern[c] & NONASCII) /* kanji + meta */ 27732332Sbostic kernighan(argv); 27832332Sbostic #endif 27932332Sbostic if ((rspencer = regcomp(pattern)) == NULL) 28032332Sbostic oops("regcomp failure"); 28132332Sbostic } 28232332Sbostic } 28332332Sbostic gosper(patboy); /* "pre-conditioning is wonderful" 28432332Sbostic * -- v. strassen */ 28532332Sbostic 28632332Sbostic if ((firstfile = optind) >= argc) { 28732332Sbostic /* Grep standard input */ 28832332Sbostic if (lflag) /* We don't know its name! */ 28932332Sbostic exit(1); 29032332Sbostic egsecute((char *) NULL); 29132332Sbostic } else { 29232332Sbostic while (optind < argc) { 29332332Sbostic egsecute(argv[optind]); 29432332Sbostic optind++; 29532332Sbostic if (firstflag && (nsuccess == 1)) 29632332Sbostic break; 29732332Sbostic } 29832332Sbostic } 29932332Sbostic exit((nsuccess == 2) ? 2 : (nsuccess == 0)); 30032332Sbostic } 30132332Sbostic 30232332Sbostic char * 30332332Sbostic pfile(pfname) /* absorb expression from file */ 30432332Sbostic char *pfname; 30532332Sbostic { 30632337Sbostic int fd; 30732332Sbostic struct stat patstat; 30832332Sbostic static char *pat; 30932332Sbostic 31032337Sbostic if ((fd = open(pfname, O_RDONLY, 0)) < 0) 31132332Sbostic oops("can't read pattern file"); 31232337Sbostic if (fstat(fd, &patstat) != 0) 31332332Sbostic oops("can't stat pattern file"); 31432332Sbostic if (patstat.st_size > PATSIZE) { 31532332Sbostic if (fgrepflag) { /* defer to unix version */ 31632332Sbostic fgrepold++; 31732332Sbostic return "dummy"; 31832332Sbostic } else 31932332Sbostic oops("pattern file too big"); 32032332Sbostic } 32132332Sbostic if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL) 32232332Sbostic oops("out of memory to read pattern file"); 32332337Sbostic if (patstat.st_size != read(fd, pat, (int)patstat.st_size)) 32432332Sbostic oops("error reading pattern file"); 32532337Sbostic (void) close(fd); 32632332Sbostic 32732332Sbostic pat[patstat.st_size] = EOS; 32832332Sbostic if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */ 32932332Sbostic pat[patstat.st_size - 1] = EOS; 33032332Sbostic 33132332Sbostic if (nlcount(pat, &pat[patstat.st_size]) > NALT) { 33232332Sbostic if (fgrepflag) 33332332Sbostic fgrepold++; /* "what's it all about, alfie?" */ 33432332Sbostic else 33532332Sbostic egrepold++; 33632332Sbostic } 33732332Sbostic return (pat); 33832332Sbostic } 33932332Sbostic 34032332Sbostic egsecute(file) 34132332Sbostic char *file; 34232332Sbostic { 34332332Sbostic if (file == NULL) 34432332Sbostic fd = 0; 34532339Sbostic else if ((fd = open(file, O_RDONLY, 0)) <= 0) { 34632332Sbostic fprintf(stderr, "%s: can't open %s\n", progname, file); 34732332Sbostic nsuccess = 2; 34832332Sbostic return; 34932332Sbostic } 35032332Sbostic chimaera(file, patboy); 35132332Sbostic 35232332Sbostic if (!boyonly && !flushflag && file != NULL) 35332332Sbostic flushmatches(); 35432332Sbostic if (file != NULL) 35532332Sbostic close(fd); 35632332Sbostic } 35732332Sbostic 35832332Sbostic chimaera(file, pat) /* "reach out and boyer-moore search someone" */ 35932332Sbostic char *file, *pat; /* -- soon-to-be-popular bumper sticker */ 36032332Sbostic { 36132332Sbostic register char *k, *strend, *s; 36232332Sbostic register int j, count; 36332332Sbostic register int *deltazero = delta0; 36432332Sbostic int patlen = altmin; 36532332Sbostic char *t; 36632332Sbostic char *gotamatch(), *kanji(), *linesave(), *submatch(); 36732332Sbostic 36832332Sbostic nleftover = boyfound = flushflag = 0; 36932332Sbostic nline = 1L; 37032332Sbostic prevmatch = 0; 37132332Sbostic nmatch = counted = rxcount = 0L; 37232332Sbostic 37332332Sbostic while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) { 37432332Sbostic 37532332Sbostic counted += count; 37632332Sbostic strend = linesave(str, count); 37732332Sbostic 37832332Sbostic for (k = str + patlen - 1; k < strend;) { 37932332Sbostic /* 38032332Sbostic * for a large class of patterns, upwards of 80% of 38132332Sbostic * match time is spent on the next line. we beat 38232332Sbostic * existing microcode (vax 'matchc') this way. 38332332Sbostic */ 38432332Sbostic while ((k += deltazero[*(unsigned char *) k]) < strend); 38532332Sbostic if (k < (str + LARGE)) 38632332Sbostic break; 38732332Sbostic k -= LARGE; 38832332Sbostic 38932332Sbostic if (altflag) { 39032332Sbostic /* 39132332Sbostic * Parallel Boyer-Moore. Check whether each 39232332Sbostic * of the previous <altmin> chars COULD be 39332332Sbostic * from one of the alternative strings. 39432332Sbostic */ 39532332Sbostic s = k - 1; 39632332Sbostic j = altmin; 39732332Sbostic while (altset[--j][(unsigned char) 39832332Sbostic cmap[*(unsigned char *) s--]]); 39932332Sbostic /* 40032332Sbostic * quick test fails. in this life, compare 40132332Sbostic * 'em all. but, a "reverse trie" would 40232332Sbostic * attenuate worst case (linear w/delta2?). 40332332Sbostic */ 40432332Sbostic if (--j < 0) { 40532332Sbostic count = nalt - 1; 40632332Sbostic do { 40732332Sbostic s = k; 40832332Sbostic j = altlen[count]; 40932332Sbostic t = altpat[count]; 41032332Sbostic 41132332Sbostic while 41232332Sbostic (cmap[*(unsigned char *) s--] 41332332Sbostic == t[--j]); 41432332Sbostic if (j < 0) 41532332Sbostic break; 41632332Sbostic } 41732332Sbostic while (count--); 41832332Sbostic } 41932332Sbostic } else { 42032332Sbostic /* One string -- check it */ 42132332Sbostic j = patlen - 1; 42232332Sbostic s = k - 1; 42332332Sbostic while (cmap[*(unsigned char *) s--] == pat[--j]); 42432332Sbostic } 42532332Sbostic /* 42632332Sbostic * delta-less shortcut for literati. short shrift for 42732332Sbostic * genetic engineers? 42832332Sbostic */ 42932332Sbostic if (j >= 0) { 43032332Sbostic k++; /* no match; restart next char */ 43132332Sbostic continue; 43232332Sbostic } 43332332Sbostic k = submatch(file, pat, str, strend, k, count); 43432332Sbostic if (k == NULL) 43532332Sbostic return; 43632332Sbostic } 43732332Sbostic if (nflag) { 43832332Sbostic if (prevmatch) 43932332Sbostic nline = prevnline + nlcount(prevloc, k); 44032332Sbostic else 44132332Sbostic nline = nline + nlcount(str, k); 44232332Sbostic prevmatch = 0; 44332332Sbostic } 44432332Sbostic strncpy(str, linetemp, nleftover); 44532332Sbostic } 44632332Sbostic if (cflag) { 44732332Sbostic /* Bug from old grep: -c overrides -h. We fix the bug. */ 44832332Sbostic if (!hflag) 44932332Sbostic printf("%s:", file); 45032332Sbostic printf("%ld\n", nmatch); 45132332Sbostic } 45232332Sbostic } 45332332Sbostic 45432332Sbostic char * 45532332Sbostic linesave(str, count) /* accumulate partial line at end of buffer */ 45632332Sbostic char str[]; 45732332Sbostic register int count; 45832332Sbostic { 45932332Sbostic register int j; 46032332Sbostic 46132332Sbostic count += nleftover; 46232332Sbostic if (count != BUFSIZE && fd != 0) 46332332Sbostic str[count++] = NL; /* insurance for broken last line */ 46432332Sbostic str[count] = EOS; 46532332Sbostic for (j = count - 1; str[j] != NL && j >= 0;) 46632332Sbostic j--; 46732332Sbostic /* 46832332Sbostic * break up these lines: long line (> BUFSIZE), last line of file, or 46932332Sbostic * short return from read(), as from tee(1) input 47032332Sbostic */ 47132332Sbostic if (j < 0 && (count == (BUFSIZE - nleftover))) { 47232332Sbostic str[count++] = NL; 47332332Sbostic str[count] = EOS; 47432332Sbostic linetemp[0] = EOS; 47532332Sbostic nleftover = 0; 47632332Sbostic return (str + count); 47732332Sbostic } else { 47832332Sbostic nleftover = count - j - 1; 47932332Sbostic strncpy(linetemp, str + j + 1, nleftover); 48032332Sbostic return (str + j); 48132332Sbostic } 48232332Sbostic } 48332332Sbostic 48432332Sbostic /* 48532332Sbostic * Process partial match. First check for mis-aligned Kanji, then match line 48632332Sbostic * against full compiled r.e. if statistics do not warrant handing off to 48732332Sbostic * standard egrep. 48832332Sbostic */ 48932332Sbostic char * 49032332Sbostic submatch(file, pat, str, strend, k, altindex) 49132332Sbostic char file[], pat[], str[]; 49232332Sbostic register char *strend, *k; 49332332Sbostic int altindex; 49432332Sbostic { 49532332Sbostic register char *s; 49632332Sbostic char *t, c; 49732332Sbostic 49832332Sbostic t = k; 49932332Sbostic s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1); 50032332Sbostic #ifndef NOKANJI 50132332Sbostic c = ((altflag) ? altpat[altindex][0] : pat[0]); 50232332Sbostic if (c & NONASCII) 50332332Sbostic if ((s = kanji(str, s, k)) == NULL) 50432332Sbostic return (++k); /* reject false kanji */ 50532332Sbostic #endif 50632332Sbostic do; 50732332Sbostic while (*s != NL && --s >= str); 50832332Sbostic k = s + 1; /* now at line start */ 50932332Sbostic 51032332Sbostic if (boyonly) 51132332Sbostic return (gotamatch(file, k)); 51232332Sbostic 51332332Sbostic incount = counted - (strend - k); 51432332Sbostic if (boyfound++ == FIRSTFEW) 51532332Sbostic execstrategy(file); 51632332Sbostic 51732332Sbostic s = t; 51832332Sbostic do 51932332Sbostic rxcount++; 52032332Sbostic while (*s++ != NL); 52132332Sbostic *--s = EOS; 52232332Sbostic /* 52332332Sbostic * "quick henry -- the flit" (after theodor geisel) 52432332Sbostic */ 52532332Sbostic if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) { 52632332Sbostic *s = NL; 52732332Sbostic if (gotamatch(file, k) == NULL) 52832332Sbostic return (NULL); 52932332Sbostic } 53032332Sbostic *s = NL; 53132332Sbostic return (s + 1); 53232332Sbostic } 53332332Sbostic 53432338Sbostic #ifndef NOKANJI 53532332Sbostic /* 53632332Sbostic * EUC code disambiguation -- scan backwards to first 7-bit code, while 53732332Sbostic * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern. 53832332Sbostic * SS2/3 checks are for intermixed Japanase Katakana or Kanji2. 53932332Sbostic */ 54032332Sbostic char * 54132332Sbostic kanji(str, s, k) 54232332Sbostic register char *str, *s, *k; 54332332Sbostic { 54432332Sbostic register int j = 0; 54532332Sbostic 54632332Sbostic for (s--; s >= str; s--) { 54732332Sbostic if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0) 54832332Sbostic break; 54932332Sbostic j++; 55032332Sbostic } 55132332Sbostic #ifndef CHINESE 55232332Sbostic if (*s == SS2) 55332332Sbostic j -= 1; 55432332Sbostic #endif CHINESE 55532332Sbostic return ((j & 01) ? NULL : k); 55632332Sbostic } 55732338Sbostic #endif 55832332Sbostic 55932332Sbostic /* 56032332Sbostic * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] 56132332Sbostic */ 56232332Sbostic gosper(pattern) 56332332Sbostic char *pattern; /* ... HAKMEM lives ... */ 56432332Sbostic { 56532332Sbostic register int i, j; 56632332Sbostic unsigned char c; 56732332Sbostic 56832332Sbostic /* Make one-string case look like simple alternatives case */ 56932332Sbostic if (!altflag) { 57032332Sbostic nalt = 1; 57132332Sbostic altmin = altlen[0] = strlen(pattern); 57232339Sbostic altpat[0] = pattern; 57332332Sbostic } 57432332Sbostic /* For chars that aren't in any string, skip by string length. */ 57532332Sbostic for (j = 0; j < 256; j++) { 57632332Sbostic delta0[j] = altmin; 57732332Sbostic cmap[j] = j; /* Sneak in initialization of cmap */ 57832332Sbostic } 57932332Sbostic 58032332Sbostic /* For chars in a string, skip distance from char to end of string. */ 58132332Sbostic /* (If char appears more than once, skip minimum distance.) */ 58232332Sbostic for (i = 0; i < nalt; i++) 58332332Sbostic for (j = 0; j < altlen[i] - 1; j++) { 58432332Sbostic c = altpat[i][j]; 58532332Sbostic delta0[c] = MIN(delta0[c], altlen[i] - j - 1); 58632332Sbostic if (iflag && islower((int) c)) 58732332Sbostic delta0[toupper((int) c)] = delta0[c]; 58832332Sbostic } 58932332Sbostic 59032332Sbostic /* For last char of each string, fall out of search loop. */ 59132332Sbostic for (i = 0; i < nalt; i++) { 59232332Sbostic c = altpat[i][altlen[i] - 1]; 59332332Sbostic delta0[c] = LARGE; 59432332Sbostic if (iflag && islower((int) c)) 59532332Sbostic delta0[toupper((int) c)] = LARGE; 59632332Sbostic } 59732332Sbostic if (iflag) 59832332Sbostic for (j = 'A'; j <= 'Z'; j++) 59932332Sbostic cmap[j] = tolower((int) j); 60032332Sbostic } 60132332Sbostic 60232332Sbostic /* 60332332Sbostic * Print, count, or stop on full match. Result is either the location for 60432332Sbostic * continued search, or NULL to stop. 60532332Sbostic */ 60632332Sbostic char * 60732332Sbostic gotamatch(file, s) 60832332Sbostic register char *file, *s; 60932332Sbostic { 61032332Sbostic char *savematch(); 61132332Sbostic int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */ 61232332Sbostic 61332332Sbostic nmatch++; 61432332Sbostic nsuccess = 1; 61532332Sbostic if (!boyonly && boyfound <= FIRSTFEW && file != NULL) 61632332Sbostic squirrel = 1; 61732332Sbostic 61832332Sbostic if (sflag) 61932332Sbostic return (NULL); /* -s usurps all flags (unlike some versions) */ 62032332Sbostic if (cflag) { /* -c overrides -l, we guess */ 62132332Sbostic do; 62232332Sbostic while (*s++ != NL); 62332332Sbostic } else if (lflag) { 62432332Sbostic puts(file); 62532332Sbostic return (NULL); 62632332Sbostic } else { 62732332Sbostic if (!hflag) 62832332Sbostic if (!squirrel) 62932332Sbostic printf("%s:", file); 63032332Sbostic else 631*32502Sbostic (void)sprintf(preamble, "%s:", file); 63232332Sbostic if (nflag) { 63332332Sbostic if (prevmatch) 63432332Sbostic prevnline = prevnline + nlcount(prevloc, s); 63532332Sbostic else 63632332Sbostic prevnline = nline + nlcount(str, s); 63732332Sbostic prevmatch = 1; 63832332Sbostic 63932332Sbostic if (!squirrel) 64032332Sbostic printf("%ld:", prevnline); 64132332Sbostic else 642*32502Sbostic (void)sprintf(preamble + strlen(preamble), 64332332Sbostic "%ld:", prevnline); 64432332Sbostic } 64532332Sbostic if (!squirrel) { 64632332Sbostic do 64732332Sbostic putchar(*s); 64832332Sbostic while (*s++ != NL); 64932332Sbostic } else 65032332Sbostic s = savematch(s); 65132332Sbostic 65232332Sbostic if (nflag) 65332332Sbostic prevloc = s - 1; 65432332Sbostic } 65532332Sbostic return ((firstflag && !cflag) ? NULL : s); 65632332Sbostic } 65732332Sbostic 65832332Sbostic char * 65932332Sbostic fold(line) 66032332Sbostic char *line; 66132332Sbostic { 66232332Sbostic static char fline[BUFSIZE]; 66332332Sbostic register char *s, *t = fline; 66432332Sbostic 66532332Sbostic for (s = line; *s != EOS; s++) 66632332Sbostic *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s); 66732332Sbostic *t = EOS; 66832332Sbostic return (fline); 66932332Sbostic } 67032332Sbostic 67132332Sbostic strindex(s, t) /* the easy way, as in K&P, p. 192 */ 67232332Sbostic char *s, *t; 67332332Sbostic { 67432332Sbostic int i, n; 67532332Sbostic 67632332Sbostic n = strlen(t); 67732332Sbostic for (i = 0; s[i] != '\0'; i++) 67832332Sbostic if (strncmp(s + i, t, n) == 0) 67932332Sbostic return (i); 68032332Sbostic return (-1); 68132332Sbostic } 68232332Sbostic 68332332Sbostic char * 68432332Sbostic grepxlat(pattern) /* grep pattern meta conversion */ 68532332Sbostic char *pattern; 68632332Sbostic { 68732332Sbostic register char *p, *s; 68832332Sbostic static char newpat[BUFSIZE]; 68932332Sbostic 69032332Sbostic for (s = newpat, p = pattern; *p != EOS;) { 69132332Sbostic if (*p == '\\') { /* skip escapes ... */ 69232332Sbostic *s++ = *p++; 69332332Sbostic if (*p) 69432332Sbostic *s++ = *p++; 69532332Sbostic } else if (*p == '[') { /* ... and char classes */ 69632332Sbostic while (*p != EOS && *p != ']') 69732332Sbostic *s++ = *p++; 69832332Sbostic } else if (strchr("+?|()", *p) != NULL) { 69932332Sbostic *s++ = '\\'; /* insert protection */ 70032332Sbostic *s++ = *p++; 70132332Sbostic } else 70232332Sbostic *s++ = *p++; 70332332Sbostic } 70432332Sbostic *s = EOS; 70532332Sbostic grepflag = ((patind) ? 0 : 1); 70632332Sbostic return (newpat); 70732332Sbostic } 70832332Sbostic 70932332Sbostic /* 71032332Sbostic * Test for simple alternation. Result is NULL if it's not so simple, or is 71132332Sbostic * a pointer to the first string if it is. Warning: sscanf size is a 71232332Sbostic * fixpoint, beyond which the speedup linearity starts to break down. In the 71332332Sbostic * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing 71432332Sbostic * altpat[] to arbitrary size is not useful. 71532332Sbostic */ 71632332Sbostic char * 71732332Sbostic alternate(regexpr) 71832332Sbostic char *regexpr; 71932332Sbostic { 72032339Sbostic register int i, j; 72132339Sbostic register char *start, *stop; 72232332Sbostic unsigned char c; 72332332Sbostic 72432332Sbostic if (fgrepflag && strchr(regexpr, '|')) 72532339Sbostic return (NULL); 72632332Sbostic 72732339Sbostic /* 72832339Sbostic * break pattern up into altpat array; delimit on newline, bar, 72932339Sbostic * or EOS. We know we won't overflow, we've already checked the 73032339Sbostic * number of patterns we're going to find against NALT. 73132339Sbostic * Also, set length of pattern and find minimum pattern length. 73232339Sbostic */ 73332339Sbostic nalt = 0; 73432339Sbostic altmin = NMUSH; 73532339Sbostic for (start = stop = regexpr;; ++stop) 73632339Sbostic if (!*stop || *stop == '|' || *stop == NL) { 73732339Sbostic altlen[nalt] = j = stop - start; 73832339Sbostic if (j < altmin) 73932339Sbostic altmin = j; 74032339Sbostic if (!(altpat[nalt] = malloc((u_int)(j + 1)))) 74132339Sbostic oops("out of memory"); 74232339Sbostic bcopy(start, altpat[nalt], j); 74332339Sbostic altpat[nalt][j] = EOS; 74432362Sbostic ++nalt; 74532339Sbostic if (!*stop) 74632339Sbostic break; 74732362Sbostic if (nalt == NALT) 74832339Sbostic return(NULL); 74932339Sbostic if (*stop == NL) 75032339Sbostic *stop = '|'; 75132339Sbostic start = stop + 1; 75232339Sbostic } 75332332Sbostic if (!fgrepflag) { 75432332Sbostic if (strchr(regexpr, '|') == NULL || regexpr[0] == '|') 75532332Sbostic return (NULL); 75632332Sbostic if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL 75732332Sbostic || strindex(regexpr, "||") >= 0) 75832332Sbostic return (NULL); 75932332Sbostic } 76032332Sbostic 76132332Sbostic if (nalt > 1) { /* build superimposed "pre-match" sets per 76232332Sbostic * char */ 76332332Sbostic altflag++; 76432332Sbostic for (j = 0; j < nalt; j++) 76532332Sbostic for (i = 0; i < altmin; i++) { 76632332Sbostic c = altpat[j][altlen[j] - altmin + i]; 76732332Sbostic altset[i + 1][c] = 1; /* offset for sentinel */ 76832332Sbostic } 76932332Sbostic } 77032332Sbostic return (altpat[0]); 77132332Sbostic } 77232332Sbostic 77332332Sbostic /* 77432332Sbostic * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to 77532332Sbostic * determine whether to use dfa-based egrep: We do FIRSTFEW matches with 77632332Sbostic * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT 77732332Sbostic * of the input, the r.e. is likely to be underspecified, so do old *grep, 77832332Sbostic * which is faster on complex patterns than regexp(). At FIRSTFEW, 77932332Sbostic * dump the saved matches collected by savematch(). They are saved 78032332Sbostic * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic, 78132332Sbostic * since it's hard to rewind. 78232332Sbostic */ 78332332Sbostic 78432332Sbostic execstrategy(file) 78532332Sbostic char *file; 78632332Sbostic { 78732332Sbostic int pctmatch; 78832332Sbostic 78932332Sbostic pctmatch = (100 * rxcount) / incount; 79032332Sbostic if (pctmatch > PUNTPERCENT && file != NULL) 79132332Sbostic kernighan(args); 79232332Sbostic if (file != NULL) 79332332Sbostic flushmatches(); 79432332Sbostic } 79532332Sbostic 79632332Sbostic nlcount(bstart, bstop) /* flail interval to totalize newlines. */ 79732332Sbostic char *bstart, *bstop; 79832332Sbostic { 79932332Sbostic register char *s = bstart; 80032332Sbostic register char *t = bstop; 80132332Sbostic register int count = 0; 80232332Sbostic 80332332Sbostic do { /* loop unroll for older architectures */ 80432332Sbostic if (*t == NL) /* ... ask ames!jaw for sample code */ 80532332Sbostic count++; 80632332Sbostic } while (t-- > s); 80732332Sbostic 80832332Sbostic return (count); 80932332Sbostic } 81032332Sbostic 81132332Sbostic char * 81232332Sbostic isolate(regexpr) /* isolate longest metacharacter-free string */ 81332332Sbostic char *regexpr; 81432332Sbostic { 81532332Sbostic char *dummyexpr; 81632332Sbostic 81732332Sbostic /* 81832332Sbostic * We add (.)* because Henry's regcomp only figures regmust if it 81932332Sbostic * sees a leading * pattern. Foo! 82032332Sbostic */ 82132332Sbostic dummyexpr = malloc((unsigned) strlen(regexpr) + 5); 822*32502Sbostic (void)sprintf(dummyexpr, "(.)*%s", regexpr); 82332332Sbostic if ((rspencer = regcomp(dummyexpr)) == NULL) 82432332Sbostic kernighan(args); 82532332Sbostic return (rspencer->regmust); 82632332Sbostic } 82732332Sbostic 82832332Sbostic char *matches[FIRSTFEW]; 82932332Sbostic static int mcount = 0; 83032332Sbostic 83132332Sbostic char * 83232332Sbostic savematch(s) /* horde matches during statistics gathering */ 83332332Sbostic register char *s; 83432332Sbostic { 83532332Sbostic char *p; 83632332Sbostic char *start = s; 83732332Sbostic int msize = 0; 83832332Sbostic int psize = strlen(preamble); 83932332Sbostic 84032332Sbostic while (*s++ != NL) 84132332Sbostic msize++; 84232332Sbostic *--s = EOS; 84332332Sbostic 84432332Sbostic p = malloc((unsigned) msize + 1 + psize); 84532332Sbostic strcpy(p, preamble); 84632332Sbostic strcpy(p + psize, start); 84732332Sbostic matches[mcount++] = p; 84832332Sbostic 84932332Sbostic preamble[0] = 0; 85032332Sbostic *s = NL; 85132332Sbostic return (s); 85232332Sbostic } 85332332Sbostic 85432332Sbostic flushmatches() 85532332Sbostic { 85632332Sbostic int n; 85732332Sbostic 85832332Sbostic flushflag = 1; 85932332Sbostic for (n = 0; n < mcount; n++) 86032332Sbostic printf("%s\n", matches[n]); 86132332Sbostic mcount = 0; 86232332Sbostic } 86332332Sbostic 86432332Sbostic oops(message) 86532332Sbostic char *message; 86632332Sbostic { 86732332Sbostic fprintf(stderr, "%s: %s\n", progname, message); 86832332Sbostic exit(2); 86932332Sbostic } 87032332Sbostic 87132332Sbostic kernighan(args) /* "let others do the hard part ..." */ 87232332Sbostic char *args[]; 87332332Sbostic { 87432332Sbostic /* 87532332Sbostic * We may have already run grep on some of the files; remove them 87632332Sbostic * from the arg list we pass on. Note that we can't delete them 87732332Sbostic * totally because the number of file names affects the output 87832332Sbostic * (automatic -h). 87932332Sbostic */ 88032332Sbostic /* better would be fork/exec per punted file -- jaw */ 88132332Sbostic 88232332Sbostic while (firstfile && optind > firstfile) 88332332Sbostic args[firstfile++] = "/dev/null"; 88432332Sbostic if (patind) 88532332Sbostic args[patind] = pattern; 88632339Sbostic (void) fflush(stdout); 88732332Sbostic 88832332Sbostic if (grepflag) 88932332Sbostic execvp(GREPSTD, args), oops("can't exec old 'grep'"); 89032332Sbostic else if (fgrepflag) 89132332Sbostic execvp(FGREPSTD, args), oops("can't exec old 'fgrep'"); 89232332Sbostic else 89332332Sbostic execvp(EGREPSTD, args), oops("can't exec old 'egrep'"); 89432332Sbostic } 895