132332Sbostic #ifndef lint 2*46717Sbostic static char sccsid[] = "@(#)egrep.c 5.13 (Berkeley) 02/27/91"; 332332Sbostic #endif not lint 432332Sbostic 532332Sbostic /* 632332Sbostic Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0 732332Sbostic table as in original paper (CACM, October, 1977). No delta1 or delta2. 832332Sbostic According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of 932332Sbostic minimal practical value. However, to improve for worst case input, 1032332Sbostic integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J. 1132332Sbostic Comput., Feb. 1986) deserves consideration. 1232332Sbostic 1332332Sbostic Method: extract longest metacharacter-free string from expression. 1432332Sbostic this is done using a side-effect from henry spencer's regcomp(). 1532332Sbostic use boyer-moore to match such, then pass submatching lines 1632332Sbostic to either regexp() or standard 'egrep', depending on certain 1732332Sbostic criteria within execstrategy() below. [this tradeoff is due 1832332Sbostic to the general slowness of the regexp() nondeterministic 1932332Sbostic machine on complex expressions, as well as the startup time 2032332Sbostic of standard 'egrep' on short files.] alternatively, one may 2132332Sbostic change the vendor-supplied 'egrep' automaton to include 2232332Sbostic boyer-moore directly. see accompanying writeup for discussion 2332332Sbostic of kanji expression treatment. 2432332Sbostic 2532332Sbostic late addition: apply trickbag for fast match of simple 2632332Sbostic alternations (sublinear, in common low-cardinality cases). 2732332Sbostic trap fgrep into this lair. 2832332Sbostic 2932332Sbostic gnu additions: -f, newline as |, \< and \> [in regexec()], more 3032332Sbostic comments. inspire better dfa exec() strategy. 3132332Sbostic serious testing and help with special cases. 3232332Sbostic 3332332Sbostic Algorithm amalgam summary: 3432332Sbostic 3532332Sbostic dfa e?grep (aho/thompson) 3632332Sbostic ndfa regexp() (spencer/aho) 3732332Sbostic bmg (boyer/moore/gosper) 3832332Sbostic "superimposed" bmg (jaw) 3932332Sbostic fgrep (aho/corrasick) 4032332Sbostic 4132332Sbostic sorry, but the knuth/morris/pratt machine, horspool's 4232332Sbostic "frequentist" code, and the rabin/karp matcher, however cute, 4332332Sbostic just don't cut it for this production. 4432332Sbostic 4532332Sbostic James A. Woods Copyright (c) 1986 4632332Sbostic NASA Ames Research Center 4732332Sbostic */ 4832332Sbostic #include <sys/types.h> 4932332Sbostic #include <sys/stat.h> 5032337Sbostic #include <sys/file.h> 5132332Sbostic #include <regexp.h> /* must be henry spencer's version */ 5237914Sbostic #include <stdio.h> 5337914Sbostic #include <ctype.h> 5437914Sbostic #include "pathnames.h" 5532332Sbostic 5632332Sbostic #define MIN(A, B) ((A) > (B) ? (B) : (A)) 5732332Sbostic 5832332Sbostic #ifdef SLOWSYS 5932332Sbostic #define read xread 6032332Sbostic #endif 6132332Sbostic 6232332Sbostic #define BUFSIZE 8192 /* make higher for cray */ 6332332Sbostic #define PATSIZE 6000 6432332Sbostic #define LARGE BUFSIZE + PATSIZE 6532332Sbostic 6632332Sbostic #define NALT 7 /* tied to scanf() size in alternate() */ 6732332Sbostic #define NMUSH 6 /* loosely relates to expected alt length */ 6832332Sbostic 6932332Sbostic #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */ 7032332Sbostic #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input 7132332Sbostic * was processed by regexp(), exec std egrep. */ 7232332Sbostic #define NL '\n' 7332332Sbostic #define EOS '\0' 7432332Sbostic #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */ 7532332Sbostic #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */ 7632332Sbostic #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */ 7732332Sbostic #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */ 7832332Sbostic 7932332Sbostic extern char *optarg; 8032332Sbostic extern int optind; 8132332Sbostic char *progname; 8232332Sbostic 8332332Sbostic int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */ 8432332Sbostic int sflag, hflag; /* v7, v8, bsd */ 8532332Sbostic 8632332Sbostic int firstflag; /* Stop at first match */ 8732332Sbostic int grepflag; /* Called as "grep" */ 8832332Sbostic int fgrepflag; /* Called as "fgrep" */ 8932332Sbostic int altflag; /* Simple alternation in pattern */ 9032332Sbostic int boyonly; /* No regexp needed -- all simple */ 9132332Sbostic int flushflag; 9232332Sbostic int grepold, egrepold, fgrepold; 9332332Sbostic 9432332Sbostic int nalt; /* Number of alternatives */ 9532332Sbostic int nsuccess; /* 1 for match, 2 for error */ 9632332Sbostic int altmin; /* Minimum length of all the alternate 9732332Sbostic * strings */ 9832332Sbostic int firstfile; /* argv index of first file argument */ 9932332Sbostic int patind; /* argv index of pattern */ 10032332Sbostic long nmatch; /* Number of matches in this file */ 10132332Sbostic long incount, counted; /* Amount of input consumed */ 10232332Sbostic long rxcount; /* Bytes of input processed by regexec() */ 10332332Sbostic int boyfound; /* accumulated partial matches (tripped by 10432332Sbostic * FIRSTFEW) */ 10532332Sbostic int prevmatch; /* next three lines aid fast -n */ 10632332Sbostic long nline, prevnline; 10732332Sbostic char *prevloc; 10832332Sbostic 10932332Sbostic regexp *rspencer; 11032332Sbostic char *pattern; 11132332Sbostic char *patboy; /* Pattern for simple Boyer-Moore */ 11232332Sbostic char *patfile; /* Filename containing pattern(s) */ 11332332Sbostic 11432332Sbostic int delta0[256]; /* Boyer-Moore algorithm core */ 11532332Sbostic char cmap[256]; /* Usually 0-255, but if -i, maps upper to 11632332Sbostic * lower case */ 11732332Sbostic char str[BUFSIZE + 2]; 11832332Sbostic int nleftover; 11932332Sbostic char linetemp[BUFSIZE]; 12032339Sbostic char *altpat[NALT]; /* alternation component storage */ 12132332Sbostic int altlen[NALT]; 12232332Sbostic short altset[NMUSH + 1][256]; 12332332Sbostic char preamble[200]; /* match prefix (filename, line no.) */ 12432332Sbostic 12532332Sbostic int fd; 12632332Sbostic char * 12732502Sbostic strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc(); 12832332Sbostic char * 12932332Sbostic grepxlat(), *fold(), *pfile(), *alternate(), *isolate(); 13035692Sbostic char *gotamatch(), *kanji(), *linesave(), *submatch(); 13132332Sbostic char **args; 13232332Sbostic 13332332Sbostic main(argc, argv) 13432332Sbostic int argc; 13532332Sbostic char *argv[]; 13632332Sbostic { 13732365Sbostic int c, oflag; 13832332Sbostic int errflag = 0; 13932332Sbostic 14032332Sbostic args = argv; 14132332Sbostic 14232332Sbostic if ((progname = strrchr(argv[0], '/')) != 0) 14332332Sbostic progname++; 14432332Sbostic else 14532332Sbostic progname = argv[0]; 14632332Sbostic if (strcmp(progname, "grep") == 0) 14732332Sbostic grepflag++; 14832365Sbostic else if (strcmp(progname, "fgrep") == 0) 14932332Sbostic fgrepflag++; 15032332Sbostic 15132365Sbostic oflag = 0; 15232365Sbostic while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) { 15332332Sbostic switch (c) { 15432332Sbostic 15532332Sbostic case 'f': 15632332Sbostic fflag++; 15732332Sbostic patfile = optarg; 15832332Sbostic continue; 15932332Sbostic case 'b': 16032332Sbostic case 'v': 16132332Sbostic egrepold++; /* boyer-moore of little help here */ 16232332Sbostic continue; 16332332Sbostic case 'c': 16432332Sbostic cflag++; 16532332Sbostic continue; 16632332Sbostic case 'e': 16732332Sbostic eflag++; 16832332Sbostic pattern = optarg; 16932332Sbostic continue; 17032332Sbostic case 'h': 17132332Sbostic hflag++; 17232332Sbostic continue; 17332365Sbostic case 'o': 17432365Sbostic oflag++; 17532365Sbostic continue; 17632332Sbostic case '1': /* Stop at very first match */ 17732332Sbostic firstflag++; /* spead freaks only */ 17832332Sbostic continue; 17932332Sbostic case 'i': 18032332Sbostic iflag++; 18132332Sbostic continue; 18232332Sbostic case 'l': 18332332Sbostic lflag++; 18432332Sbostic continue; 18532332Sbostic case 'n': 18632332Sbostic nflag++; 18732332Sbostic continue; 18832332Sbostic case 's': 18932332Sbostic sflag++; 19032332Sbostic continue; 19132332Sbostic case 'w': 19232332Sbostic case 'y': 19332332Sbostic if (!grepflag) 19432332Sbostic errflag++; 19532332Sbostic grepold++; 19632332Sbostic continue; 19732332Sbostic case 'x': /* needs more work, like -b above */ 19832332Sbostic if (!fgrepflag) 19932332Sbostic errflag++; 20032332Sbostic fgrepold++; 20132332Sbostic continue; 20232332Sbostic case '?': 20332332Sbostic errflag++; 20432332Sbostic } 20532332Sbostic } 20632332Sbostic if (errflag || ((argc <= optind) && !fflag && !eflag)) { 20732332Sbostic if (grepflag) 20832365Sbostic oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]"); 20932332Sbostic else if (fgrepflag) 21032366Sbostic oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]"); 21132332Sbostic else /* encourage SVID options, though we provide 21232332Sbostic * others */ 21332366Sbostic oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]"); 21432332Sbostic } 21532332Sbostic if (fflag) 21634917Sbostic pattern = pfile(patfile); 21734917Sbostic else if (!eflag) { 21834917Sbostic patind = optind; 21932332Sbostic pattern = argv[optind++]; 22034917Sbostic } 22132332Sbostic 22232365Sbostic if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */ 22332332Sbostic hflag++; 22432332Sbostic if (pattern[0] == EOS) 22532332Sbostic kernighan(argv); /* same as it ever was */ 22632332Sbostic /* 22732332Sbostic * 'grep/egrep' merger -- "old" grep is called to handle: tagged 22832332Sbostic * exprs \( \), word matches \< and \>, -w and -y options, char 22932332Sbostic * classes with '-' at end (egrep bug?), and patterns beginning with 23032332Sbostic * an asterisk (don't ask why). otherwise, characters meaningful to 23132332Sbostic * 'egrep' but not to 'grep' are escaped; the entire expr is then 23232332Sbostic * passed to 'egrep'. 23332332Sbostic */ 23432332Sbostic if (grepflag && !grepold) { 23532332Sbostic if (strindex(pattern, "\\(") >= 0 || 23632332Sbostic strindex(pattern, "\\<") >= 0 || 23732332Sbostic strindex(pattern, "\\>") >= 0 || 23832332Sbostic strindex(pattern, "-]") >= 0 || 23932332Sbostic pattern[0] == '*') /* grep bug */ 24032332Sbostic grepold++; 24132332Sbostic else 24232332Sbostic pattern = grepxlat(pattern); 24332332Sbostic } 24432332Sbostic if (grepold || egrepold || fgrepold) 24532332Sbostic kernighan(argv); 24632332Sbostic 24732332Sbostic if (iflag) 24832332Sbostic strcpy(pattern, fold(pattern)); 24932332Sbostic /* 25032332Sbostic * If the pattern is a plain string, just run boyer-moore. If it 25132332Sbostic * consists of meta-free alternatives, run "superimposed" bmg. 25232332Sbostic * Otherwise, find best string, and compile pattern for regexec(). 25332332Sbostic */ 25432332Sbostic if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */ 25532332Sbostic boyonly++; 25632332Sbostic patboy = pattern; 25732332Sbostic } else { 25832332Sbostic if ((patboy = alternate(pattern)) != NULL) 25932332Sbostic boyonly++; 26032332Sbostic else { 26132332Sbostic if ((patboy = isolate(pattern)) == NULL) 26232332Sbostic kernighan(argv); /* expr too involved */ 26332332Sbostic #ifndef NOKANJI 26432332Sbostic for (c = 0; pattern[c] != EOS; c++) 26532332Sbostic if (pattern[c] & NONASCII) /* kanji + meta */ 26632332Sbostic kernighan(argv); 26732332Sbostic #endif 26832332Sbostic if ((rspencer = regcomp(pattern)) == NULL) 26932332Sbostic oops("regcomp failure"); 27032332Sbostic } 27132332Sbostic } 27232332Sbostic gosper(patboy); /* "pre-conditioning is wonderful" 27332332Sbostic * -- v. strassen */ 27432332Sbostic 27532332Sbostic if ((firstfile = optind) >= argc) { 27632332Sbostic /* Grep standard input */ 27732332Sbostic if (lflag) /* We don't know its name! */ 27832332Sbostic exit(1); 27932332Sbostic egsecute((char *) NULL); 28032332Sbostic } else { 28132332Sbostic while (optind < argc) { 28232332Sbostic egsecute(argv[optind]); 28332332Sbostic optind++; 28432332Sbostic if (firstflag && (nsuccess == 1)) 28532332Sbostic break; 28632332Sbostic } 28732332Sbostic } 28832332Sbostic exit((nsuccess == 2) ? 2 : (nsuccess == 0)); 28932332Sbostic } 29032332Sbostic 29132332Sbostic char * 29232332Sbostic pfile(pfname) /* absorb expression from file */ 29332332Sbostic char *pfname; 29432332Sbostic { 29532337Sbostic int fd; 29632332Sbostic struct stat patstat; 29732332Sbostic static char *pat; 29832332Sbostic 29932337Sbostic if ((fd = open(pfname, O_RDONLY, 0)) < 0) 30032332Sbostic oops("can't read pattern file"); 30132337Sbostic if (fstat(fd, &patstat) != 0) 30232332Sbostic oops("can't stat pattern file"); 30332332Sbostic if (patstat.st_size > PATSIZE) { 30432332Sbostic if (fgrepflag) { /* defer to unix version */ 30532332Sbostic fgrepold++; 30632332Sbostic return "dummy"; 30732332Sbostic } else 30832332Sbostic oops("pattern file too big"); 30932332Sbostic } 31032332Sbostic if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL) 31132332Sbostic oops("out of memory to read pattern file"); 31232337Sbostic if (patstat.st_size != read(fd, pat, (int)patstat.st_size)) 31332332Sbostic oops("error reading pattern file"); 31432337Sbostic (void) close(fd); 31532332Sbostic 31632332Sbostic pat[patstat.st_size] = EOS; 31732332Sbostic if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */ 31832332Sbostic pat[patstat.st_size - 1] = EOS; 31932332Sbostic 32032332Sbostic if (nlcount(pat, &pat[patstat.st_size]) > NALT) { 32132332Sbostic if (fgrepflag) 32232332Sbostic fgrepold++; /* "what's it all about, alfie?" */ 32332332Sbostic else 32432332Sbostic egrepold++; 32532332Sbostic } 32632332Sbostic return (pat); 32732332Sbostic } 32832332Sbostic 32932332Sbostic egsecute(file) 33032332Sbostic char *file; 33132332Sbostic { 332*46717Sbostic extern int errno; 333*46717Sbostic 33432332Sbostic if (file == NULL) 33532332Sbostic fd = 0; 33632339Sbostic else if ((fd = open(file, O_RDONLY, 0)) <= 0) { 337*46717Sbostic fprintf(stderr, 338*46717Sbostic "%s: %s: %s\n", progname, file, strerror(errno)); 33932332Sbostic nsuccess = 2; 34032332Sbostic return; 34132332Sbostic } 34232332Sbostic chimaera(file, patboy); 34332332Sbostic 34432332Sbostic if (!boyonly && !flushflag && file != NULL) 34532332Sbostic flushmatches(); 34632332Sbostic if (file != NULL) 34732332Sbostic close(fd); 34832332Sbostic } 34932332Sbostic 35032332Sbostic chimaera(file, pat) /* "reach out and boyer-moore search someone" */ 35132332Sbostic char *file, *pat; /* -- soon-to-be-popular bumper sticker */ 35232332Sbostic { 35332332Sbostic register char *k, *strend, *s; 35432332Sbostic register int j, count; 35532332Sbostic register int *deltazero = delta0; 35632332Sbostic int patlen = altmin; 35732332Sbostic char *t; 35832332Sbostic 35932332Sbostic nleftover = boyfound = flushflag = 0; 36032332Sbostic nline = 1L; 36132332Sbostic prevmatch = 0; 36232332Sbostic nmatch = counted = rxcount = 0L; 36332332Sbostic 36432332Sbostic while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) { 36532332Sbostic 36632332Sbostic counted += count; 36732332Sbostic strend = linesave(str, count); 36832332Sbostic 36932332Sbostic for (k = str + patlen - 1; k < strend;) { 37032332Sbostic /* 37132332Sbostic * for a large class of patterns, upwards of 80% of 37232332Sbostic * match time is spent on the next line. we beat 37332332Sbostic * existing microcode (vax 'matchc') this way. 37432332Sbostic */ 37532332Sbostic while ((k += deltazero[*(unsigned char *) k]) < strend); 37632332Sbostic if (k < (str + LARGE)) 37732332Sbostic break; 37832332Sbostic k -= LARGE; 37932332Sbostic 38032332Sbostic if (altflag) { 38132332Sbostic /* 38232332Sbostic * Parallel Boyer-Moore. Check whether each 38332332Sbostic * of the previous <altmin> chars COULD be 38432332Sbostic * from one of the alternative strings. 38532332Sbostic */ 38632332Sbostic s = k - 1; 38732332Sbostic j = altmin; 38832332Sbostic while (altset[--j][(unsigned char) 38932332Sbostic cmap[*(unsigned char *) s--]]); 39032332Sbostic /* 39132332Sbostic * quick test fails. in this life, compare 39232332Sbostic * 'em all. but, a "reverse trie" would 39332332Sbostic * attenuate worst case (linear w/delta2?). 39432332Sbostic */ 39532332Sbostic if (--j < 0) { 39632332Sbostic count = nalt - 1; 39732332Sbostic do { 39832332Sbostic s = k; 39932332Sbostic j = altlen[count]; 40032332Sbostic t = altpat[count]; 40132332Sbostic 40232332Sbostic while 40332332Sbostic (cmap[*(unsigned char *) s--] 40432332Sbostic == t[--j]); 40532332Sbostic if (j < 0) 40632332Sbostic break; 40732332Sbostic } 40832332Sbostic while (count--); 40932332Sbostic } 41032332Sbostic } else { 41132332Sbostic /* One string -- check it */ 41232332Sbostic j = patlen - 1; 41332332Sbostic s = k - 1; 41432332Sbostic while (cmap[*(unsigned char *) s--] == pat[--j]); 41532332Sbostic } 41632332Sbostic /* 41732332Sbostic * delta-less shortcut for literati. short shrift for 41832332Sbostic * genetic engineers? 41932332Sbostic */ 42032332Sbostic if (j >= 0) { 42132332Sbostic k++; /* no match; restart next char */ 42232332Sbostic continue; 42332332Sbostic } 42432332Sbostic k = submatch(file, pat, str, strend, k, count); 42532332Sbostic if (k == NULL) 42632332Sbostic return; 42732332Sbostic } 42832332Sbostic if (nflag) { 42932332Sbostic if (prevmatch) 43032332Sbostic nline = prevnline + nlcount(prevloc, k); 43132332Sbostic else 43232332Sbostic nline = nline + nlcount(str, k); 43332332Sbostic prevmatch = 0; 43432332Sbostic } 43532332Sbostic strncpy(str, linetemp, nleftover); 43632332Sbostic } 43732332Sbostic if (cflag) { 43832332Sbostic /* Bug from old grep: -c overrides -h. We fix the bug. */ 43932332Sbostic if (!hflag) 44032332Sbostic printf("%s:", file); 44132332Sbostic printf("%ld\n", nmatch); 44232332Sbostic } 44332332Sbostic } 44432332Sbostic 44532332Sbostic char * 44632332Sbostic linesave(str, count) /* accumulate partial line at end of buffer */ 44732332Sbostic char str[]; 44832332Sbostic register int count; 44932332Sbostic { 45032332Sbostic register int j; 45132332Sbostic 45232332Sbostic count += nleftover; 45332332Sbostic if (count != BUFSIZE && fd != 0) 45432332Sbostic str[count++] = NL; /* insurance for broken last line */ 45532332Sbostic str[count] = EOS; 45632332Sbostic for (j = count - 1; str[j] != NL && j >= 0;) 45732332Sbostic j--; 45832332Sbostic /* 45932332Sbostic * break up these lines: long line (> BUFSIZE), last line of file, or 46032332Sbostic * short return from read(), as from tee(1) input 46132332Sbostic */ 46232332Sbostic if (j < 0 && (count == (BUFSIZE - nleftover))) { 46332332Sbostic str[count++] = NL; 46432332Sbostic str[count] = EOS; 46532332Sbostic linetemp[0] = EOS; 46632332Sbostic nleftover = 0; 46732332Sbostic return (str + count); 46832332Sbostic } else { 46932332Sbostic nleftover = count - j - 1; 47032332Sbostic strncpy(linetemp, str + j + 1, nleftover); 47132332Sbostic return (str + j); 47232332Sbostic } 47332332Sbostic } 47432332Sbostic 47532332Sbostic /* 47632332Sbostic * Process partial match. First check for mis-aligned Kanji, then match line 47732332Sbostic * against full compiled r.e. if statistics do not warrant handing off to 47832332Sbostic * standard egrep. 47932332Sbostic */ 48032332Sbostic char * 48132332Sbostic submatch(file, pat, str, strend, k, altindex) 48232332Sbostic char file[], pat[], str[]; 48332332Sbostic register char *strend, *k; 48432332Sbostic int altindex; 48532332Sbostic { 48632332Sbostic register char *s; 48732332Sbostic char *t, c; 48832332Sbostic 48932332Sbostic t = k; 49032332Sbostic s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1); 49132332Sbostic #ifndef NOKANJI 49232332Sbostic c = ((altflag) ? altpat[altindex][0] : pat[0]); 49332332Sbostic if (c & NONASCII) 49432332Sbostic if ((s = kanji(str, s, k)) == NULL) 49532332Sbostic return (++k); /* reject false kanji */ 49632332Sbostic #endif 49732332Sbostic do; 49832332Sbostic while (*s != NL && --s >= str); 49932332Sbostic k = s + 1; /* now at line start */ 50032332Sbostic 50132332Sbostic if (boyonly) 50232332Sbostic return (gotamatch(file, k)); 50332332Sbostic 50432332Sbostic incount = counted - (strend - k); 50532332Sbostic if (boyfound++ == FIRSTFEW) 50632332Sbostic execstrategy(file); 50732332Sbostic 50832332Sbostic s = t; 50932332Sbostic do 51032332Sbostic rxcount++; 51132332Sbostic while (*s++ != NL); 51232332Sbostic *--s = EOS; 51332332Sbostic /* 51432332Sbostic * "quick henry -- the flit" (after theodor geisel) 51532332Sbostic */ 51632332Sbostic if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) { 51732332Sbostic *s = NL; 51832332Sbostic if (gotamatch(file, k) == NULL) 51932332Sbostic return (NULL); 52032332Sbostic } 52132332Sbostic *s = NL; 52232332Sbostic return (s + 1); 52332332Sbostic } 52432332Sbostic 52532338Sbostic #ifndef NOKANJI 52632332Sbostic /* 52732332Sbostic * EUC code disambiguation -- scan backwards to first 7-bit code, while 52832332Sbostic * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern. 52932332Sbostic * SS2/3 checks are for intermixed Japanase Katakana or Kanji2. 53032332Sbostic */ 53132332Sbostic char * 53232332Sbostic kanji(str, s, k) 53332332Sbostic register char *str, *s, *k; 53432332Sbostic { 53532332Sbostic register int j = 0; 53632332Sbostic 53732332Sbostic for (s--; s >= str; s--) { 53832332Sbostic if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0) 53932332Sbostic break; 54032332Sbostic j++; 54132332Sbostic } 54232332Sbostic #ifndef CHINESE 54332332Sbostic if (*s == SS2) 54432332Sbostic j -= 1; 54532332Sbostic #endif CHINESE 54632332Sbostic return ((j & 01) ? NULL : k); 54732332Sbostic } 54832338Sbostic #endif 54932332Sbostic 55032332Sbostic /* 55132332Sbostic * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] 55232332Sbostic */ 55332332Sbostic gosper(pattern) 55432332Sbostic char *pattern; /* ... HAKMEM lives ... */ 55532332Sbostic { 55632332Sbostic register int i, j; 55732332Sbostic unsigned char c; 55832332Sbostic 55932332Sbostic /* Make one-string case look like simple alternatives case */ 56032332Sbostic if (!altflag) { 56132332Sbostic nalt = 1; 56232332Sbostic altmin = altlen[0] = strlen(pattern); 56332339Sbostic altpat[0] = pattern; 56432332Sbostic } 56532332Sbostic /* For chars that aren't in any string, skip by string length. */ 56632332Sbostic for (j = 0; j < 256; j++) { 56732332Sbostic delta0[j] = altmin; 56832332Sbostic cmap[j] = j; /* Sneak in initialization of cmap */ 56932332Sbostic } 57032332Sbostic 57132332Sbostic /* For chars in a string, skip distance from char to end of string. */ 57232332Sbostic /* (If char appears more than once, skip minimum distance.) */ 57332332Sbostic for (i = 0; i < nalt; i++) 57432332Sbostic for (j = 0; j < altlen[i] - 1; j++) { 57532332Sbostic c = altpat[i][j]; 57632332Sbostic delta0[c] = MIN(delta0[c], altlen[i] - j - 1); 57732332Sbostic if (iflag && islower((int) c)) 57832332Sbostic delta0[toupper((int) c)] = delta0[c]; 57932332Sbostic } 58032332Sbostic 58132332Sbostic /* For last char of each string, fall out of search loop. */ 58232332Sbostic for (i = 0; i < nalt; i++) { 58332332Sbostic c = altpat[i][altlen[i] - 1]; 58432332Sbostic delta0[c] = LARGE; 58532332Sbostic if (iflag && islower((int) c)) 58632332Sbostic delta0[toupper((int) c)] = LARGE; 58732332Sbostic } 58832332Sbostic if (iflag) 58932332Sbostic for (j = 'A'; j <= 'Z'; j++) 59032332Sbostic cmap[j] = tolower((int) j); 59132332Sbostic } 59232332Sbostic 59332332Sbostic /* 59432332Sbostic * Print, count, or stop on full match. Result is either the location for 59532332Sbostic * continued search, or NULL to stop. 59632332Sbostic */ 59732332Sbostic char * 59832332Sbostic gotamatch(file, s) 59932332Sbostic register char *file, *s; 60032332Sbostic { 60132332Sbostic char *savematch(); 60232332Sbostic int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */ 60332332Sbostic 60432332Sbostic nmatch++; 60532332Sbostic nsuccess = 1; 60632332Sbostic if (!boyonly && boyfound <= FIRSTFEW && file != NULL) 60732332Sbostic squirrel = 1; 60832332Sbostic 60932332Sbostic if (sflag) 61032332Sbostic return (NULL); /* -s usurps all flags (unlike some versions) */ 61132332Sbostic if (cflag) { /* -c overrides -l, we guess */ 61232332Sbostic do; 61332332Sbostic while (*s++ != NL); 61432332Sbostic } else if (lflag) { 61532332Sbostic puts(file); 61632332Sbostic return (NULL); 61732332Sbostic } else { 61832332Sbostic if (!hflag) 61932332Sbostic if (!squirrel) 62032332Sbostic printf("%s:", file); 62132332Sbostic else 62232502Sbostic (void)sprintf(preamble, "%s:", file); 62332332Sbostic if (nflag) { 62432332Sbostic if (prevmatch) 62532332Sbostic prevnline = prevnline + nlcount(prevloc, s); 62632332Sbostic else 62732332Sbostic prevnline = nline + nlcount(str, s); 62832332Sbostic prevmatch = 1; 62932332Sbostic 63032332Sbostic if (!squirrel) 63132332Sbostic printf("%ld:", prevnline); 63232332Sbostic else 63332502Sbostic (void)sprintf(preamble + strlen(preamble), 63432332Sbostic "%ld:", prevnline); 63532332Sbostic } 63632332Sbostic if (!squirrel) { 63732332Sbostic do 63832332Sbostic putchar(*s); 63932332Sbostic while (*s++ != NL); 64032332Sbostic } else 64132332Sbostic s = savematch(s); 64232332Sbostic 64332332Sbostic if (nflag) 64432332Sbostic prevloc = s - 1; 64532332Sbostic } 64632332Sbostic return ((firstflag && !cflag) ? NULL : s); 64732332Sbostic } 64832332Sbostic 64932332Sbostic char * 65032332Sbostic fold(line) 65132332Sbostic char *line; 65232332Sbostic { 65332332Sbostic static char fline[BUFSIZE]; 65432332Sbostic register char *s, *t = fline; 65532332Sbostic 65632332Sbostic for (s = line; *s != EOS; s++) 65732332Sbostic *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s); 65832332Sbostic *t = EOS; 65932332Sbostic return (fline); 66032332Sbostic } 66132332Sbostic 66232332Sbostic strindex(s, t) /* the easy way, as in K&P, p. 192 */ 66332332Sbostic char *s, *t; 66432332Sbostic { 66532332Sbostic int i, n; 66632332Sbostic 66732332Sbostic n = strlen(t); 66832332Sbostic for (i = 0; s[i] != '\0'; i++) 66932332Sbostic if (strncmp(s + i, t, n) == 0) 67032332Sbostic return (i); 67132332Sbostic return (-1); 67232332Sbostic } 67332332Sbostic 67432332Sbostic char * 67532332Sbostic grepxlat(pattern) /* grep pattern meta conversion */ 67632332Sbostic char *pattern; 67732332Sbostic { 67832332Sbostic register char *p, *s; 67932332Sbostic static char newpat[BUFSIZE]; 68032332Sbostic 68132332Sbostic for (s = newpat, p = pattern; *p != EOS;) { 68232332Sbostic if (*p == '\\') { /* skip escapes ... */ 68332332Sbostic *s++ = *p++; 68432332Sbostic if (*p) 68532332Sbostic *s++ = *p++; 68632332Sbostic } else if (*p == '[') { /* ... and char classes */ 68732332Sbostic while (*p != EOS && *p != ']') 68832332Sbostic *s++ = *p++; 68932332Sbostic } else if (strchr("+?|()", *p) != NULL) { 69032332Sbostic *s++ = '\\'; /* insert protection */ 69132332Sbostic *s++ = *p++; 69232332Sbostic } else 69332332Sbostic *s++ = *p++; 69432332Sbostic } 69532332Sbostic *s = EOS; 69632332Sbostic grepflag = ((patind) ? 0 : 1); 69732332Sbostic return (newpat); 69832332Sbostic } 69932332Sbostic 70032332Sbostic /* 70132332Sbostic * Test for simple alternation. Result is NULL if it's not so simple, or is 70232332Sbostic * a pointer to the first string if it is. Warning: sscanf size is a 70332332Sbostic * fixpoint, beyond which the speedup linearity starts to break down. In the 70432332Sbostic * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing 70532332Sbostic * altpat[] to arbitrary size is not useful. 70632332Sbostic */ 70732332Sbostic char * 70832332Sbostic alternate(regexpr) 70932332Sbostic char *regexpr; 71032332Sbostic { 71132339Sbostic register int i, j; 71232339Sbostic register char *start, *stop; 71332332Sbostic unsigned char c; 71432332Sbostic 71532332Sbostic if (fgrepflag && strchr(regexpr, '|')) 71632339Sbostic return (NULL); 71732332Sbostic 71832339Sbostic /* 71932339Sbostic * break pattern up into altpat array; delimit on newline, bar, 72032339Sbostic * or EOS. We know we won't overflow, we've already checked the 72132339Sbostic * number of patterns we're going to find against NALT. 72232339Sbostic * Also, set length of pattern and find minimum pattern length. 72332339Sbostic */ 72432339Sbostic nalt = 0; 72532339Sbostic altmin = NMUSH; 72632339Sbostic for (start = stop = regexpr;; ++stop) 72732339Sbostic if (!*stop || *stop == '|' || *stop == NL) { 72832339Sbostic altlen[nalt] = j = stop - start; 72932339Sbostic if (j < altmin) 73032339Sbostic altmin = j; 73132339Sbostic if (!(altpat[nalt] = malloc((u_int)(j + 1)))) 73232339Sbostic oops("out of memory"); 73332339Sbostic bcopy(start, altpat[nalt], j); 73432339Sbostic altpat[nalt][j] = EOS; 73532362Sbostic ++nalt; 73632339Sbostic if (!*stop) 73732339Sbostic break; 73832362Sbostic if (nalt == NALT) 73932339Sbostic return(NULL); 74032339Sbostic if (*stop == NL) 74132339Sbostic *stop = '|'; 74232339Sbostic start = stop + 1; 74332339Sbostic } 74432332Sbostic if (!fgrepflag) { 74532332Sbostic if (strchr(regexpr, '|') == NULL || regexpr[0] == '|') 74632332Sbostic return (NULL); 74732332Sbostic if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL 74832332Sbostic || strindex(regexpr, "||") >= 0) 74932332Sbostic return (NULL); 75032332Sbostic } 75132332Sbostic 75232332Sbostic if (nalt > 1) { /* build superimposed "pre-match" sets per 75332332Sbostic * char */ 75432332Sbostic altflag++; 75532332Sbostic for (j = 0; j < nalt; j++) 75632332Sbostic for (i = 0; i < altmin; i++) { 75732332Sbostic c = altpat[j][altlen[j] - altmin + i]; 75832332Sbostic altset[i + 1][c] = 1; /* offset for sentinel */ 75932332Sbostic } 76032332Sbostic } 76132332Sbostic return (altpat[0]); 76232332Sbostic } 76332332Sbostic 76432332Sbostic /* 76532332Sbostic * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to 76632332Sbostic * determine whether to use dfa-based egrep: We do FIRSTFEW matches with 76732332Sbostic * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT 76832332Sbostic * of the input, the r.e. is likely to be underspecified, so do old *grep, 76932332Sbostic * which is faster on complex patterns than regexp(). At FIRSTFEW, 77032332Sbostic * dump the saved matches collected by savematch(). They are saved 77132332Sbostic * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic, 77232332Sbostic * since it's hard to rewind. 77332332Sbostic */ 77432332Sbostic 77532332Sbostic execstrategy(file) 77632332Sbostic char *file; 77732332Sbostic { 77832332Sbostic int pctmatch; 77932332Sbostic 78032332Sbostic pctmatch = (100 * rxcount) / incount; 78132332Sbostic if (pctmatch > PUNTPERCENT && file != NULL) 78232332Sbostic kernighan(args); 78332332Sbostic if (file != NULL) 78432332Sbostic flushmatches(); 78532332Sbostic } 78632332Sbostic 78732332Sbostic nlcount(bstart, bstop) /* flail interval to totalize newlines. */ 78832332Sbostic char *bstart, *bstop; 78932332Sbostic { 79032332Sbostic register char *s = bstart; 79132332Sbostic register char *t = bstop; 79232332Sbostic register int count = 0; 79332332Sbostic 79432332Sbostic do { /* loop unroll for older architectures */ 79532332Sbostic if (*t == NL) /* ... ask ames!jaw for sample code */ 79632332Sbostic count++; 79732332Sbostic } while (t-- > s); 79832332Sbostic 79932332Sbostic return (count); 80032332Sbostic } 80132332Sbostic 80232332Sbostic char * 80332332Sbostic isolate(regexpr) /* isolate longest metacharacter-free string */ 80432332Sbostic char *regexpr; 80532332Sbostic { 80632332Sbostic char *dummyexpr; 80732332Sbostic 80832332Sbostic /* 80932332Sbostic * We add (.)* because Henry's regcomp only figures regmust if it 81032332Sbostic * sees a leading * pattern. Foo! 81132332Sbostic */ 81232332Sbostic dummyexpr = malloc((unsigned) strlen(regexpr) + 5); 81332502Sbostic (void)sprintf(dummyexpr, "(.)*%s", regexpr); 81432332Sbostic if ((rspencer = regcomp(dummyexpr)) == NULL) 81532332Sbostic kernighan(args); 81632332Sbostic return (rspencer->regmust); 81732332Sbostic } 81832332Sbostic 81932332Sbostic char *matches[FIRSTFEW]; 82032332Sbostic static int mcount = 0; 82132332Sbostic 82232332Sbostic char * 82332332Sbostic savematch(s) /* horde matches during statistics gathering */ 82432332Sbostic register char *s; 82532332Sbostic { 82632332Sbostic char *p; 82732332Sbostic char *start = s; 82832332Sbostic int msize = 0; 82932332Sbostic int psize = strlen(preamble); 83032332Sbostic 83132332Sbostic while (*s++ != NL) 83232332Sbostic msize++; 83332332Sbostic *--s = EOS; 83432332Sbostic 83532332Sbostic p = malloc((unsigned) msize + 1 + psize); 83632332Sbostic strcpy(p, preamble); 83732332Sbostic strcpy(p + psize, start); 83832332Sbostic matches[mcount++] = p; 83932332Sbostic 84032332Sbostic preamble[0] = 0; 84132332Sbostic *s = NL; 84232332Sbostic return (s); 84332332Sbostic } 84432332Sbostic 84532332Sbostic flushmatches() 84632332Sbostic { 84732332Sbostic int n; 84832332Sbostic 84932332Sbostic flushflag = 1; 85032332Sbostic for (n = 0; n < mcount; n++) 85132332Sbostic printf("%s\n", matches[n]); 85232332Sbostic mcount = 0; 85332332Sbostic } 85432332Sbostic 85532332Sbostic oops(message) 85632332Sbostic char *message; 85732332Sbostic { 85832332Sbostic fprintf(stderr, "%s: %s\n", progname, message); 85932332Sbostic exit(2); 86032332Sbostic } 86132332Sbostic 86232332Sbostic kernighan(args) /* "let others do the hard part ..." */ 86332332Sbostic char *args[]; 86432332Sbostic { 86532332Sbostic /* 86632332Sbostic * We may have already run grep on some of the files; remove them 86732332Sbostic * from the arg list we pass on. Note that we can't delete them 86832332Sbostic * totally because the number of file names affects the output 86932332Sbostic * (automatic -h). 87032332Sbostic */ 87132332Sbostic /* better would be fork/exec per punted file -- jaw */ 87232332Sbostic 87332332Sbostic while (firstfile && optind > firstfile) 87437914Sbostic args[firstfile++] = _PATH_DEVNULL; 87532332Sbostic if (patind) 87632332Sbostic args[patind] = pattern; 87732339Sbostic (void) fflush(stdout); 87832332Sbostic 87932332Sbostic if (grepflag) 88037914Sbostic execvp(_PATH_GREPSTD, args), oops("can't exec old 'grep'"); 88132332Sbostic else if (fgrepflag) 88237914Sbostic execvp(_PATH_FGREPSTD, args), oops("can't exec old 'fgrep'"); 88332332Sbostic else 88437914Sbostic execvp(_PATH_EGREPSTD, args), oops("can't exec old 'egrep'"); 88532332Sbostic } 886