1*32332Sbostic #ifndef lint 2*32332Sbostic static char sccsid[] = "@(#)egrep.c 5.1 (Berkeley) 10/03/87"; 3*32332Sbostic #endif not lint 4*32332Sbostic 5*32332Sbostic /* 6*32332Sbostic Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0 7*32332Sbostic table as in original paper (CACM, October, 1977). No delta1 or delta2. 8*32332Sbostic According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of 9*32332Sbostic minimal practical value. However, to improve for worst case input, 10*32332Sbostic integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J. 11*32332Sbostic Comput., Feb. 1986) deserves consideration. 12*32332Sbostic 13*32332Sbostic Method: extract longest metacharacter-free string from expression. 14*32332Sbostic this is done using a side-effect from henry spencer's regcomp(). 15*32332Sbostic use boyer-moore to match such, then pass submatching lines 16*32332Sbostic to either regexp() or standard 'egrep', depending on certain 17*32332Sbostic criteria within execstrategy() below. [this tradeoff is due 18*32332Sbostic to the general slowness of the regexp() nondeterministic 19*32332Sbostic machine on complex expressions, as well as the startup time 20*32332Sbostic of standard 'egrep' on short files.] alternatively, one may 21*32332Sbostic change the vendor-supplied 'egrep' automaton to include 22*32332Sbostic boyer-moore directly. see accompanying writeup for discussion 23*32332Sbostic of kanji expression treatment. 24*32332Sbostic 25*32332Sbostic late addition: apply trickbag for fast match of simple 26*32332Sbostic alternations (sublinear, in common low-cardinality cases). 27*32332Sbostic trap fgrep into this lair. 28*32332Sbostic 29*32332Sbostic gnu additions: -f, newline as |, \< and \> [in regexec()], more 30*32332Sbostic comments. inspire better dfa exec() strategy. 31*32332Sbostic serious testing and help with special cases. 32*32332Sbostic 33*32332Sbostic Algorithm amalgam summary: 34*32332Sbostic 35*32332Sbostic dfa e?grep (aho/thompson) 36*32332Sbostic ndfa regexp() (spencer/aho) 37*32332Sbostic bmg (boyer/moore/gosper) 38*32332Sbostic "superimposed" bmg (jaw) 39*32332Sbostic fgrep (aho/corrasick) 40*32332Sbostic 41*32332Sbostic sorry, but the knuth/morris/pratt machine, horspool's 42*32332Sbostic "frequentist" code, and the rabin/karp matcher, however cute, 43*32332Sbostic just don't cut it for this production. 44*32332Sbostic 45*32332Sbostic James A. Woods Copyright (c) 1986 46*32332Sbostic NASA Ames Research Center 47*32332Sbostic */ 48*32332Sbostic #include <stdio.h> 49*32332Sbostic #include <ctype.h> 50*32332Sbostic #include <sys/types.h> 51*32332Sbostic #include <sys/stat.h> 52*32332Sbostic #include <regexp.h> /* must be henry spencer's version */ 53*32332Sbostic 54*32332Sbostic #define MIN(A, B) ((A) > (B) ? (B) : (A)) 55*32332Sbostic 56*32332Sbostic #ifdef SLOWSYS 57*32332Sbostic #define read xread 58*32332Sbostic #endif 59*32332Sbostic /* 60*32332Sbostic * define existing [ef]?grep program locations below for use by execvp(). 61*32332Sbostic * [execlp() would be used were it not for the possibility of 62*32332Sbostic * installation-dependent recursion.] 63*32332Sbostic */ 64*32332Sbostic #ifndef EGREPSTD 65*32332Sbostic #define EGREPSTD "/usr/bin/egrep" 66*32332Sbostic #endif 67*32332Sbostic #ifndef GREPSTD 68*32332Sbostic #define GREPSTD "/bin/grep" 69*32332Sbostic #endif 70*32332Sbostic #ifndef FGREPSTD 71*32332Sbostic #define FGREPSTD "/usr/bin/fgrep" 72*32332Sbostic #endif 73*32332Sbostic 74*32332Sbostic #define BUFSIZE 8192 /* make higher for cray */ 75*32332Sbostic #define PATSIZE 6000 76*32332Sbostic #define LARGE BUFSIZE + PATSIZE 77*32332Sbostic 78*32332Sbostic #define ALTSIZE 100 /* generous? */ 79*32332Sbostic #define NALT 7 /* tied to scanf() size in alternate() */ 80*32332Sbostic #define NMUSH 6 /* loosely relates to expected alt length */ 81*32332Sbostic 82*32332Sbostic #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */ 83*32332Sbostic #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input 84*32332Sbostic * was processed by regexp(), exec std egrep. */ 85*32332Sbostic #define NL '\n' 86*32332Sbostic #define EOS '\0' 87*32332Sbostic #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */ 88*32332Sbostic #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */ 89*32332Sbostic #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */ 90*32332Sbostic #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */ 91*32332Sbostic 92*32332Sbostic extern char *optarg; 93*32332Sbostic extern int optind; 94*32332Sbostic char *progname; 95*32332Sbostic 96*32332Sbostic int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */ 97*32332Sbostic int sflag, hflag; /* v7, v8, bsd */ 98*32332Sbostic 99*32332Sbostic int firstflag; /* Stop at first match */ 100*32332Sbostic int grepflag; /* Called as "grep" */ 101*32332Sbostic int fgrepflag; /* Called as "fgrep" */ 102*32332Sbostic int altflag; /* Simple alternation in pattern */ 103*32332Sbostic int boyonly; /* No regexp needed -- all simple */ 104*32332Sbostic int flushflag; 105*32332Sbostic int grepold, egrepold, fgrepold; 106*32332Sbostic 107*32332Sbostic int nalt; /* Number of alternatives */ 108*32332Sbostic int nsuccess; /* 1 for match, 2 for error */ 109*32332Sbostic int altmin; /* Minimum length of all the alternate 110*32332Sbostic * strings */ 111*32332Sbostic int firstfile; /* argv index of first file argument */ 112*32332Sbostic int patind; /* argv index of pattern */ 113*32332Sbostic long nmatch; /* Number of matches in this file */ 114*32332Sbostic long incount, counted; /* Amount of input consumed */ 115*32332Sbostic long rxcount; /* Bytes of input processed by regexec() */ 116*32332Sbostic int boyfound; /* accumulated partial matches (tripped by 117*32332Sbostic * FIRSTFEW) */ 118*32332Sbostic int prevmatch; /* next three lines aid fast -n */ 119*32332Sbostic long nline, prevnline; 120*32332Sbostic char *prevloc; 121*32332Sbostic 122*32332Sbostic regexp *rspencer; 123*32332Sbostic char *pattern; 124*32332Sbostic char *patboy; /* Pattern for simple Boyer-Moore */ 125*32332Sbostic char *patfile; /* Filename containing pattern(s) */ 126*32332Sbostic 127*32332Sbostic int delta0[256]; /* Boyer-Moore algorithm core */ 128*32332Sbostic char cmap[256]; /* Usually 0-255, but if -i, maps upper to 129*32332Sbostic * lower case */ 130*32332Sbostic char str[BUFSIZE + 2]; 131*32332Sbostic int nleftover; 132*32332Sbostic char linetemp[BUFSIZE]; 133*32332Sbostic char altpat[NALT][ALTSIZE]; /* alternation component storage */ 134*32332Sbostic int altlen[NALT]; 135*32332Sbostic short altset[NMUSH + 1][256]; 136*32332Sbostic char preamble[200]; /* match prefix (filename, line no.) */ 137*32332Sbostic 138*32332Sbostic int fd; 139*32332Sbostic char * 140*32332Sbostic strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *sprintf(), *malloc(); 141*32332Sbostic char * 142*32332Sbostic grepxlat(), *fold(), *pfile(), *alternate(), *isolate(); 143*32332Sbostic char **args; 144*32332Sbostic 145*32332Sbostic main(argc, argv) 146*32332Sbostic int argc; 147*32332Sbostic char *argv[]; 148*32332Sbostic { 149*32332Sbostic int c; 150*32332Sbostic int errflag = 0; 151*32332Sbostic 152*32332Sbostic args = argv; 153*32332Sbostic 154*32332Sbostic if ((progname = strrchr(argv[0], '/')) != 0) 155*32332Sbostic progname++; 156*32332Sbostic else 157*32332Sbostic progname = argv[0]; 158*32332Sbostic if (strcmp(progname, "grep") == 0) 159*32332Sbostic grepflag++; 160*32332Sbostic if (strcmp(progname, "fgrep") == 0) 161*32332Sbostic fgrepflag++; 162*32332Sbostic 163*32332Sbostic while ((c = getopt(argc, argv, "bchie:f:lnsvwxy1")) != EOF) { 164*32332Sbostic switch (c) { 165*32332Sbostic 166*32332Sbostic case 'f': 167*32332Sbostic fflag++; 168*32332Sbostic patfile = optarg; 169*32332Sbostic continue; 170*32332Sbostic case 'b': 171*32332Sbostic case 'v': 172*32332Sbostic egrepold++; /* boyer-moore of little help here */ 173*32332Sbostic continue; 174*32332Sbostic case 'c': 175*32332Sbostic cflag++; 176*32332Sbostic continue; 177*32332Sbostic case 'e': 178*32332Sbostic eflag++; 179*32332Sbostic pattern = optarg; 180*32332Sbostic continue; 181*32332Sbostic case 'h': 182*32332Sbostic hflag++; 183*32332Sbostic continue; 184*32332Sbostic case '1': /* Stop at very first match */ 185*32332Sbostic firstflag++; /* spead freaks only */ 186*32332Sbostic continue; 187*32332Sbostic case 'i': 188*32332Sbostic iflag++; 189*32332Sbostic continue; 190*32332Sbostic case 'l': 191*32332Sbostic lflag++; 192*32332Sbostic continue; 193*32332Sbostic case 'n': 194*32332Sbostic nflag++; 195*32332Sbostic continue; 196*32332Sbostic case 's': 197*32332Sbostic sflag++; 198*32332Sbostic continue; 199*32332Sbostic case 'w': 200*32332Sbostic case 'y': 201*32332Sbostic if (!grepflag) 202*32332Sbostic errflag++; 203*32332Sbostic grepold++; 204*32332Sbostic continue; 205*32332Sbostic case 'x': /* needs more work, like -b above */ 206*32332Sbostic if (!fgrepflag) 207*32332Sbostic errflag++; 208*32332Sbostic fgrepold++; 209*32332Sbostic continue; 210*32332Sbostic case '?': 211*32332Sbostic errflag++; 212*32332Sbostic } 213*32332Sbostic } 214*32332Sbostic if (errflag || ((argc <= optind) && !fflag && !eflag)) { 215*32332Sbostic if (grepflag) 216*32332Sbostic oops("usage: grep [-bcihlnsvwy] [-e] pattern [file ...]"); 217*32332Sbostic else if (fgrepflag) 218*32332Sbostic oops("usage: fgrep [-bcilnv] {-f patfile | [-e] strings} [file ...]"); 219*32332Sbostic else /* encourage SVID options, though we provide 220*32332Sbostic * others */ 221*32332Sbostic oops("usage: egrep [-bcilnv] {-f patfile | [-e] pattern} [file ...]"); 222*32332Sbostic } 223*32332Sbostic patind = optind; 224*32332Sbostic if (fflag) 225*32332Sbostic pattern = pfile(patfile), patind = 0; 226*32332Sbostic else if (!eflag) 227*32332Sbostic pattern = argv[optind++]; 228*32332Sbostic 229*32332Sbostic if ((argc - optind) <= 1) /* Filename invisible given < 2 files */ 230*32332Sbostic hflag++; 231*32332Sbostic if (pattern[0] == EOS) 232*32332Sbostic kernighan(argv); /* same as it ever was */ 233*32332Sbostic /* 234*32332Sbostic * 'grep/egrep' merger -- "old" grep is called to handle: tagged 235*32332Sbostic * exprs \( \), word matches \< and \>, -w and -y options, char 236*32332Sbostic * classes with '-' at end (egrep bug?), and patterns beginning with 237*32332Sbostic * an asterisk (don't ask why). otherwise, characters meaningful to 238*32332Sbostic * 'egrep' but not to 'grep' are escaped; the entire expr is then 239*32332Sbostic * passed to 'egrep'. 240*32332Sbostic */ 241*32332Sbostic if (grepflag && !grepold) { 242*32332Sbostic if (strindex(pattern, "\\(") >= 0 || 243*32332Sbostic strindex(pattern, "\\<") >= 0 || 244*32332Sbostic strindex(pattern, "\\>") >= 0 || 245*32332Sbostic strindex(pattern, "-]") >= 0 || 246*32332Sbostic pattern[0] == '*') /* grep bug */ 247*32332Sbostic grepold++; 248*32332Sbostic else 249*32332Sbostic pattern = grepxlat(pattern); 250*32332Sbostic } 251*32332Sbostic if (grepold || egrepold || fgrepold) 252*32332Sbostic kernighan(argv); 253*32332Sbostic 254*32332Sbostic if (iflag) 255*32332Sbostic strcpy(pattern, fold(pattern)); 256*32332Sbostic /* 257*32332Sbostic * If the pattern is a plain string, just run boyer-moore. If it 258*32332Sbostic * consists of meta-free alternatives, run "superimposed" bmg. 259*32332Sbostic * Otherwise, find best string, and compile pattern for regexec(). 260*32332Sbostic */ 261*32332Sbostic if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */ 262*32332Sbostic boyonly++; 263*32332Sbostic patboy = pattern; 264*32332Sbostic } else { 265*32332Sbostic if ((patboy = alternate(pattern)) != NULL) 266*32332Sbostic boyonly++; 267*32332Sbostic else { 268*32332Sbostic if ((patboy = isolate(pattern)) == NULL) 269*32332Sbostic kernighan(argv); /* expr too involved */ 270*32332Sbostic #ifndef NOKANJI 271*32332Sbostic for (c = 0; pattern[c] != EOS; c++) 272*32332Sbostic if (pattern[c] & NONASCII) /* kanji + meta */ 273*32332Sbostic kernighan(argv); 274*32332Sbostic #endif 275*32332Sbostic if ((rspencer = regcomp(pattern)) == NULL) 276*32332Sbostic oops("regcomp failure"); 277*32332Sbostic } 278*32332Sbostic } 279*32332Sbostic gosper(patboy); /* "pre-conditioning is wonderful" 280*32332Sbostic * -- v. strassen */ 281*32332Sbostic 282*32332Sbostic if ((firstfile = optind) >= argc) { 283*32332Sbostic /* Grep standard input */ 284*32332Sbostic if (lflag) /* We don't know its name! */ 285*32332Sbostic exit(1); 286*32332Sbostic egsecute((char *) NULL); 287*32332Sbostic } else { 288*32332Sbostic while (optind < argc) { 289*32332Sbostic egsecute(argv[optind]); 290*32332Sbostic optind++; 291*32332Sbostic if (firstflag && (nsuccess == 1)) 292*32332Sbostic break; 293*32332Sbostic } 294*32332Sbostic } 295*32332Sbostic exit((nsuccess == 2) ? 2 : (nsuccess == 0)); 296*32332Sbostic } 297*32332Sbostic 298*32332Sbostic char * 299*32332Sbostic pfile(pfname) /* absorb expression from file */ 300*32332Sbostic char *pfname; 301*32332Sbostic { 302*32332Sbostic FILE *pf; 303*32332Sbostic struct stat patstat; 304*32332Sbostic static char *pat; 305*32332Sbostic 306*32332Sbostic if ((pf = fopen(pfname, "r")) == NULL) 307*32332Sbostic oops("can't read pattern file"); 308*32332Sbostic if (fstat(fileno(pf), &patstat) != 0) 309*32332Sbostic oops("can't stat pattern file"); 310*32332Sbostic if (patstat.st_size > PATSIZE) { 311*32332Sbostic if (fgrepflag) { /* defer to unix version */ 312*32332Sbostic fgrepold++; 313*32332Sbostic return "dummy"; 314*32332Sbostic } else 315*32332Sbostic oops("pattern file too big"); 316*32332Sbostic } 317*32332Sbostic if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL) 318*32332Sbostic oops("out of memory to read pattern file"); 319*32332Sbostic if (patstat.st_size != 320*32332Sbostic fread(pat, sizeof(char), (int) patstat.st_size + 1, pf)) 321*32332Sbostic oops("error reading pattern file"); 322*32332Sbostic (void) fclose(pf); 323*32332Sbostic 324*32332Sbostic pat[patstat.st_size] = EOS; 325*32332Sbostic if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */ 326*32332Sbostic pat[patstat.st_size - 1] = EOS; 327*32332Sbostic 328*32332Sbostic if (nlcount(pat, &pat[patstat.st_size]) > NALT) { 329*32332Sbostic if (fgrepflag) 330*32332Sbostic fgrepold++; /* "what's it all about, alfie?" */ 331*32332Sbostic else 332*32332Sbostic egrepold++; 333*32332Sbostic } 334*32332Sbostic return (pat); 335*32332Sbostic } 336*32332Sbostic 337*32332Sbostic egsecute(file) 338*32332Sbostic char *file; 339*32332Sbostic { 340*32332Sbostic if (file == NULL) 341*32332Sbostic fd = 0; 342*32332Sbostic else if ((fd = open(file, 0)) <= 0) { 343*32332Sbostic fprintf(stderr, "%s: can't open %s\n", progname, file); 344*32332Sbostic nsuccess = 2; 345*32332Sbostic return; 346*32332Sbostic } 347*32332Sbostic chimaera(file, patboy); 348*32332Sbostic 349*32332Sbostic if (!boyonly && !flushflag && file != NULL) 350*32332Sbostic flushmatches(); 351*32332Sbostic if (file != NULL) 352*32332Sbostic close(fd); 353*32332Sbostic } 354*32332Sbostic 355*32332Sbostic chimaera(file, pat) /* "reach out and boyer-moore search someone" */ 356*32332Sbostic char *file, *pat; /* -- soon-to-be-popular bumper sticker */ 357*32332Sbostic { 358*32332Sbostic register char *k, *strend, *s; 359*32332Sbostic register int j, count; 360*32332Sbostic register int *deltazero = delta0; 361*32332Sbostic int patlen = altmin; 362*32332Sbostic char *t; 363*32332Sbostic char *gotamatch(), *kanji(), *linesave(), *submatch(); 364*32332Sbostic 365*32332Sbostic nleftover = boyfound = flushflag = 0; 366*32332Sbostic nline = 1L; 367*32332Sbostic prevmatch = 0; 368*32332Sbostic nmatch = counted = rxcount = 0L; 369*32332Sbostic 370*32332Sbostic while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) { 371*32332Sbostic 372*32332Sbostic counted += count; 373*32332Sbostic strend = linesave(str, count); 374*32332Sbostic 375*32332Sbostic for (k = str + patlen - 1; k < strend;) { 376*32332Sbostic /* 377*32332Sbostic * for a large class of patterns, upwards of 80% of 378*32332Sbostic * match time is spent on the next line. we beat 379*32332Sbostic * existing microcode (vax 'matchc') this way. 380*32332Sbostic */ 381*32332Sbostic while ((k += deltazero[*(unsigned char *) k]) < strend); 382*32332Sbostic if (k < (str + LARGE)) 383*32332Sbostic break; 384*32332Sbostic k -= LARGE; 385*32332Sbostic 386*32332Sbostic if (altflag) { 387*32332Sbostic /* 388*32332Sbostic * Parallel Boyer-Moore. Check whether each 389*32332Sbostic * of the previous <altmin> chars COULD be 390*32332Sbostic * from one of the alternative strings. 391*32332Sbostic */ 392*32332Sbostic s = k - 1; 393*32332Sbostic j = altmin; 394*32332Sbostic while (altset[--j][(unsigned char) 395*32332Sbostic cmap[*(unsigned char *) s--]]); 396*32332Sbostic /* 397*32332Sbostic * quick test fails. in this life, compare 398*32332Sbostic * 'em all. but, a "reverse trie" would 399*32332Sbostic * attenuate worst case (linear w/delta2?). 400*32332Sbostic */ 401*32332Sbostic if (--j < 0) { 402*32332Sbostic count = nalt - 1; 403*32332Sbostic do { 404*32332Sbostic s = k; 405*32332Sbostic j = altlen[count]; 406*32332Sbostic t = altpat[count]; 407*32332Sbostic 408*32332Sbostic while 409*32332Sbostic (cmap[*(unsigned char *) s--] 410*32332Sbostic == t[--j]); 411*32332Sbostic if (j < 0) 412*32332Sbostic break; 413*32332Sbostic } 414*32332Sbostic while (count--); 415*32332Sbostic } 416*32332Sbostic } else { 417*32332Sbostic /* One string -- check it */ 418*32332Sbostic j = patlen - 1; 419*32332Sbostic s = k - 1; 420*32332Sbostic while (cmap[*(unsigned char *) s--] == pat[--j]); 421*32332Sbostic } 422*32332Sbostic /* 423*32332Sbostic * delta-less shortcut for literati. short shrift for 424*32332Sbostic * genetic engineers? 425*32332Sbostic */ 426*32332Sbostic if (j >= 0) { 427*32332Sbostic k++; /* no match; restart next char */ 428*32332Sbostic continue; 429*32332Sbostic } 430*32332Sbostic k = submatch(file, pat, str, strend, k, count); 431*32332Sbostic if (k == NULL) 432*32332Sbostic return; 433*32332Sbostic } 434*32332Sbostic if (nflag) { 435*32332Sbostic if (prevmatch) 436*32332Sbostic nline = prevnline + nlcount(prevloc, k); 437*32332Sbostic else 438*32332Sbostic nline = nline + nlcount(str, k); 439*32332Sbostic prevmatch = 0; 440*32332Sbostic } 441*32332Sbostic strncpy(str, linetemp, nleftover); 442*32332Sbostic } 443*32332Sbostic if (cflag) { 444*32332Sbostic /* Bug from old grep: -c overrides -h. We fix the bug. */ 445*32332Sbostic if (!hflag) 446*32332Sbostic printf("%s:", file); 447*32332Sbostic printf("%ld\n", nmatch); 448*32332Sbostic } 449*32332Sbostic } 450*32332Sbostic 451*32332Sbostic char * 452*32332Sbostic linesave(str, count) /* accumulate partial line at end of buffer */ 453*32332Sbostic char str[]; 454*32332Sbostic register int count; 455*32332Sbostic { 456*32332Sbostic register int j; 457*32332Sbostic 458*32332Sbostic count += nleftover; 459*32332Sbostic if (count != BUFSIZE && fd != 0) 460*32332Sbostic str[count++] = NL; /* insurance for broken last line */ 461*32332Sbostic str[count] = EOS; 462*32332Sbostic for (j = count - 1; str[j] != NL && j >= 0;) 463*32332Sbostic j--; 464*32332Sbostic /* 465*32332Sbostic * break up these lines: long line (> BUFSIZE), last line of file, or 466*32332Sbostic * short return from read(), as from tee(1) input 467*32332Sbostic */ 468*32332Sbostic if (j < 0 && (count == (BUFSIZE - nleftover))) { 469*32332Sbostic str[count++] = NL; 470*32332Sbostic str[count] = EOS; 471*32332Sbostic linetemp[0] = EOS; 472*32332Sbostic nleftover = 0; 473*32332Sbostic return (str + count); 474*32332Sbostic } else { 475*32332Sbostic nleftover = count - j - 1; 476*32332Sbostic strncpy(linetemp, str + j + 1, nleftover); 477*32332Sbostic return (str + j); 478*32332Sbostic } 479*32332Sbostic } 480*32332Sbostic 481*32332Sbostic /* 482*32332Sbostic * Process partial match. First check for mis-aligned Kanji, then match line 483*32332Sbostic * against full compiled r.e. if statistics do not warrant handing off to 484*32332Sbostic * standard egrep. 485*32332Sbostic */ 486*32332Sbostic char * 487*32332Sbostic submatch(file, pat, str, strend, k, altindex) 488*32332Sbostic char file[], pat[], str[]; 489*32332Sbostic register char *strend, *k; 490*32332Sbostic int altindex; 491*32332Sbostic { 492*32332Sbostic register char *s; 493*32332Sbostic char *t, c; 494*32332Sbostic 495*32332Sbostic t = k; 496*32332Sbostic s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1); 497*32332Sbostic #ifndef NOKANJI 498*32332Sbostic c = ((altflag) ? altpat[altindex][0] : pat[0]); 499*32332Sbostic if (c & NONASCII) 500*32332Sbostic if ((s = kanji(str, s, k)) == NULL) 501*32332Sbostic return (++k); /* reject false kanji */ 502*32332Sbostic #endif 503*32332Sbostic do; 504*32332Sbostic while (*s != NL && --s >= str); 505*32332Sbostic k = s + 1; /* now at line start */ 506*32332Sbostic 507*32332Sbostic if (boyonly) 508*32332Sbostic return (gotamatch(file, k)); 509*32332Sbostic 510*32332Sbostic incount = counted - (strend - k); 511*32332Sbostic if (boyfound++ == FIRSTFEW) 512*32332Sbostic execstrategy(file); 513*32332Sbostic 514*32332Sbostic s = t; 515*32332Sbostic do 516*32332Sbostic rxcount++; 517*32332Sbostic while (*s++ != NL); 518*32332Sbostic *--s = EOS; 519*32332Sbostic /* 520*32332Sbostic * "quick henry -- the flit" (after theodor geisel) 521*32332Sbostic */ 522*32332Sbostic if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) { 523*32332Sbostic *s = NL; 524*32332Sbostic if (gotamatch(file, k) == NULL) 525*32332Sbostic return (NULL); 526*32332Sbostic } 527*32332Sbostic *s = NL; 528*32332Sbostic return (s + 1); 529*32332Sbostic } 530*32332Sbostic 531*32332Sbostic /* 532*32332Sbostic * EUC code disambiguation -- scan backwards to first 7-bit code, while 533*32332Sbostic * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern. 534*32332Sbostic * SS2/3 checks are for intermixed Japanase Katakana or Kanji2. 535*32332Sbostic */ 536*32332Sbostic char * 537*32332Sbostic kanji(str, s, k) 538*32332Sbostic register char *str, *s, *k; 539*32332Sbostic { 540*32332Sbostic register int j = 0; 541*32332Sbostic 542*32332Sbostic for (s--; s >= str; s--) { 543*32332Sbostic if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0) 544*32332Sbostic break; 545*32332Sbostic j++; 546*32332Sbostic } 547*32332Sbostic #ifndef CHINESE 548*32332Sbostic if (*s == SS2) 549*32332Sbostic j -= 1; 550*32332Sbostic #endif CHINESE 551*32332Sbostic return ((j & 01) ? NULL : k); 552*32332Sbostic } 553*32332Sbostic 554*32332Sbostic /* 555*32332Sbostic * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] 556*32332Sbostic */ 557*32332Sbostic gosper(pattern) 558*32332Sbostic char *pattern; /* ... HAKMEM lives ... */ 559*32332Sbostic { 560*32332Sbostic register int i, j; 561*32332Sbostic unsigned char c; 562*32332Sbostic 563*32332Sbostic /* Make one-string case look like simple alternatives case */ 564*32332Sbostic if (!altflag) { 565*32332Sbostic nalt = 1; 566*32332Sbostic altmin = altlen[0] = strlen(pattern); 567*32332Sbostic strcpy(altpat[0], pattern); 568*32332Sbostic } 569*32332Sbostic /* For chars that aren't in any string, skip by string length. */ 570*32332Sbostic for (j = 0; j < 256; j++) { 571*32332Sbostic delta0[j] = altmin; 572*32332Sbostic cmap[j] = j; /* Sneak in initialization of cmap */ 573*32332Sbostic } 574*32332Sbostic 575*32332Sbostic /* For chars in a string, skip distance from char to end of string. */ 576*32332Sbostic /* (If char appears more than once, skip minimum distance.) */ 577*32332Sbostic for (i = 0; i < nalt; i++) 578*32332Sbostic for (j = 0; j < altlen[i] - 1; j++) { 579*32332Sbostic c = altpat[i][j]; 580*32332Sbostic delta0[c] = MIN(delta0[c], altlen[i] - j - 1); 581*32332Sbostic if (iflag && islower((int) c)) 582*32332Sbostic delta0[toupper((int) c)] = delta0[c]; 583*32332Sbostic } 584*32332Sbostic 585*32332Sbostic /* For last char of each string, fall out of search loop. */ 586*32332Sbostic for (i = 0; i < nalt; i++) { 587*32332Sbostic c = altpat[i][altlen[i] - 1]; 588*32332Sbostic delta0[c] = LARGE; 589*32332Sbostic if (iflag && islower((int) c)) 590*32332Sbostic delta0[toupper((int) c)] = LARGE; 591*32332Sbostic } 592*32332Sbostic if (iflag) 593*32332Sbostic for (j = 'A'; j <= 'Z'; j++) 594*32332Sbostic cmap[j] = tolower((int) j); 595*32332Sbostic } 596*32332Sbostic 597*32332Sbostic /* 598*32332Sbostic * Print, count, or stop on full match. Result is either the location for 599*32332Sbostic * continued search, or NULL to stop. 600*32332Sbostic */ 601*32332Sbostic char * 602*32332Sbostic gotamatch(file, s) 603*32332Sbostic register char *file, *s; 604*32332Sbostic { 605*32332Sbostic char *savematch(); 606*32332Sbostic int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */ 607*32332Sbostic 608*32332Sbostic nmatch++; 609*32332Sbostic nsuccess = 1; 610*32332Sbostic if (!boyonly && boyfound <= FIRSTFEW && file != NULL) 611*32332Sbostic squirrel = 1; 612*32332Sbostic 613*32332Sbostic if (sflag) 614*32332Sbostic return (NULL); /* -s usurps all flags (unlike some versions) */ 615*32332Sbostic if (cflag) { /* -c overrides -l, we guess */ 616*32332Sbostic do; 617*32332Sbostic while (*s++ != NL); 618*32332Sbostic } else if (lflag) { 619*32332Sbostic puts(file); 620*32332Sbostic return (NULL); 621*32332Sbostic } else { 622*32332Sbostic if (!hflag) 623*32332Sbostic if (!squirrel) 624*32332Sbostic printf("%s:", file); 625*32332Sbostic else 626*32332Sbostic sprintf(preamble, "%s:", file); 627*32332Sbostic if (nflag) { 628*32332Sbostic if (prevmatch) 629*32332Sbostic prevnline = prevnline + nlcount(prevloc, s); 630*32332Sbostic else 631*32332Sbostic prevnline = nline + nlcount(str, s); 632*32332Sbostic prevmatch = 1; 633*32332Sbostic 634*32332Sbostic if (!squirrel) 635*32332Sbostic printf("%ld:", prevnline); 636*32332Sbostic else 637*32332Sbostic sprintf(preamble + strlen(preamble), 638*32332Sbostic "%ld:", prevnline); 639*32332Sbostic } 640*32332Sbostic if (!squirrel) { 641*32332Sbostic do 642*32332Sbostic putchar(*s); 643*32332Sbostic while (*s++ != NL); 644*32332Sbostic } else 645*32332Sbostic s = savematch(s); 646*32332Sbostic 647*32332Sbostic if (nflag) 648*32332Sbostic prevloc = s - 1; 649*32332Sbostic } 650*32332Sbostic return ((firstflag && !cflag) ? NULL : s); 651*32332Sbostic } 652*32332Sbostic 653*32332Sbostic char * 654*32332Sbostic fold(line) 655*32332Sbostic char *line; 656*32332Sbostic { 657*32332Sbostic static char fline[BUFSIZE]; 658*32332Sbostic register char *s, *t = fline; 659*32332Sbostic 660*32332Sbostic for (s = line; *s != EOS; s++) 661*32332Sbostic *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s); 662*32332Sbostic *t = EOS; 663*32332Sbostic return (fline); 664*32332Sbostic } 665*32332Sbostic 666*32332Sbostic strindex(s, t) /* the easy way, as in K&P, p. 192 */ 667*32332Sbostic char *s, *t; 668*32332Sbostic { 669*32332Sbostic int i, n; 670*32332Sbostic 671*32332Sbostic n = strlen(t); 672*32332Sbostic for (i = 0; s[i] != '\0'; i++) 673*32332Sbostic if (strncmp(s + i, t, n) == 0) 674*32332Sbostic return (i); 675*32332Sbostic return (-1); 676*32332Sbostic } 677*32332Sbostic 678*32332Sbostic char * 679*32332Sbostic grepxlat(pattern) /* grep pattern meta conversion */ 680*32332Sbostic char *pattern; 681*32332Sbostic { 682*32332Sbostic register char *p, *s; 683*32332Sbostic static char newpat[BUFSIZE]; 684*32332Sbostic 685*32332Sbostic for (s = newpat, p = pattern; *p != EOS;) { 686*32332Sbostic if (*p == '\\') { /* skip escapes ... */ 687*32332Sbostic *s++ = *p++; 688*32332Sbostic if (*p) 689*32332Sbostic *s++ = *p++; 690*32332Sbostic } else if (*p == '[') { /* ... and char classes */ 691*32332Sbostic while (*p != EOS && *p != ']') 692*32332Sbostic *s++ = *p++; 693*32332Sbostic } else if (strchr("+?|()", *p) != NULL) { 694*32332Sbostic *s++ = '\\'; /* insert protection */ 695*32332Sbostic *s++ = *p++; 696*32332Sbostic } else 697*32332Sbostic *s++ = *p++; 698*32332Sbostic } 699*32332Sbostic *s = EOS; 700*32332Sbostic grepflag = ((patind) ? 0 : 1); 701*32332Sbostic return (newpat); 702*32332Sbostic } 703*32332Sbostic 704*32332Sbostic /* 705*32332Sbostic * Test for simple alternation. Result is NULL if it's not so simple, or is 706*32332Sbostic * a pointer to the first string if it is. Warning: sscanf size is a 707*32332Sbostic * fixpoint, beyond which the speedup linearity starts to break down. In the 708*32332Sbostic * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing 709*32332Sbostic * altpat[] to arbitrary size is not useful. 710*32332Sbostic */ 711*32332Sbostic char * 712*32332Sbostic alternate(regexpr) 713*32332Sbostic char *regexpr; 714*32332Sbostic { 715*32332Sbostic register i, j, min; 716*32332Sbostic unsigned char c; 717*32332Sbostic char oflow[100]; 718*32332Sbostic 719*32332Sbostic if (fgrepflag && strchr(regexpr, '|')) 720*32332Sbostic return (NULL); 721*32332Sbostic 722*32332Sbostic i = strlen(regexpr); 723*32332Sbostic for (j = 0; j < i; j++) 724*32332Sbostic if (regexpr[j] == NL) 725*32332Sbostic regexpr[j] = '|'; 726*32332Sbostic 727*32332Sbostic if (!fgrepflag) { 728*32332Sbostic if (strchr(regexpr, '|') == NULL || regexpr[0] == '|') 729*32332Sbostic return (NULL); 730*32332Sbostic if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL 731*32332Sbostic || strindex(regexpr, "||") >= 0) 732*32332Sbostic return (NULL); 733*32332Sbostic } 734*32332Sbostic oflow[0] = EOS; 735*32332Sbostic nalt = sscanf(regexpr, "%[^|]|%[^|]|%[^|]|%[^|]|%[^|]|%[^|]|%[^|]|%[^|]", 736*32332Sbostic altpat[0], altpat[1], altpat[2], altpat[3], altpat[4], altpat[5], altpat[6], oflow); 737*32332Sbostic 738*32332Sbostic if (nalt < 1 || oflow[0] != EOS) 739*32332Sbostic return (NULL); 740*32332Sbostic 741*32332Sbostic altmin = NMUSH; 742*32332Sbostic for (j = 0; j < nalt; j++) { 743*32332Sbostic min = altlen[j] = strlen(altpat[j]); 744*32332Sbostic if (min > ALTSIZE) 745*32332Sbostic return (NULL); 746*32332Sbostic altmin = MIN(altmin, min); 747*32332Sbostic } 748*32332Sbostic if (nalt > 1) { /* build superimposed "pre-match" sets per 749*32332Sbostic * char */ 750*32332Sbostic altflag++; 751*32332Sbostic for (j = 0; j < nalt; j++) 752*32332Sbostic for (i = 0; i < altmin; i++) { 753*32332Sbostic c = altpat[j][altlen[j] - altmin + i]; 754*32332Sbostic altset[i + 1][c] = 1; /* offset for sentinel */ 755*32332Sbostic } 756*32332Sbostic } 757*32332Sbostic return (altpat[0]); 758*32332Sbostic } 759*32332Sbostic 760*32332Sbostic /* 761*32332Sbostic * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to 762*32332Sbostic * determine whether to use dfa-based egrep: We do FIRSTFEW matches with 763*32332Sbostic * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT 764*32332Sbostic * of the input, the r.e. is likely to be underspecified, so do old *grep, 765*32332Sbostic * which is faster on complex patterns than regexp(). At FIRSTFEW, 766*32332Sbostic * dump the saved matches collected by savematch(). They are saved 767*32332Sbostic * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic, 768*32332Sbostic * since it's hard to rewind. 769*32332Sbostic */ 770*32332Sbostic 771*32332Sbostic execstrategy(file) 772*32332Sbostic char *file; 773*32332Sbostic { 774*32332Sbostic int pctmatch; 775*32332Sbostic 776*32332Sbostic pctmatch = (100 * rxcount) / incount; 777*32332Sbostic if (pctmatch > PUNTPERCENT && file != NULL) 778*32332Sbostic kernighan(args); 779*32332Sbostic if (file != NULL) 780*32332Sbostic flushmatches(); 781*32332Sbostic } 782*32332Sbostic 783*32332Sbostic nlcount(bstart, bstop) /* flail interval to totalize newlines. */ 784*32332Sbostic char *bstart, *bstop; 785*32332Sbostic { 786*32332Sbostic register char *s = bstart; 787*32332Sbostic register char *t = bstop; 788*32332Sbostic register int count = 0; 789*32332Sbostic 790*32332Sbostic do { /* loop unroll for older architectures */ 791*32332Sbostic if (*t == NL) /* ... ask ames!jaw for sample code */ 792*32332Sbostic count++; 793*32332Sbostic } while (t-- > s); 794*32332Sbostic 795*32332Sbostic return (count); 796*32332Sbostic } 797*32332Sbostic 798*32332Sbostic char * 799*32332Sbostic isolate(regexpr) /* isolate longest metacharacter-free string */ 800*32332Sbostic char *regexpr; 801*32332Sbostic { 802*32332Sbostic char *dummyexpr; 803*32332Sbostic 804*32332Sbostic /* 805*32332Sbostic * We add (.)* because Henry's regcomp only figures regmust if it 806*32332Sbostic * sees a leading * pattern. Foo! 807*32332Sbostic */ 808*32332Sbostic dummyexpr = malloc((unsigned) strlen(regexpr) + 5); 809*32332Sbostic sprintf(dummyexpr, "(.)*%s", regexpr); 810*32332Sbostic if ((rspencer = regcomp(dummyexpr)) == NULL) 811*32332Sbostic kernighan(args); 812*32332Sbostic return (rspencer->regmust); 813*32332Sbostic } 814*32332Sbostic 815*32332Sbostic char *matches[FIRSTFEW]; 816*32332Sbostic static int mcount = 0; 817*32332Sbostic 818*32332Sbostic char * 819*32332Sbostic savematch(s) /* horde matches during statistics gathering */ 820*32332Sbostic register char *s; 821*32332Sbostic { 822*32332Sbostic char *p; 823*32332Sbostic char *start = s; 824*32332Sbostic int msize = 0; 825*32332Sbostic int psize = strlen(preamble); 826*32332Sbostic 827*32332Sbostic while (*s++ != NL) 828*32332Sbostic msize++; 829*32332Sbostic *--s = EOS; 830*32332Sbostic 831*32332Sbostic p = malloc((unsigned) msize + 1 + psize); 832*32332Sbostic strcpy(p, preamble); 833*32332Sbostic strcpy(p + psize, start); 834*32332Sbostic matches[mcount++] = p; 835*32332Sbostic 836*32332Sbostic preamble[0] = 0; 837*32332Sbostic *s = NL; 838*32332Sbostic return (s); 839*32332Sbostic } 840*32332Sbostic 841*32332Sbostic flushmatches() 842*32332Sbostic { 843*32332Sbostic int n; 844*32332Sbostic 845*32332Sbostic flushflag = 1; 846*32332Sbostic for (n = 0; n < mcount; n++) 847*32332Sbostic printf("%s\n", matches[n]); 848*32332Sbostic mcount = 0; 849*32332Sbostic } 850*32332Sbostic 851*32332Sbostic oops(message) 852*32332Sbostic char *message; 853*32332Sbostic { 854*32332Sbostic fprintf(stderr, "%s: %s\n", progname, message); 855*32332Sbostic exit(2); 856*32332Sbostic } 857*32332Sbostic 858*32332Sbostic kernighan(args) /* "let others do the hard part ..." */ 859*32332Sbostic char *args[]; 860*32332Sbostic { 861*32332Sbostic /* 862*32332Sbostic * We may have already run grep on some of the files; remove them 863*32332Sbostic * from the arg list we pass on. Note that we can't delete them 864*32332Sbostic * totally because the number of file names affects the output 865*32332Sbostic * (automatic -h). 866*32332Sbostic */ 867*32332Sbostic /* better would be fork/exec per punted file -- jaw */ 868*32332Sbostic 869*32332Sbostic while (firstfile && optind > firstfile) 870*32332Sbostic args[firstfile++] = "/dev/null"; 871*32332Sbostic if (patind) 872*32332Sbostic args[patind] = pattern; 873*32332Sbostic fflush(stdout); 874*32332Sbostic 875*32332Sbostic if (grepflag) 876*32332Sbostic execvp(GREPSTD, args), oops("can't exec old 'grep'"); 877*32332Sbostic else if (fgrepflag) 878*32332Sbostic execvp(FGREPSTD, args), oops("can't exec old 'fgrep'"); 879*32332Sbostic else 880*32332Sbostic execvp(EGREPSTD, args), oops("can't exec old 'egrep'"); 881*32332Sbostic } 882