1 /* $OpenBSD: util.c,v 1.36 2007/10/02 17:59:18 otto Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stat.h> 31 32 #include <ctype.h> 33 #include <err.h> 34 #include <errno.h> 35 #include <fts.h> 36 #include <regex.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <string.h> 40 #include <unistd.h> 41 #include <zlib.h> 42 43 #include "grep.h" 44 45 /* 46 * Process a file line by line... 47 */ 48 49 static int linesqueued; 50 static int procline(str_t *l, int); 51 static int grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *pmatch); 52 static int grep_cmp(const unsigned char *, const unsigned char *, size_t); 53 static void grep_revstr(unsigned char *, int); 54 55 int 56 grep_tree(char **argv) 57 { 58 FTS *fts; 59 FTSENT *p; 60 int c, fts_flags; 61 62 c = fts_flags = 0; 63 64 if (Hflag) 65 fts_flags = FTS_COMFOLLOW; 66 if (Pflag) 67 fts_flags = FTS_PHYSICAL; 68 if (Sflag) 69 fts_flags = FTS_LOGICAL; 70 71 fts_flags |= FTS_NOSTAT | FTS_NOCHDIR; 72 73 if (!(fts = fts_open(argv, fts_flags, NULL))) 74 err(2, NULL); 75 while ((p = fts_read(fts)) != NULL) { 76 switch (p->fts_info) { 77 case FTS_DNR: 78 break; 79 case FTS_ERR: 80 errx(2, "%s: %s", p->fts_path, strerror(p->fts_errno)); 81 break; 82 case FTS_DP: 83 break; 84 default: 85 c += procfile(p->fts_path); 86 break; 87 } 88 } 89 if (errno) 90 err(2, "fts_read"); 91 92 return c; 93 } 94 95 int 96 procfile(char *fn) 97 { 98 str_t ln; 99 file_t *f; 100 int c, t, z, nottext; 101 102 if (fn == NULL) { 103 fn = "(standard input)"; 104 f = grep_fdopen(STDIN_FILENO, "r"); 105 } else { 106 f = grep_open(fn, "r"); 107 } 108 if (f == NULL) { 109 if (!sflag) 110 warn("%s", fn); 111 return 0; 112 } 113 114 nottext = grep_bin_file(f); 115 if (nottext && binbehave == BIN_FILE_SKIP) { 116 grep_close(f); 117 return 0; 118 } 119 120 ln.file = fn; 121 ln.line_no = 0; 122 ln.len = 0; 123 linesqueued = 0; 124 tail = 0; 125 ln.off = -1; 126 127 if (Bflag > 0) 128 initqueue(); 129 for (c = 0; c == 0 || !(lflag || qflag); ) { 130 ln.off += ln.len + 1; 131 if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) 132 break; 133 if (ln.len > 0 && ln.dat[ln.len - 1] == '\n') 134 --ln.len; 135 ln.line_no++; 136 137 z = tail; 138 139 if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) { 140 enqueue(&ln); 141 linesqueued++; 142 } 143 c += t; 144 } 145 if (Bflag > 0) 146 clearqueue(); 147 grep_close(f); 148 149 if (cflag) { 150 if (!hflag) 151 printf("%s:", ln.file); 152 printf("%u\n", c); 153 } 154 if (lflag && c != 0) 155 printf("%s\n", fn); 156 if (Lflag && c == 0) 157 printf("%s\n", fn); 158 if (c && !cflag && !lflag && !Lflag && 159 binbehave == BIN_FILE_BIN && nottext && !qflag) 160 printf("Binary file %s matches\n", fn); 161 162 return c; 163 } 164 165 166 /* 167 * Process an individual line in a file. Return non-zero if it matches. 168 */ 169 170 #define isword(x) (isalnum(x) || (x) == '_') 171 172 static int 173 procline(str_t *l, int nottext) 174 { 175 regmatch_t pmatch; 176 int c, i, r; 177 178 if (matchall) { 179 c = !vflag; 180 goto print; 181 } 182 183 for (c = i = 0; i < patterns; i++) { 184 if (fg_pattern[i].pattern) { 185 r = grep_search(&fg_pattern[i], (unsigned char *)l->dat, 186 l->len, &pmatch); 187 } else { 188 pmatch.rm_so = 0; 189 pmatch.rm_eo = l->len; 190 r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags); 191 } 192 if (r == 0 && xflag) { 193 if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) 194 r = REG_NOMATCH; 195 } 196 if (r == 0) { 197 c++; 198 break; 199 } 200 } 201 if (vflag) 202 c = !c; 203 204 print: 205 if (c && binbehave == BIN_FILE_BIN && nottext) 206 return c; /* Binary file */ 207 208 if ((tail > 0 || c) && !cflag && !qflag) { 209 if (c) { 210 if (first > 0 && tail == 0 && (Bflag < linesqueued) && 211 (Aflag || Bflag)) 212 printf("--\n"); 213 first = 1; 214 tail = Aflag; 215 if (Bflag > 0) 216 printqueue(); 217 linesqueued = 0; 218 printline(l, ':'); 219 } else { 220 printline(l, '-'); 221 tail--; 222 } 223 } 224 return c; 225 } 226 227 void 228 fgrepcomp(fastgrep_t *fg, const char *pattern) 229 { 230 int i; 231 232 /* Initialize. */ 233 fg->patternLen = strlen(pattern); 234 fg->bol = 0; 235 fg->eol = 0; 236 fg->wmatch = wflag; 237 fg->reversedSearch = 0; 238 239 /* 240 * Make a copy and upper case it for later if in -i mode, 241 * else just copy the pointer. 242 */ 243 if (iflag) { 244 fg->pattern = grep_malloc(fg->patternLen + 1); 245 for (i = 0; i < fg->patternLen; i++) 246 fg->pattern[i] = toupper(pattern[i]); 247 fg->pattern[fg->patternLen] = '\0'; 248 } else 249 fg->pattern = (unsigned char *)pattern; /* really const */ 250 251 /* Preprocess pattern. */ 252 for (i = 0; i <= UCHAR_MAX; i++) 253 fg->qsBc[i] = fg->patternLen; 254 for (i = 1; i < fg->patternLen; i++) { 255 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 256 /* 257 * If case is ignored, make the jump apply to both upper and 258 * lower cased characters. As the pattern is stored in upper 259 * case, apply the same to the lower case equivalents. 260 */ 261 if (iflag) 262 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 263 } 264 } 265 266 /* 267 * Returns: -1 on failure, 0 on success 268 */ 269 int 270 fastcomp(fastgrep_t *fg, const char *pattern) 271 { 272 int i; 273 int bol = 0; 274 int eol = 0; 275 int shiftPatternLen; 276 int hasDot = 0; 277 int firstHalfDot = -1; 278 int firstLastHalfDot = -1; 279 int lastHalfDot = 0; 280 281 /* Initialize. */ 282 fg->patternLen = strlen(pattern); 283 fg->bol = 0; 284 fg->eol = 0; 285 fg->wmatch = 0; 286 fg->reversedSearch = 0; 287 288 /* Remove end-of-line character ('$'). */ 289 if (pattern[fg->patternLen - 1] == '$') { 290 eol++; 291 fg->eol = 1; 292 fg->patternLen--; 293 } 294 295 /* Remove beginning-of-line character ('^'). */ 296 if (pattern[0] == '^') { 297 bol++; 298 fg->bol = 1; 299 fg->patternLen--; 300 } 301 302 /* Remove enclosing [[:<:]] and [[:>:]] (word match). */ 303 if (wflag) { 304 /* basic re's use \( \), extended re's ( ) */ 305 int extra = Eflag ? 1 : 2; 306 fg->patternLen -= 14 + 2 * extra; 307 fg->wmatch = 7 + extra; 308 } else if (fg->patternLen >= 14 && 309 strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 && 310 strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) { 311 fg->patternLen -= 14; 312 fg->wmatch = 7; 313 } 314 315 /* 316 * Copy pattern minus '^' and '$' characters as well as word 317 * match character classes at the beginning and ending of the 318 * string respectively. 319 */ 320 fg->pattern = grep_malloc(fg->patternLen + 1); 321 memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen); 322 fg->pattern[fg->patternLen] = '\0'; 323 324 /* Look for ways to cheat...er...avoid the full regex engine. */ 325 for (i = 0; i < fg->patternLen; i++) 326 { 327 /* Can still cheat? */ 328 if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) || 329 (fg->pattern[i] == '_') || (fg->pattern[i] == ',') || 330 (fg->pattern[i] == '=') || (fg->pattern[i] == '-') || 331 (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) { 332 /* As long as it is good, upper case it for later. */ 333 if (iflag) 334 fg->pattern[i] = toupper(fg->pattern[i]); 335 } else if (fg->pattern[i] == '.') { 336 hasDot = i; 337 if (i < fg->patternLen / 2) { 338 if (firstHalfDot < 0) 339 /* Closest dot to the beginning */ 340 firstHalfDot = i; 341 } else { 342 /* Closest dot to the end of the pattern. */ 343 lastHalfDot = i; 344 if (firstLastHalfDot < 0) 345 firstLastHalfDot = i; 346 } 347 } else { 348 /* Free memory and let others know this is empty. */ 349 free(fg->pattern); 350 fg->pattern = NULL; 351 return (-1); 352 } 353 } 354 355 /* 356 * Determine if a reverse search would be faster based on the placement 357 * of the dots. 358 */ 359 if ((!(lflag || cflag)) && ((!(bol || eol)) && 360 ((lastHalfDot) && ((firstHalfDot < 0) || 361 ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) { 362 fg->reversedSearch = 1; 363 hasDot = fg->patternLen - (firstHalfDot < 0 ? 364 firstLastHalfDot : firstHalfDot) - 1; 365 grep_revstr(fg->pattern, fg->patternLen); 366 } 367 368 /* 369 * Normal Quick Search would require a shift based on the position the 370 * next character after the comparison is within the pattern. With 371 * wildcards, the position of the last dot effects the maximum shift 372 * distance. 373 * The closer to the end the wild card is the slower the search. A 374 * reverse version of this algorithm would be useful for wildcards near 375 * the end of the string. 376 * 377 * Examples: 378 * Pattern Max shift 379 * ------- --------- 380 * this 5 381 * .his 4 382 * t.is 3 383 * th.s 2 384 * thi. 1 385 */ 386 387 /* Adjust the shift based on location of the last dot ('.'). */ 388 shiftPatternLen = fg->patternLen - hasDot; 389 390 /* Preprocess pattern. */ 391 for (i = 0; i <= UCHAR_MAX; i++) 392 fg->qsBc[i] = shiftPatternLen; 393 for (i = hasDot + 1; i < fg->patternLen; i++) { 394 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 395 /* 396 * If case is ignored, make the jump apply to both upper and 397 * lower cased characters. As the pattern is stored in upper 398 * case, apply the same to the lower case equivalents. 399 */ 400 if (iflag) 401 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 402 } 403 404 /* 405 * Put pattern back to normal after pre-processing to allow for easy 406 * comparisons later. 407 */ 408 if (fg->reversedSearch) 409 grep_revstr(fg->pattern, fg->patternLen); 410 411 return (0); 412 } 413 414 /* 415 * Word boundaries using regular expressions are defined as the point 416 * of transition from a non-word char to a word char, or vice versa. 417 * This means that grep -w +a and grep -w a+ never match anything, 418 * because they lack a starting or ending transition, but grep -w a+b 419 * does match a line containing a+b. 420 */ 421 #define wmatch(d, l, s, e) \ 422 ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \ 423 e > s && isword(d[s]) && isword(d[e-1])) 424 425 static int 426 grep_search(fastgrep_t *fg, unsigned char *data, size_t dataLen, regmatch_t *pmatch) 427 { 428 int j; 429 int rtrnVal = REG_NOMATCH; 430 431 pmatch->rm_so = -1; 432 pmatch->rm_eo = -1; 433 434 /* No point in going farther if we do not have enough data. */ 435 if (dataLen < fg->patternLen) 436 return (rtrnVal); 437 438 /* Only try once at the beginning or ending of the line. */ 439 if (fg->bol || fg->eol) { 440 /* Simple text comparison. */ 441 /* Verify data is >= pattern length before searching on it. */ 442 if (dataLen >= fg->patternLen) { 443 /* Determine where in data to start search at. */ 444 if (fg->eol) 445 j = dataLen - fg->patternLen; 446 else 447 j = 0; 448 if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen))) 449 if (grep_cmp(fg->pattern, data + j, 450 fg->patternLen) == -1) { 451 pmatch->rm_so = j; 452 pmatch->rm_eo = j + fg->patternLen; 453 if (!fg->wmatch || wmatch(data, dataLen, 454 pmatch->rm_so, pmatch->rm_eo)) 455 rtrnVal = 0; 456 } 457 } 458 } else if (fg->reversedSearch) { 459 /* Quick Search algorithm. */ 460 j = dataLen; 461 do { 462 if (grep_cmp(fg->pattern, data + j - fg->patternLen, 463 fg->patternLen) == -1) { 464 pmatch->rm_so = j - fg->patternLen; 465 pmatch->rm_eo = j; 466 if (!fg->wmatch || wmatch(data, dataLen, 467 pmatch->rm_so, pmatch->rm_eo)) { 468 rtrnVal = 0; 469 break; 470 } 471 } 472 /* Shift if within bounds, otherwise, we are done. */ 473 if (j == fg->patternLen) 474 break; 475 j -= fg->qsBc[data[j - fg->patternLen - 1]]; 476 } while (j >= fg->patternLen); 477 } else { 478 /* Quick Search algorithm. */ 479 j = 0; 480 do { 481 if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) { 482 pmatch->rm_so = j; 483 pmatch->rm_eo = j + fg->patternLen; 484 if (fg->patternLen == 0 || !fg->wmatch || 485 wmatch(data, dataLen, pmatch->rm_so, 486 pmatch->rm_eo)) { 487 rtrnVal = 0; 488 break; 489 } 490 } 491 492 /* Shift if within bounds, otherwise, we are done. */ 493 if (j + fg->patternLen == dataLen) 494 break; 495 else 496 j += fg->qsBc[data[j + fg->patternLen]]; 497 } while (j <= (dataLen - fg->patternLen)); 498 } 499 500 return (rtrnVal); 501 } 502 503 504 void * 505 grep_malloc(size_t size) 506 { 507 void *ptr; 508 509 if ((ptr = malloc(size)) == NULL) 510 err(2, "malloc"); 511 return ptr; 512 } 513 514 void * 515 grep_calloc(size_t nmemb, size_t size) 516 { 517 void *ptr; 518 519 if ((ptr = calloc(nmemb, size)) == NULL) 520 err(2, "calloc"); 521 return ptr; 522 } 523 524 void * 525 grep_realloc(void *ptr, size_t size) 526 { 527 if ((ptr = realloc(ptr, size)) == NULL) 528 err(2, "realloc"); 529 return ptr; 530 } 531 532 /* 533 * Returns: i >= 0 on failure (position that it failed) 534 * -1 on success 535 */ 536 static int 537 grep_cmp(const unsigned char *pattern, const unsigned char *data, size_t len) 538 { 539 int i; 540 541 for (i = 0; i < len; i++) { 542 if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.')) 543 || (iflag && pattern[i] == toupper(data[i]))) 544 continue; 545 return (i); 546 } 547 548 return (-1); 549 } 550 551 static void 552 grep_revstr(unsigned char *str, int len) 553 { 554 int i; 555 char c; 556 557 for (i = 0; i < len / 2; i++) { 558 c = str[i]; 559 str[i] = str[len - i - 1]; 560 str[len - i - 1] = c; 561 } 562 } 563 564 void 565 printline(str_t *line, int sep) 566 { 567 int n; 568 569 n = 0; 570 if (!hflag) { 571 fputs(line->file, stdout); 572 ++n; 573 } 574 if (nflag) { 575 if (n) 576 putchar(sep); 577 printf("%d", line->line_no); 578 ++n; 579 } 580 if (bflag) { 581 if (n) 582 putchar(sep); 583 printf("%lld", (long long)line->off); 584 ++n; 585 } 586 if (n) 587 putchar(sep); 588 fwrite(line->dat, line->len, 1, stdout); 589 putchar('\n'); 590 } 591