1 /* $OpenBSD: util.c,v 1.34 2006/12/26 20:59:23 otto Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stat.h> 31 32 #include <ctype.h> 33 #include <err.h> 34 #include <errno.h> 35 #include <fts.h> 36 #include <regex.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <string.h> 40 #include <unistd.h> 41 #include <zlib.h> 42 43 #include "grep.h" 44 45 /* 46 * Process a file line by line... 47 */ 48 49 static int linesqueued; 50 static int procline(str_t *l, int); 51 static int grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *pmatch); 52 static int grep_cmp(const unsigned char *, const unsigned char *, size_t); 53 static void grep_revstr(unsigned char *, int); 54 55 int 56 grep_tree(char **argv) 57 { 58 FTS *fts; 59 FTSENT *p; 60 int c, fts_flags; 61 62 c = fts_flags = 0; 63 64 if (Hflag) 65 fts_flags = FTS_COMFOLLOW; 66 if (Pflag) 67 fts_flags = FTS_PHYSICAL; 68 if (Sflag) 69 fts_flags = FTS_LOGICAL; 70 71 fts_flags |= FTS_NOSTAT | FTS_NOCHDIR; 72 73 if (!(fts = fts_open(argv, fts_flags, NULL))) 74 err(2, NULL); 75 while ((p = fts_read(fts)) != NULL) { 76 switch (p->fts_info) { 77 case FTS_DNR: 78 break; 79 case FTS_ERR: 80 errx(2, "%s: %s", p->fts_path, strerror(p->fts_errno)); 81 break; 82 case FTS_DP: 83 break; 84 default: 85 c += procfile(p->fts_path); 86 break; 87 } 88 } 89 if (errno) 90 err(2, "fts_read"); 91 92 return c; 93 } 94 95 int 96 procfile(char *fn) 97 { 98 str_t ln; 99 file_t *f; 100 int c, t, z, nottext; 101 102 if (fn == NULL) { 103 fn = "(standard input)"; 104 f = grep_fdopen(STDIN_FILENO, "r"); 105 } else { 106 f = grep_open(fn, "r"); 107 } 108 if (f == NULL) { 109 if (!sflag) 110 warn("%s", fn); 111 return 0; 112 } 113 114 nottext = grep_bin_file(f); 115 if (nottext && binbehave == BIN_FILE_SKIP) { 116 grep_close(f); 117 return 0; 118 } 119 120 ln.file = fn; 121 ln.line_no = 0; 122 ln.len = 0; 123 linesqueued = 0; 124 tail = 0; 125 ln.off = -1; 126 127 if (Bflag > 0) 128 initqueue(); 129 for (c = 0; c == 0 || !(lflag || qflag); ) { 130 ln.off += ln.len + 1; 131 if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) 132 break; 133 if (ln.len > 0 && ln.dat[ln.len - 1] == '\n') 134 --ln.len; 135 ln.line_no++; 136 137 z = tail; 138 139 if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) { 140 enqueue(&ln); 141 linesqueued++; 142 } 143 c += t; 144 } 145 if (Bflag > 0) 146 clearqueue(); 147 grep_close(f); 148 149 if (cflag) { 150 if (!hflag) 151 printf("%s:", ln.file); 152 printf("%u\n", c); 153 } 154 if (lflag && c != 0) 155 printf("%s\n", fn); 156 if (Lflag && c == 0) 157 printf("%s\n", fn); 158 if (c && !cflag && !lflag && !Lflag && 159 binbehave == BIN_FILE_BIN && nottext && !qflag) 160 printf("Binary file %s matches\n", fn); 161 162 return c; 163 } 164 165 166 /* 167 * Process an individual line in a file. Return non-zero if it matches. 168 */ 169 170 #define isword(x) (isalnum(x) || (x) == '_') 171 172 static int 173 procline(str_t *l, int nottext) 174 { 175 regmatch_t pmatch; 176 int c, i, r; 177 178 if (matchall) { 179 c = !vflag; 180 goto print; 181 } 182 183 for (c = i = 0; i < patterns; i++) { 184 if (fg_pattern[i].pattern) { 185 r = grep_search(&fg_pattern[i], (unsigned char *)l->dat, 186 l->len, &pmatch); 187 } else { 188 pmatch.rm_so = 0; 189 pmatch.rm_eo = l->len; 190 r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags); 191 } 192 if (r == 0 && xflag) { 193 if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) 194 r = REG_NOMATCH; 195 } 196 if (r == 0) { 197 c++; 198 break; 199 } 200 } 201 if (vflag) 202 c = !c; 203 204 print: 205 if (c && binbehave == BIN_FILE_BIN && nottext) 206 return c; /* Binary file */ 207 208 if ((tail > 0 || c) && !cflag && !qflag) { 209 if (c) { 210 if (first > 0 && tail == 0 && (Bflag < linesqueued) && 211 (Aflag || Bflag)) 212 printf("--\n"); 213 first = 1; 214 tail = Aflag; 215 if (Bflag > 0) 216 printqueue(); 217 linesqueued = 0; 218 printline(l, ':'); 219 } else { 220 printline(l, '-'); 221 tail--; 222 } 223 } 224 return c; 225 } 226 227 void 228 fgrepcomp(fastgrep_t *fg, const char *pattern) 229 { 230 int i; 231 232 /* Initialize. */ 233 fg->patternLen = strlen(pattern); 234 fg->bol = 0; 235 fg->eol = 0; 236 fg->wmatch = wflag; 237 fg->reversedSearch = 0; 238 239 /* 240 * Make a copy and upper case it for later if in -i mode, 241 * else just copy the pointer. 242 */ 243 if (iflag) { 244 fg->pattern = grep_malloc(fg->patternLen + 1); 245 for (i = 0; i < fg->patternLen; i++) 246 fg->pattern[i] = toupper(pattern[i]); 247 fg->pattern[fg->patternLen] = '\0'; 248 } else 249 fg->pattern = (unsigned char *)pattern; /* really const */ 250 251 /* Preprocess pattern. */ 252 for (i = 0; i <= UCHAR_MAX; i++) 253 fg->qsBc[i] = fg->patternLen; 254 for (i = 1; i < fg->patternLen; i++) { 255 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 256 /* 257 * If case is ignored, make the jump apply to both upper and 258 * lower cased characters. As the pattern is stored in upper 259 * case, apply the same to the lower case equivalents. 260 */ 261 if (iflag) 262 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 263 } 264 } 265 266 /* 267 * Returns: -1 on failure, 0 on success 268 */ 269 int 270 fastcomp(fastgrep_t *fg, const char *pattern) 271 { 272 int i; 273 int bol = 0; 274 int eol = 0; 275 int shiftPatternLen; 276 int hasDot = 0; 277 int firstHalfDot = -1; 278 int firstLastHalfDot = -1; 279 int lastHalfDot = 0; 280 281 /* Initialize. */ 282 fg->patternLen = strlen(pattern); 283 fg->bol = 0; 284 fg->eol = 0; 285 fg->wmatch = 0; 286 fg->reversedSearch = 0; 287 288 /* Remove end-of-line character ('$'). */ 289 if (pattern[fg->patternLen - 1] == '$') { 290 eol++; 291 fg->eol = 1; 292 fg->patternLen--; 293 } 294 295 /* Remove beginning-of-line character ('^'). */ 296 if (pattern[0] == '^') { 297 bol++; 298 fg->bol = 1; 299 fg->patternLen--; 300 } 301 302 /* Remove enclosing [[:<:]] and [[:>:]] (word match). */ 303 if (wflag) { 304 /* basic re's use \( \), extended re's ( ) */ 305 int extra = Eflag ? 1 : 2; 306 fg->patternLen -= 14 + 2 * extra; 307 fg->wmatch = 7 + extra; 308 } else if (fg->patternLen >= 14 && 309 strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 && 310 strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) { 311 fg->patternLen -= 14; 312 fg->wmatch = 7; 313 } 314 315 /* 316 * Copy pattern minus '^' and '$' characters as well as word 317 * match character classes at the beginning and ending of the 318 * string respectively. 319 */ 320 fg->pattern = grep_malloc(fg->patternLen + 1); 321 memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen); 322 fg->pattern[fg->patternLen] = '\0'; 323 324 /* Look for ways to cheat...er...avoid the full regex engine. */ 325 for (i = 0; i < fg->patternLen; i++) 326 { 327 /* Can still cheat? */ 328 if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) || 329 (fg->pattern[i] == '_') || (fg->pattern[i] == ',') || 330 (fg->pattern[i] == '^') || (fg->pattern[i] == '$') || 331 (fg->pattern[i] == '=') || (fg->pattern[i] == '-') || 332 (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) { 333 /* As long as it is good, upper case it for later. */ 334 if (iflag) 335 fg->pattern[i] = toupper(fg->pattern[i]); 336 } else if (fg->pattern[i] == '.') { 337 hasDot = i; 338 if (i < fg->patternLen / 2) { 339 if (firstHalfDot < 0) 340 /* Closest dot to the beginning */ 341 firstHalfDot = i; 342 } else { 343 /* Closest dot to the end of the pattern. */ 344 lastHalfDot = i; 345 if (firstLastHalfDot < 0) 346 firstLastHalfDot = i; 347 } 348 } else { 349 /* Free memory and let others know this is empty. */ 350 free(fg->pattern); 351 fg->pattern = NULL; 352 return (-1); 353 } 354 } 355 356 /* 357 * Determine if a reverse search would be faster based on the placement 358 * of the dots. 359 */ 360 if ((!(lflag || cflag)) && ((!(bol || eol)) && 361 ((lastHalfDot) && ((firstHalfDot < 0) || 362 ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) { 363 fg->reversedSearch = 1; 364 hasDot = fg->patternLen - (firstHalfDot < 0 ? 365 firstLastHalfDot : firstHalfDot) - 1; 366 grep_revstr(fg->pattern, fg->patternLen); 367 } 368 369 /* 370 * Normal Quick Search would require a shift based on the position the 371 * next character after the comparison is within the pattern. With 372 * wildcards, the position of the last dot effects the maximum shift 373 * distance. 374 * The closer to the end the wild card is the slower the search. A 375 * reverse version of this algorithm would be useful for wildcards near 376 * the end of the string. 377 * 378 * Examples: 379 * Pattern Max shift 380 * ------- --------- 381 * this 5 382 * .his 4 383 * t.is 3 384 * th.s 2 385 * thi. 1 386 */ 387 388 /* Adjust the shift based on location of the last dot ('.'). */ 389 shiftPatternLen = fg->patternLen - hasDot; 390 391 /* Preprocess pattern. */ 392 for (i = 0; i <= UCHAR_MAX; i++) 393 fg->qsBc[i] = shiftPatternLen; 394 for (i = hasDot + 1; i < fg->patternLen; i++) { 395 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 396 /* 397 * If case is ignored, make the jump apply to both upper and 398 * lower cased characters. As the pattern is stored in upper 399 * case, apply the same to the lower case equivalents. 400 */ 401 if (iflag) 402 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 403 } 404 405 /* 406 * Put pattern back to normal after pre-processing to allow for easy 407 * comparisons later. 408 */ 409 if (fg->reversedSearch) 410 grep_revstr(fg->pattern, fg->patternLen); 411 412 return (0); 413 } 414 415 /* 416 * Word boundaries using regular expressions are defined as the point 417 * of transition from a non-word char to a word char, or vice versa. 418 * This means that grep -w +a and grep -w a+ never match anything, 419 * because they lack a starting or ending transition, but grep -w a+b 420 * does match a line containing a+b. 421 */ 422 #define wmatch(d, l, s, e) \ 423 ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \ 424 e > s && isword(d[s]) && isword(d[e-1])) 425 426 static int 427 grep_search(fastgrep_t *fg, unsigned char *data, size_t dataLen, regmatch_t *pmatch) 428 { 429 int j; 430 int rtrnVal = REG_NOMATCH; 431 432 pmatch->rm_so = -1; 433 pmatch->rm_eo = -1; 434 435 /* No point in going farther if we do not have enough data. */ 436 if (dataLen < fg->patternLen) 437 return (rtrnVal); 438 439 /* Only try once at the beginning or ending of the line. */ 440 if (fg->bol || fg->eol) { 441 /* Simple text comparison. */ 442 /* Verify data is >= pattern length before searching on it. */ 443 if (dataLen >= fg->patternLen) { 444 /* Determine where in data to start search at. */ 445 if (fg->eol) 446 j = dataLen - fg->patternLen; 447 else 448 j = 0; 449 if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen))) 450 if (grep_cmp(fg->pattern, data + j, 451 fg->patternLen) == -1) { 452 pmatch->rm_so = j; 453 pmatch->rm_eo = j + fg->patternLen; 454 if (!fg->wmatch || wmatch(data, dataLen, 455 pmatch->rm_so, pmatch->rm_eo)) 456 rtrnVal = 0; 457 } 458 } 459 } else if (fg->reversedSearch) { 460 /* Quick Search algorithm. */ 461 j = dataLen; 462 do { 463 if (grep_cmp(fg->pattern, data + j - fg->patternLen, 464 fg->patternLen) == -1) { 465 pmatch->rm_so = j - fg->patternLen; 466 pmatch->rm_eo = j; 467 if (!fg->wmatch || wmatch(data, dataLen, 468 pmatch->rm_so, pmatch->rm_eo)) { 469 rtrnVal = 0; 470 break; 471 } 472 } 473 /* Shift if within bounds, otherwise, we are done. */ 474 if (j == fg->patternLen) 475 break; 476 j -= fg->qsBc[data[j - fg->patternLen - 1]]; 477 } while (j >= fg->patternLen); 478 } else { 479 /* Quick Search algorithm. */ 480 j = 0; 481 do { 482 if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) { 483 pmatch->rm_so = j; 484 pmatch->rm_eo = j + fg->patternLen; 485 if (fg->patternLen == 0 || !fg->wmatch || 486 wmatch(data, dataLen, pmatch->rm_so, 487 pmatch->rm_eo)) { 488 rtrnVal = 0; 489 break; 490 } 491 } 492 493 /* Shift if within bounds, otherwise, we are done. */ 494 if (j + fg->patternLen == dataLen) 495 break; 496 else 497 j += fg->qsBc[data[j + fg->patternLen]]; 498 } while (j <= (dataLen - fg->patternLen)); 499 } 500 501 return (rtrnVal); 502 } 503 504 505 void * 506 grep_malloc(size_t size) 507 { 508 void *ptr; 509 510 if ((ptr = malloc(size)) == NULL) 511 err(2, "malloc"); 512 return ptr; 513 } 514 515 void * 516 grep_realloc(void *ptr, size_t size) 517 { 518 if ((ptr = realloc(ptr, size)) == NULL) 519 err(2, "realloc"); 520 return ptr; 521 } 522 523 /* 524 * Returns: i >= 0 on failure (position that it failed) 525 * -1 on success 526 */ 527 static int 528 grep_cmp(const unsigned char *pattern, const unsigned char *data, size_t len) 529 { 530 int i; 531 532 for (i = 0; i < len; i++) { 533 if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.')) 534 || (iflag && pattern[i] == toupper(data[i]))) 535 continue; 536 return (i); 537 } 538 539 return (-1); 540 } 541 542 static void 543 grep_revstr(unsigned char *str, int len) 544 { 545 int i; 546 char c; 547 548 for (i = 0; i < len / 2; i++) { 549 c = str[i]; 550 str[i] = str[len - i - 1]; 551 str[len - i - 1] = c; 552 } 553 } 554 555 void 556 printline(str_t *line, int sep) 557 { 558 int n; 559 560 n = 0; 561 if (!hflag) { 562 fputs(line->file, stdout); 563 ++n; 564 } 565 if (nflag) { 566 if (n) 567 putchar(sep); 568 printf("%d", line->line_no); 569 ++n; 570 } 571 if (bflag) { 572 if (n) 573 putchar(sep); 574 printf("%lld", (long long)line->off); 575 ++n; 576 } 577 if (n) 578 putchar(sep); 579 fwrite(line->dat, line->len, 1, stdout); 580 putchar('\n'); 581 } 582