1 /* $OpenBSD: util.c,v 1.42 2011/07/17 19:39:21 aschrijver Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stat.h> 31 32 #include <ctype.h> 33 #include <err.h> 34 #include <errno.h> 35 #include <fts.h> 36 #include <regex.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <string.h> 40 #include <unistd.h> 41 #include <zlib.h> 42 43 #include "grep.h" 44 45 /* 46 * Process a file line by line... 47 */ 48 49 static int linesqueued; 50 static int procline(str_t *l, int); 51 static int grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch); 52 #ifndef SMALL 53 static int grep_cmp(const char *, const char *, size_t); 54 static void grep_revstr(unsigned char *, int); 55 #endif 56 57 int 58 grep_tree(char **argv) 59 { 60 FTS *fts; 61 FTSENT *p; 62 int c, fts_flags; 63 64 c = 0; 65 66 fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR; 67 68 if (!(fts = fts_open(argv, fts_flags, NULL))) 69 err(2, NULL); 70 while ((p = fts_read(fts)) != NULL) { 71 switch (p->fts_info) { 72 case FTS_DNR: 73 break; 74 case FTS_ERR: 75 errx(2, "%s: %s", p->fts_path, strerror(p->fts_errno)); 76 break; 77 case FTS_DP: 78 break; 79 default: 80 c += procfile(p->fts_path); 81 break; 82 } 83 } 84 if (errno) 85 err(2, "fts_read"); 86 87 return c; 88 } 89 90 int 91 procfile(char *fn) 92 { 93 str_t ln; 94 file_t *f; 95 int c, t, z, nottext; 96 97 if (fn == NULL) { 98 fn = "(standard input)"; 99 f = grep_fdopen(STDIN_FILENO, "r"); 100 } else { 101 f = grep_open(fn, "r"); 102 } 103 if (f == NULL) { 104 if (!sflag) 105 warn("%s", fn); 106 return 0; 107 } 108 109 nottext = grep_bin_file(f); 110 if (nottext && binbehave == BIN_FILE_SKIP) { 111 grep_close(f); 112 return 0; 113 } 114 115 ln.file = fn; 116 ln.line_no = 0; 117 ln.len = 0; 118 linesqueued = 0; 119 tail = 0; 120 ln.off = -1; 121 122 if (Bflag > 0) 123 initqueue(); 124 for (c = 0; c == 0 || !(lflag || qflag); ) { 125 ln.off += ln.len + 1; 126 if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) 127 break; 128 if (ln.len > 0 && ln.dat[ln.len - 1] == '\n') 129 --ln.len; 130 ln.line_no++; 131 132 z = tail; 133 134 if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) { 135 enqueue(&ln); 136 linesqueued++; 137 } 138 c += t; 139 } 140 if (Bflag > 0) 141 clearqueue(); 142 grep_close(f); 143 144 if (cflag) { 145 if (!hflag) 146 printf("%s:", ln.file); 147 printf("%u\n", c); 148 } 149 if (lflag && c != 0) 150 printf("%s\n", fn); 151 if (Lflag && c == 0) 152 printf("%s\n", fn); 153 if (c && !cflag && !lflag && !Lflag && 154 binbehave == BIN_FILE_BIN && nottext && !qflag) 155 printf("Binary file %s matches\n", fn); 156 157 return c; 158 } 159 160 161 /* 162 * Process an individual line in a file. Return non-zero if it matches. 163 */ 164 165 #define isword(x) (isalnum(x) || (x) == '_') 166 167 static int 168 procline(str_t *l, int nottext) 169 { 170 regmatch_t pmatch; 171 int c, i, r; 172 regoff_t offset; 173 174 /* size_t will be converted to regoff_t. ssize_t is guaranteed to fit 175 * into regoff_t */ 176 if (l->len > SSIZE_MAX) { 177 errx(2, "Line is too big to process"); 178 } 179 180 c = 0; 181 i = 0; 182 if (matchall) { 183 c = 1; 184 goto print; 185 } 186 187 for (i = 0; i < patterns; i++) { 188 offset = 0; 189 redo: 190 if (fg_pattern[i].pattern) { 191 r = grep_search(&fg_pattern[i], l->dat + offset, 192 l->len - offset, &pmatch); 193 pmatch.rm_so += offset; 194 pmatch.rm_eo += offset; 195 } else { 196 pmatch.rm_so = offset; 197 pmatch.rm_eo = l->len - offset; 198 r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags); 199 } 200 if (r == 0 && xflag) { 201 if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) 202 r = REG_NOMATCH; 203 } 204 if (r == 0) { 205 c = 1; 206 if (oflag) 207 goto print; 208 break; 209 } 210 } 211 if (oflag) 212 return c; 213 print: 214 if (vflag) 215 c = !c; 216 217 if (c && binbehave == BIN_FILE_BIN && nottext) 218 return c; /* Binary file */ 219 220 if ((tail > 0 || c) && !cflag && !qflag) { 221 if (c) { 222 if (first > 0 && tail == 0 && (Bflag < linesqueued) && 223 (Aflag || Bflag)) 224 printf("--\n"); 225 first = 1; 226 tail = Aflag; 227 if (Bflag > 0) 228 printqueue(); 229 linesqueued = 0; 230 printline(l, ':', oflag ? &pmatch : NULL); 231 } else { 232 printline(l, '-', oflag ? &pmatch : NULL); 233 tail--; 234 } 235 } 236 if (oflag && !matchall) { 237 offset = pmatch.rm_eo; 238 goto redo; 239 } 240 return c; 241 } 242 243 #ifndef SMALL 244 void 245 fgrepcomp(fastgrep_t *fg, const char *pattern) 246 { 247 int i; 248 249 /* Initialize. */ 250 fg->patternLen = strlen(pattern); 251 fg->bol = 0; 252 fg->eol = 0; 253 fg->wmatch = wflag; 254 fg->reversedSearch = 0; 255 256 /* 257 * Make a copy and upper case it for later if in -i mode, 258 * else just copy the pointer. 259 */ 260 if (iflag) { 261 fg->pattern = grep_malloc(fg->patternLen + 1); 262 for (i = 0; i < fg->patternLen; i++) 263 fg->pattern[i] = toupper(pattern[i]); 264 fg->pattern[fg->patternLen] = '\0'; 265 } else 266 fg->pattern = (unsigned char *)pattern; /* really const */ 267 268 /* Preprocess pattern. */ 269 for (i = 0; i <= UCHAR_MAX; i++) 270 fg->qsBc[i] = fg->patternLen; 271 for (i = 1; i < fg->patternLen; i++) { 272 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 273 /* 274 * If case is ignored, make the jump apply to both upper and 275 * lower cased characters. As the pattern is stored in upper 276 * case, apply the same to the lower case equivalents. 277 */ 278 if (iflag) 279 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 280 } 281 } 282 #endif 283 284 /* 285 * Returns: -1 on failure, 0 on success 286 */ 287 int 288 fastcomp(fastgrep_t *fg, const char *pattern) 289 { 290 #ifdef SMALL 291 return -1; 292 #else 293 int i; 294 int bol = 0; 295 int eol = 0; 296 int shiftPatternLen; 297 int hasDot = 0; 298 int firstHalfDot = -1; 299 int firstLastHalfDot = -1; 300 int lastHalfDot = 0; 301 302 /* Initialize. */ 303 fg->patternLen = strlen(pattern); 304 fg->bol = 0; 305 fg->eol = 0; 306 fg->wmatch = 0; 307 fg->reversedSearch = 0; 308 309 /* Remove end-of-line character ('$'). */ 310 if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') { 311 eol++; 312 fg->eol = 1; 313 fg->patternLen--; 314 } 315 316 /* Remove beginning-of-line character ('^'). */ 317 if (pattern[0] == '^') { 318 bol++; 319 fg->bol = 1; 320 fg->patternLen--; 321 } 322 323 /* Remove enclosing [[:<:]] and [[:>:]] (word match). */ 324 if (wflag) { 325 /* basic re's use \( \), extended re's ( ) */ 326 int extra = Eflag ? 1 : 2; 327 fg->patternLen -= 14 + 2 * extra; 328 fg->wmatch = 7 + extra; 329 } else if (fg->patternLen >= 14 && 330 strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 && 331 strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) { 332 fg->patternLen -= 14; 333 fg->wmatch = 7; 334 } 335 336 /* 337 * Copy pattern minus '^' and '$' characters as well as word 338 * match character classes at the beginning and ending of the 339 * string respectively. 340 */ 341 fg->pattern = grep_malloc(fg->patternLen + 1); 342 memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen); 343 fg->pattern[fg->patternLen] = '\0'; 344 345 /* Look for ways to cheat...er...avoid the full regex engine. */ 346 for (i = 0; i < fg->patternLen; i++) 347 { 348 /* Can still cheat? */ 349 if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) || 350 (fg->pattern[i] == '_') || (fg->pattern[i] == ',') || 351 (fg->pattern[i] == '=') || (fg->pattern[i] == '-') || 352 (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) { 353 /* As long as it is good, upper case it for later. */ 354 if (iflag) 355 fg->pattern[i] = toupper(fg->pattern[i]); 356 } else if (fg->pattern[i] == '.') { 357 hasDot = i; 358 if (i < fg->patternLen / 2) { 359 if (firstHalfDot < 0) 360 /* Closest dot to the beginning */ 361 firstHalfDot = i; 362 } else { 363 /* Closest dot to the end of the pattern. */ 364 lastHalfDot = i; 365 if (firstLastHalfDot < 0) 366 firstLastHalfDot = i; 367 } 368 } else { 369 /* Free memory and let others know this is empty. */ 370 free(fg->pattern); 371 fg->pattern = NULL; 372 return (-1); 373 } 374 } 375 376 /* 377 * Determine if a reverse search would be faster based on the placement 378 * of the dots. 379 */ 380 if ((!(lflag || cflag)) && ((!(bol || eol)) && 381 ((lastHalfDot) && ((firstHalfDot < 0) || 382 ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) { 383 fg->reversedSearch = 1; 384 hasDot = fg->patternLen - (firstHalfDot < 0 ? 385 firstLastHalfDot : firstHalfDot) - 1; 386 grep_revstr(fg->pattern, fg->patternLen); 387 } 388 389 /* 390 * Normal Quick Search would require a shift based on the position the 391 * next character after the comparison is within the pattern. With 392 * wildcards, the position of the last dot effects the maximum shift 393 * distance. 394 * The closer to the end the wild card is the slower the search. A 395 * reverse version of this algorithm would be useful for wildcards near 396 * the end of the string. 397 * 398 * Examples: 399 * Pattern Max shift 400 * ------- --------- 401 * this 5 402 * .his 4 403 * t.is 3 404 * th.s 2 405 * thi. 1 406 */ 407 408 /* Adjust the shift based on location of the last dot ('.'). */ 409 shiftPatternLen = fg->patternLen - hasDot; 410 411 /* Preprocess pattern. */ 412 for (i = 0; i <= UCHAR_MAX; i++) 413 fg->qsBc[i] = shiftPatternLen; 414 for (i = hasDot + 1; i < fg->patternLen; i++) { 415 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 416 /* 417 * If case is ignored, make the jump apply to both upper and 418 * lower cased characters. As the pattern is stored in upper 419 * case, apply the same to the lower case equivalents. 420 */ 421 if (iflag) 422 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 423 } 424 425 /* 426 * Put pattern back to normal after pre-processing to allow for easy 427 * comparisons later. 428 */ 429 if (fg->reversedSearch) 430 grep_revstr(fg->pattern, fg->patternLen); 431 432 return (0); 433 #endif 434 } 435 436 /* 437 * Word boundaries using regular expressions are defined as the point 438 * of transition from a non-word char to a word char, or vice versa. 439 * This means that grep -w +a and grep -w a+ never match anything, 440 * because they lack a starting or ending transition, but grep -w a+b 441 * does match a line containing a+b. 442 */ 443 #define wmatch(d, l, s, e) \ 444 ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \ 445 e > s && isword(d[s]) && isword(d[e-1])) 446 447 static int 448 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch) 449 { 450 #ifdef SMALL 451 return 0; 452 #else 453 regoff_t j; 454 int rtrnVal = REG_NOMATCH; 455 456 pmatch->rm_so = -1; 457 pmatch->rm_eo = -1; 458 459 /* No point in going farther if we do not have enough data. */ 460 if (dataLen < fg->patternLen) 461 return (rtrnVal); 462 463 /* Only try once at the beginning or ending of the line. */ 464 if (fg->bol || fg->eol) { 465 /* Simple text comparison. */ 466 /* Verify data is >= pattern length before searching on it. */ 467 if (dataLen >= fg->patternLen) { 468 /* Determine where in data to start search at. */ 469 if (fg->eol) 470 j = dataLen - fg->patternLen; 471 else 472 j = 0; 473 if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen))) 474 if (grep_cmp(fg->pattern, data + j, 475 fg->patternLen) == -1) { 476 pmatch->rm_so = j; 477 pmatch->rm_eo = j + fg->patternLen; 478 if (!fg->wmatch || wmatch(data, dataLen, 479 pmatch->rm_so, pmatch->rm_eo)) 480 rtrnVal = 0; 481 } 482 } 483 } else if (fg->reversedSearch) { 484 /* Quick Search algorithm. */ 485 j = dataLen; 486 do { 487 if (grep_cmp(fg->pattern, data + j - fg->patternLen, 488 fg->patternLen) == -1) { 489 pmatch->rm_so = j - fg->patternLen; 490 pmatch->rm_eo = j; 491 if (!fg->wmatch || wmatch(data, dataLen, 492 pmatch->rm_so, pmatch->rm_eo)) { 493 rtrnVal = 0; 494 break; 495 } 496 } 497 /* Shift if within bounds, otherwise, we are done. */ 498 if (j == fg->patternLen) 499 break; 500 j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]]; 501 } while (j >= fg->patternLen); 502 } else { 503 /* Quick Search algorithm. */ 504 j = 0; 505 do { 506 if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) { 507 pmatch->rm_so = j; 508 pmatch->rm_eo = j + fg->patternLen; 509 if (fg->patternLen == 0 || !fg->wmatch || 510 wmatch(data, dataLen, pmatch->rm_so, 511 pmatch->rm_eo)) { 512 rtrnVal = 0; 513 break; 514 } 515 } 516 517 /* Shift if within bounds, otherwise, we are done. */ 518 if (j + fg->patternLen == dataLen) 519 break; 520 else 521 j += fg->qsBc[(unsigned char)data[j + fg->patternLen]]; 522 } while (j <= (dataLen - fg->patternLen)); 523 } 524 525 return (rtrnVal); 526 #endif 527 } 528 529 530 void * 531 grep_malloc(size_t size) 532 { 533 void *ptr; 534 535 if ((ptr = malloc(size)) == NULL) 536 err(2, "malloc"); 537 return ptr; 538 } 539 540 void * 541 grep_calloc(size_t nmemb, size_t size) 542 { 543 void *ptr; 544 545 if ((ptr = calloc(nmemb, size)) == NULL) 546 err(2, "calloc"); 547 return ptr; 548 } 549 550 void * 551 grep_realloc(void *ptr, size_t size) 552 { 553 if ((ptr = realloc(ptr, size)) == NULL) 554 err(2, "realloc"); 555 return ptr; 556 } 557 558 #ifndef SMALL 559 /* 560 * Returns: i >= 0 on failure (position that it failed) 561 * -1 on success 562 */ 563 static int 564 grep_cmp(const char *pattern, const char *data, size_t len) 565 { 566 int i; 567 568 for (i = 0; i < len; i++) { 569 if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.')) 570 || (iflag && pattern[i] == toupper(data[i]))) 571 continue; 572 return (i); 573 } 574 575 return (-1); 576 } 577 578 static void 579 grep_revstr(unsigned char *str, int len) 580 { 581 int i; 582 char c; 583 584 for (i = 0; i < len / 2; i++) { 585 c = str[i]; 586 str[i] = str[len - i - 1]; 587 str[len - i - 1] = c; 588 } 589 } 590 #endif 591 592 void 593 printline(str_t *line, int sep, regmatch_t *pmatch) 594 { 595 int n; 596 597 n = 0; 598 if (!hflag) { 599 fputs(line->file, stdout); 600 ++n; 601 } 602 if (nflag) { 603 if (n) 604 putchar(sep); 605 printf("%d", line->line_no); 606 ++n; 607 } 608 if (bflag) { 609 if (n) 610 putchar(sep); 611 printf("%lld", (long long)line->off); 612 ++n; 613 } 614 if (n) 615 putchar(sep); 616 if (pmatch) 617 fwrite(line->dat + pmatch->rm_so, 618 pmatch->rm_eo - pmatch->rm_so, 1, stdout); 619 else 620 fwrite(line->dat, line->len, 1, stdout); 621 putchar('\n'); 622 } 623