1 /* $OpenBSD: util.c,v 1.62 2019/12/03 09:14:37 jca Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stat.h> 31 32 #include <ctype.h> 33 #include <err.h> 34 #include <errno.h> 35 #include <fts.h> 36 #include <regex.h> 37 #include <stdbool.h> 38 #include <stdio.h> 39 #include <stdlib.h> 40 #include <string.h> 41 #include <unistd.h> 42 #include <zlib.h> 43 44 #include "grep.h" 45 46 /* 47 * Process a file line by line... 48 */ 49 50 static int linesqueued; 51 static int procline(str_t *l, int); 52 static int grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch, int); 53 #ifndef SMALL 54 static bool grep_cmp(const char *, const char *, size_t); 55 static void grep_revstr(unsigned char *, int); 56 #endif 57 58 int 59 grep_tree(char **argv) 60 { 61 FTS *fts; 62 FTSENT *p; 63 int c, fts_flags; 64 char *dot_argv[] = { ".", NULL }; 65 char *path; 66 67 /* default to . if no path given */ 68 if (argv[0] == NULL) 69 argv = dot_argv; 70 71 c = 0; 72 73 fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR; 74 75 if (!(fts = fts_open(argv, fts_flags, NULL))) 76 err(2, NULL); 77 while ((p = fts_read(fts)) != NULL) { 78 switch (p->fts_info) { 79 case FTS_DNR: 80 break; 81 case FTS_ERR: 82 file_err = 1; 83 if(!sflag) 84 warnc(p->fts_errno, "%s", p->fts_path); 85 break; 86 case FTS_D: 87 case FTS_DP: 88 break; 89 default: 90 path = p->fts_path; 91 /* skip "./" if implied */ 92 if (argv == dot_argv && p->fts_pathlen >= 2) 93 path += 2; 94 c += procfile(path); 95 break; 96 } 97 } 98 if (errno) 99 err(2, "fts_read"); 100 fts_close(fts); 101 return c; 102 } 103 104 int 105 procfile(char *fn) 106 { 107 str_t ln; 108 file_t *f; 109 int c, t, z, nottext; 110 111 mcount = mlimit; 112 113 if (fn == NULL) { 114 fn = "(standard input)"; 115 f = grep_fdopen(STDIN_FILENO); 116 } else { 117 f = grep_open(fn); 118 } 119 if (f == NULL) { 120 if (errno == EISDIR) 121 return 0; 122 file_err = 1; 123 if (!sflag) 124 warn("%s", fn); 125 return 0; 126 } 127 128 nottext = grep_bin_file(f); 129 if (nottext && binbehave == BIN_FILE_SKIP) { 130 grep_close(f); 131 return 0; 132 } 133 134 ln.file = fn; 135 if (labelname) 136 ln.file = (char *)labelname; 137 ln.line_no = 0; 138 ln.len = 0; 139 linesqueued = 0; 140 tail = 0; 141 ln.off = -1; 142 143 if (Bflag > 0) 144 initqueue(); 145 for (c = 0; c == 0 || !(lflag || qflag); ) { 146 if (mflag && mlimit == 0) 147 break; 148 ln.off += ln.len + 1; 149 if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) 150 break; 151 if (ln.len > 0 && ln.dat[ln.len - 1] == '\n') 152 --ln.len; 153 ln.line_no++; 154 155 z = tail; 156 157 if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) { 158 enqueue(&ln); 159 linesqueued++; 160 } 161 c += t; 162 if (mflag && mcount <= 0) 163 break; 164 } 165 if (Bflag > 0) 166 clearqueue(); 167 grep_close(f); 168 169 if (cflag) { 170 if (!hflag) 171 printf("%s:", ln.file); 172 printf("%u\n", c); 173 } 174 if (lflag && c != 0) 175 printf("%s\n", fn); 176 if (Lflag && c == 0) 177 printf("%s\n", fn); 178 if (c && !cflag && !lflag && !Lflag && 179 binbehave == BIN_FILE_BIN && nottext && !qflag) 180 printf("Binary file %s matches\n", fn); 181 182 return c; 183 } 184 185 186 /* 187 * Process an individual line in a file. Return non-zero if it matches. 188 */ 189 190 #define isword(x) (isalnum((unsigned char)x) || (x) == '_') 191 192 static int 193 procline(str_t *l, int nottext) 194 { 195 regmatch_t pmatch = { 0 }; 196 int c, i, r; 197 regoff_t offset; 198 199 /* size_t will be converted to regoff_t. ssize_t is guaranteed to fit 200 * into regoff_t */ 201 if (l->len > SSIZE_MAX) { 202 errx(2, "Line is too big to process"); 203 } 204 205 c = 0; 206 i = 0; 207 if (matchall) { 208 c = 1; 209 goto print; 210 } 211 212 for (i = 0; i < patterns; i++) { 213 offset = 0; 214 redo: 215 if (fg_pattern[i].pattern) { 216 int flags = 0; 217 if (offset) 218 flags |= REG_NOTBOL; 219 r = grep_search(&fg_pattern[i], l->dat + offset, 220 l->len - offset, &pmatch, flags); 221 pmatch.rm_so += offset; 222 pmatch.rm_eo += offset; 223 } else { 224 int flags = eflags; 225 if (offset) 226 flags |= REG_NOTBOL; 227 pmatch.rm_so = offset; 228 pmatch.rm_eo = l->len; 229 r = regexec(&r_pattern[i], l->dat, 1, &pmatch, flags); 230 } 231 if (r == 0 && xflag) { 232 if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) 233 r = REG_NOMATCH; 234 } 235 if (r == 0) { 236 c = 1; 237 if (oflag && pmatch.rm_so != pmatch.rm_eo) 238 goto print; 239 break; 240 } 241 } 242 if (oflag) 243 return c; 244 print: 245 if (vflag) 246 c = !c; 247 248 /* Count the matches if we have a match limit */ 249 if (mflag) 250 mcount -= c; 251 252 if (c && binbehave == BIN_FILE_BIN && nottext) 253 return c; /* Binary file */ 254 255 if ((tail > 0 || c) && !cflag && !qflag) { 256 if (c) { 257 if (first > 0 && tail == 0 && (Bflag < linesqueued) && 258 (Aflag || Bflag)) 259 printf("--\n"); 260 first = 1; 261 tail = Aflag; 262 if (Bflag > 0) 263 printqueue(); 264 linesqueued = 0; 265 printline(l, ':', oflag ? &pmatch : NULL); 266 } else { 267 printline(l, '-', oflag ? &pmatch : NULL); 268 tail--; 269 } 270 } 271 if (oflag && !matchall) { 272 offset = pmatch.rm_eo; 273 goto redo; 274 } 275 return c; 276 } 277 278 #ifndef SMALL 279 void 280 fgrepcomp(fastgrep_t *fg, const unsigned char *pattern) 281 { 282 int i; 283 284 /* Initialize. */ 285 fg->patternLen = strlen(pattern); 286 fg->bol = 0; 287 fg->eol = 0; 288 fg->wmatch = wflag; 289 fg->reversedSearch = 0; 290 291 /* 292 * Make a copy and upper case it for later if in -i mode, 293 * else just copy the pointer. 294 */ 295 if (iflag) { 296 fg->pattern = grep_malloc(fg->patternLen + 1); 297 for (i = 0; i < fg->patternLen; i++) 298 fg->pattern[i] = toupper(pattern[i]); 299 fg->pattern[fg->patternLen] = '\0'; 300 } else 301 fg->pattern = (unsigned char *)pattern; /* really const */ 302 303 /* Preprocess pattern. */ 304 for (i = 0; i <= UCHAR_MAX; i++) 305 fg->qsBc[i] = fg->patternLen; 306 for (i = 1; i < fg->patternLen; i++) { 307 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 308 /* 309 * If case is ignored, make the jump apply to both upper and 310 * lower cased characters. As the pattern is stored in upper 311 * case, apply the same to the lower case equivalents. 312 */ 313 if (iflag) 314 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 315 } 316 } 317 #endif 318 319 /* 320 * Returns: -1 on failure, 0 on success 321 */ 322 int 323 fastcomp(fastgrep_t *fg, const char *pattern) 324 { 325 #ifdef SMALL 326 return -1; 327 #else 328 int i; 329 int bol = 0; 330 int eol = 0; 331 int shiftPatternLen; 332 int hasDot = 0; 333 int firstHalfDot = -1; 334 int firstLastHalfDot = -1; 335 int lastHalfDot = 0; 336 337 /* Initialize. */ 338 fg->patternLen = strlen(pattern); 339 fg->bol = 0; 340 fg->eol = 0; 341 fg->wmatch = 0; 342 fg->reversedSearch = 0; 343 344 /* Remove end-of-line character ('$'). */ 345 if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') { 346 eol++; 347 fg->eol = 1; 348 fg->patternLen--; 349 } 350 351 /* Remove beginning-of-line character ('^'). */ 352 if (pattern[0] == '^') { 353 bol++; 354 fg->bol = 1; 355 fg->patternLen--; 356 } 357 358 /* Remove enclosing [[:<:]] and [[:>:]] (word match). */ 359 if (wflag) { 360 /* basic re's use \( \), extended re's ( ) */ 361 int extra = Eflag ? 1 : 2; 362 fg->patternLen -= 14 + 2 * extra; 363 fg->wmatch = 7 + extra; 364 } else if (fg->patternLen >= 14 && 365 strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 && 366 strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) { 367 fg->patternLen -= 14; 368 fg->wmatch = 7; 369 } 370 371 /* 372 * Copy pattern minus '^' and '$' characters as well as word 373 * match character classes at the beginning and ending of the 374 * string respectively. 375 */ 376 fg->pattern = grep_malloc(fg->patternLen + 1); 377 memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen); 378 fg->pattern[fg->patternLen] = '\0'; 379 380 /* Look for ways to cheat...er...avoid the full regex engine. */ 381 for (i = 0; i < fg->patternLen; i++) 382 { 383 switch (fg->pattern[i]) { 384 case '.': 385 hasDot = i; 386 if (i < fg->patternLen / 2) { 387 if (firstHalfDot < 0) 388 /* Closest dot to the beginning */ 389 firstHalfDot = i; 390 } else { 391 /* Closest dot to the end of the pattern. */ 392 lastHalfDot = i; 393 if (firstLastHalfDot < 0) 394 firstLastHalfDot = i; 395 } 396 break; 397 case '(': case ')': 398 case '{': case '}': 399 /* Special in BRE if preceded by '\\' */ 400 case '?': 401 case '+': 402 case '|': 403 /* Not special in BRE. */ 404 if (!Eflag) 405 goto nonspecial; 406 case '\\': 407 case '*': 408 case '[': case ']': 409 /* Free memory and let others know this is empty. */ 410 free(fg->pattern); 411 fg->pattern = NULL; 412 return (-1); 413 default: 414 nonspecial: 415 if (iflag) 416 fg->pattern[i] = toupper(fg->pattern[i]); 417 break; 418 } 419 } 420 421 /* 422 * Determine if a reverse search would be faster based on the placement 423 * of the dots. 424 */ 425 if ((!(lflag || cflag || oflag)) && ((!(bol || eol)) && 426 ((lastHalfDot) && ((firstHalfDot < 0) || 427 ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) { 428 fg->reversedSearch = 1; 429 hasDot = fg->patternLen - (firstHalfDot < 0 ? 430 firstLastHalfDot : firstHalfDot) - 1; 431 grep_revstr(fg->pattern, fg->patternLen); 432 } 433 434 /* 435 * Normal Quick Search would require a shift based on the position the 436 * next character after the comparison is within the pattern. With 437 * wildcards, the position of the last dot effects the maximum shift 438 * distance. 439 * The closer to the end the wild card is the slower the search. A 440 * reverse version of this algorithm would be useful for wildcards near 441 * the end of the string. 442 * 443 * Examples: 444 * Pattern Max shift 445 * ------- --------- 446 * this 5 447 * .his 4 448 * t.is 3 449 * th.s 2 450 * thi. 1 451 */ 452 453 /* Adjust the shift based on location of the last dot ('.'). */ 454 shiftPatternLen = fg->patternLen - hasDot; 455 456 /* Preprocess pattern. */ 457 for (i = 0; i <= UCHAR_MAX; i++) 458 fg->qsBc[i] = shiftPatternLen; 459 for (i = hasDot + 1; i < fg->patternLen; i++) { 460 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 461 /* 462 * If case is ignored, make the jump apply to both upper and 463 * lower cased characters. As the pattern is stored in upper 464 * case, apply the same to the lower case equivalents. 465 */ 466 if (iflag) 467 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 468 } 469 470 /* 471 * Put pattern back to normal after pre-processing to allow for easy 472 * comparisons later. 473 */ 474 if (fg->reversedSearch) 475 grep_revstr(fg->pattern, fg->patternLen); 476 477 return (0); 478 #endif 479 } 480 481 /* 482 * Word boundaries using regular expressions are defined as the point 483 * of transition from a non-word char to a word char, or vice versa. 484 * This means that grep -w +a and grep -w a+ never match anything, 485 * because they lack a starting or ending transition, but grep -w a+b 486 * does match a line containing a+b. 487 */ 488 #define wmatch(d, l, s, e) \ 489 ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \ 490 e > s && isword(d[s]) && isword(d[e-1])) 491 492 static int 493 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch, 494 int flags) 495 { 496 #ifdef SMALL 497 return 0; 498 #else 499 regoff_t j; 500 int rtrnVal = REG_NOMATCH; 501 502 pmatch->rm_so = -1; 503 pmatch->rm_eo = -1; 504 505 /* No point in going farther if we do not have enough data. */ 506 if (dataLen < fg->patternLen) 507 return (rtrnVal); 508 509 /* Only try once at the beginning or ending of the line. */ 510 if (fg->bol || fg->eol) { 511 if (fg->bol && (flags & REG_NOTBOL)) 512 return 0; 513 /* Simple text comparison. */ 514 /* Verify data is >= pattern length before searching on it. */ 515 if (dataLen >= fg->patternLen) { 516 /* Determine where in data to start search at. */ 517 if (fg->eol) 518 j = dataLen - fg->patternLen; 519 else 520 j = 0; 521 if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen))) 522 if (grep_cmp(fg->pattern, data + j, 523 fg->patternLen)) { 524 pmatch->rm_so = j; 525 pmatch->rm_eo = j + fg->patternLen; 526 if (!fg->wmatch || wmatch(data, dataLen, 527 pmatch->rm_so, pmatch->rm_eo)) 528 rtrnVal = 0; 529 } 530 } 531 } else if (fg->reversedSearch) { 532 /* Quick Search algorithm. */ 533 j = dataLen; 534 do { 535 if (grep_cmp(fg->pattern, data + j - fg->patternLen, 536 fg->patternLen)) { 537 pmatch->rm_so = j - fg->patternLen; 538 pmatch->rm_eo = j; 539 if (!fg->wmatch || wmatch(data, dataLen, 540 pmatch->rm_so, pmatch->rm_eo)) { 541 rtrnVal = 0; 542 break; 543 } 544 } 545 /* Shift if within bounds, otherwise, we are done. */ 546 if (j == fg->patternLen) 547 break; 548 j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]]; 549 } while (j >= fg->patternLen); 550 } else { 551 /* Quick Search algorithm. */ 552 j = 0; 553 do { 554 if (grep_cmp(fg->pattern, data + j, fg->patternLen)) { 555 pmatch->rm_so = j; 556 pmatch->rm_eo = j + fg->patternLen; 557 if (fg->patternLen == 0 || !fg->wmatch || 558 wmatch(data, dataLen, pmatch->rm_so, 559 pmatch->rm_eo)) { 560 rtrnVal = 0; 561 break; 562 } 563 } 564 565 /* Shift if within bounds, otherwise, we are done. */ 566 if (j + fg->patternLen == dataLen) 567 break; 568 else 569 j += fg->qsBc[(unsigned char)data[j + fg->patternLen]]; 570 } while (j <= (dataLen - fg->patternLen)); 571 } 572 573 return (rtrnVal); 574 #endif 575 } 576 577 578 void * 579 grep_malloc(size_t size) 580 { 581 void *ptr; 582 583 if ((ptr = malloc(size)) == NULL) 584 err(2, "malloc"); 585 return ptr; 586 } 587 588 void * 589 grep_calloc(size_t nmemb, size_t size) 590 { 591 void *ptr; 592 593 if ((ptr = calloc(nmemb, size)) == NULL) 594 err(2, "calloc"); 595 return ptr; 596 } 597 598 void * 599 grep_realloc(void *ptr, size_t size) 600 { 601 if ((ptr = realloc(ptr, size)) == NULL) 602 err(2, "realloc"); 603 return ptr; 604 } 605 606 void * 607 grep_reallocarray(void *ptr, size_t nmemb, size_t size) 608 { 609 if ((ptr = reallocarray(ptr, nmemb, size)) == NULL) 610 err(2, "reallocarray"); 611 return ptr; 612 } 613 614 #ifndef SMALL 615 /* 616 * Returns: true on success, false on failure 617 */ 618 static bool 619 grep_cmp(const char *pattern, const char *data, size_t len) 620 { 621 size_t i; 622 623 for (i = 0; i < len; i++) { 624 if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.')) 625 || (iflag && pattern[i] == toupper((unsigned char)data[i]))) 626 continue; 627 return false; 628 } 629 630 return true; 631 } 632 633 static void 634 grep_revstr(unsigned char *str, int len) 635 { 636 int i; 637 char c; 638 639 for (i = 0; i < len / 2; i++) { 640 c = str[i]; 641 str[i] = str[len - i - 1]; 642 str[len - i - 1] = c; 643 } 644 } 645 #endif 646 647 void 648 printline(str_t *line, int sep, regmatch_t *pmatch) 649 { 650 int n; 651 652 n = 0; 653 if (!hflag) { 654 fputs(line->file, stdout); 655 ++n; 656 } 657 if (nflag) { 658 if (n) 659 putchar(sep); 660 printf("%lld", line->line_no); 661 ++n; 662 } 663 if (bflag) { 664 if (n) 665 putchar(sep); 666 printf("%lld", (long long)line->off + 667 (pmatch ? pmatch->rm_so : 0)); 668 ++n; 669 } 670 if (n) 671 putchar(sep); 672 if (pmatch) 673 fwrite(line->dat + pmatch->rm_so, 674 pmatch->rm_eo - pmatch->rm_so, 1, stdout); 675 else 676 fwrite(line->dat, line->len, 1, stdout); 677 putchar('\n'); 678 } 679