1 /* $OpenBSD: util.c,v 1.59 2019/01/23 23:00:54 tedu Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stat.h> 31 32 #include <ctype.h> 33 #include <err.h> 34 #include <errno.h> 35 #include <fts.h> 36 #include <regex.h> 37 #include <stdbool.h> 38 #include <stdio.h> 39 #include <stdlib.h> 40 #include <string.h> 41 #include <unistd.h> 42 #include <zlib.h> 43 44 #include "grep.h" 45 46 /* 47 * Process a file line by line... 48 */ 49 50 static int linesqueued; 51 static int procline(str_t *l, int); 52 static int grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch, int); 53 #ifndef SMALL 54 static bool grep_cmp(const char *, const char *, size_t); 55 static void grep_revstr(unsigned char *, int); 56 #endif 57 58 int 59 grep_tree(char **argv) 60 { 61 FTS *fts; 62 FTSENT *p; 63 int c, fts_flags; 64 65 c = 0; 66 67 fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR; 68 69 if (!(fts = fts_open(argv, fts_flags, NULL))) 70 err(2, NULL); 71 while ((p = fts_read(fts)) != NULL) { 72 switch (p->fts_info) { 73 case FTS_DNR: 74 break; 75 case FTS_ERR: 76 file_err = 1; 77 if(!sflag) 78 warnc(p->fts_errno, "%s", p->fts_path); 79 break; 80 case FTS_D: 81 case FTS_DP: 82 break; 83 default: 84 c += procfile(p->fts_path); 85 break; 86 } 87 } 88 if (errno) 89 err(2, "fts_read"); 90 fts_close(fts); 91 return c; 92 } 93 94 int 95 procfile(char *fn) 96 { 97 str_t ln; 98 file_t *f; 99 int c, t, z, nottext; 100 101 mcount = mlimit; 102 103 if (fn == NULL) { 104 fn = "(standard input)"; 105 f = grep_fdopen(STDIN_FILENO); 106 } else { 107 f = grep_open(fn); 108 } 109 if (f == NULL) { 110 if (errno == EISDIR) 111 return 0; 112 file_err = 1; 113 if (!sflag) 114 warn("%s", fn); 115 return 0; 116 } 117 118 nottext = grep_bin_file(f); 119 if (nottext && binbehave == BIN_FILE_SKIP) { 120 grep_close(f); 121 return 0; 122 } 123 124 ln.file = fn; 125 ln.line_no = 0; 126 ln.len = 0; 127 linesqueued = 0; 128 tail = 0; 129 ln.off = -1; 130 131 if (Bflag > 0) 132 initqueue(); 133 for (c = 0; c == 0 || !(lflag || qflag); ) { 134 if (mflag && mlimit == 0) 135 break; 136 ln.off += ln.len + 1; 137 if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) 138 break; 139 if (ln.len > 0 && ln.dat[ln.len - 1] == '\n') 140 --ln.len; 141 ln.line_no++; 142 143 z = tail; 144 145 if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) { 146 enqueue(&ln); 147 linesqueued++; 148 } 149 c += t; 150 if (mflag && mcount <= 0) 151 break; 152 } 153 if (Bflag > 0) 154 clearqueue(); 155 grep_close(f); 156 157 if (cflag) { 158 if (!hflag) 159 printf("%s:", ln.file); 160 printf("%u\n", c); 161 } 162 if (lflag && c != 0) 163 printf("%s\n", fn); 164 if (Lflag && c == 0) 165 printf("%s\n", fn); 166 if (c && !cflag && !lflag && !Lflag && 167 binbehave == BIN_FILE_BIN && nottext && !qflag) 168 printf("Binary file %s matches\n", fn); 169 170 return c; 171 } 172 173 174 /* 175 * Process an individual line in a file. Return non-zero if it matches. 176 */ 177 178 #define isword(x) (isalnum((unsigned char)x) || (x) == '_') 179 180 static int 181 procline(str_t *l, int nottext) 182 { 183 regmatch_t pmatch = { 0 }; 184 int c, i, r; 185 regoff_t offset; 186 187 /* size_t will be converted to regoff_t. ssize_t is guaranteed to fit 188 * into regoff_t */ 189 if (l->len > SSIZE_MAX) { 190 errx(2, "Line is too big to process"); 191 } 192 193 c = 0; 194 i = 0; 195 if (matchall) { 196 c = 1; 197 goto print; 198 } 199 200 for (i = 0; i < patterns; i++) { 201 offset = 0; 202 redo: 203 if (fg_pattern[i].pattern) { 204 int flags = 0; 205 if (offset) 206 flags |= REG_NOTBOL; 207 r = grep_search(&fg_pattern[i], l->dat + offset, 208 l->len - offset, &pmatch, flags); 209 pmatch.rm_so += offset; 210 pmatch.rm_eo += offset; 211 } else { 212 int flags = eflags; 213 if (offset) 214 flags |= REG_NOTBOL; 215 pmatch.rm_so = offset; 216 pmatch.rm_eo = l->len; 217 r = regexec(&r_pattern[i], l->dat, 1, &pmatch, flags); 218 } 219 if (r == 0 && xflag) { 220 if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) 221 r = REG_NOMATCH; 222 } 223 if (r == 0) { 224 c = 1; 225 if (oflag && pmatch.rm_so != pmatch.rm_eo) 226 goto print; 227 break; 228 } 229 } 230 if (oflag) 231 return c; 232 print: 233 if (vflag) 234 c = !c; 235 236 /* Count the matches if we have a match limit */ 237 if (mflag) 238 mcount -= c; 239 240 if (c && binbehave == BIN_FILE_BIN && nottext) 241 return c; /* Binary file */ 242 243 if ((tail > 0 || c) && !cflag && !qflag) { 244 if (c) { 245 if (first > 0 && tail == 0 && (Bflag < linesqueued) && 246 (Aflag || Bflag)) 247 printf("--\n"); 248 first = 1; 249 tail = Aflag; 250 if (Bflag > 0) 251 printqueue(); 252 linesqueued = 0; 253 printline(l, ':', oflag ? &pmatch : NULL); 254 } else { 255 printline(l, '-', oflag ? &pmatch : NULL); 256 tail--; 257 } 258 } 259 if (oflag && !matchall) { 260 offset = pmatch.rm_eo; 261 goto redo; 262 } 263 return c; 264 } 265 266 #ifndef SMALL 267 void 268 fgrepcomp(fastgrep_t *fg, const unsigned char *pattern) 269 { 270 int i; 271 272 /* Initialize. */ 273 fg->patternLen = strlen(pattern); 274 fg->bol = 0; 275 fg->eol = 0; 276 fg->wmatch = wflag; 277 fg->reversedSearch = 0; 278 279 /* 280 * Make a copy and upper case it for later if in -i mode, 281 * else just copy the pointer. 282 */ 283 if (iflag) { 284 fg->pattern = grep_malloc(fg->patternLen + 1); 285 for (i = 0; i < fg->patternLen; i++) 286 fg->pattern[i] = toupper(pattern[i]); 287 fg->pattern[fg->patternLen] = '\0'; 288 } else 289 fg->pattern = (unsigned char *)pattern; /* really const */ 290 291 /* Preprocess pattern. */ 292 for (i = 0; i <= UCHAR_MAX; i++) 293 fg->qsBc[i] = fg->patternLen; 294 for (i = 1; i < fg->patternLen; i++) { 295 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 296 /* 297 * If case is ignored, make the jump apply to both upper and 298 * lower cased characters. As the pattern is stored in upper 299 * case, apply the same to the lower case equivalents. 300 */ 301 if (iflag) 302 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 303 } 304 } 305 #endif 306 307 /* 308 * Returns: -1 on failure, 0 on success 309 */ 310 int 311 fastcomp(fastgrep_t *fg, const char *pattern) 312 { 313 #ifdef SMALL 314 return -1; 315 #else 316 int i; 317 int bol = 0; 318 int eol = 0; 319 int shiftPatternLen; 320 int hasDot = 0; 321 int firstHalfDot = -1; 322 int firstLastHalfDot = -1; 323 int lastHalfDot = 0; 324 325 /* Initialize. */ 326 fg->patternLen = strlen(pattern); 327 fg->bol = 0; 328 fg->eol = 0; 329 fg->wmatch = 0; 330 fg->reversedSearch = 0; 331 332 /* Remove end-of-line character ('$'). */ 333 if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') { 334 eol++; 335 fg->eol = 1; 336 fg->patternLen--; 337 } 338 339 /* Remove beginning-of-line character ('^'). */ 340 if (pattern[0] == '^') { 341 bol++; 342 fg->bol = 1; 343 fg->patternLen--; 344 } 345 346 /* Remove enclosing [[:<:]] and [[:>:]] (word match). */ 347 if (wflag) { 348 /* basic re's use \( \), extended re's ( ) */ 349 int extra = Eflag ? 1 : 2; 350 fg->patternLen -= 14 + 2 * extra; 351 fg->wmatch = 7 + extra; 352 } else if (fg->patternLen >= 14 && 353 strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 && 354 strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) { 355 fg->patternLen -= 14; 356 fg->wmatch = 7; 357 } 358 359 /* 360 * Copy pattern minus '^' and '$' characters as well as word 361 * match character classes at the beginning and ending of the 362 * string respectively. 363 */ 364 fg->pattern = grep_malloc(fg->patternLen + 1); 365 memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen); 366 fg->pattern[fg->patternLen] = '\0'; 367 368 /* Look for ways to cheat...er...avoid the full regex engine. */ 369 for (i = 0; i < fg->patternLen; i++) 370 { 371 switch (fg->pattern[i]) { 372 case '.': 373 hasDot = i; 374 if (i < fg->patternLen / 2) { 375 if (firstHalfDot < 0) 376 /* Closest dot to the beginning */ 377 firstHalfDot = i; 378 } else { 379 /* Closest dot to the end of the pattern. */ 380 lastHalfDot = i; 381 if (firstLastHalfDot < 0) 382 firstLastHalfDot = i; 383 } 384 break; 385 case '(': case ')': 386 case '{': case '}': 387 /* Special in BRE if preceded by '\\' */ 388 case '?': 389 case '+': 390 case '|': 391 /* Not special in BRE. */ 392 if (!Eflag) 393 goto nonspecial; 394 case '\\': 395 case '*': 396 case '[': case ']': 397 /* Free memory and let others know this is empty. */ 398 free(fg->pattern); 399 fg->pattern = NULL; 400 return (-1); 401 default: 402 nonspecial: 403 if (iflag) 404 fg->pattern[i] = toupper(fg->pattern[i]); 405 break; 406 } 407 } 408 409 /* 410 * Determine if a reverse search would be faster based on the placement 411 * of the dots. 412 */ 413 if ((!(lflag || cflag || oflag)) && ((!(bol || eol)) && 414 ((lastHalfDot) && ((firstHalfDot < 0) || 415 ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) { 416 fg->reversedSearch = 1; 417 hasDot = fg->patternLen - (firstHalfDot < 0 ? 418 firstLastHalfDot : firstHalfDot) - 1; 419 grep_revstr(fg->pattern, fg->patternLen); 420 } 421 422 /* 423 * Normal Quick Search would require a shift based on the position the 424 * next character after the comparison is within the pattern. With 425 * wildcards, the position of the last dot effects the maximum shift 426 * distance. 427 * The closer to the end the wild card is the slower the search. A 428 * reverse version of this algorithm would be useful for wildcards near 429 * the end of the string. 430 * 431 * Examples: 432 * Pattern Max shift 433 * ------- --------- 434 * this 5 435 * .his 4 436 * t.is 3 437 * th.s 2 438 * thi. 1 439 */ 440 441 /* Adjust the shift based on location of the last dot ('.'). */ 442 shiftPatternLen = fg->patternLen - hasDot; 443 444 /* Preprocess pattern. */ 445 for (i = 0; i <= UCHAR_MAX; i++) 446 fg->qsBc[i] = shiftPatternLen; 447 for (i = hasDot + 1; i < fg->patternLen; i++) { 448 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 449 /* 450 * If case is ignored, make the jump apply to both upper and 451 * lower cased characters. As the pattern is stored in upper 452 * case, apply the same to the lower case equivalents. 453 */ 454 if (iflag) 455 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 456 } 457 458 /* 459 * Put pattern back to normal after pre-processing to allow for easy 460 * comparisons later. 461 */ 462 if (fg->reversedSearch) 463 grep_revstr(fg->pattern, fg->patternLen); 464 465 return (0); 466 #endif 467 } 468 469 /* 470 * Word boundaries using regular expressions are defined as the point 471 * of transition from a non-word char to a word char, or vice versa. 472 * This means that grep -w +a and grep -w a+ never match anything, 473 * because they lack a starting or ending transition, but grep -w a+b 474 * does match a line containing a+b. 475 */ 476 #define wmatch(d, l, s, e) \ 477 ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \ 478 e > s && isword(d[s]) && isword(d[e-1])) 479 480 static int 481 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch, 482 int flags) 483 { 484 #ifdef SMALL 485 return 0; 486 #else 487 regoff_t j; 488 int rtrnVal = REG_NOMATCH; 489 490 pmatch->rm_so = -1; 491 pmatch->rm_eo = -1; 492 493 /* No point in going farther if we do not have enough data. */ 494 if (dataLen < fg->patternLen) 495 return (rtrnVal); 496 497 /* Only try once at the beginning or ending of the line. */ 498 if (fg->bol || fg->eol) { 499 if (fg->bol && (flags & REG_NOTBOL)) 500 return 0; 501 /* Simple text comparison. */ 502 /* Verify data is >= pattern length before searching on it. */ 503 if (dataLen >= fg->patternLen) { 504 /* Determine where in data to start search at. */ 505 if (fg->eol) 506 j = dataLen - fg->patternLen; 507 else 508 j = 0; 509 if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen))) 510 if (grep_cmp(fg->pattern, data + j, 511 fg->patternLen)) { 512 pmatch->rm_so = j; 513 pmatch->rm_eo = j + fg->patternLen; 514 if (!fg->wmatch || wmatch(data, dataLen, 515 pmatch->rm_so, pmatch->rm_eo)) 516 rtrnVal = 0; 517 } 518 } 519 } else if (fg->reversedSearch) { 520 /* Quick Search algorithm. */ 521 j = dataLen; 522 do { 523 if (grep_cmp(fg->pattern, data + j - fg->patternLen, 524 fg->patternLen)) { 525 pmatch->rm_so = j - fg->patternLen; 526 pmatch->rm_eo = j; 527 if (!fg->wmatch || wmatch(data, dataLen, 528 pmatch->rm_so, pmatch->rm_eo)) { 529 rtrnVal = 0; 530 break; 531 } 532 } 533 /* Shift if within bounds, otherwise, we are done. */ 534 if (j == fg->patternLen) 535 break; 536 j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]]; 537 } while (j >= fg->patternLen); 538 } else { 539 /* Quick Search algorithm. */ 540 j = 0; 541 do { 542 if (grep_cmp(fg->pattern, data + j, fg->patternLen)) { 543 pmatch->rm_so = j; 544 pmatch->rm_eo = j + fg->patternLen; 545 if (fg->patternLen == 0 || !fg->wmatch || 546 wmatch(data, dataLen, pmatch->rm_so, 547 pmatch->rm_eo)) { 548 rtrnVal = 0; 549 break; 550 } 551 } 552 553 /* Shift if within bounds, otherwise, we are done. */ 554 if (j + fg->patternLen == dataLen) 555 break; 556 else 557 j += fg->qsBc[(unsigned char)data[j + fg->patternLen]]; 558 } while (j <= (dataLen - fg->patternLen)); 559 } 560 561 return (rtrnVal); 562 #endif 563 } 564 565 566 void * 567 grep_malloc(size_t size) 568 { 569 void *ptr; 570 571 if ((ptr = malloc(size)) == NULL) 572 err(2, "malloc"); 573 return ptr; 574 } 575 576 void * 577 grep_calloc(size_t nmemb, size_t size) 578 { 579 void *ptr; 580 581 if ((ptr = calloc(nmemb, size)) == NULL) 582 err(2, "calloc"); 583 return ptr; 584 } 585 586 void * 587 grep_realloc(void *ptr, size_t size) 588 { 589 if ((ptr = realloc(ptr, size)) == NULL) 590 err(2, "realloc"); 591 return ptr; 592 } 593 594 void * 595 grep_reallocarray(void *ptr, size_t nmemb, size_t size) 596 { 597 if ((ptr = reallocarray(ptr, nmemb, size)) == NULL) 598 err(2, "reallocarray"); 599 return ptr; 600 } 601 602 #ifndef SMALL 603 /* 604 * Returns: true on success, false on failure 605 */ 606 static bool 607 grep_cmp(const char *pattern, const char *data, size_t len) 608 { 609 size_t i; 610 611 for (i = 0; i < len; i++) { 612 if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.')) 613 || (iflag && pattern[i] == toupper((unsigned char)data[i]))) 614 continue; 615 return false; 616 } 617 618 return true; 619 } 620 621 static void 622 grep_revstr(unsigned char *str, int len) 623 { 624 int i; 625 char c; 626 627 for (i = 0; i < len / 2; i++) { 628 c = str[i]; 629 str[i] = str[len - i - 1]; 630 str[len - i - 1] = c; 631 } 632 } 633 #endif 634 635 void 636 printline(str_t *line, int sep, regmatch_t *pmatch) 637 { 638 int n; 639 640 n = 0; 641 if (!hflag) { 642 fputs(line->file, stdout); 643 ++n; 644 } 645 if (nflag) { 646 if (n) 647 putchar(sep); 648 printf("%lld", line->line_no); 649 ++n; 650 } 651 if (bflag) { 652 if (n) 653 putchar(sep); 654 printf("%lld", (long long)line->off); 655 ++n; 656 } 657 if (n) 658 putchar(sep); 659 if (pmatch) 660 fwrite(line->dat + pmatch->rm_so, 661 pmatch->rm_eo - pmatch->rm_so, 1, stdout); 662 else 663 fwrite(line->dat, line->len, 1, stdout); 664 putchar('\n'); 665 } 666