1 /* $OpenBSD: util.c,v 1.58 2017/12/09 18:38:37 pirofti Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stat.h> 31 32 #include <ctype.h> 33 #include <err.h> 34 #include <errno.h> 35 #include <fts.h> 36 #include <regex.h> 37 #include <stdbool.h> 38 #include <stdio.h> 39 #include <stdlib.h> 40 #include <string.h> 41 #include <unistd.h> 42 #include <zlib.h> 43 44 #include "grep.h" 45 46 /* 47 * Process a file line by line... 48 */ 49 50 static int linesqueued; 51 static int procline(str_t *l, int); 52 static int grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch, int); 53 #ifndef SMALL 54 static bool grep_cmp(const char *, const char *, size_t); 55 static void grep_revstr(unsigned char *, int); 56 #endif 57 58 int 59 grep_tree(char **argv) 60 { 61 FTS *fts; 62 FTSENT *p; 63 int c, fts_flags; 64 65 c = 0; 66 67 fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR; 68 69 if (!(fts = fts_open(argv, fts_flags, NULL))) 70 err(2, NULL); 71 while ((p = fts_read(fts)) != NULL) { 72 switch (p->fts_info) { 73 case FTS_DNR: 74 break; 75 case FTS_ERR: 76 file_err = 1; 77 if(!sflag) 78 warnc(p->fts_errno, "%s", p->fts_path); 79 break; 80 case FTS_DP: 81 break; 82 default: 83 c += procfile(p->fts_path); 84 break; 85 } 86 } 87 if (errno) 88 err(2, "fts_read"); 89 fts_close(fts); 90 return c; 91 } 92 93 int 94 procfile(char *fn) 95 { 96 str_t ln; 97 file_t *f; 98 int c, t, z, nottext; 99 100 mcount = mlimit; 101 102 if (fn == NULL) { 103 fn = "(standard input)"; 104 f = grep_fdopen(STDIN_FILENO, "r"); 105 } else { 106 f = grep_open(fn, "r"); 107 } 108 if (f == NULL) { 109 file_err = 1; 110 if (!sflag) 111 warn("%s", fn); 112 return 0; 113 } 114 115 nottext = grep_bin_file(f); 116 if (nottext && binbehave == BIN_FILE_SKIP) { 117 grep_close(f); 118 return 0; 119 } 120 121 ln.file = fn; 122 ln.line_no = 0; 123 ln.len = 0; 124 linesqueued = 0; 125 tail = 0; 126 ln.off = -1; 127 128 if (Bflag > 0) 129 initqueue(); 130 for (c = 0; c == 0 || !(lflag || qflag); ) { 131 if (mflag && mlimit == 0) 132 break; 133 ln.off += ln.len + 1; 134 if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) 135 break; 136 if (ln.len > 0 && ln.dat[ln.len - 1] == '\n') 137 --ln.len; 138 ln.line_no++; 139 140 z = tail; 141 142 if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) { 143 enqueue(&ln); 144 linesqueued++; 145 } 146 c += t; 147 if (mflag && mcount <= 0) 148 break; 149 } 150 if (Bflag > 0) 151 clearqueue(); 152 grep_close(f); 153 154 if (cflag) { 155 if (!hflag) 156 printf("%s:", ln.file); 157 printf("%u\n", c); 158 } 159 if (lflag && c != 0) 160 printf("%s\n", fn); 161 if (Lflag && c == 0) 162 printf("%s\n", fn); 163 if (c && !cflag && !lflag && !Lflag && 164 binbehave == BIN_FILE_BIN && nottext && !qflag) 165 printf("Binary file %s matches\n", fn); 166 167 return c; 168 } 169 170 171 /* 172 * Process an individual line in a file. Return non-zero if it matches. 173 */ 174 175 #define isword(x) (isalnum((unsigned char)x) || (x) == '_') 176 177 static int 178 procline(str_t *l, int nottext) 179 { 180 regmatch_t pmatch = { 0 }; 181 int c, i, r; 182 regoff_t offset; 183 184 /* size_t will be converted to regoff_t. ssize_t is guaranteed to fit 185 * into regoff_t */ 186 if (l->len > SSIZE_MAX) { 187 errx(2, "Line is too big to process"); 188 } 189 190 c = 0; 191 i = 0; 192 if (matchall) { 193 c = 1; 194 goto print; 195 } 196 197 for (i = 0; i < patterns; i++) { 198 offset = 0; 199 redo: 200 if (fg_pattern[i].pattern) { 201 int flags = 0; 202 if (offset) 203 flags |= REG_NOTBOL; 204 r = grep_search(&fg_pattern[i], l->dat + offset, 205 l->len - offset, &pmatch, flags); 206 pmatch.rm_so += offset; 207 pmatch.rm_eo += offset; 208 } else { 209 int flags = eflags; 210 if (offset) 211 flags |= REG_NOTBOL; 212 pmatch.rm_so = offset; 213 pmatch.rm_eo = l->len; 214 r = regexec(&r_pattern[i], l->dat, 1, &pmatch, flags); 215 } 216 if (r == 0 && xflag) { 217 if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) 218 r = REG_NOMATCH; 219 } 220 if (r == 0) { 221 c = 1; 222 if (oflag && pmatch.rm_so != pmatch.rm_eo) 223 goto print; 224 break; 225 } 226 } 227 if (oflag) 228 return c; 229 print: 230 if (vflag) 231 c = !c; 232 233 /* Count the matches if we have a match limit */ 234 if (mflag) 235 mcount -= c; 236 237 if (c && binbehave == BIN_FILE_BIN && nottext) 238 return c; /* Binary file */ 239 240 if ((tail > 0 || c) && !cflag && !qflag) { 241 if (c) { 242 if (first > 0 && tail == 0 && (Bflag < linesqueued) && 243 (Aflag || Bflag)) 244 printf("--\n"); 245 first = 1; 246 tail = Aflag; 247 if (Bflag > 0) 248 printqueue(); 249 linesqueued = 0; 250 printline(l, ':', oflag ? &pmatch : NULL); 251 } else { 252 printline(l, '-', oflag ? &pmatch : NULL); 253 tail--; 254 } 255 } 256 if (oflag && !matchall) { 257 offset = pmatch.rm_eo; 258 goto redo; 259 } 260 return c; 261 } 262 263 #ifndef SMALL 264 void 265 fgrepcomp(fastgrep_t *fg, const unsigned char *pattern) 266 { 267 int i; 268 269 /* Initialize. */ 270 fg->patternLen = strlen(pattern); 271 fg->bol = 0; 272 fg->eol = 0; 273 fg->wmatch = wflag; 274 fg->reversedSearch = 0; 275 276 /* 277 * Make a copy and upper case it for later if in -i mode, 278 * else just copy the pointer. 279 */ 280 if (iflag) { 281 fg->pattern = grep_malloc(fg->patternLen + 1); 282 for (i = 0; i < fg->patternLen; i++) 283 fg->pattern[i] = toupper(pattern[i]); 284 fg->pattern[fg->patternLen] = '\0'; 285 } else 286 fg->pattern = (unsigned char *)pattern; /* really const */ 287 288 /* Preprocess pattern. */ 289 for (i = 0; i <= UCHAR_MAX; i++) 290 fg->qsBc[i] = fg->patternLen; 291 for (i = 1; i < fg->patternLen; i++) { 292 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 293 /* 294 * If case is ignored, make the jump apply to both upper and 295 * lower cased characters. As the pattern is stored in upper 296 * case, apply the same to the lower case equivalents. 297 */ 298 if (iflag) 299 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 300 } 301 } 302 #endif 303 304 /* 305 * Returns: -1 on failure, 0 on success 306 */ 307 int 308 fastcomp(fastgrep_t *fg, const char *pattern) 309 { 310 #ifdef SMALL 311 return -1; 312 #else 313 int i; 314 int bol = 0; 315 int eol = 0; 316 int shiftPatternLen; 317 int hasDot = 0; 318 int firstHalfDot = -1; 319 int firstLastHalfDot = -1; 320 int lastHalfDot = 0; 321 322 /* Initialize. */ 323 fg->patternLen = strlen(pattern); 324 fg->bol = 0; 325 fg->eol = 0; 326 fg->wmatch = 0; 327 fg->reversedSearch = 0; 328 329 /* Remove end-of-line character ('$'). */ 330 if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') { 331 eol++; 332 fg->eol = 1; 333 fg->patternLen--; 334 } 335 336 /* Remove beginning-of-line character ('^'). */ 337 if (pattern[0] == '^') { 338 bol++; 339 fg->bol = 1; 340 fg->patternLen--; 341 } 342 343 /* Remove enclosing [[:<:]] and [[:>:]] (word match). */ 344 if (wflag) { 345 /* basic re's use \( \), extended re's ( ) */ 346 int extra = Eflag ? 1 : 2; 347 fg->patternLen -= 14 + 2 * extra; 348 fg->wmatch = 7 + extra; 349 } else if (fg->patternLen >= 14 && 350 strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 && 351 strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) { 352 fg->patternLen -= 14; 353 fg->wmatch = 7; 354 } 355 356 /* 357 * Copy pattern minus '^' and '$' characters as well as word 358 * match character classes at the beginning and ending of the 359 * string respectively. 360 */ 361 fg->pattern = grep_malloc(fg->patternLen + 1); 362 memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen); 363 fg->pattern[fg->patternLen] = '\0'; 364 365 /* Look for ways to cheat...er...avoid the full regex engine. */ 366 for (i = 0; i < fg->patternLen; i++) 367 { 368 switch (fg->pattern[i]) { 369 case '.': 370 hasDot = i; 371 if (i < fg->patternLen / 2) { 372 if (firstHalfDot < 0) 373 /* Closest dot to the beginning */ 374 firstHalfDot = i; 375 } else { 376 /* Closest dot to the end of the pattern. */ 377 lastHalfDot = i; 378 if (firstLastHalfDot < 0) 379 firstLastHalfDot = i; 380 } 381 break; 382 case '(': case ')': 383 case '{': case '}': 384 /* Special in BRE if preceded by '\\' */ 385 case '?': 386 case '+': 387 case '|': 388 /* Not special in BRE. */ 389 if (!Eflag) 390 goto nonspecial; 391 case '\\': 392 case '*': 393 case '[': case ']': 394 /* Free memory and let others know this is empty. */ 395 free(fg->pattern); 396 fg->pattern = NULL; 397 return (-1); 398 default: 399 nonspecial: 400 if (iflag) 401 fg->pattern[i] = toupper(fg->pattern[i]); 402 break; 403 } 404 } 405 406 /* 407 * Determine if a reverse search would be faster based on the placement 408 * of the dots. 409 */ 410 if ((!(lflag || cflag || oflag)) && ((!(bol || eol)) && 411 ((lastHalfDot) && ((firstHalfDot < 0) || 412 ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) { 413 fg->reversedSearch = 1; 414 hasDot = fg->patternLen - (firstHalfDot < 0 ? 415 firstLastHalfDot : firstHalfDot) - 1; 416 grep_revstr(fg->pattern, fg->patternLen); 417 } 418 419 /* 420 * Normal Quick Search would require a shift based on the position the 421 * next character after the comparison is within the pattern. With 422 * wildcards, the position of the last dot effects the maximum shift 423 * distance. 424 * The closer to the end the wild card is the slower the search. A 425 * reverse version of this algorithm would be useful for wildcards near 426 * the end of the string. 427 * 428 * Examples: 429 * Pattern Max shift 430 * ------- --------- 431 * this 5 432 * .his 4 433 * t.is 3 434 * th.s 2 435 * thi. 1 436 */ 437 438 /* Adjust the shift based on location of the last dot ('.'). */ 439 shiftPatternLen = fg->patternLen - hasDot; 440 441 /* Preprocess pattern. */ 442 for (i = 0; i <= UCHAR_MAX; i++) 443 fg->qsBc[i] = shiftPatternLen; 444 for (i = hasDot + 1; i < fg->patternLen; i++) { 445 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 446 /* 447 * If case is ignored, make the jump apply to both upper and 448 * lower cased characters. As the pattern is stored in upper 449 * case, apply the same to the lower case equivalents. 450 */ 451 if (iflag) 452 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 453 } 454 455 /* 456 * Put pattern back to normal after pre-processing to allow for easy 457 * comparisons later. 458 */ 459 if (fg->reversedSearch) 460 grep_revstr(fg->pattern, fg->patternLen); 461 462 return (0); 463 #endif 464 } 465 466 /* 467 * Word boundaries using regular expressions are defined as the point 468 * of transition from a non-word char to a word char, or vice versa. 469 * This means that grep -w +a and grep -w a+ never match anything, 470 * because they lack a starting or ending transition, but grep -w a+b 471 * does match a line containing a+b. 472 */ 473 #define wmatch(d, l, s, e) \ 474 ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \ 475 e > s && isword(d[s]) && isword(d[e-1])) 476 477 static int 478 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch, 479 int flags) 480 { 481 #ifdef SMALL 482 return 0; 483 #else 484 regoff_t j; 485 int rtrnVal = REG_NOMATCH; 486 487 pmatch->rm_so = -1; 488 pmatch->rm_eo = -1; 489 490 /* No point in going farther if we do not have enough data. */ 491 if (dataLen < fg->patternLen) 492 return (rtrnVal); 493 494 /* Only try once at the beginning or ending of the line. */ 495 if (fg->bol || fg->eol) { 496 if (fg->bol && (flags & REG_NOTBOL)) 497 return 0; 498 /* Simple text comparison. */ 499 /* Verify data is >= pattern length before searching on it. */ 500 if (dataLen >= fg->patternLen) { 501 /* Determine where in data to start search at. */ 502 if (fg->eol) 503 j = dataLen - fg->patternLen; 504 else 505 j = 0; 506 if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen))) 507 if (grep_cmp(fg->pattern, data + j, 508 fg->patternLen)) { 509 pmatch->rm_so = j; 510 pmatch->rm_eo = j + fg->patternLen; 511 if (!fg->wmatch || wmatch(data, dataLen, 512 pmatch->rm_so, pmatch->rm_eo)) 513 rtrnVal = 0; 514 } 515 } 516 } else if (fg->reversedSearch) { 517 /* Quick Search algorithm. */ 518 j = dataLen; 519 do { 520 if (grep_cmp(fg->pattern, data + j - fg->patternLen, 521 fg->patternLen)) { 522 pmatch->rm_so = j - fg->patternLen; 523 pmatch->rm_eo = j; 524 if (!fg->wmatch || wmatch(data, dataLen, 525 pmatch->rm_so, pmatch->rm_eo)) { 526 rtrnVal = 0; 527 break; 528 } 529 } 530 /* Shift if within bounds, otherwise, we are done. */ 531 if (j == fg->patternLen) 532 break; 533 j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]]; 534 } while (j >= fg->patternLen); 535 } else { 536 /* Quick Search algorithm. */ 537 j = 0; 538 do { 539 if (grep_cmp(fg->pattern, data + j, fg->patternLen)) { 540 pmatch->rm_so = j; 541 pmatch->rm_eo = j + fg->patternLen; 542 if (fg->patternLen == 0 || !fg->wmatch || 543 wmatch(data, dataLen, pmatch->rm_so, 544 pmatch->rm_eo)) { 545 rtrnVal = 0; 546 break; 547 } 548 } 549 550 /* Shift if within bounds, otherwise, we are done. */ 551 if (j + fg->patternLen == dataLen) 552 break; 553 else 554 j += fg->qsBc[(unsigned char)data[j + fg->patternLen]]; 555 } while (j <= (dataLen - fg->patternLen)); 556 } 557 558 return (rtrnVal); 559 #endif 560 } 561 562 563 void * 564 grep_malloc(size_t size) 565 { 566 void *ptr; 567 568 if ((ptr = malloc(size)) == NULL) 569 err(2, "malloc"); 570 return ptr; 571 } 572 573 void * 574 grep_calloc(size_t nmemb, size_t size) 575 { 576 void *ptr; 577 578 if ((ptr = calloc(nmemb, size)) == NULL) 579 err(2, "calloc"); 580 return ptr; 581 } 582 583 void * 584 grep_realloc(void *ptr, size_t size) 585 { 586 if ((ptr = realloc(ptr, size)) == NULL) 587 err(2, "realloc"); 588 return ptr; 589 } 590 591 void * 592 grep_reallocarray(void *ptr, size_t nmemb, size_t size) 593 { 594 if ((ptr = reallocarray(ptr, nmemb, size)) == NULL) 595 err(2, "reallocarray"); 596 return ptr; 597 } 598 599 #ifndef SMALL 600 /* 601 * Returns: true on success, false on failure 602 */ 603 static bool 604 grep_cmp(const char *pattern, const char *data, size_t len) 605 { 606 size_t i; 607 608 for (i = 0; i < len; i++) { 609 if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.')) 610 || (iflag && pattern[i] == toupper((unsigned char)data[i]))) 611 continue; 612 return false; 613 } 614 615 return true; 616 } 617 618 static void 619 grep_revstr(unsigned char *str, int len) 620 { 621 int i; 622 char c; 623 624 for (i = 0; i < len / 2; i++) { 625 c = str[i]; 626 str[i] = str[len - i - 1]; 627 str[len - i - 1] = c; 628 } 629 } 630 #endif 631 632 void 633 printline(str_t *line, int sep, regmatch_t *pmatch) 634 { 635 int n; 636 637 n = 0; 638 if (!hflag) { 639 fputs(line->file, stdout); 640 ++n; 641 } 642 if (nflag) { 643 if (n) 644 putchar(sep); 645 printf("%lld", line->line_no); 646 ++n; 647 } 648 if (bflag) { 649 if (n) 650 putchar(sep); 651 printf("%lld", (long long)line->off); 652 ++n; 653 } 654 if (n) 655 putchar(sep); 656 if (pmatch) 657 fwrite(line->dat + pmatch->rm_so, 658 pmatch->rm_eo - pmatch->rm_so, 1, stdout); 659 else 660 fwrite(line->dat, line->len, 1, stdout); 661 putchar('\n'); 662 } 663