1 /* $OpenBSD: util.c,v 1.57 2017/04/03 16:18:35 tedu Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stat.h> 31 32 #include <ctype.h> 33 #include <err.h> 34 #include <errno.h> 35 #include <fts.h> 36 #include <regex.h> 37 #include <stdbool.h> 38 #include <stdio.h> 39 #include <stdlib.h> 40 #include <string.h> 41 #include <unistd.h> 42 #include <zlib.h> 43 44 #include "grep.h" 45 46 /* 47 * Process a file line by line... 48 */ 49 50 static int linesqueued; 51 static int procline(str_t *l, int); 52 static int grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch, int); 53 #ifndef SMALL 54 static bool grep_cmp(const char *, const char *, size_t); 55 static void grep_revstr(unsigned char *, int); 56 #endif 57 58 int 59 grep_tree(char **argv) 60 { 61 FTS *fts; 62 FTSENT *p; 63 int c, fts_flags; 64 65 c = 0; 66 67 fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR; 68 69 if (!(fts = fts_open(argv, fts_flags, NULL))) 70 err(2, NULL); 71 while ((p = fts_read(fts)) != NULL) { 72 switch (p->fts_info) { 73 case FTS_DNR: 74 break; 75 case FTS_ERR: 76 file_err = 1; 77 if(!sflag) 78 warnc(p->fts_errno, "%s", p->fts_path); 79 break; 80 case FTS_DP: 81 break; 82 default: 83 c += procfile(p->fts_path); 84 break; 85 } 86 } 87 if (errno) 88 err(2, "fts_read"); 89 fts_close(fts); 90 return c; 91 } 92 93 int 94 procfile(char *fn) 95 { 96 str_t ln; 97 file_t *f; 98 int c, t, z, nottext; 99 100 if (fn == NULL) { 101 fn = "(standard input)"; 102 f = grep_fdopen(STDIN_FILENO, "r"); 103 } else { 104 f = grep_open(fn, "r"); 105 } 106 if (f == NULL) { 107 file_err = 1; 108 if (!sflag) 109 warn("%s", fn); 110 return 0; 111 } 112 113 nottext = grep_bin_file(f); 114 if (nottext && binbehave == BIN_FILE_SKIP) { 115 grep_close(f); 116 return 0; 117 } 118 119 ln.file = fn; 120 ln.line_no = 0; 121 ln.len = 0; 122 linesqueued = 0; 123 tail = 0; 124 ln.off = -1; 125 126 if (Bflag > 0) 127 initqueue(); 128 for (c = 0; c == 0 || !(lflag || qflag); ) { 129 ln.off += ln.len + 1; 130 if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) 131 break; 132 if (ln.len > 0 && ln.dat[ln.len - 1] == '\n') 133 --ln.len; 134 ln.line_no++; 135 136 z = tail; 137 138 if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) { 139 enqueue(&ln); 140 linesqueued++; 141 } 142 c += t; 143 } 144 if (Bflag > 0) 145 clearqueue(); 146 grep_close(f); 147 148 if (cflag) { 149 if (!hflag) 150 printf("%s:", ln.file); 151 printf("%u\n", c); 152 } 153 if (lflag && c != 0) 154 printf("%s\n", fn); 155 if (Lflag && c == 0) 156 printf("%s\n", fn); 157 if (c && !cflag && !lflag && !Lflag && 158 binbehave == BIN_FILE_BIN && nottext && !qflag) 159 printf("Binary file %s matches\n", fn); 160 161 return c; 162 } 163 164 165 /* 166 * Process an individual line in a file. Return non-zero if it matches. 167 */ 168 169 #define isword(x) (isalnum((unsigned char)x) || (x) == '_') 170 171 static int 172 procline(str_t *l, int nottext) 173 { 174 regmatch_t pmatch = { 0 }; 175 int c, i, r; 176 regoff_t offset; 177 178 /* size_t will be converted to regoff_t. ssize_t is guaranteed to fit 179 * into regoff_t */ 180 if (l->len > SSIZE_MAX) { 181 errx(2, "Line is too big to process"); 182 } 183 184 c = 0; 185 i = 0; 186 if (matchall) { 187 c = 1; 188 goto print; 189 } 190 191 for (i = 0; i < patterns; i++) { 192 offset = 0; 193 redo: 194 if (fg_pattern[i].pattern) { 195 int flags = 0; 196 if (offset) 197 flags |= REG_NOTBOL; 198 r = grep_search(&fg_pattern[i], l->dat + offset, 199 l->len - offset, &pmatch, flags); 200 pmatch.rm_so += offset; 201 pmatch.rm_eo += offset; 202 } else { 203 int flags = eflags; 204 if (offset) 205 flags |= REG_NOTBOL; 206 pmatch.rm_so = offset; 207 pmatch.rm_eo = l->len; 208 r = regexec(&r_pattern[i], l->dat, 1, &pmatch, flags); 209 } 210 if (r == 0 && xflag) { 211 if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) 212 r = REG_NOMATCH; 213 } 214 if (r == 0) { 215 c = 1; 216 if (oflag && pmatch.rm_so != pmatch.rm_eo) 217 goto print; 218 break; 219 } 220 } 221 if (oflag) 222 return c; 223 print: 224 if (vflag) 225 c = !c; 226 227 if (c && binbehave == BIN_FILE_BIN && nottext) 228 return c; /* Binary file */ 229 230 if ((tail > 0 || c) && !cflag && !qflag) { 231 if (c) { 232 if (first > 0 && tail == 0 && (Bflag < linesqueued) && 233 (Aflag || Bflag)) 234 printf("--\n"); 235 first = 1; 236 tail = Aflag; 237 if (Bflag > 0) 238 printqueue(); 239 linesqueued = 0; 240 printline(l, ':', oflag ? &pmatch : NULL); 241 } else { 242 printline(l, '-', oflag ? &pmatch : NULL); 243 tail--; 244 } 245 } 246 if (oflag && !matchall) { 247 offset = pmatch.rm_eo; 248 goto redo; 249 } 250 return c; 251 } 252 253 #ifndef SMALL 254 void 255 fgrepcomp(fastgrep_t *fg, const unsigned char *pattern) 256 { 257 int i; 258 259 /* Initialize. */ 260 fg->patternLen = strlen(pattern); 261 fg->bol = 0; 262 fg->eol = 0; 263 fg->wmatch = wflag; 264 fg->reversedSearch = 0; 265 266 /* 267 * Make a copy and upper case it for later if in -i mode, 268 * else just copy the pointer. 269 */ 270 if (iflag) { 271 fg->pattern = grep_malloc(fg->patternLen + 1); 272 for (i = 0; i < fg->patternLen; i++) 273 fg->pattern[i] = toupper(pattern[i]); 274 fg->pattern[fg->patternLen] = '\0'; 275 } else 276 fg->pattern = (unsigned char *)pattern; /* really const */ 277 278 /* Preprocess pattern. */ 279 for (i = 0; i <= UCHAR_MAX; i++) 280 fg->qsBc[i] = fg->patternLen; 281 for (i = 1; i < fg->patternLen; i++) { 282 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 283 /* 284 * If case is ignored, make the jump apply to both upper and 285 * lower cased characters. As the pattern is stored in upper 286 * case, apply the same to the lower case equivalents. 287 */ 288 if (iflag) 289 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 290 } 291 } 292 #endif 293 294 /* 295 * Returns: -1 on failure, 0 on success 296 */ 297 int 298 fastcomp(fastgrep_t *fg, const char *pattern) 299 { 300 #ifdef SMALL 301 return -1; 302 #else 303 int i; 304 int bol = 0; 305 int eol = 0; 306 int shiftPatternLen; 307 int hasDot = 0; 308 int firstHalfDot = -1; 309 int firstLastHalfDot = -1; 310 int lastHalfDot = 0; 311 312 /* Initialize. */ 313 fg->patternLen = strlen(pattern); 314 fg->bol = 0; 315 fg->eol = 0; 316 fg->wmatch = 0; 317 fg->reversedSearch = 0; 318 319 /* Remove end-of-line character ('$'). */ 320 if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') { 321 eol++; 322 fg->eol = 1; 323 fg->patternLen--; 324 } 325 326 /* Remove beginning-of-line character ('^'). */ 327 if (pattern[0] == '^') { 328 bol++; 329 fg->bol = 1; 330 fg->patternLen--; 331 } 332 333 /* Remove enclosing [[:<:]] and [[:>:]] (word match). */ 334 if (wflag) { 335 /* basic re's use \( \), extended re's ( ) */ 336 int extra = Eflag ? 1 : 2; 337 fg->patternLen -= 14 + 2 * extra; 338 fg->wmatch = 7 + extra; 339 } else if (fg->patternLen >= 14 && 340 strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 && 341 strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) { 342 fg->patternLen -= 14; 343 fg->wmatch = 7; 344 } 345 346 /* 347 * Copy pattern minus '^' and '$' characters as well as word 348 * match character classes at the beginning and ending of the 349 * string respectively. 350 */ 351 fg->pattern = grep_malloc(fg->patternLen + 1); 352 memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen); 353 fg->pattern[fg->patternLen] = '\0'; 354 355 /* Look for ways to cheat...er...avoid the full regex engine. */ 356 for (i = 0; i < fg->patternLen; i++) 357 { 358 switch (fg->pattern[i]) { 359 case '.': 360 hasDot = i; 361 if (i < fg->patternLen / 2) { 362 if (firstHalfDot < 0) 363 /* Closest dot to the beginning */ 364 firstHalfDot = i; 365 } else { 366 /* Closest dot to the end of the pattern. */ 367 lastHalfDot = i; 368 if (firstLastHalfDot < 0) 369 firstLastHalfDot = i; 370 } 371 break; 372 case '(': case ')': 373 case '{': case '}': 374 /* Special in BRE if preceded by '\\' */ 375 case '?': 376 case '+': 377 case '|': 378 /* Not special in BRE. */ 379 if (!Eflag) 380 goto nonspecial; 381 case '\\': 382 case '*': 383 case '[': case ']': 384 /* Free memory and let others know this is empty. */ 385 free(fg->pattern); 386 fg->pattern = NULL; 387 return (-1); 388 default: 389 nonspecial: 390 if (iflag) 391 fg->pattern[i] = toupper(fg->pattern[i]); 392 break; 393 } 394 } 395 396 /* 397 * Determine if a reverse search would be faster based on the placement 398 * of the dots. 399 */ 400 if ((!(lflag || cflag || oflag)) && ((!(bol || eol)) && 401 ((lastHalfDot) && ((firstHalfDot < 0) || 402 ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) { 403 fg->reversedSearch = 1; 404 hasDot = fg->patternLen - (firstHalfDot < 0 ? 405 firstLastHalfDot : firstHalfDot) - 1; 406 grep_revstr(fg->pattern, fg->patternLen); 407 } 408 409 /* 410 * Normal Quick Search would require a shift based on the position the 411 * next character after the comparison is within the pattern. With 412 * wildcards, the position of the last dot effects the maximum shift 413 * distance. 414 * The closer to the end the wild card is the slower the search. A 415 * reverse version of this algorithm would be useful for wildcards near 416 * the end of the string. 417 * 418 * Examples: 419 * Pattern Max shift 420 * ------- --------- 421 * this 5 422 * .his 4 423 * t.is 3 424 * th.s 2 425 * thi. 1 426 */ 427 428 /* Adjust the shift based on location of the last dot ('.'). */ 429 shiftPatternLen = fg->patternLen - hasDot; 430 431 /* Preprocess pattern. */ 432 for (i = 0; i <= UCHAR_MAX; i++) 433 fg->qsBc[i] = shiftPatternLen; 434 for (i = hasDot + 1; i < fg->patternLen; i++) { 435 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 436 /* 437 * If case is ignored, make the jump apply to both upper and 438 * lower cased characters. As the pattern is stored in upper 439 * case, apply the same to the lower case equivalents. 440 */ 441 if (iflag) 442 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 443 } 444 445 /* 446 * Put pattern back to normal after pre-processing to allow for easy 447 * comparisons later. 448 */ 449 if (fg->reversedSearch) 450 grep_revstr(fg->pattern, fg->patternLen); 451 452 return (0); 453 #endif 454 } 455 456 /* 457 * Word boundaries using regular expressions are defined as the point 458 * of transition from a non-word char to a word char, or vice versa. 459 * This means that grep -w +a and grep -w a+ never match anything, 460 * because they lack a starting or ending transition, but grep -w a+b 461 * does match a line containing a+b. 462 */ 463 #define wmatch(d, l, s, e) \ 464 ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \ 465 e > s && isword(d[s]) && isword(d[e-1])) 466 467 static int 468 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch, 469 int flags) 470 { 471 #ifdef SMALL 472 return 0; 473 #else 474 regoff_t j; 475 int rtrnVal = REG_NOMATCH; 476 477 pmatch->rm_so = -1; 478 pmatch->rm_eo = -1; 479 480 /* No point in going farther if we do not have enough data. */ 481 if (dataLen < fg->patternLen) 482 return (rtrnVal); 483 484 /* Only try once at the beginning or ending of the line. */ 485 if (fg->bol || fg->eol) { 486 if (fg->bol && (flags & REG_NOTBOL)) 487 return 0; 488 /* Simple text comparison. */ 489 /* Verify data is >= pattern length before searching on it. */ 490 if (dataLen >= fg->patternLen) { 491 /* Determine where in data to start search at. */ 492 if (fg->eol) 493 j = dataLen - fg->patternLen; 494 else 495 j = 0; 496 if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen))) 497 if (grep_cmp(fg->pattern, data + j, 498 fg->patternLen)) { 499 pmatch->rm_so = j; 500 pmatch->rm_eo = j + fg->patternLen; 501 if (!fg->wmatch || wmatch(data, dataLen, 502 pmatch->rm_so, pmatch->rm_eo)) 503 rtrnVal = 0; 504 } 505 } 506 } else if (fg->reversedSearch) { 507 /* Quick Search algorithm. */ 508 j = dataLen; 509 do { 510 if (grep_cmp(fg->pattern, data + j - fg->patternLen, 511 fg->patternLen)) { 512 pmatch->rm_so = j - fg->patternLen; 513 pmatch->rm_eo = j; 514 if (!fg->wmatch || wmatch(data, dataLen, 515 pmatch->rm_so, pmatch->rm_eo)) { 516 rtrnVal = 0; 517 break; 518 } 519 } 520 /* Shift if within bounds, otherwise, we are done. */ 521 if (j == fg->patternLen) 522 break; 523 j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]]; 524 } while (j >= fg->patternLen); 525 } else { 526 /* Quick Search algorithm. */ 527 j = 0; 528 do { 529 if (grep_cmp(fg->pattern, data + j, fg->patternLen)) { 530 pmatch->rm_so = j; 531 pmatch->rm_eo = j + fg->patternLen; 532 if (fg->patternLen == 0 || !fg->wmatch || 533 wmatch(data, dataLen, pmatch->rm_so, 534 pmatch->rm_eo)) { 535 rtrnVal = 0; 536 break; 537 } 538 } 539 540 /* Shift if within bounds, otherwise, we are done. */ 541 if (j + fg->patternLen == dataLen) 542 break; 543 else 544 j += fg->qsBc[(unsigned char)data[j + fg->patternLen]]; 545 } while (j <= (dataLen - fg->patternLen)); 546 } 547 548 return (rtrnVal); 549 #endif 550 } 551 552 553 void * 554 grep_malloc(size_t size) 555 { 556 void *ptr; 557 558 if ((ptr = malloc(size)) == NULL) 559 err(2, "malloc"); 560 return ptr; 561 } 562 563 void * 564 grep_calloc(size_t nmemb, size_t size) 565 { 566 void *ptr; 567 568 if ((ptr = calloc(nmemb, size)) == NULL) 569 err(2, "calloc"); 570 return ptr; 571 } 572 573 void * 574 grep_realloc(void *ptr, size_t size) 575 { 576 if ((ptr = realloc(ptr, size)) == NULL) 577 err(2, "realloc"); 578 return ptr; 579 } 580 581 void * 582 grep_reallocarray(void *ptr, size_t nmemb, size_t size) 583 { 584 if ((ptr = reallocarray(ptr, nmemb, size)) == NULL) 585 err(2, "reallocarray"); 586 return ptr; 587 } 588 589 #ifndef SMALL 590 /* 591 * Returns: true on success, false on failure 592 */ 593 static bool 594 grep_cmp(const char *pattern, const char *data, size_t len) 595 { 596 size_t i; 597 598 for (i = 0; i < len; i++) { 599 if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.')) 600 || (iflag && pattern[i] == toupper((unsigned char)data[i]))) 601 continue; 602 return false; 603 } 604 605 return true; 606 } 607 608 static void 609 grep_revstr(unsigned char *str, int len) 610 { 611 int i; 612 char c; 613 614 for (i = 0; i < len / 2; i++) { 615 c = str[i]; 616 str[i] = str[len - i - 1]; 617 str[len - i - 1] = c; 618 } 619 } 620 #endif 621 622 void 623 printline(str_t *line, int sep, regmatch_t *pmatch) 624 { 625 int n; 626 627 n = 0; 628 if (!hflag) { 629 fputs(line->file, stdout); 630 ++n; 631 } 632 if (nflag) { 633 if (n) 634 putchar(sep); 635 printf("%lld", line->line_no); 636 ++n; 637 } 638 if (bflag) { 639 if (n) 640 putchar(sep); 641 printf("%lld", (long long)line->off); 642 ++n; 643 } 644 if (n) 645 putchar(sep); 646 if (pmatch) 647 fwrite(line->dat + pmatch->rm_so, 648 pmatch->rm_eo - pmatch->rm_so, 1, stdout); 649 else 650 fwrite(line->dat, line->len, 1, stdout); 651 putchar('\n'); 652 } 653