1 /* $OpenBSD: util.c,v 1.48 2014/05/20 01:25:23 guenther Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stat.h> 31 32 #include <ctype.h> 33 #include <err.h> 34 #include <errno.h> 35 #include <fts.h> 36 #include <regex.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <string.h> 40 #include <unistd.h> 41 #include <zlib.h> 42 43 #include "grep.h" 44 45 /* 46 * Process a file line by line... 47 */ 48 49 static int linesqueued; 50 static int procline(str_t *l, int); 51 static int grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch); 52 #ifndef SMALL 53 static int grep_cmp(const char *, const char *, size_t); 54 static void grep_revstr(unsigned char *, int); 55 #endif 56 57 int 58 grep_tree(char **argv) 59 { 60 FTS *fts; 61 FTSENT *p; 62 int c, fts_flags; 63 64 c = 0; 65 66 fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR; 67 68 if (!(fts = fts_open(argv, fts_flags, NULL))) 69 err(2, NULL); 70 while ((p = fts_read(fts)) != NULL) { 71 switch (p->fts_info) { 72 case FTS_DNR: 73 break; 74 case FTS_ERR: 75 file_err = 1; 76 if(!sflag) 77 warnc(p->fts_errno, "%s", p->fts_path); 78 break; 79 case FTS_DP: 80 break; 81 default: 82 c += procfile(p->fts_path); 83 break; 84 } 85 } 86 if (errno) 87 err(2, "fts_read"); 88 89 return c; 90 } 91 92 int 93 procfile(char *fn) 94 { 95 str_t ln; 96 file_t *f; 97 int c, t, z, nottext; 98 99 if (fn == NULL) { 100 fn = "(standard input)"; 101 f = grep_fdopen(STDIN_FILENO, "r"); 102 } else { 103 f = grep_open(fn, "r"); 104 } 105 if (f == NULL) { 106 file_err = 1; 107 if (!sflag) 108 warn("%s", fn); 109 return 0; 110 } 111 112 nottext = grep_bin_file(f); 113 if (nottext && binbehave == BIN_FILE_SKIP) { 114 grep_close(f); 115 return 0; 116 } 117 118 ln.file = fn; 119 ln.line_no = 0; 120 ln.len = 0; 121 linesqueued = 0; 122 tail = 0; 123 ln.off = -1; 124 125 if (Bflag > 0) 126 initqueue(); 127 for (c = 0; c == 0 || !(lflag || qflag); ) { 128 ln.off += ln.len + 1; 129 if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) 130 break; 131 if (ln.len > 0 && ln.dat[ln.len - 1] == '\n') 132 --ln.len; 133 ln.line_no++; 134 135 z = tail; 136 137 if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) { 138 enqueue(&ln); 139 linesqueued++; 140 } 141 c += t; 142 } 143 if (Bflag > 0) 144 clearqueue(); 145 grep_close(f); 146 147 if (cflag) { 148 if (!hflag) 149 printf("%s:", ln.file); 150 printf("%u\n", c); 151 } 152 if (lflag && c != 0) 153 printf("%s\n", fn); 154 if (Lflag && c == 0) 155 printf("%s\n", fn); 156 if (c && !cflag && !lflag && !Lflag && 157 binbehave == BIN_FILE_BIN && nottext && !qflag) 158 printf("Binary file %s matches\n", fn); 159 160 return c; 161 } 162 163 164 /* 165 * Process an individual line in a file. Return non-zero if it matches. 166 */ 167 168 #define isword(x) (isalnum((unsigned char)x) || (x) == '_') 169 170 static int 171 procline(str_t *l, int nottext) 172 { 173 regmatch_t pmatch; 174 int c, i, r; 175 regoff_t offset; 176 177 /* size_t will be converted to regoff_t. ssize_t is guaranteed to fit 178 * into regoff_t */ 179 if (l->len > SSIZE_MAX) { 180 errx(2, "Line is too big to process"); 181 } 182 183 c = 0; 184 i = 0; 185 if (matchall) { 186 c = 1; 187 goto print; 188 } 189 190 for (i = 0; i < patterns; i++) { 191 offset = 0; 192 redo: 193 if (fg_pattern[i].pattern) { 194 r = grep_search(&fg_pattern[i], l->dat + offset, 195 l->len - offset, &pmatch); 196 pmatch.rm_so += offset; 197 pmatch.rm_eo += offset; 198 } else { 199 pmatch.rm_so = offset; 200 pmatch.rm_eo = l->len; 201 r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags); 202 } 203 if (r == 0 && xflag) { 204 if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) 205 r = REG_NOMATCH; 206 } 207 if (r == 0) { 208 c = 1; 209 if (oflag && pmatch.rm_so != pmatch.rm_eo) 210 goto print; 211 break; 212 } 213 } 214 if (oflag) 215 return c; 216 print: 217 if (vflag) 218 c = !c; 219 220 if (c && binbehave == BIN_FILE_BIN && nottext) 221 return c; /* Binary file */ 222 223 if ((tail > 0 || c) && !cflag && !qflag) { 224 if (c) { 225 if (first > 0 && tail == 0 && (Bflag < linesqueued) && 226 (Aflag || Bflag)) 227 printf("--\n"); 228 first = 1; 229 tail = Aflag; 230 if (Bflag > 0) 231 printqueue(); 232 linesqueued = 0; 233 printline(l, ':', oflag ? &pmatch : NULL); 234 } else { 235 printline(l, '-', oflag ? &pmatch : NULL); 236 tail--; 237 } 238 } 239 if (oflag && !matchall) { 240 offset = pmatch.rm_eo; 241 goto redo; 242 } 243 return c; 244 } 245 246 #ifndef SMALL 247 void 248 fgrepcomp(fastgrep_t *fg, const unsigned char *pattern) 249 { 250 int i; 251 252 /* Initialize. */ 253 fg->patternLen = strlen(pattern); 254 fg->bol = 0; 255 fg->eol = 0; 256 fg->wmatch = wflag; 257 fg->reversedSearch = 0; 258 259 /* 260 * Make a copy and upper case it for later if in -i mode, 261 * else just copy the pointer. 262 */ 263 if (iflag) { 264 fg->pattern = grep_malloc(fg->patternLen + 1); 265 for (i = 0; i < fg->patternLen; i++) 266 fg->pattern[i] = toupper(pattern[i]); 267 fg->pattern[fg->patternLen] = '\0'; 268 } else 269 fg->pattern = (unsigned char *)pattern; /* really const */ 270 271 /* Preprocess pattern. */ 272 for (i = 0; i <= UCHAR_MAX; i++) 273 fg->qsBc[i] = fg->patternLen; 274 for (i = 1; i < fg->patternLen; i++) { 275 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 276 /* 277 * If case is ignored, make the jump apply to both upper and 278 * lower cased characters. As the pattern is stored in upper 279 * case, apply the same to the lower case equivalents. 280 */ 281 if (iflag) 282 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 283 } 284 } 285 #endif 286 287 /* 288 * Returns: -1 on failure, 0 on success 289 */ 290 int 291 fastcomp(fastgrep_t *fg, const char *pattern) 292 { 293 #ifdef SMALL 294 return -1; 295 #else 296 int i; 297 int bol = 0; 298 int eol = 0; 299 int shiftPatternLen; 300 int hasDot = 0; 301 int firstHalfDot = -1; 302 int firstLastHalfDot = -1; 303 int lastHalfDot = 0; 304 305 /* Initialize. */ 306 fg->patternLen = strlen(pattern); 307 fg->bol = 0; 308 fg->eol = 0; 309 fg->wmatch = 0; 310 fg->reversedSearch = 0; 311 312 /* Remove end-of-line character ('$'). */ 313 if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') { 314 eol++; 315 fg->eol = 1; 316 fg->patternLen--; 317 } 318 319 /* Remove beginning-of-line character ('^'). */ 320 if (pattern[0] == '^') { 321 bol++; 322 fg->bol = 1; 323 fg->patternLen--; 324 } 325 326 /* Remove enclosing [[:<:]] and [[:>:]] (word match). */ 327 if (wflag) { 328 /* basic re's use \( \), extended re's ( ) */ 329 int extra = Eflag ? 1 : 2; 330 fg->patternLen -= 14 + 2 * extra; 331 fg->wmatch = 7 + extra; 332 } else if (fg->patternLen >= 14 && 333 strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 && 334 strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) { 335 fg->patternLen -= 14; 336 fg->wmatch = 7; 337 } 338 339 /* 340 * Copy pattern minus '^' and '$' characters as well as word 341 * match character classes at the beginning and ending of the 342 * string respectively. 343 */ 344 fg->pattern = grep_malloc(fg->patternLen + 1); 345 memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen); 346 fg->pattern[fg->patternLen] = '\0'; 347 348 /* Look for ways to cheat...er...avoid the full regex engine. */ 349 for (i = 0; i < fg->patternLen; i++) 350 { 351 switch (fg->pattern[i]) { 352 case '.': 353 hasDot = i; 354 if (i < fg->patternLen / 2) { 355 if (firstHalfDot < 0) 356 /* Closest dot to the beginning */ 357 firstHalfDot = i; 358 } else { 359 /* Closest dot to the end of the pattern. */ 360 lastHalfDot = i; 361 if (firstLastHalfDot < 0) 362 firstLastHalfDot = i; 363 } 364 break; 365 case '(': case ')': 366 case '{': case '}': 367 /* Special in BRE if preceded by '\\' */ 368 case '?': 369 case '+': 370 case '|': 371 /* Not special in BRE. */ 372 if (!Eflag) 373 goto nonspecial; 374 case '\\': 375 case '*': 376 case '[': case ']': 377 /* Free memory and let others know this is empty. */ 378 free(fg->pattern); 379 fg->pattern = NULL; 380 return (-1); 381 default: 382 nonspecial: 383 if (iflag) 384 fg->pattern[i] = toupper(fg->pattern[i]); 385 break; 386 } 387 } 388 389 /* 390 * Determine if a reverse search would be faster based on the placement 391 * of the dots. 392 */ 393 if ((!(lflag || cflag)) && ((!(bol || eol)) && 394 ((lastHalfDot) && ((firstHalfDot < 0) || 395 ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) { 396 fg->reversedSearch = 1; 397 hasDot = fg->patternLen - (firstHalfDot < 0 ? 398 firstLastHalfDot : firstHalfDot) - 1; 399 grep_revstr(fg->pattern, fg->patternLen); 400 } 401 402 /* 403 * Normal Quick Search would require a shift based on the position the 404 * next character after the comparison is within the pattern. With 405 * wildcards, the position of the last dot effects the maximum shift 406 * distance. 407 * The closer to the end the wild card is the slower the search. A 408 * reverse version of this algorithm would be useful for wildcards near 409 * the end of the string. 410 * 411 * Examples: 412 * Pattern Max shift 413 * ------- --------- 414 * this 5 415 * .his 4 416 * t.is 3 417 * th.s 2 418 * thi. 1 419 */ 420 421 /* Adjust the shift based on location of the last dot ('.'). */ 422 shiftPatternLen = fg->patternLen - hasDot; 423 424 /* Preprocess pattern. */ 425 for (i = 0; i <= UCHAR_MAX; i++) 426 fg->qsBc[i] = shiftPatternLen; 427 for (i = hasDot + 1; i < fg->patternLen; i++) { 428 fg->qsBc[fg->pattern[i]] = fg->patternLen - i; 429 /* 430 * If case is ignored, make the jump apply to both upper and 431 * lower cased characters. As the pattern is stored in upper 432 * case, apply the same to the lower case equivalents. 433 */ 434 if (iflag) 435 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; 436 } 437 438 /* 439 * Put pattern back to normal after pre-processing to allow for easy 440 * comparisons later. 441 */ 442 if (fg->reversedSearch) 443 grep_revstr(fg->pattern, fg->patternLen); 444 445 return (0); 446 #endif 447 } 448 449 /* 450 * Word boundaries using regular expressions are defined as the point 451 * of transition from a non-word char to a word char, or vice versa. 452 * This means that grep -w +a and grep -w a+ never match anything, 453 * because they lack a starting or ending transition, but grep -w a+b 454 * does match a line containing a+b. 455 */ 456 #define wmatch(d, l, s, e) \ 457 ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \ 458 e > s && isword(d[s]) && isword(d[e-1])) 459 460 static int 461 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch) 462 { 463 #ifdef SMALL 464 return 0; 465 #else 466 regoff_t j; 467 int rtrnVal = REG_NOMATCH; 468 469 pmatch->rm_so = -1; 470 pmatch->rm_eo = -1; 471 472 /* No point in going farther if we do not have enough data. */ 473 if (dataLen < fg->patternLen) 474 return (rtrnVal); 475 476 /* Only try once at the beginning or ending of the line. */ 477 if (fg->bol || fg->eol) { 478 /* Simple text comparison. */ 479 /* Verify data is >= pattern length before searching on it. */ 480 if (dataLen >= fg->patternLen) { 481 /* Determine where in data to start search at. */ 482 if (fg->eol) 483 j = dataLen - fg->patternLen; 484 else 485 j = 0; 486 if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen))) 487 if (grep_cmp(fg->pattern, data + j, 488 fg->patternLen) == -1) { 489 pmatch->rm_so = j; 490 pmatch->rm_eo = j + fg->patternLen; 491 if (!fg->wmatch || wmatch(data, dataLen, 492 pmatch->rm_so, pmatch->rm_eo)) 493 rtrnVal = 0; 494 } 495 } 496 } else if (fg->reversedSearch) { 497 /* Quick Search algorithm. */ 498 j = dataLen; 499 do { 500 if (grep_cmp(fg->pattern, data + j - fg->patternLen, 501 fg->patternLen) == -1) { 502 pmatch->rm_so = j - fg->patternLen; 503 pmatch->rm_eo = j; 504 if (!fg->wmatch || wmatch(data, dataLen, 505 pmatch->rm_so, pmatch->rm_eo)) { 506 rtrnVal = 0; 507 break; 508 } 509 } 510 /* Shift if within bounds, otherwise, we are done. */ 511 if (j == fg->patternLen) 512 break; 513 j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]]; 514 } while (j >= fg->patternLen); 515 } else { 516 /* Quick Search algorithm. */ 517 j = 0; 518 do { 519 if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) { 520 pmatch->rm_so = j; 521 pmatch->rm_eo = j + fg->patternLen; 522 if (fg->patternLen == 0 || !fg->wmatch || 523 wmatch(data, dataLen, pmatch->rm_so, 524 pmatch->rm_eo)) { 525 rtrnVal = 0; 526 break; 527 } 528 } 529 530 /* Shift if within bounds, otherwise, we are done. */ 531 if (j + fg->patternLen == dataLen) 532 break; 533 else 534 j += fg->qsBc[(unsigned char)data[j + fg->patternLen]]; 535 } while (j <= (dataLen - fg->patternLen)); 536 } 537 538 return (rtrnVal); 539 #endif 540 } 541 542 543 void * 544 grep_malloc(size_t size) 545 { 546 void *ptr; 547 548 if ((ptr = malloc(size)) == NULL) 549 err(2, "malloc"); 550 return ptr; 551 } 552 553 void * 554 grep_calloc(size_t nmemb, size_t size) 555 { 556 void *ptr; 557 558 if ((ptr = calloc(nmemb, size)) == NULL) 559 err(2, "calloc"); 560 return ptr; 561 } 562 563 void * 564 grep_realloc(void *ptr, size_t size) 565 { 566 if ((ptr = realloc(ptr, size)) == NULL) 567 err(2, "realloc"); 568 return ptr; 569 } 570 571 #ifndef SMALL 572 /* 573 * Returns: i >= 0 on failure (position that it failed) 574 * -1 on success 575 */ 576 static int 577 grep_cmp(const char *pattern, const char *data, size_t len) 578 { 579 int i; 580 581 for (i = 0; i < len; i++) { 582 if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.')) 583 || (iflag && pattern[i] == toupper(data[i]))) 584 continue; 585 return (i); 586 } 587 588 return (-1); 589 } 590 591 static void 592 grep_revstr(unsigned char *str, int len) 593 { 594 int i; 595 char c; 596 597 for (i = 0; i < len / 2; i++) { 598 c = str[i]; 599 str[i] = str[len - i - 1]; 600 str[len - i - 1] = c; 601 } 602 } 603 #endif 604 605 void 606 printline(str_t *line, int sep, regmatch_t *pmatch) 607 { 608 int n; 609 610 n = 0; 611 if (!hflag) { 612 fputs(line->file, stdout); 613 ++n; 614 } 615 if (nflag) { 616 if (n) 617 putchar(sep); 618 printf("%d", line->line_no); 619 ++n; 620 } 621 if (bflag) { 622 if (n) 623 putchar(sep); 624 printf("%lld", (long long)line->off); 625 ++n; 626 } 627 if (n) 628 putchar(sep); 629 if (pmatch) 630 fwrite(line->dat + pmatch->rm_so, 631 pmatch->rm_eo - pmatch->rm_so, 1, stdout); 632 else 633 fwrite(line->dat, line->len, 1, stdout); 634 putchar('\n'); 635 } 636