1 /* $OpenBSD: util.c,v 1.68 2023/11/15 00:50:43 millert Exp $ */
2
3 /*-
4 * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/types.h>
30 #include <sys/stat.h>
31
32 #include <ctype.h>
33 #include <err.h>
34 #include <errno.h>
35 #include <fts.h>
36 #include <regex.h>
37 #include <stdbool.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <unistd.h>
42 #include <zlib.h>
43
44 #include "grep.h"
45
46 /*
47 * Process a file line by line...
48 */
49
50 static int linesqueued;
51 static int procline(str_t *l, int);
52 static int grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch, int);
53 #ifndef SMALL
54 static bool grep_cmp(const char *, const char *, size_t);
55 static void grep_revstr(unsigned char *, int);
56 #endif
57
58 int
grep_tree(char ** argv)59 grep_tree(char **argv)
60 {
61 FTS *fts;
62 FTSENT *p;
63 int c, fts_flags;
64 char *dot_argv[] = { ".", NULL };
65 char *path;
66
67 /* default to . if no path given */
68 if (argv[0] == NULL)
69 argv = dot_argv;
70
71 c = 0;
72
73 fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
74
75 if (!(fts = fts_open(argv, fts_flags, NULL)))
76 err(2, NULL);
77 while ((p = fts_read(fts)) != NULL) {
78 switch (p->fts_info) {
79 case FTS_DNR:
80 break;
81 case FTS_ERR:
82 file_err = 1;
83 if(!sflag)
84 warnc(p->fts_errno, "%s", p->fts_path);
85 break;
86 case FTS_D:
87 case FTS_DP:
88 break;
89 default:
90 path = p->fts_path;
91 /* skip "./" if implied */
92 if (argv == dot_argv && p->fts_pathlen >= 2)
93 path += 2;
94 c |= procfile(path);
95 break;
96 }
97 }
98 if (errno)
99 err(2, "fts_read");
100 fts_close(fts);
101 return c;
102 }
103
104 int
procfile(char * fn)105 procfile(char *fn)
106 {
107 str_t ln;
108 file_t *f;
109 int t, z, nottext, overflow = 0;
110 unsigned long long c;
111
112 mcount = mlimit;
113
114 if (fn == NULL) {
115 fn = "(standard input)";
116 f = grep_fdopen(STDIN_FILENO);
117 } else {
118 f = grep_open(fn);
119 }
120 if (f == NULL) {
121 if (errno == EISDIR)
122 return 0;
123 file_err = 1;
124 if (!sflag)
125 warn("%s", fn);
126 return 0;
127 }
128
129 nottext = grep_bin_file(f);
130 if (nottext && binbehave == BIN_FILE_SKIP) {
131 grep_close(f);
132 return 0;
133 }
134
135 ln.file = fn;
136 if (labelname)
137 ln.file = (char *)labelname;
138 ln.line_no = 0;
139 ln.len = 0;
140 linesqueued = 0;
141 tail = 0;
142 ln.off = -1;
143
144 if (Bflag > 0)
145 initqueue();
146 for (c = 0; c == 0 || !(lflag || qflag); ) {
147 if (mflag && mlimit == 0)
148 break;
149 ln.off += ln.len + 1;
150 if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
151 break;
152 if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
153 --ln.len;
154 ln.line_no++;
155
156 z = tail;
157
158 if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
159 enqueue(&ln);
160 linesqueued++;
161 }
162 if (ULLONG_MAX - c < (unsigned long long)t)
163 overflow = 1;
164 else
165 c += t;
166 if (mflag && mcount <= 0 && tail <= 0)
167 break;
168 }
169 if (Bflag > 0)
170 clearqueue();
171 grep_close(f);
172
173 if (cflag) {
174 if (!hflag)
175 printf("%s%c", ln.file, nullflag ? '\0' : ':');
176 printf("%llu%s\n", c, overflow ? "+" : "");
177 }
178 if (lflag && c != 0)
179 printf("%s%c", fn, nullflag ? '\0' : '\n');
180 if (Lflag && c == 0)
181 printf("%s%c", fn, nullflag ? '\0' : '\n');
182 if (c && !cflag && !lflag && !Lflag &&
183 binbehave == BIN_FILE_BIN && nottext && !qflag)
184 printf("Binary file %s matches\n", fn);
185
186 return overflow || c != 0;
187 }
188
189
190 /*
191 * Process an individual line in a file. Return non-zero if it matches.
192 */
193
194 #define isword(x) (isalnum((unsigned char)x) || (x) == '_')
195
196 static int
procline(str_t * l,int nottext)197 procline(str_t *l, int nottext)
198 {
199 regmatch_t pmatch = { 0 };
200 int c, i, r, counted;
201 regoff_t offset;
202
203 /* size_t will be converted to regoff_t. ssize_t is guaranteed to fit
204 * into regoff_t */
205 if (l->len > SSIZE_MAX) {
206 errx(2, "Line is too big to process");
207 }
208
209 c = 0;
210 i = 0;
211 counted = 0;
212 if (matchall) {
213 c = 1;
214 goto print;
215 }
216 if (mflag && mcount <= 0)
217 goto print;
218
219 for (i = 0; i < patterns; i++) {
220 offset = 0;
221 redo:
222 if (fg_pattern[i].pattern) {
223 int flags = 0;
224 if (offset)
225 flags |= REG_NOTBOL;
226 r = grep_search(&fg_pattern[i], l->dat + offset,
227 l->len - offset, &pmatch, flags);
228 pmatch.rm_so += offset;
229 pmatch.rm_eo += offset;
230 } else {
231 int flags = eflags;
232 if (offset)
233 flags |= REG_NOTBOL;
234 pmatch.rm_so = offset;
235 pmatch.rm_eo = l->len;
236 r = regexec(&r_pattern[i], l->dat, 1, &pmatch, flags);
237 }
238 if (r == 0 && xflag) {
239 if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
240 r = REG_NOMATCH;
241 }
242 if (r == 0) {
243 c = 1;
244 if (oflag && pmatch.rm_so != pmatch.rm_eo)
245 goto print;
246 break;
247 }
248 }
249 if (oflag)
250 return c;
251 print:
252 if (vflag)
253 c = !c;
254
255 /* Count the matches if there is a match limit (but only once). */
256 if (mflag && !counted) {
257 mcount -= c;
258 counted = 1;
259 }
260
261 if (c && binbehave == BIN_FILE_BIN && nottext)
262 return c; /* Binary file */
263
264 if ((tail > 0 || c) && !cflag && !qflag) {
265 if (c) {
266 if (first > 0 && tail == 0 && (Aflag || (Bflag &&
267 Bflag < linesqueued)))
268 printf("--\n");
269 first = 1;
270 tail = Aflag;
271 if (Bflag > 0)
272 printqueue();
273 linesqueued = 0;
274 printline(l, ':', oflag ? &pmatch : NULL);
275 } else {
276 printline(l, '-', oflag ? &pmatch : NULL);
277 tail--;
278 }
279 }
280 if (oflag && !matchall) {
281 offset = pmatch.rm_eo;
282 goto redo;
283 }
284 return c;
285 }
286
287 #ifndef SMALL
288 void
fgrepcomp(fastgrep_t * fg,const unsigned char * pattern)289 fgrepcomp(fastgrep_t *fg, const unsigned char *pattern)
290 {
291 int i;
292
293 /* Initialize. */
294 fg->patternLen = strlen(pattern);
295 fg->bol = 0;
296 fg->eol = 0;
297 fg->wmatch = wflag;
298 fg->reversedSearch = 0;
299
300 /*
301 * Make a copy and upper case it for later if in -i mode,
302 * else just copy the pointer.
303 */
304 if (iflag) {
305 fg->pattern = grep_malloc(fg->patternLen + 1);
306 for (i = 0; i < fg->patternLen; i++)
307 fg->pattern[i] = toupper(pattern[i]);
308 fg->pattern[fg->patternLen] = '\0';
309 } else
310 fg->pattern = (unsigned char *)pattern; /* really const */
311
312 /* Preprocess pattern. */
313 for (i = 0; i <= UCHAR_MAX; i++)
314 fg->qsBc[i] = fg->patternLen;
315 for (i = 1; i < fg->patternLen; i++) {
316 fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
317 /*
318 * If case is ignored, make the jump apply to both upper and
319 * lower cased characters. As the pattern is stored in upper
320 * case, apply the same to the lower case equivalents.
321 */
322 if (iflag)
323 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
324 }
325 }
326 #endif
327
328 /*
329 * Returns: -1 on failure, 0 on success
330 */
331 int
fastcomp(fastgrep_t * fg,const char * pattern)332 fastcomp(fastgrep_t *fg, const char *pattern)
333 {
334 #ifdef SMALL
335 return -1;
336 #else
337 int i;
338 int bol = 0;
339 int eol = 0;
340 int shiftPatternLen;
341 int hasDot = 0;
342 int firstHalfDot = -1;
343 int firstLastHalfDot = -1;
344 int lastHalfDot = 0;
345
346 /* Initialize. */
347 fg->patternLen = strlen(pattern);
348 fg->bol = 0;
349 fg->eol = 0;
350 fg->wmatch = 0;
351 fg->reversedSearch = 0;
352
353 /* Remove end-of-line character ('$'). */
354 if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') {
355 eol++;
356 fg->eol = 1;
357 fg->patternLen--;
358 }
359
360 /* Remove beginning-of-line character ('^'). */
361 if (pattern[0] == '^') {
362 bol++;
363 fg->bol = 1;
364 fg->patternLen--;
365 }
366
367 /* Remove enclosing [[:<:]] and [[:>:]] (word match). */
368 if (wflag) {
369 /* basic re's use \( \), extended re's ( ) */
370 int extra = Eflag ? 1 : 2;
371 fg->patternLen -= 14 + 2 * extra;
372 fg->wmatch = 7 + extra;
373 } else if (fg->patternLen >= 14 &&
374 strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
375 strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
376 fg->patternLen -= 14;
377 fg->wmatch = 7;
378 }
379
380 /*
381 * Copy pattern minus '^' and '$' characters as well as word
382 * match character classes at the beginning and ending of the
383 * string respectively.
384 */
385 fg->pattern = grep_malloc(fg->patternLen + 1);
386 memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
387 fg->pattern[fg->patternLen] = '\0';
388
389 /* Look for ways to cheat...er...avoid the full regex engine. */
390 for (i = 0; i < fg->patternLen; i++)
391 {
392 switch (fg->pattern[i]) {
393 case '.':
394 hasDot = i;
395 if (i < fg->patternLen / 2) {
396 if (firstHalfDot < 0)
397 /* Closest dot to the beginning */
398 firstHalfDot = i;
399 } else {
400 /* Closest dot to the end of the pattern. */
401 lastHalfDot = i;
402 if (firstLastHalfDot < 0)
403 firstLastHalfDot = i;
404 }
405 break;
406 case '(': case ')':
407 case '{': case '}':
408 /* Special in BRE if preceded by '\\' */
409 case '?':
410 case '+':
411 case '|':
412 /* Not special in BRE. */
413 if (!Eflag)
414 goto nonspecial;
415 case '\\':
416 case '*':
417 case '[': case ']':
418 /* Free memory and let others know this is empty. */
419 free(fg->pattern);
420 fg->pattern = NULL;
421 return (-1);
422 default:
423 nonspecial:
424 if (iflag)
425 fg->pattern[i] = toupper(fg->pattern[i]);
426 break;
427 }
428 }
429
430 /*
431 * Determine if a reverse search would be faster based on the placement
432 * of the dots.
433 */
434 if ((!(lflag || cflag || oflag)) && ((!(bol || eol)) &&
435 ((lastHalfDot) && ((firstHalfDot < 0) ||
436 ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
437 fg->reversedSearch = 1;
438 hasDot = fg->patternLen - (firstHalfDot < 0 ?
439 firstLastHalfDot : firstHalfDot) - 1;
440 grep_revstr(fg->pattern, fg->patternLen);
441 }
442
443 /*
444 * Normal Quick Search would require a shift based on the position the
445 * next character after the comparison is within the pattern. With
446 * wildcards, the position of the last dot effects the maximum shift
447 * distance.
448 * The closer to the end the wild card is the slower the search. A
449 * reverse version of this algorithm would be useful for wildcards near
450 * the end of the string.
451 *
452 * Examples:
453 * Pattern Max shift
454 * ------- ---------
455 * this 5
456 * .his 4
457 * t.is 3
458 * th.s 2
459 * thi. 1
460 */
461
462 /* Adjust the shift based on location of the last dot ('.'). */
463 shiftPatternLen = fg->patternLen - hasDot;
464
465 /* Preprocess pattern. */
466 for (i = 0; i <= UCHAR_MAX; i++)
467 fg->qsBc[i] = shiftPatternLen;
468 for (i = hasDot + 1; i < fg->patternLen; i++) {
469 fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
470 /*
471 * If case is ignored, make the jump apply to both upper and
472 * lower cased characters. As the pattern is stored in upper
473 * case, apply the same to the lower case equivalents.
474 */
475 if (iflag)
476 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
477 }
478
479 /*
480 * Put pattern back to normal after pre-processing to allow for easy
481 * comparisons later.
482 */
483 if (fg->reversedSearch)
484 grep_revstr(fg->pattern, fg->patternLen);
485
486 return (0);
487 #endif
488 }
489
490 /*
491 * Word boundaries using regular expressions are defined as the point
492 * of transition from a non-word char to a word char, or vice versa.
493 * This means that grep -w +a and grep -w a+ never match anything,
494 * because they lack a starting or ending transition, but grep -w a+b
495 * does match a line containing a+b.
496 */
497 #define wmatch(d, l, s, e) \
498 ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
499 e > s && isword(d[s]) && isword(d[e-1]))
500
501 static int
grep_search(fastgrep_t * fg,char * data,size_t dataLen,regmatch_t * pmatch,int flags)502 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch,
503 int flags)
504 {
505 #ifdef SMALL
506 return 0;
507 #else
508 regoff_t j;
509 int rtrnVal = REG_NOMATCH;
510
511 pmatch->rm_so = -1;
512 pmatch->rm_eo = -1;
513
514 /* No point in going farther if we do not have enough data. */
515 if (dataLen < fg->patternLen)
516 return (rtrnVal);
517
518 /* Only try once at the beginning or ending of the line. */
519 if (fg->bol || fg->eol) {
520 if (fg->bol && (flags & REG_NOTBOL))
521 return 0;
522 /* Simple text comparison. */
523 /* Verify data is >= pattern length before searching on it. */
524 if (dataLen >= fg->patternLen) {
525 /* Determine where in data to start search at. */
526 if (fg->eol)
527 j = dataLen - fg->patternLen;
528 else
529 j = 0;
530 if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
531 if (grep_cmp(fg->pattern, data + j,
532 fg->patternLen)) {
533 pmatch->rm_so = j;
534 pmatch->rm_eo = j + fg->patternLen;
535 if (!fg->wmatch || wmatch(data, dataLen,
536 pmatch->rm_so, pmatch->rm_eo))
537 rtrnVal = 0;
538 }
539 }
540 } else if (fg->reversedSearch) {
541 /* Quick Search algorithm. */
542 j = dataLen;
543 do {
544 if (grep_cmp(fg->pattern, data + j - fg->patternLen,
545 fg->patternLen)) {
546 pmatch->rm_so = j - fg->patternLen;
547 pmatch->rm_eo = j;
548 if (!fg->wmatch || wmatch(data, dataLen,
549 pmatch->rm_so, pmatch->rm_eo)) {
550 rtrnVal = 0;
551 break;
552 }
553 }
554 /* Shift if within bounds, otherwise, we are done. */
555 if (j == fg->patternLen)
556 break;
557 j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]];
558 } while (j >= fg->patternLen);
559 } else {
560 /* Quick Search algorithm. */
561 j = 0;
562 do {
563 if (grep_cmp(fg->pattern, data + j, fg->patternLen)) {
564 pmatch->rm_so = j;
565 pmatch->rm_eo = j + fg->patternLen;
566 if (fg->patternLen == 0 || !fg->wmatch ||
567 wmatch(data, dataLen, pmatch->rm_so,
568 pmatch->rm_eo)) {
569 rtrnVal = 0;
570 break;
571 }
572 }
573
574 /* Shift if within bounds, otherwise, we are done. */
575 if (j + fg->patternLen == dataLen)
576 break;
577 else
578 j += fg->qsBc[(unsigned char)data[j + fg->patternLen]];
579 } while (j <= (dataLen - fg->patternLen));
580 }
581
582 return (rtrnVal);
583 #endif
584 }
585
586
587 void *
grep_malloc(size_t size)588 grep_malloc(size_t size)
589 {
590 void *ptr;
591
592 if ((ptr = malloc(size)) == NULL)
593 err(2, "malloc");
594 return ptr;
595 }
596
597 void *
grep_calloc(size_t nmemb,size_t size)598 grep_calloc(size_t nmemb, size_t size)
599 {
600 void *ptr;
601
602 if ((ptr = calloc(nmemb, size)) == NULL)
603 err(2, "calloc");
604 return ptr;
605 }
606
607 void *
grep_realloc(void * ptr,size_t size)608 grep_realloc(void *ptr, size_t size)
609 {
610 if ((ptr = realloc(ptr, size)) == NULL)
611 err(2, "realloc");
612 return ptr;
613 }
614
615 void *
grep_reallocarray(void * ptr,size_t nmemb,size_t size)616 grep_reallocarray(void *ptr, size_t nmemb, size_t size)
617 {
618 if ((ptr = reallocarray(ptr, nmemb, size)) == NULL)
619 err(2, "reallocarray");
620 return ptr;
621 }
622
623 #ifndef SMALL
624 /*
625 * Returns: true on success, false on failure
626 */
627 static bool
grep_cmp(const char * pattern,const char * data,size_t len)628 grep_cmp(const char *pattern, const char *data, size_t len)
629 {
630 size_t i;
631
632 for (i = 0; i < len; i++) {
633 if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
634 || (iflag && pattern[i] == toupper((unsigned char)data[i])))
635 continue;
636 return false;
637 }
638
639 return true;
640 }
641
642 static void
grep_revstr(unsigned char * str,int len)643 grep_revstr(unsigned char *str, int len)
644 {
645 int i;
646 char c;
647
648 for (i = 0; i < len / 2; i++) {
649 c = str[i];
650 str[i] = str[len - i - 1];
651 str[len - i - 1] = c;
652 }
653 }
654 #endif
655
656 void
printline(str_t * line,int sep,regmatch_t * pmatch)657 printline(str_t *line, int sep, regmatch_t *pmatch)
658 {
659 int n;
660
661 n = 0;
662 if (!hflag) {
663 fputs(line->file, stdout);
664 if (nullflag)
665 putchar(0);
666 else
667 ++n;
668 }
669 if (nflag) {
670 if (n)
671 putchar(sep);
672 printf("%lld", line->line_no);
673 ++n;
674 }
675 if (bflag) {
676 if (n)
677 putchar(sep);
678 printf("%lld", (long long)line->off +
679 (pmatch ? pmatch->rm_so : 0));
680 ++n;
681 }
682 if (n)
683 putchar(sep);
684 if (pmatch)
685 fwrite(line->dat + pmatch->rm_so,
686 pmatch->rm_eo - pmatch->rm_so, 1, stdout);
687 else
688 fwrite(line->dat, line->len, 1, stdout);
689 putchar('\n');
690 }
691