xref: /openbsd-src/usr.bin/grep/util.c (revision 99fd087599a8791921855f21bd7e36130f39aadc)
1 /*	$OpenBSD: util.c,v 1.62 2019/12/03 09:14:37 jca Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 
32 #include <ctype.h>
33 #include <err.h>
34 #include <errno.h>
35 #include <fts.h>
36 #include <regex.h>
37 #include <stdbool.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <unistd.h>
42 #include <zlib.h>
43 
44 #include "grep.h"
45 
46 /*
47  * Process a file line by line...
48  */
49 
50 static int	linesqueued;
51 static int	procline(str_t *l, int);
52 static int	grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch, int);
53 #ifndef SMALL
54 static bool	grep_cmp(const char *, const char *, size_t);
55 static void	grep_revstr(unsigned char *, int);
56 #endif
57 
58 int
59 grep_tree(char **argv)
60 {
61 	FTS	*fts;
62 	FTSENT	*p;
63 	int	c, fts_flags;
64 	char	*dot_argv[] = { ".", NULL };
65 	char	*path;
66 
67 	/* default to . if no path given */
68 	if (argv[0] == NULL)
69 		argv = dot_argv;
70 
71 	c = 0;
72 
73 	fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
74 
75 	if (!(fts = fts_open(argv, fts_flags, NULL)))
76 		err(2, NULL);
77 	while ((p = fts_read(fts)) != NULL) {
78 		switch (p->fts_info) {
79 		case FTS_DNR:
80 			break;
81 		case FTS_ERR:
82 			file_err = 1;
83 			if(!sflag)
84 				warnc(p->fts_errno, "%s", p->fts_path);
85 			break;
86 		case FTS_D:
87 		case FTS_DP:
88 			break;
89 		default:
90 			path = p->fts_path;
91 			/* skip "./" if implied */
92 			if (argv == dot_argv && p->fts_pathlen >= 2)
93 				path += 2;
94 			c += procfile(path);
95 			break;
96 		}
97 	}
98 	if (errno)
99 		err(2, "fts_read");
100 	fts_close(fts);
101 	return c;
102 }
103 
104 int
105 procfile(char *fn)
106 {
107 	str_t ln;
108 	file_t *f;
109 	int c, t, z, nottext;
110 
111 	mcount = mlimit;
112 
113 	if (fn == NULL) {
114 		fn = "(standard input)";
115 		f = grep_fdopen(STDIN_FILENO);
116 	} else {
117 		f = grep_open(fn);
118 	}
119 	if (f == NULL) {
120 		if (errno == EISDIR)
121 			return 0;
122 		file_err = 1;
123 		if (!sflag)
124 			warn("%s", fn);
125 		return 0;
126 	}
127 
128 	nottext = grep_bin_file(f);
129 	if (nottext && binbehave == BIN_FILE_SKIP) {
130 		grep_close(f);
131 		return 0;
132 	}
133 
134 	ln.file = fn;
135 	if (labelname)
136 		ln.file = (char *)labelname;
137 	ln.line_no = 0;
138 	ln.len = 0;
139 	linesqueued = 0;
140 	tail = 0;
141 	ln.off = -1;
142 
143 	if (Bflag > 0)
144 		initqueue();
145 	for (c = 0;  c == 0 || !(lflag || qflag); ) {
146 		if (mflag && mlimit == 0)
147 			break;
148 		ln.off += ln.len + 1;
149 		if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
150 			break;
151 		if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
152 			--ln.len;
153 		ln.line_no++;
154 
155 		z = tail;
156 
157 		if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
158 			enqueue(&ln);
159 			linesqueued++;
160 		}
161 		c += t;
162 		if (mflag && mcount <= 0)
163 			break;
164 	}
165 	if (Bflag > 0)
166 		clearqueue();
167 	grep_close(f);
168 
169 	if (cflag) {
170 		if (!hflag)
171 			printf("%s:", ln.file);
172 		printf("%u\n", c);
173 	}
174 	if (lflag && c != 0)
175 		printf("%s\n", fn);
176 	if (Lflag && c == 0)
177 		printf("%s\n", fn);
178 	if (c && !cflag && !lflag && !Lflag &&
179 	    binbehave == BIN_FILE_BIN && nottext && !qflag)
180 		printf("Binary file %s matches\n", fn);
181 
182 	return c;
183 }
184 
185 
186 /*
187  * Process an individual line in a file. Return non-zero if it matches.
188  */
189 
190 #define isword(x) (isalnum((unsigned char)x) || (x) == '_')
191 
192 static int
193 procline(str_t *l, int nottext)
194 {
195 	regmatch_t	pmatch = { 0 };
196 	int		c, i, r;
197 	regoff_t	offset;
198 
199 	/* size_t will be converted to regoff_t. ssize_t is guaranteed to fit
200 	 * into regoff_t */
201 	if (l->len > SSIZE_MAX) {
202 		errx(2, "Line is too big to process");
203 	}
204 
205 	c = 0;
206 	i = 0;
207 	if (matchall) {
208 		c = 1;
209 		goto print;
210 	}
211 
212 	for (i = 0; i < patterns; i++) {
213 		offset = 0;
214 redo:
215 		if (fg_pattern[i].pattern) {
216 			int flags = 0;
217 			if (offset)
218 				flags |= REG_NOTBOL;
219 			r = grep_search(&fg_pattern[i], l->dat + offset,
220 			    l->len - offset, &pmatch, flags);
221 			pmatch.rm_so += offset;
222 			pmatch.rm_eo += offset;
223 		} else {
224 			int flags = eflags;
225 			if (offset)
226 				flags |= REG_NOTBOL;
227 			pmatch.rm_so = offset;
228 			pmatch.rm_eo = l->len;
229 			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, flags);
230 		}
231 		if (r == 0 && xflag) {
232 			if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
233 				r = REG_NOMATCH;
234 		}
235 		if (r == 0) {
236 			c = 1;
237 			if (oflag && pmatch.rm_so != pmatch.rm_eo)
238 				goto print;
239 			break;
240 		}
241 	}
242 	if (oflag)
243 		return c;
244 print:
245 	if (vflag)
246 		c = !c;
247 
248 	/* Count the matches if we have a match limit */
249 	if (mflag)
250 		mcount -= c;
251 
252 	if (c && binbehave == BIN_FILE_BIN && nottext)
253 		return c; /* Binary file */
254 
255 	if ((tail > 0 || c) && !cflag && !qflag) {
256 		if (c) {
257 			if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
258 			    (Aflag || Bflag))
259 				printf("--\n");
260 			first = 1;
261 			tail = Aflag;
262 			if (Bflag > 0)
263 				printqueue();
264 			linesqueued = 0;
265 			printline(l, ':', oflag ? &pmatch : NULL);
266 		} else {
267 			printline(l, '-', oflag ? &pmatch : NULL);
268 			tail--;
269 		}
270 	}
271 	if (oflag && !matchall) {
272 		offset = pmatch.rm_eo;
273 		goto redo;
274 	}
275 	return c;
276 }
277 
278 #ifndef SMALL
279 void
280 fgrepcomp(fastgrep_t *fg, const unsigned char *pattern)
281 {
282 	int i;
283 
284 	/* Initialize. */
285 	fg->patternLen = strlen(pattern);
286 	fg->bol = 0;
287 	fg->eol = 0;
288 	fg->wmatch = wflag;
289 	fg->reversedSearch = 0;
290 
291 	/*
292 	 * Make a copy and upper case it for later if in -i mode,
293 	 * else just copy the pointer.
294 	 */
295 	if (iflag) {
296 		fg->pattern = grep_malloc(fg->patternLen + 1);
297 		for (i = 0; i < fg->patternLen; i++)
298 			fg->pattern[i] = toupper(pattern[i]);
299 		fg->pattern[fg->patternLen] = '\0';
300 	} else
301 		fg->pattern = (unsigned char *)pattern;	/* really const */
302 
303 	/* Preprocess pattern. */
304 	for (i = 0; i <= UCHAR_MAX; i++)
305 		fg->qsBc[i] = fg->patternLen;
306 	for (i = 1; i < fg->patternLen; i++) {
307 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
308 		/*
309 		 * If case is ignored, make the jump apply to both upper and
310 		 * lower cased characters.  As the pattern is stored in upper
311 		 * case, apply the same to the lower case equivalents.
312 		 */
313 		if (iflag)
314 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
315 	}
316 }
317 #endif
318 
319 /*
320  * Returns: -1 on failure, 0 on success
321  */
322 int
323 fastcomp(fastgrep_t *fg, const char *pattern)
324 {
325 #ifdef SMALL
326 	return -1;
327 #else
328 	int i;
329 	int bol = 0;
330 	int eol = 0;
331 	int shiftPatternLen;
332 	int hasDot = 0;
333 	int firstHalfDot = -1;
334 	int firstLastHalfDot = -1;
335 	int lastHalfDot = 0;
336 
337 	/* Initialize. */
338 	fg->patternLen = strlen(pattern);
339 	fg->bol = 0;
340 	fg->eol = 0;
341 	fg->wmatch = 0;
342 	fg->reversedSearch = 0;
343 
344 	/* Remove end-of-line character ('$'). */
345 	if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') {
346 		eol++;
347 		fg->eol = 1;
348 		fg->patternLen--;
349 	}
350 
351 	/* Remove beginning-of-line character ('^'). */
352 	if (pattern[0] == '^') {
353 		bol++;
354 		fg->bol = 1;
355 		fg->patternLen--;
356 	}
357 
358 	/* Remove enclosing [[:<:]] and [[:>:]] (word match). */
359 	if (wflag) {
360 		/* basic re's use \( \), extended re's ( ) */
361 		int extra = Eflag ? 1 : 2;
362 		fg->patternLen -= 14 + 2 * extra;
363 		fg->wmatch = 7 + extra;
364 	} else if (fg->patternLen >= 14 &&
365 	    strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
366 	    strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
367 		fg->patternLen -= 14;
368 		fg->wmatch = 7;
369 	}
370 
371 	/*
372 	 * Copy pattern minus '^' and '$' characters as well as word
373 	 * match character classes at the beginning and ending of the
374 	 * string respectively.
375 	 */
376 	fg->pattern = grep_malloc(fg->patternLen + 1);
377 	memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
378 	fg->pattern[fg->patternLen] = '\0';
379 
380 	/* Look for ways to cheat...er...avoid the full regex engine. */
381 	for (i = 0; i < fg->patternLen; i++)
382 	{
383 		switch (fg->pattern[i]) {
384 		case '.':
385 			hasDot = i;
386 			if (i < fg->patternLen / 2) {
387 				if (firstHalfDot < 0)
388 					/* Closest dot to the beginning */
389 					firstHalfDot = i;
390 			} else {
391 				/* Closest dot to the end of the pattern. */
392 				lastHalfDot = i;
393 				if (firstLastHalfDot < 0)
394 					firstLastHalfDot = i;
395 			}
396 			break;
397 		case '(': case ')':
398 		case '{': case '}':
399 			/* Special in BRE if preceded by '\\' */
400 		case '?':
401 		case '+':
402 		case '|':
403 			/* Not special in BRE. */
404 			if (!Eflag)
405 				goto nonspecial;
406 		case '\\':
407 		case '*':
408 		case '[': case ']':
409 			/* Free memory and let others know this is empty. */
410 			free(fg->pattern);
411 			fg->pattern = NULL;
412 			return (-1);
413 		default:
414 nonspecial:
415 			if (iflag)
416 				fg->pattern[i] = toupper(fg->pattern[i]);
417 			break;
418 		}
419 	}
420 
421 	/*
422 	 * Determine if a reverse search would be faster based on the placement
423 	 * of the dots.
424 	 */
425 	if ((!(lflag || cflag || oflag)) && ((!(bol || eol)) &&
426 	    ((lastHalfDot) && ((firstHalfDot < 0) ||
427 	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
428 		fg->reversedSearch = 1;
429 		hasDot = fg->patternLen - (firstHalfDot < 0 ?
430 		    firstLastHalfDot : firstHalfDot) - 1;
431 		grep_revstr(fg->pattern, fg->patternLen);
432 	}
433 
434 	/*
435 	 * Normal Quick Search would require a shift based on the position the
436 	 * next character after the comparison is within the pattern.  With
437 	 * wildcards, the position of the last dot effects the maximum shift
438 	 * distance.
439 	 * The closer to the end the wild card is the slower the search.  A
440 	 * reverse version of this algorithm would be useful for wildcards near
441 	 * the end of the string.
442 	 *
443 	 * Examples:
444 	 * Pattern	Max shift
445 	 * -------	---------
446 	 * this		5
447 	 * .his		4
448 	 * t.is		3
449 	 * th.s		2
450 	 * thi.		1
451 	 */
452 
453 	/* Adjust the shift based on location of the last dot ('.'). */
454 	shiftPatternLen = fg->patternLen - hasDot;
455 
456 	/* Preprocess pattern. */
457 	for (i = 0; i <= UCHAR_MAX; i++)
458 		fg->qsBc[i] = shiftPatternLen;
459 	for (i = hasDot + 1; i < fg->patternLen; i++) {
460 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
461 		/*
462 		 * If case is ignored, make the jump apply to both upper and
463 		 * lower cased characters.  As the pattern is stored in upper
464 		 * case, apply the same to the lower case equivalents.
465 		 */
466 		if (iflag)
467 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
468 	}
469 
470 	/*
471 	 * Put pattern back to normal after pre-processing to allow for easy
472 	 * comparisons later.
473 	 */
474 	if (fg->reversedSearch)
475 		grep_revstr(fg->pattern, fg->patternLen);
476 
477 	return (0);
478 #endif
479 }
480 
481 /*
482  * Word boundaries using regular expressions are defined as the point
483  * of transition from a non-word char to a word char, or vice versa.
484  * This means that grep -w +a and grep -w a+ never match anything,
485  * because they lack a starting or ending transition, but grep -w a+b
486  * does match a line containing a+b.
487  */
488 #define wmatch(d, l, s, e)	\
489 	((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
490 	  e > s && isword(d[s]) && isword(d[e-1]))
491 
492 static int
493 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch,
494     int flags)
495 {
496 #ifdef SMALL
497 	return 0;
498 #else
499 	regoff_t j;
500 	int rtrnVal = REG_NOMATCH;
501 
502 	pmatch->rm_so = -1;
503 	pmatch->rm_eo = -1;
504 
505 	/* No point in going farther if we do not have enough data. */
506 	if (dataLen < fg->patternLen)
507 		return (rtrnVal);
508 
509 	/* Only try once at the beginning or ending of the line. */
510 	if (fg->bol || fg->eol) {
511 		if (fg->bol && (flags & REG_NOTBOL))
512 			return 0;
513 		/* Simple text comparison. */
514 		/* Verify data is >= pattern length before searching on it. */
515 		if (dataLen >= fg->patternLen) {
516 			/* Determine where in data to start search at. */
517 			if (fg->eol)
518 				j = dataLen - fg->patternLen;
519 			else
520 				j = 0;
521 			if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
522 				if (grep_cmp(fg->pattern, data + j,
523 				    fg->patternLen)) {
524 					pmatch->rm_so = j;
525 					pmatch->rm_eo = j + fg->patternLen;
526 					if (!fg->wmatch || wmatch(data, dataLen,
527 					    pmatch->rm_so, pmatch->rm_eo))
528 						rtrnVal = 0;
529 				}
530 		}
531 	} else if (fg->reversedSearch) {
532 		/* Quick Search algorithm. */
533 		j = dataLen;
534 		do {
535 			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
536 			    fg->patternLen)) {
537 				pmatch->rm_so = j - fg->patternLen;
538 				pmatch->rm_eo = j;
539 				if (!fg->wmatch || wmatch(data, dataLen,
540 				    pmatch->rm_so, pmatch->rm_eo)) {
541 					rtrnVal = 0;
542 					break;
543 				}
544 			}
545 			/* Shift if within bounds, otherwise, we are done. */
546 			if (j == fg->patternLen)
547 				break;
548 			j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]];
549 		} while (j >= fg->patternLen);
550 	} else {
551 		/* Quick Search algorithm. */
552 		j = 0;
553 		do {
554 			if (grep_cmp(fg->pattern, data + j, fg->patternLen)) {
555 				pmatch->rm_so = j;
556 				pmatch->rm_eo = j + fg->patternLen;
557 				if (fg->patternLen == 0 || !fg->wmatch ||
558 				    wmatch(data, dataLen, pmatch->rm_so,
559 				    pmatch->rm_eo)) {
560 					rtrnVal = 0;
561 					break;
562 				}
563 			}
564 
565 			/* Shift if within bounds, otherwise, we are done. */
566 			if (j + fg->patternLen == dataLen)
567 				break;
568 			else
569 				j += fg->qsBc[(unsigned char)data[j + fg->patternLen]];
570 		} while (j <= (dataLen - fg->patternLen));
571 	}
572 
573 	return (rtrnVal);
574 #endif
575 }
576 
577 
578 void *
579 grep_malloc(size_t size)
580 {
581 	void	*ptr;
582 
583 	if ((ptr = malloc(size)) == NULL)
584 		err(2, "malloc");
585 	return ptr;
586 }
587 
588 void *
589 grep_calloc(size_t nmemb, size_t size)
590 {
591 	void	*ptr;
592 
593 	if ((ptr = calloc(nmemb, size)) == NULL)
594 		err(2, "calloc");
595 	return ptr;
596 }
597 
598 void *
599 grep_realloc(void *ptr, size_t size)
600 {
601 	if ((ptr = realloc(ptr, size)) == NULL)
602 		err(2, "realloc");
603 	return ptr;
604 }
605 
606 void *
607 grep_reallocarray(void *ptr, size_t nmemb, size_t size)
608 {
609 	if ((ptr = reallocarray(ptr, nmemb, size)) == NULL)
610 		err(2, "reallocarray");
611 	return ptr;
612 }
613 
614 #ifndef SMALL
615 /*
616  * Returns:	true on success, false on failure
617  */
618 static bool
619 grep_cmp(const char *pattern, const char *data, size_t len)
620 {
621 	size_t i;
622 
623 	for (i = 0; i < len; i++) {
624 		if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
625 		    || (iflag && pattern[i] == toupper((unsigned char)data[i])))
626 			continue;
627 		return false;
628 	}
629 
630 	return true;
631 }
632 
633 static void
634 grep_revstr(unsigned char *str, int len)
635 {
636 	int i;
637 	char c;
638 
639 	for (i = 0; i < len / 2; i++) {
640 		c = str[i];
641 		str[i] = str[len - i - 1];
642 		str[len - i - 1] = c;
643 	}
644 }
645 #endif
646 
647 void
648 printline(str_t *line, int sep, regmatch_t *pmatch)
649 {
650 	int n;
651 
652 	n = 0;
653 	if (!hflag) {
654 		fputs(line->file, stdout);
655 		++n;
656 	}
657 	if (nflag) {
658 		if (n)
659 			putchar(sep);
660 		printf("%lld", line->line_no);
661 		++n;
662 	}
663 	if (bflag) {
664 		if (n)
665 			putchar(sep);
666 		printf("%lld", (long long)line->off +
667 		    (pmatch ? pmatch->rm_so : 0));
668 		++n;
669 	}
670 	if (n)
671 		putchar(sep);
672 	if (pmatch)
673 		fwrite(line->dat + pmatch->rm_so,
674 		    pmatch->rm_eo - pmatch->rm_so, 1, stdout);
675 	else
676 		fwrite(line->dat, line->len, 1, stdout);
677 	putchar('\n');
678 }
679