xref: /openbsd-src/usr.bin/grep/util.c (revision c90a81c56dcebd6a1b73fe4aff9b03385b8e63b3)
1 /*	$OpenBSD: util.c,v 1.58 2017/12/09 18:38:37 pirofti Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 
32 #include <ctype.h>
33 #include <err.h>
34 #include <errno.h>
35 #include <fts.h>
36 #include <regex.h>
37 #include <stdbool.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <unistd.h>
42 #include <zlib.h>
43 
44 #include "grep.h"
45 
46 /*
47  * Process a file line by line...
48  */
49 
50 static int	linesqueued;
51 static int	procline(str_t *l, int);
52 static int	grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch, int);
53 #ifndef SMALL
54 static bool	grep_cmp(const char *, const char *, size_t);
55 static void	grep_revstr(unsigned char *, int);
56 #endif
57 
58 int
59 grep_tree(char **argv)
60 {
61 	FTS	*fts;
62 	FTSENT	*p;
63 	int	c, fts_flags;
64 
65 	c = 0;
66 
67 	fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
68 
69 	if (!(fts = fts_open(argv, fts_flags, NULL)))
70 		err(2, NULL);
71 	while ((p = fts_read(fts)) != NULL) {
72 		switch (p->fts_info) {
73 		case FTS_DNR:
74 			break;
75 		case FTS_ERR:
76 			file_err = 1;
77 			if(!sflag)
78 				warnc(p->fts_errno, "%s", p->fts_path);
79 			break;
80 		case FTS_DP:
81 			break;
82 		default:
83 			c += procfile(p->fts_path);
84 			break;
85 		}
86 	}
87 	if (errno)
88 		err(2, "fts_read");
89 	fts_close(fts);
90 	return c;
91 }
92 
93 int
94 procfile(char *fn)
95 {
96 	str_t ln;
97 	file_t *f;
98 	int c, t, z, nottext;
99 
100 	mcount = mlimit;
101 
102 	if (fn == NULL) {
103 		fn = "(standard input)";
104 		f = grep_fdopen(STDIN_FILENO, "r");
105 	} else {
106 		f = grep_open(fn, "r");
107 	}
108 	if (f == NULL) {
109 		file_err = 1;
110 		if (!sflag)
111 			warn("%s", fn);
112 		return 0;
113 	}
114 
115 	nottext = grep_bin_file(f);
116 	if (nottext && binbehave == BIN_FILE_SKIP) {
117 		grep_close(f);
118 		return 0;
119 	}
120 
121 	ln.file = fn;
122 	ln.line_no = 0;
123 	ln.len = 0;
124 	linesqueued = 0;
125 	tail = 0;
126 	ln.off = -1;
127 
128 	if (Bflag > 0)
129 		initqueue();
130 	for (c = 0;  c == 0 || !(lflag || qflag); ) {
131 		if (mflag && mlimit == 0)
132 			break;
133 		ln.off += ln.len + 1;
134 		if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
135 			break;
136 		if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
137 			--ln.len;
138 		ln.line_no++;
139 
140 		z = tail;
141 
142 		if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
143 			enqueue(&ln);
144 			linesqueued++;
145 		}
146 		c += t;
147 		if (mflag && mcount <= 0)
148 			break;
149 	}
150 	if (Bflag > 0)
151 		clearqueue();
152 	grep_close(f);
153 
154 	if (cflag) {
155 		if (!hflag)
156 			printf("%s:", ln.file);
157 		printf("%u\n", c);
158 	}
159 	if (lflag && c != 0)
160 		printf("%s\n", fn);
161 	if (Lflag && c == 0)
162 		printf("%s\n", fn);
163 	if (c && !cflag && !lflag && !Lflag &&
164 	    binbehave == BIN_FILE_BIN && nottext && !qflag)
165 		printf("Binary file %s matches\n", fn);
166 
167 	return c;
168 }
169 
170 
171 /*
172  * Process an individual line in a file. Return non-zero if it matches.
173  */
174 
175 #define isword(x) (isalnum((unsigned char)x) || (x) == '_')
176 
177 static int
178 procline(str_t *l, int nottext)
179 {
180 	regmatch_t	pmatch = { 0 };
181 	int		c, i, r;
182 	regoff_t	offset;
183 
184 	/* size_t will be converted to regoff_t. ssize_t is guaranteed to fit
185 	 * into regoff_t */
186 	if (l->len > SSIZE_MAX) {
187 		errx(2, "Line is too big to process");
188 	}
189 
190 	c = 0;
191 	i = 0;
192 	if (matchall) {
193 		c = 1;
194 		goto print;
195 	}
196 
197 	for (i = 0; i < patterns; i++) {
198 		offset = 0;
199 redo:
200 		if (fg_pattern[i].pattern) {
201 			int flags = 0;
202 			if (offset)
203 				flags |= REG_NOTBOL;
204 			r = grep_search(&fg_pattern[i], l->dat + offset,
205 			    l->len - offset, &pmatch, flags);
206 			pmatch.rm_so += offset;
207 			pmatch.rm_eo += offset;
208 		} else {
209 			int flags = eflags;
210 			if (offset)
211 				flags |= REG_NOTBOL;
212 			pmatch.rm_so = offset;
213 			pmatch.rm_eo = l->len;
214 			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, flags);
215 		}
216 		if (r == 0 && xflag) {
217 			if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
218 				r = REG_NOMATCH;
219 		}
220 		if (r == 0) {
221 			c = 1;
222 			if (oflag && pmatch.rm_so != pmatch.rm_eo)
223 				goto print;
224 			break;
225 		}
226 	}
227 	if (oflag)
228 		return c;
229 print:
230 	if (vflag)
231 		c = !c;
232 
233 	/* Count the matches if we have a match limit */
234 	if (mflag)
235 		mcount -= c;
236 
237 	if (c && binbehave == BIN_FILE_BIN && nottext)
238 		return c; /* Binary file */
239 
240 	if ((tail > 0 || c) && !cflag && !qflag) {
241 		if (c) {
242 			if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
243 			    (Aflag || Bflag))
244 				printf("--\n");
245 			first = 1;
246 			tail = Aflag;
247 			if (Bflag > 0)
248 				printqueue();
249 			linesqueued = 0;
250 			printline(l, ':', oflag ? &pmatch : NULL);
251 		} else {
252 			printline(l, '-', oflag ? &pmatch : NULL);
253 			tail--;
254 		}
255 	}
256 	if (oflag && !matchall) {
257 		offset = pmatch.rm_eo;
258 		goto redo;
259 	}
260 	return c;
261 }
262 
263 #ifndef SMALL
264 void
265 fgrepcomp(fastgrep_t *fg, const unsigned char *pattern)
266 {
267 	int i;
268 
269 	/* Initialize. */
270 	fg->patternLen = strlen(pattern);
271 	fg->bol = 0;
272 	fg->eol = 0;
273 	fg->wmatch = wflag;
274 	fg->reversedSearch = 0;
275 
276 	/*
277 	 * Make a copy and upper case it for later if in -i mode,
278 	 * else just copy the pointer.
279 	 */
280 	if (iflag) {
281 		fg->pattern = grep_malloc(fg->patternLen + 1);
282 		for (i = 0; i < fg->patternLen; i++)
283 			fg->pattern[i] = toupper(pattern[i]);
284 		fg->pattern[fg->patternLen] = '\0';
285 	} else
286 		fg->pattern = (unsigned char *)pattern;	/* really const */
287 
288 	/* Preprocess pattern. */
289 	for (i = 0; i <= UCHAR_MAX; i++)
290 		fg->qsBc[i] = fg->patternLen;
291 	for (i = 1; i < fg->patternLen; i++) {
292 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
293 		/*
294 		 * If case is ignored, make the jump apply to both upper and
295 		 * lower cased characters.  As the pattern is stored in upper
296 		 * case, apply the same to the lower case equivalents.
297 		 */
298 		if (iflag)
299 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
300 	}
301 }
302 #endif
303 
304 /*
305  * Returns: -1 on failure, 0 on success
306  */
307 int
308 fastcomp(fastgrep_t *fg, const char *pattern)
309 {
310 #ifdef SMALL
311 	return -1;
312 #else
313 	int i;
314 	int bol = 0;
315 	int eol = 0;
316 	int shiftPatternLen;
317 	int hasDot = 0;
318 	int firstHalfDot = -1;
319 	int firstLastHalfDot = -1;
320 	int lastHalfDot = 0;
321 
322 	/* Initialize. */
323 	fg->patternLen = strlen(pattern);
324 	fg->bol = 0;
325 	fg->eol = 0;
326 	fg->wmatch = 0;
327 	fg->reversedSearch = 0;
328 
329 	/* Remove end-of-line character ('$'). */
330 	if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') {
331 		eol++;
332 		fg->eol = 1;
333 		fg->patternLen--;
334 	}
335 
336 	/* Remove beginning-of-line character ('^'). */
337 	if (pattern[0] == '^') {
338 		bol++;
339 		fg->bol = 1;
340 		fg->patternLen--;
341 	}
342 
343 	/* Remove enclosing [[:<:]] and [[:>:]] (word match). */
344 	if (wflag) {
345 		/* basic re's use \( \), extended re's ( ) */
346 		int extra = Eflag ? 1 : 2;
347 		fg->patternLen -= 14 + 2 * extra;
348 		fg->wmatch = 7 + extra;
349 	} else if (fg->patternLen >= 14 &&
350 	    strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
351 	    strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
352 		fg->patternLen -= 14;
353 		fg->wmatch = 7;
354 	}
355 
356 	/*
357 	 * Copy pattern minus '^' and '$' characters as well as word
358 	 * match character classes at the beginning and ending of the
359 	 * string respectively.
360 	 */
361 	fg->pattern = grep_malloc(fg->patternLen + 1);
362 	memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
363 	fg->pattern[fg->patternLen] = '\0';
364 
365 	/* Look for ways to cheat...er...avoid the full regex engine. */
366 	for (i = 0; i < fg->patternLen; i++)
367 	{
368 		switch (fg->pattern[i]) {
369 		case '.':
370 			hasDot = i;
371 			if (i < fg->patternLen / 2) {
372 				if (firstHalfDot < 0)
373 					/* Closest dot to the beginning */
374 					firstHalfDot = i;
375 			} else {
376 				/* Closest dot to the end of the pattern. */
377 				lastHalfDot = i;
378 				if (firstLastHalfDot < 0)
379 					firstLastHalfDot = i;
380 			}
381 			break;
382 		case '(': case ')':
383 		case '{': case '}':
384 			/* Special in BRE if preceded by '\\' */
385 		case '?':
386 		case '+':
387 		case '|':
388 			/* Not special in BRE. */
389 			if (!Eflag)
390 				goto nonspecial;
391 		case '\\':
392 		case '*':
393 		case '[': case ']':
394 			/* Free memory and let others know this is empty. */
395 			free(fg->pattern);
396 			fg->pattern = NULL;
397 			return (-1);
398 		default:
399 nonspecial:
400 			if (iflag)
401 				fg->pattern[i] = toupper(fg->pattern[i]);
402 			break;
403 		}
404 	}
405 
406 	/*
407 	 * Determine if a reverse search would be faster based on the placement
408 	 * of the dots.
409 	 */
410 	if ((!(lflag || cflag || oflag)) && ((!(bol || eol)) &&
411 	    ((lastHalfDot) && ((firstHalfDot < 0) ||
412 	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
413 		fg->reversedSearch = 1;
414 		hasDot = fg->patternLen - (firstHalfDot < 0 ?
415 		    firstLastHalfDot : firstHalfDot) - 1;
416 		grep_revstr(fg->pattern, fg->patternLen);
417 	}
418 
419 	/*
420 	 * Normal Quick Search would require a shift based on the position the
421 	 * next character after the comparison is within the pattern.  With
422 	 * wildcards, the position of the last dot effects the maximum shift
423 	 * distance.
424 	 * The closer to the end the wild card is the slower the search.  A
425 	 * reverse version of this algorithm would be useful for wildcards near
426 	 * the end of the string.
427 	 *
428 	 * Examples:
429 	 * Pattern	Max shift
430 	 * -------	---------
431 	 * this		5
432 	 * .his		4
433 	 * t.is		3
434 	 * th.s		2
435 	 * thi.		1
436 	 */
437 
438 	/* Adjust the shift based on location of the last dot ('.'). */
439 	shiftPatternLen = fg->patternLen - hasDot;
440 
441 	/* Preprocess pattern. */
442 	for (i = 0; i <= UCHAR_MAX; i++)
443 		fg->qsBc[i] = shiftPatternLen;
444 	for (i = hasDot + 1; i < fg->patternLen; i++) {
445 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
446 		/*
447 		 * If case is ignored, make the jump apply to both upper and
448 		 * lower cased characters.  As the pattern is stored in upper
449 		 * case, apply the same to the lower case equivalents.
450 		 */
451 		if (iflag)
452 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
453 	}
454 
455 	/*
456 	 * Put pattern back to normal after pre-processing to allow for easy
457 	 * comparisons later.
458 	 */
459 	if (fg->reversedSearch)
460 		grep_revstr(fg->pattern, fg->patternLen);
461 
462 	return (0);
463 #endif
464 }
465 
466 /*
467  * Word boundaries using regular expressions are defined as the point
468  * of transition from a non-word char to a word char, or vice versa.
469  * This means that grep -w +a and grep -w a+ never match anything,
470  * because they lack a starting or ending transition, but grep -w a+b
471  * does match a line containing a+b.
472  */
473 #define wmatch(d, l, s, e)	\
474 	((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
475 	  e > s && isword(d[s]) && isword(d[e-1]))
476 
477 static int
478 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch,
479     int flags)
480 {
481 #ifdef SMALL
482 	return 0;
483 #else
484 	regoff_t j;
485 	int rtrnVal = REG_NOMATCH;
486 
487 	pmatch->rm_so = -1;
488 	pmatch->rm_eo = -1;
489 
490 	/* No point in going farther if we do not have enough data. */
491 	if (dataLen < fg->patternLen)
492 		return (rtrnVal);
493 
494 	/* Only try once at the beginning or ending of the line. */
495 	if (fg->bol || fg->eol) {
496 		if (fg->bol && (flags & REG_NOTBOL))
497 			return 0;
498 		/* Simple text comparison. */
499 		/* Verify data is >= pattern length before searching on it. */
500 		if (dataLen >= fg->patternLen) {
501 			/* Determine where in data to start search at. */
502 			if (fg->eol)
503 				j = dataLen - fg->patternLen;
504 			else
505 				j = 0;
506 			if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
507 				if (grep_cmp(fg->pattern, data + j,
508 				    fg->patternLen)) {
509 					pmatch->rm_so = j;
510 					pmatch->rm_eo = j + fg->patternLen;
511 					if (!fg->wmatch || wmatch(data, dataLen,
512 					    pmatch->rm_so, pmatch->rm_eo))
513 						rtrnVal = 0;
514 				}
515 		}
516 	} else if (fg->reversedSearch) {
517 		/* Quick Search algorithm. */
518 		j = dataLen;
519 		do {
520 			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
521 			    fg->patternLen)) {
522 				pmatch->rm_so = j - fg->patternLen;
523 				pmatch->rm_eo = j;
524 				if (!fg->wmatch || wmatch(data, dataLen,
525 				    pmatch->rm_so, pmatch->rm_eo)) {
526 					rtrnVal = 0;
527 					break;
528 				}
529 			}
530 			/* Shift if within bounds, otherwise, we are done. */
531 			if (j == fg->patternLen)
532 				break;
533 			j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]];
534 		} while (j >= fg->patternLen);
535 	} else {
536 		/* Quick Search algorithm. */
537 		j = 0;
538 		do {
539 			if (grep_cmp(fg->pattern, data + j, fg->patternLen)) {
540 				pmatch->rm_so = j;
541 				pmatch->rm_eo = j + fg->patternLen;
542 				if (fg->patternLen == 0 || !fg->wmatch ||
543 				    wmatch(data, dataLen, pmatch->rm_so,
544 				    pmatch->rm_eo)) {
545 					rtrnVal = 0;
546 					break;
547 				}
548 			}
549 
550 			/* Shift if within bounds, otherwise, we are done. */
551 			if (j + fg->patternLen == dataLen)
552 				break;
553 			else
554 				j += fg->qsBc[(unsigned char)data[j + fg->patternLen]];
555 		} while (j <= (dataLen - fg->patternLen));
556 	}
557 
558 	return (rtrnVal);
559 #endif
560 }
561 
562 
563 void *
564 grep_malloc(size_t size)
565 {
566 	void	*ptr;
567 
568 	if ((ptr = malloc(size)) == NULL)
569 		err(2, "malloc");
570 	return ptr;
571 }
572 
573 void *
574 grep_calloc(size_t nmemb, size_t size)
575 {
576 	void	*ptr;
577 
578 	if ((ptr = calloc(nmemb, size)) == NULL)
579 		err(2, "calloc");
580 	return ptr;
581 }
582 
583 void *
584 grep_realloc(void *ptr, size_t size)
585 {
586 	if ((ptr = realloc(ptr, size)) == NULL)
587 		err(2, "realloc");
588 	return ptr;
589 }
590 
591 void *
592 grep_reallocarray(void *ptr, size_t nmemb, size_t size)
593 {
594 	if ((ptr = reallocarray(ptr, nmemb, size)) == NULL)
595 		err(2, "reallocarray");
596 	return ptr;
597 }
598 
599 #ifndef SMALL
600 /*
601  * Returns:	true on success, false on failure
602  */
603 static bool
604 grep_cmp(const char *pattern, const char *data, size_t len)
605 {
606 	size_t i;
607 
608 	for (i = 0; i < len; i++) {
609 		if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
610 		    || (iflag && pattern[i] == toupper((unsigned char)data[i])))
611 			continue;
612 		return false;
613 	}
614 
615 	return true;
616 }
617 
618 static void
619 grep_revstr(unsigned char *str, int len)
620 {
621 	int i;
622 	char c;
623 
624 	for (i = 0; i < len / 2; i++) {
625 		c = str[i];
626 		str[i] = str[len - i - 1];
627 		str[len - i - 1] = c;
628 	}
629 }
630 #endif
631 
632 void
633 printline(str_t *line, int sep, regmatch_t *pmatch)
634 {
635 	int n;
636 
637 	n = 0;
638 	if (!hflag) {
639 		fputs(line->file, stdout);
640 		++n;
641 	}
642 	if (nflag) {
643 		if (n)
644 			putchar(sep);
645 		printf("%lld", line->line_no);
646 		++n;
647 	}
648 	if (bflag) {
649 		if (n)
650 			putchar(sep);
651 		printf("%lld", (long long)line->off);
652 		++n;
653 	}
654 	if (n)
655 		putchar(sep);
656 	if (pmatch)
657 		fwrite(line->dat + pmatch->rm_so,
658 		    pmatch->rm_eo - pmatch->rm_so, 1, stdout);
659 	else
660 		fwrite(line->dat, line->len, 1, stdout);
661 	putchar('\n');
662 }
663