xref: /openbsd-src/usr.bin/grep/util.c (revision 0b7734b3d77bb9b21afec6f4621cae6c805dbd45)
1 /*	$OpenBSD: util.c,v 1.55 2016/04/04 05:49:47 otto Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 
32 #include <ctype.h>
33 #include <err.h>
34 #include <errno.h>
35 #include <fts.h>
36 #include <regex.h>
37 #include <stdbool.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <unistd.h>
42 #include <zlib.h>
43 
44 #include "grep.h"
45 
46 /*
47  * Process a file line by line...
48  */
49 
50 static int	linesqueued;
51 static int	procline(str_t *l, int);
52 static int	grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch);
53 #ifndef SMALL
54 static bool	grep_cmp(const char *, const char *, size_t);
55 static void	grep_revstr(unsigned char *, int);
56 #endif
57 
58 int
59 grep_tree(char **argv)
60 {
61 	FTS	*fts;
62 	FTSENT	*p;
63 	int	c, fts_flags;
64 
65 	c = 0;
66 
67 	fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
68 
69 	if (!(fts = fts_open(argv, fts_flags, NULL)))
70 		err(2, NULL);
71 	while ((p = fts_read(fts)) != NULL) {
72 		switch (p->fts_info) {
73 		case FTS_DNR:
74 			break;
75 		case FTS_ERR:
76 			file_err = 1;
77 			if(!sflag)
78 				warnc(p->fts_errno, "%s", p->fts_path);
79 			break;
80 		case FTS_DP:
81 			break;
82 		default:
83 			c += procfile(p->fts_path);
84 			break;
85 		}
86 	}
87 	if (errno)
88 		err(2, "fts_read");
89 	fts_close(fts);
90 	return c;
91 }
92 
93 int
94 procfile(char *fn)
95 {
96 	str_t ln;
97 	file_t *f;
98 	int c, t, z, nottext;
99 
100 	if (fn == NULL) {
101 		fn = "(standard input)";
102 		f = grep_fdopen(STDIN_FILENO, "r");
103 	} else {
104 		f = grep_open(fn, "r");
105 	}
106 	if (f == NULL) {
107 		file_err = 1;
108 		if (!sflag)
109 			warn("%s", fn);
110 		return 0;
111 	}
112 
113 	nottext = grep_bin_file(f);
114 	if (nottext && binbehave == BIN_FILE_SKIP) {
115 		grep_close(f);
116 		return 0;
117 	}
118 
119 	ln.file = fn;
120 	ln.line_no = 0;
121 	ln.len = 0;
122 	linesqueued = 0;
123 	tail = 0;
124 	ln.off = -1;
125 
126 	if (Bflag > 0)
127 		initqueue();
128 	for (c = 0;  c == 0 || !(lflag || qflag); ) {
129 		ln.off += ln.len + 1;
130 		if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
131 			break;
132 		if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
133 			--ln.len;
134 		ln.line_no++;
135 
136 		z = tail;
137 
138 		if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
139 			enqueue(&ln);
140 			linesqueued++;
141 		}
142 		c += t;
143 	}
144 	if (Bflag > 0)
145 		clearqueue();
146 	grep_close(f);
147 
148 	if (cflag) {
149 		if (!hflag)
150 			printf("%s:", ln.file);
151 		printf("%u\n", c);
152 	}
153 	if (lflag && c != 0)
154 		printf("%s\n", fn);
155 	if (Lflag && c == 0)
156 		printf("%s\n", fn);
157 	if (c && !cflag && !lflag && !Lflag &&
158 	    binbehave == BIN_FILE_BIN && nottext && !qflag)
159 		printf("Binary file %s matches\n", fn);
160 
161 	return c;
162 }
163 
164 
165 /*
166  * Process an individual line in a file. Return non-zero if it matches.
167  */
168 
169 #define isword(x) (isalnum((unsigned char)x) || (x) == '_')
170 
171 static int
172 procline(str_t *l, int nottext)
173 {
174 	regmatch_t	pmatch;
175 	int		c, i, r;
176 	regoff_t	offset;
177 
178 	/* size_t will be converted to regoff_t. ssize_t is guaranteed to fit
179 	 * into regoff_t */
180 	if (l->len > SSIZE_MAX) {
181 		errx(2, "Line is too big to process");
182 	}
183 
184 	c = 0;
185 	i = 0;
186 	if (matchall) {
187 		c = 1;
188 		goto print;
189 	}
190 
191 	for (i = 0; i < patterns; i++) {
192 		offset = 0;
193 redo:
194 		if (fg_pattern[i].pattern) {
195 			r = grep_search(&fg_pattern[i], l->dat + offset,
196 			    l->len - offset, &pmatch);
197 			pmatch.rm_so += offset;
198 			pmatch.rm_eo += offset;
199 		} else {
200 			pmatch.rm_so = offset;
201 			pmatch.rm_eo = l->len;
202 			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
203 		}
204 		if (r == 0 && xflag) {
205 			if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
206 				r = REG_NOMATCH;
207 		}
208 		if (r == 0) {
209 			c = 1;
210 			if (oflag && pmatch.rm_so != pmatch.rm_eo)
211 				goto print;
212 			break;
213 		}
214 	}
215 	if (oflag)
216 		return c;
217 print:
218 	if (vflag)
219 		c = !c;
220 
221 	if (c && binbehave == BIN_FILE_BIN && nottext)
222 		return c; /* Binary file */
223 
224 	if ((tail > 0 || c) && !cflag && !qflag) {
225 		if (c) {
226 			if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
227 			    (Aflag || Bflag))
228 				printf("--\n");
229 			first = 1;
230 			tail = Aflag;
231 			if (Bflag > 0)
232 				printqueue();
233 			linesqueued = 0;
234 			printline(l, ':', oflag ? &pmatch : NULL);
235 		} else {
236 			printline(l, '-', oflag ? &pmatch : NULL);
237 			tail--;
238 		}
239 	}
240 	if (oflag && !matchall) {
241 		offset = pmatch.rm_eo;
242 		goto redo;
243 	}
244 	return c;
245 }
246 
247 #ifndef SMALL
248 void
249 fgrepcomp(fastgrep_t *fg, const unsigned char *pattern)
250 {
251 	int i;
252 
253 	/* Initialize. */
254 	fg->patternLen = strlen(pattern);
255 	fg->bol = 0;
256 	fg->eol = 0;
257 	fg->wmatch = wflag;
258 	fg->reversedSearch = 0;
259 
260 	/*
261 	 * Make a copy and upper case it for later if in -i mode,
262 	 * else just copy the pointer.
263 	 */
264 	if (iflag) {
265 		fg->pattern = grep_malloc(fg->patternLen + 1);
266 		for (i = 0; i < fg->patternLen; i++)
267 			fg->pattern[i] = toupper(pattern[i]);
268 		fg->pattern[fg->patternLen] = '\0';
269 	} else
270 		fg->pattern = (unsigned char *)pattern;	/* really const */
271 
272 	/* Preprocess pattern. */
273 	for (i = 0; i <= UCHAR_MAX; i++)
274 		fg->qsBc[i] = fg->patternLen;
275 	for (i = 1; i < fg->patternLen; i++) {
276 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
277 		/*
278 		 * If case is ignored, make the jump apply to both upper and
279 		 * lower cased characters.  As the pattern is stored in upper
280 		 * case, apply the same to the lower case equivalents.
281 		 */
282 		if (iflag)
283 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
284 	}
285 }
286 #endif
287 
288 /*
289  * Returns: -1 on failure, 0 on success
290  */
291 int
292 fastcomp(fastgrep_t *fg, const char *pattern)
293 {
294 #ifdef SMALL
295 	return -1;
296 #else
297 	int i;
298 	int bol = 0;
299 	int eol = 0;
300 	int shiftPatternLen;
301 	int hasDot = 0;
302 	int firstHalfDot = -1;
303 	int firstLastHalfDot = -1;
304 	int lastHalfDot = 0;
305 
306 	/* Initialize. */
307 	fg->patternLen = strlen(pattern);
308 	fg->bol = 0;
309 	fg->eol = 0;
310 	fg->wmatch = 0;
311 	fg->reversedSearch = 0;
312 
313 	/* Remove end-of-line character ('$'). */
314 	if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') {
315 		eol++;
316 		fg->eol = 1;
317 		fg->patternLen--;
318 	}
319 
320 	/* Remove beginning-of-line character ('^'). */
321 	if (pattern[0] == '^') {
322 		bol++;
323 		fg->bol = 1;
324 		fg->patternLen--;
325 	}
326 
327 	/* Remove enclosing [[:<:]] and [[:>:]] (word match). */
328 	if (wflag) {
329 		/* basic re's use \( \), extended re's ( ) */
330 		int extra = Eflag ? 1 : 2;
331 		fg->patternLen -= 14 + 2 * extra;
332 		fg->wmatch = 7 + extra;
333 	} else if (fg->patternLen >= 14 &&
334 	    strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
335 	    strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
336 		fg->patternLen -= 14;
337 		fg->wmatch = 7;
338 	}
339 
340 	/*
341 	 * Copy pattern minus '^' and '$' characters as well as word
342 	 * match character classes at the beginning and ending of the
343 	 * string respectively.
344 	 */
345 	fg->pattern = grep_malloc(fg->patternLen + 1);
346 	memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
347 	fg->pattern[fg->patternLen] = '\0';
348 
349 	/* Look for ways to cheat...er...avoid the full regex engine. */
350 	for (i = 0; i < fg->patternLen; i++)
351 	{
352 		switch (fg->pattern[i]) {
353 		case '.':
354 			hasDot = i;
355 			if (i < fg->patternLen / 2) {
356 				if (firstHalfDot < 0)
357 					/* Closest dot to the beginning */
358 					firstHalfDot = i;
359 			} else {
360 				/* Closest dot to the end of the pattern. */
361 				lastHalfDot = i;
362 				if (firstLastHalfDot < 0)
363 					firstLastHalfDot = i;
364 			}
365 			break;
366 		case '(': case ')':
367 		case '{': case '}':
368 			/* Special in BRE if preceded by '\\' */
369 		case '?':
370 		case '+':
371 		case '|':
372 			/* Not special in BRE. */
373 			if (!Eflag)
374 				goto nonspecial;
375 		case '\\':
376 		case '*':
377 		case '[': case ']':
378 			/* Free memory and let others know this is empty. */
379 			free(fg->pattern);
380 			fg->pattern = NULL;
381 			return (-1);
382 		default:
383 nonspecial:
384 			if (iflag)
385 				fg->pattern[i] = toupper(fg->pattern[i]);
386 			break;
387 		}
388 	}
389 
390 	/*
391 	 * Determine if a reverse search would be faster based on the placement
392 	 * of the dots.
393 	 */
394 	if ((!(lflag || cflag || oflag)) && ((!(bol || eol)) &&
395 	    ((lastHalfDot) && ((firstHalfDot < 0) ||
396 	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
397 		fg->reversedSearch = 1;
398 		hasDot = fg->patternLen - (firstHalfDot < 0 ?
399 		    firstLastHalfDot : firstHalfDot) - 1;
400 		grep_revstr(fg->pattern, fg->patternLen);
401 	}
402 
403 	/*
404 	 * Normal Quick Search would require a shift based on the position the
405 	 * next character after the comparison is within the pattern.  With
406 	 * wildcards, the position of the last dot effects the maximum shift
407 	 * distance.
408 	 * The closer to the end the wild card is the slower the search.  A
409 	 * reverse version of this algorithm would be useful for wildcards near
410 	 * the end of the string.
411 	 *
412 	 * Examples:
413 	 * Pattern	Max shift
414 	 * -------	---------
415 	 * this		5
416 	 * .his		4
417 	 * t.is		3
418 	 * th.s		2
419 	 * thi.		1
420 	 */
421 
422 	/* Adjust the shift based on location of the last dot ('.'). */
423 	shiftPatternLen = fg->patternLen - hasDot;
424 
425 	/* Preprocess pattern. */
426 	for (i = 0; i <= UCHAR_MAX; i++)
427 		fg->qsBc[i] = shiftPatternLen;
428 	for (i = hasDot + 1; i < fg->patternLen; i++) {
429 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
430 		/*
431 		 * If case is ignored, make the jump apply to both upper and
432 		 * lower cased characters.  As the pattern is stored in upper
433 		 * case, apply the same to the lower case equivalents.
434 		 */
435 		if (iflag)
436 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
437 	}
438 
439 	/*
440 	 * Put pattern back to normal after pre-processing to allow for easy
441 	 * comparisons later.
442 	 */
443 	if (fg->reversedSearch)
444 		grep_revstr(fg->pattern, fg->patternLen);
445 
446 	return (0);
447 #endif
448 }
449 
450 /*
451  * Word boundaries using regular expressions are defined as the point
452  * of transition from a non-word char to a word char, or vice versa.
453  * This means that grep -w +a and grep -w a+ never match anything,
454  * because they lack a starting or ending transition, but grep -w a+b
455  * does match a line containing a+b.
456  */
457 #define wmatch(d, l, s, e)	\
458 	((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
459 	  e > s && isword(d[s]) && isword(d[e-1]))
460 
461 static int
462 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch)
463 {
464 #ifdef SMALL
465 	return 0;
466 #else
467 	regoff_t j;
468 	int rtrnVal = REG_NOMATCH;
469 
470 	pmatch->rm_so = -1;
471 	pmatch->rm_eo = -1;
472 
473 	/* No point in going farther if we do not have enough data. */
474 	if (dataLen < fg->patternLen)
475 		return (rtrnVal);
476 
477 	/* Only try once at the beginning or ending of the line. */
478 	if (fg->bol || fg->eol) {
479 		/* Simple text comparison. */
480 		/* Verify data is >= pattern length before searching on it. */
481 		if (dataLen >= fg->patternLen) {
482 			/* Determine where in data to start search at. */
483 			if (fg->eol)
484 				j = dataLen - fg->patternLen;
485 			else
486 				j = 0;
487 			if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
488 				if (grep_cmp(fg->pattern, data + j,
489 				    fg->patternLen)) {
490 					pmatch->rm_so = j;
491 					pmatch->rm_eo = j + fg->patternLen;
492 					if (!fg->wmatch || wmatch(data, dataLen,
493 					    pmatch->rm_so, pmatch->rm_eo))
494 						rtrnVal = 0;
495 				}
496 		}
497 	} else if (fg->reversedSearch) {
498 		/* Quick Search algorithm. */
499 		j = dataLen;
500 		do {
501 			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
502 			    fg->patternLen)) {
503 				pmatch->rm_so = j - fg->patternLen;
504 				pmatch->rm_eo = j;
505 				if (!fg->wmatch || wmatch(data, dataLen,
506 				    pmatch->rm_so, pmatch->rm_eo)) {
507 					rtrnVal = 0;
508 					break;
509 				}
510 			}
511 			/* Shift if within bounds, otherwise, we are done. */
512 			if (j == fg->patternLen)
513 				break;
514 			j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]];
515 		} while (j >= fg->patternLen);
516 	} else {
517 		/* Quick Search algorithm. */
518 		j = 0;
519 		do {
520 			if (grep_cmp(fg->pattern, data + j, fg->patternLen)) {
521 				pmatch->rm_so = j;
522 				pmatch->rm_eo = j + fg->patternLen;
523 				if (fg->patternLen == 0 || !fg->wmatch ||
524 				    wmatch(data, dataLen, pmatch->rm_so,
525 				    pmatch->rm_eo)) {
526 					rtrnVal = 0;
527 					break;
528 				}
529 			}
530 
531 			/* Shift if within bounds, otherwise, we are done. */
532 			if (j + fg->patternLen == dataLen)
533 				break;
534 			else
535 				j += fg->qsBc[(unsigned char)data[j + fg->patternLen]];
536 		} while (j <= (dataLen - fg->patternLen));
537 	}
538 
539 	return (rtrnVal);
540 #endif
541 }
542 
543 
544 void *
545 grep_malloc(size_t size)
546 {
547 	void	*ptr;
548 
549 	if ((ptr = malloc(size)) == NULL)
550 		err(2, "malloc");
551 	return ptr;
552 }
553 
554 void *
555 grep_calloc(size_t nmemb, size_t size)
556 {
557 	void	*ptr;
558 
559 	if ((ptr = calloc(nmemb, size)) == NULL)
560 		err(2, "calloc");
561 	return ptr;
562 }
563 
564 void *
565 grep_realloc(void *ptr, size_t size)
566 {
567 	if ((ptr = realloc(ptr, size)) == NULL)
568 		err(2, "realloc");
569 	return ptr;
570 }
571 
572 void *
573 grep_reallocarray(void *ptr, size_t nmemb, size_t size)
574 {
575 	if ((ptr = reallocarray(ptr, nmemb, size)) == NULL)
576 		err(2, "reallocarray");
577 	return ptr;
578 }
579 
580 #ifndef SMALL
581 /*
582  * Returns:	true on success, false on failure
583  */
584 static bool
585 grep_cmp(const char *pattern, const char *data, size_t len)
586 {
587 	size_t i;
588 
589 	for (i = 0; i < len; i++) {
590 		if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
591 		    || (iflag && pattern[i] == toupper((unsigned char)data[i])))
592 			continue;
593 		return false;
594 	}
595 
596 	return true;
597 }
598 
599 static void
600 grep_revstr(unsigned char *str, int len)
601 {
602 	int i;
603 	char c;
604 
605 	for (i = 0; i < len / 2; i++) {
606 		c = str[i];
607 		str[i] = str[len - i - 1];
608 		str[len - i - 1] = c;
609 	}
610 }
611 #endif
612 
613 void
614 printline(str_t *line, int sep, regmatch_t *pmatch)
615 {
616 	int n;
617 
618 	n = 0;
619 	if (!hflag) {
620 		fputs(line->file, stdout);
621 		++n;
622 	}
623 	if (nflag) {
624 		if (n)
625 			putchar(sep);
626 		printf("%lld", line->line_no);
627 		++n;
628 	}
629 	if (bflag) {
630 		if (n)
631 			putchar(sep);
632 		printf("%lld", (long long)line->off);
633 		++n;
634 	}
635 	if (n)
636 		putchar(sep);
637 	if (pmatch)
638 		fwrite(line->dat + pmatch->rm_so,
639 		    pmatch->rm_eo - pmatch->rm_so, 1, stdout);
640 	else
641 		fwrite(line->dat, line->len, 1, stdout);
642 	putchar('\n');
643 }
644