xref: /openbsd-src/usr.bin/grep/util.c (revision e5157e49389faebcb42b7237d55fbf096d9c2523)
1 /*	$OpenBSD: util.c,v 1.48 2014/05/20 01:25:23 guenther Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 
32 #include <ctype.h>
33 #include <err.h>
34 #include <errno.h>
35 #include <fts.h>
36 #include <regex.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <unistd.h>
41 #include <zlib.h>
42 
43 #include "grep.h"
44 
45 /*
46  * Process a file line by line...
47  */
48 
49 static int	linesqueued;
50 static int	procline(str_t *l, int);
51 static int	grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch);
52 #ifndef SMALL
53 static int	grep_cmp(const char *, const char *, size_t);
54 static void	grep_revstr(unsigned char *, int);
55 #endif
56 
57 int
58 grep_tree(char **argv)
59 {
60 	FTS	*fts;
61 	FTSENT	*p;
62 	int	c, fts_flags;
63 
64 	c = 0;
65 
66 	fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
67 
68 	if (!(fts = fts_open(argv, fts_flags, NULL)))
69 		err(2, NULL);
70 	while ((p = fts_read(fts)) != NULL) {
71 		switch (p->fts_info) {
72 		case FTS_DNR:
73 			break;
74 		case FTS_ERR:
75 			file_err = 1;
76 			if(!sflag)
77 				warnc(p->fts_errno, "%s", p->fts_path);
78 			break;
79 		case FTS_DP:
80 			break;
81 		default:
82 			c += procfile(p->fts_path);
83 			break;
84 		}
85 	}
86 	if (errno)
87 		err(2, "fts_read");
88 
89 	return c;
90 }
91 
92 int
93 procfile(char *fn)
94 {
95 	str_t ln;
96 	file_t *f;
97 	int c, t, z, nottext;
98 
99 	if (fn == NULL) {
100 		fn = "(standard input)";
101 		f = grep_fdopen(STDIN_FILENO, "r");
102 	} else {
103 		f = grep_open(fn, "r");
104 	}
105 	if (f == NULL) {
106 		file_err = 1;
107 		if (!sflag)
108 			warn("%s", fn);
109 		return 0;
110 	}
111 
112 	nottext = grep_bin_file(f);
113 	if (nottext && binbehave == BIN_FILE_SKIP) {
114 		grep_close(f);
115 		return 0;
116 	}
117 
118 	ln.file = fn;
119 	ln.line_no = 0;
120 	ln.len = 0;
121 	linesqueued = 0;
122 	tail = 0;
123 	ln.off = -1;
124 
125 	if (Bflag > 0)
126 		initqueue();
127 	for (c = 0;  c == 0 || !(lflag || qflag); ) {
128 		ln.off += ln.len + 1;
129 		if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
130 			break;
131 		if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
132 			--ln.len;
133 		ln.line_no++;
134 
135 		z = tail;
136 
137 		if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
138 			enqueue(&ln);
139 			linesqueued++;
140 		}
141 		c += t;
142 	}
143 	if (Bflag > 0)
144 		clearqueue();
145 	grep_close(f);
146 
147 	if (cflag) {
148 		if (!hflag)
149 			printf("%s:", ln.file);
150 		printf("%u\n", c);
151 	}
152 	if (lflag && c != 0)
153 		printf("%s\n", fn);
154 	if (Lflag && c == 0)
155 		printf("%s\n", fn);
156 	if (c && !cflag && !lflag && !Lflag &&
157 	    binbehave == BIN_FILE_BIN && nottext && !qflag)
158 		printf("Binary file %s matches\n", fn);
159 
160 	return c;
161 }
162 
163 
164 /*
165  * Process an individual line in a file. Return non-zero if it matches.
166  */
167 
168 #define isword(x) (isalnum((unsigned char)x) || (x) == '_')
169 
170 static int
171 procline(str_t *l, int nottext)
172 {
173 	regmatch_t	pmatch;
174 	int		c, i, r;
175 	regoff_t	offset;
176 
177 	/* size_t will be converted to regoff_t. ssize_t is guaranteed to fit
178 	 * into regoff_t */
179 	if (l->len > SSIZE_MAX) {
180 		errx(2, "Line is too big to process");
181 	}
182 
183 	c = 0;
184 	i = 0;
185 	if (matchall) {
186 		c = 1;
187 		goto print;
188 	}
189 
190 	for (i = 0; i < patterns; i++) {
191 		offset = 0;
192 redo:
193 		if (fg_pattern[i].pattern) {
194 			r = grep_search(&fg_pattern[i], l->dat + offset,
195 			    l->len - offset, &pmatch);
196 			pmatch.rm_so += offset;
197 			pmatch.rm_eo += offset;
198 		} else {
199 			pmatch.rm_so = offset;
200 			pmatch.rm_eo = l->len;
201 			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
202 		}
203 		if (r == 0 && xflag) {
204 			if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
205 				r = REG_NOMATCH;
206 		}
207 		if (r == 0) {
208 			c = 1;
209 			if (oflag && pmatch.rm_so != pmatch.rm_eo)
210 				goto print;
211 			break;
212 		}
213 	}
214 	if (oflag)
215 		return c;
216 print:
217 	if (vflag)
218 		c = !c;
219 
220 	if (c && binbehave == BIN_FILE_BIN && nottext)
221 		return c; /* Binary file */
222 
223 	if ((tail > 0 || c) && !cflag && !qflag) {
224 		if (c) {
225 			if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
226 			    (Aflag || Bflag))
227 				printf("--\n");
228 			first = 1;
229 			tail = Aflag;
230 			if (Bflag > 0)
231 				printqueue();
232 			linesqueued = 0;
233 			printline(l, ':', oflag ? &pmatch : NULL);
234 		} else {
235 			printline(l, '-', oflag ? &pmatch : NULL);
236 			tail--;
237 		}
238 	}
239 	if (oflag && !matchall) {
240 		offset = pmatch.rm_eo;
241 		goto redo;
242 	}
243 	return c;
244 }
245 
246 #ifndef SMALL
247 void
248 fgrepcomp(fastgrep_t *fg, const unsigned char *pattern)
249 {
250 	int i;
251 
252 	/* Initialize. */
253 	fg->patternLen = strlen(pattern);
254 	fg->bol = 0;
255 	fg->eol = 0;
256 	fg->wmatch = wflag;
257 	fg->reversedSearch = 0;
258 
259 	/*
260 	 * Make a copy and upper case it for later if in -i mode,
261 	 * else just copy the pointer.
262 	 */
263 	if (iflag) {
264 		fg->pattern = grep_malloc(fg->patternLen + 1);
265 		for (i = 0; i < fg->patternLen; i++)
266 			fg->pattern[i] = toupper(pattern[i]);
267 		fg->pattern[fg->patternLen] = '\0';
268 	} else
269 		fg->pattern = (unsigned char *)pattern;	/* really const */
270 
271 	/* Preprocess pattern. */
272 	for (i = 0; i <= UCHAR_MAX; i++)
273 		fg->qsBc[i] = fg->patternLen;
274 	for (i = 1; i < fg->patternLen; i++) {
275 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
276 		/*
277 		 * If case is ignored, make the jump apply to both upper and
278 		 * lower cased characters.  As the pattern is stored in upper
279 		 * case, apply the same to the lower case equivalents.
280 		 */
281 		if (iflag)
282 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
283 	}
284 }
285 #endif
286 
287 /*
288  * Returns: -1 on failure, 0 on success
289  */
290 int
291 fastcomp(fastgrep_t *fg, const char *pattern)
292 {
293 #ifdef SMALL
294 	return -1;
295 #else
296 	int i;
297 	int bol = 0;
298 	int eol = 0;
299 	int shiftPatternLen;
300 	int hasDot = 0;
301 	int firstHalfDot = -1;
302 	int firstLastHalfDot = -1;
303 	int lastHalfDot = 0;
304 
305 	/* Initialize. */
306 	fg->patternLen = strlen(pattern);
307 	fg->bol = 0;
308 	fg->eol = 0;
309 	fg->wmatch = 0;
310 	fg->reversedSearch = 0;
311 
312 	/* Remove end-of-line character ('$'). */
313 	if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') {
314 		eol++;
315 		fg->eol = 1;
316 		fg->patternLen--;
317 	}
318 
319 	/* Remove beginning-of-line character ('^'). */
320 	if (pattern[0] == '^') {
321 		bol++;
322 		fg->bol = 1;
323 		fg->patternLen--;
324 	}
325 
326 	/* Remove enclosing [[:<:]] and [[:>:]] (word match). */
327 	if (wflag) {
328 		/* basic re's use \( \), extended re's ( ) */
329 		int extra = Eflag ? 1 : 2;
330 		fg->patternLen -= 14 + 2 * extra;
331 		fg->wmatch = 7 + extra;
332 	} else if (fg->patternLen >= 14 &&
333 	    strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
334 	    strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
335 		fg->patternLen -= 14;
336 		fg->wmatch = 7;
337 	}
338 
339 	/*
340 	 * Copy pattern minus '^' and '$' characters as well as word
341 	 * match character classes at the beginning and ending of the
342 	 * string respectively.
343 	 */
344 	fg->pattern = grep_malloc(fg->patternLen + 1);
345 	memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
346 	fg->pattern[fg->patternLen] = '\0';
347 
348 	/* Look for ways to cheat...er...avoid the full regex engine. */
349 	for (i = 0; i < fg->patternLen; i++)
350 	{
351 		switch (fg->pattern[i]) {
352 		case '.':
353 			hasDot = i;
354 			if (i < fg->patternLen / 2) {
355 				if (firstHalfDot < 0)
356 					/* Closest dot to the beginning */
357 					firstHalfDot = i;
358 			} else {
359 				/* Closest dot to the end of the pattern. */
360 				lastHalfDot = i;
361 				if (firstLastHalfDot < 0)
362 					firstLastHalfDot = i;
363 			}
364 			break;
365 		case '(': case ')':
366 		case '{': case '}':
367 			/* Special in BRE if preceded by '\\' */
368 		case '?':
369 		case '+':
370 		case '|':
371 			/* Not special in BRE. */
372 			if (!Eflag)
373 				goto nonspecial;
374 		case '\\':
375 		case '*':
376 		case '[': case ']':
377 			/* Free memory and let others know this is empty. */
378 			free(fg->pattern);
379 			fg->pattern = NULL;
380 			return (-1);
381 		default:
382 nonspecial:
383 			if (iflag)
384 				fg->pattern[i] = toupper(fg->pattern[i]);
385 			break;
386 		}
387 	}
388 
389 	/*
390 	 * Determine if a reverse search would be faster based on the placement
391 	 * of the dots.
392 	 */
393 	if ((!(lflag || cflag)) && ((!(bol || eol)) &&
394 	    ((lastHalfDot) && ((firstHalfDot < 0) ||
395 	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
396 		fg->reversedSearch = 1;
397 		hasDot = fg->patternLen - (firstHalfDot < 0 ?
398 		    firstLastHalfDot : firstHalfDot) - 1;
399 		grep_revstr(fg->pattern, fg->patternLen);
400 	}
401 
402 	/*
403 	 * Normal Quick Search would require a shift based on the position the
404 	 * next character after the comparison is within the pattern.  With
405 	 * wildcards, the position of the last dot effects the maximum shift
406 	 * distance.
407 	 * The closer to the end the wild card is the slower the search.  A
408 	 * reverse version of this algorithm would be useful for wildcards near
409 	 * the end of the string.
410 	 *
411 	 * Examples:
412 	 * Pattern	Max shift
413 	 * -------	---------
414 	 * this		5
415 	 * .his		4
416 	 * t.is		3
417 	 * th.s		2
418 	 * thi.		1
419 	 */
420 
421 	/* Adjust the shift based on location of the last dot ('.'). */
422 	shiftPatternLen = fg->patternLen - hasDot;
423 
424 	/* Preprocess pattern. */
425 	for (i = 0; i <= UCHAR_MAX; i++)
426 		fg->qsBc[i] = shiftPatternLen;
427 	for (i = hasDot + 1; i < fg->patternLen; i++) {
428 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
429 		/*
430 		 * If case is ignored, make the jump apply to both upper and
431 		 * lower cased characters.  As the pattern is stored in upper
432 		 * case, apply the same to the lower case equivalents.
433 		 */
434 		if (iflag)
435 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
436 	}
437 
438 	/*
439 	 * Put pattern back to normal after pre-processing to allow for easy
440 	 * comparisons later.
441 	 */
442 	if (fg->reversedSearch)
443 		grep_revstr(fg->pattern, fg->patternLen);
444 
445 	return (0);
446 #endif
447 }
448 
449 /*
450  * Word boundaries using regular expressions are defined as the point
451  * of transition from a non-word char to a word char, or vice versa.
452  * This means that grep -w +a and grep -w a+ never match anything,
453  * because they lack a starting or ending transition, but grep -w a+b
454  * does match a line containing a+b.
455  */
456 #define wmatch(d, l, s, e)	\
457 	((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
458 	  e > s && isword(d[s]) && isword(d[e-1]))
459 
460 static int
461 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch)
462 {
463 #ifdef SMALL
464 	return 0;
465 #else
466 	regoff_t j;
467 	int rtrnVal = REG_NOMATCH;
468 
469 	pmatch->rm_so = -1;
470 	pmatch->rm_eo = -1;
471 
472 	/* No point in going farther if we do not have enough data. */
473 	if (dataLen < fg->patternLen)
474 		return (rtrnVal);
475 
476 	/* Only try once at the beginning or ending of the line. */
477 	if (fg->bol || fg->eol) {
478 		/* Simple text comparison. */
479 		/* Verify data is >= pattern length before searching on it. */
480 		if (dataLen >= fg->patternLen) {
481 			/* Determine where in data to start search at. */
482 			if (fg->eol)
483 				j = dataLen - fg->patternLen;
484 			else
485 				j = 0;
486 			if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
487 				if (grep_cmp(fg->pattern, data + j,
488 				    fg->patternLen) == -1) {
489 					pmatch->rm_so = j;
490 					pmatch->rm_eo = j + fg->patternLen;
491 					if (!fg->wmatch || wmatch(data, dataLen,
492 					    pmatch->rm_so, pmatch->rm_eo))
493 						rtrnVal = 0;
494 				}
495 		}
496 	} else if (fg->reversedSearch) {
497 		/* Quick Search algorithm. */
498 		j = dataLen;
499 		do {
500 			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
501 			    fg->patternLen) == -1) {
502 				pmatch->rm_so = j - fg->patternLen;
503 				pmatch->rm_eo = j;
504 				if (!fg->wmatch || wmatch(data, dataLen,
505 				    pmatch->rm_so, pmatch->rm_eo)) {
506 					rtrnVal = 0;
507 					break;
508 				}
509 			}
510 			/* Shift if within bounds, otherwise, we are done. */
511 			if (j == fg->patternLen)
512 				break;
513 			j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]];
514 		} while (j >= fg->patternLen);
515 	} else {
516 		/* Quick Search algorithm. */
517 		j = 0;
518 		do {
519 			if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
520 				pmatch->rm_so = j;
521 				pmatch->rm_eo = j + fg->patternLen;
522 				if (fg->patternLen == 0 || !fg->wmatch ||
523 				    wmatch(data, dataLen, pmatch->rm_so,
524 				    pmatch->rm_eo)) {
525 					rtrnVal = 0;
526 					break;
527 				}
528 			}
529 
530 			/* Shift if within bounds, otherwise, we are done. */
531 			if (j + fg->patternLen == dataLen)
532 				break;
533 			else
534 				j += fg->qsBc[(unsigned char)data[j + fg->patternLen]];
535 		} while (j <= (dataLen - fg->patternLen));
536 	}
537 
538 	return (rtrnVal);
539 #endif
540 }
541 
542 
543 void *
544 grep_malloc(size_t size)
545 {
546 	void	*ptr;
547 
548 	if ((ptr = malloc(size)) == NULL)
549 		err(2, "malloc");
550 	return ptr;
551 }
552 
553 void *
554 grep_calloc(size_t nmemb, size_t size)
555 {
556 	void	*ptr;
557 
558 	if ((ptr = calloc(nmemb, size)) == NULL)
559 		err(2, "calloc");
560 	return ptr;
561 }
562 
563 void *
564 grep_realloc(void *ptr, size_t size)
565 {
566 	if ((ptr = realloc(ptr, size)) == NULL)
567 		err(2, "realloc");
568 	return ptr;
569 }
570 
571 #ifndef SMALL
572 /*
573  * Returns:	i >= 0 on failure (position that it failed)
574  *		-1 on success
575  */
576 static int
577 grep_cmp(const char *pattern, const char *data, size_t len)
578 {
579 	int i;
580 
581 	for (i = 0; i < len; i++) {
582 		if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
583 		    || (iflag && pattern[i] == toupper(data[i])))
584 			continue;
585 		return (i);
586 	}
587 
588 	return (-1);
589 }
590 
591 static void
592 grep_revstr(unsigned char *str, int len)
593 {
594 	int i;
595 	char c;
596 
597 	for (i = 0; i < len / 2; i++) {
598 		c = str[i];
599 		str[i] = str[len - i - 1];
600 		str[len - i - 1] = c;
601 	}
602 }
603 #endif
604 
605 void
606 printline(str_t *line, int sep, regmatch_t *pmatch)
607 {
608 	int n;
609 
610 	n = 0;
611 	if (!hflag) {
612 		fputs(line->file, stdout);
613 		++n;
614 	}
615 	if (nflag) {
616 		if (n)
617 			putchar(sep);
618 		printf("%d", line->line_no);
619 		++n;
620 	}
621 	if (bflag) {
622 		if (n)
623 			putchar(sep);
624 		printf("%lld", (long long)line->off);
625 		++n;
626 	}
627 	if (n)
628 		putchar(sep);
629 	if (pmatch)
630 		fwrite(line->dat + pmatch->rm_so,
631 		    pmatch->rm_eo - pmatch->rm_so, 1, stdout);
632 	else
633 		fwrite(line->dat, line->len, 1, stdout);
634 	putchar('\n');
635 }
636