xref: /openbsd-src/usr.bin/grep/util.c (revision 4c1e55dc91edd6e69ccc60ce855900fbc12cf34f)
1 /*	$OpenBSD: util.c,v 1.42 2011/07/17 19:39:21 aschrijver Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 
32 #include <ctype.h>
33 #include <err.h>
34 #include <errno.h>
35 #include <fts.h>
36 #include <regex.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <unistd.h>
41 #include <zlib.h>
42 
43 #include "grep.h"
44 
45 /*
46  * Process a file line by line...
47  */
48 
49 static int	linesqueued;
50 static int	procline(str_t *l, int);
51 static int	grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch);
52 #ifndef SMALL
53 static int	grep_cmp(const char *, const char *, size_t);
54 static void	grep_revstr(unsigned char *, int);
55 #endif
56 
57 int
58 grep_tree(char **argv)
59 {
60 	FTS	*fts;
61 	FTSENT	*p;
62 	int	c, fts_flags;
63 
64 	c = 0;
65 
66 	fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
67 
68 	if (!(fts = fts_open(argv, fts_flags, NULL)))
69 		err(2, NULL);
70 	while ((p = fts_read(fts)) != NULL) {
71 		switch (p->fts_info) {
72 		case FTS_DNR:
73 			break;
74 		case FTS_ERR:
75 			errx(2, "%s: %s", p->fts_path, strerror(p->fts_errno));
76 			break;
77 		case FTS_DP:
78 			break;
79 		default:
80 			c += procfile(p->fts_path);
81 			break;
82 		}
83 	}
84 	if (errno)
85 		err(2, "fts_read");
86 
87 	return c;
88 }
89 
90 int
91 procfile(char *fn)
92 {
93 	str_t ln;
94 	file_t *f;
95 	int c, t, z, nottext;
96 
97 	if (fn == NULL) {
98 		fn = "(standard input)";
99 		f = grep_fdopen(STDIN_FILENO, "r");
100 	} else {
101 		f = grep_open(fn, "r");
102 	}
103 	if (f == NULL) {
104 		if (!sflag)
105 			warn("%s", fn);
106 		return 0;
107 	}
108 
109 	nottext = grep_bin_file(f);
110 	if (nottext && binbehave == BIN_FILE_SKIP) {
111 		grep_close(f);
112 		return 0;
113 	}
114 
115 	ln.file = fn;
116 	ln.line_no = 0;
117 	ln.len = 0;
118 	linesqueued = 0;
119 	tail = 0;
120 	ln.off = -1;
121 
122 	if (Bflag > 0)
123 		initqueue();
124 	for (c = 0;  c == 0 || !(lflag || qflag); ) {
125 		ln.off += ln.len + 1;
126 		if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
127 			break;
128 		if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
129 			--ln.len;
130 		ln.line_no++;
131 
132 		z = tail;
133 
134 		if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
135 			enqueue(&ln);
136 			linesqueued++;
137 		}
138 		c += t;
139 	}
140 	if (Bflag > 0)
141 		clearqueue();
142 	grep_close(f);
143 
144 	if (cflag) {
145 		if (!hflag)
146 			printf("%s:", ln.file);
147 		printf("%u\n", c);
148 	}
149 	if (lflag && c != 0)
150 		printf("%s\n", fn);
151 	if (Lflag && c == 0)
152 		printf("%s\n", fn);
153 	if (c && !cflag && !lflag && !Lflag &&
154 	    binbehave == BIN_FILE_BIN && nottext && !qflag)
155 		printf("Binary file %s matches\n", fn);
156 
157 	return c;
158 }
159 
160 
161 /*
162  * Process an individual line in a file. Return non-zero if it matches.
163  */
164 
165 #define isword(x) (isalnum(x) || (x) == '_')
166 
167 static int
168 procline(str_t *l, int nottext)
169 {
170 	regmatch_t	pmatch;
171 	int		c, i, r;
172 	regoff_t	offset;
173 
174 	/* size_t will be converted to regoff_t. ssize_t is guaranteed to fit
175 	 * into regoff_t */
176 	if (l->len > SSIZE_MAX) {
177 		errx(2, "Line is too big to process");
178 	}
179 
180 	c = 0;
181 	i = 0;
182 	if (matchall) {
183 		c = 1;
184 		goto print;
185 	}
186 
187 	for (i = 0; i < patterns; i++) {
188 		offset = 0;
189 redo:
190 		if (fg_pattern[i].pattern) {
191 			r = grep_search(&fg_pattern[i], l->dat + offset,
192 			    l->len - offset, &pmatch);
193 			pmatch.rm_so += offset;
194 			pmatch.rm_eo += offset;
195 		} else {
196 			pmatch.rm_so = offset;
197 			pmatch.rm_eo = l->len - offset;
198 			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
199 		}
200 		if (r == 0 && xflag) {
201 			if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
202 				r = REG_NOMATCH;
203 		}
204 		if (r == 0) {
205 			c = 1;
206 			if (oflag)
207 				goto print;
208 			break;
209 		}
210 	}
211 	if (oflag)
212 		return c;
213 print:
214 	if (vflag)
215 		c = !c;
216 
217 	if (c && binbehave == BIN_FILE_BIN && nottext)
218 		return c; /* Binary file */
219 
220 	if ((tail > 0 || c) && !cflag && !qflag) {
221 		if (c) {
222 			if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
223 			    (Aflag || Bflag))
224 				printf("--\n");
225 			first = 1;
226 			tail = Aflag;
227 			if (Bflag > 0)
228 				printqueue();
229 			linesqueued = 0;
230 			printline(l, ':', oflag ? &pmatch : NULL);
231 		} else {
232 			printline(l, '-', oflag ? &pmatch : NULL);
233 			tail--;
234 		}
235 	}
236 	if (oflag && !matchall) {
237 		offset = pmatch.rm_eo;
238 		goto redo;
239 	}
240 	return c;
241 }
242 
243 #ifndef SMALL
244 void
245 fgrepcomp(fastgrep_t *fg, const char *pattern)
246 {
247 	int i;
248 
249 	/* Initialize. */
250 	fg->patternLen = strlen(pattern);
251 	fg->bol = 0;
252 	fg->eol = 0;
253 	fg->wmatch = wflag;
254 	fg->reversedSearch = 0;
255 
256 	/*
257 	 * Make a copy and upper case it for later if in -i mode,
258 	 * else just copy the pointer.
259 	 */
260 	if (iflag) {
261 		fg->pattern = grep_malloc(fg->patternLen + 1);
262 		for (i = 0; i < fg->patternLen; i++)
263 			fg->pattern[i] = toupper(pattern[i]);
264 		fg->pattern[fg->patternLen] = '\0';
265 	} else
266 		fg->pattern = (unsigned char *)pattern;	/* really const */
267 
268 	/* Preprocess pattern. */
269 	for (i = 0; i <= UCHAR_MAX; i++)
270 		fg->qsBc[i] = fg->patternLen;
271 	for (i = 1; i < fg->patternLen; i++) {
272 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
273 		/*
274 		 * If case is ignored, make the jump apply to both upper and
275 		 * lower cased characters.  As the pattern is stored in upper
276 		 * case, apply the same to the lower case equivalents.
277 		 */
278 		if (iflag)
279 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
280 	}
281 }
282 #endif
283 
284 /*
285  * Returns: -1 on failure, 0 on success
286  */
287 int
288 fastcomp(fastgrep_t *fg, const char *pattern)
289 {
290 #ifdef SMALL
291 	return -1;
292 #else
293 	int i;
294 	int bol = 0;
295 	int eol = 0;
296 	int shiftPatternLen;
297 	int hasDot = 0;
298 	int firstHalfDot = -1;
299 	int firstLastHalfDot = -1;
300 	int lastHalfDot = 0;
301 
302 	/* Initialize. */
303 	fg->patternLen = strlen(pattern);
304 	fg->bol = 0;
305 	fg->eol = 0;
306 	fg->wmatch = 0;
307 	fg->reversedSearch = 0;
308 
309 	/* Remove end-of-line character ('$'). */
310 	if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') {
311 		eol++;
312 		fg->eol = 1;
313 		fg->patternLen--;
314 	}
315 
316 	/* Remove beginning-of-line character ('^'). */
317 	if (pattern[0] == '^') {
318 		bol++;
319 		fg->bol = 1;
320 		fg->patternLen--;
321 	}
322 
323 	/* Remove enclosing [[:<:]] and [[:>:]] (word match). */
324 	if (wflag) {
325 		/* basic re's use \( \), extended re's ( ) */
326 		int extra = Eflag ? 1 : 2;
327 		fg->patternLen -= 14 + 2 * extra;
328 		fg->wmatch = 7 + extra;
329 	} else if (fg->patternLen >= 14 &&
330 	    strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
331 	    strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
332 		fg->patternLen -= 14;
333 		fg->wmatch = 7;
334 	}
335 
336 	/*
337 	 * Copy pattern minus '^' and '$' characters as well as word
338 	 * match character classes at the beginning and ending of the
339 	 * string respectively.
340 	 */
341 	fg->pattern = grep_malloc(fg->patternLen + 1);
342 	memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
343 	fg->pattern[fg->patternLen] = '\0';
344 
345 	/* Look for ways to cheat...er...avoid the full regex engine. */
346 	for (i = 0; i < fg->patternLen; i++)
347 	{
348 		/* Can still cheat? */
349 		if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) ||
350 		    (fg->pattern[i] == '_') || (fg->pattern[i] == ',') ||
351 		    (fg->pattern[i] == '=') || (fg->pattern[i] == '-') ||
352 		    (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) {
353 			/* As long as it is good, upper case it for later. */
354 			if (iflag)
355 				fg->pattern[i] = toupper(fg->pattern[i]);
356 		} else if (fg->pattern[i] == '.') {
357 			hasDot = i;
358 			if (i < fg->patternLen / 2) {
359 				if (firstHalfDot < 0)
360 					/* Closest dot to the beginning */
361 					firstHalfDot = i;
362 			} else {
363 				/* Closest dot to the end of the pattern. */
364 				lastHalfDot = i;
365 				if (firstLastHalfDot < 0)
366 					firstLastHalfDot = i;
367 			}
368 		} else {
369 			/* Free memory and let others know this is empty. */
370 			free(fg->pattern);
371 			fg->pattern = NULL;
372 			return (-1);
373 		}
374 	}
375 
376 	/*
377 	 * Determine if a reverse search would be faster based on the placement
378 	 * of the dots.
379 	 */
380 	if ((!(lflag || cflag)) && ((!(bol || eol)) &&
381 	    ((lastHalfDot) && ((firstHalfDot < 0) ||
382 	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
383 		fg->reversedSearch = 1;
384 		hasDot = fg->patternLen - (firstHalfDot < 0 ?
385 		    firstLastHalfDot : firstHalfDot) - 1;
386 		grep_revstr(fg->pattern, fg->patternLen);
387 	}
388 
389 	/*
390 	 * Normal Quick Search would require a shift based on the position the
391 	 * next character after the comparison is within the pattern.  With
392 	 * wildcards, the position of the last dot effects the maximum shift
393 	 * distance.
394 	 * The closer to the end the wild card is the slower the search.  A
395 	 * reverse version of this algorithm would be useful for wildcards near
396 	 * the end of the string.
397 	 *
398 	 * Examples:
399 	 * Pattern	Max shift
400 	 * -------	---------
401 	 * this		5
402 	 * .his		4
403 	 * t.is		3
404 	 * th.s		2
405 	 * thi.		1
406 	 */
407 
408 	/* Adjust the shift based on location of the last dot ('.'). */
409 	shiftPatternLen = fg->patternLen - hasDot;
410 
411 	/* Preprocess pattern. */
412 	for (i = 0; i <= UCHAR_MAX; i++)
413 		fg->qsBc[i] = shiftPatternLen;
414 	for (i = hasDot + 1; i < fg->patternLen; i++) {
415 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
416 		/*
417 		 * If case is ignored, make the jump apply to both upper and
418 		 * lower cased characters.  As the pattern is stored in upper
419 		 * case, apply the same to the lower case equivalents.
420 		 */
421 		if (iflag)
422 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
423 	}
424 
425 	/*
426 	 * Put pattern back to normal after pre-processing to allow for easy
427 	 * comparisons later.
428 	 */
429 	if (fg->reversedSearch)
430 		grep_revstr(fg->pattern, fg->patternLen);
431 
432 	return (0);
433 #endif
434 }
435 
436 /*
437  * Word boundaries using regular expressions are defined as the point
438  * of transition from a non-word char to a word char, or vice versa.
439  * This means that grep -w +a and grep -w a+ never match anything,
440  * because they lack a starting or ending transition, but grep -w a+b
441  * does match a line containing a+b.
442  */
443 #define wmatch(d, l, s, e)	\
444 	((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
445 	  e > s && isword(d[s]) && isword(d[e-1]))
446 
447 static int
448 grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch)
449 {
450 #ifdef SMALL
451 	return 0;
452 #else
453 	regoff_t j;
454 	int rtrnVal = REG_NOMATCH;
455 
456 	pmatch->rm_so = -1;
457 	pmatch->rm_eo = -1;
458 
459 	/* No point in going farther if we do not have enough data. */
460 	if (dataLen < fg->patternLen)
461 		return (rtrnVal);
462 
463 	/* Only try once at the beginning or ending of the line. */
464 	if (fg->bol || fg->eol) {
465 		/* Simple text comparison. */
466 		/* Verify data is >= pattern length before searching on it. */
467 		if (dataLen >= fg->patternLen) {
468 			/* Determine where in data to start search at. */
469 			if (fg->eol)
470 				j = dataLen - fg->patternLen;
471 			else
472 				j = 0;
473 			if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
474 				if (grep_cmp(fg->pattern, data + j,
475 				    fg->patternLen) == -1) {
476 					pmatch->rm_so = j;
477 					pmatch->rm_eo = j + fg->patternLen;
478 					if (!fg->wmatch || wmatch(data, dataLen,
479 					    pmatch->rm_so, pmatch->rm_eo))
480 						rtrnVal = 0;
481 				}
482 		}
483 	} else if (fg->reversedSearch) {
484 		/* Quick Search algorithm. */
485 		j = dataLen;
486 		do {
487 			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
488 			    fg->patternLen) == -1) {
489 				pmatch->rm_so = j - fg->patternLen;
490 				pmatch->rm_eo = j;
491 				if (!fg->wmatch || wmatch(data, dataLen,
492 				    pmatch->rm_so, pmatch->rm_eo)) {
493 					rtrnVal = 0;
494 					break;
495 				}
496 			}
497 			/* Shift if within bounds, otherwise, we are done. */
498 			if (j == fg->patternLen)
499 				break;
500 			j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]];
501 		} while (j >= fg->patternLen);
502 	} else {
503 		/* Quick Search algorithm. */
504 		j = 0;
505 		do {
506 			if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
507 				pmatch->rm_so = j;
508 				pmatch->rm_eo = j + fg->patternLen;
509 				if (fg->patternLen == 0 || !fg->wmatch ||
510 				    wmatch(data, dataLen, pmatch->rm_so,
511 				    pmatch->rm_eo)) {
512 					rtrnVal = 0;
513 					break;
514 				}
515 			}
516 
517 			/* Shift if within bounds, otherwise, we are done. */
518 			if (j + fg->patternLen == dataLen)
519 				break;
520 			else
521 				j += fg->qsBc[(unsigned char)data[j + fg->patternLen]];
522 		} while (j <= (dataLen - fg->patternLen));
523 	}
524 
525 	return (rtrnVal);
526 #endif
527 }
528 
529 
530 void *
531 grep_malloc(size_t size)
532 {
533 	void	*ptr;
534 
535 	if ((ptr = malloc(size)) == NULL)
536 		err(2, "malloc");
537 	return ptr;
538 }
539 
540 void *
541 grep_calloc(size_t nmemb, size_t size)
542 {
543 	void	*ptr;
544 
545 	if ((ptr = calloc(nmemb, size)) == NULL)
546 		err(2, "calloc");
547 	return ptr;
548 }
549 
550 void *
551 grep_realloc(void *ptr, size_t size)
552 {
553 	if ((ptr = realloc(ptr, size)) == NULL)
554 		err(2, "realloc");
555 	return ptr;
556 }
557 
558 #ifndef SMALL
559 /*
560  * Returns:	i >= 0 on failure (position that it failed)
561  *		-1 on success
562  */
563 static int
564 grep_cmp(const char *pattern, const char *data, size_t len)
565 {
566 	int i;
567 
568 	for (i = 0; i < len; i++) {
569 		if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
570 		    || (iflag && pattern[i] == toupper(data[i])))
571 			continue;
572 		return (i);
573 	}
574 
575 	return (-1);
576 }
577 
578 static void
579 grep_revstr(unsigned char *str, int len)
580 {
581 	int i;
582 	char c;
583 
584 	for (i = 0; i < len / 2; i++) {
585 		c = str[i];
586 		str[i] = str[len - i - 1];
587 		str[len - i - 1] = c;
588 	}
589 }
590 #endif
591 
592 void
593 printline(str_t *line, int sep, regmatch_t *pmatch)
594 {
595 	int n;
596 
597 	n = 0;
598 	if (!hflag) {
599 		fputs(line->file, stdout);
600 		++n;
601 	}
602 	if (nflag) {
603 		if (n)
604 			putchar(sep);
605 		printf("%d", line->line_no);
606 		++n;
607 	}
608 	if (bflag) {
609 		if (n)
610 			putchar(sep);
611 		printf("%lld", (long long)line->off);
612 		++n;
613 	}
614 	if (n)
615 		putchar(sep);
616 	if (pmatch)
617 		fwrite(line->dat + pmatch->rm_so,
618 		    pmatch->rm_eo - pmatch->rm_so, 1, stdout);
619 	else
620 		fwrite(line->dat, line->len, 1, stdout);
621 	putchar('\n');
622 }
623