xref: /openbsd-src/usr.bin/grep/util.c (revision a28daedfc357b214be5c701aa8ba8adb29a7f1c2)
1 /*	$OpenBSD: util.c,v 1.36 2007/10/02 17:59:18 otto Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 
32 #include <ctype.h>
33 #include <err.h>
34 #include <errno.h>
35 #include <fts.h>
36 #include <regex.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <unistd.h>
41 #include <zlib.h>
42 
43 #include "grep.h"
44 
45 /*
46  * Process a file line by line...
47  */
48 
49 static int	linesqueued;
50 static int	procline(str_t *l, int);
51 static int	grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *pmatch);
52 static int	grep_cmp(const unsigned char *, const unsigned char *, size_t);
53 static void	grep_revstr(unsigned char *, int);
54 
55 int
56 grep_tree(char **argv)
57 {
58 	FTS	*fts;
59 	FTSENT	*p;
60 	int	c, fts_flags;
61 
62 	c = fts_flags = 0;
63 
64 	if (Hflag)
65 		fts_flags = FTS_COMFOLLOW;
66 	if (Pflag)
67 		fts_flags = FTS_PHYSICAL;
68 	if (Sflag)
69 		fts_flags = FTS_LOGICAL;
70 
71 	fts_flags |= FTS_NOSTAT | FTS_NOCHDIR;
72 
73 	if (!(fts = fts_open(argv, fts_flags, NULL)))
74 		err(2, NULL);
75 	while ((p = fts_read(fts)) != NULL) {
76 		switch (p->fts_info) {
77 		case FTS_DNR:
78 			break;
79 		case FTS_ERR:
80 			errx(2, "%s: %s", p->fts_path, strerror(p->fts_errno));
81 			break;
82 		case FTS_DP:
83 			break;
84 		default:
85 			c += procfile(p->fts_path);
86 			break;
87 		}
88 	}
89 	if (errno)
90 		err(2, "fts_read");
91 
92 	return c;
93 }
94 
95 int
96 procfile(char *fn)
97 {
98 	str_t ln;
99 	file_t *f;
100 	int c, t, z, nottext;
101 
102 	if (fn == NULL) {
103 		fn = "(standard input)";
104 		f = grep_fdopen(STDIN_FILENO, "r");
105 	} else {
106 		f = grep_open(fn, "r");
107 	}
108 	if (f == NULL) {
109 		if (!sflag)
110 			warn("%s", fn);
111 		return 0;
112 	}
113 
114 	nottext = grep_bin_file(f);
115 	if (nottext && binbehave == BIN_FILE_SKIP) {
116 		grep_close(f);
117 		return 0;
118 	}
119 
120 	ln.file = fn;
121 	ln.line_no = 0;
122 	ln.len = 0;
123 	linesqueued = 0;
124 	tail = 0;
125 	ln.off = -1;
126 
127 	if (Bflag > 0)
128 		initqueue();
129 	for (c = 0;  c == 0 || !(lflag || qflag); ) {
130 		ln.off += ln.len + 1;
131 		if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
132 			break;
133 		if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
134 			--ln.len;
135 		ln.line_no++;
136 
137 		z = tail;
138 
139 		if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
140 			enqueue(&ln);
141 			linesqueued++;
142 		}
143 		c += t;
144 	}
145 	if (Bflag > 0)
146 		clearqueue();
147 	grep_close(f);
148 
149 	if (cflag) {
150 		if (!hflag)
151 			printf("%s:", ln.file);
152 		printf("%u\n", c);
153 	}
154 	if (lflag && c != 0)
155 		printf("%s\n", fn);
156 	if (Lflag && c == 0)
157 		printf("%s\n", fn);
158 	if (c && !cflag && !lflag && !Lflag &&
159 	    binbehave == BIN_FILE_BIN && nottext && !qflag)
160 		printf("Binary file %s matches\n", fn);
161 
162 	return c;
163 }
164 
165 
166 /*
167  * Process an individual line in a file. Return non-zero if it matches.
168  */
169 
170 #define isword(x) (isalnum(x) || (x) == '_')
171 
172 static int
173 procline(str_t *l, int nottext)
174 {
175 	regmatch_t	pmatch;
176 	int		c, i, r;
177 
178 	if (matchall) {
179 		c = !vflag;
180 		goto print;
181 	}
182 
183 	for (c = i = 0; i < patterns; i++) {
184 		if (fg_pattern[i].pattern) {
185 			r = grep_search(&fg_pattern[i], (unsigned char *)l->dat,
186 			    l->len, &pmatch);
187 		} else {
188 			pmatch.rm_so = 0;
189 			pmatch.rm_eo = l->len;
190 			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
191 		}
192 		if (r == 0 && xflag) {
193 			if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
194 				r = REG_NOMATCH;
195 		}
196 		if (r == 0) {
197 			c++;
198 			break;
199 		}
200 	}
201 	if (vflag)
202 		c = !c;
203 
204 print:
205 	if (c && binbehave == BIN_FILE_BIN && nottext)
206 		return c; /* Binary file */
207 
208 	if ((tail > 0 || c) && !cflag && !qflag) {
209 		if (c) {
210 			if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
211 			    (Aflag || Bflag))
212 				printf("--\n");
213 			first = 1;
214 			tail = Aflag;
215 			if (Bflag > 0)
216 				printqueue();
217 			linesqueued = 0;
218 			printline(l, ':');
219 		} else {
220 			printline(l, '-');
221 			tail--;
222 		}
223 	}
224 	return c;
225 }
226 
227 void
228 fgrepcomp(fastgrep_t *fg, const char *pattern)
229 {
230 	int i;
231 
232 	/* Initialize. */
233 	fg->patternLen = strlen(pattern);
234 	fg->bol = 0;
235 	fg->eol = 0;
236 	fg->wmatch = wflag;
237 	fg->reversedSearch = 0;
238 
239 	/*
240 	 * Make a copy and upper case it for later if in -i mode,
241 	 * else just copy the pointer.
242 	 */
243 	if (iflag) {
244 		fg->pattern = grep_malloc(fg->patternLen + 1);
245 		for (i = 0; i < fg->patternLen; i++)
246 			fg->pattern[i] = toupper(pattern[i]);
247 		fg->pattern[fg->patternLen] = '\0';
248 	} else
249 		fg->pattern = (unsigned char *)pattern;	/* really const */
250 
251 	/* Preprocess pattern. */
252 	for (i = 0; i <= UCHAR_MAX; i++)
253 		fg->qsBc[i] = fg->patternLen;
254 	for (i = 1; i < fg->patternLen; i++) {
255 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
256 		/*
257 		 * If case is ignored, make the jump apply to both upper and
258 		 * lower cased characters.  As the pattern is stored in upper
259 		 * case, apply the same to the lower case equivalents.
260 		 */
261 		if (iflag)
262 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
263 	}
264 }
265 
266 /*
267  * Returns: -1 on failure, 0 on success
268  */
269 int
270 fastcomp(fastgrep_t *fg, const char *pattern)
271 {
272 	int i;
273 	int bol = 0;
274 	int eol = 0;
275 	int shiftPatternLen;
276 	int hasDot = 0;
277 	int firstHalfDot = -1;
278 	int firstLastHalfDot = -1;
279 	int lastHalfDot = 0;
280 
281 	/* Initialize. */
282 	fg->patternLen = strlen(pattern);
283 	fg->bol = 0;
284 	fg->eol = 0;
285 	fg->wmatch = 0;
286 	fg->reversedSearch = 0;
287 
288 	/* Remove end-of-line character ('$'). */
289 	if (pattern[fg->patternLen - 1] == '$') {
290 		eol++;
291 		fg->eol = 1;
292 		fg->patternLen--;
293 	}
294 
295 	/* Remove beginning-of-line character ('^'). */
296 	if (pattern[0] == '^') {
297 		bol++;
298 		fg->bol = 1;
299 		fg->patternLen--;
300 	}
301 
302 	/* Remove enclosing [[:<:]] and [[:>:]] (word match). */
303 	if (wflag) {
304 		/* basic re's use \( \), extended re's ( ) */
305 		int extra = Eflag ? 1 : 2;
306 		fg->patternLen -= 14 + 2 * extra;
307 		fg->wmatch = 7 + extra;
308 	} else if (fg->patternLen >= 14 &&
309 	    strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
310 	    strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
311 		fg->patternLen -= 14;
312 		fg->wmatch = 7;
313 	}
314 
315 	/*
316 	 * Copy pattern minus '^' and '$' characters as well as word
317 	 * match character classes at the beginning and ending of the
318 	 * string respectively.
319 	 */
320 	fg->pattern = grep_malloc(fg->patternLen + 1);
321 	memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
322 	fg->pattern[fg->patternLen] = '\0';
323 
324 	/* Look for ways to cheat...er...avoid the full regex engine. */
325 	for (i = 0; i < fg->patternLen; i++)
326 	{
327 		/* Can still cheat? */
328 		if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) ||
329 		    (fg->pattern[i] == '_') || (fg->pattern[i] == ',') ||
330 		    (fg->pattern[i] == '=') || (fg->pattern[i] == '-') ||
331 		    (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) {
332 			/* As long as it is good, upper case it for later. */
333 			if (iflag)
334 				fg->pattern[i] = toupper(fg->pattern[i]);
335 		} else if (fg->pattern[i] == '.') {
336 			hasDot = i;
337 			if (i < fg->patternLen / 2) {
338 				if (firstHalfDot < 0)
339 					/* Closest dot to the beginning */
340 					firstHalfDot = i;
341 			} else {
342 				/* Closest dot to the end of the pattern. */
343 				lastHalfDot = i;
344 				if (firstLastHalfDot < 0)
345 					firstLastHalfDot = i;
346 			}
347 		} else {
348 			/* Free memory and let others know this is empty. */
349 			free(fg->pattern);
350 			fg->pattern = NULL;
351 			return (-1);
352 		}
353 	}
354 
355 	/*
356 	 * Determine if a reverse search would be faster based on the placement
357 	 * of the dots.
358 	 */
359 	if ((!(lflag || cflag)) && ((!(bol || eol)) &&
360 	    ((lastHalfDot) && ((firstHalfDot < 0) ||
361 	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
362 		fg->reversedSearch = 1;
363 		hasDot = fg->patternLen - (firstHalfDot < 0 ?
364 		    firstLastHalfDot : firstHalfDot) - 1;
365 		grep_revstr(fg->pattern, fg->patternLen);
366 	}
367 
368 	/*
369 	 * Normal Quick Search would require a shift based on the position the
370 	 * next character after the comparison is within the pattern.  With
371 	 * wildcards, the position of the last dot effects the maximum shift
372 	 * distance.
373 	 * The closer to the end the wild card is the slower the search.  A
374 	 * reverse version of this algorithm would be useful for wildcards near
375 	 * the end of the string.
376 	 *
377 	 * Examples:
378 	 * Pattern	Max shift
379 	 * -------	---------
380 	 * this		5
381 	 * .his		4
382 	 * t.is		3
383 	 * th.s		2
384 	 * thi.		1
385 	 */
386 
387 	/* Adjust the shift based on location of the last dot ('.'). */
388 	shiftPatternLen = fg->patternLen - hasDot;
389 
390 	/* Preprocess pattern. */
391 	for (i = 0; i <= UCHAR_MAX; i++)
392 		fg->qsBc[i] = shiftPatternLen;
393 	for (i = hasDot + 1; i < fg->patternLen; i++) {
394 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
395 		/*
396 		 * If case is ignored, make the jump apply to both upper and
397 		 * lower cased characters.  As the pattern is stored in upper
398 		 * case, apply the same to the lower case equivalents.
399 		 */
400 		if (iflag)
401 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
402 	}
403 
404 	/*
405 	 * Put pattern back to normal after pre-processing to allow for easy
406 	 * comparisons later.
407 	 */
408 	if (fg->reversedSearch)
409 		grep_revstr(fg->pattern, fg->patternLen);
410 
411 	return (0);
412 }
413 
414 /*
415  * Word boundaries using regular expressions are defined as the point
416  * of transition from a non-word char to a word char, or vice versa.
417  * This means that grep -w +a and grep -w a+ never match anything,
418  * because they lack a starting or ending transition, but grep -w a+b
419  * does match a line containing a+b.
420  */
421 #define wmatch(d, l, s, e)	\
422 	((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
423 	  e > s && isword(d[s]) && isword(d[e-1]))
424 
425 static int
426 grep_search(fastgrep_t *fg, unsigned char *data, size_t dataLen, regmatch_t *pmatch)
427 {
428 	int j;
429 	int rtrnVal = REG_NOMATCH;
430 
431 	pmatch->rm_so = -1;
432 	pmatch->rm_eo = -1;
433 
434 	/* No point in going farther if we do not have enough data. */
435 	if (dataLen < fg->patternLen)
436 		return (rtrnVal);
437 
438 	/* Only try once at the beginning or ending of the line. */
439 	if (fg->bol || fg->eol) {
440 		/* Simple text comparison. */
441 		/* Verify data is >= pattern length before searching on it. */
442 		if (dataLen >= fg->patternLen) {
443 			/* Determine where in data to start search at. */
444 			if (fg->eol)
445 				j = dataLen - fg->patternLen;
446 			else
447 				j = 0;
448 			if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
449 				if (grep_cmp(fg->pattern, data + j,
450 				    fg->patternLen) == -1) {
451 					pmatch->rm_so = j;
452 					pmatch->rm_eo = j + fg->patternLen;
453 					if (!fg->wmatch || wmatch(data, dataLen,
454 					    pmatch->rm_so, pmatch->rm_eo))
455 						rtrnVal = 0;
456 				}
457 		}
458 	} else if (fg->reversedSearch) {
459 		/* Quick Search algorithm. */
460 		j = dataLen;
461 		do {
462 			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
463 			    fg->patternLen) == -1) {
464 				pmatch->rm_so = j - fg->patternLen;
465 				pmatch->rm_eo = j;
466 				if (!fg->wmatch || wmatch(data, dataLen,
467 				    pmatch->rm_so, pmatch->rm_eo)) {
468 					rtrnVal = 0;
469 					break;
470 				}
471 			}
472 			/* Shift if within bounds, otherwise, we are done. */
473 			if (j == fg->patternLen)
474 				break;
475 			j -= fg->qsBc[data[j - fg->patternLen - 1]];
476 		} while (j >= fg->patternLen);
477 	} else {
478 		/* Quick Search algorithm. */
479 		j = 0;
480 		do {
481 			if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
482 				pmatch->rm_so = j;
483 				pmatch->rm_eo = j + fg->patternLen;
484 				if (fg->patternLen == 0 || !fg->wmatch ||
485 				    wmatch(data, dataLen, pmatch->rm_so,
486 				    pmatch->rm_eo)) {
487 					rtrnVal = 0;
488 					break;
489 				}
490 			}
491 
492 			/* Shift if within bounds, otherwise, we are done. */
493 			if (j + fg->patternLen == dataLen)
494 				break;
495 			else
496 				j += fg->qsBc[data[j + fg->patternLen]];
497 		} while (j <= (dataLen - fg->patternLen));
498 	}
499 
500 	return (rtrnVal);
501 }
502 
503 
504 void *
505 grep_malloc(size_t size)
506 {
507 	void	*ptr;
508 
509 	if ((ptr = malloc(size)) == NULL)
510 		err(2, "malloc");
511 	return ptr;
512 }
513 
514 void *
515 grep_calloc(size_t nmemb, size_t size)
516 {
517 	void	*ptr;
518 
519 	if ((ptr = calloc(nmemb, size)) == NULL)
520 		err(2, "calloc");
521 	return ptr;
522 }
523 
524 void *
525 grep_realloc(void *ptr, size_t size)
526 {
527 	if ((ptr = realloc(ptr, size)) == NULL)
528 		err(2, "realloc");
529 	return ptr;
530 }
531 
532 /*
533  * Returns:	i >= 0 on failure (position that it failed)
534  *		-1 on success
535  */
536 static int
537 grep_cmp(const unsigned char *pattern, const unsigned char *data, size_t len)
538 {
539 	int i;
540 
541 	for (i = 0; i < len; i++) {
542 		if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
543 		    || (iflag && pattern[i] == toupper(data[i])))
544 			continue;
545 		return (i);
546 	}
547 
548 	return (-1);
549 }
550 
551 static void
552 grep_revstr(unsigned char *str, int len)
553 {
554 	int i;
555 	char c;
556 
557 	for (i = 0; i < len / 2; i++) {
558 		c = str[i];
559 		str[i] = str[len - i - 1];
560 		str[len - i - 1] = c;
561 	}
562 }
563 
564 void
565 printline(str_t *line, int sep)
566 {
567 	int n;
568 
569 	n = 0;
570 	if (!hflag) {
571 		fputs(line->file, stdout);
572 		++n;
573 	}
574 	if (nflag) {
575 		if (n)
576 			putchar(sep);
577 		printf("%d", line->line_no);
578 		++n;
579 	}
580 	if (bflag) {
581 		if (n)
582 			putchar(sep);
583 		printf("%lld", (long long)line->off);
584 		++n;
585 	}
586 	if (n)
587 		putchar(sep);
588 	fwrite(line->dat, line->len, 1, stdout);
589 	putchar('\n');
590 }
591