xref: /openbsd-src/usr.bin/grep/util.c (revision db3296cf5c1dd9058ceecc3a29fe4aaa0bd26000)
1 /*	$OpenBSD: util.c,v 1.17 2003/07/20 22:16:52 millert Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 
32 #include <ctype.h>
33 #include <err.h>
34 #include <errno.h>
35 #include <fts.h>
36 #include <regex.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <unistd.h>
41 #include <zlib.h>
42 
43 #include "grep.h"
44 
45 /*
46  * Process a file line by line...
47  */
48 
49 static int	linesqueued;
50 static int	procline(str_t *l, int);
51 static int	grep_search(fastgrep_t *, unsigned char *, int, regmatch_t *pmatch);
52 static int	grep_cmp(const unsigned char *, const unsigned char *, size_t);
53 static void	grep_revstr(unsigned char *, int);
54 
55 int
56 grep_tree(char **argv)
57 {
58 	FTS	*fts;
59 	FTSENT	*p;
60 	int	c, fts_flags;
61 
62 	c = fts_flags = 0;
63 
64 	if (Hflag)
65 		fts_flags = FTS_COMFOLLOW;
66 	if (Pflag)
67 		fts_flags = FTS_PHYSICAL;
68 	if (Sflag)
69 		fts_flags = FTS_LOGICAL;
70 
71 	fts_flags |= FTS_NOSTAT | FTS_NOCHDIR;
72 
73 	if (!(fts = fts_open(argv, fts_flags, NULL)))
74 		err(2, NULL);
75 	while ((p = fts_read(fts)) != NULL) {
76 		switch (p->fts_info) {
77 		case FTS_DNR:
78 			break;
79 		case FTS_ERR:
80 			errx(2, "%s: %s", p->fts_path, strerror(p->fts_errno));
81 			break;
82 		case FTS_DP:
83 			break;
84 		default:
85 			c += procfile(p->fts_path);
86 			break;
87 		}
88 	}
89 
90 	return c;
91 }
92 
93 int
94 procfile(char *fn)
95 {
96 	str_t ln;
97 	file_t *f;
98 	int c, t, z, nottext;
99 
100 	if (fn == NULL) {
101 		fn = "(standard input)";
102 		f = grep_fdopen(STDIN_FILENO, "r");
103 	} else {
104 		f = grep_open(fn, "r");
105 	}
106 	if (f == NULL) {
107 		if (!sflag)
108 			warn("%s", fn);
109 		return 0;
110 	}
111 
112 	nottext = grep_bin_file(f);
113 	if (nottext && binbehave == BIN_FILE_SKIP) {
114 		grep_close(f);
115 		return 0;
116 	}
117 
118 	ln.file = fn;
119 	ln.line_no = 0;
120 	linesqueued = 0;
121 	ln.off = -1;
122 
123 	if (Bflag > 0)
124 		initqueue();
125 	for (c = 0; !(lflag && c);) {
126 		ln.off += ln.len + 1;
127 		if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
128 			break;
129 		if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
130 			--ln.len;
131 		ln.line_no++;
132 
133 		z = tail;
134 
135 		if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
136 			enqueue(&ln);
137 			linesqueued++;
138 		}
139 		c += t;
140 	}
141 	if (Bflag > 0)
142 		clearqueue();
143 	grep_close(f);
144 
145 	if (cflag) {
146 		if (!hflag)
147 			printf("%s:", ln.file);
148 		printf("%u\n", c);
149 	}
150 	if (lflag && c != 0)
151 		printf("%s\n", fn);
152 	if (Lflag && c == 0)
153 		printf("%s\n", fn);
154 	if (c && !cflag && !lflag && !Lflag &&
155 	    binbehave == BIN_FILE_BIN && nottext && !qflag)
156 		printf("Binary file %s matches\n", fn);
157 
158 	return c;
159 }
160 
161 
162 /*
163  * Process an individual line in a file. Return non-zero if it matches.
164  */
165 
166 #define isword(x) (isalnum(x) || (x) == '_')
167 
168 static int
169 procline(str_t *l, int nottext)
170 {
171 	regmatch_t	pmatch;
172 	int		c, i, r;
173 
174 	if (matchall) {
175 		c = !vflag;
176 		goto print;
177 	}
178 
179 	for (c = i = 0; i < patterns; i++) {
180 		pmatch.rm_so = 0;
181 		pmatch.rm_eo = l->len;
182 		if (fg_pattern[i].pattern)
183 			r = grep_search(&fg_pattern[i], (unsigned char *)l->dat,
184 			    l->len, &pmatch);
185 		else
186 			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
187 		if (r == 0) {
188 			if (wflag) {
189 				if ((pmatch.rm_so != 0 &&
190 				    isword(l->dat[pmatch.rm_so - 1])) ||
191 				    (pmatch.rm_eo != l->len &&
192 				    isword(l->dat[pmatch.rm_eo])))
193 					r = REG_NOMATCH;
194 			}
195 			if (xflag) {
196 				if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
197 					r = REG_NOMATCH;
198 			}
199 		}
200 		if (r == 0) {
201 			c++;
202 			break;
203 		}
204 	}
205 	if (vflag)
206 		c = !c;
207 
208 print:
209 	if (c && binbehave == BIN_FILE_BIN && nottext)
210 		return c; /* Binary file */
211 
212 	if ((tail > 0 || c) && !cflag && !qflag) {
213 		if (c) {
214 			if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
215 			    (Aflag || Bflag))
216 				printf("--\n");
217 			first = 1;
218 			tail = Aflag;
219 			if (Bflag > 0)
220 				printqueue();
221 			linesqueued = 0;
222 			printline(l, ':');
223 		} else {
224 			printline(l, '-');
225 			tail--;
226 		}
227 	}
228 	return c;
229 }
230 
231 /*
232  * Returns: -1 on failure, 0 on success
233  */
234 int
235 fastcomp(fastgrep_t *fg, const char *pattern)
236 {
237 	int i;
238 	int bol = 0;
239 	int eol = 0;
240 	int origPatternLen;
241 	int shiftPatternLen;
242 	int hasDot = 0;
243 	int firstHalfDot = -1;
244 	int firstLastHalfDot = -1;
245 	int lastHalfDot = 0;
246 
247 	if (Fflag) {
248 		fg->pattern = NULL;
249 		return (-1);
250 	}
251 
252 	/* Initialize. */
253 	origPatternLen = fg->patternLen = strlen(pattern);
254 	fg->bol = 0;
255 	fg->eol = 0;
256 	fg->reversedSearch = 0;
257 
258 	/* Remove end-of-line character ('$'). */
259 	if (pattern[fg->patternLen - 1] == '$') {
260 		eol++;
261 		fg->eol = 1;
262 		fg->patternLen--;
263 		boleol = 1;
264 	}
265 
266 	/* Remove beginning-of-line character ('^'). */
267 	if (pattern[0] == '^') {
268 		bol++;
269 		fg->bol = 1;
270 		fg->patternLen--;
271 		boleol = 1;
272 	}
273 
274 	/*
275 	 * Copy pattern minus '^' and '$' characters at the beginning and
276 	 * ending of the string respectively.
277 	 */
278 	fg->pattern = grep_strdup(pattern + bol);
279 
280 	/* Look for ways to cheat...er...avoid the full regex engine. */
281 	for (i = 0; i < fg->patternLen; i++)
282 	{
283 		/* Can still cheat? */
284 		if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) ||
285 		    (fg->pattern[i] == '_') || (fg->pattern[i] == ',') ||
286 		    (fg->pattern[i] == '^') || (fg->pattern[i] == '$') ||
287 		    (fg->pattern[i] == '=') || (fg->pattern[i] == '-') ||
288 		    (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) {
289 			/* As long as it is good, upper case it for later. */
290 			if (iflag)
291 				fg->pattern[i] = toupper(fg->pattern[i]);
292 		} else if (fg->pattern[i] == '.') {
293 			hasDot = i;
294 			if (i < fg->patternLen / 2) {
295 				if (firstHalfDot < -1)
296 					/* Closest dot to the beginning */
297 					firstHalfDot = i;
298 			} else {
299 				/* Closest dot to the end of the pattern. */
300 				lastHalfDot = i;
301 				if (firstLastHalfDot < 0)
302 					firstLastHalfDot = i;
303 			}
304 		} else {
305 			/* Free memory and let others know this is empty. */
306 			free(fg->pattern);
307 			fg->pattern = NULL;
308 			return (-1);
309 		}
310 	}
311 
312 	/*
313 	 * Determine if a reverse search would be faster based on the placement
314 	 * of the dots.
315 	 */
316 	if ((!(lflag || cflag)) && ((!(bol || eol)) &&
317 	    ((lastHalfDot) && ((firstHalfDot < 0) ||
318 	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
319 		fg->reversedSearch = 1;
320 		hasDot = fg->patternLen - (firstHalfDot < 0 ?
321 		    firstLastHalfDot : firstHalfDot) - 1;
322 		grep_revstr(fg->pattern, fg->patternLen);
323 	}
324 
325 	/*
326 	 * Normal Quick Search would require a shift based on the position the
327 	 * next character after the comparison is within the pattern.  With
328 	 * wildcards, the position of the last dot effects the maximum shift
329 	 * distance.
330 	 * The closer to the end the wild card is the slower the search.  A
331 	 * reverse version of this algorithm would be useful for wildcards near
332 	 * the end of the string.
333 	 *
334 	 * Examples:
335 	 * Pattern	Max shift
336 	 * -------	---------
337 	 * this		5
338 	 * .his		4
339 	 * t.is		3
340 	 * th.s		2
341 	 * thi.		1
342 	 */
343 
344 	/* Adjust the shift based on location of the last dot ('.'). */
345 	shiftPatternLen = fg->patternLen - hasDot;
346 
347 	/* Preprocess pattern. */
348 	for (i = 0; i <= UCHAR_MAX; i++)
349 		fg->qsBc[i] = shiftPatternLen;
350 	for (i = hasDot + 1; i < fg->patternLen; i++) {
351 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
352 		/*
353 		 * If case is ignored, make the jump apply to both upper and
354 		 * lower cased characters.  As the pattern is stored in upper
355 		 * case, apply the same to the lower case equivalents.
356 		 */
357 		if (iflag)
358 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
359 	}
360 
361 	/*
362 	 * Put pattern back to normal after pre-processing to allow for easy
363 	 * comparisons later.
364 	 */
365 	if (fg->reversedSearch)
366 		grep_revstr(fg->pattern, fg->patternLen);
367 
368 	return (0);
369 }
370 
371 static int
372 grep_search(fastgrep_t *fg, unsigned char *data, int dataLen, regmatch_t *pmatch)
373 {
374 	int j;
375 	int rtrnVal = REG_NOMATCH;
376 
377 	pmatch->rm_so = -1;
378 	pmatch->rm_eo = -1;
379 
380 	/* No point in going farther if we do not have enough data. */
381 	if (dataLen < fg->patternLen)
382 		return (rtrnVal);
383 
384 	/* Only try once at the beginning or ending of the line. */
385 	if (fg->bol || fg->eol) {
386 		/* Simple text comparison. */
387 		/* Verify data is >= pattern length before searching on it. */
388 		if (dataLen >= fg->patternLen) {
389 			/* Determine where in data to start search at. */
390 			if (fg->eol)
391 				j = dataLen - fg->patternLen;
392 			else
393 				j = 0;
394 			if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
395 				if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
396 					rtrnVal = 0;
397 					pmatch->rm_so = j;
398 					pmatch->rm_eo = j + fg->patternLen;
399 				}
400 		}
401 	} else if (fg->reversedSearch) {
402 		/* Quick Search algorithm. */
403 		j = dataLen;
404 		do {
405 			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
406 			    fg->patternLen) == -1) {
407 				rtrnVal = 0;
408 				pmatch->rm_so = j - fg->patternLen;
409 				pmatch->rm_eo = j;
410 				break;
411 			}
412 			/* Shift if within bounds, otherwise, we are done. */
413 			if (j == fg->patternLen)
414 				break;
415 			j -= fg->qsBc[data[j - fg->patternLen - 1]];
416 		} while (j >= fg->patternLen);
417 	} else {
418 		/* Quick Search algorithm. */
419 		j = 0;
420 		do {
421 			if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
422 				rtrnVal = 0;
423 				pmatch->rm_so = j;
424 				pmatch->rm_eo = j + fg->patternLen;
425 				break;
426 			}
427 
428 			/* Shift if within bounds, otherwise, we are done. */
429 			if (j + fg->patternLen == dataLen)
430 				break;
431 			else
432 				j += fg->qsBc[data[j + fg->patternLen]];
433 		} while (j <= (dataLen - fg->patternLen));
434 	}
435 
436 	return (rtrnVal);
437 }
438 
439 
440 void *
441 grep_malloc(size_t size)
442 {
443 	void	*ptr;
444 
445 	if ((ptr = malloc(size)) == NULL)
446 		err(2, "malloc");
447 	return ptr;
448 }
449 
450 void *
451 grep_realloc(void *ptr, size_t size)
452 {
453 	if ((ptr = realloc(ptr, size)) == NULL)
454 		err(2, "realloc");
455 	return ptr;
456 }
457 
458 unsigned char *
459 grep_strdup(const char *str)
460 {
461 	unsigned char *ptr;
462 
463 	if ((ptr = (unsigned char *)strdup(str)) == NULL)
464 		err(2, "strdup");
465 	return ptr;
466 }
467 
468 /*
469  * Returns:	i >= 0 on failure (position that it failed)
470  *		-1 on success
471  */
472 int
473 grep_cmp(const unsigned char *pattern, const unsigned char *data, size_t len)
474 {
475 	int i;
476 
477 	for (i = 0; i < len; i++) {
478 		if (((pattern[i] == data[i]) || (pattern[i] == '.')) ||
479 		    (iflag && pattern[i] == toupper(data[i])))
480 			continue;
481 		return (i);
482 	}
483 
484 	return (-1);
485 }
486 
487 static void
488 grep_revstr(unsigned char *str, int len)
489 {
490 	int i;
491 	char c;
492 
493 	for (i = 0; i < len / 2; i++) {
494 		c = str[i];
495 		str[i] = str[len - i - 1];
496 		str[len - i - 1] = c;
497 	}
498 }
499 
500 void
501 printline(str_t *line, int sep)
502 {
503 	int n;
504 
505 	n = 0;
506 	if (!hflag) {
507 		fputs(line->file, stdout);
508 		++n;
509 	}
510 	if (nflag) {
511 		if (n)
512 			putchar(sep);
513 		printf("%d", line->line_no);
514 		++n;
515 	}
516 	if (bflag) {
517 		if (n)
518 			putchar(sep);
519 		printf("%lu", (unsigned long)line->off);
520 	}
521 	if (n)
522 		putchar(sep);
523 	fwrite(line->dat, line->len, 1, stdout);
524 	putchar('\n');
525 }
526