xref: /openbsd-src/usr.bin/grep/util.c (revision daf88648c0e349d5c02e1504293082072c981640)
1 /*	$OpenBSD: util.c,v 1.34 2006/12/26 20:59:23 otto Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 
32 #include <ctype.h>
33 #include <err.h>
34 #include <errno.h>
35 #include <fts.h>
36 #include <regex.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <unistd.h>
41 #include <zlib.h>
42 
43 #include "grep.h"
44 
45 /*
46  * Process a file line by line...
47  */
48 
49 static int	linesqueued;
50 static int	procline(str_t *l, int);
51 static int	grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *pmatch);
52 static int	grep_cmp(const unsigned char *, const unsigned char *, size_t);
53 static void	grep_revstr(unsigned char *, int);
54 
55 int
56 grep_tree(char **argv)
57 {
58 	FTS	*fts;
59 	FTSENT	*p;
60 	int	c, fts_flags;
61 
62 	c = fts_flags = 0;
63 
64 	if (Hflag)
65 		fts_flags = FTS_COMFOLLOW;
66 	if (Pflag)
67 		fts_flags = FTS_PHYSICAL;
68 	if (Sflag)
69 		fts_flags = FTS_LOGICAL;
70 
71 	fts_flags |= FTS_NOSTAT | FTS_NOCHDIR;
72 
73 	if (!(fts = fts_open(argv, fts_flags, NULL)))
74 		err(2, NULL);
75 	while ((p = fts_read(fts)) != NULL) {
76 		switch (p->fts_info) {
77 		case FTS_DNR:
78 			break;
79 		case FTS_ERR:
80 			errx(2, "%s: %s", p->fts_path, strerror(p->fts_errno));
81 			break;
82 		case FTS_DP:
83 			break;
84 		default:
85 			c += procfile(p->fts_path);
86 			break;
87 		}
88 	}
89 	if (errno)
90 		err(2, "fts_read");
91 
92 	return c;
93 }
94 
95 int
96 procfile(char *fn)
97 {
98 	str_t ln;
99 	file_t *f;
100 	int c, t, z, nottext;
101 
102 	if (fn == NULL) {
103 		fn = "(standard input)";
104 		f = grep_fdopen(STDIN_FILENO, "r");
105 	} else {
106 		f = grep_open(fn, "r");
107 	}
108 	if (f == NULL) {
109 		if (!sflag)
110 			warn("%s", fn);
111 		return 0;
112 	}
113 
114 	nottext = grep_bin_file(f);
115 	if (nottext && binbehave == BIN_FILE_SKIP) {
116 		grep_close(f);
117 		return 0;
118 	}
119 
120 	ln.file = fn;
121 	ln.line_no = 0;
122 	ln.len = 0;
123 	linesqueued = 0;
124 	tail = 0;
125 	ln.off = -1;
126 
127 	if (Bflag > 0)
128 		initqueue();
129 	for (c = 0;  c == 0 || !(lflag || qflag); ) {
130 		ln.off += ln.len + 1;
131 		if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
132 			break;
133 		if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
134 			--ln.len;
135 		ln.line_no++;
136 
137 		z = tail;
138 
139 		if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
140 			enqueue(&ln);
141 			linesqueued++;
142 		}
143 		c += t;
144 	}
145 	if (Bflag > 0)
146 		clearqueue();
147 	grep_close(f);
148 
149 	if (cflag) {
150 		if (!hflag)
151 			printf("%s:", ln.file);
152 		printf("%u\n", c);
153 	}
154 	if (lflag && c != 0)
155 		printf("%s\n", fn);
156 	if (Lflag && c == 0)
157 		printf("%s\n", fn);
158 	if (c && !cflag && !lflag && !Lflag &&
159 	    binbehave == BIN_FILE_BIN && nottext && !qflag)
160 		printf("Binary file %s matches\n", fn);
161 
162 	return c;
163 }
164 
165 
166 /*
167  * Process an individual line in a file. Return non-zero if it matches.
168  */
169 
170 #define isword(x) (isalnum(x) || (x) == '_')
171 
172 static int
173 procline(str_t *l, int nottext)
174 {
175 	regmatch_t	pmatch;
176 	int		c, i, r;
177 
178 	if (matchall) {
179 		c = !vflag;
180 		goto print;
181 	}
182 
183 	for (c = i = 0; i < patterns; i++) {
184 		if (fg_pattern[i].pattern) {
185 			r = grep_search(&fg_pattern[i], (unsigned char *)l->dat,
186 			    l->len, &pmatch);
187 		} else {
188 			pmatch.rm_so = 0;
189 			pmatch.rm_eo = l->len;
190 			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
191 		}
192 		if (r == 0 && xflag) {
193 			if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
194 				r = REG_NOMATCH;
195 		}
196 		if (r == 0) {
197 			c++;
198 			break;
199 		}
200 	}
201 	if (vflag)
202 		c = !c;
203 
204 print:
205 	if (c && binbehave == BIN_FILE_BIN && nottext)
206 		return c; /* Binary file */
207 
208 	if ((tail > 0 || c) && !cflag && !qflag) {
209 		if (c) {
210 			if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
211 			    (Aflag || Bflag))
212 				printf("--\n");
213 			first = 1;
214 			tail = Aflag;
215 			if (Bflag > 0)
216 				printqueue();
217 			linesqueued = 0;
218 			printline(l, ':');
219 		} else {
220 			printline(l, '-');
221 			tail--;
222 		}
223 	}
224 	return c;
225 }
226 
227 void
228 fgrepcomp(fastgrep_t *fg, const char *pattern)
229 {
230 	int i;
231 
232 	/* Initialize. */
233 	fg->patternLen = strlen(pattern);
234 	fg->bol = 0;
235 	fg->eol = 0;
236 	fg->wmatch = wflag;
237 	fg->reversedSearch = 0;
238 
239 	/*
240 	 * Make a copy and upper case it for later if in -i mode,
241 	 * else just copy the pointer.
242 	 */
243 	if (iflag) {
244 		fg->pattern = grep_malloc(fg->patternLen + 1);
245 		for (i = 0; i < fg->patternLen; i++)
246 			fg->pattern[i] = toupper(pattern[i]);
247 		fg->pattern[fg->patternLen] = '\0';
248 	} else
249 		fg->pattern = (unsigned char *)pattern;	/* really const */
250 
251 	/* Preprocess pattern. */
252 	for (i = 0; i <= UCHAR_MAX; i++)
253 		fg->qsBc[i] = fg->patternLen;
254 	for (i = 1; i < fg->patternLen; i++) {
255 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
256 		/*
257 		 * If case is ignored, make the jump apply to both upper and
258 		 * lower cased characters.  As the pattern is stored in upper
259 		 * case, apply the same to the lower case equivalents.
260 		 */
261 		if (iflag)
262 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
263 	}
264 }
265 
266 /*
267  * Returns: -1 on failure, 0 on success
268  */
269 int
270 fastcomp(fastgrep_t *fg, const char *pattern)
271 {
272 	int i;
273 	int bol = 0;
274 	int eol = 0;
275 	int shiftPatternLen;
276 	int hasDot = 0;
277 	int firstHalfDot = -1;
278 	int firstLastHalfDot = -1;
279 	int lastHalfDot = 0;
280 
281 	/* Initialize. */
282 	fg->patternLen = strlen(pattern);
283 	fg->bol = 0;
284 	fg->eol = 0;
285 	fg->wmatch = 0;
286 	fg->reversedSearch = 0;
287 
288 	/* Remove end-of-line character ('$'). */
289 	if (pattern[fg->patternLen - 1] == '$') {
290 		eol++;
291 		fg->eol = 1;
292 		fg->patternLen--;
293 	}
294 
295 	/* Remove beginning-of-line character ('^'). */
296 	if (pattern[0] == '^') {
297 		bol++;
298 		fg->bol = 1;
299 		fg->patternLen--;
300 	}
301 
302 	/* Remove enclosing [[:<:]] and [[:>:]] (word match). */
303 	if (wflag) {
304 		/* basic re's use \( \), extended re's ( ) */
305 		int extra = Eflag ? 1 : 2;
306 		fg->patternLen -= 14 + 2 * extra;
307 		fg->wmatch = 7 + extra;
308 	} else if (fg->patternLen >= 14 &&
309 	    strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
310 	    strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
311 		fg->patternLen -= 14;
312 		fg->wmatch = 7;
313 	}
314 
315 	/*
316 	 * Copy pattern minus '^' and '$' characters as well as word
317 	 * match character classes at the beginning and ending of the
318 	 * string respectively.
319 	 */
320 	fg->pattern = grep_malloc(fg->patternLen + 1);
321 	memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
322 	fg->pattern[fg->patternLen] = '\0';
323 
324 	/* Look for ways to cheat...er...avoid the full regex engine. */
325 	for (i = 0; i < fg->patternLen; i++)
326 	{
327 		/* Can still cheat? */
328 		if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) ||
329 		    (fg->pattern[i] == '_') || (fg->pattern[i] == ',') ||
330 		    (fg->pattern[i] == '^') || (fg->pattern[i] == '$') ||
331 		    (fg->pattern[i] == '=') || (fg->pattern[i] == '-') ||
332 		    (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) {
333 			/* As long as it is good, upper case it for later. */
334 			if (iflag)
335 				fg->pattern[i] = toupper(fg->pattern[i]);
336 		} else if (fg->pattern[i] == '.') {
337 			hasDot = i;
338 			if (i < fg->patternLen / 2) {
339 				if (firstHalfDot < 0)
340 					/* Closest dot to the beginning */
341 					firstHalfDot = i;
342 			} else {
343 				/* Closest dot to the end of the pattern. */
344 				lastHalfDot = i;
345 				if (firstLastHalfDot < 0)
346 					firstLastHalfDot = i;
347 			}
348 		} else {
349 			/* Free memory and let others know this is empty. */
350 			free(fg->pattern);
351 			fg->pattern = NULL;
352 			return (-1);
353 		}
354 	}
355 
356 	/*
357 	 * Determine if a reverse search would be faster based on the placement
358 	 * of the dots.
359 	 */
360 	if ((!(lflag || cflag)) && ((!(bol || eol)) &&
361 	    ((lastHalfDot) && ((firstHalfDot < 0) ||
362 	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
363 		fg->reversedSearch = 1;
364 		hasDot = fg->patternLen - (firstHalfDot < 0 ?
365 		    firstLastHalfDot : firstHalfDot) - 1;
366 		grep_revstr(fg->pattern, fg->patternLen);
367 	}
368 
369 	/*
370 	 * Normal Quick Search would require a shift based on the position the
371 	 * next character after the comparison is within the pattern.  With
372 	 * wildcards, the position of the last dot effects the maximum shift
373 	 * distance.
374 	 * The closer to the end the wild card is the slower the search.  A
375 	 * reverse version of this algorithm would be useful for wildcards near
376 	 * the end of the string.
377 	 *
378 	 * Examples:
379 	 * Pattern	Max shift
380 	 * -------	---------
381 	 * this		5
382 	 * .his		4
383 	 * t.is		3
384 	 * th.s		2
385 	 * thi.		1
386 	 */
387 
388 	/* Adjust the shift based on location of the last dot ('.'). */
389 	shiftPatternLen = fg->patternLen - hasDot;
390 
391 	/* Preprocess pattern. */
392 	for (i = 0; i <= UCHAR_MAX; i++)
393 		fg->qsBc[i] = shiftPatternLen;
394 	for (i = hasDot + 1; i < fg->patternLen; i++) {
395 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
396 		/*
397 		 * If case is ignored, make the jump apply to both upper and
398 		 * lower cased characters.  As the pattern is stored in upper
399 		 * case, apply the same to the lower case equivalents.
400 		 */
401 		if (iflag)
402 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
403 	}
404 
405 	/*
406 	 * Put pattern back to normal after pre-processing to allow for easy
407 	 * comparisons later.
408 	 */
409 	if (fg->reversedSearch)
410 		grep_revstr(fg->pattern, fg->patternLen);
411 
412 	return (0);
413 }
414 
415 /*
416  * Word boundaries using regular expressions are defined as the point
417  * of transition from a non-word char to a word char, or vice versa.
418  * This means that grep -w +a and grep -w a+ never match anything,
419  * because they lack a starting or ending transition, but grep -w a+b
420  * does match a line containing a+b.
421  */
422 #define wmatch(d, l, s, e)	\
423 	((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
424 	  e > s && isword(d[s]) && isword(d[e-1]))
425 
426 static int
427 grep_search(fastgrep_t *fg, unsigned char *data, size_t dataLen, regmatch_t *pmatch)
428 {
429 	int j;
430 	int rtrnVal = REG_NOMATCH;
431 
432 	pmatch->rm_so = -1;
433 	pmatch->rm_eo = -1;
434 
435 	/* No point in going farther if we do not have enough data. */
436 	if (dataLen < fg->patternLen)
437 		return (rtrnVal);
438 
439 	/* Only try once at the beginning or ending of the line. */
440 	if (fg->bol || fg->eol) {
441 		/* Simple text comparison. */
442 		/* Verify data is >= pattern length before searching on it. */
443 		if (dataLen >= fg->patternLen) {
444 			/* Determine where in data to start search at. */
445 			if (fg->eol)
446 				j = dataLen - fg->patternLen;
447 			else
448 				j = 0;
449 			if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
450 				if (grep_cmp(fg->pattern, data + j,
451 				    fg->patternLen) == -1) {
452 					pmatch->rm_so = j;
453 					pmatch->rm_eo = j + fg->patternLen;
454 					if (!fg->wmatch || wmatch(data, dataLen,
455 					    pmatch->rm_so, pmatch->rm_eo))
456 						rtrnVal = 0;
457 				}
458 		}
459 	} else if (fg->reversedSearch) {
460 		/* Quick Search algorithm. */
461 		j = dataLen;
462 		do {
463 			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
464 			    fg->patternLen) == -1) {
465 				pmatch->rm_so = j - fg->patternLen;
466 				pmatch->rm_eo = j;
467 				if (!fg->wmatch || wmatch(data, dataLen,
468 				    pmatch->rm_so, pmatch->rm_eo)) {
469 					rtrnVal = 0;
470 					break;
471 				}
472 			}
473 			/* Shift if within bounds, otherwise, we are done. */
474 			if (j == fg->patternLen)
475 				break;
476 			j -= fg->qsBc[data[j - fg->patternLen - 1]];
477 		} while (j >= fg->patternLen);
478 	} else {
479 		/* Quick Search algorithm. */
480 		j = 0;
481 		do {
482 			if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
483 				pmatch->rm_so = j;
484 				pmatch->rm_eo = j + fg->patternLen;
485 				if (fg->patternLen == 0 || !fg->wmatch ||
486 				    wmatch(data, dataLen, pmatch->rm_so,
487 				    pmatch->rm_eo)) {
488 					rtrnVal = 0;
489 					break;
490 				}
491 			}
492 
493 			/* Shift if within bounds, otherwise, we are done. */
494 			if (j + fg->patternLen == dataLen)
495 				break;
496 			else
497 				j += fg->qsBc[data[j + fg->patternLen]];
498 		} while (j <= (dataLen - fg->patternLen));
499 	}
500 
501 	return (rtrnVal);
502 }
503 
504 
505 void *
506 grep_malloc(size_t size)
507 {
508 	void	*ptr;
509 
510 	if ((ptr = malloc(size)) == NULL)
511 		err(2, "malloc");
512 	return ptr;
513 }
514 
515 void *
516 grep_realloc(void *ptr, size_t size)
517 {
518 	if ((ptr = realloc(ptr, size)) == NULL)
519 		err(2, "realloc");
520 	return ptr;
521 }
522 
523 /*
524  * Returns:	i >= 0 on failure (position that it failed)
525  *		-1 on success
526  */
527 static int
528 grep_cmp(const unsigned char *pattern, const unsigned char *data, size_t len)
529 {
530 	int i;
531 
532 	for (i = 0; i < len; i++) {
533 		if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
534 		    || (iflag && pattern[i] == toupper(data[i])))
535 			continue;
536 		return (i);
537 	}
538 
539 	return (-1);
540 }
541 
542 static void
543 grep_revstr(unsigned char *str, int len)
544 {
545 	int i;
546 	char c;
547 
548 	for (i = 0; i < len / 2; i++) {
549 		c = str[i];
550 		str[i] = str[len - i - 1];
551 		str[len - i - 1] = c;
552 	}
553 }
554 
555 void
556 printline(str_t *line, int sep)
557 {
558 	int n;
559 
560 	n = 0;
561 	if (!hflag) {
562 		fputs(line->file, stdout);
563 		++n;
564 	}
565 	if (nflag) {
566 		if (n)
567 			putchar(sep);
568 		printf("%d", line->line_no);
569 		++n;
570 	}
571 	if (bflag) {
572 		if (n)
573 			putchar(sep);
574 		printf("%lld", (long long)line->off);
575 		++n;
576 	}
577 	if (n)
578 		putchar(sep);
579 	fwrite(line->dat, line->len, 1, stdout);
580 	putchar('\n');
581 }
582