1 /* $NetBSD: search.c,v 1.2 2016/01/10 22:16:40 christos Exp $ */
2
3 /* search.c - searching subroutines using dfa, kwset and regex for grep.
4 Copyright 1992, 1998, 2000 Free Software Foundation, Inc.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19 02111-1307, USA. */
20
21 /* Written August 1992 by Mike Haertel. */
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26 #include <sys/types.h>
27 #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC && defined HAVE_WCTYPE
28 /* We can handle multibyte string. */
29 # define MBS_SUPPORT
30 # include <wchar.h>
31 # include <wctype.h>
32 #endif
33
34 #include "system.h"
35 #include "grep.h"
36 #include "regex.h"
37 #include "dfa.h"
38 #include "kwset.h"
39 #include "error.h"
40 #include "xalloc.h"
41 #ifdef HAVE_LIBPCRE
42 # include <pcre.h>
43 #endif
44
45 #define NCHAR (UCHAR_MAX + 1)
46
47 /* For -w, we also consider _ to be word constituent. */
48 #define WCHAR(C) (ISALNUM(C) || (C) == '_')
49
50 /* DFA compiled regexp. */
51 static struct dfa dfa;
52
53 /* The Regex compiled patterns. */
54 static struct patterns
55 {
56 /* Regex compiled regexp. */
57 struct re_pattern_buffer regexbuf;
58 struct re_registers regs; /* This is here on account of a BRAIN-DEAD
59 Q@#%!# library interface in regex.c. */
60 } patterns0;
61
62 struct patterns *patterns;
63 size_t pcount;
64
65 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
66 a list of strings, at least one of which is known to occur in
67 any string matching the regexp. */
68 static kwset_t kwset;
69
70 /* Number of compiled fixed strings known to exactly match the regexp.
71 If kwsexec returns < kwset_exact_matches, then we don't need to
72 call the regexp matcher at all. */
73 static int kwset_exact_matches;
74
75 #if defined(MBS_SUPPORT)
76 static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
77 #endif
78 static void kwsinit PARAMS ((void));
79 static void kwsmusts PARAMS ((void));
80 static void Gcompile PARAMS ((char const *, size_t));
81 static void Ecompile PARAMS ((char const *, size_t));
82 static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
83 static void Fcompile PARAMS ((char const *, size_t));
84 static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
85 static void Pcompile PARAMS ((char const *, size_t ));
86 static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
87
88 void
dfaerror(char const * mesg)89 dfaerror (char const *mesg)
90 {
91 error (2, 0, mesg);
92 }
93
94 static void
kwsinit(void)95 kwsinit (void)
96 {
97 static char trans[NCHAR];
98 int i;
99
100 if (match_icase)
101 for (i = 0; i < NCHAR; ++i)
102 trans[i] = TOLOWER (i);
103
104 if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
105 error (2, 0, _("memory exhausted"));
106 }
107
108 /* If the DFA turns out to have some set of fixed strings one of
109 which must occur in the match, then we build a kwset matcher
110 to find those strings, and thus quickly filter out impossible
111 matches. */
112 static void
kwsmusts(void)113 kwsmusts (void)
114 {
115 struct dfamust const *dm;
116 char const *err;
117
118 if (dfa.musts)
119 {
120 kwsinit ();
121 /* First, we compile in the substrings known to be exact
122 matches. The kwset matcher will return the index
123 of the matching string that it chooses. */
124 for (dm = dfa.musts; dm; dm = dm->next)
125 {
126 if (!dm->exact)
127 continue;
128 ++kwset_exact_matches;
129 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
130 error (2, 0, err);
131 }
132 /* Now, we compile the substrings that will require
133 the use of the regexp matcher. */
134 for (dm = dfa.musts; dm; dm = dm->next)
135 {
136 if (dm->exact)
137 continue;
138 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
139 error (2, 0, err);
140 }
141 if ((err = kwsprep (kwset)) != 0)
142 error (2, 0, err);
143 }
144 }
145
146 #ifdef MBS_SUPPORT
147 /* This function allocate the array which correspond to "buf".
148 Then this check multibyte string and mark on the positions which
149 are not singlebyte character nor the first byte of a multibyte
150 character. Caller must free the array. */
151 static char*
check_multibyte_string(char const * buf,size_t size)152 check_multibyte_string(char const *buf, size_t size)
153 {
154 char *mb_properties = malloc(size);
155 mbstate_t cur_state;
156 size_t i;
157 memset(&cur_state, 0, sizeof(mbstate_t));
158 memset(mb_properties, 0, sizeof(char)*size);
159 for (i = 0; i < size ;)
160 {
161 size_t mbclen;
162 mbclen = mbrlen(buf + i, size - i, &cur_state);
163
164 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
165 {
166 /* An invalid sequence, or a truncated multibyte character.
167 We treat it as a singlebyte character. */
168 mbclen = 1;
169 }
170 mb_properties[i] = mbclen;
171 i += mbclen;
172 }
173
174 return mb_properties;
175 }
176 #endif
177
178 static void
Gcompile(char const * pattern,size_t size)179 Gcompile (char const *pattern, size_t size)
180 {
181 const char *err;
182 char const *sep;
183 size_t total = size;
184 char const *motif = pattern;
185
186 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
187 dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
188
189 /* For GNU regex compiler we have to pass the patterns separately to detect
190 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
191 GNU regex should have raise a syntax error. The same for backref, where
192 the backref should have been local to each pattern. */
193 do
194 {
195 size_t len;
196 sep = memchr (motif, '\n', total);
197 if (sep)
198 {
199 len = sep - motif;
200 sep++;
201 total -= (len + 1);
202 }
203 else
204 {
205 len = total;
206 total = 0;
207 }
208
209 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
210 if (patterns == NULL)
211 error (2, errno, _("memory exhausted"));
212
213 patterns[pcount] = patterns0;
214
215 if ((err = re_compile_pattern (motif, len,
216 &(patterns[pcount].regexbuf))) != 0)
217 error (2, 0, err);
218 pcount++;
219
220 motif = sep;
221 } while (sep && total != 0);
222
223 /* In the match_words and match_lines cases, we use a different pattern
224 for the DFA matcher that will quickly throw out cases that won't work.
225 Then if DFA succeeds we do some hairy stuff using the regex matcher
226 to decide whether the match should really count. */
227 if (match_words || match_lines)
228 {
229 /* In the whole-word case, we use the pattern:
230 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
231 In the whole-line case, we use the pattern:
232 ^\(userpattern\)$. */
233
234 static char const line_beg[] = "^\\(";
235 static char const line_end[] = "\\)$";
236 static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
237 static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
238 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
239 size_t i;
240 strcpy (n, match_lines ? line_beg : word_beg);
241 i = strlen (n);
242 memcpy (n + i, pattern, size);
243 i += size;
244 strcpy (n + i, match_lines ? line_end : word_end);
245 i += strlen (n + i);
246 pattern = n;
247 size = i;
248 }
249
250 dfacomp (pattern, size, &dfa, 1);
251 kwsmusts ();
252 }
253
254 static void
Ecompile(char const * pattern,size_t size)255 Ecompile (char const *pattern, size_t size)
256 {
257 const char *err;
258 const char *sep;
259 size_t total = size;
260 char const *motif = pattern;
261
262 if (strcmp (matcher, "awk") == 0)
263 {
264 re_set_syntax (RE_SYNTAX_AWK);
265 dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
266 }
267 else
268 {
269 re_set_syntax (RE_SYNTAX_POSIX_EGREP);
270 dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
271 }
272
273 /* For GNU regex compiler we have to pass the patterns separately to detect
274 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
275 GNU regex should have raise a syntax error. The same for backref, where
276 the backref should have been local to each pattern. */
277 do
278 {
279 size_t len;
280 sep = memchr (motif, '\n', total);
281 if (sep)
282 {
283 len = sep - motif;
284 sep++;
285 total -= (len + 1);
286 }
287 else
288 {
289 len = total;
290 total = 0;
291 }
292
293 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
294 if (patterns == NULL)
295 error (2, errno, _("memory exhausted"));
296 patterns[pcount] = patterns0;
297
298 if ((err = re_compile_pattern (motif, len,
299 &(patterns[pcount].regexbuf))) != 0)
300 error (2, 0, err);
301 pcount++;
302
303 motif = sep;
304 } while (sep && total != 0);
305
306 /* In the match_words and match_lines cases, we use a different pattern
307 for the DFA matcher that will quickly throw out cases that won't work.
308 Then if DFA succeeds we do some hairy stuff using the regex matcher
309 to decide whether the match should really count. */
310 if (match_words || match_lines)
311 {
312 /* In the whole-word case, we use the pattern:
313 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
314 In the whole-line case, we use the pattern:
315 ^(userpattern)$. */
316
317 static char const line_beg[] = "^(";
318 static char const line_end[] = ")$";
319 static char const word_beg[] = "(^|[^[:alnum:]_])(";
320 static char const word_end[] = ")([^[:alnum:]_]|$)";
321 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
322 size_t i;
323 strcpy (n, match_lines ? line_beg : word_beg);
324 i = strlen(n);
325 memcpy (n + i, pattern, size);
326 i += size;
327 strcpy (n + i, match_lines ? line_end : word_end);
328 i += strlen (n + i);
329 pattern = n;
330 size = i;
331 }
332
333 dfacomp (pattern, size, &dfa, 1);
334 kwsmusts ();
335 }
336
337 static size_t
EGexecute(char const * buf,size_t size,size_t * match_size,int exact)338 EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
339 {
340 register char const *buflim, *beg, *end;
341 char eol = eolbyte;
342 int backref;
343 ptrdiff_t start, len;
344 struct kwsmatch kwsm;
345 size_t i;
346 #ifdef MBS_SUPPORT
347 char *mb_properties = NULL;
348 #endif /* MBS_SUPPORT */
349
350 #ifdef MBS_SUPPORT
351 if (MB_CUR_MAX > 1 && kwset)
352 mb_properties = check_multibyte_string(buf, size);
353 #endif /* MBS_SUPPORT */
354
355 buflim = buf + size;
356
357 for (beg = end = buf; end < buflim; beg = end)
358 {
359 if (!exact)
360 {
361 if (kwset)
362 {
363 /* Find a possible match using the KWset matcher. */
364 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
365 if (offset == (size_t) -1)
366 goto failure;
367 beg += offset;
368 /* Narrow down to the line containing the candidate, and
369 run it through DFA. */
370 end = memchr(beg, eol, buflim - beg);
371 end++;
372 #ifdef MBS_SUPPORT
373 if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
374 continue;
375 #endif
376 while (beg > buf && beg[-1] != eol)
377 --beg;
378 if (kwsm.index < kwset_exact_matches)
379 goto success_in_beg_and_end;
380 if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
381 continue;
382 }
383 else
384 {
385 /* No good fixed strings; start with DFA. */
386 size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
387 if (offset == (size_t) -1)
388 break;
389 /* Narrow down to the line we've found. */
390 beg += offset;
391 end = memchr (beg, eol, buflim - beg);
392 end++;
393 while (beg > buf && beg[-1] != eol)
394 --beg;
395 }
396 /* Successful, no backreferences encountered! */
397 if (!backref)
398 goto success_in_beg_and_end;
399 }
400 else
401 end = beg + size;
402
403 /* If we've made it to this point, this means DFA has seen
404 a probable match, and we need to run it through Regex. */
405 for (i = 0; i < pcount; i++)
406 {
407 patterns[i].regexbuf.not_eol = 0;
408 if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
409 end - beg - 1, 0,
410 end - beg - 1, &(patterns[i].regs))))
411 {
412 len = patterns[i].regs.end[0] - start;
413 if (exact && !match_words)
414 goto success_in_start_and_len;
415 if ((!match_lines && !match_words)
416 || (match_lines && len == end - beg - 1))
417 goto success_in_beg_and_end;
418 /* If -w, check if the match aligns with word boundaries.
419 We do this iteratively because:
420 (a) the line may contain more than one occurence of the
421 pattern, and
422 (b) Several alternatives in the pattern might be valid at a
423 given point, and we may need to consider a shorter one to
424 find a word boundary. */
425 if (match_words)
426 while (start >= 0)
427 {
428 if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
429 && (len == end - beg - 1
430 || !WCHAR ((unsigned char) beg[start + len])))
431 goto success_in_start_and_len;
432 if (len > 0)
433 {
434 /* Try a shorter length anchored at the same place. */
435 --len;
436 patterns[i].regexbuf.not_eol = 1;
437 len = re_match (&(patterns[i].regexbuf), beg,
438 start + len, start,
439 &(patterns[i].regs));
440 }
441 if (len <= 0)
442 {
443 /* Try looking further on. */
444 if (start == end - beg - 1)
445 break;
446 ++start;
447 patterns[i].regexbuf.not_eol = 0;
448 start = re_search (&(patterns[i].regexbuf), beg,
449 end - beg - 1,
450 start, end - beg - 1 - start,
451 &(patterns[i].regs));
452 len = patterns[i].regs.end[0] - start;
453 }
454 }
455 }
456 } /* for Regex patterns. */
457 } /* for (beg = end ..) */
458
459 failure:
460 #ifdef MBS_SUPPORT
461 if (MB_CUR_MAX > 1 && mb_properties)
462 free (mb_properties);
463 #endif /* MBS_SUPPORT */
464 return (size_t) -1;
465
466 success_in_beg_and_end:
467 len = end - beg;
468 start = beg - buf;
469 /* FALLTHROUGH */
470
471 success_in_start_and_len:
472 #ifdef MBS_SUPPORT
473 if (MB_CUR_MAX > 1 && mb_properties)
474 free (mb_properties);
475 #endif /* MBS_SUPPORT */
476 *match_size = len;
477 return start;
478 }
479
480 static void
Fcompile(char const * pattern,size_t size)481 Fcompile (char const *pattern, size_t size)
482 {
483 char const *beg, *lim, *err;
484
485 kwsinit ();
486 beg = pattern;
487 do
488 {
489 for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
490 ;
491 if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
492 error (2, 0, err);
493 if (lim < pattern + size)
494 ++lim;
495 beg = lim;
496 }
497 while (beg < pattern + size);
498
499 if ((err = kwsprep (kwset)) != 0)
500 error (2, 0, err);
501 }
502
503 static size_t
Fexecute(char const * buf,size_t size,size_t * match_size,int exact)504 Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
505 {
506 register char const *beg, *try, *end;
507 register size_t len;
508 char eol = eolbyte;
509 struct kwsmatch kwsmatch;
510 #ifdef MBS_SUPPORT
511 char *mb_properties;
512 if (MB_CUR_MAX > 1)
513 mb_properties = check_multibyte_string (buf, size);
514 #endif /* MBS_SUPPORT */
515
516 for (beg = buf; beg <= buf + size; ++beg)
517 {
518 size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
519 if (offset == (size_t) -1)
520 goto failure;
521 #ifdef MBS_SUPPORT
522 if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
523 continue; /* It is a part of multibyte character. */
524 #endif /* MBS_SUPPORT */
525 beg += offset;
526 len = kwsmatch.size[0];
527 if (exact && !match_words)
528 goto success_in_beg_and_len;
529 if (match_lines)
530 {
531 if (beg > buf && beg[-1] != eol)
532 continue;
533 if (beg + len < buf + size && beg[len] != eol)
534 continue;
535 goto success;
536 }
537 else if (match_words)
538 {
539 while (offset >= 0)
540 {
541 if ((offset == 0 || !WCHAR ((unsigned char) beg[-1]))
542 && (len == end - beg - 1 || !WCHAR ((unsigned char) beg[len])))
543 {
544 if (!exact)
545 /* Returns the whole line now we know there's a word match. */
546 goto success;
547 else
548 /* Returns just this word match. */
549 goto success_in_beg_and_len;
550 }
551 if (len > 0)
552 {
553 /* Try a shorter length anchored at the same place. */
554 --len;
555 offset = kwsexec (kwset, beg, len, &kwsmatch);
556 if (offset == -1) {
557 break; /* Try a different anchor. */
558 }
559 beg += offset;
560 len = kwsmatch.size[0];
561 }
562 }
563 }
564 else
565 goto success;
566 }
567
568 failure:
569 #ifdef MBS_SUPPORT
570 if (MB_CUR_MAX > 1)
571 free (mb_properties);
572 #endif /* MBS_SUPPORT */
573 return -1;
574
575 success:
576 end = memchr (beg + len, eol, (buf + size) - (beg + len));
577 end++;
578 while (buf < beg && beg[-1] != eol)
579 --beg;
580 len = end - beg;
581 /* FALLTHROUGH */
582
583 success_in_beg_and_len:
584 *match_size = len;
585 #ifdef MBS_SUPPORT
586 if (MB_CUR_MAX > 1)
587 free (mb_properties);
588 #endif /* MBS_SUPPORT */
589 return beg - buf;
590 }
591
592 #if HAVE_LIBPCRE
593 /* Compiled internal form of a Perl regular expression. */
594 static pcre *cre;
595
596 /* Additional information about the pattern. */
597 static pcre_extra *extra;
598 #endif
599
600 static void
Pcompile(char const * pattern,size_t size)601 Pcompile (char const *pattern, size_t size)
602 {
603 #if !HAVE_LIBPCRE
604 error (2, 0, _("The -P option is not supported"));
605 #else
606 int e;
607 char const *ep;
608 char *re = xmalloc (4 * size + 7);
609 int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
610 char const *patlim = pattern + size;
611 char *n = re;
612 char const *p;
613 char const *pnul;
614
615 /* FIXME: Remove this restriction. */
616 if (eolbyte != '\n')
617 error (2, 0, _("The -P and -z options cannot be combined"));
618
619 *n = '\0';
620 if (match_lines)
621 strcpy (n, "^(");
622 if (match_words)
623 strcpy (n, "\\b(");
624 n += strlen (n);
625
626 /* The PCRE interface doesn't allow NUL bytes in the pattern, so
627 replace each NUL byte in the pattern with the four characters
628 "\000", removing a preceding backslash if there are an odd
629 number of backslashes before the NUL.
630
631 FIXME: This method does not work with some multibyte character
632 encodings, notably Shift-JIS, where a multibyte character can end
633 in a backslash byte. */
634 for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
635 {
636 memcpy (n, p, pnul - p);
637 n += pnul - p;
638 for (p = pnul; pattern < p && p[-1] == '\\'; p--)
639 continue;
640 n -= (pnul - p) & 1;
641 strcpy (n, "\\000");
642 n += 4;
643 }
644
645 memcpy (n, p, patlim - p);
646 n += patlim - p;
647 *n = '\0';
648 if (match_words)
649 strcpy (n, ")\\b");
650 if (match_lines)
651 strcpy (n, ")$");
652
653 cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
654 if (!cre)
655 error (2, 0, ep);
656
657 extra = pcre_study (cre, 0, &ep);
658 if (ep)
659 error (2, 0, ep);
660
661 free (re);
662 #endif
663 }
664
665 static size_t
Pexecute(char const * buf,size_t size,size_t * match_size,int exact)666 Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
667 {
668 #if !HAVE_LIBPCRE
669 abort ();
670 return -1;
671 #else
672 /* This array must have at least two elements; everything after that
673 is just for performance improvement in pcre_exec. */
674 int sub[300];
675
676 int e = pcre_exec (cre, extra, buf, size, 0, 0,
677 sub, sizeof sub / sizeof *sub);
678
679 if (e <= 0)
680 {
681 switch (e)
682 {
683 case PCRE_ERROR_NOMATCH:
684 return -1;
685
686 case PCRE_ERROR_NOMEMORY:
687 error (2, 0, _("Memory exhausted"));
688
689 default:
690 abort ();
691 }
692 }
693 else
694 {
695 /* Narrow down to the line we've found. */
696 char const *beg = buf + sub[0];
697 char const *end = buf + sub[1];
698 char const *buflim = buf + size;
699 char eol = eolbyte;
700 if (!exact)
701 {
702 end = memchr (end, eol, buflim - end);
703 end++;
704 while (buf < beg && beg[-1] != eol)
705 --beg;
706 }
707
708 *match_size = end - beg;
709 return beg - buf;
710 }
711 #endif
712 }
713
714 struct matcher const matchers[] = {
715 { "default", Gcompile, EGexecute },
716 { "grep", Gcompile, EGexecute },
717 { "egrep", Ecompile, EGexecute },
718 { "awk", Ecompile, EGexecute },
719 { "fgrep", Fcompile, Fexecute },
720 { "perl", Pcompile, Pexecute },
721 { "", 0, 0 },
722 };
723