1*fb69a85aSchristos /* $NetBSD: search.c,v 1.2 2016/01/10 22:16:40 christos Exp $ */
2a8fa202aSchristos
3a8fa202aSchristos /* search.c - searching subroutines using dfa, kwset and regex for grep.
4a8fa202aSchristos Copyright 1992, 1998, 2000 Free Software Foundation, Inc.
5a8fa202aSchristos
6a8fa202aSchristos This program is free software; you can redistribute it and/or modify
7a8fa202aSchristos it under the terms of the GNU General Public License as published by
8a8fa202aSchristos the Free Software Foundation; either version 2, or (at your option)
9a8fa202aSchristos any later version.
10a8fa202aSchristos
11a8fa202aSchristos This program is distributed in the hope that it will be useful,
12a8fa202aSchristos but WITHOUT ANY WARRANTY; without even the implied warranty of
13a8fa202aSchristos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14a8fa202aSchristos GNU General Public License for more details.
15a8fa202aSchristos
16a8fa202aSchristos You should have received a copy of the GNU General Public License
17a8fa202aSchristos along with this program; if not, write to the Free Software
18a8fa202aSchristos Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19a8fa202aSchristos 02111-1307, USA. */
20a8fa202aSchristos
21a8fa202aSchristos /* Written August 1992 by Mike Haertel. */
22a8fa202aSchristos
23a8fa202aSchristos #ifdef HAVE_CONFIG_H
24a8fa202aSchristos # include <config.h>
25a8fa202aSchristos #endif
26a8fa202aSchristos #include <sys/types.h>
27*fb69a85aSchristos #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC && defined HAVE_WCTYPE
28a8fa202aSchristos /* We can handle multibyte string. */
29a8fa202aSchristos # define MBS_SUPPORT
30a8fa202aSchristos # include <wchar.h>
31a8fa202aSchristos # include <wctype.h>
32a8fa202aSchristos #endif
33a8fa202aSchristos
34a8fa202aSchristos #include "system.h"
35a8fa202aSchristos #include "grep.h"
36a8fa202aSchristos #include "regex.h"
37a8fa202aSchristos #include "dfa.h"
38a8fa202aSchristos #include "kwset.h"
39a8fa202aSchristos #include "error.h"
40a8fa202aSchristos #include "xalloc.h"
41a8fa202aSchristos #ifdef HAVE_LIBPCRE
42a8fa202aSchristos # include <pcre.h>
43a8fa202aSchristos #endif
44a8fa202aSchristos
45a8fa202aSchristos #define NCHAR (UCHAR_MAX + 1)
46a8fa202aSchristos
47a8fa202aSchristos /* For -w, we also consider _ to be word constituent. */
48a8fa202aSchristos #define WCHAR(C) (ISALNUM(C) || (C) == '_')
49a8fa202aSchristos
50a8fa202aSchristos /* DFA compiled regexp. */
51a8fa202aSchristos static struct dfa dfa;
52a8fa202aSchristos
53a8fa202aSchristos /* The Regex compiled patterns. */
54a8fa202aSchristos static struct patterns
55a8fa202aSchristos {
56a8fa202aSchristos /* Regex compiled regexp. */
57a8fa202aSchristos struct re_pattern_buffer regexbuf;
58a8fa202aSchristos struct re_registers regs; /* This is here on account of a BRAIN-DEAD
59a8fa202aSchristos Q@#%!# library interface in regex.c. */
60a8fa202aSchristos } patterns0;
61a8fa202aSchristos
62a8fa202aSchristos struct patterns *patterns;
63a8fa202aSchristos size_t pcount;
64a8fa202aSchristos
65a8fa202aSchristos /* KWset compiled pattern. For Ecompile and Gcompile, we compile
66a8fa202aSchristos a list of strings, at least one of which is known to occur in
67a8fa202aSchristos any string matching the regexp. */
68a8fa202aSchristos static kwset_t kwset;
69a8fa202aSchristos
70a8fa202aSchristos /* Number of compiled fixed strings known to exactly match the regexp.
71a8fa202aSchristos If kwsexec returns < kwset_exact_matches, then we don't need to
72a8fa202aSchristos call the regexp matcher at all. */
73a8fa202aSchristos static int kwset_exact_matches;
74a8fa202aSchristos
75a8fa202aSchristos #if defined(MBS_SUPPORT)
76a8fa202aSchristos static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
77a8fa202aSchristos #endif
78a8fa202aSchristos static void kwsinit PARAMS ((void));
79a8fa202aSchristos static void kwsmusts PARAMS ((void));
80a8fa202aSchristos static void Gcompile PARAMS ((char const *, size_t));
81a8fa202aSchristos static void Ecompile PARAMS ((char const *, size_t));
82a8fa202aSchristos static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
83a8fa202aSchristos static void Fcompile PARAMS ((char const *, size_t));
84a8fa202aSchristos static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
85a8fa202aSchristos static void Pcompile PARAMS ((char const *, size_t ));
86a8fa202aSchristos static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
87a8fa202aSchristos
88a8fa202aSchristos void
dfaerror(char const * mesg)89a8fa202aSchristos dfaerror (char const *mesg)
90a8fa202aSchristos {
91a8fa202aSchristos error (2, 0, mesg);
92a8fa202aSchristos }
93a8fa202aSchristos
94a8fa202aSchristos static void
kwsinit(void)95a8fa202aSchristos kwsinit (void)
96a8fa202aSchristos {
97a8fa202aSchristos static char trans[NCHAR];
98a8fa202aSchristos int i;
99a8fa202aSchristos
100a8fa202aSchristos if (match_icase)
101a8fa202aSchristos for (i = 0; i < NCHAR; ++i)
102a8fa202aSchristos trans[i] = TOLOWER (i);
103a8fa202aSchristos
104a8fa202aSchristos if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
105a8fa202aSchristos error (2, 0, _("memory exhausted"));
106a8fa202aSchristos }
107a8fa202aSchristos
108a8fa202aSchristos /* If the DFA turns out to have some set of fixed strings one of
109a8fa202aSchristos which must occur in the match, then we build a kwset matcher
110a8fa202aSchristos to find those strings, and thus quickly filter out impossible
111a8fa202aSchristos matches. */
112a8fa202aSchristos static void
kwsmusts(void)113a8fa202aSchristos kwsmusts (void)
114a8fa202aSchristos {
115a8fa202aSchristos struct dfamust const *dm;
116a8fa202aSchristos char const *err;
117a8fa202aSchristos
118a8fa202aSchristos if (dfa.musts)
119a8fa202aSchristos {
120a8fa202aSchristos kwsinit ();
121a8fa202aSchristos /* First, we compile in the substrings known to be exact
122a8fa202aSchristos matches. The kwset matcher will return the index
123a8fa202aSchristos of the matching string that it chooses. */
124a8fa202aSchristos for (dm = dfa.musts; dm; dm = dm->next)
125a8fa202aSchristos {
126a8fa202aSchristos if (!dm->exact)
127a8fa202aSchristos continue;
128a8fa202aSchristos ++kwset_exact_matches;
129a8fa202aSchristos if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
130a8fa202aSchristos error (2, 0, err);
131a8fa202aSchristos }
132a8fa202aSchristos /* Now, we compile the substrings that will require
133a8fa202aSchristos the use of the regexp matcher. */
134a8fa202aSchristos for (dm = dfa.musts; dm; dm = dm->next)
135a8fa202aSchristos {
136a8fa202aSchristos if (dm->exact)
137a8fa202aSchristos continue;
138a8fa202aSchristos if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
139a8fa202aSchristos error (2, 0, err);
140a8fa202aSchristos }
141a8fa202aSchristos if ((err = kwsprep (kwset)) != 0)
142a8fa202aSchristos error (2, 0, err);
143a8fa202aSchristos }
144a8fa202aSchristos }
145a8fa202aSchristos
146a8fa202aSchristos #ifdef MBS_SUPPORT
147a8fa202aSchristos /* This function allocate the array which correspond to "buf".
148a8fa202aSchristos Then this check multibyte string and mark on the positions which
149a8fa202aSchristos are not singlebyte character nor the first byte of a multibyte
150a8fa202aSchristos character. Caller must free the array. */
151a8fa202aSchristos static char*
check_multibyte_string(char const * buf,size_t size)152a8fa202aSchristos check_multibyte_string(char const *buf, size_t size)
153a8fa202aSchristos {
154a8fa202aSchristos char *mb_properties = malloc(size);
155a8fa202aSchristos mbstate_t cur_state;
156*fb69a85aSchristos size_t i;
157a8fa202aSchristos memset(&cur_state, 0, sizeof(mbstate_t));
158a8fa202aSchristos memset(mb_properties, 0, sizeof(char)*size);
159a8fa202aSchristos for (i = 0; i < size ;)
160a8fa202aSchristos {
161a8fa202aSchristos size_t mbclen;
162a8fa202aSchristos mbclen = mbrlen(buf + i, size - i, &cur_state);
163a8fa202aSchristos
164a8fa202aSchristos if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
165a8fa202aSchristos {
166a8fa202aSchristos /* An invalid sequence, or a truncated multibyte character.
167a8fa202aSchristos We treat it as a singlebyte character. */
168a8fa202aSchristos mbclen = 1;
169a8fa202aSchristos }
170a8fa202aSchristos mb_properties[i] = mbclen;
171a8fa202aSchristos i += mbclen;
172a8fa202aSchristos }
173a8fa202aSchristos
174a8fa202aSchristos return mb_properties;
175a8fa202aSchristos }
176a8fa202aSchristos #endif
177a8fa202aSchristos
178a8fa202aSchristos static void
Gcompile(char const * pattern,size_t size)179a8fa202aSchristos Gcompile (char const *pattern, size_t size)
180a8fa202aSchristos {
181a8fa202aSchristos const char *err;
182a8fa202aSchristos char const *sep;
183a8fa202aSchristos size_t total = size;
184a8fa202aSchristos char const *motif = pattern;
185a8fa202aSchristos
186a8fa202aSchristos re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
187a8fa202aSchristos dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
188a8fa202aSchristos
189a8fa202aSchristos /* For GNU regex compiler we have to pass the patterns separately to detect
190a8fa202aSchristos errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
191a8fa202aSchristos GNU regex should have raise a syntax error. The same for backref, where
192a8fa202aSchristos the backref should have been local to each pattern. */
193a8fa202aSchristos do
194a8fa202aSchristos {
195a8fa202aSchristos size_t len;
196a8fa202aSchristos sep = memchr (motif, '\n', total);
197a8fa202aSchristos if (sep)
198a8fa202aSchristos {
199a8fa202aSchristos len = sep - motif;
200a8fa202aSchristos sep++;
201a8fa202aSchristos total -= (len + 1);
202a8fa202aSchristos }
203a8fa202aSchristos else
204a8fa202aSchristos {
205a8fa202aSchristos len = total;
206a8fa202aSchristos total = 0;
207a8fa202aSchristos }
208a8fa202aSchristos
209a8fa202aSchristos patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
210a8fa202aSchristos if (patterns == NULL)
211a8fa202aSchristos error (2, errno, _("memory exhausted"));
212a8fa202aSchristos
213a8fa202aSchristos patterns[pcount] = patterns0;
214a8fa202aSchristos
215a8fa202aSchristos if ((err = re_compile_pattern (motif, len,
216a8fa202aSchristos &(patterns[pcount].regexbuf))) != 0)
217a8fa202aSchristos error (2, 0, err);
218a8fa202aSchristos pcount++;
219a8fa202aSchristos
220a8fa202aSchristos motif = sep;
221a8fa202aSchristos } while (sep && total != 0);
222a8fa202aSchristos
223a8fa202aSchristos /* In the match_words and match_lines cases, we use a different pattern
224a8fa202aSchristos for the DFA matcher that will quickly throw out cases that won't work.
225a8fa202aSchristos Then if DFA succeeds we do some hairy stuff using the regex matcher
226a8fa202aSchristos to decide whether the match should really count. */
227a8fa202aSchristos if (match_words || match_lines)
228a8fa202aSchristos {
229a8fa202aSchristos /* In the whole-word case, we use the pattern:
230a8fa202aSchristos \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
231a8fa202aSchristos In the whole-line case, we use the pattern:
232a8fa202aSchristos ^\(userpattern\)$. */
233a8fa202aSchristos
234a8fa202aSchristos static char const line_beg[] = "^\\(";
235a8fa202aSchristos static char const line_end[] = "\\)$";
236a8fa202aSchristos static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
237a8fa202aSchristos static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
238a8fa202aSchristos char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
239a8fa202aSchristos size_t i;
240a8fa202aSchristos strcpy (n, match_lines ? line_beg : word_beg);
241a8fa202aSchristos i = strlen (n);
242a8fa202aSchristos memcpy (n + i, pattern, size);
243a8fa202aSchristos i += size;
244a8fa202aSchristos strcpy (n + i, match_lines ? line_end : word_end);
245a8fa202aSchristos i += strlen (n + i);
246a8fa202aSchristos pattern = n;
247a8fa202aSchristos size = i;
248a8fa202aSchristos }
249a8fa202aSchristos
250a8fa202aSchristos dfacomp (pattern, size, &dfa, 1);
251a8fa202aSchristos kwsmusts ();
252a8fa202aSchristos }
253a8fa202aSchristos
254a8fa202aSchristos static void
Ecompile(char const * pattern,size_t size)255a8fa202aSchristos Ecompile (char const *pattern, size_t size)
256a8fa202aSchristos {
257a8fa202aSchristos const char *err;
258a8fa202aSchristos const char *sep;
259a8fa202aSchristos size_t total = size;
260a8fa202aSchristos char const *motif = pattern;
261a8fa202aSchristos
262a8fa202aSchristos if (strcmp (matcher, "awk") == 0)
263a8fa202aSchristos {
264a8fa202aSchristos re_set_syntax (RE_SYNTAX_AWK);
265a8fa202aSchristos dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
266a8fa202aSchristos }
267a8fa202aSchristos else
268a8fa202aSchristos {
269a8fa202aSchristos re_set_syntax (RE_SYNTAX_POSIX_EGREP);
270a8fa202aSchristos dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
271a8fa202aSchristos }
272a8fa202aSchristos
273a8fa202aSchristos /* For GNU regex compiler we have to pass the patterns separately to detect
274a8fa202aSchristos errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
275a8fa202aSchristos GNU regex should have raise a syntax error. The same for backref, where
276a8fa202aSchristos the backref should have been local to each pattern. */
277a8fa202aSchristos do
278a8fa202aSchristos {
279a8fa202aSchristos size_t len;
280a8fa202aSchristos sep = memchr (motif, '\n', total);
281a8fa202aSchristos if (sep)
282a8fa202aSchristos {
283a8fa202aSchristos len = sep - motif;
284a8fa202aSchristos sep++;
285a8fa202aSchristos total -= (len + 1);
286a8fa202aSchristos }
287a8fa202aSchristos else
288a8fa202aSchristos {
289a8fa202aSchristos len = total;
290a8fa202aSchristos total = 0;
291a8fa202aSchristos }
292a8fa202aSchristos
293a8fa202aSchristos patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
294a8fa202aSchristos if (patterns == NULL)
295a8fa202aSchristos error (2, errno, _("memory exhausted"));
296a8fa202aSchristos patterns[pcount] = patterns0;
297a8fa202aSchristos
298a8fa202aSchristos if ((err = re_compile_pattern (motif, len,
299a8fa202aSchristos &(patterns[pcount].regexbuf))) != 0)
300a8fa202aSchristos error (2, 0, err);
301a8fa202aSchristos pcount++;
302a8fa202aSchristos
303a8fa202aSchristos motif = sep;
304a8fa202aSchristos } while (sep && total != 0);
305a8fa202aSchristos
306a8fa202aSchristos /* In the match_words and match_lines cases, we use a different pattern
307a8fa202aSchristos for the DFA matcher that will quickly throw out cases that won't work.
308a8fa202aSchristos Then if DFA succeeds we do some hairy stuff using the regex matcher
309a8fa202aSchristos to decide whether the match should really count. */
310a8fa202aSchristos if (match_words || match_lines)
311a8fa202aSchristos {
312a8fa202aSchristos /* In the whole-word case, we use the pattern:
313a8fa202aSchristos (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
314a8fa202aSchristos In the whole-line case, we use the pattern:
315a8fa202aSchristos ^(userpattern)$. */
316a8fa202aSchristos
317a8fa202aSchristos static char const line_beg[] = "^(";
318a8fa202aSchristos static char const line_end[] = ")$";
319a8fa202aSchristos static char const word_beg[] = "(^|[^[:alnum:]_])(";
320a8fa202aSchristos static char const word_end[] = ")([^[:alnum:]_]|$)";
321a8fa202aSchristos char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
322a8fa202aSchristos size_t i;
323a8fa202aSchristos strcpy (n, match_lines ? line_beg : word_beg);
324a8fa202aSchristos i = strlen(n);
325a8fa202aSchristos memcpy (n + i, pattern, size);
326a8fa202aSchristos i += size;
327a8fa202aSchristos strcpy (n + i, match_lines ? line_end : word_end);
328a8fa202aSchristos i += strlen (n + i);
329a8fa202aSchristos pattern = n;
330a8fa202aSchristos size = i;
331a8fa202aSchristos }
332a8fa202aSchristos
333a8fa202aSchristos dfacomp (pattern, size, &dfa, 1);
334a8fa202aSchristos kwsmusts ();
335a8fa202aSchristos }
336a8fa202aSchristos
337a8fa202aSchristos static size_t
EGexecute(char const * buf,size_t size,size_t * match_size,int exact)338a8fa202aSchristos EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
339a8fa202aSchristos {
340a8fa202aSchristos register char const *buflim, *beg, *end;
341a8fa202aSchristos char eol = eolbyte;
342*fb69a85aSchristos int backref;
343*fb69a85aSchristos ptrdiff_t start, len;
344a8fa202aSchristos struct kwsmatch kwsm;
345a8fa202aSchristos size_t i;
346a8fa202aSchristos #ifdef MBS_SUPPORT
347a8fa202aSchristos char *mb_properties = NULL;
348a8fa202aSchristos #endif /* MBS_SUPPORT */
349a8fa202aSchristos
350a8fa202aSchristos #ifdef MBS_SUPPORT
351a8fa202aSchristos if (MB_CUR_MAX > 1 && kwset)
352a8fa202aSchristos mb_properties = check_multibyte_string(buf, size);
353a8fa202aSchristos #endif /* MBS_SUPPORT */
354a8fa202aSchristos
355a8fa202aSchristos buflim = buf + size;
356a8fa202aSchristos
357a8fa202aSchristos for (beg = end = buf; end < buflim; beg = end)
358a8fa202aSchristos {
359a8fa202aSchristos if (!exact)
360a8fa202aSchristos {
361a8fa202aSchristos if (kwset)
362a8fa202aSchristos {
363a8fa202aSchristos /* Find a possible match using the KWset matcher. */
364a8fa202aSchristos size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
365a8fa202aSchristos if (offset == (size_t) -1)
366*fb69a85aSchristos goto failure;
367a8fa202aSchristos beg += offset;
368a8fa202aSchristos /* Narrow down to the line containing the candidate, and
369a8fa202aSchristos run it through DFA. */
370a8fa202aSchristos end = memchr(beg, eol, buflim - beg);
371a8fa202aSchristos end++;
372a8fa202aSchristos #ifdef MBS_SUPPORT
373a8fa202aSchristos if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
374a8fa202aSchristos continue;
375a8fa202aSchristos #endif
376a8fa202aSchristos while (beg > buf && beg[-1] != eol)
377a8fa202aSchristos --beg;
378a8fa202aSchristos if (kwsm.index < kwset_exact_matches)
379*fb69a85aSchristos goto success_in_beg_and_end;
380a8fa202aSchristos if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
381a8fa202aSchristos continue;
382a8fa202aSchristos }
383a8fa202aSchristos else
384a8fa202aSchristos {
385a8fa202aSchristos /* No good fixed strings; start with DFA. */
386a8fa202aSchristos size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
387a8fa202aSchristos if (offset == (size_t) -1)
388a8fa202aSchristos break;
389a8fa202aSchristos /* Narrow down to the line we've found. */
390a8fa202aSchristos beg += offset;
391a8fa202aSchristos end = memchr (beg, eol, buflim - beg);
392a8fa202aSchristos end++;
393a8fa202aSchristos while (beg > buf && beg[-1] != eol)
394a8fa202aSchristos --beg;
395a8fa202aSchristos }
396a8fa202aSchristos /* Successful, no backreferences encountered! */
397a8fa202aSchristos if (!backref)
398*fb69a85aSchristos goto success_in_beg_and_end;
399a8fa202aSchristos }
400a8fa202aSchristos else
401a8fa202aSchristos end = beg + size;
402a8fa202aSchristos
403a8fa202aSchristos /* If we've made it to this point, this means DFA has seen
404a8fa202aSchristos a probable match, and we need to run it through Regex. */
405a8fa202aSchristos for (i = 0; i < pcount; i++)
406a8fa202aSchristos {
407a8fa202aSchristos patterns[i].regexbuf.not_eol = 0;
408a8fa202aSchristos if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
409a8fa202aSchristos end - beg - 1, 0,
410a8fa202aSchristos end - beg - 1, &(patterns[i].regs))))
411a8fa202aSchristos {
412a8fa202aSchristos len = patterns[i].regs.end[0] - start;
413*fb69a85aSchristos if (exact && !match_words)
414*fb69a85aSchristos goto success_in_start_and_len;
415a8fa202aSchristos if ((!match_lines && !match_words)
416a8fa202aSchristos || (match_lines && len == end - beg - 1))
417*fb69a85aSchristos goto success_in_beg_and_end;
418a8fa202aSchristos /* If -w, check if the match aligns with word boundaries.
419a8fa202aSchristos We do this iteratively because:
420a8fa202aSchristos (a) the line may contain more than one occurence of the
421a8fa202aSchristos pattern, and
422a8fa202aSchristos (b) Several alternatives in the pattern might be valid at a
423a8fa202aSchristos given point, and we may need to consider a shorter one to
424a8fa202aSchristos find a word boundary. */
425a8fa202aSchristos if (match_words)
426a8fa202aSchristos while (start >= 0)
427a8fa202aSchristos {
428a8fa202aSchristos if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
429a8fa202aSchristos && (len == end - beg - 1
430a8fa202aSchristos || !WCHAR ((unsigned char) beg[start + len])))
431*fb69a85aSchristos goto success_in_start_and_len;
432a8fa202aSchristos if (len > 0)
433a8fa202aSchristos {
434a8fa202aSchristos /* Try a shorter length anchored at the same place. */
435a8fa202aSchristos --len;
436a8fa202aSchristos patterns[i].regexbuf.not_eol = 1;
437a8fa202aSchristos len = re_match (&(patterns[i].regexbuf), beg,
438a8fa202aSchristos start + len, start,
439a8fa202aSchristos &(patterns[i].regs));
440a8fa202aSchristos }
441a8fa202aSchristos if (len <= 0)
442a8fa202aSchristos {
443a8fa202aSchristos /* Try looking further on. */
444a8fa202aSchristos if (start == end - beg - 1)
445a8fa202aSchristos break;
446a8fa202aSchristos ++start;
447a8fa202aSchristos patterns[i].regexbuf.not_eol = 0;
448a8fa202aSchristos start = re_search (&(patterns[i].regexbuf), beg,
449a8fa202aSchristos end - beg - 1,
450a8fa202aSchristos start, end - beg - 1 - start,
451a8fa202aSchristos &(patterns[i].regs));
452a8fa202aSchristos len = patterns[i].regs.end[0] - start;
453a8fa202aSchristos }
454a8fa202aSchristos }
455a8fa202aSchristos }
456a8fa202aSchristos } /* for Regex patterns. */
457a8fa202aSchristos } /* for (beg = end ..) */
458*fb69a85aSchristos
459*fb69a85aSchristos failure:
460a8fa202aSchristos #ifdef MBS_SUPPORT
461a8fa202aSchristos if (MB_CUR_MAX > 1 && mb_properties)
462a8fa202aSchristos free (mb_properties);
463a8fa202aSchristos #endif /* MBS_SUPPORT */
464a8fa202aSchristos return (size_t) -1;
465a8fa202aSchristos
466*fb69a85aSchristos success_in_beg_and_end:
467*fb69a85aSchristos len = end - beg;
468*fb69a85aSchristos start = beg - buf;
469*fb69a85aSchristos /* FALLTHROUGH */
470*fb69a85aSchristos
471*fb69a85aSchristos success_in_start_and_len:
472a8fa202aSchristos #ifdef MBS_SUPPORT
473a8fa202aSchristos if (MB_CUR_MAX > 1 && mb_properties)
474a8fa202aSchristos free (mb_properties);
475a8fa202aSchristos #endif /* MBS_SUPPORT */
476*fb69a85aSchristos *match_size = len;
477*fb69a85aSchristos return start;
478a8fa202aSchristos }
479a8fa202aSchristos
480a8fa202aSchristos static void
Fcompile(char const * pattern,size_t size)481a8fa202aSchristos Fcompile (char const *pattern, size_t size)
482a8fa202aSchristos {
483a8fa202aSchristos char const *beg, *lim, *err;
484a8fa202aSchristos
485a8fa202aSchristos kwsinit ();
486a8fa202aSchristos beg = pattern;
487a8fa202aSchristos do
488a8fa202aSchristos {
489a8fa202aSchristos for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
490a8fa202aSchristos ;
491a8fa202aSchristos if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
492a8fa202aSchristos error (2, 0, err);
493a8fa202aSchristos if (lim < pattern + size)
494a8fa202aSchristos ++lim;
495a8fa202aSchristos beg = lim;
496a8fa202aSchristos }
497a8fa202aSchristos while (beg < pattern + size);
498a8fa202aSchristos
499a8fa202aSchristos if ((err = kwsprep (kwset)) != 0)
500a8fa202aSchristos error (2, 0, err);
501a8fa202aSchristos }
502a8fa202aSchristos
503a8fa202aSchristos static size_t
Fexecute(char const * buf,size_t size,size_t * match_size,int exact)504a8fa202aSchristos Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
505a8fa202aSchristos {
506a8fa202aSchristos register char const *beg, *try, *end;
507a8fa202aSchristos register size_t len;
508a8fa202aSchristos char eol = eolbyte;
509a8fa202aSchristos struct kwsmatch kwsmatch;
510a8fa202aSchristos #ifdef MBS_SUPPORT
511a8fa202aSchristos char *mb_properties;
512a8fa202aSchristos if (MB_CUR_MAX > 1)
513a8fa202aSchristos mb_properties = check_multibyte_string (buf, size);
514a8fa202aSchristos #endif /* MBS_SUPPORT */
515a8fa202aSchristos
516a8fa202aSchristos for (beg = buf; beg <= buf + size; ++beg)
517a8fa202aSchristos {
518a8fa202aSchristos size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
519a8fa202aSchristos if (offset == (size_t) -1)
520*fb69a85aSchristos goto failure;
521a8fa202aSchristos #ifdef MBS_SUPPORT
522a8fa202aSchristos if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
523a8fa202aSchristos continue; /* It is a part of multibyte character. */
524a8fa202aSchristos #endif /* MBS_SUPPORT */
525a8fa202aSchristos beg += offset;
526a8fa202aSchristos len = kwsmatch.size[0];
527*fb69a85aSchristos if (exact && !match_words)
528*fb69a85aSchristos goto success_in_beg_and_len;
529a8fa202aSchristos if (match_lines)
530a8fa202aSchristos {
531a8fa202aSchristos if (beg > buf && beg[-1] != eol)
532a8fa202aSchristos continue;
533a8fa202aSchristos if (beg + len < buf + size && beg[len] != eol)
534a8fa202aSchristos continue;
535a8fa202aSchristos goto success;
536a8fa202aSchristos }
537a8fa202aSchristos else if (match_words)
538a8fa202aSchristos {
539*fb69a85aSchristos while (offset >= 0)
540a8fa202aSchristos {
541*fb69a85aSchristos if ((offset == 0 || !WCHAR ((unsigned char) beg[-1]))
542*fb69a85aSchristos && (len == end - beg - 1 || !WCHAR ((unsigned char) beg[len])))
543a8fa202aSchristos {
544*fb69a85aSchristos if (!exact)
545*fb69a85aSchristos /* Returns the whole line now we know there's a word match. */
546*fb69a85aSchristos goto success;
547*fb69a85aSchristos else
548*fb69a85aSchristos /* Returns just this word match. */
549*fb69a85aSchristos goto success_in_beg_and_len;
550a8fa202aSchristos }
551*fb69a85aSchristos if (len > 0)
552*fb69a85aSchristos {
553*fb69a85aSchristos /* Try a shorter length anchored at the same place. */
554*fb69a85aSchristos --len;
555*fb69a85aSchristos offset = kwsexec (kwset, beg, len, &kwsmatch);
556*fb69a85aSchristos if (offset == -1) {
557*fb69a85aSchristos break; /* Try a different anchor. */
558*fb69a85aSchristos }
559*fb69a85aSchristos beg += offset;
560a8fa202aSchristos len = kwsmatch.size[0];
561a8fa202aSchristos }
562*fb69a85aSchristos }
563a8fa202aSchristos }
564a8fa202aSchristos else
565a8fa202aSchristos goto success;
566a8fa202aSchristos }
567a8fa202aSchristos
568*fb69a85aSchristos failure:
569a8fa202aSchristos #ifdef MBS_SUPPORT
570a8fa202aSchristos if (MB_CUR_MAX > 1)
571a8fa202aSchristos free (mb_properties);
572a8fa202aSchristos #endif /* MBS_SUPPORT */
573a8fa202aSchristos return -1;
574a8fa202aSchristos
575a8fa202aSchristos success:
576a8fa202aSchristos end = memchr (beg + len, eol, (buf + size) - (beg + len));
577a8fa202aSchristos end++;
578a8fa202aSchristos while (buf < beg && beg[-1] != eol)
579a8fa202aSchristos --beg;
580*fb69a85aSchristos len = end - beg;
581*fb69a85aSchristos /* FALLTHROUGH */
582*fb69a85aSchristos
583*fb69a85aSchristos success_in_beg_and_len:
584*fb69a85aSchristos *match_size = len;
585a8fa202aSchristos #ifdef MBS_SUPPORT
586a8fa202aSchristos if (MB_CUR_MAX > 1)
587a8fa202aSchristos free (mb_properties);
588a8fa202aSchristos #endif /* MBS_SUPPORT */
589a8fa202aSchristos return beg - buf;
590a8fa202aSchristos }
591a8fa202aSchristos
592a8fa202aSchristos #if HAVE_LIBPCRE
593a8fa202aSchristos /* Compiled internal form of a Perl regular expression. */
594a8fa202aSchristos static pcre *cre;
595a8fa202aSchristos
596a8fa202aSchristos /* Additional information about the pattern. */
597a8fa202aSchristos static pcre_extra *extra;
598a8fa202aSchristos #endif
599a8fa202aSchristos
600a8fa202aSchristos static void
Pcompile(char const * pattern,size_t size)601a8fa202aSchristos Pcompile (char const *pattern, size_t size)
602a8fa202aSchristos {
603a8fa202aSchristos #if !HAVE_LIBPCRE
604a8fa202aSchristos error (2, 0, _("The -P option is not supported"));
605a8fa202aSchristos #else
606a8fa202aSchristos int e;
607a8fa202aSchristos char const *ep;
608a8fa202aSchristos char *re = xmalloc (4 * size + 7);
609a8fa202aSchristos int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
610a8fa202aSchristos char const *patlim = pattern + size;
611a8fa202aSchristos char *n = re;
612a8fa202aSchristos char const *p;
613a8fa202aSchristos char const *pnul;
614a8fa202aSchristos
615a8fa202aSchristos /* FIXME: Remove this restriction. */
616a8fa202aSchristos if (eolbyte != '\n')
617a8fa202aSchristos error (2, 0, _("The -P and -z options cannot be combined"));
618a8fa202aSchristos
619a8fa202aSchristos *n = '\0';
620a8fa202aSchristos if (match_lines)
621a8fa202aSchristos strcpy (n, "^(");
622a8fa202aSchristos if (match_words)
623a8fa202aSchristos strcpy (n, "\\b(");
624a8fa202aSchristos n += strlen (n);
625a8fa202aSchristos
626a8fa202aSchristos /* The PCRE interface doesn't allow NUL bytes in the pattern, so
627a8fa202aSchristos replace each NUL byte in the pattern with the four characters
628a8fa202aSchristos "\000", removing a preceding backslash if there are an odd
629a8fa202aSchristos number of backslashes before the NUL.
630a8fa202aSchristos
631a8fa202aSchristos FIXME: This method does not work with some multibyte character
632a8fa202aSchristos encodings, notably Shift-JIS, where a multibyte character can end
633a8fa202aSchristos in a backslash byte. */
634a8fa202aSchristos for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
635a8fa202aSchristos {
636a8fa202aSchristos memcpy (n, p, pnul - p);
637a8fa202aSchristos n += pnul - p;
638a8fa202aSchristos for (p = pnul; pattern < p && p[-1] == '\\'; p--)
639a8fa202aSchristos continue;
640a8fa202aSchristos n -= (pnul - p) & 1;
641a8fa202aSchristos strcpy (n, "\\000");
642a8fa202aSchristos n += 4;
643a8fa202aSchristos }
644a8fa202aSchristos
645a8fa202aSchristos memcpy (n, p, patlim - p);
646a8fa202aSchristos n += patlim - p;
647a8fa202aSchristos *n = '\0';
648a8fa202aSchristos if (match_words)
649a8fa202aSchristos strcpy (n, ")\\b");
650a8fa202aSchristos if (match_lines)
651a8fa202aSchristos strcpy (n, ")$");
652a8fa202aSchristos
653a8fa202aSchristos cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
654a8fa202aSchristos if (!cre)
655a8fa202aSchristos error (2, 0, ep);
656a8fa202aSchristos
657a8fa202aSchristos extra = pcre_study (cre, 0, &ep);
658a8fa202aSchristos if (ep)
659a8fa202aSchristos error (2, 0, ep);
660a8fa202aSchristos
661a8fa202aSchristos free (re);
662a8fa202aSchristos #endif
663a8fa202aSchristos }
664a8fa202aSchristos
665a8fa202aSchristos static size_t
Pexecute(char const * buf,size_t size,size_t * match_size,int exact)666a8fa202aSchristos Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
667a8fa202aSchristos {
668a8fa202aSchristos #if !HAVE_LIBPCRE
669a8fa202aSchristos abort ();
670a8fa202aSchristos return -1;
671a8fa202aSchristos #else
672a8fa202aSchristos /* This array must have at least two elements; everything after that
673a8fa202aSchristos is just for performance improvement in pcre_exec. */
674a8fa202aSchristos int sub[300];
675a8fa202aSchristos
676a8fa202aSchristos int e = pcre_exec (cre, extra, buf, size, 0, 0,
677a8fa202aSchristos sub, sizeof sub / sizeof *sub);
678a8fa202aSchristos
679a8fa202aSchristos if (e <= 0)
680a8fa202aSchristos {
681a8fa202aSchristos switch (e)
682a8fa202aSchristos {
683a8fa202aSchristos case PCRE_ERROR_NOMATCH:
684a8fa202aSchristos return -1;
685a8fa202aSchristos
686a8fa202aSchristos case PCRE_ERROR_NOMEMORY:
687a8fa202aSchristos error (2, 0, _("Memory exhausted"));
688a8fa202aSchristos
689a8fa202aSchristos default:
690a8fa202aSchristos abort ();
691a8fa202aSchristos }
692a8fa202aSchristos }
693a8fa202aSchristos else
694a8fa202aSchristos {
695a8fa202aSchristos /* Narrow down to the line we've found. */
696a8fa202aSchristos char const *beg = buf + sub[0];
697a8fa202aSchristos char const *end = buf + sub[1];
698a8fa202aSchristos char const *buflim = buf + size;
699a8fa202aSchristos char eol = eolbyte;
700a8fa202aSchristos if (!exact)
701a8fa202aSchristos {
702a8fa202aSchristos end = memchr (end, eol, buflim - end);
703a8fa202aSchristos end++;
704a8fa202aSchristos while (buf < beg && beg[-1] != eol)
705a8fa202aSchristos --beg;
706a8fa202aSchristos }
707a8fa202aSchristos
708a8fa202aSchristos *match_size = end - beg;
709a8fa202aSchristos return beg - buf;
710a8fa202aSchristos }
711a8fa202aSchristos #endif
712a8fa202aSchristos }
713a8fa202aSchristos
714a8fa202aSchristos struct matcher const matchers[] = {
715a8fa202aSchristos { "default", Gcompile, EGexecute },
716a8fa202aSchristos { "grep", Gcompile, EGexecute },
717a8fa202aSchristos { "egrep", Ecompile, EGexecute },
718a8fa202aSchristos { "awk", Ecompile, EGexecute },
719a8fa202aSchristos { "fgrep", Fcompile, Fexecute },
720a8fa202aSchristos { "perl", Pcompile, Pexecute },
721a8fa202aSchristos { "", 0, 0 },
722a8fa202aSchristos };
723