1 /* Pattern Matchers for Regular Expressions.
2 Copyright (C) 1992, 1998, 2000, 2005 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
17
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21
22 /* Specification. */
23 #include "libgrep.h"
24
25 #include <ctype.h>
26 #include <stdlib.h>
27 #include <string.h>
28
29 #include "error.h"
30 #include "exitfail.h"
31 #include "xalloc.h"
32 #include "m-common.h"
33
34 /* This must be included _after_ m-common.h: It depends on MBS_SUPPORT. */
35 #include "dfa.h"
36
37 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
38 # define IN_CTYPE_DOMAIN(c) 1
39 #else
40 # define IN_CTYPE_DOMAIN(c) isascii(c)
41 #endif
42 #define ISALNUM(C) (IN_CTYPE_DOMAIN (C) && isalnum (C))
43
44 struct compiled_regex {
45 bool match_words;
46 bool match_lines;
47 char eolbyte;
48
49 /* DFA compiled regexp. */
50 struct dfa dfa;
51
52 /* The Regex compiled patterns. */
53 struct patterns
54 {
55 /* Regex compiled regexp. */
56 struct re_pattern_buffer regexbuf;
57 struct re_registers regs; /* This is here on account of a BRAIN-DEAD
58 Q@#%!# library interface in regex.c. */
59 } *patterns;
60 size_t pcount;
61
62 /* KWset compiled pattern. We compile a list of strings, at least one of
63 which is known to occur in any string matching the regexp. */
64 struct compiled_kwset ckwset;
65
66 /* Number of compiled fixed strings known to exactly match the regexp.
67 If kwsexec returns < kwset_exact_matches, then we don't need to
68 call the regexp matcher at all. */
69 int kwset_exact_matches;
70 };
71
72 /* Callback from dfa.c. */
73 void
dfaerror(const char * mesg)74 dfaerror (const char *mesg)
75 {
76 error (exit_failure, 0, mesg);
77 }
78
79 /* If the DFA turns out to have some set of fixed strings one of
80 which must occur in the match, then we build a kwset matcher
81 to find those strings, and thus quickly filter out impossible
82 matches. */
83 static void
kwsmusts(struct compiled_regex * cregex,bool match_icase,bool match_words,bool match_lines,char eolbyte)84 kwsmusts (struct compiled_regex *cregex,
85 bool match_icase, bool match_words, bool match_lines, char eolbyte)
86 {
87 struct dfamust const *dm;
88 const char *err;
89
90 if (cregex->dfa.musts)
91 {
92 kwsinit (&cregex->ckwset, match_icase, match_words, match_lines, eolbyte);
93 /* First, we compile in the substrings known to be exact
94 matches. The kwset matcher will return the index
95 of the matching string that it chooses. */
96 for (dm = cregex->dfa.musts; dm; dm = dm->next)
97 {
98 if (!dm->exact)
99 continue;
100 cregex->kwset_exact_matches++;
101 if ((err = kwsincr (cregex->ckwset.kwset, dm->must, strlen (dm->must))) != NULL)
102 error (exit_failure, 0, err);
103 }
104 /* Now, we compile the substrings that will require
105 the use of the regexp matcher. */
106 for (dm = cregex->dfa.musts; dm; dm = dm->next)
107 {
108 if (dm->exact)
109 continue;
110 if ((err = kwsincr (cregex->ckwset.kwset, dm->must, strlen (dm->must))) != NULL)
111 error (exit_failure, 0, err);
112 }
113 if ((err = kwsprep (cregex->ckwset.kwset)) != NULL)
114 error (exit_failure, 0, err);
115 }
116 }
117
118 static void *
Gcompile(const char * pattern,size_t pattern_size,bool match_icase,bool match_words,bool match_lines,char eolbyte)119 Gcompile (const char *pattern, size_t pattern_size,
120 bool match_icase, bool match_words, bool match_lines, char eolbyte)
121 {
122 struct compiled_regex *cregex;
123 const char *err;
124 const char *sep;
125 size_t total = pattern_size;
126 const char *motif = pattern;
127
128 cregex = (struct compiled_regex *) xmalloc (sizeof (struct compiled_regex));
129 memset (cregex, '\0', sizeof (struct compiled_regex));
130 cregex->match_words = match_words;
131 cregex->match_lines = match_lines;
132 cregex->eolbyte = eolbyte;
133 cregex->patterns = NULL;
134 cregex->pcount = 0;
135
136 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
137 dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
138
139 /* For GNU regex compiler we have to pass the patterns separately to detect
140 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
141 GNU regex should have raise a syntax error. The same for backref, where
142 the backref should have been local to each pattern. */
143 do
144 {
145 size_t len;
146 sep = memchr (motif, '\n', total);
147 if (sep)
148 {
149 len = sep - motif;
150 sep++;
151 total -= (len + 1);
152 }
153 else
154 {
155 len = total;
156 total = 0;
157 }
158
159 cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns));
160 memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns));
161
162 if ((err = re_compile_pattern (motif, len,
163 &(cregex->patterns[cregex->pcount].regexbuf))) != NULL)
164 error (exit_failure, 0, err);
165 cregex->pcount++;
166
167 motif = sep;
168 } while (sep && total != 0);
169
170 /* In the match_words and match_lines cases, we use a different pattern
171 for the DFA matcher that will quickly throw out cases that won't work.
172 Then if DFA succeeds we do some hairy stuff using the regex matcher
173 to decide whether the match should really count. */
174 if (match_words || match_lines)
175 {
176 /* In the whole-word case, we use the pattern:
177 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
178 In the whole-line case, we use the pattern:
179 ^\(userpattern\)$. */
180
181 static const char line_beg[] = "^\\(";
182 static const char line_end[] = "\\)$";
183 static const char word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
184 static const char word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
185 char *n = (char *) xmalloc (sizeof word_beg - 1 + pattern_size + sizeof word_end);
186 size_t i;
187 strcpy (n, match_lines ? line_beg : word_beg);
188 i = strlen (n);
189 memcpy (n + i, pattern, pattern_size);
190 i += pattern_size;
191 strcpy (n + i, match_lines ? line_end : word_end);
192 i += strlen (n + i);
193 pattern = n;
194 pattern_size = i;
195 }
196
197 dfacomp (pattern, pattern_size, &cregex->dfa, 1);
198 kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte);
199
200 return cregex;
201 }
202
203 static void *
compile(const char * pattern,size_t pattern_size,bool match_icase,bool match_words,bool match_lines,char eolbyte,reg_syntax_t syntax)204 compile (const char *pattern, size_t pattern_size,
205 bool match_icase, bool match_words, bool match_lines, char eolbyte,
206 reg_syntax_t syntax)
207 {
208 struct compiled_regex *cregex;
209 const char *err;
210 const char *sep;
211 size_t total = pattern_size;
212 const char *motif = pattern;
213
214 cregex = (struct compiled_regex *) xmalloc (sizeof (struct compiled_regex));
215 memset (cregex, '\0', sizeof (struct compiled_regex));
216 cregex->match_words = match_words;
217 cregex->match_lines = match_lines;
218 cregex->eolbyte = eolbyte;
219 cregex->patterns = NULL;
220 cregex->pcount = 0;
221
222 re_set_syntax (syntax);
223 dfasyntax (syntax, match_icase, eolbyte);
224
225 /* For GNU regex compiler we have to pass the patterns separately to detect
226 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
227 GNU regex should have raise a syntax error. The same for backref, where
228 the backref should have been local to each pattern. */
229 do
230 {
231 size_t len;
232 sep = memchr (motif, '\n', total);
233 if (sep)
234 {
235 len = sep - motif;
236 sep++;
237 total -= (len + 1);
238 }
239 else
240 {
241 len = total;
242 total = 0;
243 }
244
245 cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns));
246 memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns));
247
248 if ((err = re_compile_pattern (motif, len,
249 &(cregex->patterns[cregex->pcount].regexbuf))) != NULL)
250 error (exit_failure, 0, err);
251 cregex->pcount++;
252
253 motif = sep;
254 } while (sep && total != 0);
255
256 /* In the match_words and match_lines cases, we use a different pattern
257 for the DFA matcher that will quickly throw out cases that won't work.
258 Then if DFA succeeds we do some hairy stuff using the regex matcher
259 to decide whether the match should really count. */
260 if (match_words || match_lines)
261 {
262 /* In the whole-word case, we use the pattern:
263 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
264 In the whole-line case, we use the pattern:
265 ^(userpattern)$. */
266
267 static const char line_beg[] = "^(";
268 static const char line_end[] = ")$";
269 static const char word_beg[] = "(^|[^[:alnum:]_])(";
270 static const char word_end[] = ")([^[:alnum:]_]|$)";
271 char *n = (char *) xmalloc (sizeof word_beg - 1 + pattern_size + sizeof word_end);
272 size_t i;
273 strcpy (n, match_lines ? line_beg : word_beg);
274 i = strlen(n);
275 memcpy (n + i, pattern, pattern_size);
276 i += pattern_size;
277 strcpy (n + i, match_lines ? line_end : word_end);
278 i += strlen (n + i);
279 pattern = n;
280 pattern_size = i;
281 }
282
283 dfacomp (pattern, pattern_size, &cregex->dfa, 1);
284 kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte);
285
286 return cregex;
287 }
288
289 static void *
Ecompile(const char * pattern,size_t pattern_size,bool match_icase,bool match_words,bool match_lines,char eolbyte)290 Ecompile (const char *pattern, size_t pattern_size,
291 bool match_icase, bool match_words, bool match_lines, char eolbyte)
292 {
293 return compile (pattern, pattern_size,
294 match_icase, match_words, match_lines, eolbyte,
295 RE_SYNTAX_POSIX_EGREP);
296 }
297
298 static void *
AWKcompile(const char * pattern,size_t pattern_size,bool match_icase,bool match_words,bool match_lines,char eolbyte)299 AWKcompile (const char *pattern, size_t pattern_size,
300 bool match_icase, bool match_words, bool match_lines, char eolbyte)
301 {
302 return compile (pattern, pattern_size,
303 match_icase, match_words, match_lines, eolbyte,
304 RE_SYNTAX_AWK);
305 }
306
307 static size_t
EGexecute(const void * compiled_pattern,const char * buf,size_t buf_size,size_t * match_size,bool exact)308 EGexecute (const void *compiled_pattern,
309 const char *buf, size_t buf_size,
310 size_t *match_size, bool exact)
311 {
312 struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
313 register const char *buflim, *beg, *end;
314 char eol = cregex->eolbyte;
315 int backref, start, len;
316 struct kwsmatch kwsm;
317 size_t i;
318 #ifdef MBS_SUPPORT
319 char *mb_properties = NULL;
320 #endif /* MBS_SUPPORT */
321
322 #ifdef MBS_SUPPORT
323 if (MB_CUR_MAX > 1 && cregex->ckwset.kwset)
324 mb_properties = check_multibyte_string (buf, buf_size);
325 #endif /* MBS_SUPPORT */
326
327 buflim = buf + buf_size;
328
329 for (beg = end = buf; end < buflim; beg = end)
330 {
331 if (!exact)
332 {
333 if (cregex->ckwset.kwset)
334 {
335 /* Find a possible match using the KWset matcher. */
336 size_t offset = kwsexec (cregex->ckwset.kwset, beg, buflim - beg, &kwsm);
337 if (offset == (size_t) -1)
338 {
339 #ifdef MBS_SUPPORT
340 if (MB_CUR_MAX > 1)
341 free (mb_properties);
342 #endif
343 return (size_t)-1;
344 }
345 beg += offset;
346 /* Narrow down to the line containing the candidate, and
347 run it through DFA. */
348 end = memchr (beg, eol, buflim - beg);
349 if (end != NULL)
350 end++;
351 else
352 end = buflim;
353 #ifdef MBS_SUPPORT
354 if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
355 continue;
356 #endif
357 while (beg > buf && beg[-1] != eol)
358 --beg;
359 if (kwsm.index < cregex->kwset_exact_matches)
360 goto success;
361 if (dfaexec (&cregex->dfa, beg, end - beg, &backref) == (size_t) -1)
362 continue;
363 }
364 else
365 {
366 /* No good fixed strings; start with DFA. */
367 size_t offset = dfaexec (&cregex->dfa, beg, buflim - beg, &backref);
368 if (offset == (size_t) -1)
369 break;
370 /* Narrow down to the line we've found. */
371 beg += offset;
372 end = memchr (beg, eol, buflim - beg);
373 if (end != NULL)
374 end++;
375 else
376 end = buflim;
377 while (beg > buf && beg[-1] != eol)
378 --beg;
379 }
380 /* Successful, no backreferences encountered! */
381 if (!backref)
382 goto success;
383 }
384 else
385 end = beg + buf_size;
386
387 /* If we've made it to this point, this means DFA has seen
388 a probable match, and we need to run it through Regex. */
389 for (i = 0; i < cregex->pcount; i++)
390 {
391 cregex->patterns[i].regexbuf.not_eol = 0;
392 if (0 <= (start = re_search (&(cregex->patterns[i].regexbuf), beg,
393 end - beg - 1, 0,
394 end - beg - 1, &(cregex->patterns[i].regs))))
395 {
396 len = cregex->patterns[i].regs.end[0] - start;
397 if (exact)
398 {
399 *match_size = len;
400 return start;
401 }
402 if ((!cregex->match_lines && !cregex->match_words)
403 || (cregex->match_lines && len == end - beg - 1))
404 goto success;
405 /* If -w, check if the match aligns with word boundaries.
406 We do this iteratively because:
407 (a) the line may contain more than one occurence of the
408 pattern, and
409 (b) Several alternatives in the pattern might be valid at a
410 given point, and we may need to consider a shorter one to
411 find a word boundary. */
412 if (cregex->match_words)
413 while (start >= 0)
414 {
415 if ((start == 0 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start - 1]))
416 && (len == end - beg - 1
417 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start + len])))
418 goto success;
419 if (len > 0)
420 {
421 /* Try a shorter length anchored at the same place. */
422 --len;
423 cregex->patterns[i].regexbuf.not_eol = 1;
424 len = re_match (&(cregex->patterns[i].regexbuf), beg,
425 start + len, start,
426 &(cregex->patterns[i].regs));
427 }
428 if (len <= 0)
429 {
430 /* Try looking further on. */
431 if (start == end - beg - 1)
432 break;
433 ++start;
434 cregex->patterns[i].regexbuf.not_eol = 0;
435 start = re_search (&(cregex->patterns[i].regexbuf), beg,
436 end - beg - 1,
437 start, end - beg - 1 - start,
438 &(cregex->patterns[i].regs));
439 len = cregex->patterns[i].regs.end[0] - start;
440 }
441 }
442 }
443 } /* for Regex patterns. */
444 } /* for (beg = end ..) */
445 #ifdef MBS_SUPPORT
446 if (MB_CUR_MAX > 1 && mb_properties)
447 free (mb_properties);
448 #endif /* MBS_SUPPORT */
449 return (size_t) -1;
450
451 success:
452 #ifdef MBS_SUPPORT
453 if (MB_CUR_MAX > 1 && mb_properties)
454 free (mb_properties);
455 #endif /* MBS_SUPPORT */
456 *match_size = end - beg;
457 return beg - buf;
458 }
459
460 static void
EGfree(void * compiled_pattern)461 EGfree (void *compiled_pattern)
462 {
463 struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
464
465 dfafree (&cregex->dfa);
466 free (cregex->patterns);
467 free (cregex->ckwset.trans);
468 free (cregex);
469 }
470
471 /* POSIX Basic Regular Expressions */
472 matcher_t matcher_grep =
473 {
474 Gcompile,
475 EGexecute,
476 EGfree
477 };
478
479 /* POSIX Extended Regular Expressions */
480 matcher_t matcher_egrep =
481 {
482 Ecompile,
483 EGexecute,
484 EGfree
485 };
486
487 /* AWK Regular Expressions */
488 matcher_t matcher_awk =
489 {
490 AWKcompile,
491 EGexecute,
492 EGfree
493 };
494