xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/libgrep/m-regex.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* Pattern Matchers for Regular Expressions.
2    Copyright (C) 1992, 1998, 2000, 2005 Free Software Foundation, Inc.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 2, or (at your option)
7    any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software Foundation,
16    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
17 
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21 
22 /* Specification.  */
23 #include "libgrep.h"
24 
25 #include <ctype.h>
26 #include <stdlib.h>
27 #include <string.h>
28 
29 #include "error.h"
30 #include "exitfail.h"
31 #include "xalloc.h"
32 #include "m-common.h"
33 
34 /* This must be included _after_ m-common.h: It depends on MBS_SUPPORT.  */
35 #include "dfa.h"
36 
37 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
38 # define IN_CTYPE_DOMAIN(c) 1
39 #else
40 # define IN_CTYPE_DOMAIN(c) isascii(c)
41 #endif
42 #define ISALNUM(C) (IN_CTYPE_DOMAIN (C) && isalnum (C))
43 
44 struct compiled_regex {
45   bool match_words;
46   bool match_lines;
47   char eolbyte;
48 
49   /* DFA compiled regexp. */
50   struct dfa dfa;
51 
52   /* The Regex compiled patterns.  */
53   struct patterns
54   {
55     /* Regex compiled regexp. */
56     struct re_pattern_buffer regexbuf;
57     struct re_registers regs; /* This is here on account of a BRAIN-DEAD
58 				 Q@#%!# library interface in regex.c.  */
59   } *patterns;
60   size_t pcount;
61 
62   /* KWset compiled pattern.  We compile a list of strings, at least one of
63      which is known to occur in any string matching the regexp. */
64   struct compiled_kwset ckwset;
65 
66   /* Number of compiled fixed strings known to exactly match the regexp.
67      If kwsexec returns < kwset_exact_matches, then we don't need to
68      call the regexp matcher at all. */
69   int kwset_exact_matches;
70 };
71 
72 /* Callback from dfa.c.  */
73 void
dfaerror(const char * mesg)74 dfaerror (const char *mesg)
75 {
76   error (exit_failure, 0, mesg);
77 }
78 
79 /* If the DFA turns out to have some set of fixed strings one of
80    which must occur in the match, then we build a kwset matcher
81    to find those strings, and thus quickly filter out impossible
82    matches. */
83 static void
kwsmusts(struct compiled_regex * cregex,bool match_icase,bool match_words,bool match_lines,char eolbyte)84 kwsmusts (struct compiled_regex *cregex,
85 	  bool match_icase, bool match_words, bool match_lines, char eolbyte)
86 {
87   struct dfamust const *dm;
88   const char *err;
89 
90   if (cregex->dfa.musts)
91     {
92       kwsinit (&cregex->ckwset, match_icase, match_words, match_lines, eolbyte);
93       /* First, we compile in the substrings known to be exact
94 	 matches.  The kwset matcher will return the index
95 	 of the matching string that it chooses. */
96       for (dm = cregex->dfa.musts; dm; dm = dm->next)
97 	{
98 	  if (!dm->exact)
99 	    continue;
100 	  cregex->kwset_exact_matches++;
101 	  if ((err = kwsincr (cregex->ckwset.kwset, dm->must, strlen (dm->must))) != NULL)
102 	    error (exit_failure, 0, err);
103 	}
104       /* Now, we compile the substrings that will require
105 	 the use of the regexp matcher.  */
106       for (dm = cregex->dfa.musts; dm; dm = dm->next)
107 	{
108 	  if (dm->exact)
109 	    continue;
110 	  if ((err = kwsincr (cregex->ckwset.kwset, dm->must, strlen (dm->must))) != NULL)
111 	    error (exit_failure, 0, err);
112 	}
113       if ((err = kwsprep (cregex->ckwset.kwset)) != NULL)
114 	error (exit_failure, 0, err);
115     }
116 }
117 
118 static void *
Gcompile(const char * pattern,size_t pattern_size,bool match_icase,bool match_words,bool match_lines,char eolbyte)119 Gcompile (const char *pattern, size_t pattern_size,
120 	  bool match_icase, bool match_words, bool match_lines, char eolbyte)
121 {
122   struct compiled_regex *cregex;
123   const char *err;
124   const char *sep;
125   size_t total = pattern_size;
126   const char *motif = pattern;
127 
128   cregex = (struct compiled_regex *) xmalloc (sizeof (struct compiled_regex));
129   memset (cregex, '\0', sizeof (struct compiled_regex));
130   cregex->match_words = match_words;
131   cregex->match_lines = match_lines;
132   cregex->eolbyte = eolbyte;
133   cregex->patterns = NULL;
134   cregex->pcount = 0;
135 
136   re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
137   dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
138 
139   /* For GNU regex compiler we have to pass the patterns separately to detect
140      errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
141      GNU regex should have raise a syntax error.  The same for backref, where
142      the backref should have been local to each pattern.  */
143   do
144     {
145       size_t len;
146       sep = memchr (motif, '\n', total);
147       if (sep)
148 	{
149 	  len = sep - motif;
150 	  sep++;
151 	  total -= (len + 1);
152 	}
153       else
154 	{
155 	  len = total;
156 	  total = 0;
157 	}
158 
159       cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns));
160       memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns));
161 
162       if ((err = re_compile_pattern (motif, len,
163 				     &(cregex->patterns[cregex->pcount].regexbuf))) != NULL)
164 	error (exit_failure, 0, err);
165       cregex->pcount++;
166 
167       motif = sep;
168     } while (sep && total != 0);
169 
170   /* In the match_words and match_lines cases, we use a different pattern
171      for the DFA matcher that will quickly throw out cases that won't work.
172      Then if DFA succeeds we do some hairy stuff using the regex matcher
173      to decide whether the match should really count. */
174   if (match_words || match_lines)
175     {
176       /* In the whole-word case, we use the pattern:
177 	 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
178 	 In the whole-line case, we use the pattern:
179 	 ^\(userpattern\)$.  */
180 
181       static const char line_beg[] = "^\\(";
182       static const char line_end[] = "\\)$";
183       static const char word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
184       static const char word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
185       char *n = (char *) xmalloc (sizeof word_beg - 1 + pattern_size + sizeof word_end);
186       size_t i;
187       strcpy (n, match_lines ? line_beg : word_beg);
188       i = strlen (n);
189       memcpy (n + i, pattern, pattern_size);
190       i += pattern_size;
191       strcpy (n + i, match_lines ? line_end : word_end);
192       i += strlen (n + i);
193       pattern = n;
194       pattern_size = i;
195     }
196 
197   dfacomp (pattern, pattern_size, &cregex->dfa, 1);
198   kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte);
199 
200   return cregex;
201 }
202 
203 static void *
compile(const char * pattern,size_t pattern_size,bool match_icase,bool match_words,bool match_lines,char eolbyte,reg_syntax_t syntax)204 compile (const char *pattern, size_t pattern_size,
205 	 bool match_icase, bool match_words, bool match_lines, char eolbyte,
206 	 reg_syntax_t syntax)
207 {
208   struct compiled_regex *cregex;
209   const char *err;
210   const char *sep;
211   size_t total = pattern_size;
212   const char *motif = pattern;
213 
214   cregex = (struct compiled_regex *) xmalloc (sizeof (struct compiled_regex));
215   memset (cregex, '\0', sizeof (struct compiled_regex));
216   cregex->match_words = match_words;
217   cregex->match_lines = match_lines;
218   cregex->eolbyte = eolbyte;
219   cregex->patterns = NULL;
220   cregex->pcount = 0;
221 
222   re_set_syntax (syntax);
223   dfasyntax (syntax, match_icase, eolbyte);
224 
225   /* For GNU regex compiler we have to pass the patterns separately to detect
226      errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
227      GNU regex should have raise a syntax error.  The same for backref, where
228      the backref should have been local to each pattern.  */
229   do
230     {
231       size_t len;
232       sep = memchr (motif, '\n', total);
233       if (sep)
234 	{
235 	  len = sep - motif;
236 	  sep++;
237 	  total -= (len + 1);
238 	}
239       else
240 	{
241 	  len = total;
242 	  total = 0;
243 	}
244 
245       cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns));
246       memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns));
247 
248       if ((err = re_compile_pattern (motif, len,
249 				     &(cregex->patterns[cregex->pcount].regexbuf))) != NULL)
250 	error (exit_failure, 0, err);
251       cregex->pcount++;
252 
253       motif = sep;
254     } while (sep && total != 0);
255 
256   /* In the match_words and match_lines cases, we use a different pattern
257      for the DFA matcher that will quickly throw out cases that won't work.
258      Then if DFA succeeds we do some hairy stuff using the regex matcher
259      to decide whether the match should really count. */
260   if (match_words || match_lines)
261     {
262       /* In the whole-word case, we use the pattern:
263 	 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
264 	 In the whole-line case, we use the pattern:
265 	 ^(userpattern)$.  */
266 
267       static const char line_beg[] = "^(";
268       static const char line_end[] = ")$";
269       static const char word_beg[] = "(^|[^[:alnum:]_])(";
270       static const char word_end[] = ")([^[:alnum:]_]|$)";
271       char *n = (char *) xmalloc (sizeof word_beg - 1 + pattern_size + sizeof word_end);
272       size_t i;
273       strcpy (n, match_lines ? line_beg : word_beg);
274       i = strlen(n);
275       memcpy (n + i, pattern, pattern_size);
276       i += pattern_size;
277       strcpy (n + i, match_lines ? line_end : word_end);
278       i += strlen (n + i);
279       pattern = n;
280       pattern_size = i;
281     }
282 
283   dfacomp (pattern, pattern_size, &cregex->dfa, 1);
284   kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte);
285 
286   return cregex;
287 }
288 
289 static void *
Ecompile(const char * pattern,size_t pattern_size,bool match_icase,bool match_words,bool match_lines,char eolbyte)290 Ecompile (const char *pattern, size_t pattern_size,
291 	  bool match_icase, bool match_words, bool match_lines, char eolbyte)
292 {
293   return compile (pattern, pattern_size,
294 		  match_icase, match_words, match_lines, eolbyte,
295 		  RE_SYNTAX_POSIX_EGREP);
296 }
297 
298 static void *
AWKcompile(const char * pattern,size_t pattern_size,bool match_icase,bool match_words,bool match_lines,char eolbyte)299 AWKcompile (const char *pattern, size_t pattern_size,
300 	    bool match_icase, bool match_words, bool match_lines, char eolbyte)
301 {
302   return compile (pattern, pattern_size,
303 		  match_icase, match_words, match_lines, eolbyte,
304 		  RE_SYNTAX_AWK);
305 }
306 
307 static size_t
EGexecute(const void * compiled_pattern,const char * buf,size_t buf_size,size_t * match_size,bool exact)308 EGexecute (const void *compiled_pattern,
309 	   const char *buf, size_t buf_size,
310 	   size_t *match_size, bool exact)
311 {
312   struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
313   register const char *buflim, *beg, *end;
314   char eol = cregex->eolbyte;
315   int backref, start, len;
316   struct kwsmatch kwsm;
317   size_t i;
318 #ifdef MBS_SUPPORT
319   char *mb_properties = NULL;
320 #endif /* MBS_SUPPORT */
321 
322 #ifdef MBS_SUPPORT
323   if (MB_CUR_MAX > 1 && cregex->ckwset.kwset)
324     mb_properties = check_multibyte_string (buf, buf_size);
325 #endif /* MBS_SUPPORT */
326 
327   buflim = buf + buf_size;
328 
329   for (beg = end = buf; end < buflim; beg = end)
330     {
331       if (!exact)
332 	{
333 	  if (cregex->ckwset.kwset)
334 	    {
335 	      /* Find a possible match using the KWset matcher. */
336 	      size_t offset = kwsexec (cregex->ckwset.kwset, beg, buflim - beg, &kwsm);
337 	      if (offset == (size_t) -1)
338 		{
339 #ifdef MBS_SUPPORT
340 		  if (MB_CUR_MAX > 1)
341 		    free (mb_properties);
342 #endif
343 		  return (size_t)-1;
344 		}
345 	      beg += offset;
346 	      /* Narrow down to the line containing the candidate, and
347 		 run it through DFA. */
348 	      end = memchr (beg, eol, buflim - beg);
349 	      if (end != NULL)
350 		end++;
351 	      else
352 		end = buflim;
353 #ifdef MBS_SUPPORT
354 	      if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
355 		continue;
356 #endif
357 	      while (beg > buf && beg[-1] != eol)
358 		--beg;
359 	      if (kwsm.index < cregex->kwset_exact_matches)
360 		goto success;
361 	      if (dfaexec (&cregex->dfa, beg, end - beg, &backref) == (size_t) -1)
362 		continue;
363 	    }
364 	  else
365 	    {
366 	      /* No good fixed strings; start with DFA. */
367 	      size_t offset = dfaexec (&cregex->dfa, beg, buflim - beg, &backref);
368 	      if (offset == (size_t) -1)
369 		break;
370 	      /* Narrow down to the line we've found. */
371 	      beg += offset;
372 	      end = memchr (beg, eol, buflim - beg);
373 	      if (end != NULL)
374 		end++;
375 	      else
376 		end = buflim;
377 	      while (beg > buf && beg[-1] != eol)
378 		--beg;
379 	    }
380 	  /* Successful, no backreferences encountered! */
381 	  if (!backref)
382 	    goto success;
383 	}
384       else
385 	end = beg + buf_size;
386 
387       /* If we've made it to this point, this means DFA has seen
388 	 a probable match, and we need to run it through Regex. */
389       for (i = 0; i < cregex->pcount; i++)
390 	{
391 	  cregex->patterns[i].regexbuf.not_eol = 0;
392 	  if (0 <= (start = re_search (&(cregex->patterns[i].regexbuf), beg,
393 				       end - beg - 1, 0,
394 				       end - beg - 1, &(cregex->patterns[i].regs))))
395 	    {
396 	      len = cregex->patterns[i].regs.end[0] - start;
397 	      if (exact)
398 		{
399 		  *match_size = len;
400 		  return start;
401 		}
402 	      if ((!cregex->match_lines && !cregex->match_words)
403 		  || (cregex->match_lines && len == end - beg - 1))
404 		goto success;
405 	      /* If -w, check if the match aligns with word boundaries.
406 		 We do this iteratively because:
407 		 (a) the line may contain more than one occurence of the
408 		 pattern, and
409 		 (b) Several alternatives in the pattern might be valid at a
410 		 given point, and we may need to consider a shorter one to
411 		 find a word boundary.  */
412 	      if (cregex->match_words)
413 		while (start >= 0)
414 		  {
415 		    if ((start == 0 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start - 1]))
416 			&& (len == end - beg - 1
417 			    || !IS_WORD_CONSTITUENT ((unsigned char) beg[start + len])))
418 		      goto success;
419 		    if (len > 0)
420 		      {
421 			/* Try a shorter length anchored at the same place. */
422 			--len;
423 			cregex->patterns[i].regexbuf.not_eol = 1;
424 			len = re_match (&(cregex->patterns[i].regexbuf), beg,
425 					start + len, start,
426 					&(cregex->patterns[i].regs));
427 		      }
428 		    if (len <= 0)
429 		      {
430 			/* Try looking further on. */
431 			if (start == end - beg - 1)
432 			  break;
433 			++start;
434 			cregex->patterns[i].regexbuf.not_eol = 0;
435 			start = re_search (&(cregex->patterns[i].regexbuf), beg,
436 					   end - beg - 1,
437 					   start, end - beg - 1 - start,
438 					   &(cregex->patterns[i].regs));
439 			len = cregex->patterns[i].regs.end[0] - start;
440 		      }
441 		  }
442 	    }
443 	} /* for Regex patterns.  */
444     } /* for (beg = end ..) */
445 #ifdef MBS_SUPPORT
446   if (MB_CUR_MAX > 1 && mb_properties)
447     free (mb_properties);
448 #endif /* MBS_SUPPORT */
449   return (size_t) -1;
450 
451  success:
452 #ifdef MBS_SUPPORT
453   if (MB_CUR_MAX > 1 && mb_properties)
454     free (mb_properties);
455 #endif /* MBS_SUPPORT */
456   *match_size = end - beg;
457   return beg - buf;
458 }
459 
460 static void
EGfree(void * compiled_pattern)461 EGfree (void *compiled_pattern)
462 {
463   struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
464 
465   dfafree (&cregex->dfa);
466   free (cregex->patterns);
467   free (cregex->ckwset.trans);
468   free (cregex);
469 }
470 
471 /* POSIX Basic Regular Expressions */
472 matcher_t matcher_grep =
473   {
474     Gcompile,
475     EGexecute,
476     EGfree
477   };
478 
479 /* POSIX Extended Regular Expressions */
480 matcher_t matcher_egrep =
481   {
482     Ecompile,
483     EGexecute,
484     EGfree
485   };
486 
487 /* AWK Regular Expressions */
488 matcher_t matcher_awk =
489   {
490     AWKcompile,
491     EGexecute,
492     EGfree
493   };
494