xref: /netbsd-src/external/bsd/tre/dist/lib/regexec.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*
2   tre_regexec.c - TRE POSIX compatible matching functions (and more).
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 #ifdef HAVE_CONFIG_H
10 #include <config.h>
11 #endif /* HAVE_CONFIG_H */
12 
13 #ifdef TRE_USE_ALLOCA
14 /* AIX requires this to be the first thing in the file.	 */
15 #ifndef __GNUC__
16 # if HAVE_ALLOCA_H
17 #  include <alloca.h>
18 # else
19 #  ifdef _AIX
20  #pragma alloca
21 #  else
22 #   ifndef alloca /* predefined by HP cc +Olibcalls */
23 char *alloca ();
24 #   endif
25 #  endif
26 # endif
27 #endif
28 #endif /* TRE_USE_ALLOCA */
29 
30 #include <assert.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #ifdef HAVE_WCHAR_H
34 #include <wchar.h>
35 #endif /* HAVE_WCHAR_H */
36 #ifdef HAVE_WCTYPE_H
37 #include <wctype.h>
38 #endif /* HAVE_WCTYPE_H */
39 #ifndef TRE_WCHAR
40 #include <ctype.h>
41 #endif /* !TRE_WCHAR */
42 #ifdef HAVE_MALLOC_H
43 #include <malloc.h>
44 #endif /* HAVE_MALLOC_H */
45 #include <limits.h>
46 
47 #include "tre-internal.h"
48 #include "tre.h"
49 #include "xmalloc.h"
50 
51 #ifdef __weak_alias
52 __weak_alias(regexec,_regexec)
53 #endif
54 
55 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
56    endpoint values. */
57 void
58 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
59 		const tre_tnfa_t *tnfa, int *tags, int match_eo)
60 {
61   tre_submatch_data_t *submatch_data;
62   unsigned int i, j;
63   int *parents;
64 
65   i = 0;
66   if (match_eo >= 0 && !(cflags & REG_NOSUB))
67     {
68       /* Construct submatch offsets from the tags. */
69       DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
70       submatch_data = tnfa->submatch_data;
71       while (i < tnfa->num_submatches && i < nmatch)
72 	{
73 	  if (submatch_data[i].so_tag == tnfa->end_tag)
74 	    pmatch[i].rm_so = match_eo;
75 	  else
76 	    pmatch[i].rm_so = tags[submatch_data[i].so_tag];
77 
78 	  if (submatch_data[i].eo_tag == tnfa->end_tag)
79 	    pmatch[i].rm_eo = match_eo;
80 	  else
81 	    pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
82 
83 	  /* If either of the endpoints were not used, this submatch
84 	     was not part of the match. */
85 	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
86 	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
87 
88 	  DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i,
89 		  submatch_data[i].so_tag, pmatch[i].rm_so,
90 		  submatch_data[i].eo_tag, pmatch[i].rm_eo));
91 	  i++;
92 	}
93       /* Reset all submatches that are not within all of their parent
94 	 submatches. */
95       i = 0;
96       while (i < tnfa->num_submatches && i < nmatch)
97 	{
98 	  if (pmatch[i].rm_eo == -1)
99 	    assert(pmatch[i].rm_so == -1);
100 	  assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
101 
102 	  parents = submatch_data[i].parents;
103 	  if (parents != NULL)
104 	    for (j = 0; parents[j] >= 0; j++)
105 	      {
106 		DPRINT(("pmatch[%d] parent %d\n", i, parents[j]));
107 		if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
108 		    || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
109 		  pmatch[i].rm_so = pmatch[i].rm_eo = -1;
110 	      }
111 	  i++;
112 	}
113     }
114 
115   while (i < nmatch)
116     {
117       pmatch[i].rm_so = -1;
118       pmatch[i].rm_eo = -1;
119       i++;
120     }
121 }
122 
123 
124 /*
125   Wrapper functions for POSIX compatible regexp matching.
126 */
127 
128 int
129 tre_have_backrefs(const regex_t *preg)
130 {
131   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
132   return tnfa->have_backrefs;
133 }
134 
135 int
136 tre_have_approx(const regex_t *preg)
137 {
138   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
139   return tnfa->have_approx;
140 }
141 
142 static int
143 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
144 	  tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
145 	  int eflags)
146 {
147   reg_errcode_t status;
148   int *tags = NULL, eo;
149   if (tnfa->num_tags > 0 && nmatch > 0)
150     {
151 #ifdef TRE_USE_ALLOCA
152       tags = alloca(sizeof(*tags) * tnfa->num_tags);
153 #else /* !TRE_USE_ALLOCA */
154       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
155 #endif /* !TRE_USE_ALLOCA */
156       if (tags == NULL)
157 	return REG_ESPACE;
158     }
159 
160   /* Dispatch to the appropriate matcher. */
161   if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER)
162     {
163       /* The regex has back references, use the backtracking matcher. */
164       if (type == STR_USER)
165 	{
166 	  const tre_str_source *source = string;
167 	  if (source->rewind == NULL || source->compare == NULL)
168 	    /* The backtracking matcher requires rewind and compare
169 	       capabilities from the input stream. */
170 	    return REG_BADPAT;
171 	}
172       status = tre_tnfa_run_backtrack(tnfa, string, (int)len, type,
173 				      tags, eflags, &eo);
174     }
175 #ifdef TRE_APPROX
176   else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER)
177     {
178       /* The regex uses approximate matching, use the approximate matcher. */
179       regamatch_t match;
180       regaparams_t params;
181       tre_regaparams_default(&params);
182       params.max_err = 0;
183       params.max_cost = 0;
184       status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
185 				   &match, params, eflags, &eo);
186     }
187 #endif /* TRE_APPROX */
188   else
189     {
190       /* Exact matching, no back references, use the parallel matcher. */
191       status = tre_tnfa_run_parallel(tnfa, string, (int)len, type,
192 				     tags, eflags, &eo);
193     }
194 
195   if (status == REG_OK)
196     /* A match was found, so fill the submatch registers. */
197     tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
198 #ifndef TRE_USE_ALLOCA
199   if (tags)
200     xfree(tags);
201 #endif /* !TRE_USE_ALLOCA */
202   return status;
203 }
204 
205 int
206 tre_regnexec(const regex_t *preg, const char *str, size_t len,
207 	 size_t nmatch, regmatch_t pmatch[], int eflags)
208 {
209   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
210   /* CONSTCOND */
211   tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
212 
213   return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags);
214 }
215 
216 int
217 tre_regexec(const regex_t *preg, const char *str,
218 	size_t nmatch, regmatch_t pmatch[], int eflags)
219 {
220 	const char	*newstr;
221 	unsigned	 newflags;
222 	size_t		 newlen;
223 
224 	if (eflags & REG_STARTEND) {
225 		/* LINTED */
226 		newstr = &str[pmatch[0].rm_so];
227 		/* LINTED */
228 		newlen = pmatch[0].rm_eo;
229 		newflags = (unsigned)(eflags & ~REG_STARTEND);
230 	} else {
231 		newstr = str;
232 		newlen = (size_t)-1;
233 		newflags = (unsigned)eflags;
234 	}
235 	return tre_regnexec(preg, newstr, newlen, nmatch, pmatch, (int)newflags);
236 }
237 
238 
239 #ifdef TRE_WCHAR
240 
241 int
242 tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len,
243 	  size_t nmatch, regmatch_t pmatch[], int eflags)
244 {
245   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
246   return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags);
247 }
248 
249 int
250 tre_regwexec(const regex_t *preg, const wchar_t *str,
251 	 size_t nmatch, regmatch_t pmatch[], int eflags)
252 {
253   return tre_regwnexec(preg, str, (unsigned)-1, nmatch, pmatch, eflags);
254 }
255 
256 #endif /* TRE_WCHAR */
257 
258 int
259 tre_reguexec(const regex_t *preg, const tre_str_source *str,
260 	 size_t nmatch, regmatch_t pmatch[], int eflags)
261 {
262   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
263   return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags);
264 }
265 
266 
267 #ifdef TRE_APPROX
268 
269 /*
270   Wrapper functions for approximate regexp matching.
271 */
272 
273 static int
274 tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
275 		 tre_str_type_t type, regamatch_t *match, regaparams_t params,
276 		 int eflags)
277 {
278   reg_errcode_t status;
279   int *tags = NULL, eo;
280 
281   /* If the regexp does not use approximate matching features, the
282      maximum cost is zero, and the approximate matcher isn't forced,
283      use the exact matcher instead. */
284   if (params.max_cost == 0 && !tnfa->have_approx
285       && !(eflags & REG_APPROX_MATCHER))
286     return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch,
287 		     eflags);
288 
289   /* Back references are not supported by the approximate matcher. */
290   if (tnfa->have_backrefs)
291     return REG_BADPAT;
292 
293   if (tnfa->num_tags > 0 && match->nmatch > 0)
294     {
295 #if TRE_USE_ALLOCA
296       tags = alloca(sizeof(*tags) * tnfa->num_tags);
297 #else /* !TRE_USE_ALLOCA */
298       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
299 #endif /* !TRE_USE_ALLOCA */
300       if (tags == NULL)
301 	return REG_ESPACE;
302     }
303   status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
304 			       match, params, eflags, &eo);
305   if (status == REG_OK)
306     tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo);
307 #ifndef TRE_USE_ALLOCA
308   if (tags)
309     xfree(tags);
310 #endif /* !TRE_USE_ALLOCA */
311   return status;
312 }
313 
314 int
315 tre_reganexec(const regex_t *preg, const char *str, size_t len,
316 	  regamatch_t *match, regaparams_t params, int eflags)
317 {
318   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
319   tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
320 
321   return tre_match_approx(tnfa, str, len, type, match, params, eflags);
322 }
323 
324 int
325 tre_regaexec(const regex_t *preg, const char *str,
326 	 regamatch_t *match, regaparams_t params, int eflags)
327 {
328   return tre_reganexec(preg, str, (unsigned)-1, match, params, eflags);
329 }
330 
331 #ifdef TRE_WCHAR
332 
333 int
334 tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len,
335 	   regamatch_t *match, regaparams_t params, int eflags)
336 {
337   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
338   return tre_match_approx(tnfa, str, len, STR_WIDE,
339 			  match, params, eflags);
340 }
341 
342 int
343 tre_regawexec(const regex_t *preg, const wchar_t *str,
344 	  regamatch_t *match, regaparams_t params, int eflags)
345 {
346   return tre_regawnexec(preg, str, (unsigned)-1, match, params, eflags);
347 }
348 
349 #endif /* TRE_WCHAR */
350 
351 void
352 tre_regaparams_default(regaparams_t *params)
353 {
354   memset(params, 0, sizeof(*params));
355   params->cost_ins = 1;
356   params->cost_del = 1;
357   params->cost_subst = 1;
358   params->max_cost = INT_MAX;
359   params->max_ins = INT_MAX;
360   params->max_del = INT_MAX;
361   params->max_subst = INT_MAX;
362   params->max_err = INT_MAX;
363 }
364 
365 #endif /* TRE_APPROX */
366 
367 /* EOF */
368