xref: /netbsd-src/external/bsd/tre/dist/lib/regexec.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /*
2   tre_regexec.c - TRE POSIX compatible matching functions (and more).
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 #ifdef HAVE_CONFIG_H
10 #include <config.h>
11 #endif /* HAVE_CONFIG_H */
12 
13 #include "tre-alloca.h"
14 
15 #include <assert.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #ifdef HAVE_WCHAR_H
19 #include <wchar.h>
20 #endif /* HAVE_WCHAR_H */
21 #ifdef HAVE_WCTYPE_H
22 #include <wctype.h>
23 #endif /* HAVE_WCTYPE_H */
24 #ifndef TRE_WCHAR
25 #include <ctype.h>
26 #endif /* !TRE_WCHAR */
27 #ifdef HAVE_MALLOC_H
28 #include <malloc.h>
29 #endif /* HAVE_MALLOC_H */
30 #include <limits.h>
31 
32 #include "tre-internal.h"
33 #include "tre.h"
34 #include "xmalloc.h"
35 
36 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
37    endpoint values. */
38 void
39 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
40 		const tre_tnfa_t *tnfa, int *tags, int match_eo)
41 {
42   tre_submatch_data_t *submatch_data;
43   unsigned int i, j;
44   int *parents;
45 
46   i = 0;
47   if (match_eo >= 0 && !(cflags & REG_NOSUB))
48     {
49       /* Construct submatch offsets from the tags. */
50       DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
51       submatch_data = tnfa->submatch_data;
52       while (i < tnfa->num_submatches && i < nmatch)
53 	{
54 	  if (submatch_data[i].so_tag == tnfa->end_tag)
55 	    pmatch[i].rm_so = match_eo;
56 	  else
57 	    pmatch[i].rm_so = tags[submatch_data[i].so_tag];
58 
59 	  if (submatch_data[i].eo_tag == tnfa->end_tag)
60 	    pmatch[i].rm_eo = match_eo;
61 	  else
62 	    pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
63 
64 	  /* If either of the endpoints were not used, this submatch
65 	     was not part of the match. */
66 	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
67 	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
68 
69 	  DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i,
70 		  submatch_data[i].so_tag, pmatch[i].rm_so,
71 		  submatch_data[i].eo_tag, pmatch[i].rm_eo));
72 	  i++;
73 	}
74       /* Reset all submatches that are not within all of their parent
75 	 submatches. */
76       i = 0;
77       while (i < tnfa->num_submatches && i < nmatch)
78 	{
79 	  if (pmatch[i].rm_eo == -1)
80 	    assert(pmatch[i].rm_so == -1);
81 	  assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
82 
83 	  parents = submatch_data[i].parents;
84 	  if (parents != NULL)
85 	    for (j = 0; parents[j] >= 0; j++)
86 	      {
87 		DPRINT(("pmatch[%d] parent %d\n", i, parents[j]));
88 		if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
89 		    || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
90 		  pmatch[i].rm_so = pmatch[i].rm_eo = -1;
91 	      }
92 	  i++;
93 	}
94     }
95 
96   while (i < nmatch)
97     {
98       pmatch[i].rm_so = -1;
99       pmatch[i].rm_eo = -1;
100       i++;
101     }
102 }
103 
104 
105 /*
106   Wrapper functions for POSIX compatible regexp matching.
107 */
108 
109 int
110 tre_have_backrefs(const regex_t *preg)
111 {
112   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
113   return tnfa->have_backrefs;
114 }
115 
116 int
117 tre_have_approx(const regex_t *preg)
118 {
119   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
120   return tnfa->have_approx;
121 }
122 
123 static int
124 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
125 	  tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
126 	  int eflags)
127 {
128   reg_errcode_t status;
129   int *tags = NULL, eo;
130   if (tnfa->num_tags > 0 && nmatch > 0)
131     {
132 #ifdef TRE_USE_ALLOCA
133       tags = alloca(sizeof(*tags) * tnfa->num_tags);
134 #else /* !TRE_USE_ALLOCA */
135       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
136 #endif /* !TRE_USE_ALLOCA */
137       if (tags == NULL)
138 	return REG_ESPACE;
139     }
140 
141   /* Dispatch to the appropriate matcher. */
142   if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER)
143     {
144       /* The regex has back references, use the backtracking matcher. */
145       if (type == STR_USER)
146 	{
147 	  const tre_str_source *source = string;
148 	  if (source->rewind == NULL || source->compare == NULL)
149 	    /* The backtracking matcher requires rewind and compare
150 	       capabilities from the input stream. */
151 	    return REG_BADPAT;
152 	}
153       status = tre_tnfa_run_backtrack(tnfa, string, (int)len, type,
154 				      tags, eflags, &eo);
155     }
156 #ifdef TRE_APPROX
157   else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER)
158     {
159       /* The regex uses approximate matching, use the approximate matcher. */
160       regamatch_t match;
161       regaparams_t params;
162       tre_regaparams_default(&params);
163       params.max_err = 0;
164       params.max_cost = 0;
165       status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
166 				   &match, params, eflags, &eo);
167     }
168 #endif /* TRE_APPROX */
169   else
170     {
171       /* Exact matching, no back references, use the parallel matcher. */
172       status = tre_tnfa_run_parallel(tnfa, string, (int)len, type,
173 				     tags, eflags, &eo);
174     }
175 
176   if (status == REG_OK)
177     /* A match was found, so fill the submatch registers. */
178     tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
179 #ifndef TRE_USE_ALLOCA
180   if (tags)
181     xfree(tags);
182 #endif /* !TRE_USE_ALLOCA */
183   return status;
184 }
185 
186 int
187 tre_regnexec(const regex_t *preg, const char *str, size_t len,
188 	 size_t nmatch, regmatch_t pmatch[], int eflags)
189 {
190   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
191   /* CONSTCOND */
192   tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
193 
194   return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags);
195 }
196 
197 int
198 tre_regexec(const regex_t *preg, const char *str,
199 	size_t nmatch, regmatch_t pmatch[], int eflags)
200 {
201 	const char	*newstr;
202 	unsigned	 newflags;
203 	size_t		 newlen;
204 
205 	if (eflags & REG_STARTEND) {
206 		/* LINTED */
207 		newstr = &str[pmatch[0].rm_so];
208 		/* LINTED */
209 		newlen = pmatch[0].rm_eo;
210 		newflags = (unsigned)(eflags & ~REG_STARTEND);
211 	} else {
212 		newstr = str;
213 		newlen = (size_t)-1;
214 		newflags = (unsigned)eflags;
215 	}
216 	return tre_regnexec(preg, newstr, newlen, nmatch, pmatch, (int)newflags);
217 }
218 
219 
220 #ifdef TRE_WCHAR
221 
222 int
223 tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len,
224 	  size_t nmatch, regmatch_t pmatch[], int eflags)
225 {
226   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
227   return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags);
228 }
229 
230 int
231 tre_regwexec(const regex_t *preg, const wchar_t *str,
232 	 size_t nmatch, regmatch_t pmatch[], int eflags)
233 {
234   return tre_regwnexec(preg, str, (unsigned)-1, nmatch, pmatch, eflags);
235 }
236 
237 #endif /* TRE_WCHAR */
238 
239 int
240 tre_reguexec(const regex_t *preg, const tre_str_source *str,
241 	 size_t nmatch, regmatch_t pmatch[], int eflags)
242 {
243   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
244   return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags);
245 }
246 
247 
248 #ifdef TRE_APPROX
249 
250 /*
251   Wrapper functions for approximate regexp matching.
252 */
253 
254 static int
255 tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
256 		 tre_str_type_t type, regamatch_t *match, regaparams_t params,
257 		 int eflags)
258 {
259   reg_errcode_t status;
260   int *tags = NULL, eo;
261 
262   /* If the regexp does not use approximate matching features, the
263      maximum cost is zero, and the approximate matcher isn't forced,
264      use the exact matcher instead. */
265   if (params.max_cost == 0 && !tnfa->have_approx
266       && !(eflags & REG_APPROX_MATCHER))
267     return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch,
268 		     eflags);
269 
270   /* Back references are not supported by the approximate matcher. */
271   if (tnfa->have_backrefs)
272     return REG_BADPAT;
273 
274   if (tnfa->num_tags > 0 && match->nmatch > 0)
275     {
276 #if TRE_USE_ALLOCA
277       tags = alloca(sizeof(*tags) * tnfa->num_tags);
278 #else /* !TRE_USE_ALLOCA */
279       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
280 #endif /* !TRE_USE_ALLOCA */
281       if (tags == NULL)
282 	return REG_ESPACE;
283     }
284   status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
285 			       match, params, eflags, &eo);
286   if (status == REG_OK)
287     tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo);
288 #ifndef TRE_USE_ALLOCA
289   if (tags)
290     xfree(tags);
291 #endif /* !TRE_USE_ALLOCA */
292   return status;
293 }
294 
295 int
296 tre_reganexec(const regex_t *preg, const char *str, size_t len,
297 	  regamatch_t *match, regaparams_t params, int eflags)
298 {
299   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
300   tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
301 
302   return tre_match_approx(tnfa, str, len, type, match, params, eflags);
303 }
304 
305 int
306 tre_regaexec(const regex_t *preg, const char *str,
307 	 regamatch_t *match, regaparams_t params, int eflags)
308 {
309   return tre_reganexec(preg, str, (unsigned)-1, match, params, eflags);
310 }
311 
312 #ifdef TRE_WCHAR
313 
314 int
315 tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len,
316 	   regamatch_t *match, regaparams_t params, int eflags)
317 {
318   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
319   return tre_match_approx(tnfa, str, len, STR_WIDE,
320 			  match, params, eflags);
321 }
322 
323 int
324 tre_regawexec(const regex_t *preg, const wchar_t *str,
325 	  regamatch_t *match, regaparams_t params, int eflags)
326 {
327   return tre_regawnexec(preg, str, (unsigned)-1, match, params, eflags);
328 }
329 
330 #endif /* TRE_WCHAR */
331 
332 void
333 tre_regaparams_default(regaparams_t *params)
334 {
335   memset(params, 0, sizeof(*params));
336   params->cost_ins = 1;
337   params->cost_del = 1;
338   params->cost_subst = 1;
339   params->max_cost = INT_MAX;
340   params->max_ins = INT_MAX;
341   params->max_del = INT_MAX;
342   params->max_subst = INT_MAX;
343   params->max_err = INT_MAX;
344 }
345 
346 #endif /* TRE_APPROX */
347 
348 /* EOF */
349