xref: /netbsd-src/external/bsd/tre/dist/lib/regexec.c (revision 1580a27b92f58fcdcb23fdfbc04a7c2b54a0b7c8)
1 /*
2   tre_regexec.c - TRE POSIX compatible matching functions (and more).
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 #ifdef HAVE_CONFIG_H
10 #include <config.h>
11 #endif /* HAVE_CONFIG_H */
12 
13 #include "tre-alloca.h"
14 
15 #include <assert.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #ifdef HAVE_WCHAR_H
19 #include <wchar.h>
20 #endif /* HAVE_WCHAR_H */
21 #ifdef HAVE_WCTYPE_H
22 #include <wctype.h>
23 #endif /* HAVE_WCTYPE_H */
24 #ifndef TRE_WCHAR
25 #include <ctype.h>
26 #endif /* !TRE_WCHAR */
27 #ifdef HAVE_MALLOC_H
28 #include <malloc.h>
29 #endif /* HAVE_MALLOC_H */
30 #include <limits.h>
31 
32 #include "tre-internal.h"
33 #include "tre.h"
34 #include "xmalloc.h"
35 
36 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
37    endpoint values. */
38 void
39 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
40 		const tre_tnfa_t *tnfa, int *tags, int match_eo)
41 {
42   tre_submatch_data_t *submatch_data;
43   unsigned int i, j;
44   int *parents;
45 
46   i = 0;
47   if (match_eo >= 0 && !(cflags & REG_NOSUB))
48     {
49       /* Construct submatch offsets from the tags. */
50       DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
51       submatch_data = tnfa->submatch_data;
52       while (i < tnfa->num_submatches && i < nmatch)
53 	{
54 	  if (submatch_data[i].so_tag == tnfa->end_tag)
55 	    pmatch[i].rm_so = match_eo;
56 	  else
57 	    pmatch[i].rm_so = tags[submatch_data[i].so_tag];
58 
59 	  if (submatch_data[i].eo_tag == tnfa->end_tag)
60 	    pmatch[i].rm_eo = match_eo;
61 	  else
62 	    pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
63 
64 	  /* If either of the endpoints were not used, this submatch
65 	     was not part of the match. */
66 	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
67 	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
68 
69 	  DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i,
70 		  submatch_data[i].so_tag, pmatch[i].rm_so,
71 		  submatch_data[i].eo_tag, pmatch[i].rm_eo));
72 	  i++;
73 	}
74       /* Reset all submatches that are not within all of their parent
75 	 submatches. */
76       i = 0;
77       while (i < tnfa->num_submatches && i < nmatch)
78 	{
79 	  if (pmatch[i].rm_eo == -1)
80 	    assert(pmatch[i].rm_so == -1);
81 	  assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
82 
83 	  parents = submatch_data[i].parents;
84 	  if (parents != NULL)
85 	    for (j = 0; parents[j] >= 0; j++)
86 	      {
87 		DPRINT(("pmatch[%d] parent %d\n", i, parents[j]));
88 		if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
89 		    || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
90 		  pmatch[i].rm_so = pmatch[i].rm_eo = -1;
91 	      }
92 	  i++;
93 	}
94     }
95 
96   while (i < nmatch)
97     {
98       pmatch[i].rm_so = -1;
99       pmatch[i].rm_eo = -1;
100       i++;
101     }
102 }
103 
104 
105 /*
106   Wrapper functions for POSIX compatible regexp matching.
107 */
108 
109 int
110 tre_have_backrefs(const regex_t *preg)
111 {
112   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
113   return tnfa->have_backrefs;
114 }
115 
116 int
117 tre_have_approx(const regex_t *preg)
118 {
119   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
120   return tnfa->have_approx;
121 }
122 
123 static int
124 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
125 	  tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
126 	  int eflags)
127 {
128   reg_errcode_t status;
129   int *tags = NULL, eo;
130   if (tnfa->num_tags > 0 && nmatch > 0)
131     {
132 #ifdef TRE_USE_ALLOCA
133       tags = alloca(sizeof(*tags) * tnfa->num_tags);
134 #else /* !TRE_USE_ALLOCA */
135       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
136 #endif /* !TRE_USE_ALLOCA */
137       if (tags == NULL)
138 	return REG_ESPACE;
139     }
140 
141   /* Dispatch to the appropriate matcher. */
142   if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER)
143     {
144       /* The regex has back references, use the backtracking matcher. */
145       if (type == STR_USER)
146 	{
147 	  const tre_str_source *source = string;
148 	  if (source->rewind == NULL || source->compare == NULL)
149 	    /* The backtracking matcher requires rewind and compare
150 	       capabilities from the input stream. */
151 	    return REG_BADPAT;
152 	}
153       status = tre_tnfa_run_backtrack(tnfa, string, (int)len, type,
154 				      tags, eflags, &eo);
155     }
156 #ifdef TRE_APPROX
157   else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER)
158     {
159       /* The regex uses approximate matching, use the approximate matcher. */
160       regamatch_t match;
161       regaparams_t params;
162       tre_regaparams_default(&params);
163       params.max_err = 0;
164       params.max_cost = 0;
165       status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
166 				   &match, params, eflags, &eo);
167     }
168 #endif /* TRE_APPROX */
169   else
170     {
171       /* Exact matching, no back references, use the parallel matcher. */
172       status = tre_tnfa_run_parallel(tnfa, string, (int)len, type,
173 				     tags, eflags, &eo);
174     }
175 
176   if (status == REG_OK)
177     /* A match was found, so fill the submatch registers. */
178     tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
179 #ifndef TRE_USE_ALLOCA
180   if (tags)
181     xfree(tags);
182 #endif /* !TRE_USE_ALLOCA */
183   return status;
184 }
185 
186 int
187 tre_regnexec(const regex_t *preg, const char *str, size_t len,
188 	 size_t nmatch, regmatch_t pmatch[], int eflags)
189 {
190   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
191   /* CONSTCOND */
192   tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
193 
194   return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags);
195 }
196 
197 int
198 tre_regexec(const regex_t *preg, const char *str,
199 	size_t nmatch, regmatch_t pmatch[], int eflags)
200 {
201 	size_t shift, len, i;
202 	int ret;
203 	tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
204 	regmatch_t *p;
205 
206 	if (eflags & REG_STARTEND) {
207 		if (pmatch == NULL || pmatch->rm_so < 0 || pmatch->rm_eo < 0
208 		    || pmatch->rm_so > pmatch->rm_eo)
209 			return REG_INVARG;
210 		str += shift = pmatch->rm_so;
211 		len = pmatch->rm_eo - pmatch->rm_so;
212 		eflags &= ~REG_STARTEND;
213 	} else {
214 		shift = 0;
215 		len = (size_t)-1;
216 	}
217 
218 	ret = tre_regnexec(preg, str, len, nmatch, pmatch, eflags);
219 
220 	if (!ret && !(tnfa->cflags & REG_NOSUB) && len != (size_t)-1) {
221 		for (i = nmatch, p = pmatch; i > 0; p++, i--) {
222 			if (p->rm_so >= 0) p->rm_so += shift;
223 			if (p->rm_eo >= 0) p->rm_eo += shift;
224 		}
225 	}
226 
227 	return ret;
228 }
229 
230 
231 #ifdef TRE_WCHAR
232 
233 int
234 tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len,
235 	  size_t nmatch, regmatch_t pmatch[], int eflags)
236 {
237   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
238   return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags);
239 }
240 
241 int
242 tre_regwexec(const regex_t *preg, const wchar_t *str,
243 	 size_t nmatch, regmatch_t pmatch[], int eflags)
244 {
245 	size_t shift, len, i;
246 	int ret;
247 	tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
248 	regmatch_t *p;
249 
250 	if (eflags & REG_STARTEND) {
251 		if (pmatch == NULL || pmatch->rm_so < 0 || pmatch->rm_eo < 0
252 		    || pmatch->rm_so > pmatch->rm_eo)
253 			return REG_INVARG;
254 		str += shift = pmatch->rm_so;
255 		len = pmatch->rm_eo - pmatch->rm_so;
256 		eflags &= ~REG_STARTEND;
257 	} else {
258 		shift = 0;
259 		len = (size_t)-1;
260 	}
261 
262 	ret = tre_regwnexec(preg, str, len, nmatch, pmatch, eflags);
263 
264 	if (!ret && !(tnfa->cflags & REG_NOSUB) && len != (size_t)-1) {
265 		for (i = nmatch, p = pmatch; i > 0; p++, i--) {
266 			if (p->rm_so >= 0) p->rm_so += shift;
267 			if (p->rm_eo >= 0) p->rm_eo += shift;
268 		}
269 	}
270 
271 	return ret;
272 }
273 
274 #endif /* TRE_WCHAR */
275 
276 int
277 tre_reguexec(const regex_t *preg, const tre_str_source *str,
278 	 size_t nmatch, regmatch_t pmatch[], int eflags)
279 {
280   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
281   return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags);
282 }
283 
284 
285 #ifdef TRE_APPROX
286 
287 /*
288   Wrapper functions for approximate regexp matching.
289 */
290 
291 static int
292 tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
293 		 tre_str_type_t type, regamatch_t *match, regaparams_t params,
294 		 int eflags)
295 {
296   reg_errcode_t status;
297   int *tags = NULL, eo;
298 
299   /* If the regexp does not use approximate matching features, the
300      maximum cost is zero, and the approximate matcher isn't forced,
301      use the exact matcher instead. */
302   if (params.max_cost == 0 && !tnfa->have_approx
303       && !(eflags & REG_APPROX_MATCHER))
304     return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch,
305 		     eflags);
306 
307   /* Back references are not supported by the approximate matcher. */
308   if (tnfa->have_backrefs)
309     return REG_BADPAT;
310 
311   if (tnfa->num_tags > 0 && match->nmatch > 0)
312     {
313 #if TRE_USE_ALLOCA
314       tags = alloca(sizeof(*tags) * tnfa->num_tags);
315 #else /* !TRE_USE_ALLOCA */
316       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
317 #endif /* !TRE_USE_ALLOCA */
318       if (tags == NULL)
319 	return REG_ESPACE;
320     }
321   status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
322 			       match, params, eflags, &eo);
323   if (status == REG_OK)
324     tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo);
325 #ifndef TRE_USE_ALLOCA
326   if (tags)
327     xfree(tags);
328 #endif /* !TRE_USE_ALLOCA */
329   return status;
330 }
331 
332 int
333 tre_reganexec(const regex_t *preg, const char *str, size_t len,
334 	  regamatch_t *match, regaparams_t params, int eflags)
335 {
336   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
337   tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
338 
339   return tre_match_approx(tnfa, str, len, type, match, params, eflags);
340 }
341 
342 int
343 tre_regaexec(const regex_t *preg, const char *str,
344 	 regamatch_t *match, regaparams_t params, int eflags)
345 {
346   return tre_reganexec(preg, str, (unsigned)-1, match, params, eflags);
347 }
348 
349 #ifdef TRE_WCHAR
350 
351 int
352 tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len,
353 	   regamatch_t *match, regaparams_t params, int eflags)
354 {
355   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
356   return tre_match_approx(tnfa, str, len, STR_WIDE,
357 			  match, params, eflags);
358 }
359 
360 int
361 tre_regawexec(const regex_t *preg, const wchar_t *str,
362 	  regamatch_t *match, regaparams_t params, int eflags)
363 {
364   return tre_regawnexec(preg, str, (unsigned)-1, match, params, eflags);
365 }
366 
367 #endif /* TRE_WCHAR */
368 
369 void
370 tre_regaparams_default(regaparams_t *params)
371 {
372   memset(params, 0, sizeof(*params));
373   params->cost_ins = 1;
374   params->cost_del = 1;
375   params->cost_subst = 1;
376   params->max_cost = INT_MAX;
377   params->max_ins = INT_MAX;
378   params->max_del = INT_MAX;
379   params->max_subst = INT_MAX;
380   params->max_err = INT_MAX;
381 }
382 
383 #endif /* TRE_APPROX */
384 
385 /* EOF */
386