xref: /netbsd-src/external/bsd/tre/dist/lib/regexec.c (revision 42ec3f9a8aa7a3450d86346317a12f692ac57323)
1 /*
2   tre_regexec.c - TRE POSIX compatible matching functions (and more).
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 #ifdef HAVE_CONFIG_H
10 #include <config.h>
11 #endif /* HAVE_CONFIG_H */
12 
13 #include "tre-alloca.h"
14 
15 #include <assert.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #ifdef HAVE_WCHAR_H
19 #include <wchar.h>
20 #endif /* HAVE_WCHAR_H */
21 #ifdef HAVE_WCTYPE_H
22 #include <wctype.h>
23 #endif /* HAVE_WCTYPE_H */
24 #ifndef TRE_WCHAR
25 #include <ctype.h>
26 #endif /* !TRE_WCHAR */
27 #ifdef HAVE_MALLOC_H
28 #include <malloc.h>
29 #endif /* HAVE_MALLOC_H */
30 #include <limits.h>
31 
32 #include "tre-internal.h"
33 #include "tre.h"
34 #include "xmalloc.h"
35 
36 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
37    endpoint values. */
38 void
tre_fill_pmatch(size_t nmatch,regmatch_t pmatch[],int cflags,const tre_tnfa_t * tnfa,int * tags,int match_eo)39 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
40 		const tre_tnfa_t *tnfa, int *tags, int match_eo)
41 {
42   tre_submatch_data_t *submatch_data;
43   unsigned int i, j;
44   int *parents;
45 
46   if (cflags & REG_NOSUB)
47     return;
48 
49   i = 0;
50   if (match_eo >= 0)
51     {
52       /* Construct submatch offsets from the tags. */
53       DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
54       submatch_data = tnfa->submatch_data;
55       while (i < tnfa->num_submatches && i < nmatch)
56 	{
57 	  if (submatch_data[i].so_tag == tnfa->end_tag)
58 	    pmatch[i].rm_so = match_eo;
59 	  else
60 	    pmatch[i].rm_so = tags[submatch_data[i].so_tag];
61 
62 	  if (submatch_data[i].eo_tag == tnfa->end_tag)
63 	    pmatch[i].rm_eo = match_eo;
64 	  else
65 	    pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
66 
67 	  /* If either of the endpoints were not used, this submatch
68 	     was not part of the match. */
69 	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
70 	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
71 
72 	  DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i,
73 		  submatch_data[i].so_tag, pmatch[i].rm_so,
74 		  submatch_data[i].eo_tag, pmatch[i].rm_eo));
75 	  i++;
76 	}
77       /* Reset all submatches that are not within all of their parent
78 	 submatches. */
79       i = 0;
80       while (i < tnfa->num_submatches && i < nmatch)
81 	{
82 	  if (pmatch[i].rm_eo == -1)
83 	    assert(pmatch[i].rm_so == -1);
84 	  assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
85 
86 	  parents = submatch_data[i].parents;
87 	  if (parents != NULL)
88 	    for (j = 0; parents[j] >= 0; j++)
89 	      {
90 		DPRINT(("pmatch[%d] parent %d\n", i, parents[j]));
91 		if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
92 		    || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
93 		  pmatch[i].rm_so = pmatch[i].rm_eo = -1;
94 	      }
95 	  i++;
96 	}
97     }
98 
99   while (i < nmatch)
100     {
101       pmatch[i].rm_so = -1;
102       pmatch[i].rm_eo = -1;
103       i++;
104     }
105 }
106 
107 
108 /*
109   Wrapper functions for POSIX compatible regexp matching.
110 */
111 
112 int
tre_have_backrefs(const regex_t * preg)113 tre_have_backrefs(const regex_t *preg)
114 {
115   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
116   return tnfa->have_backrefs;
117 }
118 
119 int
tre_have_approx(const regex_t * preg)120 tre_have_approx(const regex_t *preg)
121 {
122   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
123   return tnfa->have_approx;
124 }
125 
126 static int
tre_match(const tre_tnfa_t * tnfa,const void * string,size_t len,tre_str_type_t type,size_t nmatch,regmatch_t pmatch[],int eflags)127 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
128 	  tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
129 	  int eflags)
130 {
131   reg_errcode_t status;
132   int *tags = NULL, eo;
133   if (tnfa->cflags & REG_NOSUB) nmatch = 0;
134   if (tnfa->num_tags > 0 && nmatch > 0)
135     {
136 #ifdef TRE_USE_ALLOCA
137       tags = alloca(sizeof(*tags) * tnfa->num_tags);
138 #else /* !TRE_USE_ALLOCA */
139       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
140 #endif /* !TRE_USE_ALLOCA */
141       if (tags == NULL)
142 	return REG_ESPACE;
143     }
144 
145   /* Dispatch to the appropriate matcher. */
146   if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER)
147     {
148       /* The regex has back references, use the backtracking matcher. */
149       if (type == STR_USER)
150 	{
151 	  const tre_str_source *source = string;
152 	  if (source->rewind == NULL || source->compare == NULL)
153 	    /* The backtracking matcher requires rewind and compare
154 	       capabilities from the input stream. */
155 	    return REG_BADPAT;
156 	}
157       status = tre_tnfa_run_backtrack(tnfa, string, (int)len, type,
158 				      tags, eflags, &eo);
159     }
160 #ifdef TRE_APPROX
161   else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER)
162     {
163       /* The regex uses approximate matching, use the approximate matcher. */
164       regamatch_t match;
165       regaparams_t params;
166       tre_regaparams_default(&params);
167       params.max_err = 0;
168       params.max_cost = 0;
169       status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
170 				   &match, params, eflags, &eo);
171     }
172 #endif /* TRE_APPROX */
173   else
174     {
175       /* Exact matching, no back references, use the parallel matcher. */
176       status = tre_tnfa_run_parallel(tnfa, string, (int)len, type,
177 				     tags, eflags, &eo);
178     }
179 
180   if (status == REG_OK)
181     /* A match was found, so fill the submatch registers. */
182     tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
183 #ifndef TRE_USE_ALLOCA
184   if (tags)
185     xfree(tags);
186 #endif /* !TRE_USE_ALLOCA */
187   return status;
188 }
189 
190 int
tre_regnexec(const regex_t * preg,const char * str,size_t len,size_t nmatch,regmatch_t pmatch[],int eflags)191 tre_regnexec(const regex_t *preg, const char *str, size_t len,
192 	 size_t nmatch, regmatch_t pmatch[], int eflags)
193 {
194   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
195   /* CONSTCOND */
196   tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
197 
198   return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags);
199 }
200 
201 int
tre_regexec(const regex_t * preg,const char * str,size_t nmatch,regmatch_t pmatch[],int eflags)202 tre_regexec(const regex_t *preg, const char *str,
203 	size_t nmatch, regmatch_t pmatch[], int eflags)
204 {
205 	size_t shift, len, i;
206 	int ret;
207 	tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
208 	regmatch_t *p;
209 
210 	if (eflags & REG_STARTEND) {
211 		if (pmatch == NULL || pmatch->rm_so < 0 || pmatch->rm_eo < 0
212 		    || pmatch->rm_so > pmatch->rm_eo)
213 			return REG_INVARG;
214 		str += shift = pmatch->rm_so;
215 		len = pmatch->rm_eo - pmatch->rm_so;
216 		eflags &= ~REG_STARTEND;
217 	} else {
218 		shift = 0;
219 		len = (size_t)-1;
220 	}
221 
222 	ret = tre_regnexec(preg, str, len, nmatch, pmatch, eflags);
223 
224 	if (!ret && !(tnfa->cflags & REG_NOSUB) && len != (size_t)-1) {
225 		for (i = nmatch, p = pmatch; i > 0; p++, i--) {
226 			if (p->rm_so >= 0) p->rm_so += shift;
227 			if (p->rm_eo >= 0) p->rm_eo += shift;
228 		}
229 	}
230 
231 	return ret;
232 }
233 
234 #ifdef REG_USEBYTES
235 int
tre_regexecb(const regex_t * preg,const char * str,size_t nmatch,regmatch_t pmatch[],int eflags)236 tre_regexecb(const regex_t *preg, const char *str,
237         size_t nmatch, regmatch_t pmatch[], int eflags)
238 {
239   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
240 
241   return tre_match(tnfa, str, (unsigned)-1, STR_BYTE, nmatch, pmatch, eflags);
242 }
243 
244 int
tre_regnexecb(const regex_t * preg,const char * str,size_t len,size_t nmatch,regmatch_t pmatch[],int eflags)245 tre_regnexecb(const regex_t *preg, const char *str, size_t len,
246         size_t nmatch, regmatch_t pmatch[], int eflags)
247 {
248   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
249 
250   return tre_match(tnfa, str, len, STR_BYTE, nmatch, pmatch, eflags);
251 }
252 #endif /* REG_USEBYTES */
253 
254 
255 #ifdef TRE_WCHAR
256 
257 int
tre_regwnexec(const regex_t * preg,const wchar_t * str,size_t len,size_t nmatch,regmatch_t pmatch[],int eflags)258 tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len,
259 	  size_t nmatch, regmatch_t pmatch[], int eflags)
260 {
261   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
262   return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags);
263 }
264 
265 int
tre_regwexec(const regex_t * preg,const wchar_t * str,size_t nmatch,regmatch_t pmatch[],int eflags)266 tre_regwexec(const regex_t *preg, const wchar_t *str,
267 	 size_t nmatch, regmatch_t pmatch[], int eflags)
268 {
269 	size_t shift, len, i;
270 	int ret;
271 	tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
272 	regmatch_t *p;
273 
274 	if (eflags & REG_STARTEND) {
275 		if (pmatch == NULL || pmatch->rm_so < 0 || pmatch->rm_eo < 0
276 		    || pmatch->rm_so > pmatch->rm_eo)
277 			return REG_INVARG;
278 		str += shift = pmatch->rm_so;
279 		len = pmatch->rm_eo - pmatch->rm_so;
280 		eflags &= ~REG_STARTEND;
281 	} else {
282 		shift = 0;
283 		len = (size_t)-1;
284 	}
285 
286 	ret = tre_regwnexec(preg, str, len, nmatch, pmatch, eflags);
287 
288 	if (!ret && !(tnfa->cflags & REG_NOSUB) && len != (size_t)-1) {
289 		for (i = nmatch, p = pmatch; i > 0; p++, i--) {
290 			if (p->rm_so >= 0) p->rm_so += shift;
291 			if (p->rm_eo >= 0) p->rm_eo += shift;
292 		}
293 	}
294 
295 	return ret;
296 }
297 
298 #endif /* TRE_WCHAR */
299 
300 int
tre_reguexec(const regex_t * preg,const tre_str_source * str,size_t nmatch,regmatch_t pmatch[],int eflags)301 tre_reguexec(const regex_t *preg, const tre_str_source *str,
302 	 size_t nmatch, regmatch_t pmatch[], int eflags)
303 {
304   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
305   return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags);
306 }
307 
308 
309 #ifdef TRE_APPROX
310 
311 /*
312   Wrapper functions for approximate regexp matching.
313 */
314 
315 static int
tre_match_approx(const tre_tnfa_t * tnfa,const void * string,size_t len,tre_str_type_t type,regamatch_t * match,regaparams_t params,int eflags)316 tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
317 		 tre_str_type_t type, regamatch_t *match, regaparams_t params,
318 		 int eflags)
319 {
320   reg_errcode_t status;
321   int *tags = NULL, eo;
322   size_t nmatch;
323 
324   if (tnfa->cflags & REG_NOSUB)
325     nmatch = 0;
326   else
327     nmatch = match->nmatch;
328 
329   /* If the regexp does not use approximate matching features, the
330      maximum cost is zero, and the approximate matcher isn't forced,
331      use the exact matcher instead. */
332   if (params.max_cost == 0 && !tnfa->have_approx
333       && !(eflags & REG_APPROX_MATCHER))
334     return tre_match(tnfa, string, len, type, nmatch, match->pmatch,
335 		     eflags);
336 
337   /* Back references are not supported by the approximate matcher. */
338   if (tnfa->have_backrefs)
339     return REG_BADPAT;
340 
341   if (tnfa->num_tags > 0 && nmatch > 0)
342     {
343 #if TRE_USE_ALLOCA
344       tags = alloca(sizeof(*tags) * tnfa->num_tags);
345 #else /* !TRE_USE_ALLOCA */
346       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
347 #endif /* !TRE_USE_ALLOCA */
348       if (tags == NULL)
349 	return REG_ESPACE;
350     }
351   status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
352 			       match, params, eflags, &eo);
353   if (status == REG_OK)
354     tre_fill_pmatch(nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo);
355 #ifndef TRE_USE_ALLOCA
356   if (tags)
357     xfree(tags);
358 #endif /* !TRE_USE_ALLOCA */
359   return status;
360 }
361 
362 int
tre_reganexec(const regex_t * preg,const char * str,size_t len,regamatch_t * match,regaparams_t params,int eflags)363 tre_reganexec(const regex_t *preg, const char *str, size_t len,
364 	  regamatch_t *match, regaparams_t params, int eflags)
365 {
366   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
367   tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
368 
369   return tre_match_approx(tnfa, str, len, type, match, params, eflags);
370 }
371 
372 int
tre_regaexec(const regex_t * preg,const char * str,regamatch_t * match,regaparams_t params,int eflags)373 tre_regaexec(const regex_t *preg, const char *str,
374 	 regamatch_t *match, regaparams_t params, int eflags)
375 {
376   return tre_reganexec(preg, str, (unsigned)-1, match, params, eflags);
377 }
378 
379 #ifdef REG_USEBYTES
380 int
tre_regaexecb(const regex_t * preg,const char * str,regamatch_t * match,regaparams_t params,int eflags)381 tre_regaexecb(const regex_t *preg, const char *str,
382           regamatch_t *match, regaparams_t params, int eflags)
383 {
384   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
385 
386   return tre_match_approx(tnfa, str, (unsigned)-1, STR_BYTE,
387                           match, params, eflags);
388 }
389 #endif /* REG_USEBYTES */
390 
391 #ifdef TRE_WCHAR
392 
393 int
tre_regawnexec(const regex_t * preg,const wchar_t * str,size_t len,regamatch_t * match,regaparams_t params,int eflags)394 tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len,
395 	   regamatch_t *match, regaparams_t params, int eflags)
396 {
397   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
398   return tre_match_approx(tnfa, str, len, STR_WIDE,
399 			  match, params, eflags);
400 }
401 
402 int
tre_regawexec(const regex_t * preg,const wchar_t * str,regamatch_t * match,regaparams_t params,int eflags)403 tre_regawexec(const regex_t *preg, const wchar_t *str,
404 	  regamatch_t *match, regaparams_t params, int eflags)
405 {
406   return tre_regawnexec(preg, str, (unsigned)-1, match, params, eflags);
407 }
408 
409 #endif /* TRE_WCHAR */
410 
411 void
tre_regaparams_default(regaparams_t * params)412 tre_regaparams_default(regaparams_t *params)
413 {
414   memset(params, 0, sizeof(*params));
415   params->cost_ins = 1;
416   params->cost_del = 1;
417   params->cost_subst = 1;
418   params->max_cost = INT_MAX;
419   params->max_ins = INT_MAX;
420   params->max_del = INT_MAX;
421   params->max_subst = INT_MAX;
422   params->max_err = INT_MAX;
423 }
424 
425 #endif /* TRE_APPROX */
426 
427 /* EOF */
428