xref: /netbsd-src/external/bsd/tre/dist/lib/regexec.c (revision 179b12252ecaf3553d9c2b7458ce62b6a2203d0c)
1 /*
2   tre_regexec.c - TRE POSIX compatible matching functions (and more).
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 #ifdef HAVE_CONFIG_H
10 #include <config.h>
11 #endif /* HAVE_CONFIG_H */
12 
13 #ifdef TRE_USE_ALLOCA
14 /* AIX requires this to be the first thing in the file.	 */
15 #ifndef __GNUC__
16 # if HAVE_ALLOCA_H
17 #  include <alloca.h>
18 # else
19 #  ifdef _AIX
20  #pragma alloca
21 #  else
22 #   ifndef alloca /* predefined by HP cc +Olibcalls */
23 char *alloca ();
24 #   endif
25 #  endif
26 # endif
27 #endif
28 #endif /* TRE_USE_ALLOCA */
29 
30 #include <assert.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #ifdef HAVE_WCHAR_H
34 #include <wchar.h>
35 #endif /* HAVE_WCHAR_H */
36 #ifdef HAVE_WCTYPE_H
37 #include <wctype.h>
38 #endif /* HAVE_WCTYPE_H */
39 #ifndef TRE_WCHAR
40 #include <ctype.h>
41 #endif /* !TRE_WCHAR */
42 #ifdef HAVE_MALLOC_H
43 #include <malloc.h>
44 #endif /* HAVE_MALLOC_H */
45 #include <limits.h>
46 
47 #include "tre-internal.h"
48 #include "tre.h"
49 #include "xmalloc.h"
50 
51 
52 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
53    endpoint values. */
54 void
55 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
56 		const tre_tnfa_t *tnfa, int *tags, int match_eo)
57 {
58   tre_submatch_data_t *submatch_data;
59   unsigned int i, j;
60   int *parents;
61 
62   i = 0;
63   if (match_eo >= 0 && !(cflags & REG_NOSUB))
64     {
65       /* Construct submatch offsets from the tags. */
66       DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
67       submatch_data = tnfa->submatch_data;
68       while (i < tnfa->num_submatches && i < nmatch)
69 	{
70 	  if (submatch_data[i].so_tag == tnfa->end_tag)
71 	    pmatch[i].rm_so = match_eo;
72 	  else
73 	    pmatch[i].rm_so = tags[submatch_data[i].so_tag];
74 
75 	  if (submatch_data[i].eo_tag == tnfa->end_tag)
76 	    pmatch[i].rm_eo = match_eo;
77 	  else
78 	    pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
79 
80 	  /* If either of the endpoints were not used, this submatch
81 	     was not part of the match. */
82 	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
83 	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
84 
85 	  DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i,
86 		  submatch_data[i].so_tag, pmatch[i].rm_so,
87 		  submatch_data[i].eo_tag, pmatch[i].rm_eo));
88 	  i++;
89 	}
90       /* Reset all submatches that are not within all of their parent
91 	 submatches. */
92       i = 0;
93       while (i < tnfa->num_submatches && i < nmatch)
94 	{
95 	  if (pmatch[i].rm_eo == -1)
96 	    assert(pmatch[i].rm_so == -1);
97 	  assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
98 
99 	  parents = submatch_data[i].parents;
100 	  if (parents != NULL)
101 	    for (j = 0; parents[j] >= 0; j++)
102 	      {
103 		DPRINT(("pmatch[%d] parent %d\n", i, parents[j]));
104 		if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
105 		    || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
106 		  pmatch[i].rm_so = pmatch[i].rm_eo = -1;
107 	      }
108 	  i++;
109 	}
110     }
111 
112   while (i < nmatch)
113     {
114       pmatch[i].rm_so = -1;
115       pmatch[i].rm_eo = -1;
116       i++;
117     }
118 }
119 
120 
121 /*
122   Wrapper functions for POSIX compatible regexp matching.
123 */
124 
125 int
126 tre_have_backrefs(const regex_t *preg)
127 {
128   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
129   return tnfa->have_backrefs;
130 }
131 
132 int
133 tre_have_approx(const regex_t *preg)
134 {
135   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
136   return tnfa->have_approx;
137 }
138 
139 static int
140 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
141 	  tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
142 	  int eflags)
143 {
144   reg_errcode_t status;
145   int *tags = NULL, eo;
146   if (tnfa->num_tags > 0 && nmatch > 0)
147     {
148 #ifdef TRE_USE_ALLOCA
149       tags = alloca(sizeof(*tags) * tnfa->num_tags);
150 #else /* !TRE_USE_ALLOCA */
151       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
152 #endif /* !TRE_USE_ALLOCA */
153       if (tags == NULL)
154 	return REG_ESPACE;
155     }
156 
157   /* Dispatch to the appropriate matcher. */
158   if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER)
159     {
160       /* The regex has back references, use the backtracking matcher. */
161       if (type == STR_USER)
162 	{
163 	  const tre_str_source *source = string;
164 	  if (source->rewind == NULL || source->compare == NULL)
165 	    /* The backtracking matcher requires rewind and compare
166 	       capabilities from the input stream. */
167 	    return REG_BADPAT;
168 	}
169       status = tre_tnfa_run_backtrack(tnfa, string, (int)len, type,
170 				      tags, eflags, &eo);
171     }
172 #ifdef TRE_APPROX
173   else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER)
174     {
175       /* The regex uses approximate matching, use the approximate matcher. */
176       regamatch_t match;
177       regaparams_t params;
178       tre_regaparams_default(&params);
179       params.max_err = 0;
180       params.max_cost = 0;
181       status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
182 				   &match, params, eflags, &eo);
183     }
184 #endif /* TRE_APPROX */
185   else
186     {
187       /* Exact matching, no back references, use the parallel matcher. */
188       status = tre_tnfa_run_parallel(tnfa, string, (int)len, type,
189 				     tags, eflags, &eo);
190     }
191 
192   if (status == REG_OK)
193     /* A match was found, so fill the submatch registers. */
194     tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
195 #ifndef TRE_USE_ALLOCA
196   if (tags)
197     xfree(tags);
198 #endif /* !TRE_USE_ALLOCA */
199   return status;
200 }
201 
202 int
203 tre_regnexec(const regex_t *preg, const char *str, size_t len,
204 	 size_t nmatch, regmatch_t pmatch[], int eflags)
205 {
206   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
207   tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
208 
209   return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags);
210 }
211 
212 int
213 tre_regexec(const regex_t *preg, const char *str,
214 	size_t nmatch, regmatch_t pmatch[], int eflags)
215 {
216 	const char	*newstr;
217 	unsigned	 newflags;
218 	size_t		 newlen;
219 
220 	if (eflags & REG_STARTEND) {
221 		newstr = &str[pmatch[0].rm_so];
222 		newlen = pmatch[0].rm_eo;
223 		newflags = (unsigned)(eflags & ~REG_STARTEND);
224 	} else {
225 		newstr = str;
226 		newlen = (size_t)-1;
227 		newflags = (unsigned)eflags;
228 	}
229 	return tre_regnexec(preg, newstr, newlen, nmatch, pmatch, (int)newflags);
230 }
231 
232 
233 #ifdef TRE_WCHAR
234 
235 int
236 tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len,
237 	  size_t nmatch, regmatch_t pmatch[], int eflags)
238 {
239   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
240   return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags);
241 }
242 
243 int
244 tre_regwexec(const regex_t *preg, const wchar_t *str,
245 	 size_t nmatch, regmatch_t pmatch[], int eflags)
246 {
247   return tre_regwnexec(preg, str, (unsigned)-1, nmatch, pmatch, eflags);
248 }
249 
250 #endif /* TRE_WCHAR */
251 
252 int
253 tre_reguexec(const regex_t *preg, const tre_str_source *str,
254 	 size_t nmatch, regmatch_t pmatch[], int eflags)
255 {
256   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
257   return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags);
258 }
259 
260 
261 #ifdef TRE_APPROX
262 
263 /*
264   Wrapper functions for approximate regexp matching.
265 */
266 
267 static int
268 tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
269 		 tre_str_type_t type, regamatch_t *match, regaparams_t params,
270 		 int eflags)
271 {
272   reg_errcode_t status;
273   int *tags = NULL, eo;
274 
275   /* If the regexp does not use approximate matching features, the
276      maximum cost is zero, and the approximate matcher isn't forced,
277      use the exact matcher instead. */
278   if (params.max_cost == 0 && !tnfa->have_approx
279       && !(eflags & REG_APPROX_MATCHER))
280     return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch,
281 		     eflags);
282 
283   /* Back references are not supported by the approximate matcher. */
284   if (tnfa->have_backrefs)
285     return REG_BADPAT;
286 
287   if (tnfa->num_tags > 0 && match->nmatch > 0)
288     {
289 #if TRE_USE_ALLOCA
290       tags = alloca(sizeof(*tags) * tnfa->num_tags);
291 #else /* !TRE_USE_ALLOCA */
292       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
293 #endif /* !TRE_USE_ALLOCA */
294       if (tags == NULL)
295 	return REG_ESPACE;
296     }
297   status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
298 			       match, params, eflags, &eo);
299   if (status == REG_OK)
300     tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo);
301 #ifndef TRE_USE_ALLOCA
302   if (tags)
303     xfree(tags);
304 #endif /* !TRE_USE_ALLOCA */
305   return status;
306 }
307 
308 int
309 tre_reganexec(const regex_t *preg, const char *str, size_t len,
310 	  regamatch_t *match, regaparams_t params, int eflags)
311 {
312   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
313   tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
314 
315   return tre_match_approx(tnfa, str, len, type, match, params, eflags);
316 }
317 
318 int
319 tre_regaexec(const regex_t *preg, const char *str,
320 	 regamatch_t *match, regaparams_t params, int eflags)
321 {
322   return tre_reganexec(preg, str, (unsigned)-1, match, params, eflags);
323 }
324 
325 #ifdef TRE_WCHAR
326 
327 int
328 tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len,
329 	   regamatch_t *match, regaparams_t params, int eflags)
330 {
331   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
332   return tre_match_approx(tnfa, str, len, STR_WIDE,
333 			  match, params, eflags);
334 }
335 
336 int
337 tre_regawexec(const regex_t *preg, const wchar_t *str,
338 	  regamatch_t *match, regaparams_t params, int eflags)
339 {
340   return tre_regawnexec(preg, str, (unsigned)-1, match, params, eflags);
341 }
342 
343 #endif /* TRE_WCHAR */
344 
345 void
346 tre_regaparams_default(regaparams_t *params)
347 {
348   memset(params, 0, sizeof(*params));
349   params->cost_ins = 1;
350   params->cost_del = 1;
351   params->cost_subst = 1;
352   params->max_cost = INT_MAX;
353   params->max_ins = INT_MAX;
354   params->max_del = INT_MAX;
355   params->max_subst = INT_MAX;
356   params->max_err = INT_MAX;
357 }
358 
359 #endif /* TRE_APPROX */
360 
361 /* EOF */
362