1 /*
2 tre_regexec.c - TRE POSIX compatible matching functions (and more).
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7 */
8
9 #ifdef HAVE_CONFIG_H
10 #include <config.h>
11 #endif /* HAVE_CONFIG_H */
12
13 #include "tre-alloca.h"
14
15 #include <assert.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #ifdef HAVE_WCHAR_H
19 #include <wchar.h>
20 #endif /* HAVE_WCHAR_H */
21 #ifdef HAVE_WCTYPE_H
22 #include <wctype.h>
23 #endif /* HAVE_WCTYPE_H */
24 #ifndef TRE_WCHAR
25 #include <ctype.h>
26 #endif /* !TRE_WCHAR */
27 #ifdef HAVE_MALLOC_H
28 #include <malloc.h>
29 #endif /* HAVE_MALLOC_H */
30 #include <limits.h>
31
32 #include "tre-internal.h"
33 #include "tre.h"
34 #include "xmalloc.h"
35
36 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
37 endpoint values. */
38 void
tre_fill_pmatch(size_t nmatch,regmatch_t pmatch[],int cflags,const tre_tnfa_t * tnfa,int * tags,int match_eo)39 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
40 const tre_tnfa_t *tnfa, int *tags, int match_eo)
41 {
42 tre_submatch_data_t *submatch_data;
43 unsigned int i, j;
44 int *parents;
45
46 if (cflags & REG_NOSUB)
47 return;
48
49 i = 0;
50 if (match_eo >= 0)
51 {
52 /* Construct submatch offsets from the tags. */
53 DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
54 submatch_data = tnfa->submatch_data;
55 while (i < tnfa->num_submatches && i < nmatch)
56 {
57 if (submatch_data[i].so_tag == tnfa->end_tag)
58 pmatch[i].rm_so = match_eo;
59 else
60 pmatch[i].rm_so = tags[submatch_data[i].so_tag];
61
62 if (submatch_data[i].eo_tag == tnfa->end_tag)
63 pmatch[i].rm_eo = match_eo;
64 else
65 pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
66
67 /* If either of the endpoints were not used, this submatch
68 was not part of the match. */
69 if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
70 pmatch[i].rm_so = pmatch[i].rm_eo = -1;
71
72 DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i,
73 submatch_data[i].so_tag, pmatch[i].rm_so,
74 submatch_data[i].eo_tag, pmatch[i].rm_eo));
75 i++;
76 }
77 /* Reset all submatches that are not within all of their parent
78 submatches. */
79 i = 0;
80 while (i < tnfa->num_submatches && i < nmatch)
81 {
82 if (pmatch[i].rm_eo == -1)
83 assert(pmatch[i].rm_so == -1);
84 assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
85
86 parents = submatch_data[i].parents;
87 if (parents != NULL)
88 for (j = 0; parents[j] >= 0; j++)
89 {
90 DPRINT(("pmatch[%d] parent %d\n", i, parents[j]));
91 if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
92 || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
93 pmatch[i].rm_so = pmatch[i].rm_eo = -1;
94 }
95 i++;
96 }
97 }
98
99 while (i < nmatch)
100 {
101 pmatch[i].rm_so = -1;
102 pmatch[i].rm_eo = -1;
103 i++;
104 }
105 }
106
107
108 /*
109 Wrapper functions for POSIX compatible regexp matching.
110 */
111
112 int
tre_have_backrefs(const regex_t * preg)113 tre_have_backrefs(const regex_t *preg)
114 {
115 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
116 return tnfa->have_backrefs;
117 }
118
119 int
tre_have_approx(const regex_t * preg)120 tre_have_approx(const regex_t *preg)
121 {
122 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
123 return tnfa->have_approx;
124 }
125
126 static int
tre_match(const tre_tnfa_t * tnfa,const void * string,size_t len,tre_str_type_t type,size_t nmatch,regmatch_t pmatch[],int eflags)127 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
128 tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
129 int eflags)
130 {
131 reg_errcode_t status;
132 int *tags = NULL, eo;
133 if (tnfa->cflags & REG_NOSUB) nmatch = 0;
134 if (tnfa->num_tags > 0 && nmatch > 0)
135 {
136 #ifdef TRE_USE_ALLOCA
137 tags = alloca(sizeof(*tags) * tnfa->num_tags);
138 #else /* !TRE_USE_ALLOCA */
139 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
140 #endif /* !TRE_USE_ALLOCA */
141 if (tags == NULL)
142 return REG_ESPACE;
143 }
144
145 /* Dispatch to the appropriate matcher. */
146 if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER)
147 {
148 /* The regex has back references, use the backtracking matcher. */
149 if (type == STR_USER)
150 {
151 const tre_str_source *source = string;
152 if (source->rewind == NULL || source->compare == NULL)
153 /* The backtracking matcher requires rewind and compare
154 capabilities from the input stream. */
155 return REG_BADPAT;
156 }
157 status = tre_tnfa_run_backtrack(tnfa, string, (int)len, type,
158 tags, eflags, &eo);
159 }
160 #ifdef TRE_APPROX
161 else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER)
162 {
163 /* The regex uses approximate matching, use the approximate matcher. */
164 regamatch_t match;
165 regaparams_t params;
166 tre_regaparams_default(¶ms);
167 params.max_err = 0;
168 params.max_cost = 0;
169 status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
170 &match, params, eflags, &eo);
171 }
172 #endif /* TRE_APPROX */
173 else
174 {
175 /* Exact matching, no back references, use the parallel matcher. */
176 status = tre_tnfa_run_parallel(tnfa, string, (int)len, type,
177 tags, eflags, &eo);
178 }
179
180 if (status == REG_OK)
181 /* A match was found, so fill the submatch registers. */
182 tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
183 #ifndef TRE_USE_ALLOCA
184 if (tags)
185 xfree(tags);
186 #endif /* !TRE_USE_ALLOCA */
187 return status;
188 }
189
190 int
tre_regnexec(const regex_t * preg,const char * str,size_t len,size_t nmatch,regmatch_t pmatch[],int eflags)191 tre_regnexec(const regex_t *preg, const char *str, size_t len,
192 size_t nmatch, regmatch_t pmatch[], int eflags)
193 {
194 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
195 /* CONSTCOND */
196 tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
197
198 return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags);
199 }
200
201 int
tre_regexec(const regex_t * preg,const char * str,size_t nmatch,regmatch_t pmatch[],int eflags)202 tre_regexec(const regex_t *preg, const char *str,
203 size_t nmatch, regmatch_t pmatch[], int eflags)
204 {
205 size_t shift, len, i;
206 int ret;
207 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
208 regmatch_t *p;
209
210 if (eflags & REG_STARTEND) {
211 if (pmatch == NULL || pmatch->rm_so < 0 || pmatch->rm_eo < 0
212 || pmatch->rm_so > pmatch->rm_eo)
213 return REG_INVARG;
214 str += shift = pmatch->rm_so;
215 len = pmatch->rm_eo - pmatch->rm_so;
216 eflags &= ~REG_STARTEND;
217 } else {
218 shift = 0;
219 len = (size_t)-1;
220 }
221
222 ret = tre_regnexec(preg, str, len, nmatch, pmatch, eflags);
223
224 if (!ret && !(tnfa->cflags & REG_NOSUB) && len != (size_t)-1) {
225 for (i = nmatch, p = pmatch; i > 0; p++, i--) {
226 if (p->rm_so >= 0) p->rm_so += shift;
227 if (p->rm_eo >= 0) p->rm_eo += shift;
228 }
229 }
230
231 return ret;
232 }
233
234 #ifdef REG_USEBYTES
235 int
tre_regexecb(const regex_t * preg,const char * str,size_t nmatch,regmatch_t pmatch[],int eflags)236 tre_regexecb(const regex_t *preg, const char *str,
237 size_t nmatch, regmatch_t pmatch[], int eflags)
238 {
239 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
240
241 return tre_match(tnfa, str, (unsigned)-1, STR_BYTE, nmatch, pmatch, eflags);
242 }
243
244 int
tre_regnexecb(const regex_t * preg,const char * str,size_t len,size_t nmatch,regmatch_t pmatch[],int eflags)245 tre_regnexecb(const regex_t *preg, const char *str, size_t len,
246 size_t nmatch, regmatch_t pmatch[], int eflags)
247 {
248 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
249
250 return tre_match(tnfa, str, len, STR_BYTE, nmatch, pmatch, eflags);
251 }
252 #endif /* REG_USEBYTES */
253
254
255 #ifdef TRE_WCHAR
256
257 int
tre_regwnexec(const regex_t * preg,const wchar_t * str,size_t len,size_t nmatch,regmatch_t pmatch[],int eflags)258 tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len,
259 size_t nmatch, regmatch_t pmatch[], int eflags)
260 {
261 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
262 return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags);
263 }
264
265 int
tre_regwexec(const regex_t * preg,const wchar_t * str,size_t nmatch,regmatch_t pmatch[],int eflags)266 tre_regwexec(const regex_t *preg, const wchar_t *str,
267 size_t nmatch, regmatch_t pmatch[], int eflags)
268 {
269 size_t shift, len, i;
270 int ret;
271 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
272 regmatch_t *p;
273
274 if (eflags & REG_STARTEND) {
275 if (pmatch == NULL || pmatch->rm_so < 0 || pmatch->rm_eo < 0
276 || pmatch->rm_so > pmatch->rm_eo)
277 return REG_INVARG;
278 str += shift = pmatch->rm_so;
279 len = pmatch->rm_eo - pmatch->rm_so;
280 eflags &= ~REG_STARTEND;
281 } else {
282 shift = 0;
283 len = (size_t)-1;
284 }
285
286 ret = tre_regwnexec(preg, str, len, nmatch, pmatch, eflags);
287
288 if (!ret && !(tnfa->cflags & REG_NOSUB) && len != (size_t)-1) {
289 for (i = nmatch, p = pmatch; i > 0; p++, i--) {
290 if (p->rm_so >= 0) p->rm_so += shift;
291 if (p->rm_eo >= 0) p->rm_eo += shift;
292 }
293 }
294
295 return ret;
296 }
297
298 #endif /* TRE_WCHAR */
299
300 int
tre_reguexec(const regex_t * preg,const tre_str_source * str,size_t nmatch,regmatch_t pmatch[],int eflags)301 tre_reguexec(const regex_t *preg, const tre_str_source *str,
302 size_t nmatch, regmatch_t pmatch[], int eflags)
303 {
304 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
305 return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags);
306 }
307
308
309 #ifdef TRE_APPROX
310
311 /*
312 Wrapper functions for approximate regexp matching.
313 */
314
315 static int
tre_match_approx(const tre_tnfa_t * tnfa,const void * string,size_t len,tre_str_type_t type,regamatch_t * match,regaparams_t params,int eflags)316 tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
317 tre_str_type_t type, regamatch_t *match, regaparams_t params,
318 int eflags)
319 {
320 reg_errcode_t status;
321 int *tags = NULL, eo;
322 size_t nmatch;
323
324 if (tnfa->cflags & REG_NOSUB)
325 nmatch = 0;
326 else
327 nmatch = match->nmatch;
328
329 /* If the regexp does not use approximate matching features, the
330 maximum cost is zero, and the approximate matcher isn't forced,
331 use the exact matcher instead. */
332 if (params.max_cost == 0 && !tnfa->have_approx
333 && !(eflags & REG_APPROX_MATCHER))
334 return tre_match(tnfa, string, len, type, nmatch, match->pmatch,
335 eflags);
336
337 /* Back references are not supported by the approximate matcher. */
338 if (tnfa->have_backrefs)
339 return REG_BADPAT;
340
341 if (tnfa->num_tags > 0 && nmatch > 0)
342 {
343 #if TRE_USE_ALLOCA
344 tags = alloca(sizeof(*tags) * tnfa->num_tags);
345 #else /* !TRE_USE_ALLOCA */
346 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
347 #endif /* !TRE_USE_ALLOCA */
348 if (tags == NULL)
349 return REG_ESPACE;
350 }
351 status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
352 match, params, eflags, &eo);
353 if (status == REG_OK)
354 tre_fill_pmatch(nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo);
355 #ifndef TRE_USE_ALLOCA
356 if (tags)
357 xfree(tags);
358 #endif /* !TRE_USE_ALLOCA */
359 return status;
360 }
361
362 int
tre_reganexec(const regex_t * preg,const char * str,size_t len,regamatch_t * match,regaparams_t params,int eflags)363 tre_reganexec(const regex_t *preg, const char *str, size_t len,
364 regamatch_t *match, regaparams_t params, int eflags)
365 {
366 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
367 tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
368
369 return tre_match_approx(tnfa, str, len, type, match, params, eflags);
370 }
371
372 int
tre_regaexec(const regex_t * preg,const char * str,regamatch_t * match,regaparams_t params,int eflags)373 tre_regaexec(const regex_t *preg, const char *str,
374 regamatch_t *match, regaparams_t params, int eflags)
375 {
376 return tre_reganexec(preg, str, (unsigned)-1, match, params, eflags);
377 }
378
379 #ifdef REG_USEBYTES
380 int
tre_regaexecb(const regex_t * preg,const char * str,regamatch_t * match,regaparams_t params,int eflags)381 tre_regaexecb(const regex_t *preg, const char *str,
382 regamatch_t *match, regaparams_t params, int eflags)
383 {
384 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
385
386 return tre_match_approx(tnfa, str, (unsigned)-1, STR_BYTE,
387 match, params, eflags);
388 }
389 #endif /* REG_USEBYTES */
390
391 #ifdef TRE_WCHAR
392
393 int
tre_regawnexec(const regex_t * preg,const wchar_t * str,size_t len,regamatch_t * match,regaparams_t params,int eflags)394 tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len,
395 regamatch_t *match, regaparams_t params, int eflags)
396 {
397 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
398 return tre_match_approx(tnfa, str, len, STR_WIDE,
399 match, params, eflags);
400 }
401
402 int
tre_regawexec(const regex_t * preg,const wchar_t * str,regamatch_t * match,regaparams_t params,int eflags)403 tre_regawexec(const regex_t *preg, const wchar_t *str,
404 regamatch_t *match, regaparams_t params, int eflags)
405 {
406 return tre_regawnexec(preg, str, (unsigned)-1, match, params, eflags);
407 }
408
409 #endif /* TRE_WCHAR */
410
411 void
tre_regaparams_default(regaparams_t * params)412 tre_regaparams_default(regaparams_t *params)
413 {
414 memset(params, 0, sizeof(*params));
415 params->cost_ins = 1;
416 params->cost_del = 1;
417 params->cost_subst = 1;
418 params->max_cost = INT_MAX;
419 params->max_ins = INT_MAX;
420 params->max_del = INT_MAX;
421 params->max_subst = INT_MAX;
422 params->max_err = INT_MAX;
423 }
424
425 #endif /* TRE_APPROX */
426
427 /* EOF */
428