1 /* 2 tre_regexec.c - TRE POSIX compatible matching functions (and more). 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7 */ 8 9 #ifdef HAVE_CONFIG_H 10 #include <config.h> 11 #endif /* HAVE_CONFIG_H */ 12 13 #include "tre-alloca.h" 14 15 #include <assert.h> 16 #include <stdlib.h> 17 #include <string.h> 18 #ifdef HAVE_WCHAR_H 19 #include <wchar.h> 20 #endif /* HAVE_WCHAR_H */ 21 #ifdef HAVE_WCTYPE_H 22 #include <wctype.h> 23 #endif /* HAVE_WCTYPE_H */ 24 #ifndef TRE_WCHAR 25 #include <ctype.h> 26 #endif /* !TRE_WCHAR */ 27 #ifdef HAVE_MALLOC_H 28 #include <malloc.h> 29 #endif /* HAVE_MALLOC_H */ 30 #include <limits.h> 31 32 #include "tre-internal.h" 33 #include "tre.h" 34 #include "xmalloc.h" 35 36 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match 37 endpoint values. */ 38 void 39 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, 40 const tre_tnfa_t *tnfa, int *tags, int match_eo) 41 { 42 tre_submatch_data_t *submatch_data; 43 unsigned int i, j; 44 int *parents; 45 46 i = 0; 47 if (match_eo >= 0 && !(cflags & REG_NOSUB)) 48 { 49 /* Construct submatch offsets from the tags. */ 50 DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo)); 51 submatch_data = tnfa->submatch_data; 52 while (i < tnfa->num_submatches && i < nmatch) 53 { 54 if (submatch_data[i].so_tag == tnfa->end_tag) 55 pmatch[i].rm_so = match_eo; 56 else 57 pmatch[i].rm_so = tags[submatch_data[i].so_tag]; 58 59 if (submatch_data[i].eo_tag == tnfa->end_tag) 60 pmatch[i].rm_eo = match_eo; 61 else 62 pmatch[i].rm_eo = tags[submatch_data[i].eo_tag]; 63 64 /* If either of the endpoints were not used, this submatch 65 was not part of the match. */ 66 if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) 67 pmatch[i].rm_so = pmatch[i].rm_eo = -1; 68 69 DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i, 70 submatch_data[i].so_tag, pmatch[i].rm_so, 71 submatch_data[i].eo_tag, pmatch[i].rm_eo)); 72 i++; 73 } 74 /* Reset all submatches that are not within all of their parent 75 submatches. */ 76 i = 0; 77 while (i < tnfa->num_submatches && i < nmatch) 78 { 79 if (pmatch[i].rm_eo == -1) 80 assert(pmatch[i].rm_so == -1); 81 assert(pmatch[i].rm_so <= pmatch[i].rm_eo); 82 83 parents = submatch_data[i].parents; 84 if (parents != NULL) 85 for (j = 0; parents[j] >= 0; j++) 86 { 87 DPRINT(("pmatch[%d] parent %d\n", i, parents[j])); 88 if (pmatch[i].rm_so < pmatch[parents[j]].rm_so 89 || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo) 90 pmatch[i].rm_so = pmatch[i].rm_eo = -1; 91 } 92 i++; 93 } 94 } 95 96 while (i < nmatch) 97 { 98 pmatch[i].rm_so = -1; 99 pmatch[i].rm_eo = -1; 100 i++; 101 } 102 } 103 104 105 /* 106 Wrapper functions for POSIX compatible regexp matching. 107 */ 108 109 int 110 tre_have_backrefs(const regex_t *preg) 111 { 112 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 113 return tnfa->have_backrefs; 114 } 115 116 int 117 tre_have_approx(const regex_t *preg) 118 { 119 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 120 return tnfa->have_approx; 121 } 122 123 static int 124 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, 125 tre_str_type_t type, size_t nmatch, regmatch_t pmatch[], 126 int eflags) 127 { 128 reg_errcode_t status; 129 int *tags = NULL, eo; 130 if (tnfa->num_tags > 0 && nmatch > 0) 131 { 132 #ifdef TRE_USE_ALLOCA 133 tags = alloca(sizeof(*tags) * tnfa->num_tags); 134 #else /* !TRE_USE_ALLOCA */ 135 tags = xmalloc(sizeof(*tags) * tnfa->num_tags); 136 #endif /* !TRE_USE_ALLOCA */ 137 if (tags == NULL) 138 return REG_ESPACE; 139 } 140 141 /* Dispatch to the appropriate matcher. */ 142 if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER) 143 { 144 /* The regex has back references, use the backtracking matcher. */ 145 if (type == STR_USER) 146 { 147 const tre_str_source *source = string; 148 if (source->rewind == NULL || source->compare == NULL) 149 /* The backtracking matcher requires rewind and compare 150 capabilities from the input stream. */ 151 return REG_BADPAT; 152 } 153 status = tre_tnfa_run_backtrack(tnfa, string, (int)len, type, 154 tags, eflags, &eo); 155 } 156 #ifdef TRE_APPROX 157 else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER) 158 { 159 /* The regex uses approximate matching, use the approximate matcher. */ 160 regamatch_t match; 161 regaparams_t params; 162 tre_regaparams_default(¶ms); 163 params.max_err = 0; 164 params.max_cost = 0; 165 status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags, 166 &match, params, eflags, &eo); 167 } 168 #endif /* TRE_APPROX */ 169 else 170 { 171 /* Exact matching, no back references, use the parallel matcher. */ 172 status = tre_tnfa_run_parallel(tnfa, string, (int)len, type, 173 tags, eflags, &eo); 174 } 175 176 if (status == REG_OK) 177 /* A match was found, so fill the submatch registers. */ 178 tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); 179 #ifndef TRE_USE_ALLOCA 180 if (tags) 181 xfree(tags); 182 #endif /* !TRE_USE_ALLOCA */ 183 return status; 184 } 185 186 int 187 tre_regnexec(const regex_t *preg, const char *str, size_t len, 188 size_t nmatch, regmatch_t pmatch[], int eflags) 189 { 190 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 191 /* CONSTCOND */ 192 tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; 193 194 return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags); 195 } 196 197 int 198 tre_regexec(const regex_t *preg, const char *str, 199 size_t nmatch, regmatch_t pmatch[], int eflags) 200 { 201 const char *newstr; 202 unsigned newflags; 203 size_t newlen; 204 205 if (eflags & REG_STARTEND) { 206 /* LINTED */ 207 newstr = &str[pmatch[0].rm_so]; 208 /* LINTED */ 209 newlen = pmatch[0].rm_eo; 210 newflags = (unsigned)(eflags & ~REG_STARTEND); 211 } else { 212 newstr = str; 213 newlen = (size_t)-1; 214 newflags = (unsigned)eflags; 215 } 216 return tre_regnexec(preg, newstr, newlen, nmatch, pmatch, (int)newflags); 217 } 218 219 220 #ifdef TRE_WCHAR 221 222 int 223 tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len, 224 size_t nmatch, regmatch_t pmatch[], int eflags) 225 { 226 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 227 return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags); 228 } 229 230 int 231 tre_regwexec(const regex_t *preg, const wchar_t *str, 232 size_t nmatch, regmatch_t pmatch[], int eflags) 233 { 234 return tre_regwnexec(preg, str, (unsigned)-1, nmatch, pmatch, eflags); 235 } 236 237 #endif /* TRE_WCHAR */ 238 239 int 240 tre_reguexec(const regex_t *preg, const tre_str_source *str, 241 size_t nmatch, regmatch_t pmatch[], int eflags) 242 { 243 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 244 return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags); 245 } 246 247 248 #ifdef TRE_APPROX 249 250 /* 251 Wrapper functions for approximate regexp matching. 252 */ 253 254 static int 255 tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len, 256 tre_str_type_t type, regamatch_t *match, regaparams_t params, 257 int eflags) 258 { 259 reg_errcode_t status; 260 int *tags = NULL, eo; 261 262 /* If the regexp does not use approximate matching features, the 263 maximum cost is zero, and the approximate matcher isn't forced, 264 use the exact matcher instead. */ 265 if (params.max_cost == 0 && !tnfa->have_approx 266 && !(eflags & REG_APPROX_MATCHER)) 267 return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch, 268 eflags); 269 270 /* Back references are not supported by the approximate matcher. */ 271 if (tnfa->have_backrefs) 272 return REG_BADPAT; 273 274 if (tnfa->num_tags > 0 && match->nmatch > 0) 275 { 276 #if TRE_USE_ALLOCA 277 tags = alloca(sizeof(*tags) * tnfa->num_tags); 278 #else /* !TRE_USE_ALLOCA */ 279 tags = xmalloc(sizeof(*tags) * tnfa->num_tags); 280 #endif /* !TRE_USE_ALLOCA */ 281 if (tags == NULL) 282 return REG_ESPACE; 283 } 284 status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags, 285 match, params, eflags, &eo); 286 if (status == REG_OK) 287 tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo); 288 #ifndef TRE_USE_ALLOCA 289 if (tags) 290 xfree(tags); 291 #endif /* !TRE_USE_ALLOCA */ 292 return status; 293 } 294 295 int 296 tre_reganexec(const regex_t *preg, const char *str, size_t len, 297 regamatch_t *match, regaparams_t params, int eflags) 298 { 299 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 300 tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; 301 302 return tre_match_approx(tnfa, str, len, type, match, params, eflags); 303 } 304 305 int 306 tre_regaexec(const regex_t *preg, const char *str, 307 regamatch_t *match, regaparams_t params, int eflags) 308 { 309 return tre_reganexec(preg, str, (unsigned)-1, match, params, eflags); 310 } 311 312 #ifdef TRE_WCHAR 313 314 int 315 tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len, 316 regamatch_t *match, regaparams_t params, int eflags) 317 { 318 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 319 return tre_match_approx(tnfa, str, len, STR_WIDE, 320 match, params, eflags); 321 } 322 323 int 324 tre_regawexec(const regex_t *preg, const wchar_t *str, 325 regamatch_t *match, regaparams_t params, int eflags) 326 { 327 return tre_regawnexec(preg, str, (unsigned)-1, match, params, eflags); 328 } 329 330 #endif /* TRE_WCHAR */ 331 332 void 333 tre_regaparams_default(regaparams_t *params) 334 { 335 memset(params, 0, sizeof(*params)); 336 params->cost_ins = 1; 337 params->cost_del = 1; 338 params->cost_subst = 1; 339 params->max_cost = INT_MAX; 340 params->max_ins = INT_MAX; 341 params->max_del = INT_MAX; 342 params->max_subst = INT_MAX; 343 params->max_err = INT_MAX; 344 } 345 346 #endif /* TRE_APPROX */ 347 348 /* EOF */ 349