1 /* 2 tre_regexec.c - TRE POSIX compatible matching functions (and more). 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7 */ 8 9 #ifdef HAVE_CONFIG_H 10 #include <config.h> 11 #endif /* HAVE_CONFIG_H */ 12 13 #include "tre-alloca.h" 14 15 #include <assert.h> 16 #include <stdlib.h> 17 #include <string.h> 18 #ifdef HAVE_WCHAR_H 19 #include <wchar.h> 20 #endif /* HAVE_WCHAR_H */ 21 #ifdef HAVE_WCTYPE_H 22 #include <wctype.h> 23 #endif /* HAVE_WCTYPE_H */ 24 #ifndef TRE_WCHAR 25 #include <ctype.h> 26 #endif /* !TRE_WCHAR */ 27 #ifdef HAVE_MALLOC_H 28 #include <malloc.h> 29 #endif /* HAVE_MALLOC_H */ 30 #include <limits.h> 31 32 #include "tre-internal.h" 33 #include "tre.h" 34 #include "xmalloc.h" 35 36 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match 37 endpoint values. */ 38 void 39 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, 40 const tre_tnfa_t *tnfa, int *tags, int match_eo) 41 { 42 tre_submatch_data_t *submatch_data; 43 unsigned int i, j; 44 int *parents; 45 46 i = 0; 47 if (match_eo >= 0 && !(cflags & REG_NOSUB)) 48 { 49 /* Construct submatch offsets from the tags. */ 50 DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo)); 51 submatch_data = tnfa->submatch_data; 52 while (i < tnfa->num_submatches && i < nmatch) 53 { 54 if (submatch_data[i].so_tag == tnfa->end_tag) 55 pmatch[i].rm_so = match_eo; 56 else 57 pmatch[i].rm_so = tags[submatch_data[i].so_tag]; 58 59 if (submatch_data[i].eo_tag == tnfa->end_tag) 60 pmatch[i].rm_eo = match_eo; 61 else 62 pmatch[i].rm_eo = tags[submatch_data[i].eo_tag]; 63 64 /* If either of the endpoints were not used, this submatch 65 was not part of the match. */ 66 if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) 67 pmatch[i].rm_so = pmatch[i].rm_eo = -1; 68 69 DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i, 70 submatch_data[i].so_tag, pmatch[i].rm_so, 71 submatch_data[i].eo_tag, pmatch[i].rm_eo)); 72 i++; 73 } 74 /* Reset all submatches that are not within all of their parent 75 submatches. */ 76 i = 0; 77 while (i < tnfa->num_submatches && i < nmatch) 78 { 79 if (pmatch[i].rm_eo == -1) 80 assert(pmatch[i].rm_so == -1); 81 assert(pmatch[i].rm_so <= pmatch[i].rm_eo); 82 83 parents = submatch_data[i].parents; 84 if (parents != NULL) 85 for (j = 0; parents[j] >= 0; j++) 86 { 87 DPRINT(("pmatch[%d] parent %d\n", i, parents[j])); 88 if (pmatch[i].rm_so < pmatch[parents[j]].rm_so 89 || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo) 90 pmatch[i].rm_so = pmatch[i].rm_eo = -1; 91 } 92 i++; 93 } 94 } 95 96 while (i < nmatch) 97 { 98 pmatch[i].rm_so = -1; 99 pmatch[i].rm_eo = -1; 100 i++; 101 } 102 } 103 104 105 /* 106 Wrapper functions for POSIX compatible regexp matching. 107 */ 108 109 int 110 tre_have_backrefs(const regex_t *preg) 111 { 112 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 113 return tnfa->have_backrefs; 114 } 115 116 int 117 tre_have_approx(const regex_t *preg) 118 { 119 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 120 return tnfa->have_approx; 121 } 122 123 static int 124 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, 125 tre_str_type_t type, size_t nmatch, regmatch_t pmatch[], 126 int eflags) 127 { 128 reg_errcode_t status; 129 int *tags = NULL, eo; 130 if (tnfa->num_tags > 0 && nmatch > 0) 131 { 132 #ifdef TRE_USE_ALLOCA 133 tags = alloca(sizeof(*tags) * tnfa->num_tags); 134 #else /* !TRE_USE_ALLOCA */ 135 tags = xmalloc(sizeof(*tags) * tnfa->num_tags); 136 #endif /* !TRE_USE_ALLOCA */ 137 if (tags == NULL) 138 return REG_ESPACE; 139 } 140 141 /* Dispatch to the appropriate matcher. */ 142 if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER) 143 { 144 /* The regex has back references, use the backtracking matcher. */ 145 if (type == STR_USER) 146 { 147 const tre_str_source *source = string; 148 if (source->rewind == NULL || source->compare == NULL) 149 /* The backtracking matcher requires rewind and compare 150 capabilities from the input stream. */ 151 return REG_BADPAT; 152 } 153 status = tre_tnfa_run_backtrack(tnfa, string, (int)len, type, 154 tags, eflags, &eo); 155 } 156 #ifdef TRE_APPROX 157 else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER) 158 { 159 /* The regex uses approximate matching, use the approximate matcher. */ 160 regamatch_t match; 161 regaparams_t params; 162 tre_regaparams_default(¶ms); 163 params.max_err = 0; 164 params.max_cost = 0; 165 status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags, 166 &match, params, eflags, &eo); 167 } 168 #endif /* TRE_APPROX */ 169 else 170 { 171 /* Exact matching, no back references, use the parallel matcher. */ 172 status = tre_tnfa_run_parallel(tnfa, string, (int)len, type, 173 tags, eflags, &eo); 174 } 175 176 if (status == REG_OK) 177 /* A match was found, so fill the submatch registers. */ 178 tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); 179 #ifndef TRE_USE_ALLOCA 180 if (tags) 181 xfree(tags); 182 #endif /* !TRE_USE_ALLOCA */ 183 return status; 184 } 185 186 int 187 tre_regnexec(const regex_t *preg, const char *str, size_t len, 188 size_t nmatch, regmatch_t pmatch[], int eflags) 189 { 190 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 191 /* CONSTCOND */ 192 tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; 193 194 return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags); 195 } 196 197 int 198 tre_regexec(const regex_t *preg, const char *str, 199 size_t nmatch, regmatch_t pmatch[], int eflags) 200 { 201 size_t shift, len, i; 202 int ret; 203 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 204 regmatch_t *p; 205 206 if (eflags & REG_STARTEND) { 207 if (pmatch == NULL || pmatch->rm_so < 0 || pmatch->rm_eo < 0 208 || pmatch->rm_so > pmatch->rm_eo) 209 return REG_INVARG; 210 str += shift = pmatch->rm_so; 211 len = pmatch->rm_eo - pmatch->rm_so; 212 eflags &= ~REG_STARTEND; 213 } else { 214 shift = 0; 215 len = (size_t)-1; 216 } 217 218 ret = tre_regnexec(preg, str, len, nmatch, pmatch, eflags); 219 220 if (!ret && !(tnfa->cflags & REG_NOSUB) && len != (size_t)-1) { 221 for (i = nmatch, p = pmatch; i > 0; p++, i--) { 222 if (p->rm_so >= 0) p->rm_so += shift; 223 if (p->rm_eo >= 0) p->rm_eo += shift; 224 } 225 } 226 227 return ret; 228 } 229 230 231 #ifdef TRE_WCHAR 232 233 int 234 tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len, 235 size_t nmatch, regmatch_t pmatch[], int eflags) 236 { 237 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 238 return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags); 239 } 240 241 int 242 tre_regwexec(const regex_t *preg, const wchar_t *str, 243 size_t nmatch, regmatch_t pmatch[], int eflags) 244 { 245 size_t shift, len, i; 246 int ret; 247 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 248 regmatch_t *p; 249 250 if (eflags & REG_STARTEND) { 251 if (pmatch == NULL || pmatch->rm_so < 0 || pmatch->rm_eo < 0 252 || pmatch->rm_so > pmatch->rm_eo) 253 return REG_INVARG; 254 str += shift = pmatch->rm_so; 255 len = pmatch->rm_eo - pmatch->rm_so; 256 eflags &= ~REG_STARTEND; 257 } else { 258 shift = 0; 259 len = (size_t)-1; 260 } 261 262 ret = tre_regwnexec(preg, str, len, nmatch, pmatch, eflags); 263 264 if (!ret && !(tnfa->cflags & REG_NOSUB) && len != (size_t)-1) { 265 for (i = nmatch, p = pmatch; i > 0; p++, i--) { 266 if (p->rm_so >= 0) p->rm_so += shift; 267 if (p->rm_eo >= 0) p->rm_eo += shift; 268 } 269 } 270 271 return ret; 272 } 273 274 #endif /* TRE_WCHAR */ 275 276 int 277 tre_reguexec(const regex_t *preg, const tre_str_source *str, 278 size_t nmatch, regmatch_t pmatch[], int eflags) 279 { 280 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 281 return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags); 282 } 283 284 285 #ifdef TRE_APPROX 286 287 /* 288 Wrapper functions for approximate regexp matching. 289 */ 290 291 static int 292 tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len, 293 tre_str_type_t type, regamatch_t *match, regaparams_t params, 294 int eflags) 295 { 296 reg_errcode_t status; 297 int *tags = NULL, eo; 298 299 /* If the regexp does not use approximate matching features, the 300 maximum cost is zero, and the approximate matcher isn't forced, 301 use the exact matcher instead. */ 302 if (params.max_cost == 0 && !tnfa->have_approx 303 && !(eflags & REG_APPROX_MATCHER)) 304 return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch, 305 eflags); 306 307 /* Back references are not supported by the approximate matcher. */ 308 if (tnfa->have_backrefs) 309 return REG_BADPAT; 310 311 if (tnfa->num_tags > 0 && match->nmatch > 0) 312 { 313 #if TRE_USE_ALLOCA 314 tags = alloca(sizeof(*tags) * tnfa->num_tags); 315 #else /* !TRE_USE_ALLOCA */ 316 tags = xmalloc(sizeof(*tags) * tnfa->num_tags); 317 #endif /* !TRE_USE_ALLOCA */ 318 if (tags == NULL) 319 return REG_ESPACE; 320 } 321 status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags, 322 match, params, eflags, &eo); 323 if (status == REG_OK) 324 tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo); 325 #ifndef TRE_USE_ALLOCA 326 if (tags) 327 xfree(tags); 328 #endif /* !TRE_USE_ALLOCA */ 329 return status; 330 } 331 332 int 333 tre_reganexec(const regex_t *preg, const char *str, size_t len, 334 regamatch_t *match, regaparams_t params, int eflags) 335 { 336 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 337 tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; 338 339 return tre_match_approx(tnfa, str, len, type, match, params, eflags); 340 } 341 342 int 343 tre_regaexec(const regex_t *preg, const char *str, 344 regamatch_t *match, regaparams_t params, int eflags) 345 { 346 return tre_reganexec(preg, str, (unsigned)-1, match, params, eflags); 347 } 348 349 #ifdef TRE_WCHAR 350 351 int 352 tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len, 353 regamatch_t *match, regaparams_t params, int eflags) 354 { 355 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 356 return tre_match_approx(tnfa, str, len, STR_WIDE, 357 match, params, eflags); 358 } 359 360 int 361 tre_regawexec(const regex_t *preg, const wchar_t *str, 362 regamatch_t *match, regaparams_t params, int eflags) 363 { 364 return tre_regawnexec(preg, str, (unsigned)-1, match, params, eflags); 365 } 366 367 #endif /* TRE_WCHAR */ 368 369 void 370 tre_regaparams_default(regaparams_t *params) 371 { 372 memset(params, 0, sizeof(*params)); 373 params->cost_ins = 1; 374 params->cost_del = 1; 375 params->cost_subst = 1; 376 params->max_cost = INT_MAX; 377 params->max_ins = INT_MAX; 378 params->max_del = INT_MAX; 379 params->max_subst = INT_MAX; 380 params->max_err = INT_MAX; 381 } 382 383 #endif /* TRE_APPROX */ 384 385 /* EOF */ 386