1 /* 2 tre_regexec.c - TRE POSIX compatible matching functions (and more). 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7 */ 8 9 #ifdef HAVE_CONFIG_H 10 #include <config.h> 11 #endif /* HAVE_CONFIG_H */ 12 13 #include "tre-alloca.h" 14 15 #include <assert.h> 16 #include <stdlib.h> 17 #include <string.h> 18 #ifdef HAVE_WCHAR_H 19 #include <wchar.h> 20 #endif /* HAVE_WCHAR_H */ 21 #ifdef HAVE_WCTYPE_H 22 #include <wctype.h> 23 #endif /* HAVE_WCTYPE_H */ 24 #ifndef TRE_WCHAR 25 #include <ctype.h> 26 #endif /* !TRE_WCHAR */ 27 #ifdef HAVE_MALLOC_H 28 #include <malloc.h> 29 #endif /* HAVE_MALLOC_H */ 30 #include <limits.h> 31 32 #include "tre-internal.h" 33 #include "tre.h" 34 #include "xmalloc.h" 35 36 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match 37 endpoint values. */ 38 void 39 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, 40 const tre_tnfa_t *tnfa, int *tags, int match_eo) 41 { 42 tre_submatch_data_t *submatch_data; 43 unsigned int i, j; 44 int *parents; 45 46 if (cflags & REG_NOSUB) 47 return; 48 49 i = 0; 50 if (match_eo >= 0) 51 { 52 /* Construct submatch offsets from the tags. */ 53 DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo)); 54 submatch_data = tnfa->submatch_data; 55 while (i < tnfa->num_submatches && i < nmatch) 56 { 57 if (submatch_data[i].so_tag == tnfa->end_tag) 58 pmatch[i].rm_so = match_eo; 59 else 60 pmatch[i].rm_so = tags[submatch_data[i].so_tag]; 61 62 if (submatch_data[i].eo_tag == tnfa->end_tag) 63 pmatch[i].rm_eo = match_eo; 64 else 65 pmatch[i].rm_eo = tags[submatch_data[i].eo_tag]; 66 67 /* If either of the endpoints were not used, this submatch 68 was not part of the match. */ 69 if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) 70 pmatch[i].rm_so = pmatch[i].rm_eo = -1; 71 72 DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i, 73 submatch_data[i].so_tag, pmatch[i].rm_so, 74 submatch_data[i].eo_tag, pmatch[i].rm_eo)); 75 i++; 76 } 77 /* Reset all submatches that are not within all of their parent 78 submatches. */ 79 i = 0; 80 while (i < tnfa->num_submatches && i < nmatch) 81 { 82 if (pmatch[i].rm_eo == -1) 83 assert(pmatch[i].rm_so == -1); 84 assert(pmatch[i].rm_so <= pmatch[i].rm_eo); 85 86 parents = submatch_data[i].parents; 87 if (parents != NULL) 88 for (j = 0; parents[j] >= 0; j++) 89 { 90 DPRINT(("pmatch[%d] parent %d\n", i, parents[j])); 91 if (pmatch[i].rm_so < pmatch[parents[j]].rm_so 92 || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo) 93 pmatch[i].rm_so = pmatch[i].rm_eo = -1; 94 } 95 i++; 96 } 97 } 98 99 while (i < nmatch) 100 { 101 pmatch[i].rm_so = -1; 102 pmatch[i].rm_eo = -1; 103 i++; 104 } 105 } 106 107 108 /* 109 Wrapper functions for POSIX compatible regexp matching. 110 */ 111 112 int 113 tre_have_backrefs(const regex_t *preg) 114 { 115 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 116 return tnfa->have_backrefs; 117 } 118 119 int 120 tre_have_approx(const regex_t *preg) 121 { 122 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 123 return tnfa->have_approx; 124 } 125 126 static int 127 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, 128 tre_str_type_t type, size_t nmatch, regmatch_t pmatch[], 129 int eflags) 130 { 131 reg_errcode_t status; 132 int *tags = NULL, eo; 133 if (tnfa->cflags & REG_NOSUB) nmatch = 0; 134 if (tnfa->num_tags > 0 && nmatch > 0) 135 { 136 #ifdef TRE_USE_ALLOCA 137 tags = alloca(sizeof(*tags) * tnfa->num_tags); 138 #else /* !TRE_USE_ALLOCA */ 139 tags = xmalloc(sizeof(*tags) * tnfa->num_tags); 140 #endif /* !TRE_USE_ALLOCA */ 141 if (tags == NULL) 142 return REG_ESPACE; 143 } 144 145 /* Dispatch to the appropriate matcher. */ 146 if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER) 147 { 148 /* The regex has back references, use the backtracking matcher. */ 149 if (type == STR_USER) 150 { 151 const tre_str_source *source = string; 152 if (source->rewind == NULL || source->compare == NULL) 153 /* The backtracking matcher requires rewind and compare 154 capabilities from the input stream. */ 155 return REG_BADPAT; 156 } 157 status = tre_tnfa_run_backtrack(tnfa, string, (int)len, type, 158 tags, eflags, &eo); 159 } 160 #ifdef TRE_APPROX 161 else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER) 162 { 163 /* The regex uses approximate matching, use the approximate matcher. */ 164 regamatch_t match; 165 regaparams_t params; 166 tre_regaparams_default(¶ms); 167 params.max_err = 0; 168 params.max_cost = 0; 169 status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags, 170 &match, params, eflags, &eo); 171 } 172 #endif /* TRE_APPROX */ 173 else 174 { 175 /* Exact matching, no back references, use the parallel matcher. */ 176 status = tre_tnfa_run_parallel(tnfa, string, (int)len, type, 177 tags, eflags, &eo); 178 } 179 180 if (status == REG_OK) 181 /* A match was found, so fill the submatch registers. */ 182 tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); 183 #ifndef TRE_USE_ALLOCA 184 if (tags) 185 xfree(tags); 186 #endif /* !TRE_USE_ALLOCA */ 187 return status; 188 } 189 190 int 191 tre_regnexec(const regex_t *preg, const char *str, size_t len, 192 size_t nmatch, regmatch_t pmatch[], int eflags) 193 { 194 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 195 /* CONSTCOND */ 196 tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; 197 198 return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags); 199 } 200 201 int 202 tre_regexec(const regex_t *preg, const char *str, 203 size_t nmatch, regmatch_t pmatch[], int eflags) 204 { 205 size_t shift, len, i; 206 int ret; 207 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 208 regmatch_t *p; 209 210 if (eflags & REG_STARTEND) { 211 if (pmatch == NULL || pmatch->rm_so < 0 || pmatch->rm_eo < 0 212 || pmatch->rm_so > pmatch->rm_eo) 213 return REG_INVARG; 214 str += shift = pmatch->rm_so; 215 len = pmatch->rm_eo - pmatch->rm_so; 216 eflags &= ~REG_STARTEND; 217 } else { 218 shift = 0; 219 len = (size_t)-1; 220 } 221 222 ret = tre_regnexec(preg, str, len, nmatch, pmatch, eflags); 223 224 if (!ret && !(tnfa->cflags & REG_NOSUB) && len != (size_t)-1) { 225 for (i = nmatch, p = pmatch; i > 0; p++, i--) { 226 if (p->rm_so >= 0) p->rm_so += shift; 227 if (p->rm_eo >= 0) p->rm_eo += shift; 228 } 229 } 230 231 return ret; 232 } 233 234 #ifdef REG_USEBYTES 235 int 236 tre_regexecb(const regex_t *preg, const char *str, 237 size_t nmatch, regmatch_t pmatch[], int eflags) 238 { 239 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 240 241 return tre_match(tnfa, str, (unsigned)-1, STR_BYTE, nmatch, pmatch, eflags); 242 } 243 244 int 245 tre_regnexecb(const regex_t *preg, const char *str, size_t len, 246 size_t nmatch, regmatch_t pmatch[], int eflags) 247 { 248 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 249 250 return tre_match(tnfa, str, len, STR_BYTE, nmatch, pmatch, eflags); 251 } 252 #endif /* REG_USEBYTES */ 253 254 255 #ifdef TRE_WCHAR 256 257 int 258 tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len, 259 size_t nmatch, regmatch_t pmatch[], int eflags) 260 { 261 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 262 return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags); 263 } 264 265 int 266 tre_regwexec(const regex_t *preg, const wchar_t *str, 267 size_t nmatch, regmatch_t pmatch[], int eflags) 268 { 269 size_t shift, len, i; 270 int ret; 271 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 272 regmatch_t *p; 273 274 if (eflags & REG_STARTEND) { 275 if (pmatch == NULL || pmatch->rm_so < 0 || pmatch->rm_eo < 0 276 || pmatch->rm_so > pmatch->rm_eo) 277 return REG_INVARG; 278 str += shift = pmatch->rm_so; 279 len = pmatch->rm_eo - pmatch->rm_so; 280 eflags &= ~REG_STARTEND; 281 } else { 282 shift = 0; 283 len = (size_t)-1; 284 } 285 286 ret = tre_regwnexec(preg, str, len, nmatch, pmatch, eflags); 287 288 if (!ret && !(tnfa->cflags & REG_NOSUB) && len != (size_t)-1) { 289 for (i = nmatch, p = pmatch; i > 0; p++, i--) { 290 if (p->rm_so >= 0) p->rm_so += shift; 291 if (p->rm_eo >= 0) p->rm_eo += shift; 292 } 293 } 294 295 return ret; 296 } 297 298 #endif /* TRE_WCHAR */ 299 300 int 301 tre_reguexec(const regex_t *preg, const tre_str_source *str, 302 size_t nmatch, regmatch_t pmatch[], int eflags) 303 { 304 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 305 return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags); 306 } 307 308 309 #ifdef TRE_APPROX 310 311 /* 312 Wrapper functions for approximate regexp matching. 313 */ 314 315 static int 316 tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len, 317 tre_str_type_t type, regamatch_t *match, regaparams_t params, 318 int eflags) 319 { 320 reg_errcode_t status; 321 int *tags = NULL, eo; 322 size_t nmatch; 323 324 if (tnfa->cflags & REG_NOSUB) 325 nmatch = 0; 326 else 327 nmatch = match->nmatch; 328 329 /* If the regexp does not use approximate matching features, the 330 maximum cost is zero, and the approximate matcher isn't forced, 331 use the exact matcher instead. */ 332 if (params.max_cost == 0 && !tnfa->have_approx 333 && !(eflags & REG_APPROX_MATCHER)) 334 return tre_match(tnfa, string, len, type, nmatch, match->pmatch, 335 eflags); 336 337 /* Back references are not supported by the approximate matcher. */ 338 if (tnfa->have_backrefs) 339 return REG_BADPAT; 340 341 if (tnfa->num_tags > 0 && nmatch > 0) 342 { 343 #if TRE_USE_ALLOCA 344 tags = alloca(sizeof(*tags) * tnfa->num_tags); 345 #else /* !TRE_USE_ALLOCA */ 346 tags = xmalloc(sizeof(*tags) * tnfa->num_tags); 347 #endif /* !TRE_USE_ALLOCA */ 348 if (tags == NULL) 349 return REG_ESPACE; 350 } 351 status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags, 352 match, params, eflags, &eo); 353 if (status == REG_OK) 354 tre_fill_pmatch(nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo); 355 #ifndef TRE_USE_ALLOCA 356 if (tags) 357 xfree(tags); 358 #endif /* !TRE_USE_ALLOCA */ 359 return status; 360 } 361 362 int 363 tre_reganexec(const regex_t *preg, const char *str, size_t len, 364 regamatch_t *match, regaparams_t params, int eflags) 365 { 366 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 367 tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; 368 369 return tre_match_approx(tnfa, str, len, type, match, params, eflags); 370 } 371 372 int 373 tre_regaexec(const regex_t *preg, const char *str, 374 regamatch_t *match, regaparams_t params, int eflags) 375 { 376 return tre_reganexec(preg, str, (unsigned)-1, match, params, eflags); 377 } 378 379 #ifdef REG_USEBYTES 380 int 381 tre_regaexecb(const regex_t *preg, const char *str, 382 regamatch_t *match, regaparams_t params, int eflags) 383 { 384 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 385 386 return tre_match_approx(tnfa, str, (unsigned)-1, STR_BYTE, 387 match, params, eflags); 388 } 389 #endif /* REG_USEBYTES */ 390 391 #ifdef TRE_WCHAR 392 393 int 394 tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len, 395 regamatch_t *match, regaparams_t params, int eflags) 396 { 397 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 398 return tre_match_approx(tnfa, str, len, STR_WIDE, 399 match, params, eflags); 400 } 401 402 int 403 tre_regawexec(const regex_t *preg, const wchar_t *str, 404 regamatch_t *match, regaparams_t params, int eflags) 405 { 406 return tre_regawnexec(preg, str, (unsigned)-1, match, params, eflags); 407 } 408 409 #endif /* TRE_WCHAR */ 410 411 void 412 tre_regaparams_default(regaparams_t *params) 413 { 414 memset(params, 0, sizeof(*params)); 415 params->cost_ins = 1; 416 params->cost_del = 1; 417 params->cost_subst = 1; 418 params->max_cost = INT_MAX; 419 params->max_ins = INT_MAX; 420 params->max_del = INT_MAX; 421 params->max_subst = INT_MAX; 422 params->max_err = INT_MAX; 423 } 424 425 #endif /* TRE_APPROX */ 426 427 /* EOF */ 428