1 /* 2 tre-match-parallel.c - TRE parallel regex matching engine 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7 */ 8 9 /* 10 This algorithm searches for matches basically by reading characters 11 in the searched string one by one, starting at the beginning. All 12 matching paths in the TNFA are traversed in parallel. When two or 13 more paths reach the same state, exactly one is chosen according to 14 tag ordering rules; if returning submatches is not required it does 15 not matter which path is chosen. 16 17 The worst case time required for finding the leftmost and longest 18 match, or determining that there is no match, is always linearly 19 dependent on the length of the text being searched. 20 21 This algorithm cannot handle TNFAs with back referencing nodes. 22 See `tre-match-backtrack.c'. 23 */ 24 25 26 #ifdef HAVE_CONFIG_H 27 #include <config.h> 28 #endif /* HAVE_CONFIG_H */ 29 30 #include "tre-alloca.h" 31 32 #include <assert.h> 33 #include <stdlib.h> 34 #include <string.h> 35 #ifdef HAVE_WCHAR_H 36 #include <wchar.h> 37 #endif /* HAVE_WCHAR_H */ 38 #ifdef HAVE_WCTYPE_H 39 #include <wctype.h> 40 #endif /* HAVE_WCTYPE_H */ 41 #ifndef TRE_WCHAR 42 #include <ctype.h> 43 #endif /* !TRE_WCHAR */ 44 #ifdef HAVE_MALLOC_H 45 #include <malloc.h> 46 #endif /* HAVE_MALLOC_H */ 47 #include <stdint.h> 48 49 #include "tre-internal.h" 50 #include "tre-match-utils.h" 51 #include "tre.h" 52 #include "xmalloc.h" 53 54 55 56 typedef struct { 57 tre_tnfa_transition_t *state; 58 int *tags; 59 } tre_tnfa_reach_t; 60 61 typedef struct { 62 int pos; 63 int **tags; 64 } tre_reach_pos_t; 65 66 67 #ifdef TRE_DEBUG 68 static void 69 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags) 70 { 71 int i; 72 73 while (reach->state != NULL) 74 { 75 DPRINT((" %p", (void *)reach->state)); 76 if (num_tags > 0) 77 { 78 DPRINT(("/")); 79 for (i = 0; i < num_tags; i++) 80 { 81 DPRINT(("%d:%d", i, reach->tags[i])); 82 if (i < (num_tags-1)) 83 DPRINT((",")); 84 } 85 } 86 reach++; 87 } 88 DPRINT(("\n")); 89 90 } 91 #endif /* TRE_DEBUG */ 92 93 reg_errcode_t 94 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, 95 tre_str_type_t type, int *match_tags, int eflags, 96 int *match_end_ofs) 97 { 98 /* State variables required by GET_NEXT_WCHAR. */ 99 tre_char_t prev_c = 0, next_c = 0; 100 const char *str_byte = string; 101 int pos = -1; 102 unsigned int pos_add_next = 1; 103 #ifdef TRE_WCHAR 104 const wchar_t *str_wide = string; 105 #ifdef TRE_MBSTATE 106 mbstate_t mbstate; 107 #endif /* TRE_MBSTATE */ 108 #endif /* TRE_WCHAR */ 109 int reg_notbol = eflags & REG_NOTBOL; 110 int reg_noteol = eflags & REG_NOTEOL; 111 int reg_newline = tnfa->cflags & REG_NEWLINE; 112 size_t str_user_end = 0; 113 114 char *buf; 115 tre_tnfa_transition_t *trans_i; 116 tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i; 117 tre_reach_pos_t *reach_pos; 118 int *tag_i; 119 size_t num_tags, i; 120 121 int match_eo = -1; /* end offset of match (-1 if no match found yet) */ 122 int new_match = 0; 123 int *tmp_tags = NULL; 124 int *tmp_iptr; 125 reg_errcode_t ret; 126 127 #ifdef TRE_MBSTATE 128 memset(&mbstate, '\0', sizeof(mbstate)); 129 #endif /* TRE_MBSTATE */ 130 131 DPRINT(("tre_tnfa_run_parallel, input type %d\n", type)); 132 133 if (!match_tags) 134 num_tags = 0; 135 else 136 num_tags = tnfa->num_tags; 137 138 /* Allocate memory for temporary data required for matching. This needs to 139 be done for every matching operation to be thread safe. This allocates 140 everything in a single large block from the stack frame using alloca() 141 or with malloc() if alloca is unavailable. */ 142 { 143 size_t tbytes, rbytes, pbytes, xbytes, total_bytes; 144 char *tmp_buf; 145 146 /* Ensure that tbytes and xbytes*num_states cannot overflow, and that 147 * they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */ 148 if (num_tags > SIZE_MAX/(8 * sizeof(int) * tnfa->num_states)) 149 return REG_ESPACE; 150 151 /* Likewise check rbytes. */ 152 if (tnfa->num_states+1 > SIZE_MAX/(8 * sizeof(*reach_next))) 153 return REG_ESPACE; 154 155 /* Likewise check pbytes. */ 156 if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_pos))) 157 return REG_ESPACE; 158 159 /* Compute the length of the block we need. */ 160 tbytes = sizeof(*tmp_tags) * num_tags; 161 rbytes = sizeof(*reach_next) * (tnfa->num_states + 1); 162 pbytes = sizeof(*reach_pos) * tnfa->num_states; 163 xbytes = sizeof(int) * num_tags; 164 total_bytes = 165 (sizeof(long) - 1) * 4 /* for alignment paddings */ 166 + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes; 167 168 /* Allocate the memory. */ 169 #ifdef TRE_USE_ALLOCA 170 buf = alloca(total_bytes); 171 #else /* !TRE_USE_ALLOCA */ 172 buf = xmalloc((unsigned)total_bytes); 173 #endif /* !TRE_USE_ALLOCA */ 174 if (buf == NULL) 175 return REG_ESPACE; 176 memset(buf, 0, (size_t)total_bytes); 177 178 /* Get the various pointers within tmp_buf (properly aligned). */ 179 tmp_tags = (void *)buf; 180 tmp_buf = buf + tbytes; 181 tmp_buf += ALIGN(tmp_buf, long); 182 reach_next = (void *)tmp_buf; 183 tmp_buf += rbytes; 184 tmp_buf += ALIGN(tmp_buf, long); 185 reach = (void *)tmp_buf; 186 tmp_buf += rbytes; 187 tmp_buf += ALIGN(tmp_buf, long); 188 reach_pos = (void *)tmp_buf; 189 tmp_buf += pbytes; 190 tmp_buf += ALIGN(tmp_buf, long); 191 for (i = 0; i < tnfa->num_states; i++) 192 { 193 reach[i].tags = (void *)tmp_buf; 194 tmp_buf += xbytes; 195 reach_next[i].tags = (void *)tmp_buf; 196 tmp_buf += xbytes; 197 } 198 } 199 200 for (i = 0; i < tnfa->num_states; i++) 201 reach_pos[i].pos = -1; 202 203 /* If only one character can start a match, find it first. */ 204 if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte) 205 { 206 const char *orig_str = str_byte; 207 int first = tnfa->first_char; 208 209 if (len >= 0) 210 str_byte = memchr(orig_str, first, (size_t)len); 211 else 212 str_byte = strchr(orig_str, first); 213 if (str_byte == NULL) 214 { 215 #ifndef TRE_USE_ALLOCA 216 if (buf) 217 xfree(buf); 218 #endif /* !TRE_USE_ALLOCA */ 219 return REG_NOMATCH; 220 } 221 DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str))); 222 if (str_byte >= orig_str + 1) 223 prev_c = (unsigned char)*(str_byte - 1); 224 next_c = (unsigned char)*str_byte; 225 pos = (int)(str_byte - orig_str); 226 if (len < 0 || pos < len) 227 str_byte++; 228 } 229 else 230 { 231 GET_NEXT_WCHAR(); 232 pos = 0; 233 } 234 235 #if 0 236 /* Skip over characters that cannot possibly be the first character 237 of a match. */ 238 if (tnfa->firstpos_chars != NULL) 239 { 240 char *chars = tnfa->firstpos_chars; 241 242 if (len < 0) 243 { 244 const char *orig_str = str_byte; 245 /* XXX - use strpbrk() and wcspbrk() because they might be 246 optimized for the target architecture. Try also strcspn() 247 and wcscspn() and compare the speeds. */ 248 while (next_c != L'\0' && !chars[next_c]) 249 { 250 next_c = *str_byte++; 251 } 252 prev_c = *(str_byte - 2); 253 pos += str_byte - orig_str; 254 DPRINT(("skipped %d chars\n", str_byte - orig_str)); 255 } 256 else 257 { 258 while (pos <= len && !chars[next_c]) 259 { 260 prev_c = next_c; 261 next_c = (unsigned char)(*str_byte++); 262 pos++; 263 } 264 } 265 } 266 #endif 267 268 DPRINT(("length: %d\n", len)); 269 DPRINT(("pos:chr/code | states and tags\n")); 270 DPRINT(("-------------+------------------------------------------------\n")); 271 272 reach_next_i = reach_next; 273 while (/*CONSTCOND*/(void)1,1) 274 { 275 /* If no match found yet, add the initial states to `reach_next'. */ 276 if (match_eo < 0) 277 { 278 DPRINT((" init >")); 279 trans_i = tnfa->initial; 280 while (trans_i->state != NULL) 281 { 282 if (reach_pos[trans_i->state_id].pos < pos) 283 { 284 if (trans_i->assertions 285 && CHECK_ASSERTIONS(trans_i->assertions)) 286 { 287 DPRINT(("assertion failed\n")); 288 trans_i++; 289 continue; 290 } 291 292 DPRINT((" %p", (void *)trans_i->state)); 293 reach_next_i->state = trans_i->state; 294 for (i = 0; i < num_tags; i++) 295 reach_next_i->tags[i] = -1; 296 tag_i = trans_i->tags; 297 if (tag_i) 298 while (*tag_i >= 0) 299 { 300 if ((size_t)*tag_i < num_tags) 301 reach_next_i->tags[*tag_i] = pos; 302 tag_i++; 303 } 304 if (reach_next_i->state == tnfa->final) 305 { 306 DPRINT((" found empty match\n")); 307 match_eo = pos; 308 new_match = 1; 309 for (i = 0; i < num_tags; i++) 310 match_tags[i] = reach_next_i->tags[i]; 311 } 312 reach_pos[trans_i->state_id].pos = pos; 313 reach_pos[trans_i->state_id].tags = &reach_next_i->tags; 314 reach_next_i++; 315 } 316 trans_i++; 317 } 318 DPRINT(("\n")); 319 reach_next_i->state = NULL; 320 } 321 else 322 { 323 if (num_tags == 0 || reach_next_i == reach_next) 324 /* We have found a match. */ 325 break; 326 } 327 328 /* Check for end of string. */ 329 if (len < 0) 330 { 331 if (type == STR_USER) 332 { 333 if (str_user_end) 334 break; 335 } 336 else if (next_c == L'\0') 337 break; 338 } 339 else 340 { 341 if (pos >= len) 342 break; 343 } 344 345 GET_NEXT_WCHAR(); 346 347 #ifdef TRE_DEBUG 348 DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c)); 349 tre_print_reach(tnfa, reach_next, num_tags); 350 DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c)); 351 tre_print_reach(tnfa, reach_next, num_tags); 352 #endif /* TRE_DEBUG */ 353 354 /* Swap `reach' and `reach_next'. */ 355 reach_i = reach; 356 reach = reach_next; 357 reach_next = reach_i; 358 359 /* For each state in `reach', weed out states that don't fulfill the 360 minimal matching conditions. */ 361 if (tnfa->num_minimals && new_match) 362 { 363 new_match = 0; 364 reach_next_i = reach_next; 365 for (reach_i = reach; reach_i->state; reach_i++) 366 { 367 int skip = 0; 368 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) 369 { 370 int end = tnfa->minimal_tags[i]; 371 int start = tnfa->minimal_tags[i + 1]; 372 DPRINT((" Minimal start %d, end %d\n", start, end)); 373 if ((size_t)end >= num_tags) 374 { 375 DPRINT((" Throwing %p out.\n", reach_i->state)); 376 skip = 1; 377 break; 378 } 379 else if (reach_i->tags[start] == match_tags[start] 380 && reach_i->tags[end] < match_tags[end]) 381 { 382 DPRINT((" Throwing %p out because t%d < %d\n", 383 reach_i->state, end, match_tags[end])); 384 skip = 1; 385 break; 386 } 387 } 388 if (!skip) 389 { 390 reach_next_i->state = reach_i->state; 391 tmp_iptr = reach_next_i->tags; 392 reach_next_i->tags = reach_i->tags; 393 reach_i->tags = tmp_iptr; 394 reach_next_i++; 395 } 396 } 397 reach_next_i->state = NULL; 398 399 /* Swap `reach' and `reach_next'. */ 400 reach_i = reach; 401 reach = reach_next; 402 reach_next = reach_i; 403 } 404 405 /* For each state in `reach' see if there is a transition leaving with 406 the current input symbol to a state not yet in `reach_next', and 407 add the destination states to `reach_next'. */ 408 reach_next_i = reach_next; 409 for (reach_i = reach; reach_i->state; reach_i++) 410 { 411 for (trans_i = reach_i->state; trans_i->state; trans_i++) 412 { 413 /* Does this transition match the input symbol? */ 414 if (trans_i->code_min <= (tre_cint_t)prev_c && 415 trans_i->code_max >= (tre_cint_t)prev_c) 416 { 417 if (trans_i->assertions 418 && (CHECK_ASSERTIONS(trans_i->assertions) 419 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) 420 { 421 DPRINT(("assertion failed\n")); 422 continue; 423 } 424 425 /* Compute the tags after this transition. */ 426 for (i = 0; i < num_tags; i++) 427 tmp_tags[i] = reach_i->tags[i]; 428 tag_i = trans_i->tags; 429 if (tag_i != NULL) 430 while (*tag_i >= 0) 431 { 432 if ((size_t)*tag_i < num_tags) 433 tmp_tags[*tag_i] = pos; 434 tag_i++; 435 } 436 437 if (reach_pos[trans_i->state_id].pos < pos) 438 { 439 /* Found an unvisited node. */ 440 reach_next_i->state = trans_i->state; 441 tmp_iptr = reach_next_i->tags; 442 reach_next_i->tags = tmp_tags; 443 tmp_tags = tmp_iptr; 444 reach_pos[trans_i->state_id].pos = pos; 445 reach_pos[trans_i->state_id].tags = &reach_next_i->tags; 446 447 if (reach_next_i->state == tnfa->final 448 && (match_eo == -1 449 || (num_tags > 0 450 && reach_next_i->tags[0] <= match_tags[0]))) 451 { 452 DPRINT((" found match %p\n", trans_i->state)); 453 match_eo = pos; 454 new_match = 1; 455 for (i = 0; i < num_tags; i++) 456 match_tags[i] = reach_next_i->tags[i]; 457 } 458 reach_next_i++; 459 460 } 461 else 462 { 463 assert(reach_pos[trans_i->state_id].pos == pos); 464 /* Another path has also reached this state. We choose 465 the winner by examining the tag values for both 466 paths. */ 467 if (tre_tag_order(num_tags, tnfa->tag_directions, 468 tmp_tags, 469 *reach_pos[trans_i->state_id].tags)) 470 { 471 /* The new path wins. */ 472 tmp_iptr = *reach_pos[trans_i->state_id].tags; 473 *reach_pos[trans_i->state_id].tags = tmp_tags; 474 if (trans_i->state == tnfa->final) 475 { 476 DPRINT((" found better match\n")); 477 match_eo = pos; 478 new_match = 1; 479 for (i = 0; i < num_tags; i++) 480 match_tags[i] = tmp_tags[i]; 481 } 482 tmp_tags = tmp_iptr; 483 } 484 } 485 } 486 } 487 } 488 reach_next_i->state = NULL; 489 } 490 491 DPRINT(("match end offset = %d\n", match_eo)); 492 493 *match_end_ofs = match_eo; 494 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH; 495 error_exit: 496 #ifndef TRE_USE_ALLOCA 497 if (buf) 498 xfree(buf); 499 #endif /* !TRE_USE_ALLOCA */ 500 return ret; 501 } 502 503 /* EOF */ 504