1 /* 2 tre-match-parallel.c - TRE parallel regex matching engine 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7 */ 8 9 /* 10 This algorithm searches for matches basically by reading characters 11 in the searched string one by one, starting at the beginning. All 12 matching paths in the TNFA are traversed in parallel. When two or 13 more paths reach the same state, exactly one is chosen according to 14 tag ordering rules; if returning submatches is not required it does 15 not matter which path is chosen. 16 17 The worst case time required for finding the leftmost and longest 18 match, or determining that there is no match, is always linearly 19 dependent on the length of the text being searched. 20 21 This algorithm cannot handle TNFAs with back referencing nodes. 22 See `tre-match-backtrack.c'. 23 */ 24 25 26 #ifdef HAVE_CONFIG_H 27 #include <config.h> 28 #endif /* HAVE_CONFIG_H */ 29 30 #include "tre-alloca.h" 31 32 #include <assert.h> 33 #include <stdlib.h> 34 #include <string.h> 35 #ifdef HAVE_WCHAR_H 36 #include <wchar.h> 37 #endif /* HAVE_WCHAR_H */ 38 #ifdef HAVE_WCTYPE_H 39 #include <wctype.h> 40 #endif /* HAVE_WCTYPE_H */ 41 #ifndef TRE_WCHAR 42 #include <ctype.h> 43 #endif /* !TRE_WCHAR */ 44 #ifdef HAVE_MALLOC_H 45 #include <malloc.h> 46 #endif /* HAVE_MALLOC_H */ 47 48 #include "tre-internal.h" 49 #include "tre-match-utils.h" 50 #include "tre.h" 51 #include "xmalloc.h" 52 53 54 55 typedef struct { 56 tre_tnfa_transition_t *state; 57 int *tags; 58 } tre_tnfa_reach_t; 59 60 typedef struct { 61 int pos; 62 int **tags; 63 } tre_reach_pos_t; 64 65 66 #ifdef TRE_DEBUG 67 static void 68 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags) 69 { 70 int i; 71 72 while (reach->state != NULL) 73 { 74 DPRINT((" %p", (void *)reach->state)); 75 if (num_tags > 0) 76 { 77 DPRINT(("/")); 78 for (i = 0; i < num_tags; i++) 79 { 80 DPRINT(("%d:%d", i, reach->tags[i])); 81 if (i < (num_tags-1)) 82 DPRINT((",")); 83 } 84 } 85 reach++; 86 } 87 DPRINT(("\n")); 88 89 } 90 #endif /* TRE_DEBUG */ 91 92 reg_errcode_t 93 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, 94 tre_str_type_t type, int *match_tags, int eflags, 95 int *match_end_ofs) 96 { 97 /* State variables required by GET_NEXT_WCHAR. */ 98 tre_char_t prev_c = 0, next_c = 0; 99 const char *str_byte = string; 100 int pos = -1; 101 unsigned int pos_add_next = 1; 102 #ifdef TRE_WCHAR 103 const wchar_t *str_wide = string; 104 #ifdef TRE_MBSTATE 105 mbstate_t mbstate; 106 #endif /* TRE_MBSTATE */ 107 #endif /* TRE_WCHAR */ 108 int reg_notbol = eflags & REG_NOTBOL; 109 int reg_noteol = eflags & REG_NOTEOL; 110 int reg_newline = tnfa->cflags & REG_NEWLINE; 111 size_t str_user_end = 0; 112 113 char *buf; 114 tre_tnfa_transition_t *trans_i; 115 tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i; 116 tre_reach_pos_t *reach_pos; 117 int *tag_i; 118 size_t num_tags, i; 119 120 int match_eo = -1; /* end offset of match (-1 if no match found yet) */ 121 int new_match = 0; 122 int *tmp_tags = NULL; 123 int *tmp_iptr; 124 125 #ifdef TRE_MBSTATE 126 memset(&mbstate, '\0', sizeof(mbstate)); 127 #endif /* TRE_MBSTATE */ 128 129 DPRINT(("tre_tnfa_run_parallel, input type %d\n", type)); 130 131 if (!match_tags) 132 num_tags = 0; 133 else 134 num_tags = tnfa->num_tags; 135 136 /* Allocate memory for temporary data required for matching. This needs to 137 be done for every matching operation to be thread safe. This allocates 138 everything in a single large block from the stack frame using alloca() 139 or with malloc() if alloca is unavailable. */ 140 { 141 size_t tbytes, rbytes, pbytes, xbytes, total_bytes; 142 char *tmp_buf; 143 /* Compute the length of the block we need. */ 144 tbytes = sizeof(*tmp_tags) * num_tags; 145 rbytes = sizeof(*reach_next) * (tnfa->num_states + 1); 146 pbytes = sizeof(*reach_pos) * tnfa->num_states; 147 xbytes = sizeof(int) * num_tags; 148 total_bytes = 149 (sizeof(long) - 1) * 4 /* for alignment paddings */ 150 + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes; 151 152 /* Allocate the memory. */ 153 #ifdef TRE_USE_ALLOCA 154 buf = alloca(total_bytes); 155 #else /* !TRE_USE_ALLOCA */ 156 buf = xmalloc((unsigned)total_bytes); 157 #endif /* !TRE_USE_ALLOCA */ 158 if (buf == NULL) 159 return REG_ESPACE; 160 memset(buf, 0, (size_t)total_bytes); 161 162 /* Get the various pointers within tmp_buf (properly aligned). */ 163 tmp_tags = (void *)buf; 164 tmp_buf = buf + tbytes; 165 tmp_buf += ALIGN(tmp_buf, long); 166 reach_next = (void *)tmp_buf; 167 tmp_buf += rbytes; 168 tmp_buf += ALIGN(tmp_buf, long); 169 reach = (void *)tmp_buf; 170 tmp_buf += rbytes; 171 tmp_buf += ALIGN(tmp_buf, long); 172 reach_pos = (void *)tmp_buf; 173 tmp_buf += pbytes; 174 tmp_buf += ALIGN(tmp_buf, long); 175 for (i = 0; i < tnfa->num_states; i++) 176 { 177 reach[i].tags = (void *)tmp_buf; 178 tmp_buf += xbytes; 179 reach_next[i].tags = (void *)tmp_buf; 180 tmp_buf += xbytes; 181 } 182 } 183 184 for (i = 0; i < tnfa->num_states; i++) 185 reach_pos[i].pos = -1; 186 187 /* If only one character can start a match, find it first. */ 188 if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte) 189 { 190 const char *orig_str = str_byte; 191 int first = tnfa->first_char; 192 193 if (len >= 0) 194 str_byte = memchr(orig_str, first, (size_t)len); 195 else 196 str_byte = strchr(orig_str, first); 197 if (str_byte == NULL) 198 { 199 #ifndef TRE_USE_ALLOCA 200 if (buf) 201 xfree(buf); 202 #endif /* !TRE_USE_ALLOCA */ 203 return REG_NOMATCH; 204 } 205 DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str))); 206 if (str_byte >= orig_str + 1) 207 prev_c = (unsigned char)*(str_byte - 1); 208 next_c = (unsigned char)*str_byte; 209 pos = (int)(str_byte - orig_str); 210 if (len < 0 || pos < len) 211 str_byte++; 212 } 213 else 214 { 215 GET_NEXT_WCHAR(); 216 pos = 0; 217 } 218 219 #if 0 220 /* Skip over characters that cannot possibly be the first character 221 of a match. */ 222 if (tnfa->firstpos_chars != NULL) 223 { 224 char *chars = tnfa->firstpos_chars; 225 226 if (len < 0) 227 { 228 const char *orig_str = str_byte; 229 /* XXX - use strpbrk() and wcspbrk() because they might be 230 optimized for the target architecture. Try also strcspn() 231 and wcscspn() and compare the speeds. */ 232 while (next_c != L'\0' && !chars[next_c]) 233 { 234 next_c = *str_byte++; 235 } 236 prev_c = *(str_byte - 2); 237 pos += str_byte - orig_str; 238 DPRINT(("skipped %d chars\n", str_byte - orig_str)); 239 } 240 else 241 { 242 while (pos <= len && !chars[next_c]) 243 { 244 prev_c = next_c; 245 next_c = (unsigned char)(*str_byte++); 246 pos++; 247 } 248 } 249 } 250 #endif 251 252 DPRINT(("length: %d\n", len)); 253 DPRINT(("pos:chr/code | states and tags\n")); 254 DPRINT(("-------------+------------------------------------------------\n")); 255 256 reach_next_i = reach_next; 257 while (/*CONSTCOND*/1) 258 { 259 /* If no match found yet, add the initial states to `reach_next'. */ 260 if (match_eo < 0) 261 { 262 DPRINT((" init >")); 263 trans_i = tnfa->initial; 264 while (trans_i->state != NULL) 265 { 266 if (reach_pos[trans_i->state_id].pos < pos) 267 { 268 if (trans_i->assertions 269 && CHECK_ASSERTIONS(trans_i->assertions)) 270 { 271 DPRINT(("assertion failed\n")); 272 trans_i++; 273 continue; 274 } 275 276 DPRINT((" %p", (void *)trans_i->state)); 277 reach_next_i->state = trans_i->state; 278 for (i = 0; i < num_tags; i++) 279 reach_next_i->tags[i] = -1; 280 tag_i = trans_i->tags; 281 if (tag_i) 282 while (*tag_i >= 0) 283 { 284 if ((size_t)*tag_i < num_tags) 285 reach_next_i->tags[*tag_i] = pos; 286 tag_i++; 287 } 288 if (reach_next_i->state == tnfa->final) 289 { 290 DPRINT((" found empty match\n")); 291 match_eo = pos; 292 new_match = 1; 293 for (i = 0; i < num_tags; i++) 294 match_tags[i] = reach_next_i->tags[i]; 295 } 296 reach_pos[trans_i->state_id].pos = pos; 297 reach_pos[trans_i->state_id].tags = &reach_next_i->tags; 298 reach_next_i++; 299 } 300 trans_i++; 301 } 302 DPRINT(("\n")); 303 reach_next_i->state = NULL; 304 } 305 else 306 { 307 if (num_tags == 0 || reach_next_i == reach_next) 308 /*�We have found a match. */ 309 break; 310 } 311 312 /* Check for end of string. */ 313 if (len < 0) 314 { 315 if (type == STR_USER) 316 { 317 if (str_user_end) 318 break; 319 } 320 else if (next_c == L'\0') 321 break; 322 } 323 else 324 { 325 if (pos >= len) 326 break; 327 } 328 329 GET_NEXT_WCHAR(); 330 331 #ifdef TRE_DEBUG 332 DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c)); 333 tre_print_reach(tnfa, reach_next, num_tags); 334 DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c)); 335 tre_print_reach(tnfa, reach_next, num_tags); 336 #endif /* TRE_DEBUG */ 337 338 /* Swap `reach' and `reach_next'. */ 339 reach_i = reach; 340 reach = reach_next; 341 reach_next = reach_i; 342 343 /* For each state in `reach', weed out states that don't fulfill the 344 minimal matching conditions. */ 345 if (tnfa->num_minimals && new_match) 346 { 347 new_match = 0; 348 reach_next_i = reach_next; 349 for (reach_i = reach; reach_i->state; reach_i++) 350 { 351 int skip = 0; 352 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) 353 { 354 int end = tnfa->minimal_tags[i]; 355 int start = tnfa->minimal_tags[i + 1]; 356 DPRINT((" Minimal start %d, end %d\n", start, end)); 357 if ((size_t)end >= num_tags) 358 { 359 DPRINT((" Throwing %p out.\n", reach_i->state)); 360 skip = 1; 361 break; 362 } 363 else if (reach_i->tags[start] == match_tags[start] 364 && reach_i->tags[end] < match_tags[end]) 365 { 366 DPRINT((" Throwing %p out because t%d < %d\n", 367 reach_i->state, end, match_tags[end])); 368 skip = 1; 369 break; 370 } 371 } 372 if (!skip) 373 { 374 reach_next_i->state = reach_i->state; 375 tmp_iptr = reach_next_i->tags; 376 reach_next_i->tags = reach_i->tags; 377 reach_i->tags = tmp_iptr; 378 reach_next_i++; 379 } 380 } 381 reach_next_i->state = NULL; 382 383 /* Swap `reach' and `reach_next'. */ 384 reach_i = reach; 385 reach = reach_next; 386 reach_next = reach_i; 387 } 388 389 /* For each state in `reach' see if there is a transition leaving with 390 the current input symbol to a state not yet in `reach_next', and 391 add the destination states to `reach_next'. */ 392 reach_next_i = reach_next; 393 for (reach_i = reach; reach_i->state; reach_i++) 394 { 395 for (trans_i = reach_i->state; trans_i->state; trans_i++) 396 { 397 /* Does this transition match the input symbol? */ 398 if (trans_i->code_min <= (tre_cint_t)prev_c && 399 trans_i->code_max >= (tre_cint_t)prev_c) 400 { 401 if (trans_i->assertions 402 && (CHECK_ASSERTIONS(trans_i->assertions) 403 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) 404 { 405 DPRINT(("assertion failed\n")); 406 continue; 407 } 408 409 /* Compute the tags after this transition. */ 410 for (i = 0; i < num_tags; i++) 411 tmp_tags[i] = reach_i->tags[i]; 412 tag_i = trans_i->tags; 413 if (tag_i != NULL) 414 while (*tag_i >= 0) 415 { 416 if ((size_t)*tag_i < num_tags) 417 tmp_tags[*tag_i] = pos; 418 tag_i++; 419 } 420 421 if (reach_pos[trans_i->state_id].pos < pos) 422 { 423 /* Found an unvisited node. */ 424 reach_next_i->state = trans_i->state; 425 tmp_iptr = reach_next_i->tags; 426 reach_next_i->tags = tmp_tags; 427 tmp_tags = tmp_iptr; 428 reach_pos[trans_i->state_id].pos = pos; 429 reach_pos[trans_i->state_id].tags = &reach_next_i->tags; 430 431 if (reach_next_i->state == tnfa->final 432 && (match_eo == -1 433 || (num_tags > 0 434 && reach_next_i->tags[0] <= match_tags[0]))) 435 { 436 DPRINT((" found match %p\n", trans_i->state)); 437 match_eo = pos; 438 new_match = 1; 439 for (i = 0; i < num_tags; i++) 440 match_tags[i] = reach_next_i->tags[i]; 441 } 442 reach_next_i++; 443 444 } 445 else 446 { 447 assert(reach_pos[trans_i->state_id].pos == pos); 448 /* Another path has also reached this state. We choose 449 the winner by examining the tag values for both 450 paths. */ 451 if (tre_tag_order(num_tags, tnfa->tag_directions, 452 tmp_tags, 453 *reach_pos[trans_i->state_id].tags)) 454 { 455 /* The new path wins. */ 456 tmp_iptr = *reach_pos[trans_i->state_id].tags; 457 *reach_pos[trans_i->state_id].tags = tmp_tags; 458 if (trans_i->state == tnfa->final) 459 { 460 DPRINT((" found better match\n")); 461 match_eo = pos; 462 new_match = 1; 463 for (i = 0; i < num_tags; i++) 464 match_tags[i] = tmp_tags[i]; 465 } 466 tmp_tags = tmp_iptr; 467 } 468 } 469 } 470 } 471 } 472 reach_next_i->state = NULL; 473 } 474 475 DPRINT(("match end offset = %d\n", match_eo)); 476 477 #ifndef TRE_USE_ALLOCA 478 if (buf) 479 xfree(buf); 480 #endif /* !TRE_USE_ALLOCA */ 481 482 *match_end_ofs = match_eo; 483 return match_eo >= 0 ? REG_OK : REG_NOMATCH; 484 } 485 486 /* EOF */ 487