1 /* 2 tre-match-parallel.c - TRE parallel regex matching engine 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7 */ 8 9 /* 10 This algorithm searches for matches basically by reading characters 11 in the searched string one by one, starting at the beginning. All 12 matching paths in the TNFA are traversed in parallel. When two or 13 more paths reach the same state, exactly one is chosen according to 14 tag ordering rules; if returning submatches is not required it does 15 not matter which path is chosen. 16 17 The worst case time required for finding the leftmost and longest 18 match, or determining that there is no match, is always linearly 19 dependent on the length of the text being searched. 20 21 This algorithm cannot handle TNFAs with back referencing nodes. 22 See `tre-match-backtrack.c'. 23 */ 24 25 26 #ifdef HAVE_CONFIG_H 27 #include <config.h> 28 #endif /* HAVE_CONFIG_H */ 29 30 #ifdef TRE_USE_ALLOCA 31 /* AIX requires this to be the first thing in the file. */ 32 #ifndef __GNUC__ 33 # if HAVE_ALLOCA_H 34 # include <alloca.h> 35 # else 36 # ifdef _AIX 37 #pragma alloca 38 # else 39 # ifndef alloca /* predefined by HP cc +Olibcalls */ 40 char *alloca (); 41 # endif 42 # endif 43 # endif 44 #endif 45 #endif /* TRE_USE_ALLOCA */ 46 47 #include <assert.h> 48 #include <stdlib.h> 49 #include <string.h> 50 #ifdef HAVE_WCHAR_H 51 #include <wchar.h> 52 #endif /* HAVE_WCHAR_H */ 53 #ifdef HAVE_WCTYPE_H 54 #include <wctype.h> 55 #endif /* HAVE_WCTYPE_H */ 56 #ifndef TRE_WCHAR 57 #include <ctype.h> 58 #endif /* !TRE_WCHAR */ 59 #ifdef HAVE_MALLOC_H 60 #include <malloc.h> 61 #endif /* HAVE_MALLOC_H */ 62 63 #include "tre-internal.h" 64 #include "tre-match-utils.h" 65 #include "tre.h" 66 #include "xmalloc.h" 67 68 69 70 typedef struct { 71 tre_tnfa_transition_t *state; 72 int *tags; 73 } tre_tnfa_reach_t; 74 75 typedef struct { 76 int pos; 77 int **tags; 78 } tre_reach_pos_t; 79 80 81 #ifdef TRE_DEBUG 82 static void 83 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags) 84 { 85 int i; 86 87 while (reach->state != NULL) 88 { 89 DPRINT((" %p", (void *)reach->state)); 90 if (num_tags > 0) 91 { 92 DPRINT(("/")); 93 for (i = 0; i < num_tags; i++) 94 { 95 DPRINT(("%d:%d", i, reach->tags[i])); 96 if (i < (num_tags-1)) 97 DPRINT((",")); 98 } 99 } 100 reach++; 101 } 102 DPRINT(("\n")); 103 104 } 105 #endif /* TRE_DEBUG */ 106 107 reg_errcode_t 108 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, 109 tre_str_type_t type, int *match_tags, int eflags, 110 int *match_end_ofs) 111 { 112 /* State variables required by GET_NEXT_WCHAR. */ 113 tre_char_t prev_c = 0, next_c = 0; 114 const char *str_byte = string; 115 int pos = -1; 116 unsigned int pos_add_next = 1; 117 #ifdef TRE_WCHAR 118 const wchar_t *str_wide = string; 119 #ifdef TRE_MBSTATE 120 mbstate_t mbstate; 121 #endif /* TRE_MBSTATE */ 122 #endif /* TRE_WCHAR */ 123 int reg_notbol = eflags & REG_NOTBOL; 124 int reg_noteol = eflags & REG_NOTEOL; 125 int reg_newline = tnfa->cflags & REG_NEWLINE; 126 int str_user_end = 0; 127 128 char *buf; 129 tre_tnfa_transition_t *trans_i; 130 tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i; 131 tre_reach_pos_t *reach_pos; 132 int *tag_i; 133 int num_tags, i; 134 135 int match_eo = -1; /* end offset of match (-1 if no match found yet) */ 136 int new_match = 0; 137 int *tmp_tags = NULL; 138 int *tmp_iptr; 139 140 #ifdef TRE_MBSTATE 141 memset(&mbstate, '\0', sizeof(mbstate)); 142 #endif /* TRE_MBSTATE */ 143 144 DPRINT(("tre_tnfa_run_parallel, input type %d\n", type)); 145 146 if (!match_tags) 147 num_tags = 0; 148 else 149 num_tags = tnfa->num_tags; 150 151 /* Allocate memory for temporary data required for matching. This needs to 152 be done for every matching operation to be thread safe. This allocates 153 everything in a single large block from the stack frame using alloca() 154 or with malloc() if alloca is unavailable. */ 155 { 156 int tbytes, rbytes, pbytes, xbytes, total_bytes; 157 char *tmp_buf; 158 /* Compute the length of the block we need. */ 159 tbytes = sizeof(*tmp_tags) * num_tags; 160 rbytes = sizeof(*reach_next) * (tnfa->num_states + 1); 161 pbytes = sizeof(*reach_pos) * tnfa->num_states; 162 xbytes = sizeof(int) * num_tags; 163 total_bytes = 164 (sizeof(long) - 1) * 4 /* for alignment paddings */ 165 + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes; 166 167 /* Allocate the memory. */ 168 #ifdef TRE_USE_ALLOCA 169 buf = alloca(total_bytes); 170 #else /* !TRE_USE_ALLOCA */ 171 buf = xmalloc((unsigned)total_bytes); 172 #endif /* !TRE_USE_ALLOCA */ 173 if (buf == NULL) 174 return REG_ESPACE; 175 memset(buf, 0, (size_t)total_bytes); 176 177 /* Get the various pointers within tmp_buf (properly aligned). */ 178 tmp_tags = (void *)buf; 179 tmp_buf = buf + tbytes; 180 tmp_buf += ALIGN(tmp_buf, long); 181 reach_next = (void *)tmp_buf; 182 tmp_buf += rbytes; 183 tmp_buf += ALIGN(tmp_buf, long); 184 reach = (void *)tmp_buf; 185 tmp_buf += rbytes; 186 tmp_buf += ALIGN(tmp_buf, long); 187 reach_pos = (void *)tmp_buf; 188 tmp_buf += pbytes; 189 tmp_buf += ALIGN(tmp_buf, long); 190 for (i = 0; i < tnfa->num_states; i++) 191 { 192 reach[i].tags = (void *)tmp_buf; 193 tmp_buf += xbytes; 194 reach_next[i].tags = (void *)tmp_buf; 195 tmp_buf += xbytes; 196 } 197 } 198 199 for (i = 0; i < tnfa->num_states; i++) 200 reach_pos[i].pos = -1; 201 202 /* If only one character can start a match, find it first. */ 203 if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte) 204 { 205 const char *orig_str = str_byte; 206 int first = tnfa->first_char; 207 208 if (len >= 0) 209 str_byte = memchr(orig_str, first, (size_t)len); 210 else 211 str_byte = strchr(orig_str, first); 212 if (str_byte == NULL) 213 { 214 #ifndef TRE_USE_ALLOCA 215 if (buf) 216 xfree(buf); 217 #endif /* !TRE_USE_ALLOCA */ 218 return REG_NOMATCH; 219 } 220 DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str))); 221 if (str_byte >= orig_str + 1) 222 prev_c = (unsigned char)*(str_byte - 1); 223 next_c = (unsigned char)*str_byte; 224 pos = str_byte - orig_str; 225 if (len < 0 || pos < len) 226 str_byte++; 227 } 228 else 229 { 230 GET_NEXT_WCHAR(); 231 pos = 0; 232 } 233 234 #if 0 235 /* Skip over characters that cannot possibly be the first character 236 of a match. */ 237 if (tnfa->firstpos_chars != NULL) 238 { 239 char *chars = tnfa->firstpos_chars; 240 241 if (len < 0) 242 { 243 const char *orig_str = str_byte; 244 /* XXX - use strpbrk() and wcspbrk() because they might be 245 optimized for the target architecture. Try also strcspn() 246 and wcscspn() and compare the speeds. */ 247 while (next_c != L'\0' && !chars[next_c]) 248 { 249 next_c = *str_byte++; 250 } 251 prev_c = *(str_byte - 2); 252 pos += str_byte - orig_str; 253 DPRINT(("skipped %d chars\n", str_byte - orig_str)); 254 } 255 else 256 { 257 while (pos <= len && !chars[next_c]) 258 { 259 prev_c = next_c; 260 next_c = (unsigned char)(*str_byte++); 261 pos++; 262 } 263 } 264 } 265 #endif 266 267 DPRINT(("length: %d\n", len)); 268 DPRINT(("pos:chr/code | states and tags\n")); 269 DPRINT(("-------------+------------------------------------------------\n")); 270 271 reach_next_i = reach_next; 272 while (/*CONSTCOND*/1) 273 { 274 /* If no match found yet, add the initial states to `reach_next'. */ 275 if (match_eo < 0) 276 { 277 DPRINT((" init >")); 278 trans_i = tnfa->initial; 279 while (trans_i->state != NULL) 280 { 281 if (reach_pos[trans_i->state_id].pos < pos) 282 { 283 if (trans_i->assertions 284 && CHECK_ASSERTIONS(trans_i->assertions)) 285 { 286 DPRINT(("assertion failed\n")); 287 trans_i++; 288 continue; 289 } 290 291 DPRINT((" %p", (void *)trans_i->state)); 292 reach_next_i->state = trans_i->state; 293 for (i = 0; i < num_tags; i++) 294 reach_next_i->tags[i] = -1; 295 tag_i = trans_i->tags; 296 if (tag_i) 297 while (*tag_i >= 0) 298 { 299 if (*tag_i < num_tags) 300 reach_next_i->tags[*tag_i] = pos; 301 tag_i++; 302 } 303 if (reach_next_i->state == tnfa->final) 304 { 305 DPRINT((" found empty match\n")); 306 match_eo = pos; 307 new_match = 1; 308 for (i = 0; i < num_tags; i++) 309 match_tags[i] = reach_next_i->tags[i]; 310 } 311 reach_pos[trans_i->state_id].pos = pos; 312 reach_pos[trans_i->state_id].tags = &reach_next_i->tags; 313 reach_next_i++; 314 } 315 trans_i++; 316 } 317 DPRINT(("\n")); 318 reach_next_i->state = NULL; 319 } 320 else 321 { 322 if (num_tags == 0 || reach_next_i == reach_next) 323 /*�We have found a match. */ 324 break; 325 } 326 327 /* Check for end of string. */ 328 if (len < 0) 329 { 330 if (type == STR_USER) 331 { 332 if (str_user_end) 333 break; 334 } 335 else if (next_c == L'\0') 336 break; 337 } 338 else 339 { 340 if (pos >= len) 341 break; 342 } 343 344 GET_NEXT_WCHAR(); 345 346 #ifdef TRE_DEBUG 347 DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c)); 348 tre_print_reach(tnfa, reach_next, num_tags); 349 DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c)); 350 tre_print_reach(tnfa, reach_next, num_tags); 351 #endif /* TRE_DEBUG */ 352 353 /* Swap `reach' and `reach_next'. */ 354 reach_i = reach; 355 reach = reach_next; 356 reach_next = reach_i; 357 358 /* For each state in `reach', weed out states that don't fulfill the 359 minimal matching conditions. */ 360 if (tnfa->num_minimals && new_match) 361 { 362 new_match = 0; 363 reach_next_i = reach_next; 364 for (reach_i = reach; reach_i->state; reach_i++) 365 { 366 int skip = 0; 367 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) 368 { 369 int end = tnfa->minimal_tags[i]; 370 int start = tnfa->minimal_tags[i + 1]; 371 DPRINT((" Minimal start %d, end %d\n", start, end)); 372 if (end >= num_tags) 373 { 374 DPRINT((" Throwing %p out.\n", reach_i->state)); 375 skip = 1; 376 break; 377 } 378 else if (reach_i->tags[start] == match_tags[start] 379 && reach_i->tags[end] < match_tags[end]) 380 { 381 DPRINT((" Throwing %p out because t%d < %d\n", 382 reach_i->state, end, match_tags[end])); 383 skip = 1; 384 break; 385 } 386 } 387 if (!skip) 388 { 389 reach_next_i->state = reach_i->state; 390 tmp_iptr = reach_next_i->tags; 391 reach_next_i->tags = reach_i->tags; 392 reach_i->tags = tmp_iptr; 393 reach_next_i++; 394 } 395 } 396 reach_next_i->state = NULL; 397 398 /* Swap `reach' and `reach_next'. */ 399 reach_i = reach; 400 reach = reach_next; 401 reach_next = reach_i; 402 } 403 404 /* For each state in `reach' see if there is a transition leaving with 405 the current input symbol to a state not yet in `reach_next', and 406 add the destination states to `reach_next'. */ 407 reach_next_i = reach_next; 408 for (reach_i = reach; reach_i->state; reach_i++) 409 { 410 for (trans_i = reach_i->state; trans_i->state; trans_i++) 411 { 412 /* Does this transition match the input symbol? */ 413 if (trans_i->code_min <= (tre_cint_t)prev_c && 414 trans_i->code_max >= (tre_cint_t)prev_c) 415 { 416 if (trans_i->assertions 417 && (CHECK_ASSERTIONS(trans_i->assertions) 418 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) 419 { 420 DPRINT(("assertion failed\n")); 421 continue; 422 } 423 424 /* Compute the tags after this transition. */ 425 for (i = 0; i < num_tags; i++) 426 tmp_tags[i] = reach_i->tags[i]; 427 tag_i = trans_i->tags; 428 if (tag_i != NULL) 429 while (*tag_i >= 0) 430 { 431 if (*tag_i < num_tags) 432 tmp_tags[*tag_i] = pos; 433 tag_i++; 434 } 435 436 if (reach_pos[trans_i->state_id].pos < pos) 437 { 438 /* Found an unvisited node. */ 439 reach_next_i->state = trans_i->state; 440 tmp_iptr = reach_next_i->tags; 441 reach_next_i->tags = tmp_tags; 442 tmp_tags = tmp_iptr; 443 reach_pos[trans_i->state_id].pos = pos; 444 reach_pos[trans_i->state_id].tags = &reach_next_i->tags; 445 446 if (reach_next_i->state == tnfa->final 447 && (match_eo == -1 448 || (num_tags > 0 449 && reach_next_i->tags[0] <= match_tags[0]))) 450 { 451 DPRINT((" found match %p\n", trans_i->state)); 452 match_eo = pos; 453 new_match = 1; 454 for (i = 0; i < num_tags; i++) 455 match_tags[i] = reach_next_i->tags[i]; 456 } 457 reach_next_i++; 458 459 } 460 else 461 { 462 assert(reach_pos[trans_i->state_id].pos == pos); 463 /* Another path has also reached this state. We choose 464 the winner by examining the tag values for both 465 paths. */ 466 if (tre_tag_order(num_tags, tnfa->tag_directions, 467 tmp_tags, 468 *reach_pos[trans_i->state_id].tags)) 469 { 470 /* The new path wins. */ 471 tmp_iptr = *reach_pos[trans_i->state_id].tags; 472 *reach_pos[trans_i->state_id].tags = tmp_tags; 473 if (trans_i->state == tnfa->final) 474 { 475 DPRINT((" found better match\n")); 476 match_eo = pos; 477 new_match = 1; 478 for (i = 0; i < num_tags; i++) 479 match_tags[i] = tmp_tags[i]; 480 } 481 tmp_tags = tmp_iptr; 482 } 483 } 484 } 485 } 486 } 487 reach_next_i->state = NULL; 488 } 489 490 DPRINT(("match end offset = %d\n", match_eo)); 491 492 #ifndef TRE_USE_ALLOCA 493 if (buf) 494 xfree(buf); 495 #endif /* !TRE_USE_ALLOCA */ 496 497 *match_end_ofs = match_eo; 498 return match_eo >= 0 ? REG_OK : REG_NOMATCH; 499 } 500 501 /* EOF */ 502