1 /*
2 tre-match-parallel.c - TRE parallel regex matching engine
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7 */
8
9 /*
10 This algorithm searches for matches basically by reading characters
11 in the searched string one by one, starting at the beginning. All
12 matching paths in the TNFA are traversed in parallel. When two or
13 more paths reach the same state, exactly one is chosen according to
14 tag ordering rules; if returning submatches is not required it does
15 not matter which path is chosen.
16
17 The worst case time required for finding the leftmost and longest
18 match, or determining that there is no match, is always linearly
19 dependent on the length of the text being searched.
20
21 This algorithm cannot handle TNFAs with back referencing nodes.
22 See `tre-match-backtrack.c'.
23 */
24
25
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif /* HAVE_CONFIG_H */
29
30 #include "tre-alloca.h"
31
32 #include <assert.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #ifdef HAVE_WCHAR_H
36 #include <wchar.h>
37 #endif /* HAVE_WCHAR_H */
38 #ifdef HAVE_WCTYPE_H
39 #include <wctype.h>
40 #endif /* HAVE_WCTYPE_H */
41 #ifndef TRE_WCHAR
42 #include <ctype.h>
43 #endif /* !TRE_WCHAR */
44 #ifdef HAVE_MALLOC_H
45 #include <malloc.h>
46 #endif /* HAVE_MALLOC_H */
47 #include <stdint.h>
48
49 #include "tre-internal.h"
50 #include "tre-match-utils.h"
51 #include "tre.h"
52 #include "xmalloc.h"
53
54
55
56 typedef struct {
57 tre_tnfa_transition_t *state;
58 int *tags;
59 } tre_tnfa_reach_t;
60
61 typedef struct {
62 int pos;
63 int **tags;
64 } tre_reach_pos_t;
65
66
67 #ifdef TRE_DEBUG
68 static void
tre_print_reach(const tre_tnfa_t * tnfa,tre_tnfa_reach_t * reach,int num_tags)69 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
70 {
71 int i;
72
73 while (reach->state != NULL)
74 {
75 DPRINT((" %p", (void *)reach->state));
76 if (num_tags > 0)
77 {
78 DPRINT(("/"));
79 for (i = 0; i < num_tags; i++)
80 {
81 DPRINT(("%d:%d", i, reach->tags[i]));
82 if (i < (num_tags-1))
83 DPRINT((","));
84 }
85 }
86 reach++;
87 }
88 DPRINT(("\n"));
89
90 }
91 #endif /* TRE_DEBUG */
92
93 reg_errcode_t
tre_tnfa_run_parallel(const tre_tnfa_t * tnfa,const void * string,int len,tre_str_type_t type,int * match_tags,int eflags,int * match_end_ofs)94 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
95 tre_str_type_t type, int *match_tags, int eflags,
96 int *match_end_ofs)
97 {
98 /* State variables required by GET_NEXT_WCHAR. */
99 tre_char_t prev_c = 0, next_c = 0;
100 const char *str_byte = string;
101 int pos = -1;
102 unsigned int pos_add_next = 1;
103 #ifdef TRE_WCHAR
104 const wchar_t *str_wide = string;
105 #ifdef TRE_MBSTATE
106 mbstate_t mbstate;
107 #endif /* TRE_MBSTATE */
108 #endif /* TRE_WCHAR */
109 int reg_notbol = eflags & REG_NOTBOL;
110 int reg_noteol = eflags & REG_NOTEOL;
111 int reg_newline = tnfa->cflags & REG_NEWLINE;
112 size_t str_user_end = 0;
113
114 char *buf;
115 tre_tnfa_transition_t *trans_i;
116 tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
117 tre_reach_pos_t *reach_pos;
118 int *tag_i;
119 size_t num_tags, i;
120
121 int match_eo = -1; /* end offset of match (-1 if no match found yet) */
122 int new_match = 0;
123 int *tmp_tags = NULL;
124 int *tmp_iptr;
125 reg_errcode_t ret;
126
127 #ifdef TRE_MBSTATE
128 memset(&mbstate, '\0', sizeof(mbstate));
129 #endif /* TRE_MBSTATE */
130
131 DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
132
133 if (!match_tags)
134 num_tags = 0;
135 else
136 num_tags = tnfa->num_tags;
137
138 /* Allocate memory for temporary data required for matching. This needs to
139 be done for every matching operation to be thread safe. This allocates
140 everything in a single large block from the stack frame using alloca()
141 or with malloc() if alloca is unavailable. */
142 {
143 size_t tbytes, rbytes, pbytes, xbytes, total_bytes;
144 char *tmp_buf;
145
146 /* Ensure that tbytes and xbytes*num_states cannot overflow, and that
147 * they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */
148 if (num_tags > SIZE_MAX/(8 * sizeof(int) * tnfa->num_states))
149 return REG_ESPACE;
150
151 /* Likewise check rbytes. */
152 if (tnfa->num_states+1 > SIZE_MAX/(8 * sizeof(*reach_next)))
153 return REG_ESPACE;
154
155 /* Likewise check pbytes. */
156 if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_pos)))
157 return REG_ESPACE;
158
159 /* Compute the length of the block we need. */
160 tbytes = sizeof(*tmp_tags) * num_tags;
161 rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
162 pbytes = sizeof(*reach_pos) * tnfa->num_states;
163 xbytes = sizeof(int) * num_tags;
164 total_bytes =
165 (sizeof(long) - 1) * 4 /* for alignment paddings */
166 + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
167
168 /* Allocate the memory. */
169 #ifdef TRE_USE_ALLOCA
170 buf = alloca(total_bytes);
171 #else /* !TRE_USE_ALLOCA */
172 buf = xmalloc((unsigned)total_bytes);
173 #endif /* !TRE_USE_ALLOCA */
174 if (buf == NULL)
175 return REG_ESPACE;
176 memset(buf, 0, (size_t)total_bytes);
177
178 /* Get the various pointers within tmp_buf (properly aligned). */
179 tmp_tags = (void *)buf;
180 tmp_buf = buf + tbytes;
181 tmp_buf += ALIGN(tmp_buf, long);
182 reach_next = (void *)tmp_buf;
183 tmp_buf += rbytes;
184 tmp_buf += ALIGN(tmp_buf, long);
185 reach = (void *)tmp_buf;
186 tmp_buf += rbytes;
187 tmp_buf += ALIGN(tmp_buf, long);
188 reach_pos = (void *)tmp_buf;
189 tmp_buf += pbytes;
190 tmp_buf += ALIGN(tmp_buf, long);
191 for (i = 0; i < tnfa->num_states; i++)
192 {
193 reach[i].tags = (void *)tmp_buf;
194 tmp_buf += xbytes;
195 reach_next[i].tags = (void *)tmp_buf;
196 tmp_buf += xbytes;
197 }
198 }
199
200 for (i = 0; i < tnfa->num_states; i++)
201 reach_pos[i].pos = -1;
202
203 /* If only one character can start a match, find it first. */
204 if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte)
205 {
206 const char *orig_str = str_byte;
207 int first = tnfa->first_char;
208
209 if (len >= 0)
210 str_byte = memchr(orig_str, first, (size_t)len);
211 else
212 str_byte = strchr(orig_str, first);
213 if (str_byte == NULL)
214 {
215 #ifndef TRE_USE_ALLOCA
216 if (buf)
217 xfree(buf);
218 #endif /* !TRE_USE_ALLOCA */
219 return REG_NOMATCH;
220 }
221 DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
222 if (str_byte >= orig_str + 1)
223 prev_c = (unsigned char)*(str_byte - 1);
224 next_c = (unsigned char)*str_byte;
225 pos = (int)(str_byte - orig_str);
226 if (len < 0 || pos < len)
227 str_byte++;
228 }
229 else
230 {
231 GET_NEXT_WCHAR();
232 pos = 0;
233 }
234
235 #if 0
236 /* Skip over characters that cannot possibly be the first character
237 of a match. */
238 if (tnfa->firstpos_chars != NULL)
239 {
240 char *chars = tnfa->firstpos_chars;
241
242 if (len < 0)
243 {
244 const char *orig_str = str_byte;
245 /* XXX - use strpbrk() and wcspbrk() because they might be
246 optimized for the target architecture. Try also strcspn()
247 and wcscspn() and compare the speeds. */
248 while (next_c != L'\0' && !chars[next_c])
249 {
250 next_c = *str_byte++;
251 }
252 prev_c = *(str_byte - 2);
253 pos += str_byte - orig_str;
254 DPRINT(("skipped %d chars\n", str_byte - orig_str));
255 }
256 else
257 {
258 while (pos <= len && !chars[next_c])
259 {
260 prev_c = next_c;
261 next_c = (unsigned char)(*str_byte++);
262 pos++;
263 }
264 }
265 }
266 #endif
267
268 DPRINT(("length: %d\n", len));
269 DPRINT(("pos:chr/code | states and tags\n"));
270 DPRINT(("-------------+------------------------------------------------\n"));
271
272 reach_next_i = reach_next;
273 while (/*CONSTCOND*/(void)1,1)
274 {
275 /* If no match found yet, add the initial states to `reach_next'. */
276 if (match_eo < 0)
277 {
278 DPRINT((" init >"));
279 trans_i = tnfa->initial;
280 while (trans_i->state != NULL)
281 {
282 if (reach_pos[trans_i->state_id].pos < pos)
283 {
284 if (trans_i->assertions
285 && CHECK_ASSERTIONS(trans_i->assertions))
286 {
287 DPRINT(("assertion failed\n"));
288 trans_i++;
289 continue;
290 }
291
292 DPRINT((" %p", (void *)trans_i->state));
293 reach_next_i->state = trans_i->state;
294 for (i = 0; i < num_tags; i++)
295 reach_next_i->tags[i] = -1;
296 tag_i = trans_i->tags;
297 if (tag_i)
298 while (*tag_i >= 0)
299 {
300 if ((size_t)*tag_i < num_tags)
301 reach_next_i->tags[*tag_i] = pos;
302 tag_i++;
303 }
304 if (reach_next_i->state == tnfa->final)
305 {
306 DPRINT((" found empty match\n"));
307 match_eo = pos;
308 new_match = 1;
309 for (i = 0; i < num_tags; i++)
310 match_tags[i] = reach_next_i->tags[i];
311 }
312 reach_pos[trans_i->state_id].pos = pos;
313 reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
314 reach_next_i++;
315 }
316 trans_i++;
317 }
318 DPRINT(("\n"));
319 reach_next_i->state = NULL;
320 }
321 else
322 {
323 if (num_tags == 0 || reach_next_i == reach_next)
324 /* We have found a match. */
325 break;
326 }
327
328 /* Check for end of string. */
329 if (len < 0)
330 {
331 if (type == STR_USER)
332 {
333 if (str_user_end)
334 break;
335 }
336 else if (next_c == L'\0')
337 break;
338 }
339 else
340 {
341 if (pos >= len)
342 break;
343 }
344
345 GET_NEXT_WCHAR();
346
347 #ifdef TRE_DEBUG
348 DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
349 tre_print_reach(tnfa, reach_next, num_tags);
350 DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
351 tre_print_reach(tnfa, reach_next, num_tags);
352 #endif /* TRE_DEBUG */
353
354 /* Swap `reach' and `reach_next'. */
355 reach_i = reach;
356 reach = reach_next;
357 reach_next = reach_i;
358
359 /* For each state in `reach', weed out states that don't fulfill the
360 minimal matching conditions. */
361 if (tnfa->num_minimals && new_match)
362 {
363 new_match = 0;
364 reach_next_i = reach_next;
365 for (reach_i = reach; reach_i->state; reach_i++)
366 {
367 int skip = 0;
368 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
369 {
370 int end = tnfa->minimal_tags[i];
371 int start = tnfa->minimal_tags[i + 1];
372 DPRINT((" Minimal start %d, end %d\n", start, end));
373 if ((size_t)end >= num_tags)
374 {
375 DPRINT((" Throwing %p out.\n", reach_i->state));
376 skip = 1;
377 break;
378 }
379 else if (reach_i->tags[start] == match_tags[start]
380 && reach_i->tags[end] < match_tags[end])
381 {
382 DPRINT((" Throwing %p out because t%d < %d\n",
383 reach_i->state, end, match_tags[end]));
384 skip = 1;
385 break;
386 }
387 }
388 if (!skip)
389 {
390 reach_next_i->state = reach_i->state;
391 tmp_iptr = reach_next_i->tags;
392 reach_next_i->tags = reach_i->tags;
393 reach_i->tags = tmp_iptr;
394 reach_next_i++;
395 }
396 }
397 reach_next_i->state = NULL;
398
399 /* Swap `reach' and `reach_next'. */
400 reach_i = reach;
401 reach = reach_next;
402 reach_next = reach_i;
403 }
404
405 /* For each state in `reach' see if there is a transition leaving with
406 the current input symbol to a state not yet in `reach_next', and
407 add the destination states to `reach_next'. */
408 reach_next_i = reach_next;
409 for (reach_i = reach; reach_i->state; reach_i++)
410 {
411 for (trans_i = reach_i->state; trans_i->state; trans_i++)
412 {
413 /* Does this transition match the input symbol? */
414 if (trans_i->code_min <= (tre_cint_t)prev_c &&
415 trans_i->code_max >= (tre_cint_t)prev_c)
416 {
417 if (trans_i->assertions
418 && (CHECK_ASSERTIONS(trans_i->assertions)
419 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
420 {
421 DPRINT(("assertion failed\n"));
422 continue;
423 }
424
425 /* Compute the tags after this transition. */
426 for (i = 0; i < num_tags; i++)
427 tmp_tags[i] = reach_i->tags[i];
428 tag_i = trans_i->tags;
429 if (tag_i != NULL)
430 while (*tag_i >= 0)
431 {
432 if ((size_t)*tag_i < num_tags)
433 tmp_tags[*tag_i] = pos;
434 tag_i++;
435 }
436
437 if (reach_pos[trans_i->state_id].pos < pos)
438 {
439 /* Found an unvisited node. */
440 reach_next_i->state = trans_i->state;
441 tmp_iptr = reach_next_i->tags;
442 reach_next_i->tags = tmp_tags;
443 tmp_tags = tmp_iptr;
444 reach_pos[trans_i->state_id].pos = pos;
445 reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
446
447 if (reach_next_i->state == tnfa->final
448 && (match_eo == -1
449 || (num_tags > 0
450 && reach_next_i->tags[0] <= match_tags[0])))
451 {
452 DPRINT((" found match %p\n", trans_i->state));
453 match_eo = pos;
454 new_match = 1;
455 for (i = 0; i < num_tags; i++)
456 match_tags[i] = reach_next_i->tags[i];
457 }
458 reach_next_i++;
459
460 }
461 else
462 {
463 assert(reach_pos[trans_i->state_id].pos == pos);
464 /* Another path has also reached this state. We choose
465 the winner by examining the tag values for both
466 paths. */
467 if (tre_tag_order(num_tags, tnfa->tag_directions,
468 tmp_tags,
469 *reach_pos[trans_i->state_id].tags))
470 {
471 /* The new path wins. */
472 tmp_iptr = *reach_pos[trans_i->state_id].tags;
473 *reach_pos[trans_i->state_id].tags = tmp_tags;
474 if (trans_i->state == tnfa->final)
475 {
476 DPRINT((" found better match\n"));
477 match_eo = pos;
478 new_match = 1;
479 for (i = 0; i < num_tags; i++)
480 match_tags[i] = tmp_tags[i];
481 }
482 tmp_tags = tmp_iptr;
483 }
484 }
485 }
486 }
487 }
488 reach_next_i->state = NULL;
489 }
490
491 DPRINT(("match end offset = %d\n", match_eo));
492
493 *match_end_ofs = match_eo;
494 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
495 error_exit:
496 #ifndef TRE_USE_ALLOCA
497 if (buf)
498 xfree(buf);
499 #endif /* !TRE_USE_ALLOCA */
500 return ret;
501 }
502
503 /* EOF */
504