xref: /netbsd-src/external/bsd/tre/dist/lib/tre-match-parallel.c (revision b3178392f5caad8a10adb61c96d226e9c21c03ef)
1 /*
2   tre-match-parallel.c - TRE parallel regex matching engine
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 /*
10   This algorithm searches for matches basically by reading characters
11   in the searched string one by one, starting at the beginning.	 All
12   matching paths in the TNFA are traversed in parallel.	 When two or
13   more paths reach the same state, exactly one is chosen according to
14   tag ordering rules; if returning submatches is not required it does
15   not matter which path is chosen.
16 
17   The worst case time required for finding the leftmost and longest
18   match, or determining that there is no match, is always linearly
19   dependent on the length of the text being searched.
20 
21   This algorithm cannot handle TNFAs with back referencing nodes.
22   See `tre-match-backtrack.c'.
23 */
24 
25 
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif /* HAVE_CONFIG_H */
29 
30 #include "tre-alloca.h"
31 
32 #include <assert.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #ifdef HAVE_WCHAR_H
36 #include <wchar.h>
37 #endif /* HAVE_WCHAR_H */
38 #ifdef HAVE_WCTYPE_H
39 #include <wctype.h>
40 #endif /* HAVE_WCTYPE_H */
41 #ifndef TRE_WCHAR
42 #include <ctype.h>
43 #endif /* !TRE_WCHAR */
44 #ifdef HAVE_MALLOC_H
45 #include <malloc.h>
46 #endif /* HAVE_MALLOC_H */
47 #include <stdint.h>
48 
49 #include "tre-internal.h"
50 #include "tre-match-utils.h"
51 #include "tre.h"
52 #include "xmalloc.h"
53 
54 
55 
56 typedef struct {
57   tre_tnfa_transition_t *state;
58   int *tags;
59 } tre_tnfa_reach_t;
60 
61 typedef struct {
62   int pos;
63   int **tags;
64 } tre_reach_pos_t;
65 
66 
67 #ifdef TRE_DEBUG
68 static void
tre_print_reach(const tre_tnfa_t * tnfa,tre_tnfa_reach_t * reach,int num_tags)69 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
70 {
71   int i;
72 
73   while (reach->state != NULL)
74     {
75       DPRINT((" %p", (void *)reach->state));
76       if (num_tags > 0)
77 	{
78 	  DPRINT(("/"));
79 	  for (i = 0; i < num_tags; i++)
80 	    {
81 	      DPRINT(("%d:%d", i, reach->tags[i]));
82 	      if (i < (num_tags-1))
83 		DPRINT((","));
84 	    }
85 	}
86       reach++;
87     }
88   DPRINT(("\n"));
89 
90 }
91 #endif /* TRE_DEBUG */
92 
93 reg_errcode_t
tre_tnfa_run_parallel(const tre_tnfa_t * tnfa,const void * string,int len,tre_str_type_t type,int * match_tags,int eflags,int * match_end_ofs)94 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
95 		      tre_str_type_t type, int *match_tags, int eflags,
96 		      int *match_end_ofs)
97 {
98   /* State variables required by GET_NEXT_WCHAR. */
99   tre_char_t prev_c = 0, next_c = 0;
100   const char *str_byte = string;
101   int pos = -1;
102   unsigned int pos_add_next = 1;
103 #ifdef TRE_WCHAR
104   const wchar_t *str_wide = string;
105 #ifdef TRE_MBSTATE
106   mbstate_t mbstate;
107 #endif /* TRE_MBSTATE */
108 #endif /* TRE_WCHAR */
109   int reg_notbol = eflags & REG_NOTBOL;
110   int reg_noteol = eflags & REG_NOTEOL;
111   int reg_newline = tnfa->cflags & REG_NEWLINE;
112   size_t str_user_end = 0;
113 
114   char *buf;
115   tre_tnfa_transition_t *trans_i;
116   tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
117   tre_reach_pos_t *reach_pos;
118   int *tag_i;
119   size_t num_tags, i;
120 
121   int match_eo = -1;	   /* end offset of match (-1 if no match found yet) */
122   int new_match = 0;
123   int *tmp_tags = NULL;
124   int *tmp_iptr;
125   reg_errcode_t ret;
126 
127 #ifdef TRE_MBSTATE
128   memset(&mbstate, '\0', sizeof(mbstate));
129 #endif /* TRE_MBSTATE */
130 
131   DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
132 
133   if (!match_tags)
134     num_tags = 0;
135   else
136     num_tags = tnfa->num_tags;
137 
138   /* Allocate memory for temporary data required for matching.	This needs to
139      be done for every matching operation to be thread safe.  This allocates
140      everything in a single large block from the stack frame using alloca()
141      or with malloc() if alloca is unavailable. */
142   {
143     size_t tbytes, rbytes, pbytes, xbytes, total_bytes;
144     char *tmp_buf;
145 
146     /* Ensure that tbytes and xbytes*num_states cannot overflow, and that
147      * they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */
148     if (num_tags > SIZE_MAX/(8 * sizeof(int) * tnfa->num_states))
149       return REG_ESPACE;
150 
151     /* Likewise check rbytes. */
152     if (tnfa->num_states+1 > SIZE_MAX/(8 * sizeof(*reach_next)))
153       return REG_ESPACE;
154 
155     /* Likewise check pbytes. */
156     if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_pos)))
157       return REG_ESPACE;
158 
159     /* Compute the length of the block we need. */
160     tbytes = sizeof(*tmp_tags) * num_tags;
161     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
162     pbytes = sizeof(*reach_pos) * tnfa->num_states;
163     xbytes = sizeof(int) * num_tags;
164     total_bytes =
165       (sizeof(long) - 1) * 4 /* for alignment paddings */
166       + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
167 
168     /* Allocate the memory. */
169 #ifdef TRE_USE_ALLOCA
170     buf = alloca(total_bytes);
171 #else /* !TRE_USE_ALLOCA */
172     buf = xmalloc((unsigned)total_bytes);
173 #endif /* !TRE_USE_ALLOCA */
174     if (buf == NULL)
175       return REG_ESPACE;
176     memset(buf, 0, (size_t)total_bytes);
177 
178     /* Get the various pointers within tmp_buf (properly aligned). */
179     tmp_tags = (void *)buf;
180     tmp_buf = buf + tbytes;
181     tmp_buf += ALIGN(tmp_buf, long);
182     reach_next = (void *)tmp_buf;
183     tmp_buf += rbytes;
184     tmp_buf += ALIGN(tmp_buf, long);
185     reach = (void *)tmp_buf;
186     tmp_buf += rbytes;
187     tmp_buf += ALIGN(tmp_buf, long);
188     reach_pos = (void *)tmp_buf;
189     tmp_buf += pbytes;
190     tmp_buf += ALIGN(tmp_buf, long);
191     for (i = 0; i < tnfa->num_states; i++)
192       {
193 	reach[i].tags = (void *)tmp_buf;
194 	tmp_buf += xbytes;
195 	reach_next[i].tags = (void *)tmp_buf;
196 	tmp_buf += xbytes;
197       }
198   }
199 
200   for (i = 0; i < tnfa->num_states; i++)
201     reach_pos[i].pos = -1;
202 
203   /* If only one character can start a match, find it first. */
204   if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte)
205     {
206       const char *orig_str = str_byte;
207       int first = tnfa->first_char;
208 
209       if (len >= 0)
210 	str_byte = memchr(orig_str, first, (size_t)len);
211       else
212 	str_byte = strchr(orig_str, first);
213       if (str_byte == NULL)
214 	{
215 #ifndef TRE_USE_ALLOCA
216 	  if (buf)
217 	    xfree(buf);
218 #endif /* !TRE_USE_ALLOCA */
219 	  return REG_NOMATCH;
220 	}
221       DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
222       if (str_byte >= orig_str + 1)
223 	prev_c = (unsigned char)*(str_byte - 1);
224       next_c = (unsigned char)*str_byte;
225       pos = (int)(str_byte - orig_str);
226       if (len < 0 || pos < len)
227 	str_byte++;
228     }
229   else
230     {
231       GET_NEXT_WCHAR();
232       pos = 0;
233     }
234 
235 #if 0
236   /* Skip over characters that cannot possibly be the first character
237      of a match. */
238   if (tnfa->firstpos_chars != NULL)
239     {
240       char *chars = tnfa->firstpos_chars;
241 
242       if (len < 0)
243 	{
244 	  const char *orig_str = str_byte;
245 	  /* XXX - use strpbrk() and wcspbrk() because they might be
246 	     optimized for the target architecture.  Try also strcspn()
247 	     and wcscspn() and compare the speeds. */
248 	  while (next_c != L'\0' && !chars[next_c])
249 	    {
250 	      next_c = *str_byte++;
251 	    }
252 	  prev_c = *(str_byte - 2);
253 	  pos += str_byte - orig_str;
254 	  DPRINT(("skipped %d chars\n", str_byte - orig_str));
255 	}
256       else
257 	{
258 	  while (pos <= len && !chars[next_c])
259 	    {
260 	      prev_c = next_c;
261 	      next_c = (unsigned char)(*str_byte++);
262 	      pos++;
263 	    }
264 	}
265     }
266 #endif
267 
268   DPRINT(("length: %d\n", len));
269   DPRINT(("pos:chr/code | states and tags\n"));
270   DPRINT(("-------------+------------------------------------------------\n"));
271 
272   reach_next_i = reach_next;
273   while (/*CONSTCOND*/(void)1,1)
274     {
275       /* If no match found yet, add the initial states to `reach_next'. */
276       if (match_eo < 0)
277 	{
278 	  DPRINT((" init >"));
279 	  trans_i = tnfa->initial;
280 	  while (trans_i->state != NULL)
281 	    {
282 	      if (reach_pos[trans_i->state_id].pos < pos)
283 		{
284 		  if (trans_i->assertions
285 		      && CHECK_ASSERTIONS(trans_i->assertions))
286 		    {
287 		      DPRINT(("assertion failed\n"));
288 		      trans_i++;
289 		      continue;
290 		    }
291 
292 		  DPRINT((" %p", (void *)trans_i->state));
293 		  reach_next_i->state = trans_i->state;
294 		  for (i = 0; i < num_tags; i++)
295 		    reach_next_i->tags[i] = -1;
296 		  tag_i = trans_i->tags;
297 		  if (tag_i)
298 		    while (*tag_i >= 0)
299 		      {
300 			if ((size_t)*tag_i < num_tags)
301 			  reach_next_i->tags[*tag_i] = pos;
302 			tag_i++;
303 		      }
304 		  if (reach_next_i->state == tnfa->final)
305 		    {
306 		      DPRINT(("	 found empty match\n"));
307 		      match_eo = pos;
308 		      new_match = 1;
309 		      for (i = 0; i < num_tags; i++)
310 			match_tags[i] = reach_next_i->tags[i];
311 		    }
312 		  reach_pos[trans_i->state_id].pos = pos;
313 		  reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
314 		  reach_next_i++;
315 		}
316 	      trans_i++;
317 	    }
318 	  DPRINT(("\n"));
319 	  reach_next_i->state = NULL;
320 	}
321       else
322 	{
323 	  if (num_tags == 0 || reach_next_i == reach_next)
324 	    /* We have found a match. */
325 	    break;
326 	}
327 
328       /* Check for end of string. */
329       if (len < 0)
330 	{
331 	  if (type == STR_USER)
332 	    {
333 	      if (str_user_end)
334 		break;
335 	    }
336 	  else if (next_c == L'\0')
337 	    break;
338 	}
339       else
340 	{
341 	  if (pos >= len)
342 	    break;
343 	}
344 
345       GET_NEXT_WCHAR();
346 
347 #ifdef TRE_DEBUG
348       DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
349       tre_print_reach(tnfa, reach_next, num_tags);
350       DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
351       tre_print_reach(tnfa, reach_next, num_tags);
352 #endif /* TRE_DEBUG */
353 
354       /* Swap `reach' and `reach_next'. */
355       reach_i = reach;
356       reach = reach_next;
357       reach_next = reach_i;
358 
359       /* For each state in `reach', weed out states that don't fulfill the
360 	 minimal matching conditions. */
361       if (tnfa->num_minimals && new_match)
362 	{
363 	  new_match = 0;
364 	  reach_next_i = reach_next;
365 	  for (reach_i = reach; reach_i->state; reach_i++)
366 	    {
367 	      int skip = 0;
368 	      for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
369 		{
370 		  int end = tnfa->minimal_tags[i];
371 		  int start = tnfa->minimal_tags[i + 1];
372 		  DPRINT(("  Minimal start %d, end %d\n", start, end));
373 		  if ((size_t)end >= num_tags)
374 		    {
375 		      DPRINT(("	 Throwing %p out.\n", reach_i->state));
376 		      skip = 1;
377 		      break;
378 		    }
379 		  else if (reach_i->tags[start] == match_tags[start]
380 			   && reach_i->tags[end] < match_tags[end])
381 		    {
382 		      DPRINT(("	 Throwing %p out because t%d < %d\n",
383 			      reach_i->state, end, match_tags[end]));
384 		      skip = 1;
385 		      break;
386 		    }
387 		}
388 	      if (!skip)
389 		{
390 		  reach_next_i->state = reach_i->state;
391 		  tmp_iptr = reach_next_i->tags;
392 		  reach_next_i->tags = reach_i->tags;
393 		  reach_i->tags = tmp_iptr;
394 		  reach_next_i++;
395 		}
396 	    }
397 	  reach_next_i->state = NULL;
398 
399 	  /* Swap `reach' and `reach_next'. */
400 	  reach_i = reach;
401 	  reach = reach_next;
402 	  reach_next = reach_i;
403 	}
404 
405       /* For each state in `reach' see if there is a transition leaving with
406 	 the current input symbol to a state not yet in `reach_next', and
407 	 add the destination states to `reach_next'. */
408       reach_next_i = reach_next;
409       for (reach_i = reach; reach_i->state; reach_i++)
410 	{
411 	  for (trans_i = reach_i->state; trans_i->state; trans_i++)
412 	    {
413 	      /* Does this transition match the input symbol? */
414 	      if (trans_i->code_min <= (tre_cint_t)prev_c &&
415 		  trans_i->code_max >= (tre_cint_t)prev_c)
416 		{
417 		  if (trans_i->assertions
418 		      && (CHECK_ASSERTIONS(trans_i->assertions)
419 			  || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
420 		    {
421 		      DPRINT(("assertion failed\n"));
422 		      continue;
423 		    }
424 
425 		  /* Compute the tags after this transition. */
426 		  for (i = 0; i < num_tags; i++)
427 		    tmp_tags[i] = reach_i->tags[i];
428 		  tag_i = trans_i->tags;
429 		  if (tag_i != NULL)
430 		    while (*tag_i >= 0)
431 		      {
432 			if ((size_t)*tag_i < num_tags)
433 			  tmp_tags[*tag_i] = pos;
434 			tag_i++;
435 		      }
436 
437 		  if (reach_pos[trans_i->state_id].pos < pos)
438 		    {
439 		      /* Found an unvisited node. */
440 		      reach_next_i->state = trans_i->state;
441 		      tmp_iptr = reach_next_i->tags;
442 		      reach_next_i->tags = tmp_tags;
443 		      tmp_tags = tmp_iptr;
444 		      reach_pos[trans_i->state_id].pos = pos;
445 		      reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
446 
447 		      if (reach_next_i->state == tnfa->final
448 			  && (match_eo == -1
449 			      || (num_tags > 0
450 				  && reach_next_i->tags[0] <= match_tags[0])))
451 			{
452 			  DPRINT(("  found match %p\n", trans_i->state));
453 			  match_eo = pos;
454 			  new_match = 1;
455 			  for (i = 0; i < num_tags; i++)
456 			    match_tags[i] = reach_next_i->tags[i];
457 			}
458 		      reach_next_i++;
459 
460 		    }
461 		  else
462 		    {
463 		      assert(reach_pos[trans_i->state_id].pos == pos);
464 		      /* Another path has also reached this state.  We choose
465 			 the winner by examining the tag values for both
466 			 paths. */
467 		      if (tre_tag_order(num_tags, tnfa->tag_directions,
468 					tmp_tags,
469 					*reach_pos[trans_i->state_id].tags))
470 			{
471 			  /* The new path wins. */
472 			  tmp_iptr = *reach_pos[trans_i->state_id].tags;
473 			  *reach_pos[trans_i->state_id].tags = tmp_tags;
474 			  if (trans_i->state == tnfa->final)
475 			    {
476 			      DPRINT(("	 found better match\n"));
477 			      match_eo = pos;
478 			      new_match = 1;
479 			      for (i = 0; i < num_tags; i++)
480 				match_tags[i] = tmp_tags[i];
481 			    }
482 			  tmp_tags = tmp_iptr;
483 			}
484 		    }
485 		}
486 	    }
487 	}
488       reach_next_i->state = NULL;
489     }
490 
491   DPRINT(("match end offset = %d\n", match_eo));
492 
493   *match_end_ofs = match_eo;
494   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
495 error_exit:
496 #ifndef TRE_USE_ALLOCA
497   if (buf)
498     xfree(buf);
499 #endif /* !TRE_USE_ALLOCA */
500   return ret;
501 }
502 
503 /* EOF */
504