xref: /netbsd-src/external/bsd/tre/dist/lib/tre-match-parallel.c (revision 6a493d6bc668897c91594964a732d38505b70cbb)
1 /*
2   tre-match-parallel.c - TRE parallel regex matching engine
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 /*
10   This algorithm searches for matches basically by reading characters
11   in the searched string one by one, starting at the beginning.	 All
12   matching paths in the TNFA are traversed in parallel.	 When two or
13   more paths reach the same state, exactly one is chosen according to
14   tag ordering rules; if returning submatches is not required it does
15   not matter which path is chosen.
16 
17   The worst case time required for finding the leftmost and longest
18   match, or determining that there is no match, is always linearly
19   dependent on the length of the text being searched.
20 
21   This algorithm cannot handle TNFAs with back referencing nodes.
22   See `tre-match-backtrack.c'.
23 */
24 
25 
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif /* HAVE_CONFIG_H */
29 
30 #include "tre-alloca.h"
31 
32 #include <assert.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #ifdef HAVE_WCHAR_H
36 #include <wchar.h>
37 #endif /* HAVE_WCHAR_H */
38 #ifdef HAVE_WCTYPE_H
39 #include <wctype.h>
40 #endif /* HAVE_WCTYPE_H */
41 #ifndef TRE_WCHAR
42 #include <ctype.h>
43 #endif /* !TRE_WCHAR */
44 #ifdef HAVE_MALLOC_H
45 #include <malloc.h>
46 #endif /* HAVE_MALLOC_H */
47 
48 #include "tre-internal.h"
49 #include "tre-match-utils.h"
50 #include "tre.h"
51 #include "xmalloc.h"
52 
53 
54 
55 typedef struct {
56   tre_tnfa_transition_t *state;
57   int *tags;
58 } tre_tnfa_reach_t;
59 
60 typedef struct {
61   int pos;
62   int **tags;
63 } tre_reach_pos_t;
64 
65 
66 #ifdef TRE_DEBUG
67 static void
68 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
69 {
70   int i;
71 
72   while (reach->state != NULL)
73     {
74       DPRINT((" %p", (void *)reach->state));
75       if (num_tags > 0)
76 	{
77 	  DPRINT(("/"));
78 	  for (i = 0; i < num_tags; i++)
79 	    {
80 	      DPRINT(("%d:%d", i, reach->tags[i]));
81 	      if (i < (num_tags-1))
82 		DPRINT((","));
83 	    }
84 	}
85       reach++;
86     }
87   DPRINT(("\n"));
88 
89 }
90 #endif /* TRE_DEBUG */
91 
92 reg_errcode_t
93 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
94 		      tre_str_type_t type, int *match_tags, int eflags,
95 		      int *match_end_ofs)
96 {
97   /* State variables required by GET_NEXT_WCHAR. */
98   tre_char_t prev_c = 0, next_c = 0;
99   const char *str_byte = string;
100   int pos = -1;
101   unsigned int pos_add_next = 1;
102 #ifdef TRE_WCHAR
103   const wchar_t *str_wide = string;
104 #ifdef TRE_MBSTATE
105   mbstate_t mbstate;
106 #endif /* TRE_MBSTATE */
107 #endif /* TRE_WCHAR */
108   int reg_notbol = eflags & REG_NOTBOL;
109   int reg_noteol = eflags & REG_NOTEOL;
110   int reg_newline = tnfa->cflags & REG_NEWLINE;
111   size_t str_user_end = 0;
112 
113   char *buf;
114   tre_tnfa_transition_t *trans_i;
115   tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
116   tre_reach_pos_t *reach_pos;
117   int *tag_i;
118   size_t num_tags, i;
119 
120   int match_eo = -1;	   /* end offset of match (-1 if no match found yet) */
121   int new_match = 0;
122   int *tmp_tags = NULL;
123   int *tmp_iptr;
124 
125 #ifdef TRE_MBSTATE
126   memset(&mbstate, '\0', sizeof(mbstate));
127 #endif /* TRE_MBSTATE */
128 
129   DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
130 
131   if (!match_tags)
132     num_tags = 0;
133   else
134     num_tags = tnfa->num_tags;
135 
136   /* Allocate memory for temporary data required for matching.	This needs to
137      be done for every matching operation to be thread safe.  This allocates
138      everything in a single large block from the stack frame using alloca()
139      or with malloc() if alloca is unavailable. */
140   {
141     size_t tbytes, rbytes, pbytes, xbytes, total_bytes;
142     char *tmp_buf;
143     /* Compute the length of the block we need. */
144     tbytes = sizeof(*tmp_tags) * num_tags;
145     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
146     pbytes = sizeof(*reach_pos) * tnfa->num_states;
147     xbytes = sizeof(int) * num_tags;
148     total_bytes =
149       (sizeof(long) - 1) * 4 /* for alignment paddings */
150       + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
151 
152     /* Allocate the memory. */
153 #ifdef TRE_USE_ALLOCA
154     buf = alloca(total_bytes);
155 #else /* !TRE_USE_ALLOCA */
156     buf = xmalloc((unsigned)total_bytes);
157 #endif /* !TRE_USE_ALLOCA */
158     if (buf == NULL)
159       return REG_ESPACE;
160     memset(buf, 0, (size_t)total_bytes);
161 
162     /* Get the various pointers within tmp_buf (properly aligned). */
163     tmp_tags = (void *)buf;
164     tmp_buf = buf + tbytes;
165     tmp_buf += ALIGN(tmp_buf, long);
166     reach_next = (void *)tmp_buf;
167     tmp_buf += rbytes;
168     tmp_buf += ALIGN(tmp_buf, long);
169     reach = (void *)tmp_buf;
170     tmp_buf += rbytes;
171     tmp_buf += ALIGN(tmp_buf, long);
172     reach_pos = (void *)tmp_buf;
173     tmp_buf += pbytes;
174     tmp_buf += ALIGN(tmp_buf, long);
175     for (i = 0; i < tnfa->num_states; i++)
176       {
177 	reach[i].tags = (void *)tmp_buf;
178 	tmp_buf += xbytes;
179 	reach_next[i].tags = (void *)tmp_buf;
180 	tmp_buf += xbytes;
181       }
182   }
183 
184   for (i = 0; i < tnfa->num_states; i++)
185     reach_pos[i].pos = -1;
186 
187   /* If only one character can start a match, find it first. */
188   if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte)
189     {
190       const char *orig_str = str_byte;
191       int first = tnfa->first_char;
192 
193       if (len >= 0)
194 	str_byte = memchr(orig_str, first, (size_t)len);
195       else
196 	str_byte = strchr(orig_str, first);
197       if (str_byte == NULL)
198 	{
199 #ifndef TRE_USE_ALLOCA
200 	  if (buf)
201 	    xfree(buf);
202 #endif /* !TRE_USE_ALLOCA */
203 	  return REG_NOMATCH;
204 	}
205       DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
206       if (str_byte >= orig_str + 1)
207 	prev_c = (unsigned char)*(str_byte - 1);
208       next_c = (unsigned char)*str_byte;
209       pos = (int)(str_byte - orig_str);
210       if (len < 0 || pos < len)
211 	str_byte++;
212     }
213   else
214     {
215       GET_NEXT_WCHAR();
216       pos = 0;
217     }
218 
219 #if 0
220   /* Skip over characters that cannot possibly be the first character
221      of a match. */
222   if (tnfa->firstpos_chars != NULL)
223     {
224       char *chars = tnfa->firstpos_chars;
225 
226       if (len < 0)
227 	{
228 	  const char *orig_str = str_byte;
229 	  /* XXX - use strpbrk() and wcspbrk() because they might be
230 	     optimized for the target architecture.  Try also strcspn()
231 	     and wcscspn() and compare the speeds. */
232 	  while (next_c != L'\0' && !chars[next_c])
233 	    {
234 	      next_c = *str_byte++;
235 	    }
236 	  prev_c = *(str_byte - 2);
237 	  pos += str_byte - orig_str;
238 	  DPRINT(("skipped %d chars\n", str_byte - orig_str));
239 	}
240       else
241 	{
242 	  while (pos <= len && !chars[next_c])
243 	    {
244 	      prev_c = next_c;
245 	      next_c = (unsigned char)(*str_byte++);
246 	      pos++;
247 	    }
248 	}
249     }
250 #endif
251 
252   DPRINT(("length: %d\n", len));
253   DPRINT(("pos:chr/code | states and tags\n"));
254   DPRINT(("-------------+------------------------------------------------\n"));
255 
256   reach_next_i = reach_next;
257   while (/*CONSTCOND*/1)
258     {
259       /* If no match found yet, add the initial states to `reach_next'. */
260       if (match_eo < 0)
261 	{
262 	  DPRINT((" init >"));
263 	  trans_i = tnfa->initial;
264 	  while (trans_i->state != NULL)
265 	    {
266 	      if (reach_pos[trans_i->state_id].pos < pos)
267 		{
268 		  if (trans_i->assertions
269 		      && CHECK_ASSERTIONS(trans_i->assertions))
270 		    {
271 		      DPRINT(("assertion failed\n"));
272 		      trans_i++;
273 		      continue;
274 		    }
275 
276 		  DPRINT((" %p", (void *)trans_i->state));
277 		  reach_next_i->state = trans_i->state;
278 		  for (i = 0; i < num_tags; i++)
279 		    reach_next_i->tags[i] = -1;
280 		  tag_i = trans_i->tags;
281 		  if (tag_i)
282 		    while (*tag_i >= 0)
283 		      {
284 			if ((size_t)*tag_i < num_tags)
285 			  reach_next_i->tags[*tag_i] = pos;
286 			tag_i++;
287 		      }
288 		  if (reach_next_i->state == tnfa->final)
289 		    {
290 		      DPRINT(("	 found empty match\n"));
291 		      match_eo = pos;
292 		      new_match = 1;
293 		      for (i = 0; i < num_tags; i++)
294 			match_tags[i] = reach_next_i->tags[i];
295 		    }
296 		  reach_pos[trans_i->state_id].pos = pos;
297 		  reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
298 		  reach_next_i++;
299 		}
300 	      trans_i++;
301 	    }
302 	  DPRINT(("\n"));
303 	  reach_next_i->state = NULL;
304 	}
305       else
306 	{
307 	  if (num_tags == 0 || reach_next_i == reach_next)
308 	    /*�We have found a match. */
309 	    break;
310 	}
311 
312       /* Check for end of string. */
313       if (len < 0)
314 	{
315 	  if (type == STR_USER)
316 	    {
317 	      if (str_user_end)
318 		break;
319 	    }
320 	  else if (next_c == L'\0')
321 	    break;
322 	}
323       else
324 	{
325 	  if (pos >= len)
326 	    break;
327 	}
328 
329       GET_NEXT_WCHAR();
330 
331 #ifdef TRE_DEBUG
332       DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
333       tre_print_reach(tnfa, reach_next, num_tags);
334       DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
335       tre_print_reach(tnfa, reach_next, num_tags);
336 #endif /* TRE_DEBUG */
337 
338       /* Swap `reach' and `reach_next'. */
339       reach_i = reach;
340       reach = reach_next;
341       reach_next = reach_i;
342 
343       /* For each state in `reach', weed out states that don't fulfill the
344 	 minimal matching conditions. */
345       if (tnfa->num_minimals && new_match)
346 	{
347 	  new_match = 0;
348 	  reach_next_i = reach_next;
349 	  for (reach_i = reach; reach_i->state; reach_i++)
350 	    {
351 	      int skip = 0;
352 	      for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
353 		{
354 		  int end = tnfa->minimal_tags[i];
355 		  int start = tnfa->minimal_tags[i + 1];
356 		  DPRINT(("  Minimal start %d, end %d\n", start, end));
357 		  if ((size_t)end >= num_tags)
358 		    {
359 		      DPRINT(("	 Throwing %p out.\n", reach_i->state));
360 		      skip = 1;
361 		      break;
362 		    }
363 		  else if (reach_i->tags[start] == match_tags[start]
364 			   && reach_i->tags[end] < match_tags[end])
365 		    {
366 		      DPRINT(("	 Throwing %p out because t%d < %d\n",
367 			      reach_i->state, end, match_tags[end]));
368 		      skip = 1;
369 		      break;
370 		    }
371 		}
372 	      if (!skip)
373 		{
374 		  reach_next_i->state = reach_i->state;
375 		  tmp_iptr = reach_next_i->tags;
376 		  reach_next_i->tags = reach_i->tags;
377 		  reach_i->tags = tmp_iptr;
378 		  reach_next_i++;
379 		}
380 	    }
381 	  reach_next_i->state = NULL;
382 
383 	  /* Swap `reach' and `reach_next'. */
384 	  reach_i = reach;
385 	  reach = reach_next;
386 	  reach_next = reach_i;
387 	}
388 
389       /* For each state in `reach' see if there is a transition leaving with
390 	 the current input symbol to a state not yet in `reach_next', and
391 	 add the destination states to `reach_next'. */
392       reach_next_i = reach_next;
393       for (reach_i = reach; reach_i->state; reach_i++)
394 	{
395 	  for (trans_i = reach_i->state; trans_i->state; trans_i++)
396 	    {
397 	      /* Does this transition match the input symbol? */
398 	      if (trans_i->code_min <= (tre_cint_t)prev_c &&
399 		  trans_i->code_max >= (tre_cint_t)prev_c)
400 		{
401 		  if (trans_i->assertions
402 		      && (CHECK_ASSERTIONS(trans_i->assertions)
403 			  || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
404 		    {
405 		      DPRINT(("assertion failed\n"));
406 		      continue;
407 		    }
408 
409 		  /* Compute the tags after this transition. */
410 		  for (i = 0; i < num_tags; i++)
411 		    tmp_tags[i] = reach_i->tags[i];
412 		  tag_i = trans_i->tags;
413 		  if (tag_i != NULL)
414 		    while (*tag_i >= 0)
415 		      {
416 			if ((size_t)*tag_i < num_tags)
417 			  tmp_tags[*tag_i] = pos;
418 			tag_i++;
419 		      }
420 
421 		  if (reach_pos[trans_i->state_id].pos < pos)
422 		    {
423 		      /* Found an unvisited node. */
424 		      reach_next_i->state = trans_i->state;
425 		      tmp_iptr = reach_next_i->tags;
426 		      reach_next_i->tags = tmp_tags;
427 		      tmp_tags = tmp_iptr;
428 		      reach_pos[trans_i->state_id].pos = pos;
429 		      reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
430 
431 		      if (reach_next_i->state == tnfa->final
432 			  && (match_eo == -1
433 			      || (num_tags > 0
434 				  && reach_next_i->tags[0] <= match_tags[0])))
435 			{
436 			  DPRINT(("  found match %p\n", trans_i->state));
437 			  match_eo = pos;
438 			  new_match = 1;
439 			  for (i = 0; i < num_tags; i++)
440 			    match_tags[i] = reach_next_i->tags[i];
441 			}
442 		      reach_next_i++;
443 
444 		    }
445 		  else
446 		    {
447 		      assert(reach_pos[trans_i->state_id].pos == pos);
448 		      /* Another path has also reached this state.  We choose
449 			 the winner by examining the tag values for both
450 			 paths. */
451 		      if (tre_tag_order(num_tags, tnfa->tag_directions,
452 					tmp_tags,
453 					*reach_pos[trans_i->state_id].tags))
454 			{
455 			  /* The new path wins. */
456 			  tmp_iptr = *reach_pos[trans_i->state_id].tags;
457 			  *reach_pos[trans_i->state_id].tags = tmp_tags;
458 			  if (trans_i->state == tnfa->final)
459 			    {
460 			      DPRINT(("	 found better match\n"));
461 			      match_eo = pos;
462 			      new_match = 1;
463 			      for (i = 0; i < num_tags; i++)
464 				match_tags[i] = tmp_tags[i];
465 			    }
466 			  tmp_tags = tmp_iptr;
467 			}
468 		    }
469 		}
470 	    }
471 	}
472       reach_next_i->state = NULL;
473     }
474 
475   DPRINT(("match end offset = %d\n", match_eo));
476 
477 #ifndef TRE_USE_ALLOCA
478   if (buf)
479     xfree(buf);
480 #endif /* !TRE_USE_ALLOCA */
481 
482   *match_end_ofs = match_eo;
483   return match_eo >= 0 ? REG_OK : REG_NOMATCH;
484 }
485 
486 /* EOF */
487