xref: /netbsd-src/external/bsd/tre/dist/lib/tre-match-parallel.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*
2   tre-match-parallel.c - TRE parallel regex matching engine
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 /*
10   This algorithm searches for matches basically by reading characters
11   in the searched string one by one, starting at the beginning.	 All
12   matching paths in the TNFA are traversed in parallel.	 When two or
13   more paths reach the same state, exactly one is chosen according to
14   tag ordering rules; if returning submatches is not required it does
15   not matter which path is chosen.
16 
17   The worst case time required for finding the leftmost and longest
18   match, or determining that there is no match, is always linearly
19   dependent on the length of the text being searched.
20 
21   This algorithm cannot handle TNFAs with back referencing nodes.
22   See `tre-match-backtrack.c'.
23 */
24 
25 
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif /* HAVE_CONFIG_H */
29 
30 #ifdef TRE_USE_ALLOCA
31 /* AIX requires this to be the first thing in the file.	 */
32 #ifndef __GNUC__
33 # if HAVE_ALLOCA_H
34 #  include <alloca.h>
35 # else
36 #  ifdef _AIX
37  #pragma alloca
38 #  else
39 #   ifndef alloca /* predefined by HP cc +Olibcalls */
40 char *alloca ();
41 #   endif
42 #  endif
43 # endif
44 #endif
45 #endif /* TRE_USE_ALLOCA */
46 
47 #include <assert.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #ifdef HAVE_WCHAR_H
51 #include <wchar.h>
52 #endif /* HAVE_WCHAR_H */
53 #ifdef HAVE_WCTYPE_H
54 #include <wctype.h>
55 #endif /* HAVE_WCTYPE_H */
56 #ifndef TRE_WCHAR
57 #include <ctype.h>
58 #endif /* !TRE_WCHAR */
59 #ifdef HAVE_MALLOC_H
60 #include <malloc.h>
61 #endif /* HAVE_MALLOC_H */
62 
63 #include "tre-internal.h"
64 #include "tre-match-utils.h"
65 #include "tre.h"
66 #include "xmalloc.h"
67 
68 
69 
70 typedef struct {
71   tre_tnfa_transition_t *state;
72   int *tags;
73 } tre_tnfa_reach_t;
74 
75 typedef struct {
76   int pos;
77   int **tags;
78 } tre_reach_pos_t;
79 
80 
81 #ifdef TRE_DEBUG
82 static void
83 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
84 {
85   int i;
86 
87   while (reach->state != NULL)
88     {
89       DPRINT((" %p", (void *)reach->state));
90       if (num_tags > 0)
91 	{
92 	  DPRINT(("/"));
93 	  for (i = 0; i < num_tags; i++)
94 	    {
95 	      DPRINT(("%d:%d", i, reach->tags[i]));
96 	      if (i < (num_tags-1))
97 		DPRINT((","));
98 	    }
99 	}
100       reach++;
101     }
102   DPRINT(("\n"));
103 
104 }
105 #endif /* TRE_DEBUG */
106 
107 reg_errcode_t
108 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
109 		      tre_str_type_t type, int *match_tags, int eflags,
110 		      int *match_end_ofs)
111 {
112   /* State variables required by GET_NEXT_WCHAR. */
113   tre_char_t prev_c = 0, next_c = 0;
114   const char *str_byte = string;
115   int pos = -1;
116   unsigned int pos_add_next = 1;
117 #ifdef TRE_WCHAR
118   const wchar_t *str_wide = string;
119 #ifdef TRE_MBSTATE
120   mbstate_t mbstate;
121 #endif /* TRE_MBSTATE */
122 #endif /* TRE_WCHAR */
123   int reg_notbol = eflags & REG_NOTBOL;
124   int reg_noteol = eflags & REG_NOTEOL;
125   int reg_newline = tnfa->cflags & REG_NEWLINE;
126   int str_user_end = 0;
127 
128   char *buf;
129   tre_tnfa_transition_t *trans_i;
130   tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
131   tre_reach_pos_t *reach_pos;
132   int *tag_i;
133   int num_tags, i;
134 
135   int match_eo = -1;	   /* end offset of match (-1 if no match found yet) */
136   int new_match = 0;
137   int *tmp_tags = NULL;
138   int *tmp_iptr;
139 
140 #ifdef TRE_MBSTATE
141   memset(&mbstate, '\0', sizeof(mbstate));
142 #endif /* TRE_MBSTATE */
143 
144   DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
145 
146   if (!match_tags)
147     num_tags = 0;
148   else
149     num_tags = tnfa->num_tags;
150 
151   /* Allocate memory for temporary data required for matching.	This needs to
152      be done for every matching operation to be thread safe.  This allocates
153      everything in a single large block from the stack frame using alloca()
154      or with malloc() if alloca is unavailable. */
155   {
156     int tbytes, rbytes, pbytes, xbytes, total_bytes;
157     char *tmp_buf;
158     /* Compute the length of the block we need. */
159     tbytes = sizeof(*tmp_tags) * num_tags;
160     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
161     pbytes = sizeof(*reach_pos) * tnfa->num_states;
162     xbytes = sizeof(int) * num_tags;
163     total_bytes =
164       (sizeof(long) - 1) * 4 /* for alignment paddings */
165       + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
166 
167     /* Allocate the memory. */
168 #ifdef TRE_USE_ALLOCA
169     buf = alloca(total_bytes);
170 #else /* !TRE_USE_ALLOCA */
171     buf = xmalloc((unsigned)total_bytes);
172 #endif /* !TRE_USE_ALLOCA */
173     if (buf == NULL)
174       return REG_ESPACE;
175     memset(buf, 0, (size_t)total_bytes);
176 
177     /* Get the various pointers within tmp_buf (properly aligned). */
178     tmp_tags = (void *)buf;
179     tmp_buf = buf + tbytes;
180     tmp_buf += ALIGN(tmp_buf, long);
181     reach_next = (void *)tmp_buf;
182     tmp_buf += rbytes;
183     tmp_buf += ALIGN(tmp_buf, long);
184     reach = (void *)tmp_buf;
185     tmp_buf += rbytes;
186     tmp_buf += ALIGN(tmp_buf, long);
187     reach_pos = (void *)tmp_buf;
188     tmp_buf += pbytes;
189     tmp_buf += ALIGN(tmp_buf, long);
190     for (i = 0; i < tnfa->num_states; i++)
191       {
192 	reach[i].tags = (void *)tmp_buf;
193 	tmp_buf += xbytes;
194 	reach_next[i].tags = (void *)tmp_buf;
195 	tmp_buf += xbytes;
196       }
197   }
198 
199   for (i = 0; i < tnfa->num_states; i++)
200     reach_pos[i].pos = -1;
201 
202   /* If only one character can start a match, find it first. */
203   if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte)
204     {
205       const char *orig_str = str_byte;
206       int first = tnfa->first_char;
207 
208       if (len >= 0)
209 	str_byte = memchr(orig_str, first, (size_t)len);
210       else
211 	str_byte = strchr(orig_str, first);
212       if (str_byte == NULL)
213 	{
214 #ifndef TRE_USE_ALLOCA
215 	  if (buf)
216 	    xfree(buf);
217 #endif /* !TRE_USE_ALLOCA */
218 	  return REG_NOMATCH;
219 	}
220       DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
221       if (str_byte >= orig_str + 1)
222 	prev_c = (unsigned char)*(str_byte - 1);
223       next_c = (unsigned char)*str_byte;
224       pos = str_byte - orig_str;
225       if (len < 0 || pos < len)
226 	str_byte++;
227     }
228   else
229     {
230       GET_NEXT_WCHAR();
231       pos = 0;
232     }
233 
234 #if 0
235   /* Skip over characters that cannot possibly be the first character
236      of a match. */
237   if (tnfa->firstpos_chars != NULL)
238     {
239       char *chars = tnfa->firstpos_chars;
240 
241       if (len < 0)
242 	{
243 	  const char *orig_str = str_byte;
244 	  /* XXX - use strpbrk() and wcspbrk() because they might be
245 	     optimized for the target architecture.  Try also strcspn()
246 	     and wcscspn() and compare the speeds. */
247 	  while (next_c != L'\0' && !chars[next_c])
248 	    {
249 	      next_c = *str_byte++;
250 	    }
251 	  prev_c = *(str_byte - 2);
252 	  pos += str_byte - orig_str;
253 	  DPRINT(("skipped %d chars\n", str_byte - orig_str));
254 	}
255       else
256 	{
257 	  while (pos <= len && !chars[next_c])
258 	    {
259 	      prev_c = next_c;
260 	      next_c = (unsigned char)(*str_byte++);
261 	      pos++;
262 	    }
263 	}
264     }
265 #endif
266 
267   DPRINT(("length: %d\n", len));
268   DPRINT(("pos:chr/code | states and tags\n"));
269   DPRINT(("-------------+------------------------------------------------\n"));
270 
271   reach_next_i = reach_next;
272   while (/*CONSTCOND*/1)
273     {
274       /* If no match found yet, add the initial states to `reach_next'. */
275       if (match_eo < 0)
276 	{
277 	  DPRINT((" init >"));
278 	  trans_i = tnfa->initial;
279 	  while (trans_i->state != NULL)
280 	    {
281 	      if (reach_pos[trans_i->state_id].pos < pos)
282 		{
283 		  if (trans_i->assertions
284 		      && CHECK_ASSERTIONS(trans_i->assertions))
285 		    {
286 		      DPRINT(("assertion failed\n"));
287 		      trans_i++;
288 		      continue;
289 		    }
290 
291 		  DPRINT((" %p", (void *)trans_i->state));
292 		  reach_next_i->state = trans_i->state;
293 		  for (i = 0; i < num_tags; i++)
294 		    reach_next_i->tags[i] = -1;
295 		  tag_i = trans_i->tags;
296 		  if (tag_i)
297 		    while (*tag_i >= 0)
298 		      {
299 			if (*tag_i < num_tags)
300 			  reach_next_i->tags[*tag_i] = pos;
301 			tag_i++;
302 		      }
303 		  if (reach_next_i->state == tnfa->final)
304 		    {
305 		      DPRINT(("	 found empty match\n"));
306 		      match_eo = pos;
307 		      new_match = 1;
308 		      for (i = 0; i < num_tags; i++)
309 			match_tags[i] = reach_next_i->tags[i];
310 		    }
311 		  reach_pos[trans_i->state_id].pos = pos;
312 		  reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
313 		  reach_next_i++;
314 		}
315 	      trans_i++;
316 	    }
317 	  DPRINT(("\n"));
318 	  reach_next_i->state = NULL;
319 	}
320       else
321 	{
322 	  if (num_tags == 0 || reach_next_i == reach_next)
323 	    /*�We have found a match. */
324 	    break;
325 	}
326 
327       /* Check for end of string. */
328       if (len < 0)
329 	{
330 	  if (type == STR_USER)
331 	    {
332 	      if (str_user_end)
333 		break;
334 	    }
335 	  else if (next_c == L'\0')
336 	    break;
337 	}
338       else
339 	{
340 	  if (pos >= len)
341 	    break;
342 	}
343 
344       GET_NEXT_WCHAR();
345 
346 #ifdef TRE_DEBUG
347       DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
348       tre_print_reach(tnfa, reach_next, num_tags);
349       DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
350       tre_print_reach(tnfa, reach_next, num_tags);
351 #endif /* TRE_DEBUG */
352 
353       /* Swap `reach' and `reach_next'. */
354       reach_i = reach;
355       reach = reach_next;
356       reach_next = reach_i;
357 
358       /* For each state in `reach', weed out states that don't fulfill the
359 	 minimal matching conditions. */
360       if (tnfa->num_minimals && new_match)
361 	{
362 	  new_match = 0;
363 	  reach_next_i = reach_next;
364 	  for (reach_i = reach; reach_i->state; reach_i++)
365 	    {
366 	      int skip = 0;
367 	      for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
368 		{
369 		  int end = tnfa->minimal_tags[i];
370 		  int start = tnfa->minimal_tags[i + 1];
371 		  DPRINT(("  Minimal start %d, end %d\n", start, end));
372 		  if (end >= num_tags)
373 		    {
374 		      DPRINT(("	 Throwing %p out.\n", reach_i->state));
375 		      skip = 1;
376 		      break;
377 		    }
378 		  else if (reach_i->tags[start] == match_tags[start]
379 			   && reach_i->tags[end] < match_tags[end])
380 		    {
381 		      DPRINT(("	 Throwing %p out because t%d < %d\n",
382 			      reach_i->state, end, match_tags[end]));
383 		      skip = 1;
384 		      break;
385 		    }
386 		}
387 	      if (!skip)
388 		{
389 		  reach_next_i->state = reach_i->state;
390 		  tmp_iptr = reach_next_i->tags;
391 		  reach_next_i->tags = reach_i->tags;
392 		  reach_i->tags = tmp_iptr;
393 		  reach_next_i++;
394 		}
395 	    }
396 	  reach_next_i->state = NULL;
397 
398 	  /* Swap `reach' and `reach_next'. */
399 	  reach_i = reach;
400 	  reach = reach_next;
401 	  reach_next = reach_i;
402 	}
403 
404       /* For each state in `reach' see if there is a transition leaving with
405 	 the current input symbol to a state not yet in `reach_next', and
406 	 add the destination states to `reach_next'. */
407       reach_next_i = reach_next;
408       for (reach_i = reach; reach_i->state; reach_i++)
409 	{
410 	  for (trans_i = reach_i->state; trans_i->state; trans_i++)
411 	    {
412 	      /* Does this transition match the input symbol? */
413 	      if (trans_i->code_min <= (tre_cint_t)prev_c &&
414 		  trans_i->code_max >= (tre_cint_t)prev_c)
415 		{
416 		  if (trans_i->assertions
417 		      && (CHECK_ASSERTIONS(trans_i->assertions)
418 			  || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
419 		    {
420 		      DPRINT(("assertion failed\n"));
421 		      continue;
422 		    }
423 
424 		  /* Compute the tags after this transition. */
425 		  for (i = 0; i < num_tags; i++)
426 		    tmp_tags[i] = reach_i->tags[i];
427 		  tag_i = trans_i->tags;
428 		  if (tag_i != NULL)
429 		    while (*tag_i >= 0)
430 		      {
431 			if (*tag_i < num_tags)
432 			  tmp_tags[*tag_i] = pos;
433 			tag_i++;
434 		      }
435 
436 		  if (reach_pos[trans_i->state_id].pos < pos)
437 		    {
438 		      /* Found an unvisited node. */
439 		      reach_next_i->state = trans_i->state;
440 		      tmp_iptr = reach_next_i->tags;
441 		      reach_next_i->tags = tmp_tags;
442 		      tmp_tags = tmp_iptr;
443 		      reach_pos[trans_i->state_id].pos = pos;
444 		      reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
445 
446 		      if (reach_next_i->state == tnfa->final
447 			  && (match_eo == -1
448 			      || (num_tags > 0
449 				  && reach_next_i->tags[0] <= match_tags[0])))
450 			{
451 			  DPRINT(("  found match %p\n", trans_i->state));
452 			  match_eo = pos;
453 			  new_match = 1;
454 			  for (i = 0; i < num_tags; i++)
455 			    match_tags[i] = reach_next_i->tags[i];
456 			}
457 		      reach_next_i++;
458 
459 		    }
460 		  else
461 		    {
462 		      assert(reach_pos[trans_i->state_id].pos == pos);
463 		      /* Another path has also reached this state.  We choose
464 			 the winner by examining the tag values for both
465 			 paths. */
466 		      if (tre_tag_order(num_tags, tnfa->tag_directions,
467 					tmp_tags,
468 					*reach_pos[trans_i->state_id].tags))
469 			{
470 			  /* The new path wins. */
471 			  tmp_iptr = *reach_pos[trans_i->state_id].tags;
472 			  *reach_pos[trans_i->state_id].tags = tmp_tags;
473 			  if (trans_i->state == tnfa->final)
474 			    {
475 			      DPRINT(("	 found better match\n"));
476 			      match_eo = pos;
477 			      new_match = 1;
478 			      for (i = 0; i < num_tags; i++)
479 				match_tags[i] = tmp_tags[i];
480 			    }
481 			  tmp_tags = tmp_iptr;
482 			}
483 		    }
484 		}
485 	    }
486 	}
487       reach_next_i->state = NULL;
488     }
489 
490   DPRINT(("match end offset = %d\n", match_eo));
491 
492 #ifndef TRE_USE_ALLOCA
493   if (buf)
494     xfree(buf);
495 #endif /* !TRE_USE_ALLOCA */
496 
497   *match_end_ofs = match_eo;
498   return match_eo >= 0 ? REG_OK : REG_NOMATCH;
499 }
500 
501 /* EOF */
502