xref: /netbsd-src/external/bsd/tre/dist/lib/tre-match-backtrack.c (revision bc436ad3c55520f33d00f361e7d3f9150b998394)
163d4abf0Sagc /*
263d4abf0Sagc   tre-match-backtrack.c - TRE backtracking regex matching engine
363d4abf0Sagc 
463d4abf0Sagc   This software is released under a BSD-style license.
563d4abf0Sagc   See the file LICENSE for details and copyright.
663d4abf0Sagc 
763d4abf0Sagc */
863d4abf0Sagc 
963d4abf0Sagc /*
1063d4abf0Sagc   This matcher is for regexps that use back referencing.  Regexp matching
1163d4abf0Sagc   with back referencing is an NP-complete problem on the number of back
1263d4abf0Sagc   references.  The easiest way to match them is to use a backtracking
1363d4abf0Sagc   routine which basically goes through all possible paths in the TNFA
1463d4abf0Sagc   and chooses the one which results in the best (leftmost and longest)
1563d4abf0Sagc   match.  This can be spectacularly expensive and may run out of stack
1663d4abf0Sagc   space, but there really is no better known generic algorithm.	 Quoting
1763d4abf0Sagc   Henry Spencer from comp.compilers:
1863d4abf0Sagc   <URL: http://compilers.iecc.com/comparch/article/93-03-102>
1963d4abf0Sagc 
2063d4abf0Sagc     POSIX.2 REs require longest match, which is really exciting to
2163d4abf0Sagc     implement since the obsolete ("basic") variant also includes
2263d4abf0Sagc     \<digit>.  I haven't found a better way of tackling this than doing
2363d4abf0Sagc     a preliminary match using a DFA (or simulation) on a modified RE
2463d4abf0Sagc     that just replicates subREs for \<digit>, and then doing a
2563d4abf0Sagc     backtracking match to determine whether the subRE matches were
2663d4abf0Sagc     right.  This can be rather slow, but I console myself with the
2763d4abf0Sagc     thought that people who use \<digit> deserve very slow execution.
2863d4abf0Sagc     (Pun unintentional but very appropriate.)
2963d4abf0Sagc 
3063d4abf0Sagc */
3163d4abf0Sagc 
3263d4abf0Sagc 
3363d4abf0Sagc #ifdef HAVE_CONFIG_H
3463d4abf0Sagc #include <config.h>
3563d4abf0Sagc #endif /* HAVE_CONFIG_H */
3663d4abf0Sagc 
37f2a3d147Schristos #include "tre-alloca.h"
3863d4abf0Sagc 
3963d4abf0Sagc #include <assert.h>
4063d4abf0Sagc #include <stdlib.h>
4163d4abf0Sagc #include <string.h>
4263d4abf0Sagc #ifdef HAVE_WCHAR_H
4363d4abf0Sagc #include <wchar.h>
4463d4abf0Sagc #endif /* HAVE_WCHAR_H */
4563d4abf0Sagc #ifdef HAVE_WCTYPE_H
4663d4abf0Sagc #include <wctype.h>
4763d4abf0Sagc #endif /* HAVE_WCTYPE_H */
4863d4abf0Sagc #ifndef TRE_WCHAR
4963d4abf0Sagc #include <ctype.h>
5063d4abf0Sagc #endif /* !TRE_WCHAR */
5163d4abf0Sagc #ifdef HAVE_MALLOC_H
5263d4abf0Sagc #include <malloc.h>
5363d4abf0Sagc #endif /* HAVE_MALLOC_H */
5463d4abf0Sagc 
5563d4abf0Sagc #include "tre-internal.h"
5663d4abf0Sagc #include "tre-mem.h"
5763d4abf0Sagc #include "tre-match-utils.h"
5863d4abf0Sagc #include "tre.h"
5963d4abf0Sagc #include "xmalloc.h"
6063d4abf0Sagc 
6163d4abf0Sagc typedef struct {
6263d4abf0Sagc   int pos;
6363d4abf0Sagc   const char *str_byte;
6463d4abf0Sagc #ifdef TRE_WCHAR
6563d4abf0Sagc   const wchar_t *str_wide;
6663d4abf0Sagc #endif /* TRE_WCHAR */
6763d4abf0Sagc   tre_tnfa_transition_t *state;
6863d4abf0Sagc   int state_id;
6963d4abf0Sagc   int next_c;
7063d4abf0Sagc   int *tags;
7163d4abf0Sagc #ifdef TRE_MBSTATE
7263d4abf0Sagc   mbstate_t mbstate;
7363d4abf0Sagc #endif /* TRE_MBSTATE */
7463d4abf0Sagc } tre_backtrack_item_t;
7563d4abf0Sagc 
7663d4abf0Sagc typedef struct tre_backtrack_struct {
7763d4abf0Sagc   tre_backtrack_item_t item;
7863d4abf0Sagc   struct tre_backtrack_struct *prev;
7963d4abf0Sagc   struct tre_backtrack_struct *next;
8063d4abf0Sagc } *tre_backtrack_t;
8163d4abf0Sagc 
8263d4abf0Sagc #ifdef TRE_WCHAR
8363d4abf0Sagc #define BT_STACK_WIDE_IN(_str_wide)	stack->item.str_wide = (_str_wide)
8463d4abf0Sagc #define BT_STACK_WIDE_OUT		(str_wide) = stack->item.str_wide
8563d4abf0Sagc #else /* !TRE_WCHAR */
8663d4abf0Sagc #define BT_STACK_WIDE_IN(_str_wide)
8763d4abf0Sagc #define BT_STACK_WIDE_OUT
8863d4abf0Sagc #endif /* !TRE_WCHAR */
8963d4abf0Sagc 
9063d4abf0Sagc #ifdef TRE_MBSTATE
9163d4abf0Sagc #define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate)
9263d4abf0Sagc #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
9363d4abf0Sagc #else /* !TRE_MBSTATE */
9463d4abf0Sagc #define BT_STACK_MBSTATE_IN
9563d4abf0Sagc #define BT_STACK_MBSTATE_OUT
9663d4abf0Sagc #endif /* !TRE_MBSTATE */
9763d4abf0Sagc 
9863d4abf0Sagc 
9963d4abf0Sagc #ifdef TRE_USE_ALLOCA
10063d4abf0Sagc #define tre_bt_mem_new		  tre_mem_newa
10163d4abf0Sagc #define tre_bt_mem_alloc	  tre_mem_alloca
10213498f30Srin #define tre_bt_mem_destroy(obj)	  do { } while (/*CONSTCOND*/(void)0,0)
10363d4abf0Sagc #else /* !TRE_USE_ALLOCA */
10463d4abf0Sagc #define tre_bt_mem_new		  tre_mem_new
10563d4abf0Sagc #define tre_bt_mem_alloc	  tre_mem_alloc
10663d4abf0Sagc #define tre_bt_mem_destroy	  tre_mem_destroy
10763d4abf0Sagc #endif /* !TRE_USE_ALLOCA */
10863d4abf0Sagc 
10963d4abf0Sagc 
11063d4abf0Sagc #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
11163d4abf0Sagc   do									      \
11263d4abf0Sagc     {									      \
113f2a3d147Schristos       size_t i;								      \
11463d4abf0Sagc       if (!stack->next)							      \
11563d4abf0Sagc 	{								      \
11663d4abf0Sagc 	  tre_backtrack_t s;						      \
11763d4abf0Sagc 	  s = tre_bt_mem_alloc(mem, sizeof(*s));			      \
11863d4abf0Sagc 	  if (!s)							      \
11963d4abf0Sagc 	    {								      \
12063d4abf0Sagc 	      tre_bt_mem_destroy(mem);					      \
12163d4abf0Sagc 	      if (tags)							      \
12263d4abf0Sagc 		xfree(tags);						      \
12363d4abf0Sagc 	      if (pmatch)						      \
12463d4abf0Sagc 		xfree(pmatch);						      \
12563d4abf0Sagc 	      if (states_seen)						      \
12663d4abf0Sagc 		xfree(states_seen);					      \
12763d4abf0Sagc 	      return REG_ESPACE;					      \
12863d4abf0Sagc 	    }								      \
12963d4abf0Sagc 	  s->prev = stack;						      \
13063d4abf0Sagc 	  s->next = NULL;						      \
13163d4abf0Sagc 	  s->item.tags = tre_bt_mem_alloc(mem,				      \
13263d4abf0Sagc 					  sizeof(*tags) * tnfa->num_tags);    \
13363d4abf0Sagc 	  if (!s->item.tags)						      \
13463d4abf0Sagc 	    {								      \
13563d4abf0Sagc 	      tre_bt_mem_destroy(mem);					      \
13663d4abf0Sagc 	      if (tags)							      \
13763d4abf0Sagc 		xfree(tags);						      \
13863d4abf0Sagc 	      if (pmatch)						      \
13963d4abf0Sagc 		xfree(pmatch);						      \
14063d4abf0Sagc 	      if (states_seen)						      \
14163d4abf0Sagc 		xfree(states_seen);					      \
14263d4abf0Sagc 	      return REG_ESPACE;					      \
14363d4abf0Sagc 	    }								      \
14463d4abf0Sagc 	  stack->next = s;						      \
14563d4abf0Sagc 	  stack = s;							      \
14663d4abf0Sagc 	}								      \
14763d4abf0Sagc       else								      \
14863d4abf0Sagc 	stack = stack->next;						      \
14963d4abf0Sagc       stack->item.pos = (_pos);						      \
15063d4abf0Sagc       stack->item.str_byte = (_str_byte);				      \
15163d4abf0Sagc       BT_STACK_WIDE_IN(_str_wide);					      \
15263d4abf0Sagc       stack->item.state = (_state);					      \
15363d4abf0Sagc       stack->item.state_id = (_state_id);				      \
15463d4abf0Sagc       stack->item.next_c = (_next_c);					      \
15563d4abf0Sagc       for (i = 0; i < tnfa->num_tags; i++)				      \
15663d4abf0Sagc 	stack->item.tags[i] = (_tags)[i];				      \
15763d4abf0Sagc       BT_STACK_MBSTATE_IN;						      \
15863d4abf0Sagc     }									      \
15913498f30Srin   while (/*CONSTCOND*/(void)0,0)
16063d4abf0Sagc 
16163d4abf0Sagc #define BT_STACK_POP()							      \
16263d4abf0Sagc   do									      \
16363d4abf0Sagc     {									      \
164f2a3d147Schristos       size_t i;								      \
16563d4abf0Sagc       assert(stack->prev);						      \
16663d4abf0Sagc       pos = stack->item.pos;						      \
16763d4abf0Sagc       if (type == STR_USER)                                                   \
16863d4abf0Sagc         str_source->rewind(pos + pos_add_next, str_source->context);          \
16963d4abf0Sagc       str_byte = stack->item.str_byte;					      \
17063d4abf0Sagc       BT_STACK_WIDE_OUT;						      \
17163d4abf0Sagc       state = stack->item.state;					      \
17213498f30Srin       next_c = (tre_char_t) stack->item.next_c;					      \
17363d4abf0Sagc       for (i = 0; i < tnfa->num_tags; i++)				      \
17463d4abf0Sagc 	tags[i] = stack->item.tags[i];					      \
17563d4abf0Sagc       BT_STACK_MBSTATE_OUT;						      \
17663d4abf0Sagc       stack = stack->prev;						      \
17763d4abf0Sagc     }									      \
17813498f30Srin   while (/*CONSTCOND*/(void)0,0)
17963d4abf0Sagc 
18063d4abf0Sagc #undef MIN
18163d4abf0Sagc #define MIN(a, b) ((a) <= (b) ? (a) : (b))
18263d4abf0Sagc 
18363d4abf0Sagc reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t * tnfa,const void * string,int len,tre_str_type_t type,int * match_tags,int eflags,int * match_end_ofs)18463d4abf0Sagc tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
18563d4abf0Sagc 		       int len, tre_str_type_t type, int *match_tags,
18663d4abf0Sagc 		       int eflags, int *match_end_ofs)
18763d4abf0Sagc {
18863d4abf0Sagc   /* State variables required by GET_NEXT_WCHAR. */
18963d4abf0Sagc   tre_char_t prev_c = 0, next_c = 0;
19063d4abf0Sagc   const char *str_byte = string;
19163d4abf0Sagc   int pos = 0;
19263d4abf0Sagc   unsigned int pos_add_next = 1;
19363d4abf0Sagc #ifdef TRE_WCHAR
19463d4abf0Sagc   const wchar_t *str_wide = string;
19563d4abf0Sagc #ifdef TRE_MBSTATE
19663d4abf0Sagc   mbstate_t mbstate;
19763d4abf0Sagc #endif /* TRE_MBSTATE */
19863d4abf0Sagc #endif /* TRE_WCHAR */
19963d4abf0Sagc   int reg_notbol = eflags & REG_NOTBOL;
20063d4abf0Sagc   int reg_noteol = eflags & REG_NOTEOL;
20163d4abf0Sagc   int reg_newline = tnfa->cflags & REG_NEWLINE;
202f2a3d147Schristos   size_t str_user_end = 0;
20363d4abf0Sagc 
20463d4abf0Sagc   /* These are used to remember the necessary values of the above
20563d4abf0Sagc      variables to return to the position where the current search
20663d4abf0Sagc      started from. */
20763d4abf0Sagc   int next_c_start;
20863d4abf0Sagc   const char *str_byte_start;
20963d4abf0Sagc   int pos_start = -1;
21063d4abf0Sagc #ifdef TRE_WCHAR
21163d4abf0Sagc   const wchar_t *str_wide_start;
21263d4abf0Sagc #endif /* TRE_WCHAR */
21363d4abf0Sagc #ifdef TRE_MBSTATE
21463d4abf0Sagc   mbstate_t mbstate_start;
21563d4abf0Sagc #endif /* TRE_MBSTATE */
21663d4abf0Sagc 
21763d4abf0Sagc   /* End offset of best match so far, or -1 if no match found yet. */
21863d4abf0Sagc   int match_eo = -1;
21963d4abf0Sagc   /* Tag arrays. */
22063d4abf0Sagc   int *next_tags, *tags = NULL;
22163d4abf0Sagc   /* Current TNFA state. */
22263d4abf0Sagc   tre_tnfa_transition_t *state;
22363d4abf0Sagc   int *states_seen = NULL;
22463d4abf0Sagc 
22563d4abf0Sagc   /* Memory allocator to for allocating the backtracking stack. */
22663d4abf0Sagc   tre_mem_t mem = tre_bt_mem_new();
22763d4abf0Sagc 
22863d4abf0Sagc   /* The backtracking stack. */
22963d4abf0Sagc   tre_backtrack_t stack;
23063d4abf0Sagc 
23163d4abf0Sagc   tre_tnfa_transition_t *trans_i;
23263d4abf0Sagc   regmatch_t *pmatch = NULL;
23399fb528eSrin   reg_errcode_t ret;
23463d4abf0Sagc 
23563d4abf0Sagc #ifdef TRE_MBSTATE
23663d4abf0Sagc   memset(&mbstate, '\0', sizeof(mbstate));
23763d4abf0Sagc #endif /* TRE_MBSTATE */
23863d4abf0Sagc 
23963d4abf0Sagc   if (!mem)
24063d4abf0Sagc     return REG_ESPACE;
24163d4abf0Sagc   stack = tre_bt_mem_alloc(mem, sizeof(*stack));
24263d4abf0Sagc   if (!stack)
24363d4abf0Sagc     {
24463d4abf0Sagc       ret = REG_ESPACE;
24563d4abf0Sagc       goto error_exit;
24663d4abf0Sagc     }
24763d4abf0Sagc   stack->prev = NULL;
24863d4abf0Sagc   stack->next = NULL;
24963d4abf0Sagc 
25063d4abf0Sagc   DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
25163d4abf0Sagc   DPRINT(("len = %d\n", len));
25263d4abf0Sagc 
25363d4abf0Sagc #ifdef TRE_USE_ALLOCA
25463d4abf0Sagc   tags = alloca(sizeof(*tags) * tnfa->num_tags);
25563d4abf0Sagc   pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches);
25663d4abf0Sagc   states_seen = alloca(sizeof(*states_seen) * tnfa->num_states);
25763d4abf0Sagc #else /* !TRE_USE_ALLOCA */
25863d4abf0Sagc   if (tnfa->num_tags)
25963d4abf0Sagc     {
26063d4abf0Sagc       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
26163d4abf0Sagc       if (!tags)
26263d4abf0Sagc 	{
26363d4abf0Sagc 	  ret = REG_ESPACE;
26463d4abf0Sagc 	  goto error_exit;
26563d4abf0Sagc 	}
26663d4abf0Sagc     }
26763d4abf0Sagc   if (tnfa->num_submatches)
26863d4abf0Sagc     {
26963d4abf0Sagc       pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
27063d4abf0Sagc       if (!pmatch)
27163d4abf0Sagc 	{
27263d4abf0Sagc 	  ret = REG_ESPACE;
27363d4abf0Sagc 	  goto error_exit;
27463d4abf0Sagc 	}
27563d4abf0Sagc     }
27663d4abf0Sagc   if (tnfa->num_states)
27763d4abf0Sagc     {
27863d4abf0Sagc       states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
27963d4abf0Sagc       if (!states_seen)
28063d4abf0Sagc 	{
28163d4abf0Sagc 	  ret = REG_ESPACE;
28263d4abf0Sagc 	  goto error_exit;
28363d4abf0Sagc 	}
28463d4abf0Sagc     }
28563d4abf0Sagc #endif /* !TRE_USE_ALLOCA */
28663d4abf0Sagc 
28763d4abf0Sagc  retry:
28863d4abf0Sagc   {
289f2a3d147Schristos     size_t i;
29063d4abf0Sagc     for (i = 0; i < tnfa->num_tags; i++)
29163d4abf0Sagc       {
29263d4abf0Sagc 	tags[i] = -1;
29363d4abf0Sagc 	if (match_tags)
29463d4abf0Sagc 	  match_tags[i] = -1;
29563d4abf0Sagc       }
29663d4abf0Sagc     for (i = 0; i < tnfa->num_states; i++)
29763d4abf0Sagc       states_seen[i] = 0;
29863d4abf0Sagc   }
29963d4abf0Sagc 
30063d4abf0Sagc   state = NULL;
30163d4abf0Sagc   pos = pos_start;
30263d4abf0Sagc   if (type == STR_USER)
30363d4abf0Sagc     str_source->rewind(pos + pos_add_next, str_source->context);
30463d4abf0Sagc   GET_NEXT_WCHAR();
30563d4abf0Sagc   pos_start = pos;
30663d4abf0Sagc   next_c_start = next_c;
30763d4abf0Sagc   str_byte_start = str_byte;
30863d4abf0Sagc #ifdef TRE_WCHAR
30963d4abf0Sagc   str_wide_start = str_wide;
31063d4abf0Sagc #endif /* TRE_WCHAR */
31163d4abf0Sagc #ifdef TRE_MBSTATE
31263d4abf0Sagc   mbstate_start = mbstate;
31363d4abf0Sagc #endif /* TRE_MBSTATE */
31463d4abf0Sagc 
31563d4abf0Sagc   /* Handle initial states. */
31663d4abf0Sagc   next_tags = NULL;
31763d4abf0Sagc   for (trans_i = tnfa->initial; trans_i->state; trans_i++)
31863d4abf0Sagc     {
31963d4abf0Sagc       DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
32063d4abf0Sagc       if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
32163d4abf0Sagc 	{
32263d4abf0Sagc 	  DPRINT(("assert failed\n"));
32363d4abf0Sagc 	  continue;
32463d4abf0Sagc 	}
32563d4abf0Sagc       if (state == NULL)
32663d4abf0Sagc 	{
32763d4abf0Sagc 	  /* Start from this state. */
32863d4abf0Sagc 	  state = trans_i->state;
32963d4abf0Sagc 	  next_tags = trans_i->tags;
33063d4abf0Sagc 	}
33163d4abf0Sagc       else
33263d4abf0Sagc 	{
33363d4abf0Sagc 	  /* Backtrack to this state. */
33463d4abf0Sagc 	  DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
33563d4abf0Sagc 	  BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
33663d4abf0Sagc 			trans_i->state_id, next_c, tags, mbstate);
33763d4abf0Sagc 	  {
33863d4abf0Sagc 	    int *tmp = trans_i->tags;
33963d4abf0Sagc 	    if (tmp)
34063d4abf0Sagc 	      while (*tmp >= 0)
34163d4abf0Sagc 		stack->item.tags[*tmp++] = pos;
34263d4abf0Sagc 	  }
34363d4abf0Sagc 	}
34463d4abf0Sagc     }
34563d4abf0Sagc 
34663d4abf0Sagc   if (next_tags)
34763d4abf0Sagc     for (; *next_tags >= 0; next_tags++)
34863d4abf0Sagc       tags[*next_tags] = pos;
34963d4abf0Sagc 
35063d4abf0Sagc 
35163d4abf0Sagc   DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
35263d4abf0Sagc   DPRINT(("pos:chr/code | state and tags\n"));
35363d4abf0Sagc   DPRINT(("-------------+------------------------------------------------\n"));
35463d4abf0Sagc 
35563d4abf0Sagc   if (state == NULL)
35663d4abf0Sagc     goto backtrack;
35763d4abf0Sagc 
35813498f30Srin   while (/*CONSTCOND*/(void)1,1)
35963d4abf0Sagc     {
36063d4abf0Sagc       tre_tnfa_transition_t *next_state;
36163d4abf0Sagc       int empty_br_match;
36263d4abf0Sagc 
36363d4abf0Sagc       DPRINT(("start loop\n"));
36463d4abf0Sagc       if (state == tnfa->final)
36563d4abf0Sagc 	{
36663d4abf0Sagc 	  DPRINT(("  match found, %d %d\n", match_eo, pos));
36763d4abf0Sagc 	  if (match_eo < pos
36863d4abf0Sagc 	      || (match_eo == pos
36963d4abf0Sagc 		  && match_tags
37063d4abf0Sagc 		  && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
37163d4abf0Sagc 				   tags, match_tags)))
37263d4abf0Sagc 	    {
373f2a3d147Schristos 	      size_t i;
37463d4abf0Sagc 	      /* This match wins the previous match. */
37563d4abf0Sagc 	      DPRINT(("	 win previous\n"));
37663d4abf0Sagc 	      match_eo = pos;
37763d4abf0Sagc 	      if (match_tags)
37863d4abf0Sagc 		for (i = 0; i < tnfa->num_tags; i++)
37963d4abf0Sagc 		  match_tags[i] = tags[i];
38063d4abf0Sagc 	    }
38163d4abf0Sagc 	  /* Our TNFAs never have transitions leaving from the final state,
38263d4abf0Sagc 	     so we jump right to backtracking. */
38363d4abf0Sagc 	  goto backtrack;
38463d4abf0Sagc 	}
38563d4abf0Sagc 
38663d4abf0Sagc #ifdef TRE_DEBUG
38763d4abf0Sagc       DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
38863d4abf0Sagc 	      state));
38963d4abf0Sagc       {
39063d4abf0Sagc 	int i;
39163d4abf0Sagc 	for (i = 0; i < tnfa->num_tags; i++)
39263d4abf0Sagc 	  DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
39363d4abf0Sagc 	DPRINT(("\n"));
39463d4abf0Sagc       }
39563d4abf0Sagc #endif /* TRE_DEBUG */
39663d4abf0Sagc 
39763d4abf0Sagc       /* Go to the next character in the input string. */
39863d4abf0Sagc       empty_br_match = 0;
39963d4abf0Sagc       trans_i = state;
40063d4abf0Sagc       if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
40163d4abf0Sagc 	{
40263d4abf0Sagc 	  /* This is a back reference state.  All transitions leaving from
40363d4abf0Sagc 	     this state have the same back reference "assertion".  Instead
40463d4abf0Sagc 	     of reading the next character, we match the back reference. */
40582e82fcaSahoka 	  regoff_t so, eo, bt = trans_i->u.backref;
40682e82fcaSahoka 	  regoff_t bt_len;
40782e82fcaSahoka 	  regoff_t result;
40863d4abf0Sagc 
40963d4abf0Sagc 	  DPRINT(("  should match back reference %d\n", bt));
41063d4abf0Sagc 	  /* Get the substring we need to match against.  Remember to
41163d4abf0Sagc 	     turn off REG_NOSUB temporarily. */
412*bc436ad3Srin 	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
41363d4abf0Sagc 			  tnfa, tags, pos);
41482e82fcaSahoka 	  /* LINTED */so = pmatch[bt].rm_so;
41582e82fcaSahoka 	  /* LINTED */eo = pmatch[bt].rm_eo;
41663d4abf0Sagc 	  bt_len = eo - so;
41763d4abf0Sagc 
41863d4abf0Sagc #ifdef TRE_DEBUG
41963d4abf0Sagc 	  {
42063d4abf0Sagc 	    int slen;
42163d4abf0Sagc 	    if (len < 0)
42263d4abf0Sagc 	      slen = bt_len;
42363d4abf0Sagc 	    else
42463d4abf0Sagc 	      slen = MIN(bt_len, len - pos);
42563d4abf0Sagc 
42663d4abf0Sagc 	    if (type == STR_BYTE)
42763d4abf0Sagc 	      {
42863d4abf0Sagc 		DPRINT(("  substring (len %d) is [%d, %d[: '%.*s'\n",
42963d4abf0Sagc 			bt_len, so, eo, bt_len, (char*)string + so));
43063d4abf0Sagc 		DPRINT(("  current string is '%.*s'\n", slen, str_byte - 1));
43163d4abf0Sagc 	      }
43263d4abf0Sagc #ifdef TRE_WCHAR
43363d4abf0Sagc 	    else if (type == STR_WIDE)
43463d4abf0Sagc 	      {
43563d4abf0Sagc 		DPRINT(("  substring (len %d) is [%d, %d[: '%.*" STRF "'\n",
43663d4abf0Sagc 			bt_len, so, eo, bt_len, (wchar_t*)string + so));
43763d4abf0Sagc 		DPRINT(("  current string is '%.*" STRF "'\n",
43863d4abf0Sagc 			slen, str_wide - 1));
43963d4abf0Sagc 	      }
44063d4abf0Sagc #endif /* TRE_WCHAR */
44163d4abf0Sagc 	  }
44263d4abf0Sagc #endif
44363d4abf0Sagc 
44463d4abf0Sagc 	  if (len < 0)
44563d4abf0Sagc 	    {
44663d4abf0Sagc 	      if (type == STR_USER)
44763d4abf0Sagc 		result = str_source->compare((unsigned)so, (unsigned)pos,
44863d4abf0Sagc 					     (unsigned)bt_len,
44963d4abf0Sagc 					     str_source->context);
45063d4abf0Sagc #ifdef TRE_WCHAR
45163d4abf0Sagc 	      else if (type == STR_WIDE)
45263d4abf0Sagc 		result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
45363d4abf0Sagc 				 (size_t)bt_len);
45463d4abf0Sagc #endif /* TRE_WCHAR */
45563d4abf0Sagc 	      else
45682e82fcaSahoka 		/* LINTED */result = strncmp((const char*)string + so, str_byte - 1,
45763d4abf0Sagc 				 (size_t)bt_len);
45863d4abf0Sagc 	    }
45963d4abf0Sagc 	  else if (len - pos < bt_len)
46063d4abf0Sagc 	    result = 1;
46163d4abf0Sagc #ifdef TRE_WCHAR
46263d4abf0Sagc 	  else if (type == STR_WIDE)
46363d4abf0Sagc 	    result = wmemcmp((const wchar_t*)string + so, str_wide - 1,
46463d4abf0Sagc 			     (size_t)bt_len);
46563d4abf0Sagc #endif /* TRE_WCHAR */
46663d4abf0Sagc 	  else
46782e82fcaSahoka 	    /* LINTED */result = memcmp((const char*)string + so, str_byte - 1,
46863d4abf0Sagc 			    (size_t)bt_len);
46963d4abf0Sagc 
47063d4abf0Sagc 	  if (result == 0)
47163d4abf0Sagc 	    {
47263d4abf0Sagc 	      /* Back reference matched.  Check for infinite loop. */
47363d4abf0Sagc 	      if (bt_len == 0)
47463d4abf0Sagc 		empty_br_match = 1;
47563d4abf0Sagc 	      if (empty_br_match && states_seen[trans_i->state_id])
47663d4abf0Sagc 		{
47763d4abf0Sagc 		  DPRINT(("  avoid loop\n"));
47863d4abf0Sagc 		  goto backtrack;
47963d4abf0Sagc 		}
48063d4abf0Sagc 
48163d4abf0Sagc 	      states_seen[trans_i->state_id] = empty_br_match;
48263d4abf0Sagc 
48363d4abf0Sagc 	      /* Advance in input string and resync `prev_c', `next_c'
48463d4abf0Sagc 		 and pos. */
48563d4abf0Sagc 	      DPRINT(("	 back reference matched\n"));
48682e82fcaSahoka 	      /* LINTED */str_byte += bt_len - 1;
48763d4abf0Sagc #ifdef TRE_WCHAR
48882e82fcaSahoka 	      /* LINTED */str_wide += bt_len - 1;
48963d4abf0Sagc #endif /* TRE_WCHAR */
49082e82fcaSahoka 	      /* LINTED */pos += bt_len - 1;
49163d4abf0Sagc 	      GET_NEXT_WCHAR();
49263d4abf0Sagc 	      DPRINT(("	 pos now %d\n", pos));
49363d4abf0Sagc 	    }
49463d4abf0Sagc 	  else
49563d4abf0Sagc 	    {
49663d4abf0Sagc 	      DPRINT(("	 back reference did not match\n"));
49763d4abf0Sagc 	      goto backtrack;
49863d4abf0Sagc 	    }
49963d4abf0Sagc 	}
50063d4abf0Sagc       else
50163d4abf0Sagc 	{
50263d4abf0Sagc 	  /* Check for end of string. */
50363d4abf0Sagc 	  if (len < 0)
50463d4abf0Sagc 	    {
50563d4abf0Sagc 	      if (type == STR_USER)
50663d4abf0Sagc 		{
50763d4abf0Sagc 		  if (str_user_end)
50863d4abf0Sagc 		    goto backtrack;
50963d4abf0Sagc 		}
51063d4abf0Sagc 	      else if (next_c == L'\0')
51163d4abf0Sagc 		goto backtrack;
51263d4abf0Sagc 	    }
51363d4abf0Sagc 	  else
51463d4abf0Sagc 	    {
51563d4abf0Sagc 	      if (pos >= len)
51663d4abf0Sagc 		goto backtrack;
51763d4abf0Sagc 	    }
51863d4abf0Sagc 
51963d4abf0Sagc 	  /* Read the next character. */
52063d4abf0Sagc 	  GET_NEXT_WCHAR();
52163d4abf0Sagc 	}
52263d4abf0Sagc 
52363d4abf0Sagc       next_state = NULL;
52463d4abf0Sagc       for (trans_i = state; trans_i->state; trans_i++)
52563d4abf0Sagc 	{
52663d4abf0Sagc 	  DPRINT(("  transition %d-%d (%c-%c) %d to %d\n",
52763d4abf0Sagc 		  trans_i->code_min, trans_i->code_max,
52863d4abf0Sagc 		  trans_i->code_min, trans_i->code_max,
52963d4abf0Sagc 		  trans_i->assertions, trans_i->state_id));
53063d4abf0Sagc 	  if (trans_i->code_min <= (tre_cint_t)prev_c
53163d4abf0Sagc 	      && trans_i->code_max >= (tre_cint_t)prev_c)
53263d4abf0Sagc 	    {
53363d4abf0Sagc 	      if (trans_i->assertions
53463d4abf0Sagc 		  && (CHECK_ASSERTIONS(trans_i->assertions)
53563d4abf0Sagc 		      || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
53663d4abf0Sagc 		{
53763d4abf0Sagc 		  DPRINT(("  assertion failed\n"));
53863d4abf0Sagc 		  continue;
53963d4abf0Sagc 		}
54063d4abf0Sagc 
54163d4abf0Sagc 	      if (next_state == NULL)
54263d4abf0Sagc 		{
54363d4abf0Sagc 		  /* First matching transition. */
54463d4abf0Sagc 		  DPRINT(("  Next state is %d\n", trans_i->state_id));
54563d4abf0Sagc 		  next_state = trans_i->state;
54663d4abf0Sagc 		  next_tags = trans_i->tags;
54763d4abf0Sagc 		}
54863d4abf0Sagc 	      else
54963d4abf0Sagc 		{
55063d4abf0Sagc 		  /* Second matching transition.  We may need to backtrack here
55163d4abf0Sagc 		     to take this transition instead of the first one, so we
55263d4abf0Sagc 		     push this transition in the backtracking stack so we can
55363d4abf0Sagc 		     jump back here if needed. */
55463d4abf0Sagc 		  DPRINT(("  saving state %d for backtracking\n",
55563d4abf0Sagc 			  trans_i->state_id));
55663d4abf0Sagc 		  BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
55763d4abf0Sagc 				trans_i->state_id, next_c, tags, mbstate);
55863d4abf0Sagc 		  {
55963d4abf0Sagc 		    int *tmp;
56063d4abf0Sagc 		    for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
56163d4abf0Sagc 		      stack->item.tags[*tmp] = pos;
56263d4abf0Sagc 		  }
56363d4abf0Sagc #if 0 /* XXX - it's important not to look at all transitions here to keep
56463d4abf0Sagc 	 the stack small! */
56563d4abf0Sagc 		  break;
56663d4abf0Sagc #endif
56763d4abf0Sagc 		}
56863d4abf0Sagc 	    }
56963d4abf0Sagc 	}
57063d4abf0Sagc 
57163d4abf0Sagc       if (next_state != NULL)
57263d4abf0Sagc 	{
57363d4abf0Sagc 	  /* Matching transitions were found.  Take the first one. */
57463d4abf0Sagc 	  state = next_state;
57563d4abf0Sagc 
57663d4abf0Sagc 	  /* Update the tag values. */
57763d4abf0Sagc 	  if (next_tags)
57863d4abf0Sagc 	    while (*next_tags >= 0)
57963d4abf0Sagc 	      tags[*next_tags++] = pos;
58063d4abf0Sagc 	}
58163d4abf0Sagc       else
58263d4abf0Sagc 	{
58363d4abf0Sagc 	backtrack:
58463d4abf0Sagc 	  /* A matching transition was not found.  Try to backtrack. */
58563d4abf0Sagc 	  if (stack->prev)
58663d4abf0Sagc 	    {
58763d4abf0Sagc 	      DPRINT(("	 backtracking\n"));
5887964dc0fSjoerg #if ASSERT_BACKREF
5897964dc0fSjoerg 	      if (stack->item.state->assertions)
59063d4abf0Sagc 		{
59163d4abf0Sagc 		  DPRINT(("  states_seen[%d] = 0\n",
59263d4abf0Sagc 			  stack->item.state_id));
59363d4abf0Sagc 		  states_seen[stack->item.state_id] = 0;
59463d4abf0Sagc 		}
5957964dc0fSjoerg #endif
59663d4abf0Sagc 
59763d4abf0Sagc 	      BT_STACK_POP();
59863d4abf0Sagc 	    }
59963d4abf0Sagc 	  else if (match_eo < 0)
60063d4abf0Sagc 	    {
60163d4abf0Sagc 	      /* Try starting from a later position in the input string. */
60263d4abf0Sagc 	      /* Check for end of string. */
60363d4abf0Sagc 	      if (len < 0)
60463d4abf0Sagc 		{
60563d4abf0Sagc 		  if (next_c == L'\0')
60663d4abf0Sagc 		    {
60763d4abf0Sagc 		      DPRINT(("end of string.\n"));
60863d4abf0Sagc 		      break;
60963d4abf0Sagc 		    }
61063d4abf0Sagc 		}
61163d4abf0Sagc 	      else
61263d4abf0Sagc 		{
61363d4abf0Sagc 		  if (pos >= len)
61463d4abf0Sagc 		    {
61563d4abf0Sagc 		      DPRINT(("end of string.\n"));
61663d4abf0Sagc 		      break;
61763d4abf0Sagc 		    }
61863d4abf0Sagc 		}
61963d4abf0Sagc 	      DPRINT(("restarting from next start position\n"));
62013498f30Srin 	      next_c = (tre_char_t) next_c_start;
62163d4abf0Sagc #ifdef TRE_MBSTATE
62263d4abf0Sagc 	      mbstate = mbstate_start;
62363d4abf0Sagc #endif /* TRE_MBSTATE */
62463d4abf0Sagc 	      str_byte = str_byte_start;
62563d4abf0Sagc #ifdef TRE_WCHAR
62663d4abf0Sagc 	      str_wide = str_wide_start;
62763d4abf0Sagc #endif /* TRE_WCHAR */
62863d4abf0Sagc 	      goto retry;
62963d4abf0Sagc 	    }
63063d4abf0Sagc 	  else
63163d4abf0Sagc 	    {
63263d4abf0Sagc 	      DPRINT(("finished\n"));
63363d4abf0Sagc 	      break;
63463d4abf0Sagc 	    }
63563d4abf0Sagc 	}
63663d4abf0Sagc     }
63763d4abf0Sagc 
63863d4abf0Sagc   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
63963d4abf0Sagc   *match_end_ofs = match_eo;
64063d4abf0Sagc 
64163d4abf0Sagc  error_exit:
64263d4abf0Sagc   tre_bt_mem_destroy(mem);
64363d4abf0Sagc #ifndef TRE_USE_ALLOCA
64463d4abf0Sagc   if (tags)
64563d4abf0Sagc     xfree(tags);
64663d4abf0Sagc   if (pmatch)
64763d4abf0Sagc     xfree(pmatch);
64863d4abf0Sagc   if (states_seen)
64963d4abf0Sagc     xfree(states_seen);
65063d4abf0Sagc #endif /* !TRE_USE_ALLOCA */
65163d4abf0Sagc 
65263d4abf0Sagc   return ret;
65363d4abf0Sagc }
654