163d4abf0Sagc /*
263d4abf0Sagc tre-match-backtrack.c - TRE backtracking regex matching engine
363d4abf0Sagc
463d4abf0Sagc This software is released under a BSD-style license.
563d4abf0Sagc See the file LICENSE for details and copyright.
663d4abf0Sagc
763d4abf0Sagc */
863d4abf0Sagc
963d4abf0Sagc /*
1063d4abf0Sagc This matcher is for regexps that use back referencing. Regexp matching
1163d4abf0Sagc with back referencing is an NP-complete problem on the number of back
1263d4abf0Sagc references. The easiest way to match them is to use a backtracking
1363d4abf0Sagc routine which basically goes through all possible paths in the TNFA
1463d4abf0Sagc and chooses the one which results in the best (leftmost and longest)
1563d4abf0Sagc match. This can be spectacularly expensive and may run out of stack
1663d4abf0Sagc space, but there really is no better known generic algorithm. Quoting
1763d4abf0Sagc Henry Spencer from comp.compilers:
1863d4abf0Sagc <URL: http://compilers.iecc.com/comparch/article/93-03-102>
1963d4abf0Sagc
2063d4abf0Sagc POSIX.2 REs require longest match, which is really exciting to
2163d4abf0Sagc implement since the obsolete ("basic") variant also includes
2263d4abf0Sagc \<digit>. I haven't found a better way of tackling this than doing
2363d4abf0Sagc a preliminary match using a DFA (or simulation) on a modified RE
2463d4abf0Sagc that just replicates subREs for \<digit>, and then doing a
2563d4abf0Sagc backtracking match to determine whether the subRE matches were
2663d4abf0Sagc right. This can be rather slow, but I console myself with the
2763d4abf0Sagc thought that people who use \<digit> deserve very slow execution.
2863d4abf0Sagc (Pun unintentional but very appropriate.)
2963d4abf0Sagc
3063d4abf0Sagc */
3163d4abf0Sagc
3263d4abf0Sagc
3363d4abf0Sagc #ifdef HAVE_CONFIG_H
3463d4abf0Sagc #include <config.h>
3563d4abf0Sagc #endif /* HAVE_CONFIG_H */
3663d4abf0Sagc
37f2a3d147Schristos #include "tre-alloca.h"
3863d4abf0Sagc
3963d4abf0Sagc #include <assert.h>
4063d4abf0Sagc #include <stdlib.h>
4163d4abf0Sagc #include <string.h>
4263d4abf0Sagc #ifdef HAVE_WCHAR_H
4363d4abf0Sagc #include <wchar.h>
4463d4abf0Sagc #endif /* HAVE_WCHAR_H */
4563d4abf0Sagc #ifdef HAVE_WCTYPE_H
4663d4abf0Sagc #include <wctype.h>
4763d4abf0Sagc #endif /* HAVE_WCTYPE_H */
4863d4abf0Sagc #ifndef TRE_WCHAR
4963d4abf0Sagc #include <ctype.h>
5063d4abf0Sagc #endif /* !TRE_WCHAR */
5163d4abf0Sagc #ifdef HAVE_MALLOC_H
5263d4abf0Sagc #include <malloc.h>
5363d4abf0Sagc #endif /* HAVE_MALLOC_H */
5463d4abf0Sagc
5563d4abf0Sagc #include "tre-internal.h"
5663d4abf0Sagc #include "tre-mem.h"
5763d4abf0Sagc #include "tre-match-utils.h"
5863d4abf0Sagc #include "tre.h"
5963d4abf0Sagc #include "xmalloc.h"
6063d4abf0Sagc
6163d4abf0Sagc typedef struct {
6263d4abf0Sagc int pos;
6363d4abf0Sagc const char *str_byte;
6463d4abf0Sagc #ifdef TRE_WCHAR
6563d4abf0Sagc const wchar_t *str_wide;
6663d4abf0Sagc #endif /* TRE_WCHAR */
6763d4abf0Sagc tre_tnfa_transition_t *state;
6863d4abf0Sagc int state_id;
6963d4abf0Sagc int next_c;
7063d4abf0Sagc int *tags;
7163d4abf0Sagc #ifdef TRE_MBSTATE
7263d4abf0Sagc mbstate_t mbstate;
7363d4abf0Sagc #endif /* TRE_MBSTATE */
7463d4abf0Sagc } tre_backtrack_item_t;
7563d4abf0Sagc
7663d4abf0Sagc typedef struct tre_backtrack_struct {
7763d4abf0Sagc tre_backtrack_item_t item;
7863d4abf0Sagc struct tre_backtrack_struct *prev;
7963d4abf0Sagc struct tre_backtrack_struct *next;
8063d4abf0Sagc } *tre_backtrack_t;
8163d4abf0Sagc
8263d4abf0Sagc #ifdef TRE_WCHAR
8363d4abf0Sagc #define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide)
8463d4abf0Sagc #define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide
8563d4abf0Sagc #else /* !TRE_WCHAR */
8663d4abf0Sagc #define BT_STACK_WIDE_IN(_str_wide)
8763d4abf0Sagc #define BT_STACK_WIDE_OUT
8863d4abf0Sagc #endif /* !TRE_WCHAR */
8963d4abf0Sagc
9063d4abf0Sagc #ifdef TRE_MBSTATE
9163d4abf0Sagc #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate)
9263d4abf0Sagc #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
9363d4abf0Sagc #else /* !TRE_MBSTATE */
9463d4abf0Sagc #define BT_STACK_MBSTATE_IN
9563d4abf0Sagc #define BT_STACK_MBSTATE_OUT
9663d4abf0Sagc #endif /* !TRE_MBSTATE */
9763d4abf0Sagc
9863d4abf0Sagc
9963d4abf0Sagc #ifdef TRE_USE_ALLOCA
10063d4abf0Sagc #define tre_bt_mem_new tre_mem_newa
10163d4abf0Sagc #define tre_bt_mem_alloc tre_mem_alloca
10213498f30Srin #define tre_bt_mem_destroy(obj) do { } while (/*CONSTCOND*/(void)0,0)
10363d4abf0Sagc #else /* !TRE_USE_ALLOCA */
10463d4abf0Sagc #define tre_bt_mem_new tre_mem_new
10563d4abf0Sagc #define tre_bt_mem_alloc tre_mem_alloc
10663d4abf0Sagc #define tre_bt_mem_destroy tre_mem_destroy
10763d4abf0Sagc #endif /* !TRE_USE_ALLOCA */
10863d4abf0Sagc
10963d4abf0Sagc
11063d4abf0Sagc #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
11163d4abf0Sagc do \
11263d4abf0Sagc { \
113f2a3d147Schristos size_t i; \
11463d4abf0Sagc if (!stack->next) \
11563d4abf0Sagc { \
11663d4abf0Sagc tre_backtrack_t s; \
11763d4abf0Sagc s = tre_bt_mem_alloc(mem, sizeof(*s)); \
11863d4abf0Sagc if (!s) \
11963d4abf0Sagc { \
12063d4abf0Sagc tre_bt_mem_destroy(mem); \
12163d4abf0Sagc if (tags) \
12263d4abf0Sagc xfree(tags); \
12363d4abf0Sagc if (pmatch) \
12463d4abf0Sagc xfree(pmatch); \
12563d4abf0Sagc if (states_seen) \
12663d4abf0Sagc xfree(states_seen); \
12763d4abf0Sagc return REG_ESPACE; \
12863d4abf0Sagc } \
12963d4abf0Sagc s->prev = stack; \
13063d4abf0Sagc s->next = NULL; \
13163d4abf0Sagc s->item.tags = tre_bt_mem_alloc(mem, \
13263d4abf0Sagc sizeof(*tags) * tnfa->num_tags); \
13363d4abf0Sagc if (!s->item.tags) \
13463d4abf0Sagc { \
13563d4abf0Sagc tre_bt_mem_destroy(mem); \
13663d4abf0Sagc if (tags) \
13763d4abf0Sagc xfree(tags); \
13863d4abf0Sagc if (pmatch) \
13963d4abf0Sagc xfree(pmatch); \
14063d4abf0Sagc if (states_seen) \
14163d4abf0Sagc xfree(states_seen); \
14263d4abf0Sagc return REG_ESPACE; \
14363d4abf0Sagc } \
14463d4abf0Sagc stack->next = s; \
14563d4abf0Sagc stack = s; \
14663d4abf0Sagc } \
14763d4abf0Sagc else \
14863d4abf0Sagc stack = stack->next; \
14963d4abf0Sagc stack->item.pos = (_pos); \
15063d4abf0Sagc stack->item.str_byte = (_str_byte); \
15163d4abf0Sagc BT_STACK_WIDE_IN(_str_wide); \
15263d4abf0Sagc stack->item.state = (_state); \
15363d4abf0Sagc stack->item.state_id = (_state_id); \
15463d4abf0Sagc stack->item.next_c = (_next_c); \
15563d4abf0Sagc for (i = 0; i < tnfa->num_tags; i++) \
15663d4abf0Sagc stack->item.tags[i] = (_tags)[i]; \
15763d4abf0Sagc BT_STACK_MBSTATE_IN; \
15863d4abf0Sagc } \
15913498f30Srin while (/*CONSTCOND*/(void)0,0)
16063d4abf0Sagc
16163d4abf0Sagc #define BT_STACK_POP() \
16263d4abf0Sagc do \
16363d4abf0Sagc { \
164f2a3d147Schristos size_t i; \
16563d4abf0Sagc assert(stack->prev); \
16663d4abf0Sagc pos = stack->item.pos; \
16763d4abf0Sagc if (type == STR_USER) \
16863d4abf0Sagc str_source->rewind(pos + pos_add_next, str_source->context); \
16963d4abf0Sagc str_byte = stack->item.str_byte; \
17063d4abf0Sagc BT_STACK_WIDE_OUT; \
17163d4abf0Sagc state = stack->item.state; \
17213498f30Srin next_c = (tre_char_t) stack->item.next_c; \
17363d4abf0Sagc for (i = 0; i < tnfa->num_tags; i++) \
17463d4abf0Sagc tags[i] = stack->item.tags[i]; \
17563d4abf0Sagc BT_STACK_MBSTATE_OUT; \
17663d4abf0Sagc stack = stack->prev; \
17763d4abf0Sagc } \
17813498f30Srin while (/*CONSTCOND*/(void)0,0)
17963d4abf0Sagc
18063d4abf0Sagc #undef MIN
18163d4abf0Sagc #define MIN(a, b) ((a) <= (b) ? (a) : (b))
18263d4abf0Sagc
18363d4abf0Sagc reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t * tnfa,const void * string,int len,tre_str_type_t type,int * match_tags,int eflags,int * match_end_ofs)18463d4abf0Sagc tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
18563d4abf0Sagc int len, tre_str_type_t type, int *match_tags,
18663d4abf0Sagc int eflags, int *match_end_ofs)
18763d4abf0Sagc {
18863d4abf0Sagc /* State variables required by GET_NEXT_WCHAR. */
18963d4abf0Sagc tre_char_t prev_c = 0, next_c = 0;
19063d4abf0Sagc const char *str_byte = string;
19163d4abf0Sagc int pos = 0;
19263d4abf0Sagc unsigned int pos_add_next = 1;
19363d4abf0Sagc #ifdef TRE_WCHAR
19463d4abf0Sagc const wchar_t *str_wide = string;
19563d4abf0Sagc #ifdef TRE_MBSTATE
19663d4abf0Sagc mbstate_t mbstate;
19763d4abf0Sagc #endif /* TRE_MBSTATE */
19863d4abf0Sagc #endif /* TRE_WCHAR */
19963d4abf0Sagc int reg_notbol = eflags & REG_NOTBOL;
20063d4abf0Sagc int reg_noteol = eflags & REG_NOTEOL;
20163d4abf0Sagc int reg_newline = tnfa->cflags & REG_NEWLINE;
202f2a3d147Schristos size_t str_user_end = 0;
20363d4abf0Sagc
20463d4abf0Sagc /* These are used to remember the necessary values of the above
20563d4abf0Sagc variables to return to the position where the current search
20663d4abf0Sagc started from. */
20763d4abf0Sagc int next_c_start;
20863d4abf0Sagc const char *str_byte_start;
20963d4abf0Sagc int pos_start = -1;
21063d4abf0Sagc #ifdef TRE_WCHAR
21163d4abf0Sagc const wchar_t *str_wide_start;
21263d4abf0Sagc #endif /* TRE_WCHAR */
21363d4abf0Sagc #ifdef TRE_MBSTATE
21463d4abf0Sagc mbstate_t mbstate_start;
21563d4abf0Sagc #endif /* TRE_MBSTATE */
21663d4abf0Sagc
21763d4abf0Sagc /* End offset of best match so far, or -1 if no match found yet. */
21863d4abf0Sagc int match_eo = -1;
21963d4abf0Sagc /* Tag arrays. */
22063d4abf0Sagc int *next_tags, *tags = NULL;
22163d4abf0Sagc /* Current TNFA state. */
22263d4abf0Sagc tre_tnfa_transition_t *state;
22363d4abf0Sagc int *states_seen = NULL;
22463d4abf0Sagc
22563d4abf0Sagc /* Memory allocator to for allocating the backtracking stack. */
22663d4abf0Sagc tre_mem_t mem = tre_bt_mem_new();
22763d4abf0Sagc
22863d4abf0Sagc /* The backtracking stack. */
22963d4abf0Sagc tre_backtrack_t stack;
23063d4abf0Sagc
23163d4abf0Sagc tre_tnfa_transition_t *trans_i;
23263d4abf0Sagc regmatch_t *pmatch = NULL;
23399fb528eSrin reg_errcode_t ret;
23463d4abf0Sagc
23563d4abf0Sagc #ifdef TRE_MBSTATE
23663d4abf0Sagc memset(&mbstate, '\0', sizeof(mbstate));
23763d4abf0Sagc #endif /* TRE_MBSTATE */
23863d4abf0Sagc
23963d4abf0Sagc if (!mem)
24063d4abf0Sagc return REG_ESPACE;
24163d4abf0Sagc stack = tre_bt_mem_alloc(mem, sizeof(*stack));
24263d4abf0Sagc if (!stack)
24363d4abf0Sagc {
24463d4abf0Sagc ret = REG_ESPACE;
24563d4abf0Sagc goto error_exit;
24663d4abf0Sagc }
24763d4abf0Sagc stack->prev = NULL;
24863d4abf0Sagc stack->next = NULL;
24963d4abf0Sagc
25063d4abf0Sagc DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
25163d4abf0Sagc DPRINT(("len = %d\n", len));
25263d4abf0Sagc
25363d4abf0Sagc #ifdef TRE_USE_ALLOCA
25463d4abf0Sagc tags = alloca(sizeof(*tags) * tnfa->num_tags);
25563d4abf0Sagc pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches);
25663d4abf0Sagc states_seen = alloca(sizeof(*states_seen) * tnfa->num_states);
25763d4abf0Sagc #else /* !TRE_USE_ALLOCA */
25863d4abf0Sagc if (tnfa->num_tags)
25963d4abf0Sagc {
26063d4abf0Sagc tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
26163d4abf0Sagc if (!tags)
26263d4abf0Sagc {
26363d4abf0Sagc ret = REG_ESPACE;
26463d4abf0Sagc goto error_exit;
26563d4abf0Sagc }
26663d4abf0Sagc }
26763d4abf0Sagc if (tnfa->num_submatches)
26863d4abf0Sagc {
26963d4abf0Sagc pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
27063d4abf0Sagc if (!pmatch)
27163d4abf0Sagc {
27263d4abf0Sagc ret = REG_ESPACE;
27363d4abf0Sagc goto error_exit;
27463d4abf0Sagc }
27563d4abf0Sagc }
27663d4abf0Sagc if (tnfa->num_states)
27763d4abf0Sagc {
27863d4abf0Sagc states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
27963d4abf0Sagc if (!states_seen)
28063d4abf0Sagc {
28163d4abf0Sagc ret = REG_ESPACE;
28263d4abf0Sagc goto error_exit;
28363d4abf0Sagc }
28463d4abf0Sagc }
28563d4abf0Sagc #endif /* !TRE_USE_ALLOCA */
28663d4abf0Sagc
28763d4abf0Sagc retry:
28863d4abf0Sagc {
289f2a3d147Schristos size_t i;
29063d4abf0Sagc for (i = 0; i < tnfa->num_tags; i++)
29163d4abf0Sagc {
29263d4abf0Sagc tags[i] = -1;
29363d4abf0Sagc if (match_tags)
29463d4abf0Sagc match_tags[i] = -1;
29563d4abf0Sagc }
29663d4abf0Sagc for (i = 0; i < tnfa->num_states; i++)
29763d4abf0Sagc states_seen[i] = 0;
29863d4abf0Sagc }
29963d4abf0Sagc
30063d4abf0Sagc state = NULL;
30163d4abf0Sagc pos = pos_start;
30263d4abf0Sagc if (type == STR_USER)
30363d4abf0Sagc str_source->rewind(pos + pos_add_next, str_source->context);
30463d4abf0Sagc GET_NEXT_WCHAR();
30563d4abf0Sagc pos_start = pos;
30663d4abf0Sagc next_c_start = next_c;
30763d4abf0Sagc str_byte_start = str_byte;
30863d4abf0Sagc #ifdef TRE_WCHAR
30963d4abf0Sagc str_wide_start = str_wide;
31063d4abf0Sagc #endif /* TRE_WCHAR */
31163d4abf0Sagc #ifdef TRE_MBSTATE
31263d4abf0Sagc mbstate_start = mbstate;
31363d4abf0Sagc #endif /* TRE_MBSTATE */
31463d4abf0Sagc
31563d4abf0Sagc /* Handle initial states. */
31663d4abf0Sagc next_tags = NULL;
31763d4abf0Sagc for (trans_i = tnfa->initial; trans_i->state; trans_i++)
31863d4abf0Sagc {
31963d4abf0Sagc DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
32063d4abf0Sagc if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
32163d4abf0Sagc {
32263d4abf0Sagc DPRINT(("assert failed\n"));
32363d4abf0Sagc continue;
32463d4abf0Sagc }
32563d4abf0Sagc if (state == NULL)
32663d4abf0Sagc {
32763d4abf0Sagc /* Start from this state. */
32863d4abf0Sagc state = trans_i->state;
32963d4abf0Sagc next_tags = trans_i->tags;
33063d4abf0Sagc }
33163d4abf0Sagc else
33263d4abf0Sagc {
33363d4abf0Sagc /* Backtrack to this state. */
33463d4abf0Sagc DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
33563d4abf0Sagc BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
33663d4abf0Sagc trans_i->state_id, next_c, tags, mbstate);
33763d4abf0Sagc {
33863d4abf0Sagc int *tmp = trans_i->tags;
33963d4abf0Sagc if (tmp)
34063d4abf0Sagc while (*tmp >= 0)
34163d4abf0Sagc stack->item.tags[*tmp++] = pos;
34263d4abf0Sagc }
34363d4abf0Sagc }
34463d4abf0Sagc }
34563d4abf0Sagc
34663d4abf0Sagc if (next_tags)
34763d4abf0Sagc for (; *next_tags >= 0; next_tags++)
34863d4abf0Sagc tags[*next_tags] = pos;
34963d4abf0Sagc
35063d4abf0Sagc
35163d4abf0Sagc DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
35263d4abf0Sagc DPRINT(("pos:chr/code | state and tags\n"));
35363d4abf0Sagc DPRINT(("-------------+------------------------------------------------\n"));
35463d4abf0Sagc
35563d4abf0Sagc if (state == NULL)
35663d4abf0Sagc goto backtrack;
35763d4abf0Sagc
35813498f30Srin while (/*CONSTCOND*/(void)1,1)
35963d4abf0Sagc {
36063d4abf0Sagc tre_tnfa_transition_t *next_state;
36163d4abf0Sagc int empty_br_match;
36263d4abf0Sagc
36363d4abf0Sagc DPRINT(("start loop\n"));
36463d4abf0Sagc if (state == tnfa->final)
36563d4abf0Sagc {
36663d4abf0Sagc DPRINT((" match found, %d %d\n", match_eo, pos));
36763d4abf0Sagc if (match_eo < pos
36863d4abf0Sagc || (match_eo == pos
36963d4abf0Sagc && match_tags
37063d4abf0Sagc && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
37163d4abf0Sagc tags, match_tags)))
37263d4abf0Sagc {
373f2a3d147Schristos size_t i;
37463d4abf0Sagc /* This match wins the previous match. */
37563d4abf0Sagc DPRINT((" win previous\n"));
37663d4abf0Sagc match_eo = pos;
37763d4abf0Sagc if (match_tags)
37863d4abf0Sagc for (i = 0; i < tnfa->num_tags; i++)
37963d4abf0Sagc match_tags[i] = tags[i];
38063d4abf0Sagc }
38163d4abf0Sagc /* Our TNFAs never have transitions leaving from the final state,
38263d4abf0Sagc so we jump right to backtracking. */
38363d4abf0Sagc goto backtrack;
38463d4abf0Sagc }
38563d4abf0Sagc
38663d4abf0Sagc #ifdef TRE_DEBUG
38763d4abf0Sagc DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
38863d4abf0Sagc state));
38963d4abf0Sagc {
39063d4abf0Sagc int i;
39163d4abf0Sagc for (i = 0; i < tnfa->num_tags; i++)
39263d4abf0Sagc DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
39363d4abf0Sagc DPRINT(("\n"));
39463d4abf0Sagc }
39563d4abf0Sagc #endif /* TRE_DEBUG */
39663d4abf0Sagc
39763d4abf0Sagc /* Go to the next character in the input string. */
39863d4abf0Sagc empty_br_match = 0;
39963d4abf0Sagc trans_i = state;
40063d4abf0Sagc if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
40163d4abf0Sagc {
40263d4abf0Sagc /* This is a back reference state. All transitions leaving from
40363d4abf0Sagc this state have the same back reference "assertion". Instead
40463d4abf0Sagc of reading the next character, we match the back reference. */
40582e82fcaSahoka regoff_t so, eo, bt = trans_i->u.backref;
40682e82fcaSahoka regoff_t bt_len;
40782e82fcaSahoka regoff_t result;
40863d4abf0Sagc
40963d4abf0Sagc DPRINT((" should match back reference %d\n", bt));
41063d4abf0Sagc /* Get the substring we need to match against. Remember to
41163d4abf0Sagc turn off REG_NOSUB temporarily. */
412*bc436ad3Srin tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
41363d4abf0Sagc tnfa, tags, pos);
41482e82fcaSahoka /* LINTED */so = pmatch[bt].rm_so;
41582e82fcaSahoka /* LINTED */eo = pmatch[bt].rm_eo;
41663d4abf0Sagc bt_len = eo - so;
41763d4abf0Sagc
41863d4abf0Sagc #ifdef TRE_DEBUG
41963d4abf0Sagc {
42063d4abf0Sagc int slen;
42163d4abf0Sagc if (len < 0)
42263d4abf0Sagc slen = bt_len;
42363d4abf0Sagc else
42463d4abf0Sagc slen = MIN(bt_len, len - pos);
42563d4abf0Sagc
42663d4abf0Sagc if (type == STR_BYTE)
42763d4abf0Sagc {
42863d4abf0Sagc DPRINT((" substring (len %d) is [%d, %d[: '%.*s'\n",
42963d4abf0Sagc bt_len, so, eo, bt_len, (char*)string + so));
43063d4abf0Sagc DPRINT((" current string is '%.*s'\n", slen, str_byte - 1));
43163d4abf0Sagc }
43263d4abf0Sagc #ifdef TRE_WCHAR
43363d4abf0Sagc else if (type == STR_WIDE)
43463d4abf0Sagc {
43563d4abf0Sagc DPRINT((" substring (len %d) is [%d, %d[: '%.*" STRF "'\n",
43663d4abf0Sagc bt_len, so, eo, bt_len, (wchar_t*)string + so));
43763d4abf0Sagc DPRINT((" current string is '%.*" STRF "'\n",
43863d4abf0Sagc slen, str_wide - 1));
43963d4abf0Sagc }
44063d4abf0Sagc #endif /* TRE_WCHAR */
44163d4abf0Sagc }
44263d4abf0Sagc #endif
44363d4abf0Sagc
44463d4abf0Sagc if (len < 0)
44563d4abf0Sagc {
44663d4abf0Sagc if (type == STR_USER)
44763d4abf0Sagc result = str_source->compare((unsigned)so, (unsigned)pos,
44863d4abf0Sagc (unsigned)bt_len,
44963d4abf0Sagc str_source->context);
45063d4abf0Sagc #ifdef TRE_WCHAR
45163d4abf0Sagc else if (type == STR_WIDE)
45263d4abf0Sagc result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
45363d4abf0Sagc (size_t)bt_len);
45463d4abf0Sagc #endif /* TRE_WCHAR */
45563d4abf0Sagc else
45682e82fcaSahoka /* LINTED */result = strncmp((const char*)string + so, str_byte - 1,
45763d4abf0Sagc (size_t)bt_len);
45863d4abf0Sagc }
45963d4abf0Sagc else if (len - pos < bt_len)
46063d4abf0Sagc result = 1;
46163d4abf0Sagc #ifdef TRE_WCHAR
46263d4abf0Sagc else if (type == STR_WIDE)
46363d4abf0Sagc result = wmemcmp((const wchar_t*)string + so, str_wide - 1,
46463d4abf0Sagc (size_t)bt_len);
46563d4abf0Sagc #endif /* TRE_WCHAR */
46663d4abf0Sagc else
46782e82fcaSahoka /* LINTED */result = memcmp((const char*)string + so, str_byte - 1,
46863d4abf0Sagc (size_t)bt_len);
46963d4abf0Sagc
47063d4abf0Sagc if (result == 0)
47163d4abf0Sagc {
47263d4abf0Sagc /* Back reference matched. Check for infinite loop. */
47363d4abf0Sagc if (bt_len == 0)
47463d4abf0Sagc empty_br_match = 1;
47563d4abf0Sagc if (empty_br_match && states_seen[trans_i->state_id])
47663d4abf0Sagc {
47763d4abf0Sagc DPRINT((" avoid loop\n"));
47863d4abf0Sagc goto backtrack;
47963d4abf0Sagc }
48063d4abf0Sagc
48163d4abf0Sagc states_seen[trans_i->state_id] = empty_br_match;
48263d4abf0Sagc
48363d4abf0Sagc /* Advance in input string and resync `prev_c', `next_c'
48463d4abf0Sagc and pos. */
48563d4abf0Sagc DPRINT((" back reference matched\n"));
48682e82fcaSahoka /* LINTED */str_byte += bt_len - 1;
48763d4abf0Sagc #ifdef TRE_WCHAR
48882e82fcaSahoka /* LINTED */str_wide += bt_len - 1;
48963d4abf0Sagc #endif /* TRE_WCHAR */
49082e82fcaSahoka /* LINTED */pos += bt_len - 1;
49163d4abf0Sagc GET_NEXT_WCHAR();
49263d4abf0Sagc DPRINT((" pos now %d\n", pos));
49363d4abf0Sagc }
49463d4abf0Sagc else
49563d4abf0Sagc {
49663d4abf0Sagc DPRINT((" back reference did not match\n"));
49763d4abf0Sagc goto backtrack;
49863d4abf0Sagc }
49963d4abf0Sagc }
50063d4abf0Sagc else
50163d4abf0Sagc {
50263d4abf0Sagc /* Check for end of string. */
50363d4abf0Sagc if (len < 0)
50463d4abf0Sagc {
50563d4abf0Sagc if (type == STR_USER)
50663d4abf0Sagc {
50763d4abf0Sagc if (str_user_end)
50863d4abf0Sagc goto backtrack;
50963d4abf0Sagc }
51063d4abf0Sagc else if (next_c == L'\0')
51163d4abf0Sagc goto backtrack;
51263d4abf0Sagc }
51363d4abf0Sagc else
51463d4abf0Sagc {
51563d4abf0Sagc if (pos >= len)
51663d4abf0Sagc goto backtrack;
51763d4abf0Sagc }
51863d4abf0Sagc
51963d4abf0Sagc /* Read the next character. */
52063d4abf0Sagc GET_NEXT_WCHAR();
52163d4abf0Sagc }
52263d4abf0Sagc
52363d4abf0Sagc next_state = NULL;
52463d4abf0Sagc for (trans_i = state; trans_i->state; trans_i++)
52563d4abf0Sagc {
52663d4abf0Sagc DPRINT((" transition %d-%d (%c-%c) %d to %d\n",
52763d4abf0Sagc trans_i->code_min, trans_i->code_max,
52863d4abf0Sagc trans_i->code_min, trans_i->code_max,
52963d4abf0Sagc trans_i->assertions, trans_i->state_id));
53063d4abf0Sagc if (trans_i->code_min <= (tre_cint_t)prev_c
53163d4abf0Sagc && trans_i->code_max >= (tre_cint_t)prev_c)
53263d4abf0Sagc {
53363d4abf0Sagc if (trans_i->assertions
53463d4abf0Sagc && (CHECK_ASSERTIONS(trans_i->assertions)
53563d4abf0Sagc || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
53663d4abf0Sagc {
53763d4abf0Sagc DPRINT((" assertion failed\n"));
53863d4abf0Sagc continue;
53963d4abf0Sagc }
54063d4abf0Sagc
54163d4abf0Sagc if (next_state == NULL)
54263d4abf0Sagc {
54363d4abf0Sagc /* First matching transition. */
54463d4abf0Sagc DPRINT((" Next state is %d\n", trans_i->state_id));
54563d4abf0Sagc next_state = trans_i->state;
54663d4abf0Sagc next_tags = trans_i->tags;
54763d4abf0Sagc }
54863d4abf0Sagc else
54963d4abf0Sagc {
55063d4abf0Sagc /* Second matching transition. We may need to backtrack here
55163d4abf0Sagc to take this transition instead of the first one, so we
55263d4abf0Sagc push this transition in the backtracking stack so we can
55363d4abf0Sagc jump back here if needed. */
55463d4abf0Sagc DPRINT((" saving state %d for backtracking\n",
55563d4abf0Sagc trans_i->state_id));
55663d4abf0Sagc BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
55763d4abf0Sagc trans_i->state_id, next_c, tags, mbstate);
55863d4abf0Sagc {
55963d4abf0Sagc int *tmp;
56063d4abf0Sagc for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
56163d4abf0Sagc stack->item.tags[*tmp] = pos;
56263d4abf0Sagc }
56363d4abf0Sagc #if 0 /* XXX - it's important not to look at all transitions here to keep
56463d4abf0Sagc the stack small! */
56563d4abf0Sagc break;
56663d4abf0Sagc #endif
56763d4abf0Sagc }
56863d4abf0Sagc }
56963d4abf0Sagc }
57063d4abf0Sagc
57163d4abf0Sagc if (next_state != NULL)
57263d4abf0Sagc {
57363d4abf0Sagc /* Matching transitions were found. Take the first one. */
57463d4abf0Sagc state = next_state;
57563d4abf0Sagc
57663d4abf0Sagc /* Update the tag values. */
57763d4abf0Sagc if (next_tags)
57863d4abf0Sagc while (*next_tags >= 0)
57963d4abf0Sagc tags[*next_tags++] = pos;
58063d4abf0Sagc }
58163d4abf0Sagc else
58263d4abf0Sagc {
58363d4abf0Sagc backtrack:
58463d4abf0Sagc /* A matching transition was not found. Try to backtrack. */
58563d4abf0Sagc if (stack->prev)
58663d4abf0Sagc {
58763d4abf0Sagc DPRINT((" backtracking\n"));
5887964dc0fSjoerg #if ASSERT_BACKREF
5897964dc0fSjoerg if (stack->item.state->assertions)
59063d4abf0Sagc {
59163d4abf0Sagc DPRINT((" states_seen[%d] = 0\n",
59263d4abf0Sagc stack->item.state_id));
59363d4abf0Sagc states_seen[stack->item.state_id] = 0;
59463d4abf0Sagc }
5957964dc0fSjoerg #endif
59663d4abf0Sagc
59763d4abf0Sagc BT_STACK_POP();
59863d4abf0Sagc }
59963d4abf0Sagc else if (match_eo < 0)
60063d4abf0Sagc {
60163d4abf0Sagc /* Try starting from a later position in the input string. */
60263d4abf0Sagc /* Check for end of string. */
60363d4abf0Sagc if (len < 0)
60463d4abf0Sagc {
60563d4abf0Sagc if (next_c == L'\0')
60663d4abf0Sagc {
60763d4abf0Sagc DPRINT(("end of string.\n"));
60863d4abf0Sagc break;
60963d4abf0Sagc }
61063d4abf0Sagc }
61163d4abf0Sagc else
61263d4abf0Sagc {
61363d4abf0Sagc if (pos >= len)
61463d4abf0Sagc {
61563d4abf0Sagc DPRINT(("end of string.\n"));
61663d4abf0Sagc break;
61763d4abf0Sagc }
61863d4abf0Sagc }
61963d4abf0Sagc DPRINT(("restarting from next start position\n"));
62013498f30Srin next_c = (tre_char_t) next_c_start;
62163d4abf0Sagc #ifdef TRE_MBSTATE
62263d4abf0Sagc mbstate = mbstate_start;
62363d4abf0Sagc #endif /* TRE_MBSTATE */
62463d4abf0Sagc str_byte = str_byte_start;
62563d4abf0Sagc #ifdef TRE_WCHAR
62663d4abf0Sagc str_wide = str_wide_start;
62763d4abf0Sagc #endif /* TRE_WCHAR */
62863d4abf0Sagc goto retry;
62963d4abf0Sagc }
63063d4abf0Sagc else
63163d4abf0Sagc {
63263d4abf0Sagc DPRINT(("finished\n"));
63363d4abf0Sagc break;
63463d4abf0Sagc }
63563d4abf0Sagc }
63663d4abf0Sagc }
63763d4abf0Sagc
63863d4abf0Sagc ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
63963d4abf0Sagc *match_end_ofs = match_eo;
64063d4abf0Sagc
64163d4abf0Sagc error_exit:
64263d4abf0Sagc tre_bt_mem_destroy(mem);
64363d4abf0Sagc #ifndef TRE_USE_ALLOCA
64463d4abf0Sagc if (tags)
64563d4abf0Sagc xfree(tags);
64663d4abf0Sagc if (pmatch)
64763d4abf0Sagc xfree(pmatch);
64863d4abf0Sagc if (states_seen)
64963d4abf0Sagc xfree(states_seen);
65063d4abf0Sagc #endif /* !TRE_USE_ALLOCA */
65163d4abf0Sagc
65263d4abf0Sagc return ret;
65363d4abf0Sagc }
654