xref: /netbsd-src/external/gpl2/xcvs/dist/lib/regexec.c (revision 5a6c14c844c4c665da5632061aebde7bb2cb5766)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License along
17    with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19 #include <sys/cdefs.h>
20 __RCSID("$NetBSD: regexec.c,v 1.3 2016/05/17 14:00:09 christos Exp $");
21 
22 
23 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
24 				     Idx n) internal_function;
25 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
26 static void match_ctx_free (re_match_context_t *cache) internal_function;
27 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
28 					  Idx str_idx, Idx from, Idx to)
29      internal_function;
30 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
31      internal_function;
32 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
33 					   Idx str_idx) internal_function;
34 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
35 						    Idx node, Idx str_idx)
36      internal_function;
37 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
38 			   re_dfastate_t **limited_sts, Idx last_node,
39 			   Idx last_str_idx)
40      internal_function;
41 static reg_errcode_t re_search_internal (const regex_t *preg,
42 					 const char *string, Idx length,
43 					 Idx start, Idx last_start, Idx stop,
44 					 size_t nmatch, regmatch_t pmatch[],
45 					 int eflags) internal_function;
46 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
47 				  const char *string1, Idx length1,
48 				  const char *string2, Idx length2,
49 				  Idx start, regoff_t range,
50 				  struct re_registers *regs,
51 				  Idx stop, bool ret_len) internal_function;
52 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
53 				const char *string, Idx length, Idx start,
54 				regoff_t range, Idx stop,
55 				struct re_registers *regs,
56 				bool ret_len) internal_function;
57 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
58 			      Idx nregs, int regs_allocated) internal_function;
59 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
60      internal_function;
61 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
62 			   Idx *p_match_first)
63      internal_function;
64 static Idx check_halt_state_context (const re_match_context_t *mctx,
65 				     const re_dfastate_t *state, Idx idx)
66      internal_function;
67 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
68 			 regmatch_t *prev_idx_match, Idx cur_node,
69 			 Idx cur_idx, Idx nmatch) internal_function;
70 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
71 				      Idx str_idx, Idx dest_node, Idx nregs,
72 				      regmatch_t *regs,
73 				      re_node_set *eps_via_nodes) internal_function;
74 static reg_errcode_t set_regs (const regex_t *preg,
75 			       const re_match_context_t *mctx,
76 			       size_t nmatch, regmatch_t *pmatch,
77 			       bool fl_backtrack) internal_function;
78 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
79 
80 #ifdef RE_ENABLE_I18N
81 static int sift_states_iter_mb (const re_match_context_t *mctx,
82 				re_sift_context_t *sctx,
83 				Idx node_idx, Idx str_idx, Idx max_str_idx) internal_function;
84 #endif /* RE_ENABLE_I18N */
85 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
86 					   re_sift_context_t *sctx) internal_function;
87 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
88 					  re_sift_context_t *sctx, Idx str_idx,
89 					  re_node_set *cur_dest) internal_function;
90 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
91 					      re_sift_context_t *sctx,
92 					      Idx str_idx,
93 					      re_node_set *dest_nodes) internal_function;
94 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
95 					    re_node_set *dest_nodes,
96 					    const re_node_set *candidates) internal_function;
97 static bool check_dst_limits (const re_match_context_t *mctx,
98 			      const re_node_set *limits,
99 			      Idx dst_node, Idx dst_idx, Idx src_node,
100 			      Idx src_idx) internal_function;
101 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
102 					int boundaries, Idx subexp_idx,
103 					Idx from_node, Idx bkref_idx) internal_function;
104 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
105 				      Idx limit, Idx subexp_idx,
106 				      Idx node, Idx str_idx,
107 				      Idx bkref_idx) internal_function;
108 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
109 					  re_node_set *dest_nodes,
110 					  const re_node_set *candidates,
111 					  re_node_set *limits,
112 					  struct re_backref_cache_entry *bkref_ents,
113 					  Idx str_idx) internal_function;
114 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
115 					re_sift_context_t *sctx,
116 					Idx str_idx, const re_node_set *candidates) internal_function;
117 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
118 					re_dfastate_t **src, Idx num) internal_function;
119 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
120 					 re_match_context_t *mctx) internal_function;
121 static re_dfastate_t *transit_state (reg_errcode_t *err,
122 				     re_match_context_t *mctx,
123 				     re_dfastate_t *state) internal_function;
124 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
125 					    re_match_context_t *mctx,
126 					    re_dfastate_t *next_state) internal_function;
127 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
128 						re_node_set *cur_nodes,
129 						Idx str_idx) internal_function;
130 #if 0
131 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
132 					re_match_context_t *mctx,
133 					re_dfastate_t *pstate) internal_function;
134 #endif
135 #ifdef RE_ENABLE_I18N
136 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
137 				       re_dfastate_t *pstate) internal_function;
138 #endif /* RE_ENABLE_I18N */
139 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
140 					  const re_node_set *nodes) internal_function;
141 static reg_errcode_t get_subexp (re_match_context_t *mctx,
142 				 Idx bkref_node, Idx bkref_str_idx) internal_function;
143 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
144 				     const re_sub_match_top_t *sub_top,
145 				     re_sub_match_last_t *sub_last,
146 				     Idx bkref_node, Idx bkref_str) internal_function;
147 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
148 			     Idx subexp_idx, int type) internal_function;
149 static reg_errcode_t check_arrival (re_match_context_t *mctx,
150 				    state_array_t *path, Idx top_node,
151 				    Idx top_str, Idx last_node, Idx last_str,
152 				    int type) internal_function;
153 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
154 						   Idx str_idx,
155 						   re_node_set *cur_nodes,
156 						   re_node_set *next_nodes) internal_function;
157 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
158 					       re_node_set *cur_nodes,
159 					       Idx ex_subexp, int type) internal_function;
160 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
161 						   re_node_set *dst_nodes,
162 						   Idx target, Idx ex_subexp,
163 						   int type) internal_function;
164 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
165 					 re_node_set *cur_nodes, Idx cur_str,
166 					 Idx subexp_num, int type) internal_function;
167 static bool build_trtable (re_dfa_t *dfa,
168 			   re_dfastate_t *state) internal_function;
169 #ifdef RE_ENABLE_I18N
170 static int check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
171 				    const re_string_t *input, Idx idx) internal_function;
172 # ifdef _LIBC
173 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
174 						   size_t name_len) internal_function;
175 # endif /* _LIBC */
176 #endif /* RE_ENABLE_I18N */
177 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
178 				       const re_dfastate_t *state,
179 				       re_node_set *states_node,
180 				       bitset *states_ch) internal_function;
181 static bool check_node_accept (const re_match_context_t *mctx,
182 			       const re_token_t *node, Idx idx)
183      internal_function;
184 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
185 
186 /* Entry point for POSIX code.  */
187 
188 /* regexec searches for a given pattern, specified by PREG, in the
189    string STRING.
190 
191    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
192    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
193    least NMATCH elements, and we set them to the offsets of the
194    corresponding matched substrings.
195 
196    EFLAGS specifies `execution flags' which affect matching: if
197    REG_NOTBOL is set, then ^ does not match at the beginning of the
198    string; if REG_NOTEOL is set, then $ does not match at the end.
199 
200    We return 0 if we find a match and REG_NOMATCH if not.  */
201 
202 int
regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)203 regexec (const regex_t *__restrict preg, const char *__restrict string,
204 	 size_t nmatch, regmatch_t pmatch[], int eflags)
205 {
206   reg_errcode_t err;
207   Idx start, length;
208 #ifdef _LIBC
209   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
210 #endif
211 
212   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
213     return REG_BADPAT;
214 
215   if (eflags & REG_STARTEND)
216     {
217       start = pmatch[0].rm_so;
218       length = pmatch[0].rm_eo;
219     }
220   else
221     {
222       start = 0;
223       length = strlen (string);
224     }
225 
226   __libc_lock_lock (dfa->lock);
227   if (preg->re_no_sub)
228     err = re_search_internal (preg, string, length, start, length,
229 			      length, 0, NULL, eflags);
230   else
231     err = re_search_internal (preg, string, length, start, length,
232 			      length, nmatch, pmatch, eflags);
233   __libc_lock_unlock (dfa->lock);
234   return err != REG_NOERROR;
235 }
236 
237 #ifdef _LIBC
238 # include <shlib-compat.h>
239 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
240 
241 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
242 __typeof__ (__regexec) __compat_regexec;
243 
244 int
245 attribute_compat_text_section
__compat_regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)246 __compat_regexec (const regex_t *__restrict preg,
247 		  const char *__restrict string, size_t nmatch,
248 		  regmatch_t pmatch[], int eflags)
249 {
250   return regexec (preg, string, nmatch, pmatch,
251 		  eflags & (REG_NOTBOL | REG_NOTEOL));
252 }
253 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
254 # endif
255 #endif
256 
257 /* Entry points for GNU code.  */
258 
259 /* re_match, re_search, re_match_2, re_search_2
260 
261    The former two functions operate on STRING with length LENGTH,
262    while the later two operate on concatenation of STRING1 and STRING2
263    with lengths LENGTH1 and LENGTH2, respectively.
264 
265    re_match() matches the compiled pattern in BUFP against the string,
266    starting at index START.
267 
268    re_search() first tries matching at index START, then it tries to match
269    starting from index START + 1, and so on.  The last start position tried
270    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
271    way as re_match().)
272 
273    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
274    the first STOP characters of the concatenation of the strings should be
275    concerned.
276 
277    If REGS is not NULL, and BUFP->re_no_sub is not set, the offsets of the match
278    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
279    computed relative to the concatenation, not relative to the individual
280    strings.)
281 
282    On success, re_match* functions return the length of the match, re_search*
283    return the position of the start of the match.  Return value -1 means no
284    match was found and -2 indicates an internal error.  */
285 
286 regoff_t
re_match(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,struct re_registers * regs)287 re_match (struct re_pattern_buffer *bufp, const char *string,
288 	  Idx length, Idx start, struct re_registers *regs)
289 {
290   return re_search_stub (bufp, string, length, start, 0, length, regs, true);
291 }
292 #ifdef _LIBC
weak_alias(__re_match,re_match)293 weak_alias (__re_match, re_match)
294 #endif
295 
296 regoff_t
297 re_search (struct re_pattern_buffer *bufp, const char *string,
298 	   Idx length, Idx start, regoff_t range, struct re_registers *regs)
299 {
300   return re_search_stub (bufp, string, length, start, range, length, regs,
301 			 false);
302 }
303 #ifdef _LIBC
weak_alias(__re_search,re_search)304 weak_alias (__re_search, re_search)
305 #endif
306 
307 regoff_t
308 re_match_2 (struct re_pattern_buffer *bufp,
309 	    const char *string1, Idx length1,
310 	    const char *string2, Idx length2,
311 	    Idx start, struct re_registers *regs, Idx stop)
312 {
313   return re_search_2_stub (bufp, string1, length1, string2, length2,
314 			   start, 0, regs, stop, true);
315 }
316 #ifdef _LIBC
weak_alias(__re_match_2,re_match_2)317 weak_alias (__re_match_2, re_match_2)
318 #endif
319 
320 regoff_t
321 re_search_2 (struct re_pattern_buffer *bufp,
322 	     const char *string1, Idx length1,
323 	     const char *string2, Idx length2,
324 	     Idx start, regoff_t range, struct re_registers *regs, Idx stop)
325 {
326   return re_search_2_stub (bufp, string1, length1, string2, length2,
327 			   start, range, regs, stop, false);
328 }
329 #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)330 weak_alias (__re_search_2, re_search_2)
331 #endif
332 
333 static regoff_t
334 internal_function
335 re_search_2_stub (struct re_pattern_buffer *bufp,
336 		  const char *string1, Idx length1,
337 		  const char *string2, Idx length2,
338 		  Idx start, regoff_t range, struct re_registers *regs,
339 		  Idx stop, bool ret_len)
340 {
341   const char *str;
342   regoff_t rval;
343   Idx len = length1 + length2;
344   char *s = NULL;
345 
346   if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
347     return -2;
348 
349   /* Concatenate the strings.  */
350   if (length2 > 0)
351     if (length1 > 0)
352       {
353 	s = re_malloc (char, len);
354 
355 	if (BE (s == NULL, 0))
356 	  return -2;
357 	memcpy (s, string1, length1);
358 	memcpy (s + length1, string2, length2);
359 	str = s;
360       }
361     else
362       str = string2;
363   else
364     str = string1;
365 
366   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
367 			 ret_len);
368   re_free (s);
369   return rval;
370 }
371 
372 /* The parameters have the same meaning as those of re_search.
373    Additional parameters:
374    If RET_LEN is true the length of the match is returned (re_match style);
375    otherwise the position of the match is returned.  */
376 
377 static regoff_t
378 internal_function
re_search_stub(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,regoff_t range,Idx stop,struct re_registers * regs,bool ret_len)379 re_search_stub (struct re_pattern_buffer *bufp,
380 		const char *string, Idx length,
381 		Idx start, regoff_t range, Idx stop, struct re_registers *regs,
382 		bool ret_len)
383 {
384   reg_errcode_t result;
385   regmatch_t *pmatch;
386   Idx nregs;
387   regoff_t rval;
388   int eflags = 0;
389 #ifdef _LIBC
390   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
391 #endif
392   Idx last_start = start + range;
393 
394   /* Check for out-of-range.  */
395   if (BE (start < 0 || start > length, 0))
396     return -1;
397   if (sizeof start < sizeof range)
398     {
399       regoff_t length_offset = length;
400       regoff_t start_offset = start;
401       if (BE (length_offset - start_offset < range, 0))
402 	last_start = length;
403       else if (BE (range < - start_offset, 0))
404 	last_start = 0;
405     }
406   else
407     {
408       if (BE ((last_start < start) != (range < 0), 0))
409 	{
410 	  /* Overflow occurred when computing last_start; substitute
411 	     the extreme value.  */
412 	  last_start = range < 0 ? 0 : length;
413 	}
414       else
415 	{
416 	  if (BE (length < last_start, 0))
417 	    last_start = length;
418 	  else if (BE (last_start < 0, 0))
419 	    last_start = 0;
420 	}
421     }
422 
423   __libc_lock_lock (dfa->lock);
424 
425   eflags |= (bufp->re_not_bol) ? REG_NOTBOL : 0;
426   eflags |= (bufp->re_not_eol) ? REG_NOTEOL : 0;
427 
428   /* Compile fastmap if we haven't yet.  */
429   if (start < last_start && bufp->re_fastmap != NULL
430       && !bufp->re_fastmap_accurate)
431     re_compile_fastmap (bufp);
432 
433   if (BE (bufp->re_no_sub, 0))
434     regs = NULL;
435 
436   /* We need at least 1 register.  */
437   if (regs == NULL)
438     nregs = 1;
439   else if (BE (bufp->re_regs_allocated == REG_FIXED
440 	       && regs->rm_num_regs <= bufp->re_nsub, 0))
441     {
442       nregs = regs->rm_num_regs;
443       if (BE (nregs < 1, 0))
444 	{
445 	  /* Nothing can be copied to regs.  */
446 	  regs = NULL;
447 	  nregs = 1;
448 	}
449     }
450   else
451     nregs = bufp->re_nsub + 1;
452   pmatch = re_xmalloc (regmatch_t, nregs);
453   if (BE (pmatch == NULL, 0))
454     {
455       rval = -2;
456       goto out;
457     }
458 
459   result = re_search_internal (bufp, string, length, start, last_start, stop,
460 			       nregs, pmatch, eflags);
461 
462   rval = 0;
463 
464   /* I hope we needn't fill ther regs with -1's when no match was found.  */
465   if (result != REG_NOERROR)
466     rval = -1;
467   else if (regs != NULL)
468     {
469       /* If caller wants register contents data back, copy them.  */
470       bufp->re_regs_allocated = re_copy_regs (regs, pmatch, nregs,
471 					      bufp->re_regs_allocated);
472       if (BE (bufp->re_regs_allocated == REG_UNALLOCATED, 0))
473 	rval = -2;
474     }
475 
476   if (BE (rval == 0, 1))
477     {
478       if (ret_len)
479 	{
480 	  assert (pmatch[0].rm_so == start);
481 	  rval = pmatch[0].rm_eo - start;
482 	}
483       else
484 	rval = pmatch[0].rm_so;
485     }
486   re_free (pmatch);
487  out:
488   __libc_lock_unlock (dfa->lock);
489   return rval;
490 }
491 
492 static unsigned
493 internal_function
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,Idx nregs,int regs_allocated)494 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
495 	      int regs_allocated)
496 {
497   int rval = REG_REALLOCATE;
498   Idx i;
499   Idx need_regs = nregs + 1;
500   /* We need one extra element beyond `rm_num_regs' for the `-1' marker GNU code
501      uses.  */
502 
503   /* Have the register data arrays been allocated?  */
504   if (regs_allocated == REG_UNALLOCATED)
505     { /* No.  So allocate them with malloc.  */
506       regs->rm_start = re_xmalloc (regoff_t, need_regs);
507       regs->rm_end = re_malloc (regoff_t, need_regs);
508       if (BE (regs->rm_start == NULL, 0) || BE (regs->rm_end == NULL, 0))
509 	return REG_UNALLOCATED;
510       regs->rm_num_regs = need_regs;
511     }
512   else if (regs_allocated == REG_REALLOCATE)
513     { /* Yes.  If we need more elements than were already
514 	 allocated, reallocate them.  If we need fewer, just
515 	 leave it alone.  */
516       if (BE (need_regs > regs->rm_num_regs, 0))
517 	{
518 	  regoff_t *new_start =
519 	    re_xrealloc (regs->rm_start, regoff_t, need_regs);
520 	  regoff_t *new_end = re_realloc (regs->rm_end, regoff_t, need_regs);
521 	  if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
522 	    return REG_UNALLOCATED;
523 	  regs->rm_start = new_start;
524 	  regs->rm_end = new_end;
525 	  regs->rm_num_regs = need_regs;
526 	}
527     }
528   else
529     {
530       assert (regs_allocated == REG_FIXED);
531       /* This function may not be called with REG_FIXED and nregs too big.  */
532       assert (regs->rm_num_regs >= nregs);
533       rval = REG_FIXED;
534     }
535 
536   /* Copy the regs.  */
537   for (i = 0; i < nregs; ++i)
538     {
539       regs->rm_start[i] = pmatch[i].rm_so;
540       regs->rm_end[i] = pmatch[i].rm_eo;
541     }
542   for ( ; i < regs->rm_num_regs; ++i)
543     regs->rm_start[i] = regs->rm_end[i] = -1;
544 
545   return rval;
546 }
547 
548 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
549    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
550    this memory for recording register information.  STARTS and ENDS
551    must be allocated using the malloc library routine, and must each
552    be at least NUM_REGS * sizeof (regoff_t) bytes long.
553 
554    If NUM_REGS == 0, then subsequent matches should allocate their own
555    register data.
556 
557    Unless this function is called, the first search or match using
558    PATTERN_BUFFER will allocate its own register data, without
559    freeing the old data.  */
560 
561 void
re_set_registers(struct re_pattern_buffer * bufp,struct re_registers * regs,__re_size_t num_regs,regoff_t * starts,regoff_t * ends)562 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
563 		  __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
564 {
565   if (num_regs)
566     {
567       bufp->re_regs_allocated = REG_REALLOCATE;
568       regs->rm_num_regs = num_regs;
569       regs->rm_start = starts;
570       regs->rm_end = ends;
571     }
572   else
573     {
574       bufp->re_regs_allocated = REG_UNALLOCATED;
575       regs->rm_num_regs = 0;
576       regs->rm_start = regs->rm_end = NULL;
577     }
578 }
579 #ifdef _LIBC
weak_alias(__re_set_registers,re_set_registers)580 weak_alias (__re_set_registers, re_set_registers)
581 #endif
582 
583 /* Entry points compatible with 4.2 BSD regex library.  We don't define
584    them unless specifically requested.  */
585 
586 #if defined _REGEX_RE_COMP || defined _LIBC
587 int
588 # ifdef _LIBC
589 weak_function
590 # endif
591 re_exec (const char *s)
592 {
593   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
594 }
595 #endif /* _REGEX_RE_COMP */
596 
597 /* Internal entry point.  */
598 
599 /* Searches for a compiled pattern PREG in the string STRING, whose
600    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
601    meaning as with regexec.  LAST_START is START + RANGE, where
602    START and RANGE have the same meaning as with re_search.
603    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
604    otherwise return the error code.
605    Note: We assume front end functions already check ranges.
606    (0 <= LAST_START && LAST_START <= LENGTH)  */
607 
608 static reg_errcode_t
609 internal_function
re_search_internal(const regex_t * preg,const char * string,Idx length,Idx start,Idx last_start,Idx stop,size_t nmatch,regmatch_t pmatch[],int eflags)610 re_search_internal (const regex_t *preg,
611 		    const char *string, Idx length,
612 		    Idx start, Idx last_start, Idx stop,
613 		    size_t nmatch, regmatch_t pmatch[],
614 		    int eflags)
615 {
616   reg_errcode_t err;
617   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
618   Idx left_lim, right_lim;
619   int incr;
620   bool fl_longest_match;
621   int match_kind;
622   Idx match_first, match_last = REG_MISSING;
623   Idx extra_nmatch;
624   bool sb;
625   int ch;
626 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
627   re_match_context_t mctx = { .dfa = dfa };
628 #else
629   re_match_context_t mctx;
630 #endif
631   char *fastmap = ((preg->re_fastmap != NULL && preg->re_fastmap_accurate
632 		    && start != last_start && !preg->re_can_be_null)
633 		   ? preg->re_fastmap : NULL);
634   unsigned REG_TRANSLATE_TYPE t =
635     (unsigned REG_TRANSLATE_TYPE) preg->re_translate;
636 
637 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
638   memset (&mctx, '\0', sizeof (re_match_context_t));
639   mctx.dfa = dfa;
640 #endif
641 
642   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
643   nmatch -= extra_nmatch;
644 
645   /* Check if the DFA haven't been compiled.  */
646   if (BE (preg->re_used == 0 || dfa->init_state == NULL
647 	  || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
648 	  || dfa->init_state_begbuf == NULL, 0))
649     return REG_NOMATCH;
650 
651 #ifdef DEBUG
652   /* We assume front-end functions already check them.  */
653   assert (0 <= last_start && last_start <= length);
654 #endif
655 
656   /* If initial states with non-begbuf contexts have no elements,
657      the regex must be anchored.  If preg->re_newline_anchor is set,
658      we'll never use init_state_nl, so do not check it.  */
659   if (dfa->init_state->nodes.nelem == 0
660       && dfa->init_state_word->nodes.nelem == 0
661       && (dfa->init_state_nl->nodes.nelem == 0
662 	  || !preg->re_newline_anchor))
663     {
664       if (start != 0 && last_start != 0)
665         return REG_NOMATCH;
666       start = last_start = 0;
667     }
668 
669   /* We must check the longest matching, if nmatch > 0.  */
670   fl_longest_match = (nmatch != 0 || dfa->nbackref);
671 
672   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
673 			    preg->re_translate,
674 			    preg->re_syntax & REG_IGNORE_CASE, dfa);
675   if (BE (err != REG_NOERROR, 0))
676     goto free_return;
677   mctx.input.stop = stop;
678   mctx.input.raw_stop = stop;
679   mctx.input.newline_anchor = preg->re_newline_anchor;
680 
681   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
682   if (BE (err != REG_NOERROR, 0))
683     goto free_return;
684 
685   /* We will log all the DFA states through which the dfa pass,
686      if nmatch > 1, or this dfa has "multibyte node", which is a
687      back-reference or a node which can accept multibyte character or
688      multi character collating element.  */
689   if (nmatch > 1 || dfa->has_mb_node)
690     {
691       mctx.state_log = re_xmalloc (re_dfastate_t *, mctx.input.bufs_len + 1);
692       if (BE (mctx.state_log == NULL, 0))
693 	{
694 	  err = REG_ESPACE;
695 	  goto free_return;
696 	}
697     }
698   else
699     mctx.state_log = NULL;
700 
701   match_first = start;
702   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
703 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
704 
705   /* Check incrementally whether of not the input string match.  */
706   incr = (last_start < start) ? -1 : 1;
707   left_lim = (last_start < start) ? last_start : start;
708   right_lim = (last_start < start) ? start : last_start;
709   sb = dfa->mb_cur_max == 1;
710   match_kind =
711     (fastmap
712      ? ((sb || !(preg->re_syntax & REG_IGNORE_CASE || t) ? 4 : 0)
713 	| (start <= last_start ? 2 : 0)
714 	| (t != NULL ? 1 : 0))
715      : 8);
716 
717   for (;; match_first += incr)
718     {
719       err = REG_NOMATCH;
720       if (match_first < left_lim || right_lim < match_first)
721 	goto free_return;
722 
723       /* Advance as rapidly as possible through the string, until we
724 	 find a plausible place to start matching.  This may be done
725 	 with varying efficiency, so there are various possibilities:
726 	 only the most common of them are specialized, in order to
727 	 save on code size.  We use a switch statement for speed.  */
728       switch (match_kind)
729 	{
730 	case 8:
731 	  /* No fastmap.  */
732 	  break;
733 
734 	case 7:
735 	  /* Fastmap with single-byte translation, match forward.  */
736 	  while (BE (match_first < right_lim, 1)
737 		 && !fastmap[t[(unsigned char) string[match_first]]])
738 	    ++match_first;
739 	  goto forward_match_found_start_or_reached_end;
740 
741 	case 6:
742 	  /* Fastmap without translation, match forward.  */
743 	  while (BE (match_first < right_lim, 1)
744 		 && !fastmap[(unsigned char) string[match_first]])
745 	    ++match_first;
746 
747 	forward_match_found_start_or_reached_end:
748 	  if (BE (match_first == right_lim, 0))
749 	    {
750 	      ch = match_first >= length
751 		       ? 0 : (unsigned char) string[match_first];
752 	      if (!fastmap[t ? t[ch] : ch])
753 		goto free_return;
754 	    }
755 	  break;
756 
757 	case 4:
758 	case 5:
759 	  /* Fastmap without multi-byte translation, match backwards.  */
760 	  while (match_first >= left_lim)
761 	    {
762 	      ch = match_first >= length
763 		       ? 0 : (unsigned char) string[match_first];
764 	      if (fastmap[t ? t[ch] : ch])
765 		break;
766 	      --match_first;
767 	    }
768 	  if (match_first < left_lim)
769 	    goto free_return;
770 	  break;
771 
772 	default:
773 	  /* In this case, we can't determine easily the current byte,
774 	     since it might be a component byte of a multibyte
775 	     character.  Then we use the constructed buffer instead.  */
776 	  for (;;)
777 	    {
778 	      /* If MATCH_FIRST is out of the valid range, reconstruct the
779 		 buffers.  */
780 	      __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
781 	      if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
782 		{
783 		  err = re_string_reconstruct (&mctx.input, match_first,
784 					       eflags);
785 		  if (BE (err != REG_NOERROR, 0))
786 		    goto free_return;
787 
788 		  offset = match_first - mctx.input.raw_mbs_idx;
789 		}
790 	      /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
791 		 Note that MATCH_FIRST must not be smaller than 0.  */
792 	      ch = (match_first >= length
793 		    ? 0 : re_string_byte_at (&mctx.input, offset));
794 	      if (fastmap[ch])
795 		break;
796 	      match_first += incr;
797 	      if (match_first < left_lim || match_first > right_lim)
798 	        {
799 	          err = REG_NOMATCH;
800 	          goto free_return;
801 	        }
802 	    }
803 	  break;
804 	}
805 
806       /* Reconstruct the buffers so that the matcher can assume that
807 	 the matching starts from the beginning of the buffer.  */
808       err = re_string_reconstruct (&mctx.input, match_first, eflags);
809       if (BE (err != REG_NOERROR, 0))
810 	goto free_return;
811 
812 #ifdef RE_ENABLE_I18N
813      /* Don't consider this char as a possible match start if it part,
814 	yet isn't the head, of a multibyte character.  */
815       if (!sb && !re_string_first_byte (&mctx.input, 0))
816 	continue;
817 #endif
818 
819       /* It seems to be appropriate one, then use the matcher.  */
820       /* We assume that the matching starts from 0.  */
821       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
822       match_last = check_matching (&mctx, fl_longest_match,
823 				   start <= last_start ? &match_first : NULL);
824       if (match_last != REG_MISSING)
825 	{
826 	  if (BE (match_last == REG_ERROR, 0))
827 	    {
828 	      err = REG_ESPACE;
829 	      goto free_return;
830 	    }
831 	  else
832 	    {
833 	      mctx.match_last = match_last;
834 	      if ((!preg->re_no_sub && nmatch > 1) || dfa->nbackref)
835 		{
836 		  re_dfastate_t *pstate = mctx.state_log[match_last];
837 		  mctx.last_node = check_halt_state_context (&mctx, pstate,
838 							     match_last);
839 		}
840 	      if ((!preg->re_no_sub && nmatch > 1 && dfa->has_plural_match)
841 		  || dfa->nbackref)
842 		{
843 		  err = prune_impossible_nodes (&mctx);
844 		  if (err == REG_NOERROR)
845 		    break;
846 		  if (BE (err != REG_NOMATCH, 0))
847 		    goto free_return;
848 		  match_last = REG_MISSING;
849 		}
850 	      else
851 		break; /* We found a match.  */
852 	    }
853 	}
854 
855       match_ctx_clean (&mctx);
856     }
857 
858 #ifdef DEBUG
859   assert (match_last != REG_MISSING);
860   assert (err == REG_NOERROR);
861 #endif
862 
863   /* Set pmatch[] if we need.  */
864   if (nmatch > 0)
865     {
866       Idx reg_idx;
867 
868       /* Initialize registers.  */
869       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
870 	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
871 
872       /* Set the points where matching start/end.  */
873       pmatch[0].rm_so = 0;
874       pmatch[0].rm_eo = mctx.match_last;
875       /* FIXME: This function should fail if mctx.match_last exceeds
876 	 the maximum possible regoff_t value.  We need a new error
877 	 code REG_OVERFLOW.  */
878 
879       if (!preg->re_no_sub && nmatch > 1)
880 	{
881 	  err = set_regs (preg, &mctx, nmatch, pmatch,
882 			  dfa->has_plural_match && dfa->nbackref > 0);
883 	  if (BE (err != REG_NOERROR, 0))
884 	    goto free_return;
885 	}
886 
887       /* At last, add the offset to the each registers, since we slided
888 	 the buffers so that we could assume that the matching starts
889 	 from 0.  */
890       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
891 	if (pmatch[reg_idx].rm_so != -1)
892 	  {
893 #ifdef RE_ENABLE_I18N
894 	    if (BE (mctx.input.offsets_needed != 0, 0))
895 	      {
896 		pmatch[reg_idx].rm_so =
897 		  (pmatch[reg_idx].rm_so == mctx.input.valid_len
898 		   ? mctx.input.valid_raw_len
899 		   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
900 		pmatch[reg_idx].rm_eo =
901 		  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
902 		   ? mctx.input.valid_raw_len
903 		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
904 	      }
905 #else
906 	    assert (mctx.input.offsets_needed == 0);
907 #endif
908 	    pmatch[reg_idx].rm_so += match_first;
909 	    pmatch[reg_idx].rm_eo += match_first;
910 	  }
911       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
912 	{
913 	  pmatch[nmatch + reg_idx].rm_so = -1;
914 	  pmatch[nmatch + reg_idx].rm_eo = -1;
915 	}
916 
917       if (dfa->subexp_map)
918         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
919           if (dfa->subexp_map[reg_idx] != reg_idx)
920             {
921               pmatch[reg_idx + 1].rm_so
922                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
923               pmatch[reg_idx + 1].rm_eo
924                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
925             }
926     }
927 
928  free_return:
929   re_free (mctx.state_log);
930   if (dfa->nbackref)
931     match_ctx_free (&mctx);
932   re_string_destruct (&mctx.input);
933   return err;
934 }
935 
936 static reg_errcode_t
937 internal_function
prune_impossible_nodes(re_match_context_t * mctx)938 prune_impossible_nodes (re_match_context_t *mctx)
939 {
940   re_dfa_t *const dfa = mctx->dfa;
941   Idx halt_node, match_last;
942   reg_errcode_t ret;
943   re_dfastate_t **sifted_states;
944   re_dfastate_t **lim_states = NULL;
945   re_sift_context_t sctx;
946 #ifdef DEBUG
947   assert (mctx->state_log != NULL);
948 #endif
949   match_last = mctx->match_last;
950   halt_node = mctx->last_node;
951   sifted_states = re_xmalloc (re_dfastate_t *, match_last + 1);
952   if (BE (sifted_states == NULL, 0))
953     {
954       ret = REG_ESPACE;
955       goto free_return;
956     }
957   if (dfa->nbackref)
958     {
959       lim_states = re_xmalloc (re_dfastate_t *, match_last + 1);
960       if (BE (lim_states == NULL, 0))
961 	{
962 	  ret = REG_ESPACE;
963 	  goto free_return;
964 	}
965       while (1)
966 	{
967 	  memset (lim_states, '\0',
968 		  sizeof (re_dfastate_t *) * (match_last + 1));
969 	  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
970 			 match_last);
971 	  ret = sift_states_backward (mctx, &sctx);
972 	  re_node_set_free (&sctx.limits);
973 	  if (BE (ret != REG_NOERROR, 0))
974 	      goto free_return;
975 	  if (sifted_states[0] != NULL || lim_states[0] != NULL)
976 	    break;
977 	  do
978 	    {
979 	      --match_last;
980 	      if (! REG_VALID_INDEX (match_last))
981 		{
982 		  ret = REG_NOMATCH;
983 		  goto free_return;
984 		}
985 	    } while (mctx->state_log[match_last] == NULL
986 		     || !mctx->state_log[match_last]->halt);
987 	  halt_node = check_halt_state_context (mctx,
988 						mctx->state_log[match_last],
989 						match_last);
990 	}
991       ret = merge_state_array (dfa, sifted_states, lim_states,
992 			       match_last + 1);
993       re_free (lim_states);
994       lim_states = NULL;
995       if (BE (ret != REG_NOERROR, 0))
996 	goto free_return;
997     }
998   else
999     {
1000       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1001       ret = sift_states_backward (mctx, &sctx);
1002       re_node_set_free (&sctx.limits);
1003       if (BE (ret != REG_NOERROR, 0))
1004 	goto free_return;
1005     }
1006   re_free (mctx->state_log);
1007   mctx->state_log = sifted_states;
1008   sifted_states = NULL;
1009   mctx->last_node = halt_node;
1010   mctx->match_last = match_last;
1011   ret = REG_NOERROR;
1012  free_return:
1013   re_free (sifted_states);
1014   re_free (lim_states);
1015   return ret;
1016 }
1017 
1018 /* Acquire an initial state and return it.
1019    We must select appropriate initial state depending on the context,
1020    since initial states may have constraints like "\<", "^", etc..  */
1021 
1022 static inline re_dfastate_t *
1023 __attribute ((always_inline)) internal_function
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,Idx idx)1024 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1025 			    Idx idx)
1026 {
1027   re_dfa_t *const dfa = mctx->dfa;
1028   if (dfa->init_state->has_constraint)
1029     {
1030       unsigned int context;
1031       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1032       if (IS_WORD_CONTEXT (context))
1033 	return dfa->init_state_word;
1034       else if (IS_ORDINARY_CONTEXT (context))
1035 	return dfa->init_state;
1036       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1037 	return dfa->init_state_begbuf;
1038       else if (IS_NEWLINE_CONTEXT (context))
1039 	return dfa->init_state_nl;
1040       else if (IS_BEGBUF_CONTEXT (context))
1041 	{
1042 	  /* It is relatively rare case, then calculate on demand.  */
1043 	  return re_acquire_state_context (err, dfa,
1044 					   dfa->init_state->entrance_nodes,
1045 					   context);
1046 	}
1047       else
1048 	/* Must not happen?  */
1049 	return dfa->init_state;
1050     }
1051   else
1052     return dfa->init_state;
1053 }
1054 
1055 /* Check whether the regular expression match input string INPUT or not,
1056    and return the index where the matching end.  Return REG_MISSING if
1057    there is no match, and return REG_ERROR in case of an error.
1058    FL_LONGEST_MATCH means we want the POSIX longest matching.
1059    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1060    next place where we may want to try matching.
1061    Note that the matcher assume that the maching starts from the current
1062    index of the buffer.  */
1063 
1064 static Idx
1065 internal_function
check_matching(re_match_context_t * mctx,bool fl_longest_match,Idx * p_match_first)1066 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1067 		Idx *p_match_first)
1068 {
1069   re_dfa_t *const dfa = mctx->dfa;
1070   reg_errcode_t err;
1071   Idx match = 0;
1072   Idx match_last = REG_MISSING;
1073   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1074   re_dfastate_t *cur_state;
1075   bool at_init_state = p_match_first != NULL;
1076   Idx next_start_idx = cur_str_idx;
1077 
1078   err = REG_NOERROR;
1079   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1080   /* An initial state must not be NULL (invalid).  */
1081   if (BE (cur_state == NULL, 0))
1082     {
1083       assert (err == REG_ESPACE);
1084       return REG_ERROR;
1085     }
1086 
1087   if (mctx->state_log != NULL)
1088     {
1089       mctx->state_log[cur_str_idx] = cur_state;
1090 
1091       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1092 	 later.  E.g. Processing back references.  */
1093       if (BE (dfa->nbackref, 0))
1094 	{
1095 	  at_init_state = false;
1096 	  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1097 	  if (BE (err != REG_NOERROR, 0))
1098 	    return err;
1099 
1100 	  if (cur_state->has_backref)
1101 	    {
1102 	      err = transit_state_bkref (mctx, &cur_state->nodes);
1103 	      if (BE (err != REG_NOERROR, 0))
1104 	        return err;
1105 	    }
1106 	}
1107     }
1108 
1109   /* If the RE accepts NULL string.  */
1110   if (BE (cur_state->halt, 0))
1111     {
1112       if (!cur_state->has_constraint
1113 	  || check_halt_state_context (mctx, cur_state, cur_str_idx))
1114 	{
1115 	  if (!fl_longest_match)
1116 	    return cur_str_idx;
1117 	  else
1118 	    {
1119 	      match_last = cur_str_idx;
1120 	      match = 1;
1121 	    }
1122 	}
1123     }
1124 
1125   while (!re_string_eoi (&mctx->input))
1126     {
1127       re_dfastate_t *old_state = cur_state;
1128       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1129 
1130       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1131           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1132               && mctx->input.valid_len < mctx->input.len))
1133         {
1134           err = extend_buffers (mctx);
1135           if (BE (err != REG_NOERROR, 0))
1136 	    {
1137 	      assert (err == REG_ESPACE);
1138 	      return REG_ERROR;
1139 	    }
1140         }
1141 
1142       cur_state = transit_state (&err, mctx, cur_state);
1143       if (mctx->state_log != NULL)
1144 	cur_state = merge_state_with_log (&err, mctx, cur_state);
1145 
1146       if (cur_state == NULL)
1147 	{
1148 	  /* Reached the invalid state or an error.  Try to recover a valid
1149 	     state using the state log, if available and if we have not
1150 	     already found a valid (even if not the longest) match.  */
1151 	  if (BE (err != REG_NOERROR, 0))
1152 	    return REG_ERROR;
1153 
1154 	  if (mctx->state_log == NULL
1155 	      || (match && !fl_longest_match)
1156 	      || (cur_state = find_recover_state (&err, mctx)) == NULL)
1157 	    break;
1158 	}
1159 
1160       if (BE (at_init_state, 0))
1161 	{
1162 	  if (old_state == cur_state)
1163 	    next_start_idx = next_char_idx;
1164 	  else
1165 	    at_init_state = false;
1166 	}
1167 
1168       if (cur_state->halt)
1169 	{
1170 	  /* Reached a halt state.
1171 	     Check the halt state can satisfy the current context.  */
1172 	  if (!cur_state->has_constraint
1173 	      || check_halt_state_context (mctx, cur_state,
1174 					   re_string_cur_idx (&mctx->input)))
1175 	    {
1176 	      /* We found an appropriate halt state.  */
1177 	      match_last = re_string_cur_idx (&mctx->input);
1178 	      match = 1;
1179 
1180 	      /* We found a match, do not modify match_first below.  */
1181 	      p_match_first = NULL;
1182 	      if (!fl_longest_match)
1183 		break;
1184 	    }
1185 	}
1186     }
1187 
1188   if (p_match_first)
1189     *p_match_first += next_start_idx;
1190 
1191   return match_last;
1192 }
1193 
1194 /* Check NODE match the current context.  */
1195 
1196 static bool
1197 internal_function
check_halt_node_context(const re_dfa_t * dfa,Idx node,unsigned int context)1198 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1199 {
1200   re_token_type_t type = dfa->nodes[node].type;
1201   unsigned int constraint = dfa->nodes[node].constraint;
1202   if (type != END_OF_RE)
1203     return false;
1204   if (!constraint)
1205     return true;
1206   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1207     return false;
1208   return true;
1209 }
1210 
1211 /* Check the halt state STATE match the current context.
1212    Return 0 if not match, if the node, STATE has, is a halt node and
1213    match the context, return the node.  */
1214 
1215 static Idx
1216 internal_function
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,Idx idx)1217 check_halt_state_context (const re_match_context_t *mctx,
1218 			  const re_dfastate_t *state, Idx idx)
1219 {
1220   Idx i;
1221   unsigned int context;
1222 #ifdef DEBUG
1223   assert (state->halt);
1224 #endif
1225   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1226   for (i = 0; i < state->nodes.nelem; ++i)
1227     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1228       return state->nodes.elems[i];
1229   return 0;
1230 }
1231 
1232 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1233    corresponding to the DFA).
1234    Return the destination node, and update EPS_VIA_NODES;
1235    return REG_MISSING in case of errors.  */
1236 
1237 static Idx
1238 internal_function
proceed_next_node(const re_match_context_t * mctx,Idx nregs,regmatch_t * regs,Idx * pidx,Idx node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1239 proceed_next_node (const re_match_context_t *mctx,
1240 		   Idx nregs, regmatch_t *regs, Idx *pidx, Idx node,
1241 		   re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1242 {
1243   re_dfa_t *const dfa = mctx->dfa;
1244   Idx i;
1245   bool ok;
1246   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1247     {
1248       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1249       re_node_set *edests = &dfa->edests[node];
1250       Idx dest_node;
1251       ok = re_node_set_insert (eps_via_nodes, node);
1252       if (BE (! ok, 0))
1253 	return REG_ERROR;
1254       /* Pick up a valid destination, or return REG_MISSING if none
1255 	 is found.  */
1256       for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1257 	{
1258 	  Idx candidate = edests->elems[i];
1259 	  if (!re_node_set_contains (cur_nodes, candidate))
1260 	    continue;
1261           if (dest_node == REG_MISSING)
1262 	    dest_node = candidate;
1263 
1264           else
1265 	    {
1266 	      /* In order to avoid infinite loop like "(a*)*", return the second
1267 	         epsilon-transition if the first was already considered.  */
1268 	      if (re_node_set_contains (eps_via_nodes, dest_node))
1269 	        return candidate;
1270 
1271 	      /* Otherwise, push the second epsilon-transition on the fail stack.  */
1272 	      else if (fs != NULL
1273 		       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1274 				           eps_via_nodes))
1275 		return REG_ERROR;
1276 
1277 	      /* We know we are going to exit.  */
1278 	      break;
1279 	    }
1280 	}
1281       return dest_node;
1282     }
1283   else
1284     {
1285       Idx naccepted = 0;
1286       re_token_type_t type = dfa->nodes[node].type;
1287 
1288 #ifdef RE_ENABLE_I18N
1289       if (dfa->nodes[node].accept_mb)
1290 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1291       else
1292 #endif /* RE_ENABLE_I18N */
1293       if (type == OP_BACK_REF)
1294 	{
1295 	  Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1296 	  naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1297 	  if (fs != NULL)
1298 	    {
1299 	      if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1300 		return REG_MISSING;
1301 	      else if (naccepted)
1302 		{
1303 		  char *buf = (char *) re_string_get_buffer (&mctx->input);
1304 		  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1305 			      naccepted) != 0)
1306 		    return REG_MISSING;
1307 		}
1308 	    }
1309 
1310 	  if (naccepted == 0)
1311 	    {
1312 	      Idx dest_node;
1313 	      ok = re_node_set_insert (eps_via_nodes, node);
1314 	      if (BE (! ok, 0))
1315 		return REG_ERROR;
1316 	      dest_node = dfa->edests[node].elems[0];
1317 	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1318 					dest_node))
1319 		return dest_node;
1320 	    }
1321 	}
1322 
1323       if (naccepted != 0
1324 	  || check_node_accept (mctx, dfa->nodes + node, *pidx))
1325 	{
1326 	  Idx dest_node = dfa->nexts[node];
1327 	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1328 	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1329 		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1330 					       dest_node)))
1331 	    return REG_MISSING;
1332 	  re_node_set_empty (eps_via_nodes);
1333 	  return dest_node;
1334 	}
1335     }
1336   return REG_MISSING;
1337 }
1338 
1339 static reg_errcode_t
1340 internal_function
push_fail_stack(struct re_fail_stack_t * fs,Idx str_idx,Idx dest_node,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1341 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1342 		 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1343 {
1344   reg_errcode_t err;
1345   Idx num = fs->num++;
1346   if (fs->num == fs->alloc)
1347     {
1348       struct re_fail_stack_ent_t *new_array =
1349 	re_x2realloc (fs->stack, struct re_fail_stack_ent_t, &fs->alloc);
1350       if (new_array == NULL)
1351 	return REG_ESPACE;
1352       fs->stack = new_array;
1353     }
1354   fs->stack[num].idx = str_idx;
1355   fs->stack[num].node = dest_node;
1356   fs->stack[num].regs = re_xmalloc (regmatch_t, nregs);
1357   if (fs->stack[num].regs == NULL)
1358     return REG_ESPACE;
1359   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1360   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1361   return err;
1362 }
1363 
1364 static Idx
1365 internal_function
pop_fail_stack(struct re_fail_stack_t * fs,Idx * pidx,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1366 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx,
1367 		Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1368 {
1369   Idx num = --fs->num;
1370   assert (REG_VALID_INDEX (num));
1371   *pidx = fs->stack[num].idx;
1372   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1373   re_node_set_free (eps_via_nodes);
1374   re_free (fs->stack[num].regs);
1375   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1376   return fs->stack[num].node;
1377 }
1378 
1379 /* Set the positions where the subexpressions are starts/ends to registers
1380    PMATCH.
1381    Note: We assume that pmatch[0] is already set, and
1382    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1383 
1384 static reg_errcode_t
1385 internal_function
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,bool fl_backtrack)1386 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1387 	  size_t nmatch, regmatch_t *pmatch, bool fl_backtrack)
1388 {
1389   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1390   Idx idx, cur_node;
1391   re_node_set eps_via_nodes;
1392   struct re_fail_stack_t *fs;
1393   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1394   regmatch_t *prev_idx_match;
1395   bool prev_idx_match_malloced = false;
1396 
1397 #ifdef DEBUG
1398   assert (nmatch > 1);
1399   assert (mctx->state_log != NULL);
1400 #endif
1401   if (fl_backtrack)
1402     {
1403       fs = &fs_body;
1404       fs->stack = re_xmalloc (struct re_fail_stack_ent_t, fs->alloc);
1405       if (fs->stack == NULL)
1406 	return REG_ESPACE;
1407     }
1408   else
1409     fs = NULL;
1410 
1411   cur_node = dfa->init_node;
1412   re_node_set_init_empty (&eps_via_nodes);
1413 
1414   if (re_alloc_oversized (nmatch, sizeof (regmatch_t)))
1415     {
1416       free_fail_stack_return (fs);
1417       return REG_ESPACE;
1418     }
1419 #ifndef __SSP__
1420   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1421     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1422   else
1423 #endif
1424     {
1425       prev_idx_match = re_malloc (regmatch_t, nmatch);
1426       if (prev_idx_match == NULL)
1427 	{
1428 	  free_fail_stack_return (fs);
1429 	  return REG_ESPACE;
1430 	}
1431       prev_idx_match_malloced = true;
1432     }
1433   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1434 
1435   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1436     {
1437       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1438 
1439       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1440 	{
1441 	  Idx reg_idx;
1442 	  if (fs)
1443 	    {
1444 	      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1445 		if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1446 		  break;
1447 	      if (reg_idx == nmatch)
1448 		{
1449 		  re_node_set_free (&eps_via_nodes);
1450 		  if (prev_idx_match_malloced)
1451 		    re_free (prev_idx_match);
1452 		  return free_fail_stack_return (fs);
1453 		}
1454 	      cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1455 					 &eps_via_nodes);
1456 	    }
1457 	  else
1458 	    {
1459 	      re_node_set_free (&eps_via_nodes);
1460 	      if (prev_idx_match_malloced)
1461 		re_free (prev_idx_match);
1462 	      return REG_NOERROR;
1463 	    }
1464 	}
1465 
1466       /* Proceed to next node.  */
1467       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1468 				    &eps_via_nodes, fs);
1469 
1470       if (BE (! REG_VALID_INDEX (cur_node), 0))
1471 	{
1472 	  if (BE (cur_node == REG_ERROR, 0))
1473 	    {
1474 	      re_node_set_free (&eps_via_nodes);
1475 	      if (prev_idx_match_malloced)
1476 		re_free (prev_idx_match);
1477 	      free_fail_stack_return (fs);
1478 	      return REG_ESPACE;
1479 	    }
1480 	  if (fs)
1481 	    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1482 				       &eps_via_nodes);
1483 	  else
1484 	    {
1485 	      re_node_set_free (&eps_via_nodes);
1486 	      if (prev_idx_match_malloced)
1487 		re_free (prev_idx_match);
1488 	      return REG_NOMATCH;
1489 	    }
1490 	}
1491     }
1492   re_node_set_free (&eps_via_nodes);
1493   if (prev_idx_match_malloced)
1494     re_free (prev_idx_match);
1495   return free_fail_stack_return (fs);
1496 }
1497 
1498 static reg_errcode_t
1499 internal_function
free_fail_stack_return(struct re_fail_stack_t * fs)1500 free_fail_stack_return (struct re_fail_stack_t *fs)
1501 {
1502   if (fs)
1503     {
1504       Idx fs_idx;
1505       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1506 	{
1507 	  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1508 	  re_free (fs->stack[fs_idx].regs);
1509 	}
1510       re_free (fs->stack);
1511     }
1512   return REG_NOERROR;
1513 }
1514 
1515 static void
1516 internal_function
update_regs(re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,Idx cur_node,Idx cur_idx,Idx nmatch)1517 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1518 	     Idx cur_node, Idx cur_idx, Idx nmatch)
1519 {
1520   int type = dfa->nodes[cur_node].type;
1521   if (type == OP_OPEN_SUBEXP)
1522     {
1523       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1524 
1525       /* We are at the first node of this sub expression.  */
1526       if (reg_num < nmatch)
1527 	{
1528 	  pmatch[reg_num].rm_so = cur_idx;
1529 	  pmatch[reg_num].rm_eo = -1;
1530 	}
1531     }
1532   else if (type == OP_CLOSE_SUBEXP)
1533     {
1534       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1535       if (reg_num < nmatch)
1536 	{
1537 	  /* We are at the last node of this sub expression.  */
1538 	  if (pmatch[reg_num].rm_so < cur_idx)
1539 	    {
1540 	      pmatch[reg_num].rm_eo = cur_idx;
1541 	      /* This is a non-empty match or we are not inside an optional
1542 		 subexpression.  Accept this right away.  */
1543 	      memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1544 	    }
1545 	  else
1546 	    {
1547 	      if (dfa->nodes[cur_node].opt_subexp
1548 		  && prev_idx_match[reg_num].rm_so != -1)
1549 		/* We transited through an empty match for an optional
1550 		   subexpression, like (a?)*, and this is not the subexp's
1551 		   first match.  Copy back the old content of the registers
1552 		   so that matches of an inner subexpression are undone as
1553 		   well, like in ((a?))*.  */
1554 		memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1555 	      else
1556 		/* We completed a subexpression, but it may be part of
1557 		   an optional one, so do not update PREV_IDX_MATCH.  */
1558 		pmatch[reg_num].rm_eo = cur_idx;
1559 	    }
1560 	}
1561     }
1562 }
1563 
1564 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1565    and sift the nodes in each states according to the following rules.
1566    Updated state_log will be wrote to STATE_LOG.
1567 
1568    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1569      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1570 	If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1571 	the LAST_NODE, we throw away the node `a'.
1572      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1573 	string `s' and transit to `b':
1574 	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1575 	   away the node `a'.
1576 	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1577 	    thrown away, we throw away the node `a'.
1578      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1579 	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1580 	   node `a'.
1581 	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1582 	    we throw away the node `a'.  */
1583 
1584 #define STATE_NODE_CONTAINS(state,node) \
1585   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1586 
1587 static reg_errcode_t
1588 internal_function
sift_states_backward(re_match_context_t * mctx,re_sift_context_t * sctx)1589 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1590 {
1591   reg_errcode_t err;
1592   int null_cnt = 0;
1593   Idx str_idx = sctx->last_str_idx;
1594   re_node_set cur_dest;
1595 
1596 #ifdef DEBUG
1597   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1598 #endif
1599 
1600   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1601      transit to the last_node and the last_node itself.  */
1602   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1603   if (BE (err != REG_NOERROR, 0))
1604     return err;
1605   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1606   if (BE (err != REG_NOERROR, 0))
1607     goto free_return;
1608 
1609   /* Then check each states in the state_log.  */
1610   while (str_idx > 0)
1611     {
1612       /* Update counters.  */
1613       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1614       if (null_cnt > mctx->max_mb_elem_len)
1615 	{
1616 	  memset (sctx->sifted_states, '\0',
1617 		  sizeof (re_dfastate_t *) * str_idx);
1618 	  re_node_set_free (&cur_dest);
1619 	  return REG_NOERROR;
1620 	}
1621       re_node_set_empty (&cur_dest);
1622       --str_idx;
1623 
1624       if (mctx->state_log[str_idx])
1625 	{
1626 	  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1627           if (BE (err != REG_NOERROR, 0))
1628 	    goto free_return;
1629 	}
1630 
1631       /* Add all the nodes which satisfy the following conditions:
1632 	 - It can epsilon transit to a node in CUR_DEST.
1633 	 - It is in CUR_SRC.
1634 	 And update state_log.  */
1635       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1636       if (BE (err != REG_NOERROR, 0))
1637 	goto free_return;
1638     }
1639   err = REG_NOERROR;
1640  free_return:
1641   re_node_set_free (&cur_dest);
1642   return err;
1643 }
1644 
1645 static reg_errcode_t
1646 internal_function
build_sifted_states(re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * cur_dest)1647 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1648 		     Idx str_idx, re_node_set *cur_dest)
1649 {
1650   re_dfa_t *const dfa = mctx->dfa;
1651   re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1652   Idx i;
1653 
1654   /* Then build the next sifted state.
1655      We build the next sifted state on `cur_dest', and update
1656      `sifted_states[str_idx]' with `cur_dest'.
1657      Note:
1658      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1659      `cur_src' points the node_set of the old `state_log[str_idx]'
1660      (with the epsilon nodes pre-filtered out).  */
1661   for (i = 0; i < cur_src->nelem; i++)
1662     {
1663       Idx prev_node = cur_src->elems[i];
1664       int naccepted = 0;
1665       bool ok;
1666 
1667 #ifdef DEBUG
1668       re_token_type_t type = dfa->nodes[prev_node].type;
1669       assert (!IS_EPSILON_NODE (type));
1670 #endif
1671 #ifdef RE_ENABLE_I18N
1672       /* If the node may accept `multi byte'.  */
1673       if (dfa->nodes[prev_node].accept_mb)
1674 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1675 					 str_idx, sctx->last_str_idx);
1676 #endif /* RE_ENABLE_I18N */
1677 
1678       /* We don't check backreferences here.
1679 	 See update_cur_sifted_state().  */
1680       if (!naccepted
1681 	  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1682 	  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1683 				  dfa->nexts[prev_node]))
1684 	naccepted = 1;
1685 
1686       if (naccepted == 0)
1687 	continue;
1688 
1689       if (sctx->limits.nelem)
1690 	{
1691 	  Idx to_idx = str_idx + naccepted;
1692 	  if (check_dst_limits (mctx, &sctx->limits,
1693 				dfa->nexts[prev_node], to_idx,
1694 				prev_node, str_idx))
1695 	    continue;
1696 	}
1697       ok = re_node_set_insert (cur_dest, prev_node);
1698       if (BE (! ok, 0))
1699 	return REG_ESPACE;
1700     }
1701 
1702   return REG_NOERROR;
1703 }
1704 
1705 /* Helper functions.  */
1706 
1707 static reg_errcode_t
1708 internal_function
clean_state_log_if_needed(re_match_context_t * mctx,Idx next_state_log_idx)1709 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1710 {
1711   Idx top = mctx->state_log_top;
1712 
1713   if (next_state_log_idx >= mctx->input.bufs_len
1714       || (next_state_log_idx >= mctx->input.valid_len
1715 	  && mctx->input.valid_len < mctx->input.len))
1716     {
1717       reg_errcode_t err;
1718       err = extend_buffers (mctx);
1719       if (BE (err != REG_NOERROR, 0))
1720 	return err;
1721     }
1722 
1723   if (top < next_state_log_idx)
1724     {
1725       memset (mctx->state_log + top + 1, '\0',
1726 	      sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1727       mctx->state_log_top = next_state_log_idx;
1728     }
1729   return REG_NOERROR;
1730 }
1731 
1732 static reg_errcode_t
1733 internal_function
merge_state_array(re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,Idx num)1734 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1735 		   Idx num)
1736 {
1737   Idx st_idx;
1738   reg_errcode_t err;
1739   for (st_idx = 0; st_idx < num; ++st_idx)
1740     {
1741       if (dst[st_idx] == NULL)
1742 	dst[st_idx] = src[st_idx];
1743       else if (src[st_idx] != NULL)
1744 	{
1745 	  re_node_set merged_set;
1746 	  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1747 					&src[st_idx]->nodes);
1748 	  if (BE (err != REG_NOERROR, 0))
1749 	    return err;
1750 	  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1751 	  re_node_set_free (&merged_set);
1752 	  if (BE (err != REG_NOERROR, 0))
1753 	    return err;
1754 	}
1755     }
1756   return REG_NOERROR;
1757 }
1758 
1759 static reg_errcode_t
1760 internal_function
update_cur_sifted_state(re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * dest_nodes)1761 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1762 			 Idx str_idx, re_node_set *dest_nodes)
1763 {
1764   re_dfa_t *const dfa = mctx->dfa;
1765   reg_errcode_t err;
1766   const re_node_set *candidates;
1767   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1768 		: &mctx->state_log[str_idx]->nodes);
1769 
1770   if (dest_nodes->nelem == 0)
1771     sctx->sifted_states[str_idx] = NULL;
1772   else
1773     {
1774       if (candidates)
1775 	{
1776 	  /* At first, add the nodes which can epsilon transit to a node in
1777 	     DEST_NODE.  */
1778 	  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1779 	  if (BE (err != REG_NOERROR, 0))
1780 	    return err;
1781 
1782 	  /* Then, check the limitations in the current sift_context.  */
1783 	  if (sctx->limits.nelem)
1784 	    {
1785 	      err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1786 					 mctx->bkref_ents, str_idx);
1787 	      if (BE (err != REG_NOERROR, 0))
1788 		return err;
1789 	    }
1790 	}
1791 
1792       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1793       if (BE (err != REG_NOERROR, 0))
1794 	return err;
1795     }
1796 
1797   if (candidates && mctx->state_log[str_idx]->has_backref)
1798     {
1799       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1800       if (BE (err != REG_NOERROR, 0))
1801 	return err;
1802     }
1803   return REG_NOERROR;
1804 }
1805 
1806 static reg_errcode_t
1807 internal_function
add_epsilon_src_nodes(re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1808 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1809 		       const re_node_set *candidates)
1810 {
1811   reg_errcode_t err = REG_NOERROR;
1812   Idx i;
1813 
1814   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1815   if (BE (err != REG_NOERROR, 0))
1816     return err;
1817 
1818   if (!state->inveclosure.alloc)
1819     {
1820       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1821       if (BE (err != REG_NOERROR, 0))
1822         return REG_ESPACE;
1823       for (i = 0; i < dest_nodes->nelem; i++)
1824         re_node_set_merge (&state->inveclosure,
1825 			   dfa->inveclosures + dest_nodes->elems[i]);
1826     }
1827   return re_node_set_add_intersect (dest_nodes, candidates,
1828 				    &state->inveclosure);
1829 }
1830 
1831 static reg_errcode_t
1832 internal_function
sub_epsilon_src_nodes(re_dfa_t * dfa,Idx node,re_node_set * dest_nodes,const re_node_set * candidates)1833 sub_epsilon_src_nodes (re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1834 		       const re_node_set *candidates)
1835 {
1836     Idx ecl_idx;
1837     reg_errcode_t err;
1838     re_node_set *inv_eclosure = dfa->inveclosures + node;
1839     re_node_set except_nodes;
1840     re_node_set_init_empty (&except_nodes);
1841     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1842       {
1843 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1844 	if (cur_node == node)
1845 	  continue;
1846 	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1847 	  {
1848 	    Idx edst1 = dfa->edests[cur_node].elems[0];
1849 	    Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1850 			 ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1851 	    if ((!re_node_set_contains (inv_eclosure, edst1)
1852 		 && re_node_set_contains (dest_nodes, edst1))
1853 		|| (REG_VALID_NONZERO_INDEX (edst2)
1854 		    && !re_node_set_contains (inv_eclosure, edst2)
1855 		    && re_node_set_contains (dest_nodes, edst2)))
1856 	      {
1857 		err = re_node_set_add_intersect (&except_nodes, candidates,
1858 						 dfa->inveclosures + cur_node);
1859 		if (BE (err != REG_NOERROR, 0))
1860 		  {
1861 		    re_node_set_free (&except_nodes);
1862 		    return err;
1863 		  }
1864 	      }
1865 	  }
1866       }
1867     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1868       {
1869 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1870 	if (!re_node_set_contains (&except_nodes, cur_node))
1871 	  {
1872 	    Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1873 	    re_node_set_remove_at (dest_nodes, idx);
1874 	  }
1875       }
1876     re_node_set_free (&except_nodes);
1877     return REG_NOERROR;
1878 }
1879 
1880 static bool
1881 internal_function
check_dst_limits(const re_match_context_t * mctx,const re_node_set * limits,Idx dst_node,Idx dst_idx,Idx src_node,Idx src_idx)1882 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1883 		  Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1884 {
1885   re_dfa_t *const dfa = mctx->dfa;
1886   Idx lim_idx, src_pos, dst_pos;
1887 
1888   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1889   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1890   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1891     {
1892       Idx subexp_idx;
1893       struct re_backref_cache_entry *ent;
1894       ent = mctx->bkref_ents + limits->elems[lim_idx];
1895       subexp_idx = dfa->nodes[ent->node].opr.idx;
1896 
1897       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1898 					   subexp_idx, dst_node, dst_idx,
1899 					   dst_bkref_idx);
1900       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1901 					   subexp_idx, src_node, src_idx,
1902 					   src_bkref_idx);
1903 
1904       /* In case of:
1905 	 <src> <dst> ( <subexp> )
1906 	 ( <subexp> ) <src> <dst>
1907 	 ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1908       if (src_pos == dst_pos)
1909 	continue; /* This is unrelated limitation.  */
1910       else
1911 	return true;
1912     }
1913   return false;
1914 }
1915 
1916 static int
1917 internal_function
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,Idx subexp_idx,Idx from_node,Idx bkref_idx)1918 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1919 			     Idx subexp_idx, Idx from_node, Idx bkref_idx)
1920 {
1921   re_dfa_t *const dfa = mctx->dfa;
1922   re_node_set *eclosures = dfa->eclosures + from_node;
1923   Idx node_idx;
1924 
1925   /* Else, we are on the boundary: examine the nodes on the epsilon
1926      closure.  */
1927   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1928     {
1929       Idx node = eclosures->elems[node_idx];
1930       switch (dfa->nodes[node].type)
1931 	{
1932 	case OP_BACK_REF:
1933 	  if (bkref_idx != REG_MISSING)
1934 	    {
1935 	      struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1936 	      do
1937 	        {
1938 		  Idx dst;
1939 		  int cpos;
1940 
1941 		  if (ent->node != node)
1942 		    continue;
1943 
1944 		  if (subexp_idx < BITSET_WORD_BITS
1945 		      && !(ent->eps_reachable_subexps_map
1946 			   & ((bitset_word) 1 << subexp_idx)))
1947 		    continue;
1948 
1949 		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1950 		     OP_CLOSE_SUBEXP cases below.  But, if the
1951 		     destination node is the same node as the source
1952 		     node, don't recurse because it would cause an
1953 		     infinite loop: a regex that exhibits this behavior
1954 		     is ()\1*\1*  */
1955 		  dst = dfa->edests[node].elems[0];
1956 		  if (dst == from_node)
1957 		    {
1958 		      if (boundaries & 1)
1959 		        return -1;
1960 		      else /* if (boundaries & 2) */
1961 		        return 0;
1962 		    }
1963 
1964 		  cpos =
1965 		    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1966 						 dst, bkref_idx);
1967 		  if (cpos == -1 /* && (boundaries & 1) */)
1968 		    return -1;
1969 		  if (cpos == 0 && (boundaries & 2))
1970 		    return 0;
1971 
1972 		  if (subexp_idx < BITSET_WORD_BITS)
1973 		    ent->eps_reachable_subexps_map &=
1974 		      ~ ((bitset_word) 1 << subexp_idx);
1975 	        }
1976 	      while (ent++->more);
1977 	    }
1978 	  break;
1979 
1980 	case OP_OPEN_SUBEXP:
1981 	  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1982 	    return -1;
1983 	  break;
1984 
1985 	case OP_CLOSE_SUBEXP:
1986 	  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1987 	    return 0;
1988 	  break;
1989 
1990 	default:
1991 	    break;
1992 	}
1993     }
1994 
1995   return (boundaries & 2) ? 1 : 0;
1996 }
1997 
1998 static int
1999 internal_function
check_dst_limits_calc_pos(const re_match_context_t * mctx,Idx limit,Idx subexp_idx,Idx from_node,Idx str_idx,Idx bkref_idx)2000 check_dst_limits_calc_pos (const re_match_context_t *mctx,
2001 			   Idx limit, Idx subexp_idx,
2002 			   Idx from_node, Idx str_idx, Idx bkref_idx)
2003 {
2004   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2005   int boundaries;
2006 
2007   /* If we are outside the range of the subexpression, return -1 or 1.  */
2008   if (str_idx < lim->subexp_from)
2009     return -1;
2010 
2011   if (lim->subexp_to < str_idx)
2012     return 1;
2013 
2014   /* If we are within the subexpression, return 0.  */
2015   boundaries = (str_idx == lim->subexp_from);
2016   boundaries |= (str_idx == lim->subexp_to) << 1;
2017   if (boundaries == 0)
2018     return 0;
2019 
2020   /* Else, examine epsilon closure.  */
2021   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2022 				      from_node, bkref_idx);
2023 }
2024 
2025 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2026    which are against limitations from DEST_NODES. */
2027 
2028 static reg_errcode_t
2029 internal_function
check_subexp_limits(re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,Idx str_idx)2030 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
2031 		     const re_node_set *candidates, re_node_set *limits,
2032 		     struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2033 {
2034   reg_errcode_t err;
2035   Idx node_idx, lim_idx;
2036 
2037   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2038     {
2039       Idx subexp_idx;
2040       struct re_backref_cache_entry *ent;
2041       ent = bkref_ents + limits->elems[lim_idx];
2042 
2043       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2044 	continue; /* This is unrelated limitation.  */
2045 
2046       subexp_idx = dfa->nodes[ent->node].opr.idx;
2047       if (ent->subexp_to == str_idx)
2048 	{
2049 	  Idx ops_node = REG_MISSING;
2050 	  Idx cls_node = REG_MISSING;
2051 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2052 	    {
2053 	      Idx node = dest_nodes->elems[node_idx];
2054 	      re_token_type_t type = dfa->nodes[node].type;
2055 	      if (type == OP_OPEN_SUBEXP
2056 		  && subexp_idx == dfa->nodes[node].opr.idx)
2057 		ops_node = node;
2058 	      else if (type == OP_CLOSE_SUBEXP
2059 		       && subexp_idx == dfa->nodes[node].opr.idx)
2060 		cls_node = node;
2061 	    }
2062 
2063 	  /* Check the limitation of the open subexpression.  */
2064 	  /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2065 	  if (REG_VALID_INDEX (ops_node))
2066 	    {
2067 	      err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2068 					   candidates);
2069 	      if (BE (err != REG_NOERROR, 0))
2070 		return err;
2071 	    }
2072 
2073 	  /* Check the limitation of the close subexpression.  */
2074 	  if (REG_VALID_INDEX (cls_node))
2075 	    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2076 	      {
2077 		Idx node = dest_nodes->elems[node_idx];
2078 		if (!re_node_set_contains (dfa->inveclosures + node,
2079 					   cls_node)
2080 		    && !re_node_set_contains (dfa->eclosures + node,
2081 					      cls_node))
2082 		  {
2083 		    /* It is against this limitation.
2084 		       Remove it form the current sifted state.  */
2085 		    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2086 						 candidates);
2087 		    if (BE (err != REG_NOERROR, 0))
2088 		      return err;
2089 		    --node_idx;
2090 		  }
2091 	      }
2092 	}
2093       else /* (ent->subexp_to != str_idx)  */
2094 	{
2095 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2096 	    {
2097 	      Idx node = dest_nodes->elems[node_idx];
2098 	      re_token_type_t type = dfa->nodes[node].type;
2099 	      if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2100 		{
2101 		  if (subexp_idx != dfa->nodes[node].opr.idx)
2102 		    continue;
2103 		  /* It is against this limitation.
2104 		     Remove it form the current sifted state.  */
2105 		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2106 					       candidates);
2107 		  if (BE (err != REG_NOERROR, 0))
2108 		    return err;
2109 		}
2110 	    }
2111 	}
2112     }
2113   return REG_NOERROR;
2114 }
2115 
2116 static reg_errcode_t
2117 internal_function
sift_states_bkref(re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,const re_node_set * candidates)2118 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2119 		   Idx str_idx, const re_node_set *candidates)
2120 {
2121   re_dfa_t *const dfa = mctx->dfa;
2122   reg_errcode_t err;
2123   Idx node_idx, node;
2124   re_sift_context_t local_sctx;
2125   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2126 
2127   if (first_idx == REG_MISSING)
2128     return REG_NOERROR;
2129 
2130   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2131 
2132   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2133     {
2134       Idx enabled_idx;
2135       re_token_type_t type;
2136       struct re_backref_cache_entry *entry;
2137       node = candidates->elems[node_idx];
2138       type = dfa->nodes[node].type;
2139       /* Avoid infinite loop for the REs like "()\1+".  */
2140       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2141 	continue;
2142       if (type != OP_BACK_REF)
2143 	continue;
2144 
2145       entry = mctx->bkref_ents + first_idx;
2146       enabled_idx = first_idx;
2147       do
2148 	{
2149 	  bool ok;
2150 	  Idx subexp_len, to_idx, dst_node;
2151 	  re_dfastate_t *cur_state;
2152 
2153 	  if (entry->node != node)
2154 	    continue;
2155 	  subexp_len = entry->subexp_to - entry->subexp_from;
2156 	  to_idx = str_idx + subexp_len;
2157 	  dst_node = (subexp_len ? dfa->nexts[node]
2158 		      : dfa->edests[node].elems[0]);
2159 
2160 	  if (to_idx > sctx->last_str_idx
2161 	      || sctx->sifted_states[to_idx] == NULL
2162 	      || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2163 	      || check_dst_limits (mctx, &sctx->limits, node,
2164 				   str_idx, dst_node, to_idx))
2165 	    continue;
2166 
2167 	  if (local_sctx.sifted_states == NULL)
2168 	    {
2169 	      local_sctx = *sctx;
2170 	      err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2171 	      if (BE (err != REG_NOERROR, 0))
2172 		goto free_return;
2173 	    }
2174 	  local_sctx.last_node = node;
2175 	  local_sctx.last_str_idx = str_idx;
2176 	  ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2177 	  if (BE (! ok, 0))
2178 	    {
2179 	      err = REG_ESPACE;
2180 	      goto free_return;
2181 	    }
2182 	  cur_state = local_sctx.sifted_states[str_idx];
2183 	  err = sift_states_backward (mctx, &local_sctx);
2184 	  if (BE (err != REG_NOERROR, 0))
2185 	    goto free_return;
2186 	  if (sctx->limited_states != NULL)
2187 	    {
2188 	      err = merge_state_array (dfa, sctx->limited_states,
2189 				       local_sctx.sifted_states,
2190 				       str_idx + 1);
2191 	      if (BE (err != REG_NOERROR, 0))
2192 		goto free_return;
2193 	    }
2194 	  local_sctx.sifted_states[str_idx] = cur_state;
2195 	  re_node_set_remove (&local_sctx.limits, enabled_idx);
2196 
2197 	  /* mctx->bkref_ents may have changed, reload the pointer.  */
2198           entry = mctx->bkref_ents + enabled_idx;
2199 	}
2200       while (enabled_idx++, entry++->more);
2201     }
2202   err = REG_NOERROR;
2203  free_return:
2204   if (local_sctx.sifted_states != NULL)
2205     {
2206       re_node_set_free (&local_sctx.limits);
2207     }
2208 
2209   return err;
2210 }
2211 
2212 
2213 #ifdef RE_ENABLE_I18N
2214 static int
2215 internal_function
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx node_idx,Idx str_idx,Idx max_str_idx)2216 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2217 		     Idx node_idx, Idx str_idx, Idx max_str_idx)
2218 {
2219   re_dfa_t *const dfa = mctx->dfa;
2220   int naccepted;
2221   /* Check the node can accept `multi byte'.  */
2222   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2223   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2224       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2225 			    dfa->nexts[node_idx]))
2226     /* The node can't accept the `multi byte', or the
2227        destination was already thrown away, then the node
2228        could't accept the current input `multi byte'.   */
2229     naccepted = 0;
2230   /* Otherwise, it is sure that the node could accept
2231      `naccepted' bytes input.  */
2232   return naccepted;
2233 }
2234 #endif /* RE_ENABLE_I18N */
2235 
2236 
2237 /* Functions for state transition.  */
2238 
2239 /* Return the next state to which the current state STATE will transit by
2240    accepting the current input byte, and update STATE_LOG if necessary.
2241    If STATE can accept a multibyte char/collating element/back reference
2242    update the destination of STATE_LOG.  */
2243 
2244 static re_dfastate_t *
2245 internal_function
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2246 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2247 	       re_dfastate_t *state)
2248 {
2249   re_dfastate_t **trtable;
2250   unsigned char ch;
2251 
2252 #ifdef RE_ENABLE_I18N
2253   /* If the current state can accept multibyte.  */
2254   if (BE (state->accept_mb, 0))
2255     {
2256       *err = transit_state_mb (mctx, state);
2257       if (BE (*err != REG_NOERROR, 0))
2258 	return NULL;
2259     }
2260 #endif /* RE_ENABLE_I18N */
2261 
2262   /* Then decide the next state with the single byte.  */
2263 #if 0
2264   if (0)
2265     /* don't use transition table  */
2266     return transit_state_sb (err, mctx, state);
2267 #endif
2268 
2269   /* Use transition table  */
2270   ch = re_string_fetch_byte (&mctx->input);
2271   for (;;)
2272     {
2273       trtable = state->trtable;
2274       if (BE (trtable != NULL, 1))
2275 	return trtable[ch];
2276 
2277       trtable = state->word_trtable;
2278       if (BE (trtable != NULL, 1))
2279         {
2280 	  unsigned int context;
2281 	  context
2282 	    = re_string_context_at (&mctx->input,
2283 				    re_string_cur_idx (&mctx->input) - 1,
2284 				    mctx->eflags);
2285 	  if (IS_WORD_CONTEXT (context))
2286 	    return trtable[ch + SBC_MAX];
2287 	  else
2288 	    return trtable[ch];
2289 	}
2290 
2291       if (!build_trtable (mctx->dfa, state))
2292 	{
2293 	  *err = REG_ESPACE;
2294 	  return NULL;
2295 	}
2296 
2297       /* Retry, we now have a transition table.  */
2298     }
2299 }
2300 
2301 /* Update the state_log if we need */
2302 re_dfastate_t *
2303 internal_function
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2304 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2305 		      re_dfastate_t *next_state)
2306 {
2307   re_dfa_t *const dfa = mctx->dfa;
2308   Idx cur_idx = re_string_cur_idx (&mctx->input);
2309 
2310   if (cur_idx > mctx->state_log_top)
2311     {
2312       mctx->state_log[cur_idx] = next_state;
2313       mctx->state_log_top = cur_idx;
2314     }
2315   else if (mctx->state_log[cur_idx] == 0)
2316     {
2317       mctx->state_log[cur_idx] = next_state;
2318     }
2319   else
2320     {
2321       re_dfastate_t *pstate;
2322       unsigned int context;
2323       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2324       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2325          the destination of a multibyte char/collating element/
2326          back reference.  Then the next state is the union set of
2327          these destinations and the results of the transition table.  */
2328       pstate = mctx->state_log[cur_idx];
2329       log_nodes = pstate->entrance_nodes;
2330       if (next_state != NULL)
2331         {
2332           table_nodes = next_state->entrance_nodes;
2333           *err = re_node_set_init_union (&next_nodes, table_nodes,
2334 					     log_nodes);
2335           if (BE (*err != REG_NOERROR, 0))
2336 	    return NULL;
2337         }
2338       else
2339         next_nodes = *log_nodes;
2340       /* Note: We already add the nodes of the initial state,
2341 	 then we don't need to add them here.  */
2342 
2343       context = re_string_context_at (&mctx->input,
2344 				      re_string_cur_idx (&mctx->input) - 1,
2345 				      mctx->eflags);
2346       next_state = mctx->state_log[cur_idx]
2347         = re_acquire_state_context (err, dfa, &next_nodes, context);
2348       /* We don't need to check errors here, since the return value of
2349          this function is next_state and ERR is already set.  */
2350 
2351       if (table_nodes != NULL)
2352         re_node_set_free (&next_nodes);
2353     }
2354 
2355   if (BE (dfa->nbackref, 0) && next_state != NULL)
2356     {
2357       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2358 	 later.  We must check them here, since the back references in the
2359 	 next state might use them.  */
2360       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2361 					cur_idx);
2362       if (BE (*err != REG_NOERROR, 0))
2363 	return NULL;
2364 
2365       /* If the next state has back references.  */
2366       if (next_state->has_backref)
2367 	{
2368 	  *err = transit_state_bkref (mctx, &next_state->nodes);
2369 	  if (BE (*err != REG_NOERROR, 0))
2370 	    return NULL;
2371 	  next_state = mctx->state_log[cur_idx];
2372 	}
2373     }
2374 
2375   return next_state;
2376 }
2377 
2378 /* Skip bytes in the input that correspond to part of a
2379    multi-byte match, then look in the log for a state
2380    from which to restart matching.  */
2381 static re_dfastate_t *
2382 internal_function
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2383 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2384 {
2385   re_dfastate_t *cur_state = NULL;
2386   do
2387     {
2388       Idx max = mctx->state_log_top;
2389       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2390 
2391       do
2392 	{
2393           if (++cur_str_idx > max)
2394             return NULL;
2395           re_string_skip_bytes (&mctx->input, 1);
2396 	}
2397       while (mctx->state_log[cur_str_idx] == NULL);
2398 
2399       cur_state = merge_state_with_log (err, mctx, NULL);
2400     }
2401   while (*err == REG_NOERROR && cur_state == NULL);
2402   return cur_state;
2403 }
2404 
2405 /* Helper functions for transit_state.  */
2406 
2407 /* From the node set CUR_NODES, pick up the nodes whose types are
2408    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2409    expression. And register them to use them later for evaluating the
2410    correspoding back references.  */
2411 
2412 static reg_errcode_t
2413 internal_function
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,Idx str_idx)2414 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2415 			   Idx str_idx)
2416 {
2417   re_dfa_t *const dfa = mctx->dfa;
2418   Idx node_idx;
2419   reg_errcode_t err;
2420 
2421   /* TODO: This isn't efficient.
2422 	   Because there might be more than one nodes whose types are
2423 	   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2424 	   nodes.
2425 	   E.g. RE: (a){2}  */
2426   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2427     {
2428       Idx node = cur_nodes->elems[node_idx];
2429       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2430 	  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2431 	  && (dfa->used_bkref_map
2432 	      & ((bitset_word) 1 << dfa->nodes[node].opr.idx)))
2433 	{
2434 	  err = match_ctx_add_subtop (mctx, node, str_idx);
2435 	  if (BE (err != REG_NOERROR, 0))
2436 	    return err;
2437 	}
2438     }
2439   return REG_NOERROR;
2440 }
2441 
2442 #if 0
2443 /* Return the next state to which the current state STATE will transit by
2444    accepting the current input byte.  */
2445 
2446 static re_dfastate_t *
2447 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2448 		  re_dfastate_t *state)
2449 {
2450   re_dfa_t *const dfa = mctx->dfa;
2451   re_node_set next_nodes;
2452   re_dfastate_t *next_state;
2453   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2454   unsigned int context;
2455 
2456   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2457   if (BE (*err != REG_NOERROR, 0))
2458     return NULL;
2459   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2460     {
2461       Idx cur_node = state->nodes.elems[node_cnt];
2462       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2463 	{
2464 	  *err = re_node_set_merge (&next_nodes,
2465 				    dfa->eclosures + dfa->nexts[cur_node]);
2466 	  if (BE (*err != REG_NOERROR, 0))
2467 	    {
2468 	      re_node_set_free (&next_nodes);
2469 	      return NULL;
2470 	    }
2471 	}
2472     }
2473   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2474   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2475   /* We don't need to check errors here, since the return value of
2476      this function is next_state and ERR is already set.  */
2477 
2478   re_node_set_free (&next_nodes);
2479   re_string_skip_bytes (&mctx->input, 1);
2480   return next_state;
2481 }
2482 #endif
2483 
2484 #ifdef RE_ENABLE_I18N
2485 static reg_errcode_t
2486 internal_function
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2487 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2488 {
2489   re_dfa_t *const dfa = mctx->dfa;
2490   reg_errcode_t err;
2491   Idx i;
2492 
2493   for (i = 0; i < pstate->nodes.nelem; ++i)
2494     {
2495       re_node_set dest_nodes, *new_nodes;
2496       Idx cur_node_idx = pstate->nodes.elems[i];
2497       int naccepted;
2498       Idx dest_idx;
2499       unsigned int context;
2500       re_dfastate_t *dest_state;
2501 
2502       if (!dfa->nodes[cur_node_idx].accept_mb)
2503         continue;
2504 
2505       if (dfa->nodes[cur_node_idx].constraint)
2506 	{
2507 	  context = re_string_context_at (&mctx->input,
2508 					  re_string_cur_idx (&mctx->input),
2509 					  mctx->eflags);
2510 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2511 					   context))
2512 	    continue;
2513 	}
2514 
2515       /* How many bytes the node can accept?  */
2516       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2517 					   re_string_cur_idx (&mctx->input));
2518       if (naccepted == 0)
2519 	continue;
2520 
2521       /* The node can accepts `naccepted' bytes.  */
2522       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2523       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2524 			       : mctx->max_mb_elem_len);
2525       err = clean_state_log_if_needed (mctx, dest_idx);
2526       if (BE (err != REG_NOERROR, 0))
2527 	return err;
2528 #ifdef DEBUG
2529       assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2530 #endif
2531       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2532 
2533       dest_state = mctx->state_log[dest_idx];
2534       if (dest_state == NULL)
2535 	dest_nodes = *new_nodes;
2536       else
2537 	{
2538 	  err = re_node_set_init_union (&dest_nodes,
2539 					dest_state->entrance_nodes, new_nodes);
2540 	  if (BE (err != REG_NOERROR, 0))
2541 	    return err;
2542 	}
2543       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2544       mctx->state_log[dest_idx]
2545 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2546       if (dest_state != NULL)
2547 	re_node_set_free (&dest_nodes);
2548       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2549 	return err;
2550     }
2551   return REG_NOERROR;
2552 }
2553 #endif /* RE_ENABLE_I18N */
2554 
2555 static reg_errcode_t
2556 internal_function
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2557 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2558 {
2559   re_dfa_t *const dfa = mctx->dfa;
2560   reg_errcode_t err;
2561   Idx i;
2562   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2563 
2564   for (i = 0; i < nodes->nelem; ++i)
2565     {
2566       Idx dest_str_idx, prev_nelem, bkc_idx;
2567       Idx node_idx = nodes->elems[i];
2568       unsigned int context;
2569       const re_token_t *node = dfa->nodes + node_idx;
2570       re_node_set *new_dest_nodes;
2571 
2572       /* Check whether `node' is a backreference or not.  */
2573       if (node->type != OP_BACK_REF)
2574 	continue;
2575 
2576       if (node->constraint)
2577 	{
2578 	  context = re_string_context_at (&mctx->input, cur_str_idx,
2579 					  mctx->eflags);
2580 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2581 	    continue;
2582 	}
2583 
2584       /* `node' is a backreference.
2585 	 Check the substring which the substring matched.  */
2586       bkc_idx = mctx->nbkref_ents;
2587       err = get_subexp (mctx, node_idx, cur_str_idx);
2588       if (BE (err != REG_NOERROR, 0))
2589 	goto free_return;
2590 
2591       /* And add the epsilon closures (which is `new_dest_nodes') of
2592 	 the backreference to appropriate state_log.  */
2593 #ifdef DEBUG
2594       assert (dfa->nexts[node_idx] != REG_MISSING);
2595 #endif
2596       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2597 	{
2598 	  Idx subexp_len;
2599 	  re_dfastate_t *dest_state;
2600 	  struct re_backref_cache_entry *bkref_ent;
2601 	  bkref_ent = mctx->bkref_ents + bkc_idx;
2602 	  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2603 	    continue;
2604 	  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2605 	  new_dest_nodes = (subexp_len == 0
2606 			    ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2607 			    : dfa->eclosures + dfa->nexts[node_idx]);
2608 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2609 			  - bkref_ent->subexp_from);
2610 	  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2611 					  mctx->eflags);
2612 	  dest_state = mctx->state_log[dest_str_idx];
2613 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2614 			: mctx->state_log[cur_str_idx]->nodes.nelem);
2615 	  /* Add `new_dest_node' to state_log.  */
2616 	  if (dest_state == NULL)
2617 	    {
2618 	      mctx->state_log[dest_str_idx]
2619 		= re_acquire_state_context (&err, dfa, new_dest_nodes,
2620 					    context);
2621 	      if (BE (mctx->state_log[dest_str_idx] == NULL
2622 		      && err != REG_NOERROR, 0))
2623 		goto free_return;
2624 	    }
2625 	  else
2626 	    {
2627 	      re_node_set dest_nodes;
2628 	      err = re_node_set_init_union (&dest_nodes,
2629 					    dest_state->entrance_nodes,
2630 					    new_dest_nodes);
2631 	      if (BE (err != REG_NOERROR, 0))
2632 		{
2633 		  re_node_set_free (&dest_nodes);
2634 		  goto free_return;
2635 		}
2636 	      mctx->state_log[dest_str_idx]
2637 		= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2638 	      re_node_set_free (&dest_nodes);
2639 	      if (BE (mctx->state_log[dest_str_idx] == NULL
2640 		      && err != REG_NOERROR, 0))
2641 		goto free_return;
2642 	    }
2643 	  /* We need to check recursively if the backreference can epsilon
2644 	     transit.  */
2645 	  if (subexp_len == 0
2646 	      && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2647 	    {
2648 	      err = check_subexp_matching_top (mctx, new_dest_nodes,
2649 					       cur_str_idx);
2650 	      if (BE (err != REG_NOERROR, 0))
2651 		goto free_return;
2652 	      err = transit_state_bkref (mctx, new_dest_nodes);
2653 	      if (BE (err != REG_NOERROR, 0))
2654 		goto free_return;
2655 	    }
2656 	}
2657     }
2658   err = REG_NOERROR;
2659  free_return:
2660   return err;
2661 }
2662 
2663 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2664    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2665    Note that we might collect inappropriate candidates here.
2666    However, the cost of checking them strictly here is too high, then we
2667    delay these checking for prune_impossible_nodes().  */
2668 
2669 static reg_errcode_t
2670 internal_function
get_subexp(re_match_context_t * mctx,Idx bkref_node,Idx bkref_str_idx)2671 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2672 {
2673   re_dfa_t *const dfa = mctx->dfa;
2674   Idx subexp_num, sub_top_idx;
2675   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2676   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2677   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2678   if (cache_idx != REG_MISSING)
2679     {
2680       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2681       do
2682         if (entry->node == bkref_node)
2683 	  return REG_NOERROR; /* We already checked it.  */
2684       while (entry++->more);
2685     }
2686 
2687   subexp_num = dfa->nodes[bkref_node].opr.idx;
2688 
2689   /* For each sub expression  */
2690   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2691     {
2692       reg_errcode_t err;
2693       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2694       re_sub_match_last_t *sub_last;
2695       Idx sub_last_idx, sl_str, bkref_str_off;
2696 
2697       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2698 	continue; /* It isn't related.  */
2699 
2700       sl_str = sub_top->str_idx;
2701       bkref_str_off = bkref_str_idx;
2702       /* At first, check the last node of sub expressions we already
2703 	 evaluated.  */
2704       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2705 	{
2706 	  regoff_t sl_str_diff;
2707 	  sub_last = sub_top->lasts[sub_last_idx];
2708 	  sl_str_diff = sub_last->str_idx - sl_str;
2709 	  /* The matched string by the sub expression match with the substring
2710 	     at the back reference?  */
2711 	  if (sl_str_diff > 0)
2712 	    {
2713 	      if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2714 		{
2715 		  /* Not enough chars for a successful match.  */
2716 		  if (bkref_str_off + sl_str_diff > mctx->input.len)
2717 		    break;
2718 
2719 		  err = clean_state_log_if_needed (mctx,
2720 						   bkref_str_off
2721 						   + sl_str_diff);
2722 		  if (BE (err != REG_NOERROR, 0))
2723 		    return err;
2724 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2725 		}
2726 	      if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2727 		break; /* We don't need to search this sub expression any more.  */
2728 	    }
2729 	  bkref_str_off += sl_str_diff;
2730 	  sl_str += sl_str_diff;
2731 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2732 				bkref_str_idx);
2733 
2734 	  /* Reload buf, since the preceding call might have reallocated
2735 	     the buffer.  */
2736 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2737 
2738 	  if (err == REG_NOMATCH)
2739 	    continue;
2740 	  if (BE (err != REG_NOERROR, 0))
2741 	    return err;
2742 	}
2743 
2744       if (sub_last_idx < sub_top->nlasts)
2745 	continue;
2746       if (sub_last_idx > 0)
2747 	++sl_str;
2748       /* Then, search for the other last nodes of the sub expression.  */
2749       for (; sl_str <= bkref_str_idx; ++sl_str)
2750 	{
2751 	  Idx cls_node;
2752 	  regoff_t sl_str_off;
2753 	  const re_node_set *nodes;
2754 	  sl_str_off = sl_str - sub_top->str_idx;
2755 	  /* The matched string by the sub expression match with the substring
2756 	     at the back reference?  */
2757 	  if (sl_str_off > 0)
2758 	    {
2759 	      if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2760 		{
2761 		  /* If we are at the end of the input, we cannot match.  */
2762 		  if (bkref_str_off >= mctx->input.len)
2763 		    break;
2764 
2765 		  err = extend_buffers (mctx);
2766 		  if (BE (err != REG_NOERROR, 0))
2767 		    return err;
2768 
2769 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2770 		}
2771 	      if (buf [bkref_str_off++] != buf[sl_str - 1])
2772 		break; /* We don't need to search this sub expression
2773 			  any more.  */
2774 	    }
2775 	  if (mctx->state_log[sl_str] == NULL)
2776 	    continue;
2777 	  /* Does this state have a ')' of the sub expression?  */
2778 	  nodes = &mctx->state_log[sl_str]->nodes;
2779 	  cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2780 	  if (cls_node == REG_MISSING)
2781 	    continue; /* No.  */
2782 	  if (sub_top->path == NULL)
2783 	    {
2784 	      sub_top->path = re_calloc (state_array_t,
2785 					 sl_str - sub_top->str_idx + 1);
2786 	      if (sub_top->path == NULL)
2787 		return REG_ESPACE;
2788 	    }
2789 	  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2790 	     in the current context?  */
2791 	  err = check_arrival (mctx, sub_top->path, sub_top->node,
2792 			       sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2793 	  if (err == REG_NOMATCH)
2794 	      continue;
2795 	  if (BE (err != REG_NOERROR, 0))
2796 	      return err;
2797 	  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2798 	  if (BE (sub_last == NULL, 0))
2799 	    return REG_ESPACE;
2800 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2801 				bkref_str_idx);
2802 	  if (err == REG_NOMATCH)
2803 	    continue;
2804 	}
2805     }
2806   return REG_NOERROR;
2807 }
2808 
2809 /* Helper functions for get_subexp().  */
2810 
2811 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2812    If it can arrive, register the sub expression expressed with SUB_TOP
2813    and SUB_LAST.  */
2814 
2815 static reg_errcode_t
2816 internal_function
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,Idx bkref_node,Idx bkref_str)2817 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2818 		re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2819 {
2820   reg_errcode_t err;
2821   Idx to_idx;
2822   /* Can the subexpression arrive the back reference?  */
2823   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2824 		       sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2825   if (err != REG_NOERROR)
2826     return err;
2827   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2828 			     sub_last->str_idx);
2829   if (BE (err != REG_NOERROR, 0))
2830     return err;
2831   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2832   return clean_state_log_if_needed (mctx, to_idx);
2833 }
2834 
2835 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2836    Search '(' if FL_OPEN, or search ')' otherwise.
2837    TODO: This function isn't efficient...
2838 	 Because there might be more than one nodes whose types are
2839 	 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2840 	 nodes.
2841 	 E.g. RE: (a){2}  */
2842 
2843 static Idx
2844 internal_function
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,Idx subexp_idx,int type)2845 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2846 		  Idx subexp_idx, int type)
2847 {
2848   Idx cls_idx;
2849   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2850     {
2851       Idx cls_node = nodes->elems[cls_idx];
2852       const re_token_t *node = dfa->nodes + cls_node;
2853       if (node->type == type
2854 	  && node->opr.idx == subexp_idx)
2855 	return cls_node;
2856     }
2857   return REG_MISSING;
2858 }
2859 
2860 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2861    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2862    heavily reused.
2863    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2864 
2865 static reg_errcode_t
2866 internal_function
check_arrival(re_match_context_t * mctx,state_array_t * path,Idx top_node,Idx top_str,Idx last_node,Idx last_str,int type)2867 check_arrival (re_match_context_t *mctx, state_array_t *path,
2868 	       Idx top_node, Idx top_str, Idx last_node, Idx last_str,
2869 	       int type)
2870 {
2871   re_dfa_t *const dfa = mctx->dfa;
2872   reg_errcode_t err;
2873   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2874   re_dfastate_t *cur_state = NULL;
2875   re_node_set *cur_nodes, next_nodes;
2876   re_dfastate_t **backup_state_log;
2877   unsigned int context;
2878 
2879   subexp_num = dfa->nodes[top_node].opr.idx;
2880   /* Extend the buffer if we need.  */
2881   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2882     {
2883       re_dfastate_t **new_array;
2884       Idx old_alloc = path->alloc;
2885       Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1;
2886       if (BE (new_alloc < old_alloc, 0))
2887 	return REG_ESPACE;
2888       new_array = re_xrealloc (path->array, re_dfastate_t *, new_alloc);
2889       if (BE (new_array == NULL, 0))
2890 	return REG_ESPACE;
2891       path->array = new_array;
2892       path->alloc = new_alloc;
2893       memset (new_array + old_alloc, '\0',
2894 	      sizeof (re_dfastate_t *) * (new_alloc - old_alloc));
2895     }
2896 
2897   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2898 
2899   /* Temporary modify MCTX.  */
2900   backup_state_log = mctx->state_log;
2901   backup_cur_idx = mctx->input.cur_idx;
2902   mctx->state_log = path->array;
2903   mctx->input.cur_idx = str_idx;
2904 
2905   /* Setup initial node set.  */
2906   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2907   if (str_idx == top_str)
2908     {
2909       err = re_node_set_init_1 (&next_nodes, top_node);
2910       if (BE (err != REG_NOERROR, 0))
2911 	return err;
2912       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2913       if (BE (err != REG_NOERROR, 0))
2914 	{
2915 	  re_node_set_free (&next_nodes);
2916 	  return err;
2917 	}
2918     }
2919   else
2920     {
2921       cur_state = mctx->state_log[str_idx];
2922       if (cur_state && cur_state->has_backref)
2923 	{
2924 	  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2925 	  if (BE ( err != REG_NOERROR, 0))
2926 	    return err;
2927 	}
2928       else
2929 	re_node_set_init_empty (&next_nodes);
2930     }
2931   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2932     {
2933       if (next_nodes.nelem)
2934 	{
2935 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2936 				    subexp_num, type);
2937 	  if (BE ( err != REG_NOERROR, 0))
2938 	    {
2939 	      re_node_set_free (&next_nodes);
2940 	      return err;
2941 	    }
2942 	}
2943       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2944       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2945 	{
2946 	  re_node_set_free (&next_nodes);
2947 	  return err;
2948 	}
2949       mctx->state_log[str_idx] = cur_state;
2950     }
2951 
2952   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2953     {
2954       re_node_set_empty (&next_nodes);
2955       if (mctx->state_log[str_idx + 1])
2956 	{
2957 	  err = re_node_set_merge (&next_nodes,
2958 				   &mctx->state_log[str_idx + 1]->nodes);
2959 	  if (BE (err != REG_NOERROR, 0))
2960 	    {
2961 	      re_node_set_free (&next_nodes);
2962 	      return err;
2963 	    }
2964 	}
2965       if (cur_state)
2966 	{
2967 	  err = check_arrival_add_next_nodes (mctx, str_idx,
2968 					      &cur_state->non_eps_nodes, &next_nodes);
2969 	  if (BE (err != REG_NOERROR, 0))
2970 	    {
2971 	      re_node_set_free (&next_nodes);
2972 	      return err;
2973 	    }
2974 	}
2975       ++str_idx;
2976       if (next_nodes.nelem)
2977 	{
2978 	  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2979 	  if (BE (err != REG_NOERROR, 0))
2980 	    {
2981 	      re_node_set_free (&next_nodes);
2982 	      return err;
2983 	    }
2984 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2985 				    subexp_num, type);
2986 	  if (BE ( err != REG_NOERROR, 0))
2987 	    {
2988 	      re_node_set_free (&next_nodes);
2989 	      return err;
2990 	    }
2991 	}
2992       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2993       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2994       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2995 	{
2996 	  re_node_set_free (&next_nodes);
2997 	  return err;
2998 	}
2999       mctx->state_log[str_idx] = cur_state;
3000       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3001     }
3002   re_node_set_free (&next_nodes);
3003   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3004 	       : &mctx->state_log[last_str]->nodes);
3005   path->next_idx = str_idx;
3006 
3007   /* Fix MCTX.  */
3008   mctx->state_log = backup_state_log;
3009   mctx->input.cur_idx = backup_cur_idx;
3010 
3011   /* Then check the current node set has the node LAST_NODE.  */
3012   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3013     return REG_NOERROR;
3014 
3015   return REG_NOMATCH;
3016 }
3017 
3018 /* Helper functions for check_arrival.  */
3019 
3020 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3021    to NEXT_NODES.
3022    TODO: This function is similar to the functions transit_state*(),
3023 	 however this function has many additional works.
3024 	 Can't we unify them?  */
3025 
3026 static reg_errcode_t
3027 internal_function
check_arrival_add_next_nodes(re_match_context_t * mctx,Idx str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)3028 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3029 			      re_node_set *cur_nodes,
3030 			      re_node_set *next_nodes)
3031 {
3032   re_dfa_t *const dfa = mctx->dfa;
3033   bool ok;
3034   Idx cur_idx;
3035   reg_errcode_t err;
3036   re_node_set union_set;
3037   re_node_set_init_empty (&union_set);
3038   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3039     {
3040       int naccepted = 0;
3041       Idx cur_node = cur_nodes->elems[cur_idx];
3042 #ifdef DEBUG
3043       re_token_type_t type = dfa->nodes[cur_node].type;
3044       assert (!IS_EPSILON_NODE (type));
3045 #endif
3046 #ifdef RE_ENABLE_I18N
3047       /* If the node may accept `multi byte'.  */
3048       if (dfa->nodes[cur_node].accept_mb)
3049 	{
3050 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3051 					       str_idx);
3052 	  if (naccepted > 1)
3053 	    {
3054 	      re_dfastate_t *dest_state;
3055 	      Idx next_node = dfa->nexts[cur_node];
3056 	      Idx next_idx = str_idx + naccepted;
3057 	      dest_state = mctx->state_log[next_idx];
3058 	      re_node_set_empty (&union_set);
3059 	      if (dest_state)
3060 		{
3061 		  err = re_node_set_merge (&union_set, &dest_state->nodes);
3062 		  if (BE (err != REG_NOERROR, 0))
3063 		    {
3064 		      re_node_set_free (&union_set);
3065 		      return err;
3066 		    }
3067 		}
3068 	      ok = re_node_set_insert (&union_set, next_node);
3069 	      if (BE (! ok, 0))
3070 		{
3071 		  re_node_set_free (&union_set);
3072 		  return REG_ESPACE;
3073 		}
3074 	      mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3075 							    &union_set);
3076 	      if (BE (mctx->state_log[next_idx] == NULL
3077 		      && err != REG_NOERROR, 0))
3078 		{
3079 		  re_node_set_free (&union_set);
3080 		  return err;
3081 		}
3082 	    }
3083 	}
3084 #endif /* RE_ENABLE_I18N */
3085       if (naccepted
3086 	  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3087 	{
3088 	  ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3089 	  if (BE (! ok, 0))
3090 	    {
3091 	      re_node_set_free (&union_set);
3092 	      return REG_ESPACE;
3093 	    }
3094 	}
3095     }
3096   re_node_set_free (&union_set);
3097   return REG_NOERROR;
3098 }
3099 
3100 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3101    CUR_NODES, however exclude the nodes which are:
3102     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3103     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3104 */
3105 
3106 static reg_errcode_t
3107 internal_function
check_arrival_expand_ecl(re_dfa_t * dfa,re_node_set * cur_nodes,Idx ex_subexp,int type)3108 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3109 			  Idx ex_subexp, int type)
3110 {
3111   reg_errcode_t err;
3112   Idx idx, outside_node;
3113   re_node_set new_nodes;
3114 #ifdef DEBUG
3115   assert (cur_nodes->nelem);
3116 #endif
3117   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3118   if (BE (err != REG_NOERROR, 0))
3119     return err;
3120   /* Create a new node set NEW_NODES with the nodes which are epsilon
3121      closures of the node in CUR_NODES.  */
3122 
3123   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3124     {
3125       Idx cur_node = cur_nodes->elems[idx];
3126       re_node_set *eclosure = dfa->eclosures + cur_node;
3127       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3128       if (outside_node == REG_MISSING)
3129 	{
3130 	  /* There are no problematic nodes, just merge them.  */
3131 	  err = re_node_set_merge (&new_nodes, eclosure);
3132 	  if (BE (err != REG_NOERROR, 0))
3133 	    {
3134 	      re_node_set_free (&new_nodes);
3135 	      return err;
3136 	    }
3137 	}
3138       else
3139 	{
3140 	  /* There are problematic nodes, re-calculate incrementally.  */
3141 	  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3142 					      ex_subexp, type);
3143 	  if (BE (err != REG_NOERROR, 0))
3144 	    {
3145 	      re_node_set_free (&new_nodes);
3146 	      return err;
3147 	    }
3148 	}
3149     }
3150   re_node_set_free (cur_nodes);
3151   *cur_nodes = new_nodes;
3152   return REG_NOERROR;
3153 }
3154 
3155 /* Helper function for check_arrival_expand_ecl.
3156    Check incrementally the epsilon closure of TARGET, and if it isn't
3157    problematic append it to DST_NODES.  */
3158 
3159 static reg_errcode_t
3160 internal_function
check_arrival_expand_ecl_sub(re_dfa_t * dfa,re_node_set * dst_nodes,Idx target,Idx ex_subexp,int type)3161 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3162 			      Idx target, Idx ex_subexp, int type)
3163 {
3164   Idx cur_node;
3165   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3166     {
3167       bool ok;
3168 
3169       if (dfa->nodes[cur_node].type == type
3170 	  && dfa->nodes[cur_node].opr.idx == ex_subexp)
3171 	{
3172 	  if (type == OP_CLOSE_SUBEXP)
3173 	    {
3174 	      ok = re_node_set_insert (dst_nodes, cur_node);
3175 	      if (BE (! ok, 0))
3176 		return REG_ESPACE;
3177 	    }
3178 	  break;
3179 	}
3180       ok = re_node_set_insert (dst_nodes, cur_node);
3181       if (BE (! ok, 0))
3182 	return REG_ESPACE;
3183       if (dfa->edests[cur_node].nelem == 0)
3184 	break;
3185       if (dfa->edests[cur_node].nelem == 2)
3186 	{
3187 	  reg_errcode_t ret =
3188 	    check_arrival_expand_ecl_sub (dfa, dst_nodes,
3189 					  dfa->edests[cur_node].elems[1],
3190 					  ex_subexp, type);
3191 	  if (BE (ret != REG_NOERROR, 0))
3192 	    return ret;
3193 	}
3194       cur_node = dfa->edests[cur_node].elems[0];
3195     }
3196   return REG_NOERROR;
3197 }
3198 
3199 
3200 /* For all the back references in the current state, calculate the
3201    destination of the back references by the appropriate entry
3202    in MCTX->BKREF_ENTS.  */
3203 
3204 static reg_errcode_t
3205 internal_function
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,Idx cur_str,Idx subexp_num,int type)3206 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3207 		    Idx cur_str, Idx subexp_num, int type)
3208 {
3209   re_dfa_t *const dfa = mctx->dfa;
3210   reg_errcode_t err;
3211   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3212   struct re_backref_cache_entry *ent;
3213 
3214   if (cache_idx_start == REG_MISSING)
3215     return REG_NOERROR;
3216 
3217  restart:
3218   ent = mctx->bkref_ents + cache_idx_start;
3219   do
3220     {
3221       Idx to_idx, next_node;
3222 
3223       /* Is this entry ENT is appropriate?  */
3224       if (!re_node_set_contains (cur_nodes, ent->node))
3225 	continue; /* No.  */
3226 
3227       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3228       /* Calculate the destination of the back reference, and append it
3229 	 to MCTX->STATE_LOG.  */
3230       if (to_idx == cur_str)
3231 	{
3232 	  /* The backreference did epsilon transit, we must re-check all the
3233 	     node in the current state.  */
3234 	  re_node_set new_dests;
3235 	  reg_errcode_t err2, err3;
3236 	  next_node = dfa->edests[ent->node].elems[0];
3237 	  if (re_node_set_contains (cur_nodes, next_node))
3238 	    continue;
3239 	  err = re_node_set_init_1 (&new_dests, next_node);
3240 	  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3241 	  err3 = re_node_set_merge (cur_nodes, &new_dests);
3242 	  re_node_set_free (&new_dests);
3243 	  if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3244 		  || err3 != REG_NOERROR, 0))
3245 	    {
3246 	      err = (err != REG_NOERROR ? err
3247 		     : (err2 != REG_NOERROR ? err2 : err3));
3248 	      return err;
3249 	    }
3250 	  /* TODO: It is still inefficient...  */
3251 	  goto restart;
3252 	}
3253       else
3254 	{
3255 	  re_node_set union_set;
3256 	  next_node = dfa->nexts[ent->node];
3257 	  if (mctx->state_log[to_idx])
3258 	    {
3259 	      bool ok;
3260 	      if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3261 					next_node))
3262 		continue;
3263 	      err = re_node_set_init_copy (&union_set,
3264 					   &mctx->state_log[to_idx]->nodes);
3265 	      ok = re_node_set_insert (&union_set, next_node);
3266 	      if (BE (err != REG_NOERROR || ! ok, 0))
3267 		{
3268 		  re_node_set_free (&union_set);
3269 		  err = err != REG_NOERROR ? err : REG_ESPACE;
3270 		  return err;
3271 		}
3272 	    }
3273 	  else
3274 	    {
3275 	      err = re_node_set_init_1 (&union_set, next_node);
3276 	      if (BE (err != REG_NOERROR, 0))
3277 		return err;
3278 	    }
3279 	  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3280 	  re_node_set_free (&union_set);
3281 	  if (BE (mctx->state_log[to_idx] == NULL
3282 		  && err != REG_NOERROR, 0))
3283 	    return err;
3284 	}
3285     }
3286   while (ent++->more);
3287   return REG_NOERROR;
3288 }
3289 
3290 /* Build transition table for the state.
3291    Return true if successful.  */
3292 
3293 static bool
3294 internal_function
build_trtable(re_dfa_t * dfa,re_dfastate_t * state)3295 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3296 {
3297   reg_errcode_t err;
3298   Idx i, j;
3299   int ch;
3300   bool need_word_trtable = false;
3301   bitset_word elem, mask;
3302   bool dests_node_malloced = false, dest_states_malloced = false;
3303   Idx ndests; /* Number of the destination states from `state'.  */
3304   re_dfastate_t **trtable;
3305   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3306   re_node_set follows, *dests_node;
3307   bitset *dests_ch;
3308   bitset acceptable;
3309 
3310   struct dests_alloc
3311   {
3312     re_node_set dests_node[SBC_MAX];
3313     bitset dests_ch[SBC_MAX];
3314   } *dests_alloc;
3315 
3316   /* We build DFA states which corresponds to the destination nodes
3317      from `state'.  `dests_node[i]' represents the nodes which i-th
3318      destination state contains, and `dests_ch[i]' represents the
3319      characters which i-th destination state accepts.  */
3320 #ifndef __SSP__
3321   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3322     dests_alloc = (struct dests_alloc *) alloca (sizeof dests_alloc[0]);
3323   else
3324 #endif
3325     {
3326       dests_alloc = re_malloc (struct dests_alloc, 1);
3327       if (BE (dests_alloc == NULL, 0))
3328 	return false;
3329       dests_node_malloced = true;
3330     }
3331   dests_node = dests_alloc->dests_node;
3332   dests_ch = dests_alloc->dests_ch;
3333 
3334   /* Initialize transiton table.  */
3335   state->word_trtable = state->trtable = NULL;
3336 
3337   /* At first, group all nodes belonging to `state' into several
3338      destinations.  */
3339   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3340   if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3341     {
3342       if (dests_node_malloced)
3343 	free (dests_alloc);
3344       if (ndests == 0)
3345 	{
3346 	  state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3347 	  return true;
3348 	}
3349       return false;
3350     }
3351 
3352   err = re_node_set_alloc (&follows, ndests + 1);
3353   if (BE (err != REG_NOERROR, 0))
3354     goto out_free;
3355 
3356   /* Avoid arithmetic overflow in size calculation.  */
3357   if (BE (((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX)
3358 	   / (3 * sizeof (re_dfastate_t *)))
3359 	  < ndests, 0))
3360     goto out_free;
3361 
3362 #ifndef __SSP__
3363   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3364 			 + ndests * 3 * sizeof (re_dfastate_t *)))
3365     dest_states = (re_dfastate_t **)
3366       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3367   else
3368 #endif
3369     {
3370       dest_states = (re_dfastate_t **)
3371 	malloc (ndests * 3 * sizeof (re_dfastate_t *));
3372       if (BE (dest_states == NULL, 0))
3373 	{
3374 out_free:
3375 	  if (dest_states_malloced)
3376 	    free (dest_states);
3377 	  re_node_set_free (&follows);
3378 	  for (i = 0; i < ndests; ++i)
3379 	    re_node_set_free (dests_node + i);
3380 	  if (dests_node_malloced)
3381 	    free (dests_alloc);
3382 	  return false;
3383 	}
3384       dest_states_malloced = true;
3385     }
3386   dest_states_word = dest_states + ndests;
3387   dest_states_nl = dest_states_word + ndests;
3388   bitset_empty (acceptable);
3389 
3390   /* Then build the states for all destinations.  */
3391   for (i = 0; i < ndests; ++i)
3392     {
3393       Idx next_node;
3394       re_node_set_empty (&follows);
3395       /* Merge the follows of this destination states.  */
3396       for (j = 0; j < dests_node[i].nelem; ++j)
3397 	{
3398 	  next_node = dfa->nexts[dests_node[i].elems[j]];
3399 	  if (next_node != REG_MISSING)
3400 	    {
3401 	      err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3402 	      if (BE (err != REG_NOERROR, 0))
3403 		goto out_free;
3404 	    }
3405 	}
3406       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3407       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3408 	goto out_free;
3409       /* If the new state has context constraint,
3410 	 build appropriate states for these contexts.  */
3411       if (dest_states[i]->has_constraint)
3412 	{
3413 	  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3414 							  CONTEXT_WORD);
3415 	  if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3416 	    goto out_free;
3417 
3418 	  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3419 	    need_word_trtable = true;
3420 
3421 	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3422 							CONTEXT_NEWLINE);
3423 	  if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3424 	    goto out_free;
3425  	}
3426       else
3427 	{
3428 	  dest_states_word[i] = dest_states[i];
3429 	  dest_states_nl[i] = dest_states[i];
3430 	}
3431       bitset_merge (acceptable, dests_ch[i]);
3432     }
3433 
3434   if (!BE (need_word_trtable, 0))
3435     {
3436       /* We don't care about whether the following character is a word
3437 	 character, or we are in a single-byte character set so we can
3438 	 discern by looking at the character code: allocate a
3439 	 256-entry transition table.  */
3440       trtable = state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3441       if (BE (trtable == NULL, 0))
3442 	goto out_free;
3443 
3444       /* For all characters ch...:  */
3445       for (i = 0; i < BITSET_WORDS; ++i)
3446 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3447 	     elem;
3448 	     mask <<= 1, elem >>= 1, ++ch)
3449 	  if (BE (elem & 1, 0))
3450 	    {
3451 	      /* There must be exactly one destination which accepts
3452 		 character ch.  See group_nodes_into_DFAstates.  */
3453 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3454 		;
3455 
3456 	      /* j-th destination accepts the word character ch.  */
3457 	      if (dfa->word_char[i] & mask)
3458 		trtable[ch] = dest_states_word[j];
3459 	      else
3460 		trtable[ch] = dest_states[j];
3461 	    }
3462     }
3463   else
3464     {
3465       /* We care about whether the following character is a word
3466 	 character, and we are in a multi-byte character set: discern
3467 	 by looking at the character code: build two 256-entry
3468 	 transition tables, one starting at trtable[0] and one
3469 	 starting at trtable[SBC_MAX].  */
3470       trtable = state->word_trtable = re_calloc (re_dfastate_t *, 2 * SBC_MAX);
3471       if (BE (trtable == NULL, 0))
3472 	goto out_free;
3473 
3474       /* For all characters ch...:  */
3475       for (i = 0; i < BITSET_WORDS; ++i)
3476 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3477 	     elem;
3478 	     mask <<= 1, elem >>= 1, ++ch)
3479 	  if (BE (elem & 1, 0))
3480 	    {
3481 	      /* There must be exactly one destination which accepts
3482 		 character ch.  See group_nodes_into_DFAstates.  */
3483 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3484 		;
3485 
3486 	      /* j-th destination accepts the word character ch.  */
3487 	      trtable[ch] = dest_states[j];
3488 	      trtable[ch + SBC_MAX] = dest_states_word[j];
3489 	    }
3490     }
3491 
3492   /* new line */
3493   if (bitset_contain (acceptable, NEWLINE_CHAR))
3494     {
3495       /* The current state accepts newline character.  */
3496       for (j = 0; j < ndests; ++j)
3497 	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3498 	  {
3499 	    /* k-th destination accepts newline character.  */
3500 	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
3501 	    if (need_word_trtable)
3502 	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3503 	    /* There must be only one destination which accepts
3504 	       newline.  See group_nodes_into_DFAstates.  */
3505 	    break;
3506 	  }
3507     }
3508 
3509   if (dest_states_malloced)
3510     free (dest_states);
3511 
3512   re_node_set_free (&follows);
3513   for (i = 0; i < ndests; ++i)
3514     re_node_set_free (dests_node + i);
3515 
3516   if (dests_node_malloced)
3517     free (dests_alloc);
3518 
3519   return true;
3520 }
3521 
3522 /* Group all nodes belonging to STATE into several destinations.
3523    Then for all destinations, set the nodes belonging to the destination
3524    to DESTS_NODE[i] and set the characters accepted by the destination
3525    to DEST_CH[i].  This function return the number of destinations.  */
3526 
3527 static Idx
3528 internal_function
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset * dests_ch)3529 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3530 			    re_node_set *dests_node, bitset *dests_ch)
3531 {
3532   reg_errcode_t err;
3533   bool ok;
3534   Idx i, j, k;
3535   Idx ndests; /* Number of the destinations from `state'.  */
3536   bitset accepts; /* Characters a node can accept.  */
3537   const re_node_set *cur_nodes = &state->nodes;
3538   bitset_empty (accepts);
3539   ndests = 0;
3540 
3541   /* For all the nodes belonging to `state',  */
3542   for (i = 0; i < cur_nodes->nelem; ++i)
3543     {
3544       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3545       re_token_type_t type = node->type;
3546       unsigned int constraint = node->constraint;
3547 
3548       /* Enumerate all single byte character this node can accept.  */
3549       if (type == CHARACTER)
3550 	bitset_set (accepts, node->opr.c);
3551       else if (type == SIMPLE_BRACKET)
3552 	{
3553 	  bitset_merge (accepts, node->opr.sbcset);
3554 	}
3555       else if (type == OP_PERIOD)
3556 	{
3557 #ifdef RE_ENABLE_I18N
3558 	  if (dfa->mb_cur_max > 1)
3559 	    bitset_merge (accepts, dfa->sb_char);
3560 	  else
3561 #endif
3562 	    bitset_set_all (accepts);
3563 	  if (!(dfa->syntax & REG_DOT_NEWLINE))
3564 	    bitset_clear (accepts, '\n');
3565 	  if (dfa->syntax & REG_DOT_NOT_NULL)
3566 	    bitset_clear (accepts, '\0');
3567 	}
3568 #ifdef RE_ENABLE_I18N
3569       else if (type == OP_UTF8_PERIOD)
3570         {
3571 	  if (SBC_MAX / 2 % BITSET_WORD_BITS == 0)
3572 	    memset (accepts, -1, sizeof accepts / 2);
3573 	  else
3574 	    bitset_merge (accepts, utf8_sb_map);
3575 	  if (!(dfa->syntax & REG_DOT_NEWLINE))
3576 	    bitset_clear (accepts, '\n');
3577 	  if (dfa->syntax & REG_DOT_NOT_NULL)
3578 	    bitset_clear (accepts, '\0');
3579         }
3580 #endif
3581       else
3582 	continue;
3583 
3584       /* Check the `accepts' and sift the characters which are not
3585 	 match it the context.  */
3586       if (constraint)
3587 	{
3588 	  if (constraint & NEXT_NEWLINE_CONSTRAINT)
3589 	    {
3590 	      bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3591 	      bitset_empty (accepts);
3592 	      if (accepts_newline)
3593 		bitset_set (accepts, NEWLINE_CHAR);
3594 	      else
3595 		continue;
3596 	    }
3597 	  if (constraint & NEXT_ENDBUF_CONSTRAINT)
3598 	    {
3599 	      bitset_empty (accepts);
3600 	      continue;
3601 	    }
3602 
3603 	  if (constraint & NEXT_WORD_CONSTRAINT)
3604 	    {
3605 	      bitset_word any_set = 0;
3606 	      if (type == CHARACTER && !node->word_char)
3607 		{
3608 		  bitset_empty (accepts);
3609 		  continue;
3610 		}
3611 #ifdef RE_ENABLE_I18N
3612 	      if (dfa->mb_cur_max > 1)
3613 		for (j = 0; j < BITSET_WORDS; ++j)
3614 		  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3615 	      else
3616 #endif
3617 		for (j = 0; j < BITSET_WORDS; ++j)
3618 		  any_set |= (accepts[j] &= dfa->word_char[j]);
3619 	      if (!any_set)
3620 		continue;
3621 	    }
3622 	  if (constraint & NEXT_NOTWORD_CONSTRAINT)
3623 	    {
3624 	      bitset_word any_set = 0;
3625 	      if (type == CHARACTER && node->word_char)
3626 		{
3627 		  bitset_empty (accepts);
3628 		  continue;
3629 		}
3630 #ifdef RE_ENABLE_I18N
3631 	      if (dfa->mb_cur_max > 1)
3632 		for (j = 0; j < BITSET_WORDS; ++j)
3633 		  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3634 	      else
3635 #endif
3636 		for (j = 0; j < BITSET_WORDS; ++j)
3637 		  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3638 	      if (!any_set)
3639 		continue;
3640 	    }
3641 	}
3642 
3643       /* Then divide `accepts' into DFA states, or create a new
3644 	 state.  Above, we make sure that accepts is not empty.  */
3645       for (j = 0; j < ndests; ++j)
3646 	{
3647 	  bitset intersec; /* Intersection sets, see below.  */
3648 	  bitset remains;
3649 	  /* Flags, see below.  */
3650 	  bitset_word has_intersec, not_subset, not_consumed;
3651 
3652 	  /* Optimization, skip if this state doesn't accept the character.  */
3653 	  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3654 	    continue;
3655 
3656 	  /* Enumerate the intersection set of this state and `accepts'.  */
3657 	  has_intersec = 0;
3658 	  for (k = 0; k < BITSET_WORDS; ++k)
3659 	    has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3660 	  /* And skip if the intersection set is empty.  */
3661 	  if (!has_intersec)
3662 	    continue;
3663 
3664 	  /* Then check if this state is a subset of `accepts'.  */
3665 	  not_subset = not_consumed = 0;
3666 	  for (k = 0; k < BITSET_WORDS; ++k)
3667 	    {
3668 	      not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3669 	      not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3670 	    }
3671 
3672 	  /* If this state isn't a subset of `accepts', create a
3673 	     new group state, which has the `remains'. */
3674 	  if (not_subset)
3675 	    {
3676 	      bitset_copy (dests_ch[ndests], remains);
3677 	      bitset_copy (dests_ch[j], intersec);
3678 	      err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3679 	      if (BE (err != REG_NOERROR, 0))
3680 		goto error_return;
3681 	      ++ndests;
3682 	    }
3683 
3684 	  /* Put the position in the current group. */
3685 	  ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3686 	  if (BE (! ok, 0))
3687 	    goto error_return;
3688 
3689 	  /* If all characters are consumed, go to next node. */
3690 	  if (!not_consumed)
3691 	    break;
3692 	}
3693       /* Some characters remain, create a new group. */
3694       if (j == ndests)
3695 	{
3696 	  bitset_copy (dests_ch[ndests], accepts);
3697 	  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3698 	  if (BE (err != REG_NOERROR, 0))
3699 	    goto error_return;
3700 	  ++ndests;
3701 	  bitset_empty (accepts);
3702 	}
3703     }
3704   return ndests;
3705  error_return:
3706   for (j = 0; j < ndests; ++j)
3707     re_node_set_free (dests_node + j);
3708   return REG_MISSING;
3709 }
3710 
3711 #ifdef RE_ENABLE_I18N
3712 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3713    Return the number of the bytes the node accepts.
3714    STR_IDX is the current index of the input string.
3715 
3716    This function handles the nodes which can accept one character, or
3717    one collating element like '.', '[a-z]', opposite to the other nodes
3718    can only accept one byte.  */
3719 
3720 static int
3721 internal_function
check_node_accept_bytes(re_dfa_t * dfa,Idx node_idx,const re_string_t * input,Idx str_idx)3722 check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
3723 			 const re_string_t *input, Idx str_idx)
3724 {
3725   const re_token_t *node = dfa->nodes + node_idx;
3726   int char_len, elem_len;
3727   Idx i;
3728 
3729   if (BE (node->type == OP_UTF8_PERIOD, 0))
3730     {
3731       unsigned char c = re_string_byte_at (input, str_idx), d;
3732       if (BE (c < 0xc2, 1))
3733 	return 0;
3734 
3735       if (str_idx + 2 > input->len)
3736 	return 0;
3737 
3738       d = re_string_byte_at (input, str_idx + 1);
3739       if (c < 0xe0)
3740 	return (d < 0x80 || d > 0xbf) ? 0 : 2;
3741       else if (c < 0xf0)
3742 	{
3743 	  char_len = 3;
3744 	  if (c == 0xe0 && d < 0xa0)
3745 	    return 0;
3746 	}
3747       else if (c < 0xf8)
3748 	{
3749 	  char_len = 4;
3750 	  if (c == 0xf0 && d < 0x90)
3751 	    return 0;
3752 	}
3753       else if (c < 0xfc)
3754 	{
3755 	  char_len = 5;
3756 	  if (c == 0xf8 && d < 0x88)
3757 	    return 0;
3758 	}
3759       else if (c < 0xfe)
3760 	{
3761 	  char_len = 6;
3762 	  if (c == 0xfc && d < 0x84)
3763 	    return 0;
3764 	}
3765       else
3766 	return 0;
3767 
3768       if (str_idx + char_len > input->len)
3769 	return 0;
3770 
3771       for (i = 1; i < char_len; ++i)
3772 	{
3773 	  d = re_string_byte_at (input, str_idx + i);
3774 	  if (d < 0x80 || d > 0xbf)
3775 	    return 0;
3776 	}
3777       return char_len;
3778     }
3779 
3780   char_len = re_string_char_size_at (input, str_idx);
3781   if (node->type == OP_PERIOD)
3782     {
3783       if (char_len <= 1)
3784         return 0;
3785       /* FIXME: I don't think this if is needed, as both '\n'
3786 	 and '\0' are char_len == 1.  */
3787       /* '.' accepts any one character except the following two cases.  */
3788       if ((!(dfa->syntax & REG_DOT_NEWLINE) &&
3789 	   re_string_byte_at (input, str_idx) == '\n') ||
3790 	  ((dfa->syntax & REG_DOT_NOT_NULL) &&
3791 	   re_string_byte_at (input, str_idx) == '\0'))
3792 	return 0;
3793       return char_len;
3794     }
3795 
3796   elem_len = re_string_elem_size_at (input, str_idx);
3797   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3798     return 0;
3799 
3800   if (node->type == COMPLEX_BRACKET)
3801     {
3802       const re_charset_t *cset = node->opr.mbcset;
3803 # ifdef _LIBC
3804       const unsigned char *pin
3805 	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3806       Idx j;
3807       uint32_t nrules;
3808 # endif /* _LIBC */
3809       int match_len = 0;
3810       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3811 		    ? re_string_wchar_at (input, str_idx) : 0);
3812 
3813       /* match with multibyte character?  */
3814       for (i = 0; i < cset->nmbchars; ++i)
3815 	if (wc == cset->mbchars[i])
3816 	  {
3817 	    match_len = char_len;
3818 	    goto check_node_accept_bytes_match;
3819 	  }
3820       /* match with character_class?  */
3821       for (i = 0; i < cset->nchar_classes; ++i)
3822 	{
3823 	  wctype_t wt = cset->char_classes[i];
3824 	  if (__iswctype (wc, wt))
3825 	    {
3826 	      match_len = char_len;
3827 	      goto check_node_accept_bytes_match;
3828 	    }
3829 	}
3830 
3831 # ifdef _LIBC
3832       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3833       if (nrules != 0)
3834 	{
3835 	  unsigned int in_collseq = 0;
3836 	  const int32_t *table, *indirect;
3837 	  const unsigned char *weights, *extra;
3838 	  const char *collseqwc;
3839 	  int32_t idx;
3840 	  /* This #include defines a local function!  */
3841 #  include <locale/weight.h>
3842 
3843 	  /* match with collating_symbol?  */
3844 	  if (cset->ncoll_syms)
3845 	    extra = (const unsigned char *)
3846 	      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3847 	  for (i = 0; i < cset->ncoll_syms; ++i)
3848 	    {
3849 	      const unsigned char *coll_sym = extra + cset->coll_syms[i];
3850 	      /* Compare the length of input collating element and
3851 		 the length of current collating element.  */
3852 	      if (*coll_sym != elem_len)
3853 		continue;
3854 	      /* Compare each bytes.  */
3855 	      for (j = 0; j < *coll_sym; j++)
3856 		if (pin[j] != coll_sym[1 + j])
3857 		  break;
3858 	      if (j == *coll_sym)
3859 		{
3860 		  /* Match if every bytes is equal.  */
3861 		  match_len = j;
3862 		  goto check_node_accept_bytes_match;
3863 		}
3864 	    }
3865 
3866 	  if (cset->nranges)
3867 	    {
3868 	      if (elem_len <= char_len)
3869 		{
3870 		  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3871 		  in_collseq = __collseq_table_lookup (collseqwc, wc);
3872 		}
3873 	      else
3874 		in_collseq = find_collation_sequence_value (pin, elem_len);
3875 	    }
3876 	  /* match with range expression?  */
3877 	  for (i = 0; i < cset->nranges; ++i)
3878 	    if (cset->range_starts[i] <= in_collseq
3879 		&& in_collseq <= cset->range_ends[i])
3880 	      {
3881 		match_len = elem_len;
3882 		goto check_node_accept_bytes_match;
3883 	      }
3884 
3885 	  /* match with equivalence_class?  */
3886 	  if (cset->nequiv_classes)
3887 	    {
3888 	      const unsigned char *cp = pin;
3889 	      table = (const int32_t *)
3890 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3891 	      weights = (const unsigned char *)
3892 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3893 	      extra = (const unsigned char *)
3894 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3895 	      indirect = (const int32_t *)
3896 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3897 	      idx = findidx (&cp);
3898 	      if (idx > 0)
3899 		for (i = 0; i < cset->nequiv_classes; ++i)
3900 		  {
3901 		    int32_t equiv_class_idx = cset->equiv_classes[i];
3902 		    size_t weight_len = weights[idx];
3903 		    if (weight_len == weights[equiv_class_idx])
3904 		      {
3905 			Idx cnt = 0;
3906 			while (cnt <= weight_len
3907 			       && (weights[equiv_class_idx + 1 + cnt]
3908 				   == weights[idx + 1 + cnt]))
3909 			  ++cnt;
3910 			if (cnt > weight_len)
3911 			  {
3912 			    match_len = elem_len;
3913 			    goto check_node_accept_bytes_match;
3914 			  }
3915 		      }
3916 		  }
3917 	    }
3918 	}
3919       else
3920 # endif /* _LIBC */
3921 	{
3922 	  /* match with range expression?  */
3923 #if __GNUC__ >= 2
3924 	  wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3925 #else
3926 	  wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3927 	  cmp_buf[2] = wc;
3928 #endif
3929 	  for (i = 0; i < cset->nranges; ++i)
3930 	    {
3931 	      cmp_buf[0] = cset->range_starts[i];
3932 	      cmp_buf[4] = cset->range_ends[i];
3933 	      if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3934 		  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3935 		{
3936 		  match_len = char_len;
3937 		  goto check_node_accept_bytes_match;
3938 		}
3939 	    }
3940 	}
3941     check_node_accept_bytes_match:
3942       if (!cset->non_match)
3943 	return match_len;
3944       else
3945 	{
3946 	  if (match_len > 0)
3947 	    return 0;
3948 	  else
3949 	    return (elem_len > char_len) ? elem_len : char_len;
3950 	}
3951     }
3952   return 0;
3953 }
3954 
3955 # ifdef _LIBC
3956 static unsigned int
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)3957 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3958 {
3959   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3960   if (nrules == 0)
3961     {
3962       if (mbs_len == 1)
3963 	{
3964 	  /* No valid character.  Match it as a single byte character.  */
3965 	  const unsigned char *collseq = (const unsigned char *)
3966 	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3967 	  return collseq[mbs[0]];
3968 	}
3969       return UINT_MAX;
3970     }
3971   else
3972     {
3973       int32_t idx;
3974       const unsigned char *extra = (const unsigned char *)
3975 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3976       int32_t extrasize = (const unsigned char *)
3977 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3978 
3979       for (idx = 0; idx < extrasize;)
3980 	{
3981 	  int mbs_cnt;
3982 	  bool found = false;
3983 	  int32_t elem_mbs_len;
3984 	  /* Skip the name of collating element name.  */
3985 	  idx = idx + extra[idx] + 1;
3986 	  elem_mbs_len = extra[idx++];
3987 	  if (mbs_len == elem_mbs_len)
3988 	    {
3989 	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3990 		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3991 		  break;
3992 	      if (mbs_cnt == elem_mbs_len)
3993 		/* Found the entry.  */
3994 		found = true;
3995 	    }
3996 	  /* Skip the byte sequence of the collating element.  */
3997 	  idx += elem_mbs_len;
3998 	  /* Adjust for the alignment.  */
3999 	  idx = (idx + 3) & ~3;
4000 	  /* Skip the collation sequence value.  */
4001 	  idx += sizeof (uint32_t);
4002 	  /* Skip the wide char sequence of the collating element.  */
4003 	  idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4004 	  /* If we found the entry, return the sequence value.  */
4005 	  if (found)
4006 	    return *(uint32_t *) (extra + idx);
4007 	  /* Skip the collation sequence value.  */
4008 	  idx += sizeof (uint32_t);
4009 	}
4010       return UINT_MAX;
4011     }
4012 }
4013 # endif /* _LIBC */
4014 #endif /* RE_ENABLE_I18N */
4015 
4016 /* Check whether the node accepts the byte which is IDX-th
4017    byte of the INPUT.  */
4018 
4019 static bool
4020 internal_function
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,Idx idx)4021 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4022 		   Idx idx)
4023 {
4024   unsigned char ch;
4025   ch = re_string_byte_at (&mctx->input, idx);
4026   switch (node->type)
4027     {
4028     case CHARACTER:
4029       if (node->opr.c != ch)
4030         return false;
4031       break;
4032 
4033     case SIMPLE_BRACKET:
4034       if (!bitset_contain (node->opr.sbcset, ch))
4035         return false;
4036       break;
4037 
4038 #ifdef RE_ENABLE_I18N
4039     case OP_UTF8_PERIOD:
4040       if (ch >= 0x80)
4041         return false;
4042       /* FALLTHROUGH */
4043 #endif
4044     case OP_PERIOD:
4045       if ((ch == '\n' && !(mctx->dfa->syntax & REG_DOT_NEWLINE))
4046 	  || (ch == '\0' && (mctx->dfa->syntax & REG_DOT_NOT_NULL)))
4047 	return false;
4048       break;
4049 
4050     default:
4051       return false;
4052     }
4053 
4054   if (node->constraint)
4055     {
4056       /* The node has constraints.  Check whether the current context
4057 	 satisfies the constraints.  */
4058       unsigned int context = re_string_context_at (&mctx->input, idx,
4059 						   mctx->eflags);
4060       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4061 	return false;
4062     }
4063 
4064   return true;
4065 }
4066 
4067 /* Extend the buffers, if the buffers have run out.  */
4068 
4069 static reg_errcode_t
4070 internal_function
extend_buffers(re_match_context_t * mctx)4071 extend_buffers (re_match_context_t *mctx)
4072 {
4073   reg_errcode_t ret;
4074   re_string_t *pstr = &mctx->input;
4075 
4076   /* Double the lengthes of the buffers.  */
4077   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4078   if (BE (ret != REG_NOERROR, 0))
4079     return ret;
4080 
4081   if (mctx->state_log != NULL)
4082     {
4083       /* And double the length of state_log.  */
4084       /* XXX We have no indication of the size of this buffer.  If this
4085 	 allocation fail we have no indication that the state_log array
4086 	 does not have the right size.  */
4087       re_dfastate_t **new_array = re_xrealloc (mctx->state_log, re_dfastate_t *,
4088 					       pstr->bufs_len + 1);
4089       if (BE (new_array == NULL, 0))
4090 	return REG_ESPACE;
4091       mctx->state_log = new_array;
4092     }
4093 
4094   /* Then reconstruct the buffers.  */
4095   if (pstr->icase)
4096     {
4097 #ifdef RE_ENABLE_I18N
4098       if (pstr->mb_cur_max > 1)
4099 	{
4100 	  ret = build_wcs_upper_buffer (pstr);
4101 	  if (BE (ret != REG_NOERROR, 0))
4102 	    return ret;
4103 	}
4104       else
4105 #endif /* RE_ENABLE_I18N  */
4106 	build_upper_buffer (pstr);
4107     }
4108   else
4109     {
4110 #ifdef RE_ENABLE_I18N
4111       if (pstr->mb_cur_max > 1)
4112 	build_wcs_buffer (pstr);
4113       else
4114 #endif /* RE_ENABLE_I18N  */
4115 	{
4116 	  if (pstr->trans != NULL)
4117 	    re_string_translate_buffer (pstr);
4118 	}
4119     }
4120   return REG_NOERROR;
4121 }
4122 
4123 
4124 /* Functions for matching context.  */
4125 
4126 /* Initialize MCTX.  */
4127 
4128 static reg_errcode_t
4129 internal_function
match_ctx_init(re_match_context_t * mctx,int eflags,Idx n)4130 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4131 {
4132   mctx->eflags = eflags;
4133   mctx->match_last = REG_MISSING;
4134   if (n > 0)
4135     {
4136       mctx->bkref_ents = re_xmalloc (struct re_backref_cache_entry, n);
4137       mctx->sub_tops = re_xmalloc (re_sub_match_top_t *, n);
4138       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4139 	return REG_ESPACE;
4140     }
4141   /* Already zero-ed by the caller.
4142      else
4143        mctx->bkref_ents = NULL;
4144      mctx->nbkref_ents = 0;
4145      mctx->nsub_tops = 0;  */
4146   mctx->abkref_ents = n;
4147   mctx->max_mb_elem_len = 1;
4148   mctx->asub_tops = n;
4149   return REG_NOERROR;
4150 }
4151 
4152 /* Clean the entries which depend on the current input in MCTX.
4153    This function must be invoked when the matcher changes the start index
4154    of the input, or changes the input string.  */
4155 
4156 static void
4157 internal_function
match_ctx_clean(re_match_context_t * mctx)4158 match_ctx_clean (re_match_context_t *mctx)
4159 {
4160   Idx st_idx;
4161   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4162     {
4163       Idx sl_idx;
4164       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4165       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4166 	{
4167 	  re_sub_match_last_t *last = top->lasts[sl_idx];
4168 	  re_free (last->path.array);
4169 	  re_free (last);
4170 	}
4171       re_free (top->lasts);
4172       if (top->path)
4173 	{
4174 	  re_free (top->path->array);
4175 	  re_free (top->path);
4176 	}
4177       free (top);
4178     }
4179 
4180   mctx->nsub_tops = 0;
4181   mctx->nbkref_ents = 0;
4182 }
4183 
4184 /* Free all the memory associated with MCTX.  */
4185 
4186 static void
4187 internal_function
match_ctx_free(re_match_context_t * mctx)4188 match_ctx_free (re_match_context_t *mctx)
4189 {
4190   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4191   match_ctx_clean (mctx);
4192   re_free (mctx->sub_tops);
4193   re_free (mctx->bkref_ents);
4194 }
4195 
4196 /* Add a new backreference entry to MCTX.
4197    Note that we assume that caller never call this function with duplicate
4198    entry, and call with STR_IDX which isn't smaller than any existing entry.
4199 */
4200 
4201 static reg_errcode_t
4202 internal_function
match_ctx_add_entry(re_match_context_t * mctx,Idx node,Idx str_idx,Idx from,Idx to)4203 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx,
4204 		     Idx from, Idx to)
4205 {
4206   if (mctx->nbkref_ents >= mctx->abkref_ents)
4207     {
4208       struct re_backref_cache_entry* new_entry;
4209       new_entry = re_x2realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4210 				&mctx->abkref_ents);
4211       if (BE (new_entry == NULL, 0))
4212 	{
4213 	  re_free (mctx->bkref_ents);
4214 	  return REG_ESPACE;
4215 	}
4216       mctx->bkref_ents = new_entry;
4217       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4218 	      (sizeof (struct re_backref_cache_entry)
4219 	       * (mctx->abkref_ents - mctx->nbkref_ents)));
4220     }
4221   if (mctx->nbkref_ents > 0
4222       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4223     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4224 
4225   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4226   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4227   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4228   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4229 
4230   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4231      If bit N is clear, means that this entry won't epsilon-transition to
4232      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4233      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4234      such node.
4235 
4236      A backreference does not epsilon-transition unless it is empty, so set
4237      to all zeros if FROM != TO.  */
4238   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4239     = (from == to ? -1 : 0);
4240 
4241   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4242   if (mctx->max_mb_elem_len < to - from)
4243     mctx->max_mb_elem_len = to - from;
4244   return REG_NOERROR;
4245 }
4246 
4247 /* Return the first entry with the same str_idx, or REG_MISSING if none is
4248    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4249 
4250 static Idx
4251 internal_function
search_cur_bkref_entry(const re_match_context_t * mctx,Idx str_idx)4252 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4253 {
4254   Idx left, right, mid, last;
4255   last = right = mctx->nbkref_ents;
4256   for (left = 0; left < right;)
4257     {
4258       mid = (left + right) / 2;
4259       if (mctx->bkref_ents[mid].str_idx < str_idx)
4260 	left = mid + 1;
4261       else
4262 	right = mid;
4263     }
4264   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4265     return left;
4266   else
4267     return REG_MISSING;
4268 }
4269 
4270 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4271    at STR_IDX.  */
4272 
4273 static reg_errcode_t
4274 internal_function
match_ctx_add_subtop(re_match_context_t * mctx,Idx node,Idx str_idx)4275 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4276 {
4277 #ifdef DEBUG
4278   assert (mctx->sub_tops != NULL);
4279   assert (mctx->asub_tops > 0);
4280 #endif
4281   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4282     {
4283       Idx new_asub_tops = mctx->asub_tops;
4284       re_sub_match_top_t **new_array = re_x2realloc (mctx->sub_tops,
4285 						     re_sub_match_top_t *,
4286 						     &new_asub_tops);
4287       if (BE (new_array == NULL, 0))
4288 	return REG_ESPACE;
4289       mctx->sub_tops = new_array;
4290       mctx->asub_tops = new_asub_tops;
4291     }
4292   mctx->sub_tops[mctx->nsub_tops] = re_calloc (re_sub_match_top_t, 1);
4293   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4294     return REG_ESPACE;
4295   mctx->sub_tops[mctx->nsub_tops]->node = node;
4296   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4297   return REG_NOERROR;
4298 }
4299 
4300 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4301    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4302 
4303 static re_sub_match_last_t *
4304 internal_function
match_ctx_add_sublast(re_sub_match_top_t * subtop,Idx node,Idx str_idx)4305 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4306 {
4307   re_sub_match_last_t *new_entry;
4308   if (BE (subtop->nlasts == subtop->alasts, 0))
4309     {
4310       Idx new_alasts = subtop->alasts;
4311       re_sub_match_last_t **new_array = re_x2realloc (subtop->lasts,
4312 						      re_sub_match_last_t *,
4313 						      &new_alasts);
4314       if (BE (new_array == NULL, 0))
4315 	return NULL;
4316       subtop->lasts = new_array;
4317       subtop->alasts = new_alasts;
4318     }
4319   new_entry = re_calloc (re_sub_match_last_t, 1);
4320   if (BE (new_entry != NULL, 1))
4321     {
4322       subtop->lasts[subtop->nlasts] = new_entry;
4323       new_entry->node = node;
4324       new_entry->str_idx = str_idx;
4325       ++subtop->nlasts;
4326     }
4327   return new_entry;
4328 }
4329 
4330 static void
4331 internal_function
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,Idx last_node,Idx last_str_idx)4332 sift_ctx_init (re_sift_context_t *sctx,
4333 	       re_dfastate_t **sifted_sts,
4334 	       re_dfastate_t **limited_sts,
4335 	       Idx last_node, Idx last_str_idx)
4336 {
4337   sctx->sifted_states = sifted_sts;
4338   sctx->limited_states = limited_sts;
4339   sctx->last_node = last_node;
4340   sctx->last_str_idx = last_str_idx;
4341   re_node_set_init_empty (&sctx->limits);
4342 }
4343