1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19 #include <sys/cdefs.h>
20 __RCSID("$NetBSD: regexec.c,v 1.3 2016/05/17 14:00:09 christos Exp $");
21
22
23 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
24 Idx n) internal_function;
25 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
26 static void match_ctx_free (re_match_context_t *cache) internal_function;
27 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
28 Idx str_idx, Idx from, Idx to)
29 internal_function;
30 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
31 internal_function;
32 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
33 Idx str_idx) internal_function;
34 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
35 Idx node, Idx str_idx)
36 internal_function;
37 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
38 re_dfastate_t **limited_sts, Idx last_node,
39 Idx last_str_idx)
40 internal_function;
41 static reg_errcode_t re_search_internal (const regex_t *preg,
42 const char *string, Idx length,
43 Idx start, Idx last_start, Idx stop,
44 size_t nmatch, regmatch_t pmatch[],
45 int eflags) internal_function;
46 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
47 const char *string1, Idx length1,
48 const char *string2, Idx length2,
49 Idx start, regoff_t range,
50 struct re_registers *regs,
51 Idx stop, bool ret_len) internal_function;
52 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
53 const char *string, Idx length, Idx start,
54 regoff_t range, Idx stop,
55 struct re_registers *regs,
56 bool ret_len) internal_function;
57 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
58 Idx nregs, int regs_allocated) internal_function;
59 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
60 internal_function;
61 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
62 Idx *p_match_first)
63 internal_function;
64 static Idx check_halt_state_context (const re_match_context_t *mctx,
65 const re_dfastate_t *state, Idx idx)
66 internal_function;
67 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
68 regmatch_t *prev_idx_match, Idx cur_node,
69 Idx cur_idx, Idx nmatch) internal_function;
70 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
71 Idx str_idx, Idx dest_node, Idx nregs,
72 regmatch_t *regs,
73 re_node_set *eps_via_nodes) internal_function;
74 static reg_errcode_t set_regs (const regex_t *preg,
75 const re_match_context_t *mctx,
76 size_t nmatch, regmatch_t *pmatch,
77 bool fl_backtrack) internal_function;
78 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
79
80 #ifdef RE_ENABLE_I18N
81 static int sift_states_iter_mb (const re_match_context_t *mctx,
82 re_sift_context_t *sctx,
83 Idx node_idx, Idx str_idx, Idx max_str_idx) internal_function;
84 #endif /* RE_ENABLE_I18N */
85 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
86 re_sift_context_t *sctx) internal_function;
87 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
88 re_sift_context_t *sctx, Idx str_idx,
89 re_node_set *cur_dest) internal_function;
90 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
91 re_sift_context_t *sctx,
92 Idx str_idx,
93 re_node_set *dest_nodes) internal_function;
94 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
95 re_node_set *dest_nodes,
96 const re_node_set *candidates) internal_function;
97 static bool check_dst_limits (const re_match_context_t *mctx,
98 const re_node_set *limits,
99 Idx dst_node, Idx dst_idx, Idx src_node,
100 Idx src_idx) internal_function;
101 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
102 int boundaries, Idx subexp_idx,
103 Idx from_node, Idx bkref_idx) internal_function;
104 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
105 Idx limit, Idx subexp_idx,
106 Idx node, Idx str_idx,
107 Idx bkref_idx) internal_function;
108 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
109 re_node_set *dest_nodes,
110 const re_node_set *candidates,
111 re_node_set *limits,
112 struct re_backref_cache_entry *bkref_ents,
113 Idx str_idx) internal_function;
114 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
115 re_sift_context_t *sctx,
116 Idx str_idx, const re_node_set *candidates) internal_function;
117 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
118 re_dfastate_t **src, Idx num) internal_function;
119 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
120 re_match_context_t *mctx) internal_function;
121 static re_dfastate_t *transit_state (reg_errcode_t *err,
122 re_match_context_t *mctx,
123 re_dfastate_t *state) internal_function;
124 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
125 re_match_context_t *mctx,
126 re_dfastate_t *next_state) internal_function;
127 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
128 re_node_set *cur_nodes,
129 Idx str_idx) internal_function;
130 #if 0
131 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
132 re_match_context_t *mctx,
133 re_dfastate_t *pstate) internal_function;
134 #endif
135 #ifdef RE_ENABLE_I18N
136 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
137 re_dfastate_t *pstate) internal_function;
138 #endif /* RE_ENABLE_I18N */
139 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
140 const re_node_set *nodes) internal_function;
141 static reg_errcode_t get_subexp (re_match_context_t *mctx,
142 Idx bkref_node, Idx bkref_str_idx) internal_function;
143 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
144 const re_sub_match_top_t *sub_top,
145 re_sub_match_last_t *sub_last,
146 Idx bkref_node, Idx bkref_str) internal_function;
147 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
148 Idx subexp_idx, int type) internal_function;
149 static reg_errcode_t check_arrival (re_match_context_t *mctx,
150 state_array_t *path, Idx top_node,
151 Idx top_str, Idx last_node, Idx last_str,
152 int type) internal_function;
153 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
154 Idx str_idx,
155 re_node_set *cur_nodes,
156 re_node_set *next_nodes) internal_function;
157 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
158 re_node_set *cur_nodes,
159 Idx ex_subexp, int type) internal_function;
160 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
161 re_node_set *dst_nodes,
162 Idx target, Idx ex_subexp,
163 int type) internal_function;
164 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
165 re_node_set *cur_nodes, Idx cur_str,
166 Idx subexp_num, int type) internal_function;
167 static bool build_trtable (re_dfa_t *dfa,
168 re_dfastate_t *state) internal_function;
169 #ifdef RE_ENABLE_I18N
170 static int check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
171 const re_string_t *input, Idx idx) internal_function;
172 # ifdef _LIBC
173 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
174 size_t name_len) internal_function;
175 # endif /* _LIBC */
176 #endif /* RE_ENABLE_I18N */
177 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
178 const re_dfastate_t *state,
179 re_node_set *states_node,
180 bitset *states_ch) internal_function;
181 static bool check_node_accept (const re_match_context_t *mctx,
182 const re_token_t *node, Idx idx)
183 internal_function;
184 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
185
186 /* Entry point for POSIX code. */
187
188 /* regexec searches for a given pattern, specified by PREG, in the
189 string STRING.
190
191 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
192 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
193 least NMATCH elements, and we set them to the offsets of the
194 corresponding matched substrings.
195
196 EFLAGS specifies `execution flags' which affect matching: if
197 REG_NOTBOL is set, then ^ does not match at the beginning of the
198 string; if REG_NOTEOL is set, then $ does not match at the end.
199
200 We return 0 if we find a match and REG_NOMATCH if not. */
201
202 int
regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)203 regexec (const regex_t *__restrict preg, const char *__restrict string,
204 size_t nmatch, regmatch_t pmatch[], int eflags)
205 {
206 reg_errcode_t err;
207 Idx start, length;
208 #ifdef _LIBC
209 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
210 #endif
211
212 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
213 return REG_BADPAT;
214
215 if (eflags & REG_STARTEND)
216 {
217 start = pmatch[0].rm_so;
218 length = pmatch[0].rm_eo;
219 }
220 else
221 {
222 start = 0;
223 length = strlen (string);
224 }
225
226 __libc_lock_lock (dfa->lock);
227 if (preg->re_no_sub)
228 err = re_search_internal (preg, string, length, start, length,
229 length, 0, NULL, eflags);
230 else
231 err = re_search_internal (preg, string, length, start, length,
232 length, nmatch, pmatch, eflags);
233 __libc_lock_unlock (dfa->lock);
234 return err != REG_NOERROR;
235 }
236
237 #ifdef _LIBC
238 # include <shlib-compat.h>
239 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
240
241 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
242 __typeof__ (__regexec) __compat_regexec;
243
244 int
245 attribute_compat_text_section
__compat_regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)246 __compat_regexec (const regex_t *__restrict preg,
247 const char *__restrict string, size_t nmatch,
248 regmatch_t pmatch[], int eflags)
249 {
250 return regexec (preg, string, nmatch, pmatch,
251 eflags & (REG_NOTBOL | REG_NOTEOL));
252 }
253 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
254 # endif
255 #endif
256
257 /* Entry points for GNU code. */
258
259 /* re_match, re_search, re_match_2, re_search_2
260
261 The former two functions operate on STRING with length LENGTH,
262 while the later two operate on concatenation of STRING1 and STRING2
263 with lengths LENGTH1 and LENGTH2, respectively.
264
265 re_match() matches the compiled pattern in BUFP against the string,
266 starting at index START.
267
268 re_search() first tries matching at index START, then it tries to match
269 starting from index START + 1, and so on. The last start position tried
270 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
271 way as re_match().)
272
273 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
274 the first STOP characters of the concatenation of the strings should be
275 concerned.
276
277 If REGS is not NULL, and BUFP->re_no_sub is not set, the offsets of the match
278 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
279 computed relative to the concatenation, not relative to the individual
280 strings.)
281
282 On success, re_match* functions return the length of the match, re_search*
283 return the position of the start of the match. Return value -1 means no
284 match was found and -2 indicates an internal error. */
285
286 regoff_t
re_match(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,struct re_registers * regs)287 re_match (struct re_pattern_buffer *bufp, const char *string,
288 Idx length, Idx start, struct re_registers *regs)
289 {
290 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
291 }
292 #ifdef _LIBC
weak_alias(__re_match,re_match)293 weak_alias (__re_match, re_match)
294 #endif
295
296 regoff_t
297 re_search (struct re_pattern_buffer *bufp, const char *string,
298 Idx length, Idx start, regoff_t range, struct re_registers *regs)
299 {
300 return re_search_stub (bufp, string, length, start, range, length, regs,
301 false);
302 }
303 #ifdef _LIBC
weak_alias(__re_search,re_search)304 weak_alias (__re_search, re_search)
305 #endif
306
307 regoff_t
308 re_match_2 (struct re_pattern_buffer *bufp,
309 const char *string1, Idx length1,
310 const char *string2, Idx length2,
311 Idx start, struct re_registers *regs, Idx stop)
312 {
313 return re_search_2_stub (bufp, string1, length1, string2, length2,
314 start, 0, regs, stop, true);
315 }
316 #ifdef _LIBC
weak_alias(__re_match_2,re_match_2)317 weak_alias (__re_match_2, re_match_2)
318 #endif
319
320 regoff_t
321 re_search_2 (struct re_pattern_buffer *bufp,
322 const char *string1, Idx length1,
323 const char *string2, Idx length2,
324 Idx start, regoff_t range, struct re_registers *regs, Idx stop)
325 {
326 return re_search_2_stub (bufp, string1, length1, string2, length2,
327 start, range, regs, stop, false);
328 }
329 #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)330 weak_alias (__re_search_2, re_search_2)
331 #endif
332
333 static regoff_t
334 internal_function
335 re_search_2_stub (struct re_pattern_buffer *bufp,
336 const char *string1, Idx length1,
337 const char *string2, Idx length2,
338 Idx start, regoff_t range, struct re_registers *regs,
339 Idx stop, bool ret_len)
340 {
341 const char *str;
342 regoff_t rval;
343 Idx len = length1 + length2;
344 char *s = NULL;
345
346 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
347 return -2;
348
349 /* Concatenate the strings. */
350 if (length2 > 0)
351 if (length1 > 0)
352 {
353 s = re_malloc (char, len);
354
355 if (BE (s == NULL, 0))
356 return -2;
357 memcpy (s, string1, length1);
358 memcpy (s + length1, string2, length2);
359 str = s;
360 }
361 else
362 str = string2;
363 else
364 str = string1;
365
366 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
367 ret_len);
368 re_free (s);
369 return rval;
370 }
371
372 /* The parameters have the same meaning as those of re_search.
373 Additional parameters:
374 If RET_LEN is true the length of the match is returned (re_match style);
375 otherwise the position of the match is returned. */
376
377 static regoff_t
378 internal_function
re_search_stub(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,regoff_t range,Idx stop,struct re_registers * regs,bool ret_len)379 re_search_stub (struct re_pattern_buffer *bufp,
380 const char *string, Idx length,
381 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
382 bool ret_len)
383 {
384 reg_errcode_t result;
385 regmatch_t *pmatch;
386 Idx nregs;
387 regoff_t rval;
388 int eflags = 0;
389 #ifdef _LIBC
390 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
391 #endif
392 Idx last_start = start + range;
393
394 /* Check for out-of-range. */
395 if (BE (start < 0 || start > length, 0))
396 return -1;
397 if (sizeof start < sizeof range)
398 {
399 regoff_t length_offset = length;
400 regoff_t start_offset = start;
401 if (BE (length_offset - start_offset < range, 0))
402 last_start = length;
403 else if (BE (range < - start_offset, 0))
404 last_start = 0;
405 }
406 else
407 {
408 if (BE ((last_start < start) != (range < 0), 0))
409 {
410 /* Overflow occurred when computing last_start; substitute
411 the extreme value. */
412 last_start = range < 0 ? 0 : length;
413 }
414 else
415 {
416 if (BE (length < last_start, 0))
417 last_start = length;
418 else if (BE (last_start < 0, 0))
419 last_start = 0;
420 }
421 }
422
423 __libc_lock_lock (dfa->lock);
424
425 eflags |= (bufp->re_not_bol) ? REG_NOTBOL : 0;
426 eflags |= (bufp->re_not_eol) ? REG_NOTEOL : 0;
427
428 /* Compile fastmap if we haven't yet. */
429 if (start < last_start && bufp->re_fastmap != NULL
430 && !bufp->re_fastmap_accurate)
431 re_compile_fastmap (bufp);
432
433 if (BE (bufp->re_no_sub, 0))
434 regs = NULL;
435
436 /* We need at least 1 register. */
437 if (regs == NULL)
438 nregs = 1;
439 else if (BE (bufp->re_regs_allocated == REG_FIXED
440 && regs->rm_num_regs <= bufp->re_nsub, 0))
441 {
442 nregs = regs->rm_num_regs;
443 if (BE (nregs < 1, 0))
444 {
445 /* Nothing can be copied to regs. */
446 regs = NULL;
447 nregs = 1;
448 }
449 }
450 else
451 nregs = bufp->re_nsub + 1;
452 pmatch = re_xmalloc (regmatch_t, nregs);
453 if (BE (pmatch == NULL, 0))
454 {
455 rval = -2;
456 goto out;
457 }
458
459 result = re_search_internal (bufp, string, length, start, last_start, stop,
460 nregs, pmatch, eflags);
461
462 rval = 0;
463
464 /* I hope we needn't fill ther regs with -1's when no match was found. */
465 if (result != REG_NOERROR)
466 rval = -1;
467 else if (regs != NULL)
468 {
469 /* If caller wants register contents data back, copy them. */
470 bufp->re_regs_allocated = re_copy_regs (regs, pmatch, nregs,
471 bufp->re_regs_allocated);
472 if (BE (bufp->re_regs_allocated == REG_UNALLOCATED, 0))
473 rval = -2;
474 }
475
476 if (BE (rval == 0, 1))
477 {
478 if (ret_len)
479 {
480 assert (pmatch[0].rm_so == start);
481 rval = pmatch[0].rm_eo - start;
482 }
483 else
484 rval = pmatch[0].rm_so;
485 }
486 re_free (pmatch);
487 out:
488 __libc_lock_unlock (dfa->lock);
489 return rval;
490 }
491
492 static unsigned
493 internal_function
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,Idx nregs,int regs_allocated)494 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
495 int regs_allocated)
496 {
497 int rval = REG_REALLOCATE;
498 Idx i;
499 Idx need_regs = nregs + 1;
500 /* We need one extra element beyond `rm_num_regs' for the `-1' marker GNU code
501 uses. */
502
503 /* Have the register data arrays been allocated? */
504 if (regs_allocated == REG_UNALLOCATED)
505 { /* No. So allocate them with malloc. */
506 regs->rm_start = re_xmalloc (regoff_t, need_regs);
507 regs->rm_end = re_malloc (regoff_t, need_regs);
508 if (BE (regs->rm_start == NULL, 0) || BE (regs->rm_end == NULL, 0))
509 return REG_UNALLOCATED;
510 regs->rm_num_regs = need_regs;
511 }
512 else if (regs_allocated == REG_REALLOCATE)
513 { /* Yes. If we need more elements than were already
514 allocated, reallocate them. If we need fewer, just
515 leave it alone. */
516 if (BE (need_regs > regs->rm_num_regs, 0))
517 {
518 regoff_t *new_start =
519 re_xrealloc (regs->rm_start, regoff_t, need_regs);
520 regoff_t *new_end = re_realloc (regs->rm_end, regoff_t, need_regs);
521 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
522 return REG_UNALLOCATED;
523 regs->rm_start = new_start;
524 regs->rm_end = new_end;
525 regs->rm_num_regs = need_regs;
526 }
527 }
528 else
529 {
530 assert (regs_allocated == REG_FIXED);
531 /* This function may not be called with REG_FIXED and nregs too big. */
532 assert (regs->rm_num_regs >= nregs);
533 rval = REG_FIXED;
534 }
535
536 /* Copy the regs. */
537 for (i = 0; i < nregs; ++i)
538 {
539 regs->rm_start[i] = pmatch[i].rm_so;
540 regs->rm_end[i] = pmatch[i].rm_eo;
541 }
542 for ( ; i < regs->rm_num_regs; ++i)
543 regs->rm_start[i] = regs->rm_end[i] = -1;
544
545 return rval;
546 }
547
548 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
549 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
550 this memory for recording register information. STARTS and ENDS
551 must be allocated using the malloc library routine, and must each
552 be at least NUM_REGS * sizeof (regoff_t) bytes long.
553
554 If NUM_REGS == 0, then subsequent matches should allocate their own
555 register data.
556
557 Unless this function is called, the first search or match using
558 PATTERN_BUFFER will allocate its own register data, without
559 freeing the old data. */
560
561 void
re_set_registers(struct re_pattern_buffer * bufp,struct re_registers * regs,__re_size_t num_regs,regoff_t * starts,regoff_t * ends)562 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
563 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
564 {
565 if (num_regs)
566 {
567 bufp->re_regs_allocated = REG_REALLOCATE;
568 regs->rm_num_regs = num_regs;
569 regs->rm_start = starts;
570 regs->rm_end = ends;
571 }
572 else
573 {
574 bufp->re_regs_allocated = REG_UNALLOCATED;
575 regs->rm_num_regs = 0;
576 regs->rm_start = regs->rm_end = NULL;
577 }
578 }
579 #ifdef _LIBC
weak_alias(__re_set_registers,re_set_registers)580 weak_alias (__re_set_registers, re_set_registers)
581 #endif
582
583 /* Entry points compatible with 4.2 BSD regex library. We don't define
584 them unless specifically requested. */
585
586 #if defined _REGEX_RE_COMP || defined _LIBC
587 int
588 # ifdef _LIBC
589 weak_function
590 # endif
591 re_exec (const char *s)
592 {
593 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
594 }
595 #endif /* _REGEX_RE_COMP */
596
597 /* Internal entry point. */
598
599 /* Searches for a compiled pattern PREG in the string STRING, whose
600 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
601 meaning as with regexec. LAST_START is START + RANGE, where
602 START and RANGE have the same meaning as with re_search.
603 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
604 otherwise return the error code.
605 Note: We assume front end functions already check ranges.
606 (0 <= LAST_START && LAST_START <= LENGTH) */
607
608 static reg_errcode_t
609 internal_function
re_search_internal(const regex_t * preg,const char * string,Idx length,Idx start,Idx last_start,Idx stop,size_t nmatch,regmatch_t pmatch[],int eflags)610 re_search_internal (const regex_t *preg,
611 const char *string, Idx length,
612 Idx start, Idx last_start, Idx stop,
613 size_t nmatch, regmatch_t pmatch[],
614 int eflags)
615 {
616 reg_errcode_t err;
617 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
618 Idx left_lim, right_lim;
619 int incr;
620 bool fl_longest_match;
621 int match_kind;
622 Idx match_first, match_last = REG_MISSING;
623 Idx extra_nmatch;
624 bool sb;
625 int ch;
626 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
627 re_match_context_t mctx = { .dfa = dfa };
628 #else
629 re_match_context_t mctx;
630 #endif
631 char *fastmap = ((preg->re_fastmap != NULL && preg->re_fastmap_accurate
632 && start != last_start && !preg->re_can_be_null)
633 ? preg->re_fastmap : NULL);
634 unsigned REG_TRANSLATE_TYPE t =
635 (unsigned REG_TRANSLATE_TYPE) preg->re_translate;
636
637 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
638 memset (&mctx, '\0', sizeof (re_match_context_t));
639 mctx.dfa = dfa;
640 #endif
641
642 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
643 nmatch -= extra_nmatch;
644
645 /* Check if the DFA haven't been compiled. */
646 if (BE (preg->re_used == 0 || dfa->init_state == NULL
647 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
648 || dfa->init_state_begbuf == NULL, 0))
649 return REG_NOMATCH;
650
651 #ifdef DEBUG
652 /* We assume front-end functions already check them. */
653 assert (0 <= last_start && last_start <= length);
654 #endif
655
656 /* If initial states with non-begbuf contexts have no elements,
657 the regex must be anchored. If preg->re_newline_anchor is set,
658 we'll never use init_state_nl, so do not check it. */
659 if (dfa->init_state->nodes.nelem == 0
660 && dfa->init_state_word->nodes.nelem == 0
661 && (dfa->init_state_nl->nodes.nelem == 0
662 || !preg->re_newline_anchor))
663 {
664 if (start != 0 && last_start != 0)
665 return REG_NOMATCH;
666 start = last_start = 0;
667 }
668
669 /* We must check the longest matching, if nmatch > 0. */
670 fl_longest_match = (nmatch != 0 || dfa->nbackref);
671
672 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
673 preg->re_translate,
674 preg->re_syntax & REG_IGNORE_CASE, dfa);
675 if (BE (err != REG_NOERROR, 0))
676 goto free_return;
677 mctx.input.stop = stop;
678 mctx.input.raw_stop = stop;
679 mctx.input.newline_anchor = preg->re_newline_anchor;
680
681 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
682 if (BE (err != REG_NOERROR, 0))
683 goto free_return;
684
685 /* We will log all the DFA states through which the dfa pass,
686 if nmatch > 1, or this dfa has "multibyte node", which is a
687 back-reference or a node which can accept multibyte character or
688 multi character collating element. */
689 if (nmatch > 1 || dfa->has_mb_node)
690 {
691 mctx.state_log = re_xmalloc (re_dfastate_t *, mctx.input.bufs_len + 1);
692 if (BE (mctx.state_log == NULL, 0))
693 {
694 err = REG_ESPACE;
695 goto free_return;
696 }
697 }
698 else
699 mctx.state_log = NULL;
700
701 match_first = start;
702 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
703 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
704
705 /* Check incrementally whether of not the input string match. */
706 incr = (last_start < start) ? -1 : 1;
707 left_lim = (last_start < start) ? last_start : start;
708 right_lim = (last_start < start) ? start : last_start;
709 sb = dfa->mb_cur_max == 1;
710 match_kind =
711 (fastmap
712 ? ((sb || !(preg->re_syntax & REG_IGNORE_CASE || t) ? 4 : 0)
713 | (start <= last_start ? 2 : 0)
714 | (t != NULL ? 1 : 0))
715 : 8);
716
717 for (;; match_first += incr)
718 {
719 err = REG_NOMATCH;
720 if (match_first < left_lim || right_lim < match_first)
721 goto free_return;
722
723 /* Advance as rapidly as possible through the string, until we
724 find a plausible place to start matching. This may be done
725 with varying efficiency, so there are various possibilities:
726 only the most common of them are specialized, in order to
727 save on code size. We use a switch statement for speed. */
728 switch (match_kind)
729 {
730 case 8:
731 /* No fastmap. */
732 break;
733
734 case 7:
735 /* Fastmap with single-byte translation, match forward. */
736 while (BE (match_first < right_lim, 1)
737 && !fastmap[t[(unsigned char) string[match_first]]])
738 ++match_first;
739 goto forward_match_found_start_or_reached_end;
740
741 case 6:
742 /* Fastmap without translation, match forward. */
743 while (BE (match_first < right_lim, 1)
744 && !fastmap[(unsigned char) string[match_first]])
745 ++match_first;
746
747 forward_match_found_start_or_reached_end:
748 if (BE (match_first == right_lim, 0))
749 {
750 ch = match_first >= length
751 ? 0 : (unsigned char) string[match_first];
752 if (!fastmap[t ? t[ch] : ch])
753 goto free_return;
754 }
755 break;
756
757 case 4:
758 case 5:
759 /* Fastmap without multi-byte translation, match backwards. */
760 while (match_first >= left_lim)
761 {
762 ch = match_first >= length
763 ? 0 : (unsigned char) string[match_first];
764 if (fastmap[t ? t[ch] : ch])
765 break;
766 --match_first;
767 }
768 if (match_first < left_lim)
769 goto free_return;
770 break;
771
772 default:
773 /* In this case, we can't determine easily the current byte,
774 since it might be a component byte of a multibyte
775 character. Then we use the constructed buffer instead. */
776 for (;;)
777 {
778 /* If MATCH_FIRST is out of the valid range, reconstruct the
779 buffers. */
780 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
781 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
782 {
783 err = re_string_reconstruct (&mctx.input, match_first,
784 eflags);
785 if (BE (err != REG_NOERROR, 0))
786 goto free_return;
787
788 offset = match_first - mctx.input.raw_mbs_idx;
789 }
790 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
791 Note that MATCH_FIRST must not be smaller than 0. */
792 ch = (match_first >= length
793 ? 0 : re_string_byte_at (&mctx.input, offset));
794 if (fastmap[ch])
795 break;
796 match_first += incr;
797 if (match_first < left_lim || match_first > right_lim)
798 {
799 err = REG_NOMATCH;
800 goto free_return;
801 }
802 }
803 break;
804 }
805
806 /* Reconstruct the buffers so that the matcher can assume that
807 the matching starts from the beginning of the buffer. */
808 err = re_string_reconstruct (&mctx.input, match_first, eflags);
809 if (BE (err != REG_NOERROR, 0))
810 goto free_return;
811
812 #ifdef RE_ENABLE_I18N
813 /* Don't consider this char as a possible match start if it part,
814 yet isn't the head, of a multibyte character. */
815 if (!sb && !re_string_first_byte (&mctx.input, 0))
816 continue;
817 #endif
818
819 /* It seems to be appropriate one, then use the matcher. */
820 /* We assume that the matching starts from 0. */
821 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
822 match_last = check_matching (&mctx, fl_longest_match,
823 start <= last_start ? &match_first : NULL);
824 if (match_last != REG_MISSING)
825 {
826 if (BE (match_last == REG_ERROR, 0))
827 {
828 err = REG_ESPACE;
829 goto free_return;
830 }
831 else
832 {
833 mctx.match_last = match_last;
834 if ((!preg->re_no_sub && nmatch > 1) || dfa->nbackref)
835 {
836 re_dfastate_t *pstate = mctx.state_log[match_last];
837 mctx.last_node = check_halt_state_context (&mctx, pstate,
838 match_last);
839 }
840 if ((!preg->re_no_sub && nmatch > 1 && dfa->has_plural_match)
841 || dfa->nbackref)
842 {
843 err = prune_impossible_nodes (&mctx);
844 if (err == REG_NOERROR)
845 break;
846 if (BE (err != REG_NOMATCH, 0))
847 goto free_return;
848 match_last = REG_MISSING;
849 }
850 else
851 break; /* We found a match. */
852 }
853 }
854
855 match_ctx_clean (&mctx);
856 }
857
858 #ifdef DEBUG
859 assert (match_last != REG_MISSING);
860 assert (err == REG_NOERROR);
861 #endif
862
863 /* Set pmatch[] if we need. */
864 if (nmatch > 0)
865 {
866 Idx reg_idx;
867
868 /* Initialize registers. */
869 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
870 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
871
872 /* Set the points where matching start/end. */
873 pmatch[0].rm_so = 0;
874 pmatch[0].rm_eo = mctx.match_last;
875 /* FIXME: This function should fail if mctx.match_last exceeds
876 the maximum possible regoff_t value. We need a new error
877 code REG_OVERFLOW. */
878
879 if (!preg->re_no_sub && nmatch > 1)
880 {
881 err = set_regs (preg, &mctx, nmatch, pmatch,
882 dfa->has_plural_match && dfa->nbackref > 0);
883 if (BE (err != REG_NOERROR, 0))
884 goto free_return;
885 }
886
887 /* At last, add the offset to the each registers, since we slided
888 the buffers so that we could assume that the matching starts
889 from 0. */
890 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
891 if (pmatch[reg_idx].rm_so != -1)
892 {
893 #ifdef RE_ENABLE_I18N
894 if (BE (mctx.input.offsets_needed != 0, 0))
895 {
896 pmatch[reg_idx].rm_so =
897 (pmatch[reg_idx].rm_so == mctx.input.valid_len
898 ? mctx.input.valid_raw_len
899 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
900 pmatch[reg_idx].rm_eo =
901 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
902 ? mctx.input.valid_raw_len
903 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
904 }
905 #else
906 assert (mctx.input.offsets_needed == 0);
907 #endif
908 pmatch[reg_idx].rm_so += match_first;
909 pmatch[reg_idx].rm_eo += match_first;
910 }
911 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
912 {
913 pmatch[nmatch + reg_idx].rm_so = -1;
914 pmatch[nmatch + reg_idx].rm_eo = -1;
915 }
916
917 if (dfa->subexp_map)
918 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
919 if (dfa->subexp_map[reg_idx] != reg_idx)
920 {
921 pmatch[reg_idx + 1].rm_so
922 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
923 pmatch[reg_idx + 1].rm_eo
924 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
925 }
926 }
927
928 free_return:
929 re_free (mctx.state_log);
930 if (dfa->nbackref)
931 match_ctx_free (&mctx);
932 re_string_destruct (&mctx.input);
933 return err;
934 }
935
936 static reg_errcode_t
937 internal_function
prune_impossible_nodes(re_match_context_t * mctx)938 prune_impossible_nodes (re_match_context_t *mctx)
939 {
940 re_dfa_t *const dfa = mctx->dfa;
941 Idx halt_node, match_last;
942 reg_errcode_t ret;
943 re_dfastate_t **sifted_states;
944 re_dfastate_t **lim_states = NULL;
945 re_sift_context_t sctx;
946 #ifdef DEBUG
947 assert (mctx->state_log != NULL);
948 #endif
949 match_last = mctx->match_last;
950 halt_node = mctx->last_node;
951 sifted_states = re_xmalloc (re_dfastate_t *, match_last + 1);
952 if (BE (sifted_states == NULL, 0))
953 {
954 ret = REG_ESPACE;
955 goto free_return;
956 }
957 if (dfa->nbackref)
958 {
959 lim_states = re_xmalloc (re_dfastate_t *, match_last + 1);
960 if (BE (lim_states == NULL, 0))
961 {
962 ret = REG_ESPACE;
963 goto free_return;
964 }
965 while (1)
966 {
967 memset (lim_states, '\0',
968 sizeof (re_dfastate_t *) * (match_last + 1));
969 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
970 match_last);
971 ret = sift_states_backward (mctx, &sctx);
972 re_node_set_free (&sctx.limits);
973 if (BE (ret != REG_NOERROR, 0))
974 goto free_return;
975 if (sifted_states[0] != NULL || lim_states[0] != NULL)
976 break;
977 do
978 {
979 --match_last;
980 if (! REG_VALID_INDEX (match_last))
981 {
982 ret = REG_NOMATCH;
983 goto free_return;
984 }
985 } while (mctx->state_log[match_last] == NULL
986 || !mctx->state_log[match_last]->halt);
987 halt_node = check_halt_state_context (mctx,
988 mctx->state_log[match_last],
989 match_last);
990 }
991 ret = merge_state_array (dfa, sifted_states, lim_states,
992 match_last + 1);
993 re_free (lim_states);
994 lim_states = NULL;
995 if (BE (ret != REG_NOERROR, 0))
996 goto free_return;
997 }
998 else
999 {
1000 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1001 ret = sift_states_backward (mctx, &sctx);
1002 re_node_set_free (&sctx.limits);
1003 if (BE (ret != REG_NOERROR, 0))
1004 goto free_return;
1005 }
1006 re_free (mctx->state_log);
1007 mctx->state_log = sifted_states;
1008 sifted_states = NULL;
1009 mctx->last_node = halt_node;
1010 mctx->match_last = match_last;
1011 ret = REG_NOERROR;
1012 free_return:
1013 re_free (sifted_states);
1014 re_free (lim_states);
1015 return ret;
1016 }
1017
1018 /* Acquire an initial state and return it.
1019 We must select appropriate initial state depending on the context,
1020 since initial states may have constraints like "\<", "^", etc.. */
1021
1022 static inline re_dfastate_t *
1023 __attribute ((always_inline)) internal_function
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,Idx idx)1024 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1025 Idx idx)
1026 {
1027 re_dfa_t *const dfa = mctx->dfa;
1028 if (dfa->init_state->has_constraint)
1029 {
1030 unsigned int context;
1031 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1032 if (IS_WORD_CONTEXT (context))
1033 return dfa->init_state_word;
1034 else if (IS_ORDINARY_CONTEXT (context))
1035 return dfa->init_state;
1036 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1037 return dfa->init_state_begbuf;
1038 else if (IS_NEWLINE_CONTEXT (context))
1039 return dfa->init_state_nl;
1040 else if (IS_BEGBUF_CONTEXT (context))
1041 {
1042 /* It is relatively rare case, then calculate on demand. */
1043 return re_acquire_state_context (err, dfa,
1044 dfa->init_state->entrance_nodes,
1045 context);
1046 }
1047 else
1048 /* Must not happen? */
1049 return dfa->init_state;
1050 }
1051 else
1052 return dfa->init_state;
1053 }
1054
1055 /* Check whether the regular expression match input string INPUT or not,
1056 and return the index where the matching end. Return REG_MISSING if
1057 there is no match, and return REG_ERROR in case of an error.
1058 FL_LONGEST_MATCH means we want the POSIX longest matching.
1059 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1060 next place where we may want to try matching.
1061 Note that the matcher assume that the maching starts from the current
1062 index of the buffer. */
1063
1064 static Idx
1065 internal_function
check_matching(re_match_context_t * mctx,bool fl_longest_match,Idx * p_match_first)1066 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1067 Idx *p_match_first)
1068 {
1069 re_dfa_t *const dfa = mctx->dfa;
1070 reg_errcode_t err;
1071 Idx match = 0;
1072 Idx match_last = REG_MISSING;
1073 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1074 re_dfastate_t *cur_state;
1075 bool at_init_state = p_match_first != NULL;
1076 Idx next_start_idx = cur_str_idx;
1077
1078 err = REG_NOERROR;
1079 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1080 /* An initial state must not be NULL (invalid). */
1081 if (BE (cur_state == NULL, 0))
1082 {
1083 assert (err == REG_ESPACE);
1084 return REG_ERROR;
1085 }
1086
1087 if (mctx->state_log != NULL)
1088 {
1089 mctx->state_log[cur_str_idx] = cur_state;
1090
1091 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1092 later. E.g. Processing back references. */
1093 if (BE (dfa->nbackref, 0))
1094 {
1095 at_init_state = false;
1096 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1097 if (BE (err != REG_NOERROR, 0))
1098 return err;
1099
1100 if (cur_state->has_backref)
1101 {
1102 err = transit_state_bkref (mctx, &cur_state->nodes);
1103 if (BE (err != REG_NOERROR, 0))
1104 return err;
1105 }
1106 }
1107 }
1108
1109 /* If the RE accepts NULL string. */
1110 if (BE (cur_state->halt, 0))
1111 {
1112 if (!cur_state->has_constraint
1113 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1114 {
1115 if (!fl_longest_match)
1116 return cur_str_idx;
1117 else
1118 {
1119 match_last = cur_str_idx;
1120 match = 1;
1121 }
1122 }
1123 }
1124
1125 while (!re_string_eoi (&mctx->input))
1126 {
1127 re_dfastate_t *old_state = cur_state;
1128 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1129
1130 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1131 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1132 && mctx->input.valid_len < mctx->input.len))
1133 {
1134 err = extend_buffers (mctx);
1135 if (BE (err != REG_NOERROR, 0))
1136 {
1137 assert (err == REG_ESPACE);
1138 return REG_ERROR;
1139 }
1140 }
1141
1142 cur_state = transit_state (&err, mctx, cur_state);
1143 if (mctx->state_log != NULL)
1144 cur_state = merge_state_with_log (&err, mctx, cur_state);
1145
1146 if (cur_state == NULL)
1147 {
1148 /* Reached the invalid state or an error. Try to recover a valid
1149 state using the state log, if available and if we have not
1150 already found a valid (even if not the longest) match. */
1151 if (BE (err != REG_NOERROR, 0))
1152 return REG_ERROR;
1153
1154 if (mctx->state_log == NULL
1155 || (match && !fl_longest_match)
1156 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1157 break;
1158 }
1159
1160 if (BE (at_init_state, 0))
1161 {
1162 if (old_state == cur_state)
1163 next_start_idx = next_char_idx;
1164 else
1165 at_init_state = false;
1166 }
1167
1168 if (cur_state->halt)
1169 {
1170 /* Reached a halt state.
1171 Check the halt state can satisfy the current context. */
1172 if (!cur_state->has_constraint
1173 || check_halt_state_context (mctx, cur_state,
1174 re_string_cur_idx (&mctx->input)))
1175 {
1176 /* We found an appropriate halt state. */
1177 match_last = re_string_cur_idx (&mctx->input);
1178 match = 1;
1179
1180 /* We found a match, do not modify match_first below. */
1181 p_match_first = NULL;
1182 if (!fl_longest_match)
1183 break;
1184 }
1185 }
1186 }
1187
1188 if (p_match_first)
1189 *p_match_first += next_start_idx;
1190
1191 return match_last;
1192 }
1193
1194 /* Check NODE match the current context. */
1195
1196 static bool
1197 internal_function
check_halt_node_context(const re_dfa_t * dfa,Idx node,unsigned int context)1198 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1199 {
1200 re_token_type_t type = dfa->nodes[node].type;
1201 unsigned int constraint = dfa->nodes[node].constraint;
1202 if (type != END_OF_RE)
1203 return false;
1204 if (!constraint)
1205 return true;
1206 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1207 return false;
1208 return true;
1209 }
1210
1211 /* Check the halt state STATE match the current context.
1212 Return 0 if not match, if the node, STATE has, is a halt node and
1213 match the context, return the node. */
1214
1215 static Idx
1216 internal_function
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,Idx idx)1217 check_halt_state_context (const re_match_context_t *mctx,
1218 const re_dfastate_t *state, Idx idx)
1219 {
1220 Idx i;
1221 unsigned int context;
1222 #ifdef DEBUG
1223 assert (state->halt);
1224 #endif
1225 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1226 for (i = 0; i < state->nodes.nelem; ++i)
1227 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1228 return state->nodes.elems[i];
1229 return 0;
1230 }
1231
1232 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1233 corresponding to the DFA).
1234 Return the destination node, and update EPS_VIA_NODES;
1235 return REG_MISSING in case of errors. */
1236
1237 static Idx
1238 internal_function
proceed_next_node(const re_match_context_t * mctx,Idx nregs,regmatch_t * regs,Idx * pidx,Idx node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1239 proceed_next_node (const re_match_context_t *mctx,
1240 Idx nregs, regmatch_t *regs, Idx *pidx, Idx node,
1241 re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1242 {
1243 re_dfa_t *const dfa = mctx->dfa;
1244 Idx i;
1245 bool ok;
1246 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1247 {
1248 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1249 re_node_set *edests = &dfa->edests[node];
1250 Idx dest_node;
1251 ok = re_node_set_insert (eps_via_nodes, node);
1252 if (BE (! ok, 0))
1253 return REG_ERROR;
1254 /* Pick up a valid destination, or return REG_MISSING if none
1255 is found. */
1256 for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1257 {
1258 Idx candidate = edests->elems[i];
1259 if (!re_node_set_contains (cur_nodes, candidate))
1260 continue;
1261 if (dest_node == REG_MISSING)
1262 dest_node = candidate;
1263
1264 else
1265 {
1266 /* In order to avoid infinite loop like "(a*)*", return the second
1267 epsilon-transition if the first was already considered. */
1268 if (re_node_set_contains (eps_via_nodes, dest_node))
1269 return candidate;
1270
1271 /* Otherwise, push the second epsilon-transition on the fail stack. */
1272 else if (fs != NULL
1273 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1274 eps_via_nodes))
1275 return REG_ERROR;
1276
1277 /* We know we are going to exit. */
1278 break;
1279 }
1280 }
1281 return dest_node;
1282 }
1283 else
1284 {
1285 Idx naccepted = 0;
1286 re_token_type_t type = dfa->nodes[node].type;
1287
1288 #ifdef RE_ENABLE_I18N
1289 if (dfa->nodes[node].accept_mb)
1290 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1291 else
1292 #endif /* RE_ENABLE_I18N */
1293 if (type == OP_BACK_REF)
1294 {
1295 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1296 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1297 if (fs != NULL)
1298 {
1299 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1300 return REG_MISSING;
1301 else if (naccepted)
1302 {
1303 char *buf = (char *) re_string_get_buffer (&mctx->input);
1304 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1305 naccepted) != 0)
1306 return REG_MISSING;
1307 }
1308 }
1309
1310 if (naccepted == 0)
1311 {
1312 Idx dest_node;
1313 ok = re_node_set_insert (eps_via_nodes, node);
1314 if (BE (! ok, 0))
1315 return REG_ERROR;
1316 dest_node = dfa->edests[node].elems[0];
1317 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1318 dest_node))
1319 return dest_node;
1320 }
1321 }
1322
1323 if (naccepted != 0
1324 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1325 {
1326 Idx dest_node = dfa->nexts[node];
1327 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1328 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1329 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1330 dest_node)))
1331 return REG_MISSING;
1332 re_node_set_empty (eps_via_nodes);
1333 return dest_node;
1334 }
1335 }
1336 return REG_MISSING;
1337 }
1338
1339 static reg_errcode_t
1340 internal_function
push_fail_stack(struct re_fail_stack_t * fs,Idx str_idx,Idx dest_node,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1341 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1342 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1343 {
1344 reg_errcode_t err;
1345 Idx num = fs->num++;
1346 if (fs->num == fs->alloc)
1347 {
1348 struct re_fail_stack_ent_t *new_array =
1349 re_x2realloc (fs->stack, struct re_fail_stack_ent_t, &fs->alloc);
1350 if (new_array == NULL)
1351 return REG_ESPACE;
1352 fs->stack = new_array;
1353 }
1354 fs->stack[num].idx = str_idx;
1355 fs->stack[num].node = dest_node;
1356 fs->stack[num].regs = re_xmalloc (regmatch_t, nregs);
1357 if (fs->stack[num].regs == NULL)
1358 return REG_ESPACE;
1359 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1360 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1361 return err;
1362 }
1363
1364 static Idx
1365 internal_function
pop_fail_stack(struct re_fail_stack_t * fs,Idx * pidx,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1366 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx,
1367 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1368 {
1369 Idx num = --fs->num;
1370 assert (REG_VALID_INDEX (num));
1371 *pidx = fs->stack[num].idx;
1372 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1373 re_node_set_free (eps_via_nodes);
1374 re_free (fs->stack[num].regs);
1375 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1376 return fs->stack[num].node;
1377 }
1378
1379 /* Set the positions where the subexpressions are starts/ends to registers
1380 PMATCH.
1381 Note: We assume that pmatch[0] is already set, and
1382 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1383
1384 static reg_errcode_t
1385 internal_function
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,bool fl_backtrack)1386 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1387 size_t nmatch, regmatch_t *pmatch, bool fl_backtrack)
1388 {
1389 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1390 Idx idx, cur_node;
1391 re_node_set eps_via_nodes;
1392 struct re_fail_stack_t *fs;
1393 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1394 regmatch_t *prev_idx_match;
1395 bool prev_idx_match_malloced = false;
1396
1397 #ifdef DEBUG
1398 assert (nmatch > 1);
1399 assert (mctx->state_log != NULL);
1400 #endif
1401 if (fl_backtrack)
1402 {
1403 fs = &fs_body;
1404 fs->stack = re_xmalloc (struct re_fail_stack_ent_t, fs->alloc);
1405 if (fs->stack == NULL)
1406 return REG_ESPACE;
1407 }
1408 else
1409 fs = NULL;
1410
1411 cur_node = dfa->init_node;
1412 re_node_set_init_empty (&eps_via_nodes);
1413
1414 if (re_alloc_oversized (nmatch, sizeof (regmatch_t)))
1415 {
1416 free_fail_stack_return (fs);
1417 return REG_ESPACE;
1418 }
1419 #ifndef __SSP__
1420 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1421 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1422 else
1423 #endif
1424 {
1425 prev_idx_match = re_malloc (regmatch_t, nmatch);
1426 if (prev_idx_match == NULL)
1427 {
1428 free_fail_stack_return (fs);
1429 return REG_ESPACE;
1430 }
1431 prev_idx_match_malloced = true;
1432 }
1433 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1434
1435 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1436 {
1437 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1438
1439 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1440 {
1441 Idx reg_idx;
1442 if (fs)
1443 {
1444 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1445 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1446 break;
1447 if (reg_idx == nmatch)
1448 {
1449 re_node_set_free (&eps_via_nodes);
1450 if (prev_idx_match_malloced)
1451 re_free (prev_idx_match);
1452 return free_fail_stack_return (fs);
1453 }
1454 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1455 &eps_via_nodes);
1456 }
1457 else
1458 {
1459 re_node_set_free (&eps_via_nodes);
1460 if (prev_idx_match_malloced)
1461 re_free (prev_idx_match);
1462 return REG_NOERROR;
1463 }
1464 }
1465
1466 /* Proceed to next node. */
1467 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1468 &eps_via_nodes, fs);
1469
1470 if (BE (! REG_VALID_INDEX (cur_node), 0))
1471 {
1472 if (BE (cur_node == REG_ERROR, 0))
1473 {
1474 re_node_set_free (&eps_via_nodes);
1475 if (prev_idx_match_malloced)
1476 re_free (prev_idx_match);
1477 free_fail_stack_return (fs);
1478 return REG_ESPACE;
1479 }
1480 if (fs)
1481 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1482 &eps_via_nodes);
1483 else
1484 {
1485 re_node_set_free (&eps_via_nodes);
1486 if (prev_idx_match_malloced)
1487 re_free (prev_idx_match);
1488 return REG_NOMATCH;
1489 }
1490 }
1491 }
1492 re_node_set_free (&eps_via_nodes);
1493 if (prev_idx_match_malloced)
1494 re_free (prev_idx_match);
1495 return free_fail_stack_return (fs);
1496 }
1497
1498 static reg_errcode_t
1499 internal_function
free_fail_stack_return(struct re_fail_stack_t * fs)1500 free_fail_stack_return (struct re_fail_stack_t *fs)
1501 {
1502 if (fs)
1503 {
1504 Idx fs_idx;
1505 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1506 {
1507 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1508 re_free (fs->stack[fs_idx].regs);
1509 }
1510 re_free (fs->stack);
1511 }
1512 return REG_NOERROR;
1513 }
1514
1515 static void
1516 internal_function
update_regs(re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,Idx cur_node,Idx cur_idx,Idx nmatch)1517 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1518 Idx cur_node, Idx cur_idx, Idx nmatch)
1519 {
1520 int type = dfa->nodes[cur_node].type;
1521 if (type == OP_OPEN_SUBEXP)
1522 {
1523 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1524
1525 /* We are at the first node of this sub expression. */
1526 if (reg_num < nmatch)
1527 {
1528 pmatch[reg_num].rm_so = cur_idx;
1529 pmatch[reg_num].rm_eo = -1;
1530 }
1531 }
1532 else if (type == OP_CLOSE_SUBEXP)
1533 {
1534 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1535 if (reg_num < nmatch)
1536 {
1537 /* We are at the last node of this sub expression. */
1538 if (pmatch[reg_num].rm_so < cur_idx)
1539 {
1540 pmatch[reg_num].rm_eo = cur_idx;
1541 /* This is a non-empty match or we are not inside an optional
1542 subexpression. Accept this right away. */
1543 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1544 }
1545 else
1546 {
1547 if (dfa->nodes[cur_node].opt_subexp
1548 && prev_idx_match[reg_num].rm_so != -1)
1549 /* We transited through an empty match for an optional
1550 subexpression, like (a?)*, and this is not the subexp's
1551 first match. Copy back the old content of the registers
1552 so that matches of an inner subexpression are undone as
1553 well, like in ((a?))*. */
1554 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1555 else
1556 /* We completed a subexpression, but it may be part of
1557 an optional one, so do not update PREV_IDX_MATCH. */
1558 pmatch[reg_num].rm_eo = cur_idx;
1559 }
1560 }
1561 }
1562 }
1563
1564 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1565 and sift the nodes in each states according to the following rules.
1566 Updated state_log will be wrote to STATE_LOG.
1567
1568 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1569 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1570 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1571 the LAST_NODE, we throw away the node `a'.
1572 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1573 string `s' and transit to `b':
1574 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1575 away the node `a'.
1576 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1577 thrown away, we throw away the node `a'.
1578 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1579 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1580 node `a'.
1581 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1582 we throw away the node `a'. */
1583
1584 #define STATE_NODE_CONTAINS(state,node) \
1585 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1586
1587 static reg_errcode_t
1588 internal_function
sift_states_backward(re_match_context_t * mctx,re_sift_context_t * sctx)1589 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1590 {
1591 reg_errcode_t err;
1592 int null_cnt = 0;
1593 Idx str_idx = sctx->last_str_idx;
1594 re_node_set cur_dest;
1595
1596 #ifdef DEBUG
1597 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1598 #endif
1599
1600 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1601 transit to the last_node and the last_node itself. */
1602 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1603 if (BE (err != REG_NOERROR, 0))
1604 return err;
1605 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1606 if (BE (err != REG_NOERROR, 0))
1607 goto free_return;
1608
1609 /* Then check each states in the state_log. */
1610 while (str_idx > 0)
1611 {
1612 /* Update counters. */
1613 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1614 if (null_cnt > mctx->max_mb_elem_len)
1615 {
1616 memset (sctx->sifted_states, '\0',
1617 sizeof (re_dfastate_t *) * str_idx);
1618 re_node_set_free (&cur_dest);
1619 return REG_NOERROR;
1620 }
1621 re_node_set_empty (&cur_dest);
1622 --str_idx;
1623
1624 if (mctx->state_log[str_idx])
1625 {
1626 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1627 if (BE (err != REG_NOERROR, 0))
1628 goto free_return;
1629 }
1630
1631 /* Add all the nodes which satisfy the following conditions:
1632 - It can epsilon transit to a node in CUR_DEST.
1633 - It is in CUR_SRC.
1634 And update state_log. */
1635 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1636 if (BE (err != REG_NOERROR, 0))
1637 goto free_return;
1638 }
1639 err = REG_NOERROR;
1640 free_return:
1641 re_node_set_free (&cur_dest);
1642 return err;
1643 }
1644
1645 static reg_errcode_t
1646 internal_function
build_sifted_states(re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * cur_dest)1647 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1648 Idx str_idx, re_node_set *cur_dest)
1649 {
1650 re_dfa_t *const dfa = mctx->dfa;
1651 re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1652 Idx i;
1653
1654 /* Then build the next sifted state.
1655 We build the next sifted state on `cur_dest', and update
1656 `sifted_states[str_idx]' with `cur_dest'.
1657 Note:
1658 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1659 `cur_src' points the node_set of the old `state_log[str_idx]'
1660 (with the epsilon nodes pre-filtered out). */
1661 for (i = 0; i < cur_src->nelem; i++)
1662 {
1663 Idx prev_node = cur_src->elems[i];
1664 int naccepted = 0;
1665 bool ok;
1666
1667 #ifdef DEBUG
1668 re_token_type_t type = dfa->nodes[prev_node].type;
1669 assert (!IS_EPSILON_NODE (type));
1670 #endif
1671 #ifdef RE_ENABLE_I18N
1672 /* If the node may accept `multi byte'. */
1673 if (dfa->nodes[prev_node].accept_mb)
1674 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1675 str_idx, sctx->last_str_idx);
1676 #endif /* RE_ENABLE_I18N */
1677
1678 /* We don't check backreferences here.
1679 See update_cur_sifted_state(). */
1680 if (!naccepted
1681 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1682 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1683 dfa->nexts[prev_node]))
1684 naccepted = 1;
1685
1686 if (naccepted == 0)
1687 continue;
1688
1689 if (sctx->limits.nelem)
1690 {
1691 Idx to_idx = str_idx + naccepted;
1692 if (check_dst_limits (mctx, &sctx->limits,
1693 dfa->nexts[prev_node], to_idx,
1694 prev_node, str_idx))
1695 continue;
1696 }
1697 ok = re_node_set_insert (cur_dest, prev_node);
1698 if (BE (! ok, 0))
1699 return REG_ESPACE;
1700 }
1701
1702 return REG_NOERROR;
1703 }
1704
1705 /* Helper functions. */
1706
1707 static reg_errcode_t
1708 internal_function
clean_state_log_if_needed(re_match_context_t * mctx,Idx next_state_log_idx)1709 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1710 {
1711 Idx top = mctx->state_log_top;
1712
1713 if (next_state_log_idx >= mctx->input.bufs_len
1714 || (next_state_log_idx >= mctx->input.valid_len
1715 && mctx->input.valid_len < mctx->input.len))
1716 {
1717 reg_errcode_t err;
1718 err = extend_buffers (mctx);
1719 if (BE (err != REG_NOERROR, 0))
1720 return err;
1721 }
1722
1723 if (top < next_state_log_idx)
1724 {
1725 memset (mctx->state_log + top + 1, '\0',
1726 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1727 mctx->state_log_top = next_state_log_idx;
1728 }
1729 return REG_NOERROR;
1730 }
1731
1732 static reg_errcode_t
1733 internal_function
merge_state_array(re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,Idx num)1734 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1735 Idx num)
1736 {
1737 Idx st_idx;
1738 reg_errcode_t err;
1739 for (st_idx = 0; st_idx < num; ++st_idx)
1740 {
1741 if (dst[st_idx] == NULL)
1742 dst[st_idx] = src[st_idx];
1743 else if (src[st_idx] != NULL)
1744 {
1745 re_node_set merged_set;
1746 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1747 &src[st_idx]->nodes);
1748 if (BE (err != REG_NOERROR, 0))
1749 return err;
1750 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1751 re_node_set_free (&merged_set);
1752 if (BE (err != REG_NOERROR, 0))
1753 return err;
1754 }
1755 }
1756 return REG_NOERROR;
1757 }
1758
1759 static reg_errcode_t
1760 internal_function
update_cur_sifted_state(re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * dest_nodes)1761 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1762 Idx str_idx, re_node_set *dest_nodes)
1763 {
1764 re_dfa_t *const dfa = mctx->dfa;
1765 reg_errcode_t err;
1766 const re_node_set *candidates;
1767 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1768 : &mctx->state_log[str_idx]->nodes);
1769
1770 if (dest_nodes->nelem == 0)
1771 sctx->sifted_states[str_idx] = NULL;
1772 else
1773 {
1774 if (candidates)
1775 {
1776 /* At first, add the nodes which can epsilon transit to a node in
1777 DEST_NODE. */
1778 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1779 if (BE (err != REG_NOERROR, 0))
1780 return err;
1781
1782 /* Then, check the limitations in the current sift_context. */
1783 if (sctx->limits.nelem)
1784 {
1785 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1786 mctx->bkref_ents, str_idx);
1787 if (BE (err != REG_NOERROR, 0))
1788 return err;
1789 }
1790 }
1791
1792 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1793 if (BE (err != REG_NOERROR, 0))
1794 return err;
1795 }
1796
1797 if (candidates && mctx->state_log[str_idx]->has_backref)
1798 {
1799 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1800 if (BE (err != REG_NOERROR, 0))
1801 return err;
1802 }
1803 return REG_NOERROR;
1804 }
1805
1806 static reg_errcode_t
1807 internal_function
add_epsilon_src_nodes(re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1808 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1809 const re_node_set *candidates)
1810 {
1811 reg_errcode_t err = REG_NOERROR;
1812 Idx i;
1813
1814 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1815 if (BE (err != REG_NOERROR, 0))
1816 return err;
1817
1818 if (!state->inveclosure.alloc)
1819 {
1820 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1821 if (BE (err != REG_NOERROR, 0))
1822 return REG_ESPACE;
1823 for (i = 0; i < dest_nodes->nelem; i++)
1824 re_node_set_merge (&state->inveclosure,
1825 dfa->inveclosures + dest_nodes->elems[i]);
1826 }
1827 return re_node_set_add_intersect (dest_nodes, candidates,
1828 &state->inveclosure);
1829 }
1830
1831 static reg_errcode_t
1832 internal_function
sub_epsilon_src_nodes(re_dfa_t * dfa,Idx node,re_node_set * dest_nodes,const re_node_set * candidates)1833 sub_epsilon_src_nodes (re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1834 const re_node_set *candidates)
1835 {
1836 Idx ecl_idx;
1837 reg_errcode_t err;
1838 re_node_set *inv_eclosure = dfa->inveclosures + node;
1839 re_node_set except_nodes;
1840 re_node_set_init_empty (&except_nodes);
1841 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1842 {
1843 Idx cur_node = inv_eclosure->elems[ecl_idx];
1844 if (cur_node == node)
1845 continue;
1846 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1847 {
1848 Idx edst1 = dfa->edests[cur_node].elems[0];
1849 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1850 ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1851 if ((!re_node_set_contains (inv_eclosure, edst1)
1852 && re_node_set_contains (dest_nodes, edst1))
1853 || (REG_VALID_NONZERO_INDEX (edst2)
1854 && !re_node_set_contains (inv_eclosure, edst2)
1855 && re_node_set_contains (dest_nodes, edst2)))
1856 {
1857 err = re_node_set_add_intersect (&except_nodes, candidates,
1858 dfa->inveclosures + cur_node);
1859 if (BE (err != REG_NOERROR, 0))
1860 {
1861 re_node_set_free (&except_nodes);
1862 return err;
1863 }
1864 }
1865 }
1866 }
1867 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1868 {
1869 Idx cur_node = inv_eclosure->elems[ecl_idx];
1870 if (!re_node_set_contains (&except_nodes, cur_node))
1871 {
1872 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1873 re_node_set_remove_at (dest_nodes, idx);
1874 }
1875 }
1876 re_node_set_free (&except_nodes);
1877 return REG_NOERROR;
1878 }
1879
1880 static bool
1881 internal_function
check_dst_limits(const re_match_context_t * mctx,const re_node_set * limits,Idx dst_node,Idx dst_idx,Idx src_node,Idx src_idx)1882 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1883 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1884 {
1885 re_dfa_t *const dfa = mctx->dfa;
1886 Idx lim_idx, src_pos, dst_pos;
1887
1888 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1889 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1890 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1891 {
1892 Idx subexp_idx;
1893 struct re_backref_cache_entry *ent;
1894 ent = mctx->bkref_ents + limits->elems[lim_idx];
1895 subexp_idx = dfa->nodes[ent->node].opr.idx;
1896
1897 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1898 subexp_idx, dst_node, dst_idx,
1899 dst_bkref_idx);
1900 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1901 subexp_idx, src_node, src_idx,
1902 src_bkref_idx);
1903
1904 /* In case of:
1905 <src> <dst> ( <subexp> )
1906 ( <subexp> ) <src> <dst>
1907 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1908 if (src_pos == dst_pos)
1909 continue; /* This is unrelated limitation. */
1910 else
1911 return true;
1912 }
1913 return false;
1914 }
1915
1916 static int
1917 internal_function
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,Idx subexp_idx,Idx from_node,Idx bkref_idx)1918 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1919 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1920 {
1921 re_dfa_t *const dfa = mctx->dfa;
1922 re_node_set *eclosures = dfa->eclosures + from_node;
1923 Idx node_idx;
1924
1925 /* Else, we are on the boundary: examine the nodes on the epsilon
1926 closure. */
1927 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1928 {
1929 Idx node = eclosures->elems[node_idx];
1930 switch (dfa->nodes[node].type)
1931 {
1932 case OP_BACK_REF:
1933 if (bkref_idx != REG_MISSING)
1934 {
1935 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1936 do
1937 {
1938 Idx dst;
1939 int cpos;
1940
1941 if (ent->node != node)
1942 continue;
1943
1944 if (subexp_idx < BITSET_WORD_BITS
1945 && !(ent->eps_reachable_subexps_map
1946 & ((bitset_word) 1 << subexp_idx)))
1947 continue;
1948
1949 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1950 OP_CLOSE_SUBEXP cases below. But, if the
1951 destination node is the same node as the source
1952 node, don't recurse because it would cause an
1953 infinite loop: a regex that exhibits this behavior
1954 is ()\1*\1* */
1955 dst = dfa->edests[node].elems[0];
1956 if (dst == from_node)
1957 {
1958 if (boundaries & 1)
1959 return -1;
1960 else /* if (boundaries & 2) */
1961 return 0;
1962 }
1963
1964 cpos =
1965 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1966 dst, bkref_idx);
1967 if (cpos == -1 /* && (boundaries & 1) */)
1968 return -1;
1969 if (cpos == 0 && (boundaries & 2))
1970 return 0;
1971
1972 if (subexp_idx < BITSET_WORD_BITS)
1973 ent->eps_reachable_subexps_map &=
1974 ~ ((bitset_word) 1 << subexp_idx);
1975 }
1976 while (ent++->more);
1977 }
1978 break;
1979
1980 case OP_OPEN_SUBEXP:
1981 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1982 return -1;
1983 break;
1984
1985 case OP_CLOSE_SUBEXP:
1986 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1987 return 0;
1988 break;
1989
1990 default:
1991 break;
1992 }
1993 }
1994
1995 return (boundaries & 2) ? 1 : 0;
1996 }
1997
1998 static int
1999 internal_function
check_dst_limits_calc_pos(const re_match_context_t * mctx,Idx limit,Idx subexp_idx,Idx from_node,Idx str_idx,Idx bkref_idx)2000 check_dst_limits_calc_pos (const re_match_context_t *mctx,
2001 Idx limit, Idx subexp_idx,
2002 Idx from_node, Idx str_idx, Idx bkref_idx)
2003 {
2004 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2005 int boundaries;
2006
2007 /* If we are outside the range of the subexpression, return -1 or 1. */
2008 if (str_idx < lim->subexp_from)
2009 return -1;
2010
2011 if (lim->subexp_to < str_idx)
2012 return 1;
2013
2014 /* If we are within the subexpression, return 0. */
2015 boundaries = (str_idx == lim->subexp_from);
2016 boundaries |= (str_idx == lim->subexp_to) << 1;
2017 if (boundaries == 0)
2018 return 0;
2019
2020 /* Else, examine epsilon closure. */
2021 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2022 from_node, bkref_idx);
2023 }
2024
2025 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2026 which are against limitations from DEST_NODES. */
2027
2028 static reg_errcode_t
2029 internal_function
check_subexp_limits(re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,Idx str_idx)2030 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
2031 const re_node_set *candidates, re_node_set *limits,
2032 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2033 {
2034 reg_errcode_t err;
2035 Idx node_idx, lim_idx;
2036
2037 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2038 {
2039 Idx subexp_idx;
2040 struct re_backref_cache_entry *ent;
2041 ent = bkref_ents + limits->elems[lim_idx];
2042
2043 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2044 continue; /* This is unrelated limitation. */
2045
2046 subexp_idx = dfa->nodes[ent->node].opr.idx;
2047 if (ent->subexp_to == str_idx)
2048 {
2049 Idx ops_node = REG_MISSING;
2050 Idx cls_node = REG_MISSING;
2051 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2052 {
2053 Idx node = dest_nodes->elems[node_idx];
2054 re_token_type_t type = dfa->nodes[node].type;
2055 if (type == OP_OPEN_SUBEXP
2056 && subexp_idx == dfa->nodes[node].opr.idx)
2057 ops_node = node;
2058 else if (type == OP_CLOSE_SUBEXP
2059 && subexp_idx == dfa->nodes[node].opr.idx)
2060 cls_node = node;
2061 }
2062
2063 /* Check the limitation of the open subexpression. */
2064 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2065 if (REG_VALID_INDEX (ops_node))
2066 {
2067 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2068 candidates);
2069 if (BE (err != REG_NOERROR, 0))
2070 return err;
2071 }
2072
2073 /* Check the limitation of the close subexpression. */
2074 if (REG_VALID_INDEX (cls_node))
2075 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2076 {
2077 Idx node = dest_nodes->elems[node_idx];
2078 if (!re_node_set_contains (dfa->inveclosures + node,
2079 cls_node)
2080 && !re_node_set_contains (dfa->eclosures + node,
2081 cls_node))
2082 {
2083 /* It is against this limitation.
2084 Remove it form the current sifted state. */
2085 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2086 candidates);
2087 if (BE (err != REG_NOERROR, 0))
2088 return err;
2089 --node_idx;
2090 }
2091 }
2092 }
2093 else /* (ent->subexp_to != str_idx) */
2094 {
2095 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2096 {
2097 Idx node = dest_nodes->elems[node_idx];
2098 re_token_type_t type = dfa->nodes[node].type;
2099 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2100 {
2101 if (subexp_idx != dfa->nodes[node].opr.idx)
2102 continue;
2103 /* It is against this limitation.
2104 Remove it form the current sifted state. */
2105 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2106 candidates);
2107 if (BE (err != REG_NOERROR, 0))
2108 return err;
2109 }
2110 }
2111 }
2112 }
2113 return REG_NOERROR;
2114 }
2115
2116 static reg_errcode_t
2117 internal_function
sift_states_bkref(re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,const re_node_set * candidates)2118 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2119 Idx str_idx, const re_node_set *candidates)
2120 {
2121 re_dfa_t *const dfa = mctx->dfa;
2122 reg_errcode_t err;
2123 Idx node_idx, node;
2124 re_sift_context_t local_sctx;
2125 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2126
2127 if (first_idx == REG_MISSING)
2128 return REG_NOERROR;
2129
2130 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2131
2132 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2133 {
2134 Idx enabled_idx;
2135 re_token_type_t type;
2136 struct re_backref_cache_entry *entry;
2137 node = candidates->elems[node_idx];
2138 type = dfa->nodes[node].type;
2139 /* Avoid infinite loop for the REs like "()\1+". */
2140 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2141 continue;
2142 if (type != OP_BACK_REF)
2143 continue;
2144
2145 entry = mctx->bkref_ents + first_idx;
2146 enabled_idx = first_idx;
2147 do
2148 {
2149 bool ok;
2150 Idx subexp_len, to_idx, dst_node;
2151 re_dfastate_t *cur_state;
2152
2153 if (entry->node != node)
2154 continue;
2155 subexp_len = entry->subexp_to - entry->subexp_from;
2156 to_idx = str_idx + subexp_len;
2157 dst_node = (subexp_len ? dfa->nexts[node]
2158 : dfa->edests[node].elems[0]);
2159
2160 if (to_idx > sctx->last_str_idx
2161 || sctx->sifted_states[to_idx] == NULL
2162 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2163 || check_dst_limits (mctx, &sctx->limits, node,
2164 str_idx, dst_node, to_idx))
2165 continue;
2166
2167 if (local_sctx.sifted_states == NULL)
2168 {
2169 local_sctx = *sctx;
2170 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2171 if (BE (err != REG_NOERROR, 0))
2172 goto free_return;
2173 }
2174 local_sctx.last_node = node;
2175 local_sctx.last_str_idx = str_idx;
2176 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2177 if (BE (! ok, 0))
2178 {
2179 err = REG_ESPACE;
2180 goto free_return;
2181 }
2182 cur_state = local_sctx.sifted_states[str_idx];
2183 err = sift_states_backward (mctx, &local_sctx);
2184 if (BE (err != REG_NOERROR, 0))
2185 goto free_return;
2186 if (sctx->limited_states != NULL)
2187 {
2188 err = merge_state_array (dfa, sctx->limited_states,
2189 local_sctx.sifted_states,
2190 str_idx + 1);
2191 if (BE (err != REG_NOERROR, 0))
2192 goto free_return;
2193 }
2194 local_sctx.sifted_states[str_idx] = cur_state;
2195 re_node_set_remove (&local_sctx.limits, enabled_idx);
2196
2197 /* mctx->bkref_ents may have changed, reload the pointer. */
2198 entry = mctx->bkref_ents + enabled_idx;
2199 }
2200 while (enabled_idx++, entry++->more);
2201 }
2202 err = REG_NOERROR;
2203 free_return:
2204 if (local_sctx.sifted_states != NULL)
2205 {
2206 re_node_set_free (&local_sctx.limits);
2207 }
2208
2209 return err;
2210 }
2211
2212
2213 #ifdef RE_ENABLE_I18N
2214 static int
2215 internal_function
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx node_idx,Idx str_idx,Idx max_str_idx)2216 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2217 Idx node_idx, Idx str_idx, Idx max_str_idx)
2218 {
2219 re_dfa_t *const dfa = mctx->dfa;
2220 int naccepted;
2221 /* Check the node can accept `multi byte'. */
2222 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2223 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2224 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2225 dfa->nexts[node_idx]))
2226 /* The node can't accept the `multi byte', or the
2227 destination was already thrown away, then the node
2228 could't accept the current input `multi byte'. */
2229 naccepted = 0;
2230 /* Otherwise, it is sure that the node could accept
2231 `naccepted' bytes input. */
2232 return naccepted;
2233 }
2234 #endif /* RE_ENABLE_I18N */
2235
2236
2237 /* Functions for state transition. */
2238
2239 /* Return the next state to which the current state STATE will transit by
2240 accepting the current input byte, and update STATE_LOG if necessary.
2241 If STATE can accept a multibyte char/collating element/back reference
2242 update the destination of STATE_LOG. */
2243
2244 static re_dfastate_t *
2245 internal_function
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2246 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2247 re_dfastate_t *state)
2248 {
2249 re_dfastate_t **trtable;
2250 unsigned char ch;
2251
2252 #ifdef RE_ENABLE_I18N
2253 /* If the current state can accept multibyte. */
2254 if (BE (state->accept_mb, 0))
2255 {
2256 *err = transit_state_mb (mctx, state);
2257 if (BE (*err != REG_NOERROR, 0))
2258 return NULL;
2259 }
2260 #endif /* RE_ENABLE_I18N */
2261
2262 /* Then decide the next state with the single byte. */
2263 #if 0
2264 if (0)
2265 /* don't use transition table */
2266 return transit_state_sb (err, mctx, state);
2267 #endif
2268
2269 /* Use transition table */
2270 ch = re_string_fetch_byte (&mctx->input);
2271 for (;;)
2272 {
2273 trtable = state->trtable;
2274 if (BE (trtable != NULL, 1))
2275 return trtable[ch];
2276
2277 trtable = state->word_trtable;
2278 if (BE (trtable != NULL, 1))
2279 {
2280 unsigned int context;
2281 context
2282 = re_string_context_at (&mctx->input,
2283 re_string_cur_idx (&mctx->input) - 1,
2284 mctx->eflags);
2285 if (IS_WORD_CONTEXT (context))
2286 return trtable[ch + SBC_MAX];
2287 else
2288 return trtable[ch];
2289 }
2290
2291 if (!build_trtable (mctx->dfa, state))
2292 {
2293 *err = REG_ESPACE;
2294 return NULL;
2295 }
2296
2297 /* Retry, we now have a transition table. */
2298 }
2299 }
2300
2301 /* Update the state_log if we need */
2302 re_dfastate_t *
2303 internal_function
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2304 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2305 re_dfastate_t *next_state)
2306 {
2307 re_dfa_t *const dfa = mctx->dfa;
2308 Idx cur_idx = re_string_cur_idx (&mctx->input);
2309
2310 if (cur_idx > mctx->state_log_top)
2311 {
2312 mctx->state_log[cur_idx] = next_state;
2313 mctx->state_log_top = cur_idx;
2314 }
2315 else if (mctx->state_log[cur_idx] == 0)
2316 {
2317 mctx->state_log[cur_idx] = next_state;
2318 }
2319 else
2320 {
2321 re_dfastate_t *pstate;
2322 unsigned int context;
2323 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2324 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2325 the destination of a multibyte char/collating element/
2326 back reference. Then the next state is the union set of
2327 these destinations and the results of the transition table. */
2328 pstate = mctx->state_log[cur_idx];
2329 log_nodes = pstate->entrance_nodes;
2330 if (next_state != NULL)
2331 {
2332 table_nodes = next_state->entrance_nodes;
2333 *err = re_node_set_init_union (&next_nodes, table_nodes,
2334 log_nodes);
2335 if (BE (*err != REG_NOERROR, 0))
2336 return NULL;
2337 }
2338 else
2339 next_nodes = *log_nodes;
2340 /* Note: We already add the nodes of the initial state,
2341 then we don't need to add them here. */
2342
2343 context = re_string_context_at (&mctx->input,
2344 re_string_cur_idx (&mctx->input) - 1,
2345 mctx->eflags);
2346 next_state = mctx->state_log[cur_idx]
2347 = re_acquire_state_context (err, dfa, &next_nodes, context);
2348 /* We don't need to check errors here, since the return value of
2349 this function is next_state and ERR is already set. */
2350
2351 if (table_nodes != NULL)
2352 re_node_set_free (&next_nodes);
2353 }
2354
2355 if (BE (dfa->nbackref, 0) && next_state != NULL)
2356 {
2357 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2358 later. We must check them here, since the back references in the
2359 next state might use them. */
2360 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2361 cur_idx);
2362 if (BE (*err != REG_NOERROR, 0))
2363 return NULL;
2364
2365 /* If the next state has back references. */
2366 if (next_state->has_backref)
2367 {
2368 *err = transit_state_bkref (mctx, &next_state->nodes);
2369 if (BE (*err != REG_NOERROR, 0))
2370 return NULL;
2371 next_state = mctx->state_log[cur_idx];
2372 }
2373 }
2374
2375 return next_state;
2376 }
2377
2378 /* Skip bytes in the input that correspond to part of a
2379 multi-byte match, then look in the log for a state
2380 from which to restart matching. */
2381 static re_dfastate_t *
2382 internal_function
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2383 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2384 {
2385 re_dfastate_t *cur_state = NULL;
2386 do
2387 {
2388 Idx max = mctx->state_log_top;
2389 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2390
2391 do
2392 {
2393 if (++cur_str_idx > max)
2394 return NULL;
2395 re_string_skip_bytes (&mctx->input, 1);
2396 }
2397 while (mctx->state_log[cur_str_idx] == NULL);
2398
2399 cur_state = merge_state_with_log (err, mctx, NULL);
2400 }
2401 while (*err == REG_NOERROR && cur_state == NULL);
2402 return cur_state;
2403 }
2404
2405 /* Helper functions for transit_state. */
2406
2407 /* From the node set CUR_NODES, pick up the nodes whose types are
2408 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2409 expression. And register them to use them later for evaluating the
2410 correspoding back references. */
2411
2412 static reg_errcode_t
2413 internal_function
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,Idx str_idx)2414 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2415 Idx str_idx)
2416 {
2417 re_dfa_t *const dfa = mctx->dfa;
2418 Idx node_idx;
2419 reg_errcode_t err;
2420
2421 /* TODO: This isn't efficient.
2422 Because there might be more than one nodes whose types are
2423 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2424 nodes.
2425 E.g. RE: (a){2} */
2426 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2427 {
2428 Idx node = cur_nodes->elems[node_idx];
2429 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2430 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2431 && (dfa->used_bkref_map
2432 & ((bitset_word) 1 << dfa->nodes[node].opr.idx)))
2433 {
2434 err = match_ctx_add_subtop (mctx, node, str_idx);
2435 if (BE (err != REG_NOERROR, 0))
2436 return err;
2437 }
2438 }
2439 return REG_NOERROR;
2440 }
2441
2442 #if 0
2443 /* Return the next state to which the current state STATE will transit by
2444 accepting the current input byte. */
2445
2446 static re_dfastate_t *
2447 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2448 re_dfastate_t *state)
2449 {
2450 re_dfa_t *const dfa = mctx->dfa;
2451 re_node_set next_nodes;
2452 re_dfastate_t *next_state;
2453 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2454 unsigned int context;
2455
2456 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2457 if (BE (*err != REG_NOERROR, 0))
2458 return NULL;
2459 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2460 {
2461 Idx cur_node = state->nodes.elems[node_cnt];
2462 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2463 {
2464 *err = re_node_set_merge (&next_nodes,
2465 dfa->eclosures + dfa->nexts[cur_node]);
2466 if (BE (*err != REG_NOERROR, 0))
2467 {
2468 re_node_set_free (&next_nodes);
2469 return NULL;
2470 }
2471 }
2472 }
2473 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2474 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2475 /* We don't need to check errors here, since the return value of
2476 this function is next_state and ERR is already set. */
2477
2478 re_node_set_free (&next_nodes);
2479 re_string_skip_bytes (&mctx->input, 1);
2480 return next_state;
2481 }
2482 #endif
2483
2484 #ifdef RE_ENABLE_I18N
2485 static reg_errcode_t
2486 internal_function
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2487 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2488 {
2489 re_dfa_t *const dfa = mctx->dfa;
2490 reg_errcode_t err;
2491 Idx i;
2492
2493 for (i = 0; i < pstate->nodes.nelem; ++i)
2494 {
2495 re_node_set dest_nodes, *new_nodes;
2496 Idx cur_node_idx = pstate->nodes.elems[i];
2497 int naccepted;
2498 Idx dest_idx;
2499 unsigned int context;
2500 re_dfastate_t *dest_state;
2501
2502 if (!dfa->nodes[cur_node_idx].accept_mb)
2503 continue;
2504
2505 if (dfa->nodes[cur_node_idx].constraint)
2506 {
2507 context = re_string_context_at (&mctx->input,
2508 re_string_cur_idx (&mctx->input),
2509 mctx->eflags);
2510 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2511 context))
2512 continue;
2513 }
2514
2515 /* How many bytes the node can accept? */
2516 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2517 re_string_cur_idx (&mctx->input));
2518 if (naccepted == 0)
2519 continue;
2520
2521 /* The node can accepts `naccepted' bytes. */
2522 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2523 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2524 : mctx->max_mb_elem_len);
2525 err = clean_state_log_if_needed (mctx, dest_idx);
2526 if (BE (err != REG_NOERROR, 0))
2527 return err;
2528 #ifdef DEBUG
2529 assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2530 #endif
2531 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2532
2533 dest_state = mctx->state_log[dest_idx];
2534 if (dest_state == NULL)
2535 dest_nodes = *new_nodes;
2536 else
2537 {
2538 err = re_node_set_init_union (&dest_nodes,
2539 dest_state->entrance_nodes, new_nodes);
2540 if (BE (err != REG_NOERROR, 0))
2541 return err;
2542 }
2543 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2544 mctx->state_log[dest_idx]
2545 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2546 if (dest_state != NULL)
2547 re_node_set_free (&dest_nodes);
2548 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2549 return err;
2550 }
2551 return REG_NOERROR;
2552 }
2553 #endif /* RE_ENABLE_I18N */
2554
2555 static reg_errcode_t
2556 internal_function
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2557 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2558 {
2559 re_dfa_t *const dfa = mctx->dfa;
2560 reg_errcode_t err;
2561 Idx i;
2562 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2563
2564 for (i = 0; i < nodes->nelem; ++i)
2565 {
2566 Idx dest_str_idx, prev_nelem, bkc_idx;
2567 Idx node_idx = nodes->elems[i];
2568 unsigned int context;
2569 const re_token_t *node = dfa->nodes + node_idx;
2570 re_node_set *new_dest_nodes;
2571
2572 /* Check whether `node' is a backreference or not. */
2573 if (node->type != OP_BACK_REF)
2574 continue;
2575
2576 if (node->constraint)
2577 {
2578 context = re_string_context_at (&mctx->input, cur_str_idx,
2579 mctx->eflags);
2580 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2581 continue;
2582 }
2583
2584 /* `node' is a backreference.
2585 Check the substring which the substring matched. */
2586 bkc_idx = mctx->nbkref_ents;
2587 err = get_subexp (mctx, node_idx, cur_str_idx);
2588 if (BE (err != REG_NOERROR, 0))
2589 goto free_return;
2590
2591 /* And add the epsilon closures (which is `new_dest_nodes') of
2592 the backreference to appropriate state_log. */
2593 #ifdef DEBUG
2594 assert (dfa->nexts[node_idx] != REG_MISSING);
2595 #endif
2596 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2597 {
2598 Idx subexp_len;
2599 re_dfastate_t *dest_state;
2600 struct re_backref_cache_entry *bkref_ent;
2601 bkref_ent = mctx->bkref_ents + bkc_idx;
2602 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2603 continue;
2604 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2605 new_dest_nodes = (subexp_len == 0
2606 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2607 : dfa->eclosures + dfa->nexts[node_idx]);
2608 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2609 - bkref_ent->subexp_from);
2610 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2611 mctx->eflags);
2612 dest_state = mctx->state_log[dest_str_idx];
2613 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2614 : mctx->state_log[cur_str_idx]->nodes.nelem);
2615 /* Add `new_dest_node' to state_log. */
2616 if (dest_state == NULL)
2617 {
2618 mctx->state_log[dest_str_idx]
2619 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2620 context);
2621 if (BE (mctx->state_log[dest_str_idx] == NULL
2622 && err != REG_NOERROR, 0))
2623 goto free_return;
2624 }
2625 else
2626 {
2627 re_node_set dest_nodes;
2628 err = re_node_set_init_union (&dest_nodes,
2629 dest_state->entrance_nodes,
2630 new_dest_nodes);
2631 if (BE (err != REG_NOERROR, 0))
2632 {
2633 re_node_set_free (&dest_nodes);
2634 goto free_return;
2635 }
2636 mctx->state_log[dest_str_idx]
2637 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2638 re_node_set_free (&dest_nodes);
2639 if (BE (mctx->state_log[dest_str_idx] == NULL
2640 && err != REG_NOERROR, 0))
2641 goto free_return;
2642 }
2643 /* We need to check recursively if the backreference can epsilon
2644 transit. */
2645 if (subexp_len == 0
2646 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2647 {
2648 err = check_subexp_matching_top (mctx, new_dest_nodes,
2649 cur_str_idx);
2650 if (BE (err != REG_NOERROR, 0))
2651 goto free_return;
2652 err = transit_state_bkref (mctx, new_dest_nodes);
2653 if (BE (err != REG_NOERROR, 0))
2654 goto free_return;
2655 }
2656 }
2657 }
2658 err = REG_NOERROR;
2659 free_return:
2660 return err;
2661 }
2662
2663 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2664 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2665 Note that we might collect inappropriate candidates here.
2666 However, the cost of checking them strictly here is too high, then we
2667 delay these checking for prune_impossible_nodes(). */
2668
2669 static reg_errcode_t
2670 internal_function
get_subexp(re_match_context_t * mctx,Idx bkref_node,Idx bkref_str_idx)2671 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2672 {
2673 re_dfa_t *const dfa = mctx->dfa;
2674 Idx subexp_num, sub_top_idx;
2675 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2676 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2677 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2678 if (cache_idx != REG_MISSING)
2679 {
2680 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2681 do
2682 if (entry->node == bkref_node)
2683 return REG_NOERROR; /* We already checked it. */
2684 while (entry++->more);
2685 }
2686
2687 subexp_num = dfa->nodes[bkref_node].opr.idx;
2688
2689 /* For each sub expression */
2690 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2691 {
2692 reg_errcode_t err;
2693 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2694 re_sub_match_last_t *sub_last;
2695 Idx sub_last_idx, sl_str, bkref_str_off;
2696
2697 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2698 continue; /* It isn't related. */
2699
2700 sl_str = sub_top->str_idx;
2701 bkref_str_off = bkref_str_idx;
2702 /* At first, check the last node of sub expressions we already
2703 evaluated. */
2704 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2705 {
2706 regoff_t sl_str_diff;
2707 sub_last = sub_top->lasts[sub_last_idx];
2708 sl_str_diff = sub_last->str_idx - sl_str;
2709 /* The matched string by the sub expression match with the substring
2710 at the back reference? */
2711 if (sl_str_diff > 0)
2712 {
2713 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2714 {
2715 /* Not enough chars for a successful match. */
2716 if (bkref_str_off + sl_str_diff > mctx->input.len)
2717 break;
2718
2719 err = clean_state_log_if_needed (mctx,
2720 bkref_str_off
2721 + sl_str_diff);
2722 if (BE (err != REG_NOERROR, 0))
2723 return err;
2724 buf = (const char *) re_string_get_buffer (&mctx->input);
2725 }
2726 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2727 break; /* We don't need to search this sub expression any more. */
2728 }
2729 bkref_str_off += sl_str_diff;
2730 sl_str += sl_str_diff;
2731 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2732 bkref_str_idx);
2733
2734 /* Reload buf, since the preceding call might have reallocated
2735 the buffer. */
2736 buf = (const char *) re_string_get_buffer (&mctx->input);
2737
2738 if (err == REG_NOMATCH)
2739 continue;
2740 if (BE (err != REG_NOERROR, 0))
2741 return err;
2742 }
2743
2744 if (sub_last_idx < sub_top->nlasts)
2745 continue;
2746 if (sub_last_idx > 0)
2747 ++sl_str;
2748 /* Then, search for the other last nodes of the sub expression. */
2749 for (; sl_str <= bkref_str_idx; ++sl_str)
2750 {
2751 Idx cls_node;
2752 regoff_t sl_str_off;
2753 const re_node_set *nodes;
2754 sl_str_off = sl_str - sub_top->str_idx;
2755 /* The matched string by the sub expression match with the substring
2756 at the back reference? */
2757 if (sl_str_off > 0)
2758 {
2759 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2760 {
2761 /* If we are at the end of the input, we cannot match. */
2762 if (bkref_str_off >= mctx->input.len)
2763 break;
2764
2765 err = extend_buffers (mctx);
2766 if (BE (err != REG_NOERROR, 0))
2767 return err;
2768
2769 buf = (const char *) re_string_get_buffer (&mctx->input);
2770 }
2771 if (buf [bkref_str_off++] != buf[sl_str - 1])
2772 break; /* We don't need to search this sub expression
2773 any more. */
2774 }
2775 if (mctx->state_log[sl_str] == NULL)
2776 continue;
2777 /* Does this state have a ')' of the sub expression? */
2778 nodes = &mctx->state_log[sl_str]->nodes;
2779 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2780 if (cls_node == REG_MISSING)
2781 continue; /* No. */
2782 if (sub_top->path == NULL)
2783 {
2784 sub_top->path = re_calloc (state_array_t,
2785 sl_str - sub_top->str_idx + 1);
2786 if (sub_top->path == NULL)
2787 return REG_ESPACE;
2788 }
2789 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2790 in the current context? */
2791 err = check_arrival (mctx, sub_top->path, sub_top->node,
2792 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2793 if (err == REG_NOMATCH)
2794 continue;
2795 if (BE (err != REG_NOERROR, 0))
2796 return err;
2797 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2798 if (BE (sub_last == NULL, 0))
2799 return REG_ESPACE;
2800 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2801 bkref_str_idx);
2802 if (err == REG_NOMATCH)
2803 continue;
2804 }
2805 }
2806 return REG_NOERROR;
2807 }
2808
2809 /* Helper functions for get_subexp(). */
2810
2811 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2812 If it can arrive, register the sub expression expressed with SUB_TOP
2813 and SUB_LAST. */
2814
2815 static reg_errcode_t
2816 internal_function
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,Idx bkref_node,Idx bkref_str)2817 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2818 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2819 {
2820 reg_errcode_t err;
2821 Idx to_idx;
2822 /* Can the subexpression arrive the back reference? */
2823 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2824 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2825 if (err != REG_NOERROR)
2826 return err;
2827 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2828 sub_last->str_idx);
2829 if (BE (err != REG_NOERROR, 0))
2830 return err;
2831 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2832 return clean_state_log_if_needed (mctx, to_idx);
2833 }
2834
2835 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2836 Search '(' if FL_OPEN, or search ')' otherwise.
2837 TODO: This function isn't efficient...
2838 Because there might be more than one nodes whose types are
2839 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2840 nodes.
2841 E.g. RE: (a){2} */
2842
2843 static Idx
2844 internal_function
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,Idx subexp_idx,int type)2845 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2846 Idx subexp_idx, int type)
2847 {
2848 Idx cls_idx;
2849 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2850 {
2851 Idx cls_node = nodes->elems[cls_idx];
2852 const re_token_t *node = dfa->nodes + cls_node;
2853 if (node->type == type
2854 && node->opr.idx == subexp_idx)
2855 return cls_node;
2856 }
2857 return REG_MISSING;
2858 }
2859
2860 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2861 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2862 heavily reused.
2863 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2864
2865 static reg_errcode_t
2866 internal_function
check_arrival(re_match_context_t * mctx,state_array_t * path,Idx top_node,Idx top_str,Idx last_node,Idx last_str,int type)2867 check_arrival (re_match_context_t *mctx, state_array_t *path,
2868 Idx top_node, Idx top_str, Idx last_node, Idx last_str,
2869 int type)
2870 {
2871 re_dfa_t *const dfa = mctx->dfa;
2872 reg_errcode_t err;
2873 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2874 re_dfastate_t *cur_state = NULL;
2875 re_node_set *cur_nodes, next_nodes;
2876 re_dfastate_t **backup_state_log;
2877 unsigned int context;
2878
2879 subexp_num = dfa->nodes[top_node].opr.idx;
2880 /* Extend the buffer if we need. */
2881 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2882 {
2883 re_dfastate_t **new_array;
2884 Idx old_alloc = path->alloc;
2885 Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1;
2886 if (BE (new_alloc < old_alloc, 0))
2887 return REG_ESPACE;
2888 new_array = re_xrealloc (path->array, re_dfastate_t *, new_alloc);
2889 if (BE (new_array == NULL, 0))
2890 return REG_ESPACE;
2891 path->array = new_array;
2892 path->alloc = new_alloc;
2893 memset (new_array + old_alloc, '\0',
2894 sizeof (re_dfastate_t *) * (new_alloc - old_alloc));
2895 }
2896
2897 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2898
2899 /* Temporary modify MCTX. */
2900 backup_state_log = mctx->state_log;
2901 backup_cur_idx = mctx->input.cur_idx;
2902 mctx->state_log = path->array;
2903 mctx->input.cur_idx = str_idx;
2904
2905 /* Setup initial node set. */
2906 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2907 if (str_idx == top_str)
2908 {
2909 err = re_node_set_init_1 (&next_nodes, top_node);
2910 if (BE (err != REG_NOERROR, 0))
2911 return err;
2912 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2913 if (BE (err != REG_NOERROR, 0))
2914 {
2915 re_node_set_free (&next_nodes);
2916 return err;
2917 }
2918 }
2919 else
2920 {
2921 cur_state = mctx->state_log[str_idx];
2922 if (cur_state && cur_state->has_backref)
2923 {
2924 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2925 if (BE ( err != REG_NOERROR, 0))
2926 return err;
2927 }
2928 else
2929 re_node_set_init_empty (&next_nodes);
2930 }
2931 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2932 {
2933 if (next_nodes.nelem)
2934 {
2935 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2936 subexp_num, type);
2937 if (BE ( err != REG_NOERROR, 0))
2938 {
2939 re_node_set_free (&next_nodes);
2940 return err;
2941 }
2942 }
2943 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2944 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2945 {
2946 re_node_set_free (&next_nodes);
2947 return err;
2948 }
2949 mctx->state_log[str_idx] = cur_state;
2950 }
2951
2952 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2953 {
2954 re_node_set_empty (&next_nodes);
2955 if (mctx->state_log[str_idx + 1])
2956 {
2957 err = re_node_set_merge (&next_nodes,
2958 &mctx->state_log[str_idx + 1]->nodes);
2959 if (BE (err != REG_NOERROR, 0))
2960 {
2961 re_node_set_free (&next_nodes);
2962 return err;
2963 }
2964 }
2965 if (cur_state)
2966 {
2967 err = check_arrival_add_next_nodes (mctx, str_idx,
2968 &cur_state->non_eps_nodes, &next_nodes);
2969 if (BE (err != REG_NOERROR, 0))
2970 {
2971 re_node_set_free (&next_nodes);
2972 return err;
2973 }
2974 }
2975 ++str_idx;
2976 if (next_nodes.nelem)
2977 {
2978 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2979 if (BE (err != REG_NOERROR, 0))
2980 {
2981 re_node_set_free (&next_nodes);
2982 return err;
2983 }
2984 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2985 subexp_num, type);
2986 if (BE ( err != REG_NOERROR, 0))
2987 {
2988 re_node_set_free (&next_nodes);
2989 return err;
2990 }
2991 }
2992 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2993 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2994 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2995 {
2996 re_node_set_free (&next_nodes);
2997 return err;
2998 }
2999 mctx->state_log[str_idx] = cur_state;
3000 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3001 }
3002 re_node_set_free (&next_nodes);
3003 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3004 : &mctx->state_log[last_str]->nodes);
3005 path->next_idx = str_idx;
3006
3007 /* Fix MCTX. */
3008 mctx->state_log = backup_state_log;
3009 mctx->input.cur_idx = backup_cur_idx;
3010
3011 /* Then check the current node set has the node LAST_NODE. */
3012 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3013 return REG_NOERROR;
3014
3015 return REG_NOMATCH;
3016 }
3017
3018 /* Helper functions for check_arrival. */
3019
3020 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3021 to NEXT_NODES.
3022 TODO: This function is similar to the functions transit_state*(),
3023 however this function has many additional works.
3024 Can't we unify them? */
3025
3026 static reg_errcode_t
3027 internal_function
check_arrival_add_next_nodes(re_match_context_t * mctx,Idx str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)3028 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3029 re_node_set *cur_nodes,
3030 re_node_set *next_nodes)
3031 {
3032 re_dfa_t *const dfa = mctx->dfa;
3033 bool ok;
3034 Idx cur_idx;
3035 reg_errcode_t err;
3036 re_node_set union_set;
3037 re_node_set_init_empty (&union_set);
3038 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3039 {
3040 int naccepted = 0;
3041 Idx cur_node = cur_nodes->elems[cur_idx];
3042 #ifdef DEBUG
3043 re_token_type_t type = dfa->nodes[cur_node].type;
3044 assert (!IS_EPSILON_NODE (type));
3045 #endif
3046 #ifdef RE_ENABLE_I18N
3047 /* If the node may accept `multi byte'. */
3048 if (dfa->nodes[cur_node].accept_mb)
3049 {
3050 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3051 str_idx);
3052 if (naccepted > 1)
3053 {
3054 re_dfastate_t *dest_state;
3055 Idx next_node = dfa->nexts[cur_node];
3056 Idx next_idx = str_idx + naccepted;
3057 dest_state = mctx->state_log[next_idx];
3058 re_node_set_empty (&union_set);
3059 if (dest_state)
3060 {
3061 err = re_node_set_merge (&union_set, &dest_state->nodes);
3062 if (BE (err != REG_NOERROR, 0))
3063 {
3064 re_node_set_free (&union_set);
3065 return err;
3066 }
3067 }
3068 ok = re_node_set_insert (&union_set, next_node);
3069 if (BE (! ok, 0))
3070 {
3071 re_node_set_free (&union_set);
3072 return REG_ESPACE;
3073 }
3074 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3075 &union_set);
3076 if (BE (mctx->state_log[next_idx] == NULL
3077 && err != REG_NOERROR, 0))
3078 {
3079 re_node_set_free (&union_set);
3080 return err;
3081 }
3082 }
3083 }
3084 #endif /* RE_ENABLE_I18N */
3085 if (naccepted
3086 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3087 {
3088 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3089 if (BE (! ok, 0))
3090 {
3091 re_node_set_free (&union_set);
3092 return REG_ESPACE;
3093 }
3094 }
3095 }
3096 re_node_set_free (&union_set);
3097 return REG_NOERROR;
3098 }
3099
3100 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3101 CUR_NODES, however exclude the nodes which are:
3102 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3103 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3104 */
3105
3106 static reg_errcode_t
3107 internal_function
check_arrival_expand_ecl(re_dfa_t * dfa,re_node_set * cur_nodes,Idx ex_subexp,int type)3108 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3109 Idx ex_subexp, int type)
3110 {
3111 reg_errcode_t err;
3112 Idx idx, outside_node;
3113 re_node_set new_nodes;
3114 #ifdef DEBUG
3115 assert (cur_nodes->nelem);
3116 #endif
3117 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3118 if (BE (err != REG_NOERROR, 0))
3119 return err;
3120 /* Create a new node set NEW_NODES with the nodes which are epsilon
3121 closures of the node in CUR_NODES. */
3122
3123 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3124 {
3125 Idx cur_node = cur_nodes->elems[idx];
3126 re_node_set *eclosure = dfa->eclosures + cur_node;
3127 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3128 if (outside_node == REG_MISSING)
3129 {
3130 /* There are no problematic nodes, just merge them. */
3131 err = re_node_set_merge (&new_nodes, eclosure);
3132 if (BE (err != REG_NOERROR, 0))
3133 {
3134 re_node_set_free (&new_nodes);
3135 return err;
3136 }
3137 }
3138 else
3139 {
3140 /* There are problematic nodes, re-calculate incrementally. */
3141 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3142 ex_subexp, type);
3143 if (BE (err != REG_NOERROR, 0))
3144 {
3145 re_node_set_free (&new_nodes);
3146 return err;
3147 }
3148 }
3149 }
3150 re_node_set_free (cur_nodes);
3151 *cur_nodes = new_nodes;
3152 return REG_NOERROR;
3153 }
3154
3155 /* Helper function for check_arrival_expand_ecl.
3156 Check incrementally the epsilon closure of TARGET, and if it isn't
3157 problematic append it to DST_NODES. */
3158
3159 static reg_errcode_t
3160 internal_function
check_arrival_expand_ecl_sub(re_dfa_t * dfa,re_node_set * dst_nodes,Idx target,Idx ex_subexp,int type)3161 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3162 Idx target, Idx ex_subexp, int type)
3163 {
3164 Idx cur_node;
3165 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3166 {
3167 bool ok;
3168
3169 if (dfa->nodes[cur_node].type == type
3170 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3171 {
3172 if (type == OP_CLOSE_SUBEXP)
3173 {
3174 ok = re_node_set_insert (dst_nodes, cur_node);
3175 if (BE (! ok, 0))
3176 return REG_ESPACE;
3177 }
3178 break;
3179 }
3180 ok = re_node_set_insert (dst_nodes, cur_node);
3181 if (BE (! ok, 0))
3182 return REG_ESPACE;
3183 if (dfa->edests[cur_node].nelem == 0)
3184 break;
3185 if (dfa->edests[cur_node].nelem == 2)
3186 {
3187 reg_errcode_t ret =
3188 check_arrival_expand_ecl_sub (dfa, dst_nodes,
3189 dfa->edests[cur_node].elems[1],
3190 ex_subexp, type);
3191 if (BE (ret != REG_NOERROR, 0))
3192 return ret;
3193 }
3194 cur_node = dfa->edests[cur_node].elems[0];
3195 }
3196 return REG_NOERROR;
3197 }
3198
3199
3200 /* For all the back references in the current state, calculate the
3201 destination of the back references by the appropriate entry
3202 in MCTX->BKREF_ENTS. */
3203
3204 static reg_errcode_t
3205 internal_function
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,Idx cur_str,Idx subexp_num,int type)3206 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3207 Idx cur_str, Idx subexp_num, int type)
3208 {
3209 re_dfa_t *const dfa = mctx->dfa;
3210 reg_errcode_t err;
3211 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3212 struct re_backref_cache_entry *ent;
3213
3214 if (cache_idx_start == REG_MISSING)
3215 return REG_NOERROR;
3216
3217 restart:
3218 ent = mctx->bkref_ents + cache_idx_start;
3219 do
3220 {
3221 Idx to_idx, next_node;
3222
3223 /* Is this entry ENT is appropriate? */
3224 if (!re_node_set_contains (cur_nodes, ent->node))
3225 continue; /* No. */
3226
3227 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3228 /* Calculate the destination of the back reference, and append it
3229 to MCTX->STATE_LOG. */
3230 if (to_idx == cur_str)
3231 {
3232 /* The backreference did epsilon transit, we must re-check all the
3233 node in the current state. */
3234 re_node_set new_dests;
3235 reg_errcode_t err2, err3;
3236 next_node = dfa->edests[ent->node].elems[0];
3237 if (re_node_set_contains (cur_nodes, next_node))
3238 continue;
3239 err = re_node_set_init_1 (&new_dests, next_node);
3240 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3241 err3 = re_node_set_merge (cur_nodes, &new_dests);
3242 re_node_set_free (&new_dests);
3243 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3244 || err3 != REG_NOERROR, 0))
3245 {
3246 err = (err != REG_NOERROR ? err
3247 : (err2 != REG_NOERROR ? err2 : err3));
3248 return err;
3249 }
3250 /* TODO: It is still inefficient... */
3251 goto restart;
3252 }
3253 else
3254 {
3255 re_node_set union_set;
3256 next_node = dfa->nexts[ent->node];
3257 if (mctx->state_log[to_idx])
3258 {
3259 bool ok;
3260 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3261 next_node))
3262 continue;
3263 err = re_node_set_init_copy (&union_set,
3264 &mctx->state_log[to_idx]->nodes);
3265 ok = re_node_set_insert (&union_set, next_node);
3266 if (BE (err != REG_NOERROR || ! ok, 0))
3267 {
3268 re_node_set_free (&union_set);
3269 err = err != REG_NOERROR ? err : REG_ESPACE;
3270 return err;
3271 }
3272 }
3273 else
3274 {
3275 err = re_node_set_init_1 (&union_set, next_node);
3276 if (BE (err != REG_NOERROR, 0))
3277 return err;
3278 }
3279 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3280 re_node_set_free (&union_set);
3281 if (BE (mctx->state_log[to_idx] == NULL
3282 && err != REG_NOERROR, 0))
3283 return err;
3284 }
3285 }
3286 while (ent++->more);
3287 return REG_NOERROR;
3288 }
3289
3290 /* Build transition table for the state.
3291 Return true if successful. */
3292
3293 static bool
3294 internal_function
build_trtable(re_dfa_t * dfa,re_dfastate_t * state)3295 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3296 {
3297 reg_errcode_t err;
3298 Idx i, j;
3299 int ch;
3300 bool need_word_trtable = false;
3301 bitset_word elem, mask;
3302 bool dests_node_malloced = false, dest_states_malloced = false;
3303 Idx ndests; /* Number of the destination states from `state'. */
3304 re_dfastate_t **trtable;
3305 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3306 re_node_set follows, *dests_node;
3307 bitset *dests_ch;
3308 bitset acceptable;
3309
3310 struct dests_alloc
3311 {
3312 re_node_set dests_node[SBC_MAX];
3313 bitset dests_ch[SBC_MAX];
3314 } *dests_alloc;
3315
3316 /* We build DFA states which corresponds to the destination nodes
3317 from `state'. `dests_node[i]' represents the nodes which i-th
3318 destination state contains, and `dests_ch[i]' represents the
3319 characters which i-th destination state accepts. */
3320 #ifndef __SSP__
3321 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3322 dests_alloc = (struct dests_alloc *) alloca (sizeof dests_alloc[0]);
3323 else
3324 #endif
3325 {
3326 dests_alloc = re_malloc (struct dests_alloc, 1);
3327 if (BE (dests_alloc == NULL, 0))
3328 return false;
3329 dests_node_malloced = true;
3330 }
3331 dests_node = dests_alloc->dests_node;
3332 dests_ch = dests_alloc->dests_ch;
3333
3334 /* Initialize transiton table. */
3335 state->word_trtable = state->trtable = NULL;
3336
3337 /* At first, group all nodes belonging to `state' into several
3338 destinations. */
3339 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3340 if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3341 {
3342 if (dests_node_malloced)
3343 free (dests_alloc);
3344 if (ndests == 0)
3345 {
3346 state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3347 return true;
3348 }
3349 return false;
3350 }
3351
3352 err = re_node_set_alloc (&follows, ndests + 1);
3353 if (BE (err != REG_NOERROR, 0))
3354 goto out_free;
3355
3356 /* Avoid arithmetic overflow in size calculation. */
3357 if (BE (((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX)
3358 / (3 * sizeof (re_dfastate_t *)))
3359 < ndests, 0))
3360 goto out_free;
3361
3362 #ifndef __SSP__
3363 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3364 + ndests * 3 * sizeof (re_dfastate_t *)))
3365 dest_states = (re_dfastate_t **)
3366 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3367 else
3368 #endif
3369 {
3370 dest_states = (re_dfastate_t **)
3371 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3372 if (BE (dest_states == NULL, 0))
3373 {
3374 out_free:
3375 if (dest_states_malloced)
3376 free (dest_states);
3377 re_node_set_free (&follows);
3378 for (i = 0; i < ndests; ++i)
3379 re_node_set_free (dests_node + i);
3380 if (dests_node_malloced)
3381 free (dests_alloc);
3382 return false;
3383 }
3384 dest_states_malloced = true;
3385 }
3386 dest_states_word = dest_states + ndests;
3387 dest_states_nl = dest_states_word + ndests;
3388 bitset_empty (acceptable);
3389
3390 /* Then build the states for all destinations. */
3391 for (i = 0; i < ndests; ++i)
3392 {
3393 Idx next_node;
3394 re_node_set_empty (&follows);
3395 /* Merge the follows of this destination states. */
3396 for (j = 0; j < dests_node[i].nelem; ++j)
3397 {
3398 next_node = dfa->nexts[dests_node[i].elems[j]];
3399 if (next_node != REG_MISSING)
3400 {
3401 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3402 if (BE (err != REG_NOERROR, 0))
3403 goto out_free;
3404 }
3405 }
3406 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3407 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3408 goto out_free;
3409 /* If the new state has context constraint,
3410 build appropriate states for these contexts. */
3411 if (dest_states[i]->has_constraint)
3412 {
3413 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3414 CONTEXT_WORD);
3415 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3416 goto out_free;
3417
3418 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3419 need_word_trtable = true;
3420
3421 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3422 CONTEXT_NEWLINE);
3423 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3424 goto out_free;
3425 }
3426 else
3427 {
3428 dest_states_word[i] = dest_states[i];
3429 dest_states_nl[i] = dest_states[i];
3430 }
3431 bitset_merge (acceptable, dests_ch[i]);
3432 }
3433
3434 if (!BE (need_word_trtable, 0))
3435 {
3436 /* We don't care about whether the following character is a word
3437 character, or we are in a single-byte character set so we can
3438 discern by looking at the character code: allocate a
3439 256-entry transition table. */
3440 trtable = state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3441 if (BE (trtable == NULL, 0))
3442 goto out_free;
3443
3444 /* For all characters ch...: */
3445 for (i = 0; i < BITSET_WORDS; ++i)
3446 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3447 elem;
3448 mask <<= 1, elem >>= 1, ++ch)
3449 if (BE (elem & 1, 0))
3450 {
3451 /* There must be exactly one destination which accepts
3452 character ch. See group_nodes_into_DFAstates. */
3453 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3454 ;
3455
3456 /* j-th destination accepts the word character ch. */
3457 if (dfa->word_char[i] & mask)
3458 trtable[ch] = dest_states_word[j];
3459 else
3460 trtable[ch] = dest_states[j];
3461 }
3462 }
3463 else
3464 {
3465 /* We care about whether the following character is a word
3466 character, and we are in a multi-byte character set: discern
3467 by looking at the character code: build two 256-entry
3468 transition tables, one starting at trtable[0] and one
3469 starting at trtable[SBC_MAX]. */
3470 trtable = state->word_trtable = re_calloc (re_dfastate_t *, 2 * SBC_MAX);
3471 if (BE (trtable == NULL, 0))
3472 goto out_free;
3473
3474 /* For all characters ch...: */
3475 for (i = 0; i < BITSET_WORDS; ++i)
3476 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3477 elem;
3478 mask <<= 1, elem >>= 1, ++ch)
3479 if (BE (elem & 1, 0))
3480 {
3481 /* There must be exactly one destination which accepts
3482 character ch. See group_nodes_into_DFAstates. */
3483 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3484 ;
3485
3486 /* j-th destination accepts the word character ch. */
3487 trtable[ch] = dest_states[j];
3488 trtable[ch + SBC_MAX] = dest_states_word[j];
3489 }
3490 }
3491
3492 /* new line */
3493 if (bitset_contain (acceptable, NEWLINE_CHAR))
3494 {
3495 /* The current state accepts newline character. */
3496 for (j = 0; j < ndests; ++j)
3497 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3498 {
3499 /* k-th destination accepts newline character. */
3500 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3501 if (need_word_trtable)
3502 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3503 /* There must be only one destination which accepts
3504 newline. See group_nodes_into_DFAstates. */
3505 break;
3506 }
3507 }
3508
3509 if (dest_states_malloced)
3510 free (dest_states);
3511
3512 re_node_set_free (&follows);
3513 for (i = 0; i < ndests; ++i)
3514 re_node_set_free (dests_node + i);
3515
3516 if (dests_node_malloced)
3517 free (dests_alloc);
3518
3519 return true;
3520 }
3521
3522 /* Group all nodes belonging to STATE into several destinations.
3523 Then for all destinations, set the nodes belonging to the destination
3524 to DESTS_NODE[i] and set the characters accepted by the destination
3525 to DEST_CH[i]. This function return the number of destinations. */
3526
3527 static Idx
3528 internal_function
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset * dests_ch)3529 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3530 re_node_set *dests_node, bitset *dests_ch)
3531 {
3532 reg_errcode_t err;
3533 bool ok;
3534 Idx i, j, k;
3535 Idx ndests; /* Number of the destinations from `state'. */
3536 bitset accepts; /* Characters a node can accept. */
3537 const re_node_set *cur_nodes = &state->nodes;
3538 bitset_empty (accepts);
3539 ndests = 0;
3540
3541 /* For all the nodes belonging to `state', */
3542 for (i = 0; i < cur_nodes->nelem; ++i)
3543 {
3544 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3545 re_token_type_t type = node->type;
3546 unsigned int constraint = node->constraint;
3547
3548 /* Enumerate all single byte character this node can accept. */
3549 if (type == CHARACTER)
3550 bitset_set (accepts, node->opr.c);
3551 else if (type == SIMPLE_BRACKET)
3552 {
3553 bitset_merge (accepts, node->opr.sbcset);
3554 }
3555 else if (type == OP_PERIOD)
3556 {
3557 #ifdef RE_ENABLE_I18N
3558 if (dfa->mb_cur_max > 1)
3559 bitset_merge (accepts, dfa->sb_char);
3560 else
3561 #endif
3562 bitset_set_all (accepts);
3563 if (!(dfa->syntax & REG_DOT_NEWLINE))
3564 bitset_clear (accepts, '\n');
3565 if (dfa->syntax & REG_DOT_NOT_NULL)
3566 bitset_clear (accepts, '\0');
3567 }
3568 #ifdef RE_ENABLE_I18N
3569 else if (type == OP_UTF8_PERIOD)
3570 {
3571 if (SBC_MAX / 2 % BITSET_WORD_BITS == 0)
3572 memset (accepts, -1, sizeof accepts / 2);
3573 else
3574 bitset_merge (accepts, utf8_sb_map);
3575 if (!(dfa->syntax & REG_DOT_NEWLINE))
3576 bitset_clear (accepts, '\n');
3577 if (dfa->syntax & REG_DOT_NOT_NULL)
3578 bitset_clear (accepts, '\0');
3579 }
3580 #endif
3581 else
3582 continue;
3583
3584 /* Check the `accepts' and sift the characters which are not
3585 match it the context. */
3586 if (constraint)
3587 {
3588 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3589 {
3590 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3591 bitset_empty (accepts);
3592 if (accepts_newline)
3593 bitset_set (accepts, NEWLINE_CHAR);
3594 else
3595 continue;
3596 }
3597 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3598 {
3599 bitset_empty (accepts);
3600 continue;
3601 }
3602
3603 if (constraint & NEXT_WORD_CONSTRAINT)
3604 {
3605 bitset_word any_set = 0;
3606 if (type == CHARACTER && !node->word_char)
3607 {
3608 bitset_empty (accepts);
3609 continue;
3610 }
3611 #ifdef RE_ENABLE_I18N
3612 if (dfa->mb_cur_max > 1)
3613 for (j = 0; j < BITSET_WORDS; ++j)
3614 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3615 else
3616 #endif
3617 for (j = 0; j < BITSET_WORDS; ++j)
3618 any_set |= (accepts[j] &= dfa->word_char[j]);
3619 if (!any_set)
3620 continue;
3621 }
3622 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3623 {
3624 bitset_word any_set = 0;
3625 if (type == CHARACTER && node->word_char)
3626 {
3627 bitset_empty (accepts);
3628 continue;
3629 }
3630 #ifdef RE_ENABLE_I18N
3631 if (dfa->mb_cur_max > 1)
3632 for (j = 0; j < BITSET_WORDS; ++j)
3633 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3634 else
3635 #endif
3636 for (j = 0; j < BITSET_WORDS; ++j)
3637 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3638 if (!any_set)
3639 continue;
3640 }
3641 }
3642
3643 /* Then divide `accepts' into DFA states, or create a new
3644 state. Above, we make sure that accepts is not empty. */
3645 for (j = 0; j < ndests; ++j)
3646 {
3647 bitset intersec; /* Intersection sets, see below. */
3648 bitset remains;
3649 /* Flags, see below. */
3650 bitset_word has_intersec, not_subset, not_consumed;
3651
3652 /* Optimization, skip if this state doesn't accept the character. */
3653 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3654 continue;
3655
3656 /* Enumerate the intersection set of this state and `accepts'. */
3657 has_intersec = 0;
3658 for (k = 0; k < BITSET_WORDS; ++k)
3659 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3660 /* And skip if the intersection set is empty. */
3661 if (!has_intersec)
3662 continue;
3663
3664 /* Then check if this state is a subset of `accepts'. */
3665 not_subset = not_consumed = 0;
3666 for (k = 0; k < BITSET_WORDS; ++k)
3667 {
3668 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3669 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3670 }
3671
3672 /* If this state isn't a subset of `accepts', create a
3673 new group state, which has the `remains'. */
3674 if (not_subset)
3675 {
3676 bitset_copy (dests_ch[ndests], remains);
3677 bitset_copy (dests_ch[j], intersec);
3678 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3679 if (BE (err != REG_NOERROR, 0))
3680 goto error_return;
3681 ++ndests;
3682 }
3683
3684 /* Put the position in the current group. */
3685 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3686 if (BE (! ok, 0))
3687 goto error_return;
3688
3689 /* If all characters are consumed, go to next node. */
3690 if (!not_consumed)
3691 break;
3692 }
3693 /* Some characters remain, create a new group. */
3694 if (j == ndests)
3695 {
3696 bitset_copy (dests_ch[ndests], accepts);
3697 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3698 if (BE (err != REG_NOERROR, 0))
3699 goto error_return;
3700 ++ndests;
3701 bitset_empty (accepts);
3702 }
3703 }
3704 return ndests;
3705 error_return:
3706 for (j = 0; j < ndests; ++j)
3707 re_node_set_free (dests_node + j);
3708 return REG_MISSING;
3709 }
3710
3711 #ifdef RE_ENABLE_I18N
3712 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3713 Return the number of the bytes the node accepts.
3714 STR_IDX is the current index of the input string.
3715
3716 This function handles the nodes which can accept one character, or
3717 one collating element like '.', '[a-z]', opposite to the other nodes
3718 can only accept one byte. */
3719
3720 static int
3721 internal_function
check_node_accept_bytes(re_dfa_t * dfa,Idx node_idx,const re_string_t * input,Idx str_idx)3722 check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
3723 const re_string_t *input, Idx str_idx)
3724 {
3725 const re_token_t *node = dfa->nodes + node_idx;
3726 int char_len, elem_len;
3727 Idx i;
3728
3729 if (BE (node->type == OP_UTF8_PERIOD, 0))
3730 {
3731 unsigned char c = re_string_byte_at (input, str_idx), d;
3732 if (BE (c < 0xc2, 1))
3733 return 0;
3734
3735 if (str_idx + 2 > input->len)
3736 return 0;
3737
3738 d = re_string_byte_at (input, str_idx + 1);
3739 if (c < 0xe0)
3740 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3741 else if (c < 0xf0)
3742 {
3743 char_len = 3;
3744 if (c == 0xe0 && d < 0xa0)
3745 return 0;
3746 }
3747 else if (c < 0xf8)
3748 {
3749 char_len = 4;
3750 if (c == 0xf0 && d < 0x90)
3751 return 0;
3752 }
3753 else if (c < 0xfc)
3754 {
3755 char_len = 5;
3756 if (c == 0xf8 && d < 0x88)
3757 return 0;
3758 }
3759 else if (c < 0xfe)
3760 {
3761 char_len = 6;
3762 if (c == 0xfc && d < 0x84)
3763 return 0;
3764 }
3765 else
3766 return 0;
3767
3768 if (str_idx + char_len > input->len)
3769 return 0;
3770
3771 for (i = 1; i < char_len; ++i)
3772 {
3773 d = re_string_byte_at (input, str_idx + i);
3774 if (d < 0x80 || d > 0xbf)
3775 return 0;
3776 }
3777 return char_len;
3778 }
3779
3780 char_len = re_string_char_size_at (input, str_idx);
3781 if (node->type == OP_PERIOD)
3782 {
3783 if (char_len <= 1)
3784 return 0;
3785 /* FIXME: I don't think this if is needed, as both '\n'
3786 and '\0' are char_len == 1. */
3787 /* '.' accepts any one character except the following two cases. */
3788 if ((!(dfa->syntax & REG_DOT_NEWLINE) &&
3789 re_string_byte_at (input, str_idx) == '\n') ||
3790 ((dfa->syntax & REG_DOT_NOT_NULL) &&
3791 re_string_byte_at (input, str_idx) == '\0'))
3792 return 0;
3793 return char_len;
3794 }
3795
3796 elem_len = re_string_elem_size_at (input, str_idx);
3797 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3798 return 0;
3799
3800 if (node->type == COMPLEX_BRACKET)
3801 {
3802 const re_charset_t *cset = node->opr.mbcset;
3803 # ifdef _LIBC
3804 const unsigned char *pin
3805 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3806 Idx j;
3807 uint32_t nrules;
3808 # endif /* _LIBC */
3809 int match_len = 0;
3810 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3811 ? re_string_wchar_at (input, str_idx) : 0);
3812
3813 /* match with multibyte character? */
3814 for (i = 0; i < cset->nmbchars; ++i)
3815 if (wc == cset->mbchars[i])
3816 {
3817 match_len = char_len;
3818 goto check_node_accept_bytes_match;
3819 }
3820 /* match with character_class? */
3821 for (i = 0; i < cset->nchar_classes; ++i)
3822 {
3823 wctype_t wt = cset->char_classes[i];
3824 if (__iswctype (wc, wt))
3825 {
3826 match_len = char_len;
3827 goto check_node_accept_bytes_match;
3828 }
3829 }
3830
3831 # ifdef _LIBC
3832 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3833 if (nrules != 0)
3834 {
3835 unsigned int in_collseq = 0;
3836 const int32_t *table, *indirect;
3837 const unsigned char *weights, *extra;
3838 const char *collseqwc;
3839 int32_t idx;
3840 /* This #include defines a local function! */
3841 # include <locale/weight.h>
3842
3843 /* match with collating_symbol? */
3844 if (cset->ncoll_syms)
3845 extra = (const unsigned char *)
3846 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3847 for (i = 0; i < cset->ncoll_syms; ++i)
3848 {
3849 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3850 /* Compare the length of input collating element and
3851 the length of current collating element. */
3852 if (*coll_sym != elem_len)
3853 continue;
3854 /* Compare each bytes. */
3855 for (j = 0; j < *coll_sym; j++)
3856 if (pin[j] != coll_sym[1 + j])
3857 break;
3858 if (j == *coll_sym)
3859 {
3860 /* Match if every bytes is equal. */
3861 match_len = j;
3862 goto check_node_accept_bytes_match;
3863 }
3864 }
3865
3866 if (cset->nranges)
3867 {
3868 if (elem_len <= char_len)
3869 {
3870 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3871 in_collseq = __collseq_table_lookup (collseqwc, wc);
3872 }
3873 else
3874 in_collseq = find_collation_sequence_value (pin, elem_len);
3875 }
3876 /* match with range expression? */
3877 for (i = 0; i < cset->nranges; ++i)
3878 if (cset->range_starts[i] <= in_collseq
3879 && in_collseq <= cset->range_ends[i])
3880 {
3881 match_len = elem_len;
3882 goto check_node_accept_bytes_match;
3883 }
3884
3885 /* match with equivalence_class? */
3886 if (cset->nequiv_classes)
3887 {
3888 const unsigned char *cp = pin;
3889 table = (const int32_t *)
3890 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3891 weights = (const unsigned char *)
3892 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3893 extra = (const unsigned char *)
3894 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3895 indirect = (const int32_t *)
3896 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3897 idx = findidx (&cp);
3898 if (idx > 0)
3899 for (i = 0; i < cset->nequiv_classes; ++i)
3900 {
3901 int32_t equiv_class_idx = cset->equiv_classes[i];
3902 size_t weight_len = weights[idx];
3903 if (weight_len == weights[equiv_class_idx])
3904 {
3905 Idx cnt = 0;
3906 while (cnt <= weight_len
3907 && (weights[equiv_class_idx + 1 + cnt]
3908 == weights[idx + 1 + cnt]))
3909 ++cnt;
3910 if (cnt > weight_len)
3911 {
3912 match_len = elem_len;
3913 goto check_node_accept_bytes_match;
3914 }
3915 }
3916 }
3917 }
3918 }
3919 else
3920 # endif /* _LIBC */
3921 {
3922 /* match with range expression? */
3923 #if __GNUC__ >= 2
3924 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3925 #else
3926 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3927 cmp_buf[2] = wc;
3928 #endif
3929 for (i = 0; i < cset->nranges; ++i)
3930 {
3931 cmp_buf[0] = cset->range_starts[i];
3932 cmp_buf[4] = cset->range_ends[i];
3933 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3934 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3935 {
3936 match_len = char_len;
3937 goto check_node_accept_bytes_match;
3938 }
3939 }
3940 }
3941 check_node_accept_bytes_match:
3942 if (!cset->non_match)
3943 return match_len;
3944 else
3945 {
3946 if (match_len > 0)
3947 return 0;
3948 else
3949 return (elem_len > char_len) ? elem_len : char_len;
3950 }
3951 }
3952 return 0;
3953 }
3954
3955 # ifdef _LIBC
3956 static unsigned int
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)3957 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3958 {
3959 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3960 if (nrules == 0)
3961 {
3962 if (mbs_len == 1)
3963 {
3964 /* No valid character. Match it as a single byte character. */
3965 const unsigned char *collseq = (const unsigned char *)
3966 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3967 return collseq[mbs[0]];
3968 }
3969 return UINT_MAX;
3970 }
3971 else
3972 {
3973 int32_t idx;
3974 const unsigned char *extra = (const unsigned char *)
3975 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3976 int32_t extrasize = (const unsigned char *)
3977 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3978
3979 for (idx = 0; idx < extrasize;)
3980 {
3981 int mbs_cnt;
3982 bool found = false;
3983 int32_t elem_mbs_len;
3984 /* Skip the name of collating element name. */
3985 idx = idx + extra[idx] + 1;
3986 elem_mbs_len = extra[idx++];
3987 if (mbs_len == elem_mbs_len)
3988 {
3989 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3990 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3991 break;
3992 if (mbs_cnt == elem_mbs_len)
3993 /* Found the entry. */
3994 found = true;
3995 }
3996 /* Skip the byte sequence of the collating element. */
3997 idx += elem_mbs_len;
3998 /* Adjust for the alignment. */
3999 idx = (idx + 3) & ~3;
4000 /* Skip the collation sequence value. */
4001 idx += sizeof (uint32_t);
4002 /* Skip the wide char sequence of the collating element. */
4003 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4004 /* If we found the entry, return the sequence value. */
4005 if (found)
4006 return *(uint32_t *) (extra + idx);
4007 /* Skip the collation sequence value. */
4008 idx += sizeof (uint32_t);
4009 }
4010 return UINT_MAX;
4011 }
4012 }
4013 # endif /* _LIBC */
4014 #endif /* RE_ENABLE_I18N */
4015
4016 /* Check whether the node accepts the byte which is IDX-th
4017 byte of the INPUT. */
4018
4019 static bool
4020 internal_function
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,Idx idx)4021 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4022 Idx idx)
4023 {
4024 unsigned char ch;
4025 ch = re_string_byte_at (&mctx->input, idx);
4026 switch (node->type)
4027 {
4028 case CHARACTER:
4029 if (node->opr.c != ch)
4030 return false;
4031 break;
4032
4033 case SIMPLE_BRACKET:
4034 if (!bitset_contain (node->opr.sbcset, ch))
4035 return false;
4036 break;
4037
4038 #ifdef RE_ENABLE_I18N
4039 case OP_UTF8_PERIOD:
4040 if (ch >= 0x80)
4041 return false;
4042 /* FALLTHROUGH */
4043 #endif
4044 case OP_PERIOD:
4045 if ((ch == '\n' && !(mctx->dfa->syntax & REG_DOT_NEWLINE))
4046 || (ch == '\0' && (mctx->dfa->syntax & REG_DOT_NOT_NULL)))
4047 return false;
4048 break;
4049
4050 default:
4051 return false;
4052 }
4053
4054 if (node->constraint)
4055 {
4056 /* The node has constraints. Check whether the current context
4057 satisfies the constraints. */
4058 unsigned int context = re_string_context_at (&mctx->input, idx,
4059 mctx->eflags);
4060 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4061 return false;
4062 }
4063
4064 return true;
4065 }
4066
4067 /* Extend the buffers, if the buffers have run out. */
4068
4069 static reg_errcode_t
4070 internal_function
extend_buffers(re_match_context_t * mctx)4071 extend_buffers (re_match_context_t *mctx)
4072 {
4073 reg_errcode_t ret;
4074 re_string_t *pstr = &mctx->input;
4075
4076 /* Double the lengthes of the buffers. */
4077 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4078 if (BE (ret != REG_NOERROR, 0))
4079 return ret;
4080
4081 if (mctx->state_log != NULL)
4082 {
4083 /* And double the length of state_log. */
4084 /* XXX We have no indication of the size of this buffer. If this
4085 allocation fail we have no indication that the state_log array
4086 does not have the right size. */
4087 re_dfastate_t **new_array = re_xrealloc (mctx->state_log, re_dfastate_t *,
4088 pstr->bufs_len + 1);
4089 if (BE (new_array == NULL, 0))
4090 return REG_ESPACE;
4091 mctx->state_log = new_array;
4092 }
4093
4094 /* Then reconstruct the buffers. */
4095 if (pstr->icase)
4096 {
4097 #ifdef RE_ENABLE_I18N
4098 if (pstr->mb_cur_max > 1)
4099 {
4100 ret = build_wcs_upper_buffer (pstr);
4101 if (BE (ret != REG_NOERROR, 0))
4102 return ret;
4103 }
4104 else
4105 #endif /* RE_ENABLE_I18N */
4106 build_upper_buffer (pstr);
4107 }
4108 else
4109 {
4110 #ifdef RE_ENABLE_I18N
4111 if (pstr->mb_cur_max > 1)
4112 build_wcs_buffer (pstr);
4113 else
4114 #endif /* RE_ENABLE_I18N */
4115 {
4116 if (pstr->trans != NULL)
4117 re_string_translate_buffer (pstr);
4118 }
4119 }
4120 return REG_NOERROR;
4121 }
4122
4123
4124 /* Functions for matching context. */
4125
4126 /* Initialize MCTX. */
4127
4128 static reg_errcode_t
4129 internal_function
match_ctx_init(re_match_context_t * mctx,int eflags,Idx n)4130 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4131 {
4132 mctx->eflags = eflags;
4133 mctx->match_last = REG_MISSING;
4134 if (n > 0)
4135 {
4136 mctx->bkref_ents = re_xmalloc (struct re_backref_cache_entry, n);
4137 mctx->sub_tops = re_xmalloc (re_sub_match_top_t *, n);
4138 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4139 return REG_ESPACE;
4140 }
4141 /* Already zero-ed by the caller.
4142 else
4143 mctx->bkref_ents = NULL;
4144 mctx->nbkref_ents = 0;
4145 mctx->nsub_tops = 0; */
4146 mctx->abkref_ents = n;
4147 mctx->max_mb_elem_len = 1;
4148 mctx->asub_tops = n;
4149 return REG_NOERROR;
4150 }
4151
4152 /* Clean the entries which depend on the current input in MCTX.
4153 This function must be invoked when the matcher changes the start index
4154 of the input, or changes the input string. */
4155
4156 static void
4157 internal_function
match_ctx_clean(re_match_context_t * mctx)4158 match_ctx_clean (re_match_context_t *mctx)
4159 {
4160 Idx st_idx;
4161 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4162 {
4163 Idx sl_idx;
4164 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4165 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4166 {
4167 re_sub_match_last_t *last = top->lasts[sl_idx];
4168 re_free (last->path.array);
4169 re_free (last);
4170 }
4171 re_free (top->lasts);
4172 if (top->path)
4173 {
4174 re_free (top->path->array);
4175 re_free (top->path);
4176 }
4177 free (top);
4178 }
4179
4180 mctx->nsub_tops = 0;
4181 mctx->nbkref_ents = 0;
4182 }
4183
4184 /* Free all the memory associated with MCTX. */
4185
4186 static void
4187 internal_function
match_ctx_free(re_match_context_t * mctx)4188 match_ctx_free (re_match_context_t *mctx)
4189 {
4190 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4191 match_ctx_clean (mctx);
4192 re_free (mctx->sub_tops);
4193 re_free (mctx->bkref_ents);
4194 }
4195
4196 /* Add a new backreference entry to MCTX.
4197 Note that we assume that caller never call this function with duplicate
4198 entry, and call with STR_IDX which isn't smaller than any existing entry.
4199 */
4200
4201 static reg_errcode_t
4202 internal_function
match_ctx_add_entry(re_match_context_t * mctx,Idx node,Idx str_idx,Idx from,Idx to)4203 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx,
4204 Idx from, Idx to)
4205 {
4206 if (mctx->nbkref_ents >= mctx->abkref_ents)
4207 {
4208 struct re_backref_cache_entry* new_entry;
4209 new_entry = re_x2realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4210 &mctx->abkref_ents);
4211 if (BE (new_entry == NULL, 0))
4212 {
4213 re_free (mctx->bkref_ents);
4214 return REG_ESPACE;
4215 }
4216 mctx->bkref_ents = new_entry;
4217 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4218 (sizeof (struct re_backref_cache_entry)
4219 * (mctx->abkref_ents - mctx->nbkref_ents)));
4220 }
4221 if (mctx->nbkref_ents > 0
4222 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4223 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4224
4225 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4226 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4227 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4228 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4229
4230 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4231 If bit N is clear, means that this entry won't epsilon-transition to
4232 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4233 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4234 such node.
4235
4236 A backreference does not epsilon-transition unless it is empty, so set
4237 to all zeros if FROM != TO. */
4238 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4239 = (from == to ? -1 : 0);
4240
4241 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4242 if (mctx->max_mb_elem_len < to - from)
4243 mctx->max_mb_elem_len = to - from;
4244 return REG_NOERROR;
4245 }
4246
4247 /* Return the first entry with the same str_idx, or REG_MISSING if none is
4248 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4249
4250 static Idx
4251 internal_function
search_cur_bkref_entry(const re_match_context_t * mctx,Idx str_idx)4252 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4253 {
4254 Idx left, right, mid, last;
4255 last = right = mctx->nbkref_ents;
4256 for (left = 0; left < right;)
4257 {
4258 mid = (left + right) / 2;
4259 if (mctx->bkref_ents[mid].str_idx < str_idx)
4260 left = mid + 1;
4261 else
4262 right = mid;
4263 }
4264 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4265 return left;
4266 else
4267 return REG_MISSING;
4268 }
4269
4270 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4271 at STR_IDX. */
4272
4273 static reg_errcode_t
4274 internal_function
match_ctx_add_subtop(re_match_context_t * mctx,Idx node,Idx str_idx)4275 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4276 {
4277 #ifdef DEBUG
4278 assert (mctx->sub_tops != NULL);
4279 assert (mctx->asub_tops > 0);
4280 #endif
4281 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4282 {
4283 Idx new_asub_tops = mctx->asub_tops;
4284 re_sub_match_top_t **new_array = re_x2realloc (mctx->sub_tops,
4285 re_sub_match_top_t *,
4286 &new_asub_tops);
4287 if (BE (new_array == NULL, 0))
4288 return REG_ESPACE;
4289 mctx->sub_tops = new_array;
4290 mctx->asub_tops = new_asub_tops;
4291 }
4292 mctx->sub_tops[mctx->nsub_tops] = re_calloc (re_sub_match_top_t, 1);
4293 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4294 return REG_ESPACE;
4295 mctx->sub_tops[mctx->nsub_tops]->node = node;
4296 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4297 return REG_NOERROR;
4298 }
4299
4300 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4301 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4302
4303 static re_sub_match_last_t *
4304 internal_function
match_ctx_add_sublast(re_sub_match_top_t * subtop,Idx node,Idx str_idx)4305 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4306 {
4307 re_sub_match_last_t *new_entry;
4308 if (BE (subtop->nlasts == subtop->alasts, 0))
4309 {
4310 Idx new_alasts = subtop->alasts;
4311 re_sub_match_last_t **new_array = re_x2realloc (subtop->lasts,
4312 re_sub_match_last_t *,
4313 &new_alasts);
4314 if (BE (new_array == NULL, 0))
4315 return NULL;
4316 subtop->lasts = new_array;
4317 subtop->alasts = new_alasts;
4318 }
4319 new_entry = re_calloc (re_sub_match_last_t, 1);
4320 if (BE (new_entry != NULL, 1))
4321 {
4322 subtop->lasts[subtop->nlasts] = new_entry;
4323 new_entry->node = node;
4324 new_entry->str_idx = str_idx;
4325 ++subtop->nlasts;
4326 }
4327 return new_entry;
4328 }
4329
4330 static void
4331 internal_function
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,Idx last_node,Idx last_str_idx)4332 sift_ctx_init (re_sift_context_t *sctx,
4333 re_dfastate_t **sifted_sts,
4334 re_dfastate_t **limited_sts,
4335 Idx last_node, Idx last_str_idx)
4336 {
4337 sctx->sifted_states = sifted_sts;
4338 sctx->limited_states = limited_sts;
4339 sctx->last_node = last_node;
4340 sctx->last_str_idx = last_str_idx;
4341 re_node_set_init_empty (&sctx->limits);
4342 }
4343