1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19 #include <sys/cdefs.h>
20 __RCSID("$NetBSD: regcomp.c,v 1.2 2016/05/17 14:00:09 christos Exp $");
21
22
23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
24 Idx length, reg_syntax_t syntax);
25 static void re_compile_fastmap_iter (regex_t *bufp,
26 const re_dfastate_t *init_state,
27 char *fastmap);
28 static reg_errcode_t init_dfa (re_dfa_t *dfa, Idx pat_len);
29 #ifdef RE_ENABLE_I18N
30 static void free_charset (re_charset_t *cset);
31 #endif /* RE_ENABLE_I18N */
32 static void free_workarea_compile (regex_t *preg);
33 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
34 #ifdef RE_ENABLE_I18N
35 static void optimize_utf8 (re_dfa_t *dfa);
36 #endif
37 static reg_errcode_t analyze (regex_t *preg);
38 static reg_errcode_t preorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t postorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 bin_tree_t *node);
48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
52 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
53 unsigned int constraint);
54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
56 Idx node, bool root);
57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
58 static Idx fetch_number (re_string_t *input, re_token_t *token,
59 reg_syntax_t syntax);
60 static int peek_token (re_token_t *token, re_string_t *input,
61 reg_syntax_t syntax);
62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
63 reg_syntax_t syntax, reg_errcode_t *err);
64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 Idx nest, reg_errcode_t *err);
76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
77 re_dfa_t *dfa, re_token_t *token,
78 reg_syntax_t syntax, reg_errcode_t *err);
79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
80 re_token_t *token, reg_syntax_t syntax,
81 reg_errcode_t *err);
82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
83 re_string_t *regexp,
84 re_token_t *token, int token_len,
85 re_dfa_t *dfa,
86 reg_syntax_t syntax,
87 bool accept_hyphen);
88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 re_string_t *regexp,
90 re_token_t *token);
91 #ifdef RE_ENABLE_I18N
92 static reg_errcode_t build_equiv_class (bitset sbcset,
93 re_charset_t *mbcset,
94 Idx *equiv_class_alloc,
95 const unsigned char *name);
96 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
97 bitset sbcset,
98 re_charset_t *mbcset,
99 Idx *char_class_alloc,
100 const unsigned char *class_name,
101 reg_syntax_t syntax);
102 #else /* not RE_ENABLE_I18N */
103 static reg_errcode_t build_equiv_class (bitset sbcset,
104 const unsigned char *name);
105 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
106 bitset sbcset,
107 const unsigned char *class_name,
108 reg_syntax_t syntax);
109 #endif /* not RE_ENABLE_I18N */
110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
111 unsigned REG_TRANSLATE_TYPE trans,
112 const unsigned char *class_name,
113 const unsigned char *extra,
114 bool non_match, reg_errcode_t *err);
115 static bin_tree_t *create_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 re_token_type_t type);
118 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
119 bin_tree_t *left, bin_tree_t *right,
120 const re_token_t *token);
121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
122 static void free_token (re_token_t *node);
123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
125
126 /* This table gives an error message for each of the error codes listed
127 in regex.h. Obviously the order here has to be same as there.
128 POSIX doesn't require that we do anything for REG_NOERROR,
129 but why not be nice? */
130
131 const char __re_error_msgid[] attribute_hidden =
132 {
133 #define REG_NOERROR_IDX 0
134 gettext_noop ("Success") /* REG_NOERROR */
135 "\0"
136 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
137 gettext_noop ("No match") /* REG_NOMATCH */
138 "\0"
139 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
140 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141 "\0"
142 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
143 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144 "\0"
145 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
146 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147 "\0"
148 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
149 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150 "\0"
151 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
152 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153 "\0"
154 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
155 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
156 "\0"
157 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
158 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159 "\0"
160 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
161 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162 "\0"
163 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
164 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165 "\0"
166 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
167 gettext_noop ("Invalid range end") /* REG_ERANGE */
168 "\0"
169 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
170 gettext_noop ("Memory exhausted") /* REG_ESPACE */
171 "\0"
172 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
173 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174 "\0"
175 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
176 gettext_noop ("Premature end of regular expression") /* REG_EEND */
177 "\0"
178 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
179 gettext_noop ("Regular expression too big") /* REG_ESIZE */
180 "\0"
181 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
182 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 };
184
185 const size_t __re_error_msgid_idx[] attribute_hidden =
186 {
187 REG_NOERROR_IDX,
188 REG_NOMATCH_IDX,
189 REG_BADPAT_IDX,
190 REG_ECOLLATE_IDX,
191 REG_ECTYPE_IDX,
192 REG_EESCAPE_IDX,
193 REG_ESUBREG_IDX,
194 REG_EBRACK_IDX,
195 REG_EPAREN_IDX,
196 REG_EBRACE_IDX,
197 REG_BADBR_IDX,
198 REG_ERANGE_IDX,
199 REG_ESPACE_IDX,
200 REG_BADRPT_IDX,
201 REG_EEND_IDX,
202 REG_ESIZE_IDX,
203 REG_ERPAREN_IDX
204 };
205
206 /* Entry points for GNU code. */
207
208 /* re_compile_pattern is the GNU regular expression compiler: it
209 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
210 Returns 0 if the pattern was valid, otherwise an error string.
211
212 Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
213 are set in BUFP on entry. */
214
215 const char *
re_compile_pattern(const char * pattern,size_t length,struct re_pattern_buffer * bufp)216 re_compile_pattern (const char *pattern, size_t length,
217 struct re_pattern_buffer *bufp)
218 {
219 reg_errcode_t ret;
220
221 /* And GNU code determines whether or not to get register information
222 by passing null for the REGS argument to re_match, etc., not by
223 setting re_no_sub, unless REG_NO_SUB is set. */
224 bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
225
226 /* Match anchors at newline. */
227 bufp->re_newline_anchor = 1;
228
229 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230
231 if (!ret)
232 return NULL;
233 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
234 }
235 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)236 weak_alias (__re_compile_pattern, re_compile_pattern)
237 #endif
238
239 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
240 also be assigned to arbitrarily: each pattern buffer stores its own
241 syntax, so it can be changed between regex compilations. */
242 /* This has no initializer because initialized variables in Emacs
243 become read-only after dumping. */
244 reg_syntax_t re_syntax_options;
245
246
247 /* Specify the precise syntax of regexps for compilation. This provides
248 for compatibility for various utilities which historically have
249 different, incompatible syntaxes.
250
251 The argument SYNTAX is a bit mask comprised of the various bits
252 defined in regex.h. We return the old syntax. */
253
254 reg_syntax_t
255 re_set_syntax (reg_syntax_t syntax)
256 {
257 reg_syntax_t ret = re_syntax_options;
258
259 re_syntax_options = syntax;
260 return ret;
261 }
262 #ifdef _LIBC
weak_alias(__re_set_syntax,re_set_syntax)263 weak_alias (__re_set_syntax, re_set_syntax)
264 #endif
265
266 int
267 re_compile_fastmap (struct re_pattern_buffer *bufp)
268 {
269 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
270 char *fastmap = bufp->re_fastmap;
271
272 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
273 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
274 if (dfa->init_state != dfa->init_state_word)
275 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
276 if (dfa->init_state != dfa->init_state_nl)
277 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
278 if (dfa->init_state != dfa->init_state_begbuf)
279 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
280 bufp->re_fastmap_accurate = 1;
281 return 0;
282 }
283 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)284 weak_alias (__re_compile_fastmap, re_compile_fastmap)
285 #endif
286
287 static inline void
288 __attribute ((always_inline))
289 re_set_fastmap (char *fastmap, bool icase, int ch)
290 {
291 fastmap[ch] = 1;
292 if (icase)
293 fastmap[tolower (ch)] = 1;
294 }
295
296 /* Helper function for re_compile_fastmap.
297 Compile fastmap for the initial_state INIT_STATE. */
298
299 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)300 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
301 char *fastmap)
302 {
303 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
304 Idx node_cnt;
305 bool icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
306 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
307 {
308 Idx node = init_state->nodes.elems[node_cnt];
309 re_token_type_t type = dfa->nodes[node].type;
310
311 if (type == CHARACTER)
312 {
313 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
314 #ifdef RE_ENABLE_I18N
315 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
316 {
317 unsigned char buf[MB_LEN_MAX];
318 unsigned char *p;
319 wchar_t wc;
320 mbstate_t state;
321
322 p = buf;
323 *p++ = dfa->nodes[node].opr.c;
324 while (++node < dfa->nodes_len
325 && dfa->nodes[node].type == CHARACTER
326 && dfa->nodes[node].mb_partial)
327 *p++ = dfa->nodes[node].opr.c;
328 memset (&state, 0, sizeof (state));
329 if (mbrtowc (&wc, (const char *) buf, p - buf,
330 &state) == p - buf
331 && (__wcrtomb ((char *) buf, towlower (wc), &state)
332 != (size_t) -1))
333 re_set_fastmap (fastmap, false, buf[0]);
334 }
335 #endif
336 }
337 else if (type == SIMPLE_BRACKET)
338 {
339 int i, j, ch;
340 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
341 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
342 if (dfa->nodes[node].opr.sbcset[i] & ((bitset_word) 1 << j))
343 re_set_fastmap (fastmap, icase, ch);
344 }
345 #ifdef RE_ENABLE_I18N
346 else if (type == COMPLEX_BRACKET)
347 {
348 Idx i;
349 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
350 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
351 || cset->nranges || cset->nchar_classes)
352 {
353 # ifdef _LIBC
354 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
355 {
356 /* In this case we want to catch the bytes which are
357 the first byte of any collation elements.
358 e.g. In da_DK, we want to catch 'a' since "aa"
359 is a valid collation element, and don't catch
360 'b' since 'b' is the only collation element
361 which starts from 'b'. */
362 const int32_t *table = (const int32_t *)
363 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
364 for (i = 0; i < SBC_MAX; ++i)
365 if (table[i] < 0)
366 re_set_fastmap (fastmap, icase, i);
367 }
368 # else
369 if (dfa->mb_cur_max > 1)
370 for (i = 0; i < SBC_MAX; ++i)
371 if (__btowc (i) == WEOF)
372 re_set_fastmap (fastmap, icase, i);
373 # endif /* not _LIBC */
374 }
375 for (i = 0; i < cset->nmbchars; ++i)
376 {
377 char buf[256];
378 mbstate_t state;
379 memset (&state, '\0', sizeof (state));
380 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
381 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
382 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
383 {
384 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
385 != (size_t) -1)
386 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
387 }
388 }
389 }
390 #endif /* RE_ENABLE_I18N */
391 else if (type == OP_PERIOD
392 #ifdef RE_ENABLE_I18N
393 || type == OP_UTF8_PERIOD
394 #endif /* RE_ENABLE_I18N */
395 || type == END_OF_RE)
396 {
397 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
398 if (type == END_OF_RE)
399 bufp->re_can_be_null = 1;
400 return;
401 }
402 }
403 }
404
405 /* Entry point for POSIX code. */
406 /* regcomp takes a regular expression as a string and compiles it.
407
408 PREG is a regex_t *. We do not expect any fields to be initialized,
409 since POSIX says we shouldn't. Thus, we set
410
411 `re_buffer' to the compiled pattern;
412 `re_used' to the length of the compiled pattern;
413 `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
414 REG_EXTENDED bit in CFLAGS is set; otherwise, to
415 REG_SYNTAX_POSIX_BASIC;
416 `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
417 `re_fastmap' to an allocated space for the fastmap;
418 `re_fastmap_accurate' to zero;
419 `re_nsub' to the number of subexpressions in PATTERN.
420
421 PATTERN is the address of the pattern string.
422
423 CFLAGS is a series of bits which affect compilation.
424
425 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
426 use POSIX basic syntax.
427
428 If REG_NEWLINE is set, then . and [^...] don't match newline.
429 Also, regexec will try a match beginning after every newline.
430
431 If REG_ICASE is set, then we considers upper- and lowercase
432 versions of letters to be equivalent when matching.
433
434 If REG_NOSUB is set, then when PREG is passed to regexec, that
435 routine will report only success or failure, and nothing about the
436 registers.
437
438 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
439 the return codes and their meanings.) */
440
441 int
regcomp(regex_t * __restrict preg,const char * __restrict pattern,int cflags)442 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
443 {
444 reg_errcode_t ret;
445 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
446 : REG_SYNTAX_POSIX_BASIC);
447
448 preg->re_buffer = NULL;
449 preg->re_allocated = 0;
450 preg->re_used = 0;
451
452 /* Try to allocate space for the fastmap. */
453 preg->re_fastmap = re_malloc (char, SBC_MAX);
454 if (BE (preg->re_fastmap == NULL, 0))
455 return REG_ESPACE;
456
457 syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
458
459 /* If REG_NEWLINE is set, newlines are treated differently. */
460 if (cflags & REG_NEWLINE)
461 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
462 syntax &= ~REG_DOT_NEWLINE;
463 syntax |= REG_HAT_LISTS_NOT_NEWLINE;
464 /* It also changes the matching behavior. */
465 preg->re_newline_anchor = 1;
466 }
467 else
468 preg->re_newline_anchor = 0;
469 preg->re_no_sub = !!(cflags & REG_NOSUB);
470 preg->re_translate = NULL;
471
472 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
473
474 /* POSIX doesn't distinguish between an unmatched open-group and an
475 unmatched close-group: both are REG_EPAREN. */
476 if (ret == REG_ERPAREN)
477 ret = REG_EPAREN;
478
479 /* We have already checked preg->re_fastmap != NULL. */
480 if (BE (ret == REG_NOERROR, 1))
481 /* Compute the fastmap now, since regexec cannot modify the pattern
482 buffer. This function never fails in this implementation. */
483 (void) re_compile_fastmap (preg);
484 else
485 {
486 /* Some error occurred while compiling the expression. */
487 re_free (preg->re_fastmap);
488 preg->re_fastmap = NULL;
489 }
490
491 return (int) ret;
492 }
493 #ifdef _LIBC
weak_alias(__regcomp,regcomp)494 weak_alias (__regcomp, regcomp)
495 #endif
496
497 /* Returns a message corresponding to an error code, ERRCODE, returned
498 from either regcomp or regexec. We don't use PREG here. */
499
500 size_t
501 regerror (int errcode, const regex_t *__restrict preg,
502 char *__restrict errbuf, size_t errbuf_size)
503 {
504 const char *msg;
505 size_t msg_size;
506
507 if (BE (errcode < 0
508 || errcode >= (int) (sizeof (__re_error_msgid_idx)
509 / sizeof (__re_error_msgid_idx[0])), 0))
510 /* Only error codes returned by the rest of the code should be passed
511 to this routine. If we are given anything else, or if other regex
512 code generates an invalid error code, then the program has a bug.
513 Dump core so we can fix it. */
514 abort ();
515
516 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
517
518 msg_size = strlen (msg) + 1; /* Includes the null. */
519
520 if (BE (errbuf_size != 0, 1))
521 {
522 if (BE (msg_size > errbuf_size, 0))
523 {
524 #if defined HAVE_MEMPCPY || defined _LIBC
525 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
526 #else
527 memcpy (errbuf, msg, errbuf_size - 1);
528 errbuf[errbuf_size - 1] = 0;
529 #endif
530 }
531 else
532 memcpy (errbuf, msg, msg_size);
533 }
534
535 return msg_size;
536 }
537 #ifdef _LIBC
538 weak_alias (__regerror, regerror)
539 #endif
540
541
542 #ifdef RE_ENABLE_I18N
543 /* This static array is used for the map to single-byte characters when
544 UTF-8 is used. Otherwise we would allocate memory just to initialize
545 it the same all the time. UTF-8 is the preferred encoding so this is
546 a worthwhile optimization. */
547 static const bitset utf8_sb_map =
548 {
549 /* Set the first 128 bits. */
550 # if 2 < BITSET_WORDS
551 BITSET_WORD_MAX,
552 # endif
553 # if 4 < BITSET_WORDS
554 BITSET_WORD_MAX,
555 # endif
556 # if 6 < BITSET_WORDS
557 BITSET_WORD_MAX,
558 # endif
559 # if 8 < BITSET_WORDS
560 # error "Invalid BITSET_WORDS"
561 # endif
562 (BITSET_WORD_MAX
563 >> (SBC_MAX % BITSET_WORD_BITS == 0
564 ? 0
565 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
566 };
567 #endif
568
569
570 static void
free_dfa_content(re_dfa_t * dfa)571 free_dfa_content (re_dfa_t *dfa)
572 {
573 Idx i, j;
574
575 if (dfa->nodes)
576 for (i = 0; i < dfa->nodes_len; ++i)
577 free_token (dfa->nodes + i);
578 re_free (dfa->nexts);
579 for (i = 0; i < dfa->nodes_len; ++i)
580 {
581 if (dfa->eclosures != NULL)
582 re_node_set_free (dfa->eclosures + i);
583 if (dfa->inveclosures != NULL)
584 re_node_set_free (dfa->inveclosures + i);
585 if (dfa->edests != NULL)
586 re_node_set_free (dfa->edests + i);
587 }
588 re_free (dfa->edests);
589 re_free (dfa->eclosures);
590 re_free (dfa->inveclosures);
591 re_free (dfa->nodes);
592
593 if (dfa->state_table)
594 for (i = 0; i <= dfa->state_hash_mask; ++i)
595 {
596 struct re_state_table_entry *entry = dfa->state_table + i;
597 for (j = 0; j < entry->num; ++j)
598 {
599 re_dfastate_t *state = entry->array[j];
600 free_state (state);
601 }
602 re_free (entry->array);
603 }
604 re_free (dfa->state_table);
605 #ifdef RE_ENABLE_I18N
606 if (dfa->sb_char != utf8_sb_map)
607 re_free (dfa->sb_char);
608 #endif
609 re_free (dfa->subexp_map);
610 #ifdef DEBUG
611 re_free (dfa->re_str);
612 #endif
613
614 re_free (dfa);
615 }
616
617
618 /* Free dynamically allocated space used by PREG. */
619
620 void
regfree(regex_t * preg)621 regfree (regex_t *preg)
622 {
623 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
624 if (BE (dfa != NULL, 1))
625 free_dfa_content (dfa);
626 preg->re_buffer = NULL;
627 preg->re_allocated = 0;
628
629 re_free (preg->re_fastmap);
630 preg->re_fastmap = NULL;
631
632 re_free (preg->re_translate);
633 preg->re_translate = NULL;
634 }
635 #ifdef _LIBC
636 weak_alias (__regfree, regfree)
637 #endif
638
639 /* Entry points compatible with 4.2 BSD regex library. We don't define
640 them unless specifically requested. */
641
642 #if defined _REGEX_RE_COMP || defined _LIBC
643
644 /* BSD has one and only one pattern buffer. */
645 static struct re_pattern_buffer re_comp_buf;
646
647 char *
648 # ifdef _LIBC
649 /* Make these definitions weak in libc, so POSIX programs can redefine
650 these names if they don't use our functions, and still use
651 regcomp/regexec above without link errors. */
652 weak_function
653 # endif
re_comp(const char * s)654 re_comp (const char *s)
655 {
656 reg_errcode_t ret;
657 char *fastmap;
658
659 if (!s)
660 {
661 if (!re_comp_buf.re_buffer)
662 return gettext ("No previous regular expression");
663 return 0;
664 }
665
666 if (re_comp_buf.re_buffer)
667 {
668 fastmap = re_comp_buf.re_fastmap;
669 re_comp_buf.re_fastmap = NULL;
670 __regfree (&re_comp_buf);
671 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
672 re_comp_buf.re_fastmap = fastmap;
673 }
674
675 if (re_comp_buf.re_fastmap == NULL)
676 {
677 re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
678 if (re_comp_buf.re_fastmap == NULL)
679 return (char *) gettext (__re_error_msgid
680 + __re_error_msgid_idx[(int) REG_ESPACE]);
681 }
682
683 /* Since `re_exec' always passes NULL for the `regs' argument, we
684 don't need to initialize the pattern buffer fields which affect it. */
685
686 /* Match anchors at newlines. */
687 re_comp_buf.re_newline_anchor = 1;
688
689 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
690
691 if (!ret)
692 return NULL;
693
694 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
695 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
696 }
697
698 #ifdef _LIBC
libc_freeres_fn(free_mem)699 libc_freeres_fn (free_mem)
700 {
701 __regfree (&re_comp_buf);
702 }
703 #endif
704
705 #endif /* _REGEX_RE_COMP */
706
707 /* Internal entry point.
708 Compile the regular expression PATTERN, whose length is LENGTH.
709 SYNTAX indicate regular expression's syntax. */
710
711 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,Idx length,reg_syntax_t syntax)712 re_compile_internal (regex_t *preg, const char *pattern, Idx length,
713 reg_syntax_t syntax)
714 {
715 reg_errcode_t err = REG_NOERROR;
716 re_dfa_t *dfa;
717 re_string_t regexp;
718
719 /* Initialize the pattern buffer. */
720 preg->re_fastmap_accurate = 0;
721 preg->re_syntax = syntax;
722 preg->re_not_bol = preg->re_not_eol = 0;
723 preg->re_used = 0;
724 preg->re_nsub = 0;
725 preg->re_can_be_null = 0;
726 preg->re_regs_allocated = REG_UNALLOCATED;
727
728 /* Initialize the dfa. */
729 dfa = (re_dfa_t *) preg->re_buffer;
730 if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
731 {
732 /* If zero allocated, but buffer is non-null, try to realloc
733 enough space. This loses if buffer's address is bogus, but
734 that is the user's responsibility. If buffer is null this
735 is a simple allocation. */
736 dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
737 if (dfa == NULL)
738 return REG_ESPACE;
739 preg->re_allocated = sizeof (re_dfa_t);
740 preg->re_buffer = (unsigned char *) dfa;
741 }
742 preg->re_used = sizeof (re_dfa_t);
743
744 __libc_lock_init (dfa->lock);
745
746 err = init_dfa (dfa, length);
747 if (BE (err != REG_NOERROR, 0))
748 {
749 free_dfa_content (dfa);
750 preg->re_buffer = NULL;
751 preg->re_allocated = 0;
752 return err;
753 }
754 #ifdef DEBUG
755 dfa->re_str = re_malloc (char, length + 1);
756 strncpy (dfa->re_str, pattern, length + 1);
757 #endif
758
759 err = re_string_construct (®exp, pattern, length, preg->re_translate,
760 syntax & REG_IGNORE_CASE, dfa);
761 if (BE (err != REG_NOERROR, 0))
762 {
763 re_compile_internal_free_return:
764 free_workarea_compile (preg);
765 re_string_destruct (®exp);
766 free_dfa_content (dfa);
767 preg->re_buffer = NULL;
768 preg->re_allocated = 0;
769 return err;
770 }
771
772 /* Parse the regular expression, and build a structure tree. */
773 preg->re_nsub = 0;
774 dfa->str_tree = parse (®exp, preg, syntax, &err);
775 if (BE (dfa->str_tree == NULL, 0))
776 goto re_compile_internal_free_return;
777
778 /* Analyze the tree and create the nfa. */
779 err = analyze (preg);
780 if (BE (err != REG_NOERROR, 0))
781 goto re_compile_internal_free_return;
782
783 #ifdef RE_ENABLE_I18N
784 /* If possible, do searching in single byte encoding to speed things up. */
785 if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
786 optimize_utf8 (dfa);
787 #endif
788
789 /* Then create the initial state of the dfa. */
790 err = create_initial_state (dfa);
791
792 /* Release work areas. */
793 free_workarea_compile (preg);
794 re_string_destruct (®exp);
795
796 if (BE (err != REG_NOERROR, 0))
797 {
798 free_dfa_content (dfa);
799 preg->re_buffer = NULL;
800 preg->re_allocated = 0;
801 }
802
803 return err;
804 }
805
806 /* Initialize DFA. We use the length of the regular expression PAT_LEN
807 as the initial length of some arrays. */
808
809 static reg_errcode_t
init_dfa(re_dfa_t * dfa,Idx pat_len)810 init_dfa (re_dfa_t *dfa, Idx pat_len)
811 {
812 __re_size_t table_size;
813 #ifndef _LIBC
814 char *codeset_name;
815 #endif
816
817 memset (dfa, '\0', sizeof (re_dfa_t));
818
819 /* Force allocation of str_tree_storage the first time. */
820 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
821
822 dfa->nodes_alloc = pat_len + 1;
823 dfa->nodes = re_xmalloc (re_token_t, dfa->nodes_alloc);
824
825 /* table_size = 2 ^ ceil(log pat_len) */
826 for (table_size = 1; table_size <= pat_len; table_size <<= 1)
827 if (0 < (Idx) -1 && table_size == 0)
828 return REG_ESPACE;
829
830 dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
831 dfa->state_hash_mask = table_size - 1;
832
833 dfa->mb_cur_max = MB_CUR_MAX;
834 #ifdef _LIBC
835 if (dfa->mb_cur_max == 6
836 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
837 dfa->is_utf8 = 1;
838 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
839 != 0);
840 #else
841 # ifdef HAVE_LANGINFO_CODESET
842 codeset_name = nl_langinfo (CODESET);
843 # else
844 codeset_name = getenv ("LC_ALL");
845 if (codeset_name == NULL || codeset_name[0] == '\0')
846 codeset_name = getenv ("LC_CTYPE");
847 if (codeset_name == NULL || codeset_name[0] == '\0')
848 codeset_name = getenv ("LANG");
849 if (codeset_name == NULL)
850 codeset_name = "";
851 else if (strchr (codeset_name, '.') != NULL)
852 codeset_name = strchr (codeset_name, '.') + 1;
853 # endif
854
855 if (strcasecmp (codeset_name, "UTF-8") == 0
856 || strcasecmp (codeset_name, "UTF8") == 0)
857 dfa->is_utf8 = 1;
858
859 /* We check exhaustively in the loop below if this charset is a
860 superset of ASCII. */
861 dfa->map_notascii = 0;
862 #endif
863
864 #ifdef RE_ENABLE_I18N
865 if (dfa->mb_cur_max > 1)
866 {
867 if (dfa->is_utf8)
868 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
869 else
870 {
871 int i, j, ch;
872
873 dfa->sb_char = re_calloc (bitset_word, BITSET_WORDS);
874 if (BE (dfa->sb_char == NULL, 0))
875 return REG_ESPACE;
876
877 /* Set the bits corresponding to single byte chars. */
878 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
879 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
880 {
881 wint_t wch = __btowc (ch);
882 if (wch != WEOF)
883 dfa->sb_char[i] |= (bitset_word) 1 << j;
884 # ifndef _LIBC
885 if (isascii (ch) && wch != ch)
886 dfa->map_notascii = 1;
887 # endif
888 }
889 }
890 }
891 #endif
892
893 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
894 return REG_ESPACE;
895 return REG_NOERROR;
896 }
897
898 /* Initialize WORD_CHAR table, which indicate which character is
899 "word". In this case "word" means that it is the word construction
900 character used by some operators like "\<", "\>", etc. */
901
902 static void
init_word_char(re_dfa_t * dfa)903 init_word_char (re_dfa_t *dfa)
904 {
905 int i, j, ch;
906 dfa->word_ops_used = 1;
907 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
908 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
909 if (isalnum (ch) || ch == '_')
910 dfa->word_char[i] |= (bitset_word) 1 << j;
911 }
912
913 /* Free the work area which are only used while compiling. */
914
915 static void
free_workarea_compile(regex_t * preg)916 free_workarea_compile (regex_t *preg)
917 {
918 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
919 bin_tree_storage_t *storage, *next;
920 for (storage = dfa->str_tree_storage; storage; storage = next)
921 {
922 next = storage->next;
923 re_free (storage);
924 }
925 dfa->str_tree_storage = NULL;
926 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
927 dfa->str_tree = NULL;
928 re_free (dfa->org_indices);
929 dfa->org_indices = NULL;
930 }
931
932 /* Create initial states for all contexts. */
933
934 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)935 create_initial_state (re_dfa_t *dfa)
936 {
937 Idx first, i;
938 reg_errcode_t err;
939 re_node_set init_nodes;
940
941 /* Initial states have the epsilon closure of the node which is
942 the first node of the regular expression. */
943 first = dfa->str_tree->first->node_idx;
944 dfa->init_node = first;
945 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
946 if (BE (err != REG_NOERROR, 0))
947 return err;
948
949 /* The back-references which are in initial states can epsilon transit,
950 since in this case all of the subexpressions can be null.
951 Then we add epsilon closures of the nodes which are the next nodes of
952 the back-references. */
953 if (dfa->nbackref > 0)
954 for (i = 0; i < init_nodes.nelem; ++i)
955 {
956 Idx node_idx = init_nodes.elems[i];
957 re_token_type_t type = dfa->nodes[node_idx].type;
958
959 Idx clexp_idx;
960 if (type != OP_BACK_REF)
961 continue;
962 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
963 {
964 re_token_t *clexp_node;
965 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
966 if (clexp_node->type == OP_CLOSE_SUBEXP
967 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
968 break;
969 }
970 if (clexp_idx == init_nodes.nelem)
971 continue;
972
973 if (type == OP_BACK_REF)
974 {
975 Idx dest_idx = dfa->edests[node_idx].elems[0];
976 if (!re_node_set_contains (&init_nodes, dest_idx))
977 {
978 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
979 i = 0;
980 }
981 }
982 }
983
984 /* It must be the first time to invoke acquire_state. */
985 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
986 /* We don't check ERR here, since the initial state must not be NULL. */
987 if (BE (dfa->init_state == NULL, 0))
988 return err;
989 if (dfa->init_state->has_constraint)
990 {
991 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
992 CONTEXT_WORD);
993 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
994 CONTEXT_NEWLINE);
995 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
996 &init_nodes,
997 CONTEXT_NEWLINE
998 | CONTEXT_BEGBUF);
999 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1000 || dfa->init_state_begbuf == NULL, 0))
1001 return err;
1002 }
1003 else
1004 dfa->init_state_word = dfa->init_state_nl
1005 = dfa->init_state_begbuf = dfa->init_state;
1006
1007 re_node_set_free (&init_nodes);
1008 return REG_NOERROR;
1009 }
1010
1011 #ifdef RE_ENABLE_I18N
1012 /* If it is possible to do searching in single byte encoding instead of UTF-8
1013 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1014 DFA nodes where needed. */
1015
1016 static void
optimize_utf8(re_dfa_t * dfa)1017 optimize_utf8 (re_dfa_t *dfa)
1018 {
1019 Idx node;
1020 int i;
1021 bool mb_chars = false;
1022 bool has_period = false;
1023
1024 for (node = 0; node < dfa->nodes_len; ++node)
1025 switch (dfa->nodes[node].type)
1026 {
1027 case CHARACTER:
1028 if (dfa->nodes[node].opr.c >= 0x80)
1029 mb_chars = true;
1030 break;
1031 case ANCHOR:
1032 switch (dfa->nodes[node].opr.idx)
1033 {
1034 case LINE_FIRST:
1035 case LINE_LAST:
1036 case BUF_FIRST:
1037 case BUF_LAST:
1038 break;
1039 default:
1040 /* Word anchors etc. cannot be handled. */
1041 return;
1042 }
1043 break;
1044 case OP_PERIOD:
1045 has_period = true;
1046 break;
1047 case OP_BACK_REF:
1048 case OP_ALT:
1049 case END_OF_RE:
1050 case OP_DUP_ASTERISK:
1051 case OP_OPEN_SUBEXP:
1052 case OP_CLOSE_SUBEXP:
1053 break;
1054 case COMPLEX_BRACKET:
1055 return;
1056 case SIMPLE_BRACKET:
1057 /* Just double check. */
1058 {
1059 int rshift =
1060 (SBC_MAX / 2 % BITSET_WORD_BITS == 0
1061 ? 0
1062 : BITSET_WORD_BITS - SBC_MAX / 2 % BITSET_WORD_BITS);
1063 for (i = SBC_MAX / 2 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1064 {
1065 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1066 return;
1067 rshift = 0;
1068 }
1069 }
1070 break;
1071 default:
1072 abort ();
1073 }
1074
1075 if (mb_chars || has_period)
1076 for (node = 0; node < dfa->nodes_len; ++node)
1077 {
1078 if (dfa->nodes[node].type == CHARACTER
1079 && dfa->nodes[node].opr.c >= 0x80)
1080 dfa->nodes[node].mb_partial = 0;
1081 else if (dfa->nodes[node].type == OP_PERIOD)
1082 dfa->nodes[node].type = OP_UTF8_PERIOD;
1083 }
1084
1085 /* The search can be in single byte locale. */
1086 dfa->mb_cur_max = 1;
1087 dfa->is_utf8 = 0;
1088 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1089 }
1090 #endif
1091
1092 /* Analyze the structure tree, and calculate "first", "next", "edest",
1093 "eclosure", and "inveclosure". */
1094
1095 static reg_errcode_t
analyze(regex_t * preg)1096 analyze (regex_t *preg)
1097 {
1098 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1099 reg_errcode_t ret;
1100
1101 /* Allocate arrays. */
1102 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1103 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1104 dfa->edests = re_xmalloc (re_node_set, dfa->nodes_alloc);
1105 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1106 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1107 || dfa->eclosures == NULL, 0))
1108 return REG_ESPACE;
1109
1110 dfa->subexp_map = re_xmalloc (Idx, preg->re_nsub);
1111 if (dfa->subexp_map != NULL)
1112 {
1113 Idx i;
1114 for (i = 0; i < preg->re_nsub; i++)
1115 dfa->subexp_map[i] = i;
1116 preorder (dfa->str_tree, optimize_subexps, dfa);
1117 for (i = 0; i < preg->re_nsub; i++)
1118 if (dfa->subexp_map[i] != i)
1119 break;
1120 if (i == preg->re_nsub)
1121 {
1122 free (dfa->subexp_map);
1123 dfa->subexp_map = NULL;
1124 }
1125 }
1126
1127 ret = postorder (dfa->str_tree, lower_subexps, preg);
1128 if (BE (ret != REG_NOERROR, 0))
1129 return ret;
1130 ret = postorder (dfa->str_tree, calc_first, dfa);
1131 if (BE (ret != REG_NOERROR, 0))
1132 return ret;
1133 preorder (dfa->str_tree, calc_next, dfa);
1134 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1135 if (BE (ret != REG_NOERROR, 0))
1136 return ret;
1137 ret = calc_eclosure (dfa);
1138 if (BE (ret != REG_NOERROR, 0))
1139 return ret;
1140
1141 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1142 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1143 if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1144 || dfa->nbackref)
1145 {
1146 dfa->inveclosures = re_xmalloc (re_node_set, dfa->nodes_len);
1147 if (BE (dfa->inveclosures == NULL, 0))
1148 return REG_ESPACE;
1149 ret = calc_inveclosure (dfa);
1150 }
1151
1152 return ret;
1153 }
1154
1155 /* Our parse trees are very unbalanced, so we cannot use a stack to
1156 implement parse tree visits. Instead, we use parent pointers and
1157 some hairy code in these two functions. */
1158 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1159 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1160 void *extra)
1161 {
1162 bin_tree_t *node, *prev;
1163
1164 for (node = root; ; )
1165 {
1166 /* Descend down the tree, preferably to the left (or to the right
1167 if that's the only child). */
1168 while (node->left || node->right)
1169 if (node->left)
1170 node = node->left;
1171 else
1172 node = node->right;
1173
1174 do
1175 {
1176 reg_errcode_t err = fn (extra, node);
1177 if (BE (err != REG_NOERROR, 0))
1178 return err;
1179 if (node->parent == NULL)
1180 return REG_NOERROR;
1181 prev = node;
1182 node = node->parent;
1183 }
1184 /* Go up while we have a node that is reached from the right. */
1185 while (node->right == prev || node->right == NULL);
1186 node = node->right;
1187 }
1188 }
1189
1190 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1191 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1192 void *extra)
1193 {
1194 bin_tree_t *node;
1195
1196 for (node = root; ; )
1197 {
1198 reg_errcode_t err = fn (extra, node);
1199 if (BE (err != REG_NOERROR, 0))
1200 return err;
1201
1202 /* Go to the left node, or up and to the right. */
1203 if (node->left)
1204 node = node->left;
1205 else
1206 {
1207 bin_tree_t *prev = NULL;
1208 while (node->right == prev || node->right == NULL)
1209 {
1210 prev = node;
1211 node = node->parent;
1212 if (!node)
1213 return REG_NOERROR;
1214 }
1215 node = node->right;
1216 }
1217 }
1218 }
1219
1220 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1221 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1222 backreferences as well. Requires a preorder visit. */
1223 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1224 optimize_subexps (void *extra, bin_tree_t *node)
1225 {
1226 re_dfa_t *dfa = (re_dfa_t *) extra;
1227
1228 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1229 {
1230 int idx = node->token.opr.idx;
1231 node->token.opr.idx = dfa->subexp_map[idx];
1232 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1233 }
1234
1235 else if (node->token.type == SUBEXP
1236 && node->left && node->left->token.type == SUBEXP)
1237 {
1238 Idx other_idx = node->left->token.opr.idx;
1239
1240 node->left = node->left->left;
1241 if (node->left)
1242 node->left->parent = node;
1243
1244 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1245 if (other_idx < BITSET_WORD_BITS)
1246 dfa->used_bkref_map &= ~ ((bitset_word) 1 << other_idx);
1247 }
1248
1249 return REG_NOERROR;
1250 }
1251
1252 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1253 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1254 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1255 lower_subexps (void *extra, bin_tree_t *node)
1256 {
1257 regex_t *preg = (regex_t *) extra;
1258 reg_errcode_t err = REG_NOERROR;
1259
1260 if (node->left && node->left->token.type == SUBEXP)
1261 {
1262 node->left = lower_subexp (&err, preg, node->left);
1263 if (node->left)
1264 node->left->parent = node;
1265 }
1266 if (node->right && node->right->token.type == SUBEXP)
1267 {
1268 node->right = lower_subexp (&err, preg, node->right);
1269 if (node->right)
1270 node->right->parent = node;
1271 }
1272
1273 return err;
1274 }
1275
1276 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1277 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1278 {
1279 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1280 bin_tree_t *body = node->left;
1281 bin_tree_t *op, *cls, *tree1, *tree;
1282
1283 if (preg->re_no_sub
1284 /* We do not optimize empty subexpressions, because otherwise we may
1285 have bad CONCAT nodes with NULL children. This is obviously not
1286 very common, so we do not lose much. An example that triggers
1287 this case is the sed "script" /\(\)/x. */
1288 && node->left != NULL
1289 && ! (node->token.opr.idx < BITSET_WORD_BITS
1290 && dfa->used_bkref_map & ((bitset_word) 1 << node->token.opr.idx)))
1291 return node->left;
1292
1293 /* Convert the SUBEXP node to the concatenation of an
1294 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1295 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1296 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1297 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1298 tree = create_tree (dfa, op, tree1, CONCAT);
1299 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1300 {
1301 *err = REG_ESPACE;
1302 return NULL;
1303 }
1304
1305 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1306 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1307 return tree;
1308 }
1309
1310 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1311 nodes. Requires a postorder visit. */
1312 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1313 calc_first (void *extra, bin_tree_t *node)
1314 {
1315 re_dfa_t *dfa = (re_dfa_t *) extra;
1316 if (node->token.type == CONCAT)
1317 {
1318 node->first = node->left->first;
1319 node->node_idx = node->left->node_idx;
1320 }
1321 else
1322 {
1323 node->first = node;
1324 node->node_idx = re_dfa_add_node (dfa, node->token);
1325 if (BE (node->node_idx == REG_MISSING, 0))
1326 return REG_ESPACE;
1327 }
1328 return REG_NOERROR;
1329 }
1330
1331 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1332 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1333 calc_next (void *extra, bin_tree_t *node)
1334 {
1335 switch (node->token.type)
1336 {
1337 case OP_DUP_ASTERISK:
1338 node->left->next = node;
1339 break;
1340 case CONCAT:
1341 node->left->next = node->right->first;
1342 node->right->next = node->next;
1343 break;
1344 default:
1345 if (node->left)
1346 node->left->next = node->next;
1347 if (node->right)
1348 node->right->next = node->next;
1349 break;
1350 }
1351 return REG_NOERROR;
1352 }
1353
1354 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1355 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1356 link_nfa_nodes (void *extra, bin_tree_t *node)
1357 {
1358 re_dfa_t *dfa = (re_dfa_t *) extra;
1359 Idx idx = node->node_idx;
1360 reg_errcode_t err = REG_NOERROR;
1361
1362 switch (node->token.type)
1363 {
1364 case CONCAT:
1365 break;
1366
1367 case END_OF_RE:
1368 assert (node->next == NULL);
1369 break;
1370
1371 case OP_DUP_ASTERISK:
1372 case OP_ALT:
1373 {
1374 Idx left, right;
1375 dfa->has_plural_match = 1;
1376 if (node->left != NULL)
1377 left = node->left->first->node_idx;
1378 else
1379 left = node->next->node_idx;
1380 if (node->right != NULL)
1381 right = node->right->first->node_idx;
1382 else
1383 right = node->next->node_idx;
1384 assert (REG_VALID_INDEX (left));
1385 assert (REG_VALID_INDEX (right));
1386 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1387 }
1388 break;
1389
1390 case ANCHOR:
1391 case OP_OPEN_SUBEXP:
1392 case OP_CLOSE_SUBEXP:
1393 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1394 break;
1395
1396 case OP_BACK_REF:
1397 dfa->nexts[idx] = node->next->node_idx;
1398 if (node->token.type == OP_BACK_REF)
1399 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1400 break;
1401
1402 default:
1403 assert (!IS_EPSILON_NODE (node->token.type));
1404 dfa->nexts[idx] = node->next->node_idx;
1405 break;
1406 }
1407
1408 return err;
1409 }
1410
1411 /* Duplicate the epsilon closure of the node ROOT_NODE.
1412 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1413 to their own constraint. */
1414
1415 static reg_errcode_t
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)1416 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node,
1417 Idx top_clone_node, Idx root_node,
1418 unsigned int init_constraint)
1419 {
1420 Idx org_node, clone_node;
1421 bool ok;
1422 unsigned int constraint = init_constraint;
1423 for (org_node = top_org_node, clone_node = top_clone_node;;)
1424 {
1425 Idx org_dest, clone_dest;
1426 if (dfa->nodes[org_node].type == OP_BACK_REF)
1427 {
1428 /* If the back reference epsilon-transit, its destination must
1429 also have the constraint. Then duplicate the epsilon closure
1430 of the destination of the back reference, and store it in
1431 edests of the back reference. */
1432 org_dest = dfa->nexts[org_node];
1433 re_node_set_empty (dfa->edests + clone_node);
1434 clone_dest = duplicate_node (dfa, org_dest, constraint);
1435 if (BE (clone_dest == REG_MISSING, 0))
1436 return REG_ESPACE;
1437 dfa->nexts[clone_node] = dfa->nexts[org_node];
1438 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1439 if (BE (! ok, 0))
1440 return REG_ESPACE;
1441 }
1442 else if (dfa->edests[org_node].nelem == 0)
1443 {
1444 /* In case of the node can't epsilon-transit, don't duplicate the
1445 destination and store the original destination as the
1446 destination of the node. */
1447 dfa->nexts[clone_node] = dfa->nexts[org_node];
1448 break;
1449 }
1450 else if (dfa->edests[org_node].nelem == 1)
1451 {
1452 /* In case of the node can epsilon-transit, and it has only one
1453 destination. */
1454 org_dest = dfa->edests[org_node].elems[0];
1455 re_node_set_empty (dfa->edests + clone_node);
1456 if (dfa->nodes[org_node].type == ANCHOR)
1457 {
1458 /* In case of the node has another constraint, append it. */
1459 if (org_node == root_node && clone_node != org_node)
1460 {
1461 /* ...but if the node is root_node itself, it means the
1462 epsilon closure have a loop, then tie it to the
1463 destination of the root_node. */
1464 ok = re_node_set_insert (dfa->edests + clone_node,
1465 org_dest);
1466 if (BE (! ok, 0))
1467 return REG_ESPACE;
1468 break;
1469 }
1470 constraint |= dfa->nodes[org_node].opr.ctx_type;
1471 }
1472 clone_dest = duplicate_node (dfa, org_dest, constraint);
1473 if (BE (clone_dest == REG_MISSING, 0))
1474 return REG_ESPACE;
1475 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1476 if (BE (! ok, 0))
1477 return REG_ESPACE;
1478 }
1479 else /* dfa->edests[org_node].nelem == 2 */
1480 {
1481 /* In case of the node can epsilon-transit, and it has two
1482 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1483 org_dest = dfa->edests[org_node].elems[0];
1484 re_node_set_empty (dfa->edests + clone_node);
1485 /* Search for a duplicated node which satisfies the constraint. */
1486 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1487 if (clone_dest == REG_MISSING)
1488 {
1489 /* There are no such a duplicated node, create a new one. */
1490 reg_errcode_t err;
1491 clone_dest = duplicate_node (dfa, org_dest, constraint);
1492 if (BE (clone_dest == REG_MISSING, 0))
1493 return REG_ESPACE;
1494 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1495 if (BE (! ok, 0))
1496 return REG_ESPACE;
1497 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1498 root_node, constraint);
1499 if (BE (err != REG_NOERROR, 0))
1500 return err;
1501 }
1502 else
1503 {
1504 /* There are a duplicated node which satisfy the constraint,
1505 use it to avoid infinite loop. */
1506 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1507 if (BE (! ok, 0))
1508 return REG_ESPACE;
1509 }
1510
1511 org_dest = dfa->edests[org_node].elems[1];
1512 clone_dest = duplicate_node (dfa, org_dest, constraint);
1513 if (BE (clone_dest == REG_MISSING, 0))
1514 return REG_ESPACE;
1515 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1516 if (BE (! ok, 0))
1517 return REG_ESPACE;
1518 }
1519 org_node = org_dest;
1520 clone_node = clone_dest;
1521 }
1522 return REG_NOERROR;
1523 }
1524
1525 /* Search for a node which is duplicated from the node ORG_NODE, and
1526 satisfies the constraint CONSTRAINT. */
1527
1528 static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)1529 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1530 unsigned int constraint)
1531 {
1532 Idx idx;
1533 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1534 {
1535 if (org_node == dfa->org_indices[idx]
1536 && constraint == dfa->nodes[idx].constraint)
1537 return idx; /* Found. */
1538 }
1539 return REG_MISSING; /* Not found. */
1540 }
1541
1542 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1543 Return the index of the new node, or REG_MISSING if insufficient storage is
1544 available. */
1545
1546 static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)1547 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1548 {
1549 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1550 if (BE (dup_idx != REG_MISSING, 1))
1551 {
1552 dfa->nodes[dup_idx].constraint = constraint;
1553 if (dfa->nodes[org_idx].type == ANCHOR)
1554 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1555 dfa->nodes[dup_idx].duplicated = 1;
1556
1557 /* Store the index of the original node. */
1558 dfa->org_indices[dup_idx] = org_idx;
1559 }
1560 return dup_idx;
1561 }
1562
1563 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1564 calc_inveclosure (re_dfa_t *dfa)
1565 {
1566 Idx src, idx;
1567 bool ok;
1568 for (idx = 0; idx < dfa->nodes_len; ++idx)
1569 re_node_set_init_empty (dfa->inveclosures + idx);
1570
1571 for (src = 0; src < dfa->nodes_len; ++src)
1572 {
1573 Idx *elems = dfa->eclosures[src].elems;
1574 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1575 {
1576 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1577 if (BE (! ok, 0))
1578 return REG_ESPACE;
1579 }
1580 }
1581
1582 return REG_NOERROR;
1583 }
1584
1585 /* Calculate "eclosure" for all the node in DFA. */
1586
1587 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1588 calc_eclosure (re_dfa_t *dfa)
1589 {
1590 Idx node_idx;
1591 bool incomplete;
1592 #ifdef DEBUG
1593 assert (dfa->nodes_len > 0);
1594 #endif
1595 incomplete = false;
1596 /* For each nodes, calculate epsilon closure. */
1597 for (node_idx = 0; ; ++node_idx)
1598 {
1599 reg_errcode_t err;
1600 re_node_set eclosure_elem;
1601 if (node_idx == dfa->nodes_len)
1602 {
1603 if (!incomplete)
1604 break;
1605 incomplete = false;
1606 node_idx = 0;
1607 }
1608
1609 #ifdef DEBUG
1610 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1611 #endif
1612
1613 /* If we have already calculated, skip it. */
1614 if (dfa->eclosures[node_idx].nelem != 0)
1615 continue;
1616 /* Calculate epsilon closure of `node_idx'. */
1617 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1618 if (BE (err != REG_NOERROR, 0))
1619 return err;
1620
1621 if (dfa->eclosures[node_idx].nelem == 0)
1622 {
1623 incomplete = true;
1624 re_node_set_free (&eclosure_elem);
1625 }
1626 }
1627 return REG_NOERROR;
1628 }
1629
1630 /* Calculate epsilon closure of NODE. */
1631
1632 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)1633 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1634 {
1635 reg_errcode_t err;
1636 unsigned int constraint;
1637 Idx i;
1638 bool incomplete;
1639 bool ok;
1640 re_node_set eclosure;
1641 incomplete = false;
1642 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1643 if (BE (err != REG_NOERROR, 0))
1644 return err;
1645
1646 /* This indicates that we are calculating this node now.
1647 We reference this value to avoid infinite loop. */
1648 dfa->eclosures[node].nelem = REG_MISSING;
1649
1650 constraint = ((dfa->nodes[node].type == ANCHOR)
1651 ? dfa->nodes[node].opr.ctx_type : 0);
1652 /* If the current node has constraints, duplicate all nodes.
1653 Since they must inherit the constraints. */
1654 if (constraint
1655 && dfa->edests[node].nelem
1656 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1657 {
1658 Idx org_node, cur_node;
1659 org_node = cur_node = node;
1660 err = duplicate_node_closure (dfa, node, node, node, constraint);
1661 if (BE (err != REG_NOERROR, 0))
1662 return err;
1663 }
1664
1665 /* Expand each epsilon destination nodes. */
1666 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1667 for (i = 0; i < dfa->edests[node].nelem; ++i)
1668 {
1669 re_node_set eclosure_elem;
1670 Idx edest = dfa->edests[node].elems[i];
1671 /* If calculating the epsilon closure of `edest' is in progress,
1672 return intermediate result. */
1673 if (dfa->eclosures[edest].nelem == REG_MISSING)
1674 {
1675 incomplete = true;
1676 continue;
1677 }
1678 /* If we haven't calculated the epsilon closure of `edest' yet,
1679 calculate now. Otherwise use calculated epsilon closure. */
1680 if (dfa->eclosures[edest].nelem == 0)
1681 {
1682 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1683 if (BE (err != REG_NOERROR, 0))
1684 return err;
1685 }
1686 else
1687 eclosure_elem = dfa->eclosures[edest];
1688 /* Merge the epsilon closure of `edest'. */
1689 re_node_set_merge (&eclosure, &eclosure_elem);
1690 /* If the epsilon closure of `edest' is incomplete,
1691 the epsilon closure of this node is also incomplete. */
1692 if (dfa->eclosures[edest].nelem == 0)
1693 {
1694 incomplete = true;
1695 re_node_set_free (&eclosure_elem);
1696 }
1697 }
1698
1699 /* Epsilon closures include itself. */
1700 ok = re_node_set_insert (&eclosure, node);
1701 if (BE (! ok, 0))
1702 return REG_ESPACE;
1703 if (incomplete && !root)
1704 dfa->eclosures[node].nelem = 0;
1705 else
1706 dfa->eclosures[node] = eclosure;
1707 *new_set = eclosure;
1708 return REG_NOERROR;
1709 }
1710
1711 /* Functions for token which are used in the parser. */
1712
1713 /* Fetch a token from INPUT.
1714 We must not use this function inside bracket expressions. */
1715
1716 static void
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1717 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1718 {
1719 re_string_skip_bytes (input, peek_token (result, input, syntax));
1720 }
1721
1722 /* Peek a token from INPUT, and return the length of the token.
1723 We must not use this function inside bracket expressions. */
1724
1725 static int
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1726 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1727 {
1728 unsigned char c;
1729
1730 if (re_string_eoi (input))
1731 {
1732 token->type = END_OF_RE;
1733 return 0;
1734 }
1735
1736 c = re_string_peek_byte (input, 0);
1737 token->opr.c = c;
1738
1739 token->word_char = 0;
1740 #ifdef RE_ENABLE_I18N
1741 token->mb_partial = 0;
1742 if (input->mb_cur_max > 1 &&
1743 !re_string_first_byte (input, re_string_cur_idx (input)))
1744 {
1745 token->type = CHARACTER;
1746 token->mb_partial = 1;
1747 return 1;
1748 }
1749 #endif
1750 if (c == '\\')
1751 {
1752 unsigned char c2;
1753 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1754 {
1755 token->type = BACK_SLASH;
1756 return 1;
1757 }
1758
1759 c2 = re_string_peek_byte_case (input, 1);
1760 token->opr.c = c2;
1761 token->type = CHARACTER;
1762 #ifdef RE_ENABLE_I18N
1763 if (input->mb_cur_max > 1)
1764 {
1765 wint_t wc = re_string_wchar_at (input,
1766 re_string_cur_idx (input) + 1);
1767 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1768 }
1769 else
1770 #endif
1771 token->word_char = IS_WORD_CHAR (c2) != 0;
1772
1773 switch (c2)
1774 {
1775 case '|':
1776 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1777 token->type = OP_ALT;
1778 break;
1779 case '1': case '2': case '3': case '4': case '5':
1780 case '6': case '7': case '8': case '9':
1781 if (!(syntax & REG_NO_BK_REFS))
1782 {
1783 token->type = OP_BACK_REF;
1784 token->opr.idx = c2 - '1';
1785 }
1786 break;
1787 case '<':
1788 if (!(syntax & REG_NO_GNU_OPS))
1789 {
1790 token->type = ANCHOR;
1791 token->opr.ctx_type = WORD_FIRST;
1792 }
1793 break;
1794 case '>':
1795 if (!(syntax & REG_NO_GNU_OPS))
1796 {
1797 token->type = ANCHOR;
1798 token->opr.ctx_type = WORD_LAST;
1799 }
1800 break;
1801 case 'b':
1802 if (!(syntax & REG_NO_GNU_OPS))
1803 {
1804 token->type = ANCHOR;
1805 token->opr.ctx_type = WORD_DELIM;
1806 }
1807 break;
1808 case 'B':
1809 if (!(syntax & REG_NO_GNU_OPS))
1810 {
1811 token->type = ANCHOR;
1812 token->opr.ctx_type = NOT_WORD_DELIM;
1813 }
1814 break;
1815 case 'w':
1816 if (!(syntax & REG_NO_GNU_OPS))
1817 token->type = OP_WORD;
1818 break;
1819 case 'W':
1820 if (!(syntax & REG_NO_GNU_OPS))
1821 token->type = OP_NOTWORD;
1822 break;
1823 case 's':
1824 if (!(syntax & REG_NO_GNU_OPS))
1825 token->type = OP_SPACE;
1826 break;
1827 case 'S':
1828 if (!(syntax & REG_NO_GNU_OPS))
1829 token->type = OP_NOTSPACE;
1830 break;
1831 case '`':
1832 if (!(syntax & REG_NO_GNU_OPS))
1833 {
1834 token->type = ANCHOR;
1835 token->opr.ctx_type = BUF_FIRST;
1836 }
1837 break;
1838 case '\'':
1839 if (!(syntax & REG_NO_GNU_OPS))
1840 {
1841 token->type = ANCHOR;
1842 token->opr.ctx_type = BUF_LAST;
1843 }
1844 break;
1845 case '(':
1846 if (!(syntax & REG_NO_BK_PARENS))
1847 token->type = OP_OPEN_SUBEXP;
1848 break;
1849 case ')':
1850 if (!(syntax & REG_NO_BK_PARENS))
1851 token->type = OP_CLOSE_SUBEXP;
1852 break;
1853 case '+':
1854 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1855 token->type = OP_DUP_PLUS;
1856 break;
1857 case '?':
1858 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1859 token->type = OP_DUP_QUESTION;
1860 break;
1861 case '{':
1862 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1863 token->type = OP_OPEN_DUP_NUM;
1864 break;
1865 case '}':
1866 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1867 token->type = OP_CLOSE_DUP_NUM;
1868 break;
1869 default:
1870 break;
1871 }
1872 return 2;
1873 }
1874
1875 token->type = CHARACTER;
1876 #ifdef RE_ENABLE_I18N
1877 if (input->mb_cur_max > 1)
1878 {
1879 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1880 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1881 }
1882 else
1883 #endif
1884 token->word_char = IS_WORD_CHAR (token->opr.c);
1885
1886 switch (c)
1887 {
1888 case '\n':
1889 if (syntax & REG_NEWLINE_ALT)
1890 token->type = OP_ALT;
1891 break;
1892 case '|':
1893 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1894 token->type = OP_ALT;
1895 break;
1896 case '*':
1897 token->type = OP_DUP_ASTERISK;
1898 break;
1899 case '+':
1900 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1901 token->type = OP_DUP_PLUS;
1902 break;
1903 case '?':
1904 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1905 token->type = OP_DUP_QUESTION;
1906 break;
1907 case '{':
1908 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1909 token->type = OP_OPEN_DUP_NUM;
1910 break;
1911 case '}':
1912 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1913 token->type = OP_CLOSE_DUP_NUM;
1914 break;
1915 case '(':
1916 if (syntax & REG_NO_BK_PARENS)
1917 token->type = OP_OPEN_SUBEXP;
1918 break;
1919 case ')':
1920 if (syntax & REG_NO_BK_PARENS)
1921 token->type = OP_CLOSE_SUBEXP;
1922 break;
1923 case '[':
1924 token->type = OP_OPEN_BRACKET;
1925 break;
1926 case '.':
1927 token->type = OP_PERIOD;
1928 break;
1929 case '^':
1930 if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1931 re_string_cur_idx (input) != 0)
1932 {
1933 char prev = re_string_peek_byte (input, -1);
1934 if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1935 break;
1936 }
1937 token->type = ANCHOR;
1938 token->opr.ctx_type = LINE_FIRST;
1939 break;
1940 case '$':
1941 if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1942 re_string_cur_idx (input) + 1 != re_string_length (input))
1943 {
1944 re_token_t next;
1945 re_string_skip_bytes (input, 1);
1946 peek_token (&next, input, syntax);
1947 re_string_skip_bytes (input, -1);
1948 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1949 break;
1950 }
1951 token->type = ANCHOR;
1952 token->opr.ctx_type = LINE_LAST;
1953 break;
1954 default:
1955 break;
1956 }
1957 return 1;
1958 }
1959
1960 /* Peek a token from INPUT, and return the length of the token.
1961 We must not use this function out of bracket expressions. */
1962
1963 static int
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1964 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1965 {
1966 unsigned char c;
1967 if (re_string_eoi (input))
1968 {
1969 token->type = END_OF_RE;
1970 return 0;
1971 }
1972 c = re_string_peek_byte (input, 0);
1973 token->opr.c = c;
1974
1975 #ifdef RE_ENABLE_I18N
1976 if (input->mb_cur_max > 1 &&
1977 !re_string_first_byte (input, re_string_cur_idx (input)))
1978 {
1979 token->type = CHARACTER;
1980 return 1;
1981 }
1982 #endif /* RE_ENABLE_I18N */
1983
1984 if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1985 && re_string_cur_idx (input) + 1 < re_string_length (input))
1986 {
1987 /* In this case, '\' escape a character. */
1988 unsigned char c2;
1989 re_string_skip_bytes (input, 1);
1990 c2 = re_string_peek_byte (input, 0);
1991 token->opr.c = c2;
1992 token->type = CHARACTER;
1993 return 1;
1994 }
1995 if (c == '[') /* '[' is a special char in a bracket exps. */
1996 {
1997 unsigned char c2;
1998 int token_len;
1999 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2000 c2 = re_string_peek_byte (input, 1);
2001 else
2002 c2 = 0;
2003 token->opr.c = c2;
2004 token_len = 2;
2005 switch (c2)
2006 {
2007 case '.':
2008 token->type = OP_OPEN_COLL_ELEM;
2009 break;
2010 case '=':
2011 token->type = OP_OPEN_EQUIV_CLASS;
2012 break;
2013 case ':':
2014 if (syntax & REG_CHAR_CLASSES)
2015 {
2016 token->type = OP_OPEN_CHAR_CLASS;
2017 break;
2018 }
2019 /* else fall through. */
2020 default:
2021 token->type = CHARACTER;
2022 token->opr.c = c;
2023 token_len = 1;
2024 break;
2025 }
2026 return token_len;
2027 }
2028 switch (c)
2029 {
2030 case '-':
2031 token->type = OP_CHARSET_RANGE;
2032 break;
2033 case ']':
2034 token->type = OP_CLOSE_BRACKET;
2035 break;
2036 case '^':
2037 token->type = OP_NON_MATCH_LIST;
2038 break;
2039 default:
2040 token->type = CHARACTER;
2041 }
2042 return 1;
2043 }
2044
2045 /* Functions for parser. */
2046
2047 /* Entry point of the parser.
2048 Parse the regular expression REGEXP and return the structure tree.
2049 If an error is occured, ERR is set by error code, and return NULL.
2050 This function build the following tree, from regular expression <reg_exp>:
2051 CAT
2052 / \
2053 / \
2054 <reg_exp> EOR
2055
2056 CAT means concatenation.
2057 EOR means end of regular expression. */
2058
2059 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2060 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2061 reg_errcode_t *err)
2062 {
2063 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2064 bin_tree_t *tree, *eor, *root;
2065 re_token_t current_token;
2066 dfa->syntax = syntax;
2067 fetch_token (¤t_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2068 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2069 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2070 return NULL;
2071 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2072 if (tree != NULL)
2073 root = create_tree (dfa, tree, eor, CONCAT);
2074 else
2075 root = eor;
2076 if (BE (eor == NULL || root == NULL, 0))
2077 {
2078 *err = REG_ESPACE;
2079 return NULL;
2080 }
2081 return root;
2082 }
2083
2084 /* This function build the following tree, from regular expression
2085 <branch1>|<branch2>:
2086 ALT
2087 / \
2088 / \
2089 <branch1> <branch2>
2090
2091 ALT means alternative, which represents the operator `|'. */
2092
2093 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2094 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2095 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2096 {
2097 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2098 bin_tree_t *tree, *branch = NULL;
2099 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2100 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2101 return NULL;
2102
2103 while (token->type == OP_ALT)
2104 {
2105 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2106 if (token->type != OP_ALT && token->type != END_OF_RE
2107 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2108 {
2109 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2110 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2111 return NULL;
2112 }
2113 else
2114 branch = NULL;
2115 tree = create_tree (dfa, tree, branch, OP_ALT);
2116 if (BE (tree == NULL, 0))
2117 {
2118 *err = REG_ESPACE;
2119 return NULL;
2120 }
2121 }
2122 return tree;
2123 }
2124
2125 /* This function build the following tree, from regular expression
2126 <exp1><exp2>:
2127 CAT
2128 / \
2129 / \
2130 <exp1> <exp2>
2131
2132 CAT means concatenation. */
2133
2134 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2135 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2136 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2137 {
2138 bin_tree_t *tree, *exp;
2139 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2140 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2141 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2142 return NULL;
2143
2144 while (token->type != OP_ALT && token->type != END_OF_RE
2145 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2146 {
2147 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2148 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2149 {
2150 return NULL;
2151 }
2152 if (tree != NULL && exp != NULL)
2153 {
2154 tree = create_tree (dfa, tree, exp, CONCAT);
2155 if (tree == NULL)
2156 {
2157 *err = REG_ESPACE;
2158 return NULL;
2159 }
2160 }
2161 else if (tree == NULL)
2162 tree = exp;
2163 /* Otherwise exp == NULL, we don't need to create new tree. */
2164 }
2165 return tree;
2166 }
2167
2168 /* This function build the following tree, from regular expression a*:
2169 *
2170 |
2171 a
2172 */
2173
2174 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2175 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2176 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2177 {
2178 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2179 bin_tree_t *tree;
2180 switch (token->type)
2181 {
2182 case CHARACTER:
2183 tree = create_token_tree (dfa, NULL, NULL, token);
2184 if (BE (tree == NULL, 0))
2185 {
2186 *err = REG_ESPACE;
2187 return NULL;
2188 }
2189 #ifdef RE_ENABLE_I18N
2190 if (dfa->mb_cur_max > 1)
2191 {
2192 while (!re_string_eoi (regexp)
2193 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2194 {
2195 bin_tree_t *mbc_remain;
2196 fetch_token (token, regexp, syntax);
2197 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2198 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2199 if (BE (mbc_remain == NULL || tree == NULL, 0))
2200 {
2201 *err = REG_ESPACE;
2202 return NULL;
2203 }
2204 }
2205 }
2206 #endif
2207 break;
2208 case OP_OPEN_SUBEXP:
2209 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2210 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2211 return NULL;
2212 break;
2213 case OP_OPEN_BRACKET:
2214 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2215 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2216 return NULL;
2217 break;
2218 case OP_BACK_REF:
2219 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2220 {
2221 *err = REG_ESUBREG;
2222 return NULL;
2223 }
2224 dfa->used_bkref_map |= 1 << token->opr.idx;
2225 tree = create_token_tree (dfa, NULL, NULL, token);
2226 if (BE (tree == NULL, 0))
2227 {
2228 *err = REG_ESPACE;
2229 return NULL;
2230 }
2231 ++dfa->nbackref;
2232 dfa->has_mb_node = 1;
2233 break;
2234 case OP_OPEN_DUP_NUM:
2235 if (syntax & REG_CONTEXT_INVALID_DUP)
2236 {
2237 *err = REG_BADRPT;
2238 return NULL;
2239 }
2240 /* FALLTHROUGH */
2241 case OP_DUP_ASTERISK:
2242 case OP_DUP_PLUS:
2243 case OP_DUP_QUESTION:
2244 if (syntax & REG_CONTEXT_INVALID_OPS)
2245 {
2246 *err = REG_BADRPT;
2247 return NULL;
2248 }
2249 else if (syntax & REG_CONTEXT_INDEP_OPS)
2250 {
2251 fetch_token (token, regexp, syntax);
2252 return parse_expression (regexp, preg, token, syntax, nest, err);
2253 }
2254 /* else fall through */
2255 case OP_CLOSE_SUBEXP:
2256 if ((token->type == OP_CLOSE_SUBEXP) &&
2257 !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2258 {
2259 *err = REG_ERPAREN;
2260 return NULL;
2261 }
2262 /* else fall through */
2263 case OP_CLOSE_DUP_NUM:
2264 /* We treat it as a normal character. */
2265
2266 /* Then we can these characters as normal characters. */
2267 token->type = CHARACTER;
2268 /* mb_partial and word_char bits should be initialized already
2269 by peek_token. */
2270 tree = create_token_tree (dfa, NULL, NULL, token);
2271 if (BE (tree == NULL, 0))
2272 {
2273 *err = REG_ESPACE;
2274 return NULL;
2275 }
2276 break;
2277 case ANCHOR:
2278 if ((token->opr.ctx_type
2279 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2280 && dfa->word_ops_used == 0)
2281 init_word_char (dfa);
2282 if (token->opr.ctx_type == WORD_DELIM
2283 || token->opr.ctx_type == NOT_WORD_DELIM)
2284 {
2285 bin_tree_t *tree_first, *tree_last;
2286 if (token->opr.ctx_type == WORD_DELIM)
2287 {
2288 token->opr.ctx_type = WORD_FIRST;
2289 tree_first = create_token_tree (dfa, NULL, NULL, token);
2290 token->opr.ctx_type = WORD_LAST;
2291 }
2292 else
2293 {
2294 token->opr.ctx_type = INSIDE_WORD;
2295 tree_first = create_token_tree (dfa, NULL, NULL, token);
2296 token->opr.ctx_type = INSIDE_NOTWORD;
2297 }
2298 tree_last = create_token_tree (dfa, NULL, NULL, token);
2299 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2300 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2301 {
2302 *err = REG_ESPACE;
2303 return NULL;
2304 }
2305 }
2306 else
2307 {
2308 tree = create_token_tree (dfa, NULL, NULL, token);
2309 if (BE (tree == NULL, 0))
2310 {
2311 *err = REG_ESPACE;
2312 return NULL;
2313 }
2314 }
2315 /* We must return here, since ANCHORs can't be followed
2316 by repetition operators.
2317 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2318 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2319 fetch_token (token, regexp, syntax);
2320 return tree;
2321 case OP_PERIOD:
2322 tree = create_token_tree (dfa, NULL, NULL, token);
2323 if (BE (tree == NULL, 0))
2324 {
2325 *err = REG_ESPACE;
2326 return NULL;
2327 }
2328 if (dfa->mb_cur_max > 1)
2329 dfa->has_mb_node = 1;
2330 break;
2331 case OP_WORD:
2332 case OP_NOTWORD:
2333 tree = build_charclass_op (dfa, regexp->trans,
2334 (const unsigned char *) "alnum",
2335 (const unsigned char *) "_",
2336 token->type == OP_NOTWORD, err);
2337 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2338 return NULL;
2339 break;
2340 case OP_SPACE:
2341 case OP_NOTSPACE:
2342 tree = build_charclass_op (dfa, regexp->trans,
2343 (const unsigned char *) "space",
2344 (const unsigned char *) "",
2345 token->type == OP_NOTSPACE, err);
2346 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2347 return NULL;
2348 break;
2349 case OP_ALT:
2350 case END_OF_RE:
2351 return NULL;
2352 case BACK_SLASH:
2353 *err = REG_EESCAPE;
2354 return NULL;
2355 default:
2356 /* Must not happen? */
2357 #ifdef DEBUG
2358 assert (0);
2359 #endif
2360 return NULL;
2361 }
2362 fetch_token (token, regexp, syntax);
2363
2364 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2365 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2366 {
2367 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2368 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2369 return NULL;
2370 /* In BRE consecutive duplications are not allowed. */
2371 if ((syntax & REG_CONTEXT_INVALID_DUP)
2372 && (token->type == OP_DUP_ASTERISK
2373 || token->type == OP_OPEN_DUP_NUM))
2374 {
2375 *err = REG_BADRPT;
2376 return NULL;
2377 }
2378 }
2379
2380 return tree;
2381 }
2382
2383 /* This function build the following tree, from regular expression
2384 (<reg_exp>):
2385 SUBEXP
2386 |
2387 <reg_exp>
2388 */
2389
2390 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2391 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2392 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2393 {
2394 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2395 bin_tree_t *tree;
2396 size_t cur_nsub;
2397 cur_nsub = preg->re_nsub++;
2398
2399 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2400
2401 /* The subexpression may be a null string. */
2402 if (token->type == OP_CLOSE_SUBEXP)
2403 tree = NULL;
2404 else
2405 {
2406 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2407 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2408 *err = REG_EPAREN;
2409 if (BE (*err != REG_NOERROR, 0))
2410 return NULL;
2411 }
2412
2413 if (cur_nsub <= '9' - '1')
2414 dfa->completed_bkref_map |= 1 << cur_nsub;
2415
2416 tree = create_tree (dfa, tree, NULL, SUBEXP);
2417 if (BE (tree == NULL, 0))
2418 {
2419 *err = REG_ESPACE;
2420 return NULL;
2421 }
2422 tree->token.opr.idx = cur_nsub;
2423 return tree;
2424 }
2425
2426 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2427
2428 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2429 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2430 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2431 {
2432 bin_tree_t *tree = NULL, *old_tree = NULL;
2433 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2434 re_token_t start_token = *token;
2435
2436 if (token->type == OP_OPEN_DUP_NUM)
2437 {
2438 end = 0;
2439 start = fetch_number (regexp, token, syntax);
2440 if (start == REG_MISSING)
2441 {
2442 if (token->type == CHARACTER && token->opr.c == ',')
2443 start = 0; /* We treat "{,m}" as "{0,m}". */
2444 else
2445 {
2446 *err = REG_BADBR; /* <re>{} is invalid. */
2447 return NULL;
2448 }
2449 }
2450 if (BE (start != REG_ERROR, 1))
2451 {
2452 /* We treat "{n}" as "{n,n}". */
2453 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2454 : ((token->type == CHARACTER && token->opr.c == ',')
2455 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2456 }
2457 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2458 {
2459 /* Invalid sequence. */
2460 if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2461 {
2462 if (token->type == END_OF_RE)
2463 *err = REG_EBRACE;
2464 else
2465 *err = REG_BADBR;
2466
2467 return NULL;
2468 }
2469
2470 /* If the syntax bit is set, rollback. */
2471 re_string_set_index (regexp, start_idx);
2472 *token = start_token;
2473 token->type = CHARACTER;
2474 /* mb_partial and word_char bits should be already initialized by
2475 peek_token. */
2476 return elem;
2477 }
2478
2479 if (BE (end != REG_MISSING && start > end, 0))
2480 {
2481 /* First number greater than second. */
2482 *err = REG_BADBR;
2483 return NULL;
2484 }
2485 }
2486 else
2487 {
2488 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2489 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2490 }
2491
2492 fetch_token (token, regexp, syntax);
2493
2494 if (BE (elem == NULL, 0))
2495 return NULL;
2496 if (BE (start == 0 && end == 0, 0))
2497 {
2498 postorder (elem, free_tree, NULL);
2499 return NULL;
2500 }
2501
2502 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2503 if (BE (start > 0, 0))
2504 {
2505 tree = elem;
2506 for (i = 2; i <= start; ++i)
2507 {
2508 elem = duplicate_tree (elem, dfa);
2509 tree = create_tree (dfa, tree, elem, CONCAT);
2510 if (BE (elem == NULL || tree == NULL, 0))
2511 goto parse_dup_op_espace;
2512 }
2513
2514 if (start == end)
2515 return tree;
2516
2517 /* Duplicate ELEM before it is marked optional. */
2518 elem = duplicate_tree (elem, dfa);
2519 old_tree = tree;
2520 }
2521 else
2522 old_tree = NULL;
2523
2524 if (elem->token.type == SUBEXP)
2525 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2526
2527 tree = create_tree (dfa, elem, NULL,
2528 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2529 if (BE (tree == NULL, 0))
2530 goto parse_dup_op_espace;
2531
2532 /* This loop is actually executed only when end != REG_MISSING,
2533 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2534 already created the start+1-th copy. */
2535 if ((Idx) -1 < 0 || end != REG_MISSING)
2536 for (i = start + 2; i <= end; ++i)
2537 {
2538 elem = duplicate_tree (elem, dfa);
2539 tree = create_tree (dfa, tree, elem, CONCAT);
2540 if (BE (elem == NULL || tree == NULL, 0))
2541 goto parse_dup_op_espace;
2542
2543 tree = create_tree (dfa, tree, NULL, OP_ALT);
2544 if (BE (tree == NULL, 0))
2545 goto parse_dup_op_espace;
2546 }
2547
2548 if (old_tree)
2549 tree = create_tree (dfa, old_tree, tree, CONCAT);
2550
2551 return tree;
2552
2553 parse_dup_op_espace:
2554 *err = REG_ESPACE;
2555 return NULL;
2556 }
2557
2558 /* Size of the names for collating symbol/equivalence_class/character_class.
2559 I'm not sure, but maybe enough. */
2560 #define BRACKET_NAME_BUF_SIZE 32
2561
2562 #ifndef _LIBC
2563 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2564 Build the range expression which starts from START_ELEM, and ends
2565 at END_ELEM. The result are written to MBCSET and SBCSET.
2566 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2567 mbcset->range_ends, is a pointer argument sinse we may
2568 update it. */
2569
2570 static reg_errcode_t
build_range_exp(bitset sbcset,re_charset_t * mbcset,Idx * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)2571 build_range_exp (bitset sbcset,
2572 # ifdef RE_ENABLE_I18N
2573 re_charset_t *mbcset, Idx *range_alloc,
2574 # endif
2575 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2576 {
2577 unsigned int start_ch, end_ch;
2578 /* Equivalence Classes and Character Classes can't be a range start/end. */
2579 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2580 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2581 0))
2582 return REG_ERANGE;
2583
2584 /* We can handle no multi character collating elements without libc
2585 support. */
2586 if (BE ((start_elem->type == COLL_SYM
2587 && strlen ((char *) start_elem->opr.name) > 1)
2588 || (end_elem->type == COLL_SYM
2589 && strlen ((char *) end_elem->opr.name) > 1), 0))
2590 return REG_ECOLLATE;
2591
2592 # ifdef RE_ENABLE_I18N
2593 {
2594 wchar_t wc;
2595 wint_t start_wc, end_wc;
2596 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2597
2598 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2599 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2600 : 0));
2601 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2602 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2603 : 0));
2604 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2605 ? __btowc (start_ch) : start_elem->opr.wch);
2606 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2607 ? __btowc (end_ch) : end_elem->opr.wch);
2608 if (start_wc == WEOF || end_wc == WEOF)
2609 return REG_ECOLLATE;
2610 cmp_buf[0] = start_wc;
2611 cmp_buf[4] = end_wc;
2612 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2613 return REG_ERANGE;
2614
2615 /* Got valid collation sequence values, add them as a new entry.
2616 However, for !_LIBC we have no collation elements: if the
2617 character set is single byte, the single byte character set
2618 that we build below suffices. parse_bracket_exp passes
2619 no MBCSET if dfa->mb_cur_max == 1. */
2620 if (mbcset)
2621 {
2622 /* Check the space of the arrays. */
2623 if (BE (*range_alloc == mbcset->nranges, 0))
2624 {
2625 /* There is not enough space, need realloc. */
2626 wchar_t *new_array_start, *new_array_end;
2627 Idx new_nranges;
2628
2629 new_nranges = mbcset->nranges;
2630 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2631 are NULL if *range_alloc == 0. */
2632 new_array_start = re_x2realloc (mbcset->range_starts, wchar_t,
2633 &new_nranges);
2634 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2635 new_nranges);
2636
2637 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2638 return REG_ESPACE;
2639
2640 mbcset->range_starts = new_array_start;
2641 mbcset->range_ends = new_array_end;
2642 *range_alloc = new_nranges;
2643 }
2644
2645 mbcset->range_starts[mbcset->nranges] = start_wc;
2646 mbcset->range_ends[mbcset->nranges++] = end_wc;
2647 }
2648
2649 /* Build the table for single byte characters. */
2650 for (wc = 0; wc < SBC_MAX; ++wc)
2651 {
2652 cmp_buf[2] = wc;
2653 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2654 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2655 bitset_set (sbcset, wc);
2656 }
2657 }
2658 # else /* not RE_ENABLE_I18N */
2659 {
2660 unsigned int ch;
2661 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2662 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2663 : 0));
2664 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2665 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2666 : 0));
2667 if (start_ch > end_ch)
2668 return REG_ERANGE;
2669 /* Build the table for single byte characters. */
2670 for (ch = 0; ch < SBC_MAX; ++ch)
2671 if (start_ch <= ch && ch <= end_ch)
2672 bitset_set (sbcset, ch);
2673 }
2674 # endif /* not RE_ENABLE_I18N */
2675 return REG_NOERROR;
2676 }
2677 #endif /* not _LIBC */
2678
2679 #ifndef _LIBC
2680 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2681 Build the collating element which is represented by NAME.
2682 The result are written to MBCSET and SBCSET.
2683 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2684 pointer argument since we may update it. */
2685
2686 static reg_errcode_t
build_collating_symbol(bitset sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2687 build_collating_symbol (bitset sbcset,
2688 # ifdef RE_ENABLE_I18N
2689 re_charset_t *mbcset, Idx *coll_sym_alloc,
2690 # endif
2691 const unsigned char *name)
2692 {
2693 size_t name_len = strlen ((const char *) name);
2694 if (BE (name_len != 1, 0))
2695 return REG_ECOLLATE;
2696 else
2697 {
2698 bitset_set (sbcset, name[0]);
2699 return REG_NOERROR;
2700 }
2701 }
2702 #endif /* not _LIBC */
2703
2704 /* This function parse bracket expression like "[abc]", "[a-c]",
2705 "[[.a-a.]]" etc. */
2706
2707 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2708 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2709 reg_syntax_t syntax, reg_errcode_t *err)
2710 {
2711 #ifdef _LIBC
2712 const unsigned char *collseqmb;
2713 const char *collseqwc;
2714 uint32_t nrules;
2715 int32_t table_size;
2716 const int32_t *symb_table;
2717 const unsigned char *extra;
2718
2719 /* Local function for parse_bracket_exp used in _LIBC environement.
2720 Seek the collating symbol entry correspondings to NAME.
2721 Return the index of the symbol in the SYMB_TABLE. */
2722
2723 auto inline int32_t
2724 __attribute ((always_inline))
2725 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2726 {
2727 int32_t hash = elem_hash ((const char *) name, name_len);
2728 int32_t elem = hash % table_size;
2729 int32_t second = hash % (table_size - 2);
2730 while (symb_table[2 * elem] != 0)
2731 {
2732 /* First compare the hashing value. */
2733 if (symb_table[2 * elem] == hash
2734 /* Compare the length of the name. */
2735 && name_len == extra[symb_table[2 * elem + 1]]
2736 /* Compare the name. */
2737 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2738 name_len) == 0)
2739 {
2740 /* Yep, this is the entry. */
2741 break;
2742 }
2743
2744 /* Next entry. */
2745 elem += second;
2746 }
2747 return elem;
2748 }
2749
2750 /* Local function for parse_bracket_exp used in _LIBC environement.
2751 Look up the collation sequence value of BR_ELEM.
2752 Return the value if succeeded, UINT_MAX otherwise. */
2753
2754 auto inline unsigned int
2755 __attribute ((always_inline))
2756 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2757 {
2758 if (br_elem->type == SB_CHAR)
2759 {
2760 /*
2761 if (MB_CUR_MAX == 1)
2762 */
2763 if (nrules == 0)
2764 return collseqmb[br_elem->opr.ch];
2765 else
2766 {
2767 wint_t wc = __btowc (br_elem->opr.ch);
2768 return __collseq_table_lookup (collseqwc, wc);
2769 }
2770 }
2771 else if (br_elem->type == MB_CHAR)
2772 {
2773 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2774 }
2775 else if (br_elem->type == COLL_SYM)
2776 {
2777 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2778 if (nrules != 0)
2779 {
2780 int32_t elem, idx;
2781 elem = seek_collating_symbol_entry (br_elem->opr.name,
2782 sym_name_len);
2783 if (symb_table[2 * elem] != 0)
2784 {
2785 /* We found the entry. */
2786 idx = symb_table[2 * elem + 1];
2787 /* Skip the name of collating element name. */
2788 idx += 1 + extra[idx];
2789 /* Skip the byte sequence of the collating element. */
2790 idx += 1 + extra[idx];
2791 /* Adjust for the alignment. */
2792 idx = (idx + 3) & ~3;
2793 /* Skip the multibyte collation sequence value. */
2794 idx += sizeof (unsigned int);
2795 /* Skip the wide char sequence of the collating element. */
2796 idx += sizeof (unsigned int) *
2797 (1 + *(unsigned int *) (extra + idx));
2798 /* Return the collation sequence value. */
2799 return *(unsigned int *) (extra + idx);
2800 }
2801 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2802 {
2803 /* No valid character. Match it as a single byte
2804 character. */
2805 return collseqmb[br_elem->opr.name[0]];
2806 }
2807 }
2808 else if (sym_name_len == 1)
2809 return collseqmb[br_elem->opr.name[0]];
2810 }
2811 return UINT_MAX;
2812 }
2813
2814 /* Local function for parse_bracket_exp used in _LIBC environement.
2815 Build the range expression which starts from START_ELEM, and ends
2816 at END_ELEM. The result are written to MBCSET and SBCSET.
2817 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2818 mbcset->range_ends, is a pointer argument sinse we may
2819 update it. */
2820
2821 auto inline reg_errcode_t
2822 __attribute ((always_inline))
2823 build_range_exp (bitset sbcset, re_charset_t *mbcset,
2824 Idx *range_alloc,
2825 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2826 {
2827 unsigned int ch;
2828 uint32_t start_collseq;
2829 uint32_t end_collseq;
2830
2831 /* Equivalence Classes and Character Classes can't be a range
2832 start/end. */
2833 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2834 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2835 0))
2836 return REG_ERANGE;
2837
2838 start_collseq = lookup_collation_sequence_value (start_elem);
2839 end_collseq = lookup_collation_sequence_value (end_elem);
2840 /* Check start/end collation sequence values. */
2841 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2842 return REG_ECOLLATE;
2843 if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2844 return REG_ERANGE;
2845
2846 /* Got valid collation sequence values, add them as a new entry.
2847 However, if we have no collation elements, and the character set
2848 is single byte, the single byte character set that we
2849 build below suffices. */
2850 if (nrules > 0 || dfa->mb_cur_max > 1)
2851 {
2852 /* Check the space of the arrays. */
2853 if (BE (*range_alloc == mbcset->nranges, 0))
2854 {
2855 /* There is not enough space, need realloc. */
2856 uint32_t *new_array_start;
2857 uint32_t *new_array_end;
2858 Idx new_nranges;
2859
2860 new_nranges = mbcset->nranges;
2861 new_array_start = re_x2realloc (mbcset->range_starts, uint32_t,
2862 &new_nranges);
2863 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2864 new_nranges);
2865
2866 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2867 return REG_ESPACE;
2868
2869 mbcset->range_starts = new_array_start;
2870 mbcset->range_ends = new_array_end;
2871 *range_alloc = new_nranges;
2872 }
2873
2874 mbcset->range_starts[mbcset->nranges] = start_collseq;
2875 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2876 }
2877
2878 /* Build the table for single byte characters. */
2879 for (ch = 0; ch < SBC_MAX; ch++)
2880 {
2881 uint32_t ch_collseq;
2882 /*
2883 if (MB_CUR_MAX == 1)
2884 */
2885 if (nrules == 0)
2886 ch_collseq = collseqmb[ch];
2887 else
2888 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2889 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2890 bitset_set (sbcset, ch);
2891 }
2892 return REG_NOERROR;
2893 }
2894
2895 /* Local function for parse_bracket_exp used in _LIBC environement.
2896 Build the collating element which is represented by NAME.
2897 The result are written to MBCSET and SBCSET.
2898 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2899 pointer argument sinse we may update it. */
2900
2901 auto inline reg_errcode_t
2902 __attribute ((always_inline))
2903 build_collating_symbol (bitset sbcset, re_charset_t *mbcset,
2904 Idx *coll_sym_alloc, const unsigned char *name)
2905 {
2906 int32_t elem, idx;
2907 size_t name_len = strlen ((const char *) name);
2908 if (nrules != 0)
2909 {
2910 elem = seek_collating_symbol_entry (name, name_len);
2911 if (symb_table[2 * elem] != 0)
2912 {
2913 /* We found the entry. */
2914 idx = symb_table[2 * elem + 1];
2915 /* Skip the name of collating element name. */
2916 idx += 1 + extra[idx];
2917 }
2918 else if (symb_table[2 * elem] == 0 && name_len == 1)
2919 {
2920 /* No valid character, treat it as a normal
2921 character. */
2922 bitset_set (sbcset, name[0]);
2923 return REG_NOERROR;
2924 }
2925 else
2926 return REG_ECOLLATE;
2927
2928 /* Got valid collation sequence, add it as a new entry. */
2929 /* Check the space of the arrays. */
2930 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2931 {
2932 /* Not enough, realloc it. */
2933 Idx new_coll_sym_alloc = mbcset->ncoll_syms;
2934 /* Use realloc since mbcset->coll_syms is NULL
2935 if *alloc == 0. */
2936 int32_t *new_coll_syms = re_x2realloc (mbcset->coll_syms, int32_t,
2937 &new_coll_sym_alloc);
2938 if (BE (new_coll_syms == NULL, 0))
2939 return REG_ESPACE;
2940 mbcset->coll_syms = new_coll_syms;
2941 *coll_sym_alloc = new_coll_sym_alloc;
2942 }
2943 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2944 return REG_NOERROR;
2945 }
2946 else
2947 {
2948 if (BE (name_len != 1, 0))
2949 return REG_ECOLLATE;
2950 else
2951 {
2952 bitset_set (sbcset, name[0]);
2953 return REG_NOERROR;
2954 }
2955 }
2956 }
2957 #endif
2958
2959 re_token_t br_token;
2960 re_bitset_ptr_t sbcset;
2961 #ifdef RE_ENABLE_I18N
2962 re_charset_t *mbcset;
2963 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2964 Idx equiv_class_alloc = 0, char_class_alloc = 0;
2965 #endif /* not RE_ENABLE_I18N */
2966 bool non_match = false;
2967 bin_tree_t *work_tree;
2968 int token_len;
2969 bool first_round = true;
2970 #ifdef _LIBC
2971 collseqmb = (const unsigned char *)
2972 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2973 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2974 if (nrules)
2975 {
2976 /*
2977 if (MB_CUR_MAX > 1)
2978 */
2979 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2980 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2981 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2982 _NL_COLLATE_SYMB_TABLEMB);
2983 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2984 _NL_COLLATE_SYMB_EXTRAMB);
2985 }
2986 #endif
2987 sbcset = re_calloc (bitset_word, BITSET_WORDS);
2988 #ifdef RE_ENABLE_I18N
2989 mbcset = re_calloc (re_charset_t, 1);
2990 #endif /* RE_ENABLE_I18N */
2991 #ifdef RE_ENABLE_I18N
2992 if (BE (sbcset == NULL || mbcset == NULL, 0))
2993 #else
2994 if (BE (sbcset == NULL, 0))
2995 #endif /* RE_ENABLE_I18N */
2996 {
2997 *err = REG_ESPACE;
2998 return NULL;
2999 }
3000
3001 token_len = peek_token_bracket (token, regexp, syntax);
3002 if (BE (token->type == END_OF_RE, 0))
3003 {
3004 *err = REG_BADPAT;
3005 goto parse_bracket_exp_free_return;
3006 }
3007 if (token->type == OP_NON_MATCH_LIST)
3008 {
3009 #ifdef RE_ENABLE_I18N
3010 mbcset->non_match = 1;
3011 #endif /* not RE_ENABLE_I18N */
3012 non_match = true;
3013 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3014 bitset_set (sbcset, '\0');
3015 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3016 token_len = peek_token_bracket (token, regexp, syntax);
3017 if (BE (token->type == END_OF_RE, 0))
3018 {
3019 *err = REG_BADPAT;
3020 goto parse_bracket_exp_free_return;
3021 }
3022 }
3023
3024 /* We treat the first ']' as a normal character. */
3025 if (token->type == OP_CLOSE_BRACKET)
3026 token->type = CHARACTER;
3027
3028 while (1)
3029 {
3030 bracket_elem_t start_elem, end_elem;
3031 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3032 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3033 reg_errcode_t ret;
3034 int token_len2 = 0;
3035 bool is_range_exp = false;
3036 re_token_t token2;
3037
3038 start_elem.opr.name = start_name_buf;
3039 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3040 syntax, first_round);
3041 if (BE (ret != REG_NOERROR, 0))
3042 {
3043 *err = ret;
3044 goto parse_bracket_exp_free_return;
3045 }
3046 first_round = false;
3047
3048 /* Get information about the next token. We need it in any case. */
3049 token_len = peek_token_bracket (token, regexp, syntax);
3050
3051 /* Do not check for ranges if we know they are not allowed. */
3052 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3053 {
3054 if (BE (token->type == END_OF_RE, 0))
3055 {
3056 *err = REG_EBRACK;
3057 goto parse_bracket_exp_free_return;
3058 }
3059 if (token->type == OP_CHARSET_RANGE)
3060 {
3061 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3062 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3063 if (BE (token2.type == END_OF_RE, 0))
3064 {
3065 *err = REG_EBRACK;
3066 goto parse_bracket_exp_free_return;
3067 }
3068 if (token2.type == OP_CLOSE_BRACKET)
3069 {
3070 /* We treat the last '-' as a normal character. */
3071 re_string_skip_bytes (regexp, -token_len);
3072 token->type = CHARACTER;
3073 }
3074 else
3075 is_range_exp = true;
3076 }
3077 }
3078
3079 if (is_range_exp == true)
3080 {
3081 end_elem.opr.name = end_name_buf;
3082 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3083 dfa, syntax, true);
3084 if (BE (ret != REG_NOERROR, 0))
3085 {
3086 *err = ret;
3087 goto parse_bracket_exp_free_return;
3088 }
3089
3090 token_len = peek_token_bracket (token, regexp, syntax);
3091
3092 #ifdef _LIBC
3093 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3094 &start_elem, &end_elem);
3095 #else
3096 # ifdef RE_ENABLE_I18N
3097 *err = build_range_exp (sbcset,
3098 dfa->mb_cur_max > 1 ? mbcset : NULL,
3099 &range_alloc, &start_elem, &end_elem);
3100 # else
3101 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3102 # endif
3103 #endif /* RE_ENABLE_I18N */
3104 if (BE (*err != REG_NOERROR, 0))
3105 goto parse_bracket_exp_free_return;
3106 }
3107 else
3108 {
3109 switch (start_elem.type)
3110 {
3111 case SB_CHAR:
3112 bitset_set (sbcset, start_elem.opr.ch);
3113 break;
3114 #ifdef RE_ENABLE_I18N
3115 case MB_CHAR:
3116 /* Check whether the array has enough space. */
3117 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3118 {
3119 wchar_t *new_mbchars;
3120 /* Not enough, realloc it. */
3121 mbchar_alloc = mbcset->nmbchars;
3122 /* Use realloc since array is NULL if *alloc == 0. */
3123 new_mbchars = re_x2realloc (mbcset->mbchars, wchar_t,
3124 &mbchar_alloc);
3125 if (BE (new_mbchars == NULL, 0))
3126 goto parse_bracket_exp_espace;
3127 mbcset->mbchars = new_mbchars;
3128 }
3129 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3130 break;
3131 #endif /* RE_ENABLE_I18N */
3132 case EQUIV_CLASS:
3133 *err = build_equiv_class (sbcset,
3134 #ifdef RE_ENABLE_I18N
3135 mbcset, &equiv_class_alloc,
3136 #endif /* RE_ENABLE_I18N */
3137 start_elem.opr.name);
3138 if (BE (*err != REG_NOERROR, 0))
3139 goto parse_bracket_exp_free_return;
3140 break;
3141 case COLL_SYM:
3142 *err = build_collating_symbol (sbcset,
3143 #ifdef RE_ENABLE_I18N
3144 mbcset, &coll_sym_alloc,
3145 #endif /* RE_ENABLE_I18N */
3146 start_elem.opr.name);
3147 if (BE (*err != REG_NOERROR, 0))
3148 goto parse_bracket_exp_free_return;
3149 break;
3150 case CHAR_CLASS:
3151 *err = build_charclass (regexp->trans, sbcset,
3152 #ifdef RE_ENABLE_I18N
3153 mbcset, &char_class_alloc,
3154 #endif /* RE_ENABLE_I18N */
3155 start_elem.opr.name, syntax);
3156 if (BE (*err != REG_NOERROR, 0))
3157 goto parse_bracket_exp_free_return;
3158 break;
3159 default:
3160 assert (0);
3161 break;
3162 }
3163 }
3164 if (BE (token->type == END_OF_RE, 0))
3165 {
3166 *err = REG_EBRACK;
3167 goto parse_bracket_exp_free_return;
3168 }
3169 if (token->type == OP_CLOSE_BRACKET)
3170 break;
3171 }
3172
3173 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3174
3175 /* If it is non-matching list. */
3176 if (non_match)
3177 bitset_not (sbcset);
3178
3179 #ifdef RE_ENABLE_I18N
3180 /* Ensure only single byte characters are set. */
3181 if (dfa->mb_cur_max > 1)
3182 bitset_mask (sbcset, dfa->sb_char);
3183
3184 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3185 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3186 || mbcset->non_match)))
3187 {
3188 bin_tree_t *mbc_tree;
3189 int sbc_idx;
3190 /* Build a tree for complex bracket. */
3191 dfa->has_mb_node = 1;
3192 br_token.type = COMPLEX_BRACKET;
3193 br_token.opr.mbcset = mbcset;
3194 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3195 if (BE (mbc_tree == NULL, 0))
3196 goto parse_bracket_exp_espace;
3197 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3198 if (sbcset[sbc_idx])
3199 break;
3200 /* If there are no bits set in sbcset, there is no point
3201 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3202 if (sbc_idx < BITSET_WORDS)
3203 {
3204 /* Build a tree for simple bracket. */
3205 br_token.type = SIMPLE_BRACKET;
3206 br_token.opr.sbcset = sbcset;
3207 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3208 if (BE (work_tree == NULL, 0))
3209 goto parse_bracket_exp_espace;
3210
3211 /* Then join them by ALT node. */
3212 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3213 if (BE (work_tree == NULL, 0))
3214 goto parse_bracket_exp_espace;
3215 }
3216 else
3217 {
3218 re_free (sbcset);
3219 work_tree = mbc_tree;
3220 }
3221 }
3222 else
3223 #endif /* not RE_ENABLE_I18N */
3224 {
3225 #ifdef RE_ENABLE_I18N
3226 free_charset (mbcset);
3227 #endif
3228 /* Build a tree for simple bracket. */
3229 br_token.type = SIMPLE_BRACKET;
3230 br_token.opr.sbcset = sbcset;
3231 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3232 if (BE (work_tree == NULL, 0))
3233 goto parse_bracket_exp_espace;
3234 }
3235 return work_tree;
3236
3237 parse_bracket_exp_espace:
3238 *err = REG_ESPACE;
3239 parse_bracket_exp_free_return:
3240 re_free (sbcset);
3241 #ifdef RE_ENABLE_I18N
3242 free_charset (mbcset);
3243 #endif /* RE_ENABLE_I18N */
3244 return NULL;
3245 }
3246
3247 /* Parse an element in the bracket expression. */
3248
3249 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)3250 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3251 re_token_t *token, int token_len, re_dfa_t *dfa,
3252 reg_syntax_t syntax, bool accept_hyphen)
3253 {
3254 #ifdef RE_ENABLE_I18N
3255 int cur_char_size;
3256 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3257 if (cur_char_size > 1)
3258 {
3259 elem->type = MB_CHAR;
3260 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3261 re_string_skip_bytes (regexp, cur_char_size);
3262 return REG_NOERROR;
3263 }
3264 #endif /* RE_ENABLE_I18N */
3265 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3266 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3267 || token->type == OP_OPEN_EQUIV_CLASS)
3268 return parse_bracket_symbol (elem, regexp, token);
3269 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3270 {
3271 /* A '-' must only appear as anything but a range indicator before
3272 the closing bracket. Everything else is an error. */
3273 re_token_t token2;
3274 (void) peek_token_bracket (&token2, regexp, syntax);
3275 if (token2.type != OP_CLOSE_BRACKET)
3276 /* The actual error value is not standardized since this whole
3277 case is undefined. But ERANGE makes good sense. */
3278 return REG_ERANGE;
3279 }
3280 elem->type = SB_CHAR;
3281 elem->opr.ch = token->opr.c;
3282 return REG_NOERROR;
3283 }
3284
3285 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3286 such as [:<character_class>:], [.<collating_element>.], and
3287 [=<equivalent_class>=]. */
3288
3289 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3290 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3291 re_token_t *token)
3292 {
3293 unsigned char ch, delim = token->opr.c;
3294 int i = 0;
3295 if (re_string_eoi(regexp))
3296 return REG_EBRACK;
3297 for (;; ++i)
3298 {
3299 if (i >= BRACKET_NAME_BUF_SIZE)
3300 return REG_EBRACK;
3301 if (token->type == OP_OPEN_CHAR_CLASS)
3302 ch = re_string_fetch_byte_case (regexp);
3303 else
3304 ch = re_string_fetch_byte (regexp);
3305 if (re_string_eoi(regexp))
3306 return REG_EBRACK;
3307 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3308 break;
3309 elem->opr.name[i] = ch;
3310 }
3311 re_string_skip_bytes (regexp, 1);
3312 elem->opr.name[i] = '\0';
3313 switch (token->type)
3314 {
3315 case OP_OPEN_COLL_ELEM:
3316 elem->type = COLL_SYM;
3317 break;
3318 case OP_OPEN_EQUIV_CLASS:
3319 elem->type = EQUIV_CLASS;
3320 break;
3321 case OP_OPEN_CHAR_CLASS:
3322 elem->type = CHAR_CLASS;
3323 break;
3324 default:
3325 break;
3326 }
3327 return REG_NOERROR;
3328 }
3329
3330 /* Helper function for parse_bracket_exp.
3331 Build the equivalence class which is represented by NAME.
3332 The result are written to MBCSET and SBCSET.
3333 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3334 is a pointer argument sinse we may update it. */
3335
3336 static reg_errcode_t
build_equiv_class(bitset sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3337 build_equiv_class (bitset sbcset,
3338 #ifdef RE_ENABLE_I18N
3339 re_charset_t *mbcset, Idx *equiv_class_alloc,
3340 #endif
3341 const unsigned char *name)
3342 {
3343 #if defined _LIBC
3344 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3345 if (nrules != 0)
3346 {
3347 const int32_t *table, *indirect;
3348 const unsigned char *weights, *extra, *cp;
3349 unsigned char char_buf[2];
3350 int32_t idx1, idx2;
3351 unsigned int ch;
3352 size_t len;
3353 /* This #include defines a local function! */
3354 # include <locale/weight.h>
3355 /* Calculate the index for equivalence class. */
3356 cp = name;
3357 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3358 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3359 _NL_COLLATE_WEIGHTMB);
3360 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3361 _NL_COLLATE_EXTRAMB);
3362 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3363 _NL_COLLATE_INDIRECTMB);
3364 idx1 = findidx (&cp);
3365 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3366 /* This isn't a valid character. */
3367 return REG_ECOLLATE;
3368
3369 /* Build single byte matcing table for this equivalence class. */
3370 char_buf[1] = (unsigned char) '\0';
3371 len = weights[idx1];
3372 for (ch = 0; ch < SBC_MAX; ++ch)
3373 {
3374 char_buf[0] = ch;
3375 cp = char_buf;
3376 idx2 = findidx (&cp);
3377 /*
3378 idx2 = table[ch];
3379 */
3380 if (idx2 == 0)
3381 /* This isn't a valid character. */
3382 continue;
3383 if (len == weights[idx2])
3384 {
3385 int cnt = 0;
3386 while (cnt <= len &&
3387 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3388 ++cnt;
3389
3390 if (cnt > len)
3391 bitset_set (sbcset, ch);
3392 }
3393 }
3394 /* Check whether the array has enough space. */
3395 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3396 {
3397 /* Not enough, realloc it. */
3398 Idx new_equiv_class_alloc = mbcset->nequiv_classes;
3399 /* Use realloc since the array is NULL if *alloc == 0. */
3400 int32_t *new_equiv_classes = re_x2realloc (mbcset->equiv_classes,
3401 int32_t,
3402 &new_equiv_class_alloc);
3403 if (BE (new_equiv_classes == NULL, 0))
3404 return REG_ESPACE;
3405 mbcset->equiv_classes = new_equiv_classes;
3406 *equiv_class_alloc = new_equiv_class_alloc;
3407 }
3408 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3409 }
3410 else
3411 #endif /* _LIBC */
3412 {
3413 if (BE (strlen ((const char *) name) != 1, 0))
3414 return REG_ECOLLATE;
3415 bitset_set (sbcset, *name);
3416 }
3417 return REG_NOERROR;
3418 }
3419
3420 /* Helper function for parse_bracket_exp.
3421 Build the character class which is represented by NAME.
3422 The result are written to MBCSET and SBCSET.
3423 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3424 is a pointer argument sinse we may update it. */
3425
3426 static reg_errcode_t
build_charclass(unsigned REG_TRANSLATE_TYPE trans,bitset sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3427 build_charclass (unsigned REG_TRANSLATE_TYPE trans, bitset sbcset,
3428 #ifdef RE_ENABLE_I18N
3429 re_charset_t *mbcset, Idx *char_class_alloc,
3430 #endif
3431 const unsigned char *class_name, reg_syntax_t syntax)
3432 {
3433 int i;
3434 const char *name = (const char *) class_name;
3435
3436 /* In case of REG_ICASE "upper" and "lower" match the both of
3437 upper and lower cases. */
3438 if ((syntax & REG_IGNORE_CASE)
3439 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3440 name = "alpha";
3441
3442 #ifdef RE_ENABLE_I18N
3443 /* Check the space of the arrays. */
3444 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3445 {
3446 /* Not enough, realloc it. */
3447 Idx new_char_class_alloc = mbcset->nchar_classes;
3448 /* Use realloc since array is NULL if *alloc == 0. */
3449 wctype_t *new_char_classes = re_x2realloc (mbcset->char_classes, wctype_t,
3450 &new_char_class_alloc);
3451 if (BE (new_char_classes == NULL, 0))
3452 return REG_ESPACE;
3453 mbcset->char_classes = new_char_classes;
3454 *char_class_alloc = new_char_class_alloc;
3455 }
3456 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3457 #endif /* RE_ENABLE_I18N */
3458
3459 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3460 for (i = 0; i < SBC_MAX; ++i) \
3461 { \
3462 if (ctype_func (i)) \
3463 { \
3464 int ch = trans ? trans[i] : i; \
3465 bitset_set (sbcset, ch); \
3466 } \
3467 }
3468
3469 if (strcmp (name, "alnum") == 0)
3470 BUILD_CHARCLASS_LOOP (isalnum)
3471 else if (strcmp (name, "cntrl") == 0)
3472 BUILD_CHARCLASS_LOOP (iscntrl)
3473 else if (strcmp (name, "lower") == 0)
3474 BUILD_CHARCLASS_LOOP (islower)
3475 else if (strcmp (name, "space") == 0)
3476 BUILD_CHARCLASS_LOOP (isspace)
3477 else if (strcmp (name, "alpha") == 0)
3478 BUILD_CHARCLASS_LOOP (isalpha)
3479 else if (strcmp (name, "digit") == 0)
3480 BUILD_CHARCLASS_LOOP (isdigit)
3481 else if (strcmp (name, "print") == 0)
3482 BUILD_CHARCLASS_LOOP (isprint)
3483 else if (strcmp (name, "upper") == 0)
3484 BUILD_CHARCLASS_LOOP (isupper)
3485 else if (strcmp (name, "blank") == 0)
3486 BUILD_CHARCLASS_LOOP (isblank)
3487 else if (strcmp (name, "graph") == 0)
3488 BUILD_CHARCLASS_LOOP (isgraph)
3489 else if (strcmp (name, "punct") == 0)
3490 BUILD_CHARCLASS_LOOP (ispunct)
3491 else if (strcmp (name, "xdigit") == 0)
3492 BUILD_CHARCLASS_LOOP (isxdigit)
3493 else
3494 return REG_ECTYPE;
3495
3496 return REG_NOERROR;
3497 }
3498
3499 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,unsigned REG_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,bool non_match,reg_errcode_t * err)3500 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3501 const unsigned char *class_name,
3502 const unsigned char *extra,
3503 bool non_match, reg_errcode_t *err)
3504 {
3505 re_bitset_ptr_t sbcset;
3506 #ifdef RE_ENABLE_I18N
3507 re_charset_t *mbcset;
3508 Idx alloc = 0;
3509 #endif /* not RE_ENABLE_I18N */
3510 reg_errcode_t ret;
3511 re_token_t br_token;
3512 bin_tree_t *tree;
3513
3514 sbcset = re_calloc (bitset_word, BITSET_WORDS);
3515 #ifdef RE_ENABLE_I18N
3516 mbcset = re_calloc (re_charset_t, 1);
3517 #endif /* RE_ENABLE_I18N */
3518
3519 #ifdef RE_ENABLE_I18N
3520 if (BE (sbcset == NULL || mbcset == NULL, 0))
3521 #else /* not RE_ENABLE_I18N */
3522 if (BE (sbcset == NULL, 0))
3523 #endif /* not RE_ENABLE_I18N */
3524 {
3525 *err = REG_ESPACE;
3526 return NULL;
3527 }
3528
3529 if (non_match)
3530 {
3531 #ifdef RE_ENABLE_I18N
3532 /*
3533 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3534 bitset_set(cset->sbcset, '\0');
3535 */
3536 mbcset->non_match = 1;
3537 #endif /* not RE_ENABLE_I18N */
3538 }
3539
3540 /* We don't care the syntax in this case. */
3541 ret = build_charclass (trans, sbcset,
3542 #ifdef RE_ENABLE_I18N
3543 mbcset, &alloc,
3544 #endif /* RE_ENABLE_I18N */
3545 class_name, 0);
3546
3547 if (BE (ret != REG_NOERROR, 0))
3548 {
3549 re_free (sbcset);
3550 #ifdef RE_ENABLE_I18N
3551 free_charset (mbcset);
3552 #endif /* RE_ENABLE_I18N */
3553 *err = ret;
3554 return NULL;
3555 }
3556 /* \w match '_' also. */
3557 for (; *extra; extra++)
3558 bitset_set (sbcset, *extra);
3559
3560 /* If it is non-matching list. */
3561 if (non_match)
3562 bitset_not (sbcset);
3563
3564 #ifdef RE_ENABLE_I18N
3565 /* Ensure only single byte characters are set. */
3566 if (dfa->mb_cur_max > 1)
3567 bitset_mask (sbcset, dfa->sb_char);
3568 #endif
3569
3570 /* Build a tree for simple bracket. */
3571 br_token.type = SIMPLE_BRACKET;
3572 br_token.opr.sbcset = sbcset;
3573 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3574 if (BE (tree == NULL, 0))
3575 goto build_word_op_espace;
3576
3577 #ifdef RE_ENABLE_I18N
3578 if (dfa->mb_cur_max > 1)
3579 {
3580 bin_tree_t *mbc_tree;
3581 /* Build a tree for complex bracket. */
3582 br_token.type = COMPLEX_BRACKET;
3583 br_token.opr.mbcset = mbcset;
3584 dfa->has_mb_node = 1;
3585 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3586 if (BE (mbc_tree == NULL, 0))
3587 goto build_word_op_espace;
3588 /* Then join them by ALT node. */
3589 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3590 if (BE (mbc_tree != NULL, 1))
3591 return tree;
3592 }
3593 else
3594 {
3595 free_charset (mbcset);
3596 return tree;
3597 }
3598 #else /* not RE_ENABLE_I18N */
3599 return tree;
3600 #endif /* not RE_ENABLE_I18N */
3601
3602 build_word_op_espace:
3603 re_free (sbcset);
3604 #ifdef RE_ENABLE_I18N
3605 free_charset (mbcset);
3606 #endif /* RE_ENABLE_I18N */
3607 *err = REG_ESPACE;
3608 return NULL;
3609 }
3610
3611 /* This is intended for the expressions like "a{1,3}".
3612 Fetch a number from `input', and return the number.
3613 Return REG_MISSING if the number field is empty like "{,1}".
3614 Return REG_ERROR if an error occurred. */
3615
3616 static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3617 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3618 {
3619 Idx num = REG_MISSING;
3620 unsigned char c;
3621 while (1)
3622 {
3623 fetch_token (token, input, syntax);
3624 c = token->opr.c;
3625 if (BE (token->type == END_OF_RE, 0))
3626 return REG_ERROR;
3627 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3628 break;
3629 num = ((token->type != CHARACTER || c < '0' || '9' < c
3630 || num == REG_ERROR)
3631 ? REG_ERROR
3632 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3633 num = (num > REG_DUP_MAX) ? REG_ERROR : num;
3634 }
3635 return num;
3636 }
3637
3638 #ifdef RE_ENABLE_I18N
3639 static void
free_charset(re_charset_t * cset)3640 free_charset (re_charset_t *cset)
3641 {
3642 re_free (cset->mbchars);
3643 # ifdef _LIBC
3644 re_free (cset->coll_syms);
3645 re_free (cset->equiv_classes);
3646 re_free (cset->range_starts);
3647 re_free (cset->range_ends);
3648 # endif
3649 re_free (cset->char_classes);
3650 re_free (cset);
3651 }
3652 #endif /* RE_ENABLE_I18N */
3653
3654 /* Functions for binary tree operation. */
3655
3656 /* Create a tree node. */
3657
3658 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3659 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3660 re_token_type_t type)
3661 {
3662 re_token_t t;
3663 t.type = type;
3664 return create_token_tree (dfa, left, right, &t);
3665 }
3666
3667 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3668 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3669 const re_token_t *token)
3670 {
3671 bin_tree_t *tree;
3672 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3673 {
3674 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3675
3676 if (storage == NULL)
3677 return NULL;
3678 storage->next = dfa->str_tree_storage;
3679 dfa->str_tree_storage = storage;
3680 dfa->str_tree_storage_idx = 0;
3681 }
3682 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3683
3684 tree->parent = NULL;
3685 tree->left = left;
3686 tree->right = right;
3687 tree->token = *token;
3688 tree->token.duplicated = 0;
3689 tree->token.opt_subexp = 0;
3690 tree->first = NULL;
3691 tree->next = NULL;
3692 tree->node_idx = REG_MISSING;
3693
3694 if (left != NULL)
3695 left->parent = tree;
3696 if (right != NULL)
3697 right->parent = tree;
3698 return tree;
3699 }
3700
3701 /* Mark the tree SRC as an optional subexpression.
3702 To be called from preorder or postorder. */
3703
3704 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3705 mark_opt_subexp (void *extra, bin_tree_t *node)
3706 {
3707 Idx idx = (Idx) (long) extra;
3708 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3709 node->token.opt_subexp = 1;
3710
3711 return REG_NOERROR;
3712 }
3713
3714 /* Free the allocated memory inside NODE. */
3715
3716 static void
free_token(re_token_t * node)3717 free_token (re_token_t *node)
3718 {
3719 #ifdef RE_ENABLE_I18N
3720 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3721 free_charset (node->opr.mbcset);
3722 else
3723 #endif /* RE_ENABLE_I18N */
3724 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3725 re_free (node->opr.sbcset);
3726 }
3727
3728 /* Worker function for tree walking. Free the allocated memory inside NODE
3729 and its children. */
3730
3731 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3732 free_tree (void *extra, bin_tree_t *node)
3733 {
3734 free_token (&node->token);
3735 return REG_NOERROR;
3736 }
3737
3738
3739 /* Duplicate the node SRC, and return new node. This is a preorder
3740 visit similar to the one implemented by the generic visitor, but
3741 we need more infrastructure to maintain two parallel trees --- so,
3742 it's easier to duplicate. */
3743
3744 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3745 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3746 {
3747 const bin_tree_t *node;
3748 bin_tree_t *dup_root;
3749 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3750
3751 for (node = root; ; )
3752 {
3753 /* Create a new tree and link it back to the current parent. */
3754 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3755 if (*p_new == NULL)
3756 return NULL;
3757 (*p_new)->parent = dup_node;
3758 (*p_new)->token.duplicated = 1;
3759 dup_node = *p_new;
3760
3761 /* Go to the left node, or up and to the right. */
3762 if (node->left)
3763 {
3764 node = node->left;
3765 p_new = &dup_node->left;
3766 }
3767 else
3768 {
3769 const bin_tree_t *prev = NULL;
3770 while (node->right == prev || node->right == NULL)
3771 {
3772 prev = node;
3773 node = node->parent;
3774 dup_node = dup_node->parent;
3775 if (!node)
3776 return dup_root;
3777 }
3778 node = node->right;
3779 p_new = &dup_node->right;
3780 }
3781 }
3782 }
3783