xref: /dflybsd-src/contrib/grep/lib/regcomp.c (revision 91b9ed38d3db6a8a8ac5b66da1d43e6e331e259a)
195b7b453SJohn Marino /* Extended regular expression matching and search library.
2*09d4459fSDaniel Fojt    Copyright (C) 2002-2020 Free Software Foundation, Inc.
395b7b453SJohn Marino    This file is part of the GNU C Library.
495b7b453SJohn Marino    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
595b7b453SJohn Marino 
6680a9cb8SJohn Marino    The GNU C Library is free software; you can redistribute it and/or
7680a9cb8SJohn Marino    modify it under the terms of the GNU General Public
8680a9cb8SJohn Marino    License as published by the Free Software Foundation; either
9680a9cb8SJohn Marino    version 3 of the License, or (at your option) any later version.
1095b7b453SJohn Marino 
11680a9cb8SJohn Marino    The GNU C Library is distributed in the hope that it will be useful,
1295b7b453SJohn Marino    but WITHOUT ANY WARRANTY; without even the implied warranty of
13680a9cb8SJohn Marino    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14680a9cb8SJohn Marino    General Public License for more details.
1595b7b453SJohn Marino 
16680a9cb8SJohn Marino    You should have received a copy of the GNU General Public
17680a9cb8SJohn Marino    License along with the GNU C Library; if not, see
18*09d4459fSDaniel Fojt    <https://www.gnu.org/licenses/>.  */
1995b7b453SJohn Marino 
20dc7c36e4SJohn Marino #ifdef _LIBC
21dc7c36e4SJohn Marino # include <locale/weight.h>
22dc7c36e4SJohn Marino #endif
23dc7c36e4SJohn Marino 
2495b7b453SJohn Marino static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
2595b7b453SJohn Marino 					  size_t length, reg_syntax_t syntax);
2695b7b453SJohn Marino static void re_compile_fastmap_iter (regex_t *bufp,
2795b7b453SJohn Marino 				     const re_dfastate_t *init_state,
2895b7b453SJohn Marino 				     char *fastmap);
2995b7b453SJohn Marino static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
3095b7b453SJohn Marino #ifdef RE_ENABLE_I18N
3195b7b453SJohn Marino static void free_charset (re_charset_t *cset);
3295b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
3395b7b453SJohn Marino static void free_workarea_compile (regex_t *preg);
3495b7b453SJohn Marino static reg_errcode_t create_initial_state (re_dfa_t *dfa);
3595b7b453SJohn Marino #ifdef RE_ENABLE_I18N
3695b7b453SJohn Marino static void optimize_utf8 (re_dfa_t *dfa);
3795b7b453SJohn Marino #endif
3895b7b453SJohn Marino static reg_errcode_t analyze (regex_t *preg);
3995b7b453SJohn Marino static reg_errcode_t preorder (bin_tree_t *root,
4095b7b453SJohn Marino 			       reg_errcode_t (fn (void *, bin_tree_t *)),
4195b7b453SJohn Marino 			       void *extra);
4295b7b453SJohn Marino static reg_errcode_t postorder (bin_tree_t *root,
4395b7b453SJohn Marino 				reg_errcode_t (fn (void *, bin_tree_t *)),
4495b7b453SJohn Marino 				void *extra);
4595b7b453SJohn Marino static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
4695b7b453SJohn Marino static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
4795b7b453SJohn Marino static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
4895b7b453SJohn Marino 				 bin_tree_t *node);
4995b7b453SJohn Marino static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
5095b7b453SJohn Marino static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
5195b7b453SJohn Marino static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
5295b7b453SJohn Marino static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
5395b7b453SJohn Marino static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
5495b7b453SJohn Marino 				   unsigned int constraint);
5595b7b453SJohn Marino static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
5695b7b453SJohn Marino static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
5795b7b453SJohn Marino 					 Idx node, bool root);
5895b7b453SJohn Marino static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
5995b7b453SJohn Marino static Idx fetch_number (re_string_t *input, re_token_t *token,
6095b7b453SJohn Marino 			 reg_syntax_t syntax);
6195b7b453SJohn Marino static int peek_token (re_token_t *token, re_string_t *input,
62*09d4459fSDaniel Fojt 			reg_syntax_t syntax);
6395b7b453SJohn Marino static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
6495b7b453SJohn Marino 			  reg_syntax_t syntax, reg_errcode_t *err);
6595b7b453SJohn Marino static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
6695b7b453SJohn Marino 				  re_token_t *token, reg_syntax_t syntax,
6795b7b453SJohn Marino 				  Idx nest, reg_errcode_t *err);
6895b7b453SJohn Marino static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
6995b7b453SJohn Marino 				 re_token_t *token, reg_syntax_t syntax,
7095b7b453SJohn Marino 				 Idx nest, reg_errcode_t *err);
7195b7b453SJohn Marino static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
7295b7b453SJohn Marino 				     re_token_t *token, reg_syntax_t syntax,
7395b7b453SJohn Marino 				     Idx nest, reg_errcode_t *err);
7495b7b453SJohn Marino static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
7595b7b453SJohn Marino 				  re_token_t *token, reg_syntax_t syntax,
7695b7b453SJohn Marino 				  Idx nest, reg_errcode_t *err);
7795b7b453SJohn Marino static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
7895b7b453SJohn Marino 				 re_dfa_t *dfa, re_token_t *token,
7995b7b453SJohn Marino 				 reg_syntax_t syntax, reg_errcode_t *err);
8095b7b453SJohn Marino static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
8195b7b453SJohn Marino 				      re_token_t *token, reg_syntax_t syntax,
8295b7b453SJohn Marino 				      reg_errcode_t *err);
8395b7b453SJohn Marino static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
8495b7b453SJohn Marino 					    re_string_t *regexp,
8595b7b453SJohn Marino 					    re_token_t *token, int token_len,
8695b7b453SJohn Marino 					    re_dfa_t *dfa,
8795b7b453SJohn Marino 					    reg_syntax_t syntax,
8895b7b453SJohn Marino 					    bool accept_hyphen);
8995b7b453SJohn Marino static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
9095b7b453SJohn Marino 					  re_string_t *regexp,
9195b7b453SJohn Marino 					  re_token_t *token);
9295b7b453SJohn Marino #ifdef RE_ENABLE_I18N
9395b7b453SJohn Marino static reg_errcode_t build_equiv_class (bitset_t sbcset,
9495b7b453SJohn Marino 					re_charset_t *mbcset,
9595b7b453SJohn Marino 					Idx *equiv_class_alloc,
9695b7b453SJohn Marino 					const unsigned char *name);
9795b7b453SJohn Marino static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
9895b7b453SJohn Marino 				      bitset_t sbcset,
9995b7b453SJohn Marino 				      re_charset_t *mbcset,
10095b7b453SJohn Marino 				      Idx *char_class_alloc,
101680a9cb8SJohn Marino 				      const char *class_name,
10295b7b453SJohn Marino 				      reg_syntax_t syntax);
10395b7b453SJohn Marino #else  /* not RE_ENABLE_I18N */
10495b7b453SJohn Marino static reg_errcode_t build_equiv_class (bitset_t sbcset,
10595b7b453SJohn Marino 					const unsigned char *name);
10695b7b453SJohn Marino static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
10795b7b453SJohn Marino 				      bitset_t sbcset,
108680a9cb8SJohn Marino 				      const char *class_name,
10995b7b453SJohn Marino 				      reg_syntax_t syntax);
11095b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
11195b7b453SJohn Marino static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
11295b7b453SJohn Marino 				       RE_TRANSLATE_TYPE trans,
113680a9cb8SJohn Marino 				       const char *class_name,
114680a9cb8SJohn Marino 				       const char *extra,
11595b7b453SJohn Marino 				       bool non_match, reg_errcode_t *err);
11695b7b453SJohn Marino static bin_tree_t *create_tree (re_dfa_t *dfa,
11795b7b453SJohn Marino 				bin_tree_t *left, bin_tree_t *right,
11895b7b453SJohn Marino 				re_token_type_t type);
11995b7b453SJohn Marino static bin_tree_t *create_token_tree (re_dfa_t *dfa,
12095b7b453SJohn Marino 				      bin_tree_t *left, bin_tree_t *right,
12195b7b453SJohn Marino 				      const re_token_t *token);
12295b7b453SJohn Marino static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
12395b7b453SJohn Marino static void free_token (re_token_t *node);
12495b7b453SJohn Marino static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
12595b7b453SJohn Marino static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
12695b7b453SJohn Marino 
12795b7b453SJohn Marino /* This table gives an error message for each of the error codes listed
12895b7b453SJohn Marino    in regex.h.  Obviously the order here has to be same as there.
12995b7b453SJohn Marino    POSIX doesn't require that we do anything for REG_NOERROR,
13095b7b453SJohn Marino    but why not be nice?  */
13195b7b453SJohn Marino 
13295b7b453SJohn Marino static const char __re_error_msgid[] =
13395b7b453SJohn Marino   {
13495b7b453SJohn Marino #define REG_NOERROR_IDX	0
13595b7b453SJohn Marino     gettext_noop ("Success")	/* REG_NOERROR */
13695b7b453SJohn Marino     "\0"
13795b7b453SJohn Marino #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
13895b7b453SJohn Marino     gettext_noop ("No match")	/* REG_NOMATCH */
13995b7b453SJohn Marino     "\0"
14095b7b453SJohn Marino #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
14195b7b453SJohn Marino     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
14295b7b453SJohn Marino     "\0"
14395b7b453SJohn Marino #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
14495b7b453SJohn Marino     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
14595b7b453SJohn Marino     "\0"
14695b7b453SJohn Marino #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
14795b7b453SJohn Marino     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
14895b7b453SJohn Marino     "\0"
14995b7b453SJohn Marino #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
15095b7b453SJohn Marino     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
15195b7b453SJohn Marino     "\0"
15295b7b453SJohn Marino #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
15395b7b453SJohn Marino     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
15495b7b453SJohn Marino     "\0"
15595b7b453SJohn Marino #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
156*09d4459fSDaniel Fojt     gettext_noop ("Unmatched [, [^, [:, [., or [=")	/* REG_EBRACK */
15795b7b453SJohn Marino     "\0"
158*09d4459fSDaniel Fojt #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
15995b7b453SJohn Marino     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
16095b7b453SJohn Marino     "\0"
16195b7b453SJohn Marino #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
16295b7b453SJohn Marino     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
16395b7b453SJohn Marino     "\0"
16495b7b453SJohn Marino #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
16595b7b453SJohn Marino     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
16695b7b453SJohn Marino     "\0"
16795b7b453SJohn Marino #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
16895b7b453SJohn Marino     gettext_noop ("Invalid range end")	/* REG_ERANGE */
16995b7b453SJohn Marino     "\0"
17095b7b453SJohn Marino #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
17195b7b453SJohn Marino     gettext_noop ("Memory exhausted") /* REG_ESPACE */
17295b7b453SJohn Marino     "\0"
17395b7b453SJohn Marino #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
17495b7b453SJohn Marino     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
17595b7b453SJohn Marino     "\0"
17695b7b453SJohn Marino #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
17795b7b453SJohn Marino     gettext_noop ("Premature end of regular expression") /* REG_EEND */
17895b7b453SJohn Marino     "\0"
17995b7b453SJohn Marino #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
18095b7b453SJohn Marino     gettext_noop ("Regular expression too big") /* REG_ESIZE */
18195b7b453SJohn Marino     "\0"
18295b7b453SJohn Marino #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
18395b7b453SJohn Marino     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
18495b7b453SJohn Marino   };
18595b7b453SJohn Marino 
18695b7b453SJohn Marino static const size_t __re_error_msgid_idx[] =
18795b7b453SJohn Marino   {
18895b7b453SJohn Marino     REG_NOERROR_IDX,
18995b7b453SJohn Marino     REG_NOMATCH_IDX,
19095b7b453SJohn Marino     REG_BADPAT_IDX,
19195b7b453SJohn Marino     REG_ECOLLATE_IDX,
19295b7b453SJohn Marino     REG_ECTYPE_IDX,
19395b7b453SJohn Marino     REG_EESCAPE_IDX,
19495b7b453SJohn Marino     REG_ESUBREG_IDX,
19595b7b453SJohn Marino     REG_EBRACK_IDX,
19695b7b453SJohn Marino     REG_EPAREN_IDX,
19795b7b453SJohn Marino     REG_EBRACE_IDX,
19895b7b453SJohn Marino     REG_BADBR_IDX,
19995b7b453SJohn Marino     REG_ERANGE_IDX,
20095b7b453SJohn Marino     REG_ESPACE_IDX,
20195b7b453SJohn Marino     REG_BADRPT_IDX,
20295b7b453SJohn Marino     REG_EEND_IDX,
20395b7b453SJohn Marino     REG_ESIZE_IDX,
20495b7b453SJohn Marino     REG_ERPAREN_IDX
20595b7b453SJohn Marino   };
20695b7b453SJohn Marino 
20795b7b453SJohn Marino /* Entry points for GNU code.  */
20895b7b453SJohn Marino 
20995b7b453SJohn Marino /* re_compile_pattern is the GNU regular expression compiler: it
21095b7b453SJohn Marino    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
21195b7b453SJohn Marino    Returns 0 if the pattern was valid, otherwise an error string.
21295b7b453SJohn Marino 
213cf28ed85SJohn Marino    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
21495b7b453SJohn Marino    are set in BUFP on entry.  */
21595b7b453SJohn Marino 
21695b7b453SJohn Marino const char *
re_compile_pattern(const char * pattern,size_t length,struct re_pattern_buffer * bufp)21795b7b453SJohn Marino re_compile_pattern (const char *pattern, size_t length,
21895b7b453SJohn Marino 		    struct re_pattern_buffer *bufp)
21995b7b453SJohn Marino {
22095b7b453SJohn Marino   reg_errcode_t ret;
22195b7b453SJohn Marino 
22295b7b453SJohn Marino   /* And GNU code determines whether or not to get register information
22395b7b453SJohn Marino      by passing null for the REGS argument to re_match, etc., not by
22495b7b453SJohn Marino      setting no_sub, unless RE_NO_SUB is set.  */
22595b7b453SJohn Marino   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
22695b7b453SJohn Marino 
22795b7b453SJohn Marino   /* Match anchors at newline.  */
22895b7b453SJohn Marino   bufp->newline_anchor = 1;
22995b7b453SJohn Marino 
23095b7b453SJohn Marino   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
23195b7b453SJohn Marino 
23295b7b453SJohn Marino   if (!ret)
23395b7b453SJohn Marino     return NULL;
23495b7b453SJohn Marino   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
23595b7b453SJohn Marino }
weak_alias(__re_compile_pattern,re_compile_pattern)23695b7b453SJohn Marino weak_alias (__re_compile_pattern, re_compile_pattern)
23795b7b453SJohn Marino 
238cf28ed85SJohn Marino /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
23995b7b453SJohn Marino    also be assigned to arbitrarily: each pattern buffer stores its own
24095b7b453SJohn Marino    syntax, so it can be changed between regex compilations.  */
24195b7b453SJohn Marino /* This has no initializer because initialized variables in Emacs
24295b7b453SJohn Marino    become read-only after dumping.  */
24395b7b453SJohn Marino reg_syntax_t re_syntax_options;
24495b7b453SJohn Marino 
24595b7b453SJohn Marino 
24695b7b453SJohn Marino /* Specify the precise syntax of regexps for compilation.  This provides
24795b7b453SJohn Marino    for compatibility for various utilities which historically have
24895b7b453SJohn Marino    different, incompatible syntaxes.
24995b7b453SJohn Marino 
25095b7b453SJohn Marino    The argument SYNTAX is a bit mask comprised of the various bits
25195b7b453SJohn Marino    defined in regex.h.  We return the old syntax.  */
25295b7b453SJohn Marino 
25395b7b453SJohn Marino reg_syntax_t
254*09d4459fSDaniel Fojt re_set_syntax (reg_syntax_t syntax)
25595b7b453SJohn Marino {
25695b7b453SJohn Marino   reg_syntax_t ret = re_syntax_options;
25795b7b453SJohn Marino 
25895b7b453SJohn Marino   re_syntax_options = syntax;
25995b7b453SJohn Marino   return ret;
26095b7b453SJohn Marino }
weak_alias(__re_set_syntax,re_set_syntax)26195b7b453SJohn Marino weak_alias (__re_set_syntax, re_set_syntax)
26295b7b453SJohn Marino 
26395b7b453SJohn Marino int
264*09d4459fSDaniel Fojt re_compile_fastmap (struct re_pattern_buffer *bufp)
26595b7b453SJohn Marino {
266cf28ed85SJohn Marino   re_dfa_t *dfa = bufp->buffer;
26795b7b453SJohn Marino   char *fastmap = bufp->fastmap;
26895b7b453SJohn Marino 
26995b7b453SJohn Marino   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
27095b7b453SJohn Marino   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
27195b7b453SJohn Marino   if (dfa->init_state != dfa->init_state_word)
27295b7b453SJohn Marino     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
27395b7b453SJohn Marino   if (dfa->init_state != dfa->init_state_nl)
27495b7b453SJohn Marino     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
27595b7b453SJohn Marino   if (dfa->init_state != dfa->init_state_begbuf)
27695b7b453SJohn Marino     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
27795b7b453SJohn Marino   bufp->fastmap_accurate = 1;
27895b7b453SJohn Marino   return 0;
27995b7b453SJohn Marino }
weak_alias(__re_compile_fastmap,re_compile_fastmap)28095b7b453SJohn Marino weak_alias (__re_compile_fastmap, re_compile_fastmap)
28195b7b453SJohn Marino 
28295b7b453SJohn Marino static inline void
283680a9cb8SJohn Marino __attribute__ ((always_inline))
28495b7b453SJohn Marino re_set_fastmap (char *fastmap, bool icase, int ch)
28595b7b453SJohn Marino {
28695b7b453SJohn Marino   fastmap[ch] = 1;
28795b7b453SJohn Marino   if (icase)
28895b7b453SJohn Marino     fastmap[tolower (ch)] = 1;
28995b7b453SJohn Marino }
29095b7b453SJohn Marino 
29195b7b453SJohn Marino /* Helper function for re_compile_fastmap.
29295b7b453SJohn Marino    Compile fastmap for the initial_state INIT_STATE.  */
29395b7b453SJohn Marino 
29495b7b453SJohn Marino static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)29595b7b453SJohn Marino re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
29695b7b453SJohn Marino 			 char *fastmap)
29795b7b453SJohn Marino {
298cf28ed85SJohn Marino   re_dfa_t *dfa = bufp->buffer;
29995b7b453SJohn Marino   Idx node_cnt;
30095b7b453SJohn Marino   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
30195b7b453SJohn Marino   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
30295b7b453SJohn Marino     {
30395b7b453SJohn Marino       Idx node = init_state->nodes.elems[node_cnt];
30495b7b453SJohn Marino       re_token_type_t type = dfa->nodes[node].type;
30595b7b453SJohn Marino 
30695b7b453SJohn Marino       if (type == CHARACTER)
30795b7b453SJohn Marino 	{
30895b7b453SJohn Marino 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
30995b7b453SJohn Marino #ifdef RE_ENABLE_I18N
31095b7b453SJohn Marino 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
31195b7b453SJohn Marino 	    {
31295b7b453SJohn Marino 	      unsigned char buf[MB_LEN_MAX];
31395b7b453SJohn Marino 	      unsigned char *p;
31495b7b453SJohn Marino 	      wchar_t wc;
31595b7b453SJohn Marino 	      mbstate_t state;
31695b7b453SJohn Marino 
31795b7b453SJohn Marino 	      p = buf;
31895b7b453SJohn Marino 	      *p++ = dfa->nodes[node].opr.c;
31995b7b453SJohn Marino 	      while (++node < dfa->nodes_len
32095b7b453SJohn Marino 		     &&	dfa->nodes[node].type == CHARACTER
32195b7b453SJohn Marino 		     && dfa->nodes[node].mb_partial)
32295b7b453SJohn Marino 		*p++ = dfa->nodes[node].opr.c;
32395b7b453SJohn Marino 	      memset (&state, '\0', sizeof (state));
32495b7b453SJohn Marino 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
32595b7b453SJohn Marino 			     &state) == p - buf
326dc7c36e4SJohn Marino 		  && (__wcrtomb ((char *) buf, __towlower (wc), &state)
32795b7b453SJohn Marino 		      != (size_t) -1))
32895b7b453SJohn Marino 		re_set_fastmap (fastmap, false, buf[0]);
32995b7b453SJohn Marino 	    }
33095b7b453SJohn Marino #endif
33195b7b453SJohn Marino 	}
33295b7b453SJohn Marino       else if (type == SIMPLE_BRACKET)
33395b7b453SJohn Marino 	{
33495b7b453SJohn Marino 	  int i, ch;
33595b7b453SJohn Marino 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
33695b7b453SJohn Marino 	    {
33795b7b453SJohn Marino 	      int j;
33895b7b453SJohn Marino 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
33995b7b453SJohn Marino 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
34095b7b453SJohn Marino 		if (w & ((bitset_word_t) 1 << j))
34195b7b453SJohn Marino 		  re_set_fastmap (fastmap, icase, ch);
34295b7b453SJohn Marino 	    }
34395b7b453SJohn Marino 	}
34495b7b453SJohn Marino #ifdef RE_ENABLE_I18N
34595b7b453SJohn Marino       else if (type == COMPLEX_BRACKET)
34695b7b453SJohn Marino 	{
34795b7b453SJohn Marino 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
34895b7b453SJohn Marino 	  Idx i;
34995b7b453SJohn Marino 
35095b7b453SJohn Marino # ifdef _LIBC
35195b7b453SJohn Marino 	  /* See if we have to try all bytes which start multiple collation
35295b7b453SJohn Marino 	     elements.
35395b7b453SJohn Marino 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
35495b7b453SJohn Marino 		  collation element, and don't catch 'b' since 'b' is
35595b7b453SJohn Marino 		  the only collation element which starts from 'b' (and
35695b7b453SJohn Marino 		  it is caught by SIMPLE_BRACKET).  */
35795b7b453SJohn Marino 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
35895b7b453SJohn Marino 		  && (cset->ncoll_syms || cset->nranges))
35995b7b453SJohn Marino 		{
36095b7b453SJohn Marino 		  const int32_t *table = (const int32_t *)
36195b7b453SJohn Marino 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
36295b7b453SJohn Marino 		  for (i = 0; i < SBC_MAX; ++i)
36395b7b453SJohn Marino 		    if (table[i] < 0)
36495b7b453SJohn Marino 		      re_set_fastmap (fastmap, icase, i);
36595b7b453SJohn Marino 		}
36695b7b453SJohn Marino # endif /* _LIBC */
36795b7b453SJohn Marino 
36895b7b453SJohn Marino 	  /* See if we have to start the match at all multibyte characters,
36995b7b453SJohn Marino 	     i.e. where we would not find an invalid sequence.  This only
37095b7b453SJohn Marino 	     applies to multibyte character sets; for single byte character
37195b7b453SJohn Marino 	     sets, the SIMPLE_BRACKET again suffices.  */
37295b7b453SJohn Marino 	  if (dfa->mb_cur_max > 1
37395b7b453SJohn Marino 	      && (cset->nchar_classes || cset->non_match || cset->nranges
37495b7b453SJohn Marino # ifdef _LIBC
37595b7b453SJohn Marino 		  || cset->nequiv_classes
37695b7b453SJohn Marino # endif /* _LIBC */
37795b7b453SJohn Marino 		 ))
37895b7b453SJohn Marino 	    {
37995b7b453SJohn Marino 	      unsigned char c = 0;
38095b7b453SJohn Marino 	      do
38195b7b453SJohn Marino 		{
38295b7b453SJohn Marino 		  mbstate_t mbs;
38395b7b453SJohn Marino 		  memset (&mbs, 0, sizeof (mbs));
38495b7b453SJohn Marino 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
38595b7b453SJohn Marino 		    re_set_fastmap (fastmap, false, (int) c);
38695b7b453SJohn Marino 		}
38795b7b453SJohn Marino 	      while (++c != 0);
38895b7b453SJohn Marino 	    }
38995b7b453SJohn Marino 
39095b7b453SJohn Marino 	  else
39195b7b453SJohn Marino 	    {
39295b7b453SJohn Marino 	      /* ... Else catch all bytes which can start the mbchars.  */
39395b7b453SJohn Marino 	      for (i = 0; i < cset->nmbchars; ++i)
39495b7b453SJohn Marino 		{
39595b7b453SJohn Marino 		  char buf[256];
39695b7b453SJohn Marino 		  mbstate_t state;
39795b7b453SJohn Marino 		  memset (&state, '\0', sizeof (state));
39895b7b453SJohn Marino 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
39995b7b453SJohn Marino 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
40095b7b453SJohn Marino 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
40195b7b453SJohn Marino 		    {
402dc7c36e4SJohn Marino 		      if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
40395b7b453SJohn Marino 			  != (size_t) -1)
40495b7b453SJohn Marino 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
40595b7b453SJohn Marino 		    }
40695b7b453SJohn Marino 		}
40795b7b453SJohn Marino 	    }
40895b7b453SJohn Marino 	}
40995b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
41095b7b453SJohn Marino       else if (type == OP_PERIOD
41195b7b453SJohn Marino #ifdef RE_ENABLE_I18N
41295b7b453SJohn Marino 	       || type == OP_UTF8_PERIOD
41395b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
41495b7b453SJohn Marino 	       || type == END_OF_RE)
41595b7b453SJohn Marino 	{
41695b7b453SJohn Marino 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
41795b7b453SJohn Marino 	  if (type == END_OF_RE)
41895b7b453SJohn Marino 	    bufp->can_be_null = 1;
41995b7b453SJohn Marino 	  return;
42095b7b453SJohn Marino 	}
42195b7b453SJohn Marino     }
42295b7b453SJohn Marino }
42395b7b453SJohn Marino 
42495b7b453SJohn Marino /* Entry point for POSIX code.  */
42595b7b453SJohn Marino /* regcomp takes a regular expression as a string and compiles it.
42695b7b453SJohn Marino 
42795b7b453SJohn Marino    PREG is a regex_t *.  We do not expect any fields to be initialized,
42895b7b453SJohn Marino    since POSIX says we shouldn't.  Thus, we set
42995b7b453SJohn Marino 
430cf28ed85SJohn Marino      'buffer' to the compiled pattern;
431cf28ed85SJohn Marino      'used' to the length of the compiled pattern;
432cf28ed85SJohn Marino      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
43395b7b453SJohn Marino        REG_EXTENDED bit in CFLAGS is set; otherwise, to
43495b7b453SJohn Marino        RE_SYNTAX_POSIX_BASIC;
435cf28ed85SJohn Marino      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436cf28ed85SJohn Marino      'fastmap' to an allocated space for the fastmap;
437cf28ed85SJohn Marino      'fastmap_accurate' to zero;
438cf28ed85SJohn Marino      're_nsub' to the number of subexpressions in PATTERN.
43995b7b453SJohn Marino 
44095b7b453SJohn Marino    PATTERN is the address of the pattern string.
44195b7b453SJohn Marino 
44295b7b453SJohn Marino    CFLAGS is a series of bits which affect compilation.
44395b7b453SJohn Marino 
44495b7b453SJohn Marino      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
44595b7b453SJohn Marino      use POSIX basic syntax.
44695b7b453SJohn Marino 
44795b7b453SJohn Marino      If REG_NEWLINE is set, then . and [^...] don't match newline.
44895b7b453SJohn Marino      Also, regexec will try a match beginning after every newline.
44995b7b453SJohn Marino 
45095b7b453SJohn Marino      If REG_ICASE is set, then we considers upper- and lowercase
45195b7b453SJohn Marino      versions of letters to be equivalent when matching.
45295b7b453SJohn Marino 
45395b7b453SJohn Marino      If REG_NOSUB is set, then when PREG is passed to regexec, that
45495b7b453SJohn Marino      routine will report only success or failure, and nothing about the
45595b7b453SJohn Marino      registers.
45695b7b453SJohn Marino 
45795b7b453SJohn Marino    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
45895b7b453SJohn Marino    the return codes and their meanings.)  */
45995b7b453SJohn Marino 
46095b7b453SJohn Marino int
regcomp(regex_t * __restrict preg,const char * __restrict pattern,int cflags)461*09d4459fSDaniel Fojt regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
46295b7b453SJohn Marino {
46395b7b453SJohn Marino   reg_errcode_t ret;
46495b7b453SJohn Marino   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
46595b7b453SJohn Marino 			 : RE_SYNTAX_POSIX_BASIC);
46695b7b453SJohn Marino 
46795b7b453SJohn Marino   preg->buffer = NULL;
46895b7b453SJohn Marino   preg->allocated = 0;
46995b7b453SJohn Marino   preg->used = 0;
47095b7b453SJohn Marino 
47195b7b453SJohn Marino   /* Try to allocate space for the fastmap.  */
47295b7b453SJohn Marino   preg->fastmap = re_malloc (char, SBC_MAX);
473*09d4459fSDaniel Fojt   if (__glibc_unlikely (preg->fastmap == NULL))
47495b7b453SJohn Marino     return REG_ESPACE;
47595b7b453SJohn Marino 
47695b7b453SJohn Marino   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
47795b7b453SJohn Marino 
47895b7b453SJohn Marino   /* If REG_NEWLINE is set, newlines are treated differently.  */
47995b7b453SJohn Marino   if (cflags & REG_NEWLINE)
48095b7b453SJohn Marino     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
48195b7b453SJohn Marino       syntax &= ~RE_DOT_NEWLINE;
48295b7b453SJohn Marino       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
48395b7b453SJohn Marino       /* It also changes the matching behavior.  */
48495b7b453SJohn Marino       preg->newline_anchor = 1;
48595b7b453SJohn Marino     }
48695b7b453SJohn Marino   else
48795b7b453SJohn Marino     preg->newline_anchor = 0;
48895b7b453SJohn Marino   preg->no_sub = !!(cflags & REG_NOSUB);
48995b7b453SJohn Marino   preg->translate = NULL;
49095b7b453SJohn Marino 
49195b7b453SJohn Marino   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
49295b7b453SJohn Marino 
49395b7b453SJohn Marino   /* POSIX doesn't distinguish between an unmatched open-group and an
49495b7b453SJohn Marino      unmatched close-group: both are REG_EPAREN.  */
49595b7b453SJohn Marino   if (ret == REG_ERPAREN)
49695b7b453SJohn Marino     ret = REG_EPAREN;
49795b7b453SJohn Marino 
49895b7b453SJohn Marino   /* We have already checked preg->fastmap != NULL.  */
499*09d4459fSDaniel Fojt   if (__glibc_likely (ret == REG_NOERROR))
50095b7b453SJohn Marino     /* Compute the fastmap now, since regexec cannot modify the pattern
50195b7b453SJohn Marino        buffer.  This function never fails in this implementation.  */
50295b7b453SJohn Marino     (void) re_compile_fastmap (preg);
50395b7b453SJohn Marino   else
50495b7b453SJohn Marino     {
50595b7b453SJohn Marino       /* Some error occurred while compiling the expression.  */
50695b7b453SJohn Marino       re_free (preg->fastmap);
50795b7b453SJohn Marino       preg->fastmap = NULL;
50895b7b453SJohn Marino     }
50995b7b453SJohn Marino 
51095b7b453SJohn Marino   return (int) ret;
51195b7b453SJohn Marino }
512*09d4459fSDaniel Fojt libc_hidden_def (__regcomp)
weak_alias(__regcomp,regcomp)51395b7b453SJohn Marino weak_alias (__regcomp, regcomp)
51495b7b453SJohn Marino 
51595b7b453SJohn Marino /* Returns a message corresponding to an error code, ERRCODE, returned
51695b7b453SJohn Marino    from either regcomp or regexec.   We don't use PREG here.  */
51795b7b453SJohn Marino 
51895b7b453SJohn Marino size_t
519*09d4459fSDaniel Fojt regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
520*09d4459fSDaniel Fojt 	  size_t errbuf_size)
52195b7b453SJohn Marino {
52295b7b453SJohn Marino   const char *msg;
52395b7b453SJohn Marino   size_t msg_size;
524*09d4459fSDaniel Fojt   int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
52595b7b453SJohn Marino 
526*09d4459fSDaniel Fojt   if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
52795b7b453SJohn Marino     /* Only error codes returned by the rest of the code should be passed
52895b7b453SJohn Marino        to this routine.  If we are given anything else, or if other regex
52995b7b453SJohn Marino        code generates an invalid error code, then the program has a bug.
53095b7b453SJohn Marino        Dump core so we can fix it.  */
53195b7b453SJohn Marino     abort ();
53295b7b453SJohn Marino 
53395b7b453SJohn Marino   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
53495b7b453SJohn Marino 
53595b7b453SJohn Marino   msg_size = strlen (msg) + 1; /* Includes the null.  */
53695b7b453SJohn Marino 
537*09d4459fSDaniel Fojt   if (__glibc_likely (errbuf_size != 0))
53895b7b453SJohn Marino     {
53995b7b453SJohn Marino       size_t cpy_size = msg_size;
540*09d4459fSDaniel Fojt       if (__glibc_unlikely (msg_size > errbuf_size))
54195b7b453SJohn Marino 	{
54295b7b453SJohn Marino 	  cpy_size = errbuf_size - 1;
54395b7b453SJohn Marino 	  errbuf[cpy_size] = '\0';
54495b7b453SJohn Marino 	}
54595b7b453SJohn Marino       memcpy (errbuf, msg, cpy_size);
54695b7b453SJohn Marino     }
54795b7b453SJohn Marino 
54895b7b453SJohn Marino   return msg_size;
54995b7b453SJohn Marino }
55095b7b453SJohn Marino weak_alias (__regerror, regerror)
55195b7b453SJohn Marino 
55295b7b453SJohn Marino 
55395b7b453SJohn Marino #ifdef RE_ENABLE_I18N
55495b7b453SJohn Marino /* This static array is used for the map to single-byte characters when
55595b7b453SJohn Marino    UTF-8 is used.  Otherwise we would allocate memory just to initialize
55695b7b453SJohn Marino    it the same all the time.  UTF-8 is the preferred encoding so this is
55795b7b453SJohn Marino    a worthwhile optimization.  */
55895b7b453SJohn Marino static const bitset_t utf8_sb_map =
55995b7b453SJohn Marino {
56095b7b453SJohn Marino   /* Set the first 128 bits.  */
561680a9cb8SJohn Marino # if defined __GNUC__ && !defined __STRICT_ANSI__
562cf28ed85SJohn Marino   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
563cf28ed85SJohn Marino # else
56495b7b453SJohn Marino #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
56595b7b453SJohn Marino #   error "bitset_word_t is narrower than 32 bits"
56695b7b453SJohn Marino #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
56795b7b453SJohn Marino   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
56895b7b453SJohn Marino #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
56995b7b453SJohn Marino   BITSET_WORD_MAX, BITSET_WORD_MAX,
57095b7b453SJohn Marino #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
57195b7b453SJohn Marino   BITSET_WORD_MAX,
57295b7b453SJohn Marino #  endif
57395b7b453SJohn Marino   (BITSET_WORD_MAX
57495b7b453SJohn Marino    >> (SBC_MAX % BITSET_WORD_BITS == 0
57595b7b453SJohn Marino        ? 0
57695b7b453SJohn Marino        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
577cf28ed85SJohn Marino # endif
57895b7b453SJohn Marino };
57995b7b453SJohn Marino #endif
58095b7b453SJohn Marino 
58195b7b453SJohn Marino 
58295b7b453SJohn Marino static void
free_dfa_content(re_dfa_t * dfa)58395b7b453SJohn Marino free_dfa_content (re_dfa_t *dfa)
58495b7b453SJohn Marino {
58595b7b453SJohn Marino   Idx i, j;
58695b7b453SJohn Marino 
58795b7b453SJohn Marino   if (dfa->nodes)
58895b7b453SJohn Marino     for (i = 0; i < dfa->nodes_len; ++i)
58995b7b453SJohn Marino       free_token (dfa->nodes + i);
59095b7b453SJohn Marino   re_free (dfa->nexts);
59195b7b453SJohn Marino   for (i = 0; i < dfa->nodes_len; ++i)
59295b7b453SJohn Marino     {
59395b7b453SJohn Marino       if (dfa->eclosures != NULL)
59495b7b453SJohn Marino 	re_node_set_free (dfa->eclosures + i);
59595b7b453SJohn Marino       if (dfa->inveclosures != NULL)
59695b7b453SJohn Marino 	re_node_set_free (dfa->inveclosures + i);
59795b7b453SJohn Marino       if (dfa->edests != NULL)
59895b7b453SJohn Marino 	re_node_set_free (dfa->edests + i);
59995b7b453SJohn Marino     }
60095b7b453SJohn Marino   re_free (dfa->edests);
60195b7b453SJohn Marino   re_free (dfa->eclosures);
60295b7b453SJohn Marino   re_free (dfa->inveclosures);
60395b7b453SJohn Marino   re_free (dfa->nodes);
60495b7b453SJohn Marino 
60595b7b453SJohn Marino   if (dfa->state_table)
60695b7b453SJohn Marino     for (i = 0; i <= dfa->state_hash_mask; ++i)
60795b7b453SJohn Marino       {
60895b7b453SJohn Marino 	struct re_state_table_entry *entry = dfa->state_table + i;
60995b7b453SJohn Marino 	for (j = 0; j < entry->num; ++j)
61095b7b453SJohn Marino 	  {
61195b7b453SJohn Marino 	    re_dfastate_t *state = entry->array[j];
61295b7b453SJohn Marino 	    free_state (state);
61395b7b453SJohn Marino 	  }
61495b7b453SJohn Marino 	re_free (entry->array);
61595b7b453SJohn Marino       }
61695b7b453SJohn Marino   re_free (dfa->state_table);
61795b7b453SJohn Marino #ifdef RE_ENABLE_I18N
61895b7b453SJohn Marino   if (dfa->sb_char != utf8_sb_map)
61995b7b453SJohn Marino     re_free (dfa->sb_char);
62095b7b453SJohn Marino #endif
62195b7b453SJohn Marino   re_free (dfa->subexp_map);
62295b7b453SJohn Marino #ifdef DEBUG
62395b7b453SJohn Marino   re_free (dfa->re_str);
62495b7b453SJohn Marino #endif
62595b7b453SJohn Marino 
62695b7b453SJohn Marino   re_free (dfa);
62795b7b453SJohn Marino }
62895b7b453SJohn Marino 
62995b7b453SJohn Marino 
63095b7b453SJohn Marino /* Free dynamically allocated space used by PREG.  */
63195b7b453SJohn Marino 
63295b7b453SJohn Marino void
regfree(regex_t * preg)633*09d4459fSDaniel Fojt regfree (regex_t *preg)
63495b7b453SJohn Marino {
635cf28ed85SJohn Marino   re_dfa_t *dfa = preg->buffer;
636*09d4459fSDaniel Fojt   if (__glibc_likely (dfa != NULL))
637680a9cb8SJohn Marino     {
638680a9cb8SJohn Marino       lock_fini (dfa->lock);
63995b7b453SJohn Marino       free_dfa_content (dfa);
640680a9cb8SJohn Marino     }
64195b7b453SJohn Marino   preg->buffer = NULL;
64295b7b453SJohn Marino   preg->allocated = 0;
64395b7b453SJohn Marino 
64495b7b453SJohn Marino   re_free (preg->fastmap);
64595b7b453SJohn Marino   preg->fastmap = NULL;
64695b7b453SJohn Marino 
64795b7b453SJohn Marino   re_free (preg->translate);
64895b7b453SJohn Marino   preg->translate = NULL;
64995b7b453SJohn Marino }
650*09d4459fSDaniel Fojt libc_hidden_def (__regfree)
65195b7b453SJohn Marino weak_alias (__regfree, regfree)
65295b7b453SJohn Marino 
65395b7b453SJohn Marino /* Entry points compatible with 4.2 BSD regex library.  We don't define
65495b7b453SJohn Marino    them unless specifically requested.  */
65595b7b453SJohn Marino 
65695b7b453SJohn Marino #if defined _REGEX_RE_COMP || defined _LIBC
65795b7b453SJohn Marino 
65895b7b453SJohn Marino /* BSD has one and only one pattern buffer.  */
65995b7b453SJohn Marino static struct re_pattern_buffer re_comp_buf;
66095b7b453SJohn Marino 
66195b7b453SJohn Marino char *
66295b7b453SJohn Marino # ifdef _LIBC
66395b7b453SJohn Marino /* Make these definitions weak in libc, so POSIX programs can redefine
66495b7b453SJohn Marino    these names if they don't use our functions, and still use
66595b7b453SJohn Marino    regcomp/regexec above without link errors.  */
66695b7b453SJohn Marino weak_function
66795b7b453SJohn Marino # endif
re_comp(const char * s)668*09d4459fSDaniel Fojt re_comp (const char *s)
66995b7b453SJohn Marino {
67095b7b453SJohn Marino   reg_errcode_t ret;
67195b7b453SJohn Marino   char *fastmap;
67295b7b453SJohn Marino 
67395b7b453SJohn Marino   if (!s)
67495b7b453SJohn Marino     {
67595b7b453SJohn Marino       if (!re_comp_buf.buffer)
67695b7b453SJohn Marino 	return gettext ("No previous regular expression");
67795b7b453SJohn Marino       return 0;
67895b7b453SJohn Marino     }
67995b7b453SJohn Marino 
68095b7b453SJohn Marino   if (re_comp_buf.buffer)
68195b7b453SJohn Marino     {
68295b7b453SJohn Marino       fastmap = re_comp_buf.fastmap;
68395b7b453SJohn Marino       re_comp_buf.fastmap = NULL;
68495b7b453SJohn Marino       __regfree (&re_comp_buf);
68595b7b453SJohn Marino       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
68695b7b453SJohn Marino       re_comp_buf.fastmap = fastmap;
68795b7b453SJohn Marino     }
68895b7b453SJohn Marino 
68995b7b453SJohn Marino   if (re_comp_buf.fastmap == NULL)
69095b7b453SJohn Marino     {
691*09d4459fSDaniel Fojt       re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
69295b7b453SJohn Marino       if (re_comp_buf.fastmap == NULL)
69395b7b453SJohn Marino 	return (char *) gettext (__re_error_msgid
69495b7b453SJohn Marino 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
69595b7b453SJohn Marino     }
69695b7b453SJohn Marino 
697cf28ed85SJohn Marino   /* Since 're_exec' always passes NULL for the 'regs' argument, we
69895b7b453SJohn Marino      don't need to initialize the pattern buffer fields which affect it.  */
69995b7b453SJohn Marino 
70095b7b453SJohn Marino   /* Match anchors at newlines.  */
70195b7b453SJohn Marino   re_comp_buf.newline_anchor = 1;
70295b7b453SJohn Marino 
70395b7b453SJohn Marino   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
70495b7b453SJohn Marino 
70595b7b453SJohn Marino   if (!ret)
70695b7b453SJohn Marino     return NULL;
70795b7b453SJohn Marino 
708cf28ed85SJohn Marino   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
70995b7b453SJohn Marino   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
71095b7b453SJohn Marino }
71195b7b453SJohn Marino 
71295b7b453SJohn Marino #ifdef _LIBC
libc_freeres_fn(free_mem)71395b7b453SJohn Marino libc_freeres_fn (free_mem)
71495b7b453SJohn Marino {
71595b7b453SJohn Marino   __regfree (&re_comp_buf);
71695b7b453SJohn Marino }
71795b7b453SJohn Marino #endif
71895b7b453SJohn Marino 
71995b7b453SJohn Marino #endif /* _REGEX_RE_COMP */
72095b7b453SJohn Marino 
72195b7b453SJohn Marino /* Internal entry point.
72295b7b453SJohn Marino    Compile the regular expression PATTERN, whose length is LENGTH.
72395b7b453SJohn Marino    SYNTAX indicate regular expression's syntax.  */
72495b7b453SJohn Marino 
72595b7b453SJohn Marino static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)72695b7b453SJohn Marino re_compile_internal (regex_t *preg, const char * pattern, size_t length,
72795b7b453SJohn Marino 		     reg_syntax_t syntax)
72895b7b453SJohn Marino {
72995b7b453SJohn Marino   reg_errcode_t err = REG_NOERROR;
73095b7b453SJohn Marino   re_dfa_t *dfa;
73195b7b453SJohn Marino   re_string_t regexp;
73295b7b453SJohn Marino 
73395b7b453SJohn Marino   /* Initialize the pattern buffer.  */
73495b7b453SJohn Marino   preg->fastmap_accurate = 0;
73595b7b453SJohn Marino   preg->syntax = syntax;
73695b7b453SJohn Marino   preg->not_bol = preg->not_eol = 0;
73795b7b453SJohn Marino   preg->used = 0;
73895b7b453SJohn Marino   preg->re_nsub = 0;
73995b7b453SJohn Marino   preg->can_be_null = 0;
74095b7b453SJohn Marino   preg->regs_allocated = REGS_UNALLOCATED;
74195b7b453SJohn Marino 
74295b7b453SJohn Marino   /* Initialize the dfa.  */
743cf28ed85SJohn Marino   dfa = preg->buffer;
744*09d4459fSDaniel Fojt   if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
74595b7b453SJohn Marino     {
74695b7b453SJohn Marino       /* If zero allocated, but buffer is non-null, try to realloc
74795b7b453SJohn Marino 	 enough space.  This loses if buffer's address is bogus, but
74895b7b453SJohn Marino 	 that is the user's responsibility.  If ->buffer is NULL this
74995b7b453SJohn Marino 	 is a simple allocation.  */
75095b7b453SJohn Marino       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
75195b7b453SJohn Marino       if (dfa == NULL)
75295b7b453SJohn Marino 	return REG_ESPACE;
75395b7b453SJohn Marino       preg->allocated = sizeof (re_dfa_t);
754cf28ed85SJohn Marino       preg->buffer = dfa;
75595b7b453SJohn Marino     }
75695b7b453SJohn Marino   preg->used = sizeof (re_dfa_t);
75795b7b453SJohn Marino 
75895b7b453SJohn Marino   err = init_dfa (dfa, length);
759*09d4459fSDaniel Fojt   if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
760680a9cb8SJohn Marino     err = REG_ESPACE;
761*09d4459fSDaniel Fojt   if (__glibc_unlikely (err != REG_NOERROR))
76295b7b453SJohn Marino     {
76395b7b453SJohn Marino       free_dfa_content (dfa);
76495b7b453SJohn Marino       preg->buffer = NULL;
76595b7b453SJohn Marino       preg->allocated = 0;
76695b7b453SJohn Marino       return err;
76795b7b453SJohn Marino     }
76895b7b453SJohn Marino #ifdef DEBUG
76995b7b453SJohn Marino   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
77095b7b453SJohn Marino   dfa->re_str = re_malloc (char, length + 1);
77195b7b453SJohn Marino   strncpy (dfa->re_str, pattern, length + 1);
77295b7b453SJohn Marino #endif
77395b7b453SJohn Marino 
77495b7b453SJohn Marino   err = re_string_construct (&regexp, pattern, length, preg->translate,
77595b7b453SJohn Marino 			     (syntax & RE_ICASE) != 0, dfa);
776*09d4459fSDaniel Fojt   if (__glibc_unlikely (err != REG_NOERROR))
77795b7b453SJohn Marino     {
77895b7b453SJohn Marino     re_compile_internal_free_return:
77995b7b453SJohn Marino       free_workarea_compile (preg);
78095b7b453SJohn Marino       re_string_destruct (&regexp);
781680a9cb8SJohn Marino       lock_fini (dfa->lock);
78295b7b453SJohn Marino       free_dfa_content (dfa);
78395b7b453SJohn Marino       preg->buffer = NULL;
78495b7b453SJohn Marino       preg->allocated = 0;
78595b7b453SJohn Marino       return err;
78695b7b453SJohn Marino     }
78795b7b453SJohn Marino 
78895b7b453SJohn Marino   /* Parse the regular expression, and build a structure tree.  */
78995b7b453SJohn Marino   preg->re_nsub = 0;
79095b7b453SJohn Marino   dfa->str_tree = parse (&regexp, preg, syntax, &err);
791*09d4459fSDaniel Fojt   if (__glibc_unlikely (dfa->str_tree == NULL))
79295b7b453SJohn Marino     goto re_compile_internal_free_return;
79395b7b453SJohn Marino 
79495b7b453SJohn Marino   /* Analyze the tree and create the nfa.  */
79595b7b453SJohn Marino   err = analyze (preg);
796*09d4459fSDaniel Fojt   if (__glibc_unlikely (err != REG_NOERROR))
79795b7b453SJohn Marino     goto re_compile_internal_free_return;
79895b7b453SJohn Marino 
79995b7b453SJohn Marino #ifdef RE_ENABLE_I18N
80095b7b453SJohn Marino   /* If possible, do searching in single byte encoding to speed things up.  */
80195b7b453SJohn Marino   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
80295b7b453SJohn Marino     optimize_utf8 (dfa);
80395b7b453SJohn Marino #endif
80495b7b453SJohn Marino 
80595b7b453SJohn Marino   /* Then create the initial state of the dfa.  */
80695b7b453SJohn Marino   err = create_initial_state (dfa);
80795b7b453SJohn Marino 
80895b7b453SJohn Marino   /* Release work areas.  */
80995b7b453SJohn Marino   free_workarea_compile (preg);
81095b7b453SJohn Marino   re_string_destruct (&regexp);
81195b7b453SJohn Marino 
812*09d4459fSDaniel Fojt   if (__glibc_unlikely (err != REG_NOERROR))
81395b7b453SJohn Marino     {
814680a9cb8SJohn Marino       lock_fini (dfa->lock);
81595b7b453SJohn Marino       free_dfa_content (dfa);
81695b7b453SJohn Marino       preg->buffer = NULL;
81795b7b453SJohn Marino       preg->allocated = 0;
81895b7b453SJohn Marino     }
81995b7b453SJohn Marino 
82095b7b453SJohn Marino   return err;
82195b7b453SJohn Marino }
82295b7b453SJohn Marino 
82395b7b453SJohn Marino /* Initialize DFA.  We use the length of the regular expression PAT_LEN
82495b7b453SJohn Marino    as the initial length of some arrays.  */
82595b7b453SJohn Marino 
82695b7b453SJohn Marino static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)82795b7b453SJohn Marino init_dfa (re_dfa_t *dfa, size_t pat_len)
82895b7b453SJohn Marino {
82995b7b453SJohn Marino   __re_size_t table_size;
83095b7b453SJohn Marino #ifndef _LIBC
831cf28ed85SJohn Marino   const char *codeset_name;
83295b7b453SJohn Marino #endif
83395b7b453SJohn Marino #ifdef RE_ENABLE_I18N
83495b7b453SJohn Marino   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
83595b7b453SJohn Marino #else
83695b7b453SJohn Marino   size_t max_i18n_object_size = 0;
83795b7b453SJohn Marino #endif
83895b7b453SJohn Marino   size_t max_object_size =
83995b7b453SJohn Marino     MAX (sizeof (struct re_state_table_entry),
84095b7b453SJohn Marino 	 MAX (sizeof (re_token_t),
84195b7b453SJohn Marino 	      MAX (sizeof (re_node_set),
84295b7b453SJohn Marino 		   MAX (sizeof (regmatch_t),
84395b7b453SJohn Marino 			max_i18n_object_size))));
84495b7b453SJohn Marino 
84595b7b453SJohn Marino   memset (dfa, '\0', sizeof (re_dfa_t));
84695b7b453SJohn Marino 
84795b7b453SJohn Marino   /* Force allocation of str_tree_storage the first time.  */
84895b7b453SJohn Marino   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
84995b7b453SJohn Marino 
85095b7b453SJohn Marino   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
85195b7b453SJohn Marino      calculation below, and for similar doubling calculations
85295b7b453SJohn Marino      elsewhere.  And it's <= rather than <, because some of the
85395b7b453SJohn Marino      doubling calculations add 1 afterwards.  */
854*09d4459fSDaniel Fojt   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
855*09d4459fSDaniel Fojt 			<= pat_len))
85695b7b453SJohn Marino     return REG_ESPACE;
85795b7b453SJohn Marino 
85895b7b453SJohn Marino   dfa->nodes_alloc = pat_len + 1;
85995b7b453SJohn Marino   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
86095b7b453SJohn Marino 
86195b7b453SJohn Marino   /*  table_size = 2 ^ ceil(log pat_len) */
86295b7b453SJohn Marino   for (table_size = 1; ; table_size <<= 1)
86395b7b453SJohn Marino     if (table_size > pat_len)
86495b7b453SJohn Marino       break;
86595b7b453SJohn Marino 
86695b7b453SJohn Marino   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
86795b7b453SJohn Marino   dfa->state_hash_mask = table_size - 1;
86895b7b453SJohn Marino 
86995b7b453SJohn Marino   dfa->mb_cur_max = MB_CUR_MAX;
87095b7b453SJohn Marino #ifdef _LIBC
87195b7b453SJohn Marino   if (dfa->mb_cur_max == 6
87295b7b453SJohn Marino       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
87395b7b453SJohn Marino     dfa->is_utf8 = 1;
87495b7b453SJohn Marino   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
87595b7b453SJohn Marino 		       != 0);
87695b7b453SJohn Marino #else
87795b7b453SJohn Marino   codeset_name = nl_langinfo (CODESET);
878a8597f6cSJohn Marino   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
879a8597f6cSJohn Marino       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
880a8597f6cSJohn Marino       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
881a8597f6cSJohn Marino       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
88295b7b453SJohn Marino     dfa->is_utf8 = 1;
88395b7b453SJohn Marino 
88495b7b453SJohn Marino   /* We check exhaustively in the loop below if this charset is a
88595b7b453SJohn Marino      superset of ASCII.  */
88695b7b453SJohn Marino   dfa->map_notascii = 0;
88795b7b453SJohn Marino #endif
88895b7b453SJohn Marino 
88995b7b453SJohn Marino #ifdef RE_ENABLE_I18N
89095b7b453SJohn Marino   if (dfa->mb_cur_max > 1)
89195b7b453SJohn Marino     {
89295b7b453SJohn Marino       if (dfa->is_utf8)
89395b7b453SJohn Marino 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
89495b7b453SJohn Marino       else
89595b7b453SJohn Marino 	{
89695b7b453SJohn Marino 	  int i, j, ch;
89795b7b453SJohn Marino 
89895b7b453SJohn Marino 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
899*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (dfa->sb_char == NULL))
90095b7b453SJohn Marino 	    return REG_ESPACE;
90195b7b453SJohn Marino 
90295b7b453SJohn Marino 	  /* Set the bits corresponding to single byte chars.  */
90395b7b453SJohn Marino 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
90495b7b453SJohn Marino 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
90595b7b453SJohn Marino 	      {
90695b7b453SJohn Marino 		wint_t wch = __btowc (ch);
90795b7b453SJohn Marino 		if (wch != WEOF)
90895b7b453SJohn Marino 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
90995b7b453SJohn Marino # ifndef _LIBC
91095b7b453SJohn Marino 		if (isascii (ch) && wch != ch)
91195b7b453SJohn Marino 		  dfa->map_notascii = 1;
91295b7b453SJohn Marino # endif
91395b7b453SJohn Marino 	      }
91495b7b453SJohn Marino 	}
91595b7b453SJohn Marino     }
91695b7b453SJohn Marino #endif
91795b7b453SJohn Marino 
918*09d4459fSDaniel Fojt   if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
91995b7b453SJohn Marino     return REG_ESPACE;
92095b7b453SJohn Marino   return REG_NOERROR;
92195b7b453SJohn Marino }
92295b7b453SJohn Marino 
92395b7b453SJohn Marino /* Initialize WORD_CHAR table, which indicate which character is
92495b7b453SJohn Marino    "word".  In this case "word" means that it is the word construction
92595b7b453SJohn Marino    character used by some operators like "\<", "\>", etc.  */
92695b7b453SJohn Marino 
92795b7b453SJohn Marino static void
init_word_char(re_dfa_t * dfa)92895b7b453SJohn Marino init_word_char (re_dfa_t *dfa)
92995b7b453SJohn Marino {
930cf28ed85SJohn Marino   int i = 0;
931cf28ed85SJohn Marino   int j;
932cf28ed85SJohn Marino   int ch = 0;
933680a9cb8SJohn Marino   dfa->word_ops_used = 1;
934*09d4459fSDaniel Fojt   if (__glibc_likely (dfa->map_notascii == 0))
935cf28ed85SJohn Marino     {
936*09d4459fSDaniel Fojt       /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
937*09d4459fSDaniel Fojt 	 them, an issue when this code is used in Gnulib.  */
938a8597f6cSJohn Marino       bitset_word_t bits0 = 0x00000000;
939a8597f6cSJohn Marino       bitset_word_t bits1 = 0x03ff0000;
940a8597f6cSJohn Marino       bitset_word_t bits2 = 0x87fffffe;
941a8597f6cSJohn Marino       bitset_word_t bits3 = 0x07fffffe;
942cf28ed85SJohn Marino       if (BITSET_WORD_BITS == 64)
943cf28ed85SJohn Marino 	{
944*09d4459fSDaniel Fojt 	  /* Pacify gcc -Woverflow on 32-bit platformns.  */
945a8597f6cSJohn Marino 	  dfa->word_char[0] = bits1 << 31 << 1 | bits0;
946a8597f6cSJohn Marino 	  dfa->word_char[1] = bits3 << 31 << 1 | bits2;
947cf28ed85SJohn Marino 	  i = 2;
948cf28ed85SJohn Marino 	}
949cf28ed85SJohn Marino       else if (BITSET_WORD_BITS == 32)
950cf28ed85SJohn Marino 	{
951a8597f6cSJohn Marino 	  dfa->word_char[0] = bits0;
952a8597f6cSJohn Marino 	  dfa->word_char[1] = bits1;
953a8597f6cSJohn Marino 	  dfa->word_char[2] = bits2;
954a8597f6cSJohn Marino 	  dfa->word_char[3] = bits3;
955cf28ed85SJohn Marino 	  i = 4;
956cf28ed85SJohn Marino 	}
957cf28ed85SJohn Marino       else
958cf28ed85SJohn Marino         goto general_case;
959cf28ed85SJohn Marino       ch = 128;
960cf28ed85SJohn Marino 
961*09d4459fSDaniel Fojt       if (__glibc_likely (dfa->is_utf8))
962cf28ed85SJohn Marino 	{
963cf28ed85SJohn Marino 	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
964cf28ed85SJohn Marino 	  return;
965cf28ed85SJohn Marino 	}
966cf28ed85SJohn Marino     }
967cf28ed85SJohn Marino 
968cf28ed85SJohn Marino  general_case:
969cf28ed85SJohn Marino   for (; i < BITSET_WORDS; ++i)
97095b7b453SJohn Marino     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
97195b7b453SJohn Marino       if (isalnum (ch) || ch == '_')
97295b7b453SJohn Marino 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
97395b7b453SJohn Marino }
97495b7b453SJohn Marino 
97595b7b453SJohn Marino /* Free the work area which are only used while compiling.  */
97695b7b453SJohn Marino 
97795b7b453SJohn Marino static void
free_workarea_compile(regex_t * preg)97895b7b453SJohn Marino free_workarea_compile (regex_t *preg)
97995b7b453SJohn Marino {
980cf28ed85SJohn Marino   re_dfa_t *dfa = preg->buffer;
98195b7b453SJohn Marino   bin_tree_storage_t *storage, *next;
98295b7b453SJohn Marino   for (storage = dfa->str_tree_storage; storage; storage = next)
98395b7b453SJohn Marino     {
98495b7b453SJohn Marino       next = storage->next;
98595b7b453SJohn Marino       re_free (storage);
98695b7b453SJohn Marino     }
98795b7b453SJohn Marino   dfa->str_tree_storage = NULL;
98895b7b453SJohn Marino   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
98995b7b453SJohn Marino   dfa->str_tree = NULL;
99095b7b453SJohn Marino   re_free (dfa->org_indices);
99195b7b453SJohn Marino   dfa->org_indices = NULL;
99295b7b453SJohn Marino }
99395b7b453SJohn Marino 
99495b7b453SJohn Marino /* Create initial states for all contexts.  */
99595b7b453SJohn Marino 
99695b7b453SJohn Marino static reg_errcode_t
create_initial_state(re_dfa_t * dfa)99795b7b453SJohn Marino create_initial_state (re_dfa_t *dfa)
99895b7b453SJohn Marino {
99995b7b453SJohn Marino   Idx first, i;
100095b7b453SJohn Marino   reg_errcode_t err;
100195b7b453SJohn Marino   re_node_set init_nodes;
100295b7b453SJohn Marino 
100395b7b453SJohn Marino   /* Initial states have the epsilon closure of the node which is
100495b7b453SJohn Marino      the first node of the regular expression.  */
100595b7b453SJohn Marino   first = dfa->str_tree->first->node_idx;
100695b7b453SJohn Marino   dfa->init_node = first;
100795b7b453SJohn Marino   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008*09d4459fSDaniel Fojt   if (__glibc_unlikely (err != REG_NOERROR))
100995b7b453SJohn Marino     return err;
101095b7b453SJohn Marino 
101195b7b453SJohn Marino   /* The back-references which are in initial states can epsilon transit,
101295b7b453SJohn Marino      since in this case all of the subexpressions can be null.
101395b7b453SJohn Marino      Then we add epsilon closures of the nodes which are the next nodes of
101495b7b453SJohn Marino      the back-references.  */
101595b7b453SJohn Marino   if (dfa->nbackref > 0)
101695b7b453SJohn Marino     for (i = 0; i < init_nodes.nelem; ++i)
101795b7b453SJohn Marino       {
101895b7b453SJohn Marino 	Idx node_idx = init_nodes.elems[i];
101995b7b453SJohn Marino 	re_token_type_t type = dfa->nodes[node_idx].type;
102095b7b453SJohn Marino 
102195b7b453SJohn Marino 	Idx clexp_idx;
102295b7b453SJohn Marino 	if (type != OP_BACK_REF)
102395b7b453SJohn Marino 	  continue;
102495b7b453SJohn Marino 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
102595b7b453SJohn Marino 	  {
102695b7b453SJohn Marino 	    re_token_t *clexp_node;
102795b7b453SJohn Marino 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
102895b7b453SJohn Marino 	    if (clexp_node->type == OP_CLOSE_SUBEXP
102995b7b453SJohn Marino 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
103095b7b453SJohn Marino 	      break;
103195b7b453SJohn Marino 	  }
103295b7b453SJohn Marino 	if (clexp_idx == init_nodes.nelem)
103395b7b453SJohn Marino 	  continue;
103495b7b453SJohn Marino 
103595b7b453SJohn Marino 	if (type == OP_BACK_REF)
103695b7b453SJohn Marino 	  {
103795b7b453SJohn Marino 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
103895b7b453SJohn Marino 	    if (!re_node_set_contains (&init_nodes, dest_idx))
103995b7b453SJohn Marino 	      {
104095b7b453SJohn Marino 		reg_errcode_t merge_err
104195b7b453SJohn Marino                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
104295b7b453SJohn Marino 		if (merge_err != REG_NOERROR)
104395b7b453SJohn Marino 		  return merge_err;
104495b7b453SJohn Marino 		i = 0;
104595b7b453SJohn Marino 	      }
104695b7b453SJohn Marino 	  }
104795b7b453SJohn Marino       }
104895b7b453SJohn Marino 
104995b7b453SJohn Marino   /* It must be the first time to invoke acquire_state.  */
105095b7b453SJohn Marino   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
105195b7b453SJohn Marino   /* We don't check ERR here, since the initial state must not be NULL.  */
1052*09d4459fSDaniel Fojt   if (__glibc_unlikely (dfa->init_state == NULL))
105395b7b453SJohn Marino     return err;
105495b7b453SJohn Marino   if (dfa->init_state->has_constraint)
105595b7b453SJohn Marino     {
105695b7b453SJohn Marino       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
105795b7b453SJohn Marino 						       CONTEXT_WORD);
105895b7b453SJohn Marino       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
105995b7b453SJohn Marino 						     CONTEXT_NEWLINE);
106095b7b453SJohn Marino       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
106195b7b453SJohn Marino 							 &init_nodes,
106295b7b453SJohn Marino 							 CONTEXT_NEWLINE
106395b7b453SJohn Marino 							 | CONTEXT_BEGBUF);
1064*09d4459fSDaniel Fojt       if (__glibc_unlikely (dfa->init_state_word == NULL
1065*09d4459fSDaniel Fojt 			    || dfa->init_state_nl == NULL
1066*09d4459fSDaniel Fojt 			    || dfa->init_state_begbuf == NULL))
106795b7b453SJohn Marino 	return err;
106895b7b453SJohn Marino     }
106995b7b453SJohn Marino   else
107095b7b453SJohn Marino     dfa->init_state_word = dfa->init_state_nl
107195b7b453SJohn Marino       = dfa->init_state_begbuf = dfa->init_state;
107295b7b453SJohn Marino 
107395b7b453SJohn Marino   re_node_set_free (&init_nodes);
107495b7b453SJohn Marino   return REG_NOERROR;
107595b7b453SJohn Marino }
107695b7b453SJohn Marino 
107795b7b453SJohn Marino #ifdef RE_ENABLE_I18N
107895b7b453SJohn Marino /* If it is possible to do searching in single byte encoding instead of UTF-8
107995b7b453SJohn Marino    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
108095b7b453SJohn Marino    DFA nodes where needed.  */
108195b7b453SJohn Marino 
108295b7b453SJohn Marino static void
optimize_utf8(re_dfa_t * dfa)108395b7b453SJohn Marino optimize_utf8 (re_dfa_t *dfa)
108495b7b453SJohn Marino {
108595b7b453SJohn Marino   Idx node;
108695b7b453SJohn Marino   int i;
108795b7b453SJohn Marino   bool mb_chars = false;
108895b7b453SJohn Marino   bool has_period = false;
108995b7b453SJohn Marino 
109095b7b453SJohn Marino   for (node = 0; node < dfa->nodes_len; ++node)
109195b7b453SJohn Marino     switch (dfa->nodes[node].type)
109295b7b453SJohn Marino       {
109395b7b453SJohn Marino       case CHARACTER:
109495b7b453SJohn Marino 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
109595b7b453SJohn Marino 	  mb_chars = true;
109695b7b453SJohn Marino 	break;
109795b7b453SJohn Marino       case ANCHOR:
109895b7b453SJohn Marino 	switch (dfa->nodes[node].opr.ctx_type)
109995b7b453SJohn Marino 	  {
110095b7b453SJohn Marino 	  case LINE_FIRST:
110195b7b453SJohn Marino 	  case LINE_LAST:
110295b7b453SJohn Marino 	  case BUF_FIRST:
110395b7b453SJohn Marino 	  case BUF_LAST:
110495b7b453SJohn Marino 	    break;
110595b7b453SJohn Marino 	  default:
110695b7b453SJohn Marino 	    /* Word anchors etc. cannot be handled.  It's okay to test
110795b7b453SJohn Marino 	       opr.ctx_type since constraints (for all DFA nodes) are
110895b7b453SJohn Marino 	       created by ORing one or more opr.ctx_type values.  */
110995b7b453SJohn Marino 	    return;
111095b7b453SJohn Marino 	  }
111195b7b453SJohn Marino 	break;
111295b7b453SJohn Marino       case OP_PERIOD:
111395b7b453SJohn Marino 	has_period = true;
111495b7b453SJohn Marino 	break;
111595b7b453SJohn Marino       case OP_BACK_REF:
111695b7b453SJohn Marino       case OP_ALT:
111795b7b453SJohn Marino       case END_OF_RE:
111895b7b453SJohn Marino       case OP_DUP_ASTERISK:
111995b7b453SJohn Marino       case OP_OPEN_SUBEXP:
112095b7b453SJohn Marino       case OP_CLOSE_SUBEXP:
112195b7b453SJohn Marino 	break;
112295b7b453SJohn Marino       case COMPLEX_BRACKET:
112395b7b453SJohn Marino 	return;
112495b7b453SJohn Marino       case SIMPLE_BRACKET:
112595b7b453SJohn Marino 	/* Just double check.  */
112695b7b453SJohn Marino 	{
112795b7b453SJohn Marino 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
112895b7b453SJohn Marino 			? 0
112995b7b453SJohn Marino 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
113095b7b453SJohn Marino 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
113195b7b453SJohn Marino 	    {
113295b7b453SJohn Marino 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
113395b7b453SJohn Marino 		return;
113495b7b453SJohn Marino 	      rshift = 0;
113595b7b453SJohn Marino 	    }
113695b7b453SJohn Marino 	}
113795b7b453SJohn Marino 	break;
113895b7b453SJohn Marino       default:
113995b7b453SJohn Marino 	abort ();
114095b7b453SJohn Marino       }
114195b7b453SJohn Marino 
114295b7b453SJohn Marino   if (mb_chars || has_period)
114395b7b453SJohn Marino     for (node = 0; node < dfa->nodes_len; ++node)
114495b7b453SJohn Marino       {
114595b7b453SJohn Marino 	if (dfa->nodes[node].type == CHARACTER
114695b7b453SJohn Marino 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
114795b7b453SJohn Marino 	  dfa->nodes[node].mb_partial = 0;
114895b7b453SJohn Marino 	else if (dfa->nodes[node].type == OP_PERIOD)
114995b7b453SJohn Marino 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
115095b7b453SJohn Marino       }
115195b7b453SJohn Marino 
115295b7b453SJohn Marino   /* The search can be in single byte locale.  */
115395b7b453SJohn Marino   dfa->mb_cur_max = 1;
115495b7b453SJohn Marino   dfa->is_utf8 = 0;
115595b7b453SJohn Marino   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
115695b7b453SJohn Marino }
115795b7b453SJohn Marino #endif
115895b7b453SJohn Marino 
115995b7b453SJohn Marino /* Analyze the structure tree, and calculate "first", "next", "edest",
116095b7b453SJohn Marino    "eclosure", and "inveclosure".  */
116195b7b453SJohn Marino 
116295b7b453SJohn Marino static reg_errcode_t
analyze(regex_t * preg)116395b7b453SJohn Marino analyze (regex_t *preg)
116495b7b453SJohn Marino {
1165cf28ed85SJohn Marino   re_dfa_t *dfa = preg->buffer;
116695b7b453SJohn Marino   reg_errcode_t ret;
116795b7b453SJohn Marino 
116895b7b453SJohn Marino   /* Allocate arrays.  */
116995b7b453SJohn Marino   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
117095b7b453SJohn Marino   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
117195b7b453SJohn Marino   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
117295b7b453SJohn Marino   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1173*09d4459fSDaniel Fojt   if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1174*09d4459fSDaniel Fojt 			|| dfa->edests == NULL || dfa->eclosures == NULL))
117595b7b453SJohn Marino     return REG_ESPACE;
117695b7b453SJohn Marino 
117795b7b453SJohn Marino   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
117895b7b453SJohn Marino   if (dfa->subexp_map != NULL)
117995b7b453SJohn Marino     {
118095b7b453SJohn Marino       Idx i;
118195b7b453SJohn Marino       for (i = 0; i < preg->re_nsub; i++)
118295b7b453SJohn Marino 	dfa->subexp_map[i] = i;
118395b7b453SJohn Marino       preorder (dfa->str_tree, optimize_subexps, dfa);
118495b7b453SJohn Marino       for (i = 0; i < preg->re_nsub; i++)
118595b7b453SJohn Marino 	if (dfa->subexp_map[i] != i)
118695b7b453SJohn Marino 	  break;
118795b7b453SJohn Marino       if (i == preg->re_nsub)
118895b7b453SJohn Marino 	{
1189*09d4459fSDaniel Fojt 	  re_free (dfa->subexp_map);
119095b7b453SJohn Marino 	  dfa->subexp_map = NULL;
119195b7b453SJohn Marino 	}
119295b7b453SJohn Marino     }
119395b7b453SJohn Marino 
119495b7b453SJohn Marino   ret = postorder (dfa->str_tree, lower_subexps, preg);
1195*09d4459fSDaniel Fojt   if (__glibc_unlikely (ret != REG_NOERROR))
119695b7b453SJohn Marino     return ret;
119795b7b453SJohn Marino   ret = postorder (dfa->str_tree, calc_first, dfa);
1198*09d4459fSDaniel Fojt   if (__glibc_unlikely (ret != REG_NOERROR))
119995b7b453SJohn Marino     return ret;
120095b7b453SJohn Marino   preorder (dfa->str_tree, calc_next, dfa);
120195b7b453SJohn Marino   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1202*09d4459fSDaniel Fojt   if (__glibc_unlikely (ret != REG_NOERROR))
120395b7b453SJohn Marino     return ret;
120495b7b453SJohn Marino   ret = calc_eclosure (dfa);
1205*09d4459fSDaniel Fojt   if (__glibc_unlikely (ret != REG_NOERROR))
120695b7b453SJohn Marino     return ret;
120795b7b453SJohn Marino 
120895b7b453SJohn Marino   /* We only need this during the prune_impossible_nodes pass in regexec.c;
120995b7b453SJohn Marino      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
121095b7b453SJohn Marino   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
121195b7b453SJohn Marino       || dfa->nbackref)
121295b7b453SJohn Marino     {
121395b7b453SJohn Marino       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1214*09d4459fSDaniel Fojt       if (__glibc_unlikely (dfa->inveclosures == NULL))
121595b7b453SJohn Marino 	return REG_ESPACE;
121695b7b453SJohn Marino       ret = calc_inveclosure (dfa);
121795b7b453SJohn Marino     }
121895b7b453SJohn Marino 
121995b7b453SJohn Marino   return ret;
122095b7b453SJohn Marino }
122195b7b453SJohn Marino 
122295b7b453SJohn Marino /* Our parse trees are very unbalanced, so we cannot use a stack to
122395b7b453SJohn Marino    implement parse tree visits.  Instead, we use parent pointers and
122495b7b453SJohn Marino    some hairy code in these two functions.  */
122595b7b453SJohn Marino static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)122695b7b453SJohn Marino postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
122795b7b453SJohn Marino 	   void *extra)
122895b7b453SJohn Marino {
122995b7b453SJohn Marino   bin_tree_t *node, *prev;
123095b7b453SJohn Marino 
123195b7b453SJohn Marino   for (node = root; ; )
123295b7b453SJohn Marino     {
123395b7b453SJohn Marino       /* Descend down the tree, preferably to the left (or to the right
123495b7b453SJohn Marino 	 if that's the only child).  */
123595b7b453SJohn Marino       while (node->left || node->right)
123695b7b453SJohn Marino 	if (node->left)
123795b7b453SJohn Marino 	  node = node->left;
123895b7b453SJohn Marino 	else
123995b7b453SJohn Marino 	  node = node->right;
124095b7b453SJohn Marino 
124195b7b453SJohn Marino       do
124295b7b453SJohn Marino 	{
124395b7b453SJohn Marino 	  reg_errcode_t err = fn (extra, node);
1244*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (err != REG_NOERROR))
124595b7b453SJohn Marino 	    return err;
124695b7b453SJohn Marino 	  if (node->parent == NULL)
124795b7b453SJohn Marino 	    return REG_NOERROR;
124895b7b453SJohn Marino 	  prev = node;
124995b7b453SJohn Marino 	  node = node->parent;
125095b7b453SJohn Marino 	}
125195b7b453SJohn Marino       /* Go up while we have a node that is reached from the right.  */
125295b7b453SJohn Marino       while (node->right == prev || node->right == NULL);
125395b7b453SJohn Marino       node = node->right;
125495b7b453SJohn Marino     }
125595b7b453SJohn Marino }
125695b7b453SJohn Marino 
125795b7b453SJohn Marino static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)125895b7b453SJohn Marino preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
125995b7b453SJohn Marino 	  void *extra)
126095b7b453SJohn Marino {
126195b7b453SJohn Marino   bin_tree_t *node;
126295b7b453SJohn Marino 
126395b7b453SJohn Marino   for (node = root; ; )
126495b7b453SJohn Marino     {
126595b7b453SJohn Marino       reg_errcode_t err = fn (extra, node);
1266*09d4459fSDaniel Fojt       if (__glibc_unlikely (err != REG_NOERROR))
126795b7b453SJohn Marino 	return err;
126895b7b453SJohn Marino 
126995b7b453SJohn Marino       /* Go to the left node, or up and to the right.  */
127095b7b453SJohn Marino       if (node->left)
127195b7b453SJohn Marino 	node = node->left;
127295b7b453SJohn Marino       else
127395b7b453SJohn Marino 	{
127495b7b453SJohn Marino 	  bin_tree_t *prev = NULL;
127595b7b453SJohn Marino 	  while (node->right == prev || node->right == NULL)
127695b7b453SJohn Marino 	    {
127795b7b453SJohn Marino 	      prev = node;
127895b7b453SJohn Marino 	      node = node->parent;
127995b7b453SJohn Marino 	      if (!node)
128095b7b453SJohn Marino 		return REG_NOERROR;
128195b7b453SJohn Marino 	    }
128295b7b453SJohn Marino 	  node = node->right;
128395b7b453SJohn Marino 	}
128495b7b453SJohn Marino     }
128595b7b453SJohn Marino }
128695b7b453SJohn Marino 
128795b7b453SJohn Marino /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
128895b7b453SJohn Marino    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
128995b7b453SJohn Marino    backreferences as well.  Requires a preorder visit.  */
129095b7b453SJohn Marino static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)129195b7b453SJohn Marino optimize_subexps (void *extra, bin_tree_t *node)
129295b7b453SJohn Marino {
129395b7b453SJohn Marino   re_dfa_t *dfa = (re_dfa_t *) extra;
129495b7b453SJohn Marino 
129595b7b453SJohn Marino   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
129695b7b453SJohn Marino     {
129795b7b453SJohn Marino       int idx = node->token.opr.idx;
129895b7b453SJohn Marino       node->token.opr.idx = dfa->subexp_map[idx];
129995b7b453SJohn Marino       dfa->used_bkref_map |= 1 << node->token.opr.idx;
130095b7b453SJohn Marino     }
130195b7b453SJohn Marino 
130295b7b453SJohn Marino   else if (node->token.type == SUBEXP
130395b7b453SJohn Marino 	   && node->left && node->left->token.type == SUBEXP)
130495b7b453SJohn Marino     {
130595b7b453SJohn Marino       Idx other_idx = node->left->token.opr.idx;
130695b7b453SJohn Marino 
130795b7b453SJohn Marino       node->left = node->left->left;
130895b7b453SJohn Marino       if (node->left)
130995b7b453SJohn Marino 	node->left->parent = node;
131095b7b453SJohn Marino 
131195b7b453SJohn Marino       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
131295b7b453SJohn Marino       if (other_idx < BITSET_WORD_BITS)
131395b7b453SJohn Marino 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
131495b7b453SJohn Marino     }
131595b7b453SJohn Marino 
131695b7b453SJohn Marino   return REG_NOERROR;
131795b7b453SJohn Marino }
131895b7b453SJohn Marino 
131995b7b453SJohn Marino /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
132095b7b453SJohn Marino    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
132195b7b453SJohn Marino static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)132295b7b453SJohn Marino lower_subexps (void *extra, bin_tree_t *node)
132395b7b453SJohn Marino {
132495b7b453SJohn Marino   regex_t *preg = (regex_t *) extra;
132595b7b453SJohn Marino   reg_errcode_t err = REG_NOERROR;
132695b7b453SJohn Marino 
132795b7b453SJohn Marino   if (node->left && node->left->token.type == SUBEXP)
132895b7b453SJohn Marino     {
132995b7b453SJohn Marino       node->left = lower_subexp (&err, preg, node->left);
133095b7b453SJohn Marino       if (node->left)
133195b7b453SJohn Marino 	node->left->parent = node;
133295b7b453SJohn Marino     }
133395b7b453SJohn Marino   if (node->right && node->right->token.type == SUBEXP)
133495b7b453SJohn Marino     {
133595b7b453SJohn Marino       node->right = lower_subexp (&err, preg, node->right);
133695b7b453SJohn Marino       if (node->right)
133795b7b453SJohn Marino 	node->right->parent = node;
133895b7b453SJohn Marino     }
133995b7b453SJohn Marino 
134095b7b453SJohn Marino   return err;
134195b7b453SJohn Marino }
134295b7b453SJohn Marino 
134395b7b453SJohn Marino static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)134495b7b453SJohn Marino lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
134595b7b453SJohn Marino {
1346cf28ed85SJohn Marino   re_dfa_t *dfa = preg->buffer;
134795b7b453SJohn Marino   bin_tree_t *body = node->left;
134895b7b453SJohn Marino   bin_tree_t *op, *cls, *tree1, *tree;
134995b7b453SJohn Marino 
135095b7b453SJohn Marino   if (preg->no_sub
135195b7b453SJohn Marino       /* We do not optimize empty subexpressions, because otherwise we may
135295b7b453SJohn Marino 	 have bad CONCAT nodes with NULL children.  This is obviously not
135395b7b453SJohn Marino 	 very common, so we do not lose much.  An example that triggers
135495b7b453SJohn Marino 	 this case is the sed "script" /\(\)/x.  */
135595b7b453SJohn Marino       && node->left != NULL
135695b7b453SJohn Marino       && (node->token.opr.idx >= BITSET_WORD_BITS
135795b7b453SJohn Marino 	  || !(dfa->used_bkref_map
135895b7b453SJohn Marino 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
135995b7b453SJohn Marino     return node->left;
136095b7b453SJohn Marino 
136195b7b453SJohn Marino   /* Convert the SUBEXP node to the concatenation of an
136295b7b453SJohn Marino      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
136395b7b453SJohn Marino   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
136495b7b453SJohn Marino   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
136595b7b453SJohn Marino   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
136695b7b453SJohn Marino   tree = create_tree (dfa, op, tree1, CONCAT);
1367*09d4459fSDaniel Fojt   if (__glibc_unlikely (tree == NULL || tree1 == NULL
1368*09d4459fSDaniel Fojt 			|| op == NULL || cls == NULL))
136995b7b453SJohn Marino     {
137095b7b453SJohn Marino       *err = REG_ESPACE;
137195b7b453SJohn Marino       return NULL;
137295b7b453SJohn Marino     }
137395b7b453SJohn Marino 
137495b7b453SJohn Marino   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
137595b7b453SJohn Marino   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
137695b7b453SJohn Marino   return tree;
137795b7b453SJohn Marino }
137895b7b453SJohn Marino 
137995b7b453SJohn Marino /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
138095b7b453SJohn Marino    nodes.  Requires a postorder visit.  */
138195b7b453SJohn Marino static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)138295b7b453SJohn Marino calc_first (void *extra, bin_tree_t *node)
138395b7b453SJohn Marino {
138495b7b453SJohn Marino   re_dfa_t *dfa = (re_dfa_t *) extra;
138595b7b453SJohn Marino   if (node->token.type == CONCAT)
138695b7b453SJohn Marino     {
138795b7b453SJohn Marino       node->first = node->left->first;
138895b7b453SJohn Marino       node->node_idx = node->left->node_idx;
138995b7b453SJohn Marino     }
139095b7b453SJohn Marino   else
139195b7b453SJohn Marino     {
139295b7b453SJohn Marino       node->first = node;
139395b7b453SJohn Marino       node->node_idx = re_dfa_add_node (dfa, node->token);
1394*09d4459fSDaniel Fojt       if (__glibc_unlikely (node->node_idx == -1))
139595b7b453SJohn Marino 	return REG_ESPACE;
139695b7b453SJohn Marino       if (node->token.type == ANCHOR)
139795b7b453SJohn Marino 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
139895b7b453SJohn Marino     }
139995b7b453SJohn Marino   return REG_NOERROR;
140095b7b453SJohn Marino }
140195b7b453SJohn Marino 
140295b7b453SJohn Marino /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
140395b7b453SJohn Marino static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1404680a9cb8SJohn Marino calc_next (void *extra, bin_tree_t *node)
140595b7b453SJohn Marino {
140695b7b453SJohn Marino   switch (node->token.type)
140795b7b453SJohn Marino     {
140895b7b453SJohn Marino     case OP_DUP_ASTERISK:
140995b7b453SJohn Marino       node->left->next = node;
141095b7b453SJohn Marino       break;
141195b7b453SJohn Marino     case CONCAT:
141295b7b453SJohn Marino       node->left->next = node->right->first;
141395b7b453SJohn Marino       node->right->next = node->next;
141495b7b453SJohn Marino       break;
141595b7b453SJohn Marino     default:
141695b7b453SJohn Marino       if (node->left)
141795b7b453SJohn Marino 	node->left->next = node->next;
141895b7b453SJohn Marino       if (node->right)
141995b7b453SJohn Marino 	node->right->next = node->next;
142095b7b453SJohn Marino       break;
142195b7b453SJohn Marino     }
142295b7b453SJohn Marino   return REG_NOERROR;
142395b7b453SJohn Marino }
142495b7b453SJohn Marino 
142595b7b453SJohn Marino /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
142695b7b453SJohn Marino static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)142795b7b453SJohn Marino link_nfa_nodes (void *extra, bin_tree_t *node)
142895b7b453SJohn Marino {
142995b7b453SJohn Marino   re_dfa_t *dfa = (re_dfa_t *) extra;
143095b7b453SJohn Marino   Idx idx = node->node_idx;
143195b7b453SJohn Marino   reg_errcode_t err = REG_NOERROR;
143295b7b453SJohn Marino 
143395b7b453SJohn Marino   switch (node->token.type)
143495b7b453SJohn Marino     {
143595b7b453SJohn Marino     case CONCAT:
143695b7b453SJohn Marino       break;
143795b7b453SJohn Marino 
143895b7b453SJohn Marino     case END_OF_RE:
1439*09d4459fSDaniel Fojt       DEBUG_ASSERT (node->next == NULL);
144095b7b453SJohn Marino       break;
144195b7b453SJohn Marino 
144295b7b453SJohn Marino     case OP_DUP_ASTERISK:
144395b7b453SJohn Marino     case OP_ALT:
144495b7b453SJohn Marino       {
144595b7b453SJohn Marino 	Idx left, right;
144695b7b453SJohn Marino 	dfa->has_plural_match = 1;
144795b7b453SJohn Marino 	if (node->left != NULL)
144895b7b453SJohn Marino 	  left = node->left->first->node_idx;
144995b7b453SJohn Marino 	else
145095b7b453SJohn Marino 	  left = node->next->node_idx;
145195b7b453SJohn Marino 	if (node->right != NULL)
145295b7b453SJohn Marino 	  right = node->right->first->node_idx;
145395b7b453SJohn Marino 	else
145495b7b453SJohn Marino 	  right = node->next->node_idx;
1455*09d4459fSDaniel Fojt 	DEBUG_ASSERT (left > -1);
1456*09d4459fSDaniel Fojt 	DEBUG_ASSERT (right > -1);
145795b7b453SJohn Marino 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
145895b7b453SJohn Marino       }
145995b7b453SJohn Marino       break;
146095b7b453SJohn Marino 
146195b7b453SJohn Marino     case ANCHOR:
146295b7b453SJohn Marino     case OP_OPEN_SUBEXP:
146395b7b453SJohn Marino     case OP_CLOSE_SUBEXP:
146495b7b453SJohn Marino       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
146595b7b453SJohn Marino       break;
146695b7b453SJohn Marino 
146795b7b453SJohn Marino     case OP_BACK_REF:
146895b7b453SJohn Marino       dfa->nexts[idx] = node->next->node_idx;
146995b7b453SJohn Marino       if (node->token.type == OP_BACK_REF)
147095b7b453SJohn Marino 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
147195b7b453SJohn Marino       break;
147295b7b453SJohn Marino 
147395b7b453SJohn Marino     default:
1474*09d4459fSDaniel Fojt       DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
147595b7b453SJohn Marino       dfa->nexts[idx] = node->next->node_idx;
147695b7b453SJohn Marino       break;
147795b7b453SJohn Marino     }
147895b7b453SJohn Marino 
147995b7b453SJohn Marino   return err;
148095b7b453SJohn Marino }
148195b7b453SJohn Marino 
148295b7b453SJohn Marino /* Duplicate the epsilon closure of the node ROOT_NODE.
148395b7b453SJohn Marino    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
148495b7b453SJohn Marino    to their own constraint.  */
148595b7b453SJohn Marino 
148695b7b453SJohn Marino static reg_errcode_t
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)148795b7b453SJohn Marino duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
148895b7b453SJohn Marino 			Idx root_node, unsigned int init_constraint)
148995b7b453SJohn Marino {
149095b7b453SJohn Marino   Idx org_node, clone_node;
149195b7b453SJohn Marino   bool ok;
149295b7b453SJohn Marino   unsigned int constraint = init_constraint;
149395b7b453SJohn Marino   for (org_node = top_org_node, clone_node = top_clone_node;;)
149495b7b453SJohn Marino     {
149595b7b453SJohn Marino       Idx org_dest, clone_dest;
149695b7b453SJohn Marino       if (dfa->nodes[org_node].type == OP_BACK_REF)
149795b7b453SJohn Marino 	{
149895b7b453SJohn Marino 	  /* If the back reference epsilon-transit, its destination must
149995b7b453SJohn Marino 	     also have the constraint.  Then duplicate the epsilon closure
150095b7b453SJohn Marino 	     of the destination of the back reference, and store it in
150195b7b453SJohn Marino 	     edests of the back reference.  */
150295b7b453SJohn Marino 	  org_dest = dfa->nexts[org_node];
150395b7b453SJohn Marino 	  re_node_set_empty (dfa->edests + clone_node);
150495b7b453SJohn Marino 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1505*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (clone_dest == -1))
150695b7b453SJohn Marino 	    return REG_ESPACE;
150795b7b453SJohn Marino 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
150895b7b453SJohn Marino 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (! ok))
151095b7b453SJohn Marino 	    return REG_ESPACE;
151195b7b453SJohn Marino 	}
151295b7b453SJohn Marino       else if (dfa->edests[org_node].nelem == 0)
151395b7b453SJohn Marino 	{
151495b7b453SJohn Marino 	  /* In case of the node can't epsilon-transit, don't duplicate the
151595b7b453SJohn Marino 	     destination and store the original destination as the
151695b7b453SJohn Marino 	     destination of the node.  */
151795b7b453SJohn Marino 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
151895b7b453SJohn Marino 	  break;
151995b7b453SJohn Marino 	}
152095b7b453SJohn Marino       else if (dfa->edests[org_node].nelem == 1)
152195b7b453SJohn Marino 	{
152295b7b453SJohn Marino 	  /* In case of the node can epsilon-transit, and it has only one
152395b7b453SJohn Marino 	     destination.  */
152495b7b453SJohn Marino 	  org_dest = dfa->edests[org_node].elems[0];
152595b7b453SJohn Marino 	  re_node_set_empty (dfa->edests + clone_node);
152695b7b453SJohn Marino 	  /* If the node is root_node itself, it means the epsilon closure
152795b7b453SJohn Marino 	     has a loop.  Then tie it to the destination of the root_node.  */
152895b7b453SJohn Marino 	  if (org_node == root_node && clone_node != org_node)
152995b7b453SJohn Marino 	    {
153095b7b453SJohn Marino 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1531*09d4459fSDaniel Fojt 	      if (__glibc_unlikely (! ok))
153295b7b453SJohn Marino 	        return REG_ESPACE;
153395b7b453SJohn Marino 	      break;
153495b7b453SJohn Marino 	    }
153595b7b453SJohn Marino 	  /* In case the node has another constraint, append it.  */
153695b7b453SJohn Marino 	  constraint |= dfa->nodes[org_node].constraint;
153795b7b453SJohn Marino 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1538*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (clone_dest == -1))
153995b7b453SJohn Marino 	    return REG_ESPACE;
154095b7b453SJohn Marino 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1541*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (! ok))
154295b7b453SJohn Marino 	    return REG_ESPACE;
154395b7b453SJohn Marino 	}
154495b7b453SJohn Marino       else /* dfa->edests[org_node].nelem == 2 */
154595b7b453SJohn Marino 	{
154695b7b453SJohn Marino 	  /* In case of the node can epsilon-transit, and it has two
154795b7b453SJohn Marino 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
154895b7b453SJohn Marino 	  org_dest = dfa->edests[org_node].elems[0];
154995b7b453SJohn Marino 	  re_node_set_empty (dfa->edests + clone_node);
155095b7b453SJohn Marino 	  /* Search for a duplicated node which satisfies the constraint.  */
155195b7b453SJohn Marino 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1552*09d4459fSDaniel Fojt 	  if (clone_dest == -1)
155395b7b453SJohn Marino 	    {
155495b7b453SJohn Marino 	      /* There is no such duplicated node, create a new one.  */
155595b7b453SJohn Marino 	      reg_errcode_t err;
155695b7b453SJohn Marino 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1557*09d4459fSDaniel Fojt 	      if (__glibc_unlikely (clone_dest == -1))
155895b7b453SJohn Marino 		return REG_ESPACE;
155995b7b453SJohn Marino 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560*09d4459fSDaniel Fojt 	      if (__glibc_unlikely (! ok))
156195b7b453SJohn Marino 		return REG_ESPACE;
156295b7b453SJohn Marino 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
156395b7b453SJohn Marino 					    root_node, constraint);
1564*09d4459fSDaniel Fojt 	      if (__glibc_unlikely (err != REG_NOERROR))
156595b7b453SJohn Marino 		return err;
156695b7b453SJohn Marino 	    }
156795b7b453SJohn Marino 	  else
156895b7b453SJohn Marino 	    {
156995b7b453SJohn Marino 	      /* There is a duplicated node which satisfies the constraint,
157095b7b453SJohn Marino 		 use it to avoid infinite loop.  */
157195b7b453SJohn Marino 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572*09d4459fSDaniel Fojt 	      if (__glibc_unlikely (! ok))
157395b7b453SJohn Marino 		return REG_ESPACE;
157495b7b453SJohn Marino 	    }
157595b7b453SJohn Marino 
157695b7b453SJohn Marino 	  org_dest = dfa->edests[org_node].elems[1];
157795b7b453SJohn Marino 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1578*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (clone_dest == -1))
157995b7b453SJohn Marino 	    return REG_ESPACE;
158095b7b453SJohn Marino 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (! ok))
158295b7b453SJohn Marino 	    return REG_ESPACE;
158395b7b453SJohn Marino 	}
158495b7b453SJohn Marino       org_node = org_dest;
158595b7b453SJohn Marino       clone_node = clone_dest;
158695b7b453SJohn Marino     }
158795b7b453SJohn Marino   return REG_NOERROR;
158895b7b453SJohn Marino }
158995b7b453SJohn Marino 
159095b7b453SJohn Marino /* Search for a node which is duplicated from the node ORG_NODE, and
159195b7b453SJohn Marino    satisfies the constraint CONSTRAINT.  */
159295b7b453SJohn Marino 
159395b7b453SJohn Marino static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)159495b7b453SJohn Marino search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
159595b7b453SJohn Marino 			unsigned int constraint)
159695b7b453SJohn Marino {
159795b7b453SJohn Marino   Idx idx;
159895b7b453SJohn Marino   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
159995b7b453SJohn Marino     {
160095b7b453SJohn Marino       if (org_node == dfa->org_indices[idx]
160195b7b453SJohn Marino 	  && constraint == dfa->nodes[idx].constraint)
160295b7b453SJohn Marino 	return idx; /* Found.  */
160395b7b453SJohn Marino     }
1604*09d4459fSDaniel Fojt   return -1; /* Not found.  */
160595b7b453SJohn Marino }
160695b7b453SJohn Marino 
160795b7b453SJohn Marino /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1608*09d4459fSDaniel Fojt    Return the index of the new node, or -1 if insufficient storage is
160995b7b453SJohn Marino    available.  */
161095b7b453SJohn Marino 
161195b7b453SJohn Marino static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)161295b7b453SJohn Marino duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
161395b7b453SJohn Marino {
161495b7b453SJohn Marino   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1615*09d4459fSDaniel Fojt   if (__glibc_likely (dup_idx != -1))
161695b7b453SJohn Marino     {
161795b7b453SJohn Marino       dfa->nodes[dup_idx].constraint = constraint;
161895b7b453SJohn Marino       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
161995b7b453SJohn Marino       dfa->nodes[dup_idx].duplicated = 1;
162095b7b453SJohn Marino 
162195b7b453SJohn Marino       /* Store the index of the original node.  */
162295b7b453SJohn Marino       dfa->org_indices[dup_idx] = org_idx;
162395b7b453SJohn Marino     }
162495b7b453SJohn Marino   return dup_idx;
162595b7b453SJohn Marino }
162695b7b453SJohn Marino 
162795b7b453SJohn Marino static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)162895b7b453SJohn Marino calc_inveclosure (re_dfa_t *dfa)
162995b7b453SJohn Marino {
163095b7b453SJohn Marino   Idx src, idx;
163195b7b453SJohn Marino   bool ok;
163295b7b453SJohn Marino   for (idx = 0; idx < dfa->nodes_len; ++idx)
163395b7b453SJohn Marino     re_node_set_init_empty (dfa->inveclosures + idx);
163495b7b453SJohn Marino 
163595b7b453SJohn Marino   for (src = 0; src < dfa->nodes_len; ++src)
163695b7b453SJohn Marino     {
163795b7b453SJohn Marino       Idx *elems = dfa->eclosures[src].elems;
163895b7b453SJohn Marino       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
163995b7b453SJohn Marino 	{
164095b7b453SJohn Marino 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1641*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (! ok))
164295b7b453SJohn Marino 	    return REG_ESPACE;
164395b7b453SJohn Marino 	}
164495b7b453SJohn Marino     }
164595b7b453SJohn Marino 
164695b7b453SJohn Marino   return REG_NOERROR;
164795b7b453SJohn Marino }
164895b7b453SJohn Marino 
164995b7b453SJohn Marino /* Calculate "eclosure" for all the node in DFA.  */
165095b7b453SJohn Marino 
165195b7b453SJohn Marino static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)165295b7b453SJohn Marino calc_eclosure (re_dfa_t *dfa)
165395b7b453SJohn Marino {
165495b7b453SJohn Marino   Idx node_idx;
165595b7b453SJohn Marino   bool incomplete;
1656*09d4459fSDaniel Fojt   DEBUG_ASSERT (dfa->nodes_len > 0);
165795b7b453SJohn Marino   incomplete = false;
165895b7b453SJohn Marino   /* For each nodes, calculate epsilon closure.  */
165995b7b453SJohn Marino   for (node_idx = 0; ; ++node_idx)
166095b7b453SJohn Marino     {
166195b7b453SJohn Marino       reg_errcode_t err;
166295b7b453SJohn Marino       re_node_set eclosure_elem;
166395b7b453SJohn Marino       if (node_idx == dfa->nodes_len)
166495b7b453SJohn Marino 	{
166595b7b453SJohn Marino 	  if (!incomplete)
166695b7b453SJohn Marino 	    break;
166795b7b453SJohn Marino 	  incomplete = false;
166895b7b453SJohn Marino 	  node_idx = 0;
166995b7b453SJohn Marino 	}
167095b7b453SJohn Marino 
1671*09d4459fSDaniel Fojt       DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
167295b7b453SJohn Marino 
167395b7b453SJohn Marino       /* If we have already calculated, skip it.  */
167495b7b453SJohn Marino       if (dfa->eclosures[node_idx].nelem != 0)
167595b7b453SJohn Marino 	continue;
1676cf28ed85SJohn Marino       /* Calculate epsilon closure of 'node_idx'.  */
167795b7b453SJohn Marino       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1678*09d4459fSDaniel Fojt       if (__glibc_unlikely (err != REG_NOERROR))
167995b7b453SJohn Marino 	return err;
168095b7b453SJohn Marino 
168195b7b453SJohn Marino       if (dfa->eclosures[node_idx].nelem == 0)
168295b7b453SJohn Marino 	{
168395b7b453SJohn Marino 	  incomplete = true;
168495b7b453SJohn Marino 	  re_node_set_free (&eclosure_elem);
168595b7b453SJohn Marino 	}
168695b7b453SJohn Marino     }
168795b7b453SJohn Marino   return REG_NOERROR;
168895b7b453SJohn Marino }
168995b7b453SJohn Marino 
169095b7b453SJohn Marino /* Calculate epsilon closure of NODE.  */
169195b7b453SJohn Marino 
169295b7b453SJohn Marino static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)169395b7b453SJohn Marino calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
169495b7b453SJohn Marino {
169595b7b453SJohn Marino   reg_errcode_t err;
169695b7b453SJohn Marino   Idx i;
169795b7b453SJohn Marino   re_node_set eclosure;
169895b7b453SJohn Marino   bool ok;
169995b7b453SJohn Marino   bool incomplete = false;
170095b7b453SJohn Marino   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1701*09d4459fSDaniel Fojt   if (__glibc_unlikely (err != REG_NOERROR))
170295b7b453SJohn Marino     return err;
170395b7b453SJohn Marino 
170495b7b453SJohn Marino   /* This indicates that we are calculating this node now.
170595b7b453SJohn Marino      We reference this value to avoid infinite loop.  */
1706*09d4459fSDaniel Fojt   dfa->eclosures[node].nelem = -1;
170795b7b453SJohn Marino 
170895b7b453SJohn Marino   /* If the current node has constraints, duplicate all nodes
170995b7b453SJohn Marino      since they must inherit the constraints.  */
171095b7b453SJohn Marino   if (dfa->nodes[node].constraint
171195b7b453SJohn Marino       && dfa->edests[node].nelem
171295b7b453SJohn Marino       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
171395b7b453SJohn Marino     {
171495b7b453SJohn Marino       err = duplicate_node_closure (dfa, node, node, node,
171595b7b453SJohn Marino 				    dfa->nodes[node].constraint);
1716*09d4459fSDaniel Fojt       if (__glibc_unlikely (err != REG_NOERROR))
171795b7b453SJohn Marino 	return err;
171895b7b453SJohn Marino     }
171995b7b453SJohn Marino 
172095b7b453SJohn Marino   /* Expand each epsilon destination nodes.  */
172195b7b453SJohn Marino   if (IS_EPSILON_NODE(dfa->nodes[node].type))
172295b7b453SJohn Marino     for (i = 0; i < dfa->edests[node].nelem; ++i)
172395b7b453SJohn Marino       {
172495b7b453SJohn Marino 	re_node_set eclosure_elem;
172595b7b453SJohn Marino 	Idx edest = dfa->edests[node].elems[i];
1726cf28ed85SJohn Marino 	/* If calculating the epsilon closure of 'edest' is in progress,
172795b7b453SJohn Marino 	   return intermediate result.  */
1728*09d4459fSDaniel Fojt 	if (dfa->eclosures[edest].nelem == -1)
172995b7b453SJohn Marino 	  {
173095b7b453SJohn Marino 	    incomplete = true;
173195b7b453SJohn Marino 	    continue;
173295b7b453SJohn Marino 	  }
1733cf28ed85SJohn Marino 	/* If we haven't calculated the epsilon closure of 'edest' yet,
173495b7b453SJohn Marino 	   calculate now. Otherwise use calculated epsilon closure.  */
173595b7b453SJohn Marino 	if (dfa->eclosures[edest].nelem == 0)
173695b7b453SJohn Marino 	  {
173795b7b453SJohn Marino 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1738*09d4459fSDaniel Fojt 	    if (__glibc_unlikely (err != REG_NOERROR))
173995b7b453SJohn Marino 	      return err;
174095b7b453SJohn Marino 	  }
174195b7b453SJohn Marino 	else
174295b7b453SJohn Marino 	  eclosure_elem = dfa->eclosures[edest];
1743cf28ed85SJohn Marino 	/* Merge the epsilon closure of 'edest'.  */
174495b7b453SJohn Marino 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1745*09d4459fSDaniel Fojt 	if (__glibc_unlikely (err != REG_NOERROR))
174695b7b453SJohn Marino 	  return err;
1747cf28ed85SJohn Marino 	/* If the epsilon closure of 'edest' is incomplete,
174895b7b453SJohn Marino 	   the epsilon closure of this node is also incomplete.  */
174995b7b453SJohn Marino 	if (dfa->eclosures[edest].nelem == 0)
175095b7b453SJohn Marino 	  {
175195b7b453SJohn Marino 	    incomplete = true;
175295b7b453SJohn Marino 	    re_node_set_free (&eclosure_elem);
175395b7b453SJohn Marino 	  }
175495b7b453SJohn Marino       }
175595b7b453SJohn Marino 
175695b7b453SJohn Marino   /* An epsilon closure includes itself.  */
175795b7b453SJohn Marino   ok = re_node_set_insert (&eclosure, node);
1758*09d4459fSDaniel Fojt   if (__glibc_unlikely (! ok))
175995b7b453SJohn Marino     return REG_ESPACE;
176095b7b453SJohn Marino   if (incomplete && !root)
176195b7b453SJohn Marino     dfa->eclosures[node].nelem = 0;
176295b7b453SJohn Marino   else
176395b7b453SJohn Marino     dfa->eclosures[node] = eclosure;
176495b7b453SJohn Marino   *new_set = eclosure;
176595b7b453SJohn Marino   return REG_NOERROR;
176695b7b453SJohn Marino }
176795b7b453SJohn Marino 
176895b7b453SJohn Marino /* Functions for token which are used in the parser.  */
176995b7b453SJohn Marino 
177095b7b453SJohn Marino /* Fetch a token from INPUT.
177195b7b453SJohn Marino    We must not use this function inside bracket expressions.  */
177295b7b453SJohn Marino 
177395b7b453SJohn Marino static void
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)177495b7b453SJohn Marino fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
177595b7b453SJohn Marino {
177695b7b453SJohn Marino   re_string_skip_bytes (input, peek_token (result, input, syntax));
177795b7b453SJohn Marino }
177895b7b453SJohn Marino 
177995b7b453SJohn Marino /* Peek a token from INPUT, and return the length of the token.
178095b7b453SJohn Marino    We must not use this function inside bracket expressions.  */
178195b7b453SJohn Marino 
178295b7b453SJohn Marino static int
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)178395b7b453SJohn Marino peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
178495b7b453SJohn Marino {
178595b7b453SJohn Marino   unsigned char c;
178695b7b453SJohn Marino 
178795b7b453SJohn Marino   if (re_string_eoi (input))
178895b7b453SJohn Marino     {
178995b7b453SJohn Marino       token->type = END_OF_RE;
179095b7b453SJohn Marino       return 0;
179195b7b453SJohn Marino     }
179295b7b453SJohn Marino 
179395b7b453SJohn Marino   c = re_string_peek_byte (input, 0);
179495b7b453SJohn Marino   token->opr.c = c;
179595b7b453SJohn Marino 
179695b7b453SJohn Marino   token->word_char = 0;
179795b7b453SJohn Marino #ifdef RE_ENABLE_I18N
179895b7b453SJohn Marino   token->mb_partial = 0;
1799*09d4459fSDaniel Fojt   if (input->mb_cur_max > 1
1800*09d4459fSDaniel Fojt       && !re_string_first_byte (input, re_string_cur_idx (input)))
180195b7b453SJohn Marino     {
180295b7b453SJohn Marino       token->type = CHARACTER;
180395b7b453SJohn Marino       token->mb_partial = 1;
180495b7b453SJohn Marino       return 1;
180595b7b453SJohn Marino     }
180695b7b453SJohn Marino #endif
180795b7b453SJohn Marino   if (c == '\\')
180895b7b453SJohn Marino     {
180995b7b453SJohn Marino       unsigned char c2;
181095b7b453SJohn Marino       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
181195b7b453SJohn Marino 	{
181295b7b453SJohn Marino 	  token->type = BACK_SLASH;
181395b7b453SJohn Marino 	  return 1;
181495b7b453SJohn Marino 	}
181595b7b453SJohn Marino 
181695b7b453SJohn Marino       c2 = re_string_peek_byte_case (input, 1);
181795b7b453SJohn Marino       token->opr.c = c2;
181895b7b453SJohn Marino       token->type = CHARACTER;
181995b7b453SJohn Marino #ifdef RE_ENABLE_I18N
182095b7b453SJohn Marino       if (input->mb_cur_max > 1)
182195b7b453SJohn Marino 	{
182295b7b453SJohn Marino 	  wint_t wc = re_string_wchar_at (input,
182395b7b453SJohn Marino 					  re_string_cur_idx (input) + 1);
182495b7b453SJohn Marino 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
182595b7b453SJohn Marino 	}
182695b7b453SJohn Marino       else
182795b7b453SJohn Marino #endif
182895b7b453SJohn Marino 	token->word_char = IS_WORD_CHAR (c2) != 0;
182995b7b453SJohn Marino 
183095b7b453SJohn Marino       switch (c2)
183195b7b453SJohn Marino 	{
183295b7b453SJohn Marino 	case '|':
183395b7b453SJohn Marino 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
183495b7b453SJohn Marino 	    token->type = OP_ALT;
183595b7b453SJohn Marino 	  break;
183695b7b453SJohn Marino 	case '1': case '2': case '3': case '4': case '5':
183795b7b453SJohn Marino 	case '6': case '7': case '8': case '9':
183895b7b453SJohn Marino 	  if (!(syntax & RE_NO_BK_REFS))
183995b7b453SJohn Marino 	    {
184095b7b453SJohn Marino 	      token->type = OP_BACK_REF;
184195b7b453SJohn Marino 	      token->opr.idx = c2 - '1';
184295b7b453SJohn Marino 	    }
184395b7b453SJohn Marino 	  break;
184495b7b453SJohn Marino 	case '<':
184595b7b453SJohn Marino 	  if (!(syntax & RE_NO_GNU_OPS))
184695b7b453SJohn Marino 	    {
184795b7b453SJohn Marino 	      token->type = ANCHOR;
184895b7b453SJohn Marino 	      token->opr.ctx_type = WORD_FIRST;
184995b7b453SJohn Marino 	    }
185095b7b453SJohn Marino 	  break;
185195b7b453SJohn Marino 	case '>':
185295b7b453SJohn Marino 	  if (!(syntax & RE_NO_GNU_OPS))
185395b7b453SJohn Marino 	    {
185495b7b453SJohn Marino 	      token->type = ANCHOR;
185595b7b453SJohn Marino 	      token->opr.ctx_type = WORD_LAST;
185695b7b453SJohn Marino 	    }
185795b7b453SJohn Marino 	  break;
185895b7b453SJohn Marino 	case 'b':
185995b7b453SJohn Marino 	  if (!(syntax & RE_NO_GNU_OPS))
186095b7b453SJohn Marino 	    {
186195b7b453SJohn Marino 	      token->type = ANCHOR;
186295b7b453SJohn Marino 	      token->opr.ctx_type = WORD_DELIM;
186395b7b453SJohn Marino 	    }
186495b7b453SJohn Marino 	  break;
186595b7b453SJohn Marino 	case 'B':
186695b7b453SJohn Marino 	  if (!(syntax & RE_NO_GNU_OPS))
186795b7b453SJohn Marino 	    {
186895b7b453SJohn Marino 	      token->type = ANCHOR;
186995b7b453SJohn Marino 	      token->opr.ctx_type = NOT_WORD_DELIM;
187095b7b453SJohn Marino 	    }
187195b7b453SJohn Marino 	  break;
187295b7b453SJohn Marino 	case 'w':
187395b7b453SJohn Marino 	  if (!(syntax & RE_NO_GNU_OPS))
187495b7b453SJohn Marino 	    token->type = OP_WORD;
187595b7b453SJohn Marino 	  break;
187695b7b453SJohn Marino 	case 'W':
187795b7b453SJohn Marino 	  if (!(syntax & RE_NO_GNU_OPS))
187895b7b453SJohn Marino 	    token->type = OP_NOTWORD;
187995b7b453SJohn Marino 	  break;
188095b7b453SJohn Marino 	case 's':
188195b7b453SJohn Marino 	  if (!(syntax & RE_NO_GNU_OPS))
188295b7b453SJohn Marino 	    token->type = OP_SPACE;
188395b7b453SJohn Marino 	  break;
188495b7b453SJohn Marino 	case 'S':
188595b7b453SJohn Marino 	  if (!(syntax & RE_NO_GNU_OPS))
188695b7b453SJohn Marino 	    token->type = OP_NOTSPACE;
188795b7b453SJohn Marino 	  break;
188895b7b453SJohn Marino 	case '`':
188995b7b453SJohn Marino 	  if (!(syntax & RE_NO_GNU_OPS))
189095b7b453SJohn Marino 	    {
189195b7b453SJohn Marino 	      token->type = ANCHOR;
189295b7b453SJohn Marino 	      token->opr.ctx_type = BUF_FIRST;
189395b7b453SJohn Marino 	    }
189495b7b453SJohn Marino 	  break;
189595b7b453SJohn Marino 	case '\'':
189695b7b453SJohn Marino 	  if (!(syntax & RE_NO_GNU_OPS))
189795b7b453SJohn Marino 	    {
189895b7b453SJohn Marino 	      token->type = ANCHOR;
189995b7b453SJohn Marino 	      token->opr.ctx_type = BUF_LAST;
190095b7b453SJohn Marino 	    }
190195b7b453SJohn Marino 	  break;
190295b7b453SJohn Marino 	case '(':
190395b7b453SJohn Marino 	  if (!(syntax & RE_NO_BK_PARENS))
190495b7b453SJohn Marino 	    token->type = OP_OPEN_SUBEXP;
190595b7b453SJohn Marino 	  break;
190695b7b453SJohn Marino 	case ')':
190795b7b453SJohn Marino 	  if (!(syntax & RE_NO_BK_PARENS))
190895b7b453SJohn Marino 	    token->type = OP_CLOSE_SUBEXP;
190995b7b453SJohn Marino 	  break;
191095b7b453SJohn Marino 	case '+':
191195b7b453SJohn Marino 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
191295b7b453SJohn Marino 	    token->type = OP_DUP_PLUS;
191395b7b453SJohn Marino 	  break;
191495b7b453SJohn Marino 	case '?':
191595b7b453SJohn Marino 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
191695b7b453SJohn Marino 	    token->type = OP_DUP_QUESTION;
191795b7b453SJohn Marino 	  break;
191895b7b453SJohn Marino 	case '{':
191995b7b453SJohn Marino 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
192095b7b453SJohn Marino 	    token->type = OP_OPEN_DUP_NUM;
192195b7b453SJohn Marino 	  break;
192295b7b453SJohn Marino 	case '}':
192395b7b453SJohn Marino 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
192495b7b453SJohn Marino 	    token->type = OP_CLOSE_DUP_NUM;
192595b7b453SJohn Marino 	  break;
192695b7b453SJohn Marino 	default:
192795b7b453SJohn Marino 	  break;
192895b7b453SJohn Marino 	}
192995b7b453SJohn Marino       return 2;
193095b7b453SJohn Marino     }
193195b7b453SJohn Marino 
193295b7b453SJohn Marino   token->type = CHARACTER;
193395b7b453SJohn Marino #ifdef RE_ENABLE_I18N
193495b7b453SJohn Marino   if (input->mb_cur_max > 1)
193595b7b453SJohn Marino     {
193695b7b453SJohn Marino       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
193795b7b453SJohn Marino       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
193895b7b453SJohn Marino     }
193995b7b453SJohn Marino   else
194095b7b453SJohn Marino #endif
194195b7b453SJohn Marino     token->word_char = IS_WORD_CHAR (token->opr.c);
194295b7b453SJohn Marino 
194395b7b453SJohn Marino   switch (c)
194495b7b453SJohn Marino     {
194595b7b453SJohn Marino     case '\n':
194695b7b453SJohn Marino       if (syntax & RE_NEWLINE_ALT)
194795b7b453SJohn Marino 	token->type = OP_ALT;
194895b7b453SJohn Marino       break;
194995b7b453SJohn Marino     case '|':
195095b7b453SJohn Marino       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
195195b7b453SJohn Marino 	token->type = OP_ALT;
195295b7b453SJohn Marino       break;
195395b7b453SJohn Marino     case '*':
195495b7b453SJohn Marino       token->type = OP_DUP_ASTERISK;
195595b7b453SJohn Marino       break;
195695b7b453SJohn Marino     case '+':
195795b7b453SJohn Marino       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
195895b7b453SJohn Marino 	token->type = OP_DUP_PLUS;
195995b7b453SJohn Marino       break;
196095b7b453SJohn Marino     case '?':
196195b7b453SJohn Marino       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
196295b7b453SJohn Marino 	token->type = OP_DUP_QUESTION;
196395b7b453SJohn Marino       break;
196495b7b453SJohn Marino     case '{':
196595b7b453SJohn Marino       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
196695b7b453SJohn Marino 	token->type = OP_OPEN_DUP_NUM;
196795b7b453SJohn Marino       break;
196895b7b453SJohn Marino     case '}':
196995b7b453SJohn Marino       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
197095b7b453SJohn Marino 	token->type = OP_CLOSE_DUP_NUM;
197195b7b453SJohn Marino       break;
197295b7b453SJohn Marino     case '(':
197395b7b453SJohn Marino       if (syntax & RE_NO_BK_PARENS)
197495b7b453SJohn Marino 	token->type = OP_OPEN_SUBEXP;
197595b7b453SJohn Marino       break;
197695b7b453SJohn Marino     case ')':
197795b7b453SJohn Marino       if (syntax & RE_NO_BK_PARENS)
197895b7b453SJohn Marino 	token->type = OP_CLOSE_SUBEXP;
197995b7b453SJohn Marino       break;
198095b7b453SJohn Marino     case '[':
198195b7b453SJohn Marino       token->type = OP_OPEN_BRACKET;
198295b7b453SJohn Marino       break;
198395b7b453SJohn Marino     case '.':
198495b7b453SJohn Marino       token->type = OP_PERIOD;
198595b7b453SJohn Marino       break;
198695b7b453SJohn Marino     case '^':
1987*09d4459fSDaniel Fojt       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1988*09d4459fSDaniel Fojt 	  && re_string_cur_idx (input) != 0)
198995b7b453SJohn Marino 	{
199095b7b453SJohn Marino 	  char prev = re_string_peek_byte (input, -1);
199195b7b453SJohn Marino 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
199295b7b453SJohn Marino 	    break;
199395b7b453SJohn Marino 	}
199495b7b453SJohn Marino       token->type = ANCHOR;
199595b7b453SJohn Marino       token->opr.ctx_type = LINE_FIRST;
199695b7b453SJohn Marino       break;
199795b7b453SJohn Marino     case '$':
1998*09d4459fSDaniel Fojt       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1999*09d4459fSDaniel Fojt 	  && re_string_cur_idx (input) + 1 != re_string_length (input))
200095b7b453SJohn Marino 	{
200195b7b453SJohn Marino 	  re_token_t next;
200295b7b453SJohn Marino 	  re_string_skip_bytes (input, 1);
200395b7b453SJohn Marino 	  peek_token (&next, input, syntax);
200495b7b453SJohn Marino 	  re_string_skip_bytes (input, -1);
200595b7b453SJohn Marino 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
200695b7b453SJohn Marino 	    break;
200795b7b453SJohn Marino 	}
200895b7b453SJohn Marino       token->type = ANCHOR;
200995b7b453SJohn Marino       token->opr.ctx_type = LINE_LAST;
201095b7b453SJohn Marino       break;
201195b7b453SJohn Marino     default:
201295b7b453SJohn Marino       break;
201395b7b453SJohn Marino     }
201495b7b453SJohn Marino   return 1;
201595b7b453SJohn Marino }
201695b7b453SJohn Marino 
201795b7b453SJohn Marino /* Peek a token from INPUT, and return the length of the token.
201895b7b453SJohn Marino    We must not use this function out of bracket expressions.  */
201995b7b453SJohn Marino 
202095b7b453SJohn Marino static int
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)202195b7b453SJohn Marino peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
202295b7b453SJohn Marino {
202395b7b453SJohn Marino   unsigned char c;
202495b7b453SJohn Marino   if (re_string_eoi (input))
202595b7b453SJohn Marino     {
202695b7b453SJohn Marino       token->type = END_OF_RE;
202795b7b453SJohn Marino       return 0;
202895b7b453SJohn Marino     }
202995b7b453SJohn Marino   c = re_string_peek_byte (input, 0);
203095b7b453SJohn Marino   token->opr.c = c;
203195b7b453SJohn Marino 
203295b7b453SJohn Marino #ifdef RE_ENABLE_I18N
2033*09d4459fSDaniel Fojt   if (input->mb_cur_max > 1
2034*09d4459fSDaniel Fojt       && !re_string_first_byte (input, re_string_cur_idx (input)))
203595b7b453SJohn Marino     {
203695b7b453SJohn Marino       token->type = CHARACTER;
203795b7b453SJohn Marino       return 1;
203895b7b453SJohn Marino     }
203995b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
204095b7b453SJohn Marino 
204195b7b453SJohn Marino   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
204295b7b453SJohn Marino       && re_string_cur_idx (input) + 1 < re_string_length (input))
204395b7b453SJohn Marino     {
204495b7b453SJohn Marino       /* In this case, '\' escape a character.  */
204595b7b453SJohn Marino       unsigned char c2;
204695b7b453SJohn Marino       re_string_skip_bytes (input, 1);
204795b7b453SJohn Marino       c2 = re_string_peek_byte (input, 0);
204895b7b453SJohn Marino       token->opr.c = c2;
204995b7b453SJohn Marino       token->type = CHARACTER;
205095b7b453SJohn Marino       return 1;
205195b7b453SJohn Marino     }
205295b7b453SJohn Marino   if (c == '[') /* '[' is a special char in a bracket exps.  */
205395b7b453SJohn Marino     {
205495b7b453SJohn Marino       unsigned char c2;
205595b7b453SJohn Marino       int token_len;
205695b7b453SJohn Marino       if (re_string_cur_idx (input) + 1 < re_string_length (input))
205795b7b453SJohn Marino 	c2 = re_string_peek_byte (input, 1);
205895b7b453SJohn Marino       else
205995b7b453SJohn Marino 	c2 = 0;
206095b7b453SJohn Marino       token->opr.c = c2;
206195b7b453SJohn Marino       token_len = 2;
206295b7b453SJohn Marino       switch (c2)
206395b7b453SJohn Marino 	{
206495b7b453SJohn Marino 	case '.':
206595b7b453SJohn Marino 	  token->type = OP_OPEN_COLL_ELEM;
206695b7b453SJohn Marino 	  break;
2067*09d4459fSDaniel Fojt 
206895b7b453SJohn Marino 	case '=':
206995b7b453SJohn Marino 	  token->type = OP_OPEN_EQUIV_CLASS;
207095b7b453SJohn Marino 	  break;
2071*09d4459fSDaniel Fojt 
207295b7b453SJohn Marino 	case ':':
207395b7b453SJohn Marino 	  if (syntax & RE_CHAR_CLASSES)
207495b7b453SJohn Marino 	    {
207595b7b453SJohn Marino 	      token->type = OP_OPEN_CHAR_CLASS;
207695b7b453SJohn Marino 	      break;
207795b7b453SJohn Marino 	    }
2078*09d4459fSDaniel Fojt 	  FALLTHROUGH;
207995b7b453SJohn Marino 	default:
208095b7b453SJohn Marino 	  token->type = CHARACTER;
208195b7b453SJohn Marino 	  token->opr.c = c;
208295b7b453SJohn Marino 	  token_len = 1;
208395b7b453SJohn Marino 	  break;
208495b7b453SJohn Marino 	}
208595b7b453SJohn Marino       return token_len;
208695b7b453SJohn Marino     }
208795b7b453SJohn Marino   switch (c)
208895b7b453SJohn Marino     {
208995b7b453SJohn Marino     case '-':
209095b7b453SJohn Marino       token->type = OP_CHARSET_RANGE;
209195b7b453SJohn Marino       break;
209295b7b453SJohn Marino     case ']':
209395b7b453SJohn Marino       token->type = OP_CLOSE_BRACKET;
209495b7b453SJohn Marino       break;
209595b7b453SJohn Marino     case '^':
209695b7b453SJohn Marino       token->type = OP_NON_MATCH_LIST;
209795b7b453SJohn Marino       break;
209895b7b453SJohn Marino     default:
209995b7b453SJohn Marino       token->type = CHARACTER;
210095b7b453SJohn Marino     }
210195b7b453SJohn Marino   return 1;
210295b7b453SJohn Marino }
210395b7b453SJohn Marino 
210495b7b453SJohn Marino /* Functions for parser.  */
210595b7b453SJohn Marino 
210695b7b453SJohn Marino /* Entry point of the parser.
210795b7b453SJohn Marino    Parse the regular expression REGEXP and return the structure tree.
2108cf28ed85SJohn Marino    If an error occurs, ERR is set by error code, and return NULL.
210995b7b453SJohn Marino    This function build the following tree, from regular expression <reg_exp>:
211095b7b453SJohn Marino 	   CAT
211195b7b453SJohn Marino 	   / \
211295b7b453SJohn Marino 	  /   \
211395b7b453SJohn Marino    <reg_exp>  EOR
211495b7b453SJohn Marino 
211595b7b453SJohn Marino    CAT means concatenation.
211695b7b453SJohn Marino    EOR means end of regular expression.  */
211795b7b453SJohn Marino 
211895b7b453SJohn Marino static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)211995b7b453SJohn Marino parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
212095b7b453SJohn Marino        reg_errcode_t *err)
212195b7b453SJohn Marino {
2122cf28ed85SJohn Marino   re_dfa_t *dfa = preg->buffer;
212395b7b453SJohn Marino   bin_tree_t *tree, *eor, *root;
212495b7b453SJohn Marino   re_token_t current_token;
212595b7b453SJohn Marino   dfa->syntax = syntax;
212695b7b453SJohn Marino   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
212795b7b453SJohn Marino   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2128*09d4459fSDaniel Fojt   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
212995b7b453SJohn Marino     return NULL;
213095b7b453SJohn Marino   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
213195b7b453SJohn Marino   if (tree != NULL)
213295b7b453SJohn Marino     root = create_tree (dfa, tree, eor, CONCAT);
213395b7b453SJohn Marino   else
213495b7b453SJohn Marino     root = eor;
2135*09d4459fSDaniel Fojt   if (__glibc_unlikely (eor == NULL || root == NULL))
213695b7b453SJohn Marino     {
213795b7b453SJohn Marino       *err = REG_ESPACE;
213895b7b453SJohn Marino       return NULL;
213995b7b453SJohn Marino     }
214095b7b453SJohn Marino   return root;
214195b7b453SJohn Marino }
214295b7b453SJohn Marino 
214395b7b453SJohn Marino /* This function build the following tree, from regular expression
214495b7b453SJohn Marino    <branch1>|<branch2>:
214595b7b453SJohn Marino 	   ALT
214695b7b453SJohn Marino 	   / \
214795b7b453SJohn Marino 	  /   \
214895b7b453SJohn Marino    <branch1> <branch2>
214995b7b453SJohn Marino 
2150cf28ed85SJohn Marino    ALT means alternative, which represents the operator '|'.  */
215195b7b453SJohn Marino 
215295b7b453SJohn Marino static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)215395b7b453SJohn Marino parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
215495b7b453SJohn Marino 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
215595b7b453SJohn Marino {
2156cf28ed85SJohn Marino   re_dfa_t *dfa = preg->buffer;
215795b7b453SJohn Marino   bin_tree_t *tree, *branch = NULL;
2158dc7c36e4SJohn Marino   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
215995b7b453SJohn Marino   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2160*09d4459fSDaniel Fojt   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
216195b7b453SJohn Marino     return NULL;
216295b7b453SJohn Marino 
216395b7b453SJohn Marino   while (token->type == OP_ALT)
216495b7b453SJohn Marino     {
216595b7b453SJohn Marino       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
216695b7b453SJohn Marino       if (token->type != OP_ALT && token->type != END_OF_RE
216795b7b453SJohn Marino 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
216895b7b453SJohn Marino 	{
2169dc7c36e4SJohn Marino 	  bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2170dc7c36e4SJohn Marino 	  dfa->completed_bkref_map = initial_bkref_map;
217195b7b453SJohn Marino 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2172*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2173dc7c36e4SJohn Marino 	    {
2174dc7c36e4SJohn Marino 	      if (tree != NULL)
2175dc7c36e4SJohn Marino 		postorder (tree, free_tree, NULL);
217695b7b453SJohn Marino 	      return NULL;
217795b7b453SJohn Marino 	    }
2178dc7c36e4SJohn Marino 	  dfa->completed_bkref_map |= accumulated_bkref_map;
2179dc7c36e4SJohn Marino 	}
218095b7b453SJohn Marino       else
218195b7b453SJohn Marino 	branch = NULL;
218295b7b453SJohn Marino       tree = create_tree (dfa, tree, branch, OP_ALT);
2183*09d4459fSDaniel Fojt       if (__glibc_unlikely (tree == NULL))
218495b7b453SJohn Marino 	{
218595b7b453SJohn Marino 	  *err = REG_ESPACE;
218695b7b453SJohn Marino 	  return NULL;
218795b7b453SJohn Marino 	}
218895b7b453SJohn Marino     }
218995b7b453SJohn Marino   return tree;
219095b7b453SJohn Marino }
219195b7b453SJohn Marino 
219295b7b453SJohn Marino /* This function build the following tree, from regular expression
219395b7b453SJohn Marino    <exp1><exp2>:
219495b7b453SJohn Marino 	CAT
219595b7b453SJohn Marino 	/ \
219695b7b453SJohn Marino        /   \
219795b7b453SJohn Marino    <exp1> <exp2>
219895b7b453SJohn Marino 
219995b7b453SJohn Marino    CAT means concatenation.  */
220095b7b453SJohn Marino 
220195b7b453SJohn Marino static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)220295b7b453SJohn Marino parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
220395b7b453SJohn Marino 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
220495b7b453SJohn Marino {
220595b7b453SJohn Marino   bin_tree_t *tree, *expr;
2206cf28ed85SJohn Marino   re_dfa_t *dfa = preg->buffer;
220795b7b453SJohn Marino   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2208*09d4459fSDaniel Fojt   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
220995b7b453SJohn Marino     return NULL;
221095b7b453SJohn Marino 
221195b7b453SJohn Marino   while (token->type != OP_ALT && token->type != END_OF_RE
221295b7b453SJohn Marino 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
221395b7b453SJohn Marino     {
221495b7b453SJohn Marino       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2215*09d4459fSDaniel Fojt       if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
221695b7b453SJohn Marino 	{
2217cf28ed85SJohn Marino 	  if (tree != NULL)
2218cf28ed85SJohn Marino 	    postorder (tree, free_tree, NULL);
221995b7b453SJohn Marino 	  return NULL;
222095b7b453SJohn Marino 	}
222195b7b453SJohn Marino       if (tree != NULL && expr != NULL)
222295b7b453SJohn Marino 	{
2223cf28ed85SJohn Marino 	  bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2224cf28ed85SJohn Marino 	  if (newtree == NULL)
222595b7b453SJohn Marino 	    {
2226cf28ed85SJohn Marino 	      postorder (expr, free_tree, NULL);
2227cf28ed85SJohn Marino 	      postorder (tree, free_tree, NULL);
222895b7b453SJohn Marino 	      *err = REG_ESPACE;
222995b7b453SJohn Marino 	      return NULL;
223095b7b453SJohn Marino 	    }
2231cf28ed85SJohn Marino 	  tree = newtree;
223295b7b453SJohn Marino 	}
223395b7b453SJohn Marino       else if (tree == NULL)
223495b7b453SJohn Marino 	tree = expr;
223595b7b453SJohn Marino       /* Otherwise expr == NULL, we don't need to create new tree.  */
223695b7b453SJohn Marino     }
223795b7b453SJohn Marino   return tree;
223895b7b453SJohn Marino }
223995b7b453SJohn Marino 
224095b7b453SJohn Marino /* This function build the following tree, from regular expression a*:
224195b7b453SJohn Marino 	 *
224295b7b453SJohn Marino 	 |
224395b7b453SJohn Marino 	 a
224495b7b453SJohn Marino */
224595b7b453SJohn Marino 
224695b7b453SJohn Marino static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)224795b7b453SJohn Marino parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
224895b7b453SJohn Marino 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
224995b7b453SJohn Marino {
2250cf28ed85SJohn Marino   re_dfa_t *dfa = preg->buffer;
225195b7b453SJohn Marino   bin_tree_t *tree;
225295b7b453SJohn Marino   switch (token->type)
225395b7b453SJohn Marino     {
225495b7b453SJohn Marino     case CHARACTER:
225595b7b453SJohn Marino       tree = create_token_tree (dfa, NULL, NULL, token);
2256*09d4459fSDaniel Fojt       if (__glibc_unlikely (tree == NULL))
225795b7b453SJohn Marino 	{
225895b7b453SJohn Marino 	  *err = REG_ESPACE;
225995b7b453SJohn Marino 	  return NULL;
226095b7b453SJohn Marino 	}
226195b7b453SJohn Marino #ifdef RE_ENABLE_I18N
226295b7b453SJohn Marino       if (dfa->mb_cur_max > 1)
226395b7b453SJohn Marino 	{
226495b7b453SJohn Marino 	  while (!re_string_eoi (regexp)
226595b7b453SJohn Marino 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
226695b7b453SJohn Marino 	    {
226795b7b453SJohn Marino 	      bin_tree_t *mbc_remain;
226895b7b453SJohn Marino 	      fetch_token (token, regexp, syntax);
226995b7b453SJohn Marino 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
227095b7b453SJohn Marino 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2271*09d4459fSDaniel Fojt 	      if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
227295b7b453SJohn Marino 		{
227395b7b453SJohn Marino 		  *err = REG_ESPACE;
227495b7b453SJohn Marino 		  return NULL;
227595b7b453SJohn Marino 		}
227695b7b453SJohn Marino 	    }
227795b7b453SJohn Marino 	}
227895b7b453SJohn Marino #endif
227995b7b453SJohn Marino       break;
2280*09d4459fSDaniel Fojt 
228195b7b453SJohn Marino     case OP_OPEN_SUBEXP:
228295b7b453SJohn Marino       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2283*09d4459fSDaniel Fojt       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
228495b7b453SJohn Marino 	return NULL;
228595b7b453SJohn Marino       break;
2286*09d4459fSDaniel Fojt 
228795b7b453SJohn Marino     case OP_OPEN_BRACKET:
228895b7b453SJohn Marino       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2289*09d4459fSDaniel Fojt       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
229095b7b453SJohn Marino 	return NULL;
229195b7b453SJohn Marino       break;
2292*09d4459fSDaniel Fojt 
229395b7b453SJohn Marino     case OP_BACK_REF:
2294*09d4459fSDaniel Fojt       if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
229595b7b453SJohn Marino 	{
229695b7b453SJohn Marino 	  *err = REG_ESUBREG;
229795b7b453SJohn Marino 	  return NULL;
229895b7b453SJohn Marino 	}
229995b7b453SJohn Marino       dfa->used_bkref_map |= 1 << token->opr.idx;
230095b7b453SJohn Marino       tree = create_token_tree (dfa, NULL, NULL, token);
2301*09d4459fSDaniel Fojt       if (__glibc_unlikely (tree == NULL))
230295b7b453SJohn Marino 	{
230395b7b453SJohn Marino 	  *err = REG_ESPACE;
230495b7b453SJohn Marino 	  return NULL;
230595b7b453SJohn Marino 	}
230695b7b453SJohn Marino       ++dfa->nbackref;
230795b7b453SJohn Marino       dfa->has_mb_node = 1;
230895b7b453SJohn Marino       break;
2309*09d4459fSDaniel Fojt 
231095b7b453SJohn Marino     case OP_OPEN_DUP_NUM:
231195b7b453SJohn Marino       if (syntax & RE_CONTEXT_INVALID_DUP)
231295b7b453SJohn Marino 	{
231395b7b453SJohn Marino 	  *err = REG_BADRPT;
231495b7b453SJohn Marino 	  return NULL;
231595b7b453SJohn Marino 	}
2316*09d4459fSDaniel Fojt       FALLTHROUGH;
231795b7b453SJohn Marino     case OP_DUP_ASTERISK:
231895b7b453SJohn Marino     case OP_DUP_PLUS:
231995b7b453SJohn Marino     case OP_DUP_QUESTION:
232095b7b453SJohn Marino       if (syntax & RE_CONTEXT_INVALID_OPS)
232195b7b453SJohn Marino 	{
232295b7b453SJohn Marino 	  *err = REG_BADRPT;
232395b7b453SJohn Marino 	  return NULL;
232495b7b453SJohn Marino 	}
232595b7b453SJohn Marino       else if (syntax & RE_CONTEXT_INDEP_OPS)
232695b7b453SJohn Marino 	{
232795b7b453SJohn Marino 	  fetch_token (token, regexp, syntax);
232895b7b453SJohn Marino 	  return parse_expression (regexp, preg, token, syntax, nest, err);
232995b7b453SJohn Marino 	}
2330*09d4459fSDaniel Fojt       FALLTHROUGH;
233195b7b453SJohn Marino     case OP_CLOSE_SUBEXP:
2332*09d4459fSDaniel Fojt       if ((token->type == OP_CLOSE_SUBEXP)
2333*09d4459fSDaniel Fojt 	  && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
233495b7b453SJohn Marino 	{
233595b7b453SJohn Marino 	  *err = REG_ERPAREN;
233695b7b453SJohn Marino 	  return NULL;
233795b7b453SJohn Marino 	}
2338*09d4459fSDaniel Fojt       FALLTHROUGH;
233995b7b453SJohn Marino     case OP_CLOSE_DUP_NUM:
234095b7b453SJohn Marino       /* We treat it as a normal character.  */
234195b7b453SJohn Marino 
234295b7b453SJohn Marino       /* Then we can these characters as normal characters.  */
234395b7b453SJohn Marino       token->type = CHARACTER;
234495b7b453SJohn Marino       /* mb_partial and word_char bits should be initialized already
234595b7b453SJohn Marino 	 by peek_token.  */
234695b7b453SJohn Marino       tree = create_token_tree (dfa, NULL, NULL, token);
2347*09d4459fSDaniel Fojt       if (__glibc_unlikely (tree == NULL))
234895b7b453SJohn Marino 	{
234995b7b453SJohn Marino 	  *err = REG_ESPACE;
235095b7b453SJohn Marino 	  return NULL;
235195b7b453SJohn Marino 	}
235295b7b453SJohn Marino       break;
2353*09d4459fSDaniel Fojt 
235495b7b453SJohn Marino     case ANCHOR:
235595b7b453SJohn Marino       if ((token->opr.ctx_type
235695b7b453SJohn Marino 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
235795b7b453SJohn Marino 	  && dfa->word_ops_used == 0)
235895b7b453SJohn Marino 	init_word_char (dfa);
235995b7b453SJohn Marino       if (token->opr.ctx_type == WORD_DELIM
236095b7b453SJohn Marino 	  || token->opr.ctx_type == NOT_WORD_DELIM)
236195b7b453SJohn Marino 	{
236295b7b453SJohn Marino 	  bin_tree_t *tree_first, *tree_last;
236395b7b453SJohn Marino 	  if (token->opr.ctx_type == WORD_DELIM)
236495b7b453SJohn Marino 	    {
236595b7b453SJohn Marino 	      token->opr.ctx_type = WORD_FIRST;
236695b7b453SJohn Marino 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
236795b7b453SJohn Marino 	      token->opr.ctx_type = WORD_LAST;
236895b7b453SJohn Marino 	    }
236995b7b453SJohn Marino 	  else
237095b7b453SJohn Marino 	    {
237195b7b453SJohn Marino 	      token->opr.ctx_type = INSIDE_WORD;
237295b7b453SJohn Marino 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
237395b7b453SJohn Marino 	      token->opr.ctx_type = INSIDE_NOTWORD;
237495b7b453SJohn Marino 	    }
237595b7b453SJohn Marino 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
237695b7b453SJohn Marino 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2377*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2378*09d4459fSDaniel Fojt 				|| tree == NULL))
237995b7b453SJohn Marino 	    {
238095b7b453SJohn Marino 	      *err = REG_ESPACE;
238195b7b453SJohn Marino 	      return NULL;
238295b7b453SJohn Marino 	    }
238395b7b453SJohn Marino 	}
238495b7b453SJohn Marino       else
238595b7b453SJohn Marino 	{
238695b7b453SJohn Marino 	  tree = create_token_tree (dfa, NULL, NULL, token);
2387*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (tree == NULL))
238895b7b453SJohn Marino 	    {
238995b7b453SJohn Marino 	      *err = REG_ESPACE;
239095b7b453SJohn Marino 	      return NULL;
239195b7b453SJohn Marino 	    }
239295b7b453SJohn Marino 	}
239395b7b453SJohn Marino       /* We must return here, since ANCHORs can't be followed
239495b7b453SJohn Marino 	 by repetition operators.
239595b7b453SJohn Marino 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
239695b7b453SJohn Marino 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
239795b7b453SJohn Marino       fetch_token (token, regexp, syntax);
239895b7b453SJohn Marino       return tree;
2399*09d4459fSDaniel Fojt 
240095b7b453SJohn Marino     case OP_PERIOD:
240195b7b453SJohn Marino       tree = create_token_tree (dfa, NULL, NULL, token);
2402*09d4459fSDaniel Fojt       if (__glibc_unlikely (tree == NULL))
240395b7b453SJohn Marino 	{
240495b7b453SJohn Marino 	  *err = REG_ESPACE;
240595b7b453SJohn Marino 	  return NULL;
240695b7b453SJohn Marino 	}
240795b7b453SJohn Marino       if (dfa->mb_cur_max > 1)
240895b7b453SJohn Marino 	dfa->has_mb_node = 1;
240995b7b453SJohn Marino       break;
2410*09d4459fSDaniel Fojt 
241195b7b453SJohn Marino     case OP_WORD:
241295b7b453SJohn Marino     case OP_NOTWORD:
241395b7b453SJohn Marino       tree = build_charclass_op (dfa, regexp->trans,
2414680a9cb8SJohn Marino 				 "alnum",
2415680a9cb8SJohn Marino 				 "_",
241695b7b453SJohn Marino 				 token->type == OP_NOTWORD, err);
2417*09d4459fSDaniel Fojt       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
241895b7b453SJohn Marino 	return NULL;
241995b7b453SJohn Marino       break;
2420*09d4459fSDaniel Fojt 
242195b7b453SJohn Marino     case OP_SPACE:
242295b7b453SJohn Marino     case OP_NOTSPACE:
242395b7b453SJohn Marino       tree = build_charclass_op (dfa, regexp->trans,
2424680a9cb8SJohn Marino 				 "space",
2425680a9cb8SJohn Marino 				 "",
242695b7b453SJohn Marino 				 token->type == OP_NOTSPACE, err);
2427*09d4459fSDaniel Fojt       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
242895b7b453SJohn Marino 	return NULL;
242995b7b453SJohn Marino       break;
2430*09d4459fSDaniel Fojt 
243195b7b453SJohn Marino     case OP_ALT:
243295b7b453SJohn Marino     case END_OF_RE:
243395b7b453SJohn Marino       return NULL;
2434*09d4459fSDaniel Fojt 
243595b7b453SJohn Marino     case BACK_SLASH:
243695b7b453SJohn Marino       *err = REG_EESCAPE;
243795b7b453SJohn Marino       return NULL;
2438*09d4459fSDaniel Fojt 
243995b7b453SJohn Marino     default:
244095b7b453SJohn Marino       /* Must not happen?  */
2441*09d4459fSDaniel Fojt       DEBUG_ASSERT (false);
244295b7b453SJohn Marino       return NULL;
244395b7b453SJohn Marino     }
244495b7b453SJohn Marino   fetch_token (token, regexp, syntax);
244595b7b453SJohn Marino 
244695b7b453SJohn Marino   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
244795b7b453SJohn Marino 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
244895b7b453SJohn Marino     {
2449dc7c36e4SJohn Marino       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2450dc7c36e4SJohn Marino 					   syntax, err);
2451*09d4459fSDaniel Fojt       if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2452dc7c36e4SJohn Marino 	{
2453dc7c36e4SJohn Marino 	  if (tree != NULL)
2454dc7c36e4SJohn Marino 	    postorder (tree, free_tree, NULL);
245595b7b453SJohn Marino 	  return NULL;
2456dc7c36e4SJohn Marino 	}
2457dc7c36e4SJohn Marino       tree = dup_tree;
245895b7b453SJohn Marino       /* In BRE consecutive duplications are not allowed.  */
245995b7b453SJohn Marino       if ((syntax & RE_CONTEXT_INVALID_DUP)
246095b7b453SJohn Marino 	  && (token->type == OP_DUP_ASTERISK
246195b7b453SJohn Marino 	      || token->type == OP_OPEN_DUP_NUM))
246295b7b453SJohn Marino 	{
2463dc7c36e4SJohn Marino 	  if (tree != NULL)
2464dc7c36e4SJohn Marino 	    postorder (tree, free_tree, NULL);
246595b7b453SJohn Marino 	  *err = REG_BADRPT;
246695b7b453SJohn Marino 	  return NULL;
246795b7b453SJohn Marino 	}
246895b7b453SJohn Marino     }
246995b7b453SJohn Marino 
247095b7b453SJohn Marino   return tree;
247195b7b453SJohn Marino }
247295b7b453SJohn Marino 
247395b7b453SJohn Marino /* This function build the following tree, from regular expression
247495b7b453SJohn Marino    (<reg_exp>):
247595b7b453SJohn Marino 	 SUBEXP
247695b7b453SJohn Marino 	    |
247795b7b453SJohn Marino 	<reg_exp>
247895b7b453SJohn Marino */
247995b7b453SJohn Marino 
248095b7b453SJohn Marino static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)248195b7b453SJohn Marino parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
248295b7b453SJohn Marino 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
248395b7b453SJohn Marino {
2484cf28ed85SJohn Marino   re_dfa_t *dfa = preg->buffer;
248595b7b453SJohn Marino   bin_tree_t *tree;
248695b7b453SJohn Marino   size_t cur_nsub;
248795b7b453SJohn Marino   cur_nsub = preg->re_nsub++;
248895b7b453SJohn Marino 
248995b7b453SJohn Marino   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
249095b7b453SJohn Marino 
249195b7b453SJohn Marino   /* The subexpression may be a null string.  */
249295b7b453SJohn Marino   if (token->type == OP_CLOSE_SUBEXP)
249395b7b453SJohn Marino     tree = NULL;
249495b7b453SJohn Marino   else
249595b7b453SJohn Marino     {
249695b7b453SJohn Marino       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2497*09d4459fSDaniel Fojt       if (__glibc_unlikely (*err == REG_NOERROR
2498*09d4459fSDaniel Fojt 			    && token->type != OP_CLOSE_SUBEXP))
2499cf28ed85SJohn Marino 	{
2500cf28ed85SJohn Marino 	  if (tree != NULL)
2501cf28ed85SJohn Marino 	    postorder (tree, free_tree, NULL);
250295b7b453SJohn Marino 	  *err = REG_EPAREN;
2503cf28ed85SJohn Marino 	}
2504*09d4459fSDaniel Fojt       if (__glibc_unlikely (*err != REG_NOERROR))
250595b7b453SJohn Marino 	return NULL;
250695b7b453SJohn Marino     }
250795b7b453SJohn Marino 
250895b7b453SJohn Marino   if (cur_nsub <= '9' - '1')
250995b7b453SJohn Marino     dfa->completed_bkref_map |= 1 << cur_nsub;
251095b7b453SJohn Marino 
251195b7b453SJohn Marino   tree = create_tree (dfa, tree, NULL, SUBEXP);
2512*09d4459fSDaniel Fojt   if (__glibc_unlikely (tree == NULL))
251395b7b453SJohn Marino     {
251495b7b453SJohn Marino       *err = REG_ESPACE;
251595b7b453SJohn Marino       return NULL;
251695b7b453SJohn Marino     }
251795b7b453SJohn Marino   tree->token.opr.idx = cur_nsub;
251895b7b453SJohn Marino   return tree;
251995b7b453SJohn Marino }
252095b7b453SJohn Marino 
252195b7b453SJohn Marino /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
252295b7b453SJohn Marino 
252395b7b453SJohn Marino static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)252495b7b453SJohn Marino parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
252595b7b453SJohn Marino 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
252695b7b453SJohn Marino {
252795b7b453SJohn Marino   bin_tree_t *tree = NULL, *old_tree = NULL;
252895b7b453SJohn Marino   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
252995b7b453SJohn Marino   re_token_t start_token = *token;
253095b7b453SJohn Marino 
253195b7b453SJohn Marino   if (token->type == OP_OPEN_DUP_NUM)
253295b7b453SJohn Marino     {
253395b7b453SJohn Marino       end = 0;
253495b7b453SJohn Marino       start = fetch_number (regexp, token, syntax);
2535*09d4459fSDaniel Fojt       if (start == -1)
253695b7b453SJohn Marino 	{
253795b7b453SJohn Marino 	  if (token->type == CHARACTER && token->opr.c == ',')
253895b7b453SJohn Marino 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
253995b7b453SJohn Marino 	  else
254095b7b453SJohn Marino 	    {
254195b7b453SJohn Marino 	      *err = REG_BADBR; /* <re>{} is invalid.  */
254295b7b453SJohn Marino 	      return NULL;
254395b7b453SJohn Marino 	    }
254495b7b453SJohn Marino 	}
2545*09d4459fSDaniel Fojt       if (__glibc_likely (start != -2))
254695b7b453SJohn Marino 	{
254795b7b453SJohn Marino 	  /* We treat "{n}" as "{n,n}".  */
254895b7b453SJohn Marino 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
254995b7b453SJohn Marino 		 : ((token->type == CHARACTER && token->opr.c == ',')
2550*09d4459fSDaniel Fojt 		    ? fetch_number (regexp, token, syntax) : -2));
255195b7b453SJohn Marino 	}
2552*09d4459fSDaniel Fojt       if (__glibc_unlikely (start == -2 || end == -2))
255395b7b453SJohn Marino 	{
255495b7b453SJohn Marino 	  /* Invalid sequence.  */
2555*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
255695b7b453SJohn Marino 	    {
255795b7b453SJohn Marino 	      if (token->type == END_OF_RE)
255895b7b453SJohn Marino 		*err = REG_EBRACE;
255995b7b453SJohn Marino 	      else
256095b7b453SJohn Marino 		*err = REG_BADBR;
256195b7b453SJohn Marino 
256295b7b453SJohn Marino 	      return NULL;
256395b7b453SJohn Marino 	    }
256495b7b453SJohn Marino 
256595b7b453SJohn Marino 	  /* If the syntax bit is set, rollback.  */
256695b7b453SJohn Marino 	  re_string_set_index (regexp, start_idx);
256795b7b453SJohn Marino 	  *token = start_token;
256895b7b453SJohn Marino 	  token->type = CHARACTER;
256995b7b453SJohn Marino 	  /* mb_partial and word_char bits should be already initialized by
257095b7b453SJohn Marino 	     peek_token.  */
257195b7b453SJohn Marino 	  return elem;
257295b7b453SJohn Marino 	}
257395b7b453SJohn Marino 
2574*09d4459fSDaniel Fojt       if (__glibc_unlikely ((end != -1 && start > end)
2575*09d4459fSDaniel Fojt 			    || token->type != OP_CLOSE_DUP_NUM))
257695b7b453SJohn Marino 	{
257795b7b453SJohn Marino 	  /* First number greater than second.  */
257895b7b453SJohn Marino 	  *err = REG_BADBR;
257995b7b453SJohn Marino 	  return NULL;
258095b7b453SJohn Marino 	}
2581cf28ed85SJohn Marino 
2582*09d4459fSDaniel Fojt       if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2583cf28ed85SJohn Marino 	{
2584cf28ed85SJohn Marino 	  *err = REG_ESIZE;
2585cf28ed85SJohn Marino 	  return NULL;
2586cf28ed85SJohn Marino 	}
258795b7b453SJohn Marino     }
258895b7b453SJohn Marino   else
258995b7b453SJohn Marino     {
259095b7b453SJohn Marino       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2591*09d4459fSDaniel Fojt       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
259295b7b453SJohn Marino     }
259395b7b453SJohn Marino 
259495b7b453SJohn Marino   fetch_token (token, regexp, syntax);
259595b7b453SJohn Marino 
2596*09d4459fSDaniel Fojt   if (__glibc_unlikely (elem == NULL))
259795b7b453SJohn Marino     return NULL;
2598*09d4459fSDaniel Fojt   if (__glibc_unlikely (start == 0 && end == 0))
259995b7b453SJohn Marino     {
260095b7b453SJohn Marino       postorder (elem, free_tree, NULL);
260195b7b453SJohn Marino       return NULL;
260295b7b453SJohn Marino     }
260395b7b453SJohn Marino 
260495b7b453SJohn Marino   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2605*09d4459fSDaniel Fojt   if (__glibc_unlikely (start > 0))
260695b7b453SJohn Marino     {
260795b7b453SJohn Marino       tree = elem;
260895b7b453SJohn Marino       for (i = 2; i <= start; ++i)
260995b7b453SJohn Marino 	{
261095b7b453SJohn Marino 	  elem = duplicate_tree (elem, dfa);
261195b7b453SJohn Marino 	  tree = create_tree (dfa, tree, elem, CONCAT);
2612*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (elem == NULL || tree == NULL))
261395b7b453SJohn Marino 	    goto parse_dup_op_espace;
261495b7b453SJohn Marino 	}
261595b7b453SJohn Marino 
261695b7b453SJohn Marino       if (start == end)
261795b7b453SJohn Marino 	return tree;
261895b7b453SJohn Marino 
261995b7b453SJohn Marino       /* Duplicate ELEM before it is marked optional.  */
262095b7b453SJohn Marino       elem = duplicate_tree (elem, dfa);
2621*09d4459fSDaniel Fojt       if (__glibc_unlikely (elem == NULL))
2622dc7c36e4SJohn Marino         goto parse_dup_op_espace;
262395b7b453SJohn Marino       old_tree = tree;
262495b7b453SJohn Marino     }
262595b7b453SJohn Marino   else
262695b7b453SJohn Marino     old_tree = NULL;
262795b7b453SJohn Marino 
262895b7b453SJohn Marino   if (elem->token.type == SUBEXP)
2629a8597f6cSJohn Marino     {
2630a8597f6cSJohn Marino       uintptr_t subidx = elem->token.opr.idx;
2631a8597f6cSJohn Marino       postorder (elem, mark_opt_subexp, (void *) subidx);
2632a8597f6cSJohn Marino     }
263395b7b453SJohn Marino 
263495b7b453SJohn Marino   tree = create_tree (dfa, elem, NULL,
2635*09d4459fSDaniel Fojt 		      (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2636*09d4459fSDaniel Fojt   if (__glibc_unlikely (tree == NULL))
263795b7b453SJohn Marino     goto parse_dup_op_espace;
263895b7b453SJohn Marino 
2639*09d4459fSDaniel Fojt   /* This loop is actually executed only when end != -1,
264095b7b453SJohn Marino      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
264195b7b453SJohn Marino      already created the start+1-th copy.  */
2642*09d4459fSDaniel Fojt   if (TYPE_SIGNED (Idx) || end != -1)
264395b7b453SJohn Marino     for (i = start + 2; i <= end; ++i)
264495b7b453SJohn Marino       {
264595b7b453SJohn Marino 	elem = duplicate_tree (elem, dfa);
264695b7b453SJohn Marino 	tree = create_tree (dfa, tree, elem, CONCAT);
2647*09d4459fSDaniel Fojt 	if (__glibc_unlikely (elem == NULL || tree == NULL))
264895b7b453SJohn Marino 	  goto parse_dup_op_espace;
264995b7b453SJohn Marino 
265095b7b453SJohn Marino 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2651*09d4459fSDaniel Fojt 	if (__glibc_unlikely (tree == NULL))
265295b7b453SJohn Marino 	  goto parse_dup_op_espace;
265395b7b453SJohn Marino       }
265495b7b453SJohn Marino 
265595b7b453SJohn Marino   if (old_tree)
265695b7b453SJohn Marino     tree = create_tree (dfa, old_tree, tree, CONCAT);
265795b7b453SJohn Marino 
265895b7b453SJohn Marino   return tree;
265995b7b453SJohn Marino 
266095b7b453SJohn Marino  parse_dup_op_espace:
266195b7b453SJohn Marino   *err = REG_ESPACE;
266295b7b453SJohn Marino   return NULL;
266395b7b453SJohn Marino }
266495b7b453SJohn Marino 
266595b7b453SJohn Marino /* Size of the names for collating symbol/equivalence_class/character_class.
266695b7b453SJohn Marino    I'm not sure, but maybe enough.  */
266795b7b453SJohn Marino #define BRACKET_NAME_BUF_SIZE 32
266895b7b453SJohn Marino 
266995b7b453SJohn Marino #ifndef _LIBC
2670*09d4459fSDaniel Fojt 
2671*09d4459fSDaniel Fojt # ifdef RE_ENABLE_I18N
2672*09d4459fSDaniel Fojt /* Convert the byte B to the corresponding wide character.  In a
2673*09d4459fSDaniel Fojt    unibyte locale, treat B as itself.  In a multibyte locale, return
2674*09d4459fSDaniel Fojt    WEOF if B is an encoding error.  */
2675*09d4459fSDaniel Fojt static wint_t
parse_byte(unsigned char b,re_charset_t * mbcset)2676*09d4459fSDaniel Fojt parse_byte (unsigned char b, re_charset_t *mbcset)
2677*09d4459fSDaniel Fojt {
2678*09d4459fSDaniel Fojt   return mbcset == NULL ? b : __btowc (b);
2679*09d4459fSDaniel Fojt }
2680*09d4459fSDaniel Fojt # endif
2681*09d4459fSDaniel Fojt 
268295b7b453SJohn Marino   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
268395b7b453SJohn Marino      Build the range expression which starts from START_ELEM, and ends
268495b7b453SJohn Marino      at END_ELEM.  The result are written to MBCSET and SBCSET.
268595b7b453SJohn Marino      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2686cf28ed85SJohn Marino      mbcset->range_ends, is a pointer argument since we may
268795b7b453SJohn Marino      update it.  */
268895b7b453SJohn Marino 
268995b7b453SJohn Marino static reg_errcode_t
269095b7b453SJohn Marino # ifdef RE_ENABLE_I18N
build_range_exp(const reg_syntax_t syntax,bitset_t sbcset,re_charset_t * mbcset,Idx * range_alloc,const bracket_elem_t * start_elem,const bracket_elem_t * end_elem)269195b7b453SJohn Marino build_range_exp (const reg_syntax_t syntax,
269295b7b453SJohn Marino                  bitset_t sbcset,
269395b7b453SJohn Marino                  re_charset_t *mbcset,
269495b7b453SJohn Marino                  Idx *range_alloc,
269595b7b453SJohn Marino                  const bracket_elem_t *start_elem,
269695b7b453SJohn Marino                  const bracket_elem_t *end_elem)
269795b7b453SJohn Marino # else /* not RE_ENABLE_I18N */
269895b7b453SJohn Marino build_range_exp (const reg_syntax_t syntax,
269995b7b453SJohn Marino                  bitset_t sbcset,
270095b7b453SJohn Marino                  const bracket_elem_t *start_elem,
270195b7b453SJohn Marino                  const bracket_elem_t *end_elem)
270295b7b453SJohn Marino # endif /* not RE_ENABLE_I18N */
270395b7b453SJohn Marino {
270495b7b453SJohn Marino   unsigned int start_ch, end_ch;
270595b7b453SJohn Marino   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2706*09d4459fSDaniel Fojt   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2707*09d4459fSDaniel Fojt 			|| start_elem->type == CHAR_CLASS
2708*09d4459fSDaniel Fojt 			|| end_elem->type == EQUIV_CLASS
2709*09d4459fSDaniel Fojt 			|| end_elem->type == CHAR_CLASS))
271095b7b453SJohn Marino     return REG_ERANGE;
271195b7b453SJohn Marino 
271295b7b453SJohn Marino   /* We can handle no multi character collating elements without libc
271395b7b453SJohn Marino      support.  */
2714*09d4459fSDaniel Fojt   if (__glibc_unlikely ((start_elem->type == COLL_SYM
271595b7b453SJohn Marino 			 && strlen ((char *) start_elem->opr.name) > 1)
271695b7b453SJohn Marino 			|| (end_elem->type == COLL_SYM
2717*09d4459fSDaniel Fojt 			    && strlen ((char *) end_elem->opr.name) > 1)))
271895b7b453SJohn Marino     return REG_ECOLLATE;
271995b7b453SJohn Marino 
272095b7b453SJohn Marino # ifdef RE_ENABLE_I18N
272195b7b453SJohn Marino   {
272295b7b453SJohn Marino     wchar_t wc;
272395b7b453SJohn Marino     wint_t start_wc;
272495b7b453SJohn Marino     wint_t end_wc;
272595b7b453SJohn Marino 
272695b7b453SJohn Marino     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
272795b7b453SJohn Marino 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
272895b7b453SJohn Marino 		   : 0));
272995b7b453SJohn Marino     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
273095b7b453SJohn Marino 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
273195b7b453SJohn Marino 		 : 0));
273295b7b453SJohn Marino     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2733*09d4459fSDaniel Fojt 		? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
273495b7b453SJohn Marino     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2735*09d4459fSDaniel Fojt 	      ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
273695b7b453SJohn Marino     if (start_wc == WEOF || end_wc == WEOF)
273795b7b453SJohn Marino       return REG_ECOLLATE;
2738*09d4459fSDaniel Fojt     else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2739*09d4459fSDaniel Fojt 			       && start_wc > end_wc))
274095b7b453SJohn Marino       return REG_ERANGE;
274195b7b453SJohn Marino 
274295b7b453SJohn Marino     /* Got valid collation sequence values, add them as a new entry.
274395b7b453SJohn Marino        However, for !_LIBC we have no collation elements: if the
274495b7b453SJohn Marino        character set is single byte, the single byte character set
274595b7b453SJohn Marino        that we build below suffices.  parse_bracket_exp passes
274695b7b453SJohn Marino        no MBCSET if dfa->mb_cur_max == 1.  */
274795b7b453SJohn Marino     if (mbcset)
274895b7b453SJohn Marino       {
274995b7b453SJohn Marino 	/* Check the space of the arrays.  */
2750*09d4459fSDaniel Fojt 	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
275195b7b453SJohn Marino 	  {
275295b7b453SJohn Marino 	    /* There is not enough space, need realloc.  */
275395b7b453SJohn Marino 	    wchar_t *new_array_start, *new_array_end;
275495b7b453SJohn Marino 	    Idx new_nranges;
275595b7b453SJohn Marino 
275695b7b453SJohn Marino 	    /* +1 in case of mbcset->nranges is 0.  */
275795b7b453SJohn Marino 	    new_nranges = 2 * mbcset->nranges + 1;
275895b7b453SJohn Marino 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
275995b7b453SJohn Marino 	       are NULL if *range_alloc == 0.  */
276095b7b453SJohn Marino 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
276195b7b453SJohn Marino 					  new_nranges);
276295b7b453SJohn Marino 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
276395b7b453SJohn Marino 					new_nranges);
276495b7b453SJohn Marino 
2765*09d4459fSDaniel Fojt 	    if (__glibc_unlikely (new_array_start == NULL
2766*09d4459fSDaniel Fojt 				  || new_array_end == NULL))
2767*09d4459fSDaniel Fojt 	      {
2768*09d4459fSDaniel Fojt 		re_free (new_array_start);
2769*09d4459fSDaniel Fojt 		re_free (new_array_end);
277095b7b453SJohn Marino 		return REG_ESPACE;
2771*09d4459fSDaniel Fojt 	      }
277295b7b453SJohn Marino 
277395b7b453SJohn Marino 	    mbcset->range_starts = new_array_start;
277495b7b453SJohn Marino 	    mbcset->range_ends = new_array_end;
277595b7b453SJohn Marino 	    *range_alloc = new_nranges;
277695b7b453SJohn Marino 	  }
277795b7b453SJohn Marino 
277895b7b453SJohn Marino 	mbcset->range_starts[mbcset->nranges] = start_wc;
277995b7b453SJohn Marino 	mbcset->range_ends[mbcset->nranges++] = end_wc;
278095b7b453SJohn Marino       }
278195b7b453SJohn Marino 
278295b7b453SJohn Marino     /* Build the table for single byte characters.  */
278395b7b453SJohn Marino     for (wc = 0; wc < SBC_MAX; ++wc)
278495b7b453SJohn Marino       {
2785680a9cb8SJohn Marino 	if (start_wc <= wc && wc <= end_wc)
278695b7b453SJohn Marino 	  bitset_set (sbcset, wc);
278795b7b453SJohn Marino       }
278895b7b453SJohn Marino   }
278995b7b453SJohn Marino # else /* not RE_ENABLE_I18N */
279095b7b453SJohn Marino   {
279195b7b453SJohn Marino     unsigned int ch;
279295b7b453SJohn Marino     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
279395b7b453SJohn Marino 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
279495b7b453SJohn Marino 		   : 0));
279595b7b453SJohn Marino     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
279695b7b453SJohn Marino 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
279795b7b453SJohn Marino 		 : 0));
279895b7b453SJohn Marino     if (start_ch > end_ch)
279995b7b453SJohn Marino       return REG_ERANGE;
280095b7b453SJohn Marino     /* Build the table for single byte characters.  */
280195b7b453SJohn Marino     for (ch = 0; ch < SBC_MAX; ++ch)
280295b7b453SJohn Marino       if (start_ch <= ch  && ch <= end_ch)
280395b7b453SJohn Marino 	bitset_set (sbcset, ch);
280495b7b453SJohn Marino   }
280595b7b453SJohn Marino # endif /* not RE_ENABLE_I18N */
280695b7b453SJohn Marino   return REG_NOERROR;
280795b7b453SJohn Marino }
280895b7b453SJohn Marino #endif /* not _LIBC */
280995b7b453SJohn Marino 
281095b7b453SJohn Marino #ifndef _LIBC
281195b7b453SJohn Marino /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
281295b7b453SJohn Marino    Build the collating element which is represented by NAME.
281395b7b453SJohn Marino    The result are written to MBCSET and SBCSET.
281495b7b453SJohn Marino    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
281595b7b453SJohn Marino    pointer argument since we may update it.  */
281695b7b453SJohn Marino 
281795b7b453SJohn Marino static reg_errcode_t
281895b7b453SJohn Marino # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2819680a9cb8SJohn Marino build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2820680a9cb8SJohn Marino 			Idx *coll_sym_alloc, const unsigned char *name)
2821cf28ed85SJohn Marino # else /* not RE_ENABLE_I18N */
2822cf28ed85SJohn Marino build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2823cf28ed85SJohn Marino # endif /* not RE_ENABLE_I18N */
282495b7b453SJohn Marino {
282595b7b453SJohn Marino   size_t name_len = strlen ((const char *) name);
2826*09d4459fSDaniel Fojt   if (__glibc_unlikely (name_len != 1))
282795b7b453SJohn Marino     return REG_ECOLLATE;
282895b7b453SJohn Marino   else
282995b7b453SJohn Marino     {
283095b7b453SJohn Marino       bitset_set (sbcset, name[0]);
283195b7b453SJohn Marino       return REG_NOERROR;
283295b7b453SJohn Marino     }
283395b7b453SJohn Marino }
283495b7b453SJohn Marino #endif /* not _LIBC */
283595b7b453SJohn Marino 
283695b7b453SJohn Marino /* This function parse bracket expression like "[abc]", "[a-c]",
283795b7b453SJohn Marino    "[[.a-a.]]" etc.  */
283895b7b453SJohn Marino 
283995b7b453SJohn Marino static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)284095b7b453SJohn Marino parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
284195b7b453SJohn Marino 		   reg_syntax_t syntax, reg_errcode_t *err)
284295b7b453SJohn Marino {
284395b7b453SJohn Marino #ifdef _LIBC
284495b7b453SJohn Marino   const unsigned char *collseqmb;
284595b7b453SJohn Marino   const char *collseqwc;
284695b7b453SJohn Marino   uint32_t nrules;
284795b7b453SJohn Marino   int32_t table_size;
284895b7b453SJohn Marino   const int32_t *symb_table;
284995b7b453SJohn Marino   const unsigned char *extra;
285095b7b453SJohn Marino 
2851cf28ed85SJohn Marino   /* Local function for parse_bracket_exp used in _LIBC environment.
2852cf28ed85SJohn Marino      Seek the collating symbol entry corresponding to NAME.
2853680a9cb8SJohn Marino      Return the index of the symbol in the SYMB_TABLE,
2854680a9cb8SJohn Marino      or -1 if not found.  */
285595b7b453SJohn Marino 
285695b7b453SJohn Marino   auto inline int32_t
2857680a9cb8SJohn Marino   __attribute__ ((always_inline))
2858680a9cb8SJohn Marino   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
285995b7b453SJohn Marino     {
2860680a9cb8SJohn Marino       int32_t elem;
2861680a9cb8SJohn Marino 
2862680a9cb8SJohn Marino       for (elem = 0; elem < table_size; elem++)
286395b7b453SJohn Marino 	if (symb_table[2 * elem] != 0)
286495b7b453SJohn Marino 	  {
2865680a9cb8SJohn Marino 	    int32_t idx = symb_table[2 * elem + 1];
2866680a9cb8SJohn Marino 	    /* Skip the name of collating element name.  */
2867680a9cb8SJohn Marino 	    idx += 1 + extra[idx];
2868680a9cb8SJohn Marino 	    if (/* Compare the length of the name.  */
2869680a9cb8SJohn Marino 		name_len == extra[idx]
287095b7b453SJohn Marino 		/* Compare the name.  */
2871680a9cb8SJohn Marino 		&& memcmp (name, &extra[idx + 1], name_len) == 0)
287295b7b453SJohn Marino 	      /* Yep, this is the entry.  */
287395b7b453SJohn Marino 	      return elem;
287495b7b453SJohn Marino 	  }
2875680a9cb8SJohn Marino       return -1;
2876680a9cb8SJohn Marino     }
287795b7b453SJohn Marino 
287895b7b453SJohn Marino   /* Local function for parse_bracket_exp used in _LIBC environment.
287995b7b453SJohn Marino      Look up the collation sequence value of BR_ELEM.
288095b7b453SJohn Marino      Return the value if succeeded, UINT_MAX otherwise.  */
288195b7b453SJohn Marino 
288295b7b453SJohn Marino   auto inline unsigned int
2883680a9cb8SJohn Marino   __attribute__ ((always_inline))
2884680a9cb8SJohn Marino   lookup_collation_sequence_value (bracket_elem_t *br_elem)
288595b7b453SJohn Marino     {
288695b7b453SJohn Marino       if (br_elem->type == SB_CHAR)
288795b7b453SJohn Marino 	{
288895b7b453SJohn Marino 	  /*
288995b7b453SJohn Marino 	  if (MB_CUR_MAX == 1)
289095b7b453SJohn Marino 	  */
289195b7b453SJohn Marino 	  if (nrules == 0)
289295b7b453SJohn Marino 	    return collseqmb[br_elem->opr.ch];
289395b7b453SJohn Marino 	  else
289495b7b453SJohn Marino 	    {
289595b7b453SJohn Marino 	      wint_t wc = __btowc (br_elem->opr.ch);
289695b7b453SJohn Marino 	      return __collseq_table_lookup (collseqwc, wc);
289795b7b453SJohn Marino 	    }
289895b7b453SJohn Marino 	}
289995b7b453SJohn Marino       else if (br_elem->type == MB_CHAR)
290095b7b453SJohn Marino 	{
290195b7b453SJohn Marino 	  if (nrules != 0)
290295b7b453SJohn Marino 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
290395b7b453SJohn Marino 	}
290495b7b453SJohn Marino       else if (br_elem->type == COLL_SYM)
290595b7b453SJohn Marino 	{
290695b7b453SJohn Marino 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
290795b7b453SJohn Marino 	  if (nrules != 0)
290895b7b453SJohn Marino 	    {
290995b7b453SJohn Marino 	      int32_t elem, idx;
291095b7b453SJohn Marino 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
291195b7b453SJohn Marino 						  sym_name_len);
2912680a9cb8SJohn Marino 	      if (elem != -1)
291395b7b453SJohn Marino 		{
291495b7b453SJohn Marino 		  /* We found the entry.  */
291595b7b453SJohn Marino 		  idx = symb_table[2 * elem + 1];
291695b7b453SJohn Marino 		  /* Skip the name of collating element name.  */
291795b7b453SJohn Marino 		  idx += 1 + extra[idx];
291895b7b453SJohn Marino 		  /* Skip the byte sequence of the collating element.  */
291995b7b453SJohn Marino 		  idx += 1 + extra[idx];
292095b7b453SJohn Marino 		  /* Adjust for the alignment.  */
292195b7b453SJohn Marino 		  idx = (idx + 3) & ~3;
292295b7b453SJohn Marino 		  /* Skip the multibyte collation sequence value.  */
292395b7b453SJohn Marino 		  idx += sizeof (unsigned int);
292495b7b453SJohn Marino 		  /* Skip the wide char sequence of the collating element.  */
292595b7b453SJohn Marino 		  idx += sizeof (unsigned int) *
292695b7b453SJohn Marino 		    (1 + *(unsigned int *) (extra + idx));
292795b7b453SJohn Marino 		  /* Return the collation sequence value.  */
292895b7b453SJohn Marino 		  return *(unsigned int *) (extra + idx);
292995b7b453SJohn Marino 		}
2930680a9cb8SJohn Marino 	      else if (sym_name_len == 1)
293195b7b453SJohn Marino 		{
293295b7b453SJohn Marino 		  /* No valid character.  Match it as a single byte
293395b7b453SJohn Marino 		     character.  */
293495b7b453SJohn Marino 		  return collseqmb[br_elem->opr.name[0]];
293595b7b453SJohn Marino 		}
293695b7b453SJohn Marino 	    }
293795b7b453SJohn Marino 	  else if (sym_name_len == 1)
293895b7b453SJohn Marino 	    return collseqmb[br_elem->opr.name[0]];
293995b7b453SJohn Marino 	}
294095b7b453SJohn Marino       return UINT_MAX;
294195b7b453SJohn Marino     }
294295b7b453SJohn Marino 
2943cf28ed85SJohn Marino   /* Local function for parse_bracket_exp used in _LIBC environment.
294495b7b453SJohn Marino      Build the range expression which starts from START_ELEM, and ends
294595b7b453SJohn Marino      at END_ELEM.  The result are written to MBCSET and SBCSET.
294695b7b453SJohn Marino      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2947cf28ed85SJohn Marino      mbcset->range_ends, is a pointer argument since we may
294895b7b453SJohn Marino      update it.  */
294995b7b453SJohn Marino 
295095b7b453SJohn Marino   auto inline reg_errcode_t
2951680a9cb8SJohn Marino   __attribute__ ((always_inline))
2952680a9cb8SJohn Marino   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2953680a9cb8SJohn Marino 		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
295495b7b453SJohn Marino     {
295595b7b453SJohn Marino       unsigned int ch;
295695b7b453SJohn Marino       uint32_t start_collseq;
295795b7b453SJohn Marino       uint32_t end_collseq;
295895b7b453SJohn Marino 
295995b7b453SJohn Marino       /* Equivalence Classes and Character Classes can't be a range
296095b7b453SJohn Marino 	 start/end.  */
2961*09d4459fSDaniel Fojt       if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2962*09d4459fSDaniel Fojt 			    || start_elem->type == CHAR_CLASS
2963*09d4459fSDaniel Fojt 			    || end_elem->type == EQUIV_CLASS
2964*09d4459fSDaniel Fojt 			    || end_elem->type == CHAR_CLASS))
296595b7b453SJohn Marino 	return REG_ERANGE;
296695b7b453SJohn Marino 
2967680a9cb8SJohn Marino       /* FIXME: Implement rational ranges here, too.  */
296895b7b453SJohn Marino       start_collseq = lookup_collation_sequence_value (start_elem);
296995b7b453SJohn Marino       end_collseq = lookup_collation_sequence_value (end_elem);
297095b7b453SJohn Marino       /* Check start/end collation sequence values.  */
2971*09d4459fSDaniel Fojt       if (__glibc_unlikely (start_collseq == UINT_MAX
2972*09d4459fSDaniel Fojt 			    || end_collseq == UINT_MAX))
297395b7b453SJohn Marino 	return REG_ECOLLATE;
2974*09d4459fSDaniel Fojt       if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2975*09d4459fSDaniel Fojt 			    && start_collseq > end_collseq))
297695b7b453SJohn Marino 	return REG_ERANGE;
297795b7b453SJohn Marino 
297895b7b453SJohn Marino       /* Got valid collation sequence values, add them as a new entry.
297995b7b453SJohn Marino 	 However, if we have no collation elements, and the character set
298095b7b453SJohn Marino 	 is single byte, the single byte character set that we
298195b7b453SJohn Marino 	 build below suffices. */
298295b7b453SJohn Marino       if (nrules > 0 || dfa->mb_cur_max > 1)
298395b7b453SJohn Marino 	{
298495b7b453SJohn Marino 	  /* Check the space of the arrays.  */
2985*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (*range_alloc == mbcset->nranges))
298695b7b453SJohn Marino 	    {
298795b7b453SJohn Marino 	      /* There is not enough space, need realloc.  */
298895b7b453SJohn Marino 	      uint32_t *new_array_start;
298995b7b453SJohn Marino 	      uint32_t *new_array_end;
299095b7b453SJohn Marino 	      Idx new_nranges;
299195b7b453SJohn Marino 
299295b7b453SJohn Marino 	      /* +1 in case of mbcset->nranges is 0.  */
299395b7b453SJohn Marino 	      new_nranges = 2 * mbcset->nranges + 1;
299495b7b453SJohn Marino 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
299595b7b453SJohn Marino 					    new_nranges);
299695b7b453SJohn Marino 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
299795b7b453SJohn Marino 					  new_nranges);
299895b7b453SJohn Marino 
2999*09d4459fSDaniel Fojt 	      if (__glibc_unlikely (new_array_start == NULL
3000*09d4459fSDaniel Fojt 				    || new_array_end == NULL))
300195b7b453SJohn Marino 		return REG_ESPACE;
300295b7b453SJohn Marino 
300395b7b453SJohn Marino 	      mbcset->range_starts = new_array_start;
300495b7b453SJohn Marino 	      mbcset->range_ends = new_array_end;
300595b7b453SJohn Marino 	      *range_alloc = new_nranges;
300695b7b453SJohn Marino 	    }
300795b7b453SJohn Marino 
300895b7b453SJohn Marino 	  mbcset->range_starts[mbcset->nranges] = start_collseq;
300995b7b453SJohn Marino 	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
301095b7b453SJohn Marino 	}
301195b7b453SJohn Marino 
301295b7b453SJohn Marino       /* Build the table for single byte characters.  */
301395b7b453SJohn Marino       for (ch = 0; ch < SBC_MAX; ch++)
301495b7b453SJohn Marino 	{
301595b7b453SJohn Marino 	  uint32_t ch_collseq;
301695b7b453SJohn Marino 	  /*
301795b7b453SJohn Marino 	  if (MB_CUR_MAX == 1)
301895b7b453SJohn Marino 	  */
301995b7b453SJohn Marino 	  if (nrules == 0)
302095b7b453SJohn Marino 	    ch_collseq = collseqmb[ch];
302195b7b453SJohn Marino 	  else
302295b7b453SJohn Marino 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
302395b7b453SJohn Marino 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
302495b7b453SJohn Marino 	    bitset_set (sbcset, ch);
302595b7b453SJohn Marino 	}
302695b7b453SJohn Marino       return REG_NOERROR;
302795b7b453SJohn Marino     }
302895b7b453SJohn Marino 
3029cf28ed85SJohn Marino   /* Local function for parse_bracket_exp used in _LIBC environment.
303095b7b453SJohn Marino      Build the collating element which is represented by NAME.
303195b7b453SJohn Marino      The result are written to MBCSET and SBCSET.
303295b7b453SJohn Marino      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3033cf28ed85SJohn Marino      pointer argument since we may update it.  */
303495b7b453SJohn Marino 
303595b7b453SJohn Marino   auto inline reg_errcode_t
3036680a9cb8SJohn Marino   __attribute__ ((always_inline))
3037680a9cb8SJohn Marino   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3038680a9cb8SJohn Marino 			  Idx *coll_sym_alloc, const unsigned char *name)
303995b7b453SJohn Marino     {
304095b7b453SJohn Marino       int32_t elem, idx;
304195b7b453SJohn Marino       size_t name_len = strlen ((const char *) name);
304295b7b453SJohn Marino       if (nrules != 0)
304395b7b453SJohn Marino 	{
304495b7b453SJohn Marino 	  elem = seek_collating_symbol_entry (name, name_len);
3045680a9cb8SJohn Marino 	  if (elem != -1)
304695b7b453SJohn Marino 	    {
304795b7b453SJohn Marino 	      /* We found the entry.  */
304895b7b453SJohn Marino 	      idx = symb_table[2 * elem + 1];
304995b7b453SJohn Marino 	      /* Skip the name of collating element name.  */
305095b7b453SJohn Marino 	      idx += 1 + extra[idx];
305195b7b453SJohn Marino 	    }
3052680a9cb8SJohn Marino 	  else if (name_len == 1)
305395b7b453SJohn Marino 	    {
305495b7b453SJohn Marino 	      /* No valid character, treat it as a normal
305595b7b453SJohn Marino 		 character.  */
305695b7b453SJohn Marino 	      bitset_set (sbcset, name[0]);
305795b7b453SJohn Marino 	      return REG_NOERROR;
305895b7b453SJohn Marino 	    }
305995b7b453SJohn Marino 	  else
306095b7b453SJohn Marino 	    return REG_ECOLLATE;
306195b7b453SJohn Marino 
306295b7b453SJohn Marino 	  /* Got valid collation sequence, add it as a new entry.  */
306395b7b453SJohn Marino 	  /* Check the space of the arrays.  */
3064*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
306595b7b453SJohn Marino 	    {
306695b7b453SJohn Marino 	      /* Not enough, realloc it.  */
306795b7b453SJohn Marino 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
306895b7b453SJohn Marino 	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
306995b7b453SJohn Marino 	      /* Use realloc since mbcset->coll_syms is NULL
307095b7b453SJohn Marino 		 if *alloc == 0.  */
307195b7b453SJohn Marino 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
307295b7b453SJohn Marino 						   new_coll_sym_alloc);
3073*09d4459fSDaniel Fojt 	      if (__glibc_unlikely (new_coll_syms == NULL))
307495b7b453SJohn Marino 		return REG_ESPACE;
307595b7b453SJohn Marino 	      mbcset->coll_syms = new_coll_syms;
307695b7b453SJohn Marino 	      *coll_sym_alloc = new_coll_sym_alloc;
307795b7b453SJohn Marino 	    }
307895b7b453SJohn Marino 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
307995b7b453SJohn Marino 	  return REG_NOERROR;
308095b7b453SJohn Marino 	}
308195b7b453SJohn Marino       else
308295b7b453SJohn Marino 	{
3083*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (name_len != 1))
308495b7b453SJohn Marino 	    return REG_ECOLLATE;
308595b7b453SJohn Marino 	  else
308695b7b453SJohn Marino 	    {
308795b7b453SJohn Marino 	      bitset_set (sbcset, name[0]);
308895b7b453SJohn Marino 	      return REG_NOERROR;
308995b7b453SJohn Marino 	    }
309095b7b453SJohn Marino 	}
309195b7b453SJohn Marino     }
309295b7b453SJohn Marino #endif
309395b7b453SJohn Marino 
309495b7b453SJohn Marino   re_token_t br_token;
309595b7b453SJohn Marino   re_bitset_ptr_t sbcset;
309695b7b453SJohn Marino #ifdef RE_ENABLE_I18N
309795b7b453SJohn Marino   re_charset_t *mbcset;
309895b7b453SJohn Marino   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
309995b7b453SJohn Marino   Idx equiv_class_alloc = 0, char_class_alloc = 0;
310095b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
310195b7b453SJohn Marino   bool non_match = false;
310295b7b453SJohn Marino   bin_tree_t *work_tree;
310395b7b453SJohn Marino   int token_len;
310495b7b453SJohn Marino   bool first_round = true;
310595b7b453SJohn Marino #ifdef _LIBC
310695b7b453SJohn Marino   collseqmb = (const unsigned char *)
310795b7b453SJohn Marino     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
310895b7b453SJohn Marino   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
310995b7b453SJohn Marino   if (nrules)
311095b7b453SJohn Marino     {
311195b7b453SJohn Marino       /*
311295b7b453SJohn Marino       if (MB_CUR_MAX > 1)
311395b7b453SJohn Marino       */
311495b7b453SJohn Marino       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
311595b7b453SJohn Marino       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
311695b7b453SJohn Marino       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
311795b7b453SJohn Marino 						  _NL_COLLATE_SYMB_TABLEMB);
311895b7b453SJohn Marino       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
311995b7b453SJohn Marino 						   _NL_COLLATE_SYMB_EXTRAMB);
312095b7b453SJohn Marino     }
312195b7b453SJohn Marino #endif
312295b7b453SJohn Marino   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
312395b7b453SJohn Marino #ifdef RE_ENABLE_I18N
312495b7b453SJohn Marino   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
312595b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
312695b7b453SJohn Marino #ifdef RE_ENABLE_I18N
3127*09d4459fSDaniel Fojt   if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
312895b7b453SJohn Marino #else
3129*09d4459fSDaniel Fojt   if (__glibc_unlikely (sbcset == NULL))
313095b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
313195b7b453SJohn Marino     {
3132cf28ed85SJohn Marino       re_free (sbcset);
3133cf28ed85SJohn Marino #ifdef RE_ENABLE_I18N
3134cf28ed85SJohn Marino       re_free (mbcset);
3135cf28ed85SJohn Marino #endif
313695b7b453SJohn Marino       *err = REG_ESPACE;
313795b7b453SJohn Marino       return NULL;
313895b7b453SJohn Marino     }
313995b7b453SJohn Marino 
314095b7b453SJohn Marino   token_len = peek_token_bracket (token, regexp, syntax);
3141*09d4459fSDaniel Fojt   if (__glibc_unlikely (token->type == END_OF_RE))
314295b7b453SJohn Marino     {
314395b7b453SJohn Marino       *err = REG_BADPAT;
314495b7b453SJohn Marino       goto parse_bracket_exp_free_return;
314595b7b453SJohn Marino     }
314695b7b453SJohn Marino   if (token->type == OP_NON_MATCH_LIST)
314795b7b453SJohn Marino     {
314895b7b453SJohn Marino #ifdef RE_ENABLE_I18N
314995b7b453SJohn Marino       mbcset->non_match = 1;
315095b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
315195b7b453SJohn Marino       non_match = true;
315295b7b453SJohn Marino       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
315395b7b453SJohn Marino 	bitset_set (sbcset, '\n');
315495b7b453SJohn Marino       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
315595b7b453SJohn Marino       token_len = peek_token_bracket (token, regexp, syntax);
3156*09d4459fSDaniel Fojt       if (__glibc_unlikely (token->type == END_OF_RE))
315795b7b453SJohn Marino 	{
315895b7b453SJohn Marino 	  *err = REG_BADPAT;
315995b7b453SJohn Marino 	  goto parse_bracket_exp_free_return;
316095b7b453SJohn Marino 	}
316195b7b453SJohn Marino     }
316295b7b453SJohn Marino 
316395b7b453SJohn Marino   /* We treat the first ']' as a normal character.  */
316495b7b453SJohn Marino   if (token->type == OP_CLOSE_BRACKET)
316595b7b453SJohn Marino     token->type = CHARACTER;
316695b7b453SJohn Marino 
316795b7b453SJohn Marino   while (1)
316895b7b453SJohn Marino     {
316995b7b453SJohn Marino       bracket_elem_t start_elem, end_elem;
317095b7b453SJohn Marino       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
317195b7b453SJohn Marino       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
317295b7b453SJohn Marino       reg_errcode_t ret;
317395b7b453SJohn Marino       int token_len2 = 0;
317495b7b453SJohn Marino       bool is_range_exp = false;
317595b7b453SJohn Marino       re_token_t token2;
317695b7b453SJohn Marino 
317795b7b453SJohn Marino       start_elem.opr.name = start_name_buf;
3178dc7c36e4SJohn Marino       start_elem.type = COLL_SYM;
317995b7b453SJohn Marino       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
318095b7b453SJohn Marino 				   syntax, first_round);
3181*09d4459fSDaniel Fojt       if (__glibc_unlikely (ret != REG_NOERROR))
318295b7b453SJohn Marino 	{
318395b7b453SJohn Marino 	  *err = ret;
318495b7b453SJohn Marino 	  goto parse_bracket_exp_free_return;
318595b7b453SJohn Marino 	}
318695b7b453SJohn Marino       first_round = false;
318795b7b453SJohn Marino 
318895b7b453SJohn Marino       /* Get information about the next token.  We need it in any case.  */
318995b7b453SJohn Marino       token_len = peek_token_bracket (token, regexp, syntax);
319095b7b453SJohn Marino 
319195b7b453SJohn Marino       /* Do not check for ranges if we know they are not allowed.  */
319295b7b453SJohn Marino       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
319395b7b453SJohn Marino 	{
3194*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (token->type == END_OF_RE))
319595b7b453SJohn Marino 	    {
319695b7b453SJohn Marino 	      *err = REG_EBRACK;
319795b7b453SJohn Marino 	      goto parse_bracket_exp_free_return;
319895b7b453SJohn Marino 	    }
319995b7b453SJohn Marino 	  if (token->type == OP_CHARSET_RANGE)
320095b7b453SJohn Marino 	    {
320195b7b453SJohn Marino 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
320295b7b453SJohn Marino 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3203*09d4459fSDaniel Fojt 	      if (__glibc_unlikely (token2.type == END_OF_RE))
320495b7b453SJohn Marino 		{
320595b7b453SJohn Marino 		  *err = REG_EBRACK;
320695b7b453SJohn Marino 		  goto parse_bracket_exp_free_return;
320795b7b453SJohn Marino 		}
320895b7b453SJohn Marino 	      if (token2.type == OP_CLOSE_BRACKET)
320995b7b453SJohn Marino 		{
321095b7b453SJohn Marino 		  /* We treat the last '-' as a normal character.  */
321195b7b453SJohn Marino 		  re_string_skip_bytes (regexp, -token_len);
321295b7b453SJohn Marino 		  token->type = CHARACTER;
321395b7b453SJohn Marino 		}
321495b7b453SJohn Marino 	      else
321595b7b453SJohn Marino 		is_range_exp = true;
321695b7b453SJohn Marino 	    }
321795b7b453SJohn Marino 	}
321895b7b453SJohn Marino 
321995b7b453SJohn Marino       if (is_range_exp == true)
322095b7b453SJohn Marino 	{
322195b7b453SJohn Marino 	  end_elem.opr.name = end_name_buf;
3222dc7c36e4SJohn Marino 	  end_elem.type = COLL_SYM;
322395b7b453SJohn Marino 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
322495b7b453SJohn Marino 				       dfa, syntax, true);
3225*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (ret != REG_NOERROR))
322695b7b453SJohn Marino 	    {
322795b7b453SJohn Marino 	      *err = ret;
322895b7b453SJohn Marino 	      goto parse_bracket_exp_free_return;
322995b7b453SJohn Marino 	    }
323095b7b453SJohn Marino 
323195b7b453SJohn Marino 	  token_len = peek_token_bracket (token, regexp, syntax);
323295b7b453SJohn Marino 
323395b7b453SJohn Marino #ifdef _LIBC
323495b7b453SJohn Marino 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
323595b7b453SJohn Marino 				  &start_elem, &end_elem);
323695b7b453SJohn Marino #else
323795b7b453SJohn Marino # ifdef RE_ENABLE_I18N
323895b7b453SJohn Marino 	  *err = build_range_exp (syntax, sbcset,
323995b7b453SJohn Marino 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
324095b7b453SJohn Marino 				  &range_alloc, &start_elem, &end_elem);
324195b7b453SJohn Marino # else
324295b7b453SJohn Marino 	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
324395b7b453SJohn Marino # endif
324495b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
3245*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (*err != REG_NOERROR))
324695b7b453SJohn Marino 	    goto parse_bracket_exp_free_return;
324795b7b453SJohn Marino 	}
324895b7b453SJohn Marino       else
324995b7b453SJohn Marino 	{
325095b7b453SJohn Marino 	  switch (start_elem.type)
325195b7b453SJohn Marino 	    {
325295b7b453SJohn Marino 	    case SB_CHAR:
325395b7b453SJohn Marino 	      bitset_set (sbcset, start_elem.opr.ch);
325495b7b453SJohn Marino 	      break;
325595b7b453SJohn Marino #ifdef RE_ENABLE_I18N
325695b7b453SJohn Marino 	    case MB_CHAR:
325795b7b453SJohn Marino 	      /* Check whether the array has enough space.  */
3258*09d4459fSDaniel Fojt 	      if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
325995b7b453SJohn Marino 		{
326095b7b453SJohn Marino 		  wchar_t *new_mbchars;
326195b7b453SJohn Marino 		  /* Not enough, realloc it.  */
326295b7b453SJohn Marino 		  /* +1 in case of mbcset->nmbchars is 0.  */
326395b7b453SJohn Marino 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
326495b7b453SJohn Marino 		  /* Use realloc since array is NULL if *alloc == 0.  */
326595b7b453SJohn Marino 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
326695b7b453SJohn Marino 					    mbchar_alloc);
3267*09d4459fSDaniel Fojt 		  if (__glibc_unlikely (new_mbchars == NULL))
326895b7b453SJohn Marino 		    goto parse_bracket_exp_espace;
326995b7b453SJohn Marino 		  mbcset->mbchars = new_mbchars;
327095b7b453SJohn Marino 		}
327195b7b453SJohn Marino 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
327295b7b453SJohn Marino 	      break;
327395b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
327495b7b453SJohn Marino 	    case EQUIV_CLASS:
327595b7b453SJohn Marino 	      *err = build_equiv_class (sbcset,
327695b7b453SJohn Marino #ifdef RE_ENABLE_I18N
327795b7b453SJohn Marino 					mbcset, &equiv_class_alloc,
327895b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
327995b7b453SJohn Marino 					start_elem.opr.name);
3280*09d4459fSDaniel Fojt 	      if (__glibc_unlikely (*err != REG_NOERROR))
328195b7b453SJohn Marino 		goto parse_bracket_exp_free_return;
328295b7b453SJohn Marino 	      break;
328395b7b453SJohn Marino 	    case COLL_SYM:
328495b7b453SJohn Marino 	      *err = build_collating_symbol (sbcset,
328595b7b453SJohn Marino #ifdef RE_ENABLE_I18N
328695b7b453SJohn Marino 					     mbcset, &coll_sym_alloc,
328795b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
328895b7b453SJohn Marino 					     start_elem.opr.name);
3289*09d4459fSDaniel Fojt 	      if (__glibc_unlikely (*err != REG_NOERROR))
329095b7b453SJohn Marino 		goto parse_bracket_exp_free_return;
329195b7b453SJohn Marino 	      break;
329295b7b453SJohn Marino 	    case CHAR_CLASS:
329395b7b453SJohn Marino 	      *err = build_charclass (regexp->trans, sbcset,
329495b7b453SJohn Marino #ifdef RE_ENABLE_I18N
329595b7b453SJohn Marino 				      mbcset, &char_class_alloc,
329695b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
3297680a9cb8SJohn Marino 				      (const char *) start_elem.opr.name,
3298680a9cb8SJohn Marino 				      syntax);
3299*09d4459fSDaniel Fojt 	      if (__glibc_unlikely (*err != REG_NOERROR))
330095b7b453SJohn Marino 	       goto parse_bracket_exp_free_return;
330195b7b453SJohn Marino 	      break;
330295b7b453SJohn Marino 	    default:
3303*09d4459fSDaniel Fojt 	      DEBUG_ASSERT (false);
330495b7b453SJohn Marino 	      break;
330595b7b453SJohn Marino 	    }
330695b7b453SJohn Marino 	}
3307*09d4459fSDaniel Fojt       if (__glibc_unlikely (token->type == END_OF_RE))
330895b7b453SJohn Marino 	{
330995b7b453SJohn Marino 	  *err = REG_EBRACK;
331095b7b453SJohn Marino 	  goto parse_bracket_exp_free_return;
331195b7b453SJohn Marino 	}
331295b7b453SJohn Marino       if (token->type == OP_CLOSE_BRACKET)
331395b7b453SJohn Marino 	break;
331495b7b453SJohn Marino     }
331595b7b453SJohn Marino 
331695b7b453SJohn Marino   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
331795b7b453SJohn Marino 
331895b7b453SJohn Marino   /* If it is non-matching list.  */
331995b7b453SJohn Marino   if (non_match)
332095b7b453SJohn Marino     bitset_not (sbcset);
332195b7b453SJohn Marino 
332295b7b453SJohn Marino #ifdef RE_ENABLE_I18N
332395b7b453SJohn Marino   /* Ensure only single byte characters are set.  */
332495b7b453SJohn Marino   if (dfa->mb_cur_max > 1)
332595b7b453SJohn Marino     bitset_mask (sbcset, dfa->sb_char);
332695b7b453SJohn Marino 
332795b7b453SJohn Marino   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
332895b7b453SJohn Marino       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
332995b7b453SJohn Marino 						     || mbcset->non_match)))
333095b7b453SJohn Marino     {
333195b7b453SJohn Marino       bin_tree_t *mbc_tree;
333295b7b453SJohn Marino       int sbc_idx;
333395b7b453SJohn Marino       /* Build a tree for complex bracket.  */
333495b7b453SJohn Marino       dfa->has_mb_node = 1;
333595b7b453SJohn Marino       br_token.type = COMPLEX_BRACKET;
333695b7b453SJohn Marino       br_token.opr.mbcset = mbcset;
333795b7b453SJohn Marino       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3338*09d4459fSDaniel Fojt       if (__glibc_unlikely (mbc_tree == NULL))
333995b7b453SJohn Marino 	goto parse_bracket_exp_espace;
334095b7b453SJohn Marino       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
334195b7b453SJohn Marino 	if (sbcset[sbc_idx])
334295b7b453SJohn Marino 	  break;
334395b7b453SJohn Marino       /* If there are no bits set in sbcset, there is no point
334495b7b453SJohn Marino 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
334595b7b453SJohn Marino       if (sbc_idx < BITSET_WORDS)
334695b7b453SJohn Marino 	{
334795b7b453SJohn Marino 	  /* Build a tree for simple bracket.  */
334895b7b453SJohn Marino 	  br_token.type = SIMPLE_BRACKET;
334995b7b453SJohn Marino 	  br_token.opr.sbcset = sbcset;
335095b7b453SJohn Marino 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3351*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (work_tree == NULL))
335295b7b453SJohn Marino 	    goto parse_bracket_exp_espace;
335395b7b453SJohn Marino 
335495b7b453SJohn Marino 	  /* Then join them by ALT node.  */
335595b7b453SJohn Marino 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3356*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (work_tree == NULL))
335795b7b453SJohn Marino 	    goto parse_bracket_exp_espace;
335895b7b453SJohn Marino 	}
335995b7b453SJohn Marino       else
336095b7b453SJohn Marino 	{
336195b7b453SJohn Marino 	  re_free (sbcset);
336295b7b453SJohn Marino 	  work_tree = mbc_tree;
336395b7b453SJohn Marino 	}
336495b7b453SJohn Marino     }
336595b7b453SJohn Marino   else
336695b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
336795b7b453SJohn Marino     {
336895b7b453SJohn Marino #ifdef RE_ENABLE_I18N
336995b7b453SJohn Marino       free_charset (mbcset);
337095b7b453SJohn Marino #endif
337195b7b453SJohn Marino       /* Build a tree for simple bracket.  */
337295b7b453SJohn Marino       br_token.type = SIMPLE_BRACKET;
337395b7b453SJohn Marino       br_token.opr.sbcset = sbcset;
337495b7b453SJohn Marino       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3375*09d4459fSDaniel Fojt       if (__glibc_unlikely (work_tree == NULL))
337695b7b453SJohn Marino 	goto parse_bracket_exp_espace;
337795b7b453SJohn Marino     }
337895b7b453SJohn Marino   return work_tree;
337995b7b453SJohn Marino 
338095b7b453SJohn Marino  parse_bracket_exp_espace:
338195b7b453SJohn Marino   *err = REG_ESPACE;
338295b7b453SJohn Marino  parse_bracket_exp_free_return:
338395b7b453SJohn Marino   re_free (sbcset);
338495b7b453SJohn Marino #ifdef RE_ENABLE_I18N
338595b7b453SJohn Marino   free_charset (mbcset);
338695b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
338795b7b453SJohn Marino   return NULL;
338895b7b453SJohn Marino }
338995b7b453SJohn Marino 
339095b7b453SJohn Marino /* Parse an element in the bracket expression.  */
339195b7b453SJohn Marino 
339295b7b453SJohn Marino static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)339395b7b453SJohn Marino parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3394680a9cb8SJohn Marino 		       re_token_t *token, int token_len, re_dfa_t *dfa,
339595b7b453SJohn Marino 		       reg_syntax_t syntax, bool accept_hyphen)
339695b7b453SJohn Marino {
339795b7b453SJohn Marino #ifdef RE_ENABLE_I18N
339895b7b453SJohn Marino   int cur_char_size;
339995b7b453SJohn Marino   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
340095b7b453SJohn Marino   if (cur_char_size > 1)
340195b7b453SJohn Marino     {
340295b7b453SJohn Marino       elem->type = MB_CHAR;
340395b7b453SJohn Marino       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
340495b7b453SJohn Marino       re_string_skip_bytes (regexp, cur_char_size);
340595b7b453SJohn Marino       return REG_NOERROR;
340695b7b453SJohn Marino     }
340795b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
340895b7b453SJohn Marino   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
340995b7b453SJohn Marino   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
341095b7b453SJohn Marino       || token->type == OP_OPEN_EQUIV_CLASS)
341195b7b453SJohn Marino     return parse_bracket_symbol (elem, regexp, token);
3412*09d4459fSDaniel Fojt   if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
341395b7b453SJohn Marino     {
341495b7b453SJohn Marino       /* A '-' must only appear as anything but a range indicator before
341595b7b453SJohn Marino 	 the closing bracket.  Everything else is an error.  */
341695b7b453SJohn Marino       re_token_t token2;
341795b7b453SJohn Marino       (void) peek_token_bracket (&token2, regexp, syntax);
341895b7b453SJohn Marino       if (token2.type != OP_CLOSE_BRACKET)
341995b7b453SJohn Marino 	/* The actual error value is not standardized since this whole
342095b7b453SJohn Marino 	   case is undefined.  But ERANGE makes good sense.  */
342195b7b453SJohn Marino 	return REG_ERANGE;
342295b7b453SJohn Marino     }
342395b7b453SJohn Marino   elem->type = SB_CHAR;
342495b7b453SJohn Marino   elem->opr.ch = token->opr.c;
342595b7b453SJohn Marino   return REG_NOERROR;
342695b7b453SJohn Marino }
342795b7b453SJohn Marino 
342895b7b453SJohn Marino /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
342995b7b453SJohn Marino    such as [:<character_class>:], [.<collating_element>.], and
343095b7b453SJohn Marino    [=<equivalent_class>=].  */
343195b7b453SJohn Marino 
343295b7b453SJohn Marino static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)343395b7b453SJohn Marino parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
343495b7b453SJohn Marino 		      re_token_t *token)
343595b7b453SJohn Marino {
343695b7b453SJohn Marino   unsigned char ch, delim = token->opr.c;
343795b7b453SJohn Marino   int i = 0;
343895b7b453SJohn Marino   if (re_string_eoi(regexp))
343995b7b453SJohn Marino     return REG_EBRACK;
344095b7b453SJohn Marino   for (;; ++i)
344195b7b453SJohn Marino     {
344295b7b453SJohn Marino       if (i >= BRACKET_NAME_BUF_SIZE)
344395b7b453SJohn Marino 	return REG_EBRACK;
344495b7b453SJohn Marino       if (token->type == OP_OPEN_CHAR_CLASS)
344595b7b453SJohn Marino 	ch = re_string_fetch_byte_case (regexp);
344695b7b453SJohn Marino       else
344795b7b453SJohn Marino 	ch = re_string_fetch_byte (regexp);
344895b7b453SJohn Marino       if (re_string_eoi(regexp))
344995b7b453SJohn Marino 	return REG_EBRACK;
345095b7b453SJohn Marino       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
345195b7b453SJohn Marino 	break;
345295b7b453SJohn Marino       elem->opr.name[i] = ch;
345395b7b453SJohn Marino     }
345495b7b453SJohn Marino   re_string_skip_bytes (regexp, 1);
345595b7b453SJohn Marino   elem->opr.name[i] = '\0';
345695b7b453SJohn Marino   switch (token->type)
345795b7b453SJohn Marino     {
345895b7b453SJohn Marino     case OP_OPEN_COLL_ELEM:
345995b7b453SJohn Marino       elem->type = COLL_SYM;
346095b7b453SJohn Marino       break;
346195b7b453SJohn Marino     case OP_OPEN_EQUIV_CLASS:
346295b7b453SJohn Marino       elem->type = EQUIV_CLASS;
346395b7b453SJohn Marino       break;
346495b7b453SJohn Marino     case OP_OPEN_CHAR_CLASS:
346595b7b453SJohn Marino       elem->type = CHAR_CLASS;
346695b7b453SJohn Marino       break;
346795b7b453SJohn Marino     default:
346895b7b453SJohn Marino       break;
346995b7b453SJohn Marino     }
347095b7b453SJohn Marino   return REG_NOERROR;
347195b7b453SJohn Marino }
347295b7b453SJohn Marino 
347395b7b453SJohn Marino   /* Helper function for parse_bracket_exp.
347495b7b453SJohn Marino      Build the equivalence class which is represented by NAME.
347595b7b453SJohn Marino      The result are written to MBCSET and SBCSET.
347695b7b453SJohn Marino      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3477cf28ed85SJohn Marino      is a pointer argument since we may update it.  */
347895b7b453SJohn Marino 
347995b7b453SJohn Marino static reg_errcode_t
348095b7b453SJohn Marino #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3481680a9cb8SJohn Marino build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3482680a9cb8SJohn Marino 		   Idx *equiv_class_alloc, const unsigned char *name)
348395b7b453SJohn Marino #else /* not RE_ENABLE_I18N */
348495b7b453SJohn Marino build_equiv_class (bitset_t sbcset, const unsigned char *name)
348595b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
348695b7b453SJohn Marino {
348795b7b453SJohn Marino #ifdef _LIBC
348895b7b453SJohn Marino   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
348995b7b453SJohn Marino   if (nrules != 0)
349095b7b453SJohn Marino     {
349195b7b453SJohn Marino       const int32_t *table, *indirect;
349295b7b453SJohn Marino       const unsigned char *weights, *extra, *cp;
349395b7b453SJohn Marino       unsigned char char_buf[2];
349495b7b453SJohn Marino       int32_t idx1, idx2;
349595b7b453SJohn Marino       unsigned int ch;
349695b7b453SJohn Marino       size_t len;
349795b7b453SJohn Marino       /* Calculate the index for equivalence class.  */
349895b7b453SJohn Marino       cp = name;
349995b7b453SJohn Marino       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
350095b7b453SJohn Marino       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
350195b7b453SJohn Marino 					       _NL_COLLATE_WEIGHTMB);
350295b7b453SJohn Marino       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
350395b7b453SJohn Marino 						   _NL_COLLATE_EXTRAMB);
350495b7b453SJohn Marino       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
350595b7b453SJohn Marino 						_NL_COLLATE_INDIRECTMB);
3506dc7c36e4SJohn Marino       idx1 = findidx (table, indirect, extra, &cp, -1);
3507*09d4459fSDaniel Fojt       if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
350895b7b453SJohn Marino 	/* This isn't a valid character.  */
350995b7b453SJohn Marino 	return REG_ECOLLATE;
351095b7b453SJohn Marino 
3511cf28ed85SJohn Marino       /* Build single byte matching table for this equivalence class.  */
351295b7b453SJohn Marino       len = weights[idx1 & 0xffffff];
351395b7b453SJohn Marino       for (ch = 0; ch < SBC_MAX; ++ch)
351495b7b453SJohn Marino 	{
351595b7b453SJohn Marino 	  char_buf[0] = ch;
351695b7b453SJohn Marino 	  cp = char_buf;
3517dc7c36e4SJohn Marino 	  idx2 = findidx (table, indirect, extra, &cp, 1);
351895b7b453SJohn Marino /*
351995b7b453SJohn Marino 	  idx2 = table[ch];
352095b7b453SJohn Marino */
352195b7b453SJohn Marino 	  if (idx2 == 0)
352295b7b453SJohn Marino 	    /* This isn't a valid character.  */
352395b7b453SJohn Marino 	    continue;
352495b7b453SJohn Marino 	  /* Compare only if the length matches and the collation rule
352595b7b453SJohn Marino 	     index is the same.  */
3526*09d4459fSDaniel Fojt 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3527*09d4459fSDaniel Fojt 	      && memcmp (weights + (idx1 & 0xffffff) + 1,
3528*09d4459fSDaniel Fojt 			 weights + (idx2 & 0xffffff) + 1, len) == 0)
352995b7b453SJohn Marino 	    bitset_set (sbcset, ch);
353095b7b453SJohn Marino 	}
353195b7b453SJohn Marino       /* Check whether the array has enough space.  */
3532*09d4459fSDaniel Fojt       if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
353395b7b453SJohn Marino 	{
353495b7b453SJohn Marino 	  /* Not enough, realloc it.  */
353595b7b453SJohn Marino 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
353695b7b453SJohn Marino 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
353795b7b453SJohn Marino 	  /* Use realloc since the array is NULL if *alloc == 0.  */
353895b7b453SJohn Marino 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
353995b7b453SJohn Marino 						   int32_t,
354095b7b453SJohn Marino 						   new_equiv_class_alloc);
3541*09d4459fSDaniel Fojt 	  if (__glibc_unlikely (new_equiv_classes == NULL))
354295b7b453SJohn Marino 	    return REG_ESPACE;
354395b7b453SJohn Marino 	  mbcset->equiv_classes = new_equiv_classes;
354495b7b453SJohn Marino 	  *equiv_class_alloc = new_equiv_class_alloc;
354595b7b453SJohn Marino 	}
354695b7b453SJohn Marino       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
354795b7b453SJohn Marino     }
354895b7b453SJohn Marino   else
354995b7b453SJohn Marino #endif /* _LIBC */
355095b7b453SJohn Marino     {
3551*09d4459fSDaniel Fojt       if (__glibc_unlikely (strlen ((const char *) name) != 1))
355295b7b453SJohn Marino 	return REG_ECOLLATE;
355395b7b453SJohn Marino       bitset_set (sbcset, *name);
355495b7b453SJohn Marino     }
355595b7b453SJohn Marino   return REG_NOERROR;
355695b7b453SJohn Marino }
355795b7b453SJohn Marino 
355895b7b453SJohn Marino   /* Helper function for parse_bracket_exp.
355995b7b453SJohn Marino      Build the character class which is represented by NAME.
356095b7b453SJohn Marino      The result are written to MBCSET and SBCSET.
356195b7b453SJohn Marino      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3562cf28ed85SJohn Marino      is a pointer argument since we may update it.  */
356395b7b453SJohn Marino 
356495b7b453SJohn Marino static reg_errcode_t
356595b7b453SJohn Marino #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const char * class_name,reg_syntax_t syntax)356695b7b453SJohn Marino build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
356795b7b453SJohn Marino 		 re_charset_t *mbcset, Idx *char_class_alloc,
3568680a9cb8SJohn Marino 		 const char *class_name, reg_syntax_t syntax)
356995b7b453SJohn Marino #else /* not RE_ENABLE_I18N */
357095b7b453SJohn Marino build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3571680a9cb8SJohn Marino 		 const char *class_name, reg_syntax_t syntax)
357295b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
357395b7b453SJohn Marino {
357495b7b453SJohn Marino   int i;
3575680a9cb8SJohn Marino   const char *name = class_name;
357695b7b453SJohn Marino 
357795b7b453SJohn Marino   /* In case of REG_ICASE "upper" and "lower" match the both of
357895b7b453SJohn Marino      upper and lower cases.  */
357995b7b453SJohn Marino   if ((syntax & RE_ICASE)
358095b7b453SJohn Marino       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
358195b7b453SJohn Marino     name = "alpha";
358295b7b453SJohn Marino 
358395b7b453SJohn Marino #ifdef RE_ENABLE_I18N
358495b7b453SJohn Marino   /* Check the space of the arrays.  */
3585*09d4459fSDaniel Fojt   if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
358695b7b453SJohn Marino     {
358795b7b453SJohn Marino       /* Not enough, realloc it.  */
358895b7b453SJohn Marino       /* +1 in case of mbcset->nchar_classes is 0.  */
358995b7b453SJohn Marino       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
359095b7b453SJohn Marino       /* Use realloc since array is NULL if *alloc == 0.  */
359195b7b453SJohn Marino       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
359295b7b453SJohn Marino 					       new_char_class_alloc);
3593*09d4459fSDaniel Fojt       if (__glibc_unlikely (new_char_classes == NULL))
359495b7b453SJohn Marino 	return REG_ESPACE;
359595b7b453SJohn Marino       mbcset->char_classes = new_char_classes;
359695b7b453SJohn Marino       *char_class_alloc = new_char_class_alloc;
359795b7b453SJohn Marino     }
359895b7b453SJohn Marino   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
359995b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
360095b7b453SJohn Marino 
360195b7b453SJohn Marino #define BUILD_CHARCLASS_LOOP(ctype_func)	\
360295b7b453SJohn Marino   do {						\
3603*09d4459fSDaniel Fojt     if (__glibc_unlikely (trans != NULL))			\
360495b7b453SJohn Marino       {						\
360595b7b453SJohn Marino 	for (i = 0; i < SBC_MAX; ++i)		\
360695b7b453SJohn Marino 	  if (ctype_func (i))			\
360795b7b453SJohn Marino 	    bitset_set (sbcset, trans[i]);	\
360895b7b453SJohn Marino       }						\
360995b7b453SJohn Marino     else					\
361095b7b453SJohn Marino       {						\
361195b7b453SJohn Marino 	for (i = 0; i < SBC_MAX; ++i)		\
361295b7b453SJohn Marino 	  if (ctype_func (i))			\
361395b7b453SJohn Marino 	    bitset_set (sbcset, i);		\
361495b7b453SJohn Marino       }						\
361595b7b453SJohn Marino   } while (0)
361695b7b453SJohn Marino 
361795b7b453SJohn Marino   if (strcmp (name, "alnum") == 0)
361895b7b453SJohn Marino     BUILD_CHARCLASS_LOOP (isalnum);
361995b7b453SJohn Marino   else if (strcmp (name, "cntrl") == 0)
362095b7b453SJohn Marino     BUILD_CHARCLASS_LOOP (iscntrl);
362195b7b453SJohn Marino   else if (strcmp (name, "lower") == 0)
362295b7b453SJohn Marino     BUILD_CHARCLASS_LOOP (islower);
362395b7b453SJohn Marino   else if (strcmp (name, "space") == 0)
362495b7b453SJohn Marino     BUILD_CHARCLASS_LOOP (isspace);
362595b7b453SJohn Marino   else if (strcmp (name, "alpha") == 0)
362695b7b453SJohn Marino     BUILD_CHARCLASS_LOOP (isalpha);
362795b7b453SJohn Marino   else if (strcmp (name, "digit") == 0)
362895b7b453SJohn Marino     BUILD_CHARCLASS_LOOP (isdigit);
362995b7b453SJohn Marino   else if (strcmp (name, "print") == 0)
363095b7b453SJohn Marino     BUILD_CHARCLASS_LOOP (isprint);
363195b7b453SJohn Marino   else if (strcmp (name, "upper") == 0)
363295b7b453SJohn Marino     BUILD_CHARCLASS_LOOP (isupper);
363395b7b453SJohn Marino   else if (strcmp (name, "blank") == 0)
363495b7b453SJohn Marino     BUILD_CHARCLASS_LOOP (isblank);
363595b7b453SJohn Marino   else if (strcmp (name, "graph") == 0)
363695b7b453SJohn Marino     BUILD_CHARCLASS_LOOP (isgraph);
363795b7b453SJohn Marino   else if (strcmp (name, "punct") == 0)
363895b7b453SJohn Marino     BUILD_CHARCLASS_LOOP (ispunct);
363995b7b453SJohn Marino   else if (strcmp (name, "xdigit") == 0)
364095b7b453SJohn Marino     BUILD_CHARCLASS_LOOP (isxdigit);
364195b7b453SJohn Marino   else
364295b7b453SJohn Marino     return REG_ECTYPE;
364395b7b453SJohn Marino 
364495b7b453SJohn Marino   return REG_NOERROR;
364595b7b453SJohn Marino }
364695b7b453SJohn Marino 
364795b7b453SJohn Marino static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const char * class_name,const char * extra,bool non_match,reg_errcode_t * err)364895b7b453SJohn Marino build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3649680a9cb8SJohn Marino 		    const char *class_name,
3650680a9cb8SJohn Marino 		    const char *extra, bool non_match,
365195b7b453SJohn Marino 		    reg_errcode_t *err)
365295b7b453SJohn Marino {
365395b7b453SJohn Marino   re_bitset_ptr_t sbcset;
365495b7b453SJohn Marino #ifdef RE_ENABLE_I18N
365595b7b453SJohn Marino   re_charset_t *mbcset;
365695b7b453SJohn Marino   Idx alloc = 0;
365795b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
365895b7b453SJohn Marino   reg_errcode_t ret;
365995b7b453SJohn Marino   bin_tree_t *tree;
366095b7b453SJohn Marino 
366195b7b453SJohn Marino   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3662*09d4459fSDaniel Fojt   if (__glibc_unlikely (sbcset == NULL))
366395b7b453SJohn Marino     {
366495b7b453SJohn Marino       *err = REG_ESPACE;
366595b7b453SJohn Marino       return NULL;
366695b7b453SJohn Marino     }
366795b7b453SJohn Marino #ifdef RE_ENABLE_I18N
3668*09d4459fSDaniel Fojt   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3669*09d4459fSDaniel Fojt   if (__glibc_unlikely (mbcset == NULL))
3670*09d4459fSDaniel Fojt     {
3671*09d4459fSDaniel Fojt       re_free (sbcset);
3672*09d4459fSDaniel Fojt       *err = REG_ESPACE;
3673*09d4459fSDaniel Fojt       return NULL;
367495b7b453SJohn Marino     }
3675*09d4459fSDaniel Fojt   mbcset->non_match = non_match;
3676*09d4459fSDaniel Fojt #endif /* RE_ENABLE_I18N */
367795b7b453SJohn Marino 
367895b7b453SJohn Marino   /* We don't care the syntax in this case.  */
367995b7b453SJohn Marino   ret = build_charclass (trans, sbcset,
368095b7b453SJohn Marino #ifdef RE_ENABLE_I18N
368195b7b453SJohn Marino 			 mbcset, &alloc,
368295b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
368395b7b453SJohn Marino 			 class_name, 0);
368495b7b453SJohn Marino 
3685*09d4459fSDaniel Fojt   if (__glibc_unlikely (ret != REG_NOERROR))
368695b7b453SJohn Marino     {
368795b7b453SJohn Marino       re_free (sbcset);
368895b7b453SJohn Marino #ifdef RE_ENABLE_I18N
368995b7b453SJohn Marino       free_charset (mbcset);
369095b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
369195b7b453SJohn Marino       *err = ret;
369295b7b453SJohn Marino       return NULL;
369395b7b453SJohn Marino     }
369495b7b453SJohn Marino   /* \w match '_' also.  */
369595b7b453SJohn Marino   for (; *extra; extra++)
369695b7b453SJohn Marino     bitset_set (sbcset, *extra);
369795b7b453SJohn Marino 
369895b7b453SJohn Marino   /* If it is non-matching list.  */
369995b7b453SJohn Marino   if (non_match)
370095b7b453SJohn Marino     bitset_not (sbcset);
370195b7b453SJohn Marino 
370295b7b453SJohn Marino #ifdef RE_ENABLE_I18N
370395b7b453SJohn Marino   /* Ensure only single byte characters are set.  */
370495b7b453SJohn Marino   if (dfa->mb_cur_max > 1)
370595b7b453SJohn Marino     bitset_mask (sbcset, dfa->sb_char);
370695b7b453SJohn Marino #endif
370795b7b453SJohn Marino 
370895b7b453SJohn Marino   /* Build a tree for simple bracket.  */
3709*09d4459fSDaniel Fojt   re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
371095b7b453SJohn Marino   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3711*09d4459fSDaniel Fojt   if (__glibc_unlikely (tree == NULL))
371295b7b453SJohn Marino     goto build_word_op_espace;
371395b7b453SJohn Marino 
371495b7b453SJohn Marino #ifdef RE_ENABLE_I18N
371595b7b453SJohn Marino   if (dfa->mb_cur_max > 1)
371695b7b453SJohn Marino     {
371795b7b453SJohn Marino       bin_tree_t *mbc_tree;
371895b7b453SJohn Marino       /* Build a tree for complex bracket.  */
371995b7b453SJohn Marino       br_token.type = COMPLEX_BRACKET;
372095b7b453SJohn Marino       br_token.opr.mbcset = mbcset;
372195b7b453SJohn Marino       dfa->has_mb_node = 1;
372295b7b453SJohn Marino       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3723*09d4459fSDaniel Fojt       if (__glibc_unlikely (mbc_tree == NULL))
372495b7b453SJohn Marino 	goto build_word_op_espace;
372595b7b453SJohn Marino       /* Then join them by ALT node.  */
372695b7b453SJohn Marino       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3727*09d4459fSDaniel Fojt       if (__glibc_likely (mbc_tree != NULL))
372895b7b453SJohn Marino 	return tree;
372995b7b453SJohn Marino     }
373095b7b453SJohn Marino   else
373195b7b453SJohn Marino     {
373295b7b453SJohn Marino       free_charset (mbcset);
373395b7b453SJohn Marino       return tree;
373495b7b453SJohn Marino     }
373595b7b453SJohn Marino #else /* not RE_ENABLE_I18N */
373695b7b453SJohn Marino   return tree;
373795b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
373895b7b453SJohn Marino 
373995b7b453SJohn Marino  build_word_op_espace:
374095b7b453SJohn Marino   re_free (sbcset);
374195b7b453SJohn Marino #ifdef RE_ENABLE_I18N
374295b7b453SJohn Marino   free_charset (mbcset);
374395b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
374495b7b453SJohn Marino   *err = REG_ESPACE;
374595b7b453SJohn Marino   return NULL;
374695b7b453SJohn Marino }
374795b7b453SJohn Marino 
374895b7b453SJohn Marino /* This is intended for the expressions like "a{1,3}".
3749cf28ed85SJohn Marino    Fetch a number from 'input', and return the number.
3750*09d4459fSDaniel Fojt    Return -1 if the number field is empty like "{,1}".
3751cf28ed85SJohn Marino    Return RE_DUP_MAX + 1 if the number field is too large.
3752*09d4459fSDaniel Fojt    Return -2 if an error occurred.  */
375395b7b453SJohn Marino 
375495b7b453SJohn Marino static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)375595b7b453SJohn Marino fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
375695b7b453SJohn Marino {
3757*09d4459fSDaniel Fojt   Idx num = -1;
375895b7b453SJohn Marino   unsigned char c;
375995b7b453SJohn Marino   while (1)
376095b7b453SJohn Marino     {
376195b7b453SJohn Marino       fetch_token (token, input, syntax);
376295b7b453SJohn Marino       c = token->opr.c;
3763*09d4459fSDaniel Fojt       if (__glibc_unlikely (token->type == END_OF_RE))
3764*09d4459fSDaniel Fojt 	return -2;
376595b7b453SJohn Marino       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
376695b7b453SJohn Marino 	break;
3767*09d4459fSDaniel Fojt       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3768*09d4459fSDaniel Fojt 	     ? -2
3769*09d4459fSDaniel Fojt 	     : num == -1
3770cf28ed85SJohn Marino 	     ? c - '0'
3771cf28ed85SJohn Marino 	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
377295b7b453SJohn Marino     }
377395b7b453SJohn Marino   return num;
377495b7b453SJohn Marino }
377595b7b453SJohn Marino 
377695b7b453SJohn Marino #ifdef RE_ENABLE_I18N
377795b7b453SJohn Marino static void
free_charset(re_charset_t * cset)377895b7b453SJohn Marino free_charset (re_charset_t *cset)
377995b7b453SJohn Marino {
378095b7b453SJohn Marino   re_free (cset->mbchars);
378195b7b453SJohn Marino # ifdef _LIBC
378295b7b453SJohn Marino   re_free (cset->coll_syms);
378395b7b453SJohn Marino   re_free (cset->equiv_classes);
3784*09d4459fSDaniel Fojt # endif
378595b7b453SJohn Marino   re_free (cset->range_starts);
378695b7b453SJohn Marino   re_free (cset->range_ends);
378795b7b453SJohn Marino   re_free (cset->char_classes);
378895b7b453SJohn Marino   re_free (cset);
378995b7b453SJohn Marino }
379095b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
379195b7b453SJohn Marino 
379295b7b453SJohn Marino /* Functions for binary tree operation.  */
379395b7b453SJohn Marino 
379495b7b453SJohn Marino /* Create a tree node.  */
379595b7b453SJohn Marino 
379695b7b453SJohn Marino static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)379795b7b453SJohn Marino create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
379895b7b453SJohn Marino 	     re_token_type_t type)
379995b7b453SJohn Marino {
3800*09d4459fSDaniel Fojt   re_token_t t = { .type = type };
380195b7b453SJohn Marino   return create_token_tree (dfa, left, right, &t);
380295b7b453SJohn Marino }
380395b7b453SJohn Marino 
380495b7b453SJohn Marino static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)380595b7b453SJohn Marino create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
380695b7b453SJohn Marino 		   const re_token_t *token)
380795b7b453SJohn Marino {
380895b7b453SJohn Marino   bin_tree_t *tree;
3809*09d4459fSDaniel Fojt   if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
381095b7b453SJohn Marino     {
381195b7b453SJohn Marino       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
381295b7b453SJohn Marino 
381395b7b453SJohn Marino       if (storage == NULL)
381495b7b453SJohn Marino 	return NULL;
381595b7b453SJohn Marino       storage->next = dfa->str_tree_storage;
381695b7b453SJohn Marino       dfa->str_tree_storage = storage;
381795b7b453SJohn Marino       dfa->str_tree_storage_idx = 0;
381895b7b453SJohn Marino     }
381995b7b453SJohn Marino   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
382095b7b453SJohn Marino 
382195b7b453SJohn Marino   tree->parent = NULL;
382295b7b453SJohn Marino   tree->left = left;
382395b7b453SJohn Marino   tree->right = right;
382495b7b453SJohn Marino   tree->token = *token;
382595b7b453SJohn Marino   tree->token.duplicated = 0;
382695b7b453SJohn Marino   tree->token.opt_subexp = 0;
382795b7b453SJohn Marino   tree->first = NULL;
382895b7b453SJohn Marino   tree->next = NULL;
3829*09d4459fSDaniel Fojt   tree->node_idx = -1;
383095b7b453SJohn Marino 
383195b7b453SJohn Marino   if (left != NULL)
383295b7b453SJohn Marino     left->parent = tree;
383395b7b453SJohn Marino   if (right != NULL)
383495b7b453SJohn Marino     right->parent = tree;
383595b7b453SJohn Marino   return tree;
383695b7b453SJohn Marino }
383795b7b453SJohn Marino 
383895b7b453SJohn Marino /* Mark the tree SRC as an optional subexpression.
383995b7b453SJohn Marino    To be called from preorder or postorder.  */
384095b7b453SJohn Marino 
384195b7b453SJohn Marino static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)384295b7b453SJohn Marino mark_opt_subexp (void *extra, bin_tree_t *node)
384395b7b453SJohn Marino {
3844a8597f6cSJohn Marino   Idx idx = (uintptr_t) extra;
384595b7b453SJohn Marino   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
384695b7b453SJohn Marino     node->token.opt_subexp = 1;
384795b7b453SJohn Marino 
384895b7b453SJohn Marino   return REG_NOERROR;
384995b7b453SJohn Marino }
385095b7b453SJohn Marino 
385195b7b453SJohn Marino /* Free the allocated memory inside NODE. */
385295b7b453SJohn Marino 
385395b7b453SJohn Marino static void
free_token(re_token_t * node)385495b7b453SJohn Marino free_token (re_token_t *node)
385595b7b453SJohn Marino {
385695b7b453SJohn Marino #ifdef RE_ENABLE_I18N
385795b7b453SJohn Marino   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
385895b7b453SJohn Marino     free_charset (node->opr.mbcset);
385995b7b453SJohn Marino   else
386095b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
386195b7b453SJohn Marino     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
386295b7b453SJohn Marino       re_free (node->opr.sbcset);
386395b7b453SJohn Marino }
386495b7b453SJohn Marino 
386595b7b453SJohn Marino /* Worker function for tree walking.  Free the allocated memory inside NODE
386695b7b453SJohn Marino    and its children. */
386795b7b453SJohn Marino 
386895b7b453SJohn Marino static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3869680a9cb8SJohn Marino free_tree (void *extra, bin_tree_t *node)
387095b7b453SJohn Marino {
387195b7b453SJohn Marino   free_token (&node->token);
387295b7b453SJohn Marino   return REG_NOERROR;
387395b7b453SJohn Marino }
387495b7b453SJohn Marino 
387595b7b453SJohn Marino 
387695b7b453SJohn Marino /* Duplicate the node SRC, and return new node.  This is a preorder
387795b7b453SJohn Marino    visit similar to the one implemented by the generic visitor, but
387895b7b453SJohn Marino    we need more infrastructure to maintain two parallel trees --- so,
387995b7b453SJohn Marino    it's easier to duplicate.  */
388095b7b453SJohn Marino 
388195b7b453SJohn Marino static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)388295b7b453SJohn Marino duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
388395b7b453SJohn Marino {
388495b7b453SJohn Marino   const bin_tree_t *node;
388595b7b453SJohn Marino   bin_tree_t *dup_root;
388695b7b453SJohn Marino   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
388795b7b453SJohn Marino 
388895b7b453SJohn Marino   for (node = root; ; )
388995b7b453SJohn Marino     {
389095b7b453SJohn Marino       /* Create a new tree and link it back to the current parent.  */
389195b7b453SJohn Marino       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
389295b7b453SJohn Marino       if (*p_new == NULL)
389395b7b453SJohn Marino 	return NULL;
389495b7b453SJohn Marino       (*p_new)->parent = dup_node;
389595b7b453SJohn Marino       (*p_new)->token.duplicated = 1;
389695b7b453SJohn Marino       dup_node = *p_new;
389795b7b453SJohn Marino 
389895b7b453SJohn Marino       /* Go to the left node, or up and to the right.  */
389995b7b453SJohn Marino       if (node->left)
390095b7b453SJohn Marino 	{
390195b7b453SJohn Marino 	  node = node->left;
390295b7b453SJohn Marino 	  p_new = &dup_node->left;
390395b7b453SJohn Marino 	}
390495b7b453SJohn Marino       else
390595b7b453SJohn Marino 	{
390695b7b453SJohn Marino 	  const bin_tree_t *prev = NULL;
390795b7b453SJohn Marino 	  while (node->right == prev || node->right == NULL)
390895b7b453SJohn Marino 	    {
390995b7b453SJohn Marino 	      prev = node;
391095b7b453SJohn Marino 	      node = node->parent;
391195b7b453SJohn Marino 	      dup_node = dup_node->parent;
391295b7b453SJohn Marino 	      if (!node)
391395b7b453SJohn Marino 		return dup_root;
391495b7b453SJohn Marino 	    }
391595b7b453SJohn Marino 	  node = node->right;
391695b7b453SJohn Marino 	  p_new = &dup_node->right;
391795b7b453SJohn Marino 	}
391895b7b453SJohn Marino     }
391995b7b453SJohn Marino }
3920