195b7b453SJohn Marino /* Extended regular expression matching and search library.
2*09d4459fSDaniel Fojt Copyright (C) 2002-2020 Free Software Foundation, Inc.
395b7b453SJohn Marino This file is part of the GNU C Library.
495b7b453SJohn Marino Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
595b7b453SJohn Marino
6680a9cb8SJohn Marino The GNU C Library is free software; you can redistribute it and/or
7680a9cb8SJohn Marino modify it under the terms of the GNU General Public
8680a9cb8SJohn Marino License as published by the Free Software Foundation; either
9680a9cb8SJohn Marino version 3 of the License, or (at your option) any later version.
1095b7b453SJohn Marino
11680a9cb8SJohn Marino The GNU C Library is distributed in the hope that it will be useful,
1295b7b453SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
13680a9cb8SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14680a9cb8SJohn Marino General Public License for more details.
1595b7b453SJohn Marino
16680a9cb8SJohn Marino You should have received a copy of the GNU General Public
17680a9cb8SJohn Marino License along with the GNU C Library; if not, see
18*09d4459fSDaniel Fojt <https://www.gnu.org/licenses/>. */
1995b7b453SJohn Marino
20dc7c36e4SJohn Marino #ifdef _LIBC
21dc7c36e4SJohn Marino # include <locale/weight.h>
22dc7c36e4SJohn Marino #endif
23dc7c36e4SJohn Marino
2495b7b453SJohn Marino static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
2595b7b453SJohn Marino size_t length, reg_syntax_t syntax);
2695b7b453SJohn Marino static void re_compile_fastmap_iter (regex_t *bufp,
2795b7b453SJohn Marino const re_dfastate_t *init_state,
2895b7b453SJohn Marino char *fastmap);
2995b7b453SJohn Marino static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
3095b7b453SJohn Marino #ifdef RE_ENABLE_I18N
3195b7b453SJohn Marino static void free_charset (re_charset_t *cset);
3295b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
3395b7b453SJohn Marino static void free_workarea_compile (regex_t *preg);
3495b7b453SJohn Marino static reg_errcode_t create_initial_state (re_dfa_t *dfa);
3595b7b453SJohn Marino #ifdef RE_ENABLE_I18N
3695b7b453SJohn Marino static void optimize_utf8 (re_dfa_t *dfa);
3795b7b453SJohn Marino #endif
3895b7b453SJohn Marino static reg_errcode_t analyze (regex_t *preg);
3995b7b453SJohn Marino static reg_errcode_t preorder (bin_tree_t *root,
4095b7b453SJohn Marino reg_errcode_t (fn (void *, bin_tree_t *)),
4195b7b453SJohn Marino void *extra);
4295b7b453SJohn Marino static reg_errcode_t postorder (bin_tree_t *root,
4395b7b453SJohn Marino reg_errcode_t (fn (void *, bin_tree_t *)),
4495b7b453SJohn Marino void *extra);
4595b7b453SJohn Marino static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
4695b7b453SJohn Marino static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
4795b7b453SJohn Marino static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
4895b7b453SJohn Marino bin_tree_t *node);
4995b7b453SJohn Marino static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
5095b7b453SJohn Marino static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
5195b7b453SJohn Marino static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
5295b7b453SJohn Marino static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
5395b7b453SJohn Marino static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
5495b7b453SJohn Marino unsigned int constraint);
5595b7b453SJohn Marino static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
5695b7b453SJohn Marino static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
5795b7b453SJohn Marino Idx node, bool root);
5895b7b453SJohn Marino static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
5995b7b453SJohn Marino static Idx fetch_number (re_string_t *input, re_token_t *token,
6095b7b453SJohn Marino reg_syntax_t syntax);
6195b7b453SJohn Marino static int peek_token (re_token_t *token, re_string_t *input,
62*09d4459fSDaniel Fojt reg_syntax_t syntax);
6395b7b453SJohn Marino static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
6495b7b453SJohn Marino reg_syntax_t syntax, reg_errcode_t *err);
6595b7b453SJohn Marino static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
6695b7b453SJohn Marino re_token_t *token, reg_syntax_t syntax,
6795b7b453SJohn Marino Idx nest, reg_errcode_t *err);
6895b7b453SJohn Marino static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
6995b7b453SJohn Marino re_token_t *token, reg_syntax_t syntax,
7095b7b453SJohn Marino Idx nest, reg_errcode_t *err);
7195b7b453SJohn Marino static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
7295b7b453SJohn Marino re_token_t *token, reg_syntax_t syntax,
7395b7b453SJohn Marino Idx nest, reg_errcode_t *err);
7495b7b453SJohn Marino static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
7595b7b453SJohn Marino re_token_t *token, reg_syntax_t syntax,
7695b7b453SJohn Marino Idx nest, reg_errcode_t *err);
7795b7b453SJohn Marino static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
7895b7b453SJohn Marino re_dfa_t *dfa, re_token_t *token,
7995b7b453SJohn Marino reg_syntax_t syntax, reg_errcode_t *err);
8095b7b453SJohn Marino static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
8195b7b453SJohn Marino re_token_t *token, reg_syntax_t syntax,
8295b7b453SJohn Marino reg_errcode_t *err);
8395b7b453SJohn Marino static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
8495b7b453SJohn Marino re_string_t *regexp,
8595b7b453SJohn Marino re_token_t *token, int token_len,
8695b7b453SJohn Marino re_dfa_t *dfa,
8795b7b453SJohn Marino reg_syntax_t syntax,
8895b7b453SJohn Marino bool accept_hyphen);
8995b7b453SJohn Marino static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
9095b7b453SJohn Marino re_string_t *regexp,
9195b7b453SJohn Marino re_token_t *token);
9295b7b453SJohn Marino #ifdef RE_ENABLE_I18N
9395b7b453SJohn Marino static reg_errcode_t build_equiv_class (bitset_t sbcset,
9495b7b453SJohn Marino re_charset_t *mbcset,
9595b7b453SJohn Marino Idx *equiv_class_alloc,
9695b7b453SJohn Marino const unsigned char *name);
9795b7b453SJohn Marino static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
9895b7b453SJohn Marino bitset_t sbcset,
9995b7b453SJohn Marino re_charset_t *mbcset,
10095b7b453SJohn Marino Idx *char_class_alloc,
101680a9cb8SJohn Marino const char *class_name,
10295b7b453SJohn Marino reg_syntax_t syntax);
10395b7b453SJohn Marino #else /* not RE_ENABLE_I18N */
10495b7b453SJohn Marino static reg_errcode_t build_equiv_class (bitset_t sbcset,
10595b7b453SJohn Marino const unsigned char *name);
10695b7b453SJohn Marino static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
10795b7b453SJohn Marino bitset_t sbcset,
108680a9cb8SJohn Marino const char *class_name,
10995b7b453SJohn Marino reg_syntax_t syntax);
11095b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
11195b7b453SJohn Marino static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
11295b7b453SJohn Marino RE_TRANSLATE_TYPE trans,
113680a9cb8SJohn Marino const char *class_name,
114680a9cb8SJohn Marino const char *extra,
11595b7b453SJohn Marino bool non_match, reg_errcode_t *err);
11695b7b453SJohn Marino static bin_tree_t *create_tree (re_dfa_t *dfa,
11795b7b453SJohn Marino bin_tree_t *left, bin_tree_t *right,
11895b7b453SJohn Marino re_token_type_t type);
11995b7b453SJohn Marino static bin_tree_t *create_token_tree (re_dfa_t *dfa,
12095b7b453SJohn Marino bin_tree_t *left, bin_tree_t *right,
12195b7b453SJohn Marino const re_token_t *token);
12295b7b453SJohn Marino static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
12395b7b453SJohn Marino static void free_token (re_token_t *node);
12495b7b453SJohn Marino static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
12595b7b453SJohn Marino static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
12695b7b453SJohn Marino
12795b7b453SJohn Marino /* This table gives an error message for each of the error codes listed
12895b7b453SJohn Marino in regex.h. Obviously the order here has to be same as there.
12995b7b453SJohn Marino POSIX doesn't require that we do anything for REG_NOERROR,
13095b7b453SJohn Marino but why not be nice? */
13195b7b453SJohn Marino
13295b7b453SJohn Marino static const char __re_error_msgid[] =
13395b7b453SJohn Marino {
13495b7b453SJohn Marino #define REG_NOERROR_IDX 0
13595b7b453SJohn Marino gettext_noop ("Success") /* REG_NOERROR */
13695b7b453SJohn Marino "\0"
13795b7b453SJohn Marino #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
13895b7b453SJohn Marino gettext_noop ("No match") /* REG_NOMATCH */
13995b7b453SJohn Marino "\0"
14095b7b453SJohn Marino #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
14195b7b453SJohn Marino gettext_noop ("Invalid regular expression") /* REG_BADPAT */
14295b7b453SJohn Marino "\0"
14395b7b453SJohn Marino #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
14495b7b453SJohn Marino gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
14595b7b453SJohn Marino "\0"
14695b7b453SJohn Marino #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
14795b7b453SJohn Marino gettext_noop ("Invalid character class name") /* REG_ECTYPE */
14895b7b453SJohn Marino "\0"
14995b7b453SJohn Marino #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
15095b7b453SJohn Marino gettext_noop ("Trailing backslash") /* REG_EESCAPE */
15195b7b453SJohn Marino "\0"
15295b7b453SJohn Marino #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
15395b7b453SJohn Marino gettext_noop ("Invalid back reference") /* REG_ESUBREG */
15495b7b453SJohn Marino "\0"
15595b7b453SJohn Marino #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156*09d4459fSDaniel Fojt gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
15795b7b453SJohn Marino "\0"
158*09d4459fSDaniel Fojt #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
15995b7b453SJohn Marino gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
16095b7b453SJohn Marino "\0"
16195b7b453SJohn Marino #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
16295b7b453SJohn Marino gettext_noop ("Unmatched \\{") /* REG_EBRACE */
16395b7b453SJohn Marino "\0"
16495b7b453SJohn Marino #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
16595b7b453SJohn Marino gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
16695b7b453SJohn Marino "\0"
16795b7b453SJohn Marino #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
16895b7b453SJohn Marino gettext_noop ("Invalid range end") /* REG_ERANGE */
16995b7b453SJohn Marino "\0"
17095b7b453SJohn Marino #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
17195b7b453SJohn Marino gettext_noop ("Memory exhausted") /* REG_ESPACE */
17295b7b453SJohn Marino "\0"
17395b7b453SJohn Marino #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
17495b7b453SJohn Marino gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
17595b7b453SJohn Marino "\0"
17695b7b453SJohn Marino #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
17795b7b453SJohn Marino gettext_noop ("Premature end of regular expression") /* REG_EEND */
17895b7b453SJohn Marino "\0"
17995b7b453SJohn Marino #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
18095b7b453SJohn Marino gettext_noop ("Regular expression too big") /* REG_ESIZE */
18195b7b453SJohn Marino "\0"
18295b7b453SJohn Marino #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
18395b7b453SJohn Marino gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
18495b7b453SJohn Marino };
18595b7b453SJohn Marino
18695b7b453SJohn Marino static const size_t __re_error_msgid_idx[] =
18795b7b453SJohn Marino {
18895b7b453SJohn Marino REG_NOERROR_IDX,
18995b7b453SJohn Marino REG_NOMATCH_IDX,
19095b7b453SJohn Marino REG_BADPAT_IDX,
19195b7b453SJohn Marino REG_ECOLLATE_IDX,
19295b7b453SJohn Marino REG_ECTYPE_IDX,
19395b7b453SJohn Marino REG_EESCAPE_IDX,
19495b7b453SJohn Marino REG_ESUBREG_IDX,
19595b7b453SJohn Marino REG_EBRACK_IDX,
19695b7b453SJohn Marino REG_EPAREN_IDX,
19795b7b453SJohn Marino REG_EBRACE_IDX,
19895b7b453SJohn Marino REG_BADBR_IDX,
19995b7b453SJohn Marino REG_ERANGE_IDX,
20095b7b453SJohn Marino REG_ESPACE_IDX,
20195b7b453SJohn Marino REG_BADRPT_IDX,
20295b7b453SJohn Marino REG_EEND_IDX,
20395b7b453SJohn Marino REG_ESIZE_IDX,
20495b7b453SJohn Marino REG_ERPAREN_IDX
20595b7b453SJohn Marino };
20695b7b453SJohn Marino
20795b7b453SJohn Marino /* Entry points for GNU code. */
20895b7b453SJohn Marino
20995b7b453SJohn Marino /* re_compile_pattern is the GNU regular expression compiler: it
21095b7b453SJohn Marino compiles PATTERN (of length LENGTH) and puts the result in BUFP.
21195b7b453SJohn Marino Returns 0 if the pattern was valid, otherwise an error string.
21295b7b453SJohn Marino
213cf28ed85SJohn Marino Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
21495b7b453SJohn Marino are set in BUFP on entry. */
21595b7b453SJohn Marino
21695b7b453SJohn Marino const char *
re_compile_pattern(const char * pattern,size_t length,struct re_pattern_buffer * bufp)21795b7b453SJohn Marino re_compile_pattern (const char *pattern, size_t length,
21895b7b453SJohn Marino struct re_pattern_buffer *bufp)
21995b7b453SJohn Marino {
22095b7b453SJohn Marino reg_errcode_t ret;
22195b7b453SJohn Marino
22295b7b453SJohn Marino /* And GNU code determines whether or not to get register information
22395b7b453SJohn Marino by passing null for the REGS argument to re_match, etc., not by
22495b7b453SJohn Marino setting no_sub, unless RE_NO_SUB is set. */
22595b7b453SJohn Marino bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
22695b7b453SJohn Marino
22795b7b453SJohn Marino /* Match anchors at newline. */
22895b7b453SJohn Marino bufp->newline_anchor = 1;
22995b7b453SJohn Marino
23095b7b453SJohn Marino ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
23195b7b453SJohn Marino
23295b7b453SJohn Marino if (!ret)
23395b7b453SJohn Marino return NULL;
23495b7b453SJohn Marino return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
23595b7b453SJohn Marino }
weak_alias(__re_compile_pattern,re_compile_pattern)23695b7b453SJohn Marino weak_alias (__re_compile_pattern, re_compile_pattern)
23795b7b453SJohn Marino
238cf28ed85SJohn Marino /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
23995b7b453SJohn Marino also be assigned to arbitrarily: each pattern buffer stores its own
24095b7b453SJohn Marino syntax, so it can be changed between regex compilations. */
24195b7b453SJohn Marino /* This has no initializer because initialized variables in Emacs
24295b7b453SJohn Marino become read-only after dumping. */
24395b7b453SJohn Marino reg_syntax_t re_syntax_options;
24495b7b453SJohn Marino
24595b7b453SJohn Marino
24695b7b453SJohn Marino /* Specify the precise syntax of regexps for compilation. This provides
24795b7b453SJohn Marino for compatibility for various utilities which historically have
24895b7b453SJohn Marino different, incompatible syntaxes.
24995b7b453SJohn Marino
25095b7b453SJohn Marino The argument SYNTAX is a bit mask comprised of the various bits
25195b7b453SJohn Marino defined in regex.h. We return the old syntax. */
25295b7b453SJohn Marino
25395b7b453SJohn Marino reg_syntax_t
254*09d4459fSDaniel Fojt re_set_syntax (reg_syntax_t syntax)
25595b7b453SJohn Marino {
25695b7b453SJohn Marino reg_syntax_t ret = re_syntax_options;
25795b7b453SJohn Marino
25895b7b453SJohn Marino re_syntax_options = syntax;
25995b7b453SJohn Marino return ret;
26095b7b453SJohn Marino }
weak_alias(__re_set_syntax,re_set_syntax)26195b7b453SJohn Marino weak_alias (__re_set_syntax, re_set_syntax)
26295b7b453SJohn Marino
26395b7b453SJohn Marino int
264*09d4459fSDaniel Fojt re_compile_fastmap (struct re_pattern_buffer *bufp)
26595b7b453SJohn Marino {
266cf28ed85SJohn Marino re_dfa_t *dfa = bufp->buffer;
26795b7b453SJohn Marino char *fastmap = bufp->fastmap;
26895b7b453SJohn Marino
26995b7b453SJohn Marino memset (fastmap, '\0', sizeof (char) * SBC_MAX);
27095b7b453SJohn Marino re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
27195b7b453SJohn Marino if (dfa->init_state != dfa->init_state_word)
27295b7b453SJohn Marino re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
27395b7b453SJohn Marino if (dfa->init_state != dfa->init_state_nl)
27495b7b453SJohn Marino re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
27595b7b453SJohn Marino if (dfa->init_state != dfa->init_state_begbuf)
27695b7b453SJohn Marino re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
27795b7b453SJohn Marino bufp->fastmap_accurate = 1;
27895b7b453SJohn Marino return 0;
27995b7b453SJohn Marino }
weak_alias(__re_compile_fastmap,re_compile_fastmap)28095b7b453SJohn Marino weak_alias (__re_compile_fastmap, re_compile_fastmap)
28195b7b453SJohn Marino
28295b7b453SJohn Marino static inline void
283680a9cb8SJohn Marino __attribute__ ((always_inline))
28495b7b453SJohn Marino re_set_fastmap (char *fastmap, bool icase, int ch)
28595b7b453SJohn Marino {
28695b7b453SJohn Marino fastmap[ch] = 1;
28795b7b453SJohn Marino if (icase)
28895b7b453SJohn Marino fastmap[tolower (ch)] = 1;
28995b7b453SJohn Marino }
29095b7b453SJohn Marino
29195b7b453SJohn Marino /* Helper function for re_compile_fastmap.
29295b7b453SJohn Marino Compile fastmap for the initial_state INIT_STATE. */
29395b7b453SJohn Marino
29495b7b453SJohn Marino static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)29595b7b453SJohn Marino re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
29695b7b453SJohn Marino char *fastmap)
29795b7b453SJohn Marino {
298cf28ed85SJohn Marino re_dfa_t *dfa = bufp->buffer;
29995b7b453SJohn Marino Idx node_cnt;
30095b7b453SJohn Marino bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
30195b7b453SJohn Marino for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
30295b7b453SJohn Marino {
30395b7b453SJohn Marino Idx node = init_state->nodes.elems[node_cnt];
30495b7b453SJohn Marino re_token_type_t type = dfa->nodes[node].type;
30595b7b453SJohn Marino
30695b7b453SJohn Marino if (type == CHARACTER)
30795b7b453SJohn Marino {
30895b7b453SJohn Marino re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
30995b7b453SJohn Marino #ifdef RE_ENABLE_I18N
31095b7b453SJohn Marino if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
31195b7b453SJohn Marino {
31295b7b453SJohn Marino unsigned char buf[MB_LEN_MAX];
31395b7b453SJohn Marino unsigned char *p;
31495b7b453SJohn Marino wchar_t wc;
31595b7b453SJohn Marino mbstate_t state;
31695b7b453SJohn Marino
31795b7b453SJohn Marino p = buf;
31895b7b453SJohn Marino *p++ = dfa->nodes[node].opr.c;
31995b7b453SJohn Marino while (++node < dfa->nodes_len
32095b7b453SJohn Marino && dfa->nodes[node].type == CHARACTER
32195b7b453SJohn Marino && dfa->nodes[node].mb_partial)
32295b7b453SJohn Marino *p++ = dfa->nodes[node].opr.c;
32395b7b453SJohn Marino memset (&state, '\0', sizeof (state));
32495b7b453SJohn Marino if (__mbrtowc (&wc, (const char *) buf, p - buf,
32595b7b453SJohn Marino &state) == p - buf
326dc7c36e4SJohn Marino && (__wcrtomb ((char *) buf, __towlower (wc), &state)
32795b7b453SJohn Marino != (size_t) -1))
32895b7b453SJohn Marino re_set_fastmap (fastmap, false, buf[0]);
32995b7b453SJohn Marino }
33095b7b453SJohn Marino #endif
33195b7b453SJohn Marino }
33295b7b453SJohn Marino else if (type == SIMPLE_BRACKET)
33395b7b453SJohn Marino {
33495b7b453SJohn Marino int i, ch;
33595b7b453SJohn Marino for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
33695b7b453SJohn Marino {
33795b7b453SJohn Marino int j;
33895b7b453SJohn Marino bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
33995b7b453SJohn Marino for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
34095b7b453SJohn Marino if (w & ((bitset_word_t) 1 << j))
34195b7b453SJohn Marino re_set_fastmap (fastmap, icase, ch);
34295b7b453SJohn Marino }
34395b7b453SJohn Marino }
34495b7b453SJohn Marino #ifdef RE_ENABLE_I18N
34595b7b453SJohn Marino else if (type == COMPLEX_BRACKET)
34695b7b453SJohn Marino {
34795b7b453SJohn Marino re_charset_t *cset = dfa->nodes[node].opr.mbcset;
34895b7b453SJohn Marino Idx i;
34995b7b453SJohn Marino
35095b7b453SJohn Marino # ifdef _LIBC
35195b7b453SJohn Marino /* See if we have to try all bytes which start multiple collation
35295b7b453SJohn Marino elements.
35395b7b453SJohn Marino e.g. In da_DK, we want to catch 'a' since "aa" is a valid
35495b7b453SJohn Marino collation element, and don't catch 'b' since 'b' is
35595b7b453SJohn Marino the only collation element which starts from 'b' (and
35695b7b453SJohn Marino it is caught by SIMPLE_BRACKET). */
35795b7b453SJohn Marino if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
35895b7b453SJohn Marino && (cset->ncoll_syms || cset->nranges))
35995b7b453SJohn Marino {
36095b7b453SJohn Marino const int32_t *table = (const int32_t *)
36195b7b453SJohn Marino _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
36295b7b453SJohn Marino for (i = 0; i < SBC_MAX; ++i)
36395b7b453SJohn Marino if (table[i] < 0)
36495b7b453SJohn Marino re_set_fastmap (fastmap, icase, i);
36595b7b453SJohn Marino }
36695b7b453SJohn Marino # endif /* _LIBC */
36795b7b453SJohn Marino
36895b7b453SJohn Marino /* See if we have to start the match at all multibyte characters,
36995b7b453SJohn Marino i.e. where we would not find an invalid sequence. This only
37095b7b453SJohn Marino applies to multibyte character sets; for single byte character
37195b7b453SJohn Marino sets, the SIMPLE_BRACKET again suffices. */
37295b7b453SJohn Marino if (dfa->mb_cur_max > 1
37395b7b453SJohn Marino && (cset->nchar_classes || cset->non_match || cset->nranges
37495b7b453SJohn Marino # ifdef _LIBC
37595b7b453SJohn Marino || cset->nequiv_classes
37695b7b453SJohn Marino # endif /* _LIBC */
37795b7b453SJohn Marino ))
37895b7b453SJohn Marino {
37995b7b453SJohn Marino unsigned char c = 0;
38095b7b453SJohn Marino do
38195b7b453SJohn Marino {
38295b7b453SJohn Marino mbstate_t mbs;
38395b7b453SJohn Marino memset (&mbs, 0, sizeof (mbs));
38495b7b453SJohn Marino if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
38595b7b453SJohn Marino re_set_fastmap (fastmap, false, (int) c);
38695b7b453SJohn Marino }
38795b7b453SJohn Marino while (++c != 0);
38895b7b453SJohn Marino }
38995b7b453SJohn Marino
39095b7b453SJohn Marino else
39195b7b453SJohn Marino {
39295b7b453SJohn Marino /* ... Else catch all bytes which can start the mbchars. */
39395b7b453SJohn Marino for (i = 0; i < cset->nmbchars; ++i)
39495b7b453SJohn Marino {
39595b7b453SJohn Marino char buf[256];
39695b7b453SJohn Marino mbstate_t state;
39795b7b453SJohn Marino memset (&state, '\0', sizeof (state));
39895b7b453SJohn Marino if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
39995b7b453SJohn Marino re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
40095b7b453SJohn Marino if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
40195b7b453SJohn Marino {
402dc7c36e4SJohn Marino if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
40395b7b453SJohn Marino != (size_t) -1)
40495b7b453SJohn Marino re_set_fastmap (fastmap, false, *(unsigned char *) buf);
40595b7b453SJohn Marino }
40695b7b453SJohn Marino }
40795b7b453SJohn Marino }
40895b7b453SJohn Marino }
40995b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
41095b7b453SJohn Marino else if (type == OP_PERIOD
41195b7b453SJohn Marino #ifdef RE_ENABLE_I18N
41295b7b453SJohn Marino || type == OP_UTF8_PERIOD
41395b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
41495b7b453SJohn Marino || type == END_OF_RE)
41595b7b453SJohn Marino {
41695b7b453SJohn Marino memset (fastmap, '\1', sizeof (char) * SBC_MAX);
41795b7b453SJohn Marino if (type == END_OF_RE)
41895b7b453SJohn Marino bufp->can_be_null = 1;
41995b7b453SJohn Marino return;
42095b7b453SJohn Marino }
42195b7b453SJohn Marino }
42295b7b453SJohn Marino }
42395b7b453SJohn Marino
42495b7b453SJohn Marino /* Entry point for POSIX code. */
42595b7b453SJohn Marino /* regcomp takes a regular expression as a string and compiles it.
42695b7b453SJohn Marino
42795b7b453SJohn Marino PREG is a regex_t *. We do not expect any fields to be initialized,
42895b7b453SJohn Marino since POSIX says we shouldn't. Thus, we set
42995b7b453SJohn Marino
430cf28ed85SJohn Marino 'buffer' to the compiled pattern;
431cf28ed85SJohn Marino 'used' to the length of the compiled pattern;
432cf28ed85SJohn Marino 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
43395b7b453SJohn Marino REG_EXTENDED bit in CFLAGS is set; otherwise, to
43495b7b453SJohn Marino RE_SYNTAX_POSIX_BASIC;
435cf28ed85SJohn Marino 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436cf28ed85SJohn Marino 'fastmap' to an allocated space for the fastmap;
437cf28ed85SJohn Marino 'fastmap_accurate' to zero;
438cf28ed85SJohn Marino 're_nsub' to the number of subexpressions in PATTERN.
43995b7b453SJohn Marino
44095b7b453SJohn Marino PATTERN is the address of the pattern string.
44195b7b453SJohn Marino
44295b7b453SJohn Marino CFLAGS is a series of bits which affect compilation.
44395b7b453SJohn Marino
44495b7b453SJohn Marino If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
44595b7b453SJohn Marino use POSIX basic syntax.
44695b7b453SJohn Marino
44795b7b453SJohn Marino If REG_NEWLINE is set, then . and [^...] don't match newline.
44895b7b453SJohn Marino Also, regexec will try a match beginning after every newline.
44995b7b453SJohn Marino
45095b7b453SJohn Marino If REG_ICASE is set, then we considers upper- and lowercase
45195b7b453SJohn Marino versions of letters to be equivalent when matching.
45295b7b453SJohn Marino
45395b7b453SJohn Marino If REG_NOSUB is set, then when PREG is passed to regexec, that
45495b7b453SJohn Marino routine will report only success or failure, and nothing about the
45595b7b453SJohn Marino registers.
45695b7b453SJohn Marino
45795b7b453SJohn Marino It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
45895b7b453SJohn Marino the return codes and their meanings.) */
45995b7b453SJohn Marino
46095b7b453SJohn Marino int
regcomp(regex_t * __restrict preg,const char * __restrict pattern,int cflags)461*09d4459fSDaniel Fojt regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
46295b7b453SJohn Marino {
46395b7b453SJohn Marino reg_errcode_t ret;
46495b7b453SJohn Marino reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
46595b7b453SJohn Marino : RE_SYNTAX_POSIX_BASIC);
46695b7b453SJohn Marino
46795b7b453SJohn Marino preg->buffer = NULL;
46895b7b453SJohn Marino preg->allocated = 0;
46995b7b453SJohn Marino preg->used = 0;
47095b7b453SJohn Marino
47195b7b453SJohn Marino /* Try to allocate space for the fastmap. */
47295b7b453SJohn Marino preg->fastmap = re_malloc (char, SBC_MAX);
473*09d4459fSDaniel Fojt if (__glibc_unlikely (preg->fastmap == NULL))
47495b7b453SJohn Marino return REG_ESPACE;
47595b7b453SJohn Marino
47695b7b453SJohn Marino syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
47795b7b453SJohn Marino
47895b7b453SJohn Marino /* If REG_NEWLINE is set, newlines are treated differently. */
47995b7b453SJohn Marino if (cflags & REG_NEWLINE)
48095b7b453SJohn Marino { /* REG_NEWLINE implies neither . nor [^...] match newline. */
48195b7b453SJohn Marino syntax &= ~RE_DOT_NEWLINE;
48295b7b453SJohn Marino syntax |= RE_HAT_LISTS_NOT_NEWLINE;
48395b7b453SJohn Marino /* It also changes the matching behavior. */
48495b7b453SJohn Marino preg->newline_anchor = 1;
48595b7b453SJohn Marino }
48695b7b453SJohn Marino else
48795b7b453SJohn Marino preg->newline_anchor = 0;
48895b7b453SJohn Marino preg->no_sub = !!(cflags & REG_NOSUB);
48995b7b453SJohn Marino preg->translate = NULL;
49095b7b453SJohn Marino
49195b7b453SJohn Marino ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
49295b7b453SJohn Marino
49395b7b453SJohn Marino /* POSIX doesn't distinguish between an unmatched open-group and an
49495b7b453SJohn Marino unmatched close-group: both are REG_EPAREN. */
49595b7b453SJohn Marino if (ret == REG_ERPAREN)
49695b7b453SJohn Marino ret = REG_EPAREN;
49795b7b453SJohn Marino
49895b7b453SJohn Marino /* We have already checked preg->fastmap != NULL. */
499*09d4459fSDaniel Fojt if (__glibc_likely (ret == REG_NOERROR))
50095b7b453SJohn Marino /* Compute the fastmap now, since regexec cannot modify the pattern
50195b7b453SJohn Marino buffer. This function never fails in this implementation. */
50295b7b453SJohn Marino (void) re_compile_fastmap (preg);
50395b7b453SJohn Marino else
50495b7b453SJohn Marino {
50595b7b453SJohn Marino /* Some error occurred while compiling the expression. */
50695b7b453SJohn Marino re_free (preg->fastmap);
50795b7b453SJohn Marino preg->fastmap = NULL;
50895b7b453SJohn Marino }
50995b7b453SJohn Marino
51095b7b453SJohn Marino return (int) ret;
51195b7b453SJohn Marino }
512*09d4459fSDaniel Fojt libc_hidden_def (__regcomp)
weak_alias(__regcomp,regcomp)51395b7b453SJohn Marino weak_alias (__regcomp, regcomp)
51495b7b453SJohn Marino
51595b7b453SJohn Marino /* Returns a message corresponding to an error code, ERRCODE, returned
51695b7b453SJohn Marino from either regcomp or regexec. We don't use PREG here. */
51795b7b453SJohn Marino
51895b7b453SJohn Marino size_t
519*09d4459fSDaniel Fojt regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
520*09d4459fSDaniel Fojt size_t errbuf_size)
52195b7b453SJohn Marino {
52295b7b453SJohn Marino const char *msg;
52395b7b453SJohn Marino size_t msg_size;
524*09d4459fSDaniel Fojt int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
52595b7b453SJohn Marino
526*09d4459fSDaniel Fojt if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
52795b7b453SJohn Marino /* Only error codes returned by the rest of the code should be passed
52895b7b453SJohn Marino to this routine. If we are given anything else, or if other regex
52995b7b453SJohn Marino code generates an invalid error code, then the program has a bug.
53095b7b453SJohn Marino Dump core so we can fix it. */
53195b7b453SJohn Marino abort ();
53295b7b453SJohn Marino
53395b7b453SJohn Marino msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
53495b7b453SJohn Marino
53595b7b453SJohn Marino msg_size = strlen (msg) + 1; /* Includes the null. */
53695b7b453SJohn Marino
537*09d4459fSDaniel Fojt if (__glibc_likely (errbuf_size != 0))
53895b7b453SJohn Marino {
53995b7b453SJohn Marino size_t cpy_size = msg_size;
540*09d4459fSDaniel Fojt if (__glibc_unlikely (msg_size > errbuf_size))
54195b7b453SJohn Marino {
54295b7b453SJohn Marino cpy_size = errbuf_size - 1;
54395b7b453SJohn Marino errbuf[cpy_size] = '\0';
54495b7b453SJohn Marino }
54595b7b453SJohn Marino memcpy (errbuf, msg, cpy_size);
54695b7b453SJohn Marino }
54795b7b453SJohn Marino
54895b7b453SJohn Marino return msg_size;
54995b7b453SJohn Marino }
55095b7b453SJohn Marino weak_alias (__regerror, regerror)
55195b7b453SJohn Marino
55295b7b453SJohn Marino
55395b7b453SJohn Marino #ifdef RE_ENABLE_I18N
55495b7b453SJohn Marino /* This static array is used for the map to single-byte characters when
55595b7b453SJohn Marino UTF-8 is used. Otherwise we would allocate memory just to initialize
55695b7b453SJohn Marino it the same all the time. UTF-8 is the preferred encoding so this is
55795b7b453SJohn Marino a worthwhile optimization. */
55895b7b453SJohn Marino static const bitset_t utf8_sb_map =
55995b7b453SJohn Marino {
56095b7b453SJohn Marino /* Set the first 128 bits. */
561680a9cb8SJohn Marino # if defined __GNUC__ && !defined __STRICT_ANSI__
562cf28ed85SJohn Marino [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
563cf28ed85SJohn Marino # else
56495b7b453SJohn Marino # if 4 * BITSET_WORD_BITS < ASCII_CHARS
56595b7b453SJohn Marino # error "bitset_word_t is narrower than 32 bits"
56695b7b453SJohn Marino # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
56795b7b453SJohn Marino BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
56895b7b453SJohn Marino # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
56995b7b453SJohn Marino BITSET_WORD_MAX, BITSET_WORD_MAX,
57095b7b453SJohn Marino # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
57195b7b453SJohn Marino BITSET_WORD_MAX,
57295b7b453SJohn Marino # endif
57395b7b453SJohn Marino (BITSET_WORD_MAX
57495b7b453SJohn Marino >> (SBC_MAX % BITSET_WORD_BITS == 0
57595b7b453SJohn Marino ? 0
57695b7b453SJohn Marino : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
577cf28ed85SJohn Marino # endif
57895b7b453SJohn Marino };
57995b7b453SJohn Marino #endif
58095b7b453SJohn Marino
58195b7b453SJohn Marino
58295b7b453SJohn Marino static void
free_dfa_content(re_dfa_t * dfa)58395b7b453SJohn Marino free_dfa_content (re_dfa_t *dfa)
58495b7b453SJohn Marino {
58595b7b453SJohn Marino Idx i, j;
58695b7b453SJohn Marino
58795b7b453SJohn Marino if (dfa->nodes)
58895b7b453SJohn Marino for (i = 0; i < dfa->nodes_len; ++i)
58995b7b453SJohn Marino free_token (dfa->nodes + i);
59095b7b453SJohn Marino re_free (dfa->nexts);
59195b7b453SJohn Marino for (i = 0; i < dfa->nodes_len; ++i)
59295b7b453SJohn Marino {
59395b7b453SJohn Marino if (dfa->eclosures != NULL)
59495b7b453SJohn Marino re_node_set_free (dfa->eclosures + i);
59595b7b453SJohn Marino if (dfa->inveclosures != NULL)
59695b7b453SJohn Marino re_node_set_free (dfa->inveclosures + i);
59795b7b453SJohn Marino if (dfa->edests != NULL)
59895b7b453SJohn Marino re_node_set_free (dfa->edests + i);
59995b7b453SJohn Marino }
60095b7b453SJohn Marino re_free (dfa->edests);
60195b7b453SJohn Marino re_free (dfa->eclosures);
60295b7b453SJohn Marino re_free (dfa->inveclosures);
60395b7b453SJohn Marino re_free (dfa->nodes);
60495b7b453SJohn Marino
60595b7b453SJohn Marino if (dfa->state_table)
60695b7b453SJohn Marino for (i = 0; i <= dfa->state_hash_mask; ++i)
60795b7b453SJohn Marino {
60895b7b453SJohn Marino struct re_state_table_entry *entry = dfa->state_table + i;
60995b7b453SJohn Marino for (j = 0; j < entry->num; ++j)
61095b7b453SJohn Marino {
61195b7b453SJohn Marino re_dfastate_t *state = entry->array[j];
61295b7b453SJohn Marino free_state (state);
61395b7b453SJohn Marino }
61495b7b453SJohn Marino re_free (entry->array);
61595b7b453SJohn Marino }
61695b7b453SJohn Marino re_free (dfa->state_table);
61795b7b453SJohn Marino #ifdef RE_ENABLE_I18N
61895b7b453SJohn Marino if (dfa->sb_char != utf8_sb_map)
61995b7b453SJohn Marino re_free (dfa->sb_char);
62095b7b453SJohn Marino #endif
62195b7b453SJohn Marino re_free (dfa->subexp_map);
62295b7b453SJohn Marino #ifdef DEBUG
62395b7b453SJohn Marino re_free (dfa->re_str);
62495b7b453SJohn Marino #endif
62595b7b453SJohn Marino
62695b7b453SJohn Marino re_free (dfa);
62795b7b453SJohn Marino }
62895b7b453SJohn Marino
62995b7b453SJohn Marino
63095b7b453SJohn Marino /* Free dynamically allocated space used by PREG. */
63195b7b453SJohn Marino
63295b7b453SJohn Marino void
regfree(regex_t * preg)633*09d4459fSDaniel Fojt regfree (regex_t *preg)
63495b7b453SJohn Marino {
635cf28ed85SJohn Marino re_dfa_t *dfa = preg->buffer;
636*09d4459fSDaniel Fojt if (__glibc_likely (dfa != NULL))
637680a9cb8SJohn Marino {
638680a9cb8SJohn Marino lock_fini (dfa->lock);
63995b7b453SJohn Marino free_dfa_content (dfa);
640680a9cb8SJohn Marino }
64195b7b453SJohn Marino preg->buffer = NULL;
64295b7b453SJohn Marino preg->allocated = 0;
64395b7b453SJohn Marino
64495b7b453SJohn Marino re_free (preg->fastmap);
64595b7b453SJohn Marino preg->fastmap = NULL;
64695b7b453SJohn Marino
64795b7b453SJohn Marino re_free (preg->translate);
64895b7b453SJohn Marino preg->translate = NULL;
64995b7b453SJohn Marino }
650*09d4459fSDaniel Fojt libc_hidden_def (__regfree)
65195b7b453SJohn Marino weak_alias (__regfree, regfree)
65295b7b453SJohn Marino
65395b7b453SJohn Marino /* Entry points compatible with 4.2 BSD regex library. We don't define
65495b7b453SJohn Marino them unless specifically requested. */
65595b7b453SJohn Marino
65695b7b453SJohn Marino #if defined _REGEX_RE_COMP || defined _LIBC
65795b7b453SJohn Marino
65895b7b453SJohn Marino /* BSD has one and only one pattern buffer. */
65995b7b453SJohn Marino static struct re_pattern_buffer re_comp_buf;
66095b7b453SJohn Marino
66195b7b453SJohn Marino char *
66295b7b453SJohn Marino # ifdef _LIBC
66395b7b453SJohn Marino /* Make these definitions weak in libc, so POSIX programs can redefine
66495b7b453SJohn Marino these names if they don't use our functions, and still use
66595b7b453SJohn Marino regcomp/regexec above without link errors. */
66695b7b453SJohn Marino weak_function
66795b7b453SJohn Marino # endif
re_comp(const char * s)668*09d4459fSDaniel Fojt re_comp (const char *s)
66995b7b453SJohn Marino {
67095b7b453SJohn Marino reg_errcode_t ret;
67195b7b453SJohn Marino char *fastmap;
67295b7b453SJohn Marino
67395b7b453SJohn Marino if (!s)
67495b7b453SJohn Marino {
67595b7b453SJohn Marino if (!re_comp_buf.buffer)
67695b7b453SJohn Marino return gettext ("No previous regular expression");
67795b7b453SJohn Marino return 0;
67895b7b453SJohn Marino }
67995b7b453SJohn Marino
68095b7b453SJohn Marino if (re_comp_buf.buffer)
68195b7b453SJohn Marino {
68295b7b453SJohn Marino fastmap = re_comp_buf.fastmap;
68395b7b453SJohn Marino re_comp_buf.fastmap = NULL;
68495b7b453SJohn Marino __regfree (&re_comp_buf);
68595b7b453SJohn Marino memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
68695b7b453SJohn Marino re_comp_buf.fastmap = fastmap;
68795b7b453SJohn Marino }
68895b7b453SJohn Marino
68995b7b453SJohn Marino if (re_comp_buf.fastmap == NULL)
69095b7b453SJohn Marino {
691*09d4459fSDaniel Fojt re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
69295b7b453SJohn Marino if (re_comp_buf.fastmap == NULL)
69395b7b453SJohn Marino return (char *) gettext (__re_error_msgid
69495b7b453SJohn Marino + __re_error_msgid_idx[(int) REG_ESPACE]);
69595b7b453SJohn Marino }
69695b7b453SJohn Marino
697cf28ed85SJohn Marino /* Since 're_exec' always passes NULL for the 'regs' argument, we
69895b7b453SJohn Marino don't need to initialize the pattern buffer fields which affect it. */
69995b7b453SJohn Marino
70095b7b453SJohn Marino /* Match anchors at newlines. */
70195b7b453SJohn Marino re_comp_buf.newline_anchor = 1;
70295b7b453SJohn Marino
70395b7b453SJohn Marino ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
70495b7b453SJohn Marino
70595b7b453SJohn Marino if (!ret)
70695b7b453SJohn Marino return NULL;
70795b7b453SJohn Marino
708cf28ed85SJohn Marino /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
70995b7b453SJohn Marino return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
71095b7b453SJohn Marino }
71195b7b453SJohn Marino
71295b7b453SJohn Marino #ifdef _LIBC
libc_freeres_fn(free_mem)71395b7b453SJohn Marino libc_freeres_fn (free_mem)
71495b7b453SJohn Marino {
71595b7b453SJohn Marino __regfree (&re_comp_buf);
71695b7b453SJohn Marino }
71795b7b453SJohn Marino #endif
71895b7b453SJohn Marino
71995b7b453SJohn Marino #endif /* _REGEX_RE_COMP */
72095b7b453SJohn Marino
72195b7b453SJohn Marino /* Internal entry point.
72295b7b453SJohn Marino Compile the regular expression PATTERN, whose length is LENGTH.
72395b7b453SJohn Marino SYNTAX indicate regular expression's syntax. */
72495b7b453SJohn Marino
72595b7b453SJohn Marino static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)72695b7b453SJohn Marino re_compile_internal (regex_t *preg, const char * pattern, size_t length,
72795b7b453SJohn Marino reg_syntax_t syntax)
72895b7b453SJohn Marino {
72995b7b453SJohn Marino reg_errcode_t err = REG_NOERROR;
73095b7b453SJohn Marino re_dfa_t *dfa;
73195b7b453SJohn Marino re_string_t regexp;
73295b7b453SJohn Marino
73395b7b453SJohn Marino /* Initialize the pattern buffer. */
73495b7b453SJohn Marino preg->fastmap_accurate = 0;
73595b7b453SJohn Marino preg->syntax = syntax;
73695b7b453SJohn Marino preg->not_bol = preg->not_eol = 0;
73795b7b453SJohn Marino preg->used = 0;
73895b7b453SJohn Marino preg->re_nsub = 0;
73995b7b453SJohn Marino preg->can_be_null = 0;
74095b7b453SJohn Marino preg->regs_allocated = REGS_UNALLOCATED;
74195b7b453SJohn Marino
74295b7b453SJohn Marino /* Initialize the dfa. */
743cf28ed85SJohn Marino dfa = preg->buffer;
744*09d4459fSDaniel Fojt if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
74595b7b453SJohn Marino {
74695b7b453SJohn Marino /* If zero allocated, but buffer is non-null, try to realloc
74795b7b453SJohn Marino enough space. This loses if buffer's address is bogus, but
74895b7b453SJohn Marino that is the user's responsibility. If ->buffer is NULL this
74995b7b453SJohn Marino is a simple allocation. */
75095b7b453SJohn Marino dfa = re_realloc (preg->buffer, re_dfa_t, 1);
75195b7b453SJohn Marino if (dfa == NULL)
75295b7b453SJohn Marino return REG_ESPACE;
75395b7b453SJohn Marino preg->allocated = sizeof (re_dfa_t);
754cf28ed85SJohn Marino preg->buffer = dfa;
75595b7b453SJohn Marino }
75695b7b453SJohn Marino preg->used = sizeof (re_dfa_t);
75795b7b453SJohn Marino
75895b7b453SJohn Marino err = init_dfa (dfa, length);
759*09d4459fSDaniel Fojt if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
760680a9cb8SJohn Marino err = REG_ESPACE;
761*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
76295b7b453SJohn Marino {
76395b7b453SJohn Marino free_dfa_content (dfa);
76495b7b453SJohn Marino preg->buffer = NULL;
76595b7b453SJohn Marino preg->allocated = 0;
76695b7b453SJohn Marino return err;
76795b7b453SJohn Marino }
76895b7b453SJohn Marino #ifdef DEBUG
76995b7b453SJohn Marino /* Note: length+1 will not overflow since it is checked in init_dfa. */
77095b7b453SJohn Marino dfa->re_str = re_malloc (char, length + 1);
77195b7b453SJohn Marino strncpy (dfa->re_str, pattern, length + 1);
77295b7b453SJohn Marino #endif
77395b7b453SJohn Marino
77495b7b453SJohn Marino err = re_string_construct (®exp, pattern, length, preg->translate,
77595b7b453SJohn Marino (syntax & RE_ICASE) != 0, dfa);
776*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
77795b7b453SJohn Marino {
77895b7b453SJohn Marino re_compile_internal_free_return:
77995b7b453SJohn Marino free_workarea_compile (preg);
78095b7b453SJohn Marino re_string_destruct (®exp);
781680a9cb8SJohn Marino lock_fini (dfa->lock);
78295b7b453SJohn Marino free_dfa_content (dfa);
78395b7b453SJohn Marino preg->buffer = NULL;
78495b7b453SJohn Marino preg->allocated = 0;
78595b7b453SJohn Marino return err;
78695b7b453SJohn Marino }
78795b7b453SJohn Marino
78895b7b453SJohn Marino /* Parse the regular expression, and build a structure tree. */
78995b7b453SJohn Marino preg->re_nsub = 0;
79095b7b453SJohn Marino dfa->str_tree = parse (®exp, preg, syntax, &err);
791*09d4459fSDaniel Fojt if (__glibc_unlikely (dfa->str_tree == NULL))
79295b7b453SJohn Marino goto re_compile_internal_free_return;
79395b7b453SJohn Marino
79495b7b453SJohn Marino /* Analyze the tree and create the nfa. */
79595b7b453SJohn Marino err = analyze (preg);
796*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
79795b7b453SJohn Marino goto re_compile_internal_free_return;
79895b7b453SJohn Marino
79995b7b453SJohn Marino #ifdef RE_ENABLE_I18N
80095b7b453SJohn Marino /* If possible, do searching in single byte encoding to speed things up. */
80195b7b453SJohn Marino if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
80295b7b453SJohn Marino optimize_utf8 (dfa);
80395b7b453SJohn Marino #endif
80495b7b453SJohn Marino
80595b7b453SJohn Marino /* Then create the initial state of the dfa. */
80695b7b453SJohn Marino err = create_initial_state (dfa);
80795b7b453SJohn Marino
80895b7b453SJohn Marino /* Release work areas. */
80995b7b453SJohn Marino free_workarea_compile (preg);
81095b7b453SJohn Marino re_string_destruct (®exp);
81195b7b453SJohn Marino
812*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
81395b7b453SJohn Marino {
814680a9cb8SJohn Marino lock_fini (dfa->lock);
81595b7b453SJohn Marino free_dfa_content (dfa);
81695b7b453SJohn Marino preg->buffer = NULL;
81795b7b453SJohn Marino preg->allocated = 0;
81895b7b453SJohn Marino }
81995b7b453SJohn Marino
82095b7b453SJohn Marino return err;
82195b7b453SJohn Marino }
82295b7b453SJohn Marino
82395b7b453SJohn Marino /* Initialize DFA. We use the length of the regular expression PAT_LEN
82495b7b453SJohn Marino as the initial length of some arrays. */
82595b7b453SJohn Marino
82695b7b453SJohn Marino static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)82795b7b453SJohn Marino init_dfa (re_dfa_t *dfa, size_t pat_len)
82895b7b453SJohn Marino {
82995b7b453SJohn Marino __re_size_t table_size;
83095b7b453SJohn Marino #ifndef _LIBC
831cf28ed85SJohn Marino const char *codeset_name;
83295b7b453SJohn Marino #endif
83395b7b453SJohn Marino #ifdef RE_ENABLE_I18N
83495b7b453SJohn Marino size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
83595b7b453SJohn Marino #else
83695b7b453SJohn Marino size_t max_i18n_object_size = 0;
83795b7b453SJohn Marino #endif
83895b7b453SJohn Marino size_t max_object_size =
83995b7b453SJohn Marino MAX (sizeof (struct re_state_table_entry),
84095b7b453SJohn Marino MAX (sizeof (re_token_t),
84195b7b453SJohn Marino MAX (sizeof (re_node_set),
84295b7b453SJohn Marino MAX (sizeof (regmatch_t),
84395b7b453SJohn Marino max_i18n_object_size))));
84495b7b453SJohn Marino
84595b7b453SJohn Marino memset (dfa, '\0', sizeof (re_dfa_t));
84695b7b453SJohn Marino
84795b7b453SJohn Marino /* Force allocation of str_tree_storage the first time. */
84895b7b453SJohn Marino dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
84995b7b453SJohn Marino
85095b7b453SJohn Marino /* Avoid overflows. The extra "/ 2" is for the table_size doubling
85195b7b453SJohn Marino calculation below, and for similar doubling calculations
85295b7b453SJohn Marino elsewhere. And it's <= rather than <, because some of the
85395b7b453SJohn Marino doubling calculations add 1 afterwards. */
854*09d4459fSDaniel Fojt if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
855*09d4459fSDaniel Fojt <= pat_len))
85695b7b453SJohn Marino return REG_ESPACE;
85795b7b453SJohn Marino
85895b7b453SJohn Marino dfa->nodes_alloc = pat_len + 1;
85995b7b453SJohn Marino dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
86095b7b453SJohn Marino
86195b7b453SJohn Marino /* table_size = 2 ^ ceil(log pat_len) */
86295b7b453SJohn Marino for (table_size = 1; ; table_size <<= 1)
86395b7b453SJohn Marino if (table_size > pat_len)
86495b7b453SJohn Marino break;
86595b7b453SJohn Marino
86695b7b453SJohn Marino dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
86795b7b453SJohn Marino dfa->state_hash_mask = table_size - 1;
86895b7b453SJohn Marino
86995b7b453SJohn Marino dfa->mb_cur_max = MB_CUR_MAX;
87095b7b453SJohn Marino #ifdef _LIBC
87195b7b453SJohn Marino if (dfa->mb_cur_max == 6
87295b7b453SJohn Marino && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
87395b7b453SJohn Marino dfa->is_utf8 = 1;
87495b7b453SJohn Marino dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
87595b7b453SJohn Marino != 0);
87695b7b453SJohn Marino #else
87795b7b453SJohn Marino codeset_name = nl_langinfo (CODESET);
878a8597f6cSJohn Marino if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
879a8597f6cSJohn Marino && (codeset_name[1] == 'T' || codeset_name[1] == 't')
880a8597f6cSJohn Marino && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
881a8597f6cSJohn Marino && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
88295b7b453SJohn Marino dfa->is_utf8 = 1;
88395b7b453SJohn Marino
88495b7b453SJohn Marino /* We check exhaustively in the loop below if this charset is a
88595b7b453SJohn Marino superset of ASCII. */
88695b7b453SJohn Marino dfa->map_notascii = 0;
88795b7b453SJohn Marino #endif
88895b7b453SJohn Marino
88995b7b453SJohn Marino #ifdef RE_ENABLE_I18N
89095b7b453SJohn Marino if (dfa->mb_cur_max > 1)
89195b7b453SJohn Marino {
89295b7b453SJohn Marino if (dfa->is_utf8)
89395b7b453SJohn Marino dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
89495b7b453SJohn Marino else
89595b7b453SJohn Marino {
89695b7b453SJohn Marino int i, j, ch;
89795b7b453SJohn Marino
89895b7b453SJohn Marino dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
899*09d4459fSDaniel Fojt if (__glibc_unlikely (dfa->sb_char == NULL))
90095b7b453SJohn Marino return REG_ESPACE;
90195b7b453SJohn Marino
90295b7b453SJohn Marino /* Set the bits corresponding to single byte chars. */
90395b7b453SJohn Marino for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
90495b7b453SJohn Marino for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
90595b7b453SJohn Marino {
90695b7b453SJohn Marino wint_t wch = __btowc (ch);
90795b7b453SJohn Marino if (wch != WEOF)
90895b7b453SJohn Marino dfa->sb_char[i] |= (bitset_word_t) 1 << j;
90995b7b453SJohn Marino # ifndef _LIBC
91095b7b453SJohn Marino if (isascii (ch) && wch != ch)
91195b7b453SJohn Marino dfa->map_notascii = 1;
91295b7b453SJohn Marino # endif
91395b7b453SJohn Marino }
91495b7b453SJohn Marino }
91595b7b453SJohn Marino }
91695b7b453SJohn Marino #endif
91795b7b453SJohn Marino
918*09d4459fSDaniel Fojt if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
91995b7b453SJohn Marino return REG_ESPACE;
92095b7b453SJohn Marino return REG_NOERROR;
92195b7b453SJohn Marino }
92295b7b453SJohn Marino
92395b7b453SJohn Marino /* Initialize WORD_CHAR table, which indicate which character is
92495b7b453SJohn Marino "word". In this case "word" means that it is the word construction
92595b7b453SJohn Marino character used by some operators like "\<", "\>", etc. */
92695b7b453SJohn Marino
92795b7b453SJohn Marino static void
init_word_char(re_dfa_t * dfa)92895b7b453SJohn Marino init_word_char (re_dfa_t *dfa)
92995b7b453SJohn Marino {
930cf28ed85SJohn Marino int i = 0;
931cf28ed85SJohn Marino int j;
932cf28ed85SJohn Marino int ch = 0;
933680a9cb8SJohn Marino dfa->word_ops_used = 1;
934*09d4459fSDaniel Fojt if (__glibc_likely (dfa->map_notascii == 0))
935cf28ed85SJohn Marino {
936*09d4459fSDaniel Fojt /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
937*09d4459fSDaniel Fojt them, an issue when this code is used in Gnulib. */
938a8597f6cSJohn Marino bitset_word_t bits0 = 0x00000000;
939a8597f6cSJohn Marino bitset_word_t bits1 = 0x03ff0000;
940a8597f6cSJohn Marino bitset_word_t bits2 = 0x87fffffe;
941a8597f6cSJohn Marino bitset_word_t bits3 = 0x07fffffe;
942cf28ed85SJohn Marino if (BITSET_WORD_BITS == 64)
943cf28ed85SJohn Marino {
944*09d4459fSDaniel Fojt /* Pacify gcc -Woverflow on 32-bit platformns. */
945a8597f6cSJohn Marino dfa->word_char[0] = bits1 << 31 << 1 | bits0;
946a8597f6cSJohn Marino dfa->word_char[1] = bits3 << 31 << 1 | bits2;
947cf28ed85SJohn Marino i = 2;
948cf28ed85SJohn Marino }
949cf28ed85SJohn Marino else if (BITSET_WORD_BITS == 32)
950cf28ed85SJohn Marino {
951a8597f6cSJohn Marino dfa->word_char[0] = bits0;
952a8597f6cSJohn Marino dfa->word_char[1] = bits1;
953a8597f6cSJohn Marino dfa->word_char[2] = bits2;
954a8597f6cSJohn Marino dfa->word_char[3] = bits3;
955cf28ed85SJohn Marino i = 4;
956cf28ed85SJohn Marino }
957cf28ed85SJohn Marino else
958cf28ed85SJohn Marino goto general_case;
959cf28ed85SJohn Marino ch = 128;
960cf28ed85SJohn Marino
961*09d4459fSDaniel Fojt if (__glibc_likely (dfa->is_utf8))
962cf28ed85SJohn Marino {
963cf28ed85SJohn Marino memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
964cf28ed85SJohn Marino return;
965cf28ed85SJohn Marino }
966cf28ed85SJohn Marino }
967cf28ed85SJohn Marino
968cf28ed85SJohn Marino general_case:
969cf28ed85SJohn Marino for (; i < BITSET_WORDS; ++i)
97095b7b453SJohn Marino for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
97195b7b453SJohn Marino if (isalnum (ch) || ch == '_')
97295b7b453SJohn Marino dfa->word_char[i] |= (bitset_word_t) 1 << j;
97395b7b453SJohn Marino }
97495b7b453SJohn Marino
97595b7b453SJohn Marino /* Free the work area which are only used while compiling. */
97695b7b453SJohn Marino
97795b7b453SJohn Marino static void
free_workarea_compile(regex_t * preg)97895b7b453SJohn Marino free_workarea_compile (regex_t *preg)
97995b7b453SJohn Marino {
980cf28ed85SJohn Marino re_dfa_t *dfa = preg->buffer;
98195b7b453SJohn Marino bin_tree_storage_t *storage, *next;
98295b7b453SJohn Marino for (storage = dfa->str_tree_storage; storage; storage = next)
98395b7b453SJohn Marino {
98495b7b453SJohn Marino next = storage->next;
98595b7b453SJohn Marino re_free (storage);
98695b7b453SJohn Marino }
98795b7b453SJohn Marino dfa->str_tree_storage = NULL;
98895b7b453SJohn Marino dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
98995b7b453SJohn Marino dfa->str_tree = NULL;
99095b7b453SJohn Marino re_free (dfa->org_indices);
99195b7b453SJohn Marino dfa->org_indices = NULL;
99295b7b453SJohn Marino }
99395b7b453SJohn Marino
99495b7b453SJohn Marino /* Create initial states for all contexts. */
99595b7b453SJohn Marino
99695b7b453SJohn Marino static reg_errcode_t
create_initial_state(re_dfa_t * dfa)99795b7b453SJohn Marino create_initial_state (re_dfa_t *dfa)
99895b7b453SJohn Marino {
99995b7b453SJohn Marino Idx first, i;
100095b7b453SJohn Marino reg_errcode_t err;
100195b7b453SJohn Marino re_node_set init_nodes;
100295b7b453SJohn Marino
100395b7b453SJohn Marino /* Initial states have the epsilon closure of the node which is
100495b7b453SJohn Marino the first node of the regular expression. */
100595b7b453SJohn Marino first = dfa->str_tree->first->node_idx;
100695b7b453SJohn Marino dfa->init_node = first;
100795b7b453SJohn Marino err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
100995b7b453SJohn Marino return err;
101095b7b453SJohn Marino
101195b7b453SJohn Marino /* The back-references which are in initial states can epsilon transit,
101295b7b453SJohn Marino since in this case all of the subexpressions can be null.
101395b7b453SJohn Marino Then we add epsilon closures of the nodes which are the next nodes of
101495b7b453SJohn Marino the back-references. */
101595b7b453SJohn Marino if (dfa->nbackref > 0)
101695b7b453SJohn Marino for (i = 0; i < init_nodes.nelem; ++i)
101795b7b453SJohn Marino {
101895b7b453SJohn Marino Idx node_idx = init_nodes.elems[i];
101995b7b453SJohn Marino re_token_type_t type = dfa->nodes[node_idx].type;
102095b7b453SJohn Marino
102195b7b453SJohn Marino Idx clexp_idx;
102295b7b453SJohn Marino if (type != OP_BACK_REF)
102395b7b453SJohn Marino continue;
102495b7b453SJohn Marino for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
102595b7b453SJohn Marino {
102695b7b453SJohn Marino re_token_t *clexp_node;
102795b7b453SJohn Marino clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
102895b7b453SJohn Marino if (clexp_node->type == OP_CLOSE_SUBEXP
102995b7b453SJohn Marino && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
103095b7b453SJohn Marino break;
103195b7b453SJohn Marino }
103295b7b453SJohn Marino if (clexp_idx == init_nodes.nelem)
103395b7b453SJohn Marino continue;
103495b7b453SJohn Marino
103595b7b453SJohn Marino if (type == OP_BACK_REF)
103695b7b453SJohn Marino {
103795b7b453SJohn Marino Idx dest_idx = dfa->edests[node_idx].elems[0];
103895b7b453SJohn Marino if (!re_node_set_contains (&init_nodes, dest_idx))
103995b7b453SJohn Marino {
104095b7b453SJohn Marino reg_errcode_t merge_err
104195b7b453SJohn Marino = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
104295b7b453SJohn Marino if (merge_err != REG_NOERROR)
104395b7b453SJohn Marino return merge_err;
104495b7b453SJohn Marino i = 0;
104595b7b453SJohn Marino }
104695b7b453SJohn Marino }
104795b7b453SJohn Marino }
104895b7b453SJohn Marino
104995b7b453SJohn Marino /* It must be the first time to invoke acquire_state. */
105095b7b453SJohn Marino dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
105195b7b453SJohn Marino /* We don't check ERR here, since the initial state must not be NULL. */
1052*09d4459fSDaniel Fojt if (__glibc_unlikely (dfa->init_state == NULL))
105395b7b453SJohn Marino return err;
105495b7b453SJohn Marino if (dfa->init_state->has_constraint)
105595b7b453SJohn Marino {
105695b7b453SJohn Marino dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
105795b7b453SJohn Marino CONTEXT_WORD);
105895b7b453SJohn Marino dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
105995b7b453SJohn Marino CONTEXT_NEWLINE);
106095b7b453SJohn Marino dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
106195b7b453SJohn Marino &init_nodes,
106295b7b453SJohn Marino CONTEXT_NEWLINE
106395b7b453SJohn Marino | CONTEXT_BEGBUF);
1064*09d4459fSDaniel Fojt if (__glibc_unlikely (dfa->init_state_word == NULL
1065*09d4459fSDaniel Fojt || dfa->init_state_nl == NULL
1066*09d4459fSDaniel Fojt || dfa->init_state_begbuf == NULL))
106795b7b453SJohn Marino return err;
106895b7b453SJohn Marino }
106995b7b453SJohn Marino else
107095b7b453SJohn Marino dfa->init_state_word = dfa->init_state_nl
107195b7b453SJohn Marino = dfa->init_state_begbuf = dfa->init_state;
107295b7b453SJohn Marino
107395b7b453SJohn Marino re_node_set_free (&init_nodes);
107495b7b453SJohn Marino return REG_NOERROR;
107595b7b453SJohn Marino }
107695b7b453SJohn Marino
107795b7b453SJohn Marino #ifdef RE_ENABLE_I18N
107895b7b453SJohn Marino /* If it is possible to do searching in single byte encoding instead of UTF-8
107995b7b453SJohn Marino to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
108095b7b453SJohn Marino DFA nodes where needed. */
108195b7b453SJohn Marino
108295b7b453SJohn Marino static void
optimize_utf8(re_dfa_t * dfa)108395b7b453SJohn Marino optimize_utf8 (re_dfa_t *dfa)
108495b7b453SJohn Marino {
108595b7b453SJohn Marino Idx node;
108695b7b453SJohn Marino int i;
108795b7b453SJohn Marino bool mb_chars = false;
108895b7b453SJohn Marino bool has_period = false;
108995b7b453SJohn Marino
109095b7b453SJohn Marino for (node = 0; node < dfa->nodes_len; ++node)
109195b7b453SJohn Marino switch (dfa->nodes[node].type)
109295b7b453SJohn Marino {
109395b7b453SJohn Marino case CHARACTER:
109495b7b453SJohn Marino if (dfa->nodes[node].opr.c >= ASCII_CHARS)
109595b7b453SJohn Marino mb_chars = true;
109695b7b453SJohn Marino break;
109795b7b453SJohn Marino case ANCHOR:
109895b7b453SJohn Marino switch (dfa->nodes[node].opr.ctx_type)
109995b7b453SJohn Marino {
110095b7b453SJohn Marino case LINE_FIRST:
110195b7b453SJohn Marino case LINE_LAST:
110295b7b453SJohn Marino case BUF_FIRST:
110395b7b453SJohn Marino case BUF_LAST:
110495b7b453SJohn Marino break;
110595b7b453SJohn Marino default:
110695b7b453SJohn Marino /* Word anchors etc. cannot be handled. It's okay to test
110795b7b453SJohn Marino opr.ctx_type since constraints (for all DFA nodes) are
110895b7b453SJohn Marino created by ORing one or more opr.ctx_type values. */
110995b7b453SJohn Marino return;
111095b7b453SJohn Marino }
111195b7b453SJohn Marino break;
111295b7b453SJohn Marino case OP_PERIOD:
111395b7b453SJohn Marino has_period = true;
111495b7b453SJohn Marino break;
111595b7b453SJohn Marino case OP_BACK_REF:
111695b7b453SJohn Marino case OP_ALT:
111795b7b453SJohn Marino case END_OF_RE:
111895b7b453SJohn Marino case OP_DUP_ASTERISK:
111995b7b453SJohn Marino case OP_OPEN_SUBEXP:
112095b7b453SJohn Marino case OP_CLOSE_SUBEXP:
112195b7b453SJohn Marino break;
112295b7b453SJohn Marino case COMPLEX_BRACKET:
112395b7b453SJohn Marino return;
112495b7b453SJohn Marino case SIMPLE_BRACKET:
112595b7b453SJohn Marino /* Just double check. */
112695b7b453SJohn Marino {
112795b7b453SJohn Marino int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
112895b7b453SJohn Marino ? 0
112995b7b453SJohn Marino : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
113095b7b453SJohn Marino for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
113195b7b453SJohn Marino {
113295b7b453SJohn Marino if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
113395b7b453SJohn Marino return;
113495b7b453SJohn Marino rshift = 0;
113595b7b453SJohn Marino }
113695b7b453SJohn Marino }
113795b7b453SJohn Marino break;
113895b7b453SJohn Marino default:
113995b7b453SJohn Marino abort ();
114095b7b453SJohn Marino }
114195b7b453SJohn Marino
114295b7b453SJohn Marino if (mb_chars || has_period)
114395b7b453SJohn Marino for (node = 0; node < dfa->nodes_len; ++node)
114495b7b453SJohn Marino {
114595b7b453SJohn Marino if (dfa->nodes[node].type == CHARACTER
114695b7b453SJohn Marino && dfa->nodes[node].opr.c >= ASCII_CHARS)
114795b7b453SJohn Marino dfa->nodes[node].mb_partial = 0;
114895b7b453SJohn Marino else if (dfa->nodes[node].type == OP_PERIOD)
114995b7b453SJohn Marino dfa->nodes[node].type = OP_UTF8_PERIOD;
115095b7b453SJohn Marino }
115195b7b453SJohn Marino
115295b7b453SJohn Marino /* The search can be in single byte locale. */
115395b7b453SJohn Marino dfa->mb_cur_max = 1;
115495b7b453SJohn Marino dfa->is_utf8 = 0;
115595b7b453SJohn Marino dfa->has_mb_node = dfa->nbackref > 0 || has_period;
115695b7b453SJohn Marino }
115795b7b453SJohn Marino #endif
115895b7b453SJohn Marino
115995b7b453SJohn Marino /* Analyze the structure tree, and calculate "first", "next", "edest",
116095b7b453SJohn Marino "eclosure", and "inveclosure". */
116195b7b453SJohn Marino
116295b7b453SJohn Marino static reg_errcode_t
analyze(regex_t * preg)116395b7b453SJohn Marino analyze (regex_t *preg)
116495b7b453SJohn Marino {
1165cf28ed85SJohn Marino re_dfa_t *dfa = preg->buffer;
116695b7b453SJohn Marino reg_errcode_t ret;
116795b7b453SJohn Marino
116895b7b453SJohn Marino /* Allocate arrays. */
116995b7b453SJohn Marino dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
117095b7b453SJohn Marino dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
117195b7b453SJohn Marino dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
117295b7b453SJohn Marino dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1173*09d4459fSDaniel Fojt if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1174*09d4459fSDaniel Fojt || dfa->edests == NULL || dfa->eclosures == NULL))
117595b7b453SJohn Marino return REG_ESPACE;
117695b7b453SJohn Marino
117795b7b453SJohn Marino dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
117895b7b453SJohn Marino if (dfa->subexp_map != NULL)
117995b7b453SJohn Marino {
118095b7b453SJohn Marino Idx i;
118195b7b453SJohn Marino for (i = 0; i < preg->re_nsub; i++)
118295b7b453SJohn Marino dfa->subexp_map[i] = i;
118395b7b453SJohn Marino preorder (dfa->str_tree, optimize_subexps, dfa);
118495b7b453SJohn Marino for (i = 0; i < preg->re_nsub; i++)
118595b7b453SJohn Marino if (dfa->subexp_map[i] != i)
118695b7b453SJohn Marino break;
118795b7b453SJohn Marino if (i == preg->re_nsub)
118895b7b453SJohn Marino {
1189*09d4459fSDaniel Fojt re_free (dfa->subexp_map);
119095b7b453SJohn Marino dfa->subexp_map = NULL;
119195b7b453SJohn Marino }
119295b7b453SJohn Marino }
119395b7b453SJohn Marino
119495b7b453SJohn Marino ret = postorder (dfa->str_tree, lower_subexps, preg);
1195*09d4459fSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
119695b7b453SJohn Marino return ret;
119795b7b453SJohn Marino ret = postorder (dfa->str_tree, calc_first, dfa);
1198*09d4459fSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
119995b7b453SJohn Marino return ret;
120095b7b453SJohn Marino preorder (dfa->str_tree, calc_next, dfa);
120195b7b453SJohn Marino ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1202*09d4459fSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
120395b7b453SJohn Marino return ret;
120495b7b453SJohn Marino ret = calc_eclosure (dfa);
1205*09d4459fSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
120695b7b453SJohn Marino return ret;
120795b7b453SJohn Marino
120895b7b453SJohn Marino /* We only need this during the prune_impossible_nodes pass in regexec.c;
120995b7b453SJohn Marino skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
121095b7b453SJohn Marino if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
121195b7b453SJohn Marino || dfa->nbackref)
121295b7b453SJohn Marino {
121395b7b453SJohn Marino dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1214*09d4459fSDaniel Fojt if (__glibc_unlikely (dfa->inveclosures == NULL))
121595b7b453SJohn Marino return REG_ESPACE;
121695b7b453SJohn Marino ret = calc_inveclosure (dfa);
121795b7b453SJohn Marino }
121895b7b453SJohn Marino
121995b7b453SJohn Marino return ret;
122095b7b453SJohn Marino }
122195b7b453SJohn Marino
122295b7b453SJohn Marino /* Our parse trees are very unbalanced, so we cannot use a stack to
122395b7b453SJohn Marino implement parse tree visits. Instead, we use parent pointers and
122495b7b453SJohn Marino some hairy code in these two functions. */
122595b7b453SJohn Marino static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)122695b7b453SJohn Marino postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
122795b7b453SJohn Marino void *extra)
122895b7b453SJohn Marino {
122995b7b453SJohn Marino bin_tree_t *node, *prev;
123095b7b453SJohn Marino
123195b7b453SJohn Marino for (node = root; ; )
123295b7b453SJohn Marino {
123395b7b453SJohn Marino /* Descend down the tree, preferably to the left (or to the right
123495b7b453SJohn Marino if that's the only child). */
123595b7b453SJohn Marino while (node->left || node->right)
123695b7b453SJohn Marino if (node->left)
123795b7b453SJohn Marino node = node->left;
123895b7b453SJohn Marino else
123995b7b453SJohn Marino node = node->right;
124095b7b453SJohn Marino
124195b7b453SJohn Marino do
124295b7b453SJohn Marino {
124395b7b453SJohn Marino reg_errcode_t err = fn (extra, node);
1244*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
124595b7b453SJohn Marino return err;
124695b7b453SJohn Marino if (node->parent == NULL)
124795b7b453SJohn Marino return REG_NOERROR;
124895b7b453SJohn Marino prev = node;
124995b7b453SJohn Marino node = node->parent;
125095b7b453SJohn Marino }
125195b7b453SJohn Marino /* Go up while we have a node that is reached from the right. */
125295b7b453SJohn Marino while (node->right == prev || node->right == NULL);
125395b7b453SJohn Marino node = node->right;
125495b7b453SJohn Marino }
125595b7b453SJohn Marino }
125695b7b453SJohn Marino
125795b7b453SJohn Marino static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)125895b7b453SJohn Marino preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
125995b7b453SJohn Marino void *extra)
126095b7b453SJohn Marino {
126195b7b453SJohn Marino bin_tree_t *node;
126295b7b453SJohn Marino
126395b7b453SJohn Marino for (node = root; ; )
126495b7b453SJohn Marino {
126595b7b453SJohn Marino reg_errcode_t err = fn (extra, node);
1266*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
126795b7b453SJohn Marino return err;
126895b7b453SJohn Marino
126995b7b453SJohn Marino /* Go to the left node, or up and to the right. */
127095b7b453SJohn Marino if (node->left)
127195b7b453SJohn Marino node = node->left;
127295b7b453SJohn Marino else
127395b7b453SJohn Marino {
127495b7b453SJohn Marino bin_tree_t *prev = NULL;
127595b7b453SJohn Marino while (node->right == prev || node->right == NULL)
127695b7b453SJohn Marino {
127795b7b453SJohn Marino prev = node;
127895b7b453SJohn Marino node = node->parent;
127995b7b453SJohn Marino if (!node)
128095b7b453SJohn Marino return REG_NOERROR;
128195b7b453SJohn Marino }
128295b7b453SJohn Marino node = node->right;
128395b7b453SJohn Marino }
128495b7b453SJohn Marino }
128595b7b453SJohn Marino }
128695b7b453SJohn Marino
128795b7b453SJohn Marino /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
128895b7b453SJohn Marino re_search_internal to map the inner one's opr.idx to this one's. Adjust
128995b7b453SJohn Marino backreferences as well. Requires a preorder visit. */
129095b7b453SJohn Marino static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)129195b7b453SJohn Marino optimize_subexps (void *extra, bin_tree_t *node)
129295b7b453SJohn Marino {
129395b7b453SJohn Marino re_dfa_t *dfa = (re_dfa_t *) extra;
129495b7b453SJohn Marino
129595b7b453SJohn Marino if (node->token.type == OP_BACK_REF && dfa->subexp_map)
129695b7b453SJohn Marino {
129795b7b453SJohn Marino int idx = node->token.opr.idx;
129895b7b453SJohn Marino node->token.opr.idx = dfa->subexp_map[idx];
129995b7b453SJohn Marino dfa->used_bkref_map |= 1 << node->token.opr.idx;
130095b7b453SJohn Marino }
130195b7b453SJohn Marino
130295b7b453SJohn Marino else if (node->token.type == SUBEXP
130395b7b453SJohn Marino && node->left && node->left->token.type == SUBEXP)
130495b7b453SJohn Marino {
130595b7b453SJohn Marino Idx other_idx = node->left->token.opr.idx;
130695b7b453SJohn Marino
130795b7b453SJohn Marino node->left = node->left->left;
130895b7b453SJohn Marino if (node->left)
130995b7b453SJohn Marino node->left->parent = node;
131095b7b453SJohn Marino
131195b7b453SJohn Marino dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
131295b7b453SJohn Marino if (other_idx < BITSET_WORD_BITS)
131395b7b453SJohn Marino dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
131495b7b453SJohn Marino }
131595b7b453SJohn Marino
131695b7b453SJohn Marino return REG_NOERROR;
131795b7b453SJohn Marino }
131895b7b453SJohn Marino
131995b7b453SJohn Marino /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
132095b7b453SJohn Marino of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
132195b7b453SJohn Marino static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)132295b7b453SJohn Marino lower_subexps (void *extra, bin_tree_t *node)
132395b7b453SJohn Marino {
132495b7b453SJohn Marino regex_t *preg = (regex_t *) extra;
132595b7b453SJohn Marino reg_errcode_t err = REG_NOERROR;
132695b7b453SJohn Marino
132795b7b453SJohn Marino if (node->left && node->left->token.type == SUBEXP)
132895b7b453SJohn Marino {
132995b7b453SJohn Marino node->left = lower_subexp (&err, preg, node->left);
133095b7b453SJohn Marino if (node->left)
133195b7b453SJohn Marino node->left->parent = node;
133295b7b453SJohn Marino }
133395b7b453SJohn Marino if (node->right && node->right->token.type == SUBEXP)
133495b7b453SJohn Marino {
133595b7b453SJohn Marino node->right = lower_subexp (&err, preg, node->right);
133695b7b453SJohn Marino if (node->right)
133795b7b453SJohn Marino node->right->parent = node;
133895b7b453SJohn Marino }
133995b7b453SJohn Marino
134095b7b453SJohn Marino return err;
134195b7b453SJohn Marino }
134295b7b453SJohn Marino
134395b7b453SJohn Marino static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)134495b7b453SJohn Marino lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
134595b7b453SJohn Marino {
1346cf28ed85SJohn Marino re_dfa_t *dfa = preg->buffer;
134795b7b453SJohn Marino bin_tree_t *body = node->left;
134895b7b453SJohn Marino bin_tree_t *op, *cls, *tree1, *tree;
134995b7b453SJohn Marino
135095b7b453SJohn Marino if (preg->no_sub
135195b7b453SJohn Marino /* We do not optimize empty subexpressions, because otherwise we may
135295b7b453SJohn Marino have bad CONCAT nodes with NULL children. This is obviously not
135395b7b453SJohn Marino very common, so we do not lose much. An example that triggers
135495b7b453SJohn Marino this case is the sed "script" /\(\)/x. */
135595b7b453SJohn Marino && node->left != NULL
135695b7b453SJohn Marino && (node->token.opr.idx >= BITSET_WORD_BITS
135795b7b453SJohn Marino || !(dfa->used_bkref_map
135895b7b453SJohn Marino & ((bitset_word_t) 1 << node->token.opr.idx))))
135995b7b453SJohn Marino return node->left;
136095b7b453SJohn Marino
136195b7b453SJohn Marino /* Convert the SUBEXP node to the concatenation of an
136295b7b453SJohn Marino OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
136395b7b453SJohn Marino op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
136495b7b453SJohn Marino cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
136595b7b453SJohn Marino tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
136695b7b453SJohn Marino tree = create_tree (dfa, op, tree1, CONCAT);
1367*09d4459fSDaniel Fojt if (__glibc_unlikely (tree == NULL || tree1 == NULL
1368*09d4459fSDaniel Fojt || op == NULL || cls == NULL))
136995b7b453SJohn Marino {
137095b7b453SJohn Marino *err = REG_ESPACE;
137195b7b453SJohn Marino return NULL;
137295b7b453SJohn Marino }
137395b7b453SJohn Marino
137495b7b453SJohn Marino op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
137595b7b453SJohn Marino op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
137695b7b453SJohn Marino return tree;
137795b7b453SJohn Marino }
137895b7b453SJohn Marino
137995b7b453SJohn Marino /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
138095b7b453SJohn Marino nodes. Requires a postorder visit. */
138195b7b453SJohn Marino static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)138295b7b453SJohn Marino calc_first (void *extra, bin_tree_t *node)
138395b7b453SJohn Marino {
138495b7b453SJohn Marino re_dfa_t *dfa = (re_dfa_t *) extra;
138595b7b453SJohn Marino if (node->token.type == CONCAT)
138695b7b453SJohn Marino {
138795b7b453SJohn Marino node->first = node->left->first;
138895b7b453SJohn Marino node->node_idx = node->left->node_idx;
138995b7b453SJohn Marino }
139095b7b453SJohn Marino else
139195b7b453SJohn Marino {
139295b7b453SJohn Marino node->first = node;
139395b7b453SJohn Marino node->node_idx = re_dfa_add_node (dfa, node->token);
1394*09d4459fSDaniel Fojt if (__glibc_unlikely (node->node_idx == -1))
139595b7b453SJohn Marino return REG_ESPACE;
139695b7b453SJohn Marino if (node->token.type == ANCHOR)
139795b7b453SJohn Marino dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
139895b7b453SJohn Marino }
139995b7b453SJohn Marino return REG_NOERROR;
140095b7b453SJohn Marino }
140195b7b453SJohn Marino
140295b7b453SJohn Marino /* Pass 2: compute NEXT on the tree. Preorder visit. */
140395b7b453SJohn Marino static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1404680a9cb8SJohn Marino calc_next (void *extra, bin_tree_t *node)
140595b7b453SJohn Marino {
140695b7b453SJohn Marino switch (node->token.type)
140795b7b453SJohn Marino {
140895b7b453SJohn Marino case OP_DUP_ASTERISK:
140995b7b453SJohn Marino node->left->next = node;
141095b7b453SJohn Marino break;
141195b7b453SJohn Marino case CONCAT:
141295b7b453SJohn Marino node->left->next = node->right->first;
141395b7b453SJohn Marino node->right->next = node->next;
141495b7b453SJohn Marino break;
141595b7b453SJohn Marino default:
141695b7b453SJohn Marino if (node->left)
141795b7b453SJohn Marino node->left->next = node->next;
141895b7b453SJohn Marino if (node->right)
141995b7b453SJohn Marino node->right->next = node->next;
142095b7b453SJohn Marino break;
142195b7b453SJohn Marino }
142295b7b453SJohn Marino return REG_NOERROR;
142395b7b453SJohn Marino }
142495b7b453SJohn Marino
142595b7b453SJohn Marino /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
142695b7b453SJohn Marino static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)142795b7b453SJohn Marino link_nfa_nodes (void *extra, bin_tree_t *node)
142895b7b453SJohn Marino {
142995b7b453SJohn Marino re_dfa_t *dfa = (re_dfa_t *) extra;
143095b7b453SJohn Marino Idx idx = node->node_idx;
143195b7b453SJohn Marino reg_errcode_t err = REG_NOERROR;
143295b7b453SJohn Marino
143395b7b453SJohn Marino switch (node->token.type)
143495b7b453SJohn Marino {
143595b7b453SJohn Marino case CONCAT:
143695b7b453SJohn Marino break;
143795b7b453SJohn Marino
143895b7b453SJohn Marino case END_OF_RE:
1439*09d4459fSDaniel Fojt DEBUG_ASSERT (node->next == NULL);
144095b7b453SJohn Marino break;
144195b7b453SJohn Marino
144295b7b453SJohn Marino case OP_DUP_ASTERISK:
144395b7b453SJohn Marino case OP_ALT:
144495b7b453SJohn Marino {
144595b7b453SJohn Marino Idx left, right;
144695b7b453SJohn Marino dfa->has_plural_match = 1;
144795b7b453SJohn Marino if (node->left != NULL)
144895b7b453SJohn Marino left = node->left->first->node_idx;
144995b7b453SJohn Marino else
145095b7b453SJohn Marino left = node->next->node_idx;
145195b7b453SJohn Marino if (node->right != NULL)
145295b7b453SJohn Marino right = node->right->first->node_idx;
145395b7b453SJohn Marino else
145495b7b453SJohn Marino right = node->next->node_idx;
1455*09d4459fSDaniel Fojt DEBUG_ASSERT (left > -1);
1456*09d4459fSDaniel Fojt DEBUG_ASSERT (right > -1);
145795b7b453SJohn Marino err = re_node_set_init_2 (dfa->edests + idx, left, right);
145895b7b453SJohn Marino }
145995b7b453SJohn Marino break;
146095b7b453SJohn Marino
146195b7b453SJohn Marino case ANCHOR:
146295b7b453SJohn Marino case OP_OPEN_SUBEXP:
146395b7b453SJohn Marino case OP_CLOSE_SUBEXP:
146495b7b453SJohn Marino err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
146595b7b453SJohn Marino break;
146695b7b453SJohn Marino
146795b7b453SJohn Marino case OP_BACK_REF:
146895b7b453SJohn Marino dfa->nexts[idx] = node->next->node_idx;
146995b7b453SJohn Marino if (node->token.type == OP_BACK_REF)
147095b7b453SJohn Marino err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
147195b7b453SJohn Marino break;
147295b7b453SJohn Marino
147395b7b453SJohn Marino default:
1474*09d4459fSDaniel Fojt DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
147595b7b453SJohn Marino dfa->nexts[idx] = node->next->node_idx;
147695b7b453SJohn Marino break;
147795b7b453SJohn Marino }
147895b7b453SJohn Marino
147995b7b453SJohn Marino return err;
148095b7b453SJohn Marino }
148195b7b453SJohn Marino
148295b7b453SJohn Marino /* Duplicate the epsilon closure of the node ROOT_NODE.
148395b7b453SJohn Marino Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
148495b7b453SJohn Marino to their own constraint. */
148595b7b453SJohn Marino
148695b7b453SJohn Marino static reg_errcode_t
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)148795b7b453SJohn Marino duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
148895b7b453SJohn Marino Idx root_node, unsigned int init_constraint)
148995b7b453SJohn Marino {
149095b7b453SJohn Marino Idx org_node, clone_node;
149195b7b453SJohn Marino bool ok;
149295b7b453SJohn Marino unsigned int constraint = init_constraint;
149395b7b453SJohn Marino for (org_node = top_org_node, clone_node = top_clone_node;;)
149495b7b453SJohn Marino {
149595b7b453SJohn Marino Idx org_dest, clone_dest;
149695b7b453SJohn Marino if (dfa->nodes[org_node].type == OP_BACK_REF)
149795b7b453SJohn Marino {
149895b7b453SJohn Marino /* If the back reference epsilon-transit, its destination must
149995b7b453SJohn Marino also have the constraint. Then duplicate the epsilon closure
150095b7b453SJohn Marino of the destination of the back reference, and store it in
150195b7b453SJohn Marino edests of the back reference. */
150295b7b453SJohn Marino org_dest = dfa->nexts[org_node];
150395b7b453SJohn Marino re_node_set_empty (dfa->edests + clone_node);
150495b7b453SJohn Marino clone_dest = duplicate_node (dfa, org_dest, constraint);
1505*09d4459fSDaniel Fojt if (__glibc_unlikely (clone_dest == -1))
150695b7b453SJohn Marino return REG_ESPACE;
150795b7b453SJohn Marino dfa->nexts[clone_node] = dfa->nexts[org_node];
150895b7b453SJohn Marino ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509*09d4459fSDaniel Fojt if (__glibc_unlikely (! ok))
151095b7b453SJohn Marino return REG_ESPACE;
151195b7b453SJohn Marino }
151295b7b453SJohn Marino else if (dfa->edests[org_node].nelem == 0)
151395b7b453SJohn Marino {
151495b7b453SJohn Marino /* In case of the node can't epsilon-transit, don't duplicate the
151595b7b453SJohn Marino destination and store the original destination as the
151695b7b453SJohn Marino destination of the node. */
151795b7b453SJohn Marino dfa->nexts[clone_node] = dfa->nexts[org_node];
151895b7b453SJohn Marino break;
151995b7b453SJohn Marino }
152095b7b453SJohn Marino else if (dfa->edests[org_node].nelem == 1)
152195b7b453SJohn Marino {
152295b7b453SJohn Marino /* In case of the node can epsilon-transit, and it has only one
152395b7b453SJohn Marino destination. */
152495b7b453SJohn Marino org_dest = dfa->edests[org_node].elems[0];
152595b7b453SJohn Marino re_node_set_empty (dfa->edests + clone_node);
152695b7b453SJohn Marino /* If the node is root_node itself, it means the epsilon closure
152795b7b453SJohn Marino has a loop. Then tie it to the destination of the root_node. */
152895b7b453SJohn Marino if (org_node == root_node && clone_node != org_node)
152995b7b453SJohn Marino {
153095b7b453SJohn Marino ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1531*09d4459fSDaniel Fojt if (__glibc_unlikely (! ok))
153295b7b453SJohn Marino return REG_ESPACE;
153395b7b453SJohn Marino break;
153495b7b453SJohn Marino }
153595b7b453SJohn Marino /* In case the node has another constraint, append it. */
153695b7b453SJohn Marino constraint |= dfa->nodes[org_node].constraint;
153795b7b453SJohn Marino clone_dest = duplicate_node (dfa, org_dest, constraint);
1538*09d4459fSDaniel Fojt if (__glibc_unlikely (clone_dest == -1))
153995b7b453SJohn Marino return REG_ESPACE;
154095b7b453SJohn Marino ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1541*09d4459fSDaniel Fojt if (__glibc_unlikely (! ok))
154295b7b453SJohn Marino return REG_ESPACE;
154395b7b453SJohn Marino }
154495b7b453SJohn Marino else /* dfa->edests[org_node].nelem == 2 */
154595b7b453SJohn Marino {
154695b7b453SJohn Marino /* In case of the node can epsilon-transit, and it has two
154795b7b453SJohn Marino destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
154895b7b453SJohn Marino org_dest = dfa->edests[org_node].elems[0];
154995b7b453SJohn Marino re_node_set_empty (dfa->edests + clone_node);
155095b7b453SJohn Marino /* Search for a duplicated node which satisfies the constraint. */
155195b7b453SJohn Marino clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1552*09d4459fSDaniel Fojt if (clone_dest == -1)
155395b7b453SJohn Marino {
155495b7b453SJohn Marino /* There is no such duplicated node, create a new one. */
155595b7b453SJohn Marino reg_errcode_t err;
155695b7b453SJohn Marino clone_dest = duplicate_node (dfa, org_dest, constraint);
1557*09d4459fSDaniel Fojt if (__glibc_unlikely (clone_dest == -1))
155895b7b453SJohn Marino return REG_ESPACE;
155995b7b453SJohn Marino ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560*09d4459fSDaniel Fojt if (__glibc_unlikely (! ok))
156195b7b453SJohn Marino return REG_ESPACE;
156295b7b453SJohn Marino err = duplicate_node_closure (dfa, org_dest, clone_dest,
156395b7b453SJohn Marino root_node, constraint);
1564*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
156595b7b453SJohn Marino return err;
156695b7b453SJohn Marino }
156795b7b453SJohn Marino else
156895b7b453SJohn Marino {
156995b7b453SJohn Marino /* There is a duplicated node which satisfies the constraint,
157095b7b453SJohn Marino use it to avoid infinite loop. */
157195b7b453SJohn Marino ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572*09d4459fSDaniel Fojt if (__glibc_unlikely (! ok))
157395b7b453SJohn Marino return REG_ESPACE;
157495b7b453SJohn Marino }
157595b7b453SJohn Marino
157695b7b453SJohn Marino org_dest = dfa->edests[org_node].elems[1];
157795b7b453SJohn Marino clone_dest = duplicate_node (dfa, org_dest, constraint);
1578*09d4459fSDaniel Fojt if (__glibc_unlikely (clone_dest == -1))
157995b7b453SJohn Marino return REG_ESPACE;
158095b7b453SJohn Marino ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581*09d4459fSDaniel Fojt if (__glibc_unlikely (! ok))
158295b7b453SJohn Marino return REG_ESPACE;
158395b7b453SJohn Marino }
158495b7b453SJohn Marino org_node = org_dest;
158595b7b453SJohn Marino clone_node = clone_dest;
158695b7b453SJohn Marino }
158795b7b453SJohn Marino return REG_NOERROR;
158895b7b453SJohn Marino }
158995b7b453SJohn Marino
159095b7b453SJohn Marino /* Search for a node which is duplicated from the node ORG_NODE, and
159195b7b453SJohn Marino satisfies the constraint CONSTRAINT. */
159295b7b453SJohn Marino
159395b7b453SJohn Marino static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)159495b7b453SJohn Marino search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
159595b7b453SJohn Marino unsigned int constraint)
159695b7b453SJohn Marino {
159795b7b453SJohn Marino Idx idx;
159895b7b453SJohn Marino for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
159995b7b453SJohn Marino {
160095b7b453SJohn Marino if (org_node == dfa->org_indices[idx]
160195b7b453SJohn Marino && constraint == dfa->nodes[idx].constraint)
160295b7b453SJohn Marino return idx; /* Found. */
160395b7b453SJohn Marino }
1604*09d4459fSDaniel Fojt return -1; /* Not found. */
160595b7b453SJohn Marino }
160695b7b453SJohn Marino
160795b7b453SJohn Marino /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1608*09d4459fSDaniel Fojt Return the index of the new node, or -1 if insufficient storage is
160995b7b453SJohn Marino available. */
161095b7b453SJohn Marino
161195b7b453SJohn Marino static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)161295b7b453SJohn Marino duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
161395b7b453SJohn Marino {
161495b7b453SJohn Marino Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1615*09d4459fSDaniel Fojt if (__glibc_likely (dup_idx != -1))
161695b7b453SJohn Marino {
161795b7b453SJohn Marino dfa->nodes[dup_idx].constraint = constraint;
161895b7b453SJohn Marino dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
161995b7b453SJohn Marino dfa->nodes[dup_idx].duplicated = 1;
162095b7b453SJohn Marino
162195b7b453SJohn Marino /* Store the index of the original node. */
162295b7b453SJohn Marino dfa->org_indices[dup_idx] = org_idx;
162395b7b453SJohn Marino }
162495b7b453SJohn Marino return dup_idx;
162595b7b453SJohn Marino }
162695b7b453SJohn Marino
162795b7b453SJohn Marino static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)162895b7b453SJohn Marino calc_inveclosure (re_dfa_t *dfa)
162995b7b453SJohn Marino {
163095b7b453SJohn Marino Idx src, idx;
163195b7b453SJohn Marino bool ok;
163295b7b453SJohn Marino for (idx = 0; idx < dfa->nodes_len; ++idx)
163395b7b453SJohn Marino re_node_set_init_empty (dfa->inveclosures + idx);
163495b7b453SJohn Marino
163595b7b453SJohn Marino for (src = 0; src < dfa->nodes_len; ++src)
163695b7b453SJohn Marino {
163795b7b453SJohn Marino Idx *elems = dfa->eclosures[src].elems;
163895b7b453SJohn Marino for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
163995b7b453SJohn Marino {
164095b7b453SJohn Marino ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1641*09d4459fSDaniel Fojt if (__glibc_unlikely (! ok))
164295b7b453SJohn Marino return REG_ESPACE;
164395b7b453SJohn Marino }
164495b7b453SJohn Marino }
164595b7b453SJohn Marino
164695b7b453SJohn Marino return REG_NOERROR;
164795b7b453SJohn Marino }
164895b7b453SJohn Marino
164995b7b453SJohn Marino /* Calculate "eclosure" for all the node in DFA. */
165095b7b453SJohn Marino
165195b7b453SJohn Marino static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)165295b7b453SJohn Marino calc_eclosure (re_dfa_t *dfa)
165395b7b453SJohn Marino {
165495b7b453SJohn Marino Idx node_idx;
165595b7b453SJohn Marino bool incomplete;
1656*09d4459fSDaniel Fojt DEBUG_ASSERT (dfa->nodes_len > 0);
165795b7b453SJohn Marino incomplete = false;
165895b7b453SJohn Marino /* For each nodes, calculate epsilon closure. */
165995b7b453SJohn Marino for (node_idx = 0; ; ++node_idx)
166095b7b453SJohn Marino {
166195b7b453SJohn Marino reg_errcode_t err;
166295b7b453SJohn Marino re_node_set eclosure_elem;
166395b7b453SJohn Marino if (node_idx == dfa->nodes_len)
166495b7b453SJohn Marino {
166595b7b453SJohn Marino if (!incomplete)
166695b7b453SJohn Marino break;
166795b7b453SJohn Marino incomplete = false;
166895b7b453SJohn Marino node_idx = 0;
166995b7b453SJohn Marino }
167095b7b453SJohn Marino
1671*09d4459fSDaniel Fojt DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
167295b7b453SJohn Marino
167395b7b453SJohn Marino /* If we have already calculated, skip it. */
167495b7b453SJohn Marino if (dfa->eclosures[node_idx].nelem != 0)
167595b7b453SJohn Marino continue;
1676cf28ed85SJohn Marino /* Calculate epsilon closure of 'node_idx'. */
167795b7b453SJohn Marino err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1678*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
167995b7b453SJohn Marino return err;
168095b7b453SJohn Marino
168195b7b453SJohn Marino if (dfa->eclosures[node_idx].nelem == 0)
168295b7b453SJohn Marino {
168395b7b453SJohn Marino incomplete = true;
168495b7b453SJohn Marino re_node_set_free (&eclosure_elem);
168595b7b453SJohn Marino }
168695b7b453SJohn Marino }
168795b7b453SJohn Marino return REG_NOERROR;
168895b7b453SJohn Marino }
168995b7b453SJohn Marino
169095b7b453SJohn Marino /* Calculate epsilon closure of NODE. */
169195b7b453SJohn Marino
169295b7b453SJohn Marino static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)169395b7b453SJohn Marino calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
169495b7b453SJohn Marino {
169595b7b453SJohn Marino reg_errcode_t err;
169695b7b453SJohn Marino Idx i;
169795b7b453SJohn Marino re_node_set eclosure;
169895b7b453SJohn Marino bool ok;
169995b7b453SJohn Marino bool incomplete = false;
170095b7b453SJohn Marino err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1701*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
170295b7b453SJohn Marino return err;
170395b7b453SJohn Marino
170495b7b453SJohn Marino /* This indicates that we are calculating this node now.
170595b7b453SJohn Marino We reference this value to avoid infinite loop. */
1706*09d4459fSDaniel Fojt dfa->eclosures[node].nelem = -1;
170795b7b453SJohn Marino
170895b7b453SJohn Marino /* If the current node has constraints, duplicate all nodes
170995b7b453SJohn Marino since they must inherit the constraints. */
171095b7b453SJohn Marino if (dfa->nodes[node].constraint
171195b7b453SJohn Marino && dfa->edests[node].nelem
171295b7b453SJohn Marino && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
171395b7b453SJohn Marino {
171495b7b453SJohn Marino err = duplicate_node_closure (dfa, node, node, node,
171595b7b453SJohn Marino dfa->nodes[node].constraint);
1716*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
171795b7b453SJohn Marino return err;
171895b7b453SJohn Marino }
171995b7b453SJohn Marino
172095b7b453SJohn Marino /* Expand each epsilon destination nodes. */
172195b7b453SJohn Marino if (IS_EPSILON_NODE(dfa->nodes[node].type))
172295b7b453SJohn Marino for (i = 0; i < dfa->edests[node].nelem; ++i)
172395b7b453SJohn Marino {
172495b7b453SJohn Marino re_node_set eclosure_elem;
172595b7b453SJohn Marino Idx edest = dfa->edests[node].elems[i];
1726cf28ed85SJohn Marino /* If calculating the epsilon closure of 'edest' is in progress,
172795b7b453SJohn Marino return intermediate result. */
1728*09d4459fSDaniel Fojt if (dfa->eclosures[edest].nelem == -1)
172995b7b453SJohn Marino {
173095b7b453SJohn Marino incomplete = true;
173195b7b453SJohn Marino continue;
173295b7b453SJohn Marino }
1733cf28ed85SJohn Marino /* If we haven't calculated the epsilon closure of 'edest' yet,
173495b7b453SJohn Marino calculate now. Otherwise use calculated epsilon closure. */
173595b7b453SJohn Marino if (dfa->eclosures[edest].nelem == 0)
173695b7b453SJohn Marino {
173795b7b453SJohn Marino err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1738*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
173995b7b453SJohn Marino return err;
174095b7b453SJohn Marino }
174195b7b453SJohn Marino else
174295b7b453SJohn Marino eclosure_elem = dfa->eclosures[edest];
1743cf28ed85SJohn Marino /* Merge the epsilon closure of 'edest'. */
174495b7b453SJohn Marino err = re_node_set_merge (&eclosure, &eclosure_elem);
1745*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
174695b7b453SJohn Marino return err;
1747cf28ed85SJohn Marino /* If the epsilon closure of 'edest' is incomplete,
174895b7b453SJohn Marino the epsilon closure of this node is also incomplete. */
174995b7b453SJohn Marino if (dfa->eclosures[edest].nelem == 0)
175095b7b453SJohn Marino {
175195b7b453SJohn Marino incomplete = true;
175295b7b453SJohn Marino re_node_set_free (&eclosure_elem);
175395b7b453SJohn Marino }
175495b7b453SJohn Marino }
175595b7b453SJohn Marino
175695b7b453SJohn Marino /* An epsilon closure includes itself. */
175795b7b453SJohn Marino ok = re_node_set_insert (&eclosure, node);
1758*09d4459fSDaniel Fojt if (__glibc_unlikely (! ok))
175995b7b453SJohn Marino return REG_ESPACE;
176095b7b453SJohn Marino if (incomplete && !root)
176195b7b453SJohn Marino dfa->eclosures[node].nelem = 0;
176295b7b453SJohn Marino else
176395b7b453SJohn Marino dfa->eclosures[node] = eclosure;
176495b7b453SJohn Marino *new_set = eclosure;
176595b7b453SJohn Marino return REG_NOERROR;
176695b7b453SJohn Marino }
176795b7b453SJohn Marino
176895b7b453SJohn Marino /* Functions for token which are used in the parser. */
176995b7b453SJohn Marino
177095b7b453SJohn Marino /* Fetch a token from INPUT.
177195b7b453SJohn Marino We must not use this function inside bracket expressions. */
177295b7b453SJohn Marino
177395b7b453SJohn Marino static void
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)177495b7b453SJohn Marino fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
177595b7b453SJohn Marino {
177695b7b453SJohn Marino re_string_skip_bytes (input, peek_token (result, input, syntax));
177795b7b453SJohn Marino }
177895b7b453SJohn Marino
177995b7b453SJohn Marino /* Peek a token from INPUT, and return the length of the token.
178095b7b453SJohn Marino We must not use this function inside bracket expressions. */
178195b7b453SJohn Marino
178295b7b453SJohn Marino static int
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)178395b7b453SJohn Marino peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
178495b7b453SJohn Marino {
178595b7b453SJohn Marino unsigned char c;
178695b7b453SJohn Marino
178795b7b453SJohn Marino if (re_string_eoi (input))
178895b7b453SJohn Marino {
178995b7b453SJohn Marino token->type = END_OF_RE;
179095b7b453SJohn Marino return 0;
179195b7b453SJohn Marino }
179295b7b453SJohn Marino
179395b7b453SJohn Marino c = re_string_peek_byte (input, 0);
179495b7b453SJohn Marino token->opr.c = c;
179595b7b453SJohn Marino
179695b7b453SJohn Marino token->word_char = 0;
179795b7b453SJohn Marino #ifdef RE_ENABLE_I18N
179895b7b453SJohn Marino token->mb_partial = 0;
1799*09d4459fSDaniel Fojt if (input->mb_cur_max > 1
1800*09d4459fSDaniel Fojt && !re_string_first_byte (input, re_string_cur_idx (input)))
180195b7b453SJohn Marino {
180295b7b453SJohn Marino token->type = CHARACTER;
180395b7b453SJohn Marino token->mb_partial = 1;
180495b7b453SJohn Marino return 1;
180595b7b453SJohn Marino }
180695b7b453SJohn Marino #endif
180795b7b453SJohn Marino if (c == '\\')
180895b7b453SJohn Marino {
180995b7b453SJohn Marino unsigned char c2;
181095b7b453SJohn Marino if (re_string_cur_idx (input) + 1 >= re_string_length (input))
181195b7b453SJohn Marino {
181295b7b453SJohn Marino token->type = BACK_SLASH;
181395b7b453SJohn Marino return 1;
181495b7b453SJohn Marino }
181595b7b453SJohn Marino
181695b7b453SJohn Marino c2 = re_string_peek_byte_case (input, 1);
181795b7b453SJohn Marino token->opr.c = c2;
181895b7b453SJohn Marino token->type = CHARACTER;
181995b7b453SJohn Marino #ifdef RE_ENABLE_I18N
182095b7b453SJohn Marino if (input->mb_cur_max > 1)
182195b7b453SJohn Marino {
182295b7b453SJohn Marino wint_t wc = re_string_wchar_at (input,
182395b7b453SJohn Marino re_string_cur_idx (input) + 1);
182495b7b453SJohn Marino token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
182595b7b453SJohn Marino }
182695b7b453SJohn Marino else
182795b7b453SJohn Marino #endif
182895b7b453SJohn Marino token->word_char = IS_WORD_CHAR (c2) != 0;
182995b7b453SJohn Marino
183095b7b453SJohn Marino switch (c2)
183195b7b453SJohn Marino {
183295b7b453SJohn Marino case '|':
183395b7b453SJohn Marino if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
183495b7b453SJohn Marino token->type = OP_ALT;
183595b7b453SJohn Marino break;
183695b7b453SJohn Marino case '1': case '2': case '3': case '4': case '5':
183795b7b453SJohn Marino case '6': case '7': case '8': case '9':
183895b7b453SJohn Marino if (!(syntax & RE_NO_BK_REFS))
183995b7b453SJohn Marino {
184095b7b453SJohn Marino token->type = OP_BACK_REF;
184195b7b453SJohn Marino token->opr.idx = c2 - '1';
184295b7b453SJohn Marino }
184395b7b453SJohn Marino break;
184495b7b453SJohn Marino case '<':
184595b7b453SJohn Marino if (!(syntax & RE_NO_GNU_OPS))
184695b7b453SJohn Marino {
184795b7b453SJohn Marino token->type = ANCHOR;
184895b7b453SJohn Marino token->opr.ctx_type = WORD_FIRST;
184995b7b453SJohn Marino }
185095b7b453SJohn Marino break;
185195b7b453SJohn Marino case '>':
185295b7b453SJohn Marino if (!(syntax & RE_NO_GNU_OPS))
185395b7b453SJohn Marino {
185495b7b453SJohn Marino token->type = ANCHOR;
185595b7b453SJohn Marino token->opr.ctx_type = WORD_LAST;
185695b7b453SJohn Marino }
185795b7b453SJohn Marino break;
185895b7b453SJohn Marino case 'b':
185995b7b453SJohn Marino if (!(syntax & RE_NO_GNU_OPS))
186095b7b453SJohn Marino {
186195b7b453SJohn Marino token->type = ANCHOR;
186295b7b453SJohn Marino token->opr.ctx_type = WORD_DELIM;
186395b7b453SJohn Marino }
186495b7b453SJohn Marino break;
186595b7b453SJohn Marino case 'B':
186695b7b453SJohn Marino if (!(syntax & RE_NO_GNU_OPS))
186795b7b453SJohn Marino {
186895b7b453SJohn Marino token->type = ANCHOR;
186995b7b453SJohn Marino token->opr.ctx_type = NOT_WORD_DELIM;
187095b7b453SJohn Marino }
187195b7b453SJohn Marino break;
187295b7b453SJohn Marino case 'w':
187395b7b453SJohn Marino if (!(syntax & RE_NO_GNU_OPS))
187495b7b453SJohn Marino token->type = OP_WORD;
187595b7b453SJohn Marino break;
187695b7b453SJohn Marino case 'W':
187795b7b453SJohn Marino if (!(syntax & RE_NO_GNU_OPS))
187895b7b453SJohn Marino token->type = OP_NOTWORD;
187995b7b453SJohn Marino break;
188095b7b453SJohn Marino case 's':
188195b7b453SJohn Marino if (!(syntax & RE_NO_GNU_OPS))
188295b7b453SJohn Marino token->type = OP_SPACE;
188395b7b453SJohn Marino break;
188495b7b453SJohn Marino case 'S':
188595b7b453SJohn Marino if (!(syntax & RE_NO_GNU_OPS))
188695b7b453SJohn Marino token->type = OP_NOTSPACE;
188795b7b453SJohn Marino break;
188895b7b453SJohn Marino case '`':
188995b7b453SJohn Marino if (!(syntax & RE_NO_GNU_OPS))
189095b7b453SJohn Marino {
189195b7b453SJohn Marino token->type = ANCHOR;
189295b7b453SJohn Marino token->opr.ctx_type = BUF_FIRST;
189395b7b453SJohn Marino }
189495b7b453SJohn Marino break;
189595b7b453SJohn Marino case '\'':
189695b7b453SJohn Marino if (!(syntax & RE_NO_GNU_OPS))
189795b7b453SJohn Marino {
189895b7b453SJohn Marino token->type = ANCHOR;
189995b7b453SJohn Marino token->opr.ctx_type = BUF_LAST;
190095b7b453SJohn Marino }
190195b7b453SJohn Marino break;
190295b7b453SJohn Marino case '(':
190395b7b453SJohn Marino if (!(syntax & RE_NO_BK_PARENS))
190495b7b453SJohn Marino token->type = OP_OPEN_SUBEXP;
190595b7b453SJohn Marino break;
190695b7b453SJohn Marino case ')':
190795b7b453SJohn Marino if (!(syntax & RE_NO_BK_PARENS))
190895b7b453SJohn Marino token->type = OP_CLOSE_SUBEXP;
190995b7b453SJohn Marino break;
191095b7b453SJohn Marino case '+':
191195b7b453SJohn Marino if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
191295b7b453SJohn Marino token->type = OP_DUP_PLUS;
191395b7b453SJohn Marino break;
191495b7b453SJohn Marino case '?':
191595b7b453SJohn Marino if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
191695b7b453SJohn Marino token->type = OP_DUP_QUESTION;
191795b7b453SJohn Marino break;
191895b7b453SJohn Marino case '{':
191995b7b453SJohn Marino if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
192095b7b453SJohn Marino token->type = OP_OPEN_DUP_NUM;
192195b7b453SJohn Marino break;
192295b7b453SJohn Marino case '}':
192395b7b453SJohn Marino if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
192495b7b453SJohn Marino token->type = OP_CLOSE_DUP_NUM;
192595b7b453SJohn Marino break;
192695b7b453SJohn Marino default:
192795b7b453SJohn Marino break;
192895b7b453SJohn Marino }
192995b7b453SJohn Marino return 2;
193095b7b453SJohn Marino }
193195b7b453SJohn Marino
193295b7b453SJohn Marino token->type = CHARACTER;
193395b7b453SJohn Marino #ifdef RE_ENABLE_I18N
193495b7b453SJohn Marino if (input->mb_cur_max > 1)
193595b7b453SJohn Marino {
193695b7b453SJohn Marino wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
193795b7b453SJohn Marino token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
193895b7b453SJohn Marino }
193995b7b453SJohn Marino else
194095b7b453SJohn Marino #endif
194195b7b453SJohn Marino token->word_char = IS_WORD_CHAR (token->opr.c);
194295b7b453SJohn Marino
194395b7b453SJohn Marino switch (c)
194495b7b453SJohn Marino {
194595b7b453SJohn Marino case '\n':
194695b7b453SJohn Marino if (syntax & RE_NEWLINE_ALT)
194795b7b453SJohn Marino token->type = OP_ALT;
194895b7b453SJohn Marino break;
194995b7b453SJohn Marino case '|':
195095b7b453SJohn Marino if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
195195b7b453SJohn Marino token->type = OP_ALT;
195295b7b453SJohn Marino break;
195395b7b453SJohn Marino case '*':
195495b7b453SJohn Marino token->type = OP_DUP_ASTERISK;
195595b7b453SJohn Marino break;
195695b7b453SJohn Marino case '+':
195795b7b453SJohn Marino if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
195895b7b453SJohn Marino token->type = OP_DUP_PLUS;
195995b7b453SJohn Marino break;
196095b7b453SJohn Marino case '?':
196195b7b453SJohn Marino if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
196295b7b453SJohn Marino token->type = OP_DUP_QUESTION;
196395b7b453SJohn Marino break;
196495b7b453SJohn Marino case '{':
196595b7b453SJohn Marino if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
196695b7b453SJohn Marino token->type = OP_OPEN_DUP_NUM;
196795b7b453SJohn Marino break;
196895b7b453SJohn Marino case '}':
196995b7b453SJohn Marino if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
197095b7b453SJohn Marino token->type = OP_CLOSE_DUP_NUM;
197195b7b453SJohn Marino break;
197295b7b453SJohn Marino case '(':
197395b7b453SJohn Marino if (syntax & RE_NO_BK_PARENS)
197495b7b453SJohn Marino token->type = OP_OPEN_SUBEXP;
197595b7b453SJohn Marino break;
197695b7b453SJohn Marino case ')':
197795b7b453SJohn Marino if (syntax & RE_NO_BK_PARENS)
197895b7b453SJohn Marino token->type = OP_CLOSE_SUBEXP;
197995b7b453SJohn Marino break;
198095b7b453SJohn Marino case '[':
198195b7b453SJohn Marino token->type = OP_OPEN_BRACKET;
198295b7b453SJohn Marino break;
198395b7b453SJohn Marino case '.':
198495b7b453SJohn Marino token->type = OP_PERIOD;
198595b7b453SJohn Marino break;
198695b7b453SJohn Marino case '^':
1987*09d4459fSDaniel Fojt if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1988*09d4459fSDaniel Fojt && re_string_cur_idx (input) != 0)
198995b7b453SJohn Marino {
199095b7b453SJohn Marino char prev = re_string_peek_byte (input, -1);
199195b7b453SJohn Marino if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
199295b7b453SJohn Marino break;
199395b7b453SJohn Marino }
199495b7b453SJohn Marino token->type = ANCHOR;
199595b7b453SJohn Marino token->opr.ctx_type = LINE_FIRST;
199695b7b453SJohn Marino break;
199795b7b453SJohn Marino case '$':
1998*09d4459fSDaniel Fojt if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1999*09d4459fSDaniel Fojt && re_string_cur_idx (input) + 1 != re_string_length (input))
200095b7b453SJohn Marino {
200195b7b453SJohn Marino re_token_t next;
200295b7b453SJohn Marino re_string_skip_bytes (input, 1);
200395b7b453SJohn Marino peek_token (&next, input, syntax);
200495b7b453SJohn Marino re_string_skip_bytes (input, -1);
200595b7b453SJohn Marino if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
200695b7b453SJohn Marino break;
200795b7b453SJohn Marino }
200895b7b453SJohn Marino token->type = ANCHOR;
200995b7b453SJohn Marino token->opr.ctx_type = LINE_LAST;
201095b7b453SJohn Marino break;
201195b7b453SJohn Marino default:
201295b7b453SJohn Marino break;
201395b7b453SJohn Marino }
201495b7b453SJohn Marino return 1;
201595b7b453SJohn Marino }
201695b7b453SJohn Marino
201795b7b453SJohn Marino /* Peek a token from INPUT, and return the length of the token.
201895b7b453SJohn Marino We must not use this function out of bracket expressions. */
201995b7b453SJohn Marino
202095b7b453SJohn Marino static int
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)202195b7b453SJohn Marino peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
202295b7b453SJohn Marino {
202395b7b453SJohn Marino unsigned char c;
202495b7b453SJohn Marino if (re_string_eoi (input))
202595b7b453SJohn Marino {
202695b7b453SJohn Marino token->type = END_OF_RE;
202795b7b453SJohn Marino return 0;
202895b7b453SJohn Marino }
202995b7b453SJohn Marino c = re_string_peek_byte (input, 0);
203095b7b453SJohn Marino token->opr.c = c;
203195b7b453SJohn Marino
203295b7b453SJohn Marino #ifdef RE_ENABLE_I18N
2033*09d4459fSDaniel Fojt if (input->mb_cur_max > 1
2034*09d4459fSDaniel Fojt && !re_string_first_byte (input, re_string_cur_idx (input)))
203595b7b453SJohn Marino {
203695b7b453SJohn Marino token->type = CHARACTER;
203795b7b453SJohn Marino return 1;
203895b7b453SJohn Marino }
203995b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
204095b7b453SJohn Marino
204195b7b453SJohn Marino if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
204295b7b453SJohn Marino && re_string_cur_idx (input) + 1 < re_string_length (input))
204395b7b453SJohn Marino {
204495b7b453SJohn Marino /* In this case, '\' escape a character. */
204595b7b453SJohn Marino unsigned char c2;
204695b7b453SJohn Marino re_string_skip_bytes (input, 1);
204795b7b453SJohn Marino c2 = re_string_peek_byte (input, 0);
204895b7b453SJohn Marino token->opr.c = c2;
204995b7b453SJohn Marino token->type = CHARACTER;
205095b7b453SJohn Marino return 1;
205195b7b453SJohn Marino }
205295b7b453SJohn Marino if (c == '[') /* '[' is a special char in a bracket exps. */
205395b7b453SJohn Marino {
205495b7b453SJohn Marino unsigned char c2;
205595b7b453SJohn Marino int token_len;
205695b7b453SJohn Marino if (re_string_cur_idx (input) + 1 < re_string_length (input))
205795b7b453SJohn Marino c2 = re_string_peek_byte (input, 1);
205895b7b453SJohn Marino else
205995b7b453SJohn Marino c2 = 0;
206095b7b453SJohn Marino token->opr.c = c2;
206195b7b453SJohn Marino token_len = 2;
206295b7b453SJohn Marino switch (c2)
206395b7b453SJohn Marino {
206495b7b453SJohn Marino case '.':
206595b7b453SJohn Marino token->type = OP_OPEN_COLL_ELEM;
206695b7b453SJohn Marino break;
2067*09d4459fSDaniel Fojt
206895b7b453SJohn Marino case '=':
206995b7b453SJohn Marino token->type = OP_OPEN_EQUIV_CLASS;
207095b7b453SJohn Marino break;
2071*09d4459fSDaniel Fojt
207295b7b453SJohn Marino case ':':
207395b7b453SJohn Marino if (syntax & RE_CHAR_CLASSES)
207495b7b453SJohn Marino {
207595b7b453SJohn Marino token->type = OP_OPEN_CHAR_CLASS;
207695b7b453SJohn Marino break;
207795b7b453SJohn Marino }
2078*09d4459fSDaniel Fojt FALLTHROUGH;
207995b7b453SJohn Marino default:
208095b7b453SJohn Marino token->type = CHARACTER;
208195b7b453SJohn Marino token->opr.c = c;
208295b7b453SJohn Marino token_len = 1;
208395b7b453SJohn Marino break;
208495b7b453SJohn Marino }
208595b7b453SJohn Marino return token_len;
208695b7b453SJohn Marino }
208795b7b453SJohn Marino switch (c)
208895b7b453SJohn Marino {
208995b7b453SJohn Marino case '-':
209095b7b453SJohn Marino token->type = OP_CHARSET_RANGE;
209195b7b453SJohn Marino break;
209295b7b453SJohn Marino case ']':
209395b7b453SJohn Marino token->type = OP_CLOSE_BRACKET;
209495b7b453SJohn Marino break;
209595b7b453SJohn Marino case '^':
209695b7b453SJohn Marino token->type = OP_NON_MATCH_LIST;
209795b7b453SJohn Marino break;
209895b7b453SJohn Marino default:
209995b7b453SJohn Marino token->type = CHARACTER;
210095b7b453SJohn Marino }
210195b7b453SJohn Marino return 1;
210295b7b453SJohn Marino }
210395b7b453SJohn Marino
210495b7b453SJohn Marino /* Functions for parser. */
210595b7b453SJohn Marino
210695b7b453SJohn Marino /* Entry point of the parser.
210795b7b453SJohn Marino Parse the regular expression REGEXP and return the structure tree.
2108cf28ed85SJohn Marino If an error occurs, ERR is set by error code, and return NULL.
210995b7b453SJohn Marino This function build the following tree, from regular expression <reg_exp>:
211095b7b453SJohn Marino CAT
211195b7b453SJohn Marino / \
211295b7b453SJohn Marino / \
211395b7b453SJohn Marino <reg_exp> EOR
211495b7b453SJohn Marino
211595b7b453SJohn Marino CAT means concatenation.
211695b7b453SJohn Marino EOR means end of regular expression. */
211795b7b453SJohn Marino
211895b7b453SJohn Marino static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)211995b7b453SJohn Marino parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
212095b7b453SJohn Marino reg_errcode_t *err)
212195b7b453SJohn Marino {
2122cf28ed85SJohn Marino re_dfa_t *dfa = preg->buffer;
212395b7b453SJohn Marino bin_tree_t *tree, *eor, *root;
212495b7b453SJohn Marino re_token_t current_token;
212595b7b453SJohn Marino dfa->syntax = syntax;
212695b7b453SJohn Marino fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
212795b7b453SJohn Marino tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2128*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
212995b7b453SJohn Marino return NULL;
213095b7b453SJohn Marino eor = create_tree (dfa, NULL, NULL, END_OF_RE);
213195b7b453SJohn Marino if (tree != NULL)
213295b7b453SJohn Marino root = create_tree (dfa, tree, eor, CONCAT);
213395b7b453SJohn Marino else
213495b7b453SJohn Marino root = eor;
2135*09d4459fSDaniel Fojt if (__glibc_unlikely (eor == NULL || root == NULL))
213695b7b453SJohn Marino {
213795b7b453SJohn Marino *err = REG_ESPACE;
213895b7b453SJohn Marino return NULL;
213995b7b453SJohn Marino }
214095b7b453SJohn Marino return root;
214195b7b453SJohn Marino }
214295b7b453SJohn Marino
214395b7b453SJohn Marino /* This function build the following tree, from regular expression
214495b7b453SJohn Marino <branch1>|<branch2>:
214595b7b453SJohn Marino ALT
214695b7b453SJohn Marino / \
214795b7b453SJohn Marino / \
214895b7b453SJohn Marino <branch1> <branch2>
214995b7b453SJohn Marino
2150cf28ed85SJohn Marino ALT means alternative, which represents the operator '|'. */
215195b7b453SJohn Marino
215295b7b453SJohn Marino static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)215395b7b453SJohn Marino parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
215495b7b453SJohn Marino reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
215595b7b453SJohn Marino {
2156cf28ed85SJohn Marino re_dfa_t *dfa = preg->buffer;
215795b7b453SJohn Marino bin_tree_t *tree, *branch = NULL;
2158dc7c36e4SJohn Marino bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
215995b7b453SJohn Marino tree = parse_branch (regexp, preg, token, syntax, nest, err);
2160*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
216195b7b453SJohn Marino return NULL;
216295b7b453SJohn Marino
216395b7b453SJohn Marino while (token->type == OP_ALT)
216495b7b453SJohn Marino {
216595b7b453SJohn Marino fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
216695b7b453SJohn Marino if (token->type != OP_ALT && token->type != END_OF_RE
216795b7b453SJohn Marino && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
216895b7b453SJohn Marino {
2169dc7c36e4SJohn Marino bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2170dc7c36e4SJohn Marino dfa->completed_bkref_map = initial_bkref_map;
217195b7b453SJohn Marino branch = parse_branch (regexp, preg, token, syntax, nest, err);
2172*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2173dc7c36e4SJohn Marino {
2174dc7c36e4SJohn Marino if (tree != NULL)
2175dc7c36e4SJohn Marino postorder (tree, free_tree, NULL);
217695b7b453SJohn Marino return NULL;
217795b7b453SJohn Marino }
2178dc7c36e4SJohn Marino dfa->completed_bkref_map |= accumulated_bkref_map;
2179dc7c36e4SJohn Marino }
218095b7b453SJohn Marino else
218195b7b453SJohn Marino branch = NULL;
218295b7b453SJohn Marino tree = create_tree (dfa, tree, branch, OP_ALT);
2183*09d4459fSDaniel Fojt if (__glibc_unlikely (tree == NULL))
218495b7b453SJohn Marino {
218595b7b453SJohn Marino *err = REG_ESPACE;
218695b7b453SJohn Marino return NULL;
218795b7b453SJohn Marino }
218895b7b453SJohn Marino }
218995b7b453SJohn Marino return tree;
219095b7b453SJohn Marino }
219195b7b453SJohn Marino
219295b7b453SJohn Marino /* This function build the following tree, from regular expression
219395b7b453SJohn Marino <exp1><exp2>:
219495b7b453SJohn Marino CAT
219595b7b453SJohn Marino / \
219695b7b453SJohn Marino / \
219795b7b453SJohn Marino <exp1> <exp2>
219895b7b453SJohn Marino
219995b7b453SJohn Marino CAT means concatenation. */
220095b7b453SJohn Marino
220195b7b453SJohn Marino static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)220295b7b453SJohn Marino parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
220395b7b453SJohn Marino reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
220495b7b453SJohn Marino {
220595b7b453SJohn Marino bin_tree_t *tree, *expr;
2206cf28ed85SJohn Marino re_dfa_t *dfa = preg->buffer;
220795b7b453SJohn Marino tree = parse_expression (regexp, preg, token, syntax, nest, err);
2208*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
220995b7b453SJohn Marino return NULL;
221095b7b453SJohn Marino
221195b7b453SJohn Marino while (token->type != OP_ALT && token->type != END_OF_RE
221295b7b453SJohn Marino && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
221395b7b453SJohn Marino {
221495b7b453SJohn Marino expr = parse_expression (regexp, preg, token, syntax, nest, err);
2215*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
221695b7b453SJohn Marino {
2217cf28ed85SJohn Marino if (tree != NULL)
2218cf28ed85SJohn Marino postorder (tree, free_tree, NULL);
221995b7b453SJohn Marino return NULL;
222095b7b453SJohn Marino }
222195b7b453SJohn Marino if (tree != NULL && expr != NULL)
222295b7b453SJohn Marino {
2223cf28ed85SJohn Marino bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2224cf28ed85SJohn Marino if (newtree == NULL)
222595b7b453SJohn Marino {
2226cf28ed85SJohn Marino postorder (expr, free_tree, NULL);
2227cf28ed85SJohn Marino postorder (tree, free_tree, NULL);
222895b7b453SJohn Marino *err = REG_ESPACE;
222995b7b453SJohn Marino return NULL;
223095b7b453SJohn Marino }
2231cf28ed85SJohn Marino tree = newtree;
223295b7b453SJohn Marino }
223395b7b453SJohn Marino else if (tree == NULL)
223495b7b453SJohn Marino tree = expr;
223595b7b453SJohn Marino /* Otherwise expr == NULL, we don't need to create new tree. */
223695b7b453SJohn Marino }
223795b7b453SJohn Marino return tree;
223895b7b453SJohn Marino }
223995b7b453SJohn Marino
224095b7b453SJohn Marino /* This function build the following tree, from regular expression a*:
224195b7b453SJohn Marino *
224295b7b453SJohn Marino |
224395b7b453SJohn Marino a
224495b7b453SJohn Marino */
224595b7b453SJohn Marino
224695b7b453SJohn Marino static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)224795b7b453SJohn Marino parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
224895b7b453SJohn Marino reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
224995b7b453SJohn Marino {
2250cf28ed85SJohn Marino re_dfa_t *dfa = preg->buffer;
225195b7b453SJohn Marino bin_tree_t *tree;
225295b7b453SJohn Marino switch (token->type)
225395b7b453SJohn Marino {
225495b7b453SJohn Marino case CHARACTER:
225595b7b453SJohn Marino tree = create_token_tree (dfa, NULL, NULL, token);
2256*09d4459fSDaniel Fojt if (__glibc_unlikely (tree == NULL))
225795b7b453SJohn Marino {
225895b7b453SJohn Marino *err = REG_ESPACE;
225995b7b453SJohn Marino return NULL;
226095b7b453SJohn Marino }
226195b7b453SJohn Marino #ifdef RE_ENABLE_I18N
226295b7b453SJohn Marino if (dfa->mb_cur_max > 1)
226395b7b453SJohn Marino {
226495b7b453SJohn Marino while (!re_string_eoi (regexp)
226595b7b453SJohn Marino && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
226695b7b453SJohn Marino {
226795b7b453SJohn Marino bin_tree_t *mbc_remain;
226895b7b453SJohn Marino fetch_token (token, regexp, syntax);
226995b7b453SJohn Marino mbc_remain = create_token_tree (dfa, NULL, NULL, token);
227095b7b453SJohn Marino tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2271*09d4459fSDaniel Fojt if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
227295b7b453SJohn Marino {
227395b7b453SJohn Marino *err = REG_ESPACE;
227495b7b453SJohn Marino return NULL;
227595b7b453SJohn Marino }
227695b7b453SJohn Marino }
227795b7b453SJohn Marino }
227895b7b453SJohn Marino #endif
227995b7b453SJohn Marino break;
2280*09d4459fSDaniel Fojt
228195b7b453SJohn Marino case OP_OPEN_SUBEXP:
228295b7b453SJohn Marino tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2283*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
228495b7b453SJohn Marino return NULL;
228595b7b453SJohn Marino break;
2286*09d4459fSDaniel Fojt
228795b7b453SJohn Marino case OP_OPEN_BRACKET:
228895b7b453SJohn Marino tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2289*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
229095b7b453SJohn Marino return NULL;
229195b7b453SJohn Marino break;
2292*09d4459fSDaniel Fojt
229395b7b453SJohn Marino case OP_BACK_REF:
2294*09d4459fSDaniel Fojt if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
229595b7b453SJohn Marino {
229695b7b453SJohn Marino *err = REG_ESUBREG;
229795b7b453SJohn Marino return NULL;
229895b7b453SJohn Marino }
229995b7b453SJohn Marino dfa->used_bkref_map |= 1 << token->opr.idx;
230095b7b453SJohn Marino tree = create_token_tree (dfa, NULL, NULL, token);
2301*09d4459fSDaniel Fojt if (__glibc_unlikely (tree == NULL))
230295b7b453SJohn Marino {
230395b7b453SJohn Marino *err = REG_ESPACE;
230495b7b453SJohn Marino return NULL;
230595b7b453SJohn Marino }
230695b7b453SJohn Marino ++dfa->nbackref;
230795b7b453SJohn Marino dfa->has_mb_node = 1;
230895b7b453SJohn Marino break;
2309*09d4459fSDaniel Fojt
231095b7b453SJohn Marino case OP_OPEN_DUP_NUM:
231195b7b453SJohn Marino if (syntax & RE_CONTEXT_INVALID_DUP)
231295b7b453SJohn Marino {
231395b7b453SJohn Marino *err = REG_BADRPT;
231495b7b453SJohn Marino return NULL;
231595b7b453SJohn Marino }
2316*09d4459fSDaniel Fojt FALLTHROUGH;
231795b7b453SJohn Marino case OP_DUP_ASTERISK:
231895b7b453SJohn Marino case OP_DUP_PLUS:
231995b7b453SJohn Marino case OP_DUP_QUESTION:
232095b7b453SJohn Marino if (syntax & RE_CONTEXT_INVALID_OPS)
232195b7b453SJohn Marino {
232295b7b453SJohn Marino *err = REG_BADRPT;
232395b7b453SJohn Marino return NULL;
232495b7b453SJohn Marino }
232595b7b453SJohn Marino else if (syntax & RE_CONTEXT_INDEP_OPS)
232695b7b453SJohn Marino {
232795b7b453SJohn Marino fetch_token (token, regexp, syntax);
232895b7b453SJohn Marino return parse_expression (regexp, preg, token, syntax, nest, err);
232995b7b453SJohn Marino }
2330*09d4459fSDaniel Fojt FALLTHROUGH;
233195b7b453SJohn Marino case OP_CLOSE_SUBEXP:
2332*09d4459fSDaniel Fojt if ((token->type == OP_CLOSE_SUBEXP)
2333*09d4459fSDaniel Fojt && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
233495b7b453SJohn Marino {
233595b7b453SJohn Marino *err = REG_ERPAREN;
233695b7b453SJohn Marino return NULL;
233795b7b453SJohn Marino }
2338*09d4459fSDaniel Fojt FALLTHROUGH;
233995b7b453SJohn Marino case OP_CLOSE_DUP_NUM:
234095b7b453SJohn Marino /* We treat it as a normal character. */
234195b7b453SJohn Marino
234295b7b453SJohn Marino /* Then we can these characters as normal characters. */
234395b7b453SJohn Marino token->type = CHARACTER;
234495b7b453SJohn Marino /* mb_partial and word_char bits should be initialized already
234595b7b453SJohn Marino by peek_token. */
234695b7b453SJohn Marino tree = create_token_tree (dfa, NULL, NULL, token);
2347*09d4459fSDaniel Fojt if (__glibc_unlikely (tree == NULL))
234895b7b453SJohn Marino {
234995b7b453SJohn Marino *err = REG_ESPACE;
235095b7b453SJohn Marino return NULL;
235195b7b453SJohn Marino }
235295b7b453SJohn Marino break;
2353*09d4459fSDaniel Fojt
235495b7b453SJohn Marino case ANCHOR:
235595b7b453SJohn Marino if ((token->opr.ctx_type
235695b7b453SJohn Marino & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
235795b7b453SJohn Marino && dfa->word_ops_used == 0)
235895b7b453SJohn Marino init_word_char (dfa);
235995b7b453SJohn Marino if (token->opr.ctx_type == WORD_DELIM
236095b7b453SJohn Marino || token->opr.ctx_type == NOT_WORD_DELIM)
236195b7b453SJohn Marino {
236295b7b453SJohn Marino bin_tree_t *tree_first, *tree_last;
236395b7b453SJohn Marino if (token->opr.ctx_type == WORD_DELIM)
236495b7b453SJohn Marino {
236595b7b453SJohn Marino token->opr.ctx_type = WORD_FIRST;
236695b7b453SJohn Marino tree_first = create_token_tree (dfa, NULL, NULL, token);
236795b7b453SJohn Marino token->opr.ctx_type = WORD_LAST;
236895b7b453SJohn Marino }
236995b7b453SJohn Marino else
237095b7b453SJohn Marino {
237195b7b453SJohn Marino token->opr.ctx_type = INSIDE_WORD;
237295b7b453SJohn Marino tree_first = create_token_tree (dfa, NULL, NULL, token);
237395b7b453SJohn Marino token->opr.ctx_type = INSIDE_NOTWORD;
237495b7b453SJohn Marino }
237595b7b453SJohn Marino tree_last = create_token_tree (dfa, NULL, NULL, token);
237695b7b453SJohn Marino tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2377*09d4459fSDaniel Fojt if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2378*09d4459fSDaniel Fojt || tree == NULL))
237995b7b453SJohn Marino {
238095b7b453SJohn Marino *err = REG_ESPACE;
238195b7b453SJohn Marino return NULL;
238295b7b453SJohn Marino }
238395b7b453SJohn Marino }
238495b7b453SJohn Marino else
238595b7b453SJohn Marino {
238695b7b453SJohn Marino tree = create_token_tree (dfa, NULL, NULL, token);
2387*09d4459fSDaniel Fojt if (__glibc_unlikely (tree == NULL))
238895b7b453SJohn Marino {
238995b7b453SJohn Marino *err = REG_ESPACE;
239095b7b453SJohn Marino return NULL;
239195b7b453SJohn Marino }
239295b7b453SJohn Marino }
239395b7b453SJohn Marino /* We must return here, since ANCHORs can't be followed
239495b7b453SJohn Marino by repetition operators.
239595b7b453SJohn Marino eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
239695b7b453SJohn Marino it must not be "<ANCHOR(^)><REPEAT(*)>". */
239795b7b453SJohn Marino fetch_token (token, regexp, syntax);
239895b7b453SJohn Marino return tree;
2399*09d4459fSDaniel Fojt
240095b7b453SJohn Marino case OP_PERIOD:
240195b7b453SJohn Marino tree = create_token_tree (dfa, NULL, NULL, token);
2402*09d4459fSDaniel Fojt if (__glibc_unlikely (tree == NULL))
240395b7b453SJohn Marino {
240495b7b453SJohn Marino *err = REG_ESPACE;
240595b7b453SJohn Marino return NULL;
240695b7b453SJohn Marino }
240795b7b453SJohn Marino if (dfa->mb_cur_max > 1)
240895b7b453SJohn Marino dfa->has_mb_node = 1;
240995b7b453SJohn Marino break;
2410*09d4459fSDaniel Fojt
241195b7b453SJohn Marino case OP_WORD:
241295b7b453SJohn Marino case OP_NOTWORD:
241395b7b453SJohn Marino tree = build_charclass_op (dfa, regexp->trans,
2414680a9cb8SJohn Marino "alnum",
2415680a9cb8SJohn Marino "_",
241695b7b453SJohn Marino token->type == OP_NOTWORD, err);
2417*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
241895b7b453SJohn Marino return NULL;
241995b7b453SJohn Marino break;
2420*09d4459fSDaniel Fojt
242195b7b453SJohn Marino case OP_SPACE:
242295b7b453SJohn Marino case OP_NOTSPACE:
242395b7b453SJohn Marino tree = build_charclass_op (dfa, regexp->trans,
2424680a9cb8SJohn Marino "space",
2425680a9cb8SJohn Marino "",
242695b7b453SJohn Marino token->type == OP_NOTSPACE, err);
2427*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
242895b7b453SJohn Marino return NULL;
242995b7b453SJohn Marino break;
2430*09d4459fSDaniel Fojt
243195b7b453SJohn Marino case OP_ALT:
243295b7b453SJohn Marino case END_OF_RE:
243395b7b453SJohn Marino return NULL;
2434*09d4459fSDaniel Fojt
243595b7b453SJohn Marino case BACK_SLASH:
243695b7b453SJohn Marino *err = REG_EESCAPE;
243795b7b453SJohn Marino return NULL;
2438*09d4459fSDaniel Fojt
243995b7b453SJohn Marino default:
244095b7b453SJohn Marino /* Must not happen? */
2441*09d4459fSDaniel Fojt DEBUG_ASSERT (false);
244295b7b453SJohn Marino return NULL;
244395b7b453SJohn Marino }
244495b7b453SJohn Marino fetch_token (token, regexp, syntax);
244595b7b453SJohn Marino
244695b7b453SJohn Marino while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
244795b7b453SJohn Marino || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
244895b7b453SJohn Marino {
2449dc7c36e4SJohn Marino bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2450dc7c36e4SJohn Marino syntax, err);
2451*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2452dc7c36e4SJohn Marino {
2453dc7c36e4SJohn Marino if (tree != NULL)
2454dc7c36e4SJohn Marino postorder (tree, free_tree, NULL);
245595b7b453SJohn Marino return NULL;
2456dc7c36e4SJohn Marino }
2457dc7c36e4SJohn Marino tree = dup_tree;
245895b7b453SJohn Marino /* In BRE consecutive duplications are not allowed. */
245995b7b453SJohn Marino if ((syntax & RE_CONTEXT_INVALID_DUP)
246095b7b453SJohn Marino && (token->type == OP_DUP_ASTERISK
246195b7b453SJohn Marino || token->type == OP_OPEN_DUP_NUM))
246295b7b453SJohn Marino {
2463dc7c36e4SJohn Marino if (tree != NULL)
2464dc7c36e4SJohn Marino postorder (tree, free_tree, NULL);
246595b7b453SJohn Marino *err = REG_BADRPT;
246695b7b453SJohn Marino return NULL;
246795b7b453SJohn Marino }
246895b7b453SJohn Marino }
246995b7b453SJohn Marino
247095b7b453SJohn Marino return tree;
247195b7b453SJohn Marino }
247295b7b453SJohn Marino
247395b7b453SJohn Marino /* This function build the following tree, from regular expression
247495b7b453SJohn Marino (<reg_exp>):
247595b7b453SJohn Marino SUBEXP
247695b7b453SJohn Marino |
247795b7b453SJohn Marino <reg_exp>
247895b7b453SJohn Marino */
247995b7b453SJohn Marino
248095b7b453SJohn Marino static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)248195b7b453SJohn Marino parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
248295b7b453SJohn Marino reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
248395b7b453SJohn Marino {
2484cf28ed85SJohn Marino re_dfa_t *dfa = preg->buffer;
248595b7b453SJohn Marino bin_tree_t *tree;
248695b7b453SJohn Marino size_t cur_nsub;
248795b7b453SJohn Marino cur_nsub = preg->re_nsub++;
248895b7b453SJohn Marino
248995b7b453SJohn Marino fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
249095b7b453SJohn Marino
249195b7b453SJohn Marino /* The subexpression may be a null string. */
249295b7b453SJohn Marino if (token->type == OP_CLOSE_SUBEXP)
249395b7b453SJohn Marino tree = NULL;
249495b7b453SJohn Marino else
249595b7b453SJohn Marino {
249695b7b453SJohn Marino tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2497*09d4459fSDaniel Fojt if (__glibc_unlikely (*err == REG_NOERROR
2498*09d4459fSDaniel Fojt && token->type != OP_CLOSE_SUBEXP))
2499cf28ed85SJohn Marino {
2500cf28ed85SJohn Marino if (tree != NULL)
2501cf28ed85SJohn Marino postorder (tree, free_tree, NULL);
250295b7b453SJohn Marino *err = REG_EPAREN;
2503cf28ed85SJohn Marino }
2504*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR))
250595b7b453SJohn Marino return NULL;
250695b7b453SJohn Marino }
250795b7b453SJohn Marino
250895b7b453SJohn Marino if (cur_nsub <= '9' - '1')
250995b7b453SJohn Marino dfa->completed_bkref_map |= 1 << cur_nsub;
251095b7b453SJohn Marino
251195b7b453SJohn Marino tree = create_tree (dfa, tree, NULL, SUBEXP);
2512*09d4459fSDaniel Fojt if (__glibc_unlikely (tree == NULL))
251395b7b453SJohn Marino {
251495b7b453SJohn Marino *err = REG_ESPACE;
251595b7b453SJohn Marino return NULL;
251695b7b453SJohn Marino }
251795b7b453SJohn Marino tree->token.opr.idx = cur_nsub;
251895b7b453SJohn Marino return tree;
251995b7b453SJohn Marino }
252095b7b453SJohn Marino
252195b7b453SJohn Marino /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
252295b7b453SJohn Marino
252395b7b453SJohn Marino static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)252495b7b453SJohn Marino parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
252595b7b453SJohn Marino re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
252695b7b453SJohn Marino {
252795b7b453SJohn Marino bin_tree_t *tree = NULL, *old_tree = NULL;
252895b7b453SJohn Marino Idx i, start, end, start_idx = re_string_cur_idx (regexp);
252995b7b453SJohn Marino re_token_t start_token = *token;
253095b7b453SJohn Marino
253195b7b453SJohn Marino if (token->type == OP_OPEN_DUP_NUM)
253295b7b453SJohn Marino {
253395b7b453SJohn Marino end = 0;
253495b7b453SJohn Marino start = fetch_number (regexp, token, syntax);
2535*09d4459fSDaniel Fojt if (start == -1)
253695b7b453SJohn Marino {
253795b7b453SJohn Marino if (token->type == CHARACTER && token->opr.c == ',')
253895b7b453SJohn Marino start = 0; /* We treat "{,m}" as "{0,m}". */
253995b7b453SJohn Marino else
254095b7b453SJohn Marino {
254195b7b453SJohn Marino *err = REG_BADBR; /* <re>{} is invalid. */
254295b7b453SJohn Marino return NULL;
254395b7b453SJohn Marino }
254495b7b453SJohn Marino }
2545*09d4459fSDaniel Fojt if (__glibc_likely (start != -2))
254695b7b453SJohn Marino {
254795b7b453SJohn Marino /* We treat "{n}" as "{n,n}". */
254895b7b453SJohn Marino end = ((token->type == OP_CLOSE_DUP_NUM) ? start
254995b7b453SJohn Marino : ((token->type == CHARACTER && token->opr.c == ',')
2550*09d4459fSDaniel Fojt ? fetch_number (regexp, token, syntax) : -2));
255195b7b453SJohn Marino }
2552*09d4459fSDaniel Fojt if (__glibc_unlikely (start == -2 || end == -2))
255395b7b453SJohn Marino {
255495b7b453SJohn Marino /* Invalid sequence. */
2555*09d4459fSDaniel Fojt if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
255695b7b453SJohn Marino {
255795b7b453SJohn Marino if (token->type == END_OF_RE)
255895b7b453SJohn Marino *err = REG_EBRACE;
255995b7b453SJohn Marino else
256095b7b453SJohn Marino *err = REG_BADBR;
256195b7b453SJohn Marino
256295b7b453SJohn Marino return NULL;
256395b7b453SJohn Marino }
256495b7b453SJohn Marino
256595b7b453SJohn Marino /* If the syntax bit is set, rollback. */
256695b7b453SJohn Marino re_string_set_index (regexp, start_idx);
256795b7b453SJohn Marino *token = start_token;
256895b7b453SJohn Marino token->type = CHARACTER;
256995b7b453SJohn Marino /* mb_partial and word_char bits should be already initialized by
257095b7b453SJohn Marino peek_token. */
257195b7b453SJohn Marino return elem;
257295b7b453SJohn Marino }
257395b7b453SJohn Marino
2574*09d4459fSDaniel Fojt if (__glibc_unlikely ((end != -1 && start > end)
2575*09d4459fSDaniel Fojt || token->type != OP_CLOSE_DUP_NUM))
257695b7b453SJohn Marino {
257795b7b453SJohn Marino /* First number greater than second. */
257895b7b453SJohn Marino *err = REG_BADBR;
257995b7b453SJohn Marino return NULL;
258095b7b453SJohn Marino }
2581cf28ed85SJohn Marino
2582*09d4459fSDaniel Fojt if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2583cf28ed85SJohn Marino {
2584cf28ed85SJohn Marino *err = REG_ESIZE;
2585cf28ed85SJohn Marino return NULL;
2586cf28ed85SJohn Marino }
258795b7b453SJohn Marino }
258895b7b453SJohn Marino else
258995b7b453SJohn Marino {
259095b7b453SJohn Marino start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2591*09d4459fSDaniel Fojt end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
259295b7b453SJohn Marino }
259395b7b453SJohn Marino
259495b7b453SJohn Marino fetch_token (token, regexp, syntax);
259595b7b453SJohn Marino
2596*09d4459fSDaniel Fojt if (__glibc_unlikely (elem == NULL))
259795b7b453SJohn Marino return NULL;
2598*09d4459fSDaniel Fojt if (__glibc_unlikely (start == 0 && end == 0))
259995b7b453SJohn Marino {
260095b7b453SJohn Marino postorder (elem, free_tree, NULL);
260195b7b453SJohn Marino return NULL;
260295b7b453SJohn Marino }
260395b7b453SJohn Marino
260495b7b453SJohn Marino /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2605*09d4459fSDaniel Fojt if (__glibc_unlikely (start > 0))
260695b7b453SJohn Marino {
260795b7b453SJohn Marino tree = elem;
260895b7b453SJohn Marino for (i = 2; i <= start; ++i)
260995b7b453SJohn Marino {
261095b7b453SJohn Marino elem = duplicate_tree (elem, dfa);
261195b7b453SJohn Marino tree = create_tree (dfa, tree, elem, CONCAT);
2612*09d4459fSDaniel Fojt if (__glibc_unlikely (elem == NULL || tree == NULL))
261395b7b453SJohn Marino goto parse_dup_op_espace;
261495b7b453SJohn Marino }
261595b7b453SJohn Marino
261695b7b453SJohn Marino if (start == end)
261795b7b453SJohn Marino return tree;
261895b7b453SJohn Marino
261995b7b453SJohn Marino /* Duplicate ELEM before it is marked optional. */
262095b7b453SJohn Marino elem = duplicate_tree (elem, dfa);
2621*09d4459fSDaniel Fojt if (__glibc_unlikely (elem == NULL))
2622dc7c36e4SJohn Marino goto parse_dup_op_espace;
262395b7b453SJohn Marino old_tree = tree;
262495b7b453SJohn Marino }
262595b7b453SJohn Marino else
262695b7b453SJohn Marino old_tree = NULL;
262795b7b453SJohn Marino
262895b7b453SJohn Marino if (elem->token.type == SUBEXP)
2629a8597f6cSJohn Marino {
2630a8597f6cSJohn Marino uintptr_t subidx = elem->token.opr.idx;
2631a8597f6cSJohn Marino postorder (elem, mark_opt_subexp, (void *) subidx);
2632a8597f6cSJohn Marino }
263395b7b453SJohn Marino
263495b7b453SJohn Marino tree = create_tree (dfa, elem, NULL,
2635*09d4459fSDaniel Fojt (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2636*09d4459fSDaniel Fojt if (__glibc_unlikely (tree == NULL))
263795b7b453SJohn Marino goto parse_dup_op_espace;
263895b7b453SJohn Marino
2639*09d4459fSDaniel Fojt /* This loop is actually executed only when end != -1,
264095b7b453SJohn Marino to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
264195b7b453SJohn Marino already created the start+1-th copy. */
2642*09d4459fSDaniel Fojt if (TYPE_SIGNED (Idx) || end != -1)
264395b7b453SJohn Marino for (i = start + 2; i <= end; ++i)
264495b7b453SJohn Marino {
264595b7b453SJohn Marino elem = duplicate_tree (elem, dfa);
264695b7b453SJohn Marino tree = create_tree (dfa, tree, elem, CONCAT);
2647*09d4459fSDaniel Fojt if (__glibc_unlikely (elem == NULL || tree == NULL))
264895b7b453SJohn Marino goto parse_dup_op_espace;
264995b7b453SJohn Marino
265095b7b453SJohn Marino tree = create_tree (dfa, tree, NULL, OP_ALT);
2651*09d4459fSDaniel Fojt if (__glibc_unlikely (tree == NULL))
265295b7b453SJohn Marino goto parse_dup_op_espace;
265395b7b453SJohn Marino }
265495b7b453SJohn Marino
265595b7b453SJohn Marino if (old_tree)
265695b7b453SJohn Marino tree = create_tree (dfa, old_tree, tree, CONCAT);
265795b7b453SJohn Marino
265895b7b453SJohn Marino return tree;
265995b7b453SJohn Marino
266095b7b453SJohn Marino parse_dup_op_espace:
266195b7b453SJohn Marino *err = REG_ESPACE;
266295b7b453SJohn Marino return NULL;
266395b7b453SJohn Marino }
266495b7b453SJohn Marino
266595b7b453SJohn Marino /* Size of the names for collating symbol/equivalence_class/character_class.
266695b7b453SJohn Marino I'm not sure, but maybe enough. */
266795b7b453SJohn Marino #define BRACKET_NAME_BUF_SIZE 32
266895b7b453SJohn Marino
266995b7b453SJohn Marino #ifndef _LIBC
2670*09d4459fSDaniel Fojt
2671*09d4459fSDaniel Fojt # ifdef RE_ENABLE_I18N
2672*09d4459fSDaniel Fojt /* Convert the byte B to the corresponding wide character. In a
2673*09d4459fSDaniel Fojt unibyte locale, treat B as itself. In a multibyte locale, return
2674*09d4459fSDaniel Fojt WEOF if B is an encoding error. */
2675*09d4459fSDaniel Fojt static wint_t
parse_byte(unsigned char b,re_charset_t * mbcset)2676*09d4459fSDaniel Fojt parse_byte (unsigned char b, re_charset_t *mbcset)
2677*09d4459fSDaniel Fojt {
2678*09d4459fSDaniel Fojt return mbcset == NULL ? b : __btowc (b);
2679*09d4459fSDaniel Fojt }
2680*09d4459fSDaniel Fojt # endif
2681*09d4459fSDaniel Fojt
268295b7b453SJohn Marino /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
268395b7b453SJohn Marino Build the range expression which starts from START_ELEM, and ends
268495b7b453SJohn Marino at END_ELEM. The result are written to MBCSET and SBCSET.
268595b7b453SJohn Marino RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2686cf28ed85SJohn Marino mbcset->range_ends, is a pointer argument since we may
268795b7b453SJohn Marino update it. */
268895b7b453SJohn Marino
268995b7b453SJohn Marino static reg_errcode_t
269095b7b453SJohn Marino # ifdef RE_ENABLE_I18N
build_range_exp(const reg_syntax_t syntax,bitset_t sbcset,re_charset_t * mbcset,Idx * range_alloc,const bracket_elem_t * start_elem,const bracket_elem_t * end_elem)269195b7b453SJohn Marino build_range_exp (const reg_syntax_t syntax,
269295b7b453SJohn Marino bitset_t sbcset,
269395b7b453SJohn Marino re_charset_t *mbcset,
269495b7b453SJohn Marino Idx *range_alloc,
269595b7b453SJohn Marino const bracket_elem_t *start_elem,
269695b7b453SJohn Marino const bracket_elem_t *end_elem)
269795b7b453SJohn Marino # else /* not RE_ENABLE_I18N */
269895b7b453SJohn Marino build_range_exp (const reg_syntax_t syntax,
269995b7b453SJohn Marino bitset_t sbcset,
270095b7b453SJohn Marino const bracket_elem_t *start_elem,
270195b7b453SJohn Marino const bracket_elem_t *end_elem)
270295b7b453SJohn Marino # endif /* not RE_ENABLE_I18N */
270395b7b453SJohn Marino {
270495b7b453SJohn Marino unsigned int start_ch, end_ch;
270595b7b453SJohn Marino /* Equivalence Classes and Character Classes can't be a range start/end. */
2706*09d4459fSDaniel Fojt if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2707*09d4459fSDaniel Fojt || start_elem->type == CHAR_CLASS
2708*09d4459fSDaniel Fojt || end_elem->type == EQUIV_CLASS
2709*09d4459fSDaniel Fojt || end_elem->type == CHAR_CLASS))
271095b7b453SJohn Marino return REG_ERANGE;
271195b7b453SJohn Marino
271295b7b453SJohn Marino /* We can handle no multi character collating elements without libc
271395b7b453SJohn Marino support. */
2714*09d4459fSDaniel Fojt if (__glibc_unlikely ((start_elem->type == COLL_SYM
271595b7b453SJohn Marino && strlen ((char *) start_elem->opr.name) > 1)
271695b7b453SJohn Marino || (end_elem->type == COLL_SYM
2717*09d4459fSDaniel Fojt && strlen ((char *) end_elem->opr.name) > 1)))
271895b7b453SJohn Marino return REG_ECOLLATE;
271995b7b453SJohn Marino
272095b7b453SJohn Marino # ifdef RE_ENABLE_I18N
272195b7b453SJohn Marino {
272295b7b453SJohn Marino wchar_t wc;
272395b7b453SJohn Marino wint_t start_wc;
272495b7b453SJohn Marino wint_t end_wc;
272595b7b453SJohn Marino
272695b7b453SJohn Marino start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
272795b7b453SJohn Marino : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
272895b7b453SJohn Marino : 0));
272995b7b453SJohn Marino end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
273095b7b453SJohn Marino : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
273195b7b453SJohn Marino : 0));
273295b7b453SJohn Marino start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2733*09d4459fSDaniel Fojt ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
273495b7b453SJohn Marino end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2735*09d4459fSDaniel Fojt ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
273695b7b453SJohn Marino if (start_wc == WEOF || end_wc == WEOF)
273795b7b453SJohn Marino return REG_ECOLLATE;
2738*09d4459fSDaniel Fojt else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2739*09d4459fSDaniel Fojt && start_wc > end_wc))
274095b7b453SJohn Marino return REG_ERANGE;
274195b7b453SJohn Marino
274295b7b453SJohn Marino /* Got valid collation sequence values, add them as a new entry.
274395b7b453SJohn Marino However, for !_LIBC we have no collation elements: if the
274495b7b453SJohn Marino character set is single byte, the single byte character set
274595b7b453SJohn Marino that we build below suffices. parse_bracket_exp passes
274695b7b453SJohn Marino no MBCSET if dfa->mb_cur_max == 1. */
274795b7b453SJohn Marino if (mbcset)
274895b7b453SJohn Marino {
274995b7b453SJohn Marino /* Check the space of the arrays. */
2750*09d4459fSDaniel Fojt if (__glibc_unlikely (*range_alloc == mbcset->nranges))
275195b7b453SJohn Marino {
275295b7b453SJohn Marino /* There is not enough space, need realloc. */
275395b7b453SJohn Marino wchar_t *new_array_start, *new_array_end;
275495b7b453SJohn Marino Idx new_nranges;
275595b7b453SJohn Marino
275695b7b453SJohn Marino /* +1 in case of mbcset->nranges is 0. */
275795b7b453SJohn Marino new_nranges = 2 * mbcset->nranges + 1;
275895b7b453SJohn Marino /* Use realloc since mbcset->range_starts and mbcset->range_ends
275995b7b453SJohn Marino are NULL if *range_alloc == 0. */
276095b7b453SJohn Marino new_array_start = re_realloc (mbcset->range_starts, wchar_t,
276195b7b453SJohn Marino new_nranges);
276295b7b453SJohn Marino new_array_end = re_realloc (mbcset->range_ends, wchar_t,
276395b7b453SJohn Marino new_nranges);
276495b7b453SJohn Marino
2765*09d4459fSDaniel Fojt if (__glibc_unlikely (new_array_start == NULL
2766*09d4459fSDaniel Fojt || new_array_end == NULL))
2767*09d4459fSDaniel Fojt {
2768*09d4459fSDaniel Fojt re_free (new_array_start);
2769*09d4459fSDaniel Fojt re_free (new_array_end);
277095b7b453SJohn Marino return REG_ESPACE;
2771*09d4459fSDaniel Fojt }
277295b7b453SJohn Marino
277395b7b453SJohn Marino mbcset->range_starts = new_array_start;
277495b7b453SJohn Marino mbcset->range_ends = new_array_end;
277595b7b453SJohn Marino *range_alloc = new_nranges;
277695b7b453SJohn Marino }
277795b7b453SJohn Marino
277895b7b453SJohn Marino mbcset->range_starts[mbcset->nranges] = start_wc;
277995b7b453SJohn Marino mbcset->range_ends[mbcset->nranges++] = end_wc;
278095b7b453SJohn Marino }
278195b7b453SJohn Marino
278295b7b453SJohn Marino /* Build the table for single byte characters. */
278395b7b453SJohn Marino for (wc = 0; wc < SBC_MAX; ++wc)
278495b7b453SJohn Marino {
2785680a9cb8SJohn Marino if (start_wc <= wc && wc <= end_wc)
278695b7b453SJohn Marino bitset_set (sbcset, wc);
278795b7b453SJohn Marino }
278895b7b453SJohn Marino }
278995b7b453SJohn Marino # else /* not RE_ENABLE_I18N */
279095b7b453SJohn Marino {
279195b7b453SJohn Marino unsigned int ch;
279295b7b453SJohn Marino start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
279395b7b453SJohn Marino : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
279495b7b453SJohn Marino : 0));
279595b7b453SJohn Marino end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
279695b7b453SJohn Marino : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
279795b7b453SJohn Marino : 0));
279895b7b453SJohn Marino if (start_ch > end_ch)
279995b7b453SJohn Marino return REG_ERANGE;
280095b7b453SJohn Marino /* Build the table for single byte characters. */
280195b7b453SJohn Marino for (ch = 0; ch < SBC_MAX; ++ch)
280295b7b453SJohn Marino if (start_ch <= ch && ch <= end_ch)
280395b7b453SJohn Marino bitset_set (sbcset, ch);
280495b7b453SJohn Marino }
280595b7b453SJohn Marino # endif /* not RE_ENABLE_I18N */
280695b7b453SJohn Marino return REG_NOERROR;
280795b7b453SJohn Marino }
280895b7b453SJohn Marino #endif /* not _LIBC */
280995b7b453SJohn Marino
281095b7b453SJohn Marino #ifndef _LIBC
281195b7b453SJohn Marino /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
281295b7b453SJohn Marino Build the collating element which is represented by NAME.
281395b7b453SJohn Marino The result are written to MBCSET and SBCSET.
281495b7b453SJohn Marino COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
281595b7b453SJohn Marino pointer argument since we may update it. */
281695b7b453SJohn Marino
281795b7b453SJohn Marino static reg_errcode_t
281895b7b453SJohn Marino # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2819680a9cb8SJohn Marino build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2820680a9cb8SJohn Marino Idx *coll_sym_alloc, const unsigned char *name)
2821cf28ed85SJohn Marino # else /* not RE_ENABLE_I18N */
2822cf28ed85SJohn Marino build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2823cf28ed85SJohn Marino # endif /* not RE_ENABLE_I18N */
282495b7b453SJohn Marino {
282595b7b453SJohn Marino size_t name_len = strlen ((const char *) name);
2826*09d4459fSDaniel Fojt if (__glibc_unlikely (name_len != 1))
282795b7b453SJohn Marino return REG_ECOLLATE;
282895b7b453SJohn Marino else
282995b7b453SJohn Marino {
283095b7b453SJohn Marino bitset_set (sbcset, name[0]);
283195b7b453SJohn Marino return REG_NOERROR;
283295b7b453SJohn Marino }
283395b7b453SJohn Marino }
283495b7b453SJohn Marino #endif /* not _LIBC */
283595b7b453SJohn Marino
283695b7b453SJohn Marino /* This function parse bracket expression like "[abc]", "[a-c]",
283795b7b453SJohn Marino "[[.a-a.]]" etc. */
283895b7b453SJohn Marino
283995b7b453SJohn Marino static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)284095b7b453SJohn Marino parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
284195b7b453SJohn Marino reg_syntax_t syntax, reg_errcode_t *err)
284295b7b453SJohn Marino {
284395b7b453SJohn Marino #ifdef _LIBC
284495b7b453SJohn Marino const unsigned char *collseqmb;
284595b7b453SJohn Marino const char *collseqwc;
284695b7b453SJohn Marino uint32_t nrules;
284795b7b453SJohn Marino int32_t table_size;
284895b7b453SJohn Marino const int32_t *symb_table;
284995b7b453SJohn Marino const unsigned char *extra;
285095b7b453SJohn Marino
2851cf28ed85SJohn Marino /* Local function for parse_bracket_exp used in _LIBC environment.
2852cf28ed85SJohn Marino Seek the collating symbol entry corresponding to NAME.
2853680a9cb8SJohn Marino Return the index of the symbol in the SYMB_TABLE,
2854680a9cb8SJohn Marino or -1 if not found. */
285595b7b453SJohn Marino
285695b7b453SJohn Marino auto inline int32_t
2857680a9cb8SJohn Marino __attribute__ ((always_inline))
2858680a9cb8SJohn Marino seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
285995b7b453SJohn Marino {
2860680a9cb8SJohn Marino int32_t elem;
2861680a9cb8SJohn Marino
2862680a9cb8SJohn Marino for (elem = 0; elem < table_size; elem++)
286395b7b453SJohn Marino if (symb_table[2 * elem] != 0)
286495b7b453SJohn Marino {
2865680a9cb8SJohn Marino int32_t idx = symb_table[2 * elem + 1];
2866680a9cb8SJohn Marino /* Skip the name of collating element name. */
2867680a9cb8SJohn Marino idx += 1 + extra[idx];
2868680a9cb8SJohn Marino if (/* Compare the length of the name. */
2869680a9cb8SJohn Marino name_len == extra[idx]
287095b7b453SJohn Marino /* Compare the name. */
2871680a9cb8SJohn Marino && memcmp (name, &extra[idx + 1], name_len) == 0)
287295b7b453SJohn Marino /* Yep, this is the entry. */
287395b7b453SJohn Marino return elem;
287495b7b453SJohn Marino }
2875680a9cb8SJohn Marino return -1;
2876680a9cb8SJohn Marino }
287795b7b453SJohn Marino
287895b7b453SJohn Marino /* Local function for parse_bracket_exp used in _LIBC environment.
287995b7b453SJohn Marino Look up the collation sequence value of BR_ELEM.
288095b7b453SJohn Marino Return the value if succeeded, UINT_MAX otherwise. */
288195b7b453SJohn Marino
288295b7b453SJohn Marino auto inline unsigned int
2883680a9cb8SJohn Marino __attribute__ ((always_inline))
2884680a9cb8SJohn Marino lookup_collation_sequence_value (bracket_elem_t *br_elem)
288595b7b453SJohn Marino {
288695b7b453SJohn Marino if (br_elem->type == SB_CHAR)
288795b7b453SJohn Marino {
288895b7b453SJohn Marino /*
288995b7b453SJohn Marino if (MB_CUR_MAX == 1)
289095b7b453SJohn Marino */
289195b7b453SJohn Marino if (nrules == 0)
289295b7b453SJohn Marino return collseqmb[br_elem->opr.ch];
289395b7b453SJohn Marino else
289495b7b453SJohn Marino {
289595b7b453SJohn Marino wint_t wc = __btowc (br_elem->opr.ch);
289695b7b453SJohn Marino return __collseq_table_lookup (collseqwc, wc);
289795b7b453SJohn Marino }
289895b7b453SJohn Marino }
289995b7b453SJohn Marino else if (br_elem->type == MB_CHAR)
290095b7b453SJohn Marino {
290195b7b453SJohn Marino if (nrules != 0)
290295b7b453SJohn Marino return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
290395b7b453SJohn Marino }
290495b7b453SJohn Marino else if (br_elem->type == COLL_SYM)
290595b7b453SJohn Marino {
290695b7b453SJohn Marino size_t sym_name_len = strlen ((char *) br_elem->opr.name);
290795b7b453SJohn Marino if (nrules != 0)
290895b7b453SJohn Marino {
290995b7b453SJohn Marino int32_t elem, idx;
291095b7b453SJohn Marino elem = seek_collating_symbol_entry (br_elem->opr.name,
291195b7b453SJohn Marino sym_name_len);
2912680a9cb8SJohn Marino if (elem != -1)
291395b7b453SJohn Marino {
291495b7b453SJohn Marino /* We found the entry. */
291595b7b453SJohn Marino idx = symb_table[2 * elem + 1];
291695b7b453SJohn Marino /* Skip the name of collating element name. */
291795b7b453SJohn Marino idx += 1 + extra[idx];
291895b7b453SJohn Marino /* Skip the byte sequence of the collating element. */
291995b7b453SJohn Marino idx += 1 + extra[idx];
292095b7b453SJohn Marino /* Adjust for the alignment. */
292195b7b453SJohn Marino idx = (idx + 3) & ~3;
292295b7b453SJohn Marino /* Skip the multibyte collation sequence value. */
292395b7b453SJohn Marino idx += sizeof (unsigned int);
292495b7b453SJohn Marino /* Skip the wide char sequence of the collating element. */
292595b7b453SJohn Marino idx += sizeof (unsigned int) *
292695b7b453SJohn Marino (1 + *(unsigned int *) (extra + idx));
292795b7b453SJohn Marino /* Return the collation sequence value. */
292895b7b453SJohn Marino return *(unsigned int *) (extra + idx);
292995b7b453SJohn Marino }
2930680a9cb8SJohn Marino else if (sym_name_len == 1)
293195b7b453SJohn Marino {
293295b7b453SJohn Marino /* No valid character. Match it as a single byte
293395b7b453SJohn Marino character. */
293495b7b453SJohn Marino return collseqmb[br_elem->opr.name[0]];
293595b7b453SJohn Marino }
293695b7b453SJohn Marino }
293795b7b453SJohn Marino else if (sym_name_len == 1)
293895b7b453SJohn Marino return collseqmb[br_elem->opr.name[0]];
293995b7b453SJohn Marino }
294095b7b453SJohn Marino return UINT_MAX;
294195b7b453SJohn Marino }
294295b7b453SJohn Marino
2943cf28ed85SJohn Marino /* Local function for parse_bracket_exp used in _LIBC environment.
294495b7b453SJohn Marino Build the range expression which starts from START_ELEM, and ends
294595b7b453SJohn Marino at END_ELEM. The result are written to MBCSET and SBCSET.
294695b7b453SJohn Marino RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2947cf28ed85SJohn Marino mbcset->range_ends, is a pointer argument since we may
294895b7b453SJohn Marino update it. */
294995b7b453SJohn Marino
295095b7b453SJohn Marino auto inline reg_errcode_t
2951680a9cb8SJohn Marino __attribute__ ((always_inline))
2952680a9cb8SJohn Marino build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2953680a9cb8SJohn Marino bracket_elem_t *start_elem, bracket_elem_t *end_elem)
295495b7b453SJohn Marino {
295595b7b453SJohn Marino unsigned int ch;
295695b7b453SJohn Marino uint32_t start_collseq;
295795b7b453SJohn Marino uint32_t end_collseq;
295895b7b453SJohn Marino
295995b7b453SJohn Marino /* Equivalence Classes and Character Classes can't be a range
296095b7b453SJohn Marino start/end. */
2961*09d4459fSDaniel Fojt if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2962*09d4459fSDaniel Fojt || start_elem->type == CHAR_CLASS
2963*09d4459fSDaniel Fojt || end_elem->type == EQUIV_CLASS
2964*09d4459fSDaniel Fojt || end_elem->type == CHAR_CLASS))
296595b7b453SJohn Marino return REG_ERANGE;
296695b7b453SJohn Marino
2967680a9cb8SJohn Marino /* FIXME: Implement rational ranges here, too. */
296895b7b453SJohn Marino start_collseq = lookup_collation_sequence_value (start_elem);
296995b7b453SJohn Marino end_collseq = lookup_collation_sequence_value (end_elem);
297095b7b453SJohn Marino /* Check start/end collation sequence values. */
2971*09d4459fSDaniel Fojt if (__glibc_unlikely (start_collseq == UINT_MAX
2972*09d4459fSDaniel Fojt || end_collseq == UINT_MAX))
297395b7b453SJohn Marino return REG_ECOLLATE;
2974*09d4459fSDaniel Fojt if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2975*09d4459fSDaniel Fojt && start_collseq > end_collseq))
297695b7b453SJohn Marino return REG_ERANGE;
297795b7b453SJohn Marino
297895b7b453SJohn Marino /* Got valid collation sequence values, add them as a new entry.
297995b7b453SJohn Marino However, if we have no collation elements, and the character set
298095b7b453SJohn Marino is single byte, the single byte character set that we
298195b7b453SJohn Marino build below suffices. */
298295b7b453SJohn Marino if (nrules > 0 || dfa->mb_cur_max > 1)
298395b7b453SJohn Marino {
298495b7b453SJohn Marino /* Check the space of the arrays. */
2985*09d4459fSDaniel Fojt if (__glibc_unlikely (*range_alloc == mbcset->nranges))
298695b7b453SJohn Marino {
298795b7b453SJohn Marino /* There is not enough space, need realloc. */
298895b7b453SJohn Marino uint32_t *new_array_start;
298995b7b453SJohn Marino uint32_t *new_array_end;
299095b7b453SJohn Marino Idx new_nranges;
299195b7b453SJohn Marino
299295b7b453SJohn Marino /* +1 in case of mbcset->nranges is 0. */
299395b7b453SJohn Marino new_nranges = 2 * mbcset->nranges + 1;
299495b7b453SJohn Marino new_array_start = re_realloc (mbcset->range_starts, uint32_t,
299595b7b453SJohn Marino new_nranges);
299695b7b453SJohn Marino new_array_end = re_realloc (mbcset->range_ends, uint32_t,
299795b7b453SJohn Marino new_nranges);
299895b7b453SJohn Marino
2999*09d4459fSDaniel Fojt if (__glibc_unlikely (new_array_start == NULL
3000*09d4459fSDaniel Fojt || new_array_end == NULL))
300195b7b453SJohn Marino return REG_ESPACE;
300295b7b453SJohn Marino
300395b7b453SJohn Marino mbcset->range_starts = new_array_start;
300495b7b453SJohn Marino mbcset->range_ends = new_array_end;
300595b7b453SJohn Marino *range_alloc = new_nranges;
300695b7b453SJohn Marino }
300795b7b453SJohn Marino
300895b7b453SJohn Marino mbcset->range_starts[mbcset->nranges] = start_collseq;
300995b7b453SJohn Marino mbcset->range_ends[mbcset->nranges++] = end_collseq;
301095b7b453SJohn Marino }
301195b7b453SJohn Marino
301295b7b453SJohn Marino /* Build the table for single byte characters. */
301395b7b453SJohn Marino for (ch = 0; ch < SBC_MAX; ch++)
301495b7b453SJohn Marino {
301595b7b453SJohn Marino uint32_t ch_collseq;
301695b7b453SJohn Marino /*
301795b7b453SJohn Marino if (MB_CUR_MAX == 1)
301895b7b453SJohn Marino */
301995b7b453SJohn Marino if (nrules == 0)
302095b7b453SJohn Marino ch_collseq = collseqmb[ch];
302195b7b453SJohn Marino else
302295b7b453SJohn Marino ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
302395b7b453SJohn Marino if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
302495b7b453SJohn Marino bitset_set (sbcset, ch);
302595b7b453SJohn Marino }
302695b7b453SJohn Marino return REG_NOERROR;
302795b7b453SJohn Marino }
302895b7b453SJohn Marino
3029cf28ed85SJohn Marino /* Local function for parse_bracket_exp used in _LIBC environment.
303095b7b453SJohn Marino Build the collating element which is represented by NAME.
303195b7b453SJohn Marino The result are written to MBCSET and SBCSET.
303295b7b453SJohn Marino COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3033cf28ed85SJohn Marino pointer argument since we may update it. */
303495b7b453SJohn Marino
303595b7b453SJohn Marino auto inline reg_errcode_t
3036680a9cb8SJohn Marino __attribute__ ((always_inline))
3037680a9cb8SJohn Marino build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3038680a9cb8SJohn Marino Idx *coll_sym_alloc, const unsigned char *name)
303995b7b453SJohn Marino {
304095b7b453SJohn Marino int32_t elem, idx;
304195b7b453SJohn Marino size_t name_len = strlen ((const char *) name);
304295b7b453SJohn Marino if (nrules != 0)
304395b7b453SJohn Marino {
304495b7b453SJohn Marino elem = seek_collating_symbol_entry (name, name_len);
3045680a9cb8SJohn Marino if (elem != -1)
304695b7b453SJohn Marino {
304795b7b453SJohn Marino /* We found the entry. */
304895b7b453SJohn Marino idx = symb_table[2 * elem + 1];
304995b7b453SJohn Marino /* Skip the name of collating element name. */
305095b7b453SJohn Marino idx += 1 + extra[idx];
305195b7b453SJohn Marino }
3052680a9cb8SJohn Marino else if (name_len == 1)
305395b7b453SJohn Marino {
305495b7b453SJohn Marino /* No valid character, treat it as a normal
305595b7b453SJohn Marino character. */
305695b7b453SJohn Marino bitset_set (sbcset, name[0]);
305795b7b453SJohn Marino return REG_NOERROR;
305895b7b453SJohn Marino }
305995b7b453SJohn Marino else
306095b7b453SJohn Marino return REG_ECOLLATE;
306195b7b453SJohn Marino
306295b7b453SJohn Marino /* Got valid collation sequence, add it as a new entry. */
306395b7b453SJohn Marino /* Check the space of the arrays. */
3064*09d4459fSDaniel Fojt if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
306595b7b453SJohn Marino {
306695b7b453SJohn Marino /* Not enough, realloc it. */
306795b7b453SJohn Marino /* +1 in case of mbcset->ncoll_syms is 0. */
306895b7b453SJohn Marino Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
306995b7b453SJohn Marino /* Use realloc since mbcset->coll_syms is NULL
307095b7b453SJohn Marino if *alloc == 0. */
307195b7b453SJohn Marino int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
307295b7b453SJohn Marino new_coll_sym_alloc);
3073*09d4459fSDaniel Fojt if (__glibc_unlikely (new_coll_syms == NULL))
307495b7b453SJohn Marino return REG_ESPACE;
307595b7b453SJohn Marino mbcset->coll_syms = new_coll_syms;
307695b7b453SJohn Marino *coll_sym_alloc = new_coll_sym_alloc;
307795b7b453SJohn Marino }
307895b7b453SJohn Marino mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
307995b7b453SJohn Marino return REG_NOERROR;
308095b7b453SJohn Marino }
308195b7b453SJohn Marino else
308295b7b453SJohn Marino {
3083*09d4459fSDaniel Fojt if (__glibc_unlikely (name_len != 1))
308495b7b453SJohn Marino return REG_ECOLLATE;
308595b7b453SJohn Marino else
308695b7b453SJohn Marino {
308795b7b453SJohn Marino bitset_set (sbcset, name[0]);
308895b7b453SJohn Marino return REG_NOERROR;
308995b7b453SJohn Marino }
309095b7b453SJohn Marino }
309195b7b453SJohn Marino }
309295b7b453SJohn Marino #endif
309395b7b453SJohn Marino
309495b7b453SJohn Marino re_token_t br_token;
309595b7b453SJohn Marino re_bitset_ptr_t sbcset;
309695b7b453SJohn Marino #ifdef RE_ENABLE_I18N
309795b7b453SJohn Marino re_charset_t *mbcset;
309895b7b453SJohn Marino Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
309995b7b453SJohn Marino Idx equiv_class_alloc = 0, char_class_alloc = 0;
310095b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
310195b7b453SJohn Marino bool non_match = false;
310295b7b453SJohn Marino bin_tree_t *work_tree;
310395b7b453SJohn Marino int token_len;
310495b7b453SJohn Marino bool first_round = true;
310595b7b453SJohn Marino #ifdef _LIBC
310695b7b453SJohn Marino collseqmb = (const unsigned char *)
310795b7b453SJohn Marino _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
310895b7b453SJohn Marino nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
310995b7b453SJohn Marino if (nrules)
311095b7b453SJohn Marino {
311195b7b453SJohn Marino /*
311295b7b453SJohn Marino if (MB_CUR_MAX > 1)
311395b7b453SJohn Marino */
311495b7b453SJohn Marino collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
311595b7b453SJohn Marino table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
311695b7b453SJohn Marino symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
311795b7b453SJohn Marino _NL_COLLATE_SYMB_TABLEMB);
311895b7b453SJohn Marino extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
311995b7b453SJohn Marino _NL_COLLATE_SYMB_EXTRAMB);
312095b7b453SJohn Marino }
312195b7b453SJohn Marino #endif
312295b7b453SJohn Marino sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
312395b7b453SJohn Marino #ifdef RE_ENABLE_I18N
312495b7b453SJohn Marino mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
312595b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
312695b7b453SJohn Marino #ifdef RE_ENABLE_I18N
3127*09d4459fSDaniel Fojt if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
312895b7b453SJohn Marino #else
3129*09d4459fSDaniel Fojt if (__glibc_unlikely (sbcset == NULL))
313095b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
313195b7b453SJohn Marino {
3132cf28ed85SJohn Marino re_free (sbcset);
3133cf28ed85SJohn Marino #ifdef RE_ENABLE_I18N
3134cf28ed85SJohn Marino re_free (mbcset);
3135cf28ed85SJohn Marino #endif
313695b7b453SJohn Marino *err = REG_ESPACE;
313795b7b453SJohn Marino return NULL;
313895b7b453SJohn Marino }
313995b7b453SJohn Marino
314095b7b453SJohn Marino token_len = peek_token_bracket (token, regexp, syntax);
3141*09d4459fSDaniel Fojt if (__glibc_unlikely (token->type == END_OF_RE))
314295b7b453SJohn Marino {
314395b7b453SJohn Marino *err = REG_BADPAT;
314495b7b453SJohn Marino goto parse_bracket_exp_free_return;
314595b7b453SJohn Marino }
314695b7b453SJohn Marino if (token->type == OP_NON_MATCH_LIST)
314795b7b453SJohn Marino {
314895b7b453SJohn Marino #ifdef RE_ENABLE_I18N
314995b7b453SJohn Marino mbcset->non_match = 1;
315095b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
315195b7b453SJohn Marino non_match = true;
315295b7b453SJohn Marino if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
315395b7b453SJohn Marino bitset_set (sbcset, '\n');
315495b7b453SJohn Marino re_string_skip_bytes (regexp, token_len); /* Skip a token. */
315595b7b453SJohn Marino token_len = peek_token_bracket (token, regexp, syntax);
3156*09d4459fSDaniel Fojt if (__glibc_unlikely (token->type == END_OF_RE))
315795b7b453SJohn Marino {
315895b7b453SJohn Marino *err = REG_BADPAT;
315995b7b453SJohn Marino goto parse_bracket_exp_free_return;
316095b7b453SJohn Marino }
316195b7b453SJohn Marino }
316295b7b453SJohn Marino
316395b7b453SJohn Marino /* We treat the first ']' as a normal character. */
316495b7b453SJohn Marino if (token->type == OP_CLOSE_BRACKET)
316595b7b453SJohn Marino token->type = CHARACTER;
316695b7b453SJohn Marino
316795b7b453SJohn Marino while (1)
316895b7b453SJohn Marino {
316995b7b453SJohn Marino bracket_elem_t start_elem, end_elem;
317095b7b453SJohn Marino unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
317195b7b453SJohn Marino unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
317295b7b453SJohn Marino reg_errcode_t ret;
317395b7b453SJohn Marino int token_len2 = 0;
317495b7b453SJohn Marino bool is_range_exp = false;
317595b7b453SJohn Marino re_token_t token2;
317695b7b453SJohn Marino
317795b7b453SJohn Marino start_elem.opr.name = start_name_buf;
3178dc7c36e4SJohn Marino start_elem.type = COLL_SYM;
317995b7b453SJohn Marino ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
318095b7b453SJohn Marino syntax, first_round);
3181*09d4459fSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
318295b7b453SJohn Marino {
318395b7b453SJohn Marino *err = ret;
318495b7b453SJohn Marino goto parse_bracket_exp_free_return;
318595b7b453SJohn Marino }
318695b7b453SJohn Marino first_round = false;
318795b7b453SJohn Marino
318895b7b453SJohn Marino /* Get information about the next token. We need it in any case. */
318995b7b453SJohn Marino token_len = peek_token_bracket (token, regexp, syntax);
319095b7b453SJohn Marino
319195b7b453SJohn Marino /* Do not check for ranges if we know they are not allowed. */
319295b7b453SJohn Marino if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
319395b7b453SJohn Marino {
3194*09d4459fSDaniel Fojt if (__glibc_unlikely (token->type == END_OF_RE))
319595b7b453SJohn Marino {
319695b7b453SJohn Marino *err = REG_EBRACK;
319795b7b453SJohn Marino goto parse_bracket_exp_free_return;
319895b7b453SJohn Marino }
319995b7b453SJohn Marino if (token->type == OP_CHARSET_RANGE)
320095b7b453SJohn Marino {
320195b7b453SJohn Marino re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
320295b7b453SJohn Marino token_len2 = peek_token_bracket (&token2, regexp, syntax);
3203*09d4459fSDaniel Fojt if (__glibc_unlikely (token2.type == END_OF_RE))
320495b7b453SJohn Marino {
320595b7b453SJohn Marino *err = REG_EBRACK;
320695b7b453SJohn Marino goto parse_bracket_exp_free_return;
320795b7b453SJohn Marino }
320895b7b453SJohn Marino if (token2.type == OP_CLOSE_BRACKET)
320995b7b453SJohn Marino {
321095b7b453SJohn Marino /* We treat the last '-' as a normal character. */
321195b7b453SJohn Marino re_string_skip_bytes (regexp, -token_len);
321295b7b453SJohn Marino token->type = CHARACTER;
321395b7b453SJohn Marino }
321495b7b453SJohn Marino else
321595b7b453SJohn Marino is_range_exp = true;
321695b7b453SJohn Marino }
321795b7b453SJohn Marino }
321895b7b453SJohn Marino
321995b7b453SJohn Marino if (is_range_exp == true)
322095b7b453SJohn Marino {
322195b7b453SJohn Marino end_elem.opr.name = end_name_buf;
3222dc7c36e4SJohn Marino end_elem.type = COLL_SYM;
322395b7b453SJohn Marino ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
322495b7b453SJohn Marino dfa, syntax, true);
3225*09d4459fSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
322695b7b453SJohn Marino {
322795b7b453SJohn Marino *err = ret;
322895b7b453SJohn Marino goto parse_bracket_exp_free_return;
322995b7b453SJohn Marino }
323095b7b453SJohn Marino
323195b7b453SJohn Marino token_len = peek_token_bracket (token, regexp, syntax);
323295b7b453SJohn Marino
323395b7b453SJohn Marino #ifdef _LIBC
323495b7b453SJohn Marino *err = build_range_exp (sbcset, mbcset, &range_alloc,
323595b7b453SJohn Marino &start_elem, &end_elem);
323695b7b453SJohn Marino #else
323795b7b453SJohn Marino # ifdef RE_ENABLE_I18N
323895b7b453SJohn Marino *err = build_range_exp (syntax, sbcset,
323995b7b453SJohn Marino dfa->mb_cur_max > 1 ? mbcset : NULL,
324095b7b453SJohn Marino &range_alloc, &start_elem, &end_elem);
324195b7b453SJohn Marino # else
324295b7b453SJohn Marino *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
324395b7b453SJohn Marino # endif
324495b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
3245*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR))
324695b7b453SJohn Marino goto parse_bracket_exp_free_return;
324795b7b453SJohn Marino }
324895b7b453SJohn Marino else
324995b7b453SJohn Marino {
325095b7b453SJohn Marino switch (start_elem.type)
325195b7b453SJohn Marino {
325295b7b453SJohn Marino case SB_CHAR:
325395b7b453SJohn Marino bitset_set (sbcset, start_elem.opr.ch);
325495b7b453SJohn Marino break;
325595b7b453SJohn Marino #ifdef RE_ENABLE_I18N
325695b7b453SJohn Marino case MB_CHAR:
325795b7b453SJohn Marino /* Check whether the array has enough space. */
3258*09d4459fSDaniel Fojt if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
325995b7b453SJohn Marino {
326095b7b453SJohn Marino wchar_t *new_mbchars;
326195b7b453SJohn Marino /* Not enough, realloc it. */
326295b7b453SJohn Marino /* +1 in case of mbcset->nmbchars is 0. */
326395b7b453SJohn Marino mbchar_alloc = 2 * mbcset->nmbchars + 1;
326495b7b453SJohn Marino /* Use realloc since array is NULL if *alloc == 0. */
326595b7b453SJohn Marino new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
326695b7b453SJohn Marino mbchar_alloc);
3267*09d4459fSDaniel Fojt if (__glibc_unlikely (new_mbchars == NULL))
326895b7b453SJohn Marino goto parse_bracket_exp_espace;
326995b7b453SJohn Marino mbcset->mbchars = new_mbchars;
327095b7b453SJohn Marino }
327195b7b453SJohn Marino mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
327295b7b453SJohn Marino break;
327395b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
327495b7b453SJohn Marino case EQUIV_CLASS:
327595b7b453SJohn Marino *err = build_equiv_class (sbcset,
327695b7b453SJohn Marino #ifdef RE_ENABLE_I18N
327795b7b453SJohn Marino mbcset, &equiv_class_alloc,
327895b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
327995b7b453SJohn Marino start_elem.opr.name);
3280*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR))
328195b7b453SJohn Marino goto parse_bracket_exp_free_return;
328295b7b453SJohn Marino break;
328395b7b453SJohn Marino case COLL_SYM:
328495b7b453SJohn Marino *err = build_collating_symbol (sbcset,
328595b7b453SJohn Marino #ifdef RE_ENABLE_I18N
328695b7b453SJohn Marino mbcset, &coll_sym_alloc,
328795b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
328895b7b453SJohn Marino start_elem.opr.name);
3289*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR))
329095b7b453SJohn Marino goto parse_bracket_exp_free_return;
329195b7b453SJohn Marino break;
329295b7b453SJohn Marino case CHAR_CLASS:
329395b7b453SJohn Marino *err = build_charclass (regexp->trans, sbcset,
329495b7b453SJohn Marino #ifdef RE_ENABLE_I18N
329595b7b453SJohn Marino mbcset, &char_class_alloc,
329695b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
3297680a9cb8SJohn Marino (const char *) start_elem.opr.name,
3298680a9cb8SJohn Marino syntax);
3299*09d4459fSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR))
330095b7b453SJohn Marino goto parse_bracket_exp_free_return;
330195b7b453SJohn Marino break;
330295b7b453SJohn Marino default:
3303*09d4459fSDaniel Fojt DEBUG_ASSERT (false);
330495b7b453SJohn Marino break;
330595b7b453SJohn Marino }
330695b7b453SJohn Marino }
3307*09d4459fSDaniel Fojt if (__glibc_unlikely (token->type == END_OF_RE))
330895b7b453SJohn Marino {
330995b7b453SJohn Marino *err = REG_EBRACK;
331095b7b453SJohn Marino goto parse_bracket_exp_free_return;
331195b7b453SJohn Marino }
331295b7b453SJohn Marino if (token->type == OP_CLOSE_BRACKET)
331395b7b453SJohn Marino break;
331495b7b453SJohn Marino }
331595b7b453SJohn Marino
331695b7b453SJohn Marino re_string_skip_bytes (regexp, token_len); /* Skip a token. */
331795b7b453SJohn Marino
331895b7b453SJohn Marino /* If it is non-matching list. */
331995b7b453SJohn Marino if (non_match)
332095b7b453SJohn Marino bitset_not (sbcset);
332195b7b453SJohn Marino
332295b7b453SJohn Marino #ifdef RE_ENABLE_I18N
332395b7b453SJohn Marino /* Ensure only single byte characters are set. */
332495b7b453SJohn Marino if (dfa->mb_cur_max > 1)
332595b7b453SJohn Marino bitset_mask (sbcset, dfa->sb_char);
332695b7b453SJohn Marino
332795b7b453SJohn Marino if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
332895b7b453SJohn Marino || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
332995b7b453SJohn Marino || mbcset->non_match)))
333095b7b453SJohn Marino {
333195b7b453SJohn Marino bin_tree_t *mbc_tree;
333295b7b453SJohn Marino int sbc_idx;
333395b7b453SJohn Marino /* Build a tree for complex bracket. */
333495b7b453SJohn Marino dfa->has_mb_node = 1;
333595b7b453SJohn Marino br_token.type = COMPLEX_BRACKET;
333695b7b453SJohn Marino br_token.opr.mbcset = mbcset;
333795b7b453SJohn Marino mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3338*09d4459fSDaniel Fojt if (__glibc_unlikely (mbc_tree == NULL))
333995b7b453SJohn Marino goto parse_bracket_exp_espace;
334095b7b453SJohn Marino for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
334195b7b453SJohn Marino if (sbcset[sbc_idx])
334295b7b453SJohn Marino break;
334395b7b453SJohn Marino /* If there are no bits set in sbcset, there is no point
334495b7b453SJohn Marino of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
334595b7b453SJohn Marino if (sbc_idx < BITSET_WORDS)
334695b7b453SJohn Marino {
334795b7b453SJohn Marino /* Build a tree for simple bracket. */
334895b7b453SJohn Marino br_token.type = SIMPLE_BRACKET;
334995b7b453SJohn Marino br_token.opr.sbcset = sbcset;
335095b7b453SJohn Marino work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3351*09d4459fSDaniel Fojt if (__glibc_unlikely (work_tree == NULL))
335295b7b453SJohn Marino goto parse_bracket_exp_espace;
335395b7b453SJohn Marino
335495b7b453SJohn Marino /* Then join them by ALT node. */
335595b7b453SJohn Marino work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3356*09d4459fSDaniel Fojt if (__glibc_unlikely (work_tree == NULL))
335795b7b453SJohn Marino goto parse_bracket_exp_espace;
335895b7b453SJohn Marino }
335995b7b453SJohn Marino else
336095b7b453SJohn Marino {
336195b7b453SJohn Marino re_free (sbcset);
336295b7b453SJohn Marino work_tree = mbc_tree;
336395b7b453SJohn Marino }
336495b7b453SJohn Marino }
336595b7b453SJohn Marino else
336695b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
336795b7b453SJohn Marino {
336895b7b453SJohn Marino #ifdef RE_ENABLE_I18N
336995b7b453SJohn Marino free_charset (mbcset);
337095b7b453SJohn Marino #endif
337195b7b453SJohn Marino /* Build a tree for simple bracket. */
337295b7b453SJohn Marino br_token.type = SIMPLE_BRACKET;
337395b7b453SJohn Marino br_token.opr.sbcset = sbcset;
337495b7b453SJohn Marino work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3375*09d4459fSDaniel Fojt if (__glibc_unlikely (work_tree == NULL))
337695b7b453SJohn Marino goto parse_bracket_exp_espace;
337795b7b453SJohn Marino }
337895b7b453SJohn Marino return work_tree;
337995b7b453SJohn Marino
338095b7b453SJohn Marino parse_bracket_exp_espace:
338195b7b453SJohn Marino *err = REG_ESPACE;
338295b7b453SJohn Marino parse_bracket_exp_free_return:
338395b7b453SJohn Marino re_free (sbcset);
338495b7b453SJohn Marino #ifdef RE_ENABLE_I18N
338595b7b453SJohn Marino free_charset (mbcset);
338695b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
338795b7b453SJohn Marino return NULL;
338895b7b453SJohn Marino }
338995b7b453SJohn Marino
339095b7b453SJohn Marino /* Parse an element in the bracket expression. */
339195b7b453SJohn Marino
339295b7b453SJohn Marino static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)339395b7b453SJohn Marino parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3394680a9cb8SJohn Marino re_token_t *token, int token_len, re_dfa_t *dfa,
339595b7b453SJohn Marino reg_syntax_t syntax, bool accept_hyphen)
339695b7b453SJohn Marino {
339795b7b453SJohn Marino #ifdef RE_ENABLE_I18N
339895b7b453SJohn Marino int cur_char_size;
339995b7b453SJohn Marino cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
340095b7b453SJohn Marino if (cur_char_size > 1)
340195b7b453SJohn Marino {
340295b7b453SJohn Marino elem->type = MB_CHAR;
340395b7b453SJohn Marino elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
340495b7b453SJohn Marino re_string_skip_bytes (regexp, cur_char_size);
340595b7b453SJohn Marino return REG_NOERROR;
340695b7b453SJohn Marino }
340795b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
340895b7b453SJohn Marino re_string_skip_bytes (regexp, token_len); /* Skip a token. */
340995b7b453SJohn Marino if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
341095b7b453SJohn Marino || token->type == OP_OPEN_EQUIV_CLASS)
341195b7b453SJohn Marino return parse_bracket_symbol (elem, regexp, token);
3412*09d4459fSDaniel Fojt if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
341395b7b453SJohn Marino {
341495b7b453SJohn Marino /* A '-' must only appear as anything but a range indicator before
341595b7b453SJohn Marino the closing bracket. Everything else is an error. */
341695b7b453SJohn Marino re_token_t token2;
341795b7b453SJohn Marino (void) peek_token_bracket (&token2, regexp, syntax);
341895b7b453SJohn Marino if (token2.type != OP_CLOSE_BRACKET)
341995b7b453SJohn Marino /* The actual error value is not standardized since this whole
342095b7b453SJohn Marino case is undefined. But ERANGE makes good sense. */
342195b7b453SJohn Marino return REG_ERANGE;
342295b7b453SJohn Marino }
342395b7b453SJohn Marino elem->type = SB_CHAR;
342495b7b453SJohn Marino elem->opr.ch = token->opr.c;
342595b7b453SJohn Marino return REG_NOERROR;
342695b7b453SJohn Marino }
342795b7b453SJohn Marino
342895b7b453SJohn Marino /* Parse a bracket symbol in the bracket expression. Bracket symbols are
342995b7b453SJohn Marino such as [:<character_class>:], [.<collating_element>.], and
343095b7b453SJohn Marino [=<equivalent_class>=]. */
343195b7b453SJohn Marino
343295b7b453SJohn Marino static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)343395b7b453SJohn Marino parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
343495b7b453SJohn Marino re_token_t *token)
343595b7b453SJohn Marino {
343695b7b453SJohn Marino unsigned char ch, delim = token->opr.c;
343795b7b453SJohn Marino int i = 0;
343895b7b453SJohn Marino if (re_string_eoi(regexp))
343995b7b453SJohn Marino return REG_EBRACK;
344095b7b453SJohn Marino for (;; ++i)
344195b7b453SJohn Marino {
344295b7b453SJohn Marino if (i >= BRACKET_NAME_BUF_SIZE)
344395b7b453SJohn Marino return REG_EBRACK;
344495b7b453SJohn Marino if (token->type == OP_OPEN_CHAR_CLASS)
344595b7b453SJohn Marino ch = re_string_fetch_byte_case (regexp);
344695b7b453SJohn Marino else
344795b7b453SJohn Marino ch = re_string_fetch_byte (regexp);
344895b7b453SJohn Marino if (re_string_eoi(regexp))
344995b7b453SJohn Marino return REG_EBRACK;
345095b7b453SJohn Marino if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
345195b7b453SJohn Marino break;
345295b7b453SJohn Marino elem->opr.name[i] = ch;
345395b7b453SJohn Marino }
345495b7b453SJohn Marino re_string_skip_bytes (regexp, 1);
345595b7b453SJohn Marino elem->opr.name[i] = '\0';
345695b7b453SJohn Marino switch (token->type)
345795b7b453SJohn Marino {
345895b7b453SJohn Marino case OP_OPEN_COLL_ELEM:
345995b7b453SJohn Marino elem->type = COLL_SYM;
346095b7b453SJohn Marino break;
346195b7b453SJohn Marino case OP_OPEN_EQUIV_CLASS:
346295b7b453SJohn Marino elem->type = EQUIV_CLASS;
346395b7b453SJohn Marino break;
346495b7b453SJohn Marino case OP_OPEN_CHAR_CLASS:
346595b7b453SJohn Marino elem->type = CHAR_CLASS;
346695b7b453SJohn Marino break;
346795b7b453SJohn Marino default:
346895b7b453SJohn Marino break;
346995b7b453SJohn Marino }
347095b7b453SJohn Marino return REG_NOERROR;
347195b7b453SJohn Marino }
347295b7b453SJohn Marino
347395b7b453SJohn Marino /* Helper function for parse_bracket_exp.
347495b7b453SJohn Marino Build the equivalence class which is represented by NAME.
347595b7b453SJohn Marino The result are written to MBCSET and SBCSET.
347695b7b453SJohn Marino EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3477cf28ed85SJohn Marino is a pointer argument since we may update it. */
347895b7b453SJohn Marino
347995b7b453SJohn Marino static reg_errcode_t
348095b7b453SJohn Marino #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3481680a9cb8SJohn Marino build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3482680a9cb8SJohn Marino Idx *equiv_class_alloc, const unsigned char *name)
348395b7b453SJohn Marino #else /* not RE_ENABLE_I18N */
348495b7b453SJohn Marino build_equiv_class (bitset_t sbcset, const unsigned char *name)
348595b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
348695b7b453SJohn Marino {
348795b7b453SJohn Marino #ifdef _LIBC
348895b7b453SJohn Marino uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
348995b7b453SJohn Marino if (nrules != 0)
349095b7b453SJohn Marino {
349195b7b453SJohn Marino const int32_t *table, *indirect;
349295b7b453SJohn Marino const unsigned char *weights, *extra, *cp;
349395b7b453SJohn Marino unsigned char char_buf[2];
349495b7b453SJohn Marino int32_t idx1, idx2;
349595b7b453SJohn Marino unsigned int ch;
349695b7b453SJohn Marino size_t len;
349795b7b453SJohn Marino /* Calculate the index for equivalence class. */
349895b7b453SJohn Marino cp = name;
349995b7b453SJohn Marino table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
350095b7b453SJohn Marino weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
350195b7b453SJohn Marino _NL_COLLATE_WEIGHTMB);
350295b7b453SJohn Marino extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
350395b7b453SJohn Marino _NL_COLLATE_EXTRAMB);
350495b7b453SJohn Marino indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
350595b7b453SJohn Marino _NL_COLLATE_INDIRECTMB);
3506dc7c36e4SJohn Marino idx1 = findidx (table, indirect, extra, &cp, -1);
3507*09d4459fSDaniel Fojt if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
350895b7b453SJohn Marino /* This isn't a valid character. */
350995b7b453SJohn Marino return REG_ECOLLATE;
351095b7b453SJohn Marino
3511cf28ed85SJohn Marino /* Build single byte matching table for this equivalence class. */
351295b7b453SJohn Marino len = weights[idx1 & 0xffffff];
351395b7b453SJohn Marino for (ch = 0; ch < SBC_MAX; ++ch)
351495b7b453SJohn Marino {
351595b7b453SJohn Marino char_buf[0] = ch;
351695b7b453SJohn Marino cp = char_buf;
3517dc7c36e4SJohn Marino idx2 = findidx (table, indirect, extra, &cp, 1);
351895b7b453SJohn Marino /*
351995b7b453SJohn Marino idx2 = table[ch];
352095b7b453SJohn Marino */
352195b7b453SJohn Marino if (idx2 == 0)
352295b7b453SJohn Marino /* This isn't a valid character. */
352395b7b453SJohn Marino continue;
352495b7b453SJohn Marino /* Compare only if the length matches and the collation rule
352595b7b453SJohn Marino index is the same. */
3526*09d4459fSDaniel Fojt if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3527*09d4459fSDaniel Fojt && memcmp (weights + (idx1 & 0xffffff) + 1,
3528*09d4459fSDaniel Fojt weights + (idx2 & 0xffffff) + 1, len) == 0)
352995b7b453SJohn Marino bitset_set (sbcset, ch);
353095b7b453SJohn Marino }
353195b7b453SJohn Marino /* Check whether the array has enough space. */
3532*09d4459fSDaniel Fojt if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
353395b7b453SJohn Marino {
353495b7b453SJohn Marino /* Not enough, realloc it. */
353595b7b453SJohn Marino /* +1 in case of mbcset->nequiv_classes is 0. */
353695b7b453SJohn Marino Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
353795b7b453SJohn Marino /* Use realloc since the array is NULL if *alloc == 0. */
353895b7b453SJohn Marino int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
353995b7b453SJohn Marino int32_t,
354095b7b453SJohn Marino new_equiv_class_alloc);
3541*09d4459fSDaniel Fojt if (__glibc_unlikely (new_equiv_classes == NULL))
354295b7b453SJohn Marino return REG_ESPACE;
354395b7b453SJohn Marino mbcset->equiv_classes = new_equiv_classes;
354495b7b453SJohn Marino *equiv_class_alloc = new_equiv_class_alloc;
354595b7b453SJohn Marino }
354695b7b453SJohn Marino mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
354795b7b453SJohn Marino }
354895b7b453SJohn Marino else
354995b7b453SJohn Marino #endif /* _LIBC */
355095b7b453SJohn Marino {
3551*09d4459fSDaniel Fojt if (__glibc_unlikely (strlen ((const char *) name) != 1))
355295b7b453SJohn Marino return REG_ECOLLATE;
355395b7b453SJohn Marino bitset_set (sbcset, *name);
355495b7b453SJohn Marino }
355595b7b453SJohn Marino return REG_NOERROR;
355695b7b453SJohn Marino }
355795b7b453SJohn Marino
355895b7b453SJohn Marino /* Helper function for parse_bracket_exp.
355995b7b453SJohn Marino Build the character class which is represented by NAME.
356095b7b453SJohn Marino The result are written to MBCSET and SBCSET.
356195b7b453SJohn Marino CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3562cf28ed85SJohn Marino is a pointer argument since we may update it. */
356395b7b453SJohn Marino
356495b7b453SJohn Marino static reg_errcode_t
356595b7b453SJohn Marino #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const char * class_name,reg_syntax_t syntax)356695b7b453SJohn Marino build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
356795b7b453SJohn Marino re_charset_t *mbcset, Idx *char_class_alloc,
3568680a9cb8SJohn Marino const char *class_name, reg_syntax_t syntax)
356995b7b453SJohn Marino #else /* not RE_ENABLE_I18N */
357095b7b453SJohn Marino build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3571680a9cb8SJohn Marino const char *class_name, reg_syntax_t syntax)
357295b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
357395b7b453SJohn Marino {
357495b7b453SJohn Marino int i;
3575680a9cb8SJohn Marino const char *name = class_name;
357695b7b453SJohn Marino
357795b7b453SJohn Marino /* In case of REG_ICASE "upper" and "lower" match the both of
357895b7b453SJohn Marino upper and lower cases. */
357995b7b453SJohn Marino if ((syntax & RE_ICASE)
358095b7b453SJohn Marino && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
358195b7b453SJohn Marino name = "alpha";
358295b7b453SJohn Marino
358395b7b453SJohn Marino #ifdef RE_ENABLE_I18N
358495b7b453SJohn Marino /* Check the space of the arrays. */
3585*09d4459fSDaniel Fojt if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
358695b7b453SJohn Marino {
358795b7b453SJohn Marino /* Not enough, realloc it. */
358895b7b453SJohn Marino /* +1 in case of mbcset->nchar_classes is 0. */
358995b7b453SJohn Marino Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
359095b7b453SJohn Marino /* Use realloc since array is NULL if *alloc == 0. */
359195b7b453SJohn Marino wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
359295b7b453SJohn Marino new_char_class_alloc);
3593*09d4459fSDaniel Fojt if (__glibc_unlikely (new_char_classes == NULL))
359495b7b453SJohn Marino return REG_ESPACE;
359595b7b453SJohn Marino mbcset->char_classes = new_char_classes;
359695b7b453SJohn Marino *char_class_alloc = new_char_class_alloc;
359795b7b453SJohn Marino }
359895b7b453SJohn Marino mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
359995b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
360095b7b453SJohn Marino
360195b7b453SJohn Marino #define BUILD_CHARCLASS_LOOP(ctype_func) \
360295b7b453SJohn Marino do { \
3603*09d4459fSDaniel Fojt if (__glibc_unlikely (trans != NULL)) \
360495b7b453SJohn Marino { \
360595b7b453SJohn Marino for (i = 0; i < SBC_MAX; ++i) \
360695b7b453SJohn Marino if (ctype_func (i)) \
360795b7b453SJohn Marino bitset_set (sbcset, trans[i]); \
360895b7b453SJohn Marino } \
360995b7b453SJohn Marino else \
361095b7b453SJohn Marino { \
361195b7b453SJohn Marino for (i = 0; i < SBC_MAX; ++i) \
361295b7b453SJohn Marino if (ctype_func (i)) \
361395b7b453SJohn Marino bitset_set (sbcset, i); \
361495b7b453SJohn Marino } \
361595b7b453SJohn Marino } while (0)
361695b7b453SJohn Marino
361795b7b453SJohn Marino if (strcmp (name, "alnum") == 0)
361895b7b453SJohn Marino BUILD_CHARCLASS_LOOP (isalnum);
361995b7b453SJohn Marino else if (strcmp (name, "cntrl") == 0)
362095b7b453SJohn Marino BUILD_CHARCLASS_LOOP (iscntrl);
362195b7b453SJohn Marino else if (strcmp (name, "lower") == 0)
362295b7b453SJohn Marino BUILD_CHARCLASS_LOOP (islower);
362395b7b453SJohn Marino else if (strcmp (name, "space") == 0)
362495b7b453SJohn Marino BUILD_CHARCLASS_LOOP (isspace);
362595b7b453SJohn Marino else if (strcmp (name, "alpha") == 0)
362695b7b453SJohn Marino BUILD_CHARCLASS_LOOP (isalpha);
362795b7b453SJohn Marino else if (strcmp (name, "digit") == 0)
362895b7b453SJohn Marino BUILD_CHARCLASS_LOOP (isdigit);
362995b7b453SJohn Marino else if (strcmp (name, "print") == 0)
363095b7b453SJohn Marino BUILD_CHARCLASS_LOOP (isprint);
363195b7b453SJohn Marino else if (strcmp (name, "upper") == 0)
363295b7b453SJohn Marino BUILD_CHARCLASS_LOOP (isupper);
363395b7b453SJohn Marino else if (strcmp (name, "blank") == 0)
363495b7b453SJohn Marino BUILD_CHARCLASS_LOOP (isblank);
363595b7b453SJohn Marino else if (strcmp (name, "graph") == 0)
363695b7b453SJohn Marino BUILD_CHARCLASS_LOOP (isgraph);
363795b7b453SJohn Marino else if (strcmp (name, "punct") == 0)
363895b7b453SJohn Marino BUILD_CHARCLASS_LOOP (ispunct);
363995b7b453SJohn Marino else if (strcmp (name, "xdigit") == 0)
364095b7b453SJohn Marino BUILD_CHARCLASS_LOOP (isxdigit);
364195b7b453SJohn Marino else
364295b7b453SJohn Marino return REG_ECTYPE;
364395b7b453SJohn Marino
364495b7b453SJohn Marino return REG_NOERROR;
364595b7b453SJohn Marino }
364695b7b453SJohn Marino
364795b7b453SJohn Marino static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const char * class_name,const char * extra,bool non_match,reg_errcode_t * err)364895b7b453SJohn Marino build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3649680a9cb8SJohn Marino const char *class_name,
3650680a9cb8SJohn Marino const char *extra, bool non_match,
365195b7b453SJohn Marino reg_errcode_t *err)
365295b7b453SJohn Marino {
365395b7b453SJohn Marino re_bitset_ptr_t sbcset;
365495b7b453SJohn Marino #ifdef RE_ENABLE_I18N
365595b7b453SJohn Marino re_charset_t *mbcset;
365695b7b453SJohn Marino Idx alloc = 0;
365795b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
365895b7b453SJohn Marino reg_errcode_t ret;
365995b7b453SJohn Marino bin_tree_t *tree;
366095b7b453SJohn Marino
366195b7b453SJohn Marino sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3662*09d4459fSDaniel Fojt if (__glibc_unlikely (sbcset == NULL))
366395b7b453SJohn Marino {
366495b7b453SJohn Marino *err = REG_ESPACE;
366595b7b453SJohn Marino return NULL;
366695b7b453SJohn Marino }
366795b7b453SJohn Marino #ifdef RE_ENABLE_I18N
3668*09d4459fSDaniel Fojt mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3669*09d4459fSDaniel Fojt if (__glibc_unlikely (mbcset == NULL))
3670*09d4459fSDaniel Fojt {
3671*09d4459fSDaniel Fojt re_free (sbcset);
3672*09d4459fSDaniel Fojt *err = REG_ESPACE;
3673*09d4459fSDaniel Fojt return NULL;
367495b7b453SJohn Marino }
3675*09d4459fSDaniel Fojt mbcset->non_match = non_match;
3676*09d4459fSDaniel Fojt #endif /* RE_ENABLE_I18N */
367795b7b453SJohn Marino
367895b7b453SJohn Marino /* We don't care the syntax in this case. */
367995b7b453SJohn Marino ret = build_charclass (trans, sbcset,
368095b7b453SJohn Marino #ifdef RE_ENABLE_I18N
368195b7b453SJohn Marino mbcset, &alloc,
368295b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
368395b7b453SJohn Marino class_name, 0);
368495b7b453SJohn Marino
3685*09d4459fSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
368695b7b453SJohn Marino {
368795b7b453SJohn Marino re_free (sbcset);
368895b7b453SJohn Marino #ifdef RE_ENABLE_I18N
368995b7b453SJohn Marino free_charset (mbcset);
369095b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
369195b7b453SJohn Marino *err = ret;
369295b7b453SJohn Marino return NULL;
369395b7b453SJohn Marino }
369495b7b453SJohn Marino /* \w match '_' also. */
369595b7b453SJohn Marino for (; *extra; extra++)
369695b7b453SJohn Marino bitset_set (sbcset, *extra);
369795b7b453SJohn Marino
369895b7b453SJohn Marino /* If it is non-matching list. */
369995b7b453SJohn Marino if (non_match)
370095b7b453SJohn Marino bitset_not (sbcset);
370195b7b453SJohn Marino
370295b7b453SJohn Marino #ifdef RE_ENABLE_I18N
370395b7b453SJohn Marino /* Ensure only single byte characters are set. */
370495b7b453SJohn Marino if (dfa->mb_cur_max > 1)
370595b7b453SJohn Marino bitset_mask (sbcset, dfa->sb_char);
370695b7b453SJohn Marino #endif
370795b7b453SJohn Marino
370895b7b453SJohn Marino /* Build a tree for simple bracket. */
3709*09d4459fSDaniel Fojt re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
371095b7b453SJohn Marino tree = create_token_tree (dfa, NULL, NULL, &br_token);
3711*09d4459fSDaniel Fojt if (__glibc_unlikely (tree == NULL))
371295b7b453SJohn Marino goto build_word_op_espace;
371395b7b453SJohn Marino
371495b7b453SJohn Marino #ifdef RE_ENABLE_I18N
371595b7b453SJohn Marino if (dfa->mb_cur_max > 1)
371695b7b453SJohn Marino {
371795b7b453SJohn Marino bin_tree_t *mbc_tree;
371895b7b453SJohn Marino /* Build a tree for complex bracket. */
371995b7b453SJohn Marino br_token.type = COMPLEX_BRACKET;
372095b7b453SJohn Marino br_token.opr.mbcset = mbcset;
372195b7b453SJohn Marino dfa->has_mb_node = 1;
372295b7b453SJohn Marino mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3723*09d4459fSDaniel Fojt if (__glibc_unlikely (mbc_tree == NULL))
372495b7b453SJohn Marino goto build_word_op_espace;
372595b7b453SJohn Marino /* Then join them by ALT node. */
372695b7b453SJohn Marino tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3727*09d4459fSDaniel Fojt if (__glibc_likely (mbc_tree != NULL))
372895b7b453SJohn Marino return tree;
372995b7b453SJohn Marino }
373095b7b453SJohn Marino else
373195b7b453SJohn Marino {
373295b7b453SJohn Marino free_charset (mbcset);
373395b7b453SJohn Marino return tree;
373495b7b453SJohn Marino }
373595b7b453SJohn Marino #else /* not RE_ENABLE_I18N */
373695b7b453SJohn Marino return tree;
373795b7b453SJohn Marino #endif /* not RE_ENABLE_I18N */
373895b7b453SJohn Marino
373995b7b453SJohn Marino build_word_op_espace:
374095b7b453SJohn Marino re_free (sbcset);
374195b7b453SJohn Marino #ifdef RE_ENABLE_I18N
374295b7b453SJohn Marino free_charset (mbcset);
374395b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
374495b7b453SJohn Marino *err = REG_ESPACE;
374595b7b453SJohn Marino return NULL;
374695b7b453SJohn Marino }
374795b7b453SJohn Marino
374895b7b453SJohn Marino /* This is intended for the expressions like "a{1,3}".
3749cf28ed85SJohn Marino Fetch a number from 'input', and return the number.
3750*09d4459fSDaniel Fojt Return -1 if the number field is empty like "{,1}".
3751cf28ed85SJohn Marino Return RE_DUP_MAX + 1 if the number field is too large.
3752*09d4459fSDaniel Fojt Return -2 if an error occurred. */
375395b7b453SJohn Marino
375495b7b453SJohn Marino static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)375595b7b453SJohn Marino fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
375695b7b453SJohn Marino {
3757*09d4459fSDaniel Fojt Idx num = -1;
375895b7b453SJohn Marino unsigned char c;
375995b7b453SJohn Marino while (1)
376095b7b453SJohn Marino {
376195b7b453SJohn Marino fetch_token (token, input, syntax);
376295b7b453SJohn Marino c = token->opr.c;
3763*09d4459fSDaniel Fojt if (__glibc_unlikely (token->type == END_OF_RE))
3764*09d4459fSDaniel Fojt return -2;
376595b7b453SJohn Marino if (token->type == OP_CLOSE_DUP_NUM || c == ',')
376695b7b453SJohn Marino break;
3767*09d4459fSDaniel Fojt num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3768*09d4459fSDaniel Fojt ? -2
3769*09d4459fSDaniel Fojt : num == -1
3770cf28ed85SJohn Marino ? c - '0'
3771cf28ed85SJohn Marino : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
377295b7b453SJohn Marino }
377395b7b453SJohn Marino return num;
377495b7b453SJohn Marino }
377595b7b453SJohn Marino
377695b7b453SJohn Marino #ifdef RE_ENABLE_I18N
377795b7b453SJohn Marino static void
free_charset(re_charset_t * cset)377895b7b453SJohn Marino free_charset (re_charset_t *cset)
377995b7b453SJohn Marino {
378095b7b453SJohn Marino re_free (cset->mbchars);
378195b7b453SJohn Marino # ifdef _LIBC
378295b7b453SJohn Marino re_free (cset->coll_syms);
378395b7b453SJohn Marino re_free (cset->equiv_classes);
3784*09d4459fSDaniel Fojt # endif
378595b7b453SJohn Marino re_free (cset->range_starts);
378695b7b453SJohn Marino re_free (cset->range_ends);
378795b7b453SJohn Marino re_free (cset->char_classes);
378895b7b453SJohn Marino re_free (cset);
378995b7b453SJohn Marino }
379095b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
379195b7b453SJohn Marino
379295b7b453SJohn Marino /* Functions for binary tree operation. */
379395b7b453SJohn Marino
379495b7b453SJohn Marino /* Create a tree node. */
379595b7b453SJohn Marino
379695b7b453SJohn Marino static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)379795b7b453SJohn Marino create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
379895b7b453SJohn Marino re_token_type_t type)
379995b7b453SJohn Marino {
3800*09d4459fSDaniel Fojt re_token_t t = { .type = type };
380195b7b453SJohn Marino return create_token_tree (dfa, left, right, &t);
380295b7b453SJohn Marino }
380395b7b453SJohn Marino
380495b7b453SJohn Marino static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)380595b7b453SJohn Marino create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
380695b7b453SJohn Marino const re_token_t *token)
380795b7b453SJohn Marino {
380895b7b453SJohn Marino bin_tree_t *tree;
3809*09d4459fSDaniel Fojt if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
381095b7b453SJohn Marino {
381195b7b453SJohn Marino bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
381295b7b453SJohn Marino
381395b7b453SJohn Marino if (storage == NULL)
381495b7b453SJohn Marino return NULL;
381595b7b453SJohn Marino storage->next = dfa->str_tree_storage;
381695b7b453SJohn Marino dfa->str_tree_storage = storage;
381795b7b453SJohn Marino dfa->str_tree_storage_idx = 0;
381895b7b453SJohn Marino }
381995b7b453SJohn Marino tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
382095b7b453SJohn Marino
382195b7b453SJohn Marino tree->parent = NULL;
382295b7b453SJohn Marino tree->left = left;
382395b7b453SJohn Marino tree->right = right;
382495b7b453SJohn Marino tree->token = *token;
382595b7b453SJohn Marino tree->token.duplicated = 0;
382695b7b453SJohn Marino tree->token.opt_subexp = 0;
382795b7b453SJohn Marino tree->first = NULL;
382895b7b453SJohn Marino tree->next = NULL;
3829*09d4459fSDaniel Fojt tree->node_idx = -1;
383095b7b453SJohn Marino
383195b7b453SJohn Marino if (left != NULL)
383295b7b453SJohn Marino left->parent = tree;
383395b7b453SJohn Marino if (right != NULL)
383495b7b453SJohn Marino right->parent = tree;
383595b7b453SJohn Marino return tree;
383695b7b453SJohn Marino }
383795b7b453SJohn Marino
383895b7b453SJohn Marino /* Mark the tree SRC as an optional subexpression.
383995b7b453SJohn Marino To be called from preorder or postorder. */
384095b7b453SJohn Marino
384195b7b453SJohn Marino static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)384295b7b453SJohn Marino mark_opt_subexp (void *extra, bin_tree_t *node)
384395b7b453SJohn Marino {
3844a8597f6cSJohn Marino Idx idx = (uintptr_t) extra;
384595b7b453SJohn Marino if (node->token.type == SUBEXP && node->token.opr.idx == idx)
384695b7b453SJohn Marino node->token.opt_subexp = 1;
384795b7b453SJohn Marino
384895b7b453SJohn Marino return REG_NOERROR;
384995b7b453SJohn Marino }
385095b7b453SJohn Marino
385195b7b453SJohn Marino /* Free the allocated memory inside NODE. */
385295b7b453SJohn Marino
385395b7b453SJohn Marino static void
free_token(re_token_t * node)385495b7b453SJohn Marino free_token (re_token_t *node)
385595b7b453SJohn Marino {
385695b7b453SJohn Marino #ifdef RE_ENABLE_I18N
385795b7b453SJohn Marino if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
385895b7b453SJohn Marino free_charset (node->opr.mbcset);
385995b7b453SJohn Marino else
386095b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
386195b7b453SJohn Marino if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
386295b7b453SJohn Marino re_free (node->opr.sbcset);
386395b7b453SJohn Marino }
386495b7b453SJohn Marino
386595b7b453SJohn Marino /* Worker function for tree walking. Free the allocated memory inside NODE
386695b7b453SJohn Marino and its children. */
386795b7b453SJohn Marino
386895b7b453SJohn Marino static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3869680a9cb8SJohn Marino free_tree (void *extra, bin_tree_t *node)
387095b7b453SJohn Marino {
387195b7b453SJohn Marino free_token (&node->token);
387295b7b453SJohn Marino return REG_NOERROR;
387395b7b453SJohn Marino }
387495b7b453SJohn Marino
387595b7b453SJohn Marino
387695b7b453SJohn Marino /* Duplicate the node SRC, and return new node. This is a preorder
387795b7b453SJohn Marino visit similar to the one implemented by the generic visitor, but
387895b7b453SJohn Marino we need more infrastructure to maintain two parallel trees --- so,
387995b7b453SJohn Marino it's easier to duplicate. */
388095b7b453SJohn Marino
388195b7b453SJohn Marino static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)388295b7b453SJohn Marino duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
388395b7b453SJohn Marino {
388495b7b453SJohn Marino const bin_tree_t *node;
388595b7b453SJohn Marino bin_tree_t *dup_root;
388695b7b453SJohn Marino bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
388795b7b453SJohn Marino
388895b7b453SJohn Marino for (node = root; ; )
388995b7b453SJohn Marino {
389095b7b453SJohn Marino /* Create a new tree and link it back to the current parent. */
389195b7b453SJohn Marino *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
389295b7b453SJohn Marino if (*p_new == NULL)
389395b7b453SJohn Marino return NULL;
389495b7b453SJohn Marino (*p_new)->parent = dup_node;
389595b7b453SJohn Marino (*p_new)->token.duplicated = 1;
389695b7b453SJohn Marino dup_node = *p_new;
389795b7b453SJohn Marino
389895b7b453SJohn Marino /* Go to the left node, or up and to the right. */
389995b7b453SJohn Marino if (node->left)
390095b7b453SJohn Marino {
390195b7b453SJohn Marino node = node->left;
390295b7b453SJohn Marino p_new = &dup_node->left;
390395b7b453SJohn Marino }
390495b7b453SJohn Marino else
390595b7b453SJohn Marino {
390695b7b453SJohn Marino const bin_tree_t *prev = NULL;
390795b7b453SJohn Marino while (node->right == prev || node->right == NULL)
390895b7b453SJohn Marino {
390995b7b453SJohn Marino prev = node;
391095b7b453SJohn Marino node = node->parent;
391195b7b453SJohn Marino dup_node = dup_node->parent;
391295b7b453SJohn Marino if (!node)
391395b7b453SJohn Marino return dup_root;
391495b7b453SJohn Marino }
391595b7b453SJohn Marino node = node->right;
391695b7b453SJohn Marino p_new = &dup_node->right;
391795b7b453SJohn Marino }
391895b7b453SJohn Marino }
391995b7b453SJohn Marino }
3920