xref: /netbsd-src/external/gpl2/xcvs/dist/lib/regcomp.c (revision 5a6c14c844c4c665da5632061aebde7bb2cb5766)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License along
17    with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19 #include <sys/cdefs.h>
20 __RCSID("$NetBSD: regcomp.c,v 1.2 2016/05/17 14:00:09 christos Exp $");
21 
22 
23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
24 					  Idx length, reg_syntax_t syntax);
25 static void re_compile_fastmap_iter (regex_t *bufp,
26 				     const re_dfastate_t *init_state,
27 				     char *fastmap);
28 static reg_errcode_t init_dfa (re_dfa_t *dfa, Idx pat_len);
29 #ifdef RE_ENABLE_I18N
30 static void free_charset (re_charset_t *cset);
31 #endif /* RE_ENABLE_I18N */
32 static void free_workarea_compile (regex_t *preg);
33 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
34 #ifdef RE_ENABLE_I18N
35 static void optimize_utf8 (re_dfa_t *dfa);
36 #endif
37 static reg_errcode_t analyze (regex_t *preg);
38 static reg_errcode_t preorder (bin_tree_t *root,
39 			       reg_errcode_t (fn (void *, bin_tree_t *)),
40 			       void *extra);
41 static reg_errcode_t postorder (bin_tree_t *root,
42 				reg_errcode_t (fn (void *, bin_tree_t *)),
43 				void *extra);
44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 				 bin_tree_t *node);
48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
52 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
53 				   unsigned int constraint);
54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
56 					 Idx node, bool root);
57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
58 static Idx fetch_number (re_string_t *input, re_token_t *token,
59 			 reg_syntax_t syntax);
60 static int peek_token (re_token_t *token, re_string_t *input,
61 			reg_syntax_t syntax);
62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
63 			  reg_syntax_t syntax, reg_errcode_t *err);
64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
65 				  re_token_t *token, reg_syntax_t syntax,
66 				  Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
68 				 re_token_t *token, reg_syntax_t syntax,
69 				 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
71 				     re_token_t *token, reg_syntax_t syntax,
72 				     Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
74 				  re_token_t *token, reg_syntax_t syntax,
75 				  Idx nest, reg_errcode_t *err);
76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
77 				 re_dfa_t *dfa, re_token_t *token,
78 				 reg_syntax_t syntax, reg_errcode_t *err);
79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
80 				      re_token_t *token, reg_syntax_t syntax,
81 				      reg_errcode_t *err);
82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
83 					    re_string_t *regexp,
84 					    re_token_t *token, int token_len,
85 					    re_dfa_t *dfa,
86 					    reg_syntax_t syntax,
87 					    bool accept_hyphen);
88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 					  re_string_t *regexp,
90 					  re_token_t *token);
91 #ifdef RE_ENABLE_I18N
92 static reg_errcode_t build_equiv_class (bitset sbcset,
93 					re_charset_t *mbcset,
94 					Idx *equiv_class_alloc,
95 					const unsigned char *name);
96 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
97 				      bitset sbcset,
98 				      re_charset_t *mbcset,
99 				      Idx *char_class_alloc,
100 				      const unsigned char *class_name,
101 				      reg_syntax_t syntax);
102 #else  /* not RE_ENABLE_I18N */
103 static reg_errcode_t build_equiv_class (bitset sbcset,
104 					const unsigned char *name);
105 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
106 				      bitset sbcset,
107 				      const unsigned char *class_name,
108 				      reg_syntax_t syntax);
109 #endif /* not RE_ENABLE_I18N */
110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
111 				       unsigned REG_TRANSLATE_TYPE trans,
112 				       const unsigned char *class_name,
113 				       const unsigned char *extra,
114 				       bool non_match, reg_errcode_t *err);
115 static bin_tree_t *create_tree (re_dfa_t *dfa,
116 				bin_tree_t *left, bin_tree_t *right,
117 				re_token_type_t type);
118 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
119 				      bin_tree_t *left, bin_tree_t *right,
120 				      const re_token_t *token);
121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
122 static void free_token (re_token_t *node);
123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
125 
126 /* This table gives an error message for each of the error codes listed
127    in regex.h.  Obviously the order here has to be same as there.
128    POSIX doesn't require that we do anything for REG_NOERROR,
129    but why not be nice?  */
130 
131 const char __re_error_msgid[] attribute_hidden =
132   {
133 #define REG_NOERROR_IDX	0
134     gettext_noop ("Success")	/* REG_NOERROR */
135     "\0"
136 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
137     gettext_noop ("No match")	/* REG_NOMATCH */
138     "\0"
139 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
140     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141     "\0"
142 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
143     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144     "\0"
145 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
146     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147     "\0"
148 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
149     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150     "\0"
151 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
152     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153     "\0"
154 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
155     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
156     "\0"
157 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
158     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159     "\0"
160 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
161     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162     "\0"
163 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
164     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165     "\0"
166 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
167     gettext_noop ("Invalid range end")	/* REG_ERANGE */
168     "\0"
169 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
170     gettext_noop ("Memory exhausted") /* REG_ESPACE */
171     "\0"
172 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
173     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174     "\0"
175 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
176     gettext_noop ("Premature end of regular expression") /* REG_EEND */
177     "\0"
178 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
179     gettext_noop ("Regular expression too big") /* REG_ESIZE */
180     "\0"
181 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
182     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183   };
184 
185 const size_t __re_error_msgid_idx[] attribute_hidden =
186   {
187     REG_NOERROR_IDX,
188     REG_NOMATCH_IDX,
189     REG_BADPAT_IDX,
190     REG_ECOLLATE_IDX,
191     REG_ECTYPE_IDX,
192     REG_EESCAPE_IDX,
193     REG_ESUBREG_IDX,
194     REG_EBRACK_IDX,
195     REG_EPAREN_IDX,
196     REG_EBRACE_IDX,
197     REG_BADBR_IDX,
198     REG_ERANGE_IDX,
199     REG_ESPACE_IDX,
200     REG_BADRPT_IDX,
201     REG_EEND_IDX,
202     REG_ESIZE_IDX,
203     REG_ERPAREN_IDX
204   };
205 
206 /* Entry points for GNU code.  */
207 
208 /* re_compile_pattern is the GNU regular expression compiler: it
209    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
210    Returns 0 if the pattern was valid, otherwise an error string.
211 
212    Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
213    are set in BUFP on entry.  */
214 
215 const char *
re_compile_pattern(const char * pattern,size_t length,struct re_pattern_buffer * bufp)216 re_compile_pattern (const char *pattern, size_t length,
217 		    struct re_pattern_buffer *bufp)
218 {
219   reg_errcode_t ret;
220 
221   /* And GNU code determines whether or not to get register information
222      by passing null for the REGS argument to re_match, etc., not by
223      setting re_no_sub, unless REG_NO_SUB is set.  */
224   bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
225 
226   /* Match anchors at newline.  */
227   bufp->re_newline_anchor = 1;
228 
229   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230 
231   if (!ret)
232     return NULL;
233   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
234 }
235 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)236 weak_alias (__re_compile_pattern, re_compile_pattern)
237 #endif
238 
239 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
240    also be assigned to arbitrarily: each pattern buffer stores its own
241    syntax, so it can be changed between regex compilations.  */
242 /* This has no initializer because initialized variables in Emacs
243    become read-only after dumping.  */
244 reg_syntax_t re_syntax_options;
245 
246 
247 /* Specify the precise syntax of regexps for compilation.  This provides
248    for compatibility for various utilities which historically have
249    different, incompatible syntaxes.
250 
251    The argument SYNTAX is a bit mask comprised of the various bits
252    defined in regex.h.  We return the old syntax.  */
253 
254 reg_syntax_t
255 re_set_syntax (reg_syntax_t syntax)
256 {
257   reg_syntax_t ret = re_syntax_options;
258 
259   re_syntax_options = syntax;
260   return ret;
261 }
262 #ifdef _LIBC
weak_alias(__re_set_syntax,re_set_syntax)263 weak_alias (__re_set_syntax, re_set_syntax)
264 #endif
265 
266 int
267 re_compile_fastmap (struct re_pattern_buffer *bufp)
268 {
269   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
270   char *fastmap = bufp->re_fastmap;
271 
272   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
273   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
274   if (dfa->init_state != dfa->init_state_word)
275     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
276   if (dfa->init_state != dfa->init_state_nl)
277     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
278   if (dfa->init_state != dfa->init_state_begbuf)
279     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
280   bufp->re_fastmap_accurate = 1;
281   return 0;
282 }
283 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)284 weak_alias (__re_compile_fastmap, re_compile_fastmap)
285 #endif
286 
287 static inline void
288 __attribute ((always_inline))
289 re_set_fastmap (char *fastmap, bool icase, int ch)
290 {
291   fastmap[ch] = 1;
292   if (icase)
293     fastmap[tolower (ch)] = 1;
294 }
295 
296 /* Helper function for re_compile_fastmap.
297    Compile fastmap for the initial_state INIT_STATE.  */
298 
299 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)300 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
301 			 char *fastmap)
302 {
303   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
304   Idx node_cnt;
305   bool icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
306   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
307     {
308       Idx node = init_state->nodes.elems[node_cnt];
309       re_token_type_t type = dfa->nodes[node].type;
310 
311       if (type == CHARACTER)
312 	{
313 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
314 #ifdef RE_ENABLE_I18N
315 	  if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
316 	    {
317 	      unsigned char buf[MB_LEN_MAX];
318 	      unsigned char *p;
319 	      wchar_t wc;
320 	      mbstate_t state;
321 
322 	      p = buf;
323 	      *p++ = dfa->nodes[node].opr.c;
324 	      while (++node < dfa->nodes_len
325 		     &&	dfa->nodes[node].type == CHARACTER
326 		     && dfa->nodes[node].mb_partial)
327 		*p++ = dfa->nodes[node].opr.c;
328 	      memset (&state, 0, sizeof (state));
329 	      if (mbrtowc (&wc, (const char *) buf, p - buf,
330 			   &state) == p - buf
331 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
332 		      != (size_t) -1))
333 		re_set_fastmap (fastmap, false, buf[0]);
334 	    }
335 #endif
336 	}
337       else if (type == SIMPLE_BRACKET)
338 	{
339 	  int i, j, ch;
340 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
341 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
342 	      if (dfa->nodes[node].opr.sbcset[i] & ((bitset_word) 1 << j))
343 		re_set_fastmap (fastmap, icase, ch);
344 	}
345 #ifdef RE_ENABLE_I18N
346       else if (type == COMPLEX_BRACKET)
347 	{
348 	  Idx i;
349 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
350 	  if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
351 	      || cset->nranges || cset->nchar_classes)
352 	    {
353 # ifdef _LIBC
354 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
355 		{
356 		  /* In this case we want to catch the bytes which are
357 		     the first byte of any collation elements.
358 		     e.g. In da_DK, we want to catch 'a' since "aa"
359 			  is a valid collation element, and don't catch
360 			  'b' since 'b' is the only collation element
361 			  which starts from 'b'.  */
362 		  const int32_t *table = (const int32_t *)
363 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
364 		  for (i = 0; i < SBC_MAX; ++i)
365 		    if (table[i] < 0)
366 		      re_set_fastmap (fastmap, icase, i);
367 		}
368 # else
369 	      if (dfa->mb_cur_max > 1)
370 		for (i = 0; i < SBC_MAX; ++i)
371 		  if (__btowc (i) == WEOF)
372 		    re_set_fastmap (fastmap, icase, i);
373 # endif /* not _LIBC */
374 	    }
375 	  for (i = 0; i < cset->nmbchars; ++i)
376 	    {
377 	      char buf[256];
378 	      mbstate_t state;
379 	      memset (&state, '\0', sizeof (state));
380 	      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
381 		re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
382 	      if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
383 		{
384 		  if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
385 		      != (size_t) -1)
386 		    re_set_fastmap (fastmap, false, *(unsigned char *) buf);
387 		}
388 	    }
389 	}
390 #endif /* RE_ENABLE_I18N */
391       else if (type == OP_PERIOD
392 #ifdef RE_ENABLE_I18N
393 	       || type == OP_UTF8_PERIOD
394 #endif /* RE_ENABLE_I18N */
395 	       || type == END_OF_RE)
396 	{
397 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
398 	  if (type == END_OF_RE)
399 	    bufp->re_can_be_null = 1;
400 	  return;
401 	}
402     }
403 }
404 
405 /* Entry point for POSIX code.  */
406 /* regcomp takes a regular expression as a string and compiles it.
407 
408    PREG is a regex_t *.  We do not expect any fields to be initialized,
409    since POSIX says we shouldn't.  Thus, we set
410 
411      `re_buffer' to the compiled pattern;
412      `re_used' to the length of the compiled pattern;
413      `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
414        REG_EXTENDED bit in CFLAGS is set; otherwise, to
415        REG_SYNTAX_POSIX_BASIC;
416      `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
417      `re_fastmap' to an allocated space for the fastmap;
418      `re_fastmap_accurate' to zero;
419      `re_nsub' to the number of subexpressions in PATTERN.
420 
421    PATTERN is the address of the pattern string.
422 
423    CFLAGS is a series of bits which affect compilation.
424 
425      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
426      use POSIX basic syntax.
427 
428      If REG_NEWLINE is set, then . and [^...] don't match newline.
429      Also, regexec will try a match beginning after every newline.
430 
431      If REG_ICASE is set, then we considers upper- and lowercase
432      versions of letters to be equivalent when matching.
433 
434      If REG_NOSUB is set, then when PREG is passed to regexec, that
435      routine will report only success or failure, and nothing about the
436      registers.
437 
438    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
439    the return codes and their meanings.)  */
440 
441 int
regcomp(regex_t * __restrict preg,const char * __restrict pattern,int cflags)442 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
443 {
444   reg_errcode_t ret;
445   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
446 			: REG_SYNTAX_POSIX_BASIC);
447 
448   preg->re_buffer = NULL;
449   preg->re_allocated = 0;
450   preg->re_used = 0;
451 
452   /* Try to allocate space for the fastmap.  */
453   preg->re_fastmap = re_malloc (char, SBC_MAX);
454   if (BE (preg->re_fastmap == NULL, 0))
455     return REG_ESPACE;
456 
457   syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
458 
459   /* If REG_NEWLINE is set, newlines are treated differently.  */
460   if (cflags & REG_NEWLINE)
461     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
462       syntax &= ~REG_DOT_NEWLINE;
463       syntax |= REG_HAT_LISTS_NOT_NEWLINE;
464       /* It also changes the matching behavior.  */
465       preg->re_newline_anchor = 1;
466     }
467   else
468     preg->re_newline_anchor = 0;
469   preg->re_no_sub = !!(cflags & REG_NOSUB);
470   preg->re_translate = NULL;
471 
472   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
473 
474   /* POSIX doesn't distinguish between an unmatched open-group and an
475      unmatched close-group: both are REG_EPAREN.  */
476   if (ret == REG_ERPAREN)
477     ret = REG_EPAREN;
478 
479   /* We have already checked preg->re_fastmap != NULL.  */
480   if (BE (ret == REG_NOERROR, 1))
481     /* Compute the fastmap now, since regexec cannot modify the pattern
482        buffer.  This function never fails in this implementation.  */
483     (void) re_compile_fastmap (preg);
484   else
485     {
486       /* Some error occurred while compiling the expression.  */
487       re_free (preg->re_fastmap);
488       preg->re_fastmap = NULL;
489     }
490 
491   return (int) ret;
492 }
493 #ifdef _LIBC
weak_alias(__regcomp,regcomp)494 weak_alias (__regcomp, regcomp)
495 #endif
496 
497 /* Returns a message corresponding to an error code, ERRCODE, returned
498    from either regcomp or regexec.   We don't use PREG here.  */
499 
500 size_t
501 regerror (int errcode, const regex_t *__restrict preg,
502 	  char *__restrict errbuf, size_t errbuf_size)
503 {
504   const char *msg;
505   size_t msg_size;
506 
507   if (BE (errcode < 0
508 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
509 			       / sizeof (__re_error_msgid_idx[0])), 0))
510     /* Only error codes returned by the rest of the code should be passed
511        to this routine.  If we are given anything else, or if other regex
512        code generates an invalid error code, then the program has a bug.
513        Dump core so we can fix it.  */
514     abort ();
515 
516   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
517 
518   msg_size = strlen (msg) + 1; /* Includes the null.  */
519 
520   if (BE (errbuf_size != 0, 1))
521     {
522       if (BE (msg_size > errbuf_size, 0))
523 	{
524 #if defined HAVE_MEMPCPY || defined _LIBC
525 	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
526 #else
527 	  memcpy (errbuf, msg, errbuf_size - 1);
528 	  errbuf[errbuf_size - 1] = 0;
529 #endif
530 	}
531       else
532 	memcpy (errbuf, msg, msg_size);
533     }
534 
535   return msg_size;
536 }
537 #ifdef _LIBC
538 weak_alias (__regerror, regerror)
539 #endif
540 
541 
542 #ifdef RE_ENABLE_I18N
543 /* This static array is used for the map to single-byte characters when
544    UTF-8 is used.  Otherwise we would allocate memory just to initialize
545    it the same all the time.  UTF-8 is the preferred encoding so this is
546    a worthwhile optimization.  */
547 static const bitset utf8_sb_map =
548 {
549   /* Set the first 128 bits.  */
550 # if 2 < BITSET_WORDS
551   BITSET_WORD_MAX,
552 # endif
553 # if 4 < BITSET_WORDS
554   BITSET_WORD_MAX,
555 # endif
556 # if 6 < BITSET_WORDS
557   BITSET_WORD_MAX,
558 # endif
559 # if 8 < BITSET_WORDS
560 #  error "Invalid BITSET_WORDS"
561 # endif
562   (BITSET_WORD_MAX
563    >> (SBC_MAX % BITSET_WORD_BITS == 0
564        ? 0
565        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
566 };
567 #endif
568 
569 
570 static void
free_dfa_content(re_dfa_t * dfa)571 free_dfa_content (re_dfa_t *dfa)
572 {
573   Idx i, j;
574 
575   if (dfa->nodes)
576     for (i = 0; i < dfa->nodes_len; ++i)
577       free_token (dfa->nodes + i);
578   re_free (dfa->nexts);
579   for (i = 0; i < dfa->nodes_len; ++i)
580     {
581       if (dfa->eclosures != NULL)
582 	re_node_set_free (dfa->eclosures + i);
583       if (dfa->inveclosures != NULL)
584 	re_node_set_free (dfa->inveclosures + i);
585       if (dfa->edests != NULL)
586 	re_node_set_free (dfa->edests + i);
587     }
588   re_free (dfa->edests);
589   re_free (dfa->eclosures);
590   re_free (dfa->inveclosures);
591   re_free (dfa->nodes);
592 
593   if (dfa->state_table)
594     for (i = 0; i <= dfa->state_hash_mask; ++i)
595       {
596 	struct re_state_table_entry *entry = dfa->state_table + i;
597 	for (j = 0; j < entry->num; ++j)
598 	  {
599 	    re_dfastate_t *state = entry->array[j];
600 	    free_state (state);
601 	  }
602         re_free (entry->array);
603       }
604   re_free (dfa->state_table);
605 #ifdef RE_ENABLE_I18N
606   if (dfa->sb_char != utf8_sb_map)
607     re_free (dfa->sb_char);
608 #endif
609   re_free (dfa->subexp_map);
610 #ifdef DEBUG
611   re_free (dfa->re_str);
612 #endif
613 
614   re_free (dfa);
615 }
616 
617 
618 /* Free dynamically allocated space used by PREG.  */
619 
620 void
regfree(regex_t * preg)621 regfree (regex_t *preg)
622 {
623   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
624   if (BE (dfa != NULL, 1))
625     free_dfa_content (dfa);
626   preg->re_buffer = NULL;
627   preg->re_allocated = 0;
628 
629   re_free (preg->re_fastmap);
630   preg->re_fastmap = NULL;
631 
632   re_free (preg->re_translate);
633   preg->re_translate = NULL;
634 }
635 #ifdef _LIBC
636 weak_alias (__regfree, regfree)
637 #endif
638 
639 /* Entry points compatible with 4.2 BSD regex library.  We don't define
640    them unless specifically requested.  */
641 
642 #if defined _REGEX_RE_COMP || defined _LIBC
643 
644 /* BSD has one and only one pattern buffer.  */
645 static struct re_pattern_buffer re_comp_buf;
646 
647 char *
648 # ifdef _LIBC
649 /* Make these definitions weak in libc, so POSIX programs can redefine
650    these names if they don't use our functions, and still use
651    regcomp/regexec above without link errors.  */
652 weak_function
653 # endif
re_comp(const char * s)654 re_comp (const char *s)
655 {
656   reg_errcode_t ret;
657   char *fastmap;
658 
659   if (!s)
660     {
661       if (!re_comp_buf.re_buffer)
662 	return gettext ("No previous regular expression");
663       return 0;
664     }
665 
666   if (re_comp_buf.re_buffer)
667     {
668       fastmap = re_comp_buf.re_fastmap;
669       re_comp_buf.re_fastmap = NULL;
670       __regfree (&re_comp_buf);
671       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
672       re_comp_buf.re_fastmap = fastmap;
673     }
674 
675   if (re_comp_buf.re_fastmap == NULL)
676     {
677       re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
678       if (re_comp_buf.re_fastmap == NULL)
679 	return (char *) gettext (__re_error_msgid
680 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
681     }
682 
683   /* Since `re_exec' always passes NULL for the `regs' argument, we
684      don't need to initialize the pattern buffer fields which affect it.  */
685 
686   /* Match anchors at newlines.  */
687   re_comp_buf.re_newline_anchor = 1;
688 
689   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
690 
691   if (!ret)
692     return NULL;
693 
694   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
695   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
696 }
697 
698 #ifdef _LIBC
libc_freeres_fn(free_mem)699 libc_freeres_fn (free_mem)
700 {
701   __regfree (&re_comp_buf);
702 }
703 #endif
704 
705 #endif /* _REGEX_RE_COMP */
706 
707 /* Internal entry point.
708    Compile the regular expression PATTERN, whose length is LENGTH.
709    SYNTAX indicate regular expression's syntax.  */
710 
711 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,Idx length,reg_syntax_t syntax)712 re_compile_internal (regex_t *preg, const char *pattern, Idx length,
713 		     reg_syntax_t syntax)
714 {
715   reg_errcode_t err = REG_NOERROR;
716   re_dfa_t *dfa;
717   re_string_t regexp;
718 
719   /* Initialize the pattern buffer.  */
720   preg->re_fastmap_accurate = 0;
721   preg->re_syntax = syntax;
722   preg->re_not_bol = preg->re_not_eol = 0;
723   preg->re_used = 0;
724   preg->re_nsub = 0;
725   preg->re_can_be_null = 0;
726   preg->re_regs_allocated = REG_UNALLOCATED;
727 
728   /* Initialize the dfa.  */
729   dfa = (re_dfa_t *) preg->re_buffer;
730   if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
731     {
732       /* If zero allocated, but buffer is non-null, try to realloc
733 	 enough space.  This loses if buffer's address is bogus, but
734 	 that is the user's responsibility.  If buffer is null this
735 	 is a simple allocation.  */
736       dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
737       if (dfa == NULL)
738 	return REG_ESPACE;
739       preg->re_allocated = sizeof (re_dfa_t);
740       preg->re_buffer = (unsigned char *) dfa;
741     }
742   preg->re_used = sizeof (re_dfa_t);
743 
744   __libc_lock_init (dfa->lock);
745 
746   err = init_dfa (dfa, length);
747   if (BE (err != REG_NOERROR, 0))
748     {
749       free_dfa_content (dfa);
750       preg->re_buffer = NULL;
751       preg->re_allocated = 0;
752       return err;
753     }
754 #ifdef DEBUG
755   dfa->re_str = re_malloc (char, length + 1);
756   strncpy (dfa->re_str, pattern, length + 1);
757 #endif
758 
759   err = re_string_construct (&regexp, pattern, length, preg->re_translate,
760 			     syntax & REG_IGNORE_CASE, dfa);
761   if (BE (err != REG_NOERROR, 0))
762     {
763     re_compile_internal_free_return:
764       free_workarea_compile (preg);
765       re_string_destruct (&regexp);
766       free_dfa_content (dfa);
767       preg->re_buffer = NULL;
768       preg->re_allocated = 0;
769       return err;
770     }
771 
772   /* Parse the regular expression, and build a structure tree.  */
773   preg->re_nsub = 0;
774   dfa->str_tree = parse (&regexp, preg, syntax, &err);
775   if (BE (dfa->str_tree == NULL, 0))
776     goto re_compile_internal_free_return;
777 
778   /* Analyze the tree and create the nfa.  */
779   err = analyze (preg);
780   if (BE (err != REG_NOERROR, 0))
781     goto re_compile_internal_free_return;
782 
783 #ifdef RE_ENABLE_I18N
784   /* If possible, do searching in single byte encoding to speed things up.  */
785   if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
786     optimize_utf8 (dfa);
787 #endif
788 
789   /* Then create the initial state of the dfa.  */
790   err = create_initial_state (dfa);
791 
792   /* Release work areas.  */
793   free_workarea_compile (preg);
794   re_string_destruct (&regexp);
795 
796   if (BE (err != REG_NOERROR, 0))
797     {
798       free_dfa_content (dfa);
799       preg->re_buffer = NULL;
800       preg->re_allocated = 0;
801     }
802 
803   return err;
804 }
805 
806 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
807    as the initial length of some arrays.  */
808 
809 static reg_errcode_t
init_dfa(re_dfa_t * dfa,Idx pat_len)810 init_dfa (re_dfa_t *dfa, Idx pat_len)
811 {
812   __re_size_t table_size;
813 #ifndef _LIBC
814   char *codeset_name;
815 #endif
816 
817   memset (dfa, '\0', sizeof (re_dfa_t));
818 
819   /* Force allocation of str_tree_storage the first time.  */
820   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
821 
822   dfa->nodes_alloc = pat_len + 1;
823   dfa->nodes = re_xmalloc (re_token_t, dfa->nodes_alloc);
824 
825   /*  table_size = 2 ^ ceil(log pat_len) */
826   for (table_size = 1; table_size <= pat_len; table_size <<= 1)
827     if (0 < (Idx) -1 && table_size == 0)
828       return REG_ESPACE;
829 
830   dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
831   dfa->state_hash_mask = table_size - 1;
832 
833   dfa->mb_cur_max = MB_CUR_MAX;
834 #ifdef _LIBC
835   if (dfa->mb_cur_max == 6
836       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
837     dfa->is_utf8 = 1;
838   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
839 		       != 0);
840 #else
841 # ifdef HAVE_LANGINFO_CODESET
842   codeset_name = nl_langinfo (CODESET);
843 # else
844   codeset_name = getenv ("LC_ALL");
845   if (codeset_name == NULL || codeset_name[0] == '\0')
846     codeset_name = getenv ("LC_CTYPE");
847   if (codeset_name == NULL || codeset_name[0] == '\0')
848     codeset_name = getenv ("LANG");
849   if (codeset_name == NULL)
850     codeset_name = "";
851   else if (strchr (codeset_name, '.') !=  NULL)
852     codeset_name = strchr (codeset_name, '.') + 1;
853 # endif
854 
855   if (strcasecmp (codeset_name, "UTF-8") == 0
856       || strcasecmp (codeset_name, "UTF8") == 0)
857     dfa->is_utf8 = 1;
858 
859   /* We check exhaustively in the loop below if this charset is a
860      superset of ASCII.  */
861   dfa->map_notascii = 0;
862 #endif
863 
864 #ifdef RE_ENABLE_I18N
865   if (dfa->mb_cur_max > 1)
866     {
867       if (dfa->is_utf8)
868 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
869       else
870 	{
871 	  int i, j, ch;
872 
873 	  dfa->sb_char = re_calloc (bitset_word, BITSET_WORDS);
874 	  if (BE (dfa->sb_char == NULL, 0))
875 	    return REG_ESPACE;
876 
877 	  /* Set the bits corresponding to single byte chars.  */
878 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
879 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
880 	      {
881 		wint_t wch = __btowc (ch);
882 		if (wch != WEOF)
883 		  dfa->sb_char[i] |= (bitset_word) 1 << j;
884 # ifndef _LIBC
885 		if (isascii (ch) && wch != ch)
886 		  dfa->map_notascii = 1;
887 # endif
888 	      }
889 	}
890     }
891 #endif
892 
893   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
894     return REG_ESPACE;
895   return REG_NOERROR;
896 }
897 
898 /* Initialize WORD_CHAR table, which indicate which character is
899    "word".  In this case "word" means that it is the word construction
900    character used by some operators like "\<", "\>", etc.  */
901 
902 static void
init_word_char(re_dfa_t * dfa)903 init_word_char (re_dfa_t *dfa)
904 {
905   int i, j, ch;
906   dfa->word_ops_used = 1;
907   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
908     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
909       if (isalnum (ch) || ch == '_')
910 	dfa->word_char[i] |= (bitset_word) 1 << j;
911 }
912 
913 /* Free the work area which are only used while compiling.  */
914 
915 static void
free_workarea_compile(regex_t * preg)916 free_workarea_compile (regex_t *preg)
917 {
918   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
919   bin_tree_storage_t *storage, *next;
920   for (storage = dfa->str_tree_storage; storage; storage = next)
921     {
922       next = storage->next;
923       re_free (storage);
924     }
925   dfa->str_tree_storage = NULL;
926   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
927   dfa->str_tree = NULL;
928   re_free (dfa->org_indices);
929   dfa->org_indices = NULL;
930 }
931 
932 /* Create initial states for all contexts.  */
933 
934 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)935 create_initial_state (re_dfa_t *dfa)
936 {
937   Idx first, i;
938   reg_errcode_t err;
939   re_node_set init_nodes;
940 
941   /* Initial states have the epsilon closure of the node which is
942      the first node of the regular expression.  */
943   first = dfa->str_tree->first->node_idx;
944   dfa->init_node = first;
945   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
946   if (BE (err != REG_NOERROR, 0))
947     return err;
948 
949   /* The back-references which are in initial states can epsilon transit,
950      since in this case all of the subexpressions can be null.
951      Then we add epsilon closures of the nodes which are the next nodes of
952      the back-references.  */
953   if (dfa->nbackref > 0)
954     for (i = 0; i < init_nodes.nelem; ++i)
955       {
956 	Idx node_idx = init_nodes.elems[i];
957 	re_token_type_t type = dfa->nodes[node_idx].type;
958 
959 	Idx clexp_idx;
960 	if (type != OP_BACK_REF)
961 	  continue;
962 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
963 	  {
964 	    re_token_t *clexp_node;
965 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
966 	    if (clexp_node->type == OP_CLOSE_SUBEXP
967 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
968 	      break;
969 	  }
970 	if (clexp_idx == init_nodes.nelem)
971 	  continue;
972 
973 	if (type == OP_BACK_REF)
974 	  {
975 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
976 	    if (!re_node_set_contains (&init_nodes, dest_idx))
977 	      {
978 		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
979 		i = 0;
980 	      }
981 	  }
982       }
983 
984   /* It must be the first time to invoke acquire_state.  */
985   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
986   /* We don't check ERR here, since the initial state must not be NULL.  */
987   if (BE (dfa->init_state == NULL, 0))
988     return err;
989   if (dfa->init_state->has_constraint)
990     {
991       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
992 						       CONTEXT_WORD);
993       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
994 						     CONTEXT_NEWLINE);
995       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
996 							 &init_nodes,
997 							 CONTEXT_NEWLINE
998 							 | CONTEXT_BEGBUF);
999       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1000 	      || dfa->init_state_begbuf == NULL, 0))
1001 	return err;
1002     }
1003   else
1004     dfa->init_state_word = dfa->init_state_nl
1005       = dfa->init_state_begbuf = dfa->init_state;
1006 
1007   re_node_set_free (&init_nodes);
1008   return REG_NOERROR;
1009 }
1010 
1011 #ifdef RE_ENABLE_I18N
1012 /* If it is possible to do searching in single byte encoding instead of UTF-8
1013    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1014    DFA nodes where needed.  */
1015 
1016 static void
optimize_utf8(re_dfa_t * dfa)1017 optimize_utf8 (re_dfa_t *dfa)
1018 {
1019   Idx node;
1020   int i;
1021   bool mb_chars = false;
1022   bool has_period = false;
1023 
1024   for (node = 0; node < dfa->nodes_len; ++node)
1025     switch (dfa->nodes[node].type)
1026       {
1027       case CHARACTER:
1028 	if (dfa->nodes[node].opr.c >= 0x80)
1029 	  mb_chars = true;
1030 	break;
1031       case ANCHOR:
1032 	switch (dfa->nodes[node].opr.idx)
1033 	  {
1034 	  case LINE_FIRST:
1035 	  case LINE_LAST:
1036 	  case BUF_FIRST:
1037 	  case BUF_LAST:
1038 	    break;
1039 	  default:
1040 	    /* Word anchors etc. cannot be handled.  */
1041 	    return;
1042 	  }
1043 	break;
1044       case OP_PERIOD:
1045         has_period = true;
1046         break;
1047       case OP_BACK_REF:
1048       case OP_ALT:
1049       case END_OF_RE:
1050       case OP_DUP_ASTERISK:
1051       case OP_OPEN_SUBEXP:
1052       case OP_CLOSE_SUBEXP:
1053 	break;
1054       case COMPLEX_BRACKET:
1055 	return;
1056       case SIMPLE_BRACKET:
1057 	/* Just double check.  */
1058 	{
1059 	  int rshift =
1060 	    (SBC_MAX / 2 % BITSET_WORD_BITS == 0
1061 	     ? 0
1062 	     : BITSET_WORD_BITS - SBC_MAX / 2 % BITSET_WORD_BITS);
1063 	  for (i = SBC_MAX / 2 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1064 	    {
1065 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1066 		return;
1067 	      rshift = 0;
1068 	    }
1069 	}
1070 	break;
1071       default:
1072 	abort ();
1073       }
1074 
1075   if (mb_chars || has_period)
1076     for (node = 0; node < dfa->nodes_len; ++node)
1077       {
1078 	if (dfa->nodes[node].type == CHARACTER
1079 	    && dfa->nodes[node].opr.c >= 0x80)
1080 	  dfa->nodes[node].mb_partial = 0;
1081 	else if (dfa->nodes[node].type == OP_PERIOD)
1082 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1083       }
1084 
1085   /* The search can be in single byte locale.  */
1086   dfa->mb_cur_max = 1;
1087   dfa->is_utf8 = 0;
1088   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1089 }
1090 #endif
1091 
1092 /* Analyze the structure tree, and calculate "first", "next", "edest",
1093    "eclosure", and "inveclosure".  */
1094 
1095 static reg_errcode_t
analyze(regex_t * preg)1096 analyze (regex_t *preg)
1097 {
1098   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1099   reg_errcode_t ret;
1100 
1101   /* Allocate arrays.  */
1102   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1103   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1104   dfa->edests = re_xmalloc (re_node_set, dfa->nodes_alloc);
1105   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1106   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1107 	  || dfa->eclosures == NULL, 0))
1108     return REG_ESPACE;
1109 
1110   dfa->subexp_map = re_xmalloc (Idx, preg->re_nsub);
1111   if (dfa->subexp_map != NULL)
1112     {
1113       Idx i;
1114       for (i = 0; i < preg->re_nsub; i++)
1115 	dfa->subexp_map[i] = i;
1116       preorder (dfa->str_tree, optimize_subexps, dfa);
1117       for (i = 0; i < preg->re_nsub; i++)
1118 	if (dfa->subexp_map[i] != i)
1119 	  break;
1120       if (i == preg->re_nsub)
1121 	{
1122 	  free (dfa->subexp_map);
1123 	  dfa->subexp_map = NULL;
1124 	}
1125     }
1126 
1127   ret = postorder (dfa->str_tree, lower_subexps, preg);
1128   if (BE (ret != REG_NOERROR, 0))
1129     return ret;
1130   ret = postorder (dfa->str_tree, calc_first, dfa);
1131   if (BE (ret != REG_NOERROR, 0))
1132     return ret;
1133   preorder (dfa->str_tree, calc_next, dfa);
1134   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1135   if (BE (ret != REG_NOERROR, 0))
1136     return ret;
1137   ret = calc_eclosure (dfa);
1138   if (BE (ret != REG_NOERROR, 0))
1139     return ret;
1140 
1141   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1142      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1143   if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1144       || dfa->nbackref)
1145     {
1146       dfa->inveclosures = re_xmalloc (re_node_set, dfa->nodes_len);
1147       if (BE (dfa->inveclosures == NULL, 0))
1148         return REG_ESPACE;
1149       ret = calc_inveclosure (dfa);
1150     }
1151 
1152   return ret;
1153 }
1154 
1155 /* Our parse trees are very unbalanced, so we cannot use a stack to
1156    implement parse tree visits.  Instead, we use parent pointers and
1157    some hairy code in these two functions.  */
1158 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1159 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1160 	   void *extra)
1161 {
1162   bin_tree_t *node, *prev;
1163 
1164   for (node = root; ; )
1165     {
1166       /* Descend down the tree, preferably to the left (or to the right
1167 	 if that's the only child).  */
1168       while (node->left || node->right)
1169 	if (node->left)
1170           node = node->left;
1171         else
1172           node = node->right;
1173 
1174       do
1175 	{
1176 	  reg_errcode_t err = fn (extra, node);
1177 	  if (BE (err != REG_NOERROR, 0))
1178 	    return err;
1179           if (node->parent == NULL)
1180 	    return REG_NOERROR;
1181 	  prev = node;
1182 	  node = node->parent;
1183 	}
1184       /* Go up while we have a node that is reached from the right.  */
1185       while (node->right == prev || node->right == NULL);
1186       node = node->right;
1187     }
1188 }
1189 
1190 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1191 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1192 	  void *extra)
1193 {
1194   bin_tree_t *node;
1195 
1196   for (node = root; ; )
1197     {
1198       reg_errcode_t err = fn (extra, node);
1199       if (BE (err != REG_NOERROR, 0))
1200 	return err;
1201 
1202       /* Go to the left node, or up and to the right.  */
1203       if (node->left)
1204 	node = node->left;
1205       else
1206 	{
1207 	  bin_tree_t *prev = NULL;
1208 	  while (node->right == prev || node->right == NULL)
1209 	    {
1210 	      prev = node;
1211 	      node = node->parent;
1212 	      if (!node)
1213 	        return REG_NOERROR;
1214 	    }
1215 	  node = node->right;
1216 	}
1217     }
1218 }
1219 
1220 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1221    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1222    backreferences as well.  Requires a preorder visit.  */
1223 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1224 optimize_subexps (void *extra, bin_tree_t *node)
1225 {
1226   re_dfa_t *dfa = (re_dfa_t *) extra;
1227 
1228   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1229     {
1230       int idx = node->token.opr.idx;
1231       node->token.opr.idx = dfa->subexp_map[idx];
1232       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1233     }
1234 
1235   else if (node->token.type == SUBEXP
1236            && node->left && node->left->token.type == SUBEXP)
1237     {
1238       Idx other_idx = node->left->token.opr.idx;
1239 
1240       node->left = node->left->left;
1241       if (node->left)
1242         node->left->parent = node;
1243 
1244       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1245       if (other_idx < BITSET_WORD_BITS)
1246 	dfa->used_bkref_map &= ~ ((bitset_word) 1 << other_idx);
1247     }
1248 
1249   return REG_NOERROR;
1250 }
1251 
1252 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1253    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1254 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1255 lower_subexps (void *extra, bin_tree_t *node)
1256 {
1257   regex_t *preg = (regex_t *) extra;
1258   reg_errcode_t err = REG_NOERROR;
1259 
1260   if (node->left && node->left->token.type == SUBEXP)
1261     {
1262       node->left = lower_subexp (&err, preg, node->left);
1263       if (node->left)
1264 	node->left->parent = node;
1265     }
1266   if (node->right && node->right->token.type == SUBEXP)
1267     {
1268       node->right = lower_subexp (&err, preg, node->right);
1269       if (node->right)
1270 	node->right->parent = node;
1271     }
1272 
1273   return err;
1274 }
1275 
1276 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1277 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1278 {
1279   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1280   bin_tree_t *body = node->left;
1281   bin_tree_t *op, *cls, *tree1, *tree;
1282 
1283   if (preg->re_no_sub
1284       /* We do not optimize empty subexpressions, because otherwise we may
1285 	 have bad CONCAT nodes with NULL children.  This is obviously not
1286 	 very common, so we do not lose much.  An example that triggers
1287 	 this case is the sed "script" /\(\)/x.  */
1288       && node->left != NULL
1289       && ! (node->token.opr.idx < BITSET_WORD_BITS
1290 	    && dfa->used_bkref_map & ((bitset_word) 1 << node->token.opr.idx)))
1291     return node->left;
1292 
1293   /* Convert the SUBEXP node to the concatenation of an
1294      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1295   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1296   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1297   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1298   tree = create_tree (dfa, op, tree1, CONCAT);
1299   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1300     {
1301       *err = REG_ESPACE;
1302       return NULL;
1303     }
1304 
1305   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1306   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1307   return tree;
1308 }
1309 
1310 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1311    nodes.  Requires a postorder visit.  */
1312 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1313 calc_first (void *extra, bin_tree_t *node)
1314 {
1315   re_dfa_t *dfa = (re_dfa_t *) extra;
1316   if (node->token.type == CONCAT)
1317     {
1318       node->first = node->left->first;
1319       node->node_idx = node->left->node_idx;
1320     }
1321   else
1322     {
1323       node->first = node;
1324       node->node_idx = re_dfa_add_node (dfa, node->token);
1325       if (BE (node->node_idx == REG_MISSING, 0))
1326         return REG_ESPACE;
1327     }
1328   return REG_NOERROR;
1329 }
1330 
1331 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1332 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1333 calc_next (void *extra, bin_tree_t *node)
1334 {
1335   switch (node->token.type)
1336     {
1337     case OP_DUP_ASTERISK:
1338       node->left->next = node;
1339       break;
1340     case CONCAT:
1341       node->left->next = node->right->first;
1342       node->right->next = node->next;
1343       break;
1344     default:
1345       if (node->left)
1346 	node->left->next = node->next;
1347       if (node->right)
1348         node->right->next = node->next;
1349       break;
1350     }
1351   return REG_NOERROR;
1352 }
1353 
1354 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1355 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1356 link_nfa_nodes (void *extra, bin_tree_t *node)
1357 {
1358   re_dfa_t *dfa = (re_dfa_t *) extra;
1359   Idx idx = node->node_idx;
1360   reg_errcode_t err = REG_NOERROR;
1361 
1362   switch (node->token.type)
1363     {
1364     case CONCAT:
1365       break;
1366 
1367     case END_OF_RE:
1368       assert (node->next == NULL);
1369       break;
1370 
1371     case OP_DUP_ASTERISK:
1372     case OP_ALT:
1373       {
1374 	Idx left, right;
1375 	dfa->has_plural_match = 1;
1376 	if (node->left != NULL)
1377 	  left = node->left->first->node_idx;
1378 	else
1379 	  left = node->next->node_idx;
1380 	if (node->right != NULL)
1381 	  right = node->right->first->node_idx;
1382 	else
1383 	  right = node->next->node_idx;
1384 	assert (REG_VALID_INDEX (left));
1385 	assert (REG_VALID_INDEX (right));
1386 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1387       }
1388       break;
1389 
1390     case ANCHOR:
1391     case OP_OPEN_SUBEXP:
1392     case OP_CLOSE_SUBEXP:
1393       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1394       break;
1395 
1396     case OP_BACK_REF:
1397       dfa->nexts[idx] = node->next->node_idx;
1398       if (node->token.type == OP_BACK_REF)
1399 	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1400       break;
1401 
1402     default:
1403       assert (!IS_EPSILON_NODE (node->token.type));
1404       dfa->nexts[idx] = node->next->node_idx;
1405       break;
1406     }
1407 
1408   return err;
1409 }
1410 
1411 /* Duplicate the epsilon closure of the node ROOT_NODE.
1412    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1413    to their own constraint.  */
1414 
1415 static reg_errcode_t
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)1416 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node,
1417 			Idx top_clone_node, Idx root_node,
1418 			unsigned int init_constraint)
1419 {
1420   Idx org_node, clone_node;
1421   bool ok;
1422   unsigned int constraint = init_constraint;
1423   for (org_node = top_org_node, clone_node = top_clone_node;;)
1424     {
1425       Idx org_dest, clone_dest;
1426       if (dfa->nodes[org_node].type == OP_BACK_REF)
1427 	{
1428 	  /* If the back reference epsilon-transit, its destination must
1429 	     also have the constraint.  Then duplicate the epsilon closure
1430 	     of the destination of the back reference, and store it in
1431 	     edests of the back reference.  */
1432 	  org_dest = dfa->nexts[org_node];
1433 	  re_node_set_empty (dfa->edests + clone_node);
1434 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1435 	  if (BE (clone_dest == REG_MISSING, 0))
1436 	    return REG_ESPACE;
1437 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1438 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1439 	  if (BE (! ok, 0))
1440 	    return REG_ESPACE;
1441 	}
1442       else if (dfa->edests[org_node].nelem == 0)
1443 	{
1444 	  /* In case of the node can't epsilon-transit, don't duplicate the
1445 	     destination and store the original destination as the
1446 	     destination of the node.  */
1447 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1448 	  break;
1449 	}
1450       else if (dfa->edests[org_node].nelem == 1)
1451 	{
1452 	  /* In case of the node can epsilon-transit, and it has only one
1453 	     destination.  */
1454 	  org_dest = dfa->edests[org_node].elems[0];
1455 	  re_node_set_empty (dfa->edests + clone_node);
1456 	  if (dfa->nodes[org_node].type == ANCHOR)
1457 	    {
1458 	      /* In case of the node has another constraint, append it.  */
1459 	      if (org_node == root_node && clone_node != org_node)
1460 		{
1461 		  /* ...but if the node is root_node itself, it means the
1462 		     epsilon closure have a loop, then tie it to the
1463 		     destination of the root_node.  */
1464 		  ok = re_node_set_insert (dfa->edests + clone_node,
1465 					    org_dest);
1466 		  if (BE (! ok, 0))
1467 		    return REG_ESPACE;
1468 		  break;
1469 		}
1470 	      constraint |= dfa->nodes[org_node].opr.ctx_type;
1471 	    }
1472 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1473 	  if (BE (clone_dest == REG_MISSING, 0))
1474 	    return REG_ESPACE;
1475 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1476 	  if (BE (! ok, 0))
1477 	    return REG_ESPACE;
1478 	}
1479       else /* dfa->edests[org_node].nelem == 2 */
1480 	{
1481 	  /* In case of the node can epsilon-transit, and it has two
1482 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1483 	  org_dest = dfa->edests[org_node].elems[0];
1484 	  re_node_set_empty (dfa->edests + clone_node);
1485 	  /* Search for a duplicated node which satisfies the constraint.  */
1486 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1487 	  if (clone_dest == REG_MISSING)
1488 	    {
1489 	      /* There are no such a duplicated node, create a new one.  */
1490 	      reg_errcode_t err;
1491 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1492 	      if (BE (clone_dest == REG_MISSING, 0))
1493 		return REG_ESPACE;
1494 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1495 	      if (BE (! ok, 0))
1496 		return REG_ESPACE;
1497 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1498 					    root_node, constraint);
1499 	      if (BE (err != REG_NOERROR, 0))
1500 		return err;
1501 	    }
1502 	  else
1503 	    {
1504 	      /* There are a duplicated node which satisfy the constraint,
1505 		 use it to avoid infinite loop.  */
1506 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1507 	      if (BE (! ok, 0))
1508 		return REG_ESPACE;
1509 	    }
1510 
1511 	  org_dest = dfa->edests[org_node].elems[1];
1512 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1513 	  if (BE (clone_dest == REG_MISSING, 0))
1514 	    return REG_ESPACE;
1515 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1516 	  if (BE (! ok, 0))
1517 	    return REG_ESPACE;
1518 	}
1519       org_node = org_dest;
1520       clone_node = clone_dest;
1521     }
1522   return REG_NOERROR;
1523 }
1524 
1525 /* Search for a node which is duplicated from the node ORG_NODE, and
1526    satisfies the constraint CONSTRAINT.  */
1527 
1528 static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)1529 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1530 			unsigned int constraint)
1531 {
1532   Idx idx;
1533   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1534     {
1535       if (org_node == dfa->org_indices[idx]
1536 	  && constraint == dfa->nodes[idx].constraint)
1537 	return idx; /* Found.  */
1538     }
1539   return REG_MISSING; /* Not found.  */
1540 }
1541 
1542 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1543    Return the index of the new node, or REG_MISSING if insufficient storage is
1544    available.  */
1545 
1546 static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)1547 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1548 {
1549   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1550   if (BE (dup_idx != REG_MISSING, 1))
1551     {
1552       dfa->nodes[dup_idx].constraint = constraint;
1553       if (dfa->nodes[org_idx].type == ANCHOR)
1554 	dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1555       dfa->nodes[dup_idx].duplicated = 1;
1556 
1557       /* Store the index of the original node.  */
1558       dfa->org_indices[dup_idx] = org_idx;
1559     }
1560   return dup_idx;
1561 }
1562 
1563 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1564 calc_inveclosure (re_dfa_t *dfa)
1565 {
1566   Idx src, idx;
1567   bool ok;
1568   for (idx = 0; idx < dfa->nodes_len; ++idx)
1569     re_node_set_init_empty (dfa->inveclosures + idx);
1570 
1571   for (src = 0; src < dfa->nodes_len; ++src)
1572     {
1573       Idx *elems = dfa->eclosures[src].elems;
1574       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1575 	{
1576 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1577 	  if (BE (! ok, 0))
1578 	    return REG_ESPACE;
1579 	}
1580     }
1581 
1582   return REG_NOERROR;
1583 }
1584 
1585 /* Calculate "eclosure" for all the node in DFA.  */
1586 
1587 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1588 calc_eclosure (re_dfa_t *dfa)
1589 {
1590   Idx node_idx;
1591   bool incomplete;
1592 #ifdef DEBUG
1593   assert (dfa->nodes_len > 0);
1594 #endif
1595   incomplete = false;
1596   /* For each nodes, calculate epsilon closure.  */
1597   for (node_idx = 0; ; ++node_idx)
1598     {
1599       reg_errcode_t err;
1600       re_node_set eclosure_elem;
1601       if (node_idx == dfa->nodes_len)
1602 	{
1603 	  if (!incomplete)
1604 	    break;
1605 	  incomplete = false;
1606 	  node_idx = 0;
1607 	}
1608 
1609 #ifdef DEBUG
1610       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1611 #endif
1612 
1613       /* If we have already calculated, skip it.  */
1614       if (dfa->eclosures[node_idx].nelem != 0)
1615 	continue;
1616       /* Calculate epsilon closure of `node_idx'.  */
1617       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1618       if (BE (err != REG_NOERROR, 0))
1619 	return err;
1620 
1621       if (dfa->eclosures[node_idx].nelem == 0)
1622 	{
1623 	  incomplete = true;
1624 	  re_node_set_free (&eclosure_elem);
1625 	}
1626     }
1627   return REG_NOERROR;
1628 }
1629 
1630 /* Calculate epsilon closure of NODE.  */
1631 
1632 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)1633 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1634 {
1635   reg_errcode_t err;
1636   unsigned int constraint;
1637   Idx i;
1638   bool incomplete;
1639   bool ok;
1640   re_node_set eclosure;
1641   incomplete = false;
1642   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1643   if (BE (err != REG_NOERROR, 0))
1644     return err;
1645 
1646   /* This indicates that we are calculating this node now.
1647      We reference this value to avoid infinite loop.  */
1648   dfa->eclosures[node].nelem = REG_MISSING;
1649 
1650   constraint = ((dfa->nodes[node].type == ANCHOR)
1651 		? dfa->nodes[node].opr.ctx_type : 0);
1652   /* If the current node has constraints, duplicate all nodes.
1653      Since they must inherit the constraints.  */
1654   if (constraint
1655       && dfa->edests[node].nelem
1656       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1657     {
1658       Idx org_node, cur_node;
1659       org_node = cur_node = node;
1660       err = duplicate_node_closure (dfa, node, node, node, constraint);
1661       if (BE (err != REG_NOERROR, 0))
1662 	return err;
1663     }
1664 
1665   /* Expand each epsilon destination nodes.  */
1666   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1667     for (i = 0; i < dfa->edests[node].nelem; ++i)
1668       {
1669 	re_node_set eclosure_elem;
1670 	Idx edest = dfa->edests[node].elems[i];
1671 	/* If calculating the epsilon closure of `edest' is in progress,
1672 	   return intermediate result.  */
1673 	if (dfa->eclosures[edest].nelem == REG_MISSING)
1674 	  {
1675 	    incomplete = true;
1676 	    continue;
1677 	  }
1678 	/* If we haven't calculated the epsilon closure of `edest' yet,
1679 	   calculate now. Otherwise use calculated epsilon closure.  */
1680 	if (dfa->eclosures[edest].nelem == 0)
1681 	  {
1682 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1683 	    if (BE (err != REG_NOERROR, 0))
1684 	      return err;
1685 	  }
1686 	else
1687 	  eclosure_elem = dfa->eclosures[edest];
1688 	/* Merge the epsilon closure of `edest'.  */
1689 	re_node_set_merge (&eclosure, &eclosure_elem);
1690 	/* If the epsilon closure of `edest' is incomplete,
1691 	   the epsilon closure of this node is also incomplete.  */
1692 	if (dfa->eclosures[edest].nelem == 0)
1693 	  {
1694 	    incomplete = true;
1695 	    re_node_set_free (&eclosure_elem);
1696 	  }
1697       }
1698 
1699   /* Epsilon closures include itself.  */
1700   ok = re_node_set_insert (&eclosure, node);
1701   if (BE (! ok, 0))
1702     return REG_ESPACE;
1703   if (incomplete && !root)
1704     dfa->eclosures[node].nelem = 0;
1705   else
1706     dfa->eclosures[node] = eclosure;
1707   *new_set = eclosure;
1708   return REG_NOERROR;
1709 }
1710 
1711 /* Functions for token which are used in the parser.  */
1712 
1713 /* Fetch a token from INPUT.
1714    We must not use this function inside bracket expressions.  */
1715 
1716 static void
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1717 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1718 {
1719   re_string_skip_bytes (input, peek_token (result, input, syntax));
1720 }
1721 
1722 /* Peek a token from INPUT, and return the length of the token.
1723    We must not use this function inside bracket expressions.  */
1724 
1725 static int
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1726 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1727 {
1728   unsigned char c;
1729 
1730   if (re_string_eoi (input))
1731     {
1732       token->type = END_OF_RE;
1733       return 0;
1734     }
1735 
1736   c = re_string_peek_byte (input, 0);
1737   token->opr.c = c;
1738 
1739   token->word_char = 0;
1740 #ifdef RE_ENABLE_I18N
1741   token->mb_partial = 0;
1742   if (input->mb_cur_max > 1 &&
1743       !re_string_first_byte (input, re_string_cur_idx (input)))
1744     {
1745       token->type = CHARACTER;
1746       token->mb_partial = 1;
1747       return 1;
1748     }
1749 #endif
1750   if (c == '\\')
1751     {
1752       unsigned char c2;
1753       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1754 	{
1755 	  token->type = BACK_SLASH;
1756 	  return 1;
1757 	}
1758 
1759       c2 = re_string_peek_byte_case (input, 1);
1760       token->opr.c = c2;
1761       token->type = CHARACTER;
1762 #ifdef RE_ENABLE_I18N
1763       if (input->mb_cur_max > 1)
1764 	{
1765 	  wint_t wc = re_string_wchar_at (input,
1766 					  re_string_cur_idx (input) + 1);
1767 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1768 	}
1769       else
1770 #endif
1771 	token->word_char = IS_WORD_CHAR (c2) != 0;
1772 
1773       switch (c2)
1774 	{
1775 	case '|':
1776 	  if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1777 	    token->type = OP_ALT;
1778 	  break;
1779 	case '1': case '2': case '3': case '4': case '5':
1780 	case '6': case '7': case '8': case '9':
1781 	  if (!(syntax & REG_NO_BK_REFS))
1782 	    {
1783 	      token->type = OP_BACK_REF;
1784 	      token->opr.idx = c2 - '1';
1785 	    }
1786 	  break;
1787 	case '<':
1788 	  if (!(syntax & REG_NO_GNU_OPS))
1789 	    {
1790 	      token->type = ANCHOR;
1791 	      token->opr.ctx_type = WORD_FIRST;
1792 	    }
1793 	  break;
1794 	case '>':
1795 	  if (!(syntax & REG_NO_GNU_OPS))
1796 	    {
1797 	      token->type = ANCHOR;
1798 	      token->opr.ctx_type = WORD_LAST;
1799 	    }
1800 	  break;
1801 	case 'b':
1802 	  if (!(syntax & REG_NO_GNU_OPS))
1803 	    {
1804 	      token->type = ANCHOR;
1805 	      token->opr.ctx_type = WORD_DELIM;
1806 	    }
1807 	  break;
1808 	case 'B':
1809 	  if (!(syntax & REG_NO_GNU_OPS))
1810 	    {
1811 	      token->type = ANCHOR;
1812 	      token->opr.ctx_type = NOT_WORD_DELIM;
1813 	    }
1814 	  break;
1815 	case 'w':
1816 	  if (!(syntax & REG_NO_GNU_OPS))
1817 	    token->type = OP_WORD;
1818 	  break;
1819 	case 'W':
1820 	  if (!(syntax & REG_NO_GNU_OPS))
1821 	    token->type = OP_NOTWORD;
1822 	  break;
1823 	case 's':
1824 	  if (!(syntax & REG_NO_GNU_OPS))
1825 	    token->type = OP_SPACE;
1826 	  break;
1827 	case 'S':
1828 	  if (!(syntax & REG_NO_GNU_OPS))
1829 	    token->type = OP_NOTSPACE;
1830 	  break;
1831 	case '`':
1832 	  if (!(syntax & REG_NO_GNU_OPS))
1833 	    {
1834 	      token->type = ANCHOR;
1835 	      token->opr.ctx_type = BUF_FIRST;
1836 	    }
1837 	  break;
1838 	case '\'':
1839 	  if (!(syntax & REG_NO_GNU_OPS))
1840 	    {
1841 	      token->type = ANCHOR;
1842 	      token->opr.ctx_type = BUF_LAST;
1843 	    }
1844 	  break;
1845 	case '(':
1846 	  if (!(syntax & REG_NO_BK_PARENS))
1847 	    token->type = OP_OPEN_SUBEXP;
1848 	  break;
1849 	case ')':
1850 	  if (!(syntax & REG_NO_BK_PARENS))
1851 	    token->type = OP_CLOSE_SUBEXP;
1852 	  break;
1853 	case '+':
1854 	  if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1855 	    token->type = OP_DUP_PLUS;
1856 	  break;
1857 	case '?':
1858 	  if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1859 	    token->type = OP_DUP_QUESTION;
1860 	  break;
1861 	case '{':
1862 	  if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1863 	    token->type = OP_OPEN_DUP_NUM;
1864 	  break;
1865 	case '}':
1866 	  if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1867 	    token->type = OP_CLOSE_DUP_NUM;
1868 	  break;
1869 	default:
1870 	  break;
1871 	}
1872       return 2;
1873     }
1874 
1875   token->type = CHARACTER;
1876 #ifdef RE_ENABLE_I18N
1877   if (input->mb_cur_max > 1)
1878     {
1879       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1880       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1881     }
1882   else
1883 #endif
1884     token->word_char = IS_WORD_CHAR (token->opr.c);
1885 
1886   switch (c)
1887     {
1888     case '\n':
1889       if (syntax & REG_NEWLINE_ALT)
1890 	token->type = OP_ALT;
1891       break;
1892     case '|':
1893       if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1894 	token->type = OP_ALT;
1895       break;
1896     case '*':
1897       token->type = OP_DUP_ASTERISK;
1898       break;
1899     case '+':
1900       if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1901 	token->type = OP_DUP_PLUS;
1902       break;
1903     case '?':
1904       if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1905 	token->type = OP_DUP_QUESTION;
1906       break;
1907     case '{':
1908       if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1909 	token->type = OP_OPEN_DUP_NUM;
1910       break;
1911     case '}':
1912       if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1913 	token->type = OP_CLOSE_DUP_NUM;
1914       break;
1915     case '(':
1916       if (syntax & REG_NO_BK_PARENS)
1917 	token->type = OP_OPEN_SUBEXP;
1918       break;
1919     case ')':
1920       if (syntax & REG_NO_BK_PARENS)
1921 	token->type = OP_CLOSE_SUBEXP;
1922       break;
1923     case '[':
1924       token->type = OP_OPEN_BRACKET;
1925       break;
1926     case '.':
1927       token->type = OP_PERIOD;
1928       break;
1929     case '^':
1930       if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1931 	  re_string_cur_idx (input) != 0)
1932 	{
1933 	  char prev = re_string_peek_byte (input, -1);
1934 	  if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1935 	    break;
1936 	}
1937       token->type = ANCHOR;
1938       token->opr.ctx_type = LINE_FIRST;
1939       break;
1940     case '$':
1941       if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1942 	  re_string_cur_idx (input) + 1 != re_string_length (input))
1943 	{
1944 	  re_token_t next;
1945 	  re_string_skip_bytes (input, 1);
1946 	  peek_token (&next, input, syntax);
1947 	  re_string_skip_bytes (input, -1);
1948 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1949 	    break;
1950 	}
1951       token->type = ANCHOR;
1952       token->opr.ctx_type = LINE_LAST;
1953       break;
1954     default:
1955       break;
1956     }
1957   return 1;
1958 }
1959 
1960 /* Peek a token from INPUT, and return the length of the token.
1961    We must not use this function out of bracket expressions.  */
1962 
1963 static int
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1964 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1965 {
1966   unsigned char c;
1967   if (re_string_eoi (input))
1968     {
1969       token->type = END_OF_RE;
1970       return 0;
1971     }
1972   c = re_string_peek_byte (input, 0);
1973   token->opr.c = c;
1974 
1975 #ifdef RE_ENABLE_I18N
1976   if (input->mb_cur_max > 1 &&
1977       !re_string_first_byte (input, re_string_cur_idx (input)))
1978     {
1979       token->type = CHARACTER;
1980       return 1;
1981     }
1982 #endif /* RE_ENABLE_I18N */
1983 
1984   if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1985       && re_string_cur_idx (input) + 1 < re_string_length (input))
1986     {
1987       /* In this case, '\' escape a character.  */
1988       unsigned char c2;
1989       re_string_skip_bytes (input, 1);
1990       c2 = re_string_peek_byte (input, 0);
1991       token->opr.c = c2;
1992       token->type = CHARACTER;
1993       return 1;
1994     }
1995   if (c == '[') /* '[' is a special char in a bracket exps.  */
1996     {
1997       unsigned char c2;
1998       int token_len;
1999       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2000 	c2 = re_string_peek_byte (input, 1);
2001       else
2002 	c2 = 0;
2003       token->opr.c = c2;
2004       token_len = 2;
2005       switch (c2)
2006 	{
2007 	case '.':
2008 	  token->type = OP_OPEN_COLL_ELEM;
2009 	  break;
2010 	case '=':
2011 	  token->type = OP_OPEN_EQUIV_CLASS;
2012 	  break;
2013 	case ':':
2014 	  if (syntax & REG_CHAR_CLASSES)
2015 	    {
2016 	      token->type = OP_OPEN_CHAR_CLASS;
2017 	      break;
2018 	    }
2019 	  /* else fall through.  */
2020 	default:
2021 	  token->type = CHARACTER;
2022 	  token->opr.c = c;
2023 	  token_len = 1;
2024 	  break;
2025 	}
2026       return token_len;
2027     }
2028   switch (c)
2029     {
2030     case '-':
2031       token->type = OP_CHARSET_RANGE;
2032       break;
2033     case ']':
2034       token->type = OP_CLOSE_BRACKET;
2035       break;
2036     case '^':
2037       token->type = OP_NON_MATCH_LIST;
2038       break;
2039     default:
2040       token->type = CHARACTER;
2041     }
2042   return 1;
2043 }
2044 
2045 /* Functions for parser.  */
2046 
2047 /* Entry point of the parser.
2048    Parse the regular expression REGEXP and return the structure tree.
2049    If an error is occured, ERR is set by error code, and return NULL.
2050    This function build the following tree, from regular expression <reg_exp>:
2051 	   CAT
2052 	   / \
2053 	  /   \
2054    <reg_exp>  EOR
2055 
2056    CAT means concatenation.
2057    EOR means end of regular expression.  */
2058 
2059 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2060 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2061        reg_errcode_t *err)
2062 {
2063   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2064   bin_tree_t *tree, *eor, *root;
2065   re_token_t current_token;
2066   dfa->syntax = syntax;
2067   fetch_token (&current_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2068   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2069   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2070     return NULL;
2071   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2072   if (tree != NULL)
2073     root = create_tree (dfa, tree, eor, CONCAT);
2074   else
2075     root = eor;
2076   if (BE (eor == NULL || root == NULL, 0))
2077     {
2078       *err = REG_ESPACE;
2079       return NULL;
2080     }
2081   return root;
2082 }
2083 
2084 /* This function build the following tree, from regular expression
2085    <branch1>|<branch2>:
2086 	   ALT
2087 	   / \
2088 	  /   \
2089    <branch1> <branch2>
2090 
2091    ALT means alternative, which represents the operator `|'.  */
2092 
2093 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2094 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2095 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2096 {
2097   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2098   bin_tree_t *tree, *branch = NULL;
2099   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2100   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2101     return NULL;
2102 
2103   while (token->type == OP_ALT)
2104     {
2105       fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2106       if (token->type != OP_ALT && token->type != END_OF_RE
2107 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2108 	{
2109 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2110 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2111 	    return NULL;
2112 	}
2113       else
2114 	branch = NULL;
2115       tree = create_tree (dfa, tree, branch, OP_ALT);
2116       if (BE (tree == NULL, 0))
2117 	{
2118 	  *err = REG_ESPACE;
2119 	  return NULL;
2120 	}
2121     }
2122   return tree;
2123 }
2124 
2125 /* This function build the following tree, from regular expression
2126    <exp1><exp2>:
2127 	CAT
2128 	/ \
2129        /   \
2130    <exp1> <exp2>
2131 
2132    CAT means concatenation.  */
2133 
2134 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2135 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2136 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2137 {
2138   bin_tree_t *tree, *exp;
2139   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2140   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2141   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2142     return NULL;
2143 
2144   while (token->type != OP_ALT && token->type != END_OF_RE
2145 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2146     {
2147       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2148       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2149 	{
2150 	  return NULL;
2151 	}
2152       if (tree != NULL && exp != NULL)
2153 	{
2154 	  tree = create_tree (dfa, tree, exp, CONCAT);
2155 	  if (tree == NULL)
2156 	    {
2157 	      *err = REG_ESPACE;
2158 	      return NULL;
2159 	    }
2160 	}
2161       else if (tree == NULL)
2162 	tree = exp;
2163       /* Otherwise exp == NULL, we don't need to create new tree.  */
2164     }
2165   return tree;
2166 }
2167 
2168 /* This function build the following tree, from regular expression a*:
2169 	 *
2170 	 |
2171 	 a
2172 */
2173 
2174 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2175 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2176 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2177 {
2178   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2179   bin_tree_t *tree;
2180   switch (token->type)
2181     {
2182     case CHARACTER:
2183       tree = create_token_tree (dfa, NULL, NULL, token);
2184       if (BE (tree == NULL, 0))
2185 	{
2186 	  *err = REG_ESPACE;
2187 	  return NULL;
2188 	}
2189 #ifdef RE_ENABLE_I18N
2190       if (dfa->mb_cur_max > 1)
2191 	{
2192 	  while (!re_string_eoi (regexp)
2193 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2194 	    {
2195 	      bin_tree_t *mbc_remain;
2196 	      fetch_token (token, regexp, syntax);
2197 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2198 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2199 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2200 		{
2201 		  *err = REG_ESPACE;
2202 		  return NULL;
2203 		}
2204 	    }
2205 	}
2206 #endif
2207       break;
2208     case OP_OPEN_SUBEXP:
2209       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2210       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2211 	return NULL;
2212       break;
2213     case OP_OPEN_BRACKET:
2214       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2215       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2216 	return NULL;
2217       break;
2218     case OP_BACK_REF:
2219       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2220 	{
2221 	  *err = REG_ESUBREG;
2222 	  return NULL;
2223 	}
2224       dfa->used_bkref_map |= 1 << token->opr.idx;
2225       tree = create_token_tree (dfa, NULL, NULL, token);
2226       if (BE (tree == NULL, 0))
2227 	{
2228 	  *err = REG_ESPACE;
2229 	  return NULL;
2230 	}
2231       ++dfa->nbackref;
2232       dfa->has_mb_node = 1;
2233       break;
2234     case OP_OPEN_DUP_NUM:
2235       if (syntax & REG_CONTEXT_INVALID_DUP)
2236 	{
2237 	  *err = REG_BADRPT;
2238 	  return NULL;
2239 	}
2240       /* FALLTHROUGH */
2241     case OP_DUP_ASTERISK:
2242     case OP_DUP_PLUS:
2243     case OP_DUP_QUESTION:
2244       if (syntax & REG_CONTEXT_INVALID_OPS)
2245 	{
2246 	  *err = REG_BADRPT;
2247 	  return NULL;
2248 	}
2249       else if (syntax & REG_CONTEXT_INDEP_OPS)
2250 	{
2251 	  fetch_token (token, regexp, syntax);
2252 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2253 	}
2254       /* else fall through  */
2255     case OP_CLOSE_SUBEXP:
2256       if ((token->type == OP_CLOSE_SUBEXP) &&
2257 	  !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2258 	{
2259 	  *err = REG_ERPAREN;
2260 	  return NULL;
2261 	}
2262       /* else fall through  */
2263     case OP_CLOSE_DUP_NUM:
2264       /* We treat it as a normal character.  */
2265 
2266       /* Then we can these characters as normal characters.  */
2267       token->type = CHARACTER;
2268       /* mb_partial and word_char bits should be initialized already
2269 	 by peek_token.  */
2270       tree = create_token_tree (dfa, NULL, NULL, token);
2271       if (BE (tree == NULL, 0))
2272 	{
2273 	  *err = REG_ESPACE;
2274 	  return NULL;
2275 	}
2276       break;
2277     case ANCHOR:
2278       if ((token->opr.ctx_type
2279 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2280 	  && dfa->word_ops_used == 0)
2281 	init_word_char (dfa);
2282       if (token->opr.ctx_type == WORD_DELIM
2283           || token->opr.ctx_type == NOT_WORD_DELIM)
2284 	{
2285 	  bin_tree_t *tree_first, *tree_last;
2286 	  if (token->opr.ctx_type == WORD_DELIM)
2287 	    {
2288 	      token->opr.ctx_type = WORD_FIRST;
2289 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2290 	      token->opr.ctx_type = WORD_LAST;
2291             }
2292           else
2293             {
2294 	      token->opr.ctx_type = INSIDE_WORD;
2295 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2296 	      token->opr.ctx_type = INSIDE_NOTWORD;
2297             }
2298 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2299 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2300 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2301 	    {
2302 	      *err = REG_ESPACE;
2303 	      return NULL;
2304 	    }
2305 	}
2306       else
2307 	{
2308 	  tree = create_token_tree (dfa, NULL, NULL, token);
2309 	  if (BE (tree == NULL, 0))
2310 	    {
2311 	      *err = REG_ESPACE;
2312 	      return NULL;
2313 	    }
2314 	}
2315       /* We must return here, since ANCHORs can't be followed
2316 	 by repetition operators.
2317 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2318 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2319       fetch_token (token, regexp, syntax);
2320       return tree;
2321     case OP_PERIOD:
2322       tree = create_token_tree (dfa, NULL, NULL, token);
2323       if (BE (tree == NULL, 0))
2324 	{
2325 	  *err = REG_ESPACE;
2326 	  return NULL;
2327 	}
2328       if (dfa->mb_cur_max > 1)
2329 	dfa->has_mb_node = 1;
2330       break;
2331     case OP_WORD:
2332     case OP_NOTWORD:
2333       tree = build_charclass_op (dfa, regexp->trans,
2334 				 (const unsigned char *) "alnum",
2335 				 (const unsigned char *) "_",
2336 				 token->type == OP_NOTWORD, err);
2337       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2338 	return NULL;
2339       break;
2340     case OP_SPACE:
2341     case OP_NOTSPACE:
2342       tree = build_charclass_op (dfa, regexp->trans,
2343 				 (const unsigned char *) "space",
2344 				 (const unsigned char *) "",
2345 				 token->type == OP_NOTSPACE, err);
2346       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2347 	return NULL;
2348       break;
2349     case OP_ALT:
2350     case END_OF_RE:
2351       return NULL;
2352     case BACK_SLASH:
2353       *err = REG_EESCAPE;
2354       return NULL;
2355     default:
2356       /* Must not happen?  */
2357 #ifdef DEBUG
2358       assert (0);
2359 #endif
2360       return NULL;
2361     }
2362   fetch_token (token, regexp, syntax);
2363 
2364   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2365 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2366     {
2367       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2368       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2369 	return NULL;
2370       /* In BRE consecutive duplications are not allowed.  */
2371       if ((syntax & REG_CONTEXT_INVALID_DUP)
2372 	  && (token->type == OP_DUP_ASTERISK
2373 	      || token->type == OP_OPEN_DUP_NUM))
2374 	{
2375 	  *err = REG_BADRPT;
2376 	  return NULL;
2377 	}
2378     }
2379 
2380   return tree;
2381 }
2382 
2383 /* This function build the following tree, from regular expression
2384    (<reg_exp>):
2385 	 SUBEXP
2386 	    |
2387 	<reg_exp>
2388 */
2389 
2390 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2391 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2392 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2393 {
2394   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2395   bin_tree_t *tree;
2396   size_t cur_nsub;
2397   cur_nsub = preg->re_nsub++;
2398 
2399   fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2400 
2401   /* The subexpression may be a null string.  */
2402   if (token->type == OP_CLOSE_SUBEXP)
2403     tree = NULL;
2404   else
2405     {
2406       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2407       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2408         *err = REG_EPAREN;
2409       if (BE (*err != REG_NOERROR, 0))
2410 	return NULL;
2411     }
2412 
2413   if (cur_nsub <= '9' - '1')
2414     dfa->completed_bkref_map |= 1 << cur_nsub;
2415 
2416   tree = create_tree (dfa, tree, NULL, SUBEXP);
2417   if (BE (tree == NULL, 0))
2418     {
2419       *err = REG_ESPACE;
2420       return NULL;
2421     }
2422   tree->token.opr.idx = cur_nsub;
2423   return tree;
2424 }
2425 
2426 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2427 
2428 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2429 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2430 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2431 {
2432   bin_tree_t *tree = NULL, *old_tree = NULL;
2433   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2434   re_token_t start_token = *token;
2435 
2436   if (token->type == OP_OPEN_DUP_NUM)
2437     {
2438       end = 0;
2439       start = fetch_number (regexp, token, syntax);
2440       if (start == REG_MISSING)
2441 	{
2442 	  if (token->type == CHARACTER && token->opr.c == ',')
2443 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2444 	  else
2445 	    {
2446 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2447 	      return NULL;
2448 	    }
2449 	}
2450       if (BE (start != REG_ERROR, 1))
2451 	{
2452 	  /* We treat "{n}" as "{n,n}".  */
2453 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2454 		 : ((token->type == CHARACTER && token->opr.c == ',')
2455 		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2456 	}
2457       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2458 	{
2459 	  /* Invalid sequence.  */
2460 	  if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2461 	    {
2462 	      if (token->type == END_OF_RE)
2463 		*err = REG_EBRACE;
2464 	      else
2465 		*err = REG_BADBR;
2466 
2467 	      return NULL;
2468 	    }
2469 
2470 	  /* If the syntax bit is set, rollback.  */
2471 	  re_string_set_index (regexp, start_idx);
2472 	  *token = start_token;
2473 	  token->type = CHARACTER;
2474 	  /* mb_partial and word_char bits should be already initialized by
2475 	     peek_token.  */
2476 	  return elem;
2477 	}
2478 
2479       if (BE (end != REG_MISSING && start > end, 0))
2480 	{
2481 	  /* First number greater than second.  */
2482 	  *err = REG_BADBR;
2483 	  return NULL;
2484 	}
2485     }
2486   else
2487     {
2488       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2489       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2490     }
2491 
2492   fetch_token (token, regexp, syntax);
2493 
2494   if (BE (elem == NULL, 0))
2495     return NULL;
2496   if (BE (start == 0 && end == 0, 0))
2497     {
2498       postorder (elem, free_tree, NULL);
2499       return NULL;
2500     }
2501 
2502   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2503   if (BE (start > 0, 0))
2504     {
2505       tree = elem;
2506       for (i = 2; i <= start; ++i)
2507 	{
2508 	  elem = duplicate_tree (elem, dfa);
2509 	  tree = create_tree (dfa, tree, elem, CONCAT);
2510 	  if (BE (elem == NULL || tree == NULL, 0))
2511 	    goto parse_dup_op_espace;
2512 	}
2513 
2514       if (start == end)
2515 	return tree;
2516 
2517       /* Duplicate ELEM before it is marked optional.  */
2518       elem = duplicate_tree (elem, dfa);
2519       old_tree = tree;
2520     }
2521   else
2522     old_tree = NULL;
2523 
2524   if (elem->token.type == SUBEXP)
2525     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2526 
2527   tree = create_tree (dfa, elem, NULL,
2528 		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2529   if (BE (tree == NULL, 0))
2530     goto parse_dup_op_espace;
2531 
2532   /* This loop is actually executed only when end != REG_MISSING,
2533      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2534      already created the start+1-th copy.  */
2535   if ((Idx) -1 < 0 || end != REG_MISSING)
2536     for (i = start + 2; i <= end; ++i)
2537       {
2538 	elem = duplicate_tree (elem, dfa);
2539 	tree = create_tree (dfa, tree, elem, CONCAT);
2540 	if (BE (elem == NULL || tree == NULL, 0))
2541 	  goto parse_dup_op_espace;
2542 
2543 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2544 	if (BE (tree == NULL, 0))
2545 	  goto parse_dup_op_espace;
2546       }
2547 
2548   if (old_tree)
2549     tree = create_tree (dfa, old_tree, tree, CONCAT);
2550 
2551   return tree;
2552 
2553  parse_dup_op_espace:
2554   *err = REG_ESPACE;
2555   return NULL;
2556 }
2557 
2558 /* Size of the names for collating symbol/equivalence_class/character_class.
2559    I'm not sure, but maybe enough.  */
2560 #define BRACKET_NAME_BUF_SIZE 32
2561 
2562 #ifndef _LIBC
2563   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2564      Build the range expression which starts from START_ELEM, and ends
2565      at END_ELEM.  The result are written to MBCSET and SBCSET.
2566      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2567      mbcset->range_ends, is a pointer argument sinse we may
2568      update it.  */
2569 
2570 static reg_errcode_t
build_range_exp(bitset sbcset,re_charset_t * mbcset,Idx * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)2571 build_range_exp (bitset sbcset,
2572 # ifdef RE_ENABLE_I18N
2573 		 re_charset_t *mbcset, Idx *range_alloc,
2574 # endif
2575 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2576 {
2577   unsigned int start_ch, end_ch;
2578   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2579   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2580 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2581 	  0))
2582     return REG_ERANGE;
2583 
2584   /* We can handle no multi character collating elements without libc
2585      support.  */
2586   if (BE ((start_elem->type == COLL_SYM
2587 	   && strlen ((char *) start_elem->opr.name) > 1)
2588 	  || (end_elem->type == COLL_SYM
2589 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2590     return REG_ECOLLATE;
2591 
2592 # ifdef RE_ENABLE_I18N
2593   {
2594     wchar_t wc;
2595     wint_t start_wc, end_wc;
2596     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2597 
2598     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2599 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2600 		   : 0));
2601     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2602 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2603 		 : 0));
2604     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2605 		? __btowc (start_ch) : start_elem->opr.wch);
2606     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2607 	      ? __btowc (end_ch) : end_elem->opr.wch);
2608     if (start_wc == WEOF || end_wc == WEOF)
2609       return REG_ECOLLATE;
2610     cmp_buf[0] = start_wc;
2611     cmp_buf[4] = end_wc;
2612     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2613       return REG_ERANGE;
2614 
2615     /* Got valid collation sequence values, add them as a new entry.
2616        However, for !_LIBC we have no collation elements: if the
2617        character set is single byte, the single byte character set
2618        that we build below suffices.  parse_bracket_exp passes
2619        no MBCSET if dfa->mb_cur_max == 1.  */
2620     if (mbcset)
2621       {
2622         /* Check the space of the arrays.  */
2623         if (BE (*range_alloc == mbcset->nranges, 0))
2624           {
2625 	    /* There is not enough space, need realloc.  */
2626 	    wchar_t *new_array_start, *new_array_end;
2627 	    Idx new_nranges;
2628 
2629 	    new_nranges = mbcset->nranges;
2630 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2631 	       are NULL if *range_alloc == 0.  */
2632 	    new_array_start = re_x2realloc (mbcset->range_starts, wchar_t,
2633 					    &new_nranges);
2634 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2635 				        new_nranges);
2636 
2637 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2638 	      return REG_ESPACE;
2639 
2640 	    mbcset->range_starts = new_array_start;
2641 	    mbcset->range_ends = new_array_end;
2642 	    *range_alloc = new_nranges;
2643           }
2644 
2645         mbcset->range_starts[mbcset->nranges] = start_wc;
2646         mbcset->range_ends[mbcset->nranges++] = end_wc;
2647       }
2648 
2649     /* Build the table for single byte characters.  */
2650     for (wc = 0; wc < SBC_MAX; ++wc)
2651       {
2652 	cmp_buf[2] = wc;
2653 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2654 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2655 	  bitset_set (sbcset, wc);
2656       }
2657   }
2658 # else /* not RE_ENABLE_I18N */
2659   {
2660     unsigned int ch;
2661     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2662 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2663 		   : 0));
2664     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2665 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2666 		 : 0));
2667     if (start_ch > end_ch)
2668       return REG_ERANGE;
2669     /* Build the table for single byte characters.  */
2670     for (ch = 0; ch < SBC_MAX; ++ch)
2671       if (start_ch <= ch  && ch <= end_ch)
2672 	bitset_set (sbcset, ch);
2673   }
2674 # endif /* not RE_ENABLE_I18N */
2675   return REG_NOERROR;
2676 }
2677 #endif /* not _LIBC */
2678 
2679 #ifndef _LIBC
2680 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2681    Build the collating element which is represented by NAME.
2682    The result are written to MBCSET and SBCSET.
2683    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2684    pointer argument since we may update it.  */
2685 
2686 static reg_errcode_t
build_collating_symbol(bitset sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2687 build_collating_symbol (bitset sbcset,
2688 # ifdef RE_ENABLE_I18N
2689 			re_charset_t *mbcset, Idx *coll_sym_alloc,
2690 # endif
2691 			const unsigned char *name)
2692 {
2693   size_t name_len = strlen ((const char *) name);
2694   if (BE (name_len != 1, 0))
2695     return REG_ECOLLATE;
2696   else
2697     {
2698       bitset_set (sbcset, name[0]);
2699       return REG_NOERROR;
2700     }
2701 }
2702 #endif /* not _LIBC */
2703 
2704 /* This function parse bracket expression like "[abc]", "[a-c]",
2705    "[[.a-a.]]" etc.  */
2706 
2707 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2708 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2709 		   reg_syntax_t syntax, reg_errcode_t *err)
2710 {
2711 #ifdef _LIBC
2712   const unsigned char *collseqmb;
2713   const char *collseqwc;
2714   uint32_t nrules;
2715   int32_t table_size;
2716   const int32_t *symb_table;
2717   const unsigned char *extra;
2718 
2719   /* Local function for parse_bracket_exp used in _LIBC environement.
2720      Seek the collating symbol entry correspondings to NAME.
2721      Return the index of the symbol in the SYMB_TABLE.  */
2722 
2723   auto inline int32_t
2724   __attribute ((always_inline))
2725   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2726     {
2727       int32_t hash = elem_hash ((const char *) name, name_len);
2728       int32_t elem = hash % table_size;
2729       int32_t second = hash % (table_size - 2);
2730       while (symb_table[2 * elem] != 0)
2731 	{
2732 	  /* First compare the hashing value.  */
2733 	  if (symb_table[2 * elem] == hash
2734 	      /* Compare the length of the name.  */
2735 	      && name_len == extra[symb_table[2 * elem + 1]]
2736 	      /* Compare the name.  */
2737 	      && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2738 			 name_len) == 0)
2739 	    {
2740 	      /* Yep, this is the entry.  */
2741 	      break;
2742 	    }
2743 
2744 	  /* Next entry.  */
2745 	  elem += second;
2746 	}
2747       return elem;
2748     }
2749 
2750   /* Local function for parse_bracket_exp used in _LIBC environement.
2751      Look up the collation sequence value of BR_ELEM.
2752      Return the value if succeeded, UINT_MAX otherwise.  */
2753 
2754   auto inline unsigned int
2755   __attribute ((always_inline))
2756   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2757     {
2758       if (br_elem->type == SB_CHAR)
2759 	{
2760 	  /*
2761 	  if (MB_CUR_MAX == 1)
2762 	  */
2763 	  if (nrules == 0)
2764 	    return collseqmb[br_elem->opr.ch];
2765 	  else
2766 	    {
2767 	      wint_t wc = __btowc (br_elem->opr.ch);
2768 	      return __collseq_table_lookup (collseqwc, wc);
2769 	    }
2770 	}
2771       else if (br_elem->type == MB_CHAR)
2772 	{
2773 	  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2774 	}
2775       else if (br_elem->type == COLL_SYM)
2776 	{
2777 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2778 	  if (nrules != 0)
2779 	    {
2780 	      int32_t elem, idx;
2781 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2782 						  sym_name_len);
2783 	      if (symb_table[2 * elem] != 0)
2784 		{
2785 		  /* We found the entry.  */
2786 		  idx = symb_table[2 * elem + 1];
2787 		  /* Skip the name of collating element name.  */
2788 		  idx += 1 + extra[idx];
2789 		  /* Skip the byte sequence of the collating element.  */
2790 		  idx += 1 + extra[idx];
2791 		  /* Adjust for the alignment.  */
2792 		  idx = (idx + 3) & ~3;
2793 		  /* Skip the multibyte collation sequence value.  */
2794 		  idx += sizeof (unsigned int);
2795 		  /* Skip the wide char sequence of the collating element.  */
2796 		  idx += sizeof (unsigned int) *
2797 		    (1 + *(unsigned int *) (extra + idx));
2798 		  /* Return the collation sequence value.  */
2799 		  return *(unsigned int *) (extra + idx);
2800 		}
2801 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2802 		{
2803 		  /* No valid character.  Match it as a single byte
2804 		     character.  */
2805 		  return collseqmb[br_elem->opr.name[0]];
2806 		}
2807 	    }
2808 	  else if (sym_name_len == 1)
2809 	    return collseqmb[br_elem->opr.name[0]];
2810 	}
2811       return UINT_MAX;
2812     }
2813 
2814   /* Local function for parse_bracket_exp used in _LIBC environement.
2815      Build the range expression which starts from START_ELEM, and ends
2816      at END_ELEM.  The result are written to MBCSET and SBCSET.
2817      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2818      mbcset->range_ends, is a pointer argument sinse we may
2819      update it.  */
2820 
2821   auto inline reg_errcode_t
2822   __attribute ((always_inline))
2823   build_range_exp (bitset sbcset, re_charset_t *mbcset,
2824 		   Idx *range_alloc,
2825 		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2826     {
2827       unsigned int ch;
2828       uint32_t start_collseq;
2829       uint32_t end_collseq;
2830 
2831       /* Equivalence Classes and Character Classes can't be a range
2832 	 start/end.  */
2833       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2834 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2835 	      0))
2836 	return REG_ERANGE;
2837 
2838       start_collseq = lookup_collation_sequence_value (start_elem);
2839       end_collseq = lookup_collation_sequence_value (end_elem);
2840       /* Check start/end collation sequence values.  */
2841       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2842 	return REG_ECOLLATE;
2843       if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2844 	return REG_ERANGE;
2845 
2846       /* Got valid collation sequence values, add them as a new entry.
2847 	 However, if we have no collation elements, and the character set
2848 	 is single byte, the single byte character set that we
2849 	 build below suffices. */
2850       if (nrules > 0 || dfa->mb_cur_max > 1)
2851 	{
2852           /* Check the space of the arrays.  */
2853           if (BE (*range_alloc == mbcset->nranges, 0))
2854 	    {
2855 	      /* There is not enough space, need realloc.  */
2856 	      uint32_t *new_array_start;
2857 	      uint32_t *new_array_end;
2858 	      Idx new_nranges;
2859 
2860 	      new_nranges = mbcset->nranges;
2861 	      new_array_start = re_x2realloc (mbcset->range_starts, uint32_t,
2862 					      &new_nranges);
2863 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2864 				          new_nranges);
2865 
2866 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2867 	        return REG_ESPACE;
2868 
2869 	      mbcset->range_starts = new_array_start;
2870 	      mbcset->range_ends = new_array_end;
2871 	      *range_alloc = new_nranges;
2872 	    }
2873 
2874           mbcset->range_starts[mbcset->nranges] = start_collseq;
2875           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2876 	}
2877 
2878       /* Build the table for single byte characters.  */
2879       for (ch = 0; ch < SBC_MAX; ch++)
2880 	{
2881 	  uint32_t ch_collseq;
2882 	  /*
2883 	  if (MB_CUR_MAX == 1)
2884 	  */
2885 	  if (nrules == 0)
2886 	    ch_collseq = collseqmb[ch];
2887 	  else
2888 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2889 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2890 	    bitset_set (sbcset, ch);
2891 	}
2892       return REG_NOERROR;
2893     }
2894 
2895   /* Local function for parse_bracket_exp used in _LIBC environement.
2896      Build the collating element which is represented by NAME.
2897      The result are written to MBCSET and SBCSET.
2898      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2899      pointer argument sinse we may update it.  */
2900 
2901   auto inline reg_errcode_t
2902   __attribute ((always_inline))
2903   build_collating_symbol (bitset sbcset, re_charset_t *mbcset,
2904 			  Idx *coll_sym_alloc, const unsigned char *name)
2905     {
2906       int32_t elem, idx;
2907       size_t name_len = strlen ((const char *) name);
2908       if (nrules != 0)
2909 	{
2910 	  elem = seek_collating_symbol_entry (name, name_len);
2911 	  if (symb_table[2 * elem] != 0)
2912 	    {
2913 	      /* We found the entry.  */
2914 	      idx = symb_table[2 * elem + 1];
2915 	      /* Skip the name of collating element name.  */
2916 	      idx += 1 + extra[idx];
2917 	    }
2918 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
2919 	    {
2920 	      /* No valid character, treat it as a normal
2921 		 character.  */
2922 	      bitset_set (sbcset, name[0]);
2923 	      return REG_NOERROR;
2924 	    }
2925 	  else
2926 	    return REG_ECOLLATE;
2927 
2928 	  /* Got valid collation sequence, add it as a new entry.  */
2929 	  /* Check the space of the arrays.  */
2930 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2931 	    {
2932 	      /* Not enough, realloc it.  */
2933 	      Idx new_coll_sym_alloc = mbcset->ncoll_syms;
2934 	      /* Use realloc since mbcset->coll_syms is NULL
2935 		 if *alloc == 0.  */
2936 	      int32_t *new_coll_syms = re_x2realloc (mbcset->coll_syms, int32_t,
2937 						     &new_coll_sym_alloc);
2938 	      if (BE (new_coll_syms == NULL, 0))
2939 		return REG_ESPACE;
2940 	      mbcset->coll_syms = new_coll_syms;
2941 	      *coll_sym_alloc = new_coll_sym_alloc;
2942 	    }
2943 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2944 	  return REG_NOERROR;
2945 	}
2946       else
2947 	{
2948 	  if (BE (name_len != 1, 0))
2949 	    return REG_ECOLLATE;
2950 	  else
2951 	    {
2952 	      bitset_set (sbcset, name[0]);
2953 	      return REG_NOERROR;
2954 	    }
2955 	}
2956     }
2957 #endif
2958 
2959   re_token_t br_token;
2960   re_bitset_ptr_t sbcset;
2961 #ifdef RE_ENABLE_I18N
2962   re_charset_t *mbcset;
2963   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2964   Idx equiv_class_alloc = 0, char_class_alloc = 0;
2965 #endif /* not RE_ENABLE_I18N */
2966   bool non_match = false;
2967   bin_tree_t *work_tree;
2968   int token_len;
2969   bool first_round = true;
2970 #ifdef _LIBC
2971   collseqmb = (const unsigned char *)
2972     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2973   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2974   if (nrules)
2975     {
2976       /*
2977       if (MB_CUR_MAX > 1)
2978       */
2979 	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2980       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2981       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2982 						  _NL_COLLATE_SYMB_TABLEMB);
2983       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2984 						   _NL_COLLATE_SYMB_EXTRAMB);
2985     }
2986 #endif
2987   sbcset = re_calloc (bitset_word, BITSET_WORDS);
2988 #ifdef RE_ENABLE_I18N
2989   mbcset = re_calloc (re_charset_t, 1);
2990 #endif /* RE_ENABLE_I18N */
2991 #ifdef RE_ENABLE_I18N
2992   if (BE (sbcset == NULL || mbcset == NULL, 0))
2993 #else
2994   if (BE (sbcset == NULL, 0))
2995 #endif /* RE_ENABLE_I18N */
2996     {
2997       *err = REG_ESPACE;
2998       return NULL;
2999     }
3000 
3001   token_len = peek_token_bracket (token, regexp, syntax);
3002   if (BE (token->type == END_OF_RE, 0))
3003     {
3004       *err = REG_BADPAT;
3005       goto parse_bracket_exp_free_return;
3006     }
3007   if (token->type == OP_NON_MATCH_LIST)
3008     {
3009 #ifdef RE_ENABLE_I18N
3010       mbcset->non_match = 1;
3011 #endif /* not RE_ENABLE_I18N */
3012       non_match = true;
3013       if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3014 	bitset_set (sbcset, '\0');
3015       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3016       token_len = peek_token_bracket (token, regexp, syntax);
3017       if (BE (token->type == END_OF_RE, 0))
3018 	{
3019 	  *err = REG_BADPAT;
3020 	  goto parse_bracket_exp_free_return;
3021 	}
3022     }
3023 
3024   /* We treat the first ']' as a normal character.  */
3025   if (token->type == OP_CLOSE_BRACKET)
3026     token->type = CHARACTER;
3027 
3028   while (1)
3029     {
3030       bracket_elem_t start_elem, end_elem;
3031       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3032       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3033       reg_errcode_t ret;
3034       int token_len2 = 0;
3035       bool is_range_exp = false;
3036       re_token_t token2;
3037 
3038       start_elem.opr.name = start_name_buf;
3039       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3040 				   syntax, first_round);
3041       if (BE (ret != REG_NOERROR, 0))
3042 	{
3043 	  *err = ret;
3044 	  goto parse_bracket_exp_free_return;
3045 	}
3046       first_round = false;
3047 
3048       /* Get information about the next token.  We need it in any case.  */
3049       token_len = peek_token_bracket (token, regexp, syntax);
3050 
3051       /* Do not check for ranges if we know they are not allowed.  */
3052       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3053 	{
3054 	  if (BE (token->type == END_OF_RE, 0))
3055 	    {
3056 	      *err = REG_EBRACK;
3057 	      goto parse_bracket_exp_free_return;
3058 	    }
3059 	  if (token->type == OP_CHARSET_RANGE)
3060 	    {
3061 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3062 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3063 	      if (BE (token2.type == END_OF_RE, 0))
3064 		{
3065 		  *err = REG_EBRACK;
3066 		  goto parse_bracket_exp_free_return;
3067 		}
3068 	      if (token2.type == OP_CLOSE_BRACKET)
3069 		{
3070 		  /* We treat the last '-' as a normal character.  */
3071 		  re_string_skip_bytes (regexp, -token_len);
3072 		  token->type = CHARACTER;
3073 		}
3074 	      else
3075 		is_range_exp = true;
3076 	    }
3077 	}
3078 
3079       if (is_range_exp == true)
3080 	{
3081 	  end_elem.opr.name = end_name_buf;
3082 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3083 				       dfa, syntax, true);
3084 	  if (BE (ret != REG_NOERROR, 0))
3085 	    {
3086 	      *err = ret;
3087 	      goto parse_bracket_exp_free_return;
3088 	    }
3089 
3090 	  token_len = peek_token_bracket (token, regexp, syntax);
3091 
3092 #ifdef _LIBC
3093 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3094 				  &start_elem, &end_elem);
3095 #else
3096 # ifdef RE_ENABLE_I18N
3097 	  *err = build_range_exp (sbcset,
3098 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3099 				  &range_alloc, &start_elem, &end_elem);
3100 # else
3101 	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3102 # endif
3103 #endif /* RE_ENABLE_I18N */
3104 	  if (BE (*err != REG_NOERROR, 0))
3105 	    goto parse_bracket_exp_free_return;
3106 	}
3107       else
3108 	{
3109 	  switch (start_elem.type)
3110 	    {
3111 	    case SB_CHAR:
3112 	      bitset_set (sbcset, start_elem.opr.ch);
3113 	      break;
3114 #ifdef RE_ENABLE_I18N
3115 	    case MB_CHAR:
3116 	      /* Check whether the array has enough space.  */
3117 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3118 		{
3119 		  wchar_t *new_mbchars;
3120 		  /* Not enough, realloc it.  */
3121 		  mbchar_alloc = mbcset->nmbchars;
3122 		  /* Use realloc since array is NULL if *alloc == 0.  */
3123 		  new_mbchars = re_x2realloc (mbcset->mbchars, wchar_t,
3124 					      &mbchar_alloc);
3125 		  if (BE (new_mbchars == NULL, 0))
3126 		    goto parse_bracket_exp_espace;
3127 		  mbcset->mbchars = new_mbchars;
3128 		}
3129 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3130 	      break;
3131 #endif /* RE_ENABLE_I18N */
3132 	    case EQUIV_CLASS:
3133 	      *err = build_equiv_class (sbcset,
3134 #ifdef RE_ENABLE_I18N
3135 					mbcset, &equiv_class_alloc,
3136 #endif /* RE_ENABLE_I18N */
3137 					start_elem.opr.name);
3138 	      if (BE (*err != REG_NOERROR, 0))
3139 		goto parse_bracket_exp_free_return;
3140 	      break;
3141 	    case COLL_SYM:
3142 	      *err = build_collating_symbol (sbcset,
3143 #ifdef RE_ENABLE_I18N
3144 					     mbcset, &coll_sym_alloc,
3145 #endif /* RE_ENABLE_I18N */
3146 					     start_elem.opr.name);
3147 	      if (BE (*err != REG_NOERROR, 0))
3148 		goto parse_bracket_exp_free_return;
3149 	      break;
3150 	    case CHAR_CLASS:
3151 	      *err = build_charclass (regexp->trans, sbcset,
3152 #ifdef RE_ENABLE_I18N
3153 				      mbcset, &char_class_alloc,
3154 #endif /* RE_ENABLE_I18N */
3155 				      start_elem.opr.name, syntax);
3156 	      if (BE (*err != REG_NOERROR, 0))
3157 	       goto parse_bracket_exp_free_return;
3158 	      break;
3159 	    default:
3160 	      assert (0);
3161 	      break;
3162 	    }
3163 	}
3164       if (BE (token->type == END_OF_RE, 0))
3165 	{
3166 	  *err = REG_EBRACK;
3167 	  goto parse_bracket_exp_free_return;
3168 	}
3169       if (token->type == OP_CLOSE_BRACKET)
3170 	break;
3171     }
3172 
3173   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3174 
3175   /* If it is non-matching list.  */
3176   if (non_match)
3177     bitset_not (sbcset);
3178 
3179 #ifdef RE_ENABLE_I18N
3180   /* Ensure only single byte characters are set.  */
3181   if (dfa->mb_cur_max > 1)
3182     bitset_mask (sbcset, dfa->sb_char);
3183 
3184   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3185       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3186 						     || mbcset->non_match)))
3187     {
3188       bin_tree_t *mbc_tree;
3189       int sbc_idx;
3190       /* Build a tree for complex bracket.  */
3191       dfa->has_mb_node = 1;
3192       br_token.type = COMPLEX_BRACKET;
3193       br_token.opr.mbcset = mbcset;
3194       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3195       if (BE (mbc_tree == NULL, 0))
3196 	goto parse_bracket_exp_espace;
3197       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3198 	if (sbcset[sbc_idx])
3199 	  break;
3200       /* If there are no bits set in sbcset, there is no point
3201 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3202       if (sbc_idx < BITSET_WORDS)
3203 	{
3204           /* Build a tree for simple bracket.  */
3205           br_token.type = SIMPLE_BRACKET;
3206           br_token.opr.sbcset = sbcset;
3207           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3208           if (BE (work_tree == NULL, 0))
3209             goto parse_bracket_exp_espace;
3210 
3211           /* Then join them by ALT node.  */
3212           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3213           if (BE (work_tree == NULL, 0))
3214             goto parse_bracket_exp_espace;
3215 	}
3216       else
3217 	{
3218 	  re_free (sbcset);
3219 	  work_tree = mbc_tree;
3220 	}
3221     }
3222   else
3223 #endif /* not RE_ENABLE_I18N */
3224     {
3225 #ifdef RE_ENABLE_I18N
3226       free_charset (mbcset);
3227 #endif
3228       /* Build a tree for simple bracket.  */
3229       br_token.type = SIMPLE_BRACKET;
3230       br_token.opr.sbcset = sbcset;
3231       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3232       if (BE (work_tree == NULL, 0))
3233         goto parse_bracket_exp_espace;
3234     }
3235   return work_tree;
3236 
3237  parse_bracket_exp_espace:
3238   *err = REG_ESPACE;
3239  parse_bracket_exp_free_return:
3240   re_free (sbcset);
3241 #ifdef RE_ENABLE_I18N
3242   free_charset (mbcset);
3243 #endif /* RE_ENABLE_I18N */
3244   return NULL;
3245 }
3246 
3247 /* Parse an element in the bracket expression.  */
3248 
3249 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)3250 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3251 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3252 		       reg_syntax_t syntax, bool accept_hyphen)
3253 {
3254 #ifdef RE_ENABLE_I18N
3255   int cur_char_size;
3256   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3257   if (cur_char_size > 1)
3258     {
3259       elem->type = MB_CHAR;
3260       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3261       re_string_skip_bytes (regexp, cur_char_size);
3262       return REG_NOERROR;
3263     }
3264 #endif /* RE_ENABLE_I18N */
3265   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3266   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3267       || token->type == OP_OPEN_EQUIV_CLASS)
3268     return parse_bracket_symbol (elem, regexp, token);
3269   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3270     {
3271       /* A '-' must only appear as anything but a range indicator before
3272 	 the closing bracket.  Everything else is an error.  */
3273       re_token_t token2;
3274       (void) peek_token_bracket (&token2, regexp, syntax);
3275       if (token2.type != OP_CLOSE_BRACKET)
3276 	/* The actual error value is not standardized since this whole
3277 	   case is undefined.  But ERANGE makes good sense.  */
3278 	return REG_ERANGE;
3279     }
3280   elem->type = SB_CHAR;
3281   elem->opr.ch = token->opr.c;
3282   return REG_NOERROR;
3283 }
3284 
3285 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3286    such as [:<character_class>:], [.<collating_element>.], and
3287    [=<equivalent_class>=].  */
3288 
3289 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3290 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3291 		      re_token_t *token)
3292 {
3293   unsigned char ch, delim = token->opr.c;
3294   int i = 0;
3295   if (re_string_eoi(regexp))
3296     return REG_EBRACK;
3297   for (;; ++i)
3298     {
3299       if (i >= BRACKET_NAME_BUF_SIZE)
3300 	return REG_EBRACK;
3301       if (token->type == OP_OPEN_CHAR_CLASS)
3302 	ch = re_string_fetch_byte_case (regexp);
3303       else
3304 	ch = re_string_fetch_byte (regexp);
3305       if (re_string_eoi(regexp))
3306 	return REG_EBRACK;
3307       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3308 	break;
3309       elem->opr.name[i] = ch;
3310     }
3311   re_string_skip_bytes (regexp, 1);
3312   elem->opr.name[i] = '\0';
3313   switch (token->type)
3314     {
3315     case OP_OPEN_COLL_ELEM:
3316       elem->type = COLL_SYM;
3317       break;
3318     case OP_OPEN_EQUIV_CLASS:
3319       elem->type = EQUIV_CLASS;
3320       break;
3321     case OP_OPEN_CHAR_CLASS:
3322       elem->type = CHAR_CLASS;
3323       break;
3324     default:
3325       break;
3326     }
3327   return REG_NOERROR;
3328 }
3329 
3330   /* Helper function for parse_bracket_exp.
3331      Build the equivalence class which is represented by NAME.
3332      The result are written to MBCSET and SBCSET.
3333      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3334      is a pointer argument sinse we may update it.  */
3335 
3336 static reg_errcode_t
build_equiv_class(bitset sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3337 build_equiv_class (bitset sbcset,
3338 #ifdef RE_ENABLE_I18N
3339 		   re_charset_t *mbcset, Idx *equiv_class_alloc,
3340 #endif
3341 		   const unsigned char *name)
3342 {
3343 #if defined _LIBC
3344   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3345   if (nrules != 0)
3346     {
3347       const int32_t *table, *indirect;
3348       const unsigned char *weights, *extra, *cp;
3349       unsigned char char_buf[2];
3350       int32_t idx1, idx2;
3351       unsigned int ch;
3352       size_t len;
3353       /* This #include defines a local function!  */
3354 # include <locale/weight.h>
3355       /* Calculate the index for equivalence class.  */
3356       cp = name;
3357       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3358       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3359 					       _NL_COLLATE_WEIGHTMB);
3360       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3361 						   _NL_COLLATE_EXTRAMB);
3362       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3363 						_NL_COLLATE_INDIRECTMB);
3364       idx1 = findidx (&cp);
3365       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3366 	/* This isn't a valid character.  */
3367 	return REG_ECOLLATE;
3368 
3369       /* Build single byte matcing table for this equivalence class.  */
3370       char_buf[1] = (unsigned char) '\0';
3371       len = weights[idx1];
3372       for (ch = 0; ch < SBC_MAX; ++ch)
3373 	{
3374 	  char_buf[0] = ch;
3375 	  cp = char_buf;
3376 	  idx2 = findidx (&cp);
3377 /*
3378 	  idx2 = table[ch];
3379 */
3380 	  if (idx2 == 0)
3381 	    /* This isn't a valid character.  */
3382 	    continue;
3383 	  if (len == weights[idx2])
3384 	    {
3385 	      int cnt = 0;
3386 	      while (cnt <= len &&
3387 		     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3388 		++cnt;
3389 
3390 	      if (cnt > len)
3391 		bitset_set (sbcset, ch);
3392 	    }
3393 	}
3394       /* Check whether the array has enough space.  */
3395       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3396 	{
3397 	  /* Not enough, realloc it.  */
3398 	  Idx new_equiv_class_alloc = mbcset->nequiv_classes;
3399 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3400 	  int32_t *new_equiv_classes = re_x2realloc (mbcset->equiv_classes,
3401 						     int32_t,
3402 						     &new_equiv_class_alloc);
3403 	  if (BE (new_equiv_classes == NULL, 0))
3404 	    return REG_ESPACE;
3405 	  mbcset->equiv_classes = new_equiv_classes;
3406 	  *equiv_class_alloc = new_equiv_class_alloc;
3407 	}
3408       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3409     }
3410   else
3411 #endif /* _LIBC */
3412     {
3413       if (BE (strlen ((const char *) name) != 1, 0))
3414 	return REG_ECOLLATE;
3415       bitset_set (sbcset, *name);
3416     }
3417   return REG_NOERROR;
3418 }
3419 
3420   /* Helper function for parse_bracket_exp.
3421      Build the character class which is represented by NAME.
3422      The result are written to MBCSET and SBCSET.
3423      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3424      is a pointer argument sinse we may update it.  */
3425 
3426 static reg_errcode_t
build_charclass(unsigned REG_TRANSLATE_TYPE trans,bitset sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3427 build_charclass (unsigned REG_TRANSLATE_TYPE trans, bitset sbcset,
3428 #ifdef RE_ENABLE_I18N
3429 		 re_charset_t *mbcset, Idx *char_class_alloc,
3430 #endif
3431 		 const unsigned char *class_name, reg_syntax_t syntax)
3432 {
3433   int i;
3434   const char *name = (const char *) class_name;
3435 
3436   /* In case of REG_ICASE "upper" and "lower" match the both of
3437      upper and lower cases.  */
3438   if ((syntax & REG_IGNORE_CASE)
3439       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3440     name = "alpha";
3441 
3442 #ifdef RE_ENABLE_I18N
3443   /* Check the space of the arrays.  */
3444   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3445     {
3446       /* Not enough, realloc it.  */
3447       Idx new_char_class_alloc = mbcset->nchar_classes;
3448       /* Use realloc since array is NULL if *alloc == 0.  */
3449       wctype_t *new_char_classes = re_x2realloc (mbcset->char_classes, wctype_t,
3450 						 &new_char_class_alloc);
3451       if (BE (new_char_classes == NULL, 0))
3452 	return REG_ESPACE;
3453       mbcset->char_classes = new_char_classes;
3454       *char_class_alloc = new_char_class_alloc;
3455     }
3456   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3457 #endif /* RE_ENABLE_I18N */
3458 
3459 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3460     for (i = 0; i < SBC_MAX; ++i)		\
3461       {						\
3462 	if (ctype_func (i))			\
3463 	  {					\
3464 	    int ch = trans ? trans[i] : i;	\
3465 	    bitset_set (sbcset, ch);		\
3466 	  }					\
3467       }
3468 
3469   if (strcmp (name, "alnum") == 0)
3470     BUILD_CHARCLASS_LOOP (isalnum)
3471   else if (strcmp (name, "cntrl") == 0)
3472     BUILD_CHARCLASS_LOOP (iscntrl)
3473   else if (strcmp (name, "lower") == 0)
3474     BUILD_CHARCLASS_LOOP (islower)
3475   else if (strcmp (name, "space") == 0)
3476     BUILD_CHARCLASS_LOOP (isspace)
3477   else if (strcmp (name, "alpha") == 0)
3478     BUILD_CHARCLASS_LOOP (isalpha)
3479   else if (strcmp (name, "digit") == 0)
3480     BUILD_CHARCLASS_LOOP (isdigit)
3481   else if (strcmp (name, "print") == 0)
3482     BUILD_CHARCLASS_LOOP (isprint)
3483   else if (strcmp (name, "upper") == 0)
3484     BUILD_CHARCLASS_LOOP (isupper)
3485   else if (strcmp (name, "blank") == 0)
3486     BUILD_CHARCLASS_LOOP (isblank)
3487   else if (strcmp (name, "graph") == 0)
3488     BUILD_CHARCLASS_LOOP (isgraph)
3489   else if (strcmp (name, "punct") == 0)
3490     BUILD_CHARCLASS_LOOP (ispunct)
3491   else if (strcmp (name, "xdigit") == 0)
3492     BUILD_CHARCLASS_LOOP (isxdigit)
3493   else
3494     return REG_ECTYPE;
3495 
3496   return REG_NOERROR;
3497 }
3498 
3499 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,unsigned REG_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,bool non_match,reg_errcode_t * err)3500 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3501 		    const unsigned char *class_name,
3502 		    const unsigned char *extra,
3503 		    bool non_match, reg_errcode_t *err)
3504 {
3505   re_bitset_ptr_t sbcset;
3506 #ifdef RE_ENABLE_I18N
3507   re_charset_t *mbcset;
3508   Idx alloc = 0;
3509 #endif /* not RE_ENABLE_I18N */
3510   reg_errcode_t ret;
3511   re_token_t br_token;
3512   bin_tree_t *tree;
3513 
3514   sbcset = re_calloc (bitset_word, BITSET_WORDS);
3515 #ifdef RE_ENABLE_I18N
3516   mbcset = re_calloc (re_charset_t, 1);
3517 #endif /* RE_ENABLE_I18N */
3518 
3519 #ifdef RE_ENABLE_I18N
3520   if (BE (sbcset == NULL || mbcset == NULL, 0))
3521 #else /* not RE_ENABLE_I18N */
3522   if (BE (sbcset == NULL, 0))
3523 #endif /* not RE_ENABLE_I18N */
3524     {
3525       *err = REG_ESPACE;
3526       return NULL;
3527     }
3528 
3529   if (non_match)
3530     {
3531 #ifdef RE_ENABLE_I18N
3532       /*
3533       if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3534 	bitset_set(cset->sbcset, '\0');
3535       */
3536       mbcset->non_match = 1;
3537 #endif /* not RE_ENABLE_I18N */
3538     }
3539 
3540   /* We don't care the syntax in this case.  */
3541   ret = build_charclass (trans, sbcset,
3542 #ifdef RE_ENABLE_I18N
3543 			 mbcset, &alloc,
3544 #endif /* RE_ENABLE_I18N */
3545 			 class_name, 0);
3546 
3547   if (BE (ret != REG_NOERROR, 0))
3548     {
3549       re_free (sbcset);
3550 #ifdef RE_ENABLE_I18N
3551       free_charset (mbcset);
3552 #endif /* RE_ENABLE_I18N */
3553       *err = ret;
3554       return NULL;
3555     }
3556   /* \w match '_' also.  */
3557   for (; *extra; extra++)
3558     bitset_set (sbcset, *extra);
3559 
3560   /* If it is non-matching list.  */
3561   if (non_match)
3562     bitset_not (sbcset);
3563 
3564 #ifdef RE_ENABLE_I18N
3565   /* Ensure only single byte characters are set.  */
3566   if (dfa->mb_cur_max > 1)
3567     bitset_mask (sbcset, dfa->sb_char);
3568 #endif
3569 
3570   /* Build a tree for simple bracket.  */
3571   br_token.type = SIMPLE_BRACKET;
3572   br_token.opr.sbcset = sbcset;
3573   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3574   if (BE (tree == NULL, 0))
3575     goto build_word_op_espace;
3576 
3577 #ifdef RE_ENABLE_I18N
3578   if (dfa->mb_cur_max > 1)
3579     {
3580       bin_tree_t *mbc_tree;
3581       /* Build a tree for complex bracket.  */
3582       br_token.type = COMPLEX_BRACKET;
3583       br_token.opr.mbcset = mbcset;
3584       dfa->has_mb_node = 1;
3585       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3586       if (BE (mbc_tree == NULL, 0))
3587 	goto build_word_op_espace;
3588       /* Then join them by ALT node.  */
3589       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3590       if (BE (mbc_tree != NULL, 1))
3591 	return tree;
3592     }
3593   else
3594     {
3595       free_charset (mbcset);
3596       return tree;
3597     }
3598 #else /* not RE_ENABLE_I18N */
3599   return tree;
3600 #endif /* not RE_ENABLE_I18N */
3601 
3602  build_word_op_espace:
3603   re_free (sbcset);
3604 #ifdef RE_ENABLE_I18N
3605   free_charset (mbcset);
3606 #endif /* RE_ENABLE_I18N */
3607   *err = REG_ESPACE;
3608   return NULL;
3609 }
3610 
3611 /* This is intended for the expressions like "a{1,3}".
3612    Fetch a number from `input', and return the number.
3613    Return REG_MISSING if the number field is empty like "{,1}".
3614    Return REG_ERROR if an error occurred.  */
3615 
3616 static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3617 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3618 {
3619   Idx num = REG_MISSING;
3620   unsigned char c;
3621   while (1)
3622     {
3623       fetch_token (token, input, syntax);
3624       c = token->opr.c;
3625       if (BE (token->type == END_OF_RE, 0))
3626 	return REG_ERROR;
3627       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3628 	break;
3629       num = ((token->type != CHARACTER || c < '0' || '9' < c
3630 	      || num == REG_ERROR)
3631 	     ? REG_ERROR
3632 	     : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3633       num = (num > REG_DUP_MAX) ? REG_ERROR : num;
3634     }
3635   return num;
3636 }
3637 
3638 #ifdef RE_ENABLE_I18N
3639 static void
free_charset(re_charset_t * cset)3640 free_charset (re_charset_t *cset)
3641 {
3642   re_free (cset->mbchars);
3643 # ifdef _LIBC
3644   re_free (cset->coll_syms);
3645   re_free (cset->equiv_classes);
3646   re_free (cset->range_starts);
3647   re_free (cset->range_ends);
3648 # endif
3649   re_free (cset->char_classes);
3650   re_free (cset);
3651 }
3652 #endif /* RE_ENABLE_I18N */
3653 
3654 /* Functions for binary tree operation.  */
3655 
3656 /* Create a tree node.  */
3657 
3658 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3659 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3660 	     re_token_type_t type)
3661 {
3662   re_token_t t;
3663   t.type = type;
3664   return create_token_tree (dfa, left, right, &t);
3665 }
3666 
3667 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3668 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3669 		   const re_token_t *token)
3670 {
3671   bin_tree_t *tree;
3672   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3673     {
3674       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3675 
3676       if (storage == NULL)
3677 	return NULL;
3678       storage->next = dfa->str_tree_storage;
3679       dfa->str_tree_storage = storage;
3680       dfa->str_tree_storage_idx = 0;
3681     }
3682   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3683 
3684   tree->parent = NULL;
3685   tree->left = left;
3686   tree->right = right;
3687   tree->token = *token;
3688   tree->token.duplicated = 0;
3689   tree->token.opt_subexp = 0;
3690   tree->first = NULL;
3691   tree->next = NULL;
3692   tree->node_idx = REG_MISSING;
3693 
3694   if (left != NULL)
3695     left->parent = tree;
3696   if (right != NULL)
3697     right->parent = tree;
3698   return tree;
3699 }
3700 
3701 /* Mark the tree SRC as an optional subexpression.
3702    To be called from preorder or postorder.  */
3703 
3704 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3705 mark_opt_subexp (void *extra, bin_tree_t *node)
3706 {
3707   Idx idx = (Idx) (long) extra;
3708   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3709     node->token.opt_subexp = 1;
3710 
3711   return REG_NOERROR;
3712 }
3713 
3714 /* Free the allocated memory inside NODE. */
3715 
3716 static void
free_token(re_token_t * node)3717 free_token (re_token_t *node)
3718 {
3719 #ifdef RE_ENABLE_I18N
3720   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3721     free_charset (node->opr.mbcset);
3722   else
3723 #endif /* RE_ENABLE_I18N */
3724     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3725       re_free (node->opr.sbcset);
3726 }
3727 
3728 /* Worker function for tree walking.  Free the allocated memory inside NODE
3729    and its children. */
3730 
3731 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3732 free_tree (void *extra, bin_tree_t *node)
3733 {
3734   free_token (&node->token);
3735   return REG_NOERROR;
3736 }
3737 
3738 
3739 /* Duplicate the node SRC, and return new node.  This is a preorder
3740    visit similar to the one implemented by the generic visitor, but
3741    we need more infrastructure to maintain two parallel trees --- so,
3742    it's easier to duplicate.  */
3743 
3744 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3745 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3746 {
3747   const bin_tree_t *node;
3748   bin_tree_t *dup_root;
3749   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3750 
3751   for (node = root; ; )
3752     {
3753       /* Create a new tree and link it back to the current parent.  */
3754       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3755       if (*p_new == NULL)
3756 	return NULL;
3757       (*p_new)->parent = dup_node;
3758       (*p_new)->token.duplicated = 1;
3759       dup_node = *p_new;
3760 
3761       /* Go to the left node, or up and to the right.  */
3762       if (node->left)
3763 	{
3764 	  node = node->left;
3765 	  p_new = &dup_node->left;
3766 	}
3767       else
3768 	{
3769 	  const bin_tree_t *prev = NULL;
3770 	  while (node->right == prev || node->right == NULL)
3771 	    {
3772 	      prev = node;
3773 	      node = node->parent;
3774 	      dup_node = dup_node->parent;
3775 	      if (!node)
3776 	        return dup_root;
3777 	    }
3778 	  node = node->right;
3779 	  p_new = &dup_node->right;
3780 	}
3781     }
3782 }
3783