xref: /onnv-gate/usr/src/lib/libparted/common/lib/regcomp.c (revision 9663:ace9a2ac3683)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License along
17    with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19 
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 					  size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 				     const re_dfastate_t *init_state,
24 				     char *fastmap);
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26 #ifdef RE_ENABLE_I18N
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 #ifdef RE_ENABLE_I18N
32 static void optimize_utf8 (re_dfa_t *dfa);
33 #endif
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 			       reg_errcode_t (fn (void *, bin_tree_t *)),
37 			       void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 				reg_errcode_t (fn (void *, bin_tree_t *)),
40 				void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 				 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 				   unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 					 Idx node, bool root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 			 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 			reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 			  reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 				  re_token_t *token, reg_syntax_t syntax,
63 				  Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 				 re_token_t *token, reg_syntax_t syntax,
66 				 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 				     re_token_t *token, reg_syntax_t syntax,
69 				     Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 				  re_token_t *token, reg_syntax_t syntax,
72 				  Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 				 re_dfa_t *dfa, re_token_t *token,
75 				 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 				      re_token_t *token, reg_syntax_t syntax,
78 				      reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 					    re_string_t *regexp,
81 					    re_token_t *token, int token_len,
82 					    re_dfa_t *dfa,
83 					    reg_syntax_t syntax,
84 					    bool accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 					  re_string_t *regexp,
87 					  re_token_t *token);
88 #ifdef RE_ENABLE_I18N
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 					re_charset_t *mbcset,
91 					Idx *equiv_class_alloc,
92 					const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94 				      bitset_t sbcset,
95 				      re_charset_t *mbcset,
96 				      Idx *char_class_alloc,
97 				      const unsigned char *class_name,
98 				      reg_syntax_t syntax);
99 #else  /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 					const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 				      bitset_t sbcset,
104 				      const unsigned char *class_name,
105 				      reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 				       RE_TRANSLATE_TYPE trans,
109 				       const unsigned char *class_name,
110 				       const unsigned char *extra,
111 				       bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 				bin_tree_t *left, bin_tree_t *right,
114 				re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 				      bin_tree_t *left, bin_tree_t *right,
117 				      const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122 
123 /* This table gives an error message for each of the error codes listed
124    in regex.h.  Obviously the order here has to be same as there.
125    POSIX doesn't require that we do anything for REG_NOERROR,
126    but why not be nice?  */
127 
128 static const char __re_error_msgid[] =
129   {
130 #define REG_NOERROR_IDX	0
131     gettext_noop ("Success")	/* REG_NOERROR */
132     "\0"
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134     gettext_noop ("No match")	/* REG_NOMATCH */
135     "\0"
136 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138     "\0"
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141     "\0"
142 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144     "\0"
145 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147     "\0"
148 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150     "\0"
151 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
152     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
153     "\0"
154 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156     "\0"
157 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159     "\0"
160 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162     "\0"
163 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164     gettext_noop ("Invalid range end")	/* REG_ERANGE */
165     "\0"
166 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
168     "\0"
169 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171     "\0"
172 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
174     "\0"
175 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
177     "\0"
178 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180   };
181 
182 static const size_t __re_error_msgid_idx[] =
183   {
184     REG_NOERROR_IDX,
185     REG_NOMATCH_IDX,
186     REG_BADPAT_IDX,
187     REG_ECOLLATE_IDX,
188     REG_ECTYPE_IDX,
189     REG_EESCAPE_IDX,
190     REG_ESUBREG_IDX,
191     REG_EBRACK_IDX,
192     REG_EPAREN_IDX,
193     REG_EBRACE_IDX,
194     REG_BADBR_IDX,
195     REG_ERANGE_IDX,
196     REG_ESPACE_IDX,
197     REG_BADRPT_IDX,
198     REG_EEND_IDX,
199     REG_ESIZE_IDX,
200     REG_ERPAREN_IDX
201   };
202 
203 /* Entry points for GNU code.  */
204 
205 /* re_compile_pattern is the GNU regular expression compiler: it
206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207    Returns 0 if the pattern was valid, otherwise an error string.
208 
209    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210    are set in BUFP on entry.  */
211 
212 #ifdef _LIBC
213 const char *
re_compile_pattern(pattern,length,bufp)214 re_compile_pattern (pattern, length, bufp)
215     const char *pattern;
216     size_t length;
217     struct re_pattern_buffer *bufp;
218 #else /* size_t might promote */
219 const char *
220 re_compile_pattern (const char *pattern, size_t length,
221 		    struct re_pattern_buffer *bufp)
222 #endif
223 {
224   reg_errcode_t ret;
225 
226   /* And GNU code determines whether or not to get register information
227      by passing null for the REGS argument to re_match, etc., not by
228      setting no_sub, unless RE_NO_SUB is set.  */
229   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
230 
231   /* Match anchors at newline.  */
232   bufp->newline_anchor = 1;
233 
234   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
235 
236   if (!ret)
237     return NULL;
238   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
239 }
240 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)241 weak_alias (__re_compile_pattern, re_compile_pattern)
242 #endif
243 
244 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
245    also be assigned to arbitrarily: each pattern buffer stores its own
246    syntax, so it can be changed between regex compilations.  */
247 /* This has no initializer because initialized variables in Emacs
248    become read-only after dumping.  */
249 reg_syntax_t re_syntax_options;
250 
251 
252 /* Specify the precise syntax of regexps for compilation.  This provides
253    for compatibility for various utilities which historically have
254    different, incompatible syntaxes.
255 
256    The argument SYNTAX is a bit mask comprised of the various bits
257    defined in regex.h.  We return the old syntax.  */
258 
259 reg_syntax_t
260 re_set_syntax (syntax)
261     reg_syntax_t syntax;
262 {
263   reg_syntax_t ret = re_syntax_options;
264 
265   re_syntax_options = syntax;
266   return ret;
267 }
268 #ifdef _LIBC
269 weak_alias (__re_set_syntax, re_set_syntax)
270 #endif
271 
272 int
273 re_compile_fastmap (bufp)
274     struct re_pattern_buffer *bufp;
275 {
276   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
277   char *fastmap = bufp->fastmap;
278 
279   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281   if (dfa->init_state != dfa->init_state_word)
282     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283   if (dfa->init_state != dfa->init_state_nl)
284     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285   if (dfa->init_state != dfa->init_state_begbuf)
286     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287   bufp->fastmap_accurate = 1;
288   return 0;
289 }
290 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
292 #endif
293 
294 static inline void
295 __attribute ((always_inline))
296 re_set_fastmap (char *fastmap, bool icase, int ch)
297 {
298   fastmap[ch] = 1;
299   if (icase)
300     fastmap[tolower (ch)] = 1;
301 }
302 
303 /* Helper function for re_compile_fastmap.
304    Compile fastmap for the initial_state INIT_STATE.  */
305 
306 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
308 			 char *fastmap)
309 {
310   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
311   Idx node_cnt;
312   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314     {
315       Idx node = init_state->nodes.elems[node_cnt];
316       re_token_type_t type = dfa->nodes[node].type;
317 
318       if (type == CHARACTER)
319 	{
320 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321 #ifdef RE_ENABLE_I18N
322 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323 	    {
324 	      unsigned char buf[MB_LEN_MAX];
325 	      unsigned char *p;
326 	      wchar_t wc;
327 	      mbstate_t state;
328 
329 	      p = buf;
330 	      *p++ = dfa->nodes[node].opr.c;
331 	      while (++node < dfa->nodes_len
332 		     &&	dfa->nodes[node].type == CHARACTER
333 		     && dfa->nodes[node].mb_partial)
334 		*p++ = dfa->nodes[node].opr.c;
335 	      memset (&state, '\0', sizeof (state));
336 	      if (mbrtowc (&wc, (const char *) buf, p - buf,
337 			   &state) == p - buf
338 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
339 		      != (size_t) -1))
340 		re_set_fastmap (fastmap, false, buf[0]);
341 	    }
342 #endif
343 	}
344       else if (type == SIMPLE_BRACKET)
345 	{
346 	  int i, ch;
347 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
348 	    {
349 	      int j;
350 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 		if (w & ((bitset_word_t) 1 << j))
353 		  re_set_fastmap (fastmap, icase, ch);
354 	    }
355 	}
356 #ifdef RE_ENABLE_I18N
357       else if (type == COMPLEX_BRACKET)
358 	{
359 	  Idx i;
360 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
361 	  if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
362 	      || cset->nranges || cset->nchar_classes)
363 	    {
364 # ifdef _LIBC
365 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
366 		{
367 		  /* In this case we want to catch the bytes which are
368 		     the first byte of any collation elements.
369 		     e.g. In da_DK, we want to catch 'a' since "aa"
370 			  is a valid collation element, and don't catch
371 			  'b' since 'b' is the only collation element
372 			  which starts from 'b'.  */
373 		  const int32_t *table = (const int32_t *)
374 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375 		  for (i = 0; i < SBC_MAX; ++i)
376 		    if (table[i] < 0)
377 		      re_set_fastmap (fastmap, icase, i);
378 		}
379 # else
380 	      if (dfa->mb_cur_max > 1)
381 		for (i = 0; i < SBC_MAX; ++i)
382 		  if (__btowc (i) == WEOF)
383 		    re_set_fastmap (fastmap, icase, i);
384 # endif /* not _LIBC */
385 	    }
386 	  for (i = 0; i < cset->nmbchars; ++i)
387 	    {
388 	      char buf[256];
389 	      mbstate_t state;
390 	      memset (&state, '\0', sizeof (state));
391 	      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
392 		re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
393 	      if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
394 		{
395 		  if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
396 		      != (size_t) -1)
397 		    re_set_fastmap (fastmap, false, *(unsigned char *) buf);
398 		}
399 	    }
400 	}
401 #endif /* RE_ENABLE_I18N */
402       else if (type == OP_PERIOD
403 #ifdef RE_ENABLE_I18N
404 	       || type == OP_UTF8_PERIOD
405 #endif /* RE_ENABLE_I18N */
406 	       || type == END_OF_RE)
407 	{
408 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
409 	  if (type == END_OF_RE)
410 	    bufp->can_be_null = 1;
411 	  return;
412 	}
413     }
414 }
415 
416 /* Entry point for POSIX code.  */
417 /* regcomp takes a regular expression as a string and compiles it.
418 
419    PREG is a regex_t *.  We do not expect any fields to be initialized,
420    since POSIX says we shouldn't.  Thus, we set
421 
422      `buffer' to the compiled pattern;
423      `used' to the length of the compiled pattern;
424      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
425        REG_EXTENDED bit in CFLAGS is set; otherwise, to
426        RE_SYNTAX_POSIX_BASIC;
427      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
428      `fastmap' to an allocated space for the fastmap;
429      `fastmap_accurate' to zero;
430      `re_nsub' to the number of subexpressions in PATTERN.
431 
432    PATTERN is the address of the pattern string.
433 
434    CFLAGS is a series of bits which affect compilation.
435 
436      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
437      use POSIX basic syntax.
438 
439      If REG_NEWLINE is set, then . and [^...] don't match newline.
440      Also, regexec will try a match beginning after every newline.
441 
442      If REG_ICASE is set, then we considers upper- and lowercase
443      versions of letters to be equivalent when matching.
444 
445      If REG_NOSUB is set, then when PREG is passed to regexec, that
446      routine will report only success or failure, and nothing about the
447      registers.
448 
449    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
450    the return codes and their meanings.)  */
451 
452 int
regcomp(preg,pattern,cflags)453 regcomp (preg, pattern, cflags)
454     regex_t *_Restrict_ preg;
455     const char *_Restrict_ pattern;
456     int cflags;
457 {
458   reg_errcode_t ret;
459   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
460 			 : RE_SYNTAX_POSIX_BASIC);
461 
462   preg->buffer = NULL;
463   preg->allocated = 0;
464   preg->used = 0;
465 
466   /* Try to allocate space for the fastmap.  */
467   preg->fastmap = re_malloc (char, SBC_MAX);
468   if (BE (preg->fastmap == NULL, 0))
469     return REG_ESPACE;
470 
471   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
472 
473   /* If REG_NEWLINE is set, newlines are treated differently.  */
474   if (cflags & REG_NEWLINE)
475     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
476       syntax &= ~RE_DOT_NEWLINE;
477       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
478       /* It also changes the matching behavior.  */
479       preg->newline_anchor = 1;
480     }
481   else
482     preg->newline_anchor = 0;
483   preg->no_sub = !!(cflags & REG_NOSUB);
484   preg->translate = NULL;
485 
486   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
487 
488   /* POSIX doesn't distinguish between an unmatched open-group and an
489      unmatched close-group: both are REG_EPAREN.  */
490   if (ret == REG_ERPAREN)
491     ret = REG_EPAREN;
492 
493   /* We have already checked preg->fastmap != NULL.  */
494   if (BE (ret == REG_NOERROR, 1))
495     /* Compute the fastmap now, since regexec cannot modify the pattern
496        buffer.  This function never fails in this implementation.  */
497     (void) re_compile_fastmap (preg);
498   else
499     {
500       /* Some error occurred while compiling the expression.  */
501       re_free (preg->fastmap);
502       preg->fastmap = NULL;
503     }
504 
505   return (int) ret;
506 }
507 #ifdef _LIBC
508 weak_alias (__regcomp, regcomp)
509 #endif
510 
511 /* Returns a message corresponding to an error code, ERRCODE, returned
512    from either regcomp or regexec.   We don't use PREG here.  */
513 
514 #ifdef _LIBC
515 size_t
516 regerror (errcode, preg, errbuf, errbuf_size)
517     int errcode;
518     const regex_t *_Restrict_ preg;
519     char *_Restrict_ errbuf;
520     size_t errbuf_size;
521 #else /* size_t might promote */
522 size_t
523 regerror (int errcode, const regex_t *_Restrict_ preg,
524 	  char *_Restrict_ errbuf, size_t errbuf_size)
525 #endif
526 {
527   const char *msg;
528   size_t msg_size;
529 
530   if (BE (errcode < 0
531 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
532 			       / sizeof (__re_error_msgid_idx[0])), 0))
533     /* Only error codes returned by the rest of the code should be passed
534        to this routine.  If we are given anything else, or if other regex
535        code generates an invalid error code, then the program has a bug.
536        Dump core so we can fix it.  */
537     abort ();
538 
539   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
540 
541   msg_size = strlen (msg) + 1; /* Includes the null.  */
542 
543   if (BE (errbuf_size != 0, 1))
544     {
545       size_t cpy_size = msg_size;
546       if (BE (msg_size > errbuf_size, 0))
547 	{
548 	  cpy_size = errbuf_size - 1;
549 	  errbuf[cpy_size] = '\0';
550 	}
551       memcpy (errbuf, msg, cpy_size);
552     }
553 
554   return msg_size;
555 }
556 #ifdef _LIBC
557 weak_alias (__regerror, regerror)
558 #endif
559 
560 
561 #ifdef RE_ENABLE_I18N
562 /* This static array is used for the map to single-byte characters when
563    UTF-8 is used.  Otherwise we would allocate memory just to initialize
564    it the same all the time.  UTF-8 is the preferred encoding so this is
565    a worthwhile optimization.  */
566 static const bitset_t utf8_sb_map =
567 {
568   /* Set the first 128 bits.  */
569 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
570 #  error "bitset_word_t is narrower than 32 bits"
571 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
572   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
573 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
574   BITSET_WORD_MAX, BITSET_WORD_MAX,
575 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
576   BITSET_WORD_MAX,
577 # endif
578   (BITSET_WORD_MAX
579    >> (SBC_MAX % BITSET_WORD_BITS == 0
580        ? 0
581        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
582 };
583 #endif
584 
585 
586 static void
free_dfa_content(re_dfa_t * dfa)587 free_dfa_content (re_dfa_t *dfa)
588 {
589   Idx i, j;
590 
591   if (dfa->nodes)
592     for (i = 0; i < dfa->nodes_len; ++i)
593       free_token (dfa->nodes + i);
594   re_free (dfa->nexts);
595   for (i = 0; i < dfa->nodes_len; ++i)
596     {
597       if (dfa->eclosures != NULL)
598 	re_node_set_free (dfa->eclosures + i);
599       if (dfa->inveclosures != NULL)
600 	re_node_set_free (dfa->inveclosures + i);
601       if (dfa->edests != NULL)
602 	re_node_set_free (dfa->edests + i);
603     }
604   re_free (dfa->edests);
605   re_free (dfa->eclosures);
606   re_free (dfa->inveclosures);
607   re_free (dfa->nodes);
608 
609   if (dfa->state_table)
610     for (i = 0; i <= dfa->state_hash_mask; ++i)
611       {
612 	struct re_state_table_entry *entry = dfa->state_table + i;
613 	for (j = 0; j < entry->num; ++j)
614 	  {
615 	    re_dfastate_t *state = entry->array[j];
616 	    free_state (state);
617 	  }
618         re_free (entry->array);
619       }
620   re_free (dfa->state_table);
621 #ifdef RE_ENABLE_I18N
622   if (dfa->sb_char != utf8_sb_map)
623     re_free (dfa->sb_char);
624 #endif
625   re_free (dfa->subexp_map);
626 #ifdef DEBUG
627   re_free (dfa->re_str);
628 #endif
629 
630   re_free (dfa);
631 }
632 
633 
634 /* Free dynamically allocated space used by PREG.  */
635 
636 void
regfree(preg)637 regfree (preg)
638     regex_t *preg;
639 {
640   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
641   if (BE (dfa != NULL, 1))
642     free_dfa_content (dfa);
643   preg->buffer = NULL;
644   preg->allocated = 0;
645 
646   re_free (preg->fastmap);
647   preg->fastmap = NULL;
648 
649   re_free (preg->translate);
650   preg->translate = NULL;
651 }
652 #ifdef _LIBC
653 weak_alias (__regfree, regfree)
654 #endif
655 
656 /* Entry points compatible with 4.2 BSD regex library.  We don't define
657    them unless specifically requested.  */
658 
659 #if defined _REGEX_RE_COMP || defined _LIBC
660 
661 /* BSD has one and only one pattern buffer.  */
662 static struct re_pattern_buffer re_comp_buf;
663 
664 char *
665 # ifdef _LIBC
666 /* Make these definitions weak in libc, so POSIX programs can redefine
667    these names if they don't use our functions, and still use
668    regcomp/regexec above without link errors.  */
669 weak_function
670 # endif
re_comp(s)671 re_comp (s)
672      const char *s;
673 {
674   reg_errcode_t ret;
675   char *fastmap;
676 
677   if (!s)
678     {
679       if (!re_comp_buf.buffer)
680 	return gettext ("No previous regular expression");
681       return 0;
682     }
683 
684   if (re_comp_buf.buffer)
685     {
686       fastmap = re_comp_buf.fastmap;
687       re_comp_buf.fastmap = NULL;
688       __regfree (&re_comp_buf);
689       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
690       re_comp_buf.fastmap = fastmap;
691     }
692 
693   if (re_comp_buf.fastmap == NULL)
694     {
695       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
696       if (re_comp_buf.fastmap == NULL)
697 	return (char *) gettext (__re_error_msgid
698 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
699     }
700 
701   /* Since `re_exec' always passes NULL for the `regs' argument, we
702      don't need to initialize the pattern buffer fields which affect it.  */
703 
704   /* Match anchors at newlines.  */
705   re_comp_buf.newline_anchor = 1;
706 
707   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
708 
709   if (!ret)
710     return NULL;
711 
712   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
713   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
714 }
715 
716 #ifdef _LIBC
libc_freeres_fn(free_mem)717 libc_freeres_fn (free_mem)
718 {
719   __regfree (&re_comp_buf);
720 }
721 #endif
722 
723 #endif /* _REGEX_RE_COMP */
724 
725 /* Internal entry point.
726    Compile the regular expression PATTERN, whose length is LENGTH.
727    SYNTAX indicate regular expression's syntax.  */
728 
729 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)730 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
731 		     reg_syntax_t syntax)
732 {
733   reg_errcode_t err = REG_NOERROR;
734   re_dfa_t *dfa;
735   re_string_t regexp;
736 
737   /* Initialize the pattern buffer.  */
738   preg->fastmap_accurate = 0;
739   preg->syntax = syntax;
740   preg->not_bol = preg->not_eol = 0;
741   preg->used = 0;
742   preg->re_nsub = 0;
743   preg->can_be_null = 0;
744   preg->regs_allocated = REGS_UNALLOCATED;
745 
746   /* Initialize the dfa.  */
747   dfa = (re_dfa_t *) preg->buffer;
748   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
749     {
750       /* If zero allocated, but buffer is non-null, try to realloc
751 	 enough space.  This loses if buffer's address is bogus, but
752 	 that is the user's responsibility.  If ->buffer is NULL this
753 	 is a simple allocation.  */
754       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
755       if (dfa == NULL)
756 	return REG_ESPACE;
757       preg->allocated = sizeof (re_dfa_t);
758       preg->buffer = (unsigned char *) dfa;
759     }
760   preg->used = sizeof (re_dfa_t);
761 
762   err = init_dfa (dfa, length);
763   if (BE (err != REG_NOERROR, 0))
764     {
765       free_dfa_content (dfa);
766       preg->buffer = NULL;
767       preg->allocated = 0;
768       return err;
769     }
770 #ifdef DEBUG
771   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
772   dfa->re_str = re_malloc (char, length + 1);
773   strncpy (dfa->re_str, pattern, length + 1);
774 #endif
775 
776   __libc_lock_init (dfa->lock);
777 
778   err = re_string_construct (&regexp, pattern, length, preg->translate,
779 			     syntax & RE_ICASE, dfa);
780   if (BE (err != REG_NOERROR, 0))
781     {
782     re_compile_internal_free_return:
783       free_workarea_compile (preg);
784       re_string_destruct (&regexp);
785       free_dfa_content (dfa);
786       preg->buffer = NULL;
787       preg->allocated = 0;
788       return err;
789     }
790 
791   /* Parse the regular expression, and build a structure tree.  */
792   preg->re_nsub = 0;
793   dfa->str_tree = parse (&regexp, preg, syntax, &err);
794   if (BE (dfa->str_tree == NULL, 0))
795     goto re_compile_internal_free_return;
796 
797   /* Analyze the tree and create the nfa.  */
798   err = analyze (preg);
799   if (BE (err != REG_NOERROR, 0))
800     goto re_compile_internal_free_return;
801 
802 #ifdef RE_ENABLE_I18N
803   /* If possible, do searching in single byte encoding to speed things up.  */
804   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
805     optimize_utf8 (dfa);
806 #endif
807 
808   /* Then create the initial state of the dfa.  */
809   err = create_initial_state (dfa);
810 
811   /* Release work areas.  */
812   free_workarea_compile (preg);
813   re_string_destruct (&regexp);
814 
815   if (BE (err != REG_NOERROR, 0))
816     {
817       free_dfa_content (dfa);
818       preg->buffer = NULL;
819       preg->allocated = 0;
820     }
821 
822   return err;
823 }
824 
825 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
826    as the initial length of some arrays.  */
827 
828 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)829 init_dfa (re_dfa_t *dfa, size_t pat_len)
830 {
831   __re_size_t table_size;
832 #ifdef RE_ENABLE_I18N
833   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
834 #else
835   size_t max_i18n_object_size = 0;
836 #endif
837   size_t max_object_size =
838     MAX (sizeof (struct re_state_table_entry),
839 	 MAX (sizeof (re_token_t),
840 	      MAX (sizeof (re_node_set),
841 		   MAX (sizeof (regmatch_t),
842 			max_i18n_object_size))));
843 
844   memset (dfa, '\0', sizeof (re_dfa_t));
845 
846   /* Force allocation of str_tree_storage the first time.  */
847   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
848 
849   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
850      calculation below, and for similar doubling calculations
851      elsewhere.  And it's <= rather than <, because some of the
852      doubling calculations add 1 afterwards.  */
853   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
854     return REG_ESPACE;
855 
856   dfa->nodes_alloc = pat_len + 1;
857   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
858 
859   /*  table_size = 2 ^ ceil(log pat_len) */
860   for (table_size = 1; ; table_size <<= 1)
861     if (table_size > pat_len)
862       break;
863 
864   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
865   dfa->state_hash_mask = table_size - 1;
866 
867   dfa->mb_cur_max = MB_CUR_MAX;
868 #ifdef _LIBC
869   if (dfa->mb_cur_max == 6
870       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
871     dfa->is_utf8 = 1;
872   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
873 		       != 0);
874 #else
875   if (strcmp (locale_charset (), "UTF-8") == 0)
876     dfa->is_utf8 = 1;
877 
878   /* We check exhaustively in the loop below if this charset is a
879      superset of ASCII.  */
880   dfa->map_notascii = 0;
881 #endif
882 
883 #ifdef RE_ENABLE_I18N
884   if (dfa->mb_cur_max > 1)
885     {
886       if (dfa->is_utf8)
887 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
888       else
889 	{
890 	  int i, j, ch;
891 
892 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
893 	  if (BE (dfa->sb_char == NULL, 0))
894 	    return REG_ESPACE;
895 
896 	  /* Set the bits corresponding to single byte chars.  */
897 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
898 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
899 	      {
900 		wint_t wch = __btowc (ch);
901 		if (wch != WEOF)
902 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
903 # ifndef _LIBC
904 		if (isascii (ch) && wch != ch)
905 		  dfa->map_notascii = 1;
906 # endif
907 	      }
908 	}
909     }
910 #endif
911 
912   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
913     return REG_ESPACE;
914   return REG_NOERROR;
915 }
916 
917 /* Initialize WORD_CHAR table, which indicate which character is
918    "word".  In this case "word" means that it is the word construction
919    character used by some operators like "\<", "\>", etc.  */
920 
921 static void
922 internal_function
init_word_char(re_dfa_t * dfa)923 init_word_char (re_dfa_t *dfa)
924 {
925   int i, j, ch;
926   dfa->word_ops_used = 1;
927   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
928     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
929       if (isalnum (ch) || ch == '_')
930 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
931 }
932 
933 /* Free the work area which are only used while compiling.  */
934 
935 static void
free_workarea_compile(regex_t * preg)936 free_workarea_compile (regex_t *preg)
937 {
938   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
939   bin_tree_storage_t *storage, *next;
940   for (storage = dfa->str_tree_storage; storage; storage = next)
941     {
942       next = storage->next;
943       re_free (storage);
944     }
945   dfa->str_tree_storage = NULL;
946   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
947   dfa->str_tree = NULL;
948   re_free (dfa->org_indices);
949   dfa->org_indices = NULL;
950 }
951 
952 /* Create initial states for all contexts.  */
953 
954 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)955 create_initial_state (re_dfa_t *dfa)
956 {
957   Idx first, i;
958   reg_errcode_t err;
959   re_node_set init_nodes;
960 
961   /* Initial states have the epsilon closure of the node which is
962      the first node of the regular expression.  */
963   first = dfa->str_tree->first->node_idx;
964   dfa->init_node = first;
965   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
966   if (BE (err != REG_NOERROR, 0))
967     return err;
968 
969   /* The back-references which are in initial states can epsilon transit,
970      since in this case all of the subexpressions can be null.
971      Then we add epsilon closures of the nodes which are the next nodes of
972      the back-references.  */
973   if (dfa->nbackref > 0)
974     for (i = 0; i < init_nodes.nelem; ++i)
975       {
976 	Idx node_idx = init_nodes.elems[i];
977 	re_token_type_t type = dfa->nodes[node_idx].type;
978 
979 	Idx clexp_idx;
980 	if (type != OP_BACK_REF)
981 	  continue;
982 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
983 	  {
984 	    re_token_t *clexp_node;
985 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
986 	    if (clexp_node->type == OP_CLOSE_SUBEXP
987 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
988 	      break;
989 	  }
990 	if (clexp_idx == init_nodes.nelem)
991 	  continue;
992 
993 	if (type == OP_BACK_REF)
994 	  {
995 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
996 	    if (!re_node_set_contains (&init_nodes, dest_idx))
997 	      {
998 		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
999 		i = 0;
1000 	      }
1001 	  }
1002       }
1003 
1004   /* It must be the first time to invoke acquire_state.  */
1005   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1006   /* We don't check ERR here, since the initial state must not be NULL.  */
1007   if (BE (dfa->init_state == NULL, 0))
1008     return err;
1009   if (dfa->init_state->has_constraint)
1010     {
1011       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1012 						       CONTEXT_WORD);
1013       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1014 						     CONTEXT_NEWLINE);
1015       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1016 							 &init_nodes,
1017 							 CONTEXT_NEWLINE
1018 							 | CONTEXT_BEGBUF);
1019       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1020 	      || dfa->init_state_begbuf == NULL, 0))
1021 	return err;
1022     }
1023   else
1024     dfa->init_state_word = dfa->init_state_nl
1025       = dfa->init_state_begbuf = dfa->init_state;
1026 
1027   re_node_set_free (&init_nodes);
1028   return REG_NOERROR;
1029 }
1030 
1031 #ifdef RE_ENABLE_I18N
1032 /* If it is possible to do searching in single byte encoding instead of UTF-8
1033    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1034    DFA nodes where needed.  */
1035 
1036 static void
optimize_utf8(re_dfa_t * dfa)1037 optimize_utf8 (re_dfa_t *dfa)
1038 {
1039   Idx node;
1040   int i;
1041   bool mb_chars = false;
1042   bool has_period = false;
1043 
1044   for (node = 0; node < dfa->nodes_len; ++node)
1045     switch (dfa->nodes[node].type)
1046       {
1047       case CHARACTER:
1048 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1049 	  mb_chars = true;
1050 	break;
1051       case ANCHOR:
1052 	switch (dfa->nodes[node].opr.idx)
1053 	  {
1054 	  case LINE_FIRST:
1055 	  case LINE_LAST:
1056 	  case BUF_FIRST:
1057 	  case BUF_LAST:
1058 	    break;
1059 	  default:
1060 	    /* Word anchors etc. cannot be handled.  */
1061 	    return;
1062 	  }
1063 	break;
1064       case OP_PERIOD:
1065         has_period = true;
1066         break;
1067       case OP_BACK_REF:
1068       case OP_ALT:
1069       case END_OF_RE:
1070       case OP_DUP_ASTERISK:
1071       case OP_OPEN_SUBEXP:
1072       case OP_CLOSE_SUBEXP:
1073 	break;
1074       case COMPLEX_BRACKET:
1075 	return;
1076       case SIMPLE_BRACKET:
1077 	/* Just double check.  */
1078 	{
1079 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1080 			? 0
1081 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1082 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1083 	    {
1084 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1085 		return;
1086 	      rshift = 0;
1087 	    }
1088 	}
1089 	break;
1090       default:
1091 	abort ();
1092       }
1093 
1094   if (mb_chars || has_period)
1095     for (node = 0; node < dfa->nodes_len; ++node)
1096       {
1097 	if (dfa->nodes[node].type == CHARACTER
1098 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1099 	  dfa->nodes[node].mb_partial = 0;
1100 	else if (dfa->nodes[node].type == OP_PERIOD)
1101 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1102       }
1103 
1104   /* The search can be in single byte locale.  */
1105   dfa->mb_cur_max = 1;
1106   dfa->is_utf8 = 0;
1107   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1108 }
1109 #endif
1110 
1111 /* Analyze the structure tree, and calculate "first", "next", "edest",
1112    "eclosure", and "inveclosure".  */
1113 
1114 static reg_errcode_t
analyze(regex_t * preg)1115 analyze (regex_t *preg)
1116 {
1117   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1118   reg_errcode_t ret;
1119 
1120   /* Allocate arrays.  */
1121   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1122   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1123   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1124   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1125   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1126 	  || dfa->eclosures == NULL, 0))
1127     return REG_ESPACE;
1128 
1129   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1130   if (dfa->subexp_map != NULL)
1131     {
1132       Idx i;
1133       for (i = 0; i < preg->re_nsub; i++)
1134 	dfa->subexp_map[i] = i;
1135       preorder (dfa->str_tree, optimize_subexps, dfa);
1136       for (i = 0; i < preg->re_nsub; i++)
1137 	if (dfa->subexp_map[i] != i)
1138 	  break;
1139       if (i == preg->re_nsub)
1140 	{
1141 	  free (dfa->subexp_map);
1142 	  dfa->subexp_map = NULL;
1143 	}
1144     }
1145 
1146   ret = postorder (dfa->str_tree, lower_subexps, preg);
1147   if (BE (ret != REG_NOERROR, 0))
1148     return ret;
1149   ret = postorder (dfa->str_tree, calc_first, dfa);
1150   if (BE (ret != REG_NOERROR, 0))
1151     return ret;
1152   preorder (dfa->str_tree, calc_next, dfa);
1153   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1154   if (BE (ret != REG_NOERROR, 0))
1155     return ret;
1156   ret = calc_eclosure (dfa);
1157   if (BE (ret != REG_NOERROR, 0))
1158     return ret;
1159 
1160   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1161      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1162   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1163       || dfa->nbackref)
1164     {
1165       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1166       if (BE (dfa->inveclosures == NULL, 0))
1167         return REG_ESPACE;
1168       ret = calc_inveclosure (dfa);
1169     }
1170 
1171   return ret;
1172 }
1173 
1174 /* Our parse trees are very unbalanced, so we cannot use a stack to
1175    implement parse tree visits.  Instead, we use parent pointers and
1176    some hairy code in these two functions.  */
1177 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1178 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1179 	   void *extra)
1180 {
1181   bin_tree_t *node, *prev;
1182 
1183   for (node = root; ; )
1184     {
1185       /* Descend down the tree, preferably to the left (or to the right
1186 	 if that's the only child).  */
1187       while (node->left || node->right)
1188 	if (node->left)
1189           node = node->left;
1190         else
1191           node = node->right;
1192 
1193       do
1194 	{
1195 	  reg_errcode_t err = fn (extra, node);
1196 	  if (BE (err != REG_NOERROR, 0))
1197 	    return err;
1198           if (node->parent == NULL)
1199 	    return REG_NOERROR;
1200 	  prev = node;
1201 	  node = node->parent;
1202 	}
1203       /* Go up while we have a node that is reached from the right.  */
1204       while (node->right == prev || node->right == NULL);
1205       node = node->right;
1206     }
1207 }
1208 
1209 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1210 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1211 	  void *extra)
1212 {
1213   bin_tree_t *node;
1214 
1215   for (node = root; ; )
1216     {
1217       reg_errcode_t err = fn (extra, node);
1218       if (BE (err != REG_NOERROR, 0))
1219 	return err;
1220 
1221       /* Go to the left node, or up and to the right.  */
1222       if (node->left)
1223 	node = node->left;
1224       else
1225 	{
1226 	  bin_tree_t *prev = NULL;
1227 	  while (node->right == prev || node->right == NULL)
1228 	    {
1229 	      prev = node;
1230 	      node = node->parent;
1231 	      if (!node)
1232 	        return REG_NOERROR;
1233 	    }
1234 	  node = node->right;
1235 	}
1236     }
1237 }
1238 
1239 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1240    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1241    backreferences as well.  Requires a preorder visit.  */
1242 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1243 optimize_subexps (void *extra, bin_tree_t *node)
1244 {
1245   re_dfa_t *dfa = (re_dfa_t *) extra;
1246 
1247   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1248     {
1249       int idx = node->token.opr.idx;
1250       node->token.opr.idx = dfa->subexp_map[idx];
1251       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1252     }
1253 
1254   else if (node->token.type == SUBEXP
1255            && node->left && node->left->token.type == SUBEXP)
1256     {
1257       Idx other_idx = node->left->token.opr.idx;
1258 
1259       node->left = node->left->left;
1260       if (node->left)
1261         node->left->parent = node;
1262 
1263       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1264       if (other_idx < BITSET_WORD_BITS)
1265 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1266     }
1267 
1268   return REG_NOERROR;
1269 }
1270 
1271 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1272    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1273 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1274 lower_subexps (void *extra, bin_tree_t *node)
1275 {
1276   regex_t *preg = (regex_t *) extra;
1277   reg_errcode_t err = REG_NOERROR;
1278 
1279   if (node->left && node->left->token.type == SUBEXP)
1280     {
1281       node->left = lower_subexp (&err, preg, node->left);
1282       if (node->left)
1283 	node->left->parent = node;
1284     }
1285   if (node->right && node->right->token.type == SUBEXP)
1286     {
1287       node->right = lower_subexp (&err, preg, node->right);
1288       if (node->right)
1289 	node->right->parent = node;
1290     }
1291 
1292   return err;
1293 }
1294 
1295 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1296 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1297 {
1298   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1299   bin_tree_t *body = node->left;
1300   bin_tree_t *op, *cls, *tree1, *tree;
1301 
1302   if (preg->no_sub
1303       /* We do not optimize empty subexpressions, because otherwise we may
1304 	 have bad CONCAT nodes with NULL children.  This is obviously not
1305 	 very common, so we do not lose much.  An example that triggers
1306 	 this case is the sed "script" /\(\)/x.  */
1307       && node->left != NULL
1308       && (node->token.opr.idx >= BITSET_WORD_BITS
1309 	  || !(dfa->used_bkref_map
1310 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1311     return node->left;
1312 
1313   /* Convert the SUBEXP node to the concatenation of an
1314      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1315   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1316   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1317   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1318   tree = create_tree (dfa, op, tree1, CONCAT);
1319   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1320     {
1321       *err = REG_ESPACE;
1322       return NULL;
1323     }
1324 
1325   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1326   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1327   return tree;
1328 }
1329 
1330 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1331    nodes.  Requires a postorder visit.  */
1332 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1333 calc_first (void *extra, bin_tree_t *node)
1334 {
1335   re_dfa_t *dfa = (re_dfa_t *) extra;
1336   if (node->token.type == CONCAT)
1337     {
1338       node->first = node->left->first;
1339       node->node_idx = node->left->node_idx;
1340     }
1341   else
1342     {
1343       node->first = node;
1344       node->node_idx = re_dfa_add_node (dfa, node->token);
1345       if (BE (node->node_idx == REG_MISSING, 0))
1346         return REG_ESPACE;
1347     }
1348   return REG_NOERROR;
1349 }
1350 
1351 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1352 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1353 calc_next (void *extra, bin_tree_t *node)
1354 {
1355   switch (node->token.type)
1356     {
1357     case OP_DUP_ASTERISK:
1358       node->left->next = node;
1359       break;
1360     case CONCAT:
1361       node->left->next = node->right->first;
1362       node->right->next = node->next;
1363       break;
1364     default:
1365       if (node->left)
1366 	node->left->next = node->next;
1367       if (node->right)
1368         node->right->next = node->next;
1369       break;
1370     }
1371   return REG_NOERROR;
1372 }
1373 
1374 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1375 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1376 link_nfa_nodes (void *extra, bin_tree_t *node)
1377 {
1378   re_dfa_t *dfa = (re_dfa_t *) extra;
1379   Idx idx = node->node_idx;
1380   reg_errcode_t err = REG_NOERROR;
1381 
1382   switch (node->token.type)
1383     {
1384     case CONCAT:
1385       break;
1386 
1387     case END_OF_RE:
1388       assert (node->next == NULL);
1389       break;
1390 
1391     case OP_DUP_ASTERISK:
1392     case OP_ALT:
1393       {
1394 	Idx left, right;
1395 	dfa->has_plural_match = 1;
1396 	if (node->left != NULL)
1397 	  left = node->left->first->node_idx;
1398 	else
1399 	  left = node->next->node_idx;
1400 	if (node->right != NULL)
1401 	  right = node->right->first->node_idx;
1402 	else
1403 	  right = node->next->node_idx;
1404 	assert (REG_VALID_INDEX (left));
1405 	assert (REG_VALID_INDEX (right));
1406 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1407       }
1408       break;
1409 
1410     case ANCHOR:
1411     case OP_OPEN_SUBEXP:
1412     case OP_CLOSE_SUBEXP:
1413       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1414       break;
1415 
1416     case OP_BACK_REF:
1417       dfa->nexts[idx] = node->next->node_idx;
1418       if (node->token.type == OP_BACK_REF)
1419 	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1420       break;
1421 
1422     default:
1423       assert (!IS_EPSILON_NODE (node->token.type));
1424       dfa->nexts[idx] = node->next->node_idx;
1425       break;
1426     }
1427 
1428   return err;
1429 }
1430 
1431 /* Duplicate the epsilon closure of the node ROOT_NODE.
1432    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1433    to their own constraint.  */
1434 
1435 static reg_errcode_t
1436 internal_function
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)1437 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1438 			Idx root_node, unsigned int init_constraint)
1439 {
1440   Idx org_node, clone_node;
1441   bool ok;
1442   unsigned int constraint = init_constraint;
1443   for (org_node = top_org_node, clone_node = top_clone_node;;)
1444     {
1445       Idx org_dest, clone_dest;
1446       if (dfa->nodes[org_node].type == OP_BACK_REF)
1447 	{
1448 	  /* If the back reference epsilon-transit, its destination must
1449 	     also have the constraint.  Then duplicate the epsilon closure
1450 	     of the destination of the back reference, and store it in
1451 	     edests of the back reference.  */
1452 	  org_dest = dfa->nexts[org_node];
1453 	  re_node_set_empty (dfa->edests + clone_node);
1454 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1455 	  if (BE (clone_dest == REG_MISSING, 0))
1456 	    return REG_ESPACE;
1457 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1458 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1459 	  if (BE (! ok, 0))
1460 	    return REG_ESPACE;
1461 	}
1462       else if (dfa->edests[org_node].nelem == 0)
1463 	{
1464 	  /* In case of the node can't epsilon-transit, don't duplicate the
1465 	     destination and store the original destination as the
1466 	     destination of the node.  */
1467 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1468 	  break;
1469 	}
1470       else if (dfa->edests[org_node].nelem == 1)
1471 	{
1472 	  /* In case of the node can epsilon-transit, and it has only one
1473 	     destination.  */
1474 	  org_dest = dfa->edests[org_node].elems[0];
1475 	  re_node_set_empty (dfa->edests + clone_node);
1476 	  if (dfa->nodes[org_node].type == ANCHOR)
1477 	    {
1478 	      /* In case of the node has another constraint, append it.  */
1479 	      if (org_node == root_node && clone_node != org_node)
1480 		{
1481 		  /* ...but if the node is root_node itself, it means the
1482 		     epsilon closure have a loop, then tie it to the
1483 		     destination of the root_node.  */
1484 		  ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1485 		  if (BE (! ok, 0))
1486 		    return REG_ESPACE;
1487 		  break;
1488 		}
1489 	      constraint |= dfa->nodes[org_node].opr.ctx_type;
1490 	    }
1491 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1492 	  if (BE (clone_dest == REG_MISSING, 0))
1493 	    return REG_ESPACE;
1494 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1495 	  if (BE (! ok, 0))
1496 	    return REG_ESPACE;
1497 	}
1498       else /* dfa->edests[org_node].nelem == 2 */
1499 	{
1500 	  /* In case of the node can epsilon-transit, and it has two
1501 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1502 	  org_dest = dfa->edests[org_node].elems[0];
1503 	  re_node_set_empty (dfa->edests + clone_node);
1504 	  /* Search for a duplicated node which satisfies the constraint.  */
1505 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1506 	  if (clone_dest == REG_MISSING)
1507 	    {
1508 	      /* There are no such a duplicated node, create a new one.  */
1509 	      reg_errcode_t err;
1510 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1511 	      if (BE (clone_dest == REG_MISSING, 0))
1512 		return REG_ESPACE;
1513 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1514 	      if (BE (! ok, 0))
1515 		return REG_ESPACE;
1516 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1517 					    root_node, constraint);
1518 	      if (BE (err != REG_NOERROR, 0))
1519 		return err;
1520 	    }
1521 	  else
1522 	    {
1523 	      /* There are a duplicated node which satisfy the constraint,
1524 		 use it to avoid infinite loop.  */
1525 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1526 	      if (BE (! ok, 0))
1527 		return REG_ESPACE;
1528 	    }
1529 
1530 	  org_dest = dfa->edests[org_node].elems[1];
1531 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1532 	  if (BE (clone_dest == REG_MISSING, 0))
1533 	    return REG_ESPACE;
1534 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1535 	  if (BE (! ok, 0))
1536 	    return REG_ESPACE;
1537 	}
1538       org_node = org_dest;
1539       clone_node = clone_dest;
1540     }
1541   return REG_NOERROR;
1542 }
1543 
1544 /* Search for a node which is duplicated from the node ORG_NODE, and
1545    satisfies the constraint CONSTRAINT.  */
1546 
1547 static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)1548 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1549 			unsigned int constraint)
1550 {
1551   Idx idx;
1552   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1553     {
1554       if (org_node == dfa->org_indices[idx]
1555 	  && constraint == dfa->nodes[idx].constraint)
1556 	return idx; /* Found.  */
1557     }
1558   return REG_MISSING; /* Not found.  */
1559 }
1560 
1561 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1562    Return the index of the new node, or REG_MISSING if insufficient storage is
1563    available.  */
1564 
1565 static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)1566 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1567 {
1568   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1569   if (BE (dup_idx != REG_MISSING, 1))
1570     {
1571       dfa->nodes[dup_idx].constraint = constraint;
1572       if (dfa->nodes[org_idx].type == ANCHOR)
1573 	dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1574       dfa->nodes[dup_idx].duplicated = 1;
1575 
1576       /* Store the index of the original node.  */
1577       dfa->org_indices[dup_idx] = org_idx;
1578     }
1579   return dup_idx;
1580 }
1581 
1582 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1583 calc_inveclosure (re_dfa_t *dfa)
1584 {
1585   Idx src, idx;
1586   bool ok;
1587   for (idx = 0; idx < dfa->nodes_len; ++idx)
1588     re_node_set_init_empty (dfa->inveclosures + idx);
1589 
1590   for (src = 0; src < dfa->nodes_len; ++src)
1591     {
1592       Idx *elems = dfa->eclosures[src].elems;
1593       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1594 	{
1595 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1596 	  if (BE (! ok, 0))
1597 	    return REG_ESPACE;
1598 	}
1599     }
1600 
1601   return REG_NOERROR;
1602 }
1603 
1604 /* Calculate "eclosure" for all the node in DFA.  */
1605 
1606 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1607 calc_eclosure (re_dfa_t *dfa)
1608 {
1609   Idx node_idx;
1610   bool incomplete;
1611 #ifdef DEBUG
1612   assert (dfa->nodes_len > 0);
1613 #endif
1614   incomplete = false;
1615   /* For each nodes, calculate epsilon closure.  */
1616   for (node_idx = 0; ; ++node_idx)
1617     {
1618       reg_errcode_t err;
1619       re_node_set eclosure_elem;
1620       if (node_idx == dfa->nodes_len)
1621 	{
1622 	  if (!incomplete)
1623 	    break;
1624 	  incomplete = false;
1625 	  node_idx = 0;
1626 	}
1627 
1628 #ifdef DEBUG
1629       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1630 #endif
1631 
1632       /* If we have already calculated, skip it.  */
1633       if (dfa->eclosures[node_idx].nelem != 0)
1634 	continue;
1635       /* Calculate epsilon closure of `node_idx'.  */
1636       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1637       if (BE (err != REG_NOERROR, 0))
1638 	return err;
1639 
1640       if (dfa->eclosures[node_idx].nelem == 0)
1641 	{
1642 	  incomplete = true;
1643 	  re_node_set_free (&eclosure_elem);
1644 	}
1645     }
1646   return REG_NOERROR;
1647 }
1648 
1649 /* Calculate epsilon closure of NODE.  */
1650 
1651 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)1652 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1653 {
1654   reg_errcode_t err;
1655   unsigned int constraint;
1656   Idx i;
1657   bool incomplete;
1658   bool ok;
1659   re_node_set eclosure;
1660   incomplete = false;
1661   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1662   if (BE (err != REG_NOERROR, 0))
1663     return err;
1664 
1665   /* This indicates that we are calculating this node now.
1666      We reference this value to avoid infinite loop.  */
1667   dfa->eclosures[node].nelem = REG_MISSING;
1668 
1669   constraint = ((dfa->nodes[node].type == ANCHOR)
1670 		? dfa->nodes[node].opr.ctx_type : 0);
1671   /* If the current node has constraints, duplicate all nodes.
1672      Since they must inherit the constraints.  */
1673   if (constraint
1674       && dfa->edests[node].nelem
1675       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1676     {
1677       err = duplicate_node_closure (dfa, node, node, node, constraint);
1678       if (BE (err != REG_NOERROR, 0))
1679 	return err;
1680     }
1681 
1682   /* Expand each epsilon destination nodes.  */
1683   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1684     for (i = 0; i < dfa->edests[node].nelem; ++i)
1685       {
1686 	re_node_set eclosure_elem;
1687 	Idx edest = dfa->edests[node].elems[i];
1688 	/* If calculating the epsilon closure of `edest' is in progress,
1689 	   return intermediate result.  */
1690 	if (dfa->eclosures[edest].nelem == REG_MISSING)
1691 	  {
1692 	    incomplete = true;
1693 	    continue;
1694 	  }
1695 	/* If we haven't calculated the epsilon closure of `edest' yet,
1696 	   calculate now. Otherwise use calculated epsilon closure.  */
1697 	if (dfa->eclosures[edest].nelem == 0)
1698 	  {
1699 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1700 	    if (BE (err != REG_NOERROR, 0))
1701 	      return err;
1702 	  }
1703 	else
1704 	  eclosure_elem = dfa->eclosures[edest];
1705 	/* Merge the epsilon closure of `edest'.  */
1706 	re_node_set_merge (&eclosure, &eclosure_elem);
1707 	/* If the epsilon closure of `edest' is incomplete,
1708 	   the epsilon closure of this node is also incomplete.  */
1709 	if (dfa->eclosures[edest].nelem == 0)
1710 	  {
1711 	    incomplete = true;
1712 	    re_node_set_free (&eclosure_elem);
1713 	  }
1714       }
1715 
1716   /* Epsilon closures include itself.  */
1717   ok = re_node_set_insert (&eclosure, node);
1718   if (BE (! ok, 0))
1719     return REG_ESPACE;
1720   if (incomplete && !root)
1721     dfa->eclosures[node].nelem = 0;
1722   else
1723     dfa->eclosures[node] = eclosure;
1724   *new_set = eclosure;
1725   return REG_NOERROR;
1726 }
1727 
1728 /* Functions for token which are used in the parser.  */
1729 
1730 /* Fetch a token from INPUT.
1731    We must not use this function inside bracket expressions.  */
1732 
1733 static void
1734 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1735 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1736 {
1737   re_string_skip_bytes (input, peek_token (result, input, syntax));
1738 }
1739 
1740 /* Peek a token from INPUT, and return the length of the token.
1741    We must not use this function inside bracket expressions.  */
1742 
1743 static int
1744 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1745 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1746 {
1747   unsigned char c;
1748 
1749   if (re_string_eoi (input))
1750     {
1751       token->type = END_OF_RE;
1752       return 0;
1753     }
1754 
1755   c = re_string_peek_byte (input, 0);
1756   token->opr.c = c;
1757 
1758   token->word_char = 0;
1759 #ifdef RE_ENABLE_I18N
1760   token->mb_partial = 0;
1761   if (input->mb_cur_max > 1 &&
1762       !re_string_first_byte (input, re_string_cur_idx (input)))
1763     {
1764       token->type = CHARACTER;
1765       token->mb_partial = 1;
1766       return 1;
1767     }
1768 #endif
1769   if (c == '\\')
1770     {
1771       unsigned char c2;
1772       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1773 	{
1774 	  token->type = BACK_SLASH;
1775 	  return 1;
1776 	}
1777 
1778       c2 = re_string_peek_byte_case (input, 1);
1779       token->opr.c = c2;
1780       token->type = CHARACTER;
1781 #ifdef RE_ENABLE_I18N
1782       if (input->mb_cur_max > 1)
1783 	{
1784 	  wint_t wc = re_string_wchar_at (input,
1785 					  re_string_cur_idx (input) + 1);
1786 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1787 	}
1788       else
1789 #endif
1790 	token->word_char = IS_WORD_CHAR (c2) != 0;
1791 
1792       switch (c2)
1793 	{
1794 	case '|':
1795 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1796 	    token->type = OP_ALT;
1797 	  break;
1798 	case '1': case '2': case '3': case '4': case '5':
1799 	case '6': case '7': case '8': case '9':
1800 	  if (!(syntax & RE_NO_BK_REFS))
1801 	    {
1802 	      token->type = OP_BACK_REF;
1803 	      token->opr.idx = c2 - '1';
1804 	    }
1805 	  break;
1806 	case '<':
1807 	  if (!(syntax & RE_NO_GNU_OPS))
1808 	    {
1809 	      token->type = ANCHOR;
1810 	      token->opr.ctx_type = WORD_FIRST;
1811 	    }
1812 	  break;
1813 	case '>':
1814 	  if (!(syntax & RE_NO_GNU_OPS))
1815 	    {
1816 	      token->type = ANCHOR;
1817 	      token->opr.ctx_type = WORD_LAST;
1818 	    }
1819 	  break;
1820 	case 'b':
1821 	  if (!(syntax & RE_NO_GNU_OPS))
1822 	    {
1823 	      token->type = ANCHOR;
1824 	      token->opr.ctx_type = WORD_DELIM;
1825 	    }
1826 	  break;
1827 	case 'B':
1828 	  if (!(syntax & RE_NO_GNU_OPS))
1829 	    {
1830 	      token->type = ANCHOR;
1831 	      token->opr.ctx_type = NOT_WORD_DELIM;
1832 	    }
1833 	  break;
1834 	case 'w':
1835 	  if (!(syntax & RE_NO_GNU_OPS))
1836 	    token->type = OP_WORD;
1837 	  break;
1838 	case 'W':
1839 	  if (!(syntax & RE_NO_GNU_OPS))
1840 	    token->type = OP_NOTWORD;
1841 	  break;
1842 	case 's':
1843 	  if (!(syntax & RE_NO_GNU_OPS))
1844 	    token->type = OP_SPACE;
1845 	  break;
1846 	case 'S':
1847 	  if (!(syntax & RE_NO_GNU_OPS))
1848 	    token->type = OP_NOTSPACE;
1849 	  break;
1850 	case '`':
1851 	  if (!(syntax & RE_NO_GNU_OPS))
1852 	    {
1853 	      token->type = ANCHOR;
1854 	      token->opr.ctx_type = BUF_FIRST;
1855 	    }
1856 	  break;
1857 	case '\'':
1858 	  if (!(syntax & RE_NO_GNU_OPS))
1859 	    {
1860 	      token->type = ANCHOR;
1861 	      token->opr.ctx_type = BUF_LAST;
1862 	    }
1863 	  break;
1864 	case '(':
1865 	  if (!(syntax & RE_NO_BK_PARENS))
1866 	    token->type = OP_OPEN_SUBEXP;
1867 	  break;
1868 	case ')':
1869 	  if (!(syntax & RE_NO_BK_PARENS))
1870 	    token->type = OP_CLOSE_SUBEXP;
1871 	  break;
1872 	case '+':
1873 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1874 	    token->type = OP_DUP_PLUS;
1875 	  break;
1876 	case '?':
1877 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1878 	    token->type = OP_DUP_QUESTION;
1879 	  break;
1880 	case '{':
1881 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1882 	    token->type = OP_OPEN_DUP_NUM;
1883 	  break;
1884 	case '}':
1885 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1886 	    token->type = OP_CLOSE_DUP_NUM;
1887 	  break;
1888 	default:
1889 	  break;
1890 	}
1891       return 2;
1892     }
1893 
1894   token->type = CHARACTER;
1895 #ifdef RE_ENABLE_I18N
1896   if (input->mb_cur_max > 1)
1897     {
1898       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1899       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1900     }
1901   else
1902 #endif
1903     token->word_char = IS_WORD_CHAR (token->opr.c);
1904 
1905   switch (c)
1906     {
1907     case '\n':
1908       if (syntax & RE_NEWLINE_ALT)
1909 	token->type = OP_ALT;
1910       break;
1911     case '|':
1912       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1913 	token->type = OP_ALT;
1914       break;
1915     case '*':
1916       token->type = OP_DUP_ASTERISK;
1917       break;
1918     case '+':
1919       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1920 	token->type = OP_DUP_PLUS;
1921       break;
1922     case '?':
1923       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1924 	token->type = OP_DUP_QUESTION;
1925       break;
1926     case '{':
1927       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1928 	token->type = OP_OPEN_DUP_NUM;
1929       break;
1930     case '}':
1931       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1932 	token->type = OP_CLOSE_DUP_NUM;
1933       break;
1934     case '(':
1935       if (syntax & RE_NO_BK_PARENS)
1936 	token->type = OP_OPEN_SUBEXP;
1937       break;
1938     case ')':
1939       if (syntax & RE_NO_BK_PARENS)
1940 	token->type = OP_CLOSE_SUBEXP;
1941       break;
1942     case '[':
1943       token->type = OP_OPEN_BRACKET;
1944       break;
1945     case '.':
1946       token->type = OP_PERIOD;
1947       break;
1948     case '^':
1949       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1950 	  re_string_cur_idx (input) != 0)
1951 	{
1952 	  char prev = re_string_peek_byte (input, -1);
1953 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1954 	    break;
1955 	}
1956       token->type = ANCHOR;
1957       token->opr.ctx_type = LINE_FIRST;
1958       break;
1959     case '$':
1960       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1961 	  re_string_cur_idx (input) + 1 != re_string_length (input))
1962 	{
1963 	  re_token_t next;
1964 	  re_string_skip_bytes (input, 1);
1965 	  peek_token (&next, input, syntax);
1966 	  re_string_skip_bytes (input, -1);
1967 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1968 	    break;
1969 	}
1970       token->type = ANCHOR;
1971       token->opr.ctx_type = LINE_LAST;
1972       break;
1973     default:
1974       break;
1975     }
1976   return 1;
1977 }
1978 
1979 /* Peek a token from INPUT, and return the length of the token.
1980    We must not use this function out of bracket expressions.  */
1981 
1982 static int
1983 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1984 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1985 {
1986   unsigned char c;
1987   if (re_string_eoi (input))
1988     {
1989       token->type = END_OF_RE;
1990       return 0;
1991     }
1992   c = re_string_peek_byte (input, 0);
1993   token->opr.c = c;
1994 
1995 #ifdef RE_ENABLE_I18N
1996   if (input->mb_cur_max > 1 &&
1997       !re_string_first_byte (input, re_string_cur_idx (input)))
1998     {
1999       token->type = CHARACTER;
2000       return 1;
2001     }
2002 #endif /* RE_ENABLE_I18N */
2003 
2004   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2005       && re_string_cur_idx (input) + 1 < re_string_length (input))
2006     {
2007       /* In this case, '\' escape a character.  */
2008       unsigned char c2;
2009       re_string_skip_bytes (input, 1);
2010       c2 = re_string_peek_byte (input, 0);
2011       token->opr.c = c2;
2012       token->type = CHARACTER;
2013       return 1;
2014     }
2015   if (c == '[') /* '[' is a special char in a bracket exps.  */
2016     {
2017       unsigned char c2;
2018       int token_len;
2019       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2020 	c2 = re_string_peek_byte (input, 1);
2021       else
2022 	c2 = 0;
2023       token->opr.c = c2;
2024       token_len = 2;
2025       switch (c2)
2026 	{
2027 	case '.':
2028 	  token->type = OP_OPEN_COLL_ELEM;
2029 	  break;
2030 	case '=':
2031 	  token->type = OP_OPEN_EQUIV_CLASS;
2032 	  break;
2033 	case ':':
2034 	  if (syntax & RE_CHAR_CLASSES)
2035 	    {
2036 	      token->type = OP_OPEN_CHAR_CLASS;
2037 	      break;
2038 	    }
2039 	  /* else fall through.  */
2040 	default:
2041 	  token->type = CHARACTER;
2042 	  token->opr.c = c;
2043 	  token_len = 1;
2044 	  break;
2045 	}
2046       return token_len;
2047     }
2048   switch (c)
2049     {
2050     case '-':
2051       token->type = OP_CHARSET_RANGE;
2052       break;
2053     case ']':
2054       token->type = OP_CLOSE_BRACKET;
2055       break;
2056     case '^':
2057       token->type = OP_NON_MATCH_LIST;
2058       break;
2059     default:
2060       token->type = CHARACTER;
2061     }
2062   return 1;
2063 }
2064 
2065 /* Functions for parser.  */
2066 
2067 /* Entry point of the parser.
2068    Parse the regular expression REGEXP and return the structure tree.
2069    If an error is occured, ERR is set by error code, and return NULL.
2070    This function build the following tree, from regular expression <reg_exp>:
2071 	   CAT
2072 	   / \
2073 	  /   \
2074    <reg_exp>  EOR
2075 
2076    CAT means concatenation.
2077    EOR means end of regular expression.  */
2078 
2079 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2080 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2081        reg_errcode_t *err)
2082 {
2083   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2084   bin_tree_t *tree, *eor, *root;
2085   re_token_t current_token;
2086   dfa->syntax = syntax;
2087   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2088   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2089   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2090     return NULL;
2091   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2092   if (tree != NULL)
2093     root = create_tree (dfa, tree, eor, CONCAT);
2094   else
2095     root = eor;
2096   if (BE (eor == NULL || root == NULL, 0))
2097     {
2098       *err = REG_ESPACE;
2099       return NULL;
2100     }
2101   return root;
2102 }
2103 
2104 /* This function build the following tree, from regular expression
2105    <branch1>|<branch2>:
2106 	   ALT
2107 	   / \
2108 	  /   \
2109    <branch1> <branch2>
2110 
2111    ALT means alternative, which represents the operator `|'.  */
2112 
2113 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2114 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2115 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2116 {
2117   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2118   bin_tree_t *tree, *branch = NULL;
2119   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2120   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2121     return NULL;
2122 
2123   while (token->type == OP_ALT)
2124     {
2125       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2126       if (token->type != OP_ALT && token->type != END_OF_RE
2127 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2128 	{
2129 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2130 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2131 	    return NULL;
2132 	}
2133       else
2134 	branch = NULL;
2135       tree = create_tree (dfa, tree, branch, OP_ALT);
2136       if (BE (tree == NULL, 0))
2137 	{
2138 	  *err = REG_ESPACE;
2139 	  return NULL;
2140 	}
2141     }
2142   return tree;
2143 }
2144 
2145 /* This function build the following tree, from regular expression
2146    <exp1><exp2>:
2147 	CAT
2148 	/ \
2149        /   \
2150    <exp1> <exp2>
2151 
2152    CAT means concatenation.  */
2153 
2154 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2155 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2156 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2157 {
2158   bin_tree_t *tree, *expr;
2159   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2160   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2161   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2162     return NULL;
2163 
2164   while (token->type != OP_ALT && token->type != END_OF_RE
2165 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2166     {
2167       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2168       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2169 	{
2170 	  return NULL;
2171 	}
2172       if (tree != NULL && expr != NULL)
2173 	{
2174 	  tree = create_tree (dfa, tree, expr, CONCAT);
2175 	  if (tree == NULL)
2176 	    {
2177 	      *err = REG_ESPACE;
2178 	      return NULL;
2179 	    }
2180 	}
2181       else if (tree == NULL)
2182 	tree = expr;
2183       /* Otherwise expr == NULL, we don't need to create new tree.  */
2184     }
2185   return tree;
2186 }
2187 
2188 /* This function build the following tree, from regular expression a*:
2189 	 *
2190 	 |
2191 	 a
2192 */
2193 
2194 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2195 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2196 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2197 {
2198   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2199   bin_tree_t *tree;
2200   switch (token->type)
2201     {
2202     case CHARACTER:
2203       tree = create_token_tree (dfa, NULL, NULL, token);
2204       if (BE (tree == NULL, 0))
2205 	{
2206 	  *err = REG_ESPACE;
2207 	  return NULL;
2208 	}
2209 #ifdef RE_ENABLE_I18N
2210       if (dfa->mb_cur_max > 1)
2211 	{
2212 	  while (!re_string_eoi (regexp)
2213 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2214 	    {
2215 	      bin_tree_t *mbc_remain;
2216 	      fetch_token (token, regexp, syntax);
2217 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2218 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2219 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2220 		{
2221 		  *err = REG_ESPACE;
2222 		  return NULL;
2223 		}
2224 	    }
2225 	}
2226 #endif
2227       break;
2228     case OP_OPEN_SUBEXP:
2229       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2230       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2231 	return NULL;
2232       break;
2233     case OP_OPEN_BRACKET:
2234       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2235       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2236 	return NULL;
2237       break;
2238     case OP_BACK_REF:
2239       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2240 	{
2241 	  *err = REG_ESUBREG;
2242 	  return NULL;
2243 	}
2244       dfa->used_bkref_map |= 1 << token->opr.idx;
2245       tree = create_token_tree (dfa, NULL, NULL, token);
2246       if (BE (tree == NULL, 0))
2247 	{
2248 	  *err = REG_ESPACE;
2249 	  return NULL;
2250 	}
2251       ++dfa->nbackref;
2252       dfa->has_mb_node = 1;
2253       break;
2254     case OP_OPEN_DUP_NUM:
2255       if (syntax & RE_CONTEXT_INVALID_DUP)
2256 	{
2257 	  *err = REG_BADRPT;
2258 	  return NULL;
2259 	}
2260       /* FALLTHROUGH */
2261     case OP_DUP_ASTERISK:
2262     case OP_DUP_PLUS:
2263     case OP_DUP_QUESTION:
2264       if (syntax & RE_CONTEXT_INVALID_OPS)
2265 	{
2266 	  *err = REG_BADRPT;
2267 	  return NULL;
2268 	}
2269       else if (syntax & RE_CONTEXT_INDEP_OPS)
2270 	{
2271 	  fetch_token (token, regexp, syntax);
2272 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2273 	}
2274       /* else fall through  */
2275     case OP_CLOSE_SUBEXP:
2276       if ((token->type == OP_CLOSE_SUBEXP) &&
2277 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2278 	{
2279 	  *err = REG_ERPAREN;
2280 	  return NULL;
2281 	}
2282       /* else fall through  */
2283     case OP_CLOSE_DUP_NUM:
2284       /* We treat it as a normal character.  */
2285 
2286       /* Then we can these characters as normal characters.  */
2287       token->type = CHARACTER;
2288       /* mb_partial and word_char bits should be initialized already
2289 	 by peek_token.  */
2290       tree = create_token_tree (dfa, NULL, NULL, token);
2291       if (BE (tree == NULL, 0))
2292 	{
2293 	  *err = REG_ESPACE;
2294 	  return NULL;
2295 	}
2296       break;
2297     case ANCHOR:
2298       if ((token->opr.ctx_type
2299 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2300 	  && dfa->word_ops_used == 0)
2301 	init_word_char (dfa);
2302       if (token->opr.ctx_type == WORD_DELIM
2303           || token->opr.ctx_type == NOT_WORD_DELIM)
2304 	{
2305 	  bin_tree_t *tree_first, *tree_last;
2306 	  if (token->opr.ctx_type == WORD_DELIM)
2307 	    {
2308 	      token->opr.ctx_type = WORD_FIRST;
2309 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2310 	      token->opr.ctx_type = WORD_LAST;
2311             }
2312           else
2313             {
2314 	      token->opr.ctx_type = INSIDE_WORD;
2315 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2316 	      token->opr.ctx_type = INSIDE_NOTWORD;
2317             }
2318 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2319 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2320 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2321 	    {
2322 	      *err = REG_ESPACE;
2323 	      return NULL;
2324 	    }
2325 	}
2326       else
2327 	{
2328 	  tree = create_token_tree (dfa, NULL, NULL, token);
2329 	  if (BE (tree == NULL, 0))
2330 	    {
2331 	      *err = REG_ESPACE;
2332 	      return NULL;
2333 	    }
2334 	}
2335       /* We must return here, since ANCHORs can't be followed
2336 	 by repetition operators.
2337 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2338 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2339       fetch_token (token, regexp, syntax);
2340       return tree;
2341     case OP_PERIOD:
2342       tree = create_token_tree (dfa, NULL, NULL, token);
2343       if (BE (tree == NULL, 0))
2344 	{
2345 	  *err = REG_ESPACE;
2346 	  return NULL;
2347 	}
2348       if (dfa->mb_cur_max > 1)
2349 	dfa->has_mb_node = 1;
2350       break;
2351     case OP_WORD:
2352     case OP_NOTWORD:
2353       tree = build_charclass_op (dfa, regexp->trans,
2354 				 (const unsigned char *) "alnum",
2355 				 (const unsigned char *) "_",
2356 				 token->type == OP_NOTWORD, err);
2357       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2358 	return NULL;
2359       break;
2360     case OP_SPACE:
2361     case OP_NOTSPACE:
2362       tree = build_charclass_op (dfa, regexp->trans,
2363 				 (const unsigned char *) "space",
2364 				 (const unsigned char *) "",
2365 				 token->type == OP_NOTSPACE, err);
2366       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2367 	return NULL;
2368       break;
2369     case OP_ALT:
2370     case END_OF_RE:
2371       return NULL;
2372     case BACK_SLASH:
2373       *err = REG_EESCAPE;
2374       return NULL;
2375     default:
2376       /* Must not happen?  */
2377 #ifdef DEBUG
2378       assert (0);
2379 #endif
2380       return NULL;
2381     }
2382   fetch_token (token, regexp, syntax);
2383 
2384   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2385 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2386     {
2387       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2388       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389 	return NULL;
2390       /* In BRE consecutive duplications are not allowed.  */
2391       if ((syntax & RE_CONTEXT_INVALID_DUP)
2392 	  && (token->type == OP_DUP_ASTERISK
2393 	      || token->type == OP_OPEN_DUP_NUM))
2394 	{
2395 	  *err = REG_BADRPT;
2396 	  return NULL;
2397 	}
2398     }
2399 
2400   return tree;
2401 }
2402 
2403 /* This function build the following tree, from regular expression
2404    (<reg_exp>):
2405 	 SUBEXP
2406 	    |
2407 	<reg_exp>
2408 */
2409 
2410 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2411 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2412 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2413 {
2414   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2415   bin_tree_t *tree;
2416   size_t cur_nsub;
2417   cur_nsub = preg->re_nsub++;
2418 
2419   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2420 
2421   /* The subexpression may be a null string.  */
2422   if (token->type == OP_CLOSE_SUBEXP)
2423     tree = NULL;
2424   else
2425     {
2426       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2427       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2428         *err = REG_EPAREN;
2429       if (BE (*err != REG_NOERROR, 0))
2430 	return NULL;
2431     }
2432 
2433   if (cur_nsub <= '9' - '1')
2434     dfa->completed_bkref_map |= 1 << cur_nsub;
2435 
2436   tree = create_tree (dfa, tree, NULL, SUBEXP);
2437   if (BE (tree == NULL, 0))
2438     {
2439       *err = REG_ESPACE;
2440       return NULL;
2441     }
2442   tree->token.opr.idx = cur_nsub;
2443   return tree;
2444 }
2445 
2446 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2447 
2448 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2449 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2450 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2451 {
2452   bin_tree_t *tree = NULL, *old_tree = NULL;
2453   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2454   re_token_t start_token = *token;
2455 
2456   if (token->type == OP_OPEN_DUP_NUM)
2457     {
2458       end = 0;
2459       start = fetch_number (regexp, token, syntax);
2460       if (start == REG_MISSING)
2461 	{
2462 	  if (token->type == CHARACTER && token->opr.c == ',')
2463 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2464 	  else
2465 	    {
2466 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2467 	      return NULL;
2468 	    }
2469 	}
2470       if (BE (start != REG_ERROR, 1))
2471 	{
2472 	  /* We treat "{n}" as "{n,n}".  */
2473 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2474 		 : ((token->type == CHARACTER && token->opr.c == ',')
2475 		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2476 	}
2477       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2478 	{
2479 	  /* Invalid sequence.  */
2480 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2481 	    {
2482 	      if (token->type == END_OF_RE)
2483 		*err = REG_EBRACE;
2484 	      else
2485 		*err = REG_BADBR;
2486 
2487 	      return NULL;
2488 	    }
2489 
2490 	  /* If the syntax bit is set, rollback.  */
2491 	  re_string_set_index (regexp, start_idx);
2492 	  *token = start_token;
2493 	  token->type = CHARACTER;
2494 	  /* mb_partial and word_char bits should be already initialized by
2495 	     peek_token.  */
2496 	  return elem;
2497 	}
2498 
2499       if (BE (end != REG_MISSING && start > end, 0))
2500 	{
2501 	  /* First number greater than second.  */
2502 	  *err = REG_BADBR;
2503 	  return NULL;
2504 	}
2505     }
2506   else
2507     {
2508       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2509       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2510     }
2511 
2512   fetch_token (token, regexp, syntax);
2513 
2514   if (BE (elem == NULL, 0))
2515     return NULL;
2516   if (BE (start == 0 && end == 0, 0))
2517     {
2518       postorder (elem, free_tree, NULL);
2519       return NULL;
2520     }
2521 
2522   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2523   if (BE (start > 0, 0))
2524     {
2525       tree = elem;
2526       for (i = 2; i <= start; ++i)
2527 	{
2528 	  elem = duplicate_tree (elem, dfa);
2529 	  tree = create_tree (dfa, tree, elem, CONCAT);
2530 	  if (BE (elem == NULL || tree == NULL, 0))
2531 	    goto parse_dup_op_espace;
2532 	}
2533 
2534       if (start == end)
2535 	return tree;
2536 
2537       /* Duplicate ELEM before it is marked optional.  */
2538       elem = duplicate_tree (elem, dfa);
2539       old_tree = tree;
2540     }
2541   else
2542     old_tree = NULL;
2543 
2544   if (elem->token.type == SUBEXP)
2545     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2546 
2547   tree = create_tree (dfa, elem, NULL,
2548 		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2549   if (BE (tree == NULL, 0))
2550     goto parse_dup_op_espace;
2551 
2552   /* This loop is actually executed only when end != REG_MISSING,
2553      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2554      already created the start+1-th copy.  */
2555   if ((Idx) -1 < 0 || end != REG_MISSING)
2556     for (i = start + 2; i <= end; ++i)
2557       {
2558 	elem = duplicate_tree (elem, dfa);
2559 	tree = create_tree (dfa, tree, elem, CONCAT);
2560 	if (BE (elem == NULL || tree == NULL, 0))
2561 	  goto parse_dup_op_espace;
2562 
2563 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2564 	if (BE (tree == NULL, 0))
2565 	  goto parse_dup_op_espace;
2566       }
2567 
2568   if (old_tree)
2569     tree = create_tree (dfa, old_tree, tree, CONCAT);
2570 
2571   return tree;
2572 
2573  parse_dup_op_espace:
2574   *err = REG_ESPACE;
2575   return NULL;
2576 }
2577 
2578 /* Size of the names for collating symbol/equivalence_class/character_class.
2579    I'm not sure, but maybe enough.  */
2580 #define BRACKET_NAME_BUF_SIZE 32
2581 
2582 #ifndef _LIBC
2583   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2584      Build the range expression which starts from START_ELEM, and ends
2585      at END_ELEM.  The result are written to MBCSET and SBCSET.
2586      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2587      mbcset->range_ends, is a pointer argument sinse we may
2588      update it.  */
2589 
2590 static reg_errcode_t
2591 internal_function
2592 # ifdef RE_ENABLE_I18N
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,Idx * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)2593 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2594 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2595 # else /* not RE_ENABLE_I18N */
2596 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2597 		 bracket_elem_t *end_elem)
2598 # endif /* not RE_ENABLE_I18N */
2599 {
2600   unsigned int start_ch, end_ch;
2601   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2602   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2603 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2604 	  0))
2605     return REG_ERANGE;
2606 
2607   /* We can handle no multi character collating elements without libc
2608      support.  */
2609   if (BE ((start_elem->type == COLL_SYM
2610 	   && strlen ((char *) start_elem->opr.name) > 1)
2611 	  || (end_elem->type == COLL_SYM
2612 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2613     return REG_ECOLLATE;
2614 
2615 # ifdef RE_ENABLE_I18N
2616   {
2617     wchar_t wc;
2618     wint_t start_wc;
2619     wint_t end_wc;
2620     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2621 
2622     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2623 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2624 		   : 0));
2625     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2626 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2627 		 : 0));
2628     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2629 		? __btowc (start_ch) : start_elem->opr.wch);
2630     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2631 	      ? __btowc (end_ch) : end_elem->opr.wch);
2632     if (start_wc == WEOF || end_wc == WEOF)
2633       return REG_ECOLLATE;
2634     cmp_buf[0] = start_wc;
2635     cmp_buf[4] = end_wc;
2636     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2637       return REG_ERANGE;
2638 
2639     /* Got valid collation sequence values, add them as a new entry.
2640        However, for !_LIBC we have no collation elements: if the
2641        character set is single byte, the single byte character set
2642        that we build below suffices.  parse_bracket_exp passes
2643        no MBCSET if dfa->mb_cur_max == 1.  */
2644     if (mbcset)
2645       {
2646         /* Check the space of the arrays.  */
2647         if (BE (*range_alloc == mbcset->nranges, 0))
2648           {
2649 	    /* There is not enough space, need realloc.  */
2650 	    wchar_t *new_array_start, *new_array_end;
2651 	    Idx new_nranges;
2652 
2653 	    /* +1 in case of mbcset->nranges is 0.  */
2654 	    new_nranges = 2 * mbcset->nranges + 1;
2655 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2656 	       are NULL if *range_alloc == 0.  */
2657 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2658 				          new_nranges);
2659 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2660 				        new_nranges);
2661 
2662 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2663 	      return REG_ESPACE;
2664 
2665 	    mbcset->range_starts = new_array_start;
2666 	    mbcset->range_ends = new_array_end;
2667 	    *range_alloc = new_nranges;
2668           }
2669 
2670         mbcset->range_starts[mbcset->nranges] = start_wc;
2671         mbcset->range_ends[mbcset->nranges++] = end_wc;
2672       }
2673 
2674     /* Build the table for single byte characters.  */
2675     for (wc = 0; wc < SBC_MAX; ++wc)
2676       {
2677 	cmp_buf[2] = wc;
2678 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2679 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2680 	  bitset_set (sbcset, wc);
2681       }
2682   }
2683 # else /* not RE_ENABLE_I18N */
2684   {
2685     unsigned int ch;
2686     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2687 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2688 		   : 0));
2689     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2690 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2691 		 : 0));
2692     if (start_ch > end_ch)
2693       return REG_ERANGE;
2694     /* Build the table for single byte characters.  */
2695     for (ch = 0; ch < SBC_MAX; ++ch)
2696       if (start_ch <= ch  && ch <= end_ch)
2697 	bitset_set (sbcset, ch);
2698   }
2699 # endif /* not RE_ENABLE_I18N */
2700   return REG_NOERROR;
2701 }
2702 #endif /* not _LIBC */
2703 
2704 #ifndef _LIBC
2705 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2706    Build the collating element which is represented by NAME.
2707    The result are written to MBCSET and SBCSET.
2708    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2709    pointer argument since we may update it.  */
2710 
2711 static reg_errcode_t
2712 internal_function
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2713 build_collating_symbol (bitset_t sbcset,
2714 # ifdef RE_ENABLE_I18N
2715 			re_charset_t *mbcset, Idx *coll_sym_alloc,
2716 # endif
2717 			const unsigned char *name)
2718 {
2719   size_t name_len = strlen ((const char *) name);
2720   if (BE (name_len != 1, 0))
2721     return REG_ECOLLATE;
2722   else
2723     {
2724       bitset_set (sbcset, name[0]);
2725       return REG_NOERROR;
2726     }
2727 }
2728 #endif /* not _LIBC */
2729 
2730 /* This function parse bracket expression like "[abc]", "[a-c]",
2731    "[[.a-a.]]" etc.  */
2732 
2733 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2734 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2735 		   reg_syntax_t syntax, reg_errcode_t *err)
2736 {
2737 #ifdef _LIBC
2738   const unsigned char *collseqmb;
2739   const char *collseqwc;
2740   uint32_t nrules;
2741   int32_t table_size;
2742   const int32_t *symb_table;
2743   const unsigned char *extra;
2744 
2745   /* Local function for parse_bracket_exp used in _LIBC environement.
2746      Seek the collating symbol entry correspondings to NAME.
2747      Return the index of the symbol in the SYMB_TABLE.  */
2748 
2749   auto inline int32_t
2750   __attribute ((always_inline))
2751   seek_collating_symbol_entry (name, name_len)
2752 	 const unsigned char *name;
2753 	 size_t name_len;
2754     {
2755       int32_t hash = elem_hash ((const char *) name, name_len);
2756       int32_t elem = hash % table_size;
2757       if (symb_table[2 * elem] != 0)
2758 	{
2759 	  int32_t second = hash % (table_size - 2) + 1;
2760 
2761 	  do
2762 	    {
2763 	      /* First compare the hashing value.  */
2764 	      if (symb_table[2 * elem] == hash
2765 		  /* Compare the length of the name.  */
2766 		  && name_len == extra[symb_table[2 * elem + 1]]
2767 		  /* Compare the name.  */
2768 		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2769 			     name_len) == 0)
2770 		{
2771 		  /* Yep, this is the entry.  */
2772 		  break;
2773 		}
2774 
2775 	      /* Next entry.  */
2776 	      elem += second;
2777 	    }
2778 	  while (symb_table[2 * elem] != 0);
2779 	}
2780       return elem;
2781     }
2782 
2783   /* Local function for parse_bracket_exp used in _LIBC environement.
2784      Look up the collation sequence value of BR_ELEM.
2785      Return the value if succeeded, UINT_MAX otherwise.  */
2786 
2787   auto inline unsigned int
2788   __attribute ((always_inline))
2789   lookup_collation_sequence_value (br_elem)
2790 	 bracket_elem_t *br_elem;
2791     {
2792       if (br_elem->type == SB_CHAR)
2793 	{
2794 	  /*
2795 	  if (MB_CUR_MAX == 1)
2796 	  */
2797 	  if (nrules == 0)
2798 	    return collseqmb[br_elem->opr.ch];
2799 	  else
2800 	    {
2801 	      wint_t wc = __btowc (br_elem->opr.ch);
2802 	      return __collseq_table_lookup (collseqwc, wc);
2803 	    }
2804 	}
2805       else if (br_elem->type == MB_CHAR)
2806 	{
2807 	  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2808 	}
2809       else if (br_elem->type == COLL_SYM)
2810 	{
2811 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2812 	  if (nrules != 0)
2813 	    {
2814 	      int32_t elem, idx;
2815 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2816 						  sym_name_len);
2817 	      if (symb_table[2 * elem] != 0)
2818 		{
2819 		  /* We found the entry.  */
2820 		  idx = symb_table[2 * elem + 1];
2821 		  /* Skip the name of collating element name.  */
2822 		  idx += 1 + extra[idx];
2823 		  /* Skip the byte sequence of the collating element.  */
2824 		  idx += 1 + extra[idx];
2825 		  /* Adjust for the alignment.  */
2826 		  idx = (idx + 3) & ~3;
2827 		  /* Skip the multibyte collation sequence value.  */
2828 		  idx += sizeof (unsigned int);
2829 		  /* Skip the wide char sequence of the collating element.  */
2830 		  idx += sizeof (unsigned int) *
2831 		    (1 + *(unsigned int *) (extra + idx));
2832 		  /* Return the collation sequence value.  */
2833 		  return *(unsigned int *) (extra + idx);
2834 		}
2835 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2836 		{
2837 		  /* No valid character.  Match it as a single byte
2838 		     character.  */
2839 		  return collseqmb[br_elem->opr.name[0]];
2840 		}
2841 	    }
2842 	  else if (sym_name_len == 1)
2843 	    return collseqmb[br_elem->opr.name[0]];
2844 	}
2845       return UINT_MAX;
2846     }
2847 
2848   /* Local function for parse_bracket_exp used in _LIBC environement.
2849      Build the range expression which starts from START_ELEM, and ends
2850      at END_ELEM.  The result are written to MBCSET and SBCSET.
2851      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2852      mbcset->range_ends, is a pointer argument sinse we may
2853      update it.  */
2854 
2855   auto inline reg_errcode_t
2856   __attribute ((always_inline))
2857   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2858 	 re_charset_t *mbcset;
2859 	 Idx *range_alloc;
2860 	 bitset_t sbcset;
2861 	 bracket_elem_t *start_elem, *end_elem;
2862     {
2863       unsigned int ch;
2864       uint32_t start_collseq;
2865       uint32_t end_collseq;
2866 
2867       /* Equivalence Classes and Character Classes can't be a range
2868 	 start/end.  */
2869       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2870 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2871 	      0))
2872 	return REG_ERANGE;
2873 
2874       start_collseq = lookup_collation_sequence_value (start_elem);
2875       end_collseq = lookup_collation_sequence_value (end_elem);
2876       /* Check start/end collation sequence values.  */
2877       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2878 	return REG_ECOLLATE;
2879       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2880 	return REG_ERANGE;
2881 
2882       /* Got valid collation sequence values, add them as a new entry.
2883 	 However, if we have no collation elements, and the character set
2884 	 is single byte, the single byte character set that we
2885 	 build below suffices. */
2886       if (nrules > 0 || dfa->mb_cur_max > 1)
2887 	{
2888           /* Check the space of the arrays.  */
2889           if (BE (*range_alloc == mbcset->nranges, 0))
2890 	    {
2891 	      /* There is not enough space, need realloc.  */
2892 	      uint32_t *new_array_start;
2893 	      uint32_t *new_array_end;
2894 	      Idx new_nranges;
2895 
2896 	      /* +1 in case of mbcset->nranges is 0.  */
2897 	      new_nranges = 2 * mbcset->nranges + 1;
2898 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2899 					    new_nranges);
2900 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2901 				          new_nranges);
2902 
2903 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2904 	        return REG_ESPACE;
2905 
2906 	      mbcset->range_starts = new_array_start;
2907 	      mbcset->range_ends = new_array_end;
2908 	      *range_alloc = new_nranges;
2909 	    }
2910 
2911           mbcset->range_starts[mbcset->nranges] = start_collseq;
2912           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2913 	}
2914 
2915       /* Build the table for single byte characters.  */
2916       for (ch = 0; ch < SBC_MAX; ch++)
2917 	{
2918 	  uint32_t ch_collseq;
2919 	  /*
2920 	  if (MB_CUR_MAX == 1)
2921 	  */
2922 	  if (nrules == 0)
2923 	    ch_collseq = collseqmb[ch];
2924 	  else
2925 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2926 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2927 	    bitset_set (sbcset, ch);
2928 	}
2929       return REG_NOERROR;
2930     }
2931 
2932   /* Local function for parse_bracket_exp used in _LIBC environement.
2933      Build the collating element which is represented by NAME.
2934      The result are written to MBCSET and SBCSET.
2935      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2936      pointer argument sinse we may update it.  */
2937 
2938   auto inline reg_errcode_t
2939   __attribute ((always_inline))
2940   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2941 	 re_charset_t *mbcset;
2942 	 Idx *coll_sym_alloc;
2943 	 bitset_t sbcset;
2944 	 const unsigned char *name;
2945     {
2946       int32_t elem, idx;
2947       size_t name_len = strlen ((const char *) name);
2948       if (nrules != 0)
2949 	{
2950 	  elem = seek_collating_symbol_entry (name, name_len);
2951 	  if (symb_table[2 * elem] != 0)
2952 	    {
2953 	      /* We found the entry.  */
2954 	      idx = symb_table[2 * elem + 1];
2955 	      /* Skip the name of collating element name.  */
2956 	      idx += 1 + extra[idx];
2957 	    }
2958 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
2959 	    {
2960 	      /* No valid character, treat it as a normal
2961 		 character.  */
2962 	      bitset_set (sbcset, name[0]);
2963 	      return REG_NOERROR;
2964 	    }
2965 	  else
2966 	    return REG_ECOLLATE;
2967 
2968 	  /* Got valid collation sequence, add it as a new entry.  */
2969 	  /* Check the space of the arrays.  */
2970 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2971 	    {
2972 	      /* Not enough, realloc it.  */
2973 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
2974 	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2975 	      /* Use realloc since mbcset->coll_syms is NULL
2976 		 if *alloc == 0.  */
2977 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2978 						   new_coll_sym_alloc);
2979 	      if (BE (new_coll_syms == NULL, 0))
2980 		return REG_ESPACE;
2981 	      mbcset->coll_syms = new_coll_syms;
2982 	      *coll_sym_alloc = new_coll_sym_alloc;
2983 	    }
2984 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2985 	  return REG_NOERROR;
2986 	}
2987       else
2988 	{
2989 	  if (BE (name_len != 1, 0))
2990 	    return REG_ECOLLATE;
2991 	  else
2992 	    {
2993 	      bitset_set (sbcset, name[0]);
2994 	      return REG_NOERROR;
2995 	    }
2996 	}
2997     }
2998 #endif
2999 
3000   re_token_t br_token;
3001   re_bitset_ptr_t sbcset;
3002 #ifdef RE_ENABLE_I18N
3003   re_charset_t *mbcset;
3004   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3005   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3006 #endif /* not RE_ENABLE_I18N */
3007   bool non_match = false;
3008   bin_tree_t *work_tree;
3009   int token_len;
3010   bool first_round = true;
3011 #ifdef _LIBC
3012   collseqmb = (const unsigned char *)
3013     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3014   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3015   if (nrules)
3016     {
3017       /*
3018       if (MB_CUR_MAX > 1)
3019       */
3020       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3021       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3022       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3023 						  _NL_COLLATE_SYMB_TABLEMB);
3024       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3025 						   _NL_COLLATE_SYMB_EXTRAMB);
3026     }
3027 #endif
3028   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3029 #ifdef RE_ENABLE_I18N
3030   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3031 #endif /* RE_ENABLE_I18N */
3032 #ifdef RE_ENABLE_I18N
3033   if (BE (sbcset == NULL || mbcset == NULL, 0))
3034 #else
3035   if (BE (sbcset == NULL, 0))
3036 #endif /* RE_ENABLE_I18N */
3037     {
3038       *err = REG_ESPACE;
3039       return NULL;
3040     }
3041 
3042   token_len = peek_token_bracket (token, regexp, syntax);
3043   if (BE (token->type == END_OF_RE, 0))
3044     {
3045       *err = REG_BADPAT;
3046       goto parse_bracket_exp_free_return;
3047     }
3048   if (token->type == OP_NON_MATCH_LIST)
3049     {
3050 #ifdef RE_ENABLE_I18N
3051       mbcset->non_match = 1;
3052 #endif /* not RE_ENABLE_I18N */
3053       non_match = true;
3054       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3055 	bitset_set (sbcset, '\n');
3056       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3057       token_len = peek_token_bracket (token, regexp, syntax);
3058       if (BE (token->type == END_OF_RE, 0))
3059 	{
3060 	  *err = REG_BADPAT;
3061 	  goto parse_bracket_exp_free_return;
3062 	}
3063     }
3064 
3065   /* We treat the first ']' as a normal character.  */
3066   if (token->type == OP_CLOSE_BRACKET)
3067     token->type = CHARACTER;
3068 
3069   while (1)
3070     {
3071       bracket_elem_t start_elem, end_elem;
3072       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3073       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3074       reg_errcode_t ret;
3075       int token_len2 = 0;
3076       bool is_range_exp = false;
3077       re_token_t token2;
3078 
3079       start_elem.opr.name = start_name_buf;
3080       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3081 				   syntax, first_round);
3082       if (BE (ret != REG_NOERROR, 0))
3083 	{
3084 	  *err = ret;
3085 	  goto parse_bracket_exp_free_return;
3086 	}
3087       first_round = false;
3088 
3089       /* Get information about the next token.  We need it in any case.  */
3090       token_len = peek_token_bracket (token, regexp, syntax);
3091 
3092       /* Do not check for ranges if we know they are not allowed.  */
3093       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3094 	{
3095 	  if (BE (token->type == END_OF_RE, 0))
3096 	    {
3097 	      *err = REG_EBRACK;
3098 	      goto parse_bracket_exp_free_return;
3099 	    }
3100 	  if (token->type == OP_CHARSET_RANGE)
3101 	    {
3102 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3103 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3104 	      if (BE (token2.type == END_OF_RE, 0))
3105 		{
3106 		  *err = REG_EBRACK;
3107 		  goto parse_bracket_exp_free_return;
3108 		}
3109 	      if (token2.type == OP_CLOSE_BRACKET)
3110 		{
3111 		  /* We treat the last '-' as a normal character.  */
3112 		  re_string_skip_bytes (regexp, -token_len);
3113 		  token->type = CHARACTER;
3114 		}
3115 	      else
3116 		is_range_exp = true;
3117 	    }
3118 	}
3119 
3120       if (is_range_exp == true)
3121 	{
3122 	  end_elem.opr.name = end_name_buf;
3123 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3124 				       dfa, syntax, true);
3125 	  if (BE (ret != REG_NOERROR, 0))
3126 	    {
3127 	      *err = ret;
3128 	      goto parse_bracket_exp_free_return;
3129 	    }
3130 
3131 	  token_len = peek_token_bracket (token, regexp, syntax);
3132 
3133 #ifdef _LIBC
3134 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3135 				  &start_elem, &end_elem);
3136 #else
3137 # ifdef RE_ENABLE_I18N
3138 	  *err = build_range_exp (sbcset,
3139 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3140 				  &range_alloc, &start_elem, &end_elem);
3141 # else
3142 	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3143 # endif
3144 #endif /* RE_ENABLE_I18N */
3145 	  if (BE (*err != REG_NOERROR, 0))
3146 	    goto parse_bracket_exp_free_return;
3147 	}
3148       else
3149 	{
3150 	  switch (start_elem.type)
3151 	    {
3152 	    case SB_CHAR:
3153 	      bitset_set (sbcset, start_elem.opr.ch);
3154 	      break;
3155 #ifdef RE_ENABLE_I18N
3156 	    case MB_CHAR:
3157 	      /* Check whether the array has enough space.  */
3158 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3159 		{
3160 		  wchar_t *new_mbchars;
3161 		  /* Not enough, realloc it.  */
3162 		  /* +1 in case of mbcset->nmbchars is 0.  */
3163 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3164 		  /* Use realloc since array is NULL if *alloc == 0.  */
3165 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3166 					    mbchar_alloc);
3167 		  if (BE (new_mbchars == NULL, 0))
3168 		    goto parse_bracket_exp_espace;
3169 		  mbcset->mbchars = new_mbchars;
3170 		}
3171 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3172 	      break;
3173 #endif /* RE_ENABLE_I18N */
3174 	    case EQUIV_CLASS:
3175 	      *err = build_equiv_class (sbcset,
3176 #ifdef RE_ENABLE_I18N
3177 					mbcset, &equiv_class_alloc,
3178 #endif /* RE_ENABLE_I18N */
3179 					start_elem.opr.name);
3180 	      if (BE (*err != REG_NOERROR, 0))
3181 		goto parse_bracket_exp_free_return;
3182 	      break;
3183 	    case COLL_SYM:
3184 	      *err = build_collating_symbol (sbcset,
3185 #ifdef RE_ENABLE_I18N
3186 					     mbcset, &coll_sym_alloc,
3187 #endif /* RE_ENABLE_I18N */
3188 					     start_elem.opr.name);
3189 	      if (BE (*err != REG_NOERROR, 0))
3190 		goto parse_bracket_exp_free_return;
3191 	      break;
3192 	    case CHAR_CLASS:
3193 	      *err = build_charclass (regexp->trans, sbcset,
3194 #ifdef RE_ENABLE_I18N
3195 				      mbcset, &char_class_alloc,
3196 #endif /* RE_ENABLE_I18N */
3197 				      start_elem.opr.name, syntax);
3198 	      if (BE (*err != REG_NOERROR, 0))
3199 	       goto parse_bracket_exp_free_return;
3200 	      break;
3201 	    default:
3202 	      assert (0);
3203 	      break;
3204 	    }
3205 	}
3206       if (BE (token->type == END_OF_RE, 0))
3207 	{
3208 	  *err = REG_EBRACK;
3209 	  goto parse_bracket_exp_free_return;
3210 	}
3211       if (token->type == OP_CLOSE_BRACKET)
3212 	break;
3213     }
3214 
3215   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3216 
3217   /* If it is non-matching list.  */
3218   if (non_match)
3219     bitset_not (sbcset);
3220 
3221 #ifdef RE_ENABLE_I18N
3222   /* Ensure only single byte characters are set.  */
3223   if (dfa->mb_cur_max > 1)
3224     bitset_mask (sbcset, dfa->sb_char);
3225 
3226   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3227       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3228 						     || mbcset->non_match)))
3229     {
3230       bin_tree_t *mbc_tree;
3231       int sbc_idx;
3232       /* Build a tree for complex bracket.  */
3233       dfa->has_mb_node = 1;
3234       br_token.type = COMPLEX_BRACKET;
3235       br_token.opr.mbcset = mbcset;
3236       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3237       if (BE (mbc_tree == NULL, 0))
3238 	goto parse_bracket_exp_espace;
3239       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3240 	if (sbcset[sbc_idx])
3241 	  break;
3242       /* If there are no bits set in sbcset, there is no point
3243 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3244       if (sbc_idx < BITSET_WORDS)
3245 	{
3246           /* Build a tree for simple bracket.  */
3247           br_token.type = SIMPLE_BRACKET;
3248           br_token.opr.sbcset = sbcset;
3249           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3250           if (BE (work_tree == NULL, 0))
3251             goto parse_bracket_exp_espace;
3252 
3253           /* Then join them by ALT node.  */
3254           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3255           if (BE (work_tree == NULL, 0))
3256             goto parse_bracket_exp_espace;
3257 	}
3258       else
3259 	{
3260 	  re_free (sbcset);
3261 	  work_tree = mbc_tree;
3262 	}
3263     }
3264   else
3265 #endif /* not RE_ENABLE_I18N */
3266     {
3267 #ifdef RE_ENABLE_I18N
3268       free_charset (mbcset);
3269 #endif
3270       /* Build a tree for simple bracket.  */
3271       br_token.type = SIMPLE_BRACKET;
3272       br_token.opr.sbcset = sbcset;
3273       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3274       if (BE (work_tree == NULL, 0))
3275         goto parse_bracket_exp_espace;
3276     }
3277   return work_tree;
3278 
3279  parse_bracket_exp_espace:
3280   *err = REG_ESPACE;
3281  parse_bracket_exp_free_return:
3282   re_free (sbcset);
3283 #ifdef RE_ENABLE_I18N
3284   free_charset (mbcset);
3285 #endif /* RE_ENABLE_I18N */
3286   return NULL;
3287 }
3288 
3289 /* Parse an element in the bracket expression.  */
3290 
3291 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)3292 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3293 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3294 		       reg_syntax_t syntax, bool accept_hyphen)
3295 {
3296 #ifdef RE_ENABLE_I18N
3297   int cur_char_size;
3298   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3299   if (cur_char_size > 1)
3300     {
3301       elem->type = MB_CHAR;
3302       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3303       re_string_skip_bytes (regexp, cur_char_size);
3304       return REG_NOERROR;
3305     }
3306 #endif /* RE_ENABLE_I18N */
3307   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3308   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3309       || token->type == OP_OPEN_EQUIV_CLASS)
3310     return parse_bracket_symbol (elem, regexp, token);
3311   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3312     {
3313       /* A '-' must only appear as anything but a range indicator before
3314 	 the closing bracket.  Everything else is an error.  */
3315       re_token_t token2;
3316       (void) peek_token_bracket (&token2, regexp, syntax);
3317       if (token2.type != OP_CLOSE_BRACKET)
3318 	/* The actual error value is not standardized since this whole
3319 	   case is undefined.  But ERANGE makes good sense.  */
3320 	return REG_ERANGE;
3321     }
3322   elem->type = SB_CHAR;
3323   elem->opr.ch = token->opr.c;
3324   return REG_NOERROR;
3325 }
3326 
3327 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3328    such as [:<character_class>:], [.<collating_element>.], and
3329    [=<equivalent_class>=].  */
3330 
3331 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3332 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3333 		      re_token_t *token)
3334 {
3335   unsigned char ch, delim = token->opr.c;
3336   int i = 0;
3337   if (re_string_eoi(regexp))
3338     return REG_EBRACK;
3339   for (;; ++i)
3340     {
3341       if (i >= BRACKET_NAME_BUF_SIZE)
3342 	return REG_EBRACK;
3343       if (token->type == OP_OPEN_CHAR_CLASS)
3344 	ch = re_string_fetch_byte_case (regexp);
3345       else
3346 	ch = re_string_fetch_byte (regexp);
3347       if (re_string_eoi(regexp))
3348 	return REG_EBRACK;
3349       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3350 	break;
3351       elem->opr.name[i] = ch;
3352     }
3353   re_string_skip_bytes (regexp, 1);
3354   elem->opr.name[i] = '\0';
3355   switch (token->type)
3356     {
3357     case OP_OPEN_COLL_ELEM:
3358       elem->type = COLL_SYM;
3359       break;
3360     case OP_OPEN_EQUIV_CLASS:
3361       elem->type = EQUIV_CLASS;
3362       break;
3363     case OP_OPEN_CHAR_CLASS:
3364       elem->type = CHAR_CLASS;
3365       break;
3366     default:
3367       break;
3368     }
3369   return REG_NOERROR;
3370 }
3371 
3372   /* Helper function for parse_bracket_exp.
3373      Build the equivalence class which is represented by NAME.
3374      The result are written to MBCSET and SBCSET.
3375      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3376      is a pointer argument sinse we may update it.  */
3377 
3378 static reg_errcode_t
3379 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3380 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3381 		   Idx *equiv_class_alloc, const unsigned char *name)
3382 #else /* not RE_ENABLE_I18N */
3383 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3384 #endif /* not RE_ENABLE_I18N */
3385 {
3386 #ifdef _LIBC
3387   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3388   if (nrules != 0)
3389     {
3390       const int32_t *table, *indirect;
3391       const unsigned char *weights, *extra, *cp;
3392       unsigned char char_buf[2];
3393       int32_t idx1, idx2;
3394       unsigned int ch;
3395       size_t len;
3396       /* This #include defines a local function!  */
3397 # include <locale/weight.h>
3398       /* Calculate the index for equivalence class.  */
3399       cp = name;
3400       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3401       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3402 					       _NL_COLLATE_WEIGHTMB);
3403       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3404 						   _NL_COLLATE_EXTRAMB);
3405       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3406 						_NL_COLLATE_INDIRECTMB);
3407       idx1 = findidx (&cp);
3408       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3409 	/* This isn't a valid character.  */
3410 	return REG_ECOLLATE;
3411 
3412       /* Build single byte matcing table for this equivalence class.  */
3413       char_buf[1] = (unsigned char) '\0';
3414       len = weights[idx1];
3415       for (ch = 0; ch < SBC_MAX; ++ch)
3416 	{
3417 	  char_buf[0] = ch;
3418 	  cp = char_buf;
3419 	  idx2 = findidx (&cp);
3420 /*
3421 	  idx2 = table[ch];
3422 */
3423 	  if (idx2 == 0)
3424 	    /* This isn't a valid character.  */
3425 	    continue;
3426 	  if (len == weights[idx2])
3427 	    {
3428 	      int cnt = 0;
3429 	      while (cnt <= len &&
3430 		     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3431 		++cnt;
3432 
3433 	      if (cnt > len)
3434 		bitset_set (sbcset, ch);
3435 	    }
3436 	}
3437       /* Check whether the array has enough space.  */
3438       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3439 	{
3440 	  /* Not enough, realloc it.  */
3441 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3442 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3443 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3444 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3445 						   int32_t,
3446 						   new_equiv_class_alloc);
3447 	  if (BE (new_equiv_classes == NULL, 0))
3448 	    return REG_ESPACE;
3449 	  mbcset->equiv_classes = new_equiv_classes;
3450 	  *equiv_class_alloc = new_equiv_class_alloc;
3451 	}
3452       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3453     }
3454   else
3455 #endif /* _LIBC */
3456     {
3457       if (BE (strlen ((const char *) name) != 1, 0))
3458 	return REG_ECOLLATE;
3459       bitset_set (sbcset, *name);
3460     }
3461   return REG_NOERROR;
3462 }
3463 
3464   /* Helper function for parse_bracket_exp.
3465      Build the character class which is represented by NAME.
3466      The result are written to MBCSET and SBCSET.
3467      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3468      is a pointer argument sinse we may update it.  */
3469 
3470 static reg_errcode_t
3471 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3472 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3473 		 re_charset_t *mbcset, Idx *char_class_alloc,
3474 		 const unsigned char *class_name, reg_syntax_t syntax)
3475 #else /* not RE_ENABLE_I18N */
3476 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3477 		 const unsigned char *class_name, reg_syntax_t syntax)
3478 #endif /* not RE_ENABLE_I18N */
3479 {
3480   int i;
3481   const char *name = (const char *) class_name;
3482 
3483   /* In case of REG_ICASE "upper" and "lower" match the both of
3484      upper and lower cases.  */
3485   if ((syntax & RE_ICASE)
3486       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3487     name = "alpha";
3488 
3489 #ifdef RE_ENABLE_I18N
3490   /* Check the space of the arrays.  */
3491   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3492     {
3493       /* Not enough, realloc it.  */
3494       /* +1 in case of mbcset->nchar_classes is 0.  */
3495       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3496       /* Use realloc since array is NULL if *alloc == 0.  */
3497       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3498 					       new_char_class_alloc);
3499       if (BE (new_char_classes == NULL, 0))
3500 	return REG_ESPACE;
3501       mbcset->char_classes = new_char_classes;
3502       *char_class_alloc = new_char_class_alloc;
3503     }
3504   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3505 #endif /* RE_ENABLE_I18N */
3506 
3507 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3508   do {						\
3509     if (BE (trans != NULL, 0))			\
3510       {						\
3511 	for (i = 0; i < SBC_MAX; ++i)		\
3512 	  if (ctype_func (i))			\
3513 	    bitset_set (sbcset, trans[i]);	\
3514       }						\
3515     else					\
3516       {						\
3517 	for (i = 0; i < SBC_MAX; ++i)		\
3518 	  if (ctype_func (i))			\
3519 	    bitset_set (sbcset, i);		\
3520       }						\
3521   } while (0)
3522 
3523   if (strcmp (name, "alnum") == 0)
3524     BUILD_CHARCLASS_LOOP (isalnum);
3525   else if (strcmp (name, "cntrl") == 0)
3526     BUILD_CHARCLASS_LOOP (iscntrl);
3527   else if (strcmp (name, "lower") == 0)
3528     BUILD_CHARCLASS_LOOP (islower);
3529   else if (strcmp (name, "space") == 0)
3530     BUILD_CHARCLASS_LOOP (isspace);
3531   else if (strcmp (name, "alpha") == 0)
3532     BUILD_CHARCLASS_LOOP (isalpha);
3533   else if (strcmp (name, "digit") == 0)
3534     BUILD_CHARCLASS_LOOP (isdigit);
3535   else if (strcmp (name, "print") == 0)
3536     BUILD_CHARCLASS_LOOP (isprint);
3537   else if (strcmp (name, "upper") == 0)
3538     BUILD_CHARCLASS_LOOP (isupper);
3539   else if (strcmp (name, "blank") == 0)
3540     BUILD_CHARCLASS_LOOP (isblank);
3541   else if (strcmp (name, "graph") == 0)
3542     BUILD_CHARCLASS_LOOP (isgraph);
3543   else if (strcmp (name, "punct") == 0)
3544     BUILD_CHARCLASS_LOOP (ispunct);
3545   else if (strcmp (name, "xdigit") == 0)
3546     BUILD_CHARCLASS_LOOP (isxdigit);
3547   else
3548     return REG_ECTYPE;
3549 
3550   return REG_NOERROR;
3551 }
3552 
3553 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,bool non_match,reg_errcode_t * err)3554 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3555 		    const unsigned char *class_name,
3556 		    const unsigned char *extra, bool non_match,
3557 		    reg_errcode_t *err)
3558 {
3559   re_bitset_ptr_t sbcset;
3560 #ifdef RE_ENABLE_I18N
3561   re_charset_t *mbcset;
3562   Idx alloc = 0;
3563 #endif /* not RE_ENABLE_I18N */
3564   reg_errcode_t ret;
3565   re_token_t br_token;
3566   bin_tree_t *tree;
3567 
3568   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3569 #ifdef RE_ENABLE_I18N
3570   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3571 #endif /* RE_ENABLE_I18N */
3572 
3573 #ifdef RE_ENABLE_I18N
3574   if (BE (sbcset == NULL || mbcset == NULL, 0))
3575 #else /* not RE_ENABLE_I18N */
3576   if (BE (sbcset == NULL, 0))
3577 #endif /* not RE_ENABLE_I18N */
3578     {
3579       *err = REG_ESPACE;
3580       return NULL;
3581     }
3582 
3583   if (non_match)
3584     {
3585 #ifdef RE_ENABLE_I18N
3586       mbcset->non_match = 1;
3587 #endif /* not RE_ENABLE_I18N */
3588     }
3589 
3590   /* We don't care the syntax in this case.  */
3591   ret = build_charclass (trans, sbcset,
3592 #ifdef RE_ENABLE_I18N
3593 			 mbcset, &alloc,
3594 #endif /* RE_ENABLE_I18N */
3595 			 class_name, 0);
3596 
3597   if (BE (ret != REG_NOERROR, 0))
3598     {
3599       re_free (sbcset);
3600 #ifdef RE_ENABLE_I18N
3601       free_charset (mbcset);
3602 #endif /* RE_ENABLE_I18N */
3603       *err = ret;
3604       return NULL;
3605     }
3606   /* \w match '_' also.  */
3607   for (; *extra; extra++)
3608     bitset_set (sbcset, *extra);
3609 
3610   /* If it is non-matching list.  */
3611   if (non_match)
3612     bitset_not (sbcset);
3613 
3614 #ifdef RE_ENABLE_I18N
3615   /* Ensure only single byte characters are set.  */
3616   if (dfa->mb_cur_max > 1)
3617     bitset_mask (sbcset, dfa->sb_char);
3618 #endif
3619 
3620   /* Build a tree for simple bracket.  */
3621   br_token.type = SIMPLE_BRACKET;
3622   br_token.opr.sbcset = sbcset;
3623   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3624   if (BE (tree == NULL, 0))
3625     goto build_word_op_espace;
3626 
3627 #ifdef RE_ENABLE_I18N
3628   if (dfa->mb_cur_max > 1)
3629     {
3630       bin_tree_t *mbc_tree;
3631       /* Build a tree for complex bracket.  */
3632       br_token.type = COMPLEX_BRACKET;
3633       br_token.opr.mbcset = mbcset;
3634       dfa->has_mb_node = 1;
3635       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3636       if (BE (mbc_tree == NULL, 0))
3637 	goto build_word_op_espace;
3638       /* Then join them by ALT node.  */
3639       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3640       if (BE (mbc_tree != NULL, 1))
3641 	return tree;
3642     }
3643   else
3644     {
3645       free_charset (mbcset);
3646       return tree;
3647     }
3648 #else /* not RE_ENABLE_I18N */
3649   return tree;
3650 #endif /* not RE_ENABLE_I18N */
3651 
3652  build_word_op_espace:
3653   re_free (sbcset);
3654 #ifdef RE_ENABLE_I18N
3655   free_charset (mbcset);
3656 #endif /* RE_ENABLE_I18N */
3657   *err = REG_ESPACE;
3658   return NULL;
3659 }
3660 
3661 /* This is intended for the expressions like "a{1,3}".
3662    Fetch a number from `input', and return the number.
3663    Return REG_MISSING if the number field is empty like "{,1}".
3664    Return REG_ERROR if an error occurred.  */
3665 
3666 static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3667 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3668 {
3669   Idx num = REG_MISSING;
3670   unsigned char c;
3671   while (1)
3672     {
3673       fetch_token (token, input, syntax);
3674       c = token->opr.c;
3675       if (BE (token->type == END_OF_RE, 0))
3676 	return REG_ERROR;
3677       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3678 	break;
3679       num = ((token->type != CHARACTER || c < '0' || '9' < c
3680 	      || num == REG_ERROR)
3681 	     ? REG_ERROR
3682 	     : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3683       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3684     }
3685   return num;
3686 }
3687 
3688 #ifdef RE_ENABLE_I18N
3689 static void
free_charset(re_charset_t * cset)3690 free_charset (re_charset_t *cset)
3691 {
3692   re_free (cset->mbchars);
3693 # ifdef _LIBC
3694   re_free (cset->coll_syms);
3695   re_free (cset->equiv_classes);
3696   re_free (cset->range_starts);
3697   re_free (cset->range_ends);
3698 # endif
3699   re_free (cset->char_classes);
3700   re_free (cset);
3701 }
3702 #endif /* RE_ENABLE_I18N */
3703 
3704 /* Functions for binary tree operation.  */
3705 
3706 /* Create a tree node.  */
3707 
3708 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3709 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3710 	     re_token_type_t type)
3711 {
3712   re_token_t t;
3713   t.type = type;
3714   return create_token_tree (dfa, left, right, &t);
3715 }
3716 
3717 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3718 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3719 		   const re_token_t *token)
3720 {
3721   bin_tree_t *tree;
3722   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3723     {
3724       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3725 
3726       if (storage == NULL)
3727 	return NULL;
3728       storage->next = dfa->str_tree_storage;
3729       dfa->str_tree_storage = storage;
3730       dfa->str_tree_storage_idx = 0;
3731     }
3732   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3733 
3734   tree->parent = NULL;
3735   tree->left = left;
3736   tree->right = right;
3737   tree->token = *token;
3738   tree->token.duplicated = 0;
3739   tree->token.opt_subexp = 0;
3740   tree->first = NULL;
3741   tree->next = NULL;
3742   tree->node_idx = REG_MISSING;
3743 
3744   if (left != NULL)
3745     left->parent = tree;
3746   if (right != NULL)
3747     right->parent = tree;
3748   return tree;
3749 }
3750 
3751 /* Mark the tree SRC as an optional subexpression.
3752    To be called from preorder or postorder.  */
3753 
3754 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3755 mark_opt_subexp (void *extra, bin_tree_t *node)
3756 {
3757   Idx idx = (Idx) (long) extra;
3758   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3759     node->token.opt_subexp = 1;
3760 
3761   return REG_NOERROR;
3762 }
3763 
3764 /* Free the allocated memory inside NODE. */
3765 
3766 static void
free_token(re_token_t * node)3767 free_token (re_token_t *node)
3768 {
3769 #ifdef RE_ENABLE_I18N
3770   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3771     free_charset (node->opr.mbcset);
3772   else
3773 #endif /* RE_ENABLE_I18N */
3774     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3775       re_free (node->opr.sbcset);
3776 }
3777 
3778 /* Worker function for tree walking.  Free the allocated memory inside NODE
3779    and its children. */
3780 
3781 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3782 free_tree (void *extra, bin_tree_t *node)
3783 {
3784   free_token (&node->token);
3785   return REG_NOERROR;
3786 }
3787 
3788 
3789 /* Duplicate the node SRC, and return new node.  This is a preorder
3790    visit similar to the one implemented by the generic visitor, but
3791    we need more infrastructure to maintain two parallel trees --- so,
3792    it's easier to duplicate.  */
3793 
3794 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3795 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3796 {
3797   const bin_tree_t *node;
3798   bin_tree_t *dup_root;
3799   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3800 
3801   for (node = root; ; )
3802     {
3803       /* Create a new tree and link it back to the current parent.  */
3804       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3805       if (*p_new == NULL)
3806 	return NULL;
3807       (*p_new)->parent = dup_node;
3808       (*p_new)->token.duplicated = 1;
3809       dup_node = *p_new;
3810 
3811       /* Go to the left node, or up and to the right.  */
3812       if (node->left)
3813 	{
3814 	  node = node->left;
3815 	  p_new = &dup_node->left;
3816 	}
3817       else
3818 	{
3819 	  const bin_tree_t *prev = NULL;
3820 	  while (node->right == prev || node->right == NULL)
3821 	    {
3822 	      prev = node;
3823 	      node = node->parent;
3824 	      dup_node = dup_node->parent;
3825 	      if (!node)
3826 	        return dup_root;
3827 	    }
3828 	  node = node->right;
3829 	  p_new = &dup_node->right;
3830 	}
3831     }
3832 }
3833