xref: /netbsd-src/external/bsd/tre/dist/lib/tre-parse.c (revision f6cec12c6830c04a37196b5b91dd80d83b170e5e)
1 /*
2   tre-parse.c - Regexp parser
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 /*
10   This parser is just a simple recursive descent parser for POSIX.2
11   regexps.  The parser supports both the obsolete default syntax and
12   the "extended" syntax, and some nonstandard extensions.
13 */
14 
15 
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif /* HAVE_CONFIG_H */
19 #include <string.h>
20 #include <assert.h>
21 #include <limits.h>
22 
23 #include "xmalloc.h"
24 #include "tre-mem.h"
25 #include "tre-ast.h"
26 #include "tre-stack.h"
27 #include "tre-parse.h"
28 
29 
30 /* Characters with special meanings in regexp syntax. */
31 #define CHAR_PIPE	   L'|'
32 #define CHAR_LPAREN	   L'('
33 #define CHAR_RPAREN	   L')'
34 #define CHAR_LBRACE	   L'{'
35 #define CHAR_RBRACE	   L'}'
36 #define CHAR_LBRACKET	   L'['
37 #define CHAR_RBRACKET	   L']'
38 #define CHAR_MINUS	   L'-'
39 #define CHAR_STAR	   L'*'
40 #define CHAR_QUESTIONMARK  L'?'
41 #define CHAR_PLUS	   L'+'
42 #define CHAR_PERIOD	   L'.'
43 #define CHAR_COLON	   L':'
44 #define CHAR_EQUAL	   L'='
45 #define CHAR_COMMA	   L','
46 #define CHAR_CARET	   L'^'
47 #define CHAR_DOLLAR	   L'$'
48 #define CHAR_BACKSLASH	   L'\\'
49 #define CHAR_HASH	   L'#'
50 #define CHAR_TILDE	   L'~'
51 
52 
53 /* Some macros for expanding \w, \s, etc. */
54 static const struct tre_macro_struct {
55   const char c;
56   const char *expansion;
57 } tre_macros[] =
58   { {'t', "\t"},	   {'n', "\n"},		   {'r', "\r"},
59     {'f', "\f"},	   {'a', "\a"},		   {'e', "\033"},
60     {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
61     {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"},   {'D', "[^[:digit:]]"},
62     { 0, NULL }
63   };
64 
65 
66 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
67    must have at least `len' items.  Sets buf[0] to zero if the there
68    is no match in `tre_macros'. */
69 static void
tre_expand_macro(const tre_char_t * regex,const tre_char_t * regex_end,tre_char_t * buf,size_t buf_len)70 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
71 		 tre_char_t *buf, size_t buf_len)
72 {
73   int i;
74 
75   buf[0] = 0;
76   if (regex >= regex_end)
77     return;
78 
79   for (i = 0; tre_macros[i].expansion; i++)
80     {
81       if (tre_macros[i].c == *regex)
82 	{
83 	  unsigned int j;
84 	  DPRINT(("Expanding macro '%c' => '%s'\n",
85 		  tre_macros[i].c, tre_macros[i].expansion));
86 	  for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
87 	    buf[j] = tre_macros[i].expansion[j];
88 	  buf[j] = 0;
89 	  break;
90 	}
91     }
92 }
93 
94 static reg_errcode_t
tre_new_item(tre_mem_t mem,int min,int max,int * i,int * max_i,tre_ast_node_t *** items)95 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
96 	 tre_ast_node_t ***items)
97 {
98   reg_errcode_t status;
99   tre_ast_node_t **array = *items;
100   /* Allocate more space if necessary. */
101   if (*i >= *max_i)
102     {
103       tre_ast_node_t **new_items;
104       DPRINT(("out of array space, i = %d\n", *i));
105       /* If the array is already 1024 items large, give up -- there's
106 	 probably an error in the regexp (e.g. not a '\0' terminated
107 	 string and missing ']') */
108       if (*max_i > 1024)
109 	return REG_ESPACE;
110       *max_i *= 2;
111       new_items = xrealloc(array, sizeof(*array) * *max_i);
112       if (new_items == NULL)
113 	return REG_ESPACE;
114       *items = array = new_items;
115     }
116   array[*i] = tre_ast_new_literal(mem, min, max, -1);
117   status = array[*i] == NULL ? REG_ESPACE : REG_OK;
118   (*i)++;
119   return status;
120 }
121 
122 
123 /* Expands a character class to character ranges. */
124 static reg_errcode_t
tre_expand_ctype(tre_mem_t mem,tre_ctype_t class,tre_ast_node_t *** items,int * i,int * max_i,int cflags)125 tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items,
126 		 int *i, int *max_i, int cflags)
127 {
128   reg_errcode_t status = REG_OK;
129   tre_cint_t c;
130   int j, min = -1, max = 0;
131   assert(TRE_MB_CUR_MAX == 1);
132 
133   DPRINT(("  expanding class to character ranges\n"));
134   for (j = 0; (j < 256) && (status == REG_OK); j++)
135     {
136       c = (tre_cint_t) j;
137       if (tre_isctype(c, class)
138 	  || ((cflags & REG_ICASE)
139 	      && (tre_isctype(tre_tolower(c), class)
140 		  || tre_isctype(tre_toupper(c), class))))
141 {
142 	  if (min < 0)
143 	    min = c;
144 	  max = c;
145 	}
146       else if (min >= 0)
147 	{
148 	  DPRINT(("  range %c (%d) to %c (%d)\n", min, min, max, max));
149 	  status = tre_new_item(mem, min, max, i, max_i, items);
150 	  min = -1;
151 	}
152     }
153   if (min >= 0 && status == REG_OK)
154     status = tre_new_item(mem, min, max, i, max_i, items);
155   return status;
156 }
157 
158 
159 static int
tre_compare_items(const void * a,const void * b)160 tre_compare_items(const void *a, const void *b)
161 {
162   const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
163   const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
164   tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
165   int a_min = l_a->code_min, b_min = l_b->code_min;
166 
167   if (a_min < b_min)
168     return -1;
169   else if (a_min > b_min)
170     return 1;
171   else
172     return 0;
173 }
174 
175 #ifndef TRE_USE_SYSTEM_WCTYPE
176 
177 int tre_isalnum_func(tre_cint_t);
178 int tre_isalpha_func(tre_cint_t);
179 int tre_isascii_func(tre_cint_t);
180 int tre_isblank_func(tre_cint_t);
181 int tre_iscntrl_func(tre_cint_t);
182 int tre_isdigit_func(tre_cint_t);
183 int tre_isgraph_func(tre_cint_t);
184 int tre_islower_func(tre_cint_t);
185 int tre_isprint_func(tre_cint_t);
186 int tre_ispunct_func(tre_cint_t);
187 int tre_isspace_func(tre_cint_t);
188 int tre_isupper_func(tre_cint_t);
189 int tre_isxdigit_func(tre_cint_t);
190 
191 /* isalnum() and the rest may be macros, so wrap them to functions. */
tre_isalnum_func(tre_cint_t c)192 int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
tre_isalpha_func(tre_cint_t c)193 int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
194 
195 #ifdef tre_isascii
tre_isascii_func(tre_cint_t c)196 int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
197 #else /* !tre_isascii */
tre_isascii_func(tre_cint_t c)198 int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
199 #endif /* !tre_isascii */
200 
201 #ifdef tre_isblank
tre_isblank_func(tre_cint_t c)202 int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
203 #else /* !tre_isblank */
tre_isblank_func(tre_cint_t c)204 int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
205 #endif /* !tre_isblank */
206 
tre_iscntrl_func(tre_cint_t c)207 int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
tre_isdigit_func(tre_cint_t c)208 int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
tre_isgraph_func(tre_cint_t c)209 int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
tre_islower_func(tre_cint_t c)210 int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
tre_isprint_func(tre_cint_t c)211 int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
tre_ispunct_func(tre_cint_t c)212 int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
tre_isspace_func(tre_cint_t c)213 int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
tre_isupper_func(tre_cint_t c)214 int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
tre_isxdigit_func(tre_cint_t c)215 int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
216 
217 struct {
218   const char *name;
219   int (*func)(tre_cint_t);
220 } tre_ctype_map[] = {
221   { "alnum", &tre_isalnum_func },
222   { "alpha", &tre_isalpha_func },
223 #ifdef tre_isascii
224   { "ascii", &tre_isascii_func },
225 #endif /* tre_isascii */
226 #ifdef tre_isblank
227   { "blank", &tre_isblank_func },
228 #endif /* tre_isblank */
229   { "cntrl", &tre_iscntrl_func },
230   { "digit", &tre_isdigit_func },
231   { "graph", &tre_isgraph_func },
232   { "lower", &tre_islower_func },
233   { "print", &tre_isprint_func },
234   { "punct", &tre_ispunct_func },
235   { "space", &tre_isspace_func },
236   { "upper", &tre_isupper_func },
237   { "xdigit", &tre_isxdigit_func },
238   { NULL, NULL}
239 };
240 
tre_ctype(const char * name)241 tre_ctype_t tre_ctype(const char *name)
242 {
243   int i;
244   for (i = 0; tre_ctype_map[i].name != NULL; i++)
245     {
246       if (strcmp(name, tre_ctype_map[i].name) == 0)
247 	return tre_ctype_map[i].func;
248     }
249   return (tre_ctype_t)0;
250 }
251 #endif /* !TRE_USE_SYSTEM_WCTYPE */
252 
253 /* Maximum number of character classes that can occur in a negated bracket
254    expression.	*/
255 #define MAX_NEG_CLASSES 64
256 
257 /* Maximum length of character class names. */
258 #define MAX_CLASS_NAME
259 
260 #define REST(re) (int)(ctx->re_end - (re)), (re)
261 
262 static reg_errcode_t
tre_parse_bracket_items(tre_parse_ctx_t * ctx,int negate,tre_ctype_t neg_classes[],int * num_neg_classes,tre_ast_node_t *** items,int * num_items,int * items_size)263 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
264 			tre_ctype_t neg_classes[], int *num_neg_classes,
265 			tre_ast_node_t ***items, int *num_items,
266 			int *items_size)
267 {
268   const tre_char_t *re = ctx->re;
269   reg_errcode_t status = REG_OK;
270   tre_ctype_t class = (tre_ctype_t)0;
271   int i = *num_items;
272   int max_i = *items_size;
273   int skip;
274 
275   /* Build an array of the items in the bracket expression. */
276   while (status == REG_OK)
277     {
278       skip = 0;
279       if (re == ctx->re_end)
280 	{
281 	  status = REG_EBRACK;
282 	}
283       else if (*re == CHAR_RBRACKET && re > ctx->re)
284 	{
285 	  DPRINT(("tre_parse_bracket:	done: '%.*" STRF "'\n", REST(re)));
286 	  re++;
287 	  break;
288 	}
289       else
290 	{
291 	  tre_cint_t min = 0, max = 0;
292 
293 	  class = (tre_ctype_t)0;
294 	  if (re + 2 < ctx->re_end
295 	      && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET)
296 	    {
297 	      DPRINT(("tre_parse_bracket:  range: '%.*" STRF "'\n", REST(re)));
298 	      min = *re;
299 	      max = *(re + 2);
300 	      re += 3;
301 	      /* XXX - Should use collation order instead of encoding values
302 		 in character ranges. */
303 	      if (min > max)
304 		status = REG_ERANGE;
305 	    }
306 	  else if (re + 1 < ctx->re_end
307 		   && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
308 	    status = REG_ECOLLATE;
309 	  else if (re + 1 < ctx->re_end
310 		   && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
311 	    status = REG_ECOLLATE;
312 	  else if (re + 1 < ctx->re_end
313 		   && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
314 	    {
315 	      char tmp_str[64];
316 	      const tre_char_t *endptr = re + 2;
317 	      size_t len;
318 	      DPRINT(("tre_parse_bracket:  class: '%.*" STRF "'\n", REST(re)));
319 	      while (endptr < ctx->re_end && *endptr != CHAR_COLON)
320 		endptr++;
321 	      if (endptr != ctx->re_end)
322 		{
323 		  len = MIN(endptr - re - 2, 63);
324 #ifdef TRE_WCHAR
325 		  {
326 		    tre_char_t tmp_wcs[64];
327 		    wcsncpy(tmp_wcs, re + 2, (size_t)len);
328 		    tmp_wcs[len] = L'\0';
329 #if defined HAVE_WCSRTOMBS
330 		    {
331 		      mbstate_t state;
332 		      const tre_char_t *src = tmp_wcs;
333 		      memset(&state, '\0', sizeof(state));
334 		      len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
335 		    }
336 #elif defined HAVE_WCSTOMBS
337 		    len = wcstombs(tmp_str, tmp_wcs, 63);
338 #endif /* defined HAVE_WCSTOMBS */
339 		  }
340 #else /* !TRE_WCHAR */
341 		  /* LINTED */strncpy(tmp_str, (const char*)re + 2, len);
342 #endif /* !TRE_WCHAR */
343 		  tmp_str[len] = '\0';
344 		  DPRINT(("  class name: %s\n", tmp_str));
345 		  class = tre_ctype(tmp_str);
346 		  if (!class)
347 		    status = REG_ECTYPE;
348 		  /* Optimize character classes for 8 bit character sets. */
349 		  if (status == REG_OK && ctx->cur_max == 1)
350 		    {
351 		      status = tre_expand_ctype(ctx->mem, class, items,
352 						&i, &max_i, ctx->cflags);
353 		      class = (tre_ctype_t)0;
354 		      skip = 1;
355 		    }
356 		  re = endptr + 2;
357 		}
358 	      else
359 		status = REG_ECTYPE;
360 	      min = 0;
361 	      max = TRE_CHAR_MAX;
362 	    }
363 	  else
364 	    {
365 	      DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
366 	      if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
367 		  && ctx->re != re)
368 		/* Two ranges are not allowed to share and endpoint. */
369 		status = REG_ERANGE;
370 	      min = max = *re++;
371 	    }
372 
373 	  if (status != REG_OK)
374 	    break;
375 
376 	  if (class && negate)
377 	    if (*num_neg_classes >= MAX_NEG_CLASSES)
378 	      status = REG_ESPACE;
379 	    else
380 	      neg_classes[(*num_neg_classes)++] = class;
381 	  else if (!skip)
382 	    {
383 	      status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
384 	      if (status != REG_OK)
385 		break;
386 	      ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
387 	    }
388 
389 	  /* Add opposite-case counterpoints if REG_ICASE is present.
390 	     This is broken if there are more than two "same" characters. */
391 	  if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
392 	    {
393 	      tre_cint_t cmin, ccurr;
394 
395 	      DPRINT(("adding opposite-case counterpoints\n"));
396 	      while (min <= max)
397 		{
398 		  if (tre_islower(min))
399 		    {
400 		      cmin = ccurr = tre_toupper(min++);
401 		      while (tre_islower(min) && tre_toupper(min) == ccurr + 1
402 			     && min <= max)
403 			ccurr = tre_toupper(min++);
404 		      status = tre_new_item(ctx->mem, cmin, ccurr,
405 					    &i, &max_i, items);
406 		    }
407 		  else if (tre_isupper(min))
408 		    {
409 		      cmin = ccurr = tre_tolower(min++);
410 		      while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
411 			     && min <= max)
412 			ccurr = tre_tolower(min++);
413 		      status = tre_new_item(ctx->mem, cmin, ccurr,
414 					    &i, &max_i, items);
415 		    }
416 		  else min++;
417 		  if (status != REG_OK)
418 		    break;
419 		}
420 	      if (status != REG_OK)
421 		break;
422 	    }
423 	}
424     }
425   *num_items = i;
426   *items_size = max_i;
427   ctx->re = re;
428   return status;
429 }
430 
431 static reg_errcode_t
tre_parse_bracket(tre_parse_ctx_t * ctx,tre_ast_node_t ** result)432 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
433 {
434   tre_ast_node_t *node = NULL;
435   int negate = 0;
436   reg_errcode_t status = REG_OK;
437   tre_ast_node_t **items, *u, *n;
438   int i = 0, j, max_i = 32, curr_max, curr_min;
439   tre_ctype_t neg_classes[MAX_NEG_CLASSES];
440   int num_neg_classes = 0;
441 
442   /* Start off with an array of `max_i' elements. */
443   items = xmalloc(sizeof(*items) * max_i);
444   if (items == NULL)
445     return REG_ESPACE;
446 
447   if (*ctx->re == CHAR_CARET)
448     {
449       DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
450       negate = 1;
451       ctx->re++;
452     }
453 
454   status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
455 				   &items, &i, &max_i);
456 
457   if (status != REG_OK)
458     goto parse_bracket_done;
459 
460   /* Sort the array if we need to negate it. */
461   if (negate)
462     qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
463 
464   curr_max = curr_min = 0;
465   /* Build a union of the items in the array, negated if necessary. */
466   for (j = 0; j < i && status == REG_OK; j++)
467     {
468       int min, max;
469       tre_literal_t *l = items[j]->obj;
470       min = l->code_min;
471       max = l->code_max;
472 
473       DPRINT(("item: %d - %d, class %ld, curr_max = %d\n",
474 	      (int)l->code_min, (int)l->code_max, (long)l->u.class, curr_max));
475 
476       if (negate)
477 	{
478 	  if (min < curr_max)
479 	    {
480 	      /* Overlap. */
481 	      curr_max = MAX(max + 1, curr_max);
482 	      DPRINT(("overlap, curr_max = %d\n", curr_max));
483 	      l = NULL;
484 	    }
485 	  else
486 	    {
487 	      /* No overlap. */
488 	      curr_max = min - 1;
489 	      if (curr_max >= curr_min)
490 		{
491 		  DPRINT(("no overlap\n"));
492 		  l->code_min = curr_min;
493 		  l->code_max = curr_max;
494 		}
495 	      else
496 		{
497 		  DPRINT(("no overlap, zero room\n"));
498 		  l = NULL;
499 		}
500 	      curr_min = curr_max = max + 1;
501 	    }
502 	}
503 
504       if (l != NULL)
505 	{
506 	  int k;
507 	  DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max));
508 	  l->position = ctx->position;
509 	  if (num_neg_classes > 0)
510 	    {
511 	      l->neg_classes = tre_mem_alloc(ctx->mem,
512 					     (sizeof(*l->neg_classes)
513 					      * (num_neg_classes + 1)));
514 	      if (l->neg_classes == NULL)
515 		{
516 		  status = REG_ESPACE;
517 		  break;
518 		}
519 	      for (k = 0; k < num_neg_classes; k++)
520 		l->neg_classes[k] = neg_classes[k];
521 	      l->neg_classes[k] = (tre_ctype_t)0;
522 	    }
523 	  else
524 	    l->neg_classes = NULL;
525 	  if (node == NULL)
526 	    node = items[j];
527 	  else
528 	    {
529 	      u = tre_ast_new_union(ctx->mem, node, items[j]);
530 	      if (u == NULL)
531 		status = REG_ESPACE;
532 	      node = u;
533 	    }
534 	}
535     }
536 
537   if (status != REG_OK)
538     goto parse_bracket_done;
539 
540   if (negate)
541     {
542       int k;
543       DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX));
544       n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
545       if (n == NULL)
546 	status = REG_ESPACE;
547       else
548 	{
549 	  tre_literal_t *l = n->obj;
550 	  if (num_neg_classes > 0)
551 	    {
552 	      l->neg_classes = tre_mem_alloc(ctx->mem,
553 					     (sizeof(*l->neg_classes)
554 					      * (num_neg_classes + 1)));
555 	      if (l->neg_classes == NULL)
556 		{
557 		  status = REG_ESPACE;
558 		  goto parse_bracket_done;
559 		}
560 	      for (k = 0; k < num_neg_classes; k++)
561 		l->neg_classes[k] = neg_classes[k];
562 	      l->neg_classes[k] = (tre_ctype_t)0;
563 	    }
564 	  else
565 	    l->neg_classes = NULL;
566 	  if (node == NULL)
567 	    node = n;
568 	  else
569 	    {
570 	      u = tre_ast_new_union(ctx->mem, node, n);
571 	      if (u == NULL)
572 		status = REG_ESPACE;
573 	      node = u;
574 	    }
575 	}
576     }
577 
578   if (status != REG_OK)
579     goto parse_bracket_done;
580 
581 #ifdef TRE_DEBUG
582   tre_ast_print(node);
583 #endif /* TRE_DEBUG */
584 
585  parse_bracket_done:
586   xfree(items);
587   ctx->position++;
588   *result = node;
589   return status;
590 }
591 
592 
593 /* Parses a positive decimal integer.  Returns -1 if the string does not
594    contain a valid number. */
595 static int
tre_parse_int(const tre_char_t ** regex,const tre_char_t * regex_end)596 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
597 {
598   int num = -1;
599   const tre_char_t *r = *regex;
600   while (r < regex_end && *r >= L'0' && *r <= L'9')
601     {
602       if (num < 0)
603 	num = 0;
604       num = num * 10 + *r - L'0';
605       r++;
606     }
607   *regex = r;
608   return num;
609 }
610 
611 
612 static reg_errcode_t
tre_parse_bound(tre_parse_ctx_t * ctx,tre_ast_node_t ** result)613 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
614 {
615   int min, max, i;
616   int cost_ins, cost_del, cost_subst, cost_max;
617   int limit_ins, limit_del, limit_subst, limit_err;
618   const tre_char_t *r = ctx->re;
619   const tre_char_t *start;
620   int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
621   int approx = 0;
622   int costs_set = 0;
623   int counts_set = 0;
624 
625   cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
626   limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
627 
628   /* Parse number (minimum repetition count). */
629   min = -1;
630   if (r < ctx->re_end && *r >= L'0' && *r <= L'9') {
631     DPRINT(("tre_parse:	  min count: '%.*" STRF "'\n", REST(r)));
632     min = tre_parse_int(&r, ctx->re_end);
633   }
634 
635   /* Parse comma and second number (maximum repetition count). */
636   max = min;
637   if (r < ctx->re_end && *r == CHAR_COMMA)
638     {
639       r++;
640       DPRINT(("tre_parse:   max count: '%.*" STRF "'\n", REST(r)));
641       max = tre_parse_int(&r, ctx->re_end);
642     }
643 
644   /* Check that the repeat counts are sane. */
645   if ((max >= 0 && min > max) || max > RE_DUP_MAX)
646     return REG_BADBR;
647 
648 
649   /*
650    '{'
651      optionally followed immediately by a number == minimum repcount
652      optionally followed by , then a number == maximum repcount
653       + then a number == maximum insertion count
654       - then a number == maximum deletion count
655       # then a number == maximum substitution count
656       ~ then a number == maximum number of errors
657       Any of +, -, # or ~ without followed by a number means that
658       the maximum count/number of errors is infinite.
659 
660       An equation of the form
661 	Xi + Yd + Zs < C
662       can be specified to set costs and the cost limit to a value
663       different from the default value:
664 	- X is the cost of an insertion
665 	- Y is the cost of a deletion
666 	- Z is the cost of a substitution
667 	- C is the maximum cost
668 
669       If no count limit or cost is set for an operation, the operation
670       is not allowed at all.
671   */
672 
673 
674   do {
675     int done;
676     start = r;
677 
678     /* Parse count limit settings */
679     done = 0;
680     if (!counts_set)
681       while (r + 1 < ctx->re_end && !done)
682 	{
683 	  switch (*r)
684 	    {
685 	    case CHAR_PLUS:  /* Insert limit */
686 	      DPRINT(("tre_parse:   ins limit: '%.*" STRF "'\n", REST(r)));
687 	      r++;
688 	      limit_ins = tre_parse_int(&r, ctx->re_end);
689 	      if (limit_ins < 0)
690 		limit_ins = INT_MAX;
691 	      counts_set = 1;
692 	      break;
693 	    case CHAR_MINUS: /* Delete limit */
694 	      DPRINT(("tre_parse:   del limit: '%.*" STRF "'\n", REST(r)));
695 	      r++;
696 	      limit_del = tre_parse_int(&r, ctx->re_end);
697 	      if (limit_del < 0)
698 		limit_del = INT_MAX;
699 	      counts_set = 1;
700 	      break;
701 	    case CHAR_HASH:  /* Substitute limit */
702 	      DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
703 	      r++;
704 	      limit_subst = tre_parse_int(&r, ctx->re_end);
705 	      if (limit_subst < 0)
706 		limit_subst = INT_MAX;
707 	      counts_set = 1;
708 	      break;
709 	    case CHAR_TILDE: /* Maximum number of changes */
710 	      DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
711 	      r++;
712 	      limit_err = tre_parse_int(&r, ctx->re_end);
713 	      if (limit_err < 0)
714 		limit_err = INT_MAX;
715 	      approx = 1;
716 	      break;
717 	    case CHAR_COMMA:
718 	      r++;
719 	      break;
720 	    case L' ':
721 	      r++;
722 	      break;
723 	    case L'}':
724 	      done = 1;
725 	      break;
726 	    default:
727 	      done = 1;
728 	      break;
729 	    }
730 	}
731 
732     /* Parse cost restriction equation. */
733     done = 0;
734     if (!costs_set)
735       while (r + 1 < ctx->re_end && !done)
736 	{
737 	  switch (*r)
738 	    {
739 	    case CHAR_PLUS:
740 	    case L' ':
741 	      r++;
742 	      break;
743 	    case L'<':
744 	      DPRINT(("tre_parse:    max cost: '%.*" STRF "'\n", REST(r)));
745 	      r++;
746 	      while (*r == L' ')
747 		r++;
748 	      cost_max = tre_parse_int(&r, ctx->re_end);
749 	      if (cost_max < 0)
750 		cost_max = INT_MAX;
751 	      else
752 		cost_max--;
753 	      approx = 1;
754 	      break;
755 	    case CHAR_COMMA:
756 	      r++;
757 	      done = 1;
758 	      break;
759 	    default:
760 	      if (*r >= L'0' && *r <= L'9')
761 		{
762 #ifdef TRE_DEBUG
763 		  const tre_char_t *sr = r;
764 #endif /* TRE_DEBUG */
765 		  int cost = tre_parse_int(&r, ctx->re_end);
766 		  /* XXX - make sure r is not past end. */
767 		  switch (*r)
768 		    {
769 		    case L'i':	/* Insert cost */
770 		      DPRINT(("tre_parse:    ins cost: '%.*" STRF "'\n",
771 			      REST(sr)));
772 		      r++;
773 		      cost_ins = cost;
774 		      costs_set = 1;
775 		      break;
776 		    case L'd':	/* Delete cost */
777 		      DPRINT(("tre_parse:    del cost: '%.*" STRF "'\n",
778 			      REST(sr)));
779 		      r++;
780 		      cost_del = cost;
781 		      costs_set = 1;
782 		      break;
783 		    case L's':	/* Substitute cost */
784 		      DPRINT(("tre_parse:  subst cost: '%.*" STRF "'\n",
785 			      REST(sr)));
786 		      r++;
787 		      cost_subst = cost;
788 		      costs_set = 1;
789 		      break;
790 		    default:
791 		      return REG_BADBR;
792 		    }
793 		}
794 	      else
795 		{
796 		  done = 1;
797 		  break;
798 		}
799 	    }
800 	}
801   } while (start != r);
802 
803   /* Missing }. */
804   if (r >= ctx->re_end)
805     return REG_EBRACE;
806 
807   /* Empty contents of {}. */
808   if (r == ctx->re)
809     return REG_BADBR;
810 
811   /* Parse the ending '}' or '\}'.*/
812   if (ctx->cflags & REG_EXTENDED)
813     {
814       if (r >= ctx->re_end || *r != CHAR_RBRACE)
815 	return REG_BADBR;
816       r++;
817     }
818   else
819     {
820       if (r + 1 >= ctx->re_end
821 	  || *r != CHAR_BACKSLASH
822 	  || *(r + 1) != CHAR_RBRACE)
823 	return REG_BADBR;
824       r += 2;
825     }
826 
827 
828   /* Parse trailing '?' marking minimal repetition. */
829   if (r < ctx->re_end)
830     {
831       if (*r == CHAR_QUESTIONMARK)
832 	{
833 	  minimal = !(ctx->cflags & REG_UNGREEDY);
834 	  r++;
835 	}
836       else if (*r == CHAR_STAR || *r == CHAR_PLUS)
837 	{
838 	  /* These are reserved for future extensions. */
839 	  return REG_BADRPT;
840 	}
841     }
842 
843   /* Create the AST node(s). */
844   if (min == 0 && max == 0)
845     {
846       *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
847       if (*result == NULL)
848 	return REG_ESPACE;
849     }
850   else
851     {
852       if (min < 0 && max < 0)
853 	/* Only approximate parameters set, no repetitions. */
854 	min = max = 1;
855 
856       *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
857       if (!*result)
858 	return REG_ESPACE;
859 
860       /* If approximate matching parameters are set, add them to the
861 	 iteration node. */
862       if (approx || costs_set || counts_set)
863 	{
864 	  int *params;
865 	  tre_iteration_t *iter = (*result)->obj;
866 
867 	  if (costs_set || counts_set)
868 	    {
869 	      if (limit_ins == TRE_PARAM_UNSET)
870 		{
871 		  if (cost_ins == TRE_PARAM_UNSET)
872 		    limit_ins = 0;
873 		  else
874 		    limit_ins = INT_MAX;
875 		}
876 
877 	      if (limit_del == TRE_PARAM_UNSET)
878 		{
879 		  if (cost_del == TRE_PARAM_UNSET)
880 		    limit_del = 0;
881 		  else
882 		    limit_del = INT_MAX;
883 		}
884 
885 	      if (limit_subst == TRE_PARAM_UNSET)
886 		{
887 		  if (cost_subst == TRE_PARAM_UNSET)
888 		    limit_subst = 0;
889 		  else
890 		    limit_subst = INT_MAX;
891 		}
892 	    }
893 
894 	  if (cost_max == TRE_PARAM_UNSET)
895 	    cost_max = INT_MAX;
896 	  if (limit_err == TRE_PARAM_UNSET)
897 	    limit_err = INT_MAX;
898 
899 	  ctx->have_approx = 1;
900 	  params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
901 	  if (!params)
902 	    return REG_ESPACE;
903 	  for (i = 0; i < TRE_PARAM_LAST; i++)
904 	    params[i] = TRE_PARAM_UNSET;
905 	  params[TRE_PARAM_COST_INS] = cost_ins;
906 	  params[TRE_PARAM_COST_DEL] = cost_del;
907 	  params[TRE_PARAM_COST_SUBST] = cost_subst;
908 	  params[TRE_PARAM_COST_MAX] = cost_max;
909 	  params[TRE_PARAM_MAX_INS] = limit_ins;
910 	  params[TRE_PARAM_MAX_DEL] = limit_del;
911 	  params[TRE_PARAM_MAX_SUBST] = limit_subst;
912 	  params[TRE_PARAM_MAX_ERR] = limit_err;
913 	  iter->params = params;
914 	}
915     }
916 
917   DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
918 	  "limits [%d,%d,%d, total %d]\n",
919 	  min, max, cost_ins, cost_del, cost_subst, cost_max,
920 	  limit_ins, limit_del, limit_subst, limit_err));
921 
922 
923   ctx->re = r;
924   return REG_OK;
925 }
926 
927 typedef enum {
928   PARSE_RE = 0,
929   PARSE_ATOM,
930   PARSE_MARK_FOR_SUBMATCH,
931   PARSE_BRANCH,
932   PARSE_PIECE,
933   PARSE_CATENATION,
934   PARSE_POST_CATENATION,
935   PARSE_UNION,
936   PARSE_POST_UNION,
937   PARSE_POSTFIX,
938   PARSE_RESTORE_CFLAGS
939 } tre_parse_re_stack_symbol_t;
940 
941 
942 reg_errcode_t
tre_parse(tre_parse_ctx_t * ctx)943 tre_parse(tre_parse_ctx_t *ctx)
944 {
945   tre_ast_node_t *result = NULL;
946   tre_parse_re_stack_symbol_t symbol;
947   reg_errcode_t status = REG_OK;
948   tre_stack_t *stack = ctx->stack;
949   int bottom = tre_stack_num_objects(stack);
950   int depth = 0;
951   int temporary_cflags = 0;
952 
953   DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n",
954 	  ctx->len, ctx->re, ctx->len));
955 
956   if (!ctx->nofirstsub)
957     {
958       STACK_PUSH(stack, long, ctx->submatch_id);
959       STACK_PUSH(stack, long, PARSE_MARK_FOR_SUBMATCH);
960       ctx->submatch_id++;
961     }
962   STACK_PUSH(stack, long, PARSE_RE);
963   ctx->re_start = ctx->re;
964   ctx->re_end = ctx->re + ctx->len;
965 
966 
967   /* The following is basically just a recursive descent parser.  I use
968      an explicit stack instead of recursive functions mostly because of
969      two reasons: compatibility with systems which have an overflowable
970      call stack, and efficiency (both in lines of code and speed).  */
971   while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
972     {
973       if (status != REG_OK)
974 	break;
975       symbol = tre_stack_pop_int(stack);
976       switch (symbol)
977 	{
978 	case PARSE_RE:
979 	  /* Parse a full regexp.  A regexp is one or more branches,
980 	     separated by the union operator `|'. */
981 #ifdef REG_LITERAL
982 	  if (!(ctx->cflags & REG_LITERAL)
983 	      && ctx->cflags & REG_EXTENDED)
984 #endif /* REG_LITERAL */
985 	    STACK_PUSHX(stack, long, PARSE_UNION);
986 	  STACK_PUSHX(stack, long, PARSE_BRANCH);
987 	  break;
988 
989 	case PARSE_BRANCH:
990 	  /* Parse a branch.  A branch is one or more pieces, concatenated.
991 	     A piece is an atom possibly followed by a postfix operator. */
992 	  STACK_PUSHX(stack, long, PARSE_CATENATION);
993 	  STACK_PUSHX(stack, long, PARSE_PIECE);
994 	  break;
995 
996 	case PARSE_PIECE:
997 	  /* Parse a piece.  A piece is an atom possibly followed by one
998 	     or more postfix operators. */
999 #ifdef REG_LITERAL
1000 	  if (!(ctx->cflags & REG_LITERAL))
1001 #endif /* REG_LITERAL */
1002 	    STACK_PUSHX(stack, long, PARSE_POSTFIX);
1003 	  STACK_PUSHX(stack, long, PARSE_ATOM);
1004 	  break;
1005 
1006 	case PARSE_CATENATION:
1007 	  /* If the expression has not ended, parse another piece. */
1008 	  {
1009 	    tre_char_t c;
1010 	    if (ctx->re >= ctx->re_end)
1011 	      break;
1012 	    c = *ctx->re;
1013 #ifdef REG_LITERAL
1014 	    if (!(ctx->cflags & REG_LITERAL))
1015 	      {
1016 #endif /* REG_LITERAL */
1017 		if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1018 		  break;
1019 		if ((ctx->cflags & REG_EXTENDED
1020 		     && c == CHAR_RPAREN && depth > 0)
1021 		    || (!(ctx->cflags & REG_EXTENDED)
1022 			&& (c == CHAR_BACKSLASH
1023 			    && *(ctx->re + 1) == CHAR_RPAREN)))
1024 		  {
1025 		    if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1026 		      status = REG_EPAREN;
1027 		    DPRINT(("tre_parse:	  group end: '%.*" STRF "'\n",
1028 			    REST(ctx->re)));
1029 		    depth--;
1030 		    if (!(ctx->cflags & REG_EXTENDED))
1031 		      ctx->re += 2;
1032 		    break;
1033 		  }
1034 #ifdef REG_LITERAL
1035 	      }
1036 #endif /* REG_LITERAL */
1037 
1038 #ifdef REG_RIGHT_ASSOC
1039 	    if (ctx->cflags & REG_RIGHT_ASSOC)
1040 	      {
1041 		/* Right associative concatenation. */
1042 		STACK_PUSHX(stack, voidptr, result);
1043 		STACK_PUSHX(stack, long, PARSE_POST_CATENATION);
1044 		STACK_PUSHX(stack, long, PARSE_CATENATION);
1045 		STACK_PUSHX(stack, long, PARSE_PIECE);
1046 	      }
1047 	    else
1048 #endif /* REG_RIGHT_ASSOC */
1049 	      {
1050 		/* Default case, left associative concatenation. */
1051 		STACK_PUSHX(stack, long, PARSE_CATENATION);
1052 		STACK_PUSHX(stack, voidptr, result);
1053 		STACK_PUSHX(stack, long, PARSE_POST_CATENATION);
1054 		STACK_PUSHX(stack, long, PARSE_PIECE);
1055 	      }
1056 	    break;
1057 	  }
1058 
1059 	case PARSE_POST_CATENATION:
1060 	  {
1061 	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1062 	    tre_ast_node_t *tmp_node;
1063 	    tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1064 	    if (!tmp_node)
1065 	      return REG_ESPACE;
1066 	    result = tmp_node;
1067 	    break;
1068 	  }
1069 
1070 	case PARSE_UNION:
1071 	  if (ctx->re >= ctx->re_end)
1072 	    break;
1073 #ifdef REG_LITERAL
1074 	  if (ctx->cflags & REG_LITERAL)
1075 	    break;
1076 #endif /* REG_LITERAL */
1077 	  switch (*ctx->re)
1078 	    {
1079 	    case CHAR_PIPE:
1080 	      DPRINT(("tre_parse:	union: '%.*" STRF "'\n",
1081 		      REST(ctx->re)));
1082 	      STACK_PUSHX(stack, long, PARSE_UNION);
1083 	      STACK_PUSHX(stack, voidptr, result);
1084 	      STACK_PUSHX(stack, long, PARSE_POST_UNION);
1085 	      STACK_PUSHX(stack, long, PARSE_BRANCH);
1086 	      ctx->re++;
1087 	      break;
1088 
1089 	    case CHAR_RPAREN:
1090 	      ctx->re++;
1091 	      break;
1092 
1093 	    default:
1094 	      break;
1095 	    }
1096 	  break;
1097 
1098 	case PARSE_POST_UNION:
1099 	  {
1100 	    tre_ast_node_t *tmp_node;
1101 	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1102 	    tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1103 	    if (!tmp_node)
1104 	      return REG_ESPACE;
1105 	    result = tmp_node;
1106 	    break;
1107 	  }
1108 
1109 	case PARSE_POSTFIX:
1110 	  /* Parse postfix operators. */
1111 	  if (ctx->re >= ctx->re_end)
1112 	    break;
1113 #ifdef REG_LITERAL
1114 	  if (ctx->cflags & REG_LITERAL)
1115 	    break;
1116 #endif /* REG_LITERAL */
1117 	  switch (*ctx->re)
1118 	    {
1119 	    case CHAR_PLUS:
1120 	    case CHAR_QUESTIONMARK:
1121 	      if (!(ctx->cflags & REG_EXTENDED))
1122 		break;
1123 		/*FALLTHROUGH*/
1124 	    case CHAR_STAR:
1125 	      {
1126 		tre_ast_node_t *tmp_node;
1127 		int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1128 		int rep_min = 0;
1129 		int rep_max = -1;
1130 #ifdef TRE_DEBUG
1131 		const tre_char_t *tmp_re;
1132 #endif
1133 
1134 		if (*ctx->re == CHAR_PLUS)
1135 		  rep_min = 1;
1136 		if (*ctx->re == CHAR_QUESTIONMARK)
1137 		  rep_max = 1;
1138 #ifdef TRE_DEBUG
1139 		tmp_re = ctx->re;
1140 #endif
1141 
1142 		if (ctx->re + 1 < ctx->re_end)
1143 		  {
1144 		    if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1145 		      {
1146 			minimal = !(ctx->cflags & REG_UNGREEDY);
1147 			ctx->re++;
1148 		      }
1149 		    else if (*(ctx->re + 1) == CHAR_STAR
1150 			     || *(ctx->re + 1) == CHAR_PLUS)
1151 		      {
1152 			/* These are reserved for future extensions. */
1153 			return REG_BADRPT;
1154 		      }
1155 		  }
1156 
1157 		DPRINT(("tre_parse: %s star: '%.*" STRF "'\n",
1158 			minimal ? "  minimal" : "greedy", REST(tmp_re)));
1159 		ctx->re++;
1160 		tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1161 					    minimal);
1162 		if (tmp_node == NULL)
1163 		  return REG_ESPACE;
1164 		result = tmp_node;
1165 		STACK_PUSHX(stack, long, PARSE_POSTFIX);
1166 	      }
1167 	      break;
1168 
1169 	    case CHAR_BACKSLASH:
1170 	      /* "\{" is special without REG_EXTENDED */
1171 	      if (!(ctx->cflags & REG_EXTENDED)
1172 		  && ctx->re + 1 < ctx->re_end
1173 		  && *(ctx->re + 1) == CHAR_LBRACE)
1174 		{
1175 		  ctx->re++;
1176 		  goto parse_brace;
1177 		}
1178 	      else
1179 		break;
1180 
1181 	    case CHAR_LBRACE:
1182 	      /* "{" is literal without REG_EXTENDED */
1183 	      if (!(ctx->cflags & REG_EXTENDED))
1184 		break;
1185 
1186 	    parse_brace:
1187 	      DPRINT(("tre_parse:	bound: '%.*" STRF "'\n",
1188 		      REST(ctx->re)));
1189 	      ctx->re++;
1190 
1191 	      status = tre_parse_bound(ctx, &result);
1192 	      if (status != REG_OK)
1193 		return status;
1194 	      STACK_PUSHX(stack, long, PARSE_POSTFIX);
1195 	      break;
1196 	    }
1197 	  break;
1198 
1199 	case PARSE_ATOM:
1200 	  /* Parse an atom.  An atom is a regular expression enclosed in `()',
1201 	     an empty set of `()', a bracket expression, `.', `^', `$',
1202 	     a `\' followed by a character, or a single character. */
1203 
1204 	  /* End of regexp? (empty string). */
1205 	  if (ctx->re >= ctx->re_end)
1206 	    goto parse_literal;
1207 
1208 #ifdef REG_LITERAL
1209 	  if (ctx->cflags & REG_LITERAL)
1210 	    goto parse_literal;
1211 #endif /* REG_LITERAL */
1212 
1213 	  switch (*ctx->re)
1214 	    {
1215 	    case CHAR_LPAREN:  /* parenthesized subexpression */
1216 
1217 	      /* Handle "(?...)" extensions.  They work in a way similar
1218 		 to Perls corresponding extensions. */
1219 	      if (ctx->cflags & REG_EXTENDED
1220 		  && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1221 		{
1222 		  int new_cflags = ctx->cflags;
1223 		  int bit = 1;
1224 		  DPRINT(("tre_parse:	extension: '%.*" STRF "\n",
1225 			  REST(ctx->re)));
1226 		  ctx->re += 2;
1227 		  while (/*CONSTCOND*/(void)1,1)
1228 		    {
1229 		      if (*ctx->re == L'i')
1230 			{
1231 			  DPRINT(("tre_parse:	    icase: '%.*" STRF "\n",
1232 				  REST(ctx->re)));
1233 			  if (bit)
1234 			    new_cflags |= REG_ICASE;
1235 			  else
1236 			    new_cflags &= ~REG_ICASE;
1237 			  ctx->re++;
1238 			}
1239 		      else if (*ctx->re == L'n')
1240 			{
1241 			  DPRINT(("tre_parse:	  newline: '%.*" STRF "\n",
1242 				  REST(ctx->re)));
1243 			  if (bit)
1244 			    new_cflags |= REG_NEWLINE;
1245 			  else
1246 			    new_cflags &= ~REG_NEWLINE;
1247 			  ctx->re++;
1248 			}
1249 #ifdef REG_RIGHT_ASSOC
1250 		      else if (*ctx->re == L'r')
1251 			{
1252 			  DPRINT(("tre_parse: right assoc: '%.*" STRF "\n",
1253 				  REST(ctx->re)));
1254 			  if (bit)
1255 			    new_cflags |= REG_RIGHT_ASSOC;
1256 			  else
1257 			    new_cflags &= ~REG_RIGHT_ASSOC;
1258 			  ctx->re++;
1259 			}
1260 #endif /* REG_RIGHT_ASSOC */
1261 #ifdef REG_UNGREEDY
1262 		      else if (*ctx->re == L'U')
1263 			{
1264 			  DPRINT(("tre_parse:    ungreedy: '%.*" STRF "\n",
1265 				  REST(ctx->re)));
1266 			  if (bit)
1267 			    new_cflags |= REG_UNGREEDY;
1268 			  else
1269 			    new_cflags &= ~REG_UNGREEDY;
1270 			  ctx->re++;
1271 			}
1272 #endif /* REG_UNGREEDY */
1273 		      else if (*ctx->re == CHAR_MINUS)
1274 			{
1275 			  DPRINT(("tre_parse:	 turn off: '%.*" STRF "\n",
1276 				  REST(ctx->re)));
1277 			  ctx->re++;
1278 			  bit = 0;
1279 			}
1280 		      else if (*ctx->re == CHAR_COLON)
1281 			{
1282 			  DPRINT(("tre_parse:	 no group: '%.*" STRF "\n",
1283 				  REST(ctx->re)));
1284 			  ctx->re++;
1285 			  depth++;
1286 			  break;
1287 			}
1288 		      else if (*ctx->re == CHAR_HASH)
1289 			{
1290 			  DPRINT(("tre_parse:    comment: '%.*" STRF "\n",
1291 				  REST(ctx->re)));
1292 			  /* A comment can contain any character except a
1293 			     right parenthesis */
1294 			  while (*ctx->re != CHAR_RPAREN
1295 				 && ctx->re < ctx->re_end)
1296 			    ctx->re++;
1297 			  if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1298 			    {
1299 			      ctx->re++;
1300 			      break;
1301 			    }
1302 			  else
1303 			    return REG_BADPAT;
1304 			}
1305 		      else if (*ctx->re == CHAR_RPAREN)
1306 			{
1307 			  ctx->re++;
1308 			  break;
1309 			}
1310 		      else
1311 			return REG_BADPAT;
1312 		    }
1313 
1314 		  /* Turn on the cflags changes for the rest of the
1315 		     enclosing group. */
1316 		  STACK_PUSHX(stack, long, ctx->cflags);
1317 		  STACK_PUSHX(stack, long, PARSE_RESTORE_CFLAGS);
1318 		  STACK_PUSHX(stack, long, PARSE_RE);
1319 		  ctx->cflags = new_cflags;
1320 		  break;
1321 		}
1322 
1323 	      if (ctx->cflags & REG_EXTENDED
1324 		  || (ctx->re > ctx->re_start
1325 		      && *(ctx->re - 1) == CHAR_BACKSLASH))
1326 		{
1327 		  depth++;
1328 		  if (ctx->re + 2 < ctx->re_end
1329 		      && *(ctx->re + 1) == CHAR_QUESTIONMARK
1330 		      && *(ctx->re + 2) == CHAR_COLON)
1331 		    {
1332 		      DPRINT(("tre_parse: group begin: '%.*" STRF
1333 			      "', no submatch\n", REST(ctx->re)));
1334 		      /* Don't mark for submatching. */
1335 		      ctx->re += 3;
1336 		      STACK_PUSHX(stack, long, PARSE_RE);
1337 		    }
1338 		  else
1339 		    {
1340 		      DPRINT(("tre_parse: group begin: '%.*" STRF
1341 			      "', submatch %d\n", REST(ctx->re),
1342 			      ctx->submatch_id));
1343 		      ctx->re++;
1344 		      /* First parse a whole RE, then mark the resulting tree
1345 			 for submatching. */
1346 		      STACK_PUSHX(stack, long, ctx->submatch_id);
1347 		      STACK_PUSHX(stack, long, PARSE_MARK_FOR_SUBMATCH);
1348 		      STACK_PUSHX(stack, long, PARSE_RE);
1349 		      ctx->submatch_id++;
1350 		    }
1351 		}
1352 	      else
1353 		goto parse_literal;
1354 	      break;
1355 
1356 	    case CHAR_RPAREN:  /* end of current subexpression */
1357 	      if ((ctx->cflags & REG_EXTENDED && depth > 0)
1358 		  || (ctx->re > ctx->re_start
1359 		      && *(ctx->re - 1) == CHAR_BACKSLASH))
1360 		{
1361 		  DPRINT(("tre_parse:	    empty: '%.*" STRF "'\n",
1362 			  REST(ctx->re)));
1363 		  /* We were expecting an atom, but instead the current
1364 		     subexpression was closed.	POSIX leaves the meaning of
1365 		     this to be implementation-defined.	 We interpret this as
1366 		     an empty expression (which matches an empty string).  */
1367 		  result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1368 		  if (result == NULL)
1369 		    return REG_ESPACE;
1370 		  if (!(ctx->cflags & REG_EXTENDED))
1371 		    ctx->re--;
1372 		}
1373 	      else
1374 		goto parse_literal;
1375 	      break;
1376 
1377 	    case CHAR_LBRACKET: /* bracket expression */
1378 	      DPRINT(("tre_parse:     bracket: '%.*" STRF "'\n",
1379 		      REST(ctx->re)));
1380 	      ctx->re++;
1381 	      status = tre_parse_bracket(ctx, &result);
1382 	      if (status != REG_OK)
1383 		return status;
1384 	      break;
1385 
1386 	    case CHAR_BACKSLASH:
1387 	      /* If this is "\(" or "\)" chew off the backslash and
1388 		 try again. */
1389 	      if (!(ctx->cflags & REG_EXTENDED)
1390 		  && ctx->re + 1 < ctx->re_end
1391 		  && (*(ctx->re + 1) == CHAR_LPAREN
1392 		      || *(ctx->re + 1) == CHAR_RPAREN))
1393 		{
1394 		  ctx->re++;
1395 		  STACK_PUSHX(stack, long, PARSE_ATOM);
1396 		  break;
1397 		}
1398 
1399 	      /* If a macro is used, parse the expanded macro recursively. */
1400 	      {
1401 		tre_char_t buf[64];
1402 		tre_expand_macro(ctx->re + 1, ctx->re_end,
1403 				 buf, elementsof(buf));
1404 		if (buf[0] != 0)
1405 		  {
1406 		    tre_parse_ctx_t subctx;
1407 		    memcpy(&subctx, ctx, sizeof(subctx));
1408 		    subctx.re = buf;
1409 		    subctx.len = tre_strlen(buf);
1410 		    subctx.nofirstsub = 1;
1411 		    status = tre_parse(&subctx);
1412 		    if (status != REG_OK)
1413 		      return status;
1414 		    ctx->re += 2;
1415 		    ctx->position = subctx.position;
1416 		    result = subctx.result;
1417 		    break;
1418 		  }
1419 	      }
1420 
1421 	      if (ctx->re + 1 >= ctx->re_end)
1422 		/* Trailing backslash. */
1423 		return REG_EESCAPE;
1424 
1425 #ifdef REG_LITERAL
1426 	      if (*(ctx->re + 1) == L'Q')
1427 		{
1428 		  DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1429 			  REST(ctx->re)));
1430 		  ctx->cflags |= REG_LITERAL;
1431 		  temporary_cflags |= REG_LITERAL;
1432 		  ctx->re += 2;
1433 		  STACK_PUSHX(stack, long, PARSE_ATOM);
1434 		  break;
1435 		}
1436 #endif /* REG_LITERAL */
1437 
1438 	      DPRINT(("tre_parse:  bleep: '%.*" STRF "'\n", REST(ctx->re)));
1439 	      ctx->re++;
1440 	      switch (*ctx->re)
1441 		{
1442 		case L'b':
1443 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1444 					       ASSERT_AT_WB, -1);
1445 		  ctx->re++;
1446 		  break;
1447 		case L'B':
1448 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1449 					       ASSERT_AT_WB_NEG, -1);
1450 		  ctx->re++;
1451 		  break;
1452 		case L'<':
1453 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1454 					       ASSERT_AT_BOW, -1);
1455 		  ctx->re++;
1456 		  break;
1457 		case L'>':
1458 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1459 					       ASSERT_AT_EOW, -1);
1460 		  ctx->re++;
1461 		  break;
1462 		case L'x':
1463 		  ctx->re++;
1464 		  if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1465 		    {
1466 		      /* 8 bit hex char. */
1467 		      char tmp[3] = {0, 0, 0};
1468 		      long val;
1469 		      DPRINT(("tre_parse:  8 bit hex: '%.*" STRF "'\n",
1470 			      REST(ctx->re - 2)));
1471 
1472 		      if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1473 			{
1474 			  tmp[0] = (char)ctx->re[0];
1475 			  ctx->re++;
1476 			}
1477 		      if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1478 			{
1479 			  tmp[1] = (char)ctx->re[0];
1480 			  ctx->re++;
1481 			}
1482 		      val = strtol(tmp, NULL, 16);
1483 		      result = tre_ast_new_literal(ctx->mem, (int)val,
1484 						   (int)val, ctx->position);
1485 		      ctx->position++;
1486 		      break;
1487 		    }
1488 		  else if (ctx->re < ctx->re_end)
1489 		    {
1490 		      /* Wide char. */
1491 		      char tmp[32];
1492 		      long val;
1493 		      int i = 0;
1494 		      ctx->re++;
1495 		      while (ctx->re_end - ctx->re >= 0)
1496 			{
1497 			  if (ctx->re[0] == CHAR_RBRACE)
1498 			    break;
1499 			  if (tre_isxdigit(ctx->re[0]))
1500 			    {
1501 			      tmp[i] = (char)ctx->re[0];
1502 			      i++;
1503 			      ctx->re++;
1504 			      continue;
1505 			    }
1506 			  return REG_EBRACE;
1507 			}
1508 		      ctx->re++;
1509 		      tmp[i] = 0;
1510 		      val = strtol(tmp, NULL, 16);
1511 		      result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1512 						   ctx->position);
1513 		      ctx->position++;
1514 		      break;
1515 		    }
1516 		  /*FALLTHROUGH*/
1517 
1518 		default:
1519 		  if (tre_isdigit(*ctx->re))
1520 		    {
1521 		      /* Back reference. */
1522 		      int val = *ctx->re - L'0';
1523 		      DPRINT(("tre_parse:     backref: '%.*" STRF "'\n",
1524 			      REST(ctx->re - 1)));
1525 		      result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1526 						   ctx->position);
1527 		      if (result == NULL)
1528 			return REG_ESPACE;
1529 		      ctx->position++;
1530 		      ctx->max_backref = MAX(val, ctx->max_backref);
1531 		      ctx->re++;
1532 		    }
1533 		  else
1534 		    {
1535 		      /* Escaped character. */
1536 		      DPRINT(("tre_parse:     escaped: '%.*" STRF "'\n",
1537 			      REST(ctx->re - 1)));
1538 		      result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1539 						   ctx->position);
1540 		      ctx->position++;
1541 		      ctx->re++;
1542 		    }
1543 		  break;
1544 		}
1545 	      if (result == NULL)
1546 		return REG_ESPACE;
1547 	      break;
1548 
1549 	    case CHAR_PERIOD:	 /* the any-symbol */
1550 	      DPRINT(("tre_parse:	  any: '%.*" STRF "'\n",
1551 		      REST(ctx->re)));
1552 	      if (ctx->cflags & REG_NEWLINE)
1553 		{
1554 		  tre_ast_node_t *tmp1;
1555 		  tre_ast_node_t *tmp2;
1556 		  tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
1557 					     ctx->position);
1558 		  if (!tmp1)
1559 		    return REG_ESPACE;
1560 		  tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
1561 					     ctx->position + 1);
1562 		  if (!tmp2)
1563 		    return REG_ESPACE;
1564 		  result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1565 		  if (!result)
1566 		    return REG_ESPACE;
1567 		  ctx->position += 2;
1568 		}
1569 	      else
1570 		{
1571 		  result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1572 					       ctx->position);
1573 		  if (!result)
1574 		    return REG_ESPACE;
1575 		  ctx->position++;
1576 		}
1577 	      ctx->re++;
1578 	      break;
1579 
1580 	    case CHAR_CARET:	 /* beginning of line assertion */
1581 	      /* '^' has a special meaning everywhere in EREs, and in the
1582 		 beginning of the RE and after \( is BREs. */
1583 	      if (ctx->cflags & REG_EXTENDED
1584 		  || (ctx->re - 2 >= ctx->re_start
1585 		      && *(ctx->re - 2) == CHAR_BACKSLASH
1586 		      && *(ctx->re - 1) == CHAR_LPAREN)
1587 		  || ctx->re == ctx->re_start)
1588 		{
1589 		  DPRINT(("tre_parse:	      BOL: '%.*" STRF "'\n",
1590 			  REST(ctx->re)));
1591 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1592 					       ASSERT_AT_BOL, -1);
1593 		  if (result == NULL)
1594 		    return REG_ESPACE;
1595 		  ctx->re++;
1596 		}
1597 	      else
1598 		goto parse_literal;
1599 	      break;
1600 
1601 	    case CHAR_DOLLAR:	 /* end of line assertion. */
1602 	      /* '$' is special everywhere in EREs, and in the end of the
1603 		 string and before \) is BREs. */
1604 	      if (ctx->cflags & REG_EXTENDED
1605 		  || (ctx->re + 2 < ctx->re_end
1606 		      && *(ctx->re + 1) == CHAR_BACKSLASH
1607 		      && *(ctx->re + 2) == CHAR_RPAREN)
1608 		  || ctx->re + 1 == ctx->re_end)
1609 		{
1610 		  DPRINT(("tre_parse:	      EOL: '%.*" STRF "'\n",
1611 			  REST(ctx->re)));
1612 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1613 					       ASSERT_AT_EOL, -1);
1614 		  if (result == NULL)
1615 		    return REG_ESPACE;
1616 		  ctx->re++;
1617 		}
1618 	      else
1619 		goto parse_literal;
1620 	      break;
1621 
1622 	    default:
1623 	    parse_literal:
1624 
1625 	      if (temporary_cflags && ctx->re + 1 < ctx->re_end
1626 		  && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
1627 		{
1628 		  DPRINT(("tre_parse:	 end tmps: '%.*" STRF "'\n",
1629 			  REST(ctx->re)));
1630 		  ctx->cflags &= ~temporary_cflags;
1631 		  temporary_cflags = 0;
1632 		  ctx->re += 2;
1633 		  STACK_PUSHX(stack, long, PARSE_PIECE);
1634 		  break;
1635 		}
1636 
1637 
1638 	      /* We are expecting an atom.  If the subexpression (or the whole
1639 		 regexp ends here, we interpret it as an empty expression
1640 		 (which matches an empty string).  */
1641 	      if (
1642 #ifdef REG_LITERAL
1643 		  !(ctx->cflags & REG_LITERAL) &&
1644 #endif /* REG_LITERAL */
1645 		  (ctx->re >= ctx->re_end
1646 		   || *ctx->re == CHAR_STAR
1647 		   || (ctx->cflags & REG_EXTENDED
1648 		       && (*ctx->re == CHAR_PIPE
1649 			   || *ctx->re == CHAR_LBRACE
1650 			   || *ctx->re == CHAR_PLUS
1651 			   || *ctx->re == CHAR_QUESTIONMARK))
1652 		   /* Test for "\)" in BRE mode. */
1653 		   || (!(ctx->cflags & REG_EXTENDED)
1654 		       && ctx->re + 1 < ctx->re_end
1655 		       && *ctx->re == CHAR_BACKSLASH
1656 		       && *(ctx->re + 1) == CHAR_LBRACE)))
1657 		{
1658 		  DPRINT(("tre_parse:	    empty: '%.*" STRF "'\n",
1659 			  REST(ctx->re)));
1660 		  result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1661 		  if (!result)
1662 		    return REG_ESPACE;
1663 		  break;
1664 		}
1665 
1666 	      DPRINT(("tre_parse:     literal: '%.*" STRF "'\n",
1667 		      REST(ctx->re)));
1668 	      /* Note that we can't use an tre_isalpha() test here, since there
1669 		 may be characters which are alphabetic but neither upper or
1670 		 lower case. */
1671 	      if (ctx->cflags & REG_ICASE
1672 		  && (tre_isupper(*ctx->re) || tre_islower(*ctx->re)))
1673 		{
1674 		  tre_ast_node_t *tmp1;
1675 		  tre_ast_node_t *tmp2;
1676 
1677 		  /* XXX - Can there be more than one opposite-case
1678 		     counterpoints for some character in some locale?  Or
1679 		     more than two characters which all should be regarded
1680 		     the same character if case is ignored?  If yes, there
1681 		     does not seem to be a portable way to detect it.  I guess
1682 		     that at least for multi-character collating elements there
1683 		     could be several opposite-case counterpoints, but they
1684 		     cannot be supported portably anyway. */
1685 		  tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re),
1686 					     tre_toupper(*ctx->re),
1687 					     ctx->position);
1688 		  if (!tmp1)
1689 		    return REG_ESPACE;
1690 		  tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re),
1691 					     tre_tolower(*ctx->re),
1692 					     ctx->position);
1693 		  if (!tmp2)
1694 		    return REG_ESPACE;
1695 		  result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1696 		  if (!result)
1697 		    return REG_ESPACE;
1698 		}
1699 	      else
1700 		{
1701 		  result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1702 					       ctx->position);
1703 		  if (!result)
1704 		    return REG_ESPACE;
1705 		}
1706 	      ctx->position++;
1707 	      ctx->re++;
1708 	      break;
1709 	    }
1710 	  break;
1711 
1712 	case PARSE_MARK_FOR_SUBMATCH:
1713 	  {
1714 	    int submatch_id = tre_stack_pop_int(stack);
1715 
1716 	    if (result->submatch_id >= 0)
1717 	      {
1718 		tre_ast_node_t *n, *tmp_node;
1719 		n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1720 		if (n == NULL)
1721 		  return REG_ESPACE;
1722 		tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1723 		if (tmp_node == NULL)
1724 		  return REG_ESPACE;
1725 		tmp_node->num_submatches = result->num_submatches;
1726 		result = tmp_node;
1727 	      }
1728 	    result->submatch_id = submatch_id;
1729 	    result->num_submatches++;
1730 	    break;
1731 	  }
1732 
1733 	case PARSE_RESTORE_CFLAGS:
1734 	  ctx->cflags = tre_stack_pop_int(stack);
1735 	  break;
1736 
1737 	default:
1738 	  assert(0);
1739 	  break;
1740 	}
1741     }
1742 
1743   /* Check for missing closing parentheses. */
1744   if (depth > 0)
1745     return REG_EPAREN;
1746 
1747   if (status == REG_OK)
1748     ctx->result = result;
1749 
1750   return status;
1751 }
1752 
1753 /* EOF */
1754