xref: /netbsd-src/external/bsd/tre/dist/lib/tre-parse.c (revision 4e6df137e8e14049b5a701d249962c480449c141)
1 /*
2   tre-parse.c - Regexp parser
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 /*
10   This parser is just a simple recursive descent parser for POSIX.2
11   regexps.  The parser supports both the obsolete default syntax and
12   the "extended" syntax, and some nonstandard extensions.
13 */
14 
15 
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif /* HAVE_CONFIG_H */
19 #include <string.h>
20 #include <assert.h>
21 #include <limits.h>
22 
23 #include "xmalloc.h"
24 #include "tre-mem.h"
25 #include "tre-ast.h"
26 #include "tre-stack.h"
27 #include "tre-parse.h"
28 
29 
30 /* Characters with special meanings in regexp syntax. */
31 #define CHAR_PIPE	   L'|'
32 #define CHAR_LPAREN	   L'('
33 #define CHAR_RPAREN	   L')'
34 #define CHAR_LBRACE	   L'{'
35 #define CHAR_RBRACE	   L'}'
36 #define CHAR_LBRACKET	   L'['
37 #define CHAR_RBRACKET	   L']'
38 #define CHAR_MINUS	   L'-'
39 #define CHAR_STAR	   L'*'
40 #define CHAR_QUESTIONMARK  L'?'
41 #define CHAR_PLUS	   L'+'
42 #define CHAR_PERIOD	   L'.'
43 #define CHAR_COLON	   L':'
44 #define CHAR_EQUAL	   L'='
45 #define CHAR_COMMA	   L','
46 #define CHAR_CARET	   L'^'
47 #define CHAR_DOLLAR	   L'$'
48 #define CHAR_BACKSLASH	   L'\\'
49 #define CHAR_HASH	   L'#'
50 #define CHAR_TILDE	   L'~'
51 
52 
53 /* Some macros for expanding \w, \s, etc. */
54 static const struct tre_macro_struct {
55   const char c;
56   const char *expansion;
57 } tre_macros[] =
58   { {'t', "\t"},	   {'n', "\n"},		   {'r', "\r"},
59     {'f', "\f"},	   {'a', "\a"},		   {'e', "\033"},
60     {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
61     {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"},   {'D', "[^[:digit:]]"},
62     { 0, NULL }
63   };
64 
65 
66 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
67    must have at least `len' items.  Sets buf[0] to zero if the there
68    is no match in `tre_macros'. */
69 static void
70 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
71 		 tre_char_t *buf, size_t buf_len)
72 {
73   int i;
74 
75   buf[0] = 0;
76   if (regex >= regex_end)
77     return;
78 
79   for (i = 0; tre_macros[i].expansion; i++)
80     {
81       if (tre_macros[i].c == *regex)
82 	{
83 	  unsigned int j;
84 	  DPRINT(("Expanding macro '%c' => '%s'\n",
85 		  tre_macros[i].c, tre_macros[i].expansion));
86 	  for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
87 	    buf[j] = tre_macros[i].expansion[j];
88 	  buf[j] = 0;
89 	  break;
90 	}
91     }
92 }
93 
94 static reg_errcode_t
95 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
96 	 tre_ast_node_t ***items)
97 {
98   reg_errcode_t status;
99   tre_ast_node_t **array = *items;
100   /* Allocate more space if necessary. */
101   if (*i >= *max_i)
102     {
103       tre_ast_node_t **new_items;
104       DPRINT(("out of array space, i = %d\n", *i));
105       /* If the array is already 1024 items large, give up -- there's
106 	 probably an error in the regexp (e.g. not a '\0' terminated
107 	 string and missing ']') */
108       if (*max_i > 1024)
109 	return REG_ESPACE;
110       *max_i *= 2;
111       new_items = xrealloc(array, sizeof(*items) * *max_i);
112       if (new_items == NULL)
113 	return REG_ESPACE;
114       *items = array = new_items;
115     }
116   array[*i] = tre_ast_new_literal(mem, min, max, -1);
117   status = array[*i] == NULL ? REG_ESPACE : REG_OK;
118   (*i)++;
119   return status;
120 }
121 
122 
123 /* Expands a character class to character ranges. */
124 static reg_errcode_t
125 tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items,
126 		 int *i, int *max_i, int cflags)
127 {
128   reg_errcode_t status = REG_OK;
129   tre_cint_t c;
130   int j, min = -1, max = 0;
131   assert(TRE_MB_CUR_MAX == 1);
132 
133   DPRINT(("  expanding class to character ranges\n"));
134   for (j = 0; (j < 256) && (status == REG_OK); j++)
135     {
136       c = j;
137       if (tre_isctype(c, class)
138 	  || ((cflags & REG_ICASE)
139 	      && (tre_isctype(tre_tolower(c), class)
140 		  || tre_isctype(tre_toupper(c), class))))
141 {
142 	  if (min < 0)
143 	    min = c;
144 	  max = c;
145 	}
146       else if (min >= 0)
147 	{
148 	  DPRINT(("  range %c (%d) to %c (%d)\n", min, min, max, max));
149 	  status = tre_new_item(mem, min, max, i, max_i, items);
150 	  min = -1;
151 	}
152     }
153   if (min >= 0 && status == REG_OK)
154     status = tre_new_item(mem, min, max, i, max_i, items);
155   return status;
156 }
157 
158 
159 static int
160 tre_compare_items(const void *a, const void *b)
161 {
162   const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
163   const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
164   tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
165   int a_min = l_a->code_min, b_min = l_b->code_min;
166 
167   if (a_min < b_min)
168     return -1;
169   else if (a_min > b_min)
170     return 1;
171   else
172     return 0;
173 }
174 
175 #ifndef TRE_USE_SYSTEM_WCTYPE
176 
177 /* isalnum() and the rest may be macros, so wrap them to functions. */
178 int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
179 int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
180 
181 #ifdef tre_isascii
182 int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
183 #else /* !tre_isascii */
184 int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
185 #endif /* !tre_isascii */
186 
187 #ifdef tre_isblank
188 int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
189 #else /* !tre_isblank */
190 int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
191 #endif /* !tre_isblank */
192 
193 int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
194 int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
195 int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
196 int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
197 int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
198 int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
199 int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
200 int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
201 int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
202 
203 struct {
204   char *name;
205   int (*func)(tre_cint_t);
206 } tre_ctype_map[] = {
207   { "alnum", &tre_isalnum_func },
208   { "alpha", &tre_isalpha_func },
209 #ifdef tre_isascii
210   { "ascii", &tre_isascii_func },
211 #endif /* tre_isascii */
212 #ifdef tre_isblank
213   { "blank", &tre_isblank_func },
214 #endif /* tre_isblank */
215   { "cntrl", &tre_iscntrl_func },
216   { "digit", &tre_isdigit_func },
217   { "graph", &tre_isgraph_func },
218   { "lower", &tre_islower_func },
219   { "print", &tre_isprint_func },
220   { "punct", &tre_ispunct_func },
221   { "space", &tre_isspace_func },
222   { "upper", &tre_isupper_func },
223   { "xdigit", &tre_isxdigit_func },
224   { NULL, NULL}
225 };
226 
227 tre_ctype_t tre_ctype(const char *name)
228 {
229   int i;
230   for (i = 0; tre_ctype_map[i].name != NULL; i++)
231     {
232       if (strcmp(name, tre_ctype_map[i].name) == 0)
233 	return tre_ctype_map[i].func;
234     }
235   return (tre_ctype_t)0;
236 }
237 #endif /* !TRE_USE_SYSTEM_WCTYPE */
238 
239 /* Maximum number of character classes that can occur in a negated bracket
240    expression.	*/
241 #define MAX_NEG_CLASSES 64
242 
243 /* Maximum length of character class names. */
244 #define MAX_CLASS_NAME
245 
246 #define REST(re) (int)(ctx->re_end - (re)), (re)
247 
248 static reg_errcode_t
249 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
250 			tre_ctype_t neg_classes[], int *num_neg_classes,
251 			tre_ast_node_t ***items, int *num_items,
252 			int *items_size)
253 {
254   const tre_char_t *re = ctx->re;
255   reg_errcode_t status = REG_OK;
256   tre_ctype_t class = (tre_ctype_t)0;
257   int i = *num_items;
258   int max_i = *items_size;
259   int skip;
260 
261   /* Build an array of the items in the bracket expression. */
262   while (status == REG_OK)
263     {
264       skip = 0;
265       if (re == ctx->re_end)
266 	{
267 	  status = REG_EBRACK;
268 	}
269       else if (*re == CHAR_RBRACKET && re > ctx->re)
270 	{
271 	  DPRINT(("tre_parse_bracket:	done: '%.*" STRF "'\n", REST(re)));
272 	  re++;
273 	  break;
274 	}
275       else
276 	{
277 	  tre_cint_t min = 0, max = 0;
278 
279 	  class = (tre_ctype_t)0;
280 	  if (re + 2 < ctx->re_end
281 	      && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET)
282 	    {
283 	      DPRINT(("tre_parse_bracket:  range: '%.*" STRF "'\n", REST(re)));
284 	      min = *re;
285 	      max = *(re + 2);
286 	      re += 3;
287 	      /* XXX - Should use collation order instead of encoding values
288 		 in character ranges. */
289 	      if (min > max)
290 		status = REG_ERANGE;
291 	    }
292 	  else if (re + 1 < ctx->re_end
293 		   && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
294 	    status = REG_ECOLLATE;
295 	  else if (re + 1 < ctx->re_end
296 		   && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
297 	    status = REG_ECOLLATE;
298 	  else if (re + 1 < ctx->re_end
299 		   && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
300 	    {
301 	      char tmp_str[64];
302 	      const tre_char_t *endptr = re + 2;
303 	      int len;
304 	      DPRINT(("tre_parse_bracket:  class: '%.*" STRF "'\n", REST(re)));
305 	      while (endptr < ctx->re_end && *endptr != CHAR_COLON)
306 		endptr++;
307 	      if (endptr != ctx->re_end)
308 		{
309 		  len = MIN(endptr - re - 2, 63);
310 #ifdef TRE_WCHAR
311 		  {
312 		    tre_char_t tmp_wcs[64];
313 		    wcsncpy(tmp_wcs, re + 2, (size_t)len);
314 		    tmp_wcs[len] = L'\0';
315 #if defined HAVE_WCSRTOMBS
316 		    {
317 		      mbstate_t state;
318 		      const tre_char_t *src = tmp_wcs;
319 		      memset(&state, '\0', sizeof(state));
320 		      len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
321 		    }
322 #elif defined HAVE_WCSTOMBS
323 		    len = wcstombs(tmp_str, tmp_wcs, 63);
324 #endif /* defined HAVE_WCSTOMBS */
325 		  }
326 #else /* !TRE_WCHAR */
327 		  strncpy(tmp_str, (const char*)re + 2, len);
328 #endif /* !TRE_WCHAR */
329 		  tmp_str[len] = '\0';
330 		  DPRINT(("  class name: %s\n", tmp_str));
331 		  class = tre_ctype(tmp_str);
332 		  if (!class)
333 		    status = REG_ECTYPE;
334 		  /* Optimize character classes for 8 bit character sets. */
335 		  if (status == REG_OK && TRE_MB_CUR_MAX == 1)
336 		    {
337 		      status = tre_expand_ctype(ctx->mem, class, items,
338 						&i, &max_i, ctx->cflags);
339 		      class = (tre_ctype_t)0;
340 		      skip = 1;
341 		    }
342 		  re = endptr + 2;
343 		}
344 	      else
345 		status = REG_ECTYPE;
346 	      min = 0;
347 	      max = TRE_CHAR_MAX;
348 	    }
349 	  else
350 	    {
351 	      DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
352 	      if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
353 		  && ctx->re != re)
354 		/* Two ranges are not allowed to share and endpoint. */
355 		status = REG_ERANGE;
356 	      min = max = *re++;
357 	    }
358 
359 	  if (status != REG_OK)
360 	    break;
361 
362 	  if (class && negate)
363 	    if (*num_neg_classes >= MAX_NEG_CLASSES)
364 	      status = REG_ESPACE;
365 	    else
366 	      neg_classes[(*num_neg_classes)++] = class;
367 	  else if (!skip)
368 	    {
369 	      status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
370 	      if (status != REG_OK)
371 		break;
372 	      ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
373 	    }
374 
375 	  /* Add opposite-case counterpoints if REG_ICASE is present.
376 	     This is broken if there are more than two "same" characters. */
377 	  if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
378 	    {
379 	      tre_cint_t cmin, ccurr;
380 
381 	      DPRINT(("adding opposite-case counterpoints\n"));
382 	      while (min <= max)
383 		{
384 		  if (tre_islower(min))
385 		    {
386 		      cmin = ccurr = tre_toupper(min++);
387 		      while (tre_islower(min) && tre_toupper(min) == ccurr + 1
388 			     && min <= max)
389 			ccurr = tre_toupper(min++);
390 		      status = tre_new_item(ctx->mem, cmin, ccurr,
391 					    &i, &max_i, items);
392 		    }
393 		  else if (tre_isupper(min))
394 		    {
395 		      cmin = ccurr = tre_tolower(min++);
396 		      while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
397 			     && min <= max)
398 			ccurr = tre_tolower(min++);
399 		      status = tre_new_item(ctx->mem, cmin, ccurr,
400 					    &i, &max_i, items);
401 		    }
402 		  else min++;
403 		  if (status != REG_OK)
404 		    break;
405 		}
406 	      if (status != REG_OK)
407 		break;
408 	    }
409 	}
410     }
411   *num_items = i;
412   *items_size = max_i;
413   ctx->re = re;
414   return status;
415 }
416 
417 static reg_errcode_t
418 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
419 {
420   tre_ast_node_t *node = NULL;
421   int negate = 0;
422   reg_errcode_t status = REG_OK;
423   tre_ast_node_t **items, *u, *n;
424   int i = 0, j, max_i = 32, curr_max, curr_min;
425   tre_ctype_t neg_classes[MAX_NEG_CLASSES];
426   int num_neg_classes = 0;
427 
428   /* Start off with an array of `max_i' elements. */
429   items = xmalloc(sizeof(*items) * max_i);
430   if (items == NULL)
431     return REG_ESPACE;
432 
433   if (*ctx->re == CHAR_CARET)
434     {
435       DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
436       negate = 1;
437       ctx->re++;
438     }
439 
440   status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
441 				   &items, &i, &max_i);
442 
443   if (status != REG_OK)
444     goto parse_bracket_done;
445 
446   /* Sort the array if we need to negate it. */
447   if (negate)
448     qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
449 
450   curr_max = curr_min = 0;
451   /* Build a union of the items in the array, negated if necessary. */
452   for (j = 0; j < i && status == REG_OK; j++)
453     {
454       int min, max;
455       tre_literal_t *l = items[j]->obj;
456       min = l->code_min;
457       max = l->code_max;
458 
459       DPRINT(("item: %d - %d, class %ld, curr_max = %d\n",
460 	      (int)l->code_min, (int)l->code_max, (long)l->u.class, curr_max));
461 
462       if (negate)
463 	{
464 	  if (min < curr_max)
465 	    {
466 	      /* Overlap. */
467 	      curr_max = MAX(max + 1, curr_max);
468 	      DPRINT(("overlap, curr_max = %d\n", curr_max));
469 	      l = NULL;
470 	    }
471 	  else
472 	    {
473 	      /* No overlap. */
474 	      curr_max = min - 1;
475 	      if (curr_max >= curr_min)
476 		{
477 		  DPRINT(("no overlap\n"));
478 		  l->code_min = curr_min;
479 		  l->code_max = curr_max;
480 		}
481 	      else
482 		{
483 		  DPRINT(("no overlap, zero room\n"));
484 		  l = NULL;
485 		}
486 	      curr_min = curr_max = max + 1;
487 	    }
488 	}
489 
490       if (l != NULL)
491 	{
492 	  int k;
493 	  DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max));
494 	  l->position = ctx->position;
495 	  if (num_neg_classes > 0)
496 	    {
497 	      l->neg_classes = tre_mem_alloc(ctx->mem,
498 					     (sizeof(l->neg_classes)
499 					      * (num_neg_classes + 1)));
500 	      if (l->neg_classes == NULL)
501 		{
502 		  status = REG_ESPACE;
503 		  break;
504 		}
505 	      for (k = 0; k < num_neg_classes; k++)
506 		l->neg_classes[k] = neg_classes[k];
507 	      l->neg_classes[k] = (tre_ctype_t)0;
508 	    }
509 	  else
510 	    l->neg_classes = NULL;
511 	  if (node == NULL)
512 	    node = items[j];
513 	  else
514 	    {
515 	      u = tre_ast_new_union(ctx->mem, node, items[j]);
516 	      if (u == NULL)
517 		status = REG_ESPACE;
518 	      node = u;
519 	    }
520 	}
521     }
522 
523   if (status != REG_OK)
524     goto parse_bracket_done;
525 
526   if (negate)
527     {
528       int k;
529       DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX));
530       n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
531       if (n == NULL)
532 	status = REG_ESPACE;
533       else
534 	{
535 	  tre_literal_t *l = n->obj;
536 	  if (num_neg_classes > 0)
537 	    {
538 	      l->neg_classes = tre_mem_alloc(ctx->mem,
539 					     (sizeof(l->neg_classes)
540 					      * (num_neg_classes + 1)));
541 	      if (l->neg_classes == NULL)
542 		{
543 		  status = REG_ESPACE;
544 		  goto parse_bracket_done;
545 		}
546 	      for (k = 0; k < num_neg_classes; k++)
547 		l->neg_classes[k] = neg_classes[k];
548 	      l->neg_classes[k] = (tre_ctype_t)0;
549 	    }
550 	  else
551 	    l->neg_classes = NULL;
552 	  if (node == NULL)
553 	    node = n;
554 	  else
555 	    {
556 	      u = tre_ast_new_union(ctx->mem, node, n);
557 	      if (u == NULL)
558 		status = REG_ESPACE;
559 	      node = u;
560 	    }
561 	}
562     }
563 
564   if (status != REG_OK)
565     goto parse_bracket_done;
566 
567 #ifdef TRE_DEBUG
568   tre_ast_print(node);
569 #endif /* TRE_DEBUG */
570 
571  parse_bracket_done:
572   xfree(items);
573   ctx->position++;
574   *result = node;
575   return status;
576 }
577 
578 
579 /* Parses a positive decimal integer.  Returns -1 if the string does not
580    contain a valid number. */
581 static int
582 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
583 {
584   int num = -1;
585   const tre_char_t *r = *regex;
586   while (r < regex_end && *r >= L'0' && *r <= L'9')
587     {
588       if (num < 0)
589 	num = 0;
590       num = num * 10 + *r - L'0';
591       r++;
592     }
593   *regex = r;
594   return num;
595 }
596 
597 
598 static reg_errcode_t
599 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
600 {
601   int min, max, i;
602   int cost_ins, cost_del, cost_subst, cost_max;
603   int limit_ins, limit_del, limit_subst, limit_err;
604   const tre_char_t *r = ctx->re;
605   const tre_char_t *start;
606   int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
607   int approx = 0;
608   int costs_set = 0;
609   int counts_set = 0;
610 
611   cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
612   limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
613 
614   /* Parse number (minimum repetition count). */
615   min = -1;
616   if (r < ctx->re_end && *r >= L'0' && *r <= L'9') {
617     DPRINT(("tre_parse:	  min count: '%.*" STRF "'\n", REST(r)));
618     min = tre_parse_int(&r, ctx->re_end);
619   }
620 
621   /* Parse comma and second number (maximum repetition count). */
622   max = min;
623   if (r < ctx->re_end && *r == CHAR_COMMA)
624     {
625       r++;
626       DPRINT(("tre_parse:   max count: '%.*" STRF "'\n", REST(r)));
627       max = tre_parse_int(&r, ctx->re_end);
628     }
629 
630   /* Check that the repeat counts are sane. */
631   if ((max >= 0 && min > max) || max > RE_DUP_MAX)
632     return REG_BADBR;
633 
634 
635   /*
636    '{'
637      optionally followed immediately by a number == minimum repcount
638      optionally followed by , then a number == maximum repcount
639       + then a number == maximum insertion count
640       - then a number == maximum deletion count
641       # then a number == maximum substitution count
642       ~ then a number == maximum number of errors
643       Any of +, -, # or ~ without followed by a number means that
644       the maximum count/number of errors is infinite.
645 
646       An equation of the form
647 	Xi + Yd + Zs < C
648       can be specified to set costs and the cost limit to a value
649       different from the default value:
650 	- X is the cost of an insertion
651 	- Y is the cost of a deletion
652 	- Z is the cost of a substitution
653 	- C is the maximum cost
654 
655       If no count limit or cost is set for an operation, the operation
656       is not allowed at all.
657   */
658 
659 
660   do {
661     int done;
662     start = r;
663 
664     /* Parse count limit settings */
665     done = 0;
666     if (!counts_set)
667       while (r + 1 < ctx->re_end && !done)
668 	{
669 	  switch (*r)
670 	    {
671 	    case CHAR_PLUS:  /* Insert limit */
672 	      DPRINT(("tre_parse:   ins limit: '%.*" STRF "'\n", REST(r)));
673 	      r++;
674 	      limit_ins = tre_parse_int(&r, ctx->re_end);
675 	      if (limit_ins < 0)
676 		limit_ins = INT_MAX;
677 	      counts_set = 1;
678 	      break;
679 	    case CHAR_MINUS: /* Delete limit */
680 	      DPRINT(("tre_parse:   del limit: '%.*" STRF "'\n", REST(r)));
681 	      r++;
682 	      limit_del = tre_parse_int(&r, ctx->re_end);
683 	      if (limit_del < 0)
684 		limit_del = INT_MAX;
685 	      counts_set = 1;
686 	      break;
687 	    case CHAR_HASH:  /* Substitute limit */
688 	      DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
689 	      r++;
690 	      limit_subst = tre_parse_int(&r, ctx->re_end);
691 	      if (limit_subst < 0)
692 		limit_subst = INT_MAX;
693 	      counts_set = 1;
694 	      break;
695 	    case CHAR_TILDE: /* Maximum number of changes */
696 	      DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
697 	      r++;
698 	      limit_err = tre_parse_int(&r, ctx->re_end);
699 	      if (limit_err < 0)
700 		limit_err = INT_MAX;
701 	      approx = 1;
702 	      break;
703 	    case CHAR_COMMA:
704 	      r++;
705 	      break;
706 	    case L' ':
707 	      r++;
708 	      break;
709 	    case L'}':
710 	      done = 1;
711 	      break;
712 	    default:
713 	      done = 1;
714 	      break;
715 	    }
716 	}
717 
718     /* Parse cost restriction equation. */
719     done = 0;
720     if (!costs_set)
721       while (r + 1 < ctx->re_end && !done)
722 	{
723 	  switch (*r)
724 	    {
725 	    case CHAR_PLUS:
726 	    case L' ':
727 	      r++;
728 	      break;
729 	    case L'<':
730 	      DPRINT(("tre_parse:    max cost: '%.*" STRF "'\n", REST(r)));
731 	      r++;
732 	      while (*r == L' ')
733 		r++;
734 	      cost_max = tre_parse_int(&r, ctx->re_end);
735 	      if (cost_max < 0)
736 		cost_max = INT_MAX;
737 	      else
738 		cost_max--;
739 	      approx = 1;
740 	      break;
741 	    case CHAR_COMMA:
742 	      r++;
743 	      done = 1;
744 	      break;
745 	    default:
746 	      if (*r >= L'0' && *r <= L'9')
747 		{
748 #ifdef TRE_DEBUG
749 		  const tre_char_t *sr = r;
750 #endif /* TRE_DEBUG */
751 		  int cost = tre_parse_int(&r, ctx->re_end);
752 		  /* XXX - make sure r is not past end. */
753 		  switch (*r)
754 		    {
755 		    case L'i':	/* Insert cost */
756 		      DPRINT(("tre_parse:    ins cost: '%.*" STRF "'\n",
757 			      REST(sr)));
758 		      r++;
759 		      cost_ins = cost;
760 		      costs_set = 1;
761 		      break;
762 		    case L'd':	/* Delete cost */
763 		      DPRINT(("tre_parse:    del cost: '%.*" STRF "'\n",
764 			      REST(sr)));
765 		      r++;
766 		      cost_del = cost;
767 		      costs_set = 1;
768 		      break;
769 		    case L's':	/* Substitute cost */
770 		      DPRINT(("tre_parse:  subst cost: '%.*" STRF "'\n",
771 			      REST(sr)));
772 		      r++;
773 		      cost_subst = cost;
774 		      costs_set = 1;
775 		      break;
776 		    default:
777 		      return REG_BADBR;
778 		    }
779 		}
780 	      else
781 		{
782 		  done = 1;
783 		  break;
784 		}
785 	    }
786 	}
787   } while (start != r);
788 
789   /* Missing }. */
790   if (r >= ctx->re_end)
791     return REG_EBRACE;
792 
793   /* Empty contents of {}. */
794   if (r == ctx->re)
795     return REG_BADBR;
796 
797   /* Parse the ending '}' or '\}'.*/
798   if (ctx->cflags & REG_EXTENDED)
799     {
800       if (r >= ctx->re_end || *r != CHAR_RBRACE)
801 	return REG_BADBR;
802       r++;
803     }
804   else
805     {
806       if (r + 1 >= ctx->re_end
807 	  || *r != CHAR_BACKSLASH
808 	  || *(r + 1) != CHAR_RBRACE)
809 	return REG_BADBR;
810       r += 2;
811     }
812 
813 
814   /* Parse trailing '?' marking minimal repetition. */
815   if (r < ctx->re_end)
816     {
817       if (*r == CHAR_QUESTIONMARK)
818 	{
819 	  minimal = !(ctx->cflags & REG_UNGREEDY);
820 	  r++;
821 	}
822       else if (*r == CHAR_STAR || *r == CHAR_PLUS)
823 	{
824 	  /* These are reserved for future extensions. */
825 	  return REG_BADRPT;
826 	}
827     }
828 
829   /* Create the AST node(s). */
830   if (min == 0 && max == 0)
831     {
832       *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
833       if (*result == NULL)
834 	return REG_ESPACE;
835     }
836   else
837     {
838       if (min < 0 && max < 0)
839 	/* Only approximate parameters set, no repetitions. */
840 	min = max = 1;
841 
842       *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
843       if (!*result)
844 	return REG_ESPACE;
845 
846       /* If approximate matching parameters are set, add them to the
847 	 iteration node. */
848       if (approx || costs_set || counts_set)
849 	{
850 	  int *params;
851 	  tre_iteration_t *iter = (*result)->obj;
852 
853 	  if (costs_set || counts_set)
854 	    {
855 	      if (limit_ins == TRE_PARAM_UNSET)
856 		{
857 		  if (cost_ins == TRE_PARAM_UNSET)
858 		    limit_ins = 0;
859 		  else
860 		    limit_ins = INT_MAX;
861 		}
862 
863 	      if (limit_del == TRE_PARAM_UNSET)
864 		{
865 		  if (cost_del == TRE_PARAM_UNSET)
866 		    limit_del = 0;
867 		  else
868 		    limit_del = INT_MAX;
869 		}
870 
871 	      if (limit_subst == TRE_PARAM_UNSET)
872 		{
873 		  if (cost_subst == TRE_PARAM_UNSET)
874 		    limit_subst = 0;
875 		  else
876 		    limit_subst = INT_MAX;
877 		}
878 	    }
879 
880 	  if (cost_max == TRE_PARAM_UNSET)
881 	    cost_max = INT_MAX;
882 	  if (limit_err == TRE_PARAM_UNSET)
883 	    limit_err = INT_MAX;
884 
885 	  ctx->have_approx = 1;
886 	  params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
887 	  if (!params)
888 	    return REG_ESPACE;
889 	  for (i = 0; i < TRE_PARAM_LAST; i++)
890 	    params[i] = TRE_PARAM_UNSET;
891 	  params[TRE_PARAM_COST_INS] = cost_ins;
892 	  params[TRE_PARAM_COST_DEL] = cost_del;
893 	  params[TRE_PARAM_COST_SUBST] = cost_subst;
894 	  params[TRE_PARAM_COST_MAX] = cost_max;
895 	  params[TRE_PARAM_MAX_INS] = limit_ins;
896 	  params[TRE_PARAM_MAX_DEL] = limit_del;
897 	  params[TRE_PARAM_MAX_SUBST] = limit_subst;
898 	  params[TRE_PARAM_MAX_ERR] = limit_err;
899 	  iter->params = params;
900 	}
901     }
902 
903   DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
904 	  "limits [%d,%d,%d, total %d]\n",
905 	  min, max, cost_ins, cost_del, cost_subst, cost_max,
906 	  limit_ins, limit_del, limit_subst, limit_err));
907 
908 
909   ctx->re = r;
910   return REG_OK;
911 }
912 
913 typedef enum {
914   PARSE_RE = 0,
915   PARSE_ATOM,
916   PARSE_MARK_FOR_SUBMATCH,
917   PARSE_BRANCH,
918   PARSE_PIECE,
919   PARSE_CATENATION,
920   PARSE_POST_CATENATION,
921   PARSE_UNION,
922   PARSE_POST_UNION,
923   PARSE_POSTFIX,
924   PARSE_RESTORE_CFLAGS
925 } tre_parse_re_stack_symbol_t;
926 
927 
928 reg_errcode_t
929 tre_parse(tre_parse_ctx_t *ctx)
930 {
931   tre_ast_node_t *result = NULL;
932   tre_parse_re_stack_symbol_t symbol;
933   reg_errcode_t status = REG_OK;
934   tre_stack_t *stack = ctx->stack;
935   int bottom = tre_stack_num_objects(stack);
936   int depth = 0;
937   int temporary_cflags = 0;
938 
939   DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n",
940 	  ctx->len, ctx->re, ctx->len));
941 
942   if (!ctx->nofirstsub)
943     {
944       STACK_PUSH(stack, int, ctx->submatch_id);
945       STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
946       ctx->submatch_id++;
947     }
948   STACK_PUSH(stack, int, PARSE_RE);
949   ctx->re_start = ctx->re;
950   ctx->re_end = ctx->re + ctx->len;
951 
952 
953   /* The following is basically just a recursive descent parser.  I use
954      an explicit stack instead of recursive functions mostly because of
955      two reasons: compatibility with systems which have an overflowable
956      call stack, and efficiency (both in lines of code and speed).  */
957   while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
958     {
959       if (status != REG_OK)
960 	break;
961       symbol = tre_stack_pop_int(stack);
962       switch (symbol)
963 	{
964 	case PARSE_RE:
965 	  /* Parse a full regexp.  A regexp is one or more branches,
966 	     separated by the union operator `|'. */
967 #ifdef REG_LITERAL
968 	  if (!(ctx->cflags & REG_LITERAL)
969 	      && ctx->cflags & REG_EXTENDED)
970 #endif /* REG_LITERAL */
971 	    STACK_PUSHX(stack, int, PARSE_UNION);
972 	  STACK_PUSHX(stack, int, PARSE_BRANCH);
973 	  break;
974 
975 	case PARSE_BRANCH:
976 	  /* Parse a branch.  A branch is one or more pieces, concatenated.
977 	     A piece is an atom possibly followed by a postfix operator. */
978 	  STACK_PUSHX(stack, int, PARSE_CATENATION);
979 	  STACK_PUSHX(stack, int, PARSE_PIECE);
980 	  break;
981 
982 	case PARSE_PIECE:
983 	  /* Parse a piece.  A piece is an atom possibly followed by one
984 	     or more postfix operators. */
985 #ifdef REG_LITERAL
986 	  if (!(ctx->cflags & REG_LITERAL))
987 #endif /* REG_LITERAL */
988 	    STACK_PUSHX(stack, int, PARSE_POSTFIX);
989 	  STACK_PUSHX(stack, int, PARSE_ATOM);
990 	  break;
991 
992 	case PARSE_CATENATION:
993 	  /* If the expression has not ended, parse another piece. */
994 	  {
995 	    tre_char_t c;
996 	    if (ctx->re >= ctx->re_end)
997 	      break;
998 	    c = *ctx->re;
999 #ifdef REG_LITERAL
1000 	    if (!(ctx->cflags & REG_LITERAL))
1001 	      {
1002 #endif /* REG_LITERAL */
1003 		if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1004 		  break;
1005 		if ((ctx->cflags & REG_EXTENDED
1006 		     && c == CHAR_RPAREN && depth > 0)
1007 		    || (!(ctx->cflags & REG_EXTENDED)
1008 			&& (c == CHAR_BACKSLASH
1009 			    && *(ctx->re + 1) == CHAR_RPAREN)))
1010 		  {
1011 		    if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1012 		      status = REG_EPAREN;
1013 		    DPRINT(("tre_parse:	  group end: '%.*" STRF "'\n",
1014 			    REST(ctx->re)));
1015 		    depth--;
1016 		    if (!(ctx->cflags & REG_EXTENDED))
1017 		      ctx->re += 2;
1018 		    break;
1019 		  }
1020 #ifdef REG_LITERAL
1021 	      }
1022 #endif /* REG_LITERAL */
1023 
1024 #ifdef REG_RIGHT_ASSOC
1025 	    if (ctx->cflags & REG_RIGHT_ASSOC)
1026 	      {
1027 		/* Right associative concatenation. */
1028 		STACK_PUSHX(stack, voidptr, result);
1029 		STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1030 		STACK_PUSHX(stack, int, PARSE_CATENATION);
1031 		STACK_PUSHX(stack, int, PARSE_PIECE);
1032 	      }
1033 	    else
1034 #endif /* REG_RIGHT_ASSOC */
1035 	      {
1036 		/* Default case, left associative concatenation. */
1037 		STACK_PUSHX(stack, int, PARSE_CATENATION);
1038 		STACK_PUSHX(stack, voidptr, result);
1039 		STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1040 		STACK_PUSHX(stack, int, PARSE_PIECE);
1041 	      }
1042 	    break;
1043 	  }
1044 
1045 	case PARSE_POST_CATENATION:
1046 	  {
1047 	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1048 	    tre_ast_node_t *tmp_node;
1049 	    tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1050 	    if (!tmp_node)
1051 	      return REG_ESPACE;
1052 	    result = tmp_node;
1053 	    break;
1054 	  }
1055 
1056 	case PARSE_UNION:
1057 	  if (ctx->re >= ctx->re_end)
1058 	    break;
1059 #ifdef REG_LITERAL
1060 	  if (ctx->cflags & REG_LITERAL)
1061 	    break;
1062 #endif /* REG_LITERAL */
1063 	  switch (*ctx->re)
1064 	    {
1065 	    case CHAR_PIPE:
1066 	      DPRINT(("tre_parse:	union: '%.*" STRF "'\n",
1067 		      REST(ctx->re)));
1068 	      STACK_PUSHX(stack, int, PARSE_UNION);
1069 	      STACK_PUSHX(stack, voidptr, result);
1070 	      STACK_PUSHX(stack, int, PARSE_POST_UNION);
1071 	      STACK_PUSHX(stack, int, PARSE_BRANCH);
1072 	      ctx->re++;
1073 	      break;
1074 
1075 	    case CHAR_RPAREN:
1076 	      ctx->re++;
1077 	      break;
1078 
1079 	    default:
1080 	      break;
1081 	    }
1082 	  break;
1083 
1084 	case PARSE_POST_UNION:
1085 	  {
1086 	    tre_ast_node_t *tmp_node;
1087 	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1088 	    tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1089 	    if (!tmp_node)
1090 	      return REG_ESPACE;
1091 	    result = tmp_node;
1092 	    break;
1093 	  }
1094 
1095 	case PARSE_POSTFIX:
1096 	  /* Parse postfix operators. */
1097 	  if (ctx->re >= ctx->re_end)
1098 	    break;
1099 #ifdef REG_LITERAL
1100 	  if (ctx->cflags & REG_LITERAL)
1101 	    break;
1102 #endif /* REG_LITERAL */
1103 	  switch (*ctx->re)
1104 	    {
1105 	    case CHAR_PLUS:
1106 	    case CHAR_QUESTIONMARK:
1107 	      if (!(ctx->cflags & REG_EXTENDED))
1108 		break;
1109 		/*FALLTHROUGH*/
1110 	    case CHAR_STAR:
1111 	      {
1112 		tre_ast_node_t *tmp_node;
1113 		int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1114 		int rep_min = 0;
1115 		int rep_max = -1;
1116 #ifdef TRE_DEBUG
1117 		const tre_char_t *tmp_re;
1118 #endif
1119 
1120 		if (*ctx->re == CHAR_PLUS)
1121 		  rep_min = 1;
1122 		if (*ctx->re == CHAR_QUESTIONMARK)
1123 		  rep_max = 1;
1124 #ifdef TRE_DEBUG
1125 		tmp_re = ctx->re;
1126 #endif
1127 
1128 		if (ctx->re + 1 < ctx->re_end)
1129 		  {
1130 		    if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1131 		      {
1132 			minimal = !(ctx->cflags & REG_UNGREEDY);
1133 			ctx->re++;
1134 		      }
1135 		    else if (*(ctx->re + 1) == CHAR_STAR
1136 			     || *(ctx->re + 1) == CHAR_PLUS)
1137 		      {
1138 			/* These are reserved for future extensions. */
1139 			return REG_BADRPT;
1140 		      }
1141 		  }
1142 
1143 		DPRINT(("tre_parse: %s star: '%.*" STRF "'\n",
1144 			minimal ? "  minimal" : "greedy", REST(tmp_re)));
1145 		ctx->re++;
1146 		tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1147 					    minimal);
1148 		if (tmp_node == NULL)
1149 		  return REG_ESPACE;
1150 		result = tmp_node;
1151 		STACK_PUSHX(stack, int, PARSE_POSTFIX);
1152 	      }
1153 	      break;
1154 
1155 	    case CHAR_BACKSLASH:
1156 	      /* "\{" is special without REG_EXTENDED */
1157 	      if (!(ctx->cflags & REG_EXTENDED)
1158 		  && ctx->re + 1 < ctx->re_end
1159 		  && *(ctx->re + 1) == CHAR_LBRACE)
1160 		{
1161 		  ctx->re++;
1162 		  goto parse_brace;
1163 		}
1164 	      else
1165 		break;
1166 
1167 	    case CHAR_LBRACE:
1168 	      /* "{" is literal without REG_EXTENDED */
1169 	      if (!(ctx->cflags & REG_EXTENDED))
1170 		break;
1171 
1172 	    parse_brace:
1173 	      DPRINT(("tre_parse:	bound: '%.*" STRF "'\n",
1174 		      REST(ctx->re)));
1175 	      ctx->re++;
1176 
1177 	      status = tre_parse_bound(ctx, &result);
1178 	      if (status != REG_OK)
1179 		return status;
1180 	      STACK_PUSHX(stack, int, PARSE_POSTFIX);
1181 	      break;
1182 	    }
1183 	  break;
1184 
1185 	case PARSE_ATOM:
1186 	  /* Parse an atom.  An atom is a regular expression enclosed in `()',
1187 	     an empty set of `()', a bracket expression, `.', `^', `$',
1188 	     a `\' followed by a character, or a single character. */
1189 
1190 	  /* End of regexp? (empty string). */
1191 	  if (ctx->re >= ctx->re_end)
1192 	    goto parse_literal;
1193 
1194 #ifdef REG_LITERAL
1195 	  if (ctx->cflags & REG_LITERAL)
1196 	    goto parse_literal;
1197 #endif /* REG_LITERAL */
1198 
1199 	  switch (*ctx->re)
1200 	    {
1201 	    case CHAR_LPAREN:  /* parenthesized subexpression */
1202 
1203 	      /* Handle "(?...)" extensions.  They work in a way similar
1204 		 to Perls corresponding extensions. */
1205 	      if (ctx->cflags & REG_EXTENDED
1206 		  && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1207 		{
1208 		  int new_cflags = ctx->cflags;
1209 		  int bit = 1;
1210 		  DPRINT(("tre_parse:	extension: '%.*" STRF "\n",
1211 			  REST(ctx->re)));
1212 		  ctx->re += 2;
1213 		  while (/*CONSTCOND*/1)
1214 		    {
1215 		      if (*ctx->re == L'i')
1216 			{
1217 			  DPRINT(("tre_parse:	    icase: '%.*" STRF "\n",
1218 				  REST(ctx->re)));
1219 			  if (bit)
1220 			    new_cflags |= REG_ICASE;
1221 			  else
1222 			    new_cflags &= ~REG_ICASE;
1223 			  ctx->re++;
1224 			}
1225 		      else if (*ctx->re == L'n')
1226 			{
1227 			  DPRINT(("tre_parse:	  newline: '%.*" STRF "\n",
1228 				  REST(ctx->re)));
1229 			  if (bit)
1230 			    new_cflags |= REG_NEWLINE;
1231 			  else
1232 			    new_cflags &= ~REG_NEWLINE;
1233 			  ctx->re++;
1234 			}
1235 #ifdef REG_RIGHT_ASSOC
1236 		      else if (*ctx->re == L'r')
1237 			{
1238 			  DPRINT(("tre_parse: right assoc: '%.*" STRF "\n",
1239 				  REST(ctx->re)));
1240 			  if (bit)
1241 			    new_cflags |= REG_RIGHT_ASSOC;
1242 			  else
1243 			    new_cflags &= ~REG_RIGHT_ASSOC;
1244 			  ctx->re++;
1245 			}
1246 #endif /* REG_RIGHT_ASSOC */
1247 #ifdef REG_UNGREEDY
1248 		      else if (*ctx->re == L'U')
1249 			{
1250 			  DPRINT(("tre_parse:    ungreedy: '%.*" STRF "\n",
1251 				  REST(ctx->re)));
1252 			  if (bit)
1253 			    new_cflags |= REG_UNGREEDY;
1254 			  else
1255 			    new_cflags &= ~REG_UNGREEDY;
1256 			  ctx->re++;
1257 			}
1258 #endif /* REG_UNGREEDY */
1259 		      else if (*ctx->re == CHAR_MINUS)
1260 			{
1261 			  DPRINT(("tre_parse:	 turn off: '%.*" STRF "\n",
1262 				  REST(ctx->re)));
1263 			  ctx->re++;
1264 			  bit = 0;
1265 			}
1266 		      else if (*ctx->re == CHAR_COLON)
1267 			{
1268 			  DPRINT(("tre_parse:	 no group: '%.*" STRF "\n",
1269 				  REST(ctx->re)));
1270 			  ctx->re++;
1271 			  depth++;
1272 			  break;
1273 			}
1274 		      else if (*ctx->re == CHAR_HASH)
1275 			{
1276 			  DPRINT(("tre_parse:    comment: '%.*" STRF "\n",
1277 				  REST(ctx->re)));
1278 			  /* A comment can contain any character except a
1279 			     right parenthesis */
1280 			  while (*ctx->re != CHAR_RPAREN
1281 				 && ctx->re < ctx->re_end)
1282 			    ctx->re++;
1283 			  if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1284 			    {
1285 			      ctx->re++;
1286 			      break;
1287 			    }
1288 			  else
1289 			    return REG_BADPAT;
1290 			}
1291 		      else if (*ctx->re == CHAR_RPAREN)
1292 			{
1293 			  ctx->re++;
1294 			  break;
1295 			}
1296 		      else
1297 			return REG_BADPAT;
1298 		    }
1299 
1300 		  /* Turn on the cflags changes for the rest of the
1301 		     enclosing group. */
1302 		  STACK_PUSHX(stack, int, ctx->cflags);
1303 		  STACK_PUSHX(stack, int, PARSE_RESTORE_CFLAGS);
1304 		  STACK_PUSHX(stack, int, PARSE_RE);
1305 		  ctx->cflags = new_cflags;
1306 		  break;
1307 		}
1308 
1309 	      if (ctx->cflags & REG_EXTENDED
1310 		  || (ctx->re > ctx->re_start
1311 		      && *(ctx->re - 1) == CHAR_BACKSLASH))
1312 		{
1313 		  depth++;
1314 		  if (ctx->re + 2 < ctx->re_end
1315 		      && *(ctx->re + 1) == CHAR_QUESTIONMARK
1316 		      && *(ctx->re + 2) == CHAR_COLON)
1317 		    {
1318 		      DPRINT(("tre_parse: group begin: '%.*" STRF
1319 			      "', no submatch\n", REST(ctx->re)));
1320 		      /* Don't mark for submatching. */
1321 		      ctx->re += 3;
1322 		      STACK_PUSHX(stack, int, PARSE_RE);
1323 		    }
1324 		  else
1325 		    {
1326 		      DPRINT(("tre_parse: group begin: '%.*" STRF
1327 			      "', submatch %d\n", REST(ctx->re),
1328 			      ctx->submatch_id));
1329 		      ctx->re++;
1330 		      /* First parse a whole RE, then mark the resulting tree
1331 			 for submatching. */
1332 		      STACK_PUSHX(stack, int, ctx->submatch_id);
1333 		      STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1334 		      STACK_PUSHX(stack, int, PARSE_RE);
1335 		      ctx->submatch_id++;
1336 		    }
1337 		}
1338 	      else
1339 		goto parse_literal;
1340 	      break;
1341 
1342 	    case CHAR_RPAREN:  /* end of current subexpression */
1343 	      if ((ctx->cflags & REG_EXTENDED && depth > 0)
1344 		  || (ctx->re > ctx->re_start
1345 		      && *(ctx->re - 1) == CHAR_BACKSLASH))
1346 		{
1347 		  DPRINT(("tre_parse:	    empty: '%.*" STRF "'\n",
1348 			  REST(ctx->re)));
1349 		  /* We were expecting an atom, but instead the current
1350 		     subexpression was closed.	POSIX leaves the meaning of
1351 		     this to be implementation-defined.	 We interpret this as
1352 		     an empty expression (which matches an empty string).  */
1353 		  result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1354 		  if (result == NULL)
1355 		    return REG_ESPACE;
1356 		  if (!(ctx->cflags & REG_EXTENDED))
1357 		    ctx->re--;
1358 		}
1359 	      else
1360 		goto parse_literal;
1361 	      break;
1362 
1363 	    case CHAR_LBRACKET: /* bracket expression */
1364 	      DPRINT(("tre_parse:     bracket: '%.*" STRF "'\n",
1365 		      REST(ctx->re)));
1366 	      ctx->re++;
1367 	      status = tre_parse_bracket(ctx, &result);
1368 	      if (status != REG_OK)
1369 		return status;
1370 	      break;
1371 
1372 	    case CHAR_BACKSLASH:
1373 	      /* If this is "\(" or "\)" chew off the backslash and
1374 		 try again. */
1375 	      if (!(ctx->cflags & REG_EXTENDED)
1376 		  && ctx->re + 1 < ctx->re_end
1377 		  && (*(ctx->re + 1) == CHAR_LPAREN
1378 		      || *(ctx->re + 1) == CHAR_RPAREN))
1379 		{
1380 		  ctx->re++;
1381 		  STACK_PUSHX(stack, int, PARSE_ATOM);
1382 		  break;
1383 		}
1384 
1385 	      /* If a macro is used, parse the expanded macro recursively. */
1386 	      {
1387 		tre_char_t buf[64];
1388 		tre_expand_macro(ctx->re + 1, ctx->re_end,
1389 				 buf, elementsof(buf));
1390 		if (buf[0] != 0)
1391 		  {
1392 		    tre_parse_ctx_t subctx;
1393 		    memcpy(&subctx, ctx, sizeof(subctx));
1394 		    subctx.re = buf;
1395 		    subctx.len = tre_strlen(buf);
1396 		    subctx.nofirstsub = 1;
1397 		    status = tre_parse(&subctx);
1398 		    if (status != REG_OK)
1399 		      return status;
1400 		    ctx->re += 2;
1401 		    ctx->position = subctx.position;
1402 		    result = subctx.result;
1403 		    break;
1404 		  }
1405 	      }
1406 
1407 	      if (ctx->re + 1 >= ctx->re_end)
1408 		/* Trailing backslash. */
1409 		return REG_EESCAPE;
1410 
1411 #ifdef REG_LITERAL
1412 	      if (*(ctx->re + 1) == L'Q')
1413 		{
1414 		  DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1415 			  REST(ctx->re)));
1416 		  ctx->cflags |= REG_LITERAL;
1417 		  temporary_cflags |= REG_LITERAL;
1418 		  ctx->re += 2;
1419 		  STACK_PUSHX(stack, int, PARSE_ATOM);
1420 		  break;
1421 		}
1422 #endif /* REG_LITERAL */
1423 
1424 	      DPRINT(("tre_parse:  bleep: '%.*" STRF "'\n", REST(ctx->re)));
1425 	      ctx->re++;
1426 	      switch (*ctx->re)
1427 		{
1428 		case L'b':
1429 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1430 					       ASSERT_AT_WB, -1);
1431 		  ctx->re++;
1432 		  break;
1433 		case L'B':
1434 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1435 					       ASSERT_AT_WB_NEG, -1);
1436 		  ctx->re++;
1437 		  break;
1438 		case L'<':
1439 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1440 					       ASSERT_AT_BOW, -1);
1441 		  ctx->re++;
1442 		  break;
1443 		case L'>':
1444 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1445 					       ASSERT_AT_EOW, -1);
1446 		  ctx->re++;
1447 		  break;
1448 		case L'x':
1449 		  ctx->re++;
1450 		  if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1451 		    {
1452 		      /* 8 bit hex char. */
1453 		      char tmp[3] = {0, 0, 0};
1454 		      long val;
1455 		      DPRINT(("tre_parse:  8 bit hex: '%.*" STRF "'\n",
1456 			      REST(ctx->re - 2)));
1457 
1458 		      if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1459 			{
1460 			  tmp[0] = (char)ctx->re[0];
1461 			  ctx->re++;
1462 			}
1463 		      if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1464 			{
1465 			  tmp[1] = (char)ctx->re[0];
1466 			  ctx->re++;
1467 			}
1468 		      val = strtol(tmp, NULL, 16);
1469 		      result = tre_ast_new_literal(ctx->mem, (int)val,
1470 						   (int)val, ctx->position);
1471 		      ctx->position++;
1472 		      break;
1473 		    }
1474 		  else if (ctx->re < ctx->re_end)
1475 		    {
1476 		      /* Wide char. */
1477 		      char tmp[32];
1478 		      long val;
1479 		      int i = 0;
1480 		      ctx->re++;
1481 		      while (ctx->re_end - ctx->re >= 0)
1482 			{
1483 			  if (ctx->re[0] == CHAR_RBRACE)
1484 			    break;
1485 			  if (tre_isxdigit(ctx->re[0]))
1486 			    {
1487 			      tmp[i] = (char)ctx->re[0];
1488 			      i++;
1489 			      ctx->re++;
1490 			      continue;
1491 			    }
1492 			  return REG_EBRACE;
1493 			}
1494 		      ctx->re++;
1495 		      tmp[i] = 0;
1496 		      val = strtol(tmp, NULL, 16);
1497 		      result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1498 						   ctx->position);
1499 		      ctx->position++;
1500 		      break;
1501 		    }
1502 		  /*FALLTHROUGH*/
1503 
1504 		default:
1505 		  if (tre_isdigit(*ctx->re))
1506 		    {
1507 		      /* Back reference. */
1508 		      int val = *ctx->re - L'0';
1509 		      DPRINT(("tre_parse:     backref: '%.*" STRF "'\n",
1510 			      REST(ctx->re - 1)));
1511 		      result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1512 						   ctx->position);
1513 		      if (result == NULL)
1514 			return REG_ESPACE;
1515 		      ctx->position++;
1516 		      ctx->max_backref = MAX(val, ctx->max_backref);
1517 		      ctx->re++;
1518 		    }
1519 		  else
1520 		    {
1521 		      /* Escaped character. */
1522 		      DPRINT(("tre_parse:     escaped: '%.*" STRF "'\n",
1523 			      REST(ctx->re - 1)));
1524 		      result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1525 						   ctx->position);
1526 		      ctx->position++;
1527 		      ctx->re++;
1528 		    }
1529 		  break;
1530 		}
1531 	      if (result == NULL)
1532 		return REG_ESPACE;
1533 	      break;
1534 
1535 	    case CHAR_PERIOD:	 /* the any-symbol */
1536 	      DPRINT(("tre_parse:	  any: '%.*" STRF "'\n",
1537 		      REST(ctx->re)));
1538 	      if (ctx->cflags & REG_NEWLINE)
1539 		{
1540 		  tre_ast_node_t *tmp1;
1541 		  tre_ast_node_t *tmp2;
1542 		  tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
1543 					     ctx->position);
1544 		  if (!tmp1)
1545 		    return REG_ESPACE;
1546 		  tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
1547 					     ctx->position + 1);
1548 		  if (!tmp2)
1549 		    return REG_ESPACE;
1550 		  result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1551 		  if (!result)
1552 		    return REG_ESPACE;
1553 		  ctx->position += 2;
1554 		}
1555 	      else
1556 		{
1557 		  result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1558 					       ctx->position);
1559 		  if (!result)
1560 		    return REG_ESPACE;
1561 		  ctx->position++;
1562 		}
1563 	      ctx->re++;
1564 	      break;
1565 
1566 	    case CHAR_CARET:	 /* beginning of line assertion */
1567 	      /* '^' has a special meaning everywhere in EREs, and in the
1568 		 beginning of the RE and after \( is BREs. */
1569 	      if (ctx->cflags & REG_EXTENDED
1570 		  || (ctx->re - 2 >= ctx->re_start
1571 		      && *(ctx->re - 2) == CHAR_BACKSLASH
1572 		      && *(ctx->re - 1) == CHAR_LPAREN)
1573 		  || ctx->re == ctx->re_start)
1574 		{
1575 		  DPRINT(("tre_parse:	      BOL: '%.*" STRF "'\n",
1576 			  REST(ctx->re)));
1577 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1578 					       ASSERT_AT_BOL, -1);
1579 		  if (result == NULL)
1580 		    return REG_ESPACE;
1581 		  ctx->re++;
1582 		}
1583 	      else
1584 		goto parse_literal;
1585 	      break;
1586 
1587 	    case CHAR_DOLLAR:	 /* end of line assertion. */
1588 	      /* '$' is special everywhere in EREs, and in the end of the
1589 		 string and before \) is BREs. */
1590 	      if (ctx->cflags & REG_EXTENDED
1591 		  || (ctx->re + 2 < ctx->re_end
1592 		      && *(ctx->re + 1) == CHAR_BACKSLASH
1593 		      && *(ctx->re + 2) == CHAR_RPAREN)
1594 		  || ctx->re + 1 == ctx->re_end)
1595 		{
1596 		  DPRINT(("tre_parse:	      EOL: '%.*" STRF "'\n",
1597 			  REST(ctx->re)));
1598 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1599 					       ASSERT_AT_EOL, -1);
1600 		  if (result == NULL)
1601 		    return REG_ESPACE;
1602 		  ctx->re++;
1603 		}
1604 	      else
1605 		goto parse_literal;
1606 	      break;
1607 
1608 	    default:
1609 	    parse_literal:
1610 
1611 	      if (temporary_cflags && ctx->re + 1 < ctx->re_end
1612 		  && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
1613 		{
1614 		  DPRINT(("tre_parse:	 end tmps: '%.*" STRF "'\n",
1615 			  REST(ctx->re)));
1616 		  ctx->cflags &= ~temporary_cflags;
1617 		  temporary_cflags = 0;
1618 		  ctx->re += 2;
1619 		  STACK_PUSHX(stack, int, PARSE_PIECE);
1620 		  break;
1621 		}
1622 
1623 
1624 	      /* We are expecting an atom.  If the subexpression (or the whole
1625 		 regexp ends here, we interpret it as an empty expression
1626 		 (which matches an empty string).  */
1627 	      if (
1628 #ifdef REG_LITERAL
1629 		  !(ctx->cflags & REG_LITERAL) &&
1630 #endif /* REG_LITERAL */
1631 		  (ctx->re >= ctx->re_end
1632 		   || *ctx->re == CHAR_STAR
1633 		   || (ctx->cflags & REG_EXTENDED
1634 		       && (*ctx->re == CHAR_PIPE
1635 			   || *ctx->re == CHAR_LBRACE
1636 			   || *ctx->re == CHAR_PLUS
1637 			   || *ctx->re == CHAR_QUESTIONMARK))
1638 		   /* Test for "\)" in BRE mode. */
1639 		   || (!(ctx->cflags & REG_EXTENDED)
1640 		       && ctx->re + 1 < ctx->re_end
1641 		       && *ctx->re == CHAR_BACKSLASH
1642 		       && *(ctx->re + 1) == CHAR_LBRACE)))
1643 		{
1644 		  DPRINT(("tre_parse:	    empty: '%.*" STRF "'\n",
1645 			  REST(ctx->re)));
1646 		  result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1647 		  if (!result)
1648 		    return REG_ESPACE;
1649 		  break;
1650 		}
1651 
1652 	      DPRINT(("tre_parse:     literal: '%.*" STRF "'\n",
1653 		      REST(ctx->re)));
1654 	      /* Note that we can't use an tre_isalpha() test here, since there
1655 		 may be characters which are alphabetic but neither upper or
1656 		 lower case. */
1657 	      if (ctx->cflags & REG_ICASE
1658 		  && (tre_isupper(*ctx->re) || tre_islower(*ctx->re)))
1659 		{
1660 		  tre_ast_node_t *tmp1;
1661 		  tre_ast_node_t *tmp2;
1662 
1663 		  /* XXX - Can there be more than one opposite-case
1664 		     counterpoints for some character in some locale?  Or
1665 		     more than two characters which all should be regarded
1666 		     the same character if case is ignored?  If yes, there
1667 		     does not seem to be a portable way to detect it.  I guess
1668 		     that at least for multi-character collating elements there
1669 		     could be several opposite-case counterpoints, but they
1670 		     cannot be supported portably anyway. */
1671 		  tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re),
1672 					     tre_toupper(*ctx->re),
1673 					     ctx->position);
1674 		  if (!tmp1)
1675 		    return REG_ESPACE;
1676 		  tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re),
1677 					     tre_tolower(*ctx->re),
1678 					     ctx->position);
1679 		  if (!tmp2)
1680 		    return REG_ESPACE;
1681 		  result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1682 		  if (!result)
1683 		    return REG_ESPACE;
1684 		}
1685 	      else
1686 		{
1687 		  result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1688 					       ctx->position);
1689 		  if (!result)
1690 		    return REG_ESPACE;
1691 		}
1692 	      ctx->position++;
1693 	      ctx->re++;
1694 	      break;
1695 	    }
1696 	  break;
1697 
1698 	case PARSE_MARK_FOR_SUBMATCH:
1699 	  {
1700 	    int submatch_id = tre_stack_pop_int(stack);
1701 
1702 	    if (result->submatch_id >= 0)
1703 	      {
1704 		tre_ast_node_t *n, *tmp_node;
1705 		n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1706 		if (n == NULL)
1707 		  return REG_ESPACE;
1708 		tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1709 		if (tmp_node == NULL)
1710 		  return REG_ESPACE;
1711 		tmp_node->num_submatches = result->num_submatches;
1712 		result = tmp_node;
1713 	      }
1714 	    result->submatch_id = submatch_id;
1715 	    result->num_submatches++;
1716 	    break;
1717 	  }
1718 
1719 	case PARSE_RESTORE_CFLAGS:
1720 	  ctx->cflags = tre_stack_pop_int(stack);
1721 	  break;
1722 
1723 	default:
1724 	  assert(0);
1725 	  break;
1726 	}
1727     }
1728 
1729   /* Check for missing closing parentheses. */
1730   if (depth > 0)
1731     return REG_EPAREN;
1732 
1733   if (status == REG_OK)
1734     ctx->result = result;
1735 
1736   return status;
1737 }
1738 
1739 /* EOF */
1740