1 /*
2 tre-parse.c - Regexp parser
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7 */
8
9 /*
10 This parser is just a simple recursive descent parser for POSIX.2
11 regexps. The parser supports both the obsolete default syntax and
12 the "extended" syntax, and some nonstandard extensions.
13 */
14
15
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif /* HAVE_CONFIG_H */
19 #include <string.h>
20 #include <assert.h>
21 #include <limits.h>
22
23 #include "xmalloc.h"
24 #include "tre-mem.h"
25 #include "tre-ast.h"
26 #include "tre-stack.h"
27 #include "tre-parse.h"
28
29
30 /* Characters with special meanings in regexp syntax. */
31 #define CHAR_PIPE L'|'
32 #define CHAR_LPAREN L'('
33 #define CHAR_RPAREN L')'
34 #define CHAR_LBRACE L'{'
35 #define CHAR_RBRACE L'}'
36 #define CHAR_LBRACKET L'['
37 #define CHAR_RBRACKET L']'
38 #define CHAR_MINUS L'-'
39 #define CHAR_STAR L'*'
40 #define CHAR_QUESTIONMARK L'?'
41 #define CHAR_PLUS L'+'
42 #define CHAR_PERIOD L'.'
43 #define CHAR_COLON L':'
44 #define CHAR_EQUAL L'='
45 #define CHAR_COMMA L','
46 #define CHAR_CARET L'^'
47 #define CHAR_DOLLAR L'$'
48 #define CHAR_BACKSLASH L'\\'
49 #define CHAR_HASH L'#'
50 #define CHAR_TILDE L'~'
51
52
53 /* Some macros for expanding \w, \s, etc. */
54 static const struct tre_macro_struct {
55 const char c;
56 const char *expansion;
57 } tre_macros[] =
58 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
59 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
60 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
61 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
62 { 0, NULL }
63 };
64
65
66 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
67 must have at least `len' items. Sets buf[0] to zero if the there
68 is no match in `tre_macros'. */
69 static void
tre_expand_macro(const tre_char_t * regex,const tre_char_t * regex_end,tre_char_t * buf,size_t buf_len)70 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
71 tre_char_t *buf, size_t buf_len)
72 {
73 int i;
74
75 buf[0] = 0;
76 if (regex >= regex_end)
77 return;
78
79 for (i = 0; tre_macros[i].expansion; i++)
80 {
81 if (tre_macros[i].c == *regex)
82 {
83 unsigned int j;
84 DPRINT(("Expanding macro '%c' => '%s'\n",
85 tre_macros[i].c, tre_macros[i].expansion));
86 for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
87 buf[j] = tre_macros[i].expansion[j];
88 buf[j] = 0;
89 break;
90 }
91 }
92 }
93
94 static reg_errcode_t
tre_new_item(tre_mem_t mem,int min,int max,int * i,int * max_i,tre_ast_node_t *** items)95 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
96 tre_ast_node_t ***items)
97 {
98 reg_errcode_t status;
99 tre_ast_node_t **array = *items;
100 /* Allocate more space if necessary. */
101 if (*i >= *max_i)
102 {
103 tre_ast_node_t **new_items;
104 DPRINT(("out of array space, i = %d\n", *i));
105 /* If the array is already 1024 items large, give up -- there's
106 probably an error in the regexp (e.g. not a '\0' terminated
107 string and missing ']') */
108 if (*max_i > 1024)
109 return REG_ESPACE;
110 *max_i *= 2;
111 new_items = xrealloc(array, sizeof(*array) * *max_i);
112 if (new_items == NULL)
113 return REG_ESPACE;
114 *items = array = new_items;
115 }
116 array[*i] = tre_ast_new_literal(mem, min, max, -1);
117 status = array[*i] == NULL ? REG_ESPACE : REG_OK;
118 (*i)++;
119 return status;
120 }
121
122
123 /* Expands a character class to character ranges. */
124 static reg_errcode_t
tre_expand_ctype(tre_mem_t mem,tre_ctype_t class,tre_ast_node_t *** items,int * i,int * max_i,int cflags)125 tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items,
126 int *i, int *max_i, int cflags)
127 {
128 reg_errcode_t status = REG_OK;
129 tre_cint_t c;
130 int j, min = -1, max = 0;
131 assert(TRE_MB_CUR_MAX == 1);
132
133 DPRINT((" expanding class to character ranges\n"));
134 for (j = 0; (j < 256) && (status == REG_OK); j++)
135 {
136 c = (tre_cint_t) j;
137 if (tre_isctype(c, class)
138 || ((cflags & REG_ICASE)
139 && (tre_isctype(tre_tolower(c), class)
140 || tre_isctype(tre_toupper(c), class))))
141 {
142 if (min < 0)
143 min = c;
144 max = c;
145 }
146 else if (min >= 0)
147 {
148 DPRINT((" range %c (%d) to %c (%d)\n", min, min, max, max));
149 status = tre_new_item(mem, min, max, i, max_i, items);
150 min = -1;
151 }
152 }
153 if (min >= 0 && status == REG_OK)
154 status = tre_new_item(mem, min, max, i, max_i, items);
155 return status;
156 }
157
158
159 static int
tre_compare_items(const void * a,const void * b)160 tre_compare_items(const void *a, const void *b)
161 {
162 const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
163 const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
164 tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
165 int a_min = l_a->code_min, b_min = l_b->code_min;
166
167 if (a_min < b_min)
168 return -1;
169 else if (a_min > b_min)
170 return 1;
171 else
172 return 0;
173 }
174
175 #ifndef TRE_USE_SYSTEM_WCTYPE
176
177 int tre_isalnum_func(tre_cint_t);
178 int tre_isalpha_func(tre_cint_t);
179 int tre_isascii_func(tre_cint_t);
180 int tre_isblank_func(tre_cint_t);
181 int tre_iscntrl_func(tre_cint_t);
182 int tre_isdigit_func(tre_cint_t);
183 int tre_isgraph_func(tre_cint_t);
184 int tre_islower_func(tre_cint_t);
185 int tre_isprint_func(tre_cint_t);
186 int tre_ispunct_func(tre_cint_t);
187 int tre_isspace_func(tre_cint_t);
188 int tre_isupper_func(tre_cint_t);
189 int tre_isxdigit_func(tre_cint_t);
190
191 /* isalnum() and the rest may be macros, so wrap them to functions. */
tre_isalnum_func(tre_cint_t c)192 int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
tre_isalpha_func(tre_cint_t c)193 int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
194
195 #ifdef tre_isascii
tre_isascii_func(tre_cint_t c)196 int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
197 #else /* !tre_isascii */
tre_isascii_func(tre_cint_t c)198 int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
199 #endif /* !tre_isascii */
200
201 #ifdef tre_isblank
tre_isblank_func(tre_cint_t c)202 int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
203 #else /* !tre_isblank */
tre_isblank_func(tre_cint_t c)204 int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
205 #endif /* !tre_isblank */
206
tre_iscntrl_func(tre_cint_t c)207 int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
tre_isdigit_func(tre_cint_t c)208 int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
tre_isgraph_func(tre_cint_t c)209 int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
tre_islower_func(tre_cint_t c)210 int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
tre_isprint_func(tre_cint_t c)211 int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
tre_ispunct_func(tre_cint_t c)212 int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
tre_isspace_func(tre_cint_t c)213 int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
tre_isupper_func(tre_cint_t c)214 int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
tre_isxdigit_func(tre_cint_t c)215 int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
216
217 struct {
218 const char *name;
219 int (*func)(tre_cint_t);
220 } tre_ctype_map[] = {
221 { "alnum", &tre_isalnum_func },
222 { "alpha", &tre_isalpha_func },
223 #ifdef tre_isascii
224 { "ascii", &tre_isascii_func },
225 #endif /* tre_isascii */
226 #ifdef tre_isblank
227 { "blank", &tre_isblank_func },
228 #endif /* tre_isblank */
229 { "cntrl", &tre_iscntrl_func },
230 { "digit", &tre_isdigit_func },
231 { "graph", &tre_isgraph_func },
232 { "lower", &tre_islower_func },
233 { "print", &tre_isprint_func },
234 { "punct", &tre_ispunct_func },
235 { "space", &tre_isspace_func },
236 { "upper", &tre_isupper_func },
237 { "xdigit", &tre_isxdigit_func },
238 { NULL, NULL}
239 };
240
tre_ctype(const char * name)241 tre_ctype_t tre_ctype(const char *name)
242 {
243 int i;
244 for (i = 0; tre_ctype_map[i].name != NULL; i++)
245 {
246 if (strcmp(name, tre_ctype_map[i].name) == 0)
247 return tre_ctype_map[i].func;
248 }
249 return (tre_ctype_t)0;
250 }
251 #endif /* !TRE_USE_SYSTEM_WCTYPE */
252
253 /* Maximum number of character classes that can occur in a negated bracket
254 expression. */
255 #define MAX_NEG_CLASSES 64
256
257 /* Maximum length of character class names. */
258 #define MAX_CLASS_NAME
259
260 #define REST(re) (int)(ctx->re_end - (re)), (re)
261
262 static reg_errcode_t
tre_parse_bracket_items(tre_parse_ctx_t * ctx,int negate,tre_ctype_t neg_classes[],int * num_neg_classes,tre_ast_node_t *** items,int * num_items,int * items_size)263 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
264 tre_ctype_t neg_classes[], int *num_neg_classes,
265 tre_ast_node_t ***items, int *num_items,
266 int *items_size)
267 {
268 const tre_char_t *re = ctx->re;
269 reg_errcode_t status = REG_OK;
270 tre_ctype_t class = (tre_ctype_t)0;
271 int i = *num_items;
272 int max_i = *items_size;
273 int skip;
274
275 /* Build an array of the items in the bracket expression. */
276 while (status == REG_OK)
277 {
278 skip = 0;
279 if (re == ctx->re_end)
280 {
281 status = REG_EBRACK;
282 }
283 else if (*re == CHAR_RBRACKET && re > ctx->re)
284 {
285 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re)));
286 re++;
287 break;
288 }
289 else
290 {
291 tre_cint_t min = 0, max = 0;
292
293 class = (tre_ctype_t)0;
294 if (re + 2 < ctx->re_end
295 && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET)
296 {
297 DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re)));
298 min = *re;
299 max = *(re + 2);
300 re += 3;
301 /* XXX - Should use collation order instead of encoding values
302 in character ranges. */
303 if (min > max)
304 status = REG_ERANGE;
305 }
306 else if (re + 1 < ctx->re_end
307 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
308 status = REG_ECOLLATE;
309 else if (re + 1 < ctx->re_end
310 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
311 status = REG_ECOLLATE;
312 else if (re + 1 < ctx->re_end
313 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
314 {
315 char tmp_str[64];
316 const tre_char_t *endptr = re + 2;
317 size_t len;
318 DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re)));
319 while (endptr < ctx->re_end && *endptr != CHAR_COLON)
320 endptr++;
321 if (endptr != ctx->re_end)
322 {
323 len = MIN(endptr - re - 2, 63);
324 #ifdef TRE_WCHAR
325 {
326 tre_char_t tmp_wcs[64];
327 wcsncpy(tmp_wcs, re + 2, (size_t)len);
328 tmp_wcs[len] = L'\0';
329 #if defined HAVE_WCSRTOMBS
330 {
331 mbstate_t state;
332 const tre_char_t *src = tmp_wcs;
333 memset(&state, '\0', sizeof(state));
334 len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
335 }
336 #elif defined HAVE_WCSTOMBS
337 len = wcstombs(tmp_str, tmp_wcs, 63);
338 #endif /* defined HAVE_WCSTOMBS */
339 }
340 #else /* !TRE_WCHAR */
341 /* LINTED */strncpy(tmp_str, (const char*)re + 2, len);
342 #endif /* !TRE_WCHAR */
343 tmp_str[len] = '\0';
344 DPRINT((" class name: %s\n", tmp_str));
345 class = tre_ctype(tmp_str);
346 if (!class)
347 status = REG_ECTYPE;
348 /* Optimize character classes for 8 bit character sets. */
349 if (status == REG_OK && ctx->cur_max == 1)
350 {
351 status = tre_expand_ctype(ctx->mem, class, items,
352 &i, &max_i, ctx->cflags);
353 class = (tre_ctype_t)0;
354 skip = 1;
355 }
356 re = endptr + 2;
357 }
358 else
359 status = REG_ECTYPE;
360 min = 0;
361 max = TRE_CHAR_MAX;
362 }
363 else
364 {
365 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
366 if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
367 && ctx->re != re)
368 /* Two ranges are not allowed to share and endpoint. */
369 status = REG_ERANGE;
370 min = max = *re++;
371 }
372
373 if (status != REG_OK)
374 break;
375
376 if (class && negate)
377 if (*num_neg_classes >= MAX_NEG_CLASSES)
378 status = REG_ESPACE;
379 else
380 neg_classes[(*num_neg_classes)++] = class;
381 else if (!skip)
382 {
383 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
384 if (status != REG_OK)
385 break;
386 ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
387 }
388
389 /* Add opposite-case counterpoints if REG_ICASE is present.
390 This is broken if there are more than two "same" characters. */
391 if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
392 {
393 tre_cint_t cmin, ccurr;
394
395 DPRINT(("adding opposite-case counterpoints\n"));
396 while (min <= max)
397 {
398 if (tre_islower(min))
399 {
400 cmin = ccurr = tre_toupper(min++);
401 while (tre_islower(min) && tre_toupper(min) == ccurr + 1
402 && min <= max)
403 ccurr = tre_toupper(min++);
404 status = tre_new_item(ctx->mem, cmin, ccurr,
405 &i, &max_i, items);
406 }
407 else if (tre_isupper(min))
408 {
409 cmin = ccurr = tre_tolower(min++);
410 while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
411 && min <= max)
412 ccurr = tre_tolower(min++);
413 status = tre_new_item(ctx->mem, cmin, ccurr,
414 &i, &max_i, items);
415 }
416 else min++;
417 if (status != REG_OK)
418 break;
419 }
420 if (status != REG_OK)
421 break;
422 }
423 }
424 }
425 *num_items = i;
426 *items_size = max_i;
427 ctx->re = re;
428 return status;
429 }
430
431 static reg_errcode_t
tre_parse_bracket(tre_parse_ctx_t * ctx,tre_ast_node_t ** result)432 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
433 {
434 tre_ast_node_t *node = NULL;
435 int negate = 0;
436 reg_errcode_t status = REG_OK;
437 tre_ast_node_t **items, *u, *n;
438 int i = 0, j, max_i = 32, curr_max, curr_min;
439 tre_ctype_t neg_classes[MAX_NEG_CLASSES];
440 int num_neg_classes = 0;
441
442 /* Start off with an array of `max_i' elements. */
443 items = xmalloc(sizeof(*items) * max_i);
444 if (items == NULL)
445 return REG_ESPACE;
446
447 if (*ctx->re == CHAR_CARET)
448 {
449 DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
450 negate = 1;
451 ctx->re++;
452 }
453
454 status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
455 &items, &i, &max_i);
456
457 if (status != REG_OK)
458 goto parse_bracket_done;
459
460 /* Sort the array if we need to negate it. */
461 if (negate)
462 qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
463
464 curr_max = curr_min = 0;
465 /* Build a union of the items in the array, negated if necessary. */
466 for (j = 0; j < i && status == REG_OK; j++)
467 {
468 int min, max;
469 tre_literal_t *l = items[j]->obj;
470 min = l->code_min;
471 max = l->code_max;
472
473 DPRINT(("item: %d - %d, class %ld, curr_max = %d\n",
474 (int)l->code_min, (int)l->code_max, (long)l->u.class, curr_max));
475
476 if (negate)
477 {
478 if (min < curr_max)
479 {
480 /* Overlap. */
481 curr_max = MAX(max + 1, curr_max);
482 DPRINT(("overlap, curr_max = %d\n", curr_max));
483 l = NULL;
484 }
485 else
486 {
487 /* No overlap. */
488 curr_max = min - 1;
489 if (curr_max >= curr_min)
490 {
491 DPRINT(("no overlap\n"));
492 l->code_min = curr_min;
493 l->code_max = curr_max;
494 }
495 else
496 {
497 DPRINT(("no overlap, zero room\n"));
498 l = NULL;
499 }
500 curr_min = curr_max = max + 1;
501 }
502 }
503
504 if (l != NULL)
505 {
506 int k;
507 DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max));
508 l->position = ctx->position;
509 if (num_neg_classes > 0)
510 {
511 l->neg_classes = tre_mem_alloc(ctx->mem,
512 (sizeof(*l->neg_classes)
513 * (num_neg_classes + 1)));
514 if (l->neg_classes == NULL)
515 {
516 status = REG_ESPACE;
517 break;
518 }
519 for (k = 0; k < num_neg_classes; k++)
520 l->neg_classes[k] = neg_classes[k];
521 l->neg_classes[k] = (tre_ctype_t)0;
522 }
523 else
524 l->neg_classes = NULL;
525 if (node == NULL)
526 node = items[j];
527 else
528 {
529 u = tre_ast_new_union(ctx->mem, node, items[j]);
530 if (u == NULL)
531 status = REG_ESPACE;
532 node = u;
533 }
534 }
535 }
536
537 if (status != REG_OK)
538 goto parse_bracket_done;
539
540 if (negate)
541 {
542 int k;
543 DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX));
544 n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
545 if (n == NULL)
546 status = REG_ESPACE;
547 else
548 {
549 tre_literal_t *l = n->obj;
550 if (num_neg_classes > 0)
551 {
552 l->neg_classes = tre_mem_alloc(ctx->mem,
553 (sizeof(*l->neg_classes)
554 * (num_neg_classes + 1)));
555 if (l->neg_classes == NULL)
556 {
557 status = REG_ESPACE;
558 goto parse_bracket_done;
559 }
560 for (k = 0; k < num_neg_classes; k++)
561 l->neg_classes[k] = neg_classes[k];
562 l->neg_classes[k] = (tre_ctype_t)0;
563 }
564 else
565 l->neg_classes = NULL;
566 if (node == NULL)
567 node = n;
568 else
569 {
570 u = tre_ast_new_union(ctx->mem, node, n);
571 if (u == NULL)
572 status = REG_ESPACE;
573 node = u;
574 }
575 }
576 }
577
578 if (status != REG_OK)
579 goto parse_bracket_done;
580
581 #ifdef TRE_DEBUG
582 tre_ast_print(node);
583 #endif /* TRE_DEBUG */
584
585 parse_bracket_done:
586 xfree(items);
587 ctx->position++;
588 *result = node;
589 return status;
590 }
591
592
593 /* Parses a positive decimal integer. Returns -1 if the string does not
594 contain a valid number. */
595 static int
tre_parse_int(const tre_char_t ** regex,const tre_char_t * regex_end)596 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
597 {
598 int num = -1;
599 const tre_char_t *r = *regex;
600 while (r < regex_end && *r >= L'0' && *r <= L'9')
601 {
602 if (num < 0)
603 num = 0;
604 num = num * 10 + *r - L'0';
605 r++;
606 }
607 *regex = r;
608 return num;
609 }
610
611
612 static reg_errcode_t
tre_parse_bound(tre_parse_ctx_t * ctx,tre_ast_node_t ** result)613 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
614 {
615 int min, max, i;
616 int cost_ins, cost_del, cost_subst, cost_max;
617 int limit_ins, limit_del, limit_subst, limit_err;
618 const tre_char_t *r = ctx->re;
619 const tre_char_t *start;
620 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
621 int approx = 0;
622 int costs_set = 0;
623 int counts_set = 0;
624
625 cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
626 limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
627
628 /* Parse number (minimum repetition count). */
629 min = -1;
630 if (r < ctx->re_end && *r >= L'0' && *r <= L'9') {
631 DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r)));
632 min = tre_parse_int(&r, ctx->re_end);
633 }
634
635 /* Parse comma and second number (maximum repetition count). */
636 max = min;
637 if (r < ctx->re_end && *r == CHAR_COMMA)
638 {
639 r++;
640 DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r)));
641 max = tre_parse_int(&r, ctx->re_end);
642 }
643
644 /* Check that the repeat counts are sane. */
645 if ((max >= 0 && min > max) || max > RE_DUP_MAX)
646 return REG_BADBR;
647
648
649 /*
650 '{'
651 optionally followed immediately by a number == minimum repcount
652 optionally followed by , then a number == maximum repcount
653 + then a number == maximum insertion count
654 - then a number == maximum deletion count
655 # then a number == maximum substitution count
656 ~ then a number == maximum number of errors
657 Any of +, -, # or ~ without followed by a number means that
658 the maximum count/number of errors is infinite.
659
660 An equation of the form
661 Xi + Yd + Zs < C
662 can be specified to set costs and the cost limit to a value
663 different from the default value:
664 - X is the cost of an insertion
665 - Y is the cost of a deletion
666 - Z is the cost of a substitution
667 - C is the maximum cost
668
669 If no count limit or cost is set for an operation, the operation
670 is not allowed at all.
671 */
672
673
674 do {
675 int done;
676 start = r;
677
678 /* Parse count limit settings */
679 done = 0;
680 if (!counts_set)
681 while (r + 1 < ctx->re_end && !done)
682 {
683 switch (*r)
684 {
685 case CHAR_PLUS: /* Insert limit */
686 DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r)));
687 r++;
688 limit_ins = tre_parse_int(&r, ctx->re_end);
689 if (limit_ins < 0)
690 limit_ins = INT_MAX;
691 counts_set = 1;
692 break;
693 case CHAR_MINUS: /* Delete limit */
694 DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r)));
695 r++;
696 limit_del = tre_parse_int(&r, ctx->re_end);
697 if (limit_del < 0)
698 limit_del = INT_MAX;
699 counts_set = 1;
700 break;
701 case CHAR_HASH: /* Substitute limit */
702 DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
703 r++;
704 limit_subst = tre_parse_int(&r, ctx->re_end);
705 if (limit_subst < 0)
706 limit_subst = INT_MAX;
707 counts_set = 1;
708 break;
709 case CHAR_TILDE: /* Maximum number of changes */
710 DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
711 r++;
712 limit_err = tre_parse_int(&r, ctx->re_end);
713 if (limit_err < 0)
714 limit_err = INT_MAX;
715 approx = 1;
716 break;
717 case CHAR_COMMA:
718 r++;
719 break;
720 case L' ':
721 r++;
722 break;
723 case L'}':
724 done = 1;
725 break;
726 default:
727 done = 1;
728 break;
729 }
730 }
731
732 /* Parse cost restriction equation. */
733 done = 0;
734 if (!costs_set)
735 while (r + 1 < ctx->re_end && !done)
736 {
737 switch (*r)
738 {
739 case CHAR_PLUS:
740 case L' ':
741 r++;
742 break;
743 case L'<':
744 DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r)));
745 r++;
746 while (*r == L' ')
747 r++;
748 cost_max = tre_parse_int(&r, ctx->re_end);
749 if (cost_max < 0)
750 cost_max = INT_MAX;
751 else
752 cost_max--;
753 approx = 1;
754 break;
755 case CHAR_COMMA:
756 r++;
757 done = 1;
758 break;
759 default:
760 if (*r >= L'0' && *r <= L'9')
761 {
762 #ifdef TRE_DEBUG
763 const tre_char_t *sr = r;
764 #endif /* TRE_DEBUG */
765 int cost = tre_parse_int(&r, ctx->re_end);
766 /* XXX - make sure r is not past end. */
767 switch (*r)
768 {
769 case L'i': /* Insert cost */
770 DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n",
771 REST(sr)));
772 r++;
773 cost_ins = cost;
774 costs_set = 1;
775 break;
776 case L'd': /* Delete cost */
777 DPRINT(("tre_parse: del cost: '%.*" STRF "'\n",
778 REST(sr)));
779 r++;
780 cost_del = cost;
781 costs_set = 1;
782 break;
783 case L's': /* Substitute cost */
784 DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n",
785 REST(sr)));
786 r++;
787 cost_subst = cost;
788 costs_set = 1;
789 break;
790 default:
791 return REG_BADBR;
792 }
793 }
794 else
795 {
796 done = 1;
797 break;
798 }
799 }
800 }
801 } while (start != r);
802
803 /* Missing }. */
804 if (r >= ctx->re_end)
805 return REG_EBRACE;
806
807 /* Empty contents of {}. */
808 if (r == ctx->re)
809 return REG_BADBR;
810
811 /* Parse the ending '}' or '\}'.*/
812 if (ctx->cflags & REG_EXTENDED)
813 {
814 if (r >= ctx->re_end || *r != CHAR_RBRACE)
815 return REG_BADBR;
816 r++;
817 }
818 else
819 {
820 if (r + 1 >= ctx->re_end
821 || *r != CHAR_BACKSLASH
822 || *(r + 1) != CHAR_RBRACE)
823 return REG_BADBR;
824 r += 2;
825 }
826
827
828 /* Parse trailing '?' marking minimal repetition. */
829 if (r < ctx->re_end)
830 {
831 if (*r == CHAR_QUESTIONMARK)
832 {
833 minimal = !(ctx->cflags & REG_UNGREEDY);
834 r++;
835 }
836 else if (*r == CHAR_STAR || *r == CHAR_PLUS)
837 {
838 /* These are reserved for future extensions. */
839 return REG_BADRPT;
840 }
841 }
842
843 /* Create the AST node(s). */
844 if (min == 0 && max == 0)
845 {
846 *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
847 if (*result == NULL)
848 return REG_ESPACE;
849 }
850 else
851 {
852 if (min < 0 && max < 0)
853 /* Only approximate parameters set, no repetitions. */
854 min = max = 1;
855
856 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
857 if (!*result)
858 return REG_ESPACE;
859
860 /* If approximate matching parameters are set, add them to the
861 iteration node. */
862 if (approx || costs_set || counts_set)
863 {
864 int *params;
865 tre_iteration_t *iter = (*result)->obj;
866
867 if (costs_set || counts_set)
868 {
869 if (limit_ins == TRE_PARAM_UNSET)
870 {
871 if (cost_ins == TRE_PARAM_UNSET)
872 limit_ins = 0;
873 else
874 limit_ins = INT_MAX;
875 }
876
877 if (limit_del == TRE_PARAM_UNSET)
878 {
879 if (cost_del == TRE_PARAM_UNSET)
880 limit_del = 0;
881 else
882 limit_del = INT_MAX;
883 }
884
885 if (limit_subst == TRE_PARAM_UNSET)
886 {
887 if (cost_subst == TRE_PARAM_UNSET)
888 limit_subst = 0;
889 else
890 limit_subst = INT_MAX;
891 }
892 }
893
894 if (cost_max == TRE_PARAM_UNSET)
895 cost_max = INT_MAX;
896 if (limit_err == TRE_PARAM_UNSET)
897 limit_err = INT_MAX;
898
899 ctx->have_approx = 1;
900 params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
901 if (!params)
902 return REG_ESPACE;
903 for (i = 0; i < TRE_PARAM_LAST; i++)
904 params[i] = TRE_PARAM_UNSET;
905 params[TRE_PARAM_COST_INS] = cost_ins;
906 params[TRE_PARAM_COST_DEL] = cost_del;
907 params[TRE_PARAM_COST_SUBST] = cost_subst;
908 params[TRE_PARAM_COST_MAX] = cost_max;
909 params[TRE_PARAM_MAX_INS] = limit_ins;
910 params[TRE_PARAM_MAX_DEL] = limit_del;
911 params[TRE_PARAM_MAX_SUBST] = limit_subst;
912 params[TRE_PARAM_MAX_ERR] = limit_err;
913 iter->params = params;
914 }
915 }
916
917 DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
918 "limits [%d,%d,%d, total %d]\n",
919 min, max, cost_ins, cost_del, cost_subst, cost_max,
920 limit_ins, limit_del, limit_subst, limit_err));
921
922
923 ctx->re = r;
924 return REG_OK;
925 }
926
927 typedef enum {
928 PARSE_RE = 0,
929 PARSE_ATOM,
930 PARSE_MARK_FOR_SUBMATCH,
931 PARSE_BRANCH,
932 PARSE_PIECE,
933 PARSE_CATENATION,
934 PARSE_POST_CATENATION,
935 PARSE_UNION,
936 PARSE_POST_UNION,
937 PARSE_POSTFIX,
938 PARSE_RESTORE_CFLAGS
939 } tre_parse_re_stack_symbol_t;
940
941
942 reg_errcode_t
tre_parse(tre_parse_ctx_t * ctx)943 tre_parse(tre_parse_ctx_t *ctx)
944 {
945 tre_ast_node_t *result = NULL;
946 tre_parse_re_stack_symbol_t symbol;
947 reg_errcode_t status = REG_OK;
948 tre_stack_t *stack = ctx->stack;
949 int bottom = tre_stack_num_objects(stack);
950 int depth = 0;
951 int temporary_cflags = 0;
952
953 DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n",
954 ctx->len, ctx->re, ctx->len));
955
956 if (!ctx->nofirstsub)
957 {
958 STACK_PUSH(stack, long, ctx->submatch_id);
959 STACK_PUSH(stack, long, PARSE_MARK_FOR_SUBMATCH);
960 ctx->submatch_id++;
961 }
962 STACK_PUSH(stack, long, PARSE_RE);
963 ctx->re_start = ctx->re;
964 ctx->re_end = ctx->re + ctx->len;
965
966
967 /* The following is basically just a recursive descent parser. I use
968 an explicit stack instead of recursive functions mostly because of
969 two reasons: compatibility with systems which have an overflowable
970 call stack, and efficiency (both in lines of code and speed). */
971 while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
972 {
973 if (status != REG_OK)
974 break;
975 symbol = tre_stack_pop_int(stack);
976 switch (symbol)
977 {
978 case PARSE_RE:
979 /* Parse a full regexp. A regexp is one or more branches,
980 separated by the union operator `|'. */
981 #ifdef REG_LITERAL
982 if (!(ctx->cflags & REG_LITERAL)
983 && ctx->cflags & REG_EXTENDED)
984 #endif /* REG_LITERAL */
985 STACK_PUSHX(stack, long, PARSE_UNION);
986 STACK_PUSHX(stack, long, PARSE_BRANCH);
987 break;
988
989 case PARSE_BRANCH:
990 /* Parse a branch. A branch is one or more pieces, concatenated.
991 A piece is an atom possibly followed by a postfix operator. */
992 STACK_PUSHX(stack, long, PARSE_CATENATION);
993 STACK_PUSHX(stack, long, PARSE_PIECE);
994 break;
995
996 case PARSE_PIECE:
997 /* Parse a piece. A piece is an atom possibly followed by one
998 or more postfix operators. */
999 #ifdef REG_LITERAL
1000 if (!(ctx->cflags & REG_LITERAL))
1001 #endif /* REG_LITERAL */
1002 STACK_PUSHX(stack, long, PARSE_POSTFIX);
1003 STACK_PUSHX(stack, long, PARSE_ATOM);
1004 break;
1005
1006 case PARSE_CATENATION:
1007 /* If the expression has not ended, parse another piece. */
1008 {
1009 tre_char_t c;
1010 if (ctx->re >= ctx->re_end)
1011 break;
1012 c = *ctx->re;
1013 #ifdef REG_LITERAL
1014 if (!(ctx->cflags & REG_LITERAL))
1015 {
1016 #endif /* REG_LITERAL */
1017 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1018 break;
1019 if ((ctx->cflags & REG_EXTENDED
1020 && c == CHAR_RPAREN && depth > 0)
1021 || (!(ctx->cflags & REG_EXTENDED)
1022 && (c == CHAR_BACKSLASH
1023 && *(ctx->re + 1) == CHAR_RPAREN)))
1024 {
1025 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1026 status = REG_EPAREN;
1027 DPRINT(("tre_parse: group end: '%.*" STRF "'\n",
1028 REST(ctx->re)));
1029 depth--;
1030 if (!(ctx->cflags & REG_EXTENDED))
1031 ctx->re += 2;
1032 break;
1033 }
1034 #ifdef REG_LITERAL
1035 }
1036 #endif /* REG_LITERAL */
1037
1038 #ifdef REG_RIGHT_ASSOC
1039 if (ctx->cflags & REG_RIGHT_ASSOC)
1040 {
1041 /* Right associative concatenation. */
1042 STACK_PUSHX(stack, voidptr, result);
1043 STACK_PUSHX(stack, long, PARSE_POST_CATENATION);
1044 STACK_PUSHX(stack, long, PARSE_CATENATION);
1045 STACK_PUSHX(stack, long, PARSE_PIECE);
1046 }
1047 else
1048 #endif /* REG_RIGHT_ASSOC */
1049 {
1050 /* Default case, left associative concatenation. */
1051 STACK_PUSHX(stack, long, PARSE_CATENATION);
1052 STACK_PUSHX(stack, voidptr, result);
1053 STACK_PUSHX(stack, long, PARSE_POST_CATENATION);
1054 STACK_PUSHX(stack, long, PARSE_PIECE);
1055 }
1056 break;
1057 }
1058
1059 case PARSE_POST_CATENATION:
1060 {
1061 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1062 tre_ast_node_t *tmp_node;
1063 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1064 if (!tmp_node)
1065 return REG_ESPACE;
1066 result = tmp_node;
1067 break;
1068 }
1069
1070 case PARSE_UNION:
1071 if (ctx->re >= ctx->re_end)
1072 break;
1073 #ifdef REG_LITERAL
1074 if (ctx->cflags & REG_LITERAL)
1075 break;
1076 #endif /* REG_LITERAL */
1077 switch (*ctx->re)
1078 {
1079 case CHAR_PIPE:
1080 DPRINT(("tre_parse: union: '%.*" STRF "'\n",
1081 REST(ctx->re)));
1082 STACK_PUSHX(stack, long, PARSE_UNION);
1083 STACK_PUSHX(stack, voidptr, result);
1084 STACK_PUSHX(stack, long, PARSE_POST_UNION);
1085 STACK_PUSHX(stack, long, PARSE_BRANCH);
1086 ctx->re++;
1087 break;
1088
1089 case CHAR_RPAREN:
1090 ctx->re++;
1091 break;
1092
1093 default:
1094 break;
1095 }
1096 break;
1097
1098 case PARSE_POST_UNION:
1099 {
1100 tre_ast_node_t *tmp_node;
1101 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1102 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1103 if (!tmp_node)
1104 return REG_ESPACE;
1105 result = tmp_node;
1106 break;
1107 }
1108
1109 case PARSE_POSTFIX:
1110 /* Parse postfix operators. */
1111 if (ctx->re >= ctx->re_end)
1112 break;
1113 #ifdef REG_LITERAL
1114 if (ctx->cflags & REG_LITERAL)
1115 break;
1116 #endif /* REG_LITERAL */
1117 switch (*ctx->re)
1118 {
1119 case CHAR_PLUS:
1120 case CHAR_QUESTIONMARK:
1121 if (!(ctx->cflags & REG_EXTENDED))
1122 break;
1123 /*FALLTHROUGH*/
1124 case CHAR_STAR:
1125 {
1126 tre_ast_node_t *tmp_node;
1127 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1128 int rep_min = 0;
1129 int rep_max = -1;
1130 #ifdef TRE_DEBUG
1131 const tre_char_t *tmp_re;
1132 #endif
1133
1134 if (*ctx->re == CHAR_PLUS)
1135 rep_min = 1;
1136 if (*ctx->re == CHAR_QUESTIONMARK)
1137 rep_max = 1;
1138 #ifdef TRE_DEBUG
1139 tmp_re = ctx->re;
1140 #endif
1141
1142 if (ctx->re + 1 < ctx->re_end)
1143 {
1144 if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1145 {
1146 minimal = !(ctx->cflags & REG_UNGREEDY);
1147 ctx->re++;
1148 }
1149 else if (*(ctx->re + 1) == CHAR_STAR
1150 || *(ctx->re + 1) == CHAR_PLUS)
1151 {
1152 /* These are reserved for future extensions. */
1153 return REG_BADRPT;
1154 }
1155 }
1156
1157 DPRINT(("tre_parse: %s star: '%.*" STRF "'\n",
1158 minimal ? " minimal" : "greedy", REST(tmp_re)));
1159 ctx->re++;
1160 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1161 minimal);
1162 if (tmp_node == NULL)
1163 return REG_ESPACE;
1164 result = tmp_node;
1165 STACK_PUSHX(stack, long, PARSE_POSTFIX);
1166 }
1167 break;
1168
1169 case CHAR_BACKSLASH:
1170 /* "\{" is special without REG_EXTENDED */
1171 if (!(ctx->cflags & REG_EXTENDED)
1172 && ctx->re + 1 < ctx->re_end
1173 && *(ctx->re + 1) == CHAR_LBRACE)
1174 {
1175 ctx->re++;
1176 goto parse_brace;
1177 }
1178 else
1179 break;
1180
1181 case CHAR_LBRACE:
1182 /* "{" is literal without REG_EXTENDED */
1183 if (!(ctx->cflags & REG_EXTENDED))
1184 break;
1185
1186 parse_brace:
1187 DPRINT(("tre_parse: bound: '%.*" STRF "'\n",
1188 REST(ctx->re)));
1189 ctx->re++;
1190
1191 status = tre_parse_bound(ctx, &result);
1192 if (status != REG_OK)
1193 return status;
1194 STACK_PUSHX(stack, long, PARSE_POSTFIX);
1195 break;
1196 }
1197 break;
1198
1199 case PARSE_ATOM:
1200 /* Parse an atom. An atom is a regular expression enclosed in `()',
1201 an empty set of `()', a bracket expression, `.', `^', `$',
1202 a `\' followed by a character, or a single character. */
1203
1204 /* End of regexp? (empty string). */
1205 if (ctx->re >= ctx->re_end)
1206 goto parse_literal;
1207
1208 #ifdef REG_LITERAL
1209 if (ctx->cflags & REG_LITERAL)
1210 goto parse_literal;
1211 #endif /* REG_LITERAL */
1212
1213 switch (*ctx->re)
1214 {
1215 case CHAR_LPAREN: /* parenthesized subexpression */
1216
1217 /* Handle "(?...)" extensions. They work in a way similar
1218 to Perls corresponding extensions. */
1219 if (ctx->cflags & REG_EXTENDED
1220 && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1221 {
1222 int new_cflags = ctx->cflags;
1223 int bit = 1;
1224 DPRINT(("tre_parse: extension: '%.*" STRF "\n",
1225 REST(ctx->re)));
1226 ctx->re += 2;
1227 while (/*CONSTCOND*/(void)1,1)
1228 {
1229 if (*ctx->re == L'i')
1230 {
1231 DPRINT(("tre_parse: icase: '%.*" STRF "\n",
1232 REST(ctx->re)));
1233 if (bit)
1234 new_cflags |= REG_ICASE;
1235 else
1236 new_cflags &= ~REG_ICASE;
1237 ctx->re++;
1238 }
1239 else if (*ctx->re == L'n')
1240 {
1241 DPRINT(("tre_parse: newline: '%.*" STRF "\n",
1242 REST(ctx->re)));
1243 if (bit)
1244 new_cflags |= REG_NEWLINE;
1245 else
1246 new_cflags &= ~REG_NEWLINE;
1247 ctx->re++;
1248 }
1249 #ifdef REG_RIGHT_ASSOC
1250 else if (*ctx->re == L'r')
1251 {
1252 DPRINT(("tre_parse: right assoc: '%.*" STRF "\n",
1253 REST(ctx->re)));
1254 if (bit)
1255 new_cflags |= REG_RIGHT_ASSOC;
1256 else
1257 new_cflags &= ~REG_RIGHT_ASSOC;
1258 ctx->re++;
1259 }
1260 #endif /* REG_RIGHT_ASSOC */
1261 #ifdef REG_UNGREEDY
1262 else if (*ctx->re == L'U')
1263 {
1264 DPRINT(("tre_parse: ungreedy: '%.*" STRF "\n",
1265 REST(ctx->re)));
1266 if (bit)
1267 new_cflags |= REG_UNGREEDY;
1268 else
1269 new_cflags &= ~REG_UNGREEDY;
1270 ctx->re++;
1271 }
1272 #endif /* REG_UNGREEDY */
1273 else if (*ctx->re == CHAR_MINUS)
1274 {
1275 DPRINT(("tre_parse: turn off: '%.*" STRF "\n",
1276 REST(ctx->re)));
1277 ctx->re++;
1278 bit = 0;
1279 }
1280 else if (*ctx->re == CHAR_COLON)
1281 {
1282 DPRINT(("tre_parse: no group: '%.*" STRF "\n",
1283 REST(ctx->re)));
1284 ctx->re++;
1285 depth++;
1286 break;
1287 }
1288 else if (*ctx->re == CHAR_HASH)
1289 {
1290 DPRINT(("tre_parse: comment: '%.*" STRF "\n",
1291 REST(ctx->re)));
1292 /* A comment can contain any character except a
1293 right parenthesis */
1294 while (*ctx->re != CHAR_RPAREN
1295 && ctx->re < ctx->re_end)
1296 ctx->re++;
1297 if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1298 {
1299 ctx->re++;
1300 break;
1301 }
1302 else
1303 return REG_BADPAT;
1304 }
1305 else if (*ctx->re == CHAR_RPAREN)
1306 {
1307 ctx->re++;
1308 break;
1309 }
1310 else
1311 return REG_BADPAT;
1312 }
1313
1314 /* Turn on the cflags changes for the rest of the
1315 enclosing group. */
1316 STACK_PUSHX(stack, long, ctx->cflags);
1317 STACK_PUSHX(stack, long, PARSE_RESTORE_CFLAGS);
1318 STACK_PUSHX(stack, long, PARSE_RE);
1319 ctx->cflags = new_cflags;
1320 break;
1321 }
1322
1323 if (ctx->cflags & REG_EXTENDED
1324 || (ctx->re > ctx->re_start
1325 && *(ctx->re - 1) == CHAR_BACKSLASH))
1326 {
1327 depth++;
1328 if (ctx->re + 2 < ctx->re_end
1329 && *(ctx->re + 1) == CHAR_QUESTIONMARK
1330 && *(ctx->re + 2) == CHAR_COLON)
1331 {
1332 DPRINT(("tre_parse: group begin: '%.*" STRF
1333 "', no submatch\n", REST(ctx->re)));
1334 /* Don't mark for submatching. */
1335 ctx->re += 3;
1336 STACK_PUSHX(stack, long, PARSE_RE);
1337 }
1338 else
1339 {
1340 DPRINT(("tre_parse: group begin: '%.*" STRF
1341 "', submatch %d\n", REST(ctx->re),
1342 ctx->submatch_id));
1343 ctx->re++;
1344 /* First parse a whole RE, then mark the resulting tree
1345 for submatching. */
1346 STACK_PUSHX(stack, long, ctx->submatch_id);
1347 STACK_PUSHX(stack, long, PARSE_MARK_FOR_SUBMATCH);
1348 STACK_PUSHX(stack, long, PARSE_RE);
1349 ctx->submatch_id++;
1350 }
1351 }
1352 else
1353 goto parse_literal;
1354 break;
1355
1356 case CHAR_RPAREN: /* end of current subexpression */
1357 if ((ctx->cflags & REG_EXTENDED && depth > 0)
1358 || (ctx->re > ctx->re_start
1359 && *(ctx->re - 1) == CHAR_BACKSLASH))
1360 {
1361 DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1362 REST(ctx->re)));
1363 /* We were expecting an atom, but instead the current
1364 subexpression was closed. POSIX leaves the meaning of
1365 this to be implementation-defined. We interpret this as
1366 an empty expression (which matches an empty string). */
1367 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1368 if (result == NULL)
1369 return REG_ESPACE;
1370 if (!(ctx->cflags & REG_EXTENDED))
1371 ctx->re--;
1372 }
1373 else
1374 goto parse_literal;
1375 break;
1376
1377 case CHAR_LBRACKET: /* bracket expression */
1378 DPRINT(("tre_parse: bracket: '%.*" STRF "'\n",
1379 REST(ctx->re)));
1380 ctx->re++;
1381 status = tre_parse_bracket(ctx, &result);
1382 if (status != REG_OK)
1383 return status;
1384 break;
1385
1386 case CHAR_BACKSLASH:
1387 /* If this is "\(" or "\)" chew off the backslash and
1388 try again. */
1389 if (!(ctx->cflags & REG_EXTENDED)
1390 && ctx->re + 1 < ctx->re_end
1391 && (*(ctx->re + 1) == CHAR_LPAREN
1392 || *(ctx->re + 1) == CHAR_RPAREN))
1393 {
1394 ctx->re++;
1395 STACK_PUSHX(stack, long, PARSE_ATOM);
1396 break;
1397 }
1398
1399 /* If a macro is used, parse the expanded macro recursively. */
1400 {
1401 tre_char_t buf[64];
1402 tre_expand_macro(ctx->re + 1, ctx->re_end,
1403 buf, elementsof(buf));
1404 if (buf[0] != 0)
1405 {
1406 tre_parse_ctx_t subctx;
1407 memcpy(&subctx, ctx, sizeof(subctx));
1408 subctx.re = buf;
1409 subctx.len = tre_strlen(buf);
1410 subctx.nofirstsub = 1;
1411 status = tre_parse(&subctx);
1412 if (status != REG_OK)
1413 return status;
1414 ctx->re += 2;
1415 ctx->position = subctx.position;
1416 result = subctx.result;
1417 break;
1418 }
1419 }
1420
1421 if (ctx->re + 1 >= ctx->re_end)
1422 /* Trailing backslash. */
1423 return REG_EESCAPE;
1424
1425 #ifdef REG_LITERAL
1426 if (*(ctx->re + 1) == L'Q')
1427 {
1428 DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1429 REST(ctx->re)));
1430 ctx->cflags |= REG_LITERAL;
1431 temporary_cflags |= REG_LITERAL;
1432 ctx->re += 2;
1433 STACK_PUSHX(stack, long, PARSE_ATOM);
1434 break;
1435 }
1436 #endif /* REG_LITERAL */
1437
1438 DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re)));
1439 ctx->re++;
1440 switch (*ctx->re)
1441 {
1442 case L'b':
1443 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1444 ASSERT_AT_WB, -1);
1445 ctx->re++;
1446 break;
1447 case L'B':
1448 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1449 ASSERT_AT_WB_NEG, -1);
1450 ctx->re++;
1451 break;
1452 case L'<':
1453 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1454 ASSERT_AT_BOW, -1);
1455 ctx->re++;
1456 break;
1457 case L'>':
1458 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1459 ASSERT_AT_EOW, -1);
1460 ctx->re++;
1461 break;
1462 case L'x':
1463 ctx->re++;
1464 if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1465 {
1466 /* 8 bit hex char. */
1467 char tmp[3] = {0, 0, 0};
1468 long val;
1469 DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n",
1470 REST(ctx->re - 2)));
1471
1472 if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1473 {
1474 tmp[0] = (char)ctx->re[0];
1475 ctx->re++;
1476 }
1477 if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1478 {
1479 tmp[1] = (char)ctx->re[0];
1480 ctx->re++;
1481 }
1482 val = strtol(tmp, NULL, 16);
1483 result = tre_ast_new_literal(ctx->mem, (int)val,
1484 (int)val, ctx->position);
1485 ctx->position++;
1486 break;
1487 }
1488 else if (ctx->re < ctx->re_end)
1489 {
1490 /* Wide char. */
1491 char tmp[32];
1492 long val;
1493 int i = 0;
1494 ctx->re++;
1495 while (ctx->re_end - ctx->re >= 0)
1496 {
1497 if (ctx->re[0] == CHAR_RBRACE)
1498 break;
1499 if (tre_isxdigit(ctx->re[0]))
1500 {
1501 tmp[i] = (char)ctx->re[0];
1502 i++;
1503 ctx->re++;
1504 continue;
1505 }
1506 return REG_EBRACE;
1507 }
1508 ctx->re++;
1509 tmp[i] = 0;
1510 val = strtol(tmp, NULL, 16);
1511 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1512 ctx->position);
1513 ctx->position++;
1514 break;
1515 }
1516 /*FALLTHROUGH*/
1517
1518 default:
1519 if (tre_isdigit(*ctx->re))
1520 {
1521 /* Back reference. */
1522 int val = *ctx->re - L'0';
1523 DPRINT(("tre_parse: backref: '%.*" STRF "'\n",
1524 REST(ctx->re - 1)));
1525 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1526 ctx->position);
1527 if (result == NULL)
1528 return REG_ESPACE;
1529 ctx->position++;
1530 ctx->max_backref = MAX(val, ctx->max_backref);
1531 ctx->re++;
1532 }
1533 else
1534 {
1535 /* Escaped character. */
1536 DPRINT(("tre_parse: escaped: '%.*" STRF "'\n",
1537 REST(ctx->re - 1)));
1538 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1539 ctx->position);
1540 ctx->position++;
1541 ctx->re++;
1542 }
1543 break;
1544 }
1545 if (result == NULL)
1546 return REG_ESPACE;
1547 break;
1548
1549 case CHAR_PERIOD: /* the any-symbol */
1550 DPRINT(("tre_parse: any: '%.*" STRF "'\n",
1551 REST(ctx->re)));
1552 if (ctx->cflags & REG_NEWLINE)
1553 {
1554 tre_ast_node_t *tmp1;
1555 tre_ast_node_t *tmp2;
1556 tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
1557 ctx->position);
1558 if (!tmp1)
1559 return REG_ESPACE;
1560 tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
1561 ctx->position + 1);
1562 if (!tmp2)
1563 return REG_ESPACE;
1564 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1565 if (!result)
1566 return REG_ESPACE;
1567 ctx->position += 2;
1568 }
1569 else
1570 {
1571 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1572 ctx->position);
1573 if (!result)
1574 return REG_ESPACE;
1575 ctx->position++;
1576 }
1577 ctx->re++;
1578 break;
1579
1580 case CHAR_CARET: /* beginning of line assertion */
1581 /* '^' has a special meaning everywhere in EREs, and in the
1582 beginning of the RE and after \( is BREs. */
1583 if (ctx->cflags & REG_EXTENDED
1584 || (ctx->re - 2 >= ctx->re_start
1585 && *(ctx->re - 2) == CHAR_BACKSLASH
1586 && *(ctx->re - 1) == CHAR_LPAREN)
1587 || ctx->re == ctx->re_start)
1588 {
1589 DPRINT(("tre_parse: BOL: '%.*" STRF "'\n",
1590 REST(ctx->re)));
1591 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1592 ASSERT_AT_BOL, -1);
1593 if (result == NULL)
1594 return REG_ESPACE;
1595 ctx->re++;
1596 }
1597 else
1598 goto parse_literal;
1599 break;
1600
1601 case CHAR_DOLLAR: /* end of line assertion. */
1602 /* '$' is special everywhere in EREs, and in the end of the
1603 string and before \) is BREs. */
1604 if (ctx->cflags & REG_EXTENDED
1605 || (ctx->re + 2 < ctx->re_end
1606 && *(ctx->re + 1) == CHAR_BACKSLASH
1607 && *(ctx->re + 2) == CHAR_RPAREN)
1608 || ctx->re + 1 == ctx->re_end)
1609 {
1610 DPRINT(("tre_parse: EOL: '%.*" STRF "'\n",
1611 REST(ctx->re)));
1612 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1613 ASSERT_AT_EOL, -1);
1614 if (result == NULL)
1615 return REG_ESPACE;
1616 ctx->re++;
1617 }
1618 else
1619 goto parse_literal;
1620 break;
1621
1622 default:
1623 parse_literal:
1624
1625 if (temporary_cflags && ctx->re + 1 < ctx->re_end
1626 && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
1627 {
1628 DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n",
1629 REST(ctx->re)));
1630 ctx->cflags &= ~temporary_cflags;
1631 temporary_cflags = 0;
1632 ctx->re += 2;
1633 STACK_PUSHX(stack, long, PARSE_PIECE);
1634 break;
1635 }
1636
1637
1638 /* We are expecting an atom. If the subexpression (or the whole
1639 regexp ends here, we interpret it as an empty expression
1640 (which matches an empty string). */
1641 if (
1642 #ifdef REG_LITERAL
1643 !(ctx->cflags & REG_LITERAL) &&
1644 #endif /* REG_LITERAL */
1645 (ctx->re >= ctx->re_end
1646 || *ctx->re == CHAR_STAR
1647 || (ctx->cflags & REG_EXTENDED
1648 && (*ctx->re == CHAR_PIPE
1649 || *ctx->re == CHAR_LBRACE
1650 || *ctx->re == CHAR_PLUS
1651 || *ctx->re == CHAR_QUESTIONMARK))
1652 /* Test for "\)" in BRE mode. */
1653 || (!(ctx->cflags & REG_EXTENDED)
1654 && ctx->re + 1 < ctx->re_end
1655 && *ctx->re == CHAR_BACKSLASH
1656 && *(ctx->re + 1) == CHAR_LBRACE)))
1657 {
1658 DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1659 REST(ctx->re)));
1660 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1661 if (!result)
1662 return REG_ESPACE;
1663 break;
1664 }
1665
1666 DPRINT(("tre_parse: literal: '%.*" STRF "'\n",
1667 REST(ctx->re)));
1668 /* Note that we can't use an tre_isalpha() test here, since there
1669 may be characters which are alphabetic but neither upper or
1670 lower case. */
1671 if (ctx->cflags & REG_ICASE
1672 && (tre_isupper(*ctx->re) || tre_islower(*ctx->re)))
1673 {
1674 tre_ast_node_t *tmp1;
1675 tre_ast_node_t *tmp2;
1676
1677 /* XXX - Can there be more than one opposite-case
1678 counterpoints for some character in some locale? Or
1679 more than two characters which all should be regarded
1680 the same character if case is ignored? If yes, there
1681 does not seem to be a portable way to detect it. I guess
1682 that at least for multi-character collating elements there
1683 could be several opposite-case counterpoints, but they
1684 cannot be supported portably anyway. */
1685 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re),
1686 tre_toupper(*ctx->re),
1687 ctx->position);
1688 if (!tmp1)
1689 return REG_ESPACE;
1690 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re),
1691 tre_tolower(*ctx->re),
1692 ctx->position);
1693 if (!tmp2)
1694 return REG_ESPACE;
1695 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1696 if (!result)
1697 return REG_ESPACE;
1698 }
1699 else
1700 {
1701 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1702 ctx->position);
1703 if (!result)
1704 return REG_ESPACE;
1705 }
1706 ctx->position++;
1707 ctx->re++;
1708 break;
1709 }
1710 break;
1711
1712 case PARSE_MARK_FOR_SUBMATCH:
1713 {
1714 int submatch_id = tre_stack_pop_int(stack);
1715
1716 if (result->submatch_id >= 0)
1717 {
1718 tre_ast_node_t *n, *tmp_node;
1719 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1720 if (n == NULL)
1721 return REG_ESPACE;
1722 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1723 if (tmp_node == NULL)
1724 return REG_ESPACE;
1725 tmp_node->num_submatches = result->num_submatches;
1726 result = tmp_node;
1727 }
1728 result->submatch_id = submatch_id;
1729 result->num_submatches++;
1730 break;
1731 }
1732
1733 case PARSE_RESTORE_CFLAGS:
1734 ctx->cflags = tre_stack_pop_int(stack);
1735 break;
1736
1737 default:
1738 assert(0);
1739 break;
1740 }
1741 }
1742
1743 /* Check for missing closing parentheses. */
1744 if (depth > 0)
1745 return REG_EPAREN;
1746
1747 if (status == REG_OK)
1748 ctx->result = result;
1749
1750 return status;
1751 }
1752
1753 /* EOF */
1754