15f2eab64SJohn Marino /*
25f2eab64SJohn Marino tre-parse.c - Regexp parser
35f2eab64SJohn Marino
45f2eab64SJohn Marino This software is released under a BSD-style license.
55f2eab64SJohn Marino See the file LICENSE for details and copyright.
65f2eab64SJohn Marino
75f2eab64SJohn Marino */
85f2eab64SJohn Marino
95f2eab64SJohn Marino /*
105f2eab64SJohn Marino This parser is just a simple recursive descent parser for POSIX.2
115f2eab64SJohn Marino regexps. The parser supports both the obsolete default syntax and
125f2eab64SJohn Marino the "extended" syntax, and some nonstandard extensions.
135f2eab64SJohn Marino */
145f2eab64SJohn Marino
155f2eab64SJohn Marino
165f2eab64SJohn Marino #ifdef HAVE_CONFIG_H
175f2eab64SJohn Marino #include <config.h>
185f2eab64SJohn Marino #endif /* HAVE_CONFIG_H */
195f2eab64SJohn Marino #include <string.h>
205f2eab64SJohn Marino #include <assert.h>
215f2eab64SJohn Marino #include <limits.h>
22d5f8dde1SJohn Marino #include <stddef.h>
235f2eab64SJohn Marino
245f2eab64SJohn Marino #include "xmalloc.h"
255f2eab64SJohn Marino #include "tre-mem.h"
265f2eab64SJohn Marino #include "tre-ast.h"
275f2eab64SJohn Marino #include "tre-stack.h"
285f2eab64SJohn Marino #include "tre-parse.h"
295f2eab64SJohn Marino
30d5f8dde1SJohn Marino #include "xlocale_private.h"
31d5f8dde1SJohn Marino #include "collate.h"
32d5f8dde1SJohn Marino
33d5f8dde1SJohn Marino /* BSD compatibility:
34d5f8dde1SJohn Marino Before looking up a collating symbol, check if the name matches in
35d5f8dde1SJohn Marino the character names (cnames) array; if so, use the corresponding
36d5f8dde1SJohn Marino character.
37d5f8dde1SJohn Marino
38d5f8dde1SJohn Marino Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
39d5f8dde1SJohn Marino the implementation choice that for ERE, a non-numeric character following
40d5f8dde1SJohn Marino a left brace that would normally be a bound, causes the left brace to be
41d5f8dde1SJohn Marino literal. */
42d5f8dde1SJohn Marino #define BSD_COMPATIBILITY
43d5f8dde1SJohn Marino #ifdef BSD_COMPATIBILITY
44d5f8dde1SJohn Marino #include "cname.h"
45d5f8dde1SJohn Marino #define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
46d5f8dde1SJohn Marino #endif /* BSD_COMPATIBILITY */
475f2eab64SJohn Marino
485f2eab64SJohn Marino /* Characters with special meanings in regexp syntax. */
495f2eab64SJohn Marino #define CHAR_PIPE L'|'
505f2eab64SJohn Marino #define CHAR_LPAREN L'('
515f2eab64SJohn Marino #define CHAR_RPAREN L')'
525f2eab64SJohn Marino #define CHAR_LBRACE L'{'
535f2eab64SJohn Marino #define CHAR_RBRACE L'}'
545f2eab64SJohn Marino #define CHAR_LBRACKET L'['
555f2eab64SJohn Marino #define CHAR_RBRACKET L']'
565f2eab64SJohn Marino #define CHAR_MINUS L'-'
575f2eab64SJohn Marino #define CHAR_STAR L'*'
585f2eab64SJohn Marino #define CHAR_QUESTIONMARK L'?'
595f2eab64SJohn Marino #define CHAR_PLUS L'+'
605f2eab64SJohn Marino #define CHAR_PERIOD L'.'
615f2eab64SJohn Marino #define CHAR_COLON L':'
625f2eab64SJohn Marino #define CHAR_EQUAL L'='
635f2eab64SJohn Marino #define CHAR_COMMA L','
645f2eab64SJohn Marino #define CHAR_CARET L'^'
655f2eab64SJohn Marino #define CHAR_DOLLAR L'$'
665f2eab64SJohn Marino #define CHAR_BACKSLASH L'\\'
675f2eab64SJohn Marino #define CHAR_HASH L'#'
685f2eab64SJohn Marino #define CHAR_TILDE L'~'
695f2eab64SJohn Marino
705f2eab64SJohn Marino
715f2eab64SJohn Marino /* Some macros for expanding \w, \s, etc. */
725f2eab64SJohn Marino static const struct tre_macro_struct {
735f2eab64SJohn Marino const char c;
745f2eab64SJohn Marino const char *expansion;
755f2eab64SJohn Marino } tre_macros[] =
765f2eab64SJohn Marino { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
775f2eab64SJohn Marino {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
785f2eab64SJohn Marino {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
795f2eab64SJohn Marino {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
805f2eab64SJohn Marino { 0, NULL }
815f2eab64SJohn Marino };
825f2eab64SJohn Marino
835f2eab64SJohn Marino
845f2eab64SJohn Marino /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
855f2eab64SJohn Marino must have at least `len' items. Sets buf[0] to zero if the there
865f2eab64SJohn Marino is no match in `tre_macros'. */
875f2eab64SJohn Marino static void
tre_expand_macro(const tre_char_t * regex,const tre_char_t * regex_end,tre_char_t * buf,size_t buf_len)885f2eab64SJohn Marino tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
895f2eab64SJohn Marino tre_char_t *buf, size_t buf_len)
905f2eab64SJohn Marino {
915f2eab64SJohn Marino int i;
925f2eab64SJohn Marino
935f2eab64SJohn Marino buf[0] = 0;
945f2eab64SJohn Marino if (regex >= regex_end)
955f2eab64SJohn Marino return;
965f2eab64SJohn Marino
975f2eab64SJohn Marino for (i = 0; tre_macros[i].expansion; i++)
985f2eab64SJohn Marino {
995f2eab64SJohn Marino if (tre_macros[i].c == *regex)
1005f2eab64SJohn Marino {
1015f2eab64SJohn Marino unsigned int j;
1025f2eab64SJohn Marino DPRINT(("Expanding macro '%c' => '%s'\n",
1035f2eab64SJohn Marino tre_macros[i].c, tre_macros[i].expansion));
1045f2eab64SJohn Marino for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
1055f2eab64SJohn Marino buf[j] = tre_macros[i].expansion[j];
1065f2eab64SJohn Marino buf[j] = 0;
1075f2eab64SJohn Marino break;
1085f2eab64SJohn Marino }
1095f2eab64SJohn Marino }
1105f2eab64SJohn Marino }
1115f2eab64SJohn Marino
1125f2eab64SJohn Marino static reg_errcode_t
tre_new_item(tre_mem_t mem,int type,int val,int * max_i,tre_bracket_match_list_t ** items)113d5f8dde1SJohn Marino tre_new_item(tre_mem_t mem, int type, int val, int *max_i,
114d5f8dde1SJohn Marino tre_bracket_match_list_t **items)
1155f2eab64SJohn Marino {
116d5f8dde1SJohn Marino reg_errcode_t status = REG_OK;
117d5f8dde1SJohn Marino tre_bracket_match_list_t *array = *items;
118d5f8dde1SJohn Marino int i = array->num_bracket_matches;
1195f2eab64SJohn Marino /* Allocate more space if necessary. */
120d5f8dde1SJohn Marino if (i >= *max_i)
1215f2eab64SJohn Marino {
122d5f8dde1SJohn Marino tre_bracket_match_list_t *new_items;
123d5f8dde1SJohn Marino DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i));
1245f2eab64SJohn Marino /* If the array is already 1024 items large, give up -- there's
1255f2eab64SJohn Marino probably an error in the regexp (e.g. not a '\0' terminated
1265f2eab64SJohn Marino string and missing ']') */
127d5f8dde1SJohn Marino if (*max_i >= 1024)
1285f2eab64SJohn Marino return REG_ESPACE;
1295f2eab64SJohn Marino *max_i *= 2;
130d5f8dde1SJohn Marino new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i));
1315f2eab64SJohn Marino if (new_items == NULL)
1325f2eab64SJohn Marino return REG_ESPACE;
1335f2eab64SJohn Marino *items = array = new_items;
1345f2eab64SJohn Marino }
135d5f8dde1SJohn Marino array->bracket_matches[i].type = type;
136d5f8dde1SJohn Marino array->bracket_matches[i].value = val;
137d5f8dde1SJohn Marino array->num_bracket_matches++;
1385f2eab64SJohn Marino return status;
1395f2eab64SJohn Marino }
1405f2eab64SJohn Marino
1415f2eab64SJohn Marino #ifndef TRE_USE_SYSTEM_WCTYPE
1425f2eab64SJohn Marino
1435f2eab64SJohn Marino /* isalnum() and the rest may be macros, so wrap them to functions. */
tre_isalnum_func(tre_cint_t c)1445f2eab64SJohn Marino int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
tre_isalpha_func(tre_cint_t c)1455f2eab64SJohn Marino int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
1465f2eab64SJohn Marino
1475f2eab64SJohn Marino #ifdef tre_isascii
tre_isascii_func(tre_cint_t c)1485f2eab64SJohn Marino int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
1495f2eab64SJohn Marino #else /* !tre_isascii */
tre_isascii_func(tre_cint_t c)1505f2eab64SJohn Marino int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
1515f2eab64SJohn Marino #endif /* !tre_isascii */
1525f2eab64SJohn Marino
1535f2eab64SJohn Marino #ifdef tre_isblank
tre_isblank_func(tre_cint_t c)1545f2eab64SJohn Marino int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
1555f2eab64SJohn Marino #else /* !tre_isblank */
tre_isblank_func(tre_cint_t c)1565f2eab64SJohn Marino int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
1575f2eab64SJohn Marino #endif /* !tre_isblank */
1585f2eab64SJohn Marino
tre_iscntrl_func(tre_cint_t c)1595f2eab64SJohn Marino int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
tre_isdigit_func(tre_cint_t c)1605f2eab64SJohn Marino int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
tre_isgraph_func(tre_cint_t c)1615f2eab64SJohn Marino int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
tre_islower_func(tre_cint_t c)1625f2eab64SJohn Marino int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
tre_isprint_func(tre_cint_t c)1635f2eab64SJohn Marino int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
tre_ispunct_func(tre_cint_t c)1645f2eab64SJohn Marino int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
tre_isspace_func(tre_cint_t c)1655f2eab64SJohn Marino int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
tre_isupper_func(tre_cint_t c)1665f2eab64SJohn Marino int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
tre_isxdigit_func(tre_cint_t c)1675f2eab64SJohn Marino int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
1685f2eab64SJohn Marino
1695f2eab64SJohn Marino struct {
1705f2eab64SJohn Marino char *name;
1715f2eab64SJohn Marino int (*func)(tre_cint_t);
1725f2eab64SJohn Marino } tre_ctype_map[] = {
1735f2eab64SJohn Marino { "alnum", &tre_isalnum_func },
1745f2eab64SJohn Marino { "alpha", &tre_isalpha_func },
1755f2eab64SJohn Marino #ifdef tre_isascii
1765f2eab64SJohn Marino { "ascii", &tre_isascii_func },
1775f2eab64SJohn Marino #endif /* tre_isascii */
1785f2eab64SJohn Marino #ifdef tre_isblank
1795f2eab64SJohn Marino { "blank", &tre_isblank_func },
1805f2eab64SJohn Marino #endif /* tre_isblank */
1815f2eab64SJohn Marino { "cntrl", &tre_iscntrl_func },
1825f2eab64SJohn Marino { "digit", &tre_isdigit_func },
1835f2eab64SJohn Marino { "graph", &tre_isgraph_func },
1845f2eab64SJohn Marino { "lower", &tre_islower_func },
1855f2eab64SJohn Marino { "print", &tre_isprint_func },
1865f2eab64SJohn Marino { "punct", &tre_ispunct_func },
1875f2eab64SJohn Marino { "space", &tre_isspace_func },
1885f2eab64SJohn Marino { "upper", &tre_isupper_func },
1895f2eab64SJohn Marino { "xdigit", &tre_isxdigit_func },
1905f2eab64SJohn Marino { NULL, NULL}
1915f2eab64SJohn Marino };
1925f2eab64SJohn Marino
tre_ctype(const char * name)1935f2eab64SJohn Marino tre_ctype_t tre_ctype(const char *name)
1945f2eab64SJohn Marino {
1955f2eab64SJohn Marino int i;
1965f2eab64SJohn Marino for (i = 0; tre_ctype_map[i].name != NULL; i++)
1975f2eab64SJohn Marino {
1985f2eab64SJohn Marino if (strcmp(name, tre_ctype_map[i].name) == 0)
1995f2eab64SJohn Marino return tre_ctype_map[i].func;
2005f2eab64SJohn Marino }
2015f2eab64SJohn Marino return (tre_ctype_t)0;
2025f2eab64SJohn Marino }
2035f2eab64SJohn Marino #endif /* !TRE_USE_SYSTEM_WCTYPE */
2045f2eab64SJohn Marino
2055f2eab64SJohn Marino #define REST(re) (int)(ctx->re_end - (re)), (re)
2065f2eab64SJohn Marino
207d5f8dde1SJohn Marino #define START_COLLATING_SYMBOLS 16
208d5f8dde1SJohn Marino #define MAX_COLLATING_SYMBOL_LEN 4
209d5f8dde1SJohn Marino
210d5f8dde1SJohn Marino typedef struct {
211d5f8dde1SJohn Marino const tre_char_t *start;
212d5f8dde1SJohn Marino int len;
213d5f8dde1SJohn Marino } tre_collating_symbol;
214d5f8dde1SJohn Marino
215d5f8dde1SJohn Marino #ifdef BSD_COMPATIBILITY
216d5f8dde1SJohn Marino static wchar_t
tre_search_cnames(const wchar_t * name,size_t len)217d5f8dde1SJohn Marino tre_search_cnames(const wchar_t *name, size_t len)
218d5f8dde1SJohn Marino {
219d5f8dde1SJohn Marino size_t low = 0;
220d5f8dde1SJohn Marino size_t high = NCNAMES - 1;
221d5f8dde1SJohn Marino size_t cur;
222d5f8dde1SJohn Marino int cmp;
223d5f8dde1SJohn Marino
224d5f8dde1SJohn Marino while(low <= high)
225d5f8dde1SJohn Marino {
226d5f8dde1SJohn Marino cur = (low + high) / 2;
227d5f8dde1SJohn Marino cmp = wcsncmp(name, cnames[cur].name, len);
228d5f8dde1SJohn Marino if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code;
229d5f8dde1SJohn Marino if (cmp > 0) low = cur + 1;
230d5f8dde1SJohn Marino else high = cur - 1;
231d5f8dde1SJohn Marino }
232d5f8dde1SJohn Marino return (wchar_t)-1;
233d5f8dde1SJohn Marino }
234d5f8dde1SJohn Marino #endif /* BSD_COMPATIBILITY */
235d5f8dde1SJohn Marino
236d5f8dde1SJohn Marino /* Scan the contents of a bracket expression, and create a
237d5f8dde1SJohn Marino * tre_bracket_match_list_t encoding the bracket expression. If during
238d5f8dde1SJohn Marino * the scan, multi-character collating symbols are detected, switch
239d5f8dde1SJohn Marino * into a mode to collect those MCCSs into a tre_collating_symbol
240d5f8dde1SJohn Marino * list and pass them back. tre_parse_bracket will use that to
241d5f8dde1SJohn Marino * create a new string composed of a union of the bracket expression
242d5f8dde1SJohn Marino * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and
243d5f8dde1SJohn Marino * call tre_parse (recursive) to parse that new string (which will
244d5f8dde1SJohn Marino * call tre_parse_bracket and tre_parse_bracket_items again. */
2455f2eab64SJohn Marino static reg_errcode_t
tre_parse_bracket_items(tre_parse_ctx_t * ctx,tre_bracket_match_list_t ** items,int * items_size,tre_collating_symbol ** result)246d5f8dde1SJohn Marino tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items,
247d5f8dde1SJohn Marino int *items_size, tre_collating_symbol **result)
2485f2eab64SJohn Marino {
2495f2eab64SJohn Marino const tre_char_t *re = ctx->re;
250d5f8dde1SJohn Marino const tre_char_t *re_end = ctx->re_end;
251d5f8dde1SJohn Marino tre_collating_symbol *col_syms = NULL;
252d5f8dde1SJohn Marino tre_collating_symbol *cp = NULL;
253d5f8dde1SJohn Marino int n_col_syms = 0;
254d5f8dde1SJohn Marino reg_errcode_t status;
2555f2eab64SJohn Marino int max_i = *items_size;
256d5f8dde1SJohn Marino int other = 0; /* contains content other than multi-character collating
257d5f8dde1SJohn Marino * symbols */
258d5f8dde1SJohn Marino int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */
259d5f8dde1SJohn Marino tre_cint_t min, c;
260d5f8dde1SJohn Marino int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE);
261d5f8dde1SJohn Marino int collect_MCCS = 0;
262d5f8dde1SJohn Marino const tre_char_t *start;
2635f2eab64SJohn Marino
264d5f8dde1SJohn Marino for ( ;re < re_end; re++)
2655f2eab64SJohn Marino {
266d5f8dde1SJohn Marino switch (*re)
267d5f8dde1SJohn Marino {
268d5f8dde1SJohn Marino case CHAR_MINUS:
269d5f8dde1SJohn Marino /* A first hyphen */
270d5f8dde1SJohn Marino if (re == ctx->re)
271d5f8dde1SJohn Marino {
272d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
273d5f8dde1SJohn Marino min = CHAR_MINUS;
274d5f8dde1SJohn Marino other++;
275d5f8dde1SJohn Marino range = 0;
276d5f8dde1SJohn Marino break;
277d5f8dde1SJohn Marino }
278d5f8dde1SJohn Marino /* The hyphen is the end range */
279d5f8dde1SJohn Marino if (range > 0)
280d5f8dde1SJohn Marino {
281d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
282d5f8dde1SJohn Marino c = CHAR_MINUS;
283d5f8dde1SJohn Marino goto process_end_range;
284d5f8dde1SJohn Marino }
285d5f8dde1SJohn Marino if (re + 1 >= re_end)
2865f2eab64SJohn Marino {
2875f2eab64SJohn Marino status = REG_EBRACK;
288d5f8dde1SJohn Marino goto error;
2895f2eab64SJohn Marino }
290d5f8dde1SJohn Marino /* The hyphen is at the end */
291d5f8dde1SJohn Marino if (re[1] == CHAR_RBRACKET)
2925f2eab64SJohn Marino {
293d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
294d5f8dde1SJohn Marino c = CHAR_MINUS;
295d5f8dde1SJohn Marino goto process_begin_range;
296d5f8dde1SJohn Marino }
297d5f8dde1SJohn Marino /* Two ranges are not allowed to share an endpoint, or begin
298d5f8dde1SJohn Marino * range is illegal. */
299d5f8dde1SJohn Marino if (range < 0)
300d5f8dde1SJohn Marino {
301d5f8dde1SJohn Marino status = REG_ERANGE;
302d5f8dde1SJohn Marino goto error;
303d5f8dde1SJohn Marino }
304d5f8dde1SJohn Marino range = 1; /* Expect end range */
305d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re)));
306d5f8dde1SJohn Marino break;
307d5f8dde1SJohn Marino
308d5f8dde1SJohn Marino case CHAR_LBRACKET:
309d5f8dde1SJohn Marino if (re + 1 >= re_end)
310d5f8dde1SJohn Marino {
311d5f8dde1SJohn Marino status = REG_EBRACK;
312d5f8dde1SJohn Marino goto error;
313d5f8dde1SJohn Marino }
314d5f8dde1SJohn Marino switch (re[1])
315d5f8dde1SJohn Marino {
316d5f8dde1SJohn Marino case CHAR_PERIOD:
317d5f8dde1SJohn Marino {
318d5f8dde1SJohn Marino re += 2;
319d5f8dde1SJohn Marino start = re;
320d5f8dde1SJohn Marino for (;; re++)
321d5f8dde1SJohn Marino {
322d5f8dde1SJohn Marino if (re >= re_end)
323d5f8dde1SJohn Marino {
324d5f8dde1SJohn Marino status = REG_ECOLLATE;
325d5f8dde1SJohn Marino goto error;
326d5f8dde1SJohn Marino }
327d5f8dde1SJohn Marino if (*re == CHAR_PERIOD)
328d5f8dde1SJohn Marino {
329d5f8dde1SJohn Marino if (re + 1 >= re_end)
330d5f8dde1SJohn Marino {
331d5f8dde1SJohn Marino status = REG_ECOLLATE;
332d5f8dde1SJohn Marino goto error;
333d5f8dde1SJohn Marino }
334d5f8dde1SJohn Marino /* Found end */
335d5f8dde1SJohn Marino if (re[1] == CHAR_RBRACKET)
336d5f8dde1SJohn Marino {
337d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: collating "
338d5f8dde1SJohn Marino "symbol: '%.*" STRF "'\n",
339d5f8dde1SJohn Marino REST(start - 2)));
340d5f8dde1SJohn Marino /* Empty name */
341d5f8dde1SJohn Marino if (re == start)
342d5f8dde1SJohn Marino {
343d5f8dde1SJohn Marino status = REG_ECOLLATE;
344d5f8dde1SJohn Marino goto error;
345d5f8dde1SJohn Marino }
346d5f8dde1SJohn Marino #ifdef BSD_COMPATIBILITY
347d5f8dde1SJohn Marino /* Check if the name is in cnames; if so, use
348d5f8dde1SJohn Marino the corresponding code */
349d5f8dde1SJohn Marino c = tre_search_cnames(start, re - start);
350d5f8dde1SJohn Marino if (c != (wchar_t)-1)
351d5f8dde1SJohn Marino {
352d5f8dde1SJohn Marino re++;
353d5f8dde1SJohn Marino goto process_single_character;
354d5f8dde1SJohn Marino }
355d5f8dde1SJohn Marino #endif /* BSD_COMPATIBILITY */
356d5f8dde1SJohn Marino /* Verify this is a known sequence */
357d5f8dde1SJohn Marino if (__collate_equiv_value(ctx->loc, start,
358d5f8dde1SJohn Marino re - start) <= 0)
359d5f8dde1SJohn Marino {
360d5f8dde1SJohn Marino status = REG_ECOLLATE;
361d5f8dde1SJohn Marino goto error;
362d5f8dde1SJohn Marino }
363d5f8dde1SJohn Marino /* Process single character collating symbols */
364d5f8dde1SJohn Marino if (re - start == 1)
365d5f8dde1SJohn Marino {
366d5f8dde1SJohn Marino c = *start;
367d5f8dde1SJohn Marino re++;
368d5f8dde1SJohn Marino goto process_single_character;
369d5f8dde1SJohn Marino }
370d5f8dde1SJohn Marino /* Inverted MCCSs are undefined */
371d5f8dde1SJohn Marino if (invert)
372d5f8dde1SJohn Marino {
373d5f8dde1SJohn Marino status = REG_ECOLLATE;
374d5f8dde1SJohn Marino goto error;
375d5f8dde1SJohn Marino }
376d5f8dde1SJohn Marino /* Can't have MCCSs as an endpoint to a range */
377d5f8dde1SJohn Marino if (range > 0)
378d5f8dde1SJohn Marino {
379d5f8dde1SJohn Marino status = REG_ERANGE;
380d5f8dde1SJohn Marino goto error;
381d5f8dde1SJohn Marino }
382d5f8dde1SJohn Marino range = -1;
383d5f8dde1SJohn Marino /* Switch into MCCS collection mode (if not
384d5f8dde1SJohn Marino * already there */
385d5f8dde1SJohn Marino #if TRE_DEBUG
386d5f8dde1SJohn Marino if (!collect_MCCS)
387d5f8dde1SJohn Marino {
388d5f8dde1SJohn Marino collect_MCCS = 1;
389d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: Detected MCCS\n"));
390d5f8dde1SJohn Marino }
391d5f8dde1SJohn Marino #else /* !TRE_DEBUG */
392d5f8dde1SJohn Marino collect_MCCS = 1;
393d5f8dde1SJohn Marino #endif /* !TRE_DEBUG */
394d5f8dde1SJohn Marino /* Allocate a memory block the first time */
395d5f8dde1SJohn Marino if (!cp)
396d5f8dde1SJohn Marino {
397d5f8dde1SJohn Marino if ((col_syms = xmalloc(sizeof(*col_syms) *
398d5f8dde1SJohn Marino (START_COLLATING_SYMBOLS + 2)))
399d5f8dde1SJohn Marino == NULL)
400d5f8dde1SJohn Marino return REG_ESPACE;
401d5f8dde1SJohn Marino cp = col_syms + 1;
402d5f8dde1SJohn Marino n_col_syms = START_COLLATING_SYMBOLS;
403d5f8dde1SJohn Marino }
404d5f8dde1SJohn Marino /* Enlarge the memory block is more is needed */
405d5f8dde1SJohn Marino if ((cp - col_syms) - 1 >= n_col_syms)
406d5f8dde1SJohn Marino {
407d5f8dde1SJohn Marino int i = n_col_syms;
408d5f8dde1SJohn Marino tre_collating_symbol *tmp =
409d5f8dde1SJohn Marino xrealloc(col_syms, sizeof(*col_syms) *
410d5f8dde1SJohn Marino ((n_col_syms *= 2) + 2));
411d5f8dde1SJohn Marino if (tmp == NULL)
412d5f8dde1SJohn Marino {
413d5f8dde1SJohn Marino xfree(col_syms);
414d5f8dde1SJohn Marino return REG_ESPACE;
415d5f8dde1SJohn Marino }
416d5f8dde1SJohn Marino DPRINT(("tre_list_collating_symbols: "
417d5f8dde1SJohn Marino "Enlarging col_syms to %d\n",
418d5f8dde1SJohn Marino n_col_syms));
419d5f8dde1SJohn Marino col_syms = tmp;
420d5f8dde1SJohn Marino cp = col_syms + i + 1;
421d5f8dde1SJohn Marino }
422d5f8dde1SJohn Marino cp->start = start;
423d5f8dde1SJohn Marino cp->len = re - start;
424d5f8dde1SJohn Marino cp++;
4255f2eab64SJohn Marino re++;
4265f2eab64SJohn Marino break;
4275f2eab64SJohn Marino }
428d5f8dde1SJohn Marino }
429d5f8dde1SJohn Marino }
430d5f8dde1SJohn Marino break;
431d5f8dde1SJohn Marino }
432d5f8dde1SJohn Marino
433d5f8dde1SJohn Marino case CHAR_EQUAL:
434d5f8dde1SJohn Marino case CHAR_COLON:
435d5f8dde1SJohn Marino {
436d5f8dde1SJohn Marino /* Process equivalence and character classes */
437d5f8dde1SJohn Marino tre_char_t kind = re[1];
438d5f8dde1SJohn Marino
439d5f8dde1SJohn Marino /* Can't have a class as an endpoint to a range */
440d5f8dde1SJohn Marino if (range > 0)
441d5f8dde1SJohn Marino {
442d5f8dde1SJohn Marino status = REG_ERANGE;
443d5f8dde1SJohn Marino goto error;
444d5f8dde1SJohn Marino }
445d5f8dde1SJohn Marino if (!collect_MCCS && range == 0)
446d5f8dde1SJohn Marino {
447d5f8dde1SJohn Marino status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
448d5f8dde1SJohn Marino min, &max_i, items);
449d5f8dde1SJohn Marino if (status != REG_OK)
450d5f8dde1SJohn Marino goto error;
451d5f8dde1SJohn Marino }
452d5f8dde1SJohn Marino range = -1;
453d5f8dde1SJohn Marino re += 2;
454d5f8dde1SJohn Marino start = re;
455d5f8dde1SJohn Marino for (;; re++)
456d5f8dde1SJohn Marino {
457d5f8dde1SJohn Marino if (re >= re_end)
458d5f8dde1SJohn Marino {
459d5f8dde1SJohn Marino status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE;
460d5f8dde1SJohn Marino goto error;
461d5f8dde1SJohn Marino }
462d5f8dde1SJohn Marino if (*re == kind)
463d5f8dde1SJohn Marino {
464d5f8dde1SJohn Marino if (re + 1 >= re_end)
465d5f8dde1SJohn Marino {
466d5f8dde1SJohn Marino status = kind == CHAR_EQUAL ? REG_ECOLLATE :
467d5f8dde1SJohn Marino REG_ECTYPE;
468d5f8dde1SJohn Marino goto error;
469d5f8dde1SJohn Marino }
470d5f8dde1SJohn Marino /* Found end */
471d5f8dde1SJohn Marino if (re[1] == CHAR_RBRACKET)
472d5f8dde1SJohn Marino {
473d5f8dde1SJohn Marino if (re == start)
474d5f8dde1SJohn Marino {
475d5f8dde1SJohn Marino /* Empty class name */
476d5f8dde1SJohn Marino status = kind == CHAR_EQUAL ? REG_ECOLLATE :
477d5f8dde1SJohn Marino REG_ECTYPE;
478d5f8dde1SJohn Marino goto error;
479d5f8dde1SJohn Marino }
480d5f8dde1SJohn Marino /* Process equivalence class */
481d5f8dde1SJohn Marino if (kind == CHAR_EQUAL)
482d5f8dde1SJohn Marino {
483d5f8dde1SJohn Marino int equiv;
484d5f8dde1SJohn Marino
485d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: equivalence: '%.*"
486d5f8dde1SJohn Marino STRF "'\n", REST(start - 2)));
487d5f8dde1SJohn Marino
488d5f8dde1SJohn Marino /* While we find the collation value even for
489d5f8dde1SJohn Marino multi-character collating elements , we
490d5f8dde1SJohn Marino don't (yet) match any collation values
491d5f8dde1SJohn Marino against multi-character sequences. We'd have
492d5f8dde1SJohn Marino to enumerate those multi-character sequences
493d5f8dde1SJohn Marino and like multi-character collating symbols,
494d5f8dde1SJohn Marino create a union of those sequences with the
495d5f8dde1SJohn Marino rest of the bracket expression. While
496d5f8dde1SJohn Marino doable, a bracket expression matching
497d5f8dde1SJohn Marino multiple characters, that doesn't explicitly
498d5f8dde1SJohn Marino contain multi-character sequences, might
499d5f8dde1SJohn Marino be unexpected, so we punt for now. */
500d5f8dde1SJohn Marino if ((equiv = __collate_equiv_value(ctx->loc,
501d5f8dde1SJohn Marino start, re - start)) <= 0)
502d5f8dde1SJohn Marino {
503d5f8dde1SJohn Marino /* The standard says that if no collating
504d5f8dde1SJohn Marino element if found, we use the collating
505d5f8dde1SJohn Marino symbol itself. But __collate_equiv_value
506d5f8dde1SJohn Marino doesn't make a distinction between
507d5f8dde1SJohn Marino an element that is in a equvalence
508d5f8dde1SJohn Marino class with others, or is the only member,
509d5f8dde1SJohn Marino so we already know there is no collating
510d5f8dde1SJohn Marino symbol. (Note that in the case of a
511d5f8dde1SJohn Marino collating element whose collation value
512d5f8dde1SJohn Marino is unique, matching against the
513d5f8dde1SJohn Marino collating element itself, or against
514d5f8dde1SJohn Marino its collation value, is equivalent.) */
515d5f8dde1SJohn Marino #ifdef BSD_COMPATIBILITY
516d5f8dde1SJohn Marino /* Check if the name is in cnames; if so,
517d5f8dde1SJohn Marino use the corresponding code */
518d5f8dde1SJohn Marino c = tre_search_cnames(start, re - start);
519d5f8dde1SJohn Marino if (c != (wchar_t)-1)
520d5f8dde1SJohn Marino {
521d5f8dde1SJohn Marino re++;
522d5f8dde1SJohn Marino goto process_single_character;
523d5f8dde1SJohn Marino }
524d5f8dde1SJohn Marino #endif /* BSD_COMPATIBILITY */
525d5f8dde1SJohn Marino status = REG_ECOLLATE;
526d5f8dde1SJohn Marino goto error;
527d5f8dde1SJohn Marino }
528d5f8dde1SJohn Marino if (!collect_MCCS)
529d5f8dde1SJohn Marino {
530d5f8dde1SJohn Marino status = tre_new_item(ctx->mem,
531d5f8dde1SJohn Marino TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,
532d5f8dde1SJohn Marino equiv, &max_i, items);
533d5f8dde1SJohn Marino if (status != REG_OK)
534d5f8dde1SJohn Marino goto error;
535d5f8dde1SJohn Marino }
536d5f8dde1SJohn Marino }
5375f2eab64SJohn Marino else
5385f2eab64SJohn Marino {
539d5f8dde1SJohn Marino /* Process character class */
540d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: class: '%.*" STRF
541d5f8dde1SJohn Marino "'\n", REST(start - 2)));
542d5f8dde1SJohn Marino if (!collect_MCCS)
5435f2eab64SJohn Marino {
5445f2eab64SJohn Marino char tmp_str[64];
545d5f8dde1SJohn Marino tre_ctype_t class;
546d5f8dde1SJohn Marino int len = MIN(re - start, 63);
5475f2eab64SJohn Marino #ifdef TRE_WCHAR
5485f2eab64SJohn Marino {
5495f2eab64SJohn Marino tre_char_t tmp_wcs[64];
550d5f8dde1SJohn Marino wcsncpy(tmp_wcs, start, (size_t)len);
5515f2eab64SJohn Marino tmp_wcs[len] = L'\0';
5525f2eab64SJohn Marino #if defined HAVE_WCSRTOMBS
5535f2eab64SJohn Marino {
5545f2eab64SJohn Marino mbstate_t state;
5555f2eab64SJohn Marino const tre_char_t *src = tmp_wcs;
5565f2eab64SJohn Marino memset(&state, '\0', sizeof(state));
557d5f8dde1SJohn Marino len = wcsrtombs_l(tmp_str, &src,
558d5f8dde1SJohn Marino sizeof(tmp_str), &state,
559d5f8dde1SJohn Marino ctx->loc);
5605f2eab64SJohn Marino }
5615f2eab64SJohn Marino #elif defined HAVE_WCSTOMBS
5625f2eab64SJohn Marino len = wcstombs(tmp_str, tmp_wcs, 63);
5635f2eab64SJohn Marino #endif /* defined HAVE_WCSTOMBS */
5645f2eab64SJohn Marino }
5655f2eab64SJohn Marino #else /* !TRE_WCHAR */
566d5f8dde1SJohn Marino strncpy(tmp_str, (const char*)start, len);
5675f2eab64SJohn Marino #endif /* !TRE_WCHAR */
5685f2eab64SJohn Marino tmp_str[len] = '\0';
5695f2eab64SJohn Marino DPRINT((" class name: %s\n", tmp_str));
570d5f8dde1SJohn Marino class = tre_ctype_l(tmp_str, ctx->loc);
5715f2eab64SJohn Marino if (!class)
5725f2eab64SJohn Marino {
5735f2eab64SJohn Marino status = REG_ECTYPE;
574d5f8dde1SJohn Marino goto error;
5755f2eab64SJohn Marino }
576d5f8dde1SJohn Marino status = tre_new_item(ctx->mem,
577d5f8dde1SJohn Marino TRE_BRACKET_MATCH_TYPE_CLASS,
578d5f8dde1SJohn Marino class, &max_i, items);
579d5f8dde1SJohn Marino if (status != REG_OK)
580d5f8dde1SJohn Marino goto error;
581d5f8dde1SJohn Marino }
582d5f8dde1SJohn Marino }
583d5f8dde1SJohn Marino re++;
584d5f8dde1SJohn Marino break;
585d5f8dde1SJohn Marino }
586d5f8dde1SJohn Marino }
587d5f8dde1SJohn Marino }
588d5f8dde1SJohn Marino other++;
589d5f8dde1SJohn Marino break;
590d5f8dde1SJohn Marino }
591d5f8dde1SJohn Marino
592d5f8dde1SJohn Marino default:
593d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
594d5f8dde1SJohn Marino c = CHAR_LBRACKET;
595d5f8dde1SJohn Marino goto process_single_character;
596d5f8dde1SJohn Marino break;
597d5f8dde1SJohn Marino }
598d5f8dde1SJohn Marino break;
599d5f8dde1SJohn Marino
600d5f8dde1SJohn Marino case CHAR_RBRACKET:
601d5f8dde1SJohn Marino /* A first right bracket */
602d5f8dde1SJohn Marino if (re == ctx->re)
6035f2eab64SJohn Marino {
6045f2eab64SJohn Marino DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
605d5f8dde1SJohn Marino min = CHAR_RBRACKET;
606d5f8dde1SJohn Marino range = 0;
607d5f8dde1SJohn Marino other++;
6085f2eab64SJohn Marino break;
6095f2eab64SJohn Marino }
610d5f8dde1SJohn Marino /* Done */
611d5f8dde1SJohn Marino if (collect_MCCS)
612d5f8dde1SJohn Marino {
613d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n",
614d5f8dde1SJohn Marino REST(re)));
615d5f8dde1SJohn Marino if (col_syms)
616d5f8dde1SJohn Marino {
617d5f8dde1SJohn Marino /* Mark the character following the right bracket. Set len
618d5f8dde1SJohn Marino * to whether there are other things besides the
619d5f8dde1SJohn Marino * multi-character collating symbols */
620d5f8dde1SJohn Marino col_syms->start = re + 1;
621d5f8dde1SJohn Marino col_syms->len = other;
622d5f8dde1SJohn Marino /* Mark the end of the list */
623d5f8dde1SJohn Marino cp->start = NULL;
624d5f8dde1SJohn Marino }
625d5f8dde1SJohn Marino *result = col_syms;
626d5f8dde1SJohn Marino return REG_OK;
627d5f8dde1SJohn Marino }
628d5f8dde1SJohn Marino /* range > 0 is not possible, since we did a lookahead after the
629d5f8dde1SJohn Marino * hyphen */
630d5f8dde1SJohn Marino if (range == 0)
631d5f8dde1SJohn Marino {
632d5f8dde1SJohn Marino status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
633d5f8dde1SJohn Marino min, &max_i, items);
6345f2eab64SJohn Marino if (status != REG_OK)
635d5f8dde1SJohn Marino goto error;
6365f2eab64SJohn Marino }
637d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re)));
6385f2eab64SJohn Marino *items_size = max_i;
639d5f8dde1SJohn Marino ctx->re = re + 1;
640d5f8dde1SJohn Marino return REG_OK;
641d5f8dde1SJohn Marino
642d5f8dde1SJohn Marino default:
643d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
644d5f8dde1SJohn Marino c = *re;
645d5f8dde1SJohn Marino process_single_character:
646d5f8dde1SJohn Marino /* Process single character */
647d5f8dde1SJohn Marino if (range > 0)
648d5f8dde1SJohn Marino {
649d5f8dde1SJohn Marino int mine, maxe;
650d5f8dde1SJohn Marino
651d5f8dde1SJohn Marino process_end_range:
652d5f8dde1SJohn Marino /* Get collation equivalence values */
653d5f8dde1SJohn Marino mine = __collate_equiv_value(ctx->loc, &min, 1);
654d5f8dde1SJohn Marino maxe = __collate_equiv_value(ctx->loc, &c, 1);
655d5f8dde1SJohn Marino if (maxe < mine)
656d5f8dde1SJohn Marino {
657d5f8dde1SJohn Marino status = REG_ERANGE;
658d5f8dde1SJohn Marino goto error;
659d5f8dde1SJohn Marino }
660d5f8dde1SJohn Marino if (!collect_MCCS)
661d5f8dde1SJohn Marino {
662d5f8dde1SJohn Marino status = tre_new_item(ctx->mem,
663d5f8dde1SJohn Marino TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,
664d5f8dde1SJohn Marino mine, &max_i, items);
665d5f8dde1SJohn Marino if (status != REG_OK)
666d5f8dde1SJohn Marino goto error;
667d5f8dde1SJohn Marino status = tre_new_item(ctx->mem,
668d5f8dde1SJohn Marino TRE_BRACKET_MATCH_TYPE_RANGE_END,
669d5f8dde1SJohn Marino maxe, &max_i, items);
670d5f8dde1SJohn Marino if (status != REG_OK)
671d5f8dde1SJohn Marino goto error;
672d5f8dde1SJohn Marino }
673d5f8dde1SJohn Marino range = -1;
674d5f8dde1SJohn Marino }
675d5f8dde1SJohn Marino else
676d5f8dde1SJohn Marino {
677d5f8dde1SJohn Marino process_begin_range:
678d5f8dde1SJohn Marino if (!collect_MCCS)
679d5f8dde1SJohn Marino {
680d5f8dde1SJohn Marino if (range == 0)
681d5f8dde1SJohn Marino {
682d5f8dde1SJohn Marino status = tre_new_item(ctx->mem,
683d5f8dde1SJohn Marino TRE_BRACKET_MATCH_TYPE_CHAR,
684d5f8dde1SJohn Marino min, &max_i, items);
685d5f8dde1SJohn Marino if (status != REG_OK)
686d5f8dde1SJohn Marino goto error;
687d5f8dde1SJohn Marino }
688d5f8dde1SJohn Marino min = c;
689d5f8dde1SJohn Marino }
690d5f8dde1SJohn Marino range = 0;
691d5f8dde1SJohn Marino }
692d5f8dde1SJohn Marino other++;
693d5f8dde1SJohn Marino break;
694d5f8dde1SJohn Marino }
695d5f8dde1SJohn Marino }
696d5f8dde1SJohn Marino status = REG_EBRACK;
697d5f8dde1SJohn Marino error:
698d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: error: '%.*" STRF "', status=%d\n",
699d5f8dde1SJohn Marino REST(re), status));
700d5f8dde1SJohn Marino if (col_syms)
701d5f8dde1SJohn Marino xfree(col_syms);
7025f2eab64SJohn Marino return status;
7035f2eab64SJohn Marino }
7045f2eab64SJohn Marino
705d5f8dde1SJohn Marino #ifdef TRE_DEBUG
706d5f8dde1SJohn Marino static const char *bracket_match_type_str[] = {
707d5f8dde1SJohn Marino "unused",
708d5f8dde1SJohn Marino "char",
709d5f8dde1SJohn Marino "range begin",
710d5f8dde1SJohn Marino "range end",
711d5f8dde1SJohn Marino "class",
712d5f8dde1SJohn Marino "equivalence value",
713d5f8dde1SJohn Marino };
714d5f8dde1SJohn Marino #endif /* TRE_DEBUG */
715d5f8dde1SJohn Marino
7165f2eab64SJohn Marino static reg_errcode_t
tre_parse_bracket(tre_parse_ctx_t * ctx,tre_ast_node_t ** result)7175f2eab64SJohn Marino tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
7185f2eab64SJohn Marino {
719d5f8dde1SJohn Marino tre_ast_node_t *node;
7205f2eab64SJohn Marino reg_errcode_t status = REG_OK;
721d5f8dde1SJohn Marino tre_bracket_match_list_t *items;
722d5f8dde1SJohn Marino int max_i = 32;
723d5f8dde1SJohn Marino tre_collating_symbol *col_syms = NULL;
724d5f8dde1SJohn Marino
725d5f8dde1SJohn Marino /* Handle special cases [[:<:]] and [[:>:]] */
726d5f8dde1SJohn Marino if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET
727d5f8dde1SJohn Marino && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>')
728d5f8dde1SJohn Marino && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET
729d5f8dde1SJohn Marino && ctx->re[5] == CHAR_RBRACKET)
730d5f8dde1SJohn Marino {
731d5f8dde1SJohn Marino *result = tre_ast_new_literal(ctx->mem, ASSERTION,
732d5f8dde1SJohn Marino (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW,
733d5f8dde1SJohn Marino -1);
734d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ?
735d5f8dde1SJohn Marino "[[:<:]]" : "[[:>:]]"));
736d5f8dde1SJohn Marino ctx->re += 6;
737d5f8dde1SJohn Marino return *result ? REG_OK : REG_ESPACE;
738d5f8dde1SJohn Marino }
7395f2eab64SJohn Marino
7405f2eab64SJohn Marino /* Start off with an array of `max_i' elements. */
741d5f8dde1SJohn Marino items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i));
7425f2eab64SJohn Marino if (items == NULL)
7435f2eab64SJohn Marino return REG_ESPACE;
7445f2eab64SJohn Marino
7455f2eab64SJohn Marino if (*ctx->re == CHAR_CARET)
7465f2eab64SJohn Marino {
7475f2eab64SJohn Marino DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
748d5f8dde1SJohn Marino items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE;
7495f2eab64SJohn Marino ctx->re++;
7505f2eab64SJohn Marino }
7515f2eab64SJohn Marino
752d5f8dde1SJohn Marino status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms);
7535f2eab64SJohn Marino
7545f2eab64SJohn Marino if (status != REG_OK)
7555f2eab64SJohn Marino goto parse_bracket_done;
7565f2eab64SJohn Marino
757d5f8dde1SJohn Marino /* If there are collating symbols, split off the multi-character ones
758d5f8dde1SJohn Marino * into a union of the bracket expression (without the collating symbols)
759d5f8dde1SJohn Marino * and the multiple-character sequences. We create an equivalent input
760d5f8dde1SJohn Marino * string and run tre_parse() recursively */
761d5f8dde1SJohn Marino if (col_syms)
762d5f8dde1SJohn Marino {
763d5f8dde1SJohn Marino tre_char_t *str, *sp;
764d5f8dde1SJohn Marino tre_collating_symbol *cp;
765d5f8dde1SJohn Marino tre_parse_ctx_t subctx;
7665f2eab64SJohn Marino
767d5f8dde1SJohn Marino /* Allocate a new string. We start with the size of the original
768d5f8dde1SJohn Marino * bracket expression (minus 1) and add 2 (for a leading "[" and
769d5f8dde1SJohn Marino * a trailing nil; don't need a "^", since it is illegal to have
770d5f8dde1SJohn Marino * inverted MCCSs). Since a multi-character collating symbols
771d5f8dde1SJohn Marino * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't
772d5f8dde1SJohn Marino * need to worry about the new string getting too long. */
773d5f8dde1SJohn Marino xfree(items);
774d5f8dde1SJohn Marino str = xmalloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2));
775d5f8dde1SJohn Marino if (str == NULL)
7765f2eab64SJohn Marino {
777d5f8dde1SJohn Marino xfree(col_syms);
778d5f8dde1SJohn Marino return REG_ESPACE;
779d5f8dde1SJohn Marino }
780d5f8dde1SJohn Marino sp = str;
781d5f8dde1SJohn Marino if (col_syms->len > 0)
782d5f8dde1SJohn Marino {
783d5f8dde1SJohn Marino /* There are other items in the bracket expression besides the
784d5f8dde1SJohn Marino * multi-character collating symbols, so create a new bracket
785d5f8dde1SJohn Marino * expression with only those other itmes. */
786d5f8dde1SJohn Marino const tre_char_t *re;
787d5f8dde1SJohn Marino ptrdiff_t i;
7885f2eab64SJohn Marino
789d5f8dde1SJohn Marino *sp++ = '[';
790d5f8dde1SJohn Marino re = ctx->re;
791d5f8dde1SJohn Marino for (cp = col_syms + 1; cp->start; cp++)
792d5f8dde1SJohn Marino {
793d5f8dde1SJohn Marino /* The "- 2" is to account for the "[." */
794d5f8dde1SJohn Marino if ((i = ((cp->start - re) - 2)) > 0)
795d5f8dde1SJohn Marino {
796d5f8dde1SJohn Marino memcpy(sp, re, sizeof(*sp) * i);
797d5f8dde1SJohn Marino sp += i;
798d5f8dde1SJohn Marino }
799d5f8dde1SJohn Marino /* The "+ 2" is to account for the ".]" */
800d5f8dde1SJohn Marino re = cp->start + cp->len + 2;
801d5f8dde1SJohn Marino }
802d5f8dde1SJohn Marino i = col_syms->start - re; /* Includes the trailing right bracket */
803d5f8dde1SJohn Marino memcpy(sp, re, sizeof(*sp) * i);
804d5f8dde1SJohn Marino sp += i;
805d5f8dde1SJohn Marino *sp++ = '|';
806d5f8dde1SJohn Marino }
807d5f8dde1SJohn Marino for (cp = col_syms + 1; cp->start; cp++)
808d5f8dde1SJohn Marino {
809d5f8dde1SJohn Marino memcpy(sp, cp->start, sizeof(*sp) * cp->len);
810d5f8dde1SJohn Marino sp += cp->len;
811d5f8dde1SJohn Marino if (cp[1].start)
812d5f8dde1SJohn Marino *sp++ = '|';
813d5f8dde1SJohn Marino }
814d5f8dde1SJohn Marino *sp = 0;
815d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
816d5f8dde1SJohn Marino str));
8175f2eab64SJohn Marino
818d5f8dde1SJohn Marino memcpy(&subctx, ctx, sizeof(subctx));
819d5f8dde1SJohn Marino subctx.re = str;
820d5f8dde1SJohn Marino subctx.len = sp - str;
821d5f8dde1SJohn Marino subctx.nofirstsub = 1;
822d5f8dde1SJohn Marino subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */
823d5f8dde1SJohn Marino status = tre_parse(&subctx);
824d5f8dde1SJohn Marino xfree(str);
825d5f8dde1SJohn Marino if (status != REG_OK)
8265f2eab64SJohn Marino {
827d5f8dde1SJohn Marino xfree(col_syms);
828d5f8dde1SJohn Marino return status;
8295f2eab64SJohn Marino }
830d5f8dde1SJohn Marino ctx->re = col_syms->start;
831d5f8dde1SJohn Marino ctx->position = subctx.position;
832d5f8dde1SJohn Marino xfree(col_syms);
833d5f8dde1SJohn Marino *result = subctx.result;
834d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: Returning to original string\n"));
835d5f8dde1SJohn Marino return REG_OK;
8365f2eab64SJohn Marino }
8375f2eab64SJohn Marino
838d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
839d5f8dde1SJohn Marino node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position);
8405f2eab64SJohn Marino if (node == NULL)
8415f2eab64SJohn Marino {
8425f2eab64SJohn Marino status = REG_ESPACE;
8435f2eab64SJohn Marino goto parse_bracket_done;
8445f2eab64SJohn Marino }
8455f2eab64SJohn Marino else
8465f2eab64SJohn Marino {
847d5f8dde1SJohn Marino tre_literal_t *l = node->obj;
848d5f8dde1SJohn Marino l->u.bracket_match_list = tre_mem_alloc(ctx->mem,
849d5f8dde1SJohn Marino SIZEOF_BRACKET_MATCH_LIST(items));
850d5f8dde1SJohn Marino if (l->u.bracket_match_list == NULL)
851d5f8dde1SJohn Marino {
8525f2eab64SJohn Marino status = REG_ESPACE;
8535f2eab64SJohn Marino goto parse_bracket_done;
854d5f8dde1SJohn Marino }
855d5f8dde1SJohn Marino memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items));
856d5f8dde1SJohn Marino }
8575f2eab64SJohn Marino
8585f2eab64SJohn Marino #ifdef TRE_DEBUG
859d5f8dde1SJohn Marino {
860d5f8dde1SJohn Marino int i;
861d5f8dde1SJohn Marino tre_bracket_match_t *b;
862d5f8dde1SJohn Marino DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
863d5f8dde1SJohn Marino items->num_bracket_matches, items->flags));
864d5f8dde1SJohn Marino for (i = 0, b = items->bracket_matches;
865d5f8dde1SJohn Marino i < items->num_bracket_matches; i++, b++)
866d5f8dde1SJohn Marino {
867d5f8dde1SJohn Marino DPRINT((" %d: %s %d\n", i, bracket_match_type_str[b->type],
868d5f8dde1SJohn Marino b->value));
869d5f8dde1SJohn Marino }
870d5f8dde1SJohn Marino }
8715f2eab64SJohn Marino #endif /* TRE_DEBUG */
8725f2eab64SJohn Marino
8735f2eab64SJohn Marino parse_bracket_done:
8745f2eab64SJohn Marino xfree(items);
8755f2eab64SJohn Marino ctx->position++;
8765f2eab64SJohn Marino *result = node;
8775f2eab64SJohn Marino return status;
8785f2eab64SJohn Marino }
8795f2eab64SJohn Marino
8805f2eab64SJohn Marino
8815f2eab64SJohn Marino /* Parses a positive decimal integer. Returns -1 if the string does not
8825f2eab64SJohn Marino contain a valid number. */
8835f2eab64SJohn Marino static int
tre_parse_int(const tre_char_t ** regex,const tre_char_t * regex_end)8845f2eab64SJohn Marino tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
8855f2eab64SJohn Marino {
8865f2eab64SJohn Marino int num = -1;
8875f2eab64SJohn Marino const tre_char_t *r = *regex;
8885f2eab64SJohn Marino while (r < regex_end && *r >= L'0' && *r <= L'9')
8895f2eab64SJohn Marino {
8905f2eab64SJohn Marino if (num < 0)
8915f2eab64SJohn Marino num = 0;
8925f2eab64SJohn Marino num = num * 10 + *r - L'0';
8935f2eab64SJohn Marino r++;
8945f2eab64SJohn Marino }
8955f2eab64SJohn Marino *regex = r;
8965f2eab64SJohn Marino return num;
8975f2eab64SJohn Marino }
8985f2eab64SJohn Marino
8995f2eab64SJohn Marino
9005f2eab64SJohn Marino static reg_errcode_t
tre_parse_bound(tre_parse_ctx_t * ctx,tre_ast_node_t ** result)9015f2eab64SJohn Marino tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
9025f2eab64SJohn Marino {
903d5f8dde1SJohn Marino int min, max;
904d5f8dde1SJohn Marino #ifdef TRE_APPROX
905d5f8dde1SJohn Marino int i;
9065f2eab64SJohn Marino int cost_ins, cost_del, cost_subst, cost_max;
9075f2eab64SJohn Marino int limit_ins, limit_del, limit_subst, limit_err;
9085f2eab64SJohn Marino const tre_char_t *start;
909d5f8dde1SJohn Marino #endif /* TRE_APPROX */
910d5f8dde1SJohn Marino const tre_char_t *r = ctx->re;
9115f2eab64SJohn Marino int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
912d5f8dde1SJohn Marino #ifdef TRE_APPROX
9135f2eab64SJohn Marino int approx = 0;
9145f2eab64SJohn Marino int costs_set = 0;
9155f2eab64SJohn Marino int counts_set = 0;
9165f2eab64SJohn Marino
9175f2eab64SJohn Marino cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
9185f2eab64SJohn Marino limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
919d5f8dde1SJohn Marino #endif /* TRE_APPROX */
9205f2eab64SJohn Marino
9215f2eab64SJohn Marino /* Parse number (minimum repetition count). */
9225f2eab64SJohn Marino min = -1;
923d5f8dde1SJohn Marino if (r >= ctx->re_end)
924d5f8dde1SJohn Marino #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
925d5f8dde1SJohn Marino return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE;
926d5f8dde1SJohn Marino #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
927d5f8dde1SJohn Marino return REG_EBRACE;
928d5f8dde1SJohn Marino #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
929d5f8dde1SJohn Marino if (*r >= L'0' && *r <= L'9') {
9305f2eab64SJohn Marino DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r)));
9315f2eab64SJohn Marino min = tre_parse_int(&r, ctx->re_end);
9325f2eab64SJohn Marino }
933d5f8dde1SJohn Marino #ifndef TRE_APPROX
934d5f8dde1SJohn Marino else
935d5f8dde1SJohn Marino #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
936d5f8dde1SJohn Marino /* For ERE, return REG_NOMATCH to signal that the lbrace should
937d5f8dde1SJohn Marino be treated as a literal */
938d5f8dde1SJohn Marino return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR;
939d5f8dde1SJohn Marino #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
940d5f8dde1SJohn Marino return REG_BADBR;
941d5f8dde1SJohn Marino #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
942d5f8dde1SJohn Marino #endif /* !TRE_APPROX */
9435f2eab64SJohn Marino
9445f2eab64SJohn Marino /* Parse comma and second number (maximum repetition count). */
9455f2eab64SJohn Marino max = min;
9465f2eab64SJohn Marino if (r < ctx->re_end && *r == CHAR_COMMA)
9475f2eab64SJohn Marino {
9485f2eab64SJohn Marino r++;
9495f2eab64SJohn Marino DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r)));
9505f2eab64SJohn Marino max = tre_parse_int(&r, ctx->re_end);
9515f2eab64SJohn Marino }
9525f2eab64SJohn Marino
9535f2eab64SJohn Marino /* Check that the repeat counts are sane. */
954d5f8dde1SJohn Marino if ((max >= 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX)
9555f2eab64SJohn Marino return REG_BADBR;
9565f2eab64SJohn Marino
9575f2eab64SJohn Marino
958d5f8dde1SJohn Marino #ifdef TRE_APPROX
9595f2eab64SJohn Marino /*
9605f2eab64SJohn Marino '{'
9615f2eab64SJohn Marino optionally followed immediately by a number == minimum repcount
9625f2eab64SJohn Marino optionally followed by , then a number == maximum repcount
9635f2eab64SJohn Marino + then a number == maximum insertion count
9645f2eab64SJohn Marino - then a number == maximum deletion count
9655f2eab64SJohn Marino # then a number == maximum substitution count
9665f2eab64SJohn Marino ~ then a number == maximum number of errors
9675f2eab64SJohn Marino Any of +, -, # or ~ without followed by a number means that
9685f2eab64SJohn Marino the maximum count/number of errors is infinite.
9695f2eab64SJohn Marino
9705f2eab64SJohn Marino An equation of the form
9715f2eab64SJohn Marino Xi + Yd + Zs < C
9725f2eab64SJohn Marino can be specified to set costs and the cost limit to a value
9735f2eab64SJohn Marino different from the default value:
9745f2eab64SJohn Marino - X is the cost of an insertion
9755f2eab64SJohn Marino - Y is the cost of a deletion
9765f2eab64SJohn Marino - Z is the cost of a substitution
9775f2eab64SJohn Marino - C is the maximum cost
9785f2eab64SJohn Marino
9795f2eab64SJohn Marino If no count limit or cost is set for an operation, the operation
9805f2eab64SJohn Marino is not allowed at all.
9815f2eab64SJohn Marino */
9825f2eab64SJohn Marino
9835f2eab64SJohn Marino
9845f2eab64SJohn Marino do {
9855f2eab64SJohn Marino int done;
9865f2eab64SJohn Marino start = r;
9875f2eab64SJohn Marino
9885f2eab64SJohn Marino /* Parse count limit settings */
9895f2eab64SJohn Marino done = 0;
9905f2eab64SJohn Marino if (!counts_set)
9915f2eab64SJohn Marino while (r + 1 < ctx->re_end && !done)
9925f2eab64SJohn Marino {
9935f2eab64SJohn Marino switch (*r)
9945f2eab64SJohn Marino {
9955f2eab64SJohn Marino case CHAR_PLUS: /* Insert limit */
9965f2eab64SJohn Marino DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r)));
9975f2eab64SJohn Marino r++;
9985f2eab64SJohn Marino limit_ins = tre_parse_int(&r, ctx->re_end);
9995f2eab64SJohn Marino if (limit_ins < 0)
10005f2eab64SJohn Marino limit_ins = INT_MAX;
10015f2eab64SJohn Marino counts_set = 1;
10025f2eab64SJohn Marino break;
10035f2eab64SJohn Marino case CHAR_MINUS: /* Delete limit */
10045f2eab64SJohn Marino DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r)));
10055f2eab64SJohn Marino r++;
10065f2eab64SJohn Marino limit_del = tre_parse_int(&r, ctx->re_end);
10075f2eab64SJohn Marino if (limit_del < 0)
10085f2eab64SJohn Marino limit_del = INT_MAX;
10095f2eab64SJohn Marino counts_set = 1;
10105f2eab64SJohn Marino break;
10115f2eab64SJohn Marino case CHAR_HASH: /* Substitute limit */
10125f2eab64SJohn Marino DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
10135f2eab64SJohn Marino r++;
10145f2eab64SJohn Marino limit_subst = tre_parse_int(&r, ctx->re_end);
10155f2eab64SJohn Marino if (limit_subst < 0)
10165f2eab64SJohn Marino limit_subst = INT_MAX;
10175f2eab64SJohn Marino counts_set = 1;
10185f2eab64SJohn Marino break;
10195f2eab64SJohn Marino case CHAR_TILDE: /* Maximum number of changes */
10205f2eab64SJohn Marino DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
10215f2eab64SJohn Marino r++;
10225f2eab64SJohn Marino limit_err = tre_parse_int(&r, ctx->re_end);
10235f2eab64SJohn Marino if (limit_err < 0)
10245f2eab64SJohn Marino limit_err = INT_MAX;
10255f2eab64SJohn Marino approx = 1;
10265f2eab64SJohn Marino break;
10275f2eab64SJohn Marino case CHAR_COMMA:
10285f2eab64SJohn Marino r++;
10295f2eab64SJohn Marino break;
10305f2eab64SJohn Marino case L' ':
10315f2eab64SJohn Marino r++;
10325f2eab64SJohn Marino break;
10335f2eab64SJohn Marino case L'}':
10345f2eab64SJohn Marino done = 1;
10355f2eab64SJohn Marino break;
10365f2eab64SJohn Marino default:
10375f2eab64SJohn Marino done = 1;
10385f2eab64SJohn Marino break;
10395f2eab64SJohn Marino }
10405f2eab64SJohn Marino }
10415f2eab64SJohn Marino
10425f2eab64SJohn Marino /* Parse cost restriction equation. */
10435f2eab64SJohn Marino done = 0;
10445f2eab64SJohn Marino if (!costs_set)
10455f2eab64SJohn Marino while (r + 1 < ctx->re_end && !done)
10465f2eab64SJohn Marino {
10475f2eab64SJohn Marino switch (*r)
10485f2eab64SJohn Marino {
10495f2eab64SJohn Marino case CHAR_PLUS:
10505f2eab64SJohn Marino case L' ':
10515f2eab64SJohn Marino r++;
10525f2eab64SJohn Marino break;
10535f2eab64SJohn Marino case L'<':
10545f2eab64SJohn Marino DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r)));
10555f2eab64SJohn Marino r++;
10565f2eab64SJohn Marino while (*r == L' ')
10575f2eab64SJohn Marino r++;
10585f2eab64SJohn Marino cost_max = tre_parse_int(&r, ctx->re_end);
10595f2eab64SJohn Marino if (cost_max < 0)
10605f2eab64SJohn Marino cost_max = INT_MAX;
10615f2eab64SJohn Marino else
10625f2eab64SJohn Marino cost_max--;
10635f2eab64SJohn Marino approx = 1;
10645f2eab64SJohn Marino break;
10655f2eab64SJohn Marino case CHAR_COMMA:
10665f2eab64SJohn Marino r++;
10675f2eab64SJohn Marino done = 1;
10685f2eab64SJohn Marino break;
10695f2eab64SJohn Marino default:
10705f2eab64SJohn Marino if (*r >= L'0' && *r <= L'9')
10715f2eab64SJohn Marino {
10725f2eab64SJohn Marino #ifdef TRE_DEBUG
10735f2eab64SJohn Marino const tre_char_t *sr = r;
10745f2eab64SJohn Marino #endif /* TRE_DEBUG */
10755f2eab64SJohn Marino int cost = tre_parse_int(&r, ctx->re_end);
10765f2eab64SJohn Marino /* XXX - make sure r is not past end. */
10775f2eab64SJohn Marino switch (*r)
10785f2eab64SJohn Marino {
10795f2eab64SJohn Marino case L'i': /* Insert cost */
10805f2eab64SJohn Marino DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n",
10815f2eab64SJohn Marino REST(sr)));
10825f2eab64SJohn Marino r++;
10835f2eab64SJohn Marino cost_ins = cost;
10845f2eab64SJohn Marino costs_set = 1;
10855f2eab64SJohn Marino break;
10865f2eab64SJohn Marino case L'd': /* Delete cost */
10875f2eab64SJohn Marino DPRINT(("tre_parse: del cost: '%.*" STRF "'\n",
10885f2eab64SJohn Marino REST(sr)));
10895f2eab64SJohn Marino r++;
10905f2eab64SJohn Marino cost_del = cost;
10915f2eab64SJohn Marino costs_set = 1;
10925f2eab64SJohn Marino break;
10935f2eab64SJohn Marino case L's': /* Substitute cost */
10945f2eab64SJohn Marino DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n",
10955f2eab64SJohn Marino REST(sr)));
10965f2eab64SJohn Marino r++;
10975f2eab64SJohn Marino cost_subst = cost;
10985f2eab64SJohn Marino costs_set = 1;
10995f2eab64SJohn Marino break;
11005f2eab64SJohn Marino default:
11015f2eab64SJohn Marino return REG_BADBR;
11025f2eab64SJohn Marino }
11035f2eab64SJohn Marino }
11045f2eab64SJohn Marino else
11055f2eab64SJohn Marino {
11065f2eab64SJohn Marino done = 1;
11075f2eab64SJohn Marino break;
11085f2eab64SJohn Marino }
11095f2eab64SJohn Marino }
11105f2eab64SJohn Marino }
11115f2eab64SJohn Marino } while (start != r);
1112d5f8dde1SJohn Marino #endif /* TRE_APPROX */
11135f2eab64SJohn Marino
1114d5f8dde1SJohn Marino /*{*//* Missing }. */
11155f2eab64SJohn Marino if (r >= ctx->re_end)
11165f2eab64SJohn Marino return REG_EBRACE;
11175f2eab64SJohn Marino
11185f2eab64SJohn Marino /* Empty contents of {}. */
11195f2eab64SJohn Marino if (r == ctx->re)
11205f2eab64SJohn Marino return REG_BADBR;
11215f2eab64SJohn Marino
11225f2eab64SJohn Marino /* Parse the ending '}' or '\}'.*/
11235f2eab64SJohn Marino if (ctx->cflags & REG_EXTENDED)
11245f2eab64SJohn Marino {
11255f2eab64SJohn Marino if (r >= ctx->re_end || *r != CHAR_RBRACE)
11265f2eab64SJohn Marino return REG_BADBR;
11275f2eab64SJohn Marino r++;
1128d5f8dde1SJohn Marino /* Parse trailing '?' marking minimal repetition. */
1129d5f8dde1SJohn Marino if (r < ctx->re_end)
1130d5f8dde1SJohn Marino {
1131d5f8dde1SJohn Marino if (*r == CHAR_QUESTIONMARK)
1132d5f8dde1SJohn Marino {
1133d5f8dde1SJohn Marino /* Process the question mark only in enhanced mode.
1134d5f8dde1SJohn Marino Otherwise, the question mark is an error in ERE
1135d5f8dde1SJohn Marino or a literal in BRE */
1136d5f8dde1SJohn Marino if (ctx->cflags & REG_ENHANCED)
1137d5f8dde1SJohn Marino {
1138d5f8dde1SJohn Marino minimal = !(ctx->cflags & REG_UNGREEDY);
1139d5f8dde1SJohn Marino r++;
1140d5f8dde1SJohn Marino }
1141d5f8dde1SJohn Marino else return REG_BADRPT;
1142d5f8dde1SJohn Marino }
1143d5f8dde1SJohn Marino else if (*r == CHAR_STAR || *r == CHAR_PLUS)
1144d5f8dde1SJohn Marino {
1145d5f8dde1SJohn Marino /* These are reserved for future extensions. */
1146d5f8dde1SJohn Marino return REG_BADRPT;
1147d5f8dde1SJohn Marino }
1148d5f8dde1SJohn Marino }
11495f2eab64SJohn Marino }
11505f2eab64SJohn Marino else
11515f2eab64SJohn Marino {
11525f2eab64SJohn Marino if (r + 1 >= ctx->re_end
11535f2eab64SJohn Marino || *r != CHAR_BACKSLASH
11545f2eab64SJohn Marino || *(r + 1) != CHAR_RBRACE)
11555f2eab64SJohn Marino return REG_BADBR;
11565f2eab64SJohn Marino r += 2;
1157d5f8dde1SJohn Marino if (r < ctx->re_end && *r == CHAR_STAR)
11585f2eab64SJohn Marino {
1159d5f8dde1SJohn Marino /* This is reserved for future extensions. */
11605f2eab64SJohn Marino return REG_BADRPT;
11615f2eab64SJohn Marino }
11625f2eab64SJohn Marino }
11635f2eab64SJohn Marino
1164d5f8dde1SJohn Marino if (minimal)
1165d5f8dde1SJohn Marino ctx->num_reorder_tags++;
1166d5f8dde1SJohn Marino
1167d5f8dde1SJohn Marino if (!result) goto parse_bound_exit;
11685f2eab64SJohn Marino /* Create the AST node(s). */
1169d5f8dde1SJohn Marino /* Originally, if min == 0 && max == 0, we immediately replace the whole
1170d5f8dde1SJohn Marino iteration with EMPTY. This unfortunately drops any submatches, and
1171d5f8dde1SJohn Marino messes up setting the pmatch values (we can get tags of -1, and
1172d5f8dde1SJohn Marino tag values in the billions). So we leave it and process this case as
1173d5f8dde1SJohn Marino usual, and wait until tre_expand_ast() to replace with EMPTY */
1174d5f8dde1SJohn Marino #ifdef TRE_APPROX
11755f2eab64SJohn Marino if (min < 0 && max < 0)
11765f2eab64SJohn Marino /* Only approximate parameters set, no repetitions. */
11775f2eab64SJohn Marino min = max = 1;
1178d5f8dde1SJohn Marino #endif /* TRE_APPROX */
11795f2eab64SJohn Marino
11805f2eab64SJohn Marino *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
11815f2eab64SJohn Marino if (!*result)
11825f2eab64SJohn Marino return REG_ESPACE;
11835f2eab64SJohn Marino
1184d5f8dde1SJohn Marino #ifdef TRE_APPROX
11855f2eab64SJohn Marino /* If approximate matching parameters are set, add them to the
11865f2eab64SJohn Marino iteration node. */
11875f2eab64SJohn Marino if (approx || costs_set || counts_set)
11885f2eab64SJohn Marino {
11895f2eab64SJohn Marino int *params;
11905f2eab64SJohn Marino tre_iteration_t *iter = (*result)->obj;
11915f2eab64SJohn Marino
11925f2eab64SJohn Marino if (costs_set || counts_set)
11935f2eab64SJohn Marino {
11945f2eab64SJohn Marino if (limit_ins == TRE_PARAM_UNSET)
11955f2eab64SJohn Marino {
11965f2eab64SJohn Marino if (cost_ins == TRE_PARAM_UNSET)
11975f2eab64SJohn Marino limit_ins = 0;
11985f2eab64SJohn Marino else
11995f2eab64SJohn Marino limit_ins = INT_MAX;
12005f2eab64SJohn Marino }
12015f2eab64SJohn Marino
12025f2eab64SJohn Marino if (limit_del == TRE_PARAM_UNSET)
12035f2eab64SJohn Marino {
12045f2eab64SJohn Marino if (cost_del == TRE_PARAM_UNSET)
12055f2eab64SJohn Marino limit_del = 0;
12065f2eab64SJohn Marino else
12075f2eab64SJohn Marino limit_del = INT_MAX;
12085f2eab64SJohn Marino }
12095f2eab64SJohn Marino
12105f2eab64SJohn Marino if (limit_subst == TRE_PARAM_UNSET)
12115f2eab64SJohn Marino {
12125f2eab64SJohn Marino if (cost_subst == TRE_PARAM_UNSET)
12135f2eab64SJohn Marino limit_subst = 0;
12145f2eab64SJohn Marino else
12155f2eab64SJohn Marino limit_subst = INT_MAX;
12165f2eab64SJohn Marino }
12175f2eab64SJohn Marino }
12185f2eab64SJohn Marino
12195f2eab64SJohn Marino if (cost_max == TRE_PARAM_UNSET)
12205f2eab64SJohn Marino cost_max = INT_MAX;
12215f2eab64SJohn Marino if (limit_err == TRE_PARAM_UNSET)
12225f2eab64SJohn Marino limit_err = INT_MAX;
12235f2eab64SJohn Marino
12245f2eab64SJohn Marino ctx->have_approx = 1;
12255f2eab64SJohn Marino params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
12265f2eab64SJohn Marino if (!params)
12275f2eab64SJohn Marino return REG_ESPACE;
12285f2eab64SJohn Marino for (i = 0; i < TRE_PARAM_LAST; i++)
12295f2eab64SJohn Marino params[i] = TRE_PARAM_UNSET;
12305f2eab64SJohn Marino params[TRE_PARAM_COST_INS] = cost_ins;
12315f2eab64SJohn Marino params[TRE_PARAM_COST_DEL] = cost_del;
12325f2eab64SJohn Marino params[TRE_PARAM_COST_SUBST] = cost_subst;
12335f2eab64SJohn Marino params[TRE_PARAM_COST_MAX] = cost_max;
12345f2eab64SJohn Marino params[TRE_PARAM_MAX_INS] = limit_ins;
12355f2eab64SJohn Marino params[TRE_PARAM_MAX_DEL] = limit_del;
12365f2eab64SJohn Marino params[TRE_PARAM_MAX_SUBST] = limit_subst;
12375f2eab64SJohn Marino params[TRE_PARAM_MAX_ERR] = limit_err;
12385f2eab64SJohn Marino iter->params = params;
12395f2eab64SJohn Marino }
1240d5f8dde1SJohn Marino #endif /* TRE_APPROX */
12415f2eab64SJohn Marino
1242d5f8dde1SJohn Marino parse_bound_exit:
1243d5f8dde1SJohn Marino #ifdef TRE_APPROX
12445f2eab64SJohn Marino DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
12455f2eab64SJohn Marino "limits [%d,%d,%d, total %d]\n",
12465f2eab64SJohn Marino min, max, cost_ins, cost_del, cost_subst, cost_max,
12475f2eab64SJohn Marino limit_ins, limit_del, limit_subst, limit_err));
1248d5f8dde1SJohn Marino #else /* !TRE_APPROX */
1249d5f8dde1SJohn Marino DPRINT(("tre_parse_bound: min %d, max %d\n", min, max));
1250d5f8dde1SJohn Marino #endif /* !TRE_APPROX */
12515f2eab64SJohn Marino
12525f2eab64SJohn Marino
12535f2eab64SJohn Marino ctx->re = r;
12545f2eab64SJohn Marino return REG_OK;
12555f2eab64SJohn Marino }
12565f2eab64SJohn Marino
1257d5f8dde1SJohn Marino /* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
1258d5f8dde1SJohn Marino non-self-contained options, like (?i), this causes ((?i)fu)bar to be
1259d5f8dde1SJohn Marino treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
1260d5f8dde1SJohn Marino Because we now set up tags for even non-capturing parenthesized
1261d5f8dde1SJohn Marino subexpressions, we always call PARSE_MARK_FOR_SUBMATCH. So if we
1262d5f8dde1SJohn Marino pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
1263d5f8dde1SJohn Marino have it restore cflags after the subexpression, we don't need to have
1264d5f8dde1SJohn Marino a separate PARSE_RESTORE_CFLAGS, and then after processing the
1265d5f8dde1SJohn Marino non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
1266d5f8dde1SJohn Marino This has the side-benefit of now matching the perl behavior: the RE
1267d5f8dde1SJohn Marino foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
1268d5f8dde1SJohn Marino of foo AND (?i) (bar OR zap). */
12695f2eab64SJohn Marino typedef enum {
12705f2eab64SJohn Marino PARSE_RE = 0,
12715f2eab64SJohn Marino PARSE_ATOM,
12725f2eab64SJohn Marino PARSE_MARK_FOR_SUBMATCH,
12735f2eab64SJohn Marino PARSE_BRANCH,
12745f2eab64SJohn Marino PARSE_PIECE,
12755f2eab64SJohn Marino PARSE_CATENATION,
12765f2eab64SJohn Marino PARSE_POST_CATENATION,
12775f2eab64SJohn Marino PARSE_UNION,
12785f2eab64SJohn Marino PARSE_POST_UNION,
12795f2eab64SJohn Marino PARSE_POSTFIX,
12805f2eab64SJohn Marino } tre_parse_re_stack_symbol_t;
12815f2eab64SJohn Marino
12825f2eab64SJohn Marino
12835f2eab64SJohn Marino reg_errcode_t
tre_parse(tre_parse_ctx_t * ctx)12845f2eab64SJohn Marino tre_parse(tre_parse_ctx_t *ctx)
12855f2eab64SJohn Marino {
12865f2eab64SJohn Marino tre_ast_node_t *result = NULL;
12875f2eab64SJohn Marino tre_parse_re_stack_symbol_t symbol;
12885f2eab64SJohn Marino reg_errcode_t status = REG_OK;
12895f2eab64SJohn Marino tre_stack_t *stack = ctx->stack;
12905f2eab64SJohn Marino int bottom = tre_stack_num_objects(stack);
12915f2eab64SJohn Marino int depth = 0;
12925f2eab64SJohn Marino int temporary_cflags = 0;
1293d5f8dde1SJohn Marino int bre_branch_begin;
1294d5f8dde1SJohn Marino #ifdef TRE_DEBUG
1295d5f8dde1SJohn Marino const tre_char_t *tmp_re;
1296d5f8dde1SJohn Marino #endif
12975f2eab64SJohn Marino
1298d5f8dde1SJohn Marino DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n",
1299d5f8dde1SJohn Marino ctx->len, ctx->re, ctx->len, ctx->cflags));
13005f2eab64SJohn Marino
1301d5f8dde1SJohn Marino if (ctx->len <= 0) return REG_EMPTY;
13025f2eab64SJohn Marino if (!ctx->nofirstsub)
13035f2eab64SJohn Marino {
1304d5f8dde1SJohn Marino STACK_PUSH(stack, int, ctx->cflags);
13055f2eab64SJohn Marino STACK_PUSH(stack, int, ctx->submatch_id);
13065f2eab64SJohn Marino STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
13075f2eab64SJohn Marino ctx->submatch_id++;
13085f2eab64SJohn Marino }
1309d5f8dde1SJohn Marino STACK_PUSH(stack, int, 0); // bre_branch_begin
13105f2eab64SJohn Marino STACK_PUSH(stack, int, PARSE_RE);
13115f2eab64SJohn Marino ctx->re_start = ctx->re;
13125f2eab64SJohn Marino ctx->re_end = ctx->re + ctx->len;
13135f2eab64SJohn Marino
13145f2eab64SJohn Marino
13155f2eab64SJohn Marino /* The following is basically just a recursive descent parser. I use
13165f2eab64SJohn Marino an explicit stack instead of recursive functions mostly because of
13175f2eab64SJohn Marino two reasons: compatibility with systems which have an overflowable
13185f2eab64SJohn Marino call stack, and efficiency (both in lines of code and speed). */
1319d5f8dde1SJohn Marino while (tre_stack_num_objects(stack) > bottom)
13205f2eab64SJohn Marino {
13215f2eab64SJohn Marino symbol = tre_stack_pop_int(stack);
13225f2eab64SJohn Marino switch (symbol)
13235f2eab64SJohn Marino {
13245f2eab64SJohn Marino case PARSE_RE:
13255f2eab64SJohn Marino /* Parse a full regexp. A regexp is one or more branches,
13265f2eab64SJohn Marino separated by the union operator `|'. */
1327d5f8dde1SJohn Marino bre_branch_begin = tre_stack_pop_int(stack);
1328d5f8dde1SJohn Marino if (
13295f2eab64SJohn Marino #ifdef REG_LITERAL
1330d5f8dde1SJohn Marino !(ctx->cflags & REG_LITERAL) &&
13315f2eab64SJohn Marino #endif /* REG_LITERAL */
1332d5f8dde1SJohn Marino ctx->cflags & (REG_EXTENDED | REG_ENHANCED))
13335f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_UNION);
1334d5f8dde1SJohn Marino STACK_PUSHX(stack, int, bre_branch_begin);
13355f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_BRANCH);
13365f2eab64SJohn Marino break;
13375f2eab64SJohn Marino
13385f2eab64SJohn Marino case PARSE_BRANCH:
13395f2eab64SJohn Marino /* Parse a branch. A branch is one or more pieces, concatenated.
13405f2eab64SJohn Marino A piece is an atom possibly followed by a postfix operator. */
1341d5f8dde1SJohn Marino bre_branch_begin = tre_stack_pop_int(stack);
13425f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_CATENATION);
1343d5f8dde1SJohn Marino STACK_PUSHX(stack, int, bre_branch_begin);
13445f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_PIECE);
13455f2eab64SJohn Marino break;
13465f2eab64SJohn Marino
13475f2eab64SJohn Marino case PARSE_PIECE:
13485f2eab64SJohn Marino /* Parse a piece. A piece is an atom possibly followed by one
13495f2eab64SJohn Marino or more postfix operators. */
1350d5f8dde1SJohn Marino bre_branch_begin = tre_stack_pop_int(stack);
13515f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_POSTFIX);
1352d5f8dde1SJohn Marino STACK_PUSHX(stack, int, bre_branch_begin);
13535f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_ATOM);
13545f2eab64SJohn Marino break;
13555f2eab64SJohn Marino
13565f2eab64SJohn Marino case PARSE_CATENATION:
13575f2eab64SJohn Marino /* If the expression has not ended, parse another piece. */
13585f2eab64SJohn Marino {
13595f2eab64SJohn Marino tre_char_t c;
13605f2eab64SJohn Marino if (ctx->re >= ctx->re_end)
13615f2eab64SJohn Marino break;
13625f2eab64SJohn Marino c = *ctx->re;
13635f2eab64SJohn Marino #ifdef REG_LITERAL
13645f2eab64SJohn Marino if (!(ctx->cflags & REG_LITERAL))
13655f2eab64SJohn Marino {
13665f2eab64SJohn Marino #endif /* REG_LITERAL */
1367d5f8dde1SJohn Marino if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) ||
1368d5f8dde1SJohn Marino ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED
1369d5f8dde1SJohn Marino && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH &&
1370d5f8dde1SJohn Marino *(ctx->re + 1) == CHAR_PIPE))
13715f2eab64SJohn Marino break;
13725f2eab64SJohn Marino if ((ctx->cflags & REG_EXTENDED
13735f2eab64SJohn Marino && c == CHAR_RPAREN && depth > 0)
13745f2eab64SJohn Marino || (!(ctx->cflags & REG_EXTENDED)
1375d5f8dde1SJohn Marino && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH
1376d5f8dde1SJohn Marino && *(ctx->re + 1) == CHAR_RPAREN))
13775f2eab64SJohn Marino {
13785f2eab64SJohn Marino if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1379d5f8dde1SJohn Marino return REG_EPAREN;
13805f2eab64SJohn Marino DPRINT(("tre_parse: group end: '%.*" STRF "'\n",
13815f2eab64SJohn Marino REST(ctx->re)));
13825f2eab64SJohn Marino depth--;
1383d5f8dde1SJohn Marino if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED)))
13845f2eab64SJohn Marino ctx->re += 2;
13855f2eab64SJohn Marino break;
13865f2eab64SJohn Marino }
13875f2eab64SJohn Marino #ifdef REG_LITERAL
13885f2eab64SJohn Marino }
13895f2eab64SJohn Marino #endif /* REG_LITERAL */
13905f2eab64SJohn Marino
1391d5f8dde1SJohn Marino #ifdef REG_LEFT_ASSOC
1392d5f8dde1SJohn Marino if (ctx->cflags & REG_LEFT_ASSOC)
13935f2eab64SJohn Marino {
1394d5f8dde1SJohn Marino /* Left associative concatenation. */
1395d5f8dde1SJohn Marino STACK_PUSHX(stack, int, PARSE_CATENATION);
13965f2eab64SJohn Marino STACK_PUSHX(stack, voidptr, result);
13975f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1398d5f8dde1SJohn Marino STACK_PUSHX(stack, int, 0); // bre_branch_begin
13995f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_PIECE);
14005f2eab64SJohn Marino }
14015f2eab64SJohn Marino else
1402d5f8dde1SJohn Marino #endif /* REG_LEFT_ASSOC */
14035f2eab64SJohn Marino {
1404d5f8dde1SJohn Marino /* Default case, right associative concatenation. */
14055f2eab64SJohn Marino STACK_PUSHX(stack, voidptr, result);
14065f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1407d5f8dde1SJohn Marino STACK_PUSHX(stack, int, PARSE_CATENATION);
1408d5f8dde1SJohn Marino STACK_PUSHX(stack, int, 0); // bre_branch_begin
14095f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_PIECE);
14105f2eab64SJohn Marino }
14115f2eab64SJohn Marino break;
14125f2eab64SJohn Marino }
14135f2eab64SJohn Marino
14145f2eab64SJohn Marino case PARSE_POST_CATENATION:
14155f2eab64SJohn Marino {
14165f2eab64SJohn Marino tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
14175f2eab64SJohn Marino tre_ast_node_t *tmp_node;
14185f2eab64SJohn Marino tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
14195f2eab64SJohn Marino if (!tmp_node)
14205f2eab64SJohn Marino return REG_ESPACE;
14215f2eab64SJohn Marino result = tmp_node;
14225f2eab64SJohn Marino break;
14235f2eab64SJohn Marino }
14245f2eab64SJohn Marino
14255f2eab64SJohn Marino case PARSE_UNION:
14265f2eab64SJohn Marino if (ctx->re >= ctx->re_end)
14275f2eab64SJohn Marino break;
14285f2eab64SJohn Marino #ifdef REG_LITERAL
14295f2eab64SJohn Marino if (ctx->cflags & REG_LITERAL)
14305f2eab64SJohn Marino break;
14315f2eab64SJohn Marino #endif /* REG_LITERAL */
1432d5f8dde1SJohn Marino if (!(ctx->cflags & REG_EXTENDED))
1433d5f8dde1SJohn Marino {
1434d5f8dde1SJohn Marino if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end)
1435d5f8dde1SJohn Marino break;
1436d5f8dde1SJohn Marino ctx->re++;
1437d5f8dde1SJohn Marino }
14385f2eab64SJohn Marino switch (*ctx->re)
14395f2eab64SJohn Marino {
14405f2eab64SJohn Marino case CHAR_PIPE:
14415f2eab64SJohn Marino DPRINT(("tre_parse: union: '%.*" STRF "'\n",
14425f2eab64SJohn Marino REST(ctx->re)));
14435f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_UNION);
1444d5f8dde1SJohn Marino STACK_PUSHX(stack, voidptr, (void *)ctx->re);
14455f2eab64SJohn Marino STACK_PUSHX(stack, voidptr, result);
14465f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_POST_UNION);
1447d5f8dde1SJohn Marino /* We need to pass a boolean (eventually) to PARSE_ATOM to
1448d5f8dde1SJohn Marino indicate if this is the beginning of a BRE extended branch. */
1449d5f8dde1SJohn Marino STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin
14505f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_BRANCH);
14515f2eab64SJohn Marino ctx->re++;
14525f2eab64SJohn Marino break;
14535f2eab64SJohn Marino
14545f2eab64SJohn Marino case CHAR_RPAREN:
14555f2eab64SJohn Marino ctx->re++;
14565f2eab64SJohn Marino break;
14575f2eab64SJohn Marino
14585f2eab64SJohn Marino default:
1459d5f8dde1SJohn Marino if (!(ctx->cflags & REG_EXTENDED))
1460d5f8dde1SJohn Marino ctx->re--;
14615f2eab64SJohn Marino break;
14625f2eab64SJohn Marino }
14635f2eab64SJohn Marino break;
14645f2eab64SJohn Marino
14655f2eab64SJohn Marino case PARSE_POST_UNION:
14665f2eab64SJohn Marino {
14675f2eab64SJohn Marino tre_ast_node_t *tmp_node;
14685f2eab64SJohn Marino tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1469d5f8dde1SJohn Marino const tre_char_t *pipechar = tre_stack_pop_voidptr(stack);
1470d5f8dde1SJohn Marino /* error on empty expression at end of union */
1471d5f8dde1SJohn Marino if (pipechar == ctx->re - 1)
1472d5f8dde1SJohn Marino {
1473d5f8dde1SJohn Marino return REG_EMPTY;
1474d5f8dde1SJohn Marino }
14755f2eab64SJohn Marino tmp_node = tre_ast_new_union(ctx->mem, tree, result);
14765f2eab64SJohn Marino if (!tmp_node)
14775f2eab64SJohn Marino return REG_ESPACE;
14785f2eab64SJohn Marino result = tmp_node;
14795f2eab64SJohn Marino break;
14805f2eab64SJohn Marino }
14815f2eab64SJohn Marino
14825f2eab64SJohn Marino case PARSE_POSTFIX:
14835f2eab64SJohn Marino /* Parse postfix operators. */
14845f2eab64SJohn Marino if (ctx->re >= ctx->re_end)
14855f2eab64SJohn Marino break;
14865f2eab64SJohn Marino #ifdef REG_LITERAL
14875f2eab64SJohn Marino if (ctx->cflags & REG_LITERAL)
14885f2eab64SJohn Marino break;
14895f2eab64SJohn Marino #endif /* REG_LITERAL */
1490d5f8dde1SJohn Marino int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1491d5f8dde1SJohn Marino int rep_min = 0;
1492d5f8dde1SJohn Marino int rep_max = -1;
1493d5f8dde1SJohn Marino #ifdef TRE_DEBUG
1494d5f8dde1SJohn Marino int lbrace_off;
1495d5f8dde1SJohn Marino #endif
14965f2eab64SJohn Marino switch (*ctx->re)
14975f2eab64SJohn Marino {
14985f2eab64SJohn Marino case CHAR_PLUS:
14995f2eab64SJohn Marino case CHAR_QUESTIONMARK:
15005f2eab64SJohn Marino if (!(ctx->cflags & REG_EXTENDED))
15015f2eab64SJohn Marino break;
15025f2eab64SJohn Marino /*FALLTHROUGH*/
15035f2eab64SJohn Marino case CHAR_STAR:
15045f2eab64SJohn Marino {
15055f2eab64SJohn Marino tre_ast_node_t *tmp_node;
15065f2eab64SJohn Marino #ifdef TRE_DEBUG
1507d5f8dde1SJohn Marino const char *tstr = "star";
15085f2eab64SJohn Marino tmp_re = ctx->re;
15095f2eab64SJohn Marino #endif
15105f2eab64SJohn Marino
1511d5f8dde1SJohn Marino handle_plus_or_question:
1512d5f8dde1SJohn Marino /* error on iteration of raw assertion (not in subexpression) */
1513d5f8dde1SJohn Marino if (result->type == LITERAL && result->submatch_id < 0 &&
1514d5f8dde1SJohn Marino IS_ASSERTION((tre_literal_t *)result->obj))
1515d5f8dde1SJohn Marino {
1516d5f8dde1SJohn Marino if (!(ctx->cflags & REG_EXTENDED)) break;
1517d5f8dde1SJohn Marino return REG_BADRPT;
1518d5f8dde1SJohn Marino }
1519d5f8dde1SJohn Marino if (*ctx->re == CHAR_PLUS)
1520d5f8dde1SJohn Marino {
1521d5f8dde1SJohn Marino rep_min = 1;
1522d5f8dde1SJohn Marino #ifdef TRE_DEBUG
1523d5f8dde1SJohn Marino tstr = "plus";
1524d5f8dde1SJohn Marino #endif
1525d5f8dde1SJohn Marino }
1526d5f8dde1SJohn Marino if (*ctx->re == CHAR_QUESTIONMARK)
1527d5f8dde1SJohn Marino {
1528d5f8dde1SJohn Marino rep_max = 1;
1529d5f8dde1SJohn Marino #ifdef TRE_DEBUG
1530d5f8dde1SJohn Marino tstr = "questionmark";
1531d5f8dde1SJohn Marino #endif
1532d5f8dde1SJohn Marino }
1533d5f8dde1SJohn Marino
1534d5f8dde1SJohn Marino if (ctx->cflags & REG_EXTENDED)
1535d5f8dde1SJohn Marino {
15365f2eab64SJohn Marino if (ctx->re + 1 < ctx->re_end)
15375f2eab64SJohn Marino {
15385f2eab64SJohn Marino if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
15395f2eab64SJohn Marino {
1540d5f8dde1SJohn Marino /* Process the question mark only in enhanced mode.
1541d5f8dde1SJohn Marino Otherwise, the question mark is an error in ERE */
1542d5f8dde1SJohn Marino if (ctx->cflags & REG_ENHANCED)
1543d5f8dde1SJohn Marino {
15445f2eab64SJohn Marino minimal = !(ctx->cflags & REG_UNGREEDY);
15455f2eab64SJohn Marino ctx->re++;
15465f2eab64SJohn Marino }
1547d5f8dde1SJohn Marino else return REG_BADRPT;
1548d5f8dde1SJohn Marino }
15495f2eab64SJohn Marino else if (*(ctx->re + 1) == CHAR_STAR
15505f2eab64SJohn Marino || *(ctx->re + 1) == CHAR_PLUS)
15515f2eab64SJohn Marino {
15525f2eab64SJohn Marino /* These are reserved for future extensions. */
15535f2eab64SJohn Marino return REG_BADRPT;
15545f2eab64SJohn Marino }
15555f2eab64SJohn Marino }
1556d5f8dde1SJohn Marino }
1557d5f8dde1SJohn Marino else
1558d5f8dde1SJohn Marino {
1559d5f8dde1SJohn Marino if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR)
1560d5f8dde1SJohn Marino {
1561d5f8dde1SJohn Marino /* This is reserved for future extensions. */
1562d5f8dde1SJohn Marino return REG_BADRPT;
1563d5f8dde1SJohn Marino }
1564d5f8dde1SJohn Marino if (ctx->re + 2 < ctx->re_end)
1565d5f8dde1SJohn Marino {
1566*122b686eSSascha Wildner if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_QUESTIONMARK)
1567d5f8dde1SJohn Marino {
1568d5f8dde1SJohn Marino /* Process the question mark only in enhanced mode.
1569d5f8dde1SJohn Marino Otherwise, the question mark is a literal in BRE */
1570d5f8dde1SJohn Marino if (ctx->cflags & REG_ENHANCED)
1571d5f8dde1SJohn Marino {
1572d5f8dde1SJohn Marino minimal = !(ctx->cflags & REG_UNGREEDY);
1573d5f8dde1SJohn Marino ctx->re += 2;
1574d5f8dde1SJohn Marino }
1575d5f8dde1SJohn Marino }
1576d5f8dde1SJohn Marino else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS)
1577d5f8dde1SJohn Marino {
1578d5f8dde1SJohn Marino /* This is reserved for future extensions. */
1579d5f8dde1SJohn Marino return REG_BADRPT;
1580d5f8dde1SJohn Marino }
1581d5f8dde1SJohn Marino }
1582d5f8dde1SJohn Marino }
15835f2eab64SJohn Marino
1584d5f8dde1SJohn Marino if (minimal)
1585d5f8dde1SJohn Marino ctx->num_reorder_tags++;
1586d5f8dde1SJohn Marino
1587d5f8dde1SJohn Marino DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n",
1588d5f8dde1SJohn Marino minimal ? " minimal" : "greedy", tstr, REST(tmp_re)));
1589d5f8dde1SJohn Marino if (result == NULL)
1590d5f8dde1SJohn Marino {
1591d5f8dde1SJohn Marino if (ctx->cflags & REG_EXTENDED) return REG_BADRPT;
1592d5f8dde1SJohn Marino else goto parse_literal;
1593d5f8dde1SJohn Marino }
15945f2eab64SJohn Marino ctx->re++;
15955f2eab64SJohn Marino tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
15965f2eab64SJohn Marino minimal);
15975f2eab64SJohn Marino if (tmp_node == NULL)
15985f2eab64SJohn Marino return REG_ESPACE;
15995f2eab64SJohn Marino result = tmp_node;
1600d5f8dde1SJohn Marino
1601d5f8dde1SJohn Marino /* Set the iterator with a submatch id in the invisible range
1602d5f8dde1SJohn Marino * (which will be overridden if a real submatch is needed) */
1603d5f8dde1SJohn Marino result->submatch_id = ctx->submatch_id_invisible++;
1604d5f8dde1SJohn Marino
1605d5f8dde1SJohn Marino #if 0
1606d5f8dde1SJohn Marino /* We don't allow multiple postfixes, but this might be needed
1607d5f8dde1SJohn Marino to support approximate matching */
16085f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_POSTFIX);
1609d5f8dde1SJohn Marino #endif
16105f2eab64SJohn Marino }
16115f2eab64SJohn Marino break;
16125f2eab64SJohn Marino
16135f2eab64SJohn Marino case CHAR_BACKSLASH:
16145f2eab64SJohn Marino /* "\{" is special without REG_EXTENDED */
1615d5f8dde1SJohn Marino /* "\+" and "\?" are special with REG_ENHANCED for BRE */
16165f2eab64SJohn Marino if (!(ctx->cflags & REG_EXTENDED)
1617d5f8dde1SJohn Marino && ctx->re + 1 < ctx->re_end)
16185f2eab64SJohn Marino {
1619d5f8dde1SJohn Marino switch (*(ctx->re + 1))
1620d5f8dde1SJohn Marino {
1621d5f8dde1SJohn Marino case CHAR_LBRACE:
16225f2eab64SJohn Marino ctx->re++;
1623d5f8dde1SJohn Marino #ifdef TRE_DEBUG
1624d5f8dde1SJohn Marino lbrace_off = 2;
1625d5f8dde1SJohn Marino #endif
16265f2eab64SJohn Marino goto parse_brace;
1627d5f8dde1SJohn Marino case CHAR_PLUS:
1628d5f8dde1SJohn Marino case CHAR_QUESTIONMARK:
1629d5f8dde1SJohn Marino if (ctx->cflags & REG_ENHANCED)
1630d5f8dde1SJohn Marino {
1631d5f8dde1SJohn Marino #ifdef TRE_DEBUG
1632d5f8dde1SJohn Marino tmp_re = ctx->re;
1633d5f8dde1SJohn Marino #endif
1634d5f8dde1SJohn Marino ctx->re++;
1635d5f8dde1SJohn Marino goto handle_plus_or_question;
1636d5f8dde1SJohn Marino }
1637d5f8dde1SJohn Marino break;
1638d5f8dde1SJohn Marino }
1639d5f8dde1SJohn Marino break;
16405f2eab64SJohn Marino }
16415f2eab64SJohn Marino else
16425f2eab64SJohn Marino break;
16435f2eab64SJohn Marino
16445f2eab64SJohn Marino case CHAR_LBRACE:
1645d5f8dde1SJohn Marino {
1646d5f8dde1SJohn Marino int raw_assertion;
1647d5f8dde1SJohn Marino
16485f2eab64SJohn Marino /* "{" is literal without REG_EXTENDED */
16495f2eab64SJohn Marino if (!(ctx->cflags & REG_EXTENDED))
16505f2eab64SJohn Marino break;
1651d5f8dde1SJohn Marino #ifdef TRE_DEBUG
1652d5f8dde1SJohn Marino lbrace_off = 1;
1653d5f8dde1SJohn Marino #endif
16545f2eab64SJohn Marino
16555f2eab64SJohn Marino parse_brace:
1656d5f8dde1SJohn Marino /* error on iteration of raw assertion (not in subexpression),
1657d5f8dde1SJohn Marino but wait until after parsing bounds */
1658d5f8dde1SJohn Marino raw_assertion = (result->type == LITERAL
1659d5f8dde1SJohn Marino && result->submatch_id < 0
1660d5f8dde1SJohn Marino && IS_ASSERTION((tre_literal_t *)result->obj));
16615f2eab64SJohn Marino ctx->re++;
16625f2eab64SJohn Marino
16635f2eab64SJohn Marino status = tre_parse_bound(ctx, &result);
1664d5f8dde1SJohn Marino #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1665d5f8dde1SJohn Marino /* For ERE, if status is REG_NOMATCH, this mean the lbrace
1666d5f8dde1SJohn Marino is to be treated as a literal. */
1667d5f8dde1SJohn Marino if (status == REG_NOMATCH)
1668d5f8dde1SJohn Marino {
1669d5f8dde1SJohn Marino ctx->re--;
1670d5f8dde1SJohn Marino break;
1671d5f8dde1SJohn Marino }
1672d5f8dde1SJohn Marino #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1673d5f8dde1SJohn Marino DPRINT(("tre_parse: bound: '%.*" STRF "'\n",
1674d5f8dde1SJohn Marino REST(ctx->re - lbrace_off)));
16755f2eab64SJohn Marino if (status != REG_OK)
16765f2eab64SJohn Marino return status;
1677d5f8dde1SJohn Marino if (raw_assertion) return REG_BADRPT;
1678d5f8dde1SJohn Marino
1679d5f8dde1SJohn Marino /* Set the iterator with a submatch id in the invisible range
1680d5f8dde1SJohn Marino * (which will be overridden if a real submatch is needed) */
1681d5f8dde1SJohn Marino if (result->type == ITERATION)
1682d5f8dde1SJohn Marino result->submatch_id = ctx->submatch_id_invisible++;
1683d5f8dde1SJohn Marino
1684d5f8dde1SJohn Marino #if 0
1685d5f8dde1SJohn Marino /* We don't allow multiple postfixes, but this might be needed
1686d5f8dde1SJohn Marino to support approximate matching */
16875f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_POSTFIX);
1688d5f8dde1SJohn Marino #endif
16895f2eab64SJohn Marino break;
16905f2eab64SJohn Marino }
1691d5f8dde1SJohn Marino }
16925f2eab64SJohn Marino break;
16935f2eab64SJohn Marino
16945f2eab64SJohn Marino case PARSE_ATOM:
1695d5f8dde1SJohn Marino {
16965f2eab64SJohn Marino /* Parse an atom. An atom is a regular expression enclosed in `()',
16975f2eab64SJohn Marino an empty set of `()', a bracket expression, `.', `^', `$',
16985f2eab64SJohn Marino a `\' followed by a character, or a single character. */
16995f2eab64SJohn Marino
1700d5f8dde1SJohn Marino /* The stack contains a boolean value, whether PARSE_ATOM is
1701d5f8dde1SJohn Marino being called just after the start of a group (left paren)
1702d5f8dde1SJohn Marino in a BRE */
1703d5f8dde1SJohn Marino bre_branch_begin = tre_stack_pop_int(stack);
1704d5f8dde1SJohn Marino
17055f2eab64SJohn Marino /* End of regexp? (empty string). */
17065f2eab64SJohn Marino if (ctx->re >= ctx->re_end)
17075f2eab64SJohn Marino goto parse_literal;
17085f2eab64SJohn Marino
17095f2eab64SJohn Marino #ifdef REG_LITERAL
17105f2eab64SJohn Marino if (ctx->cflags & REG_LITERAL)
17115f2eab64SJohn Marino goto parse_literal;
17125f2eab64SJohn Marino #endif /* REG_LITERAL */
17135f2eab64SJohn Marino
17145f2eab64SJohn Marino switch (*ctx->re)
17155f2eab64SJohn Marino {
17165f2eab64SJohn Marino case CHAR_LPAREN: /* parenthesized subexpression */
17175f2eab64SJohn Marino
17185f2eab64SJohn Marino /* Handle "(?...)" extensions. They work in a way similar
17195f2eab64SJohn Marino to Perls corresponding extensions. */
1720d5f8dde1SJohn Marino if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) ==
1721d5f8dde1SJohn Marino (REG_EXTENDED|REG_ENHANCED)
17225f2eab64SJohn Marino && *(ctx->re + 1) == CHAR_QUESTIONMARK)
17235f2eab64SJohn Marino {
17245f2eab64SJohn Marino int new_cflags = ctx->cflags;
17255f2eab64SJohn Marino int bit = 1;
1726d5f8dde1SJohn Marino int invisible_submatch = 0;
1727d5f8dde1SJohn Marino DPRINT(("tre_parse: extension: '%.*" STRF "'\n",
17285f2eab64SJohn Marino REST(ctx->re)));
17295f2eab64SJohn Marino ctx->re += 2;
17305f2eab64SJohn Marino while (/*CONSTCOND*/1)
17315f2eab64SJohn Marino {
17325f2eab64SJohn Marino if (*ctx->re == L'i')
17335f2eab64SJohn Marino {
1734d5f8dde1SJohn Marino DPRINT(("tre_parse: icase: '%.*" STRF "'\n",
17355f2eab64SJohn Marino REST(ctx->re)));
17365f2eab64SJohn Marino if (bit)
17375f2eab64SJohn Marino new_cflags |= REG_ICASE;
17385f2eab64SJohn Marino else
17395f2eab64SJohn Marino new_cflags &= ~REG_ICASE;
17405f2eab64SJohn Marino ctx->re++;
17415f2eab64SJohn Marino }
17425f2eab64SJohn Marino else if (*ctx->re == L'n')
17435f2eab64SJohn Marino {
1744d5f8dde1SJohn Marino DPRINT(("tre_parse: newline: '%.*" STRF "'\n",
17455f2eab64SJohn Marino REST(ctx->re)));
17465f2eab64SJohn Marino if (bit)
17475f2eab64SJohn Marino new_cflags |= REG_NEWLINE;
17485f2eab64SJohn Marino else
17495f2eab64SJohn Marino new_cflags &= ~REG_NEWLINE;
17505f2eab64SJohn Marino ctx->re++;
17515f2eab64SJohn Marino }
1752d5f8dde1SJohn Marino #ifdef REG_LEFT_ASSOC
1753d5f8dde1SJohn Marino else if (*ctx->re == L'l')
17545f2eab64SJohn Marino {
1755d5f8dde1SJohn Marino DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n",
17565f2eab64SJohn Marino REST(ctx->re)));
17575f2eab64SJohn Marino if (bit)
1758d5f8dde1SJohn Marino new_cflags |= REG_LEFT_ASSOC;
17595f2eab64SJohn Marino else
1760d5f8dde1SJohn Marino new_cflags &= ~REG_LEFT_ASSOC;
17615f2eab64SJohn Marino ctx->re++;
17625f2eab64SJohn Marino }
1763d5f8dde1SJohn Marino #endif /* REG_LEFT_ASSOC */
17645f2eab64SJohn Marino #ifdef REG_UNGREEDY
17655f2eab64SJohn Marino else if (*ctx->re == L'U')
17665f2eab64SJohn Marino {
1767d5f8dde1SJohn Marino DPRINT(("tre_parse: ungreedy: '%.*" STRF "'\n",
17685f2eab64SJohn Marino REST(ctx->re)));
17695f2eab64SJohn Marino if (bit)
17705f2eab64SJohn Marino new_cflags |= REG_UNGREEDY;
17715f2eab64SJohn Marino else
17725f2eab64SJohn Marino new_cflags &= ~REG_UNGREEDY;
17735f2eab64SJohn Marino ctx->re++;
17745f2eab64SJohn Marino }
17755f2eab64SJohn Marino #endif /* REG_UNGREEDY */
17765f2eab64SJohn Marino else if (*ctx->re == CHAR_MINUS)
17775f2eab64SJohn Marino {
1778d5f8dde1SJohn Marino DPRINT(("tre_parse: turn off: '%.*" STRF "'\n",
17795f2eab64SJohn Marino REST(ctx->re)));
17805f2eab64SJohn Marino ctx->re++;
17815f2eab64SJohn Marino bit = 0;
17825f2eab64SJohn Marino }
17835f2eab64SJohn Marino else if (*ctx->re == CHAR_COLON)
17845f2eab64SJohn Marino {
1785d5f8dde1SJohn Marino DPRINT(("tre_parse: no group: '%.*" STRF
1786d5f8dde1SJohn Marino "', (invisible submatch %d)\n",
1787d5f8dde1SJohn Marino REST(ctx->re), ctx->submatch_id_invisible));
17885f2eab64SJohn Marino ctx->re++;
17895f2eab64SJohn Marino depth++;
1790d5f8dde1SJohn Marino invisible_submatch = 1;
17915f2eab64SJohn Marino break;
17925f2eab64SJohn Marino }
17935f2eab64SJohn Marino else if (*ctx->re == CHAR_HASH)
17945f2eab64SJohn Marino {
1795d5f8dde1SJohn Marino DPRINT(("tre_parse: comment: '%.*" STRF "'\n",
17965f2eab64SJohn Marino REST(ctx->re)));
17975f2eab64SJohn Marino /* A comment can contain any character except a
17985f2eab64SJohn Marino right parenthesis */
17995f2eab64SJohn Marino while (*ctx->re != CHAR_RPAREN
18005f2eab64SJohn Marino && ctx->re < ctx->re_end)
18015f2eab64SJohn Marino ctx->re++;
18025f2eab64SJohn Marino if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
18035f2eab64SJohn Marino {
18045f2eab64SJohn Marino ctx->re++;
18055f2eab64SJohn Marino break;
18065f2eab64SJohn Marino }
18075f2eab64SJohn Marino else
18085f2eab64SJohn Marino return REG_BADPAT;
18095f2eab64SJohn Marino }
18105f2eab64SJohn Marino else if (*ctx->re == CHAR_RPAREN)
18115f2eab64SJohn Marino {
18125f2eab64SJohn Marino ctx->re++;
18135f2eab64SJohn Marino break;
18145f2eab64SJohn Marino }
18155f2eab64SJohn Marino else
1816d5f8dde1SJohn Marino return REG_BADRPT;
18175f2eab64SJohn Marino }
18185f2eab64SJohn Marino
18195f2eab64SJohn Marino /* Turn on the cflags changes for the rest of the
18205f2eab64SJohn Marino enclosing group. */
1821d5f8dde1SJohn Marino if (invisible_submatch)
1822d5f8dde1SJohn Marino {
18235f2eab64SJohn Marino STACK_PUSHX(stack, int, ctx->cflags);
1824d5f8dde1SJohn Marino STACK_PUSHX(stack, int, ctx->submatch_id_invisible);
1825d5f8dde1SJohn Marino STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1826d5f8dde1SJohn Marino ctx->submatch_id_invisible++;
1827d5f8dde1SJohn Marino STACK_PUSHX(stack, int, 0); // bre_branch_begin
18285f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_RE);
1829d5f8dde1SJohn Marino }
1830d5f8dde1SJohn Marino else {
1831d5f8dde1SJohn Marino STACK_PUSHX(stack, int, 0); // bre_branch_begin
1832d5f8dde1SJohn Marino STACK_PUSHX(stack, int, PARSE_ATOM);
1833d5f8dde1SJohn Marino }
18345f2eab64SJohn Marino ctx->cflags = new_cflags;
18355f2eab64SJohn Marino break;
18365f2eab64SJohn Marino }
18375f2eab64SJohn Marino
1838d5f8dde1SJohn Marino if (ctx->cflags & REG_EXTENDED)
18395f2eab64SJohn Marino {
1840d5f8dde1SJohn Marino parse_bre_lparen:
18415f2eab64SJohn Marino DPRINT(("tre_parse: group begin: '%.*" STRF
18425f2eab64SJohn Marino "', submatch %d\n", REST(ctx->re),
18435f2eab64SJohn Marino ctx->submatch_id));
18445f2eab64SJohn Marino ctx->re++;
18455f2eab64SJohn Marino /* First parse a whole RE, then mark the resulting tree
18465f2eab64SJohn Marino for submatching. */
1847d5f8dde1SJohn Marino STACK_PUSHX(stack, int, ctx->cflags);
18485f2eab64SJohn Marino STACK_PUSHX(stack, int, ctx->submatch_id);
18495f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1850d5f8dde1SJohn Marino /* We need to pass a boolean (eventually) to PARSE_ATOM to
1851d5f8dde1SJohn Marino indicate if this is the beginning of a BRE group. */
1852d5f8dde1SJohn Marino STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED));
18535f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_RE);
18545f2eab64SJohn Marino ctx->submatch_id++;
1855d5f8dde1SJohn Marino depth++;
18565f2eab64SJohn Marino }
18575f2eab64SJohn Marino else
18585f2eab64SJohn Marino goto parse_literal;
18595f2eab64SJohn Marino break;
18605f2eab64SJohn Marino
18615f2eab64SJohn Marino case CHAR_RPAREN: /* end of current subexpression */
1862d5f8dde1SJohn Marino if (ctx->cflags & REG_EXTENDED && depth > 0)
18635f2eab64SJohn Marino {
1864d5f8dde1SJohn Marino parse_bre_rparen_empty:
1865d5f8dde1SJohn Marino if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1866d5f8dde1SJohn Marino return REG_EPAREN;
18675f2eab64SJohn Marino DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
18685f2eab64SJohn Marino REST(ctx->re)));
18695f2eab64SJohn Marino /* We were expecting an atom, but instead the current
18705f2eab64SJohn Marino subexpression was closed. POSIX leaves the meaning of
18715f2eab64SJohn Marino this to be implementation-defined. We interpret this as
18725f2eab64SJohn Marino an empty expression (which matches an empty string). */
18735f2eab64SJohn Marino result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
18745f2eab64SJohn Marino if (result == NULL)
18755f2eab64SJohn Marino return REG_ESPACE;
18765f2eab64SJohn Marino if (!(ctx->cflags & REG_EXTENDED))
18775f2eab64SJohn Marino ctx->re--;
18785f2eab64SJohn Marino }
18795f2eab64SJohn Marino else
18805f2eab64SJohn Marino goto parse_literal;
18815f2eab64SJohn Marino break;
18825f2eab64SJohn Marino
18835f2eab64SJohn Marino case CHAR_LBRACKET: /* bracket expression */
18845f2eab64SJohn Marino DPRINT(("tre_parse: bracket: '%.*" STRF "'\n",
18855f2eab64SJohn Marino REST(ctx->re)));
18865f2eab64SJohn Marino ctx->re++;
18875f2eab64SJohn Marino status = tre_parse_bracket(ctx, &result);
18885f2eab64SJohn Marino if (status != REG_OK)
18895f2eab64SJohn Marino return status;
18905f2eab64SJohn Marino break;
18915f2eab64SJohn Marino
18925f2eab64SJohn Marino case CHAR_BACKSLASH:
1893d5f8dde1SJohn Marino /* Deal with "\(", "\)" or "\{" for BREs */
18945f2eab64SJohn Marino if (!(ctx->cflags & REG_EXTENDED)
1895d5f8dde1SJohn Marino && ctx->re + 1 < ctx->re_end)
1896d5f8dde1SJohn Marino {
1897d5f8dde1SJohn Marino if (*(ctx->re + 1) == CHAR_LPAREN)
18985f2eab64SJohn Marino {
18995f2eab64SJohn Marino ctx->re++;
1900d5f8dde1SJohn Marino goto parse_bre_lparen;
1901d5f8dde1SJohn Marino }
1902d5f8dde1SJohn Marino else if (*(ctx->re + 1) == CHAR_RPAREN)
1903d5f8dde1SJohn Marino {
1904d5f8dde1SJohn Marino ctx->re++;
1905d5f8dde1SJohn Marino goto parse_bre_rparen_empty;
1906d5f8dde1SJohn Marino }
1907d5f8dde1SJohn Marino if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal;
1908d5f8dde1SJohn Marino }
1909d5f8dde1SJohn Marino
1910d5f8dde1SJohn Marino if (ctx->re + 1 >= ctx->re_end)
1911d5f8dde1SJohn Marino /* Trailing backslash. */
1912d5f8dde1SJohn Marino return REG_EESCAPE;
1913d5f8dde1SJohn Marino
1914d5f8dde1SJohn Marino if (!(ctx->cflags & REG_ENHANCED))
1915d5f8dde1SJohn Marino {
1916d5f8dde1SJohn Marino DPRINT(("tre_parse: unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re)));
1917d5f8dde1SJohn Marino ctx->re++;
1918d5f8dde1SJohn Marino goto unenhanced_backslash;
19195f2eab64SJohn Marino }
19205f2eab64SJohn Marino
19215f2eab64SJohn Marino /* If a macro is used, parse the expanded macro recursively. */
19225f2eab64SJohn Marino {
19235f2eab64SJohn Marino tre_char_t buf[64];
19245f2eab64SJohn Marino tre_expand_macro(ctx->re + 1, ctx->re_end,
19255f2eab64SJohn Marino buf, elementsof(buf));
19265f2eab64SJohn Marino if (buf[0] != 0)
19275f2eab64SJohn Marino {
19285f2eab64SJohn Marino tre_parse_ctx_t subctx;
19295f2eab64SJohn Marino memcpy(&subctx, ctx, sizeof(subctx));
19305f2eab64SJohn Marino subctx.re = buf;
19315f2eab64SJohn Marino subctx.len = tre_strlen(buf);
19325f2eab64SJohn Marino subctx.nofirstsub = 1;
19335f2eab64SJohn Marino status = tre_parse(&subctx);
19345f2eab64SJohn Marino if (status != REG_OK)
19355f2eab64SJohn Marino return status;
19365f2eab64SJohn Marino ctx->re += 2;
19375f2eab64SJohn Marino ctx->position = subctx.position;
19385f2eab64SJohn Marino result = subctx.result;
19395f2eab64SJohn Marino break;
19405f2eab64SJohn Marino }
19415f2eab64SJohn Marino }
19425f2eab64SJohn Marino
19435f2eab64SJohn Marino #ifdef REG_LITERAL
19445f2eab64SJohn Marino if (*(ctx->re + 1) == L'Q')
19455f2eab64SJohn Marino {
19465f2eab64SJohn Marino DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
19475f2eab64SJohn Marino REST(ctx->re)));
19485f2eab64SJohn Marino ctx->cflags |= REG_LITERAL;
19495f2eab64SJohn Marino temporary_cflags |= REG_LITERAL;
19505f2eab64SJohn Marino ctx->re += 2;
1951d5f8dde1SJohn Marino STACK_PUSHX(stack, int, 0);
19525f2eab64SJohn Marino STACK_PUSHX(stack, int, PARSE_ATOM);
19535f2eab64SJohn Marino break;
19545f2eab64SJohn Marino }
19555f2eab64SJohn Marino #endif /* REG_LITERAL */
19565f2eab64SJohn Marino
19575f2eab64SJohn Marino DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re)));
19585f2eab64SJohn Marino ctx->re++;
19595f2eab64SJohn Marino switch (*ctx->re)
19605f2eab64SJohn Marino {
19615f2eab64SJohn Marino case L'b':
19625f2eab64SJohn Marino result = tre_ast_new_literal(ctx->mem, ASSERTION,
19635f2eab64SJohn Marino ASSERT_AT_WB, -1);
19645f2eab64SJohn Marino ctx->re++;
19655f2eab64SJohn Marino break;
19665f2eab64SJohn Marino case L'B':
19675f2eab64SJohn Marino result = tre_ast_new_literal(ctx->mem, ASSERTION,
19685f2eab64SJohn Marino ASSERT_AT_WB_NEG, -1);
19695f2eab64SJohn Marino ctx->re++;
19705f2eab64SJohn Marino break;
19715f2eab64SJohn Marino case L'<':
19725f2eab64SJohn Marino result = tre_ast_new_literal(ctx->mem, ASSERTION,
19735f2eab64SJohn Marino ASSERT_AT_BOW, -1);
19745f2eab64SJohn Marino ctx->re++;
19755f2eab64SJohn Marino break;
19765f2eab64SJohn Marino case L'>':
19775f2eab64SJohn Marino result = tre_ast_new_literal(ctx->mem, ASSERTION,
19785f2eab64SJohn Marino ASSERT_AT_EOW, -1);
19795f2eab64SJohn Marino ctx->re++;
19805f2eab64SJohn Marino break;
19815f2eab64SJohn Marino case L'x':
19825f2eab64SJohn Marino ctx->re++;
19835f2eab64SJohn Marino if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
19845f2eab64SJohn Marino {
19855f2eab64SJohn Marino /* 8 bit hex char. */
19865f2eab64SJohn Marino char tmp[3] = {0, 0, 0};
19875f2eab64SJohn Marino long val;
19885f2eab64SJohn Marino DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n",
19895f2eab64SJohn Marino REST(ctx->re - 2)));
19905f2eab64SJohn Marino
1991d5f8dde1SJohn Marino if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1992d5f8dde1SJohn Marino ctx->re < ctx->re_end)
19935f2eab64SJohn Marino {
19945f2eab64SJohn Marino tmp[0] = (char)ctx->re[0];
19955f2eab64SJohn Marino ctx->re++;
19965f2eab64SJohn Marino }
1997d5f8dde1SJohn Marino if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1998d5f8dde1SJohn Marino ctx->re < ctx->re_end)
19995f2eab64SJohn Marino {
20005f2eab64SJohn Marino tmp[1] = (char)ctx->re[0];
20015f2eab64SJohn Marino ctx->re++;
20025f2eab64SJohn Marino }
20035f2eab64SJohn Marino val = strtol(tmp, NULL, 16);
20045f2eab64SJohn Marino result = tre_ast_new_literal(ctx->mem, (int)val,
20055f2eab64SJohn Marino (int)val, ctx->position);
20065f2eab64SJohn Marino ctx->position++;
20075f2eab64SJohn Marino break;
20085f2eab64SJohn Marino }
20095f2eab64SJohn Marino else if (ctx->re < ctx->re_end)
20105f2eab64SJohn Marino {
20115f2eab64SJohn Marino /* Wide char. */
20125f2eab64SJohn Marino char tmp[32];
20135f2eab64SJohn Marino long val;
20145f2eab64SJohn Marino int i = 0;
20155f2eab64SJohn Marino ctx->re++;
20165f2eab64SJohn Marino while (ctx->re_end - ctx->re >= 0)
20175f2eab64SJohn Marino {
20185f2eab64SJohn Marino if (ctx->re[0] == CHAR_RBRACE)
20195f2eab64SJohn Marino break;
2020d5f8dde1SJohn Marino if (tre_isxdigit_l(ctx->re[0], ctx->loc))
20215f2eab64SJohn Marino {
20225f2eab64SJohn Marino tmp[i] = (char)ctx->re[0];
20235f2eab64SJohn Marino i++;
20245f2eab64SJohn Marino ctx->re++;
20255f2eab64SJohn Marino continue;
20265f2eab64SJohn Marino }
20275f2eab64SJohn Marino return REG_EBRACE;
20285f2eab64SJohn Marino }
20295f2eab64SJohn Marino ctx->re++;
20305f2eab64SJohn Marino tmp[i] = 0;
20315f2eab64SJohn Marino val = strtol(tmp, NULL, 16);
20325f2eab64SJohn Marino result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
20335f2eab64SJohn Marino ctx->position);
20345f2eab64SJohn Marino ctx->position++;
20355f2eab64SJohn Marino break;
20365f2eab64SJohn Marino }
20375f2eab64SJohn Marino /*FALLTHROUGH*/
20385f2eab64SJohn Marino
20395f2eab64SJohn Marino default:
2040d5f8dde1SJohn Marino unenhanced_backslash:
2041d5f8dde1SJohn Marino if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) !=
2042d5f8dde1SJohn Marino REG_EXTENDED &&
2043d5f8dde1SJohn Marino tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0')
20445f2eab64SJohn Marino {
2045d5f8dde1SJohn Marino /* Back reference (only in BRE or enhanced). */
20465f2eab64SJohn Marino int val = *ctx->re - L'0';
20475f2eab64SJohn Marino DPRINT(("tre_parse: backref: '%.*" STRF "'\n",
20485f2eab64SJohn Marino REST(ctx->re - 1)));
20495f2eab64SJohn Marino result = tre_ast_new_literal(ctx->mem, BACKREF, val,
20505f2eab64SJohn Marino ctx->position);
20515f2eab64SJohn Marino if (result == NULL)
20525f2eab64SJohn Marino return REG_ESPACE;
2053d5f8dde1SJohn Marino
2054d5f8dde1SJohn Marino /* Set the backref with a submatch id in the invisible
2055d5f8dde1SJohn Marino * range (which will be overridden if a real submatch
2056d5f8dde1SJohn Marino * is needed) */
2057d5f8dde1SJohn Marino result->submatch_id = ctx->submatch_id_invisible++;
2058d5f8dde1SJohn Marino
20595f2eab64SJohn Marino ctx->position++;
2060d5f8dde1SJohn Marino ctx->num_reorder_tags++;
20615f2eab64SJohn Marino ctx->max_backref = MAX(val, ctx->max_backref);
20625f2eab64SJohn Marino ctx->re++;
20635f2eab64SJohn Marino }
20645f2eab64SJohn Marino else
20655f2eab64SJohn Marino {
20665f2eab64SJohn Marino /* Escaped character. */
20675f2eab64SJohn Marino DPRINT(("tre_parse: escaped: '%.*" STRF "'\n",
20685f2eab64SJohn Marino REST(ctx->re - 1)));
20695f2eab64SJohn Marino result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
20705f2eab64SJohn Marino ctx->position);
20715f2eab64SJohn Marino ctx->position++;
20725f2eab64SJohn Marino ctx->re++;
20735f2eab64SJohn Marino }
20745f2eab64SJohn Marino break;
20755f2eab64SJohn Marino }
20765f2eab64SJohn Marino if (result == NULL)
20775f2eab64SJohn Marino return REG_ESPACE;
20785f2eab64SJohn Marino break;
20795f2eab64SJohn Marino
20805f2eab64SJohn Marino case CHAR_PERIOD: /* the any-symbol */
20815f2eab64SJohn Marino DPRINT(("tre_parse: any: '%.*" STRF "'\n",
20825f2eab64SJohn Marino REST(ctx->re)));
20835f2eab64SJohn Marino if (ctx->cflags & REG_NEWLINE)
20845f2eab64SJohn Marino {
20855f2eab64SJohn Marino tre_ast_node_t *tmp1;
20865f2eab64SJohn Marino tre_ast_node_t *tmp2;
20875f2eab64SJohn Marino tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
20885f2eab64SJohn Marino ctx->position);
20895f2eab64SJohn Marino if (!tmp1)
20905f2eab64SJohn Marino return REG_ESPACE;
20915f2eab64SJohn Marino tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
20925f2eab64SJohn Marino ctx->position + 1);
20935f2eab64SJohn Marino if (!tmp2)
20945f2eab64SJohn Marino return REG_ESPACE;
20955f2eab64SJohn Marino result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
20965f2eab64SJohn Marino if (!result)
20975f2eab64SJohn Marino return REG_ESPACE;
20985f2eab64SJohn Marino ctx->position += 2;
20995f2eab64SJohn Marino }
21005f2eab64SJohn Marino else
21015f2eab64SJohn Marino {
21025f2eab64SJohn Marino result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
21035f2eab64SJohn Marino ctx->position);
21045f2eab64SJohn Marino if (!result)
21055f2eab64SJohn Marino return REG_ESPACE;
21065f2eab64SJohn Marino ctx->position++;
21075f2eab64SJohn Marino }
21085f2eab64SJohn Marino ctx->re++;
21095f2eab64SJohn Marino break;
21105f2eab64SJohn Marino
21115f2eab64SJohn Marino case CHAR_CARET: /* beginning of line assertion */
2112d5f8dde1SJohn Marino /* '^' has a special meaning everywhere in EREs, at the
2113d5f8dde1SJohn Marino beginning of the RE and after \( is BREs. It is also
2114d5f8dde1SJohn Marino special in enhanced BREs at the beginning of each branches
2115d5f8dde1SJohn Marino of a union */
21165f2eab64SJohn Marino if (ctx->cflags & REG_EXTENDED
2117d5f8dde1SJohn Marino || bre_branch_begin
21185f2eab64SJohn Marino || ctx->re == ctx->re_start)
21195f2eab64SJohn Marino {
21205f2eab64SJohn Marino DPRINT(("tre_parse: BOL: '%.*" STRF "'\n",
21215f2eab64SJohn Marino REST(ctx->re)));
21225f2eab64SJohn Marino result = tre_ast_new_literal(ctx->mem, ASSERTION,
21235f2eab64SJohn Marino ASSERT_AT_BOL, -1);
21245f2eab64SJohn Marino if (result == NULL)
21255f2eab64SJohn Marino return REG_ESPACE;
21265f2eab64SJohn Marino ctx->re++;
21275f2eab64SJohn Marino }
21285f2eab64SJohn Marino else
21295f2eab64SJohn Marino goto parse_literal;
21305f2eab64SJohn Marino break;
21315f2eab64SJohn Marino
21325f2eab64SJohn Marino case CHAR_DOLLAR: /* end of line assertion. */
21335f2eab64SJohn Marino /* '$' is special everywhere in EREs, and in the end of the
21345f2eab64SJohn Marino string and before \) is BREs. */
21355f2eab64SJohn Marino if (ctx->cflags & REG_EXTENDED
21365f2eab64SJohn Marino || (ctx->re + 2 < ctx->re_end
21375f2eab64SJohn Marino && *(ctx->re + 1) == CHAR_BACKSLASH
21385f2eab64SJohn Marino && *(ctx->re + 2) == CHAR_RPAREN)
21395f2eab64SJohn Marino || ctx->re + 1 == ctx->re_end)
21405f2eab64SJohn Marino {
21415f2eab64SJohn Marino DPRINT(("tre_parse: EOL: '%.*" STRF "'\n",
21425f2eab64SJohn Marino REST(ctx->re)));
21435f2eab64SJohn Marino result = tre_ast_new_literal(ctx->mem, ASSERTION,
21445f2eab64SJohn Marino ASSERT_AT_EOL, -1);
21455f2eab64SJohn Marino if (result == NULL)
21465f2eab64SJohn Marino return REG_ESPACE;
21475f2eab64SJohn Marino ctx->re++;
21485f2eab64SJohn Marino }
21495f2eab64SJohn Marino else
21505f2eab64SJohn Marino goto parse_literal;
21515f2eab64SJohn Marino break;
21525f2eab64SJohn Marino
21535f2eab64SJohn Marino default:
21545f2eab64SJohn Marino parse_literal:
21555f2eab64SJohn Marino
21565f2eab64SJohn Marino if (temporary_cflags && ctx->re + 1 < ctx->re_end
21575f2eab64SJohn Marino && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
21585f2eab64SJohn Marino {
21595f2eab64SJohn Marino DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n",
21605f2eab64SJohn Marino REST(ctx->re)));
21615f2eab64SJohn Marino ctx->cflags &= ~temporary_cflags;
21625f2eab64SJohn Marino temporary_cflags = 0;
21635f2eab64SJohn Marino ctx->re += 2;
2164d5f8dde1SJohn Marino if (ctx->re < ctx->re_end)
2165d5f8dde1SJohn Marino {
2166d5f8dde1SJohn Marino STACK_PUSHX(stack, int, 0);
2167d5f8dde1SJohn Marino STACK_PUSHX(stack, int, PARSE_ATOM);
2168d5f8dde1SJohn Marino }
2169d5f8dde1SJohn Marino else
2170d5f8dde1SJohn Marino {
2171d5f8dde1SJohn Marino result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2172d5f8dde1SJohn Marino if (!result) return REG_ESPACE;
2173d5f8dde1SJohn Marino }
21745f2eab64SJohn Marino break;
21755f2eab64SJohn Marino }
21765f2eab64SJohn Marino
21775f2eab64SJohn Marino
21785f2eab64SJohn Marino /* We are expecting an atom. If the subexpression (or the whole
21795f2eab64SJohn Marino regexp ends here, we interpret it as an empty expression
2180d5f8dde1SJohn Marino (which matches an empty string), which is an error.
2181d5f8dde1SJohn Marino Iterations of an empty expression is also an error. */
21825f2eab64SJohn Marino #ifdef REG_LITERAL
2183d5f8dde1SJohn Marino if (!(ctx->cflags & REG_LITERAL))
21845f2eab64SJohn Marino {
2185d5f8dde1SJohn Marino #endif /* REG_LITERAL */
2186d5f8dde1SJohn Marino /* error on end of string */
2187d5f8dde1SJohn Marino if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN
2188d5f8dde1SJohn Marino : REG_EMPTY;
2189d5f8dde1SJohn Marino /* error on unions and iterations of empty expressions */
2190d5f8dde1SJohn Marino if (ctx->cflags & REG_EXTENDED)
2191d5f8dde1SJohn Marino {
2192d5f8dde1SJohn Marino if (ctx->re < ctx->re_end)
2193d5f8dde1SJohn Marino {
2194d5f8dde1SJohn Marino if (*ctx->re == CHAR_PIPE) return REG_EMPTY;
2195d5f8dde1SJohn Marino if (*ctx->re == CHAR_LBRACE)
2196d5f8dde1SJohn Marino {
2197d5f8dde1SJohn Marino ctx->re++;
2198d5f8dde1SJohn Marino empty_parse_bound:
2199d5f8dde1SJohn Marino /* We need to parse the bound first and return
2200d5f8dde1SJohn Marino any error, before returning REG_BADRPT */
2201d5f8dde1SJohn Marino status = tre_parse_bound(ctx, NULL);
2202d5f8dde1SJohn Marino #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2203d5f8dde1SJohn Marino /* For ERE, if REG_NOMATCH is returned, we
2204d5f8dde1SJohn Marino treat the lbrace as a literal. */
2205d5f8dde1SJohn Marino if (status == REG_NOMATCH)
2206d5f8dde1SJohn Marino {
2207d5f8dde1SJohn Marino ctx->re--;
2208d5f8dde1SJohn Marino /* Drop down to literal-handling code */
22095f2eab64SJohn Marino }
2210d5f8dde1SJohn Marino else
2211d5f8dde1SJohn Marino {
2212d5f8dde1SJohn Marino #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2213d5f8dde1SJohn Marino if (status != REG_OK)
2214d5f8dde1SJohn Marino return status;
2215d5f8dde1SJohn Marino return REG_BADRPT;
2216d5f8dde1SJohn Marino #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2217d5f8dde1SJohn Marino }
2218d5f8dde1SJohn Marino #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2219d5f8dde1SJohn Marino }
2220d5f8dde1SJohn Marino #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2221d5f8dde1SJohn Marino else
2222d5f8dde1SJohn Marino #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2223d5f8dde1SJohn Marino if (*ctx->re == CHAR_STAR
2224d5f8dde1SJohn Marino || *ctx->re == CHAR_PLUS
2225d5f8dde1SJohn Marino || *ctx->re == CHAR_QUESTIONMARK)
2226d5f8dde1SJohn Marino {
2227d5f8dde1SJohn Marino return REG_BADRPT;
2228d5f8dde1SJohn Marino }
2229d5f8dde1SJohn Marino }
2230d5f8dde1SJohn Marino }
2231d5f8dde1SJohn Marino else if (ctx->re + 1 < ctx->re_end
2232d5f8dde1SJohn Marino && *ctx->re == CHAR_BACKSLASH
2233d5f8dde1SJohn Marino && *(ctx->re + 1) == CHAR_LBRACE)
2234d5f8dde1SJohn Marino {
2235d5f8dde1SJohn Marino ctx->re += 2;
2236d5f8dde1SJohn Marino goto empty_parse_bound;
2237d5f8dde1SJohn Marino }
2238d5f8dde1SJohn Marino #ifdef REG_LITERAL
2239d5f8dde1SJohn Marino }
2240d5f8dde1SJohn Marino #endif /* REG_LITERAL */
22415f2eab64SJohn Marino
22425f2eab64SJohn Marino DPRINT(("tre_parse: literal: '%.*" STRF "'\n",
22435f2eab64SJohn Marino REST(ctx->re)));
22445f2eab64SJohn Marino /* Note that we can't use an tre_isalpha() test here, since there
22455f2eab64SJohn Marino may be characters which are alphabetic but neither upper or
22465f2eab64SJohn Marino lower case. */
22475f2eab64SJohn Marino if (ctx->cflags & REG_ICASE
2248d5f8dde1SJohn Marino && (tre_isupper_l(*ctx->re, ctx->loc) ||
2249d5f8dde1SJohn Marino tre_islower_l(*ctx->re, ctx->loc)))
22505f2eab64SJohn Marino {
22515f2eab64SJohn Marino tre_ast_node_t *tmp1;
22525f2eab64SJohn Marino tre_ast_node_t *tmp2;
22535f2eab64SJohn Marino
22545f2eab64SJohn Marino /* XXX - Can there be more than one opposite-case
22555f2eab64SJohn Marino counterpoints for some character in some locale? Or
22565f2eab64SJohn Marino more than two characters which all should be regarded
22575f2eab64SJohn Marino the same character if case is ignored? If yes, there
22585f2eab64SJohn Marino does not seem to be a portable way to detect it. I guess
22595f2eab64SJohn Marino that at least for multi-character collating elements there
22605f2eab64SJohn Marino could be several opposite-case counterpoints, but they
22615f2eab64SJohn Marino cannot be supported portably anyway. */
2262d5f8dde1SJohn Marino tmp1 = tre_ast_new_literal(ctx->mem,
2263d5f8dde1SJohn Marino tre_toupper_l(*ctx->re, ctx->loc),
2264d5f8dde1SJohn Marino tre_toupper_l(*ctx->re, ctx->loc),
22655f2eab64SJohn Marino ctx->position);
22665f2eab64SJohn Marino if (!tmp1)
22675f2eab64SJohn Marino return REG_ESPACE;
2268d5f8dde1SJohn Marino tmp2 = tre_ast_new_literal(ctx->mem,
2269d5f8dde1SJohn Marino tre_tolower_l(*ctx->re, ctx->loc),
2270d5f8dde1SJohn Marino tre_tolower_l(*ctx->re, ctx->loc),
22715f2eab64SJohn Marino ctx->position);
22725f2eab64SJohn Marino if (!tmp2)
22735f2eab64SJohn Marino return REG_ESPACE;
22745f2eab64SJohn Marino result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
22755f2eab64SJohn Marino if (!result)
22765f2eab64SJohn Marino return REG_ESPACE;
22775f2eab64SJohn Marino }
22785f2eab64SJohn Marino else
22795f2eab64SJohn Marino {
22805f2eab64SJohn Marino result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
22815f2eab64SJohn Marino ctx->position);
22825f2eab64SJohn Marino if (!result)
22835f2eab64SJohn Marino return REG_ESPACE;
22845f2eab64SJohn Marino }
22855f2eab64SJohn Marino ctx->position++;
22865f2eab64SJohn Marino ctx->re++;
22875f2eab64SJohn Marino break;
22885f2eab64SJohn Marino }
22895f2eab64SJohn Marino break;
2290d5f8dde1SJohn Marino }
22915f2eab64SJohn Marino
22925f2eab64SJohn Marino case PARSE_MARK_FOR_SUBMATCH:
22935f2eab64SJohn Marino {
22945f2eab64SJohn Marino int submatch_id = tre_stack_pop_int(stack);
22955f2eab64SJohn Marino
2296d5f8dde1SJohn Marino ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */
2297d5f8dde1SJohn Marino if (result->submatch_id >= 0 &&
2298d5f8dde1SJohn Marino result->submatch_id < SUBMATCH_ID_INVISIBLE_START)
22995f2eab64SJohn Marino {
23005f2eab64SJohn Marino tre_ast_node_t *n, *tmp_node;
2301d5f8dde1SJohn Marino if (submatch_id >= SUBMATCH_ID_INVISIBLE_START)
2302d5f8dde1SJohn Marino break;
23035f2eab64SJohn Marino n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
23045f2eab64SJohn Marino if (n == NULL)
23055f2eab64SJohn Marino return REG_ESPACE;
23065f2eab64SJohn Marino tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
23075f2eab64SJohn Marino if (tmp_node == NULL)
23085f2eab64SJohn Marino return REG_ESPACE;
23095f2eab64SJohn Marino tmp_node->num_submatches = result->num_submatches;
23105f2eab64SJohn Marino result = tmp_node;
23115f2eab64SJohn Marino }
23125f2eab64SJohn Marino result->submatch_id = submatch_id;
2313d5f8dde1SJohn Marino if (submatch_id < SUBMATCH_ID_INVISIBLE_START)
23145f2eab64SJohn Marino result->num_submatches++;
23155f2eab64SJohn Marino break;
23165f2eab64SJohn Marino }
23175f2eab64SJohn Marino
23185f2eab64SJohn Marino default:
23195f2eab64SJohn Marino assert(0);
23205f2eab64SJohn Marino break;
23215f2eab64SJohn Marino }
23225f2eab64SJohn Marino }
23235f2eab64SJohn Marino
23245f2eab64SJohn Marino /* Check for missing closing parentheses. */
23255f2eab64SJohn Marino if (depth > 0)
23265f2eab64SJohn Marino return REG_EPAREN;
23275f2eab64SJohn Marino
23285f2eab64SJohn Marino ctx->result = result;
23295f2eab64SJohn Marino
2330d5f8dde1SJohn Marino return REG_OK;
23315f2eab64SJohn Marino }
23325f2eab64SJohn Marino
23335f2eab64SJohn Marino /* EOF */
2334