1 /* 2 tre-parse.c - Regexp parser 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7 */ 8 9 /* 10 This parser is just a simple recursive descent parser for POSIX.2 11 regexps. The parser supports both the obsolete default syntax and 12 the "extended" syntax, and some nonstandard extensions. 13 */ 14 15 16 #ifdef HAVE_CONFIG_H 17 #include <config.h> 18 #endif /* HAVE_CONFIG_H */ 19 #include <string.h> 20 #include <assert.h> 21 #include <limits.h> 22 23 #include "xmalloc.h" 24 #include "tre-mem.h" 25 #include "tre-ast.h" 26 #include "tre-stack.h" 27 #include "tre-parse.h" 28 29 30 /* Characters with special meanings in regexp syntax. */ 31 #define CHAR_PIPE L'|' 32 #define CHAR_LPAREN L'(' 33 #define CHAR_RPAREN L')' 34 #define CHAR_LBRACE L'{' 35 #define CHAR_RBRACE L'}' 36 #define CHAR_LBRACKET L'[' 37 #define CHAR_RBRACKET L']' 38 #define CHAR_MINUS L'-' 39 #define CHAR_STAR L'*' 40 #define CHAR_QUESTIONMARK L'?' 41 #define CHAR_PLUS L'+' 42 #define CHAR_PERIOD L'.' 43 #define CHAR_COLON L':' 44 #define CHAR_EQUAL L'=' 45 #define CHAR_COMMA L',' 46 #define CHAR_CARET L'^' 47 #define CHAR_DOLLAR L'$' 48 #define CHAR_BACKSLASH L'\\' 49 #define CHAR_HASH L'#' 50 #define CHAR_TILDE L'~' 51 52 53 /* Some macros for expanding \w, \s, etc. */ 54 static const struct tre_macro_struct { 55 const char c; 56 const char *expansion; 57 } tre_macros[] = 58 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, 59 {'f', "\f"}, {'a', "\a"}, {'e', "\033"}, 60 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, 61 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, 62 { 0, NULL } 63 }; 64 65 66 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which 67 must have at least `len' items. Sets buf[0] to zero if the there 68 is no match in `tre_macros'. */ 69 static void 70 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end, 71 tre_char_t *buf, size_t buf_len) 72 { 73 int i; 74 75 buf[0] = 0; 76 if (regex >= regex_end) 77 return; 78 79 for (i = 0; tre_macros[i].expansion; i++) 80 { 81 if (tre_macros[i].c == *regex) 82 { 83 unsigned int j; 84 DPRINT(("Expanding macro '%c' => '%s'\n", 85 tre_macros[i].c, tre_macros[i].expansion)); 86 for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++) 87 buf[j] = tre_macros[i].expansion[j]; 88 buf[j] = 0; 89 break; 90 } 91 } 92 } 93 94 static reg_errcode_t 95 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, 96 tre_ast_node_t ***items) 97 { 98 reg_errcode_t status; 99 tre_ast_node_t **array = *items; 100 /* Allocate more space if necessary. */ 101 if (*i >= *max_i) 102 { 103 tre_ast_node_t **new_items; 104 DPRINT(("out of array space, i = %d\n", *i)); 105 /* If the array is already 1024 items large, give up -- there's 106 probably an error in the regexp (e.g. not a '\0' terminated 107 string and missing ']') */ 108 if (*max_i > 1024) 109 return REG_ESPACE; 110 *max_i *= 2; 111 new_items = xrealloc(array, sizeof(*items) * *max_i); 112 if (new_items == NULL) 113 return REG_ESPACE; 114 *items = array = new_items; 115 } 116 array[*i] = tre_ast_new_literal(mem, min, max, -1); 117 status = array[*i] == NULL ? REG_ESPACE : REG_OK; 118 (*i)++; 119 return status; 120 } 121 122 123 /* Expands a character class to character ranges. */ 124 static reg_errcode_t 125 tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items, 126 int *i, int *max_i, int cflags) 127 { 128 reg_errcode_t status = REG_OK; 129 tre_cint_t c; 130 int j, min = -1, max = 0; 131 assert(TRE_MB_CUR_MAX == 1); 132 133 DPRINT((" expanding class to character ranges\n")); 134 for (j = 0; (j < 256) && (status == REG_OK); j++) 135 { 136 c = j; 137 if (tre_isctype(c, class) 138 || ((cflags & REG_ICASE) 139 && (tre_isctype(tre_tolower(c), class) 140 || tre_isctype(tre_toupper(c), class)))) 141 { 142 if (min < 0) 143 min = c; 144 max = c; 145 } 146 else if (min >= 0) 147 { 148 DPRINT((" range %c (%d) to %c (%d)\n", min, min, max, max)); 149 status = tre_new_item(mem, min, max, i, max_i, items); 150 min = -1; 151 } 152 } 153 if (min >= 0 && status == REG_OK) 154 status = tre_new_item(mem, min, max, i, max_i, items); 155 return status; 156 } 157 158 159 static int 160 tre_compare_items(const void *a, const void *b) 161 { 162 const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a; 163 const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b; 164 tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj; 165 int a_min = l_a->code_min, b_min = l_b->code_min; 166 167 if (a_min < b_min) 168 return -1; 169 else if (a_min > b_min) 170 return 1; 171 else 172 return 0; 173 } 174 175 #ifndef TRE_USE_SYSTEM_WCTYPE 176 177 int tre_isalnum_func(tre_cint_t); 178 int tre_isalpha_func(tre_cint_t); 179 int tre_isascii_func(tre_cint_t); 180 int tre_isblank_func(tre_cint_t); 181 int tre_iscntrl_func(tre_cint_t); 182 int tre_isdigit_func(tre_cint_t); 183 int tre_isgraph_func(tre_cint_t); 184 int tre_islower_func(tre_cint_t); 185 int tre_isprint_func(tre_cint_t); 186 int tre_ispunct_func(tre_cint_t); 187 int tre_isspace_func(tre_cint_t); 188 int tre_isupper_func(tre_cint_t); 189 int tre_isxdigit_func(tre_cint_t); 190 191 /* isalnum() and the rest may be macros, so wrap them to functions. */ 192 int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); } 193 int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); } 194 195 #ifdef tre_isascii 196 int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); } 197 #else /* !tre_isascii */ 198 int tre_isascii_func(tre_cint_t c) { return !(c >> 7); } 199 #endif /* !tre_isascii */ 200 201 #ifdef tre_isblank 202 int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); } 203 #else /* !tre_isblank */ 204 int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); } 205 #endif /* !tre_isblank */ 206 207 int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); } 208 int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); } 209 int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); } 210 int tre_islower_func(tre_cint_t c) { return tre_islower(c); } 211 int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); } 212 int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); } 213 int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); } 214 int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); } 215 int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); } 216 217 struct { 218 const char *name; 219 int (*func)(tre_cint_t); 220 } tre_ctype_map[] = { 221 { "alnum", &tre_isalnum_func }, 222 { "alpha", &tre_isalpha_func }, 223 #ifdef tre_isascii 224 { "ascii", &tre_isascii_func }, 225 #endif /* tre_isascii */ 226 #ifdef tre_isblank 227 { "blank", &tre_isblank_func }, 228 #endif /* tre_isblank */ 229 { "cntrl", &tre_iscntrl_func }, 230 { "digit", &tre_isdigit_func }, 231 { "graph", &tre_isgraph_func }, 232 { "lower", &tre_islower_func }, 233 { "print", &tre_isprint_func }, 234 { "punct", &tre_ispunct_func }, 235 { "space", &tre_isspace_func }, 236 { "upper", &tre_isupper_func }, 237 { "xdigit", &tre_isxdigit_func }, 238 { NULL, NULL} 239 }; 240 241 tre_ctype_t tre_ctype(const char *name) 242 { 243 int i; 244 for (i = 0; tre_ctype_map[i].name != NULL; i++) 245 { 246 if (strcmp(name, tre_ctype_map[i].name) == 0) 247 return tre_ctype_map[i].func; 248 } 249 return (tre_ctype_t)0; 250 } 251 #endif /* !TRE_USE_SYSTEM_WCTYPE */ 252 253 /* Maximum number of character classes that can occur in a negated bracket 254 expression. */ 255 #define MAX_NEG_CLASSES 64 256 257 /* Maximum length of character class names. */ 258 #define MAX_CLASS_NAME 259 260 #define REST(re) (int)(ctx->re_end - (re)), (re) 261 262 static reg_errcode_t 263 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, 264 tre_ctype_t neg_classes[], int *num_neg_classes, 265 tre_ast_node_t ***items, int *num_items, 266 int *items_size) 267 { 268 const tre_char_t *re = ctx->re; 269 reg_errcode_t status = REG_OK; 270 tre_ctype_t class = (tre_ctype_t)0; 271 int i = *num_items; 272 int max_i = *items_size; 273 int skip; 274 275 /* Build an array of the items in the bracket expression. */ 276 while (status == REG_OK) 277 { 278 skip = 0; 279 if (re == ctx->re_end) 280 { 281 status = REG_EBRACK; 282 } 283 else if (*re == CHAR_RBRACKET && re > ctx->re) 284 { 285 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re))); 286 re++; 287 break; 288 } 289 else 290 { 291 tre_cint_t min = 0, max = 0; 292 293 class = (tre_ctype_t)0; 294 if (re + 2 < ctx->re_end 295 && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET) 296 { 297 DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re))); 298 min = *re; 299 max = *(re + 2); 300 re += 3; 301 /* XXX - Should use collation order instead of encoding values 302 in character ranges. */ 303 if (min > max) 304 status = REG_ERANGE; 305 } 306 else if (re + 1 < ctx->re_end 307 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD) 308 status = REG_ECOLLATE; 309 else if (re + 1 < ctx->re_end 310 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL) 311 status = REG_ECOLLATE; 312 else if (re + 1 < ctx->re_end 313 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON) 314 { 315 char tmp_str[64]; 316 const tre_char_t *endptr = re + 2; 317 int len; 318 DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re))); 319 while (endptr < ctx->re_end && *endptr != CHAR_COLON) 320 endptr++; 321 if (endptr != ctx->re_end) 322 { 323 len = MIN(endptr - re - 2, 63); 324 #ifdef TRE_WCHAR 325 { 326 tre_char_t tmp_wcs[64]; 327 wcsncpy(tmp_wcs, re + 2, (size_t)len); 328 tmp_wcs[len] = L'\0'; 329 #if defined HAVE_WCSRTOMBS 330 { 331 mbstate_t state; 332 const tre_char_t *src = tmp_wcs; 333 memset(&state, '\0', sizeof(state)); 334 len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state); 335 } 336 #elif defined HAVE_WCSTOMBS 337 len = wcstombs(tmp_str, tmp_wcs, 63); 338 #endif /* defined HAVE_WCSTOMBS */ 339 } 340 #else /* !TRE_WCHAR */ 341 /* LINTED */strncpy(tmp_str, (const char*)re + 2, len); 342 #endif /* !TRE_WCHAR */ 343 tmp_str[len] = '\0'; 344 DPRINT((" class name: %s\n", tmp_str)); 345 class = tre_ctype(tmp_str); 346 if (!class) 347 status = REG_ECTYPE; 348 /* Optimize character classes for 8 bit character sets. */ 349 /* CONSTCOND */if (status == REG_OK && TRE_MB_CUR_MAX == 1) 350 { 351 status = tre_expand_ctype(ctx->mem, class, items, 352 &i, &max_i, ctx->cflags); 353 class = (tre_ctype_t)0; 354 skip = 1; 355 } 356 re = endptr + 2; 357 } 358 else 359 status = REG_ECTYPE; 360 min = 0; 361 max = TRE_CHAR_MAX; 362 } 363 else 364 { 365 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); 366 if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET 367 && ctx->re != re) 368 /* Two ranges are not allowed to share and endpoint. */ 369 status = REG_ERANGE; 370 min = max = *re++; 371 } 372 373 if (status != REG_OK) 374 break; 375 376 if (class && negate) 377 if (*num_neg_classes >= MAX_NEG_CLASSES) 378 status = REG_ESPACE; 379 else 380 neg_classes[(*num_neg_classes)++] = class; 381 else if (!skip) 382 { 383 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items); 384 if (status != REG_OK) 385 break; 386 ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class; 387 } 388 389 /* Add opposite-case counterpoints if REG_ICASE is present. 390 This is broken if there are more than two "same" characters. */ 391 if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip) 392 { 393 tre_cint_t cmin, ccurr; 394 395 DPRINT(("adding opposite-case counterpoints\n")); 396 while (min <= max) 397 { 398 if (tre_islower(min)) 399 { 400 cmin = ccurr = tre_toupper(min++); 401 while (tre_islower(min) && tre_toupper(min) == ccurr + 1 402 && min <= max) 403 ccurr = tre_toupper(min++); 404 status = tre_new_item(ctx->mem, cmin, ccurr, 405 &i, &max_i, items); 406 } 407 else if (tre_isupper(min)) 408 { 409 cmin = ccurr = tre_tolower(min++); 410 while (tre_isupper(min) && tre_tolower(min) == ccurr + 1 411 && min <= max) 412 ccurr = tre_tolower(min++); 413 status = tre_new_item(ctx->mem, cmin, ccurr, 414 &i, &max_i, items); 415 } 416 else min++; 417 if (status != REG_OK) 418 break; 419 } 420 if (status != REG_OK) 421 break; 422 } 423 } 424 } 425 *num_items = i; 426 *items_size = max_i; 427 ctx->re = re; 428 return status; 429 } 430 431 static reg_errcode_t 432 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) 433 { 434 tre_ast_node_t *node = NULL; 435 int negate = 0; 436 reg_errcode_t status = REG_OK; 437 tre_ast_node_t **items, *u, *n; 438 int i = 0, j, max_i = 32, curr_max, curr_min; 439 tre_ctype_t neg_classes[MAX_NEG_CLASSES]; 440 int num_neg_classes = 0; 441 442 /* Start off with an array of `max_i' elements. */ 443 items = xmalloc(sizeof(*items) * max_i); 444 if (items == NULL) 445 return REG_ESPACE; 446 447 if (*ctx->re == CHAR_CARET) 448 { 449 DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re))); 450 negate = 1; 451 ctx->re++; 452 } 453 454 status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes, 455 &items, &i, &max_i); 456 457 if (status != REG_OK) 458 goto parse_bracket_done; 459 460 /* Sort the array if we need to negate it. */ 461 if (negate) 462 qsort(items, (unsigned)i, sizeof(*items), tre_compare_items); 463 464 curr_max = curr_min = 0; 465 /* Build a union of the items in the array, negated if necessary. */ 466 for (j = 0; j < i && status == REG_OK; j++) 467 { 468 int min, max; 469 tre_literal_t *l = items[j]->obj; 470 min = l->code_min; 471 max = l->code_max; 472 473 DPRINT(("item: %d - %d, class %ld, curr_max = %d\n", 474 (int)l->code_min, (int)l->code_max, (long)l->u.class, curr_max)); 475 476 if (negate) 477 { 478 if (min < curr_max) 479 { 480 /* Overlap. */ 481 curr_max = MAX(max + 1, curr_max); 482 DPRINT(("overlap, curr_max = %d\n", curr_max)); 483 l = NULL; 484 } 485 else 486 { 487 /* No overlap. */ 488 curr_max = min - 1; 489 if (curr_max >= curr_min) 490 { 491 DPRINT(("no overlap\n")); 492 l->code_min = curr_min; 493 l->code_max = curr_max; 494 } 495 else 496 { 497 DPRINT(("no overlap, zero room\n")); 498 l = NULL; 499 } 500 curr_min = curr_max = max + 1; 501 } 502 } 503 504 if (l != NULL) 505 { 506 int k; 507 DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max)); 508 l->position = ctx->position; 509 if (num_neg_classes > 0) 510 { 511 l->neg_classes = tre_mem_alloc(ctx->mem, 512 (sizeof(l->neg_classes) 513 * (num_neg_classes + 1))); 514 if (l->neg_classes == NULL) 515 { 516 status = REG_ESPACE; 517 break; 518 } 519 for (k = 0; k < num_neg_classes; k++) 520 l->neg_classes[k] = neg_classes[k]; 521 l->neg_classes[k] = (tre_ctype_t)0; 522 } 523 else 524 l->neg_classes = NULL; 525 if (node == NULL) 526 node = items[j]; 527 else 528 { 529 u = tre_ast_new_union(ctx->mem, node, items[j]); 530 if (u == NULL) 531 status = REG_ESPACE; 532 node = u; 533 } 534 } 535 } 536 537 if (status != REG_OK) 538 goto parse_bracket_done; 539 540 if (negate) 541 { 542 int k; 543 DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX)); 544 n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position); 545 if (n == NULL) 546 status = REG_ESPACE; 547 else 548 { 549 tre_literal_t *l = n->obj; 550 if (num_neg_classes > 0) 551 { 552 l->neg_classes = tre_mem_alloc(ctx->mem, 553 (sizeof(l->neg_classes) 554 * (num_neg_classes + 1))); 555 if (l->neg_classes == NULL) 556 { 557 status = REG_ESPACE; 558 goto parse_bracket_done; 559 } 560 for (k = 0; k < num_neg_classes; k++) 561 l->neg_classes[k] = neg_classes[k]; 562 l->neg_classes[k] = (tre_ctype_t)0; 563 } 564 else 565 l->neg_classes = NULL; 566 if (node == NULL) 567 node = n; 568 else 569 { 570 u = tre_ast_new_union(ctx->mem, node, n); 571 if (u == NULL) 572 status = REG_ESPACE; 573 node = u; 574 } 575 } 576 } 577 578 if (status != REG_OK) 579 goto parse_bracket_done; 580 581 #ifdef TRE_DEBUG 582 tre_ast_print(node); 583 #endif /* TRE_DEBUG */ 584 585 parse_bracket_done: 586 xfree(items); 587 ctx->position++; 588 *result = node; 589 return status; 590 } 591 592 593 /* Parses a positive decimal integer. Returns -1 if the string does not 594 contain a valid number. */ 595 static int 596 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end) 597 { 598 int num = -1; 599 const tre_char_t *r = *regex; 600 while (r < regex_end && *r >= L'0' && *r <= L'9') 601 { 602 if (num < 0) 603 num = 0; 604 num = num * 10 + *r - L'0'; 605 r++; 606 } 607 *regex = r; 608 return num; 609 } 610 611 612 static reg_errcode_t 613 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) 614 { 615 int min, max, i; 616 int cost_ins, cost_del, cost_subst, cost_max; 617 int limit_ins, limit_del, limit_subst, limit_err; 618 const tre_char_t *r = ctx->re; 619 const tre_char_t *start; 620 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; 621 int approx = 0; 622 int costs_set = 0; 623 int counts_set = 0; 624 625 cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET; 626 limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET; 627 628 /* Parse number (minimum repetition count). */ 629 min = -1; 630 if (r < ctx->re_end && *r >= L'0' && *r <= L'9') { 631 DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r))); 632 min = tre_parse_int(&r, ctx->re_end); 633 } 634 635 /* Parse comma and second number (maximum repetition count). */ 636 max = min; 637 if (r < ctx->re_end && *r == CHAR_COMMA) 638 { 639 r++; 640 DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r))); 641 max = tre_parse_int(&r, ctx->re_end); 642 } 643 644 /* Check that the repeat counts are sane. */ 645 if ((max >= 0 && min > max) || max > RE_DUP_MAX) 646 return REG_BADBR; 647 648 649 /* 650 '{' 651 optionally followed immediately by a number == minimum repcount 652 optionally followed by , then a number == maximum repcount 653 + then a number == maximum insertion count 654 - then a number == maximum deletion count 655 # then a number == maximum substitution count 656 ~ then a number == maximum number of errors 657 Any of +, -, # or ~ without followed by a number means that 658 the maximum count/number of errors is infinite. 659 660 An equation of the form 661 Xi + Yd + Zs < C 662 can be specified to set costs and the cost limit to a value 663 different from the default value: 664 - X is the cost of an insertion 665 - Y is the cost of a deletion 666 - Z is the cost of a substitution 667 - C is the maximum cost 668 669 If no count limit or cost is set for an operation, the operation 670 is not allowed at all. 671 */ 672 673 674 do { 675 int done; 676 start = r; 677 678 /* Parse count limit settings */ 679 done = 0; 680 if (!counts_set) 681 while (r + 1 < ctx->re_end && !done) 682 { 683 switch (*r) 684 { 685 case CHAR_PLUS: /* Insert limit */ 686 DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r))); 687 r++; 688 limit_ins = tre_parse_int(&r, ctx->re_end); 689 if (limit_ins < 0) 690 limit_ins = INT_MAX; 691 counts_set = 1; 692 break; 693 case CHAR_MINUS: /* Delete limit */ 694 DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r))); 695 r++; 696 limit_del = tre_parse_int(&r, ctx->re_end); 697 if (limit_del < 0) 698 limit_del = INT_MAX; 699 counts_set = 1; 700 break; 701 case CHAR_HASH: /* Substitute limit */ 702 DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r))); 703 r++; 704 limit_subst = tre_parse_int(&r, ctx->re_end); 705 if (limit_subst < 0) 706 limit_subst = INT_MAX; 707 counts_set = 1; 708 break; 709 case CHAR_TILDE: /* Maximum number of changes */ 710 DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r))); 711 r++; 712 limit_err = tre_parse_int(&r, ctx->re_end); 713 if (limit_err < 0) 714 limit_err = INT_MAX; 715 approx = 1; 716 break; 717 case CHAR_COMMA: 718 r++; 719 break; 720 case L' ': 721 r++; 722 break; 723 case L'}': 724 done = 1; 725 break; 726 default: 727 done = 1; 728 break; 729 } 730 } 731 732 /* Parse cost restriction equation. */ 733 done = 0; 734 if (!costs_set) 735 while (r + 1 < ctx->re_end && !done) 736 { 737 switch (*r) 738 { 739 case CHAR_PLUS: 740 case L' ': 741 r++; 742 break; 743 case L'<': 744 DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r))); 745 r++; 746 while (*r == L' ') 747 r++; 748 cost_max = tre_parse_int(&r, ctx->re_end); 749 if (cost_max < 0) 750 cost_max = INT_MAX; 751 else 752 cost_max--; 753 approx = 1; 754 break; 755 case CHAR_COMMA: 756 r++; 757 done = 1; 758 break; 759 default: 760 if (*r >= L'0' && *r <= L'9') 761 { 762 #ifdef TRE_DEBUG 763 const tre_char_t *sr = r; 764 #endif /* TRE_DEBUG */ 765 int cost = tre_parse_int(&r, ctx->re_end); 766 /* XXX - make sure r is not past end. */ 767 switch (*r) 768 { 769 case L'i': /* Insert cost */ 770 DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n", 771 REST(sr))); 772 r++; 773 cost_ins = cost; 774 costs_set = 1; 775 break; 776 case L'd': /* Delete cost */ 777 DPRINT(("tre_parse: del cost: '%.*" STRF "'\n", 778 REST(sr))); 779 r++; 780 cost_del = cost; 781 costs_set = 1; 782 break; 783 case L's': /* Substitute cost */ 784 DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n", 785 REST(sr))); 786 r++; 787 cost_subst = cost; 788 costs_set = 1; 789 break; 790 default: 791 return REG_BADBR; 792 } 793 } 794 else 795 { 796 done = 1; 797 break; 798 } 799 } 800 } 801 } while (start != r); 802 803 /* Missing }. */ 804 if (r >= ctx->re_end) 805 return REG_EBRACE; 806 807 /* Empty contents of {}. */ 808 if (r == ctx->re) 809 return REG_BADBR; 810 811 /* Parse the ending '}' or '\}'.*/ 812 if (ctx->cflags & REG_EXTENDED) 813 { 814 if (r >= ctx->re_end || *r != CHAR_RBRACE) 815 return REG_BADBR; 816 r++; 817 } 818 else 819 { 820 if (r + 1 >= ctx->re_end 821 || *r != CHAR_BACKSLASH 822 || *(r + 1) != CHAR_RBRACE) 823 return REG_BADBR; 824 r += 2; 825 } 826 827 828 /* Parse trailing '?' marking minimal repetition. */ 829 if (r < ctx->re_end) 830 { 831 if (*r == CHAR_QUESTIONMARK) 832 { 833 minimal = !(ctx->cflags & REG_UNGREEDY); 834 r++; 835 } 836 else if (*r == CHAR_STAR || *r == CHAR_PLUS) 837 { 838 /* These are reserved for future extensions. */ 839 return REG_BADRPT; 840 } 841 } 842 843 /* Create the AST node(s). */ 844 if (min == 0 && max == 0) 845 { 846 *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 847 if (*result == NULL) 848 return REG_ESPACE; 849 } 850 else 851 { 852 if (min < 0 && max < 0) 853 /* Only approximate parameters set, no repetitions. */ 854 min = max = 1; 855 856 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal); 857 if (!*result) 858 return REG_ESPACE; 859 860 /* If approximate matching parameters are set, add them to the 861 iteration node. */ 862 if (approx || costs_set || counts_set) 863 { 864 int *params; 865 tre_iteration_t *iter = (*result)->obj; 866 867 if (costs_set || counts_set) 868 { 869 if (limit_ins == TRE_PARAM_UNSET) 870 { 871 if (cost_ins == TRE_PARAM_UNSET) 872 limit_ins = 0; 873 else 874 limit_ins = INT_MAX; 875 } 876 877 if (limit_del == TRE_PARAM_UNSET) 878 { 879 if (cost_del == TRE_PARAM_UNSET) 880 limit_del = 0; 881 else 882 limit_del = INT_MAX; 883 } 884 885 if (limit_subst == TRE_PARAM_UNSET) 886 { 887 if (cost_subst == TRE_PARAM_UNSET) 888 limit_subst = 0; 889 else 890 limit_subst = INT_MAX; 891 } 892 } 893 894 if (cost_max == TRE_PARAM_UNSET) 895 cost_max = INT_MAX; 896 if (limit_err == TRE_PARAM_UNSET) 897 limit_err = INT_MAX; 898 899 ctx->have_approx = 1; 900 params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST); 901 if (!params) 902 return REG_ESPACE; 903 for (i = 0; i < TRE_PARAM_LAST; i++) 904 params[i] = TRE_PARAM_UNSET; 905 params[TRE_PARAM_COST_INS] = cost_ins; 906 params[TRE_PARAM_COST_DEL] = cost_del; 907 params[TRE_PARAM_COST_SUBST] = cost_subst; 908 params[TRE_PARAM_COST_MAX] = cost_max; 909 params[TRE_PARAM_MAX_INS] = limit_ins; 910 params[TRE_PARAM_MAX_DEL] = limit_del; 911 params[TRE_PARAM_MAX_SUBST] = limit_subst; 912 params[TRE_PARAM_MAX_ERR] = limit_err; 913 iter->params = params; 914 } 915 } 916 917 DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], " 918 "limits [%d,%d,%d, total %d]\n", 919 min, max, cost_ins, cost_del, cost_subst, cost_max, 920 limit_ins, limit_del, limit_subst, limit_err)); 921 922 923 ctx->re = r; 924 return REG_OK; 925 } 926 927 typedef enum { 928 PARSE_RE = 0, 929 PARSE_ATOM, 930 PARSE_MARK_FOR_SUBMATCH, 931 PARSE_BRANCH, 932 PARSE_PIECE, 933 PARSE_CATENATION, 934 PARSE_POST_CATENATION, 935 PARSE_UNION, 936 PARSE_POST_UNION, 937 PARSE_POSTFIX, 938 PARSE_RESTORE_CFLAGS 939 } tre_parse_re_stack_symbol_t; 940 941 942 reg_errcode_t 943 tre_parse(tre_parse_ctx_t *ctx) 944 { 945 tre_ast_node_t *result = NULL; 946 tre_parse_re_stack_symbol_t symbol; 947 reg_errcode_t status = REG_OK; 948 tre_stack_t *stack = ctx->stack; 949 int bottom = tre_stack_num_objects(stack); 950 int depth = 0; 951 int temporary_cflags = 0; 952 953 DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n", 954 ctx->len, ctx->re, ctx->len)); 955 956 if (!ctx->nofirstsub) 957 { 958 STACK_PUSH(stack, int, ctx->submatch_id); 959 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH); 960 ctx->submatch_id++; 961 } 962 STACK_PUSH(stack, int, PARSE_RE); 963 ctx->re_start = ctx->re; 964 ctx->re_end = ctx->re + ctx->len; 965 966 967 /* The following is basically just a recursive descent parser. I use 968 an explicit stack instead of recursive functions mostly because of 969 two reasons: compatibility with systems which have an overflowable 970 call stack, and efficiency (both in lines of code and speed). */ 971 while (tre_stack_num_objects(stack) > bottom && status == REG_OK) 972 { 973 if (status != REG_OK) 974 break; 975 symbol = tre_stack_pop_int(stack); 976 switch (symbol) 977 { 978 case PARSE_RE: 979 /* Parse a full regexp. A regexp is one or more branches, 980 separated by the union operator `|'. */ 981 #ifdef REG_LITERAL 982 if (!(ctx->cflags & REG_LITERAL) 983 && ctx->cflags & REG_EXTENDED) 984 #endif /* REG_LITERAL */ 985 STACK_PUSHX(stack, int, PARSE_UNION); 986 STACK_PUSHX(stack, int, PARSE_BRANCH); 987 break; 988 989 case PARSE_BRANCH: 990 /* Parse a branch. A branch is one or more pieces, concatenated. 991 A piece is an atom possibly followed by a postfix operator. */ 992 STACK_PUSHX(stack, int, PARSE_CATENATION); 993 STACK_PUSHX(stack, int, PARSE_PIECE); 994 break; 995 996 case PARSE_PIECE: 997 /* Parse a piece. A piece is an atom possibly followed by one 998 or more postfix operators. */ 999 #ifdef REG_LITERAL 1000 if (!(ctx->cflags & REG_LITERAL)) 1001 #endif /* REG_LITERAL */ 1002 STACK_PUSHX(stack, int, PARSE_POSTFIX); 1003 STACK_PUSHX(stack, int, PARSE_ATOM); 1004 break; 1005 1006 case PARSE_CATENATION: 1007 /* If the expression has not ended, parse another piece. */ 1008 { 1009 tre_char_t c; 1010 if (ctx->re >= ctx->re_end) 1011 break; 1012 c = *ctx->re; 1013 #ifdef REG_LITERAL 1014 if (!(ctx->cflags & REG_LITERAL)) 1015 { 1016 #endif /* REG_LITERAL */ 1017 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) 1018 break; 1019 if ((ctx->cflags & REG_EXTENDED 1020 && c == CHAR_RPAREN && depth > 0) 1021 || (!(ctx->cflags & REG_EXTENDED) 1022 && (c == CHAR_BACKSLASH 1023 && *(ctx->re + 1) == CHAR_RPAREN))) 1024 { 1025 if (!(ctx->cflags & REG_EXTENDED) && depth == 0) 1026 status = REG_EPAREN; 1027 DPRINT(("tre_parse: group end: '%.*" STRF "'\n", 1028 REST(ctx->re))); 1029 depth--; 1030 if (!(ctx->cflags & REG_EXTENDED)) 1031 ctx->re += 2; 1032 break; 1033 } 1034 #ifdef REG_LITERAL 1035 } 1036 #endif /* REG_LITERAL */ 1037 1038 #ifdef REG_RIGHT_ASSOC 1039 if (ctx->cflags & REG_RIGHT_ASSOC) 1040 { 1041 /* Right associative concatenation. */ 1042 STACK_PUSHX(stack, voidptr, result); 1043 STACK_PUSHX(stack, int, PARSE_POST_CATENATION); 1044 STACK_PUSHX(stack, int, PARSE_CATENATION); 1045 STACK_PUSHX(stack, int, PARSE_PIECE); 1046 } 1047 else 1048 #endif /* REG_RIGHT_ASSOC */ 1049 { 1050 /* Default case, left associative concatenation. */ 1051 STACK_PUSHX(stack, int, PARSE_CATENATION); 1052 STACK_PUSHX(stack, voidptr, result); 1053 STACK_PUSHX(stack, int, PARSE_POST_CATENATION); 1054 STACK_PUSHX(stack, int, PARSE_PIECE); 1055 } 1056 break; 1057 } 1058 1059 case PARSE_POST_CATENATION: 1060 { 1061 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); 1062 tre_ast_node_t *tmp_node; 1063 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result); 1064 if (!tmp_node) 1065 return REG_ESPACE; 1066 result = tmp_node; 1067 break; 1068 } 1069 1070 case PARSE_UNION: 1071 if (ctx->re >= ctx->re_end) 1072 break; 1073 #ifdef REG_LITERAL 1074 if (ctx->cflags & REG_LITERAL) 1075 break; 1076 #endif /* REG_LITERAL */ 1077 switch (*ctx->re) 1078 { 1079 case CHAR_PIPE: 1080 DPRINT(("tre_parse: union: '%.*" STRF "'\n", 1081 REST(ctx->re))); 1082 STACK_PUSHX(stack, int, PARSE_UNION); 1083 STACK_PUSHX(stack, voidptr, result); 1084 STACK_PUSHX(stack, int, PARSE_POST_UNION); 1085 STACK_PUSHX(stack, int, PARSE_BRANCH); 1086 ctx->re++; 1087 break; 1088 1089 case CHAR_RPAREN: 1090 ctx->re++; 1091 break; 1092 1093 default: 1094 break; 1095 } 1096 break; 1097 1098 case PARSE_POST_UNION: 1099 { 1100 tre_ast_node_t *tmp_node; 1101 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); 1102 tmp_node = tre_ast_new_union(ctx->mem, tree, result); 1103 if (!tmp_node) 1104 return REG_ESPACE; 1105 result = tmp_node; 1106 break; 1107 } 1108 1109 case PARSE_POSTFIX: 1110 /* Parse postfix operators. */ 1111 if (ctx->re >= ctx->re_end) 1112 break; 1113 #ifdef REG_LITERAL 1114 if (ctx->cflags & REG_LITERAL) 1115 break; 1116 #endif /* REG_LITERAL */ 1117 switch (*ctx->re) 1118 { 1119 case CHAR_PLUS: 1120 case CHAR_QUESTIONMARK: 1121 if (!(ctx->cflags & REG_EXTENDED)) 1122 break; 1123 /*FALLTHROUGH*/ 1124 case CHAR_STAR: 1125 { 1126 tre_ast_node_t *tmp_node; 1127 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; 1128 int rep_min = 0; 1129 int rep_max = -1; 1130 #ifdef TRE_DEBUG 1131 const tre_char_t *tmp_re; 1132 #endif 1133 1134 if (*ctx->re == CHAR_PLUS) 1135 rep_min = 1; 1136 if (*ctx->re == CHAR_QUESTIONMARK) 1137 rep_max = 1; 1138 #ifdef TRE_DEBUG 1139 tmp_re = ctx->re; 1140 #endif 1141 1142 if (ctx->re + 1 < ctx->re_end) 1143 { 1144 if (*(ctx->re + 1) == CHAR_QUESTIONMARK) 1145 { 1146 minimal = !(ctx->cflags & REG_UNGREEDY); 1147 ctx->re++; 1148 } 1149 else if (*(ctx->re + 1) == CHAR_STAR 1150 || *(ctx->re + 1) == CHAR_PLUS) 1151 { 1152 /* These are reserved for future extensions. */ 1153 return REG_BADRPT; 1154 } 1155 } 1156 1157 DPRINT(("tre_parse: %s star: '%.*" STRF "'\n", 1158 minimal ? " minimal" : "greedy", REST(tmp_re))); 1159 ctx->re++; 1160 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max, 1161 minimal); 1162 if (tmp_node == NULL) 1163 return REG_ESPACE; 1164 result = tmp_node; 1165 STACK_PUSHX(stack, int, PARSE_POSTFIX); 1166 } 1167 break; 1168 1169 case CHAR_BACKSLASH: 1170 /* "\{" is special without REG_EXTENDED */ 1171 if (!(ctx->cflags & REG_EXTENDED) 1172 && ctx->re + 1 < ctx->re_end 1173 && *(ctx->re + 1) == CHAR_LBRACE) 1174 { 1175 ctx->re++; 1176 goto parse_brace; 1177 } 1178 else 1179 break; 1180 1181 case CHAR_LBRACE: 1182 /* "{" is literal without REG_EXTENDED */ 1183 if (!(ctx->cflags & REG_EXTENDED)) 1184 break; 1185 1186 parse_brace: 1187 DPRINT(("tre_parse: bound: '%.*" STRF "'\n", 1188 REST(ctx->re))); 1189 ctx->re++; 1190 1191 status = tre_parse_bound(ctx, &result); 1192 if (status != REG_OK) 1193 return status; 1194 STACK_PUSHX(stack, int, PARSE_POSTFIX); 1195 break; 1196 } 1197 break; 1198 1199 case PARSE_ATOM: 1200 /* Parse an atom. An atom is a regular expression enclosed in `()', 1201 an empty set of `()', a bracket expression, `.', `^', `$', 1202 a `\' followed by a character, or a single character. */ 1203 1204 /* End of regexp? (empty string). */ 1205 if (ctx->re >= ctx->re_end) 1206 goto parse_literal; 1207 1208 #ifdef REG_LITERAL 1209 if (ctx->cflags & REG_LITERAL) 1210 goto parse_literal; 1211 #endif /* REG_LITERAL */ 1212 1213 switch (*ctx->re) 1214 { 1215 case CHAR_LPAREN: /* parenthesized subexpression */ 1216 1217 /* Handle "(?...)" extensions. They work in a way similar 1218 to Perls corresponding extensions. */ 1219 if (ctx->cflags & REG_EXTENDED 1220 && *(ctx->re + 1) == CHAR_QUESTIONMARK) 1221 { 1222 int new_cflags = ctx->cflags; 1223 int bit = 1; 1224 DPRINT(("tre_parse: extension: '%.*" STRF "\n", 1225 REST(ctx->re))); 1226 ctx->re += 2; 1227 while (/*CONSTCOND*/1) 1228 { 1229 if (*ctx->re == L'i') 1230 { 1231 DPRINT(("tre_parse: icase: '%.*" STRF "\n", 1232 REST(ctx->re))); 1233 if (bit) 1234 new_cflags |= REG_ICASE; 1235 else 1236 new_cflags &= ~REG_ICASE; 1237 ctx->re++; 1238 } 1239 else if (*ctx->re == L'n') 1240 { 1241 DPRINT(("tre_parse: newline: '%.*" STRF "\n", 1242 REST(ctx->re))); 1243 if (bit) 1244 new_cflags |= REG_NEWLINE; 1245 else 1246 new_cflags &= ~REG_NEWLINE; 1247 ctx->re++; 1248 } 1249 #ifdef REG_RIGHT_ASSOC 1250 else if (*ctx->re == L'r') 1251 { 1252 DPRINT(("tre_parse: right assoc: '%.*" STRF "\n", 1253 REST(ctx->re))); 1254 if (bit) 1255 new_cflags |= REG_RIGHT_ASSOC; 1256 else 1257 new_cflags &= ~REG_RIGHT_ASSOC; 1258 ctx->re++; 1259 } 1260 #endif /* REG_RIGHT_ASSOC */ 1261 #ifdef REG_UNGREEDY 1262 else if (*ctx->re == L'U') 1263 { 1264 DPRINT(("tre_parse: ungreedy: '%.*" STRF "\n", 1265 REST(ctx->re))); 1266 if (bit) 1267 new_cflags |= REG_UNGREEDY; 1268 else 1269 new_cflags &= ~REG_UNGREEDY; 1270 ctx->re++; 1271 } 1272 #endif /* REG_UNGREEDY */ 1273 else if (*ctx->re == CHAR_MINUS) 1274 { 1275 DPRINT(("tre_parse: turn off: '%.*" STRF "\n", 1276 REST(ctx->re))); 1277 ctx->re++; 1278 bit = 0; 1279 } 1280 else if (*ctx->re == CHAR_COLON) 1281 { 1282 DPRINT(("tre_parse: no group: '%.*" STRF "\n", 1283 REST(ctx->re))); 1284 ctx->re++; 1285 depth++; 1286 break; 1287 } 1288 else if (*ctx->re == CHAR_HASH) 1289 { 1290 DPRINT(("tre_parse: comment: '%.*" STRF "\n", 1291 REST(ctx->re))); 1292 /* A comment can contain any character except a 1293 right parenthesis */ 1294 while (*ctx->re != CHAR_RPAREN 1295 && ctx->re < ctx->re_end) 1296 ctx->re++; 1297 if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end) 1298 { 1299 ctx->re++; 1300 break; 1301 } 1302 else 1303 return REG_BADPAT; 1304 } 1305 else if (*ctx->re == CHAR_RPAREN) 1306 { 1307 ctx->re++; 1308 break; 1309 } 1310 else 1311 return REG_BADPAT; 1312 } 1313 1314 /* Turn on the cflags changes for the rest of the 1315 enclosing group. */ 1316 STACK_PUSHX(stack, int, ctx->cflags); 1317 STACK_PUSHX(stack, int, PARSE_RESTORE_CFLAGS); 1318 STACK_PUSHX(stack, int, PARSE_RE); 1319 ctx->cflags = new_cflags; 1320 break; 1321 } 1322 1323 if (ctx->cflags & REG_EXTENDED 1324 || (ctx->re > ctx->re_start 1325 && *(ctx->re - 1) == CHAR_BACKSLASH)) 1326 { 1327 depth++; 1328 if (ctx->re + 2 < ctx->re_end 1329 && *(ctx->re + 1) == CHAR_QUESTIONMARK 1330 && *(ctx->re + 2) == CHAR_COLON) 1331 { 1332 DPRINT(("tre_parse: group begin: '%.*" STRF 1333 "', no submatch\n", REST(ctx->re))); 1334 /* Don't mark for submatching. */ 1335 ctx->re += 3; 1336 STACK_PUSHX(stack, int, PARSE_RE); 1337 } 1338 else 1339 { 1340 DPRINT(("tre_parse: group begin: '%.*" STRF 1341 "', submatch %d\n", REST(ctx->re), 1342 ctx->submatch_id)); 1343 ctx->re++; 1344 /* First parse a whole RE, then mark the resulting tree 1345 for submatching. */ 1346 STACK_PUSHX(stack, int, ctx->submatch_id); 1347 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); 1348 STACK_PUSHX(stack, int, PARSE_RE); 1349 ctx->submatch_id++; 1350 } 1351 } 1352 else 1353 goto parse_literal; 1354 break; 1355 1356 case CHAR_RPAREN: /* end of current subexpression */ 1357 if ((ctx->cflags & REG_EXTENDED && depth > 0) 1358 || (ctx->re > ctx->re_start 1359 && *(ctx->re - 1) == CHAR_BACKSLASH)) 1360 { 1361 DPRINT(("tre_parse: empty: '%.*" STRF "'\n", 1362 REST(ctx->re))); 1363 /* We were expecting an atom, but instead the current 1364 subexpression was closed. POSIX leaves the meaning of 1365 this to be implementation-defined. We interpret this as 1366 an empty expression (which matches an empty string). */ 1367 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 1368 if (result == NULL) 1369 return REG_ESPACE; 1370 if (!(ctx->cflags & REG_EXTENDED)) 1371 ctx->re--; 1372 } 1373 else 1374 goto parse_literal; 1375 break; 1376 1377 case CHAR_LBRACKET: /* bracket expression */ 1378 DPRINT(("tre_parse: bracket: '%.*" STRF "'\n", 1379 REST(ctx->re))); 1380 ctx->re++; 1381 status = tre_parse_bracket(ctx, &result); 1382 if (status != REG_OK) 1383 return status; 1384 break; 1385 1386 case CHAR_BACKSLASH: 1387 /* If this is "\(" or "\)" chew off the backslash and 1388 try again. */ 1389 if (!(ctx->cflags & REG_EXTENDED) 1390 && ctx->re + 1 < ctx->re_end 1391 && (*(ctx->re + 1) == CHAR_LPAREN 1392 || *(ctx->re + 1) == CHAR_RPAREN)) 1393 { 1394 ctx->re++; 1395 STACK_PUSHX(stack, int, PARSE_ATOM); 1396 break; 1397 } 1398 1399 /* If a macro is used, parse the expanded macro recursively. */ 1400 { 1401 tre_char_t buf[64]; 1402 tre_expand_macro(ctx->re + 1, ctx->re_end, 1403 buf, elementsof(buf)); 1404 if (buf[0] != 0) 1405 { 1406 tre_parse_ctx_t subctx; 1407 memcpy(&subctx, ctx, sizeof(subctx)); 1408 subctx.re = buf; 1409 subctx.len = tre_strlen(buf); 1410 subctx.nofirstsub = 1; 1411 status = tre_parse(&subctx); 1412 if (status != REG_OK) 1413 return status; 1414 ctx->re += 2; 1415 ctx->position = subctx.position; 1416 result = subctx.result; 1417 break; 1418 } 1419 } 1420 1421 if (ctx->re + 1 >= ctx->re_end) 1422 /* Trailing backslash. */ 1423 return REG_EESCAPE; 1424 1425 #ifdef REG_LITERAL 1426 if (*(ctx->re + 1) == L'Q') 1427 { 1428 DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n", 1429 REST(ctx->re))); 1430 ctx->cflags |= REG_LITERAL; 1431 temporary_cflags |= REG_LITERAL; 1432 ctx->re += 2; 1433 STACK_PUSHX(stack, int, PARSE_ATOM); 1434 break; 1435 } 1436 #endif /* REG_LITERAL */ 1437 1438 DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re))); 1439 ctx->re++; 1440 switch (*ctx->re) 1441 { 1442 case L'b': 1443 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1444 ASSERT_AT_WB, -1); 1445 ctx->re++; 1446 break; 1447 case L'B': 1448 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1449 ASSERT_AT_WB_NEG, -1); 1450 ctx->re++; 1451 break; 1452 case L'<': 1453 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1454 ASSERT_AT_BOW, -1); 1455 ctx->re++; 1456 break; 1457 case L'>': 1458 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1459 ASSERT_AT_EOW, -1); 1460 ctx->re++; 1461 break; 1462 case L'x': 1463 ctx->re++; 1464 if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end) 1465 { 1466 /* 8 bit hex char. */ 1467 char tmp[3] = {0, 0, 0}; 1468 long val; 1469 DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n", 1470 REST(ctx->re - 2))); 1471 1472 if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end) 1473 { 1474 tmp[0] = (char)ctx->re[0]; 1475 ctx->re++; 1476 } 1477 if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end) 1478 { 1479 tmp[1] = (char)ctx->re[0]; 1480 ctx->re++; 1481 } 1482 val = strtol(tmp, NULL, 16); 1483 result = tre_ast_new_literal(ctx->mem, (int)val, 1484 (int)val, ctx->position); 1485 ctx->position++; 1486 break; 1487 } 1488 else if (ctx->re < ctx->re_end) 1489 { 1490 /* Wide char. */ 1491 char tmp[32]; 1492 long val; 1493 int i = 0; 1494 ctx->re++; 1495 while (ctx->re_end - ctx->re >= 0) 1496 { 1497 if (ctx->re[0] == CHAR_RBRACE) 1498 break; 1499 if (tre_isxdigit(ctx->re[0])) 1500 { 1501 tmp[i] = (char)ctx->re[0]; 1502 i++; 1503 ctx->re++; 1504 continue; 1505 } 1506 return REG_EBRACE; 1507 } 1508 ctx->re++; 1509 tmp[i] = 0; 1510 val = strtol(tmp, NULL, 16); 1511 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val, 1512 ctx->position); 1513 ctx->position++; 1514 break; 1515 } 1516 /*FALLTHROUGH*/ 1517 1518 default: 1519 if (tre_isdigit(*ctx->re)) 1520 { 1521 /* Back reference. */ 1522 int val = *ctx->re - L'0'; 1523 DPRINT(("tre_parse: backref: '%.*" STRF "'\n", 1524 REST(ctx->re - 1))); 1525 result = tre_ast_new_literal(ctx->mem, BACKREF, val, 1526 ctx->position); 1527 if (result == NULL) 1528 return REG_ESPACE; 1529 ctx->position++; 1530 ctx->max_backref = MAX(val, ctx->max_backref); 1531 ctx->re++; 1532 } 1533 else 1534 { 1535 /* Escaped character. */ 1536 DPRINT(("tre_parse: escaped: '%.*" STRF "'\n", 1537 REST(ctx->re - 1))); 1538 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, 1539 ctx->position); 1540 ctx->position++; 1541 ctx->re++; 1542 } 1543 break; 1544 } 1545 if (result == NULL) 1546 return REG_ESPACE; 1547 break; 1548 1549 case CHAR_PERIOD: /* the any-symbol */ 1550 DPRINT(("tre_parse: any: '%.*" STRF "'\n", 1551 REST(ctx->re))); 1552 if (ctx->cflags & REG_NEWLINE) 1553 { 1554 tre_ast_node_t *tmp1; 1555 tre_ast_node_t *tmp2; 1556 tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1, 1557 ctx->position); 1558 if (!tmp1) 1559 return REG_ESPACE; 1560 tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX, 1561 ctx->position + 1); 1562 if (!tmp2) 1563 return REG_ESPACE; 1564 result = tre_ast_new_union(ctx->mem, tmp1, tmp2); 1565 if (!result) 1566 return REG_ESPACE; 1567 ctx->position += 2; 1568 } 1569 else 1570 { 1571 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, 1572 ctx->position); 1573 if (!result) 1574 return REG_ESPACE; 1575 ctx->position++; 1576 } 1577 ctx->re++; 1578 break; 1579 1580 case CHAR_CARET: /* beginning of line assertion */ 1581 /* '^' has a special meaning everywhere in EREs, and in the 1582 beginning of the RE and after \( is BREs. */ 1583 if (ctx->cflags & REG_EXTENDED 1584 || (ctx->re - 2 >= ctx->re_start 1585 && *(ctx->re - 2) == CHAR_BACKSLASH 1586 && *(ctx->re - 1) == CHAR_LPAREN) 1587 || ctx->re == ctx->re_start) 1588 { 1589 DPRINT(("tre_parse: BOL: '%.*" STRF "'\n", 1590 REST(ctx->re))); 1591 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1592 ASSERT_AT_BOL, -1); 1593 if (result == NULL) 1594 return REG_ESPACE; 1595 ctx->re++; 1596 } 1597 else 1598 goto parse_literal; 1599 break; 1600 1601 case CHAR_DOLLAR: /* end of line assertion. */ 1602 /* '$' is special everywhere in EREs, and in the end of the 1603 string and before \) is BREs. */ 1604 if (ctx->cflags & REG_EXTENDED 1605 || (ctx->re + 2 < ctx->re_end 1606 && *(ctx->re + 1) == CHAR_BACKSLASH 1607 && *(ctx->re + 2) == CHAR_RPAREN) 1608 || ctx->re + 1 == ctx->re_end) 1609 { 1610 DPRINT(("tre_parse: EOL: '%.*" STRF "'\n", 1611 REST(ctx->re))); 1612 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1613 ASSERT_AT_EOL, -1); 1614 if (result == NULL) 1615 return REG_ESPACE; 1616 ctx->re++; 1617 } 1618 else 1619 goto parse_literal; 1620 break; 1621 1622 default: 1623 parse_literal: 1624 1625 if (temporary_cflags && ctx->re + 1 < ctx->re_end 1626 && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E') 1627 { 1628 DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n", 1629 REST(ctx->re))); 1630 ctx->cflags &= ~temporary_cflags; 1631 temporary_cflags = 0; 1632 ctx->re += 2; 1633 STACK_PUSHX(stack, int, PARSE_PIECE); 1634 break; 1635 } 1636 1637 1638 /* We are expecting an atom. If the subexpression (or the whole 1639 regexp ends here, we interpret it as an empty expression 1640 (which matches an empty string). */ 1641 if ( 1642 #ifdef REG_LITERAL 1643 !(ctx->cflags & REG_LITERAL) && 1644 #endif /* REG_LITERAL */ 1645 (ctx->re >= ctx->re_end 1646 || *ctx->re == CHAR_STAR 1647 || (ctx->cflags & REG_EXTENDED 1648 && (*ctx->re == CHAR_PIPE 1649 || *ctx->re == CHAR_LBRACE 1650 || *ctx->re == CHAR_PLUS 1651 || *ctx->re == CHAR_QUESTIONMARK)) 1652 /* Test for "\)" in BRE mode. */ 1653 || (!(ctx->cflags & REG_EXTENDED) 1654 && ctx->re + 1 < ctx->re_end 1655 && *ctx->re == CHAR_BACKSLASH 1656 && *(ctx->re + 1) == CHAR_LBRACE))) 1657 { 1658 DPRINT(("tre_parse: empty: '%.*" STRF "'\n", 1659 REST(ctx->re))); 1660 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 1661 if (!result) 1662 return REG_ESPACE; 1663 break; 1664 } 1665 1666 DPRINT(("tre_parse: literal: '%.*" STRF "'\n", 1667 REST(ctx->re))); 1668 /* Note that we can't use an tre_isalpha() test here, since there 1669 may be characters which are alphabetic but neither upper or 1670 lower case. */ 1671 if (ctx->cflags & REG_ICASE 1672 && (tre_isupper(*ctx->re) || tre_islower(*ctx->re))) 1673 { 1674 tre_ast_node_t *tmp1; 1675 tre_ast_node_t *tmp2; 1676 1677 /* XXX - Can there be more than one opposite-case 1678 counterpoints for some character in some locale? Or 1679 more than two characters which all should be regarded 1680 the same character if case is ignored? If yes, there 1681 does not seem to be a portable way to detect it. I guess 1682 that at least for multi-character collating elements there 1683 could be several opposite-case counterpoints, but they 1684 cannot be supported portably anyway. */ 1685 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re), 1686 tre_toupper(*ctx->re), 1687 ctx->position); 1688 if (!tmp1) 1689 return REG_ESPACE; 1690 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re), 1691 tre_tolower(*ctx->re), 1692 ctx->position); 1693 if (!tmp2) 1694 return REG_ESPACE; 1695 result = tre_ast_new_union(ctx->mem, tmp1, tmp2); 1696 if (!result) 1697 return REG_ESPACE; 1698 } 1699 else 1700 { 1701 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, 1702 ctx->position); 1703 if (!result) 1704 return REG_ESPACE; 1705 } 1706 ctx->position++; 1707 ctx->re++; 1708 break; 1709 } 1710 break; 1711 1712 case PARSE_MARK_FOR_SUBMATCH: 1713 { 1714 int submatch_id = tre_stack_pop_int(stack); 1715 1716 if (result->submatch_id >= 0) 1717 { 1718 tre_ast_node_t *n, *tmp_node; 1719 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 1720 if (n == NULL) 1721 return REG_ESPACE; 1722 tmp_node = tre_ast_new_catenation(ctx->mem, n, result); 1723 if (tmp_node == NULL) 1724 return REG_ESPACE; 1725 tmp_node->num_submatches = result->num_submatches; 1726 result = tmp_node; 1727 } 1728 result->submatch_id = submatch_id; 1729 result->num_submatches++; 1730 break; 1731 } 1732 1733 case PARSE_RESTORE_CFLAGS: 1734 ctx->cflags = tre_stack_pop_int(stack); 1735 break; 1736 1737 default: 1738 assert(0); 1739 break; 1740 } 1741 } 1742 1743 /* Check for missing closing parentheses. */ 1744 if (depth > 0) 1745 return REG_EPAREN; 1746 1747 if (status == REG_OK) 1748 ctx->result = result; 1749 1750 return status; 1751 } 1752 1753 /* EOF */ 1754