1 /* 2 tre-parse.c - Regexp parser 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7 */ 8 9 /* 10 This parser is just a simple recursive descent parser for POSIX.2 11 regexps. The parser supports both the obsolete default syntax and 12 the "extended" syntax, and some nonstandard extensions. 13 */ 14 15 16 #ifdef HAVE_CONFIG_H 17 #include <config.h> 18 #endif /* HAVE_CONFIG_H */ 19 #include <string.h> 20 #include <assert.h> 21 #include <limits.h> 22 23 #include "xmalloc.h" 24 #include "tre-mem.h" 25 #include "tre-ast.h" 26 #include "tre-stack.h" 27 #include "tre-parse.h" 28 29 30 /* Characters with special meanings in regexp syntax. */ 31 #define CHAR_PIPE L'|' 32 #define CHAR_LPAREN L'(' 33 #define CHAR_RPAREN L')' 34 #define CHAR_LBRACE L'{' 35 #define CHAR_RBRACE L'}' 36 #define CHAR_LBRACKET L'[' 37 #define CHAR_RBRACKET L']' 38 #define CHAR_MINUS L'-' 39 #define CHAR_STAR L'*' 40 #define CHAR_QUESTIONMARK L'?' 41 #define CHAR_PLUS L'+' 42 #define CHAR_PERIOD L'.' 43 #define CHAR_COLON L':' 44 #define CHAR_EQUAL L'=' 45 #define CHAR_COMMA L',' 46 #define CHAR_CARET L'^' 47 #define CHAR_DOLLAR L'$' 48 #define CHAR_BACKSLASH L'\\' 49 #define CHAR_HASH L'#' 50 #define CHAR_TILDE L'~' 51 52 53 /* Some macros for expanding \w, \s, etc. */ 54 static const struct tre_macro_struct { 55 const char c; 56 const char *expansion; 57 } tre_macros[] = 58 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, 59 {'f', "\f"}, {'a', "\a"}, {'e', "\033"}, 60 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, 61 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, 62 { 0, NULL } 63 }; 64 65 66 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which 67 must have at least `len' items. Sets buf[0] to zero if the there 68 is no match in `tre_macros'. */ 69 static void 70 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end, 71 tre_char_t *buf, size_t buf_len) 72 { 73 int i; 74 75 buf[0] = 0; 76 if (regex >= regex_end) 77 return; 78 79 for (i = 0; tre_macros[i].expansion; i++) 80 { 81 if (tre_macros[i].c == *regex) 82 { 83 unsigned int j; 84 DPRINT(("Expanding macro '%c' => '%s'\n", 85 tre_macros[i].c, tre_macros[i].expansion)); 86 for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++) 87 buf[j] = tre_macros[i].expansion[j]; 88 buf[j] = 0; 89 break; 90 } 91 } 92 } 93 94 static reg_errcode_t 95 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, 96 tre_ast_node_t ***items) 97 { 98 reg_errcode_t status; 99 tre_ast_node_t **array = *items; 100 /* Allocate more space if necessary. */ 101 if (*i >= *max_i) 102 { 103 tre_ast_node_t **new_items; 104 DPRINT(("out of array space, i = %d\n", *i)); 105 /* If the array is already 1024 items large, give up -- there's 106 probably an error in the regexp (e.g. not a '\0' terminated 107 string and missing ']') */ 108 if (*max_i > 1024) 109 return REG_ESPACE; 110 *max_i *= 2; 111 new_items = xrealloc(array, sizeof(*items) * *max_i); 112 if (new_items == NULL) 113 return REG_ESPACE; 114 *items = array = new_items; 115 } 116 array[*i] = tre_ast_new_literal(mem, min, max, -1); 117 status = array[*i] == NULL ? REG_ESPACE : REG_OK; 118 (*i)++; 119 return status; 120 } 121 122 123 /* Expands a character class to character ranges. */ 124 static reg_errcode_t 125 tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items, 126 int *i, int *max_i, int cflags) 127 { 128 reg_errcode_t status = REG_OK; 129 tre_cint_t c; 130 int j, min = -1, max = 0; 131 assert(TRE_MB_CUR_MAX == 1); 132 133 DPRINT((" expanding class to character ranges\n")); 134 for (j = 0; (j < 256) && (status == REG_OK); j++) 135 { 136 c = j; 137 if (tre_isctype(c, class) 138 || ((cflags & REG_ICASE) 139 && (tre_isctype(tre_tolower(c), class) 140 || tre_isctype(tre_toupper(c), class)))) 141 { 142 if (min < 0) 143 min = c; 144 max = c; 145 } 146 else if (min >= 0) 147 { 148 DPRINT((" range %c (%d) to %c (%d)\n", min, min, max, max)); 149 status = tre_new_item(mem, min, max, i, max_i, items); 150 min = -1; 151 } 152 } 153 if (min >= 0 && status == REG_OK) 154 status = tre_new_item(mem, min, max, i, max_i, items); 155 return status; 156 } 157 158 159 static int 160 tre_compare_items(const void *a, const void *b) 161 { 162 const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a; 163 const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b; 164 tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj; 165 int a_min = l_a->code_min, b_min = l_b->code_min; 166 167 if (a_min < b_min) 168 return -1; 169 else if (a_min > b_min) 170 return 1; 171 else 172 return 0; 173 } 174 175 #ifndef TRE_USE_SYSTEM_WCTYPE 176 177 /* isalnum() and the rest may be macros, so wrap them to functions. */ 178 int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); } 179 int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); } 180 181 #ifdef tre_isascii 182 int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); } 183 #else /* !tre_isascii */ 184 int tre_isascii_func(tre_cint_t c) { return !(c >> 7); } 185 #endif /* !tre_isascii */ 186 187 #ifdef tre_isblank 188 int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); } 189 #else /* !tre_isblank */ 190 int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); } 191 #endif /* !tre_isblank */ 192 193 int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); } 194 int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); } 195 int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); } 196 int tre_islower_func(tre_cint_t c) { return tre_islower(c); } 197 int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); } 198 int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); } 199 int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); } 200 int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); } 201 int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); } 202 203 struct { 204 char *name; 205 int (*func)(tre_cint_t); 206 } tre_ctype_map[] = { 207 { "alnum", &tre_isalnum_func }, 208 { "alpha", &tre_isalpha_func }, 209 #ifdef tre_isascii 210 { "ascii", &tre_isascii_func }, 211 #endif /* tre_isascii */ 212 #ifdef tre_isblank 213 { "blank", &tre_isblank_func }, 214 #endif /* tre_isblank */ 215 { "cntrl", &tre_iscntrl_func }, 216 { "digit", &tre_isdigit_func }, 217 { "graph", &tre_isgraph_func }, 218 { "lower", &tre_islower_func }, 219 { "print", &tre_isprint_func }, 220 { "punct", &tre_ispunct_func }, 221 { "space", &tre_isspace_func }, 222 { "upper", &tre_isupper_func }, 223 { "xdigit", &tre_isxdigit_func }, 224 { NULL, NULL} 225 }; 226 227 tre_ctype_t tre_ctype(const char *name) 228 { 229 int i; 230 for (i = 0; tre_ctype_map[i].name != NULL; i++) 231 { 232 if (strcmp(name, tre_ctype_map[i].name) == 0) 233 return tre_ctype_map[i].func; 234 } 235 return (tre_ctype_t)0; 236 } 237 #endif /* !TRE_USE_SYSTEM_WCTYPE */ 238 239 /* Maximum number of character classes that can occur in a negated bracket 240 expression. */ 241 #define MAX_NEG_CLASSES 64 242 243 /* Maximum length of character class names. */ 244 #define MAX_CLASS_NAME 245 246 #define REST(re) (int)(ctx->re_end - (re)), (re) 247 248 static reg_errcode_t 249 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, 250 tre_ctype_t neg_classes[], int *num_neg_classes, 251 tre_ast_node_t ***items, int *num_items, 252 int *items_size) 253 { 254 const tre_char_t *re = ctx->re; 255 reg_errcode_t status = REG_OK; 256 tre_ctype_t class = (tre_ctype_t)0; 257 int i = *num_items; 258 int max_i = *items_size; 259 int skip; 260 261 /* Build an array of the items in the bracket expression. */ 262 while (status == REG_OK) 263 { 264 skip = 0; 265 if (re == ctx->re_end) 266 { 267 status = REG_EBRACK; 268 } 269 else if (*re == CHAR_RBRACKET && re > ctx->re) 270 { 271 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re))); 272 re++; 273 break; 274 } 275 else 276 { 277 tre_cint_t min = 0, max = 0; 278 279 class = (tre_ctype_t)0; 280 if (re + 2 < ctx->re_end 281 && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET) 282 { 283 DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re))); 284 min = *re; 285 max = *(re + 2); 286 re += 3; 287 /* XXX - Should use collation order instead of encoding values 288 in character ranges. */ 289 if (min > max) 290 status = REG_ERANGE; 291 } 292 else if (re + 1 < ctx->re_end 293 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD) 294 status = REG_ECOLLATE; 295 else if (re + 1 < ctx->re_end 296 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL) 297 status = REG_ECOLLATE; 298 else if (re + 1 < ctx->re_end 299 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON) 300 { 301 char tmp_str[64]; 302 const tre_char_t *endptr = re + 2; 303 int len; 304 DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re))); 305 while (endptr < ctx->re_end && *endptr != CHAR_COLON) 306 endptr++; 307 if (endptr != ctx->re_end) 308 { 309 len = MIN(endptr - re - 2, 63); 310 #ifdef TRE_WCHAR 311 { 312 tre_char_t tmp_wcs[64]; 313 wcsncpy(tmp_wcs, re + 2, (size_t)len); 314 tmp_wcs[len] = L'\0'; 315 #if defined HAVE_WCSRTOMBS 316 { 317 mbstate_t state; 318 const tre_char_t *src = tmp_wcs; 319 memset(&state, '\0', sizeof(state)); 320 len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state); 321 } 322 #elif defined HAVE_WCSTOMBS 323 len = wcstombs(tmp_str, tmp_wcs, 63); 324 #endif /* defined HAVE_WCSTOMBS */ 325 } 326 #else /* !TRE_WCHAR */ 327 strncpy(tmp_str, (const char*)re + 2, len); 328 #endif /* !TRE_WCHAR */ 329 tmp_str[len] = '\0'; 330 DPRINT((" class name: %s\n", tmp_str)); 331 class = tre_ctype(tmp_str); 332 if (!class) 333 status = REG_ECTYPE; 334 /* Optimize character classes for 8 bit character sets. */ 335 if (status == REG_OK && TRE_MB_CUR_MAX == 1) 336 { 337 status = tre_expand_ctype(ctx->mem, class, items, 338 &i, &max_i, ctx->cflags); 339 class = (tre_ctype_t)0; 340 skip = 1; 341 } 342 re = endptr + 2; 343 } 344 else 345 status = REG_ECTYPE; 346 min = 0; 347 max = TRE_CHAR_MAX; 348 } 349 else 350 { 351 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); 352 if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET 353 && ctx->re != re) 354 /* Two ranges are not allowed to share and endpoint. */ 355 status = REG_ERANGE; 356 min = max = *re++; 357 } 358 359 if (status != REG_OK) 360 break; 361 362 if (class && negate) 363 if (*num_neg_classes >= MAX_NEG_CLASSES) 364 status = REG_ESPACE; 365 else 366 neg_classes[(*num_neg_classes)++] = class; 367 else if (!skip) 368 { 369 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items); 370 if (status != REG_OK) 371 break; 372 ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class; 373 } 374 375 /* Add opposite-case counterpoints if REG_ICASE is present. 376 This is broken if there are more than two "same" characters. */ 377 if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip) 378 { 379 tre_cint_t cmin, ccurr; 380 381 DPRINT(("adding opposite-case counterpoints\n")); 382 while (min <= max) 383 { 384 if (tre_islower(min)) 385 { 386 cmin = ccurr = tre_toupper(min++); 387 while (tre_islower(min) && tre_toupper(min) == ccurr + 1 388 && min <= max) 389 ccurr = tre_toupper(min++); 390 status = tre_new_item(ctx->mem, cmin, ccurr, 391 &i, &max_i, items); 392 } 393 else if (tre_isupper(min)) 394 { 395 cmin = ccurr = tre_tolower(min++); 396 while (tre_isupper(min) && tre_tolower(min) == ccurr + 1 397 && min <= max) 398 ccurr = tre_tolower(min++); 399 status = tre_new_item(ctx->mem, cmin, ccurr, 400 &i, &max_i, items); 401 } 402 else min++; 403 if (status != REG_OK) 404 break; 405 } 406 if (status != REG_OK) 407 break; 408 } 409 } 410 } 411 *num_items = i; 412 *items_size = max_i; 413 ctx->re = re; 414 return status; 415 } 416 417 static reg_errcode_t 418 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) 419 { 420 tre_ast_node_t *node = NULL; 421 int negate = 0; 422 reg_errcode_t status = REG_OK; 423 tre_ast_node_t **items, *u, *n; 424 int i = 0, j, max_i = 32, curr_max, curr_min; 425 tre_ctype_t neg_classes[MAX_NEG_CLASSES]; 426 int num_neg_classes = 0; 427 428 /* Start off with an array of `max_i' elements. */ 429 items = xmalloc(sizeof(*items) * max_i); 430 if (items == NULL) 431 return REG_ESPACE; 432 433 if (*ctx->re == CHAR_CARET) 434 { 435 DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re))); 436 negate = 1; 437 ctx->re++; 438 } 439 440 status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes, 441 &items, &i, &max_i); 442 443 if (status != REG_OK) 444 goto parse_bracket_done; 445 446 /* Sort the array if we need to negate it. */ 447 if (negate) 448 qsort(items, (unsigned)i, sizeof(*items), tre_compare_items); 449 450 curr_max = curr_min = 0; 451 /* Build a union of the items in the array, negated if necessary. */ 452 for (j = 0; j < i && status == REG_OK; j++) 453 { 454 int min, max; 455 tre_literal_t *l = items[j]->obj; 456 min = l->code_min; 457 max = l->code_max; 458 459 DPRINT(("item: %d - %d, class %ld, curr_max = %d\n", 460 (int)l->code_min, (int)l->code_max, (long)l->u.class, curr_max)); 461 462 if (negate) 463 { 464 if (min < curr_max) 465 { 466 /* Overlap. */ 467 curr_max = MAX(max + 1, curr_max); 468 DPRINT(("overlap, curr_max = %d\n", curr_max)); 469 l = NULL; 470 } 471 else 472 { 473 /* No overlap. */ 474 curr_max = min - 1; 475 if (curr_max >= curr_min) 476 { 477 DPRINT(("no overlap\n")); 478 l->code_min = curr_min; 479 l->code_max = curr_max; 480 } 481 else 482 { 483 DPRINT(("no overlap, zero room\n")); 484 l = NULL; 485 } 486 curr_min = curr_max = max + 1; 487 } 488 } 489 490 if (l != NULL) 491 { 492 int k; 493 DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max)); 494 l->position = ctx->position; 495 if (num_neg_classes > 0) 496 { 497 l->neg_classes = tre_mem_alloc(ctx->mem, 498 (sizeof(l->neg_classes) 499 * (num_neg_classes + 1))); 500 if (l->neg_classes == NULL) 501 { 502 status = REG_ESPACE; 503 break; 504 } 505 for (k = 0; k < num_neg_classes; k++) 506 l->neg_classes[k] = neg_classes[k]; 507 l->neg_classes[k] = (tre_ctype_t)0; 508 } 509 else 510 l->neg_classes = NULL; 511 if (node == NULL) 512 node = items[j]; 513 else 514 { 515 u = tre_ast_new_union(ctx->mem, node, items[j]); 516 if (u == NULL) 517 status = REG_ESPACE; 518 node = u; 519 } 520 } 521 } 522 523 if (status != REG_OK) 524 goto parse_bracket_done; 525 526 if (negate) 527 { 528 int k; 529 DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX)); 530 n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position); 531 if (n == NULL) 532 status = REG_ESPACE; 533 else 534 { 535 tre_literal_t *l = n->obj; 536 if (num_neg_classes > 0) 537 { 538 l->neg_classes = tre_mem_alloc(ctx->mem, 539 (sizeof(l->neg_classes) 540 * (num_neg_classes + 1))); 541 if (l->neg_classes == NULL) 542 { 543 status = REG_ESPACE; 544 goto parse_bracket_done; 545 } 546 for (k = 0; k < num_neg_classes; k++) 547 l->neg_classes[k] = neg_classes[k]; 548 l->neg_classes[k] = (tre_ctype_t)0; 549 } 550 else 551 l->neg_classes = NULL; 552 if (node == NULL) 553 node = n; 554 else 555 { 556 u = tre_ast_new_union(ctx->mem, node, n); 557 if (u == NULL) 558 status = REG_ESPACE; 559 node = u; 560 } 561 } 562 } 563 564 if (status != REG_OK) 565 goto parse_bracket_done; 566 567 #ifdef TRE_DEBUG 568 tre_ast_print(node); 569 #endif /* TRE_DEBUG */ 570 571 parse_bracket_done: 572 xfree(items); 573 ctx->position++; 574 *result = node; 575 return status; 576 } 577 578 579 /* Parses a positive decimal integer. Returns -1 if the string does not 580 contain a valid number. */ 581 static int 582 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end) 583 { 584 int num = -1; 585 const tre_char_t *r = *regex; 586 while (r < regex_end && *r >= L'0' && *r <= L'9') 587 { 588 if (num < 0) 589 num = 0; 590 num = num * 10 + *r - L'0'; 591 r++; 592 } 593 *regex = r; 594 return num; 595 } 596 597 598 static reg_errcode_t 599 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) 600 { 601 int min, max, i; 602 int cost_ins, cost_del, cost_subst, cost_max; 603 int limit_ins, limit_del, limit_subst, limit_err; 604 const tre_char_t *r = ctx->re; 605 const tre_char_t *start; 606 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; 607 int approx = 0; 608 int costs_set = 0; 609 int counts_set = 0; 610 611 cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET; 612 limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET; 613 614 /* Parse number (minimum repetition count). */ 615 min = -1; 616 if (r < ctx->re_end && *r >= L'0' && *r <= L'9') { 617 DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r))); 618 min = tre_parse_int(&r, ctx->re_end); 619 } 620 621 /* Parse comma and second number (maximum repetition count). */ 622 max = min; 623 if (r < ctx->re_end && *r == CHAR_COMMA) 624 { 625 r++; 626 DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r))); 627 max = tre_parse_int(&r, ctx->re_end); 628 } 629 630 /* Check that the repeat counts are sane. */ 631 if ((max >= 0 && min > max) || max > RE_DUP_MAX) 632 return REG_BADBR; 633 634 635 /* 636 '{' 637 optionally followed immediately by a number == minimum repcount 638 optionally followed by , then a number == maximum repcount 639 + then a number == maximum insertion count 640 - then a number == maximum deletion count 641 # then a number == maximum substitution count 642 ~ then a number == maximum number of errors 643 Any of +, -, # or ~ without followed by a number means that 644 the maximum count/number of errors is infinite. 645 646 An equation of the form 647 Xi + Yd + Zs < C 648 can be specified to set costs and the cost limit to a value 649 different from the default value: 650 - X is the cost of an insertion 651 - Y is the cost of a deletion 652 - Z is the cost of a substitution 653 - C is the maximum cost 654 655 If no count limit or cost is set for an operation, the operation 656 is not allowed at all. 657 */ 658 659 660 do { 661 int done; 662 start = r; 663 664 /* Parse count limit settings */ 665 done = 0; 666 if (!counts_set) 667 while (r + 1 < ctx->re_end && !done) 668 { 669 switch (*r) 670 { 671 case CHAR_PLUS: /* Insert limit */ 672 DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r))); 673 r++; 674 limit_ins = tre_parse_int(&r, ctx->re_end); 675 if (limit_ins < 0) 676 limit_ins = INT_MAX; 677 counts_set = 1; 678 break; 679 case CHAR_MINUS: /* Delete limit */ 680 DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r))); 681 r++; 682 limit_del = tre_parse_int(&r, ctx->re_end); 683 if (limit_del < 0) 684 limit_del = INT_MAX; 685 counts_set = 1; 686 break; 687 case CHAR_HASH: /* Substitute limit */ 688 DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r))); 689 r++; 690 limit_subst = tre_parse_int(&r, ctx->re_end); 691 if (limit_subst < 0) 692 limit_subst = INT_MAX; 693 counts_set = 1; 694 break; 695 case CHAR_TILDE: /* Maximum number of changes */ 696 DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r))); 697 r++; 698 limit_err = tre_parse_int(&r, ctx->re_end); 699 if (limit_err < 0) 700 limit_err = INT_MAX; 701 approx = 1; 702 break; 703 case CHAR_COMMA: 704 r++; 705 break; 706 case L' ': 707 r++; 708 break; 709 case L'}': 710 done = 1; 711 break; 712 default: 713 done = 1; 714 break; 715 } 716 } 717 718 /* Parse cost restriction equation. */ 719 done = 0; 720 if (!costs_set) 721 while (r + 1 < ctx->re_end && !done) 722 { 723 switch (*r) 724 { 725 case CHAR_PLUS: 726 case L' ': 727 r++; 728 break; 729 case L'<': 730 DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r))); 731 r++; 732 while (*r == L' ') 733 r++; 734 cost_max = tre_parse_int(&r, ctx->re_end); 735 if (cost_max < 0) 736 cost_max = INT_MAX; 737 else 738 cost_max--; 739 approx = 1; 740 break; 741 case CHAR_COMMA: 742 r++; 743 done = 1; 744 break; 745 default: 746 if (*r >= L'0' && *r <= L'9') 747 { 748 #ifdef TRE_DEBUG 749 const tre_char_t *sr = r; 750 #endif /* TRE_DEBUG */ 751 int cost = tre_parse_int(&r, ctx->re_end); 752 /* XXX - make sure r is not past end. */ 753 switch (*r) 754 { 755 case L'i': /* Insert cost */ 756 DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n", 757 REST(sr))); 758 r++; 759 cost_ins = cost; 760 costs_set = 1; 761 break; 762 case L'd': /* Delete cost */ 763 DPRINT(("tre_parse: del cost: '%.*" STRF "'\n", 764 REST(sr))); 765 r++; 766 cost_del = cost; 767 costs_set = 1; 768 break; 769 case L's': /* Substitute cost */ 770 DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n", 771 REST(sr))); 772 r++; 773 cost_subst = cost; 774 costs_set = 1; 775 break; 776 default: 777 return REG_BADBR; 778 } 779 } 780 else 781 { 782 done = 1; 783 break; 784 } 785 } 786 } 787 } while (start != r); 788 789 /* Missing }. */ 790 if (r >= ctx->re_end) 791 return REG_EBRACE; 792 793 /* Empty contents of {}. */ 794 if (r == ctx->re) 795 return REG_BADBR; 796 797 /* Parse the ending '}' or '\}'.*/ 798 if (ctx->cflags & REG_EXTENDED) 799 { 800 if (r >= ctx->re_end || *r != CHAR_RBRACE) 801 return REG_BADBR; 802 r++; 803 } 804 else 805 { 806 if (r + 1 >= ctx->re_end 807 || *r != CHAR_BACKSLASH 808 || *(r + 1) != CHAR_RBRACE) 809 return REG_BADBR; 810 r += 2; 811 } 812 813 814 /* Parse trailing '?' marking minimal repetition. */ 815 if (r < ctx->re_end) 816 { 817 if (*r == CHAR_QUESTIONMARK) 818 { 819 minimal = !(ctx->cflags & REG_UNGREEDY); 820 r++; 821 } 822 else if (*r == CHAR_STAR || *r == CHAR_PLUS) 823 { 824 /* These are reserved for future extensions. */ 825 return REG_BADRPT; 826 } 827 } 828 829 /* Create the AST node(s). */ 830 if (min == 0 && max == 0) 831 { 832 *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 833 if (*result == NULL) 834 return REG_ESPACE; 835 } 836 else 837 { 838 if (min < 0 && max < 0) 839 /* Only approximate parameters set, no repetitions. */ 840 min = max = 1; 841 842 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal); 843 if (!*result) 844 return REG_ESPACE; 845 846 /* If approximate matching parameters are set, add them to the 847 iteration node. */ 848 if (approx || costs_set || counts_set) 849 { 850 int *params; 851 tre_iteration_t *iter = (*result)->obj; 852 853 if (costs_set || counts_set) 854 { 855 if (limit_ins == TRE_PARAM_UNSET) 856 { 857 if (cost_ins == TRE_PARAM_UNSET) 858 limit_ins = 0; 859 else 860 limit_ins = INT_MAX; 861 } 862 863 if (limit_del == TRE_PARAM_UNSET) 864 { 865 if (cost_del == TRE_PARAM_UNSET) 866 limit_del = 0; 867 else 868 limit_del = INT_MAX; 869 } 870 871 if (limit_subst == TRE_PARAM_UNSET) 872 { 873 if (cost_subst == TRE_PARAM_UNSET) 874 limit_subst = 0; 875 else 876 limit_subst = INT_MAX; 877 } 878 } 879 880 if (cost_max == TRE_PARAM_UNSET) 881 cost_max = INT_MAX; 882 if (limit_err == TRE_PARAM_UNSET) 883 limit_err = INT_MAX; 884 885 ctx->have_approx = 1; 886 params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST); 887 if (!params) 888 return REG_ESPACE; 889 for (i = 0; i < TRE_PARAM_LAST; i++) 890 params[i] = TRE_PARAM_UNSET; 891 params[TRE_PARAM_COST_INS] = cost_ins; 892 params[TRE_PARAM_COST_DEL] = cost_del; 893 params[TRE_PARAM_COST_SUBST] = cost_subst; 894 params[TRE_PARAM_COST_MAX] = cost_max; 895 params[TRE_PARAM_MAX_INS] = limit_ins; 896 params[TRE_PARAM_MAX_DEL] = limit_del; 897 params[TRE_PARAM_MAX_SUBST] = limit_subst; 898 params[TRE_PARAM_MAX_ERR] = limit_err; 899 iter->params = params; 900 } 901 } 902 903 DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], " 904 "limits [%d,%d,%d, total %d]\n", 905 min, max, cost_ins, cost_del, cost_subst, cost_max, 906 limit_ins, limit_del, limit_subst, limit_err)); 907 908 909 ctx->re = r; 910 return REG_OK; 911 } 912 913 typedef enum { 914 PARSE_RE = 0, 915 PARSE_ATOM, 916 PARSE_MARK_FOR_SUBMATCH, 917 PARSE_BRANCH, 918 PARSE_PIECE, 919 PARSE_CATENATION, 920 PARSE_POST_CATENATION, 921 PARSE_UNION, 922 PARSE_POST_UNION, 923 PARSE_POSTFIX, 924 PARSE_RESTORE_CFLAGS 925 } tre_parse_re_stack_symbol_t; 926 927 928 reg_errcode_t 929 tre_parse(tre_parse_ctx_t *ctx) 930 { 931 tre_ast_node_t *result = NULL; 932 tre_parse_re_stack_symbol_t symbol; 933 reg_errcode_t status = REG_OK; 934 tre_stack_t *stack = ctx->stack; 935 int bottom = tre_stack_num_objects(stack); 936 int depth = 0; 937 int temporary_cflags = 0; 938 939 DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n", 940 ctx->len, ctx->re, ctx->len)); 941 942 if (!ctx->nofirstsub) 943 { 944 STACK_PUSH(stack, int, ctx->submatch_id); 945 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH); 946 ctx->submatch_id++; 947 } 948 STACK_PUSH(stack, int, PARSE_RE); 949 ctx->re_start = ctx->re; 950 ctx->re_end = ctx->re + ctx->len; 951 952 953 /* The following is basically just a recursive descent parser. I use 954 an explicit stack instead of recursive functions mostly because of 955 two reasons: compatibility with systems which have an overflowable 956 call stack, and efficiency (both in lines of code and speed). */ 957 while (tre_stack_num_objects(stack) > bottom && status == REG_OK) 958 { 959 if (status != REG_OK) 960 break; 961 symbol = tre_stack_pop_int(stack); 962 switch (symbol) 963 { 964 case PARSE_RE: 965 /* Parse a full regexp. A regexp is one or more branches, 966 separated by the union operator `|'. */ 967 #ifdef REG_LITERAL 968 if (!(ctx->cflags & REG_LITERAL) 969 && ctx->cflags & REG_EXTENDED) 970 #endif /* REG_LITERAL */ 971 STACK_PUSHX(stack, int, PARSE_UNION); 972 STACK_PUSHX(stack, int, PARSE_BRANCH); 973 break; 974 975 case PARSE_BRANCH: 976 /* Parse a branch. A branch is one or more pieces, concatenated. 977 A piece is an atom possibly followed by a postfix operator. */ 978 STACK_PUSHX(stack, int, PARSE_CATENATION); 979 STACK_PUSHX(stack, int, PARSE_PIECE); 980 break; 981 982 case PARSE_PIECE: 983 /* Parse a piece. A piece is an atom possibly followed by one 984 or more postfix operators. */ 985 #ifdef REG_LITERAL 986 if (!(ctx->cflags & REG_LITERAL)) 987 #endif /* REG_LITERAL */ 988 STACK_PUSHX(stack, int, PARSE_POSTFIX); 989 STACK_PUSHX(stack, int, PARSE_ATOM); 990 break; 991 992 case PARSE_CATENATION: 993 /* If the expression has not ended, parse another piece. */ 994 { 995 tre_char_t c; 996 if (ctx->re >= ctx->re_end) 997 break; 998 c = *ctx->re; 999 #ifdef REG_LITERAL 1000 if (!(ctx->cflags & REG_LITERAL)) 1001 { 1002 #endif /* REG_LITERAL */ 1003 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) 1004 break; 1005 if ((ctx->cflags & REG_EXTENDED 1006 && c == CHAR_RPAREN && depth > 0) 1007 || (!(ctx->cflags & REG_EXTENDED) 1008 && (c == CHAR_BACKSLASH 1009 && *(ctx->re + 1) == CHAR_RPAREN))) 1010 { 1011 if (!(ctx->cflags & REG_EXTENDED) && depth == 0) 1012 status = REG_EPAREN; 1013 DPRINT(("tre_parse: group end: '%.*" STRF "'\n", 1014 REST(ctx->re))); 1015 depth--; 1016 if (!(ctx->cflags & REG_EXTENDED)) 1017 ctx->re += 2; 1018 break; 1019 } 1020 #ifdef REG_LITERAL 1021 } 1022 #endif /* REG_LITERAL */ 1023 1024 #ifdef REG_RIGHT_ASSOC 1025 if (ctx->cflags & REG_RIGHT_ASSOC) 1026 { 1027 /* Right associative concatenation. */ 1028 STACK_PUSHX(stack, voidptr, result); 1029 STACK_PUSHX(stack, int, PARSE_POST_CATENATION); 1030 STACK_PUSHX(stack, int, PARSE_CATENATION); 1031 STACK_PUSHX(stack, int, PARSE_PIECE); 1032 } 1033 else 1034 #endif /* REG_RIGHT_ASSOC */ 1035 { 1036 /* Default case, left associative concatenation. */ 1037 STACK_PUSHX(stack, int, PARSE_CATENATION); 1038 STACK_PUSHX(stack, voidptr, result); 1039 STACK_PUSHX(stack, int, PARSE_POST_CATENATION); 1040 STACK_PUSHX(stack, int, PARSE_PIECE); 1041 } 1042 break; 1043 } 1044 1045 case PARSE_POST_CATENATION: 1046 { 1047 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); 1048 tre_ast_node_t *tmp_node; 1049 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result); 1050 if (!tmp_node) 1051 return REG_ESPACE; 1052 result = tmp_node; 1053 break; 1054 } 1055 1056 case PARSE_UNION: 1057 if (ctx->re >= ctx->re_end) 1058 break; 1059 #ifdef REG_LITERAL 1060 if (ctx->cflags & REG_LITERAL) 1061 break; 1062 #endif /* REG_LITERAL */ 1063 switch (*ctx->re) 1064 { 1065 case CHAR_PIPE: 1066 DPRINT(("tre_parse: union: '%.*" STRF "'\n", 1067 REST(ctx->re))); 1068 STACK_PUSHX(stack, int, PARSE_UNION); 1069 STACK_PUSHX(stack, voidptr, result); 1070 STACK_PUSHX(stack, int, PARSE_POST_UNION); 1071 STACK_PUSHX(stack, int, PARSE_BRANCH); 1072 ctx->re++; 1073 break; 1074 1075 case CHAR_RPAREN: 1076 ctx->re++; 1077 break; 1078 1079 default: 1080 break; 1081 } 1082 break; 1083 1084 case PARSE_POST_UNION: 1085 { 1086 tre_ast_node_t *tmp_node; 1087 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); 1088 tmp_node = tre_ast_new_union(ctx->mem, tree, result); 1089 if (!tmp_node) 1090 return REG_ESPACE; 1091 result = tmp_node; 1092 break; 1093 } 1094 1095 case PARSE_POSTFIX: 1096 /* Parse postfix operators. */ 1097 if (ctx->re >= ctx->re_end) 1098 break; 1099 #ifdef REG_LITERAL 1100 if (ctx->cflags & REG_LITERAL) 1101 break; 1102 #endif /* REG_LITERAL */ 1103 switch (*ctx->re) 1104 { 1105 case CHAR_PLUS: 1106 case CHAR_QUESTIONMARK: 1107 if (!(ctx->cflags & REG_EXTENDED)) 1108 break; 1109 /*FALLTHROUGH*/ 1110 case CHAR_STAR: 1111 { 1112 tre_ast_node_t *tmp_node; 1113 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; 1114 int rep_min = 0; 1115 int rep_max = -1; 1116 #ifdef TRE_DEBUG 1117 const tre_char_t *tmp_re; 1118 #endif 1119 1120 if (*ctx->re == CHAR_PLUS) 1121 rep_min = 1; 1122 if (*ctx->re == CHAR_QUESTIONMARK) 1123 rep_max = 1; 1124 #ifdef TRE_DEBUG 1125 tmp_re = ctx->re; 1126 #endif 1127 1128 if (ctx->re + 1 < ctx->re_end) 1129 { 1130 if (*(ctx->re + 1) == CHAR_QUESTIONMARK) 1131 { 1132 minimal = !(ctx->cflags & REG_UNGREEDY); 1133 ctx->re++; 1134 } 1135 else if (*(ctx->re + 1) == CHAR_STAR 1136 || *(ctx->re + 1) == CHAR_PLUS) 1137 { 1138 /* These are reserved for future extensions. */ 1139 return REG_BADRPT; 1140 } 1141 } 1142 1143 DPRINT(("tre_parse: %s star: '%.*" STRF "'\n", 1144 minimal ? " minimal" : "greedy", REST(tmp_re))); 1145 ctx->re++; 1146 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max, 1147 minimal); 1148 if (tmp_node == NULL) 1149 return REG_ESPACE; 1150 result = tmp_node; 1151 STACK_PUSHX(stack, int, PARSE_POSTFIX); 1152 } 1153 break; 1154 1155 case CHAR_BACKSLASH: 1156 /* "\{" is special without REG_EXTENDED */ 1157 if (!(ctx->cflags & REG_EXTENDED) 1158 && ctx->re + 1 < ctx->re_end 1159 && *(ctx->re + 1) == CHAR_LBRACE) 1160 { 1161 ctx->re++; 1162 goto parse_brace; 1163 } 1164 else 1165 break; 1166 1167 case CHAR_LBRACE: 1168 /* "{" is literal without REG_EXTENDED */ 1169 if (!(ctx->cflags & REG_EXTENDED)) 1170 break; 1171 1172 parse_brace: 1173 DPRINT(("tre_parse: bound: '%.*" STRF "'\n", 1174 REST(ctx->re))); 1175 ctx->re++; 1176 1177 status = tre_parse_bound(ctx, &result); 1178 if (status != REG_OK) 1179 return status; 1180 STACK_PUSHX(stack, int, PARSE_POSTFIX); 1181 break; 1182 } 1183 break; 1184 1185 case PARSE_ATOM: 1186 /* Parse an atom. An atom is a regular expression enclosed in `()', 1187 an empty set of `()', a bracket expression, `.', `^', `$', 1188 a `\' followed by a character, or a single character. */ 1189 1190 /* End of regexp? (empty string). */ 1191 if (ctx->re >= ctx->re_end) 1192 goto parse_literal; 1193 1194 #ifdef REG_LITERAL 1195 if (ctx->cflags & REG_LITERAL) 1196 goto parse_literal; 1197 #endif /* REG_LITERAL */ 1198 1199 switch (*ctx->re) 1200 { 1201 case CHAR_LPAREN: /* parenthesized subexpression */ 1202 1203 /* Handle "(?...)" extensions. They work in a way similar 1204 to Perls corresponding extensions. */ 1205 if (ctx->cflags & REG_EXTENDED 1206 && *(ctx->re + 1) == CHAR_QUESTIONMARK) 1207 { 1208 int new_cflags = ctx->cflags; 1209 int bit = 1; 1210 DPRINT(("tre_parse: extension: '%.*" STRF "\n", 1211 REST(ctx->re))); 1212 ctx->re += 2; 1213 while (/*CONSTCOND*/1) 1214 { 1215 if (*ctx->re == L'i') 1216 { 1217 DPRINT(("tre_parse: icase: '%.*" STRF "\n", 1218 REST(ctx->re))); 1219 if (bit) 1220 new_cflags |= REG_ICASE; 1221 else 1222 new_cflags &= ~REG_ICASE; 1223 ctx->re++; 1224 } 1225 else if (*ctx->re == L'n') 1226 { 1227 DPRINT(("tre_parse: newline: '%.*" STRF "\n", 1228 REST(ctx->re))); 1229 if (bit) 1230 new_cflags |= REG_NEWLINE; 1231 else 1232 new_cflags &= ~REG_NEWLINE; 1233 ctx->re++; 1234 } 1235 #ifdef REG_RIGHT_ASSOC 1236 else if (*ctx->re == L'r') 1237 { 1238 DPRINT(("tre_parse: right assoc: '%.*" STRF "\n", 1239 REST(ctx->re))); 1240 if (bit) 1241 new_cflags |= REG_RIGHT_ASSOC; 1242 else 1243 new_cflags &= ~REG_RIGHT_ASSOC; 1244 ctx->re++; 1245 } 1246 #endif /* REG_RIGHT_ASSOC */ 1247 #ifdef REG_UNGREEDY 1248 else if (*ctx->re == L'U') 1249 { 1250 DPRINT(("tre_parse: ungreedy: '%.*" STRF "\n", 1251 REST(ctx->re))); 1252 if (bit) 1253 new_cflags |= REG_UNGREEDY; 1254 else 1255 new_cflags &= ~REG_UNGREEDY; 1256 ctx->re++; 1257 } 1258 #endif /* REG_UNGREEDY */ 1259 else if (*ctx->re == CHAR_MINUS) 1260 { 1261 DPRINT(("tre_parse: turn off: '%.*" STRF "\n", 1262 REST(ctx->re))); 1263 ctx->re++; 1264 bit = 0; 1265 } 1266 else if (*ctx->re == CHAR_COLON) 1267 { 1268 DPRINT(("tre_parse: no group: '%.*" STRF "\n", 1269 REST(ctx->re))); 1270 ctx->re++; 1271 depth++; 1272 break; 1273 } 1274 else if (*ctx->re == CHAR_HASH) 1275 { 1276 DPRINT(("tre_parse: comment: '%.*" STRF "\n", 1277 REST(ctx->re))); 1278 /* A comment can contain any character except a 1279 right parenthesis */ 1280 while (*ctx->re != CHAR_RPAREN 1281 && ctx->re < ctx->re_end) 1282 ctx->re++; 1283 if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end) 1284 { 1285 ctx->re++; 1286 break; 1287 } 1288 else 1289 return REG_BADPAT; 1290 } 1291 else if (*ctx->re == CHAR_RPAREN) 1292 { 1293 ctx->re++; 1294 break; 1295 } 1296 else 1297 return REG_BADPAT; 1298 } 1299 1300 /* Turn on the cflags changes for the rest of the 1301 enclosing group. */ 1302 STACK_PUSHX(stack, int, ctx->cflags); 1303 STACK_PUSHX(stack, int, PARSE_RESTORE_CFLAGS); 1304 STACK_PUSHX(stack, int, PARSE_RE); 1305 ctx->cflags = new_cflags; 1306 break; 1307 } 1308 1309 if (ctx->cflags & REG_EXTENDED 1310 || (ctx->re > ctx->re_start 1311 && *(ctx->re - 1) == CHAR_BACKSLASH)) 1312 { 1313 depth++; 1314 if (ctx->re + 2 < ctx->re_end 1315 && *(ctx->re + 1) == CHAR_QUESTIONMARK 1316 && *(ctx->re + 2) == CHAR_COLON) 1317 { 1318 DPRINT(("tre_parse: group begin: '%.*" STRF 1319 "', no submatch\n", REST(ctx->re))); 1320 /* Don't mark for submatching. */ 1321 ctx->re += 3; 1322 STACK_PUSHX(stack, int, PARSE_RE); 1323 } 1324 else 1325 { 1326 DPRINT(("tre_parse: group begin: '%.*" STRF 1327 "', submatch %d\n", REST(ctx->re), 1328 ctx->submatch_id)); 1329 ctx->re++; 1330 /* First parse a whole RE, then mark the resulting tree 1331 for submatching. */ 1332 STACK_PUSHX(stack, int, ctx->submatch_id); 1333 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); 1334 STACK_PUSHX(stack, int, PARSE_RE); 1335 ctx->submatch_id++; 1336 } 1337 } 1338 else 1339 goto parse_literal; 1340 break; 1341 1342 case CHAR_RPAREN: /* end of current subexpression */ 1343 if ((ctx->cflags & REG_EXTENDED && depth > 0) 1344 || (ctx->re > ctx->re_start 1345 && *(ctx->re - 1) == CHAR_BACKSLASH)) 1346 { 1347 DPRINT(("tre_parse: empty: '%.*" STRF "'\n", 1348 REST(ctx->re))); 1349 /* We were expecting an atom, but instead the current 1350 subexpression was closed. POSIX leaves the meaning of 1351 this to be implementation-defined. We interpret this as 1352 an empty expression (which matches an empty string). */ 1353 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 1354 if (result == NULL) 1355 return REG_ESPACE; 1356 if (!(ctx->cflags & REG_EXTENDED)) 1357 ctx->re--; 1358 } 1359 else 1360 goto parse_literal; 1361 break; 1362 1363 case CHAR_LBRACKET: /* bracket expression */ 1364 DPRINT(("tre_parse: bracket: '%.*" STRF "'\n", 1365 REST(ctx->re))); 1366 ctx->re++; 1367 status = tre_parse_bracket(ctx, &result); 1368 if (status != REG_OK) 1369 return status; 1370 break; 1371 1372 case CHAR_BACKSLASH: 1373 /* If this is "\(" or "\)" chew off the backslash and 1374 try again. */ 1375 if (!(ctx->cflags & REG_EXTENDED) 1376 && ctx->re + 1 < ctx->re_end 1377 && (*(ctx->re + 1) == CHAR_LPAREN 1378 || *(ctx->re + 1) == CHAR_RPAREN)) 1379 { 1380 ctx->re++; 1381 STACK_PUSHX(stack, int, PARSE_ATOM); 1382 break; 1383 } 1384 1385 /* If a macro is used, parse the expanded macro recursively. */ 1386 { 1387 tre_char_t buf[64]; 1388 tre_expand_macro(ctx->re + 1, ctx->re_end, 1389 buf, elementsof(buf)); 1390 if (buf[0] != 0) 1391 { 1392 tre_parse_ctx_t subctx; 1393 memcpy(&subctx, ctx, sizeof(subctx)); 1394 subctx.re = buf; 1395 subctx.len = tre_strlen(buf); 1396 subctx.nofirstsub = 1; 1397 status = tre_parse(&subctx); 1398 if (status != REG_OK) 1399 return status; 1400 ctx->re += 2; 1401 ctx->position = subctx.position; 1402 result = subctx.result; 1403 break; 1404 } 1405 } 1406 1407 if (ctx->re + 1 >= ctx->re_end) 1408 /* Trailing backslash. */ 1409 return REG_EESCAPE; 1410 1411 #ifdef REG_LITERAL 1412 if (*(ctx->re + 1) == L'Q') 1413 { 1414 DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n", 1415 REST(ctx->re))); 1416 ctx->cflags |= REG_LITERAL; 1417 temporary_cflags |= REG_LITERAL; 1418 ctx->re += 2; 1419 STACK_PUSHX(stack, int, PARSE_ATOM); 1420 break; 1421 } 1422 #endif /* REG_LITERAL */ 1423 1424 DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re))); 1425 ctx->re++; 1426 switch (*ctx->re) 1427 { 1428 case L'b': 1429 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1430 ASSERT_AT_WB, -1); 1431 ctx->re++; 1432 break; 1433 case L'B': 1434 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1435 ASSERT_AT_WB_NEG, -1); 1436 ctx->re++; 1437 break; 1438 case L'<': 1439 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1440 ASSERT_AT_BOW, -1); 1441 ctx->re++; 1442 break; 1443 case L'>': 1444 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1445 ASSERT_AT_EOW, -1); 1446 ctx->re++; 1447 break; 1448 case L'x': 1449 ctx->re++; 1450 if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end) 1451 { 1452 /* 8 bit hex char. */ 1453 char tmp[3] = {0, 0, 0}; 1454 long val; 1455 DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n", 1456 REST(ctx->re - 2))); 1457 1458 if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end) 1459 { 1460 tmp[0] = (char)ctx->re[0]; 1461 ctx->re++; 1462 } 1463 if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end) 1464 { 1465 tmp[1] = (char)ctx->re[0]; 1466 ctx->re++; 1467 } 1468 val = strtol(tmp, NULL, 16); 1469 result = tre_ast_new_literal(ctx->mem, (int)val, 1470 (int)val, ctx->position); 1471 ctx->position++; 1472 break; 1473 } 1474 else if (ctx->re < ctx->re_end) 1475 { 1476 /* Wide char. */ 1477 char tmp[32]; 1478 long val; 1479 int i = 0; 1480 ctx->re++; 1481 while (ctx->re_end - ctx->re >= 0) 1482 { 1483 if (ctx->re[0] == CHAR_RBRACE) 1484 break; 1485 if (tre_isxdigit(ctx->re[0])) 1486 { 1487 tmp[i] = (char)ctx->re[0]; 1488 i++; 1489 ctx->re++; 1490 continue; 1491 } 1492 return REG_EBRACE; 1493 } 1494 ctx->re++; 1495 tmp[i] = 0; 1496 val = strtol(tmp, NULL, 16); 1497 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val, 1498 ctx->position); 1499 ctx->position++; 1500 break; 1501 } 1502 /*FALLTHROUGH*/ 1503 1504 default: 1505 if (tre_isdigit(*ctx->re)) 1506 { 1507 /* Back reference. */ 1508 int val = *ctx->re - L'0'; 1509 DPRINT(("tre_parse: backref: '%.*" STRF "'\n", 1510 REST(ctx->re - 1))); 1511 result = tre_ast_new_literal(ctx->mem, BACKREF, val, 1512 ctx->position); 1513 if (result == NULL) 1514 return REG_ESPACE; 1515 ctx->position++; 1516 ctx->max_backref = MAX(val, ctx->max_backref); 1517 ctx->re++; 1518 } 1519 else 1520 { 1521 /* Escaped character. */ 1522 DPRINT(("tre_parse: escaped: '%.*" STRF "'\n", 1523 REST(ctx->re - 1))); 1524 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, 1525 ctx->position); 1526 ctx->position++; 1527 ctx->re++; 1528 } 1529 break; 1530 } 1531 if (result == NULL) 1532 return REG_ESPACE; 1533 break; 1534 1535 case CHAR_PERIOD: /* the any-symbol */ 1536 DPRINT(("tre_parse: any: '%.*" STRF "'\n", 1537 REST(ctx->re))); 1538 if (ctx->cflags & REG_NEWLINE) 1539 { 1540 tre_ast_node_t *tmp1; 1541 tre_ast_node_t *tmp2; 1542 tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1, 1543 ctx->position); 1544 if (!tmp1) 1545 return REG_ESPACE; 1546 tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX, 1547 ctx->position + 1); 1548 if (!tmp2) 1549 return REG_ESPACE; 1550 result = tre_ast_new_union(ctx->mem, tmp1, tmp2); 1551 if (!result) 1552 return REG_ESPACE; 1553 ctx->position += 2; 1554 } 1555 else 1556 { 1557 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, 1558 ctx->position); 1559 if (!result) 1560 return REG_ESPACE; 1561 ctx->position++; 1562 } 1563 ctx->re++; 1564 break; 1565 1566 case CHAR_CARET: /* beginning of line assertion */ 1567 /* '^' has a special meaning everywhere in EREs, and in the 1568 beginning of the RE and after \( is BREs. */ 1569 if (ctx->cflags & REG_EXTENDED 1570 || (ctx->re - 2 >= ctx->re_start 1571 && *(ctx->re - 2) == CHAR_BACKSLASH 1572 && *(ctx->re - 1) == CHAR_LPAREN) 1573 || ctx->re == ctx->re_start) 1574 { 1575 DPRINT(("tre_parse: BOL: '%.*" STRF "'\n", 1576 REST(ctx->re))); 1577 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1578 ASSERT_AT_BOL, -1); 1579 if (result == NULL) 1580 return REG_ESPACE; 1581 ctx->re++; 1582 } 1583 else 1584 goto parse_literal; 1585 break; 1586 1587 case CHAR_DOLLAR: /* end of line assertion. */ 1588 /* '$' is special everywhere in EREs, and in the end of the 1589 string and before \) is BREs. */ 1590 if (ctx->cflags & REG_EXTENDED 1591 || (ctx->re + 2 < ctx->re_end 1592 && *(ctx->re + 1) == CHAR_BACKSLASH 1593 && *(ctx->re + 2) == CHAR_RPAREN) 1594 || ctx->re + 1 == ctx->re_end) 1595 { 1596 DPRINT(("tre_parse: EOL: '%.*" STRF "'\n", 1597 REST(ctx->re))); 1598 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1599 ASSERT_AT_EOL, -1); 1600 if (result == NULL) 1601 return REG_ESPACE; 1602 ctx->re++; 1603 } 1604 else 1605 goto parse_literal; 1606 break; 1607 1608 default: 1609 parse_literal: 1610 1611 if (temporary_cflags && ctx->re + 1 < ctx->re_end 1612 && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E') 1613 { 1614 DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n", 1615 REST(ctx->re))); 1616 ctx->cflags &= ~temporary_cflags; 1617 temporary_cflags = 0; 1618 ctx->re += 2; 1619 STACK_PUSHX(stack, int, PARSE_PIECE); 1620 break; 1621 } 1622 1623 1624 /* We are expecting an atom. If the subexpression (or the whole 1625 regexp ends here, we interpret it as an empty expression 1626 (which matches an empty string). */ 1627 if ( 1628 #ifdef REG_LITERAL 1629 !(ctx->cflags & REG_LITERAL) && 1630 #endif /* REG_LITERAL */ 1631 (ctx->re >= ctx->re_end 1632 || *ctx->re == CHAR_STAR 1633 || (ctx->cflags & REG_EXTENDED 1634 && (*ctx->re == CHAR_PIPE 1635 || *ctx->re == CHAR_LBRACE 1636 || *ctx->re == CHAR_PLUS 1637 || *ctx->re == CHAR_QUESTIONMARK)) 1638 /* Test for "\)" in BRE mode. */ 1639 || (!(ctx->cflags & REG_EXTENDED) 1640 && ctx->re + 1 < ctx->re_end 1641 && *ctx->re == CHAR_BACKSLASH 1642 && *(ctx->re + 1) == CHAR_LBRACE))) 1643 { 1644 DPRINT(("tre_parse: empty: '%.*" STRF "'\n", 1645 REST(ctx->re))); 1646 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 1647 if (!result) 1648 return REG_ESPACE; 1649 break; 1650 } 1651 1652 DPRINT(("tre_parse: literal: '%.*" STRF "'\n", 1653 REST(ctx->re))); 1654 /* Note that we can't use an tre_isalpha() test here, since there 1655 may be characters which are alphabetic but neither upper or 1656 lower case. */ 1657 if (ctx->cflags & REG_ICASE 1658 && (tre_isupper(*ctx->re) || tre_islower(*ctx->re))) 1659 { 1660 tre_ast_node_t *tmp1; 1661 tre_ast_node_t *tmp2; 1662 1663 /* XXX - Can there be more than one opposite-case 1664 counterpoints for some character in some locale? Or 1665 more than two characters which all should be regarded 1666 the same character if case is ignored? If yes, there 1667 does not seem to be a portable way to detect it. I guess 1668 that at least for multi-character collating elements there 1669 could be several opposite-case counterpoints, but they 1670 cannot be supported portably anyway. */ 1671 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re), 1672 tre_toupper(*ctx->re), 1673 ctx->position); 1674 if (!tmp1) 1675 return REG_ESPACE; 1676 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re), 1677 tre_tolower(*ctx->re), 1678 ctx->position); 1679 if (!tmp2) 1680 return REG_ESPACE; 1681 result = tre_ast_new_union(ctx->mem, tmp1, tmp2); 1682 if (!result) 1683 return REG_ESPACE; 1684 } 1685 else 1686 { 1687 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, 1688 ctx->position); 1689 if (!result) 1690 return REG_ESPACE; 1691 } 1692 ctx->position++; 1693 ctx->re++; 1694 break; 1695 } 1696 break; 1697 1698 case PARSE_MARK_FOR_SUBMATCH: 1699 { 1700 int submatch_id = tre_stack_pop_int(stack); 1701 1702 if (result->submatch_id >= 0) 1703 { 1704 tre_ast_node_t *n, *tmp_node; 1705 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 1706 if (n == NULL) 1707 return REG_ESPACE; 1708 tmp_node = tre_ast_new_catenation(ctx->mem, n, result); 1709 if (tmp_node == NULL) 1710 return REG_ESPACE; 1711 tmp_node->num_submatches = result->num_submatches; 1712 result = tmp_node; 1713 } 1714 result->submatch_id = submatch_id; 1715 result->num_submatches++; 1716 break; 1717 } 1718 1719 case PARSE_RESTORE_CFLAGS: 1720 ctx->cflags = tre_stack_pop_int(stack); 1721 break; 1722 1723 default: 1724 assert(0); 1725 break; 1726 } 1727 } 1728 1729 /* Check for missing closing parentheses. */ 1730 if (depth > 0) 1731 return REG_EPAREN; 1732 1733 if (status == REG_OK) 1734 ctx->result = result; 1735 1736 return status; 1737 } 1738 1739 /* EOF */ 1740