1 /* 2 * Stack-less Just-In-Time compiler 3 * 4 * Copyright 2009-2010 Zoltan Herczeg (hzmester@freemail.hu). All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without modification, are 7 * permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this list of 10 * conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, this list 13 * of conditions and the following disclaimer in the documentation and/or other materials 14 * provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY 17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 19 * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 21 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 24 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "sljitLir.h" 28 #include "regexJIT.h" 29 30 #ifdef REGEX_MATCH_VERBOSE 31 #include <stdio.h> 32 #endif 33 34 /* Extra, hidden flags: 35 {id!} where id > 0 found in the code. */ 36 #define REGEX_ID_CHECK 0x100 37 /* When REGEX_NEWLINE && REGEX_MATCH_BEGIN defined, the pattern turn to a normal search, 38 which starts with [\r\n] character range. */ 39 #define REGEX_FAKE_MATCH_BEGIN 0x200 40 /* When REGEX_NEWLINE && REGEX_MATCH_END defined, the pattern turn to a normal search, 41 which ends with [\r\n] character range. */ 42 #define REGEX_FAKE_MATCH_END 0x400 43 44 /* Check match completition after every (FINISH_TEST + 1) steps. */ 45 #define FINISH_TEST 0x7 46 47 /* --------------------------------------------------------------------- */ 48 /* Structures for JIT-ed pattern matching */ 49 /* --------------------------------------------------------------------- */ 50 51 struct regex_machine 52 { 53 /* flags. */ 54 int flags; 55 /* Number of state descriptors for one term. */ 56 sljit_sw no_states; 57 /* Total size. */ 58 sljit_sw size; 59 60 union { 61 void *init_match; 62 sljit_sw (SLJIT_CALL *call_init)(void *next, void* match); 63 } u; 64 #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL) 65 struct sljit_function_context context; 66 #endif 67 68 void *continue_match; 69 70 /* Variable sized array to contain the handler addresses. */ 71 sljit_uw entry_addrs[1]; 72 }; 73 74 struct regex_match 75 { 76 /* Current and next state array. */ 77 sljit_sw *current; 78 sljit_sw *next; 79 /* Starting. */ 80 sljit_sw head; 81 /* String character index (ever increasing). */ 82 sljit_sw index; 83 /* Best match found so far (members in priority order). */ 84 sljit_sw best_begin; 85 sljit_sw best_end; 86 sljit_sw best_id; 87 /* Bool flags (encoded as word). */ 88 sljit_sw fast_quit; 89 sljit_sw fast_forward; 90 /* Machine. */ 91 struct regex_machine *machine; 92 93 union { 94 void *continue_match; 95 void (SLJIT_CALL *call_continue)(struct regex_match *match, const regex_char_t *input_string, int length); 96 } u; 97 98 /* Variable sized array to contain the state arrays. */ 99 sljit_sw states[1]; 100 }; 101 102 /* State vector 103 ITEM[0] - pointer to the address inside the machine code 104 ITEM[1] - next pointer 105 ITEM[2] - string started from (optional) 106 ITEM[3] - max ID (optional) */ 107 108 /* Register allocation. */ 109 /* Current state array (loaded & stored: regex_match->current). */ 110 #define R_CURR_STATE SLJIT_S0 111 /* Next state array (loaded & stored: regex_match->next). */ 112 #define R_NEXT_STATE SLJIT_S1 113 /* Head (loaded & stored: regex_match->head). */ 114 #define R_NEXT_HEAD SLJIT_S2 115 /* String fragment pointer. */ 116 #define R_STRING SLJIT_S3 117 /* String fragment length. */ 118 #define R_LENGTH SLJIT_S4 119 /* 'struct regex_match*' */ 120 #define R_REGEX_MATCH SLJIT_R0 121 /* Current character. */ 122 #define R_CURR_CHAR SLJIT_R1 123 /* Temporary register. */ 124 #define R_TEMP SLJIT_R2 125 /* Caches the regex_match->best_begin. */ 126 #define R_BEST_BEGIN SLJIT_R3 127 /* Current character index. */ 128 #define R_CURR_INDEX SLJIT_R4 129 130 /* --------------------------------------------------------------------- */ 131 /* Stack management */ 132 /* --------------------------------------------------------------------- */ 133 134 /* Try to allocate 2^n blocks. */ 135 #define STACK_FRAGMENT_SIZE (((64 * sizeof(struct stack_item)) - (sizeof(struct stack_fragment_data))) / (sizeof(struct stack_item))) 136 137 struct stack_item { 138 int type; 139 int value; 140 }; 141 142 struct stack_fragment_data { 143 struct stack_fragment *next; 144 struct stack_fragment *prev; 145 }; 146 147 struct stack_fragment { 148 struct stack_fragment_data data; 149 struct stack_item items[STACK_FRAGMENT_SIZE]; 150 }; 151 152 struct stack { 153 struct stack_fragment *first; 154 struct stack_fragment *last; 155 int index; 156 int count; 157 }; 158 159 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 160 161 static void stack_check(struct stack *stack) 162 { 163 struct stack_fragment *curr; 164 int found; 165 166 if (!stack) 167 return; 168 169 SLJIT_ASSERT(stack->index >= 0 && stack->index < STACK_FRAGMENT_SIZE); 170 171 if (stack->first == NULL) { 172 SLJIT_ASSERT(stack->first == NULL && stack->last == NULL); 173 SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0); 174 return; 175 } 176 177 found = 0; 178 if (stack->last == NULL) { 179 SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0); 180 found = 1; 181 } 182 else 183 SLJIT_ASSERT(stack->index >= 0 && stack->count >= 0); 184 185 SLJIT_ASSERT(stack->first->data.prev == NULL); 186 curr = stack->first; 187 while (curr) { 188 if (curr == stack->last) 189 found = 1; 190 if (curr->data.next) 191 SLJIT_ASSERT(curr->data.next->data.prev == curr); 192 curr = curr->data.next; 193 } 194 SLJIT_ASSERT(found); 195 } 196 197 #endif 198 199 static void stack_init(struct stack *stack) 200 { 201 stack->first = NULL; 202 stack->last = NULL; 203 stack->index = STACK_FRAGMENT_SIZE - 1; 204 stack->count = 0; 205 } 206 207 static void stack_destroy(struct stack *stack) 208 { 209 struct stack_fragment *curr = stack->first; 210 struct stack_fragment *prev; 211 212 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 213 stack_check(stack); 214 #endif 215 216 while (curr) { 217 prev = curr; 218 curr = curr->data.next; 219 SLJIT_FREE(prev, NULL); 220 } 221 } 222 223 static SLJIT_INLINE struct stack_item* stack_top(struct stack *stack) 224 { 225 SLJIT_ASSERT(stack->last); 226 return stack->last->items + stack->index; 227 } 228 229 static int stack_push(struct stack *stack, int type, int value) 230 { 231 if (stack->last) { 232 stack->index++; 233 if (stack->index >= STACK_FRAGMENT_SIZE) { 234 stack->index = 0; 235 if (!stack->last->data.next) { 236 stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL); 237 if (!stack->last->data.next) 238 return 1; 239 stack->last->data.next->data.next = NULL; 240 stack->last->data.next->data.prev = stack->last; 241 } 242 stack->last = stack->last->data.next; 243 } 244 } 245 else if (!stack->first) { 246 stack->last = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL); 247 if (!stack->last) 248 return 1; 249 stack->last->data.prev = NULL; 250 stack->last->data.next = NULL; 251 stack->first = stack->last; 252 stack->index = 0; 253 } 254 else { 255 stack->last = stack->first; 256 stack->index = 0; 257 } 258 stack->last->items[stack->index].type = type; 259 stack->last->items[stack->index].value = value; 260 stack->count++; 261 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 262 stack_check(stack); 263 #endif 264 return 0; 265 } 266 267 static struct stack_item* stack_pop(struct stack *stack) 268 { 269 struct stack_item *ret = stack_top(stack); 270 271 if (stack->index > 0) 272 stack->index--; 273 else { 274 stack->last = stack->last->data.prev; 275 stack->index = STACK_FRAGMENT_SIZE - 1; 276 } 277 278 stack->count--; 279 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 280 stack_check(stack); 281 #endif 282 return ret; 283 } 284 285 static SLJIT_INLINE void stack_clone(struct stack *src, struct stack *dst) 286 { 287 *dst = *src; 288 } 289 290 static int stack_push_copy(struct stack *stack, int items, int length) 291 { 292 struct stack_fragment *frag1; 293 int ind1; 294 struct stack_fragment *frag2; 295 int ind2; 296 int counter; 297 298 SLJIT_ASSERT(stack->count >= length && items <= length && items > 0); 299 300 /* Allocate the necessary elements. */ 301 counter = items; 302 frag1 = stack->last; 303 ind1 = stack->index; 304 while (counter > 0) { 305 if (stack->index + counter >= STACK_FRAGMENT_SIZE) { 306 counter -= STACK_FRAGMENT_SIZE - stack->index - 1 + 1; 307 stack->index = 0; 308 if (!stack->last->data.next) { 309 stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL); 310 if (!stack->last->data.next) 311 return 1; 312 stack->last->data.next->data.next = NULL; 313 stack->last->data.next->data.prev = stack->last; 314 } 315 stack->last = stack->last->data.next; 316 } 317 else { 318 stack->index += counter; 319 counter = 0; 320 } 321 } 322 323 frag2 = stack->last; 324 ind2 = stack->index; 325 while (length > 0) { 326 frag2->items[ind2--] = frag1->items[ind1--]; 327 if (ind1 < 0) { 328 ind1 = STACK_FRAGMENT_SIZE - 1; 329 frag1 = frag1->data.prev; 330 } 331 if (ind2 < 0) { 332 ind2 = STACK_FRAGMENT_SIZE - 1; 333 frag2 = frag2->data.prev; 334 } 335 length--; 336 } 337 338 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 339 stack_check(stack); 340 #endif 341 stack->count += items; 342 return 0; 343 } 344 345 /* --------------------------------------------------------------------- */ 346 /* Parser */ 347 /* --------------------------------------------------------------------- */ 348 349 enum { 350 /* Common. */ 351 type_begin, 352 type_end, 353 type_char, 354 type_newline, 355 type_id, 356 type_rng_start, 357 type_rng_end, 358 type_rng_char, 359 type_rng_left, 360 type_rng_right, 361 362 /* generator only. */ 363 type_branch, 364 type_jump, 365 366 /* Parser only. */ 367 type_open_br, 368 type_close_br, 369 type_select, 370 type_asterisk, 371 type_plus_sign, 372 type_qestion_mark 373 }; 374 375 struct compiler_common { 376 /* Temporary stacks. */ 377 struct stack stack; 378 struct stack depth; 379 /* REGEX_ flags. */ 380 int flags; 381 /* Encoded size of the dfa representation. */ 382 sljit_sw dfa_size; 383 /* Number of terms. */ 384 sljit_sw terms_size; 385 /* Number of state descriptors for one term (same as machine->no_states). */ 386 sljit_sw no_states; 387 /* Number of type_rng_(char|left)-s in the longest character range. */ 388 sljit_sw longest_range_size; 389 390 /* DFA linear representation (size: dfa_size). */ 391 struct stack_item *dfa_transitions; 392 /* Term id and search state pairs (size: dfa_size). */ 393 struct stack_item *search_states; 394 395 /* sljit compiler */ 396 struct sljit_compiler *compiler; 397 /* Machine data, which must be kept for later use. */ 398 struct regex_machine *machine; 399 /* Temporary space for jumps (size: longest_range_size). */ 400 struct sljit_jump **range_jump_list; 401 }; 402 403 static const regex_char_t* decode_number(const regex_char_t *regex_string, int length, int *result) 404 { 405 int value = 0; 406 407 SLJIT_ASSERT(length > 0); 408 if (*regex_string < '0' || *regex_string > '9') { 409 *result = -1; 410 return regex_string; 411 } 412 413 while (length > 0 && *regex_string >= '0' && *regex_string <= '9') { 414 value = value * 10 + (*regex_string - '0'); 415 length--; 416 regex_string++; 417 } 418 419 *result = value; 420 return regex_string; 421 } 422 423 static int iterate(struct stack *stack, int min, int max) 424 { 425 struct stack it; 426 struct stack_item *item; 427 int count = -1; 428 int len = 0; 429 int depth = 0; 430 431 stack_clone(stack, &it); 432 433 /* Calculate size. */ 434 while (count < 0) { 435 item = stack_pop(&it); 436 switch (item->type) { 437 case type_id: 438 case type_rng_end: 439 case type_rng_char: 440 case type_rng_left: 441 case type_rng_right: 442 case type_plus_sign: 443 case type_qestion_mark: 444 len++; 445 break; 446 447 case type_asterisk: 448 len += 2; 449 break; 450 451 case type_close_br: 452 depth++; 453 break; 454 455 case type_open_br: 456 SLJIT_ASSERT(depth > 0); 457 depth--; 458 if (depth == 0) 459 count = it.count; 460 break; 461 462 case type_select: 463 SLJIT_ASSERT(depth > 0); 464 len += 2; 465 break; 466 467 default: 468 SLJIT_ASSERT(item->type != type_begin && item->type != type_end); 469 if (depth == 0) 470 count = it.count; 471 len++; 472 break; 473 } 474 } 475 476 if (min == 0 && max == 0) { 477 /* {0,0} case, not {0,} case: delete subtree. */ 478 stack_clone(&it, stack); 479 /* and put an empty bracket expression instead of it. */ 480 if (stack_push(stack, type_open_br, 0)) 481 return REGEX_MEMORY_ERROR; 482 if (stack_push(stack, type_close_br, 0)) 483 return REGEX_MEMORY_ERROR; 484 return len; 485 } 486 487 count = stack->count - count; 488 489 /* Put an open bracket before the sequence. */ 490 if (stack_push_copy(stack, 1, count)) 491 return -1; 492 493 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 494 SLJIT_ASSERT(stack_push(&it, type_open_br, 0) == 0); 495 #else 496 stack_push(&it, type_open_br, 0); 497 #endif 498 499 /* Copy the data. */ 500 if (max > 0) { 501 len = len * (max - 1); 502 max -= min; 503 /* Insert ? operators. */ 504 len += max; 505 506 if (min > 0) { 507 min--; 508 while (min > 0) { 509 if (stack_push_copy(stack, count, count)) 510 return -1; 511 min--; 512 } 513 if (max > 0) { 514 if (stack_push_copy(stack, count, count)) 515 return -1; 516 if (stack_push(stack, type_qestion_mark, 0)) 517 return REGEX_MEMORY_ERROR; 518 count++; 519 max--; 520 } 521 } 522 else { 523 SLJIT_ASSERT(max > 0); 524 max--; 525 count++; 526 if (stack_push(stack, type_qestion_mark, 0)) 527 return REGEX_MEMORY_ERROR; 528 } 529 530 while (max > 0) { 531 if (stack_push_copy(stack, count, count)) 532 return -1; 533 max--; 534 } 535 } 536 else { 537 SLJIT_ASSERT(min > 0); 538 min--; 539 /* Insert + operator. */ 540 len = len * min + 1; 541 while (min > 0) { 542 if (stack_push_copy(stack, count, count)) 543 return -1; 544 min--; 545 } 546 547 if (stack_push(stack, type_plus_sign, 0)) 548 return REGEX_MEMORY_ERROR; 549 } 550 551 /* Close the opened bracket. */ 552 if (stack_push(stack, type_close_br, 0)) 553 return REGEX_MEMORY_ERROR; 554 555 return len; 556 } 557 558 static int parse_iterator(const regex_char_t *regex_string, int length, struct stack *stack, sljit_sw *dfa_size, int begin) 559 { 560 /* We only know that *regex_string == { . */ 561 int val1, val2; 562 const regex_char_t *base_from = regex_string; 563 const regex_char_t *from; 564 565 length--; 566 regex_string++; 567 568 /* Decode left value. */ 569 val2 = -1; 570 if (length == 0) 571 return -2; 572 if (*regex_string == ',') { 573 val1 = 0; 574 length--; 575 regex_string++; 576 } 577 else { 578 from = regex_string; 579 regex_string = decode_number(regex_string, length, &val1); 580 if (val1 < 0) 581 return -2; 582 length -= regex_string - from; 583 584 if (length == 0) 585 return -2; 586 if (*regex_string == '}') { 587 val2 = val1; 588 if (val1 == 0) 589 val1 = -1; 590 } 591 else if (length >= 2 && *regex_string == '!' && regex_string[1] == '}') { 592 /* Non posix extension. */ 593 if (stack_push(stack, type_id, val1)) 594 return -1; 595 (*dfa_size)++; 596 return (regex_string - base_from) + 1; 597 } 598 else { 599 if (*regex_string != ',') 600 return -2; 601 length--; 602 regex_string++; 603 } 604 } 605 606 if (begin) 607 return -2; 608 609 /* Decode right value. */ 610 if (val2 == -1) { 611 if (length == 0) 612 return -2; 613 if (*regex_string == '}') 614 val2 = 0; 615 else { 616 from = regex_string; 617 regex_string = decode_number(regex_string, length, &val2); 618 length -= regex_string - from; 619 if (val2 < 0 || length == 0 || *regex_string != '}' || val2 < val1) 620 return -2; 621 if (val2 == 0) { 622 SLJIT_ASSERT(val1 == 0); 623 val1 = -1; 624 } 625 } 626 } 627 628 /* Fast cases. */ 629 if (val1 > 1 || val2 > 1) { 630 val1 = iterate(stack, val1, val2); 631 if (val1 < 0) 632 return -1; 633 *dfa_size += val1; 634 } 635 else if (val1 == 0 && val2 == 0) { 636 if (stack_push(stack, type_asterisk, 0)) 637 return -1; 638 *dfa_size += 2; 639 } 640 else if (val1 == 1 && val2 == 0) { 641 if (stack_push(stack, type_plus_sign, 0)) 642 return -1; 643 (*dfa_size)++; 644 } 645 else if (val1 == 0 && val2 == 1) { 646 if (stack_push(stack, type_qestion_mark, 0)) 647 return -1; 648 (*dfa_size)++; 649 } 650 else if (val1 == -1) { 651 val1 = iterate(stack, 0, 0); 652 if (val1 < 0) 653 return -1; 654 *dfa_size -= val1; 655 SLJIT_ASSERT(*dfa_size >= 2); 656 } 657 else { 658 /* Ignore. */ 659 SLJIT_ASSERT(val1 == 1 && val2 == 1); 660 } 661 return regex_string - base_from; 662 } 663 664 static int parse_char_range(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common) 665 { 666 struct stack* stack = &compiler_common->stack; 667 const regex_char_t *base_from = regex_string; 668 regex_char_t left_char, right_char, tmp_char; 669 670 length--; 671 regex_string++; 672 673 if (length == 0) 674 return -2; 675 676 if (*regex_string != '^') { 677 if (stack_push(stack, type_rng_start, 0)) 678 return -1; 679 } 680 else { 681 length--; 682 regex_string++; 683 684 if (length == 0) 685 return -2; 686 687 if (stack_push(stack, type_rng_start, 1)) 688 return -1; 689 } 690 /* For both the type_rng_start & type_rng_end. */ 691 compiler_common->dfa_size += 2; 692 693 /* Range must be at least 1 character. */ 694 if (*regex_string == ']') { 695 length--; 696 regex_string++; 697 if (stack_push(stack, type_rng_char, ']')) 698 return -1; 699 compiler_common->dfa_size++; 700 } 701 702 while (1) { 703 if (length == 0) 704 return -2; 705 706 if (*regex_string == ']') 707 break; 708 709 if (*regex_string != '\\') 710 left_char = *regex_string; 711 else { 712 regex_string++; 713 length--; 714 if (length == 0) 715 return -2; 716 left_char = *regex_string; 717 } 718 regex_string++; 719 length--; 720 721 /* Is a range here? */ 722 if (length >= 3 && *regex_string == '-' && *(regex_string + 1) != ']') { 723 regex_string++; 724 length--; 725 726 if (*regex_string != '\\') 727 right_char = *regex_string; 728 else { 729 regex_string++; 730 length--; 731 if (length == 0) 732 return -2; 733 right_char = *regex_string; 734 } 735 regex_string++; 736 length--; 737 738 if (left_char > right_char) { 739 /* Swap if necessary. */ 740 tmp_char = left_char; 741 left_char = right_char; 742 right_char = tmp_char; 743 } 744 745 if (stack_push(stack, type_rng_left, left_char)) 746 return -1; 747 if (stack_push(stack, type_rng_right, right_char)) 748 return -1; 749 compiler_common->dfa_size += 2; 750 } 751 else { 752 if (stack_push(stack, type_rng_char, left_char)) 753 return -1; 754 compiler_common->dfa_size++; 755 } 756 } 757 758 if (stack_push(stack, type_rng_end, 0)) 759 return -1; 760 return regex_string - base_from; 761 } 762 763 static int parse(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common) 764 { 765 /* Depth of bracketed expressions. */ 766 int depth = 0; 767 /* Have we already found a term? '1' if not yet. */ 768 int begin = 1; 769 /* Cache stack pointer. */ 770 struct stack* stack = &compiler_common->stack; 771 int tmp; 772 773 /* Type_begin and type_end. */ 774 compiler_common->dfa_size = 2; 775 stack_init(stack); 776 if (stack_push(stack, type_begin, 0)) 777 return REGEX_MEMORY_ERROR; 778 779 if (length > 0 && *regex_string == '^') { 780 compiler_common->flags |= REGEX_MATCH_BEGIN; 781 length--; 782 regex_string++; 783 } 784 785 if ((compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) == (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) { 786 /* Replace REGEX_MATCH_BEGIN flag to REGEX_FAKE_MATCH_BEGIN */ 787 compiler_common->flags &= ~REGEX_MATCH_BEGIN; 788 compiler_common->flags |= REGEX_FAKE_MATCH_BEGIN; 789 /* and append a new-line search. */ 790 if (stack_push(stack, type_newline, 0)) 791 return REGEX_MEMORY_ERROR; 792 compiler_common->dfa_size++; 793 /* Begin intentionally kept as 1. */ 794 } 795 796 while (length > 0) { 797 switch (*regex_string) { 798 case '\\' : 799 length--; 800 regex_string++; 801 if (length == 0) 802 return REGEX_INVALID_REGEX; 803 if (stack_push(stack, type_char, *regex_string)) 804 return REGEX_MEMORY_ERROR; 805 begin = 0; 806 compiler_common->dfa_size++; 807 break; 808 809 case '.' : 810 if (stack_push(stack, type_rng_start, 1)) 811 return REGEX_MEMORY_ERROR; 812 if (compiler_common->flags & REGEX_NEWLINE) { 813 if (stack_push(stack, type_rng_char, '\n')) 814 return REGEX_MEMORY_ERROR; 815 if (stack_push(stack, type_rng_char, '\r')) 816 return REGEX_MEMORY_ERROR; 817 compiler_common->dfa_size += 2; 818 } 819 if (stack_push(stack, type_rng_end, 1)) 820 return REGEX_MEMORY_ERROR; 821 begin = 0; 822 compiler_common->dfa_size += 2; 823 break; 824 825 case '(' : 826 depth++; 827 if (stack_push(stack, type_open_br, 0)) 828 return REGEX_MEMORY_ERROR; 829 begin = 1; 830 break; 831 832 case ')' : 833 if (depth == 0) 834 return REGEX_INVALID_REGEX; 835 depth--; 836 if (stack_push(stack, type_close_br, 0)) 837 return REGEX_MEMORY_ERROR; 838 begin = 0; 839 break; 840 841 case '|' : 842 if (stack_push(stack, type_select, 0)) 843 return REGEX_MEMORY_ERROR; 844 begin = 1; 845 compiler_common->dfa_size += 2; 846 break; 847 848 case '*' : 849 if (begin) 850 return REGEX_INVALID_REGEX; 851 if (stack_push(stack, type_asterisk, 0)) 852 return REGEX_MEMORY_ERROR; 853 compiler_common->dfa_size += 2; 854 break; 855 856 case '?' : 857 case '+' : 858 if (begin) 859 return REGEX_INVALID_REGEX; 860 if (stack_push(stack, (*regex_string == '+') ? type_plus_sign : type_qestion_mark, 0)) 861 return REGEX_MEMORY_ERROR; 862 compiler_common->dfa_size++; 863 break; 864 865 case '{' : 866 tmp = parse_iterator(regex_string, length, stack, &compiler_common->dfa_size, begin); 867 868 if (tmp >= 0) { 869 length -= tmp; 870 regex_string += tmp; 871 } 872 else if (tmp == -1) 873 return REGEX_MEMORY_ERROR; 874 else { 875 /* Not a valid range expression. */ 876 SLJIT_ASSERT(tmp == -2); 877 if (stack_push(stack, type_char, '{')) 878 return REGEX_MEMORY_ERROR; 879 compiler_common->dfa_size++; 880 } 881 break; 882 883 case '[' : 884 tmp = parse_char_range(regex_string, length, compiler_common); 885 if (tmp >= 0) { 886 length -= tmp; 887 regex_string += tmp; 888 } 889 else if (tmp == -1) 890 return REGEX_MEMORY_ERROR; 891 else { 892 SLJIT_ASSERT(tmp == -2); 893 return REGEX_INVALID_REGEX; 894 } 895 begin = 0; 896 break; 897 898 default: 899 if (length == 1 && *regex_string == '$') { 900 compiler_common->flags |= REGEX_MATCH_END; 901 break; 902 } 903 if (stack_push(stack, type_char, *regex_string)) 904 return REGEX_MEMORY_ERROR; 905 begin = 0; 906 compiler_common->dfa_size++; 907 break; 908 } 909 length--; 910 regex_string++; 911 } 912 913 if (depth != 0) 914 return REGEX_INVALID_REGEX; 915 916 if ((compiler_common->flags & (REGEX_MATCH_END | REGEX_NEWLINE)) == (REGEX_MATCH_END | REGEX_NEWLINE)) { 917 /* Replace REGEX_MATCH_END flag to REGEX_FAKE_MATCH_END */ 918 compiler_common->flags &= ~REGEX_MATCH_END; 919 compiler_common->flags |= REGEX_FAKE_MATCH_END; 920 /* and append a new-line search. */ 921 if (stack_push(stack, type_newline, 1)) 922 return REGEX_MEMORY_ERROR; 923 compiler_common->dfa_size++; 924 /* Begin intentionally kept as 1. */ 925 } 926 927 if (stack_push(stack, type_end, 0)) 928 return REGEX_MEMORY_ERROR; 929 930 return REGEX_NO_ERROR; 931 } 932 933 /* --------------------------------------------------------------------- */ 934 /* Generating machine state transitions */ 935 /* --------------------------------------------------------------------- */ 936 937 #define PUT_TRANSITION(typ, val) \ 938 do { \ 939 --transitions_ptr; \ 940 transitions_ptr->type = typ; \ 941 transitions_ptr->value = val; \ 942 } while (0) 943 944 static struct stack_item* handle_iteratives(struct stack_item *transitions_ptr, struct stack_item *transitions, struct stack *depth) 945 { 946 struct stack_item *item; 947 948 while (1) { 949 item = stack_top(depth); 950 951 switch (item->type) { 952 case type_asterisk: 953 SLJIT_ASSERT(transitions[item->value].type == type_branch); 954 transitions[item->value].value = transitions_ptr - transitions; 955 PUT_TRANSITION(type_branch, item->value + 1); 956 break; 957 958 case type_plus_sign: 959 SLJIT_ASSERT(transitions[item->value].type == type_branch); 960 transitions[item->value].value = transitions_ptr - transitions; 961 break; 962 963 case type_qestion_mark: 964 PUT_TRANSITION(type_branch, item->value); 965 break; 966 967 default: 968 return transitions_ptr; 969 } 970 stack_pop(depth); 971 } 972 } 973 974 static int generate_transitions(struct compiler_common *compiler_common) 975 { 976 struct stack *stack = &compiler_common->stack; 977 struct stack *depth = &compiler_common->depth; 978 struct stack_item *transitions_ptr; 979 struct stack_item *item; 980 981 stack_init(depth); 982 compiler_common->dfa_transitions = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL); 983 if (!compiler_common->dfa_transitions) 984 return REGEX_MEMORY_ERROR; 985 986 /* Go through the items of the stack and generate the necessary branches and jumps (edges of DFA). */ 987 transitions_ptr = compiler_common->dfa_transitions + compiler_common->dfa_size; 988 while (stack->count > 0) { 989 item = stack_pop(stack); 990 switch (item->type) { 991 case type_begin: 992 case type_open_br: 993 item = stack_pop(depth); 994 if (item->type == type_select) 995 PUT_TRANSITION(type_branch, item->value + 1); 996 else 997 SLJIT_ASSERT(item->type == type_close_br); 998 if (stack->count == 0) 999 PUT_TRANSITION(type_begin, 0); 1000 else 1001 transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth); 1002 break; 1003 1004 case type_end: 1005 case type_close_br: 1006 if (item->type == type_end) 1007 *--transitions_ptr = *item; 1008 if (stack_push(depth, type_close_br, transitions_ptr - compiler_common->dfa_transitions)) 1009 return REGEX_MEMORY_ERROR; 1010 break; 1011 1012 case type_select: 1013 item = stack_top(depth); 1014 if (item->type == type_select) { 1015 SLJIT_ASSERT(compiler_common->dfa_transitions[item->value].type == type_jump); 1016 PUT_TRANSITION(type_branch, item->value + 1); 1017 PUT_TRANSITION(type_jump, item->value); 1018 item->value = transitions_ptr - compiler_common->dfa_transitions; 1019 } 1020 else { 1021 SLJIT_ASSERT(item->type == type_close_br); 1022 item->type = type_select; 1023 PUT_TRANSITION(type_jump, item->value); 1024 item->value = transitions_ptr - compiler_common->dfa_transitions; 1025 } 1026 break; 1027 1028 case type_asterisk: 1029 case type_plus_sign: 1030 case type_qestion_mark: 1031 if (item->type != type_qestion_mark) 1032 PUT_TRANSITION(type_branch, 0); 1033 if (stack_push(depth, item->type, transitions_ptr - compiler_common->dfa_transitions)) 1034 return REGEX_MEMORY_ERROR; 1035 break; 1036 1037 case type_char: 1038 case type_newline: 1039 case type_rng_start: 1040 /* Requires handle_iteratives. */ 1041 *--transitions_ptr = *item; 1042 transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth); 1043 break; 1044 1045 default: 1046 *--transitions_ptr = *item; 1047 break; 1048 } 1049 } 1050 1051 SLJIT_ASSERT(compiler_common->dfa_transitions == transitions_ptr); 1052 SLJIT_ASSERT(depth->count == 0); 1053 return REGEX_NO_ERROR; 1054 } 1055 1056 #undef PUT_TRANSITION 1057 1058 #ifdef REGEX_MATCH_VERBOSE 1059 1060 static void verbose_transitions(struct compiler_common *compiler_common) 1061 { 1062 struct stack_item *transitions_ptr = compiler_common->dfa_transitions; 1063 struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size; 1064 struct stack_item *search_states_ptr = compiler_common->search_states; 1065 int pos; 1066 1067 printf("-----------------\nTransitions\n-----------------\n"); 1068 pos = 0; 1069 while (transitions_ptr < transitions_end) { 1070 printf("[%3d] ", pos++); 1071 if (search_states_ptr->type >= 0) 1072 printf("(%3d) ", search_states_ptr->type); 1073 switch (transitions_ptr->type) { 1074 case type_begin: 1075 printf("type_begin\n"); 1076 break; 1077 1078 case type_end: 1079 printf("type_end\n"); 1080 break; 1081 1082 case type_char: 1083 if (transitions_ptr->value >= ' ') 1084 printf("type_char '%c'\n", transitions_ptr->value); 1085 else 1086 printf("type_char 0x%x\n", transitions_ptr->value); 1087 break; 1088 1089 case type_newline: 1090 printf("type_newline %s\n", transitions_ptr->value ? "(end)" : "(begin)"); 1091 break; 1092 1093 case type_id: 1094 printf("type_id %d\n", transitions_ptr->value); 1095 break; 1096 1097 case type_rng_start: 1098 printf("type_rng_start %s\n", transitions_ptr->value ? "(invert)" : "(normal)"); 1099 break; 1100 1101 case type_rng_end: 1102 printf("type_rng_end\n"); 1103 break; 1104 1105 case type_rng_char: 1106 if (transitions_ptr->value >= ' ') 1107 printf("type_rng_char '%c'\n", transitions_ptr->value); 1108 else 1109 printf("type_rng_char 0x%x\n", transitions_ptr->value); 1110 break; 1111 1112 case type_rng_left: 1113 if (transitions_ptr->value >= ' ') 1114 printf("type_rng_left '%c'\n", transitions_ptr->value); 1115 else 1116 printf("type_rng_left 0x%x\n", transitions_ptr->value); 1117 break; 1118 1119 case type_rng_right: 1120 if (transitions_ptr->value >= ' ') 1121 printf("type_rng_right '%c'\n", transitions_ptr->value); 1122 else 1123 printf("type_rng_right 0x%x\n", transitions_ptr->value); 1124 break; 1125 1126 case type_branch: 1127 printf("type_branch -> %d\n", transitions_ptr->value); 1128 break; 1129 1130 case type_jump: 1131 printf("type_jump -> %d\n", transitions_ptr->value); 1132 break; 1133 1134 default: 1135 printf("UNEXPECTED TYPE\n"); 1136 break; 1137 } 1138 transitions_ptr++; 1139 search_states_ptr++; 1140 } 1141 printf("flags: "); 1142 if (!(compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_NEWLINE | REGEX_ID_CHECK | REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END))) 1143 printf("none "); 1144 if (compiler_common->flags & REGEX_MATCH_BEGIN) 1145 printf("REGEX_MATCH_BEGIN "); 1146 if (compiler_common->flags & REGEX_MATCH_END) 1147 printf("REGEX_MATCH_END "); 1148 if (compiler_common->flags & REGEX_NEWLINE) 1149 printf("REGEX_NEWLINE "); 1150 if (compiler_common->flags & REGEX_ID_CHECK) 1151 printf("REGEX_ID_CHECK "); 1152 if (compiler_common->flags & REGEX_FAKE_MATCH_BEGIN) 1153 printf("REGEX_FAKE_MATCH_BEGIN "); 1154 if (compiler_common->flags & REGEX_FAKE_MATCH_END) 1155 printf("REGEX_FAKE_MATCH_END "); 1156 if (compiler_common->longest_range_size > 0) 1157 printf("(longest range: %ld) ", (long)compiler_common->longest_range_size); 1158 printf("\n"); 1159 } 1160 1161 #endif 1162 1163 /* --------------------------------------------------------------------- */ 1164 /* Utilities */ 1165 /* --------------------------------------------------------------------- */ 1166 1167 static int generate_search_states(struct compiler_common *compiler_common) 1168 { 1169 struct stack_item *transitions_ptr = compiler_common->dfa_transitions; 1170 struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size; 1171 struct stack_item *search_states_ptr; 1172 struct stack_item *rng_start = NULL; 1173 1174 compiler_common->terms_size = !(compiler_common->flags & REGEX_FAKE_MATCH_END) ? 1 : 2; 1175 compiler_common->longest_range_size = 0; 1176 compiler_common->search_states = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL); 1177 if (!compiler_common->search_states) 1178 return REGEX_MEMORY_ERROR; 1179 1180 search_states_ptr = compiler_common->search_states; 1181 while (transitions_ptr < transitions_end) { 1182 switch (transitions_ptr->type) { 1183 case type_begin: 1184 case type_end: 1185 search_states_ptr->type = 0; 1186 break; 1187 1188 case type_char: 1189 search_states_ptr->type = compiler_common->terms_size++; 1190 break; 1191 1192 case type_newline: 1193 if (transitions_ptr->value) 1194 search_states_ptr->type = 1; 1195 else 1196 search_states_ptr->type = compiler_common->terms_size++; 1197 SLJIT_ASSERT(search_states_ptr->type == 1 || search_states_ptr->type == 2); 1198 break; 1199 1200 case type_id: 1201 if (transitions_ptr->value > 0) 1202 compiler_common->flags |= REGEX_ID_CHECK; 1203 search_states_ptr->type = -1; 1204 break; 1205 1206 case type_rng_start: 1207 search_states_ptr->type = compiler_common->terms_size; 1208 rng_start = search_states_ptr; 1209 break; 1210 1211 case type_rng_end: 1212 search_states_ptr->type = compiler_common->terms_size++; 1213 /* Ok, this is a blunt over estimation :) */ 1214 if (compiler_common->longest_range_size < search_states_ptr - rng_start) 1215 compiler_common->longest_range_size = search_states_ptr - rng_start; 1216 break; 1217 1218 default: 1219 search_states_ptr->type = -1; 1220 break; 1221 } 1222 search_states_ptr->value = -1; 1223 search_states_ptr++; 1224 transitions_ptr++; 1225 } 1226 return REGEX_NO_ERROR; 1227 } 1228 1229 static int trace_transitions(int from, struct compiler_common *compiler_common) 1230 { 1231 int id = 0; 1232 struct stack *stack = &compiler_common->stack; 1233 struct stack *depth = &compiler_common->depth; 1234 struct stack_item *dfa_transitions = compiler_common->dfa_transitions; 1235 struct stack_item *search_states = compiler_common->search_states; 1236 1237 SLJIT_ASSERT(search_states[from].type >= 0); 1238 1239 from++; 1240 1241 /* Be prepared for any paths (loops, etc). */ 1242 while (1) { 1243 if (dfa_transitions[from].type == type_id) 1244 if (id < dfa_transitions[from].value) 1245 id = dfa_transitions[from].value; 1246 1247 if (search_states[from].value < id) { 1248 /* Forward step. */ 1249 if (search_states[from].value == -1) 1250 if (stack_push(stack, 0, from)) 1251 return REGEX_MEMORY_ERROR; 1252 search_states[from].value = id; 1253 1254 if (dfa_transitions[from].type == type_branch) { 1255 if (stack_push(depth, id, from)) 1256 return REGEX_MEMORY_ERROR; 1257 from++; 1258 continue; 1259 } 1260 else if (dfa_transitions[from].type == type_jump) { 1261 from = dfa_transitions[from].value; 1262 continue; 1263 } 1264 else if (search_states[from].type < 0) { 1265 from++; 1266 continue; 1267 } 1268 } 1269 1270 /* Back tracking. */ 1271 if (depth->count > 0) { 1272 id = stack_top(depth)->type; 1273 from = dfa_transitions[stack_pop(depth)->value].value; 1274 continue; 1275 } 1276 return 0; 1277 } 1278 } 1279 1280 /* --------------------------------------------------------------------- */ 1281 /* Code generator */ 1282 /* --------------------------------------------------------------------- */ 1283 1284 #define TERM_OFFSET_OF(index, offs) (((index) * no_states + (offs)) * sizeof(sljit_sw)) 1285 #define TERM_REL_OFFSET_OF(base, offs) ((base) + ((offs) * sizeof(sljit_sw))) 1286 1287 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \ 1288 CHECK(sljit_emit_op1(compiler, type, arg1, arg2, arg3, arg4)) 1289 1290 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \ 1291 CHECK(sljit_emit_op2(compiler, type, arg1, arg2, arg3, arg4, arg5, arg6)) 1292 1293 #define EMIT_LABEL(label) \ 1294 label = sljit_emit_label(compiler); \ 1295 CHECK(!label) 1296 1297 #define EMIT_JUMP(jump, type) \ 1298 jump = sljit_emit_jump(compiler, type); \ 1299 CHECK(!jump) 1300 1301 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \ 1302 jump = sljit_emit_cmp(compiler, type, arg1, arg2, arg3, arg4); \ 1303 CHECK(!jump) 1304 1305 /* CHECK depends on the use case. */ 1306 1307 #define CHECK(exp) \ 1308 if (SLJIT_UNLIKELY(exp)) \ 1309 return REGEX_MEMORY_ERROR 1310 1311 static int compile_uncond_tran(struct compiler_common *compiler_common, int reg) 1312 { 1313 struct sljit_compiler *compiler = compiler_common->compiler; 1314 struct stack *stack = &compiler_common->stack; 1315 struct stack_item *search_states = compiler_common->search_states; 1316 int flags = compiler_common->flags; 1317 sljit_sw no_states = compiler_common->no_states; 1318 sljit_uw head = 0; 1319 sljit_sw offset, value; 1320 1321 if (reg != R_CURR_STATE || !(compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)) { 1322 CHECK(trace_transitions(0, compiler_common)); 1323 } 1324 else { 1325 CHECK(trace_transitions(1, compiler_common)); 1326 } 1327 1328 while (stack->count > 0) { 1329 value = stack_pop(stack)->value; 1330 if (search_states[value].type >= 0) { 1331 offset = TERM_OFFSET_OF(search_states[value].type, 0); 1332 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head); 1333 if (offset > 0) 1334 head = offset; 1335 1336 if (!(flags & REGEX_MATCH_BEGIN)) { 1337 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), R_TEMP, 0); 1338 if (flags & REGEX_ID_CHECK) { 1339 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, search_states[value].value); 1340 } 1341 } 1342 else if (flags & REGEX_ID_CHECK) { 1343 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, search_states[value].value); 1344 } 1345 } 1346 search_states[value].value = -1; 1347 } 1348 if (reg == R_NEXT_STATE) { 1349 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0); 1350 } 1351 else if (flags & REGEX_FAKE_MATCH_BEGIN) { 1352 SLJIT_ASSERT(compiler_common->dfa_transitions[1].type == type_newline && !compiler_common->dfa_transitions[1].value); 1353 offset = TERM_OFFSET_OF(search_states[1].type, 0); 1354 1355 SLJIT_ASSERT(!(flags & REGEX_MATCH_BEGIN)); 1356 1357 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head); 1358 head = offset; 1359 1360 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, 1); 1361 if (flags & REGEX_ID_CHECK) { 1362 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, 0); 1363 } 1364 } 1365 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, head); 1366 return REGEX_NO_ERROR; 1367 } 1368 1369 static int compile_cond_tran(struct compiler_common *compiler_common, sljit_sw curr_index) 1370 { 1371 struct sljit_compiler *compiler = compiler_common->compiler; 1372 struct stack *stack = &compiler_common->stack; 1373 struct stack_item *search_states = compiler_common->search_states; 1374 sljit_sw offset; 1375 int flags; 1376 sljit_sw no_states; 1377 sljit_sw value; 1378 struct sljit_jump *jump1; 1379 struct sljit_jump *jump2; 1380 struct sljit_jump *jump3; 1381 struct sljit_jump *jump4; 1382 struct sljit_jump *jump5; 1383 struct sljit_label *label1; 1384 1385 flags = compiler_common->flags; 1386 no_states = compiler_common->no_states; 1387 1388 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0); 1389 if (!(flags & (REGEX_ID_CHECK | REGEX_MATCH_BEGIN))) { 1390 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2)); 1391 } 1392 1393 while (stack->count > 0) { 1394 value = stack_pop(stack)->value; 1395 if (search_states[value].type >= 0) { 1396 #ifdef REGEX_MATCH_VERBOSE 1397 if (flags & REGEX_MATCH_VERBOSE) 1398 printf("-> (%3d:%3d) ", search_states[value].type, search_states[value].value); 1399 #endif 1400 offset = TERM_OFFSET_OF(search_states[value].type, 0); 1401 1402 if (!(flags & REGEX_ID_CHECK)) { 1403 if (!(flags & REGEX_MATCH_BEGIN)) { 1404 /* Check whether item is inserted. */ 1405 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1); 1406 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0); 1407 if (offset > 0) { 1408 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset); 1409 } 1410 EMIT_JUMP(jump2, SLJIT_JUMP); 1411 1412 /* Check whether old index <= index. */ 1413 EMIT_LABEL(label1); 1414 sljit_set_label(jump1, label1); 1415 1416 EMIT_CMP(jump1, SLJIT_LESS_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); 1417 1418 EMIT_LABEL(label1); 1419 sljit_set_label(jump2, label1); 1420 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); 1421 1422 EMIT_LABEL(label1); 1423 sljit_set_label(jump1, label1); 1424 } 1425 else { 1426 /* Check whether item is inserted. */ 1427 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1); 1428 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0); 1429 if (offset > 0) { 1430 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset); 1431 } 1432 EMIT_LABEL(label1); 1433 sljit_set_label(jump1, label1); 1434 } 1435 } 1436 else { 1437 if (!(flags & REGEX_MATCH_BEGIN)) { 1438 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2)); 1439 1440 /* Check whether item is inserted. */ 1441 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1); 1442 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0); 1443 if (offset > 0) { 1444 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset); 1445 } 1446 EMIT_JUMP(jump2, SLJIT_JUMP); 1447 1448 /* Check whether old index != index. */ 1449 EMIT_LABEL(label1); 1450 sljit_set_label(jump1, label1); 1451 1452 EMIT_OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); 1453 EMIT_JUMP(jump1, SLJIT_LESS); 1454 EMIT_JUMP(jump3, SLJIT_GREATER); 1455 1456 /* Old index == index. */ 1457 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3)); 1458 if (search_states[value].value > 0) { 1459 EMIT_CMP(jump4, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value); 1460 1461 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value); 1462 EMIT_LABEL(label1); 1463 sljit_set_label(jump4, label1); 1464 } 1465 1466 EMIT_OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0); 1467 EMIT_JUMP(jump4, SLJIT_GREATER_EQUAL); 1468 EMIT_JUMP(jump5, SLJIT_JUMP); 1469 1470 /* Overwrite index & id. */ 1471 EMIT_LABEL(label1); 1472 sljit_set_label(jump3, label1); 1473 sljit_set_label(jump2, label1); 1474 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); 1475 1476 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3)); 1477 if (search_states[value].value > 0) { 1478 EMIT_CMP(jump3, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value); 1479 1480 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value); 1481 EMIT_LABEL(label1); 1482 sljit_set_label(jump3, label1); 1483 } 1484 1485 EMIT_LABEL(label1); 1486 sljit_set_label(jump5, label1); 1487 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0); 1488 1489 /* Exit. */ 1490 EMIT_LABEL(label1); 1491 sljit_set_label(jump1, label1); 1492 sljit_set_label(jump4, label1); 1493 } 1494 else { 1495 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2)); 1496 1497 if (search_states[value].value > 0) { 1498 EMIT_CMP(jump1, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value); 1499 1500 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value); 1501 EMIT_LABEL(label1); 1502 sljit_set_label(jump1, label1); 1503 } 1504 1505 /* Check whether item is inserted. */ 1506 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1); 1507 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0); 1508 if (offset > 0) { 1509 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset); 1510 } 1511 EMIT_JUMP(jump2, SLJIT_JUMP); 1512 1513 /* Check whether old id >= id. */ 1514 EMIT_LABEL(label1); 1515 sljit_set_label(jump1, label1); 1516 1517 EMIT_CMP(jump1, SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); 1518 1519 EMIT_LABEL(label1); 1520 sljit_set_label(jump2, label1); 1521 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); 1522 1523 EMIT_LABEL(label1); 1524 sljit_set_label(jump1, label1); 1525 } 1526 } 1527 } 1528 search_states[value].value = -1; 1529 } 1530 1531 #ifdef REGEX_MATCH_VERBOSE 1532 if (flags & REGEX_MATCH_VERBOSE) 1533 printf("\n"); 1534 #endif 1535 return REGEX_NO_ERROR; 1536 } 1537 1538 static int compile_end_check(struct compiler_common *compiler_common, struct sljit_label *end_check_label) 1539 { 1540 struct sljit_compiler *compiler = compiler_common->compiler; 1541 struct sljit_jump *jump; 1542 struct sljit_jump *clear_states_jump; 1543 struct sljit_label *label; 1544 struct sljit_label *leave_label; 1545 struct sljit_label *begin_loop_label; 1546 1547 /* Priority order: best_begin > best_end > best_id. 1548 In other words: 1549 if (new best_begin > old test_begin) do nothing 1550 otherwise we know that new_end > old_end, since R_CURR_INDEX ever increasing 1551 therefore we must overwrite all best_* variables (new_id also contains the highest id for this turn). */ 1552 1553 /* Both R_CURR_CHAR and R_BEST_BEGIN used as temporary registers. */ 1554 1555 if (!(compiler_common->flags & REGEX_MATCH_BEGIN)) { 1556 EMIT_OP1(SLJIT_MOV, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2)); 1557 EMIT_CMP(jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_LESS : SLJIT_LESS_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0); 1558 sljit_set_label(jump, end_check_label); 1559 1560 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0); 1561 if (!(compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END))) { 1562 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0); 1563 } 1564 else { 1565 if ((compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) == (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) { 1566 EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 2); 1567 } 1568 else { 1569 EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 1); 1570 } 1571 } 1572 if (compiler_common->flags & REGEX_ID_CHECK) { 1573 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 3)); 1574 } 1575 1576 EMIT_CMP(clear_states_jump, SLJIT_LESS, R_CURR_CHAR, 0, R_BEST_BEGIN, 0); 1577 1578 EMIT_LABEL(leave_label); 1579 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, R_CURR_CHAR, 0); 1580 EMIT_JUMP(jump, SLJIT_JUMP); 1581 sljit_set_label(jump, end_check_label); 1582 1583 /* A loop to clear all states, which are > (or >=) than R_CURR_CHAR. */ 1584 EMIT_LABEL(label); 1585 sljit_set_label(clear_states_jump, label); 1586 1587 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0); 1588 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0); 1589 1590 /* Begin of the loop. */ 1591 EMIT_LABEL(begin_loop_label); 1592 EMIT_CMP(jump, SLJIT_EQUAL, R_TEMP, 0, SLJIT_IMM, 0); 1593 sljit_set_label(jump, leave_label); 1594 1595 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, R_CURR_STATE, 0); 1596 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw)); 1597 EMIT_CMP(clear_states_jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_GREATER : SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_TEMP), 2 * sizeof(sljit_sw), R_CURR_CHAR, 0); 1598 1599 /* Case 1: keep this case. */ 1600 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), R_NEXT_HEAD, 0); 1601 EMIT_OP2(SLJIT_SUB, R_NEXT_HEAD, 0, R_TEMP, 0, R_CURR_STATE, 0); 1602 1603 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0); 1604 EMIT_JUMP(jump, SLJIT_JUMP); 1605 sljit_set_label(jump, begin_loop_label); 1606 1607 /* Case 2: remove this case. */ 1608 EMIT_LABEL(label); 1609 sljit_set_label(clear_states_jump, label); 1610 1611 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), SLJIT_IMM, -1); 1612 1613 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0); 1614 EMIT_JUMP(jump, SLJIT_JUMP); 1615 sljit_set_label(jump, begin_loop_label); 1616 } 1617 else { 1618 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_IMM, 0); 1619 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0); 1620 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0); 1621 if (compiler_common->flags & REGEX_ID_CHECK) { 1622 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2)); 1623 } 1624 EMIT_JUMP(jump, SLJIT_JUMP); 1625 sljit_set_label(jump, end_check_label); 1626 } 1627 return REGEX_NO_ERROR; 1628 } 1629 1630 static int compile_leave_fast_forward(struct compiler_common *compiler_common, struct sljit_label *fast_forward_label) 1631 { 1632 struct sljit_compiler *compiler = compiler_common->compiler; 1633 struct stack *stack = &compiler_common->stack; 1634 struct stack_item *dfa_transitions = compiler_common->dfa_transitions; 1635 struct stack_item *search_states = compiler_common->search_states; 1636 int ind; 1637 struct sljit_jump *jump; 1638 int init_range = 1, prev_value = 0; 1639 1640 while (stack->count > 0) { 1641 ind = stack_pop(stack)->value; 1642 search_states[ind].value = -1; 1643 if (search_states[ind].type >= 0) { 1644 if (dfa_transitions[ind].type == type_char) { 1645 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value); 1646 sljit_set_label(jump, fast_forward_label); 1647 } 1648 else if (dfa_transitions[ind].type == type_rng_start) { 1649 SLJIT_ASSERT(!dfa_transitions[ind].value); 1650 ind++; 1651 while (dfa_transitions[ind].type != type_rng_end) { 1652 if (dfa_transitions[ind].type == type_rng_char) { 1653 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value); 1654 sljit_set_label(jump, fast_forward_label); 1655 } 1656 else { 1657 SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left); 1658 if (init_range) { 1659 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0); 1660 init_range = 0; 1661 } 1662 if (dfa_transitions[ind].value != prev_value) { 1663 /* Best compatibility to all archs. */ 1664 prev_value -= dfa_transitions[ind].value; 1665 if (prev_value < 0) { 1666 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value); 1667 } 1668 else { 1669 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value); 1670 } 1671 prev_value = dfa_transitions[ind].value; 1672 } 1673 EMIT_CMP(jump, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value); 1674 sljit_set_label(jump, fast_forward_label); 1675 ind++; 1676 } 1677 ind++; 1678 } 1679 } 1680 else { 1681 SLJIT_ASSERT(dfa_transitions[ind].type == type_newline); 1682 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n'); 1683 sljit_set_label(jump, fast_forward_label); 1684 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r'); 1685 sljit_set_label(jump, fast_forward_label); 1686 } 1687 } 1688 } 1689 return REGEX_NO_ERROR; 1690 } 1691 1692 static int compile_newline_check(struct compiler_common *compiler_common, sljit_sw ind) 1693 { 1694 struct sljit_compiler *compiler = compiler_common->compiler; 1695 struct sljit_jump *jump1; 1696 struct sljit_jump *jump2; 1697 struct sljit_label *label; 1698 sljit_sw no_states; 1699 sljit_sw offset; 1700 1701 /* Check whether a new-line character is found. */ 1702 EMIT_CMP(jump1, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n'); 1703 EMIT_CMP(jump2, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r'); 1704 1705 no_states = compiler_common->no_states; 1706 offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1); 1707 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset); 1708 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1); 1709 CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); 1710 1711 EMIT_LABEL(label); 1712 sljit_set_label(jump1, label); 1713 sljit_set_label(jump2, label); 1714 return REGEX_NO_ERROR; 1715 } 1716 1717 #undef CHECK 1718 1719 #define CHECK(exp) \ 1720 if (SLJIT_UNLIKELY(exp)) \ 1721 return 0 1722 1723 static SLJIT_INLINE void range_set_label(struct sljit_jump **range_jump_list, struct sljit_label *label) 1724 { 1725 while (*range_jump_list) { 1726 sljit_set_label(*range_jump_list, label); 1727 range_jump_list++; 1728 } 1729 } 1730 1731 static sljit_sw compile_range_check(struct compiler_common *compiler_common, sljit_sw ind) 1732 { 1733 struct sljit_compiler *compiler = compiler_common->compiler; 1734 struct stack_item *dfa_transitions = compiler_common->dfa_transitions; 1735 struct sljit_jump **range_jump_list = compiler_common->range_jump_list; 1736 int invert = dfa_transitions[ind].value; 1737 struct sljit_label *label; 1738 sljit_sw no_states; 1739 sljit_sw offset; 1740 int init_range = 1, prev_value = 0; 1741 1742 ind++; 1743 1744 while (dfa_transitions[ind].type != type_rng_end) { 1745 if (dfa_transitions[ind].type == type_rng_char) { 1746 EMIT_CMP(*range_jump_list, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value); 1747 range_jump_list++; 1748 } 1749 else { 1750 SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left); 1751 if (init_range) { 1752 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0); 1753 init_range = 0; 1754 } 1755 if (dfa_transitions[ind].value != prev_value) { 1756 /* Best compatibility to all archs. */ 1757 prev_value -= dfa_transitions[ind].value; 1758 if (prev_value < 0) { 1759 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value); 1760 } 1761 else { 1762 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value); 1763 } 1764 prev_value = dfa_transitions[ind].value; 1765 } 1766 EMIT_CMP(*range_jump_list, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value); 1767 range_jump_list++; 1768 ind++; 1769 } 1770 ind++; 1771 } 1772 1773 *range_jump_list = NULL; 1774 1775 if (!invert) { 1776 no_states = compiler_common->no_states; 1777 offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1); 1778 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset); 1779 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1); 1780 CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); 1781 1782 EMIT_LABEL(label); 1783 range_set_label(compiler_common->range_jump_list, label); 1784 /* Clears the jump list. */ 1785 *compiler_common->range_jump_list = NULL; 1786 } 1787 return ind; 1788 } 1789 1790 #undef TERM_OFFSET_OF 1791 #undef EMIT_OP1 1792 #undef EMIT_OP2 1793 #undef EMIT_LABEL 1794 #undef EMIT_JUMP 1795 #undef EMIT_CMP 1796 #undef CHECK 1797 1798 /* --------------------------------------------------------------------- */ 1799 /* Main compiler */ 1800 /* --------------------------------------------------------------------- */ 1801 1802 #define TERM_OFFSET_OF(ind, offs) (((ind) * compiler_common.no_states + (offs)) * sizeof(sljit_sw)) 1803 1804 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \ 1805 CHECK(sljit_emit_op1(compiler_common.compiler, type, arg1, arg2, arg3, arg4)) 1806 1807 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \ 1808 CHECK(sljit_emit_op2(compiler_common.compiler, type, arg1, arg2, arg3, arg4, arg5, arg6)) 1809 1810 #define EMIT_LABEL(label) \ 1811 label = sljit_emit_label(compiler_common.compiler); \ 1812 CHECK(!label) 1813 1814 #define EMIT_JUMP(jump, type) \ 1815 jump = sljit_emit_jump(compiler_common.compiler, type); \ 1816 CHECK(!jump) 1817 1818 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \ 1819 jump = sljit_emit_cmp(compiler_common.compiler, type, arg1, arg2, arg3, arg4); \ 1820 CHECK(!jump) 1821 1822 /* A do {} while(0) expression helps to avoid goto statements. */ 1823 #define BEGIN_GUARD \ 1824 do { 1825 1826 #define END_GUARD \ 1827 } while(0); 1828 1829 #define CHECK(exp) \ 1830 if (SLJIT_UNLIKELY(exp)) \ 1831 break; 1832 1833 struct regex_machine* regex_compile(const regex_char_t *regex_string, int length, int re_flags, int *error) 1834 { 1835 struct compiler_common compiler_common; 1836 sljit_sw ind; 1837 int error_code, done, suggest_fast_forward; 1838 /* ID of an empty match (-1 if not reachable). */ 1839 int empty_match_id; 1840 1841 struct sljit_jump *jump; 1842 struct sljit_jump *best_match_found_jump; 1843 struct sljit_jump *fast_forward_jump = NULL; 1844 struct sljit_jump *length_is_zero_jump; 1845 struct sljit_jump *end_check_jump = NULL; 1846 struct sljit_jump *best_match_check_jump = NULL; 1847 struct sljit_jump *non_greedy_end_jump = NULL; 1848 struct sljit_label *label; 1849 struct sljit_label *end_check_label = NULL; 1850 struct sljit_label *start_label; 1851 struct sljit_label *fast_forward_label; 1852 struct sljit_label *fast_forward_return_label; 1853 1854 if (error) 1855 *error = REGEX_NO_ERROR; 1856 #ifdef REGEX_MATCH_VERBOSE 1857 compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE | REGEX_MATCH_VERBOSE); 1858 #else 1859 compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE); 1860 #endif 1861 1862 /* Step 1: parsing (Left->Right). 1863 Syntax check and AST generator. */ 1864 error_code = parse(regex_string, length, &compiler_common); 1865 if (error_code) { 1866 stack_destroy(&compiler_common.stack); 1867 if (error) 1868 *error = error_code; 1869 return NULL; 1870 } 1871 1872 /* Step 2: generating branches (Right->Left). */ 1873 error_code = generate_transitions(&compiler_common); 1874 stack_destroy(&compiler_common.stack); 1875 stack_destroy(&compiler_common.depth); 1876 if (error_code) { 1877 if (compiler_common.dfa_transitions) 1878 SLJIT_FREE(compiler_common.dfa_transitions, NULL); 1879 if (error) 1880 *error = error_code; 1881 return NULL; 1882 } 1883 1884 /* Step 3: Generate necessary data for depth-first search (Left->Right). */ 1885 error_code = generate_search_states(&compiler_common); 1886 if (error_code) { 1887 SLJIT_FREE(compiler_common.dfa_transitions, NULL); 1888 if (error) 1889 *error = error_code; 1890 return NULL; 1891 } 1892 1893 #ifdef REGEX_MATCH_VERBOSE 1894 if (compiler_common.flags & REGEX_MATCH_VERBOSE) 1895 verbose_transitions(&compiler_common); 1896 #endif 1897 1898 /* Step 4: Left->Right generate code. */ 1899 stack_init(&compiler_common.stack); 1900 stack_init(&compiler_common.depth); 1901 done = 0; 1902 compiler_common.machine = NULL; 1903 compiler_common.compiler = NULL; 1904 compiler_common.range_jump_list = NULL; 1905 1906 BEGIN_GUARD 1907 1908 compiler_common.machine = (struct regex_machine*)SLJIT_MALLOC(sizeof(struct regex_machine) + (compiler_common.terms_size - 1) * sizeof(sljit_uw), NULL); 1909 CHECK(!compiler_common.machine); 1910 1911 compiler_common.compiler = sljit_create_compiler(NULL); 1912 CHECK(!compiler_common.compiler); 1913 1914 if (compiler_common.longest_range_size > 0) { 1915 compiler_common.range_jump_list = (struct sljit_jump**)SLJIT_MALLOC(sizeof(struct sljit_jump*) * compiler_common.longest_range_size, NULL); 1916 CHECK(!compiler_common.range_jump_list); 1917 } 1918 1919 if ((compiler_common.flags & REGEX_ID_CHECK) && !(compiler_common.flags & REGEX_MATCH_BEGIN)) 1920 compiler_common.no_states = 4; 1921 else if (!(compiler_common.flags & REGEX_ID_CHECK) && (compiler_common.flags & REGEX_MATCH_BEGIN)) 1922 compiler_common.no_states = 2; 1923 else 1924 compiler_common.no_states = 3; 1925 1926 compiler_common.machine->flags = compiler_common.flags; 1927 compiler_common.machine->no_states = compiler_common.no_states; 1928 compiler_common.machine->size = compiler_common.machine->no_states * compiler_common.terms_size; 1929 1930 /* Study the regular expression. */ 1931 empty_match_id = -1; 1932 suggest_fast_forward = 1; 1933 if (!(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN)) { 1934 CHECK(trace_transitions(0, &compiler_common)); 1935 while (compiler_common.stack.count > 0) { 1936 ind = stack_pop(&compiler_common.stack)->value; 1937 if (compiler_common.search_states[ind].type == 0) { 1938 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end); 1939 suggest_fast_forward = 0; 1940 empty_match_id = compiler_common.search_states[ind].value; 1941 } 1942 else if (compiler_common.search_states[ind].type > 0) { 1943 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type != type_end); 1944 if (compiler_common.dfa_transitions[ind].type == type_rng_start && compiler_common.dfa_transitions[ind].value) 1945 suggest_fast_forward = 0; 1946 } 1947 compiler_common.search_states[ind].value = -1; 1948 } 1949 } 1950 else { 1951 SLJIT_ASSERT(compiler_common.dfa_transitions[1].type == type_newline); 1952 CHECK(trace_transitions(1, &compiler_common)); 1953 while (compiler_common.stack.count > 0) { 1954 ind = stack_pop(&compiler_common.stack)->value; 1955 if (compiler_common.search_states[ind].type == 0) { 1956 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end); 1957 suggest_fast_forward = 0; 1958 empty_match_id = compiler_common.search_states[ind].value; 1959 } 1960 compiler_common.search_states[ind].value = -1; 1961 } 1962 } 1963 1964 /* Step 4.1: Generate entry. */ 1965 CHECK(sljit_emit_enter(compiler_common.compiler, 0, 3, 5, 5, 0, 0, 0)); 1966 1967 /* Copy arguments to their place. */ 1968 EMIT_OP1(SLJIT_MOV, R_REGEX_MATCH, 0, SLJIT_S0, 0); 1969 EMIT_OP1(SLJIT_MOV, R_STRING, 0, SLJIT_S1, 0); 1970 EMIT_OP2(SLJIT_ADD, R_LENGTH, 0, SLJIT_S2, 0, SLJIT_IMM, 1); 1971 1972 /* Init global registers. */ 1973 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current)); 1974 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next)); 1975 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head)); 1976 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin)); 1977 EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index)); 1978 1979 /* Check whether the best match has already found in a previous frame. */ 1980 EMIT_CMP(jump, SLJIT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 0); 1981 EMIT_JUMP(best_match_found_jump, SLJIT_JUMP); 1982 1983 #ifdef REGEX_MATCH_VERBOSE 1984 if (compiler_common.flags & REGEX_MATCH_VERBOSE) 1985 printf("\n-----------------\nTrace\n-----------------\n"); 1986 #endif 1987 1988 /* Step 4.2: Generate code for state 0. */ 1989 EMIT_LABEL(label); 1990 compiler_common.machine->entry_addrs[0] = (sljit_uw)label; 1991 1992 /* Swapping current and next. */ 1993 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_STATE, 0); 1994 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_NEXT_STATE, 0); 1995 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_TEMP, 0); 1996 1997 /* Checking whether the best case needs to be updated. */ 1998 if (!(compiler_common.flags & REGEX_MATCH_END)) { 1999 EMIT_CMP(end_check_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1); 2000 EMIT_LABEL(end_check_label); 2001 } 2002 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1); 2003 EMIT_OP2(SLJIT_ADD, R_CURR_INDEX, 0, R_CURR_INDEX, 0, SLJIT_IMM, 1); 2004 2005 /* Checking whether best case has already found. */ 2006 if (!(compiler_common.flags & REGEX_MATCH_END) || (compiler_common.flags & REGEX_MATCH_BEGIN)) { 2007 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) { 2008 /* We can bail out if no more active states remain and R_BEST_BEGIN != -1. */ 2009 EMIT_CMP(best_match_check_jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1); 2010 } 2011 else { 2012 /* We can bail out if no more active states remain (regardless of R_BEST_BEGIN). */ 2013 EMIT_CMP(best_match_check_jump, SLJIT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0); 2014 } 2015 } 2016 2017 EMIT_LABEL(start_label); 2018 sljit_set_label(jump, start_label); 2019 2020 if (!(compiler_common.flags & REGEX_MATCH_BEGIN) && suggest_fast_forward) { 2021 EMIT_CMP(fast_forward_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0); 2022 } 2023 2024 /* Loading the next character. */ 2025 EMIT_OP2(SLJIT_SUB | SLJIT_SET_E, R_LENGTH, 0, R_LENGTH, 0, SLJIT_IMM, 1); 2026 EMIT_JUMP(length_is_zero_jump, SLJIT_EQUAL); 2027 2028 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_STRING, 0); 2029 #ifdef REGEX_USE_8BIT_CHARS 2030 EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0); 2031 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 1); 2032 #else 2033 EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0); 2034 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 2); 2035 #endif 2036 EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_TEMP, 0); 2037 2038 #ifdef REGEX_MATCH_VERBOSE 2039 if (compiler_common.flags & REGEX_MATCH_VERBOSE) { 2040 printf("(%3d): ", 0); 2041 CHECK(trace_transitions(0, &compiler_common)); 2042 while (compiler_common.stack.count > 0) { 2043 ind = stack_pop(&compiler_common.stack)->value; 2044 if (compiler_common.search_states[ind].type >= 0) 2045 printf("-> (%3d:%3d) ", compiler_common.search_states[ind].type, compiler_common.search_states[ind].value); 2046 compiler_common.search_states[ind].value = -1; 2047 } 2048 printf("\n"); 2049 } 2050 #endif 2051 2052 EMIT_LABEL(fast_forward_return_label); 2053 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) { 2054 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 1); 2055 if (!(compiler_common.flags & REGEX_MATCH_END)) { 2056 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1); 2057 } 2058 2059 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_INDEX, 0); 2060 CHECK(compile_uncond_tran(&compiler_common, R_NEXT_STATE)); 2061 /* And branching to the first state. */ 2062 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); 2063 2064 if (!(compiler_common.flags & REGEX_MATCH_END)) { 2065 EMIT_LABEL(label); 2066 sljit_set_label(jump, label); 2067 } 2068 } 2069 /* This is the case where we only have to reset the R_NEXT_HEAD. */ 2070 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0); 2071 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0); 2072 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); 2073 2074 /* Fast-forward loop. */ 2075 if (fast_forward_jump) { 2076 /* Quit from fast-forward loop. */ 2077 EMIT_LABEL(fast_forward_label); 2078 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1); 2079 EMIT_OP1(SLJIT_MOV, R_LENGTH, 0, R_NEXT_STATE, 0); 2080 EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_CURR_STATE, 0); 2081 EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, R_NEXT_HEAD, 0); 2082 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next)); 2083 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current)); 2084 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head)); 2085 2086 /* Update the start field of the locations. */ 2087 CHECK(trace_transitions(0, &compiler_common)); 2088 while (compiler_common.stack.count > 0) { 2089 ind = stack_pop(&compiler_common.stack)->value; 2090 if (compiler_common.search_states[ind].type >= 0) { 2091 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 2), R_TEMP, 0); 2092 } 2093 compiler_common.search_states[ind].value = -1; 2094 } 2095 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0); 2096 EMIT_JUMP(jump, SLJIT_JUMP); 2097 sljit_set_label(jump, fast_forward_return_label); 2098 2099 /* Start fast-forward. */ 2100 EMIT_LABEL(label); 2101 sljit_set_label(fast_forward_jump, label); 2102 2103 /* Moving everything to registers. */ 2104 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0); 2105 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0); 2106 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0); 2107 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_LENGTH, 0); 2108 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_STRING, 0); 2109 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, R_CURR_INDEX, 0); 2110 2111 /* Fast forward mainloop. */ 2112 EMIT_LABEL(label); 2113 EMIT_OP2(SLJIT_SUB | SLJIT_SET_E, R_NEXT_STATE, 0, R_NEXT_STATE, 0, SLJIT_IMM, 1); 2114 EMIT_JUMP(fast_forward_jump, SLJIT_EQUAL); 2115 2116 #ifdef REGEX_USE_8BIT_CHARS 2117 EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0); 2118 EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 1); 2119 #else 2120 EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0); 2121 EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 2); 2122 #endif 2123 2124 CHECK(trace_transitions(0, &compiler_common)); 2125 CHECK(compile_leave_fast_forward(&compiler_common, fast_forward_label)); 2126 2127 EMIT_OP2(SLJIT_ADD, R_NEXT_HEAD, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1); 2128 EMIT_JUMP(jump, SLJIT_JUMP); 2129 sljit_set_label(jump, label); 2130 2131 /* String is finished. */ 2132 EMIT_LABEL(label); 2133 sljit_set_label(fast_forward_jump, label); 2134 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_NEXT_HEAD, 0); 2135 EMIT_JUMP(fast_forward_jump, SLJIT_JUMP); 2136 } 2137 2138 /* End check. */ 2139 if (end_check_jump) { 2140 EMIT_LABEL(label); 2141 sljit_set_label(end_check_jump, label); 2142 2143 if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || !(compiler_common.flags & REGEX_MATCH_BEGIN)) { 2144 CHECK(compile_end_check(&compiler_common, end_check_label)); 2145 } 2146 else { 2147 /* Since we leave, we do not need to update the R_BEST_BEGIN. */ 2148 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0); 2149 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0); 2150 if (compiler_common.flags & REGEX_ID_CHECK) { 2151 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2)); 2152 } 2153 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1); 2154 EMIT_JUMP(non_greedy_end_jump, SLJIT_JUMP); 2155 } 2156 } 2157 2158 /* Finish check. */ 2159 if (best_match_check_jump) { 2160 EMIT_LABEL(label); 2161 sljit_set_label(best_match_check_jump, label); 2162 2163 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) { 2164 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0); 2165 sljit_set_label(jump, start_label); 2166 } 2167 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1); 2168 } 2169 2170 /* Leaving matching and storing the necessary values. */ 2171 EMIT_LABEL(label); 2172 sljit_set_label(length_is_zero_jump, label); 2173 if (non_greedy_end_jump) 2174 sljit_set_label(non_greedy_end_jump, label); 2175 2176 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_CURR_INDEX, 0); 2177 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0); 2178 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0); 2179 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0); 2180 2181 /* Exit from JIT. */ 2182 EMIT_LABEL(label); 2183 sljit_set_label(best_match_found_jump, label); 2184 if (fast_forward_jump) 2185 sljit_set_label(fast_forward_jump, label); 2186 CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_UNUSED, 0, 0)); 2187 2188 for (ind = 1; ind < compiler_common.dfa_size - 1; ind++) { 2189 if (compiler_common.search_states[ind].type >= 0) { 2190 SLJIT_ASSERT(compiler_common.search_states[ind].type < compiler_common.terms_size); 2191 EMIT_LABEL(label); 2192 compiler_common.machine->entry_addrs[compiler_common.search_states[ind].type] = (sljit_uw)label; 2193 2194 if (compiler_common.dfa_transitions[ind].type == type_char) { 2195 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, compiler_common.dfa_transitions[ind].value); 2196 } 2197 else if (compiler_common.dfa_transitions[ind].type == type_rng_start) { 2198 ind = compile_range_check(&compiler_common, ind); 2199 CHECK(!ind); 2200 } 2201 else { 2202 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline); 2203 CHECK(compile_newline_check(&compiler_common, ind)); 2204 } 2205 2206 CHECK(trace_transitions(ind, &compiler_common)); 2207 #ifdef REGEX_MATCH_VERBOSE 2208 if (compiler_common.flags & REGEX_MATCH_VERBOSE) 2209 printf("(%3d): ", compiler_common.search_states[ind].type); 2210 #endif 2211 CHECK(compile_cond_tran(&compiler_common, compiler_common.search_states[ind].type)); 2212 2213 if (compiler_common.dfa_transitions[ind].type == type_char) { 2214 EMIT_LABEL(label); 2215 sljit_set_label(jump, label); 2216 } 2217 else if (compiler_common.dfa_transitions[ind].type == type_rng_end) { 2218 EMIT_LABEL(label); 2219 range_set_label(compiler_common.range_jump_list, label); 2220 } 2221 else { 2222 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline); 2223 } 2224 2225 /* Branch to the next item in the list. */ 2226 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1)); 2227 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1), SLJIT_IMM, -1); 2228 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); 2229 } 2230 } 2231 2232 if (ind == compiler_common.dfa_size - 1) { 2233 /* Generate an init stub function. */ 2234 EMIT_LABEL(label); 2235 CHECK(sljit_emit_enter(compiler_common.compiler, 0, 2, 3, 3, 0, 0, 0)); 2236 2237 if (empty_match_id == -1) { 2238 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, -1); 2239 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, 0); 2240 } 2241 else { 2242 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0); 2243 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, empty_match_id); 2244 } 2245 2246 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, index), SLJIT_IMM, !(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN) ? 1 : 2); 2247 2248 if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || empty_match_id == -1) { 2249 /* The else is a really rare event, so we still generate an empty function instead of a runtime pointer check. */ 2250 SLJIT_ASSERT(R_CURR_STATE == SLJIT_S0); 2251 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) { 2252 /* R_CURR_INDEX (put to R_TEMP) is zero. */ 2253 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, 0); 2254 } 2255 CHECK(compile_uncond_tran(&compiler_common, R_CURR_STATE)); 2256 } 2257 else { 2258 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0); 2259 } 2260 CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_MOV, R_NEXT_HEAD, 0)); 2261 2262 compiler_common.machine->continue_match = sljit_generate_code(compiler_common.compiler); 2263 #ifndef SLJIT_INDIRECT_CALL 2264 compiler_common.machine->u.init_match = (void*)(sljit_sw)sljit_get_label_addr(label); 2265 #else 2266 sljit_set_function_context(&compiler_common.machine->u.init_match, &compiler_common.machine->context, sljit_get_label_addr(label), regex_compile); 2267 #endif 2268 #ifdef REGEX_MATCH_VERBOSE 2269 if (compiler_common.flags & REGEX_MATCH_VERBOSE) 2270 printf("Continue match: %p Init match: %p\n\n", compiler_common.machine->continue_match, compiler_common.machine->u.init_match); 2271 #endif 2272 if (compiler_common.machine->continue_match) { 2273 for (ind = 0; ind < compiler_common.terms_size; ++ind) 2274 compiler_common.machine->entry_addrs[ind] = sljit_get_label_addr((struct sljit_label*)compiler_common.machine->entry_addrs[ind]); 2275 done = 1; 2276 } 2277 } 2278 END_GUARD 2279 2280 stack_destroy(&compiler_common.stack); 2281 stack_destroy(&compiler_common.depth); 2282 SLJIT_FREE(compiler_common.dfa_transitions, NULL); 2283 SLJIT_FREE(compiler_common.search_states, NULL); 2284 if (compiler_common.range_jump_list) 2285 SLJIT_FREE(compiler_common.range_jump_list, NULL); 2286 if (compiler_common.compiler) 2287 sljit_free_compiler(compiler_common.compiler); 2288 if (done) 2289 return compiler_common.machine; 2290 2291 if (compiler_common.machine) { 2292 SLJIT_FREE(compiler_common.machine, NULL); 2293 } 2294 if (error) 2295 *error = REGEX_MEMORY_ERROR; 2296 return NULL; 2297 } 2298 2299 #undef TERM_OFFSET_OF 2300 #undef EMIT_OP1 2301 #undef EMIT_OP2 2302 #undef EMIT_LABEL 2303 #undef EMIT_JUMP 2304 #undef EMIT_CMP 2305 #undef BEGIN_GUARD 2306 #undef END_GUARD 2307 #undef CHECK 2308 2309 void regex_free_machine(struct regex_machine *machine) 2310 { 2311 sljit_free_code(machine->continue_match); 2312 SLJIT_FREE(machine, NULL); 2313 } 2314 2315 const char* regex_get_platform_name(void) 2316 { 2317 return sljit_get_platform_name(); 2318 } 2319 2320 /* --------------------------------------------------------------------- */ 2321 /* Mathching utilities */ 2322 /* --------------------------------------------------------------------- */ 2323 2324 struct regex_match* regex_begin_match(struct regex_machine *machine) 2325 { 2326 sljit_sw *ptr1; 2327 sljit_sw *ptr2; 2328 sljit_sw *end; 2329 sljit_sw *entry_addrs; 2330 2331 struct regex_match *match = (struct regex_match*)SLJIT_MALLOC(sizeof(struct regex_match) + (machine->size * 2 - 1) * sizeof(sljit_sw), NULL); 2332 if (!match) 2333 return NULL; 2334 2335 ptr1 = match->states; 2336 ptr2 = match->states + machine->size; 2337 end = ptr2; 2338 entry_addrs = (sljit_sw*)machine->entry_addrs; 2339 2340 match->current = ptr1; 2341 match->next = ptr2; 2342 match->head = 0; 2343 match->machine = machine; 2344 2345 /* Init machine states. */ 2346 switch (machine->no_states) { 2347 case 2: 2348 while (ptr1 < end) { 2349 *ptr1++ = *entry_addrs; 2350 *ptr2++ = *entry_addrs++; 2351 *ptr1++ = -1; 2352 *ptr2++ = -1; 2353 } 2354 break; 2355 2356 case 3: 2357 while (ptr1 < end) { 2358 *ptr1++ = *entry_addrs; 2359 *ptr2++ = *entry_addrs++; 2360 *ptr1++ = -1; 2361 *ptr2++ = -1; 2362 *ptr1++ = 0; 2363 *ptr2++ = 0; 2364 } 2365 break; 2366 2367 case 4: 2368 while (ptr1 < end) { 2369 *ptr1++ = *entry_addrs; 2370 *ptr2++ = *entry_addrs++; 2371 *ptr1++ = -1; 2372 *ptr2++ = -1; 2373 *ptr1++ = 0; 2374 *ptr2++ = 0; 2375 *ptr1++ = 0; 2376 *ptr2++ = 0; 2377 } 2378 break; 2379 2380 default: 2381 SLJIT_ASSERT_STOP(); 2382 break; 2383 } 2384 2385 SLJIT_ASSERT(ptr1 == end); 2386 2387 match->u.continue_match = machine->continue_match; 2388 2389 regex_reset_match(match); 2390 return match; 2391 } 2392 2393 void regex_reset_match(struct regex_match *match) 2394 { 2395 struct regex_machine *machine = match->machine; 2396 sljit_sw current, ind; 2397 sljit_sw *current_ptr; 2398 2399 match->best_end = 0; 2400 match->fast_quit = 0; 2401 match->fast_forward = 0; 2402 2403 if (match->head != 0) { 2404 /* Clear the current state. */ 2405 current = match->head; 2406 current_ptr = match->current; 2407 do { 2408 ind = (current / sizeof(sljit_sw)) + 1; 2409 current = current_ptr[ind]; 2410 current_ptr[ind] = -1; 2411 } while (current != 0); 2412 } 2413 match->head = machine->u.call_init(match->current, match); 2414 } 2415 2416 void regex_free_match(struct regex_match *match) 2417 { 2418 SLJIT_FREE(match, NULL); 2419 } 2420 2421 void regex_continue_match(struct regex_match *match, const regex_char_t *input_string, int length) 2422 { 2423 match->u.call_continue(match, input_string, length); 2424 } 2425 2426 int regex_get_result(struct regex_match *match, int *end, int *id) 2427 { 2428 int flags = match->machine->flags; 2429 sljit_sw no_states; 2430 2431 *end = match->best_end; 2432 *id = match->best_id; 2433 if (!(flags & (REGEX_MATCH_END | REGEX_FAKE_MATCH_END))) 2434 return match->best_begin; 2435 2436 if (flags & REGEX_FAKE_MATCH_END) { 2437 SLJIT_ASSERT(!(flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END))); 2438 if (match->best_begin != -1) 2439 return match->best_begin; 2440 2441 no_states = match->machine->no_states; 2442 if (match->current[no_states + 1] == -1) 2443 return -1; 2444 if (flags & REGEX_ID_CHECK) 2445 *id = match->current[no_states + 3]; 2446 if (!(flags & REGEX_FAKE_MATCH_BEGIN)) 2447 *end = match->index - 1; 2448 else 2449 *end = match->index - 2; 2450 return match->current[no_states + 2]; 2451 } 2452 else { 2453 /* Check the status of the last code. */ 2454 if (!(flags & REGEX_MATCH_BEGIN)) { 2455 /* No shortcut in this case. */ 2456 if (!(flags & REGEX_ID_CHECK)) { 2457 if (match->current[1] == -1) 2458 return -1; 2459 *end = match->index - 1; 2460 return match->current[2]; 2461 } 2462 2463 if (match->current[1] == -1) 2464 return -1; 2465 *end = match->index - 1; 2466 *id = match->current[3]; 2467 return match->current[2]; 2468 } 2469 2470 /* Shortcut is possible in this case. */ 2471 if (!(flags & REGEX_ID_CHECK)) { 2472 if (match->current[1] == -1 || match->head == -1) 2473 return -1; 2474 *end = match->index - 1; 2475 return 0; 2476 } 2477 2478 if (match->current[1] == -1 || match->head == -1) 2479 return -1; 2480 *end = match->index - 1; 2481 *id = match->current[2]; 2482 return 0; 2483 } 2484 } 2485 2486 int regex_is_match_finished(struct regex_match *match) 2487 { 2488 return match->fast_quit; 2489 } 2490 2491 #ifdef REGEX_MATCH_VERBOSE 2492 void regex_continue_match_debug(struct regex_match *match, const regex_char_t *input_string, int length) 2493 { 2494 sljit_sw *ptr; 2495 sljit_sw *end; 2496 sljit_sw count; 2497 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 2498 sljit_sw current; 2499 #endif 2500 sljit_sw no_states = match->machine->no_states; 2501 sljit_sw len = match->machine->size; 2502 2503 while (length > 0) { 2504 match->u.call_continue(match, input_string, 1); 2505 2506 if (match->fast_forward) { 2507 if (match->machine->flags & REGEX_MATCH_VERBOSE) 2508 printf("fast forward\n"); 2509 } 2510 2511 /* Verbose (first). */ 2512 if (match->machine->flags & REGEX_MATCH_VERBOSE) { 2513 ptr = match->current; 2514 end = ptr + len; 2515 count = 0; 2516 printf("'%c' (%3ld->%3ld [%3ld]) ", *input_string, (long)match->best_begin, (long)match->best_end, (long)match->best_id); 2517 while (ptr < end) { 2518 printf("[%3ld:", (long)count++); 2519 switch (no_states) { 2520 case 2: 2521 if (ptr[1] != -1) 2522 printf("+] "); 2523 else 2524 printf(" ] "); 2525 break; 2526 2527 case 3: 2528 if (ptr[1] != -1) 2529 printf("+,%3ld] ", (long)ptr[2]); 2530 else 2531 printf(" ,XXX] "); 2532 break; 2533 2534 case 4: 2535 if (ptr[1] != -1) 2536 printf("+,%3ld,%3ld] ", (long)ptr[2], (long)ptr[3]); 2537 else 2538 printf(" ,XXX,XXX] "); 2539 break; 2540 } 2541 ptr += no_states; 2542 } 2543 printf("\n"); 2544 } 2545 2546 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 2547 /* Sanity check (later). */ 2548 ptr = match->next; 2549 end = ptr + len; 2550 while (ptr < end) { 2551 SLJIT_ASSERT(ptr[1] == -1); 2552 ptr += no_states; 2553 } 2554 2555 /* Check number of active elements. */ 2556 ptr = match->current + no_states; 2557 end = ptr + len - no_states; 2558 count = 0; 2559 while (ptr < end) { 2560 if (ptr[1] != -1) 2561 count++; 2562 ptr += no_states; 2563 } 2564 2565 /* Check chain list. */ 2566 current = match->head; 2567 ptr = match->current; 2568 while (current != 0) { 2569 SLJIT_ASSERT(current >= 0 && current < len * sizeof(sljit_sw)); 2570 SLJIT_ASSERT((current % (no_states * sizeof(sljit_sw))) == 0); 2571 SLJIT_ASSERT(count > 0); 2572 current = ptr[(current / sizeof(sljit_sw)) + 1]; 2573 count--; 2574 } 2575 SLJIT_ASSERT(count == 0); 2576 #endif 2577 2578 if (match->fast_quit) { 2579 /* the machine has stopped working. */ 2580 if (match->machine->flags & REGEX_MATCH_VERBOSE) 2581 printf("Best match has found\n"); 2582 break; 2583 } 2584 2585 input_string++; 2586 length--; 2587 } 2588 } 2589 #endif 2590