1 /* 2 * Stack-less Just-In-Time compiler 3 * 4 * Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without modification, are 7 * permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this list of 10 * conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, this list 13 * of conditions and the following disclaimer in the documentation and/or other materials 14 * provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY 17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 19 * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 21 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 24 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "sljitLir.h" 28 #include "regexJIT.h" 29 30 #include <stdlib.h> 31 32 #ifdef REGEX_MATCH_VERBOSE 33 #include <stdio.h> 34 #endif 35 36 /* Extra, hidden flags: 37 {id!} where id > 0 found in the code. */ 38 #define REGEX_ID_CHECK 0x100 39 /* When REGEX_NEWLINE && REGEX_MATCH_BEGIN defined, the pattern turn to a normal search, 40 which starts with [\r\n] character range. */ 41 #define REGEX_FAKE_MATCH_BEGIN 0x200 42 /* When REGEX_NEWLINE && REGEX_MATCH_END defined, the pattern turn to a normal search, 43 which ends with [\r\n] character range. */ 44 #define REGEX_FAKE_MATCH_END 0x400 45 46 /* Check match completition after every (FINISH_TEST + 1) steps. */ 47 #define FINISH_TEST 0x7 48 49 /* --------------------------------------------------------------------- */ 50 /* Structures for JIT-ed pattern matching */ 51 /* --------------------------------------------------------------------- */ 52 53 struct regex_machine 54 { 55 /* flags. */ 56 int flags; 57 /* Number of state descriptors for one term. */ 58 sljit_sw no_states; 59 /* Total size. */ 60 sljit_sw size; 61 62 union { 63 void *init_match; 64 sljit_sw (SLJIT_CALL *call_init)(void *next, void* match); 65 } u; 66 #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL) 67 struct sljit_function_context context; 68 #endif 69 70 void *continue_match; 71 72 /* Variable sized array to contain the handler addresses. */ 73 sljit_uw entry_addrs[1]; 74 }; 75 76 struct regex_match 77 { 78 /* Current and next state array. */ 79 sljit_sw *current; 80 sljit_sw *next; 81 /* Starting. */ 82 sljit_sw head; 83 /* String character index (ever increasing). */ 84 sljit_sw index; 85 /* Best match found so far (members in priority order). */ 86 sljit_sw best_begin; 87 sljit_sw best_end; 88 sljit_sw best_id; 89 /* Bool flags (encoded as word). */ 90 sljit_sw fast_quit; 91 sljit_sw fast_forward; 92 /* Machine. */ 93 struct regex_machine *machine; 94 95 union { 96 void *continue_match; 97 void (SLJIT_CALL *call_continue)(struct regex_match *match, const regex_char_t *input_string, int length); 98 } u; 99 100 /* Variable sized array to contain the state arrays. */ 101 sljit_sw states[1]; 102 }; 103 104 /* State vector 105 ITEM[0] - pointer to the address inside the machine code 106 ITEM[1] - next pointer 107 ITEM[2] - string started from (optional) 108 ITEM[3] - max ID (optional) */ 109 110 /* Register allocation. */ 111 /* Current state array (loaded & stored: regex_match->current). */ 112 #define R_CURR_STATE SLJIT_S0 113 /* Next state array (loaded & stored: regex_match->next). */ 114 #define R_NEXT_STATE SLJIT_S1 115 /* Head (loaded & stored: regex_match->head). */ 116 #define R_NEXT_HEAD SLJIT_S2 117 /* String fragment pointer. */ 118 #define R_STRING SLJIT_S3 119 /* String fragment length. */ 120 #define R_LENGTH SLJIT_S4 121 /* 'struct regex_match*' */ 122 #define R_REGEX_MATCH SLJIT_R0 123 /* Current character. */ 124 #define R_CURR_CHAR SLJIT_R1 125 /* Temporary register. */ 126 #define R_TEMP SLJIT_R2 127 /* Caches the regex_match->best_begin. */ 128 #define R_BEST_BEGIN SLJIT_R3 129 /* Current character index. */ 130 #define R_CURR_INDEX SLJIT_R4 131 132 /* --------------------------------------------------------------------- */ 133 /* Stack management */ 134 /* --------------------------------------------------------------------- */ 135 136 /* Try to allocate 2^n blocks. */ 137 #define STACK_FRAGMENT_SIZE (((64 * sizeof(struct stack_item)) - (sizeof(struct stack_fragment_data))) / (sizeof(struct stack_item))) 138 139 struct stack_item { 140 int type; 141 int value; 142 }; 143 144 struct stack_fragment_data { 145 struct stack_fragment *next; 146 struct stack_fragment *prev; 147 }; 148 149 struct stack_fragment { 150 struct stack_fragment_data data; 151 struct stack_item items[STACK_FRAGMENT_SIZE]; 152 }; 153 154 struct stack { 155 struct stack_fragment *first; 156 struct stack_fragment *last; 157 int index; 158 int count; 159 }; 160 161 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 162 163 static void stack_check(struct stack *stack) 164 { 165 struct stack_fragment *curr; 166 int found; 167 168 if (!stack) 169 return; 170 171 SLJIT_ASSERT(stack->index >= 0 && stack->index < STACK_FRAGMENT_SIZE); 172 173 if (stack->first == NULL) { 174 SLJIT_ASSERT(stack->first == NULL && stack->last == NULL); 175 SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0); 176 return; 177 } 178 179 found = 0; 180 if (stack->last == NULL) { 181 SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0); 182 found = 1; 183 } 184 else 185 SLJIT_ASSERT(stack->index >= 0 && stack->count >= 0); 186 187 SLJIT_ASSERT(stack->first->data.prev == NULL); 188 curr = stack->first; 189 while (curr) { 190 if (curr == stack->last) 191 found = 1; 192 if (curr->data.next) 193 SLJIT_ASSERT(curr->data.next->data.prev == curr); 194 curr = curr->data.next; 195 } 196 SLJIT_ASSERT(found); 197 } 198 199 #endif 200 201 static void stack_init(struct stack *stack) 202 { 203 stack->first = NULL; 204 stack->last = NULL; 205 stack->index = STACK_FRAGMENT_SIZE - 1; 206 stack->count = 0; 207 } 208 209 static void stack_destroy(struct stack *stack) 210 { 211 struct stack_fragment *curr = stack->first; 212 struct stack_fragment *prev; 213 214 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 215 stack_check(stack); 216 #endif 217 218 while (curr) { 219 prev = curr; 220 curr = curr->data.next; 221 SLJIT_FREE(prev, NULL); 222 } 223 } 224 225 static SLJIT_INLINE struct stack_item* stack_top(struct stack *stack) 226 { 227 SLJIT_ASSERT(stack->last); 228 return stack->last->items + stack->index; 229 } 230 231 static int stack_push(struct stack *stack, int type, int value) 232 { 233 if (stack->last) { 234 stack->index++; 235 if (stack->index >= STACK_FRAGMENT_SIZE) { 236 stack->index = 0; 237 if (!stack->last->data.next) { 238 stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL); 239 if (!stack->last->data.next) 240 return 1; 241 stack->last->data.next->data.next = NULL; 242 stack->last->data.next->data.prev = stack->last; 243 } 244 stack->last = stack->last->data.next; 245 } 246 } 247 else if (!stack->first) { 248 stack->last = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL); 249 if (!stack->last) 250 return 1; 251 stack->last->data.prev = NULL; 252 stack->last->data.next = NULL; 253 stack->first = stack->last; 254 stack->index = 0; 255 } 256 else { 257 stack->last = stack->first; 258 stack->index = 0; 259 } 260 stack->last->items[stack->index].type = type; 261 stack->last->items[stack->index].value = value; 262 stack->count++; 263 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 264 stack_check(stack); 265 #endif 266 return 0; 267 } 268 269 static struct stack_item* stack_pop(struct stack *stack) 270 { 271 struct stack_item *ret = stack_top(stack); 272 273 if (stack->index > 0) 274 stack->index--; 275 else { 276 stack->last = stack->last->data.prev; 277 stack->index = STACK_FRAGMENT_SIZE - 1; 278 } 279 280 stack->count--; 281 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 282 stack_check(stack); 283 #endif 284 return ret; 285 } 286 287 static SLJIT_INLINE void stack_clone(struct stack *src, struct stack *dst) 288 { 289 *dst = *src; 290 } 291 292 static int stack_push_copy(struct stack *stack, int items, int length) 293 { 294 struct stack_fragment *frag1; 295 int ind1; 296 struct stack_fragment *frag2; 297 int ind2; 298 int counter; 299 300 SLJIT_ASSERT(stack->count >= length && items <= length && items > 0); 301 302 /* Allocate the necessary elements. */ 303 counter = items; 304 frag1 = stack->last; 305 ind1 = stack->index; 306 while (counter > 0) { 307 if (stack->index + counter >= STACK_FRAGMENT_SIZE) { 308 counter -= STACK_FRAGMENT_SIZE - stack->index - 1 + 1; 309 stack->index = 0; 310 if (!stack->last->data.next) { 311 stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL); 312 if (!stack->last->data.next) 313 return 1; 314 stack->last->data.next->data.next = NULL; 315 stack->last->data.next->data.prev = stack->last; 316 } 317 stack->last = stack->last->data.next; 318 } 319 else { 320 stack->index += counter; 321 counter = 0; 322 } 323 } 324 325 frag2 = stack->last; 326 ind2 = stack->index; 327 while (length > 0) { 328 frag2->items[ind2--] = frag1->items[ind1--]; 329 if (ind1 < 0) { 330 ind1 = STACK_FRAGMENT_SIZE - 1; 331 frag1 = frag1->data.prev; 332 } 333 if (ind2 < 0) { 334 ind2 = STACK_FRAGMENT_SIZE - 1; 335 frag2 = frag2->data.prev; 336 } 337 length--; 338 } 339 340 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 341 stack_check(stack); 342 #endif 343 stack->count += items; 344 return 0; 345 } 346 347 /* --------------------------------------------------------------------- */ 348 /* Parser */ 349 /* --------------------------------------------------------------------- */ 350 351 enum { 352 /* Common. */ 353 type_begin, 354 type_end, 355 type_char, 356 type_newline, 357 type_id, 358 type_rng_start, 359 type_rng_end, 360 type_rng_char, 361 type_rng_left, 362 type_rng_right, 363 364 /* generator only. */ 365 type_branch, 366 type_jump, 367 368 /* Parser only. */ 369 type_open_br, 370 type_close_br, 371 type_select, 372 type_asterisk, 373 type_plus_sign, 374 type_qestion_mark 375 }; 376 377 struct compiler_common { 378 /* Temporary stacks. */ 379 struct stack stack; 380 struct stack depth; 381 /* REGEX_ flags. */ 382 int flags; 383 /* Encoded size of the dfa representation. */ 384 sljit_sw dfa_size; 385 /* Number of terms. */ 386 sljit_sw terms_size; 387 /* Number of state descriptors for one term (same as machine->no_states). */ 388 sljit_sw no_states; 389 /* Number of type_rng_(char|left)-s in the longest character range. */ 390 sljit_sw longest_range_size; 391 392 /* DFA linear representation (size: dfa_size). */ 393 struct stack_item *dfa_transitions; 394 /* Term id and search state pairs (size: dfa_size). */ 395 struct stack_item *search_states; 396 397 /* sljit compiler */ 398 struct sljit_compiler *compiler; 399 /* Machine data, which must be kept for later use. */ 400 struct regex_machine *machine; 401 /* Temporary space for jumps (size: longest_range_size). */ 402 struct sljit_jump **range_jump_list; 403 }; 404 405 static const regex_char_t* decode_number(const regex_char_t *regex_string, int length, int *result) 406 { 407 int value = 0; 408 409 SLJIT_ASSERT(length > 0); 410 if (*regex_string < '0' || *regex_string > '9') { 411 *result = -1; 412 return regex_string; 413 } 414 415 while (length > 0 && *regex_string >= '0' && *regex_string <= '9') { 416 value = value * 10 + (*regex_string - '0'); 417 length--; 418 regex_string++; 419 } 420 421 *result = value; 422 return regex_string; 423 } 424 425 static int iterate(struct stack *stack, int min, int max) 426 { 427 struct stack it; 428 struct stack_item *item; 429 int count = -1; 430 int len = 0; 431 int depth = 0; 432 433 stack_clone(stack, &it); 434 435 /* Calculate size. */ 436 while (count < 0) { 437 item = stack_pop(&it); 438 switch (item->type) { 439 case type_id: 440 case type_rng_end: 441 case type_rng_char: 442 case type_rng_left: 443 case type_rng_right: 444 case type_plus_sign: 445 case type_qestion_mark: 446 len++; 447 break; 448 449 case type_asterisk: 450 len += 2; 451 break; 452 453 case type_close_br: 454 depth++; 455 break; 456 457 case type_open_br: 458 SLJIT_ASSERT(depth > 0); 459 depth--; 460 if (depth == 0) 461 count = it.count; 462 break; 463 464 case type_select: 465 SLJIT_ASSERT(depth > 0); 466 len += 2; 467 break; 468 469 default: 470 SLJIT_ASSERT(item->type != type_begin && item->type != type_end); 471 if (depth == 0) 472 count = it.count; 473 len++; 474 break; 475 } 476 } 477 478 if (min == 0 && max == 0) { 479 /* {0,0} case, not {0,} case: delete subtree. */ 480 stack_clone(&it, stack); 481 /* and put an empty bracket expression instead of it. */ 482 if (stack_push(stack, type_open_br, 0)) 483 return REGEX_MEMORY_ERROR; 484 if (stack_push(stack, type_close_br, 0)) 485 return REGEX_MEMORY_ERROR; 486 return len; 487 } 488 489 count = stack->count - count; 490 491 /* Put an open bracket before the sequence. */ 492 if (stack_push_copy(stack, 1, count)) 493 return -1; 494 495 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 496 SLJIT_ASSERT(stack_push(&it, type_open_br, 0) == 0); 497 #else 498 stack_push(&it, type_open_br, 0); 499 #endif 500 501 /* Copy the data. */ 502 if (max > 0) { 503 len = len * (max - 1); 504 max -= min; 505 /* Insert ? operators. */ 506 len += max; 507 508 if (min > 0) { 509 min--; 510 while (min > 0) { 511 if (stack_push_copy(stack, count, count)) 512 return -1; 513 min--; 514 } 515 if (max > 0) { 516 if (stack_push_copy(stack, count, count)) 517 return -1; 518 if (stack_push(stack, type_qestion_mark, 0)) 519 return REGEX_MEMORY_ERROR; 520 count++; 521 max--; 522 } 523 } 524 else { 525 SLJIT_ASSERT(max > 0); 526 max--; 527 count++; 528 if (stack_push(stack, type_qestion_mark, 0)) 529 return REGEX_MEMORY_ERROR; 530 } 531 532 while (max > 0) { 533 if (stack_push_copy(stack, count, count)) 534 return -1; 535 max--; 536 } 537 } 538 else { 539 SLJIT_ASSERT(min > 0); 540 min--; 541 /* Insert + operator. */ 542 len = len * min + 1; 543 while (min > 0) { 544 if (stack_push_copy(stack, count, count)) 545 return -1; 546 min--; 547 } 548 549 if (stack_push(stack, type_plus_sign, 0)) 550 return REGEX_MEMORY_ERROR; 551 } 552 553 /* Close the opened bracket. */ 554 if (stack_push(stack, type_close_br, 0)) 555 return REGEX_MEMORY_ERROR; 556 557 return len; 558 } 559 560 static int parse_iterator(const regex_char_t *regex_string, int length, struct stack *stack, sljit_sw *dfa_size, int begin) 561 { 562 /* We only know that *regex_string == { . */ 563 int val1, val2; 564 const regex_char_t *base_from = regex_string; 565 const regex_char_t *from; 566 567 length--; 568 regex_string++; 569 570 /* Decode left value. */ 571 val2 = -1; 572 if (length == 0) 573 return -2; 574 if (*regex_string == ',') { 575 val1 = 0; 576 length--; 577 regex_string++; 578 } 579 else { 580 from = regex_string; 581 regex_string = decode_number(regex_string, length, &val1); 582 if (val1 < 0) 583 return -2; 584 length -= regex_string - from; 585 586 if (length == 0) 587 return -2; 588 if (*regex_string == '}') { 589 val2 = val1; 590 if (val1 == 0) 591 val1 = -1; 592 } 593 else if (length >= 2 && *regex_string == '!' && regex_string[1] == '}') { 594 /* Non posix extension. */ 595 if (stack_push(stack, type_id, val1)) 596 return -1; 597 (*dfa_size)++; 598 return (regex_string - base_from) + 1; 599 } 600 else { 601 if (*regex_string != ',') 602 return -2; 603 length--; 604 regex_string++; 605 } 606 } 607 608 if (begin) 609 return -2; 610 611 /* Decode right value. */ 612 if (val2 == -1) { 613 if (length == 0) 614 return -2; 615 if (*regex_string == '}') 616 val2 = 0; 617 else { 618 from = regex_string; 619 regex_string = decode_number(regex_string, length, &val2); 620 length -= regex_string - from; 621 if (val2 < 0 || length == 0 || *regex_string != '}' || val2 < val1) 622 return -2; 623 if (val2 == 0) { 624 SLJIT_ASSERT(val1 == 0); 625 val1 = -1; 626 } 627 } 628 } 629 630 /* Fast cases. */ 631 if (val1 > 1 || val2 > 1) { 632 val1 = iterate(stack, val1, val2); 633 if (val1 < 0) 634 return -1; 635 *dfa_size += val1; 636 } 637 else if (val1 == 0 && val2 == 0) { 638 if (stack_push(stack, type_asterisk, 0)) 639 return -1; 640 *dfa_size += 2; 641 } 642 else if (val1 == 1 && val2 == 0) { 643 if (stack_push(stack, type_plus_sign, 0)) 644 return -1; 645 (*dfa_size)++; 646 } 647 else if (val1 == 0 && val2 == 1) { 648 if (stack_push(stack, type_qestion_mark, 0)) 649 return -1; 650 (*dfa_size)++; 651 } 652 else if (val1 == -1) { 653 val1 = iterate(stack, 0, 0); 654 if (val1 < 0) 655 return -1; 656 *dfa_size -= val1; 657 SLJIT_ASSERT(*dfa_size >= 2); 658 } 659 else { 660 /* Ignore. */ 661 SLJIT_ASSERT(val1 == 1 && val2 == 1); 662 } 663 return regex_string - base_from; 664 } 665 666 static int parse_char_range(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common) 667 { 668 struct stack* stack = &compiler_common->stack; 669 const regex_char_t *base_from = regex_string; 670 regex_char_t left_char, right_char, tmp_char; 671 672 length--; 673 regex_string++; 674 675 if (length == 0) 676 return -2; 677 678 if (*regex_string != '^') { 679 if (stack_push(stack, type_rng_start, 0)) 680 return -1; 681 } 682 else { 683 length--; 684 regex_string++; 685 686 if (length == 0) 687 return -2; 688 689 if (stack_push(stack, type_rng_start, 1)) 690 return -1; 691 } 692 /* For both the type_rng_start & type_rng_end. */ 693 compiler_common->dfa_size += 2; 694 695 /* Range must be at least 1 character. */ 696 if (*regex_string == ']') { 697 length--; 698 regex_string++; 699 if (stack_push(stack, type_rng_char, ']')) 700 return -1; 701 compiler_common->dfa_size++; 702 } 703 704 while (1) { 705 if (length == 0) 706 return -2; 707 708 if (*regex_string == ']') 709 break; 710 711 if (*regex_string != '\\') 712 left_char = *regex_string; 713 else { 714 regex_string++; 715 length--; 716 if (length == 0) 717 return -2; 718 left_char = *regex_string; 719 } 720 regex_string++; 721 length--; 722 723 /* Is a range here? */ 724 if (length >= 3 && *regex_string == '-' && *(regex_string + 1) != ']') { 725 regex_string++; 726 length--; 727 728 if (*regex_string != '\\') 729 right_char = *regex_string; 730 else { 731 regex_string++; 732 length--; 733 if (length == 0) 734 return -2; 735 right_char = *regex_string; 736 } 737 regex_string++; 738 length--; 739 740 if (left_char > right_char) { 741 /* Swap if necessary. */ 742 tmp_char = left_char; 743 left_char = right_char; 744 right_char = tmp_char; 745 } 746 747 if (stack_push(stack, type_rng_left, left_char)) 748 return -1; 749 if (stack_push(stack, type_rng_right, right_char)) 750 return -1; 751 compiler_common->dfa_size += 2; 752 } 753 else { 754 if (stack_push(stack, type_rng_char, left_char)) 755 return -1; 756 compiler_common->dfa_size++; 757 } 758 } 759 760 if (stack_push(stack, type_rng_end, 0)) 761 return -1; 762 return regex_string - base_from; 763 } 764 765 static int parse(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common) 766 { 767 /* Depth of bracketed expressions. */ 768 int depth = 0; 769 /* Have we already found a term? '1' if not yet. */ 770 int begin = 1; 771 /* Cache stack pointer. */ 772 struct stack* stack = &compiler_common->stack; 773 int tmp; 774 775 /* Type_begin and type_end. */ 776 compiler_common->dfa_size = 2; 777 stack_init(stack); 778 if (stack_push(stack, type_begin, 0)) 779 return REGEX_MEMORY_ERROR; 780 781 if (length > 0 && *regex_string == '^') { 782 compiler_common->flags |= REGEX_MATCH_BEGIN; 783 length--; 784 regex_string++; 785 } 786 787 if ((compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) == (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) { 788 /* Replace REGEX_MATCH_BEGIN flag to REGEX_FAKE_MATCH_BEGIN */ 789 compiler_common->flags &= ~REGEX_MATCH_BEGIN; 790 compiler_common->flags |= REGEX_FAKE_MATCH_BEGIN; 791 /* and append a new-line search. */ 792 if (stack_push(stack, type_newline, 0)) 793 return REGEX_MEMORY_ERROR; 794 compiler_common->dfa_size++; 795 /* Begin intentionally kept as 1. */ 796 } 797 798 while (length > 0) { 799 switch (*regex_string) { 800 case '\\' : 801 length--; 802 regex_string++; 803 if (length == 0) 804 return REGEX_INVALID_REGEX; 805 if (stack_push(stack, type_char, *regex_string)) 806 return REGEX_MEMORY_ERROR; 807 begin = 0; 808 compiler_common->dfa_size++; 809 break; 810 811 case '.' : 812 if (stack_push(stack, type_rng_start, 1)) 813 return REGEX_MEMORY_ERROR; 814 if (compiler_common->flags & REGEX_NEWLINE) { 815 if (stack_push(stack, type_rng_char, '\n')) 816 return REGEX_MEMORY_ERROR; 817 if (stack_push(stack, type_rng_char, '\r')) 818 return REGEX_MEMORY_ERROR; 819 compiler_common->dfa_size += 2; 820 } 821 if (stack_push(stack, type_rng_end, 1)) 822 return REGEX_MEMORY_ERROR; 823 begin = 0; 824 compiler_common->dfa_size += 2; 825 break; 826 827 case '(' : 828 depth++; 829 if (stack_push(stack, type_open_br, 0)) 830 return REGEX_MEMORY_ERROR; 831 begin = 1; 832 break; 833 834 case ')' : 835 if (depth == 0) 836 return REGEX_INVALID_REGEX; 837 depth--; 838 if (stack_push(stack, type_close_br, 0)) 839 return REGEX_MEMORY_ERROR; 840 begin = 0; 841 break; 842 843 case '|' : 844 if (stack_push(stack, type_select, 0)) 845 return REGEX_MEMORY_ERROR; 846 begin = 1; 847 compiler_common->dfa_size += 2; 848 break; 849 850 case '*' : 851 if (begin) 852 return REGEX_INVALID_REGEX; 853 if (stack_push(stack, type_asterisk, 0)) 854 return REGEX_MEMORY_ERROR; 855 compiler_common->dfa_size += 2; 856 break; 857 858 case '?' : 859 case '+' : 860 if (begin) 861 return REGEX_INVALID_REGEX; 862 if (stack_push(stack, (*regex_string == '+') ? type_plus_sign : type_qestion_mark, 0)) 863 return REGEX_MEMORY_ERROR; 864 compiler_common->dfa_size++; 865 break; 866 867 case '{' : 868 tmp = parse_iterator(regex_string, length, stack, &compiler_common->dfa_size, begin); 869 870 if (tmp >= 0) { 871 length -= tmp; 872 regex_string += tmp; 873 } 874 else if (tmp == -1) 875 return REGEX_MEMORY_ERROR; 876 else { 877 /* Not a valid range expression. */ 878 SLJIT_ASSERT(tmp == -2); 879 if (stack_push(stack, type_char, '{')) 880 return REGEX_MEMORY_ERROR; 881 compiler_common->dfa_size++; 882 } 883 break; 884 885 case '[' : 886 tmp = parse_char_range(regex_string, length, compiler_common); 887 if (tmp >= 0) { 888 length -= tmp; 889 regex_string += tmp; 890 } 891 else if (tmp == -1) 892 return REGEX_MEMORY_ERROR; 893 else { 894 SLJIT_ASSERT(tmp == -2); 895 return REGEX_INVALID_REGEX; 896 } 897 begin = 0; 898 break; 899 900 default: 901 if (length == 1 && *regex_string == '$') { 902 compiler_common->flags |= REGEX_MATCH_END; 903 break; 904 } 905 if (stack_push(stack, type_char, *regex_string)) 906 return REGEX_MEMORY_ERROR; 907 begin = 0; 908 compiler_common->dfa_size++; 909 break; 910 } 911 length--; 912 regex_string++; 913 } 914 915 if (depth != 0) 916 return REGEX_INVALID_REGEX; 917 918 if ((compiler_common->flags & (REGEX_MATCH_END | REGEX_NEWLINE)) == (REGEX_MATCH_END | REGEX_NEWLINE)) { 919 /* Replace REGEX_MATCH_END flag to REGEX_FAKE_MATCH_END */ 920 compiler_common->flags &= ~REGEX_MATCH_END; 921 compiler_common->flags |= REGEX_FAKE_MATCH_END; 922 /* and append a new-line search. */ 923 if (stack_push(stack, type_newline, 1)) 924 return REGEX_MEMORY_ERROR; 925 compiler_common->dfa_size++; 926 /* Begin intentionally kept as 1. */ 927 } 928 929 if (stack_push(stack, type_end, 0)) 930 return REGEX_MEMORY_ERROR; 931 932 return REGEX_NO_ERROR; 933 } 934 935 /* --------------------------------------------------------------------- */ 936 /* Generating machine state transitions */ 937 /* --------------------------------------------------------------------- */ 938 939 #define PUT_TRANSITION(typ, val) \ 940 do { \ 941 --transitions_ptr; \ 942 transitions_ptr->type = typ; \ 943 transitions_ptr->value = val; \ 944 } while (0) 945 946 static struct stack_item* handle_iteratives(struct stack_item *transitions_ptr, struct stack_item *transitions, struct stack *depth) 947 { 948 struct stack_item *item; 949 950 while (1) { 951 item = stack_top(depth); 952 953 switch (item->type) { 954 case type_asterisk: 955 SLJIT_ASSERT(transitions[item->value].type == type_branch); 956 transitions[item->value].value = transitions_ptr - transitions; 957 PUT_TRANSITION(type_branch, item->value + 1); 958 break; 959 960 case type_plus_sign: 961 SLJIT_ASSERT(transitions[item->value].type == type_branch); 962 transitions[item->value].value = transitions_ptr - transitions; 963 break; 964 965 case type_qestion_mark: 966 PUT_TRANSITION(type_branch, item->value); 967 break; 968 969 default: 970 return transitions_ptr; 971 } 972 stack_pop(depth); 973 } 974 } 975 976 static int generate_transitions(struct compiler_common *compiler_common) 977 { 978 struct stack *stack = &compiler_common->stack; 979 struct stack *depth = &compiler_common->depth; 980 struct stack_item *transitions_ptr; 981 struct stack_item *item; 982 983 stack_init(depth); 984 compiler_common->dfa_transitions = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL); 985 if (!compiler_common->dfa_transitions) 986 return REGEX_MEMORY_ERROR; 987 988 /* Go through the items of the stack and generate the necessary branches and jumps (edges of DFA). */ 989 transitions_ptr = compiler_common->dfa_transitions + compiler_common->dfa_size; 990 while (stack->count > 0) { 991 item = stack_pop(stack); 992 switch (item->type) { 993 case type_begin: 994 case type_open_br: 995 item = stack_pop(depth); 996 if (item->type == type_select) 997 PUT_TRANSITION(type_branch, item->value + 1); 998 else 999 SLJIT_ASSERT(item->type == type_close_br); 1000 if (stack->count == 0) 1001 PUT_TRANSITION(type_begin, 0); 1002 else 1003 transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth); 1004 break; 1005 1006 case type_end: 1007 case type_close_br: 1008 if (item->type == type_end) 1009 *--transitions_ptr = *item; 1010 if (stack_push(depth, type_close_br, transitions_ptr - compiler_common->dfa_transitions)) 1011 return REGEX_MEMORY_ERROR; 1012 break; 1013 1014 case type_select: 1015 item = stack_top(depth); 1016 if (item->type == type_select) { 1017 SLJIT_ASSERT(compiler_common->dfa_transitions[item->value].type == type_jump); 1018 PUT_TRANSITION(type_branch, item->value + 1); 1019 PUT_TRANSITION(type_jump, item->value); 1020 item->value = transitions_ptr - compiler_common->dfa_transitions; 1021 } 1022 else { 1023 SLJIT_ASSERT(item->type == type_close_br); 1024 item->type = type_select; 1025 PUT_TRANSITION(type_jump, item->value); 1026 item->value = transitions_ptr - compiler_common->dfa_transitions; 1027 } 1028 break; 1029 1030 case type_asterisk: 1031 case type_plus_sign: 1032 case type_qestion_mark: 1033 if (item->type != type_qestion_mark) 1034 PUT_TRANSITION(type_branch, 0); 1035 if (stack_push(depth, item->type, transitions_ptr - compiler_common->dfa_transitions)) 1036 return REGEX_MEMORY_ERROR; 1037 break; 1038 1039 case type_char: 1040 case type_newline: 1041 case type_rng_start: 1042 /* Requires handle_iteratives. */ 1043 *--transitions_ptr = *item; 1044 transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth); 1045 break; 1046 1047 default: 1048 *--transitions_ptr = *item; 1049 break; 1050 } 1051 } 1052 1053 SLJIT_ASSERT(compiler_common->dfa_transitions == transitions_ptr); 1054 SLJIT_ASSERT(depth->count == 0); 1055 return REGEX_NO_ERROR; 1056 } 1057 1058 #undef PUT_TRANSITION 1059 1060 #ifdef REGEX_MATCH_VERBOSE 1061 1062 static void verbose_transitions(struct compiler_common *compiler_common) 1063 { 1064 struct stack_item *transitions_ptr = compiler_common->dfa_transitions; 1065 struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size; 1066 struct stack_item *search_states_ptr = compiler_common->search_states; 1067 int pos; 1068 1069 printf("-----------------\nTransitions\n-----------------\n"); 1070 pos = 0; 1071 while (transitions_ptr < transitions_end) { 1072 printf("[%3d] ", pos++); 1073 if (search_states_ptr->type >= 0) 1074 printf("(%3d) ", search_states_ptr->type); 1075 switch (transitions_ptr->type) { 1076 case type_begin: 1077 printf("type_begin\n"); 1078 break; 1079 1080 case type_end: 1081 printf("type_end\n"); 1082 break; 1083 1084 case type_char: 1085 if (transitions_ptr->value >= ' ') 1086 printf("type_char '%c'\n", transitions_ptr->value); 1087 else 1088 printf("type_char 0x%x\n", transitions_ptr->value); 1089 break; 1090 1091 case type_newline: 1092 printf("type_newline %s\n", transitions_ptr->value ? "(end)" : "(begin)"); 1093 break; 1094 1095 case type_id: 1096 printf("type_id %d\n", transitions_ptr->value); 1097 break; 1098 1099 case type_rng_start: 1100 printf("type_rng_start %s\n", transitions_ptr->value ? "(invert)" : "(normal)"); 1101 break; 1102 1103 case type_rng_end: 1104 printf("type_rng_end\n"); 1105 break; 1106 1107 case type_rng_char: 1108 if (transitions_ptr->value >= ' ') 1109 printf("type_rng_char '%c'\n", transitions_ptr->value); 1110 else 1111 printf("type_rng_char 0x%x\n", transitions_ptr->value); 1112 break; 1113 1114 case type_rng_left: 1115 if (transitions_ptr->value >= ' ') 1116 printf("type_rng_left '%c'\n", transitions_ptr->value); 1117 else 1118 printf("type_rng_left 0x%x\n", transitions_ptr->value); 1119 break; 1120 1121 case type_rng_right: 1122 if (transitions_ptr->value >= ' ') 1123 printf("type_rng_right '%c'\n", transitions_ptr->value); 1124 else 1125 printf("type_rng_right 0x%x\n", transitions_ptr->value); 1126 break; 1127 1128 case type_branch: 1129 printf("type_branch -> %d\n", transitions_ptr->value); 1130 break; 1131 1132 case type_jump: 1133 printf("type_jump -> %d\n", transitions_ptr->value); 1134 break; 1135 1136 default: 1137 printf("UNEXPECTED TYPE\n"); 1138 break; 1139 } 1140 transitions_ptr++; 1141 search_states_ptr++; 1142 } 1143 printf("flags: "); 1144 if (!(compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_NEWLINE | REGEX_ID_CHECK | REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END))) 1145 printf("none "); 1146 if (compiler_common->flags & REGEX_MATCH_BEGIN) 1147 printf("REGEX_MATCH_BEGIN "); 1148 if (compiler_common->flags & REGEX_MATCH_END) 1149 printf("REGEX_MATCH_END "); 1150 if (compiler_common->flags & REGEX_NEWLINE) 1151 printf("REGEX_NEWLINE "); 1152 if (compiler_common->flags & REGEX_ID_CHECK) 1153 printf("REGEX_ID_CHECK "); 1154 if (compiler_common->flags & REGEX_FAKE_MATCH_BEGIN) 1155 printf("REGEX_FAKE_MATCH_BEGIN "); 1156 if (compiler_common->flags & REGEX_FAKE_MATCH_END) 1157 printf("REGEX_FAKE_MATCH_END "); 1158 if (compiler_common->longest_range_size > 0) 1159 printf("(longest range: %ld) ", (long)compiler_common->longest_range_size); 1160 printf("\n"); 1161 } 1162 1163 #endif 1164 1165 /* --------------------------------------------------------------------- */ 1166 /* Utilities */ 1167 /* --------------------------------------------------------------------- */ 1168 1169 static int generate_search_states(struct compiler_common *compiler_common) 1170 { 1171 struct stack_item *transitions_ptr = compiler_common->dfa_transitions; 1172 struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size; 1173 struct stack_item *search_states_ptr; 1174 struct stack_item *rng_start = NULL; 1175 1176 compiler_common->terms_size = !(compiler_common->flags & REGEX_FAKE_MATCH_END) ? 1 : 2; 1177 compiler_common->longest_range_size = 0; 1178 compiler_common->search_states = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL); 1179 if (!compiler_common->search_states) 1180 return REGEX_MEMORY_ERROR; 1181 1182 search_states_ptr = compiler_common->search_states; 1183 while (transitions_ptr < transitions_end) { 1184 switch (transitions_ptr->type) { 1185 case type_begin: 1186 case type_end: 1187 search_states_ptr->type = 0; 1188 break; 1189 1190 case type_char: 1191 search_states_ptr->type = compiler_common->terms_size++; 1192 break; 1193 1194 case type_newline: 1195 if (transitions_ptr->value) 1196 search_states_ptr->type = 1; 1197 else 1198 search_states_ptr->type = compiler_common->terms_size++; 1199 SLJIT_ASSERT(search_states_ptr->type == 1 || search_states_ptr->type == 2); 1200 break; 1201 1202 case type_id: 1203 if (transitions_ptr->value > 0) 1204 compiler_common->flags |= REGEX_ID_CHECK; 1205 search_states_ptr->type = -1; 1206 break; 1207 1208 case type_rng_start: 1209 search_states_ptr->type = compiler_common->terms_size; 1210 rng_start = search_states_ptr; 1211 break; 1212 1213 case type_rng_end: 1214 search_states_ptr->type = compiler_common->terms_size++; 1215 /* Ok, this is a blunt over estimation :) */ 1216 if (compiler_common->longest_range_size < search_states_ptr - rng_start) 1217 compiler_common->longest_range_size = search_states_ptr - rng_start; 1218 break; 1219 1220 default: 1221 search_states_ptr->type = -1; 1222 break; 1223 } 1224 search_states_ptr->value = -1; 1225 search_states_ptr++; 1226 transitions_ptr++; 1227 } 1228 return REGEX_NO_ERROR; 1229 } 1230 1231 static int trace_transitions(int from, struct compiler_common *compiler_common) 1232 { 1233 int id = 0; 1234 struct stack *stack = &compiler_common->stack; 1235 struct stack *depth = &compiler_common->depth; 1236 struct stack_item *dfa_transitions = compiler_common->dfa_transitions; 1237 struct stack_item *search_states = compiler_common->search_states; 1238 1239 SLJIT_ASSERT(search_states[from].type >= 0); 1240 1241 from++; 1242 1243 /* Be prepared for any paths (loops, etc). */ 1244 while (1) { 1245 if (dfa_transitions[from].type == type_id) 1246 if (id < dfa_transitions[from].value) 1247 id = dfa_transitions[from].value; 1248 1249 if (search_states[from].value < id) { 1250 /* Forward step. */ 1251 if (search_states[from].value == -1) 1252 if (stack_push(stack, 0, from)) 1253 return REGEX_MEMORY_ERROR; 1254 search_states[from].value = id; 1255 1256 if (dfa_transitions[from].type == type_branch) { 1257 if (stack_push(depth, id, from)) 1258 return REGEX_MEMORY_ERROR; 1259 from++; 1260 continue; 1261 } 1262 else if (dfa_transitions[from].type == type_jump) { 1263 from = dfa_transitions[from].value; 1264 continue; 1265 } 1266 else if (search_states[from].type < 0) { 1267 from++; 1268 continue; 1269 } 1270 } 1271 1272 /* Back tracking. */ 1273 if (depth->count > 0) { 1274 id = stack_top(depth)->type; 1275 from = dfa_transitions[stack_pop(depth)->value].value; 1276 continue; 1277 } 1278 return 0; 1279 } 1280 } 1281 1282 /* --------------------------------------------------------------------- */ 1283 /* Code generator */ 1284 /* --------------------------------------------------------------------- */ 1285 1286 #define TERM_OFFSET_OF(index, offs) (((index) * no_states + (offs)) * sizeof(sljit_sw)) 1287 #define TERM_REL_OFFSET_OF(base, offs) ((base) + ((offs) * sizeof(sljit_sw))) 1288 1289 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \ 1290 CHECK(sljit_emit_op1(compiler, type, arg1, arg2, arg3, arg4)) 1291 1292 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \ 1293 CHECK(sljit_emit_op2(compiler, type, arg1, arg2, arg3, arg4, arg5, arg6)) 1294 1295 #define EMIT_LABEL(label) \ 1296 label = sljit_emit_label(compiler); \ 1297 CHECK(!label) 1298 1299 #define EMIT_JUMP(jump, type) \ 1300 jump = sljit_emit_jump(compiler, type); \ 1301 CHECK(!jump) 1302 1303 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \ 1304 jump = sljit_emit_cmp(compiler, type, arg1, arg2, arg3, arg4); \ 1305 CHECK(!jump) 1306 1307 /* CHECK depends on the use case. */ 1308 1309 #define CHECK(exp) \ 1310 if (SLJIT_UNLIKELY(exp)) \ 1311 return REGEX_MEMORY_ERROR 1312 1313 static int compile_uncond_tran(struct compiler_common *compiler_common, int reg) 1314 { 1315 struct sljit_compiler *compiler = compiler_common->compiler; 1316 struct stack *stack = &compiler_common->stack; 1317 struct stack_item *search_states = compiler_common->search_states; 1318 int flags = compiler_common->flags; 1319 sljit_sw no_states = compiler_common->no_states; 1320 sljit_uw head = 0; 1321 sljit_sw offset, value; 1322 1323 if (reg != R_CURR_STATE || !(compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)) { 1324 CHECK(trace_transitions(0, compiler_common)); 1325 } 1326 else { 1327 CHECK(trace_transitions(1, compiler_common)); 1328 } 1329 1330 while (stack->count > 0) { 1331 value = stack_pop(stack)->value; 1332 if (search_states[value].type >= 0) { 1333 offset = TERM_OFFSET_OF(search_states[value].type, 0); 1334 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head); 1335 if (offset > 0) 1336 head = offset; 1337 1338 if (!(flags & REGEX_MATCH_BEGIN)) { 1339 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), R_TEMP, 0); 1340 if (flags & REGEX_ID_CHECK) { 1341 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, search_states[value].value); 1342 } 1343 } 1344 else if (flags & REGEX_ID_CHECK) { 1345 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, search_states[value].value); 1346 } 1347 } 1348 search_states[value].value = -1; 1349 } 1350 if (reg == R_NEXT_STATE) { 1351 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0); 1352 } 1353 else if (flags & REGEX_FAKE_MATCH_BEGIN) { 1354 SLJIT_ASSERT(compiler_common->dfa_transitions[1].type == type_newline && !compiler_common->dfa_transitions[1].value); 1355 offset = TERM_OFFSET_OF(search_states[1].type, 0); 1356 1357 SLJIT_ASSERT(!(flags & REGEX_MATCH_BEGIN)); 1358 1359 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head); 1360 head = offset; 1361 1362 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, 1); 1363 if (flags & REGEX_ID_CHECK) { 1364 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, 0); 1365 } 1366 } 1367 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, head); 1368 return REGEX_NO_ERROR; 1369 } 1370 1371 static int compile_cond_tran(struct compiler_common *compiler_common, sljit_sw curr_index) 1372 { 1373 struct sljit_compiler *compiler = compiler_common->compiler; 1374 struct stack *stack = &compiler_common->stack; 1375 struct stack_item *search_states = compiler_common->search_states; 1376 sljit_sw offset; 1377 int flags; 1378 sljit_sw no_states; 1379 sljit_sw value; 1380 struct sljit_jump *jump1; 1381 struct sljit_jump *jump2; 1382 struct sljit_jump *jump3; 1383 struct sljit_jump *jump4; 1384 struct sljit_jump *jump5; 1385 struct sljit_label *label1; 1386 1387 flags = compiler_common->flags; 1388 no_states = compiler_common->no_states; 1389 1390 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0); 1391 if (!(flags & (REGEX_ID_CHECK | REGEX_MATCH_BEGIN))) { 1392 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2)); 1393 } 1394 1395 while (stack->count > 0) { 1396 value = stack_pop(stack)->value; 1397 if (search_states[value].type >= 0) { 1398 #ifdef REGEX_MATCH_VERBOSE 1399 if (flags & REGEX_MATCH_VERBOSE) 1400 printf("-> (%3d:%3d) ", search_states[value].type, search_states[value].value); 1401 #endif 1402 offset = TERM_OFFSET_OF(search_states[value].type, 0); 1403 1404 if (!(flags & REGEX_ID_CHECK)) { 1405 if (!(flags & REGEX_MATCH_BEGIN)) { 1406 /* Check whether item is inserted. */ 1407 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1); 1408 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0); 1409 if (offset > 0) { 1410 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset); 1411 } 1412 EMIT_JUMP(jump2, SLJIT_JUMP); 1413 1414 /* Check whether old index <= index. */ 1415 EMIT_LABEL(label1); 1416 sljit_set_label(jump1, label1); 1417 1418 EMIT_CMP(jump1, SLJIT_LESS_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); 1419 1420 EMIT_LABEL(label1); 1421 sljit_set_label(jump2, label1); 1422 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); 1423 1424 EMIT_LABEL(label1); 1425 sljit_set_label(jump1, label1); 1426 } 1427 else { 1428 /* Check whether item is inserted. */ 1429 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1); 1430 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0); 1431 if (offset > 0) { 1432 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset); 1433 } 1434 EMIT_LABEL(label1); 1435 sljit_set_label(jump1, label1); 1436 } 1437 } 1438 else { 1439 if (!(flags & REGEX_MATCH_BEGIN)) { 1440 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2)); 1441 1442 /* Check whether item is inserted. */ 1443 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1); 1444 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0); 1445 if (offset > 0) { 1446 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset); 1447 } 1448 EMIT_JUMP(jump2, SLJIT_JUMP); 1449 1450 /* Check whether old index != index. */ 1451 EMIT_LABEL(label1); 1452 sljit_set_label(jump1, label1); 1453 1454 EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_LESS, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); 1455 EMIT_JUMP(jump1, SLJIT_LESS); 1456 EMIT_JUMP(jump3, SLJIT_NOT_EQUAL); /* Greater. */ 1457 1458 /* Old index == index. */ 1459 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3)); 1460 if (search_states[value].value > 0) { 1461 EMIT_CMP(jump4, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value); 1462 1463 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value); 1464 EMIT_LABEL(label1); 1465 sljit_set_label(jump4, label1); 1466 } 1467 1468 EMIT_OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0); 1469 EMIT_JUMP(jump4, SLJIT_GREATER_EQUAL); 1470 EMIT_JUMP(jump5, SLJIT_JUMP); 1471 1472 /* Overwrite index & id. */ 1473 EMIT_LABEL(label1); 1474 sljit_set_label(jump3, label1); 1475 sljit_set_label(jump2, label1); 1476 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); 1477 1478 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3)); 1479 if (search_states[value].value > 0) { 1480 EMIT_CMP(jump3, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value); 1481 1482 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value); 1483 EMIT_LABEL(label1); 1484 sljit_set_label(jump3, label1); 1485 } 1486 1487 EMIT_LABEL(label1); 1488 sljit_set_label(jump5, label1); 1489 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0); 1490 1491 /* Exit. */ 1492 EMIT_LABEL(label1); 1493 sljit_set_label(jump1, label1); 1494 sljit_set_label(jump4, label1); 1495 } 1496 else { 1497 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2)); 1498 1499 if (search_states[value].value > 0) { 1500 EMIT_CMP(jump1, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value); 1501 1502 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value); 1503 EMIT_LABEL(label1); 1504 sljit_set_label(jump1, label1); 1505 } 1506 1507 /* Check whether item is inserted. */ 1508 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1); 1509 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0); 1510 if (offset > 0) { 1511 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset); 1512 } 1513 EMIT_JUMP(jump2, SLJIT_JUMP); 1514 1515 /* Check whether old id >= id. */ 1516 EMIT_LABEL(label1); 1517 sljit_set_label(jump1, label1); 1518 1519 EMIT_CMP(jump1, SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); 1520 1521 EMIT_LABEL(label1); 1522 sljit_set_label(jump2, label1); 1523 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0); 1524 1525 EMIT_LABEL(label1); 1526 sljit_set_label(jump1, label1); 1527 } 1528 } 1529 } 1530 search_states[value].value = -1; 1531 } 1532 1533 #ifdef REGEX_MATCH_VERBOSE 1534 if (flags & REGEX_MATCH_VERBOSE) 1535 printf("\n"); 1536 #endif 1537 return REGEX_NO_ERROR; 1538 } 1539 1540 static int compile_end_check(struct compiler_common *compiler_common, struct sljit_label *end_check_label) 1541 { 1542 struct sljit_compiler *compiler = compiler_common->compiler; 1543 struct sljit_jump *jump; 1544 struct sljit_jump *clear_states_jump; 1545 struct sljit_label *label; 1546 struct sljit_label *leave_label; 1547 struct sljit_label *begin_loop_label; 1548 1549 /* Priority order: best_begin > best_end > best_id. 1550 In other words: 1551 if (new best_begin > old test_begin) do nothing 1552 otherwise we know that new_end > old_end, since R_CURR_INDEX ever increasing 1553 therefore we must overwrite all best_* variables (new_id also contains the highest id for this turn). */ 1554 1555 /* Both R_CURR_CHAR and R_BEST_BEGIN used as temporary registers. */ 1556 1557 if (!(compiler_common->flags & REGEX_MATCH_BEGIN)) { 1558 EMIT_OP1(SLJIT_MOV, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2)); 1559 EMIT_CMP(jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_LESS : SLJIT_LESS_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0); 1560 sljit_set_label(jump, end_check_label); 1561 1562 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0); 1563 if (!(compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END))) { 1564 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0); 1565 } 1566 else { 1567 if ((compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) == (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) { 1568 EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 2); 1569 } 1570 else { 1571 EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 1); 1572 } 1573 } 1574 if (compiler_common->flags & REGEX_ID_CHECK) { 1575 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 3)); 1576 } 1577 1578 EMIT_CMP(clear_states_jump, SLJIT_LESS, R_CURR_CHAR, 0, R_BEST_BEGIN, 0); 1579 1580 EMIT_LABEL(leave_label); 1581 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, R_CURR_CHAR, 0); 1582 EMIT_JUMP(jump, SLJIT_JUMP); 1583 sljit_set_label(jump, end_check_label); 1584 1585 /* A loop to clear all states, which are > (or >=) than R_CURR_CHAR. */ 1586 EMIT_LABEL(label); 1587 sljit_set_label(clear_states_jump, label); 1588 1589 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0); 1590 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0); 1591 1592 /* Begin of the loop. */ 1593 EMIT_LABEL(begin_loop_label); 1594 EMIT_CMP(jump, SLJIT_EQUAL, R_TEMP, 0, SLJIT_IMM, 0); 1595 sljit_set_label(jump, leave_label); 1596 1597 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, R_CURR_STATE, 0); 1598 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw)); 1599 EMIT_CMP(clear_states_jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_GREATER : SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_TEMP), 2 * sizeof(sljit_sw), R_CURR_CHAR, 0); 1600 1601 /* Case 1: keep this case. */ 1602 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), R_NEXT_HEAD, 0); 1603 EMIT_OP2(SLJIT_SUB, R_NEXT_HEAD, 0, R_TEMP, 0, R_CURR_STATE, 0); 1604 1605 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0); 1606 EMIT_JUMP(jump, SLJIT_JUMP); 1607 sljit_set_label(jump, begin_loop_label); 1608 1609 /* Case 2: remove this case. */ 1610 EMIT_LABEL(label); 1611 sljit_set_label(clear_states_jump, label); 1612 1613 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), SLJIT_IMM, -1); 1614 1615 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0); 1616 EMIT_JUMP(jump, SLJIT_JUMP); 1617 sljit_set_label(jump, begin_loop_label); 1618 } 1619 else { 1620 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_IMM, 0); 1621 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0); 1622 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0); 1623 if (compiler_common->flags & REGEX_ID_CHECK) { 1624 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2)); 1625 } 1626 EMIT_JUMP(jump, SLJIT_JUMP); 1627 sljit_set_label(jump, end_check_label); 1628 } 1629 return REGEX_NO_ERROR; 1630 } 1631 1632 static int compile_leave_fast_forward(struct compiler_common *compiler_common, struct sljit_label *fast_forward_label) 1633 { 1634 struct sljit_compiler *compiler = compiler_common->compiler; 1635 struct stack *stack = &compiler_common->stack; 1636 struct stack_item *dfa_transitions = compiler_common->dfa_transitions; 1637 struct stack_item *search_states = compiler_common->search_states; 1638 int ind; 1639 struct sljit_jump *jump; 1640 int init_range = 1, prev_value = 0; 1641 1642 while (stack->count > 0) { 1643 ind = stack_pop(stack)->value; 1644 search_states[ind].value = -1; 1645 if (search_states[ind].type >= 0) { 1646 if (dfa_transitions[ind].type == type_char) { 1647 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value); 1648 sljit_set_label(jump, fast_forward_label); 1649 } 1650 else if (dfa_transitions[ind].type == type_rng_start) { 1651 SLJIT_ASSERT(!dfa_transitions[ind].value); 1652 ind++; 1653 while (dfa_transitions[ind].type != type_rng_end) { 1654 if (dfa_transitions[ind].type == type_rng_char) { 1655 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value); 1656 sljit_set_label(jump, fast_forward_label); 1657 } 1658 else { 1659 SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left); 1660 if (init_range) { 1661 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0); 1662 init_range = 0; 1663 } 1664 if (dfa_transitions[ind].value != prev_value) { 1665 /* Best compatibility to all archs. */ 1666 prev_value -= dfa_transitions[ind].value; 1667 if (prev_value < 0) { 1668 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value); 1669 } 1670 else { 1671 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value); 1672 } 1673 prev_value = dfa_transitions[ind].value; 1674 } 1675 EMIT_CMP(jump, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value); 1676 sljit_set_label(jump, fast_forward_label); 1677 ind++; 1678 } 1679 ind++; 1680 } 1681 } 1682 else { 1683 SLJIT_ASSERT(dfa_transitions[ind].type == type_newline); 1684 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n'); 1685 sljit_set_label(jump, fast_forward_label); 1686 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r'); 1687 sljit_set_label(jump, fast_forward_label); 1688 } 1689 } 1690 } 1691 return REGEX_NO_ERROR; 1692 } 1693 1694 static int compile_newline_check(struct compiler_common *compiler_common, sljit_sw ind) 1695 { 1696 struct sljit_compiler *compiler = compiler_common->compiler; 1697 struct sljit_jump *jump1; 1698 struct sljit_jump *jump2; 1699 struct sljit_label *label; 1700 sljit_sw no_states; 1701 sljit_sw offset; 1702 1703 /* Check whether a new-line character is found. */ 1704 EMIT_CMP(jump1, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n'); 1705 EMIT_CMP(jump2, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r'); 1706 1707 no_states = compiler_common->no_states; 1708 offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1); 1709 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset); 1710 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1); 1711 CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); 1712 1713 EMIT_LABEL(label); 1714 sljit_set_label(jump1, label); 1715 sljit_set_label(jump2, label); 1716 return REGEX_NO_ERROR; 1717 } 1718 1719 #undef CHECK 1720 1721 #define CHECK(exp) \ 1722 if (SLJIT_UNLIKELY(exp)) \ 1723 return 0 1724 1725 static SLJIT_INLINE void range_set_label(struct sljit_jump **range_jump_list, struct sljit_label *label) 1726 { 1727 while (*range_jump_list) { 1728 sljit_set_label(*range_jump_list, label); 1729 range_jump_list++; 1730 } 1731 } 1732 1733 static sljit_sw compile_range_check(struct compiler_common *compiler_common, sljit_sw ind) 1734 { 1735 struct sljit_compiler *compiler = compiler_common->compiler; 1736 struct stack_item *dfa_transitions = compiler_common->dfa_transitions; 1737 struct sljit_jump **range_jump_list = compiler_common->range_jump_list; 1738 int invert = dfa_transitions[ind].value; 1739 struct sljit_label *label; 1740 sljit_sw no_states; 1741 sljit_sw offset; 1742 int init_range = 1, prev_value = 0; 1743 1744 ind++; 1745 1746 while (dfa_transitions[ind].type != type_rng_end) { 1747 if (dfa_transitions[ind].type == type_rng_char) { 1748 EMIT_CMP(*range_jump_list, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value); 1749 range_jump_list++; 1750 } 1751 else { 1752 SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left); 1753 if (init_range) { 1754 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0); 1755 init_range = 0; 1756 } 1757 if (dfa_transitions[ind].value != prev_value) { 1758 /* Best compatibility to all archs. */ 1759 prev_value -= dfa_transitions[ind].value; 1760 if (prev_value < 0) { 1761 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value); 1762 } 1763 else { 1764 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value); 1765 } 1766 prev_value = dfa_transitions[ind].value; 1767 } 1768 EMIT_CMP(*range_jump_list, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value); 1769 range_jump_list++; 1770 ind++; 1771 } 1772 ind++; 1773 } 1774 1775 *range_jump_list = NULL; 1776 1777 if (!invert) { 1778 no_states = compiler_common->no_states; 1779 offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1); 1780 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset); 1781 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1); 1782 CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); 1783 1784 EMIT_LABEL(label); 1785 range_set_label(compiler_common->range_jump_list, label); 1786 /* Clears the jump list. */ 1787 *compiler_common->range_jump_list = NULL; 1788 } 1789 return ind; 1790 } 1791 1792 #undef TERM_OFFSET_OF 1793 #undef EMIT_OP1 1794 #undef EMIT_OP2 1795 #undef EMIT_LABEL 1796 #undef EMIT_JUMP 1797 #undef EMIT_CMP 1798 #undef CHECK 1799 1800 /* --------------------------------------------------------------------- */ 1801 /* Main compiler */ 1802 /* --------------------------------------------------------------------- */ 1803 1804 #define TERM_OFFSET_OF(ind, offs) (((ind) * compiler_common.no_states + (offs)) * sizeof(sljit_sw)) 1805 1806 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \ 1807 CHECK(sljit_emit_op1(compiler_common.compiler, type, arg1, arg2, arg3, arg4)) 1808 1809 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \ 1810 CHECK(sljit_emit_op2(compiler_common.compiler, type, arg1, arg2, arg3, arg4, arg5, arg6)) 1811 1812 #define EMIT_LABEL(label) \ 1813 label = sljit_emit_label(compiler_common.compiler); \ 1814 CHECK(!label) 1815 1816 #define EMIT_JUMP(jump, type) \ 1817 jump = sljit_emit_jump(compiler_common.compiler, type); \ 1818 CHECK(!jump) 1819 1820 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \ 1821 jump = sljit_emit_cmp(compiler_common.compiler, type, arg1, arg2, arg3, arg4); \ 1822 CHECK(!jump) 1823 1824 /* A do {} while(0) expression helps to avoid goto statements. */ 1825 #define BEGIN_GUARD \ 1826 do { 1827 1828 #define END_GUARD \ 1829 } while(0); 1830 1831 #define CHECK(exp) \ 1832 if (SLJIT_UNLIKELY(exp)) \ 1833 break; 1834 1835 struct regex_machine* regex_compile(const regex_char_t *regex_string, int length, int re_flags, int *error) 1836 { 1837 struct compiler_common compiler_common; 1838 sljit_sw ind; 1839 int error_code, done, suggest_fast_forward; 1840 /* ID of an empty match (-1 if not reachable). */ 1841 int empty_match_id; 1842 1843 struct sljit_jump *jump; 1844 struct sljit_jump *best_match_found_jump; 1845 struct sljit_jump *fast_forward_jump = NULL; 1846 struct sljit_jump *length_is_zero_jump; 1847 struct sljit_jump *end_check_jump = NULL; 1848 struct sljit_jump *best_match_check_jump = NULL; 1849 struct sljit_jump *non_greedy_end_jump = NULL; 1850 struct sljit_label *label; 1851 struct sljit_label *end_check_label = NULL; 1852 struct sljit_label *start_label; 1853 struct sljit_label *fast_forward_label; 1854 struct sljit_label *fast_forward_return_label; 1855 1856 if (error) 1857 *error = REGEX_NO_ERROR; 1858 #ifdef REGEX_MATCH_VERBOSE 1859 compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE | REGEX_MATCH_VERBOSE); 1860 #else 1861 compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE); 1862 #endif 1863 1864 /* Step 1: parsing (Left->Right). 1865 Syntax check and AST generator. */ 1866 error_code = parse(regex_string, length, &compiler_common); 1867 if (error_code) { 1868 stack_destroy(&compiler_common.stack); 1869 if (error) 1870 *error = error_code; 1871 return NULL; 1872 } 1873 1874 /* Step 2: generating branches (Right->Left). */ 1875 error_code = generate_transitions(&compiler_common); 1876 stack_destroy(&compiler_common.stack); 1877 stack_destroy(&compiler_common.depth); 1878 if (error_code) { 1879 if (compiler_common.dfa_transitions) 1880 SLJIT_FREE(compiler_common.dfa_transitions, NULL); 1881 if (error) 1882 *error = error_code; 1883 return NULL; 1884 } 1885 1886 /* Step 3: Generate necessary data for depth-first search (Left->Right). */ 1887 error_code = generate_search_states(&compiler_common); 1888 if (error_code) { 1889 SLJIT_FREE(compiler_common.dfa_transitions, NULL); 1890 if (error) 1891 *error = error_code; 1892 return NULL; 1893 } 1894 1895 #ifdef REGEX_MATCH_VERBOSE 1896 if (compiler_common.flags & REGEX_MATCH_VERBOSE) 1897 verbose_transitions(&compiler_common); 1898 #endif 1899 1900 /* Step 4: Left->Right generate code. */ 1901 stack_init(&compiler_common.stack); 1902 stack_init(&compiler_common.depth); 1903 done = 0; 1904 compiler_common.machine = NULL; 1905 compiler_common.compiler = NULL; 1906 compiler_common.range_jump_list = NULL; 1907 1908 BEGIN_GUARD 1909 1910 compiler_common.machine = (struct regex_machine*)SLJIT_MALLOC(sizeof(struct regex_machine) + (compiler_common.terms_size - 1) * sizeof(sljit_uw), NULL); 1911 CHECK(!compiler_common.machine); 1912 1913 compiler_common.compiler = sljit_create_compiler(NULL); 1914 CHECK(!compiler_common.compiler); 1915 1916 if (compiler_common.longest_range_size > 0) { 1917 compiler_common.range_jump_list = (struct sljit_jump**)SLJIT_MALLOC(sizeof(struct sljit_jump*) * compiler_common.longest_range_size, NULL); 1918 CHECK(!compiler_common.range_jump_list); 1919 } 1920 1921 if ((compiler_common.flags & REGEX_ID_CHECK) && !(compiler_common.flags & REGEX_MATCH_BEGIN)) 1922 compiler_common.no_states = 4; 1923 else if (!(compiler_common.flags & REGEX_ID_CHECK) && (compiler_common.flags & REGEX_MATCH_BEGIN)) 1924 compiler_common.no_states = 2; 1925 else 1926 compiler_common.no_states = 3; 1927 1928 compiler_common.machine->flags = compiler_common.flags; 1929 compiler_common.machine->no_states = compiler_common.no_states; 1930 compiler_common.machine->size = compiler_common.machine->no_states * compiler_common.terms_size; 1931 1932 /* Study the regular expression. */ 1933 empty_match_id = -1; 1934 suggest_fast_forward = 1; 1935 if (!(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN)) { 1936 CHECK(trace_transitions(0, &compiler_common)); 1937 while (compiler_common.stack.count > 0) { 1938 ind = stack_pop(&compiler_common.stack)->value; 1939 if (compiler_common.search_states[ind].type == 0) { 1940 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end); 1941 suggest_fast_forward = 0; 1942 empty_match_id = compiler_common.search_states[ind].value; 1943 } 1944 else if (compiler_common.search_states[ind].type > 0) { 1945 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type != type_end); 1946 if (compiler_common.dfa_transitions[ind].type == type_rng_start && compiler_common.dfa_transitions[ind].value) 1947 suggest_fast_forward = 0; 1948 } 1949 compiler_common.search_states[ind].value = -1; 1950 } 1951 } 1952 else { 1953 SLJIT_ASSERT(compiler_common.dfa_transitions[1].type == type_newline); 1954 CHECK(trace_transitions(1, &compiler_common)); 1955 while (compiler_common.stack.count > 0) { 1956 ind = stack_pop(&compiler_common.stack)->value; 1957 if (compiler_common.search_states[ind].type == 0) { 1958 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end); 1959 suggest_fast_forward = 0; 1960 empty_match_id = compiler_common.search_states[ind].value; 1961 } 1962 compiler_common.search_states[ind].value = -1; 1963 } 1964 } 1965 1966 /* Step 4.1: Generate entry. */ 1967 CHECK(sljit_emit_enter(compiler_common.compiler, 0, 3, 5, 5, 0, 0, 0)); 1968 1969 /* Copy arguments to their place. */ 1970 EMIT_OP1(SLJIT_MOV, R_REGEX_MATCH, 0, SLJIT_S0, 0); 1971 EMIT_OP1(SLJIT_MOV, R_STRING, 0, SLJIT_S1, 0); 1972 EMIT_OP2(SLJIT_ADD, R_LENGTH, 0, SLJIT_S2, 0, SLJIT_IMM, 1); 1973 1974 /* Init global registers. */ 1975 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current)); 1976 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next)); 1977 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head)); 1978 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin)); 1979 EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index)); 1980 1981 /* Check whether the best match has already found in a previous frame. */ 1982 EMIT_CMP(jump, SLJIT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 0); 1983 EMIT_JUMP(best_match_found_jump, SLJIT_JUMP); 1984 1985 #ifdef REGEX_MATCH_VERBOSE 1986 if (compiler_common.flags & REGEX_MATCH_VERBOSE) 1987 printf("\n-----------------\nTrace\n-----------------\n"); 1988 #endif 1989 1990 /* Step 4.2: Generate code for state 0. */ 1991 EMIT_LABEL(label); 1992 compiler_common.machine->entry_addrs[0] = (sljit_uw)label; 1993 1994 /* Swapping current and next. */ 1995 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_STATE, 0); 1996 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_NEXT_STATE, 0); 1997 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_TEMP, 0); 1998 1999 /* Checking whether the best case needs to be updated. */ 2000 if (!(compiler_common.flags & REGEX_MATCH_END)) { 2001 EMIT_CMP(end_check_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1); 2002 EMIT_LABEL(end_check_label); 2003 } 2004 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1); 2005 EMIT_OP2(SLJIT_ADD, R_CURR_INDEX, 0, R_CURR_INDEX, 0, SLJIT_IMM, 1); 2006 2007 /* Checking whether best case has already found. */ 2008 if (!(compiler_common.flags & REGEX_MATCH_END) || (compiler_common.flags & REGEX_MATCH_BEGIN)) { 2009 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) { 2010 /* We can bail out if no more active states remain and R_BEST_BEGIN != -1. */ 2011 EMIT_CMP(best_match_check_jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1); 2012 } 2013 else { 2014 /* We can bail out if no more active states remain (regardless of R_BEST_BEGIN). */ 2015 EMIT_CMP(best_match_check_jump, SLJIT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0); 2016 } 2017 } 2018 2019 EMIT_LABEL(start_label); 2020 sljit_set_label(jump, start_label); 2021 2022 if (!(compiler_common.flags & REGEX_MATCH_BEGIN) && suggest_fast_forward) { 2023 EMIT_CMP(fast_forward_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0); 2024 } 2025 2026 /* Loading the next character. */ 2027 EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_LENGTH, 0, R_LENGTH, 0, SLJIT_IMM, 1); 2028 EMIT_JUMP(length_is_zero_jump, SLJIT_EQUAL); 2029 2030 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_STRING, 0); 2031 #ifdef REGEX_USE_8BIT_CHARS 2032 EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0); 2033 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 1); 2034 #else 2035 EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0); 2036 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 2); 2037 #endif 2038 EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_TEMP, 0); 2039 2040 #ifdef REGEX_MATCH_VERBOSE 2041 if (compiler_common.flags & REGEX_MATCH_VERBOSE) { 2042 printf("(%3d): ", 0); 2043 CHECK(trace_transitions(0, &compiler_common)); 2044 while (compiler_common.stack.count > 0) { 2045 ind = stack_pop(&compiler_common.stack)->value; 2046 if (compiler_common.search_states[ind].type >= 0) 2047 printf("-> (%3d:%3d) ", compiler_common.search_states[ind].type, compiler_common.search_states[ind].value); 2048 compiler_common.search_states[ind].value = -1; 2049 } 2050 printf("\n"); 2051 } 2052 #endif 2053 2054 EMIT_LABEL(fast_forward_return_label); 2055 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) { 2056 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 1); 2057 if (!(compiler_common.flags & REGEX_MATCH_END)) { 2058 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1); 2059 } 2060 2061 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_INDEX, 0); 2062 CHECK(compile_uncond_tran(&compiler_common, R_NEXT_STATE)); 2063 /* And branching to the first state. */ 2064 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); 2065 2066 if (!(compiler_common.flags & REGEX_MATCH_END)) { 2067 EMIT_LABEL(label); 2068 sljit_set_label(jump, label); 2069 } 2070 } 2071 /* This is the case where we only have to reset the R_NEXT_HEAD. */ 2072 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0); 2073 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0); 2074 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); 2075 2076 /* Fast-forward loop. */ 2077 if (fast_forward_jump) { 2078 /* Quit from fast-forward loop. */ 2079 EMIT_LABEL(fast_forward_label); 2080 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1); 2081 EMIT_OP1(SLJIT_MOV, R_LENGTH, 0, R_NEXT_STATE, 0); 2082 EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_CURR_STATE, 0); 2083 EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, R_NEXT_HEAD, 0); 2084 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next)); 2085 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current)); 2086 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head)); 2087 2088 /* Update the start field of the locations. */ 2089 CHECK(trace_transitions(0, &compiler_common)); 2090 while (compiler_common.stack.count > 0) { 2091 ind = stack_pop(&compiler_common.stack)->value; 2092 if (compiler_common.search_states[ind].type >= 0) { 2093 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 2), R_TEMP, 0); 2094 } 2095 compiler_common.search_states[ind].value = -1; 2096 } 2097 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0); 2098 EMIT_JUMP(jump, SLJIT_JUMP); 2099 sljit_set_label(jump, fast_forward_return_label); 2100 2101 /* Start fast-forward. */ 2102 EMIT_LABEL(label); 2103 sljit_set_label(fast_forward_jump, label); 2104 2105 /* Moving everything to registers. */ 2106 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0); 2107 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0); 2108 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0); 2109 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_LENGTH, 0); 2110 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_STRING, 0); 2111 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, R_CURR_INDEX, 0); 2112 2113 /* Fast forward mainloop. */ 2114 EMIT_LABEL(label); 2115 EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_NEXT_STATE, 0, R_NEXT_STATE, 0, SLJIT_IMM, 1); 2116 EMIT_JUMP(fast_forward_jump, SLJIT_EQUAL); 2117 2118 #ifdef REGEX_USE_8BIT_CHARS 2119 EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0); 2120 EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 1); 2121 #else 2122 EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0); 2123 EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 2); 2124 #endif 2125 2126 CHECK(trace_transitions(0, &compiler_common)); 2127 CHECK(compile_leave_fast_forward(&compiler_common, fast_forward_label)); 2128 2129 EMIT_OP2(SLJIT_ADD, R_NEXT_HEAD, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1); 2130 EMIT_JUMP(jump, SLJIT_JUMP); 2131 sljit_set_label(jump, label); 2132 2133 /* String is finished. */ 2134 EMIT_LABEL(label); 2135 sljit_set_label(fast_forward_jump, label); 2136 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_NEXT_HEAD, 0); 2137 EMIT_JUMP(fast_forward_jump, SLJIT_JUMP); 2138 } 2139 2140 /* End check. */ 2141 if (end_check_jump) { 2142 EMIT_LABEL(label); 2143 sljit_set_label(end_check_jump, label); 2144 2145 if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || !(compiler_common.flags & REGEX_MATCH_BEGIN)) { 2146 CHECK(compile_end_check(&compiler_common, end_check_label)); 2147 } 2148 else { 2149 /* Since we leave, we do not need to update the R_BEST_BEGIN. */ 2150 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0); 2151 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0); 2152 if (compiler_common.flags & REGEX_ID_CHECK) { 2153 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2)); 2154 } 2155 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1); 2156 EMIT_JUMP(non_greedy_end_jump, SLJIT_JUMP); 2157 } 2158 } 2159 2160 /* Finish check. */ 2161 if (best_match_check_jump) { 2162 EMIT_LABEL(label); 2163 sljit_set_label(best_match_check_jump, label); 2164 2165 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) { 2166 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0); 2167 sljit_set_label(jump, start_label); 2168 } 2169 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1); 2170 } 2171 2172 /* Leaving matching and storing the necessary values. */ 2173 EMIT_LABEL(label); 2174 sljit_set_label(length_is_zero_jump, label); 2175 if (non_greedy_end_jump) 2176 sljit_set_label(non_greedy_end_jump, label); 2177 2178 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_CURR_INDEX, 0); 2179 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0); 2180 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0); 2181 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0); 2182 2183 /* Exit from JIT. */ 2184 EMIT_LABEL(label); 2185 sljit_set_label(best_match_found_jump, label); 2186 if (fast_forward_jump) 2187 sljit_set_label(fast_forward_jump, label); 2188 CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_UNUSED, 0, 0)); 2189 2190 for (ind = 1; ind < compiler_common.dfa_size - 1; ind++) { 2191 if (compiler_common.search_states[ind].type >= 0) { 2192 SLJIT_ASSERT(compiler_common.search_states[ind].type < compiler_common.terms_size); 2193 EMIT_LABEL(label); 2194 compiler_common.machine->entry_addrs[compiler_common.search_states[ind].type] = (sljit_uw)label; 2195 2196 if (compiler_common.dfa_transitions[ind].type == type_char) { 2197 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, compiler_common.dfa_transitions[ind].value); 2198 } 2199 else if (compiler_common.dfa_transitions[ind].type == type_rng_start) { 2200 ind = compile_range_check(&compiler_common, ind); 2201 CHECK(!ind); 2202 } 2203 else { 2204 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline); 2205 CHECK(compile_newline_check(&compiler_common, ind)); 2206 } 2207 2208 CHECK(trace_transitions(ind, &compiler_common)); 2209 #ifdef REGEX_MATCH_VERBOSE 2210 if (compiler_common.flags & REGEX_MATCH_VERBOSE) 2211 printf("(%3d): ", compiler_common.search_states[ind].type); 2212 #endif 2213 CHECK(compile_cond_tran(&compiler_common, compiler_common.search_states[ind].type)); 2214 2215 if (compiler_common.dfa_transitions[ind].type == type_char) { 2216 EMIT_LABEL(label); 2217 sljit_set_label(jump, label); 2218 } 2219 else if (compiler_common.dfa_transitions[ind].type == type_rng_end) { 2220 EMIT_LABEL(label); 2221 range_set_label(compiler_common.range_jump_list, label); 2222 } 2223 else { 2224 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline); 2225 } 2226 2227 /* Branch to the next item in the list. */ 2228 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1)); 2229 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1), SLJIT_IMM, -1); 2230 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0)); 2231 } 2232 } 2233 2234 if (ind == compiler_common.dfa_size - 1) { 2235 /* Generate an init stub function. */ 2236 EMIT_LABEL(label); 2237 CHECK(sljit_emit_enter(compiler_common.compiler, 0, 2, 3, 3, 0, 0, 0)); 2238 2239 if (empty_match_id == -1) { 2240 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, -1); 2241 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, 0); 2242 } 2243 else { 2244 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0); 2245 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, empty_match_id); 2246 } 2247 2248 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, index), SLJIT_IMM, !(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN) ? 1 : 2); 2249 2250 if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || empty_match_id == -1) { 2251 /* The else is a really rare event, so we still generate an empty function instead of a runtime pointer check. */ 2252 SLJIT_ASSERT(R_CURR_STATE == SLJIT_S0); 2253 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) { 2254 /* R_CURR_INDEX (put to R_TEMP) is zero. */ 2255 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, 0); 2256 } 2257 CHECK(compile_uncond_tran(&compiler_common, R_CURR_STATE)); 2258 } 2259 else { 2260 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0); 2261 } 2262 CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_MOV, R_NEXT_HEAD, 0)); 2263 2264 compiler_common.machine->continue_match = sljit_generate_code(compiler_common.compiler); 2265 #ifndef SLJIT_INDIRECT_CALL 2266 compiler_common.machine->u.init_match = (void*)(sljit_sw)sljit_get_label_addr(label); 2267 #else 2268 sljit_set_function_context(&compiler_common.machine->u.init_match, &compiler_common.machine->context, sljit_get_label_addr(label), regex_compile); 2269 #endif 2270 #ifdef REGEX_MATCH_VERBOSE 2271 if (compiler_common.flags & REGEX_MATCH_VERBOSE) 2272 printf("Continue match: %p Init match: %p\n\n", compiler_common.machine->continue_match, compiler_common.machine->u.init_match); 2273 #endif 2274 if (compiler_common.machine->continue_match) { 2275 for (ind = 0; ind < compiler_common.terms_size; ++ind) 2276 compiler_common.machine->entry_addrs[ind] = sljit_get_label_addr((struct sljit_label*)compiler_common.machine->entry_addrs[ind]); 2277 done = 1; 2278 } 2279 } 2280 END_GUARD 2281 2282 stack_destroy(&compiler_common.stack); 2283 stack_destroy(&compiler_common.depth); 2284 SLJIT_FREE(compiler_common.dfa_transitions, NULL); 2285 SLJIT_FREE(compiler_common.search_states, NULL); 2286 if (compiler_common.range_jump_list) 2287 SLJIT_FREE(compiler_common.range_jump_list, NULL); 2288 if (compiler_common.compiler) 2289 sljit_free_compiler(compiler_common.compiler); 2290 if (done) 2291 return compiler_common.machine; 2292 2293 if (compiler_common.machine) { 2294 SLJIT_FREE(compiler_common.machine, NULL); 2295 } 2296 if (error) 2297 *error = REGEX_MEMORY_ERROR; 2298 return NULL; 2299 } 2300 2301 #undef TERM_OFFSET_OF 2302 #undef EMIT_OP1 2303 #undef EMIT_OP2 2304 #undef EMIT_LABEL 2305 #undef EMIT_JUMP 2306 #undef EMIT_CMP 2307 #undef BEGIN_GUARD 2308 #undef END_GUARD 2309 #undef CHECK 2310 2311 void regex_free_machine(struct regex_machine *machine) 2312 { 2313 sljit_free_code(machine->continue_match); 2314 SLJIT_FREE(machine, NULL); 2315 } 2316 2317 const char* regex_get_platform_name(void) 2318 { 2319 return sljit_get_platform_name(); 2320 } 2321 2322 /* --------------------------------------------------------------------- */ 2323 /* Mathching utilities */ 2324 /* --------------------------------------------------------------------- */ 2325 2326 struct regex_match* regex_begin_match(struct regex_machine *machine) 2327 { 2328 sljit_sw *ptr1; 2329 sljit_sw *ptr2; 2330 sljit_sw *end; 2331 sljit_sw *entry_addrs; 2332 2333 struct regex_match *match = (struct regex_match*)SLJIT_MALLOC(sizeof(struct regex_match) + (machine->size * 2 - 1) * sizeof(sljit_sw), NULL); 2334 if (!match) 2335 return NULL; 2336 2337 ptr1 = match->states; 2338 ptr2 = match->states + machine->size; 2339 end = ptr2; 2340 entry_addrs = (sljit_sw*)machine->entry_addrs; 2341 2342 match->current = ptr1; 2343 match->next = ptr2; 2344 match->head = 0; 2345 match->machine = machine; 2346 2347 /* Init machine states. */ 2348 switch (machine->no_states) { 2349 case 2: 2350 while (ptr1 < end) { 2351 *ptr1++ = *entry_addrs; 2352 *ptr2++ = *entry_addrs++; 2353 *ptr1++ = -1; 2354 *ptr2++ = -1; 2355 } 2356 break; 2357 2358 case 3: 2359 while (ptr1 < end) { 2360 *ptr1++ = *entry_addrs; 2361 *ptr2++ = *entry_addrs++; 2362 *ptr1++ = -1; 2363 *ptr2++ = -1; 2364 *ptr1++ = 0; 2365 *ptr2++ = 0; 2366 } 2367 break; 2368 2369 case 4: 2370 while (ptr1 < end) { 2371 *ptr1++ = *entry_addrs; 2372 *ptr2++ = *entry_addrs++; 2373 *ptr1++ = -1; 2374 *ptr2++ = -1; 2375 *ptr1++ = 0; 2376 *ptr2++ = 0; 2377 *ptr1++ = 0; 2378 *ptr2++ = 0; 2379 } 2380 break; 2381 2382 default: 2383 SLJIT_UNREACHABLE(); 2384 break; 2385 } 2386 2387 SLJIT_ASSERT(ptr1 == end); 2388 2389 match->u.continue_match = machine->continue_match; 2390 2391 regex_reset_match(match); 2392 return match; 2393 } 2394 2395 void regex_reset_match(struct regex_match *match) 2396 { 2397 struct regex_machine *machine = match->machine; 2398 sljit_sw current, ind; 2399 sljit_sw *current_ptr; 2400 2401 match->best_end = 0; 2402 match->fast_quit = 0; 2403 match->fast_forward = 0; 2404 2405 if (match->head != 0) { 2406 /* Clear the current state. */ 2407 current = match->head; 2408 current_ptr = match->current; 2409 do { 2410 ind = (current / sizeof(sljit_sw)) + 1; 2411 current = current_ptr[ind]; 2412 current_ptr[ind] = -1; 2413 } while (current != 0); 2414 } 2415 match->head = machine->u.call_init(match->current, match); 2416 } 2417 2418 void regex_free_match(struct regex_match *match) 2419 { 2420 SLJIT_FREE(match, NULL); 2421 } 2422 2423 void regex_continue_match(struct regex_match *match, const regex_char_t *input_string, int length) 2424 { 2425 match->u.call_continue(match, input_string, length); 2426 } 2427 2428 int regex_get_result(struct regex_match *match, int *end, int *id) 2429 { 2430 int flags = match->machine->flags; 2431 sljit_sw no_states; 2432 2433 *end = match->best_end; 2434 *id = match->best_id; 2435 if (!(flags & (REGEX_MATCH_END | REGEX_FAKE_MATCH_END))) 2436 return match->best_begin; 2437 2438 if (flags & REGEX_FAKE_MATCH_END) { 2439 SLJIT_ASSERT(!(flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END))); 2440 if (match->best_begin != -1) 2441 return match->best_begin; 2442 2443 no_states = match->machine->no_states; 2444 if (match->current[no_states + 1] == -1) 2445 return -1; 2446 if (flags & REGEX_ID_CHECK) 2447 *id = match->current[no_states + 3]; 2448 if (!(flags & REGEX_FAKE_MATCH_BEGIN)) 2449 *end = match->index - 1; 2450 else 2451 *end = match->index - 2; 2452 return match->current[no_states + 2]; 2453 } 2454 else { 2455 /* Check the status of the last code. */ 2456 if (!(flags & REGEX_MATCH_BEGIN)) { 2457 /* No shortcut in this case. */ 2458 if (!(flags & REGEX_ID_CHECK)) { 2459 if (match->current[1] == -1) 2460 return -1; 2461 *end = match->index - 1; 2462 return match->current[2]; 2463 } 2464 2465 if (match->current[1] == -1) 2466 return -1; 2467 *end = match->index - 1; 2468 *id = match->current[3]; 2469 return match->current[2]; 2470 } 2471 2472 /* Shortcut is possible in this case. */ 2473 if (!(flags & REGEX_ID_CHECK)) { 2474 if (match->current[1] == -1 || match->head == -1) 2475 return -1; 2476 *end = match->index - 1; 2477 return 0; 2478 } 2479 2480 if (match->current[1] == -1 || match->head == -1) 2481 return -1; 2482 *end = match->index - 1; 2483 *id = match->current[2]; 2484 return 0; 2485 } 2486 } 2487 2488 int regex_is_match_finished(struct regex_match *match) 2489 { 2490 return match->fast_quit; 2491 } 2492 2493 #ifdef REGEX_MATCH_VERBOSE 2494 void regex_continue_match_debug(struct regex_match *match, const regex_char_t *input_string, int length) 2495 { 2496 sljit_sw *ptr; 2497 sljit_sw *end; 2498 sljit_sw count; 2499 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 2500 sljit_sw current; 2501 #endif 2502 sljit_sw no_states = match->machine->no_states; 2503 sljit_sw len = match->machine->size; 2504 2505 while (length > 0) { 2506 match->u.call_continue(match, input_string, 1); 2507 2508 if (match->fast_forward) { 2509 if (match->machine->flags & REGEX_MATCH_VERBOSE) 2510 printf("fast forward\n"); 2511 } 2512 2513 /* Verbose (first). */ 2514 if (match->machine->flags & REGEX_MATCH_VERBOSE) { 2515 ptr = match->current; 2516 end = ptr + len; 2517 count = 0; 2518 printf("'%c' (%3ld->%3ld [%3ld]) ", *input_string, (long)match->best_begin, (long)match->best_end, (long)match->best_id); 2519 while (ptr < end) { 2520 printf("[%3ld:", (long)count++); 2521 switch (no_states) { 2522 case 2: 2523 if (ptr[1] != -1) 2524 printf("+] "); 2525 else 2526 printf(" ] "); 2527 break; 2528 2529 case 3: 2530 if (ptr[1] != -1) 2531 printf("+,%3ld] ", (long)ptr[2]); 2532 else 2533 printf(" ,XXX] "); 2534 break; 2535 2536 case 4: 2537 if (ptr[1] != -1) 2538 printf("+,%3ld,%3ld] ", (long)ptr[2], (long)ptr[3]); 2539 else 2540 printf(" ,XXX,XXX] "); 2541 break; 2542 } 2543 ptr += no_states; 2544 } 2545 printf("\n"); 2546 } 2547 2548 #if (defined SLJIT_DEBUG && SLJIT_DEBUG) 2549 /* Sanity check (later). */ 2550 ptr = match->next; 2551 end = ptr + len; 2552 while (ptr < end) { 2553 SLJIT_ASSERT(ptr[1] == -1); 2554 ptr += no_states; 2555 } 2556 2557 /* Check number of active elements. */ 2558 ptr = match->current + no_states; 2559 end = ptr + len - no_states; 2560 count = 0; 2561 while (ptr < end) { 2562 if (ptr[1] != -1) 2563 count++; 2564 ptr += no_states; 2565 } 2566 2567 /* Check chain list. */ 2568 current = match->head; 2569 ptr = match->current; 2570 while (current != 0) { 2571 SLJIT_ASSERT(current >= 0 && current < len * sizeof(sljit_sw)); 2572 SLJIT_ASSERT((current % (no_states * sizeof(sljit_sw))) == 0); 2573 SLJIT_ASSERT(count > 0); 2574 current = ptr[(current / sizeof(sljit_sw)) + 1]; 2575 count--; 2576 } 2577 SLJIT_ASSERT(count == 0); 2578 #endif 2579 2580 if (match->fast_quit) { 2581 /* the machine has stopped working. */ 2582 if (match->machine->flags & REGEX_MATCH_VERBOSE) 2583 printf("Best match has found\n"); 2584 break; 2585 } 2586 2587 input_string++; 2588 length--; 2589 } 2590 } 2591 #endif 2592