1 /* 2 Implementation of backtracking std.regex engine. 3 Contains both compile-time and run-time versions. 4 */ 5 module std.regex.internal.backtracking; 6 7 package(std.regex): 8 9 import core.stdc.stdlib, std.range.primitives, std.traits, std.typecons; 10 import std.regex.internal.ir; 11 12 /+ 13 BacktrackingMatcher implements backtracking scheme of matching 14 regular expressions. 15 +/ 16 template BacktrackingMatcher(bool CTregex) 17 { 18 @trusted struct BacktrackingMatcher(Char, Stream = Input!Char) 19 if (is(Char : dchar)) 20 { 21 alias DataIndex = Stream.DataIndex; 22 struct State 23 {//top bit in pc is set if saved along with matches 24 DataIndex index; 25 uint pc, counter, infiniteNesting; 26 } 27 static assert(State.sizeof % size_t.sizeof == 0); 28 enum stateSize = State.sizeof / size_t.sizeof; 29 enum initialStack = 1 << 11; // items in a block of segmented stack 30 alias String = const(Char)[]; 31 alias RegEx = Regex!Char; 32 alias MatchFn = bool function (ref BacktrackingMatcher!(Char, Stream)); 33 RegEx re; //regex program 34 static if (CTregex) 35 MatchFn nativeFn; //native code for that program 36 //Stream state 37 Stream s; 38 DataIndex index; 39 dchar front; 40 bool exhausted; 41 //backtracking machine state 42 uint pc, counter; 43 DataIndex lastState = 0; //top of state stack 44 static if (!CTregex) 45 uint infiniteNesting; 46 size_t[] memory; 47 Trace[] merge; 48 static struct Trace 49 { 50 ulong mask; 51 size_t offset; 52 53 bool mark(size_t idx) 54 { 55 immutable d = idx - offset; 56 if (d < 64) // including overflow 57 { 58 immutable p = mask & (1UL << d); 59 mask |= 1UL << d; 60 return p != 0; 61 } 62 else 63 { 64 offset = idx; 65 mask = 1; 66 return false; 67 } 68 } 69 } 70 //local slice of matches, global for backref 71 Group!DataIndex[] matches, backrefed; 72 73 static if (__traits(hasMember,Stream, "search")) 74 { 75 enum kicked = true; 76 } 77 else 78 enum kicked = false; 79 80 static size_t initialMemory(const ref RegEx re) 81 { 82 return stackSize(re)*size_t.sizeof + re.hotspotTableSize*Trace.sizeof; 83 } 84 85 static size_t stackSize(const ref RegEx re) 86 { 87 size_t itemSize = stateSize 88 + re.ngroup * (Group!DataIndex).sizeof / size_t.sizeof; 89 return initialStack * itemSize + 2; 90 } 91 92 @property bool atStart(){ return index == 0; } 93 94 @property bool atEnd(){ return index == s.lastIndex && s.atEnd; } 95 96 void next() 97 { 98 if (!s.nextChar(front, index)) 99 index = s.lastIndex; 100 } 101 102 void search() 103 { 104 static if (kicked) 105 { 106 if (!s.search(re.kickstart, front, index)) 107 { 108 index = s.lastIndex; 109 } 110 } 111 else 112 next(); 113 } 114 115 // 116 void newStack() 117 { 118 auto chunk = mallocArray!(size_t)(stackSize(re)); 119 chunk[0] = cast(size_t)(memory.ptr); 120 chunk[1] = lastState; 121 memory = chunk[2..$]; 122 lastState = 0; 123 } 124 125 bool prevStack() 126 { 127 // pointer to previous block 128 size_t* prev = cast(size_t*) memory.ptr[-2]; 129 if (!prev) 130 { 131 // The last segment is freed in RegexMatch 132 return false; 133 } 134 else 135 { 136 import core.stdc.stdlib : free; 137 // memory used in previous block 138 size_t size = memory.ptr[-1]; 139 free(memory.ptr-2); 140 memory = prev[0 .. size]; 141 lastState = size; 142 return true; 143 } 144 } 145 146 void initExternalMemory(void[] memBlock) 147 { 148 merge = arrayInChunk!(Trace)(re.hotspotTableSize, memBlock); 149 merge[] = Trace.init; 150 memory = cast(size_t[]) memBlock; 151 memory[0] = 0; // hidden pointer 152 memory[1] = 0; // used size 153 memory = memory[2..$]; 154 } 155 156 void initialize(ref RegEx program, Stream stream, void[] memBlock) 157 { 158 re = program; 159 s = stream; 160 exhausted = false; 161 initExternalMemory(memBlock); 162 backrefed = null; 163 } 164 165 auto dupTo(void[] memory) 166 { 167 typeof(this) tmp = this; 168 tmp.initExternalMemory(memory); 169 return tmp; 170 } 171 172 this(ref RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx) 173 { 174 initialize(program, stream, memBlock); 175 front = ch; 176 index = idx; 177 } 178 179 this(ref RegEx program, Stream stream, void[] memBlock) 180 { 181 initialize(program, stream, memBlock); 182 next(); 183 } 184 185 auto fwdMatcher(ref BacktrackingMatcher matcher, void[] memBlock) 186 { 187 alias BackMatcherTempl = .BacktrackingMatcher!(CTregex); 188 alias BackMatcher = BackMatcherTempl!(Char, Stream); 189 auto fwdMatcher = BackMatcher(matcher.re, s, memBlock, front, index); 190 return fwdMatcher; 191 } 192 193 auto bwdMatcher(ref BacktrackingMatcher matcher, void[] memBlock) 194 { 195 alias BackMatcherTempl = .BacktrackingMatcher!(CTregex); 196 alias BackMatcher = BackMatcherTempl!(Char, typeof(s.loopBack(index))); 197 auto fwdMatcher = 198 BackMatcher(matcher.re, s.loopBack(index), memBlock); 199 return fwdMatcher; 200 } 201 202 // 203 int matchFinalize() 204 { 205 immutable start = index; 206 immutable val = matchImpl(); 207 if (val) 208 {//stream is updated here 209 matches[0].begin = start; 210 matches[0].end = index; 211 if (!(re.flags & RegexOption.global) || atEnd) 212 exhausted = true; 213 if (start == index)//empty match advances input 214 next(); 215 return val; 216 } 217 else 218 return 0; 219 } 220 221 //lookup next match, fill matches with indices into input 222 int match(Group!DataIndex[] matches) 223 { 224 debug(std_regex_matcher) 225 { 226 writeln("------------------------------------------"); 227 } 228 if (exhausted) //all matches collected 229 return false; 230 this.matches = matches; 231 if (re.flags & RegexInfo.oneShot) 232 { 233 exhausted = true; 234 const DataIndex start = index; 235 immutable m = matchImpl(); 236 if (m) 237 { 238 matches[0].begin = start; 239 matches[0].end = index; 240 } 241 return m; 242 } 243 static if (kicked) 244 { 245 if (!re.kickstart.empty) 246 { 247 for (;;) 248 { 249 immutable val = matchFinalize(); 250 if (val) 251 return val; 252 else 253 { 254 if (atEnd) 255 break; 256 search(); 257 if (atEnd) 258 { 259 exhausted = true; 260 return matchFinalize(); 261 } 262 } 263 } 264 exhausted = true; 265 return 0; //early return 266 } 267 } 268 //no search available - skip a char at a time 269 for (;;) 270 { 271 immutable val = matchFinalize(); 272 if (val) 273 return val; 274 else 275 { 276 if (atEnd) 277 break; 278 next(); 279 if (atEnd) 280 { 281 exhausted = true; 282 return matchFinalize(); 283 } 284 } 285 } 286 exhausted = true; 287 return 0; 288 } 289 290 /+ 291 match subexpression against input, 292 results are stored in matches 293 +/ 294 int matchImpl() 295 { 296 static if (CTregex && is(typeof(nativeFn(this)))) 297 { 298 debug(std_regex_ctr) writeln("using C-T matcher"); 299 return nativeFn(this); 300 } 301 else 302 { 303 pc = 0; 304 counter = 0; 305 lastState = 0; 306 matches[] = Group!DataIndex.init; 307 auto start = s._index; 308 debug(std_regex_matcher) 309 writeln("Try match starting at ", s[index .. s.lastIndex]); 310 for (;;) 311 { 312 debug(std_regex_matcher) 313 writefln("PC: %s\tCNT: %s\t%s \tfront: %s src: %s", 314 pc, counter, disassemble(re.ir, pc, re.dict), 315 front, s._index); 316 switch (re.ir[pc].code) 317 { 318 case IR.OrChar://assumes IRL!(OrChar) == 1 319 if (atEnd) 320 goto L_backtrack; 321 uint len = re.ir[pc].sequence; 322 uint end = pc + len; 323 if (re.ir[pc].data != front && re.ir[pc+1].data != front) 324 { 325 for (pc = pc+2; pc < end; pc++) 326 if (re.ir[pc].data == front) 327 break; 328 if (pc == end) 329 goto L_backtrack; 330 } 331 pc = end; 332 next(); 333 break; 334 case IR.Char: 335 if (atEnd || front != re.ir[pc].data) 336 goto L_backtrack; 337 pc += IRL!(IR.Char); 338 next(); 339 break; 340 case IR.Any: 341 if (atEnd) 342 goto L_backtrack; 343 pc += IRL!(IR.Any); 344 next(); 345 break; 346 case IR.CodepointSet: 347 if (atEnd || !re.charsets[re.ir[pc].data].scanFor(front)) 348 goto L_backtrack; 349 next(); 350 pc += IRL!(IR.CodepointSet); 351 break; 352 case IR.Trie: 353 if (atEnd || !re.matchers[re.ir[pc].data][front]) 354 goto L_backtrack; 355 next(); 356 pc += IRL!(IR.Trie); 357 break; 358 case IR.Wordboundary: 359 dchar back; 360 DataIndex bi; 361 //at start & end of input 362 if (atStart && wordMatcher[front]) 363 { 364 pc += IRL!(IR.Wordboundary); 365 break; 366 } 367 else if (atEnd && s.loopBack(index).nextChar(back, bi) 368 && wordMatcher[back]) 369 { 370 pc += IRL!(IR.Wordboundary); 371 break; 372 } 373 else if (s.loopBack(index).nextChar(back, bi)) 374 { 375 immutable af = wordMatcher[front]; 376 immutable ab = wordMatcher[back]; 377 if (af ^ ab) 378 { 379 pc += IRL!(IR.Wordboundary); 380 break; 381 } 382 } 383 goto L_backtrack; 384 case IR.Notwordboundary: 385 dchar back; 386 DataIndex bi; 387 //at start & end of input 388 if (atStart && wordMatcher[front]) 389 goto L_backtrack; 390 else if (atEnd && s.loopBack(index).nextChar(back, bi) 391 && wordMatcher[back]) 392 goto L_backtrack; 393 else if (s.loopBack(index).nextChar(back, bi)) 394 { 395 immutable af = wordMatcher[front]; 396 immutable ab = wordMatcher[back]; 397 if (af ^ ab) 398 goto L_backtrack; 399 } 400 pc += IRL!(IR.Wordboundary); 401 break; 402 case IR.Bof: 403 if (atStart) 404 pc += IRL!(IR.Bol); 405 else 406 goto L_backtrack; 407 break; 408 case IR.Bol: 409 dchar back; 410 DataIndex bi; 411 if (atStart) 412 pc += IRL!(IR.Bol); 413 else if (s.loopBack(index).nextChar(back,bi) 414 && endOfLine(back, front == '\n')) 415 { 416 pc += IRL!(IR.Bol); 417 } 418 else 419 goto L_backtrack; 420 break; 421 case IR.Eof: 422 if (atEnd) 423 pc += IRL!(IR.Eol); 424 else 425 goto L_backtrack; 426 break; 427 case IR.Eol: 428 dchar back; 429 DataIndex bi; 430 debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]); 431 //no matching inside \r\n 432 if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi) 433 && back == '\r'))) 434 { 435 pc += IRL!(IR.Eol); 436 } 437 else 438 goto L_backtrack; 439 break; 440 case IR.InfiniteStart, IR.InfiniteQStart: 441 pc += re.ir[pc].data + IRL!(IR.InfiniteStart); 442 //now pc is at end IR.Infinite(Q)End 443 uint len = re.ir[pc].data; 444 if (re.ir[pc].code == IR.InfiniteEnd) 445 { 446 pushState(pc+IRL!(IR.InfiniteEnd), counter); 447 pc -= len; 448 } 449 else 450 { 451 pushState(pc - len, counter); 452 pc += IRL!(IR.InfiniteEnd); 453 } 454 break; 455 case IR.InfiniteBloomStart: 456 pc += re.ir[pc].data + IRL!(IR.InfiniteBloomStart); 457 //now pc is at end IR.InfiniteBloomEnd 458 immutable len = re.ir[pc].data; 459 immutable filterIdx = re.ir[pc+2].raw; 460 if (re.filters[filterIdx][front]) 461 pushState(pc+IRL!(IR.InfiniteBloomEnd), counter); 462 pc -= len; 463 break; 464 case IR.RepeatStart, IR.RepeatQStart: 465 pc += re.ir[pc].data + IRL!(IR.RepeatStart); 466 break; 467 case IR.RepeatEnd: 468 case IR.RepeatQEnd: 469 if (merge[re.ir[pc + 1].raw+counter].mark(index)) 470 { 471 // merged! 472 goto L_backtrack; 473 } 474 //len, step, min, max 475 immutable len = re.ir[pc].data; 476 immutable step = re.ir[pc+2].raw; 477 immutable min = re.ir[pc+3].raw; 478 immutable max = re.ir[pc+4].raw; 479 if (counter < min) 480 { 481 counter += step; 482 pc -= len; 483 } 484 else if (counter < max) 485 { 486 if (re.ir[pc].code == IR.RepeatEnd) 487 { 488 pushState(pc + IRL!(IR.RepeatEnd), counter%step); 489 counter += step; 490 pc -= len; 491 } 492 else 493 { 494 pushState(pc - len, counter + step); 495 counter = counter%step; 496 pc += IRL!(IR.RepeatEnd); 497 } 498 } 499 else 500 { 501 counter = counter%step; 502 pc += IRL!(IR.RepeatEnd); 503 } 504 break; 505 case IR.InfiniteEnd: 506 case IR.InfiniteQEnd: 507 debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); 508 if (merge[re.ir[pc + 1].raw+counter].mark(index)) 509 { 510 // merged! 511 goto L_backtrack; 512 } 513 immutable len = re.ir[pc].data; 514 if (re.ir[pc].code == IR.InfiniteEnd) 515 { 516 pushState(pc + IRL!(IR.InfiniteEnd), counter); 517 pc -= len; 518 } 519 else 520 { 521 pushState(pc-len, counter); 522 pc += IRL!(IR.InfiniteEnd); 523 } 524 break; 525 case IR.InfiniteBloomEnd: 526 debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); 527 if (merge[re.ir[pc + 1].raw+counter].mark(index)) 528 { 529 // merged! 530 goto L_backtrack; 531 } 532 immutable len = re.ir[pc].data; 533 immutable filterIdx = re.ir[pc+2].raw; 534 if (re.filters[filterIdx][front]) 535 { 536 infiniteNesting--; 537 pushState(pc + IRL!(IR.InfiniteBloomEnd), counter); 538 infiniteNesting++; 539 } 540 pc -= len; 541 break; 542 case IR.OrEnd: 543 if (merge[re.ir[pc + 1].raw+counter].mark(index)) 544 { 545 // merged! 546 goto L_backtrack; 547 } 548 pc += IRL!(IR.OrEnd); 549 break; 550 case IR.OrStart: 551 pc += IRL!(IR.OrStart); 552 goto case; 553 case IR.Option: 554 immutable len = re.ir[pc].data; 555 if (re.ir[pc+len].code == IR.GotoEndOr)//not a last one 556 { 557 pushState(pc + len + IRL!(IR.Option), counter); //remember 2nd branch 558 } 559 pc += IRL!(IR.Option); 560 break; 561 case IR.GotoEndOr: 562 pc = pc + re.ir[pc].data + IRL!(IR.GotoEndOr); 563 break; 564 case IR.GroupStart: 565 immutable n = re.ir[pc].data; 566 matches[n].begin = index; 567 debug(std_regex_matcher) writefln("IR group #%u starts at %u", n, index); 568 pc += IRL!(IR.GroupStart); 569 break; 570 case IR.GroupEnd: 571 immutable n = re.ir[pc].data; 572 matches[n].end = index; 573 debug(std_regex_matcher) writefln("IR group #%u ends at %u", n, index); 574 pc += IRL!(IR.GroupEnd); 575 break; 576 case IR.LookaheadStart: 577 case IR.NeglookaheadStart: 578 immutable len = re.ir[pc].data; 579 auto save = index; 580 immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; 581 auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)]; 582 scope(exit) free(mem.ptr); 583 static if (Stream.isLoopback) 584 { 585 auto matcher = bwdMatcher(this, mem); 586 } 587 else 588 { 589 auto matcher = fwdMatcher(this, mem); 590 } 591 matcher.matches = matches[ms .. me]; 592 matcher.backrefed = backrefed.empty ? matches : backrefed; 593 matcher.re.ir = re.ir[ 594 pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd) 595 ]; 596 immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookaheadStart); 597 s.reset(save); 598 next(); 599 if (!match) 600 goto L_backtrack; 601 else 602 { 603 pc += IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd); 604 } 605 break; 606 case IR.LookbehindStart: 607 case IR.NeglookbehindStart: 608 immutable len = re.ir[pc].data; 609 immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; 610 auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)]; 611 scope(exit) free(mem.ptr); 612 static if (Stream.isLoopback) 613 { 614 alias Matcher = BacktrackingMatcher!(Char, Stream); 615 auto matcher = Matcher(re, s, mem, front, index); 616 } 617 else 618 { 619 alias Matcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index))); 620 auto matcher = Matcher(re, s.loopBack(index), mem); 621 } 622 matcher.matches = matches[ms .. me]; 623 matcher.re.ir = re.ir[ 624 pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd) 625 ]; 626 matcher.backrefed = backrefed.empty ? matches : backrefed; 627 immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookbehindStart); 628 if (!match) 629 goto L_backtrack; 630 else 631 { 632 pc += IRL!(IR.LookbehindStart)+len+IRL!(IR.LookbehindEnd); 633 } 634 break; 635 case IR.Backref: 636 immutable n = re.ir[pc].data; 637 auto referenced = re.ir[pc].localRef 638 ? s[matches[n].begin .. matches[n].end] 639 : s[backrefed[n].begin .. backrefed[n].end]; 640 while (!atEnd && !referenced.empty && front == referenced.front) 641 { 642 next(); 643 referenced.popFront(); 644 } 645 if (referenced.empty) 646 pc++; 647 else 648 goto L_backtrack; 649 break; 650 case IR.Nop: 651 pc += IRL!(IR.Nop); 652 break; 653 case IR.LookaheadEnd: 654 case IR.NeglookaheadEnd: 655 case IR.LookbehindEnd: 656 case IR.NeglookbehindEnd: 657 case IR.End: 658 // cleanup stale stack blocks if any 659 while (prevStack()) {} 660 return re.ir[pc].data; 661 default: 662 debug printBytecode(re.ir[0..$]); 663 assert(0); 664 L_backtrack: 665 if (!popState()) 666 { 667 s.reset(start); 668 return 0; 669 } 670 } 671 } 672 } 673 assert(0); 674 } 675 676 @property size_t stackAvail() 677 { 678 return memory.length - lastState; 679 } 680 681 void stackPush(T)(T val) 682 if (!isDynamicArray!T) 683 { 684 *cast(T*)&memory[lastState] = val; 685 enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof; 686 lastState += delta; 687 debug(std_regex_matcher) writeln("push element SP= ", lastState); 688 } 689 690 void stackPush(T)(T[] val) 691 { 692 static assert(T.sizeof % size_t.sizeof == 0); 693 (cast(T*)&memory[lastState])[0 .. val.length] 694 = val[0..$]; 695 lastState += val.length*(T.sizeof/size_t.sizeof); 696 debug(std_regex_matcher) writeln("push array SP= ", lastState); 697 } 698 699 void stackPop(T)(ref T val) 700 if (!isDynamicArray!T) 701 { 702 enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof; 703 lastState -= delta; 704 val = *cast(T*)&memory[lastState]; 705 debug(std_regex_matcher) writeln("pop element SP= ", lastState); 706 } 707 708 void stackPop(T)(T[] val) 709 { 710 stackPop(val); // call ref version 711 } 712 void stackPop(T)(ref T[] val) 713 { 714 lastState -= val.length*(T.sizeof/size_t.sizeof); 715 val[0..$] = (cast(T*)&memory[lastState])[0 .. val.length]; 716 debug(std_regex_matcher) writeln("pop array SP= ", lastState); 717 } 718 719 static if (!CTregex) 720 { 721 //helper function, saves engine state 722 void pushState(uint pc, uint counter) 723 { 724 if (stateSize + 2 * matches.length > stackAvail) 725 { 726 newStack(); 727 } 728 *cast(State*)&memory[lastState] = 729 State(index, pc, counter, infiniteNesting); 730 lastState += stateSize; 731 memory[lastState .. lastState + 2 * matches.length] = (cast(size_t[]) matches)[]; 732 lastState += 2*matches.length; 733 debug(std_regex_matcher) 734 writefln("Saved(pc=%s) front: %s src: %s", 735 pc, front, s[index .. s.lastIndex]); 736 } 737 738 //helper function, restores engine state 739 bool popState() 740 { 741 if (!lastState && !prevStack()) 742 return false; 743 lastState -= 2*matches.length; 744 auto pm = cast(size_t[]) matches; 745 pm[] = memory[lastState .. lastState + 2 * matches.length]; 746 lastState -= stateSize; 747 State* state = cast(State*)&memory[lastState]; 748 index = state.index; 749 pc = state.pc; 750 counter = state.counter; 751 infiniteNesting = state.infiniteNesting; 752 debug(std_regex_matcher) 753 { 754 writefln("Restored matches", front, s[index .. s.lastIndex]); 755 foreach (i, m; matches) 756 writefln("Sub(%d) : %s..%s", i, m.begin, m.end); 757 } 758 s.reset(index); 759 next(); 760 debug(std_regex_matcher) 761 writefln("Backtracked (pc=%s) front: %s src: %s", 762 pc, front, s[index .. s.lastIndex]); 763 return true; 764 } 765 } 766 } 767 } 768 769 //very shitty string formatter, $$ replaced with next argument converted to string 770 @trusted string ctSub( U...)(string format, U args) 771 { 772 import std.conv : to; 773 bool seenDollar; 774 foreach (i, ch; format) 775 { 776 if (ch == '$') 777 { 778 if (seenDollar) 779 { 780 static if (args.length > 0) 781 { 782 return format[0 .. i - 1] ~ to!string(args[0]) 783 ~ ctSub(format[i + 1 .. $], args[1 .. $]); 784 } 785 else 786 assert(0); 787 } 788 else 789 seenDollar = true; 790 } 791 else 792 seenDollar = false; 793 794 } 795 return format; 796 } 797 798 alias Sequence(int B, int E) = staticIota!(B, E); 799 800 struct CtContext 801 { 802 import std.conv : to, text; 803 //dirty flags 804 bool counter; 805 //to mark the portion of matches to save 806 int match, total_matches; 807 int reserved; 808 CodepointSet[] charsets; 809 810 811 //state of codegenerator 812 static struct CtState 813 { 814 string code; 815 int addr; 816 } 817 818 this(Char)(Regex!Char re) 819 { 820 match = 1; 821 reserved = 1; //first match is skipped 822 total_matches = re.ngroup; 823 charsets = re.charsets; 824 } 825 826 CtContext lookaround(uint s, uint e) 827 { 828 CtContext ct; 829 ct.total_matches = e - s; 830 ct.match = 1; 831 return ct; 832 } 833 834 //restore state having current context 835 string restoreCode() 836 { 837 string text; 838 //stack is checked in L_backtrack 839 text ~= counter 840 ? " 841 stackPop(counter);" 842 : " 843 counter = 0;"; 844 if (match < total_matches) 845 { 846 text ~= ctSub(" 847 stackPop(matches[$$..$$]);", reserved, match); 848 text ~= ctSub(" 849 matches[$$..$] = typeof(matches[0]).init;", match); 850 } 851 else 852 text ~= ctSub(" 853 stackPop(matches[$$..$]);", reserved); 854 return text; 855 } 856 857 //save state having current context 858 string saveCode(uint pc, string count_expr="counter") 859 { 860 string text = ctSub(" 861 if (stackAvail < $$*(Group!(DataIndex)).sizeof/size_t.sizeof + $$) 862 { 863 newStack(); 864 }", match - reserved, cast(int) counter + 2); 865 if (match < total_matches) 866 text ~= ctSub(" 867 stackPush(matches[$$..$$]);", reserved, match); 868 else 869 text ~= ctSub(" 870 stackPush(matches[$$..$]);", reserved); 871 text ~= counter ? ctSub(" 872 stackPush($$);", count_expr) : ""; 873 text ~= ctSub(" 874 stackPush(index); stackPush($$); \n", pc); 875 return text; 876 } 877 878 // 879 CtState ctGenBlock(Bytecode[] ir, int addr) 880 { 881 CtState result; 882 result.addr = addr; 883 while (!ir.empty) 884 { 885 auto n = ctGenGroup(ir, result.addr); 886 result.code ~= n.code; 887 result.addr = n.addr; 888 } 889 return result; 890 } 891 892 // 893 CtState ctGenGroup(ref Bytecode[] ir, int addr) 894 { 895 import std.algorithm.comparison : max; 896 auto bailOut = "goto L_backtrack;"; 897 auto nextInstr = ctSub("goto case $$;", addr+1); 898 CtState r; 899 assert(!ir.empty); 900 switch (ir[0].code) 901 { 902 case IR.InfiniteStart, IR.InfiniteBloomStart,IR.InfiniteQStart, IR.RepeatStart, IR.RepeatQStart: 903 immutable infLoop = 904 ir[0].code == IR.InfiniteStart || ir[0].code == IR.InfiniteQStart || 905 ir[0].code == IR.InfiniteBloomStart; 906 907 counter = counter || 908 ir[0].code == IR.RepeatStart || ir[0].code == IR.RepeatQStart; 909 immutable len = ir[0].data; 910 auto nir = ir[ir[0].length .. ir[0].length+len]; 911 r = ctGenBlock(nir, addr+1); 912 //start/end codegen 913 //r.addr is at last test+ jump of loop, addr+1 is body of loop 914 nir = ir[ir[0].length + len .. $]; 915 r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr) ~ r.code; 916 r.code ~= ctGenFixupCode(nir, r.addr, addr+1); 917 r.addr += 2; //account end instruction + restore state 918 ir = nir; 919 break; 920 case IR.OrStart: 921 immutable len = ir[0].data; 922 auto nir = ir[ir[0].length .. ir[0].length+len]; 923 r = ctGenAlternation(nir, addr); 924 ir = ir[ir[0].length + len .. $]; 925 assert(ir[0].code == IR.OrEnd); 926 ir = ir[ir[0].length..$]; 927 break; 928 case IR.LookaheadStart: 929 case IR.NeglookaheadStart: 930 case IR.LookbehindStart: 931 case IR.NeglookbehindStart: 932 immutable len = ir[0].data; 933 immutable behind = ir[0].code == IR.LookbehindStart || ir[0].code == IR.NeglookbehindStart; 934 immutable negative = ir[0].code == IR.NeglookaheadStart || ir[0].code == IR.NeglookbehindStart; 935 string fwdType = "typeof(fwdMatcher(matcher, []))"; 936 string bwdType = "typeof(bwdMatcher(matcher, []))"; 937 string fwdCreate = "fwdMatcher(matcher, mem)"; 938 string bwdCreate = "bwdMatcher(matcher, mem)"; 939 immutable start = IRL!(IR.LookbehindStart); 940 immutable end = IRL!(IR.LookbehindStart)+len+IRL!(IR.LookaheadEnd); 941 CtContext context = lookaround(ir[1].raw, ir[2].raw); //split off new context 942 auto slice = ir[start .. end]; 943 r.code ~= ctSub(` 944 case $$: //fake lookaround "atom" 945 static if (typeof(matcher.s).isLoopback) 946 alias Lookaround = $$; 947 else 948 alias Lookaround = $$; 949 static bool matcher_$$(ref Lookaround matcher) @trusted 950 { 951 //(neg)lookaround piece start 952 $$ 953 //(neg)lookaround piece ends 954 } 955 auto save = index; 956 auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)]; 957 scope(exit) free(mem.ptr); 958 static if (typeof(matcher.s).isLoopback) 959 auto lookaround = $$; 960 else 961 auto lookaround = $$; 962 lookaround.matches = matches[$$..$$]; 963 lookaround.backrefed = backrefed.empty ? matches : backrefed; 964 lookaround.nativeFn = &matcher_$$; //hookup closure's binary code 965 int match = $$; 966 s.reset(save); 967 next(); 968 if (match) 969 $$ 970 else 971 $$`, addr, 972 behind ? fwdType : bwdType, behind ? bwdType : fwdType, 973 addr, context.ctGenRegEx(slice), 974 behind ? fwdCreate : bwdCreate, behind ? bwdCreate : fwdCreate, 975 ir[1].raw, ir[2].raw, //start - end of matches slice 976 addr, 977 negative ? "!lookaround.matchImpl()" : "lookaround.matchImpl()", 978 nextInstr, bailOut); 979 ir = ir[end .. $]; 980 r.addr = addr + 1; 981 break; 982 case IR.LookaheadEnd: case IR.NeglookaheadEnd: 983 case IR.LookbehindEnd: case IR.NeglookbehindEnd: 984 ir = ir[IRL!(IR.LookaheadEnd) .. $]; 985 r.addr = addr; 986 break; 987 default: 988 assert(ir[0].isAtom, text(ir[0].mnemonic)); 989 r = ctGenAtom(ir, addr); 990 } 991 return r; 992 } 993 994 //generate source for bytecode contained in OrStart ... OrEnd 995 CtState ctGenAlternation(Bytecode[] ir, int addr) 996 { 997 CtState[] pieces; 998 CtState r; 999 enum optL = IRL!(IR.Option); 1000 for (;;) 1001 { 1002 assert(ir[0].code == IR.Option); 1003 auto len = ir[0].data; 1004 if (optL+len < ir.length && ir[optL+len].code == IR.Option)//not a last option 1005 { 1006 auto nir = ir[optL .. optL+len-IRL!(IR.GotoEndOr)]; 1007 r = ctGenBlock(nir, addr+2);//space for Option + restore state 1008 //r.addr+1 to account GotoEndOr at end of branch 1009 r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr+1) ~ r.code; 1010 addr = r.addr+1;//leave space for GotoEndOr 1011 pieces ~= r; 1012 ir = ir[optL + len .. $]; 1013 } 1014 else 1015 { 1016 pieces ~= ctGenBlock(ir[optL..$], addr); 1017 addr = pieces[$-1].addr; 1018 break; 1019 } 1020 } 1021 r = pieces[0]; 1022 for (uint i = 1; i < pieces.length; i++) 1023 { 1024 r.code ~= ctSub(` 1025 case $$: 1026 goto case $$; `, pieces[i-1].addr, addr); 1027 r.code ~= pieces[i].code; 1028 } 1029 r.addr = addr; 1030 return r; 1031 } 1032 1033 // generate fixup code for instruction in ir, 1034 // fixup means it has an alternative way for control flow 1035 string ctGenFixupCode(Bytecode[] ir, int addr, int fixup) 1036 { 1037 return ctGenFixupCode(ir, addr, fixup); // call ref Bytecode[] version 1038 } 1039 string ctGenFixupCode(ref Bytecode[] ir, int addr, int fixup) 1040 { 1041 string r; 1042 string testCode; 1043 r = ctSub(` 1044 case $$: debug(std_regex_matcher) writeln("#$$");`, 1045 addr, addr); 1046 switch (ir[0].code) 1047 { 1048 case IR.InfiniteStart, IR.InfiniteQStart, IR.InfiniteBloomStart: 1049 r ~= ctSub( ` 1050 goto case $$;`, fixup); 1051 ir = ir[ir[0].length..$]; 1052 break; 1053 case IR.InfiniteEnd: 1054 testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1); 1055 r ~= ctSub( ` 1056 if (merge[$$+counter].mark(index)) 1057 { 1058 // merged! 1059 goto L_backtrack; 1060 } 1061 1062 $$ 1063 { 1064 $$ 1065 } 1066 goto case $$; 1067 case $$: //restore state and go out of loop 1068 $$ 1069 goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup, 1070 addr+1, restoreCode()); 1071 ir = ir[ir[0].length..$]; 1072 break; 1073 case IR.InfiniteBloomEnd: 1074 //TODO: check bloom filter and skip on failure 1075 testCode = ctQuickTest(ir[IRL!(IR.InfiniteBloomEnd) .. $],addr + 1); 1076 r ~= ctSub( ` 1077 if (merge[$$+counter].mark(index)) 1078 { 1079 // merged! 1080 goto L_backtrack; 1081 } 1082 1083 $$ 1084 { 1085 $$ 1086 } 1087 goto case $$; 1088 case $$: //restore state and go out of loop 1089 $$ 1090 goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup, 1091 addr+1, restoreCode()); 1092 ir = ir[ir[0].length..$]; 1093 break; 1094 case IR.InfiniteQEnd: 1095 testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1); 1096 auto altCode = testCode.length ? ctSub("else goto case $$;", fixup) : ""; 1097 r ~= ctSub( ` 1098 if (merge[$$+counter].mark(index)) 1099 { 1100 // merged! 1101 goto L_backtrack; 1102 } 1103 1104 $$ 1105 { 1106 $$ 1107 goto case $$; 1108 } 1109 $$ 1110 case $$://restore state and go inside loop 1111 $$ 1112 goto case $$;`, ir[1].raw, 1113 testCode, saveCode(addr+1), addr+2, altCode, 1114 addr+1, restoreCode(), fixup); 1115 ir = ir[ir[0].length..$]; 1116 break; 1117 case IR.RepeatStart, IR.RepeatQStart: 1118 r ~= ctSub( ` 1119 goto case $$;`, fixup); 1120 ir = ir[ir[0].length..$]; 1121 break; 1122 case IR.RepeatEnd, IR.RepeatQEnd: 1123 //len, step, min, max 1124 immutable len = ir[0].data; 1125 immutable step = ir[2].raw; 1126 immutable min = ir[3].raw; 1127 immutable max = ir[4].raw; 1128 r ~= ctSub(` 1129 if (merge[$$+counter].mark(index)) 1130 { 1131 // merged! 1132 goto L_backtrack; 1133 } 1134 if (counter < $$) 1135 { 1136 debug(std_regex_matcher) writeln("RepeatEnd min case pc=", $$); 1137 counter += $$; 1138 goto case $$; 1139 }`, ir[1].raw, min, addr, step, fixup); 1140 if (ir[0].code == IR.RepeatEnd) 1141 { 1142 string counter_expr = ctSub("counter % $$", step); 1143 r ~= ctSub(` 1144 else if (counter < $$) 1145 { 1146 $$ 1147 counter += $$; 1148 goto case $$; 1149 }`, max, saveCode(addr+1, counter_expr), step, fixup); 1150 } 1151 else 1152 { 1153 string counter_expr = ctSub("counter % $$", step); 1154 r ~= ctSub(` 1155 else if (counter < $$) 1156 { 1157 $$ 1158 counter = counter % $$; 1159 goto case $$; 1160 }`, max, saveCode(addr+1,counter_expr), step, addr+2); 1161 } 1162 r ~= ctSub(` 1163 else 1164 { 1165 counter = counter % $$; 1166 goto case $$; 1167 } 1168 case $$: //restore state 1169 $$ 1170 goto case $$;`, step, addr+2, addr+1, restoreCode(), 1171 ir[0].code == IR.RepeatEnd ? addr+2 : fixup ); 1172 ir = ir[ir[0].length..$]; 1173 break; 1174 case IR.Option: 1175 r ~= ctSub( ` 1176 { 1177 $$ 1178 } 1179 goto case $$; 1180 case $$://restore thunk to go to the next group 1181 $$ 1182 goto case $$;`, saveCode(addr+1), addr+2, 1183 addr+1, restoreCode(), fixup); 1184 ir = ir[ir[0].length..$]; 1185 break; 1186 default: 1187 assert(0, text(ir[0].mnemonic)); 1188 } 1189 return r; 1190 } 1191 1192 1193 string ctQuickTest(Bytecode[] ir, int id) 1194 { 1195 uint pc = 0; 1196 while (pc < ir.length && ir[pc].isAtom) 1197 { 1198 if (ir[pc].code == IR.GroupStart || ir[pc].code == IR.GroupEnd) 1199 { 1200 pc++; 1201 } 1202 else if (ir[pc].code == IR.Backref) 1203 break; 1204 else 1205 { 1206 auto code = ctAtomCode(ir[pc..$], -1); 1207 return ctSub(` 1208 int test_$$() 1209 { 1210 $$ //$$ 1211 } 1212 if (test_$$() >= 0)`, id, code.ptr ? code : "return 0;", 1213 ir[pc].mnemonic, id); 1214 } 1215 } 1216 return ""; 1217 } 1218 1219 //process & generate source for simple bytecodes at front of ir using address addr 1220 CtState ctGenAtom(ref Bytecode[] ir, int addr) 1221 { 1222 CtState result; 1223 result.code = ctAtomCode(ir, addr); 1224 ir.popFrontN(ir[0].code == IR.OrChar ? ir[0].sequence : ir[0].length); 1225 result.addr = addr + 1; 1226 return result; 1227 } 1228 1229 //D code for atom at ir using address addr, addr < 0 means quickTest 1230 string ctAtomCode(Bytecode[] ir, int addr) 1231 { 1232 string code; 1233 string bailOut, nextInstr; 1234 if (addr < 0) 1235 { 1236 bailOut = "return -1;"; 1237 nextInstr = "return 0;"; 1238 } 1239 else 1240 { 1241 bailOut = "goto L_backtrack;"; 1242 nextInstr = ctSub("goto case $$;", addr+1); 1243 code ~= ctSub( ` 1244 case $$: debug(std_regex_matcher) writeln("#$$"); 1245 `, addr, addr); 1246 } 1247 switch (ir[0].code) 1248 { 1249 case IR.OrChar://assumes IRL!(OrChar) == 1 1250 code ~= ctSub(` 1251 if (atEnd) 1252 $$`, bailOut); 1253 immutable len = ir[0].sequence; 1254 for (uint i = 0; i < len; i++) 1255 { 1256 code ~= ctSub( ` 1257 if (front == $$) 1258 { 1259 $$ 1260 $$ 1261 }`, ir[i].data, addr >= 0 ? "next();" :"", nextInstr); 1262 } 1263 code ~= ctSub( ` 1264 $$`, bailOut); 1265 break; 1266 case IR.Char: 1267 code ~= ctSub( ` 1268 if (atEnd || front != $$) 1269 $$ 1270 $$ 1271 $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); 1272 break; 1273 case IR.Any: 1274 code ~= ctSub( ` 1275 if (atEnd || (!(re.flags & RegexOption.singleline) 1276 && (front == '\r' || front == '\n'))) 1277 $$ 1278 $$ 1279 $$`, bailOut, addr >= 0 ? "next();" :"",nextInstr); 1280 break; 1281 case IR.CodepointSet: 1282 if (charsets.length) 1283 { 1284 string name = `func_`~to!string(addr+1); 1285 string funcCode = charsets[ir[0].data].toSourceCode(name); 1286 code ~= ctSub( ` 1287 static $$ 1288 if (atEnd || !$$(front)) 1289 $$ 1290 $$ 1291 $$`, funcCode, name, bailOut, addr >= 0 ? "next();" :"", nextInstr); 1292 } 1293 else 1294 code ~= ctSub( ` 1295 if (atEnd || !re.charsets[$$].scanFor(front)) 1296 $$ 1297 $$ 1298 $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); 1299 break; 1300 case IR.Trie: 1301 if (charsets.length && charsets[ir[0].data].byInterval.length <= 8) 1302 goto case IR.CodepointSet; 1303 code ~= ctSub( ` 1304 if (atEnd || !re.matchers[$$][front]) 1305 $$ 1306 $$ 1307 $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); 1308 break; 1309 case IR.Wordboundary: 1310 code ~= ctSub( ` 1311 dchar back; 1312 DataIndex bi; 1313 if (atStart && wordMatcher[front]) 1314 { 1315 $$ 1316 } 1317 else if (atEnd && s.loopBack(index).nextChar(back, bi) 1318 && wordMatcher[back]) 1319 { 1320 $$ 1321 } 1322 else if (s.loopBack(index).nextChar(back, bi)) 1323 { 1324 bool af = wordMatcher[front]; 1325 bool ab = wordMatcher[back]; 1326 if (af ^ ab) 1327 { 1328 $$ 1329 } 1330 } 1331 $$`, nextInstr, nextInstr, nextInstr, bailOut); 1332 break; 1333 case IR.Notwordboundary: 1334 code ~= ctSub( ` 1335 dchar back; 1336 DataIndex bi; 1337 //at start & end of input 1338 if (atStart && wordMatcher[front]) 1339 $$ 1340 else if (atEnd && s.loopBack(index).nextChar(back, bi) 1341 && wordMatcher[back]) 1342 $$ 1343 else if (s.loopBack(index).nextChar(back, bi)) 1344 { 1345 bool af = wordMatcher[front]; 1346 bool ab = wordMatcher[back]; 1347 if (af ^ ab) 1348 $$ 1349 } 1350 $$`, bailOut, bailOut, bailOut, nextInstr); 1351 1352 break; 1353 case IR.Bol: 1354 code ~= ctSub(` 1355 dchar back; 1356 DataIndex bi; 1357 if (atStart || (s.loopBack(index).nextChar(back,bi) 1358 && endOfLine(back, front == '\n'))) 1359 { 1360 debug(std_regex_matcher) writeln("BOL matched"); 1361 $$ 1362 } 1363 else 1364 $$`, nextInstr, bailOut); 1365 1366 break; 1367 case IR.Bof: 1368 code ~= ctSub(` 1369 if (atStart) 1370 { 1371 debug(std_regex_matcher) writeln("BOF matched"); 1372 $$ 1373 } 1374 else 1375 $$`, nextInstr, bailOut); 1376 break; 1377 case IR.Eol: 1378 code ~= ctSub(` 1379 dchar back; 1380 DataIndex bi; 1381 debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]); 1382 //no matching inside \r\n 1383 if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi) 1384 && back == '\r'))) 1385 { 1386 debug(std_regex_matcher) writeln("EOL matched"); 1387 $$ 1388 } 1389 else 1390 $$`, nextInstr, bailOut); 1391 break; 1392 case IR.Eof: 1393 code ~= ctSub(` 1394 if (atEnd) 1395 { 1396 debug(std_regex_matcher) writeln("BOF matched"); 1397 $$ 1398 } 1399 else 1400 $$`, nextInstr, bailOut); 1401 break; 1402 case IR.GroupStart: 1403 code ~= ctSub(` 1404 matches[$$].begin = index; 1405 $$`, ir[0].data, nextInstr); 1406 match = ir[0].data+1; 1407 break; 1408 case IR.GroupEnd: 1409 code ~= ctSub(` 1410 matches[$$].end = index; 1411 $$`, ir[0].data, nextInstr); 1412 break; 1413 case IR.Backref: 1414 string mStr = "auto referenced = "; 1415 mStr ~= ir[0].localRef 1416 ? ctSub("s[matches[$$].begin .. matches[$$].end];", 1417 ir[0].data, ir[0].data) 1418 : ctSub("s[backrefed[$$].begin .. backrefed[$$].end];", 1419 ir[0].data, ir[0].data); 1420 code ~= ctSub( ` 1421 $$ 1422 while (!atEnd && !referenced.empty && front == referenced.front) 1423 { 1424 next(); 1425 referenced.popFront(); 1426 } 1427 if (referenced.empty) 1428 $$ 1429 else 1430 $$`, mStr, nextInstr, bailOut); 1431 break; 1432 case IR.Nop: 1433 case IR.End: 1434 break; 1435 default: 1436 assert(0, text(ir[0].mnemonic, " is not supported yet")); 1437 } 1438 return code; 1439 } 1440 1441 //generate D code for the whole regex 1442 public string ctGenRegEx(Bytecode[] ir) 1443 { 1444 auto bdy = ctGenBlock(ir, 0); 1445 auto r = ` 1446 import core.stdc.stdlib; 1447 with(matcher) 1448 { 1449 pc = 0; 1450 counter = 0; 1451 lastState = 0; 1452 matches[] = Group!DataIndex.init; 1453 auto start = s._index;`; 1454 r ~= ` 1455 goto StartLoop; 1456 debug(std_regex_matcher) writeln("Try CT matching starting at ",s[index .. s.lastIndex]); 1457 L_backtrack: 1458 if (lastState || prevStack()) 1459 { 1460 stackPop(pc); 1461 stackPop(index); 1462 s.reset(index); 1463 next(); 1464 } 1465 else 1466 { 1467 s.reset(start); 1468 return false; 1469 } 1470 StartLoop: 1471 switch (pc) 1472 { 1473 `; 1474 r ~= bdy.code; 1475 r ~= ctSub(` 1476 case $$: break;`,bdy.addr); 1477 r ~= ` 1478 default: 1479 assert(0); 1480 } 1481 // cleanup stale stack blocks 1482 while (prevStack()) {} 1483 return true; 1484 } 1485 `; 1486 return r; 1487 } 1488 1489 } 1490 1491 string ctGenRegExCode(Char)(Regex!Char re) 1492 { 1493 auto context = CtContext(re); 1494 return context.ctGenRegEx(re.ir); 1495 } 1496