1 /* 2 Implementation of backtracking std.regex engine. 3 Contains both compile-time and run-time versions. 4 */ 5 module std.regex.internal.backtracking; 6 7 package(std.regex): 8 9 import core.stdc.stdlib, std.range.primitives, std.traits, std.typecons; 10 import std.regex.internal.ir; 11 12 import core.memory : pureMalloc, pureFree; 13 14 /+ 15 BacktrackingMatcher implements backtracking scheme of matching 16 regular expressions. 17 +/ 18 @trusted class BacktrackingMatcher(Char, Stream = Input!Char) : Matcher!Char 19 if (is(Char : dchar)) 20 { 21 alias DataIndex = Stream.DataIndex; 22 struct State 23 {//top bit in pc is set if saved along with matches 24 DataIndex index; 25 uint pc, counter, infiniteNesting; 26 } 27 static assert(State.sizeof % size_t.sizeof == 0); 28 enum stateSize = State.sizeof / size_t.sizeof; 29 enum initialStack = 1 << 11; // items in a block of segmented stack 30 alias String = const(Char)[]; 31 alias RegEx = Regex!Char; 32 alias MatchFn = bool function(BacktrackingMatcher) pure; 33 const RegEx re; // regex program 34 MatchFn nativeFn; // native code for that program 35 // Stream state 36 Stream s; 37 DataIndex index; 38 dchar front; 39 bool exhausted; 40 // Backtracking machine state 41 uint pc, counter; 42 DataIndex lastState = 0; // Top of state stack 43 uint infiniteNesting; 44 size_t[] memory; 45 Trace[] merge; 46 static struct Trace 47 { 48 ulong mask; 49 size_t offset; 50 51 bool mark(size_t idx) 52 { 53 immutable d = idx - offset; 54 if (d < 64) // including overflow 55 { 56 immutable p = mask & (1UL << d); 57 mask |= 1UL << d; 58 return p != 0; 59 } 60 else 61 { 62 offset = idx; 63 mask = 1; 64 return false; 65 } 66 } 67 } 68 //local slice of matches, global for backref 69 Group!DataIndex[] matches, backrefed; 70 size_t _refCount; 71 final: 72 73 override @property ref size_t refCount() { return _refCount; } 74 override @property ref const(RegEx) pattern(){ return re; } 75 76 static if (__traits(hasMember,Stream, "search")) 77 { 78 enum kicked = true; 79 } 80 else 81 enum kicked = false; 82 83 static size_t initialMemory(const ref RegEx re) 84 { 85 return stackSize(re)*size_t.sizeof + re.hotspotTableSize*Trace.sizeof; 86 } 87 88 static size_t stackSize(const ref RegEx re) 89 { 90 size_t itemSize = stateSize 91 + re.ngroup * (Group!DataIndex).sizeof / size_t.sizeof; 92 return initialStack * itemSize + 2; 93 } 94 95 @property bool atStart(){ return index == 0; } 96 97 @property bool atEnd(){ return index == s.lastIndex && s.atEnd; } 98 99 void next() 100 { 101 if (!s.nextChar(front, index)) 102 index = s.lastIndex; 103 } 104 105 void search() 106 { 107 static if (kicked) 108 { 109 if (!s.search(re.kickstart, front, index)) 110 { 111 index = s.lastIndex; 112 } 113 } 114 else 115 next(); 116 } 117 118 // 119 void newStack() 120 { 121 auto chunk = mallocArray!(size_t)(stackSize(re)); 122 chunk[0] = cast(size_t)(memory.ptr); 123 chunk[1] = lastState; 124 memory = chunk[2..$]; 125 lastState = 0; 126 } 127 128 bool prevStack() 129 { 130 // pointer to previous block 131 size_t* prev = cast(size_t*) memory.ptr[-2]; 132 if (!prev) 133 { 134 // The last segment is freed in RegexMatch 135 return false; 136 } 137 else 138 { 139 import core.memory : pureFree; 140 // memory used in previous block 141 size_t size = memory.ptr[-1]; 142 pureFree(memory.ptr-2); 143 memory = prev[0 .. size]; 144 lastState = size; 145 return true; 146 } 147 } 148 149 void initExternalMemory(void[] memBlock) 150 { 151 merge = arrayInChunk!(Trace)(re.hotspotTableSize, memBlock); 152 merge[] = Trace.init; 153 memory = cast(size_t[]) memBlock; 154 memory[0] = 0; // hidden pointer 155 memory[1] = 0; // used size 156 memory = memory[2..$]; 157 } 158 159 void initialize(ref const RegEx program, Stream stream, void[] memBlock) 160 { 161 s = stream; 162 exhausted = false; 163 initExternalMemory(memBlock); 164 backrefed = null; 165 } 166 167 override void dupTo(Matcher!Char m, void[] memBlock) 168 { 169 auto backtracking = cast(BacktrackingMatcher) m; 170 backtracking.s = s; 171 backtracking.front = front; 172 backtracking.index = index; 173 backtracking.exhausted = exhausted; 174 backtracking.initExternalMemory(memBlock); 175 } 176 177 override Matcher!Char rearm(in Char[] data) 178 { 179 merge[] = Trace.init; 180 exhausted = false; 181 s = Stream(data); 182 next(); 183 return this; 184 } 185 186 this(ref const RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx) 187 { 188 _refCount = 1; 189 re = program; 190 nativeFn = null; 191 initialize(program, stream, memBlock); 192 front = ch; 193 index = idx; 194 } 195 196 this(ref const RegEx program, MatchFn func, Stream stream, void[] memBlock) 197 { 198 _refCount = 1; 199 re = program; 200 initialize(program, stream, memBlock); 201 nativeFn = func; 202 next(); 203 } 204 205 this(ref const RegEx program, Stream stream, void[] memBlock) 206 { 207 _refCount = 1; 208 re = program; 209 nativeFn = null; 210 initialize(program, stream, memBlock); 211 next(); 212 } 213 214 auto fwdMatcher(ref const RegEx re, void[] memBlock) 215 { 216 alias BackMatcher = BacktrackingMatcher!(Char, Stream); 217 auto fwdMatcher = new BackMatcher(re, s, memBlock, front, index); 218 return fwdMatcher; 219 } 220 221 auto bwdMatcher(ref const RegEx re, void[] memBlock) 222 { 223 alias BackMatcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index))); 224 auto fwdMatcher = 225 new BackMatcher(re, s.loopBack(index), memBlock); 226 return fwdMatcher; 227 } 228 229 // 230 int matchFinalize() 231 { 232 immutable start = index; 233 immutable val = matchImpl(); 234 if (val) 235 {//stream is updated here 236 matches[0].begin = start; 237 matches[0].end = index; 238 if (!(re.flags & RegexOption.global) || atEnd) 239 exhausted = true; 240 if (start == index)//empty match advances input 241 next(); 242 return val; 243 } 244 else 245 return 0; 246 } 247 248 //lookup next match, fill matches with indices into input 249 override int match(Group!DataIndex[] matches) 250 { 251 debug(std_regex_matcher) 252 { 253 writeln("------------------------------------------"); 254 } 255 if (exhausted) //all matches collected 256 return false; 257 this.matches = matches; 258 if (re.flags & RegexInfo.oneShot) 259 { 260 exhausted = true; 261 const DataIndex start = index; 262 immutable m = matchImpl(); 263 if (m) 264 { 265 matches[0].begin = start; 266 matches[0].end = index; 267 } 268 return m; 269 } 270 static if (kicked) 271 { 272 if (!re.kickstart.empty) 273 { 274 for (;;) 275 { 276 immutable val = matchFinalize(); 277 if (val) 278 return val; 279 else 280 { 281 if (atEnd) 282 break; 283 search(); 284 if (atEnd) 285 { 286 exhausted = true; 287 return matchFinalize(); 288 } 289 } 290 } 291 exhausted = true; 292 return 0; //early return 293 } 294 } 295 //no search available - skip a char at a time 296 for (;;) 297 { 298 immutable val = matchFinalize(); 299 if (val) 300 return val; 301 else 302 { 303 if (atEnd) 304 break; 305 next(); 306 if (atEnd) 307 { 308 exhausted = true; 309 return matchFinalize(); 310 } 311 } 312 } 313 exhausted = true; 314 return 0; 315 } 316 317 /+ 318 match subexpression against input, 319 results are stored in matches 320 +/ 321 int matchImpl() pure 322 { 323 if (nativeFn) 324 { 325 debug(std_regex_ctr) writeln("using C-T matcher"); 326 return nativeFn(this); 327 } 328 else 329 { 330 pc = 0; 331 counter = 0; 332 lastState = 0; 333 infiniteNesting = 0; 334 matches[] = Group!DataIndex.init; 335 auto start = s._index; 336 debug(std_regex_matcher) 337 writeln("Try match starting at ", s[index .. s.lastIndex]); 338 for (;;) 339 { 340 debug(std_regex_matcher) 341 writefln("PC: %s\tCNT: %s\t%s \tfront: %s src: %s", 342 pc, counter, disassemble(re.ir, pc, re.dict), 343 front, s._index); 344 switch (re.ir[pc].code) 345 { 346 case IR.OrChar://assumes IRL!(OrChar) == 1 347 if (atEnd) 348 goto L_backtrack; 349 uint len = re.ir[pc].sequence; 350 uint end = pc + len; 351 if (re.ir[pc].data != front && re.ir[pc+1].data != front) 352 { 353 for (pc = pc+2; pc < end; pc++) 354 if (re.ir[pc].data == front) 355 break; 356 if (pc == end) 357 goto L_backtrack; 358 } 359 pc = end; 360 next(); 361 break; 362 case IR.Char: 363 if (atEnd || front != re.ir[pc].data) 364 goto L_backtrack; 365 pc += IRL!(IR.Char); 366 next(); 367 break; 368 case IR.Any: 369 if (atEnd) 370 goto L_backtrack; 371 pc += IRL!(IR.Any); 372 next(); 373 break; 374 case IR.CodepointSet: 375 if (atEnd || !re.charsets[re.ir[pc].data].scanFor(front)) 376 goto L_backtrack; 377 next(); 378 pc += IRL!(IR.CodepointSet); 379 break; 380 case IR.Trie: 381 if (atEnd || !re.matchers[re.ir[pc].data][front]) 382 goto L_backtrack; 383 next(); 384 pc += IRL!(IR.Trie); 385 break; 386 case IR.Wordboundary: 387 dchar back; 388 DataIndex bi; 389 //at start & end of input 390 if (atStart && wordMatcher[front]) 391 { 392 pc += IRL!(IR.Wordboundary); 393 break; 394 } 395 else if (atEnd && s.loopBack(index).nextChar(back, bi) 396 && wordMatcher[back]) 397 { 398 pc += IRL!(IR.Wordboundary); 399 break; 400 } 401 else if (s.loopBack(index).nextChar(back, bi)) 402 { 403 immutable af = wordMatcher[front]; 404 immutable ab = wordMatcher[back]; 405 if (af ^ ab) 406 { 407 pc += IRL!(IR.Wordboundary); 408 break; 409 } 410 } 411 goto L_backtrack; 412 case IR.Notwordboundary: 413 dchar back; 414 DataIndex bi; 415 //at start & end of input 416 if (atStart && wordMatcher[front]) 417 goto L_backtrack; 418 else if (atEnd && s.loopBack(index).nextChar(back, bi) 419 && wordMatcher[back]) 420 goto L_backtrack; 421 else if (s.loopBack(index).nextChar(back, bi)) 422 { 423 immutable af = wordMatcher[front]; 424 immutable ab = wordMatcher[back]; 425 if (af ^ ab) 426 goto L_backtrack; 427 } 428 pc += IRL!(IR.Wordboundary); 429 break; 430 case IR.Bof: 431 if (atStart) 432 pc += IRL!(IR.Bol); 433 else 434 goto L_backtrack; 435 break; 436 case IR.Bol: 437 dchar back; 438 DataIndex bi; 439 if (atStart) 440 pc += IRL!(IR.Bol); 441 else if (s.loopBack(index).nextChar(back,bi) 442 && endOfLine(back, front == '\n')) 443 { 444 pc += IRL!(IR.Bol); 445 } 446 else 447 goto L_backtrack; 448 break; 449 case IR.Eof: 450 if (atEnd) 451 pc += IRL!(IR.Eol); 452 else 453 goto L_backtrack; 454 break; 455 case IR.Eol: 456 dchar back; 457 DataIndex bi; 458 debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]); 459 //no matching inside \r\n 460 if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi) 461 && back == '\r'))) 462 { 463 pc += IRL!(IR.Eol); 464 } 465 else 466 goto L_backtrack; 467 break; 468 case IR.InfiniteStart, IR.InfiniteQStart: 469 pc += re.ir[pc].data + IRL!(IR.InfiniteStart); 470 //now pc is at end IR.Infinite(Q)End 471 uint len = re.ir[pc].data; 472 if (re.ir[pc].code == IR.InfiniteEnd) 473 { 474 pushState(pc+IRL!(IR.InfiniteEnd), counter); 475 pc -= len; 476 } 477 else 478 { 479 pushState(pc - len, counter); 480 pc += IRL!(IR.InfiniteEnd); 481 } 482 break; 483 case IR.InfiniteBloomStart: 484 pc += re.ir[pc].data + IRL!(IR.InfiniteBloomStart); 485 //now pc is at end IR.InfiniteBloomEnd 486 immutable len = re.ir[pc].data; 487 immutable filterIdx = re.ir[pc+2].raw; 488 if (re.filters[filterIdx][front]) 489 pushState(pc+IRL!(IR.InfiniteBloomEnd), counter); 490 pc -= len; 491 break; 492 case IR.RepeatStart, IR.RepeatQStart: 493 pc += re.ir[pc].data + IRL!(IR.RepeatStart); 494 break; 495 case IR.RepeatEnd: 496 case IR.RepeatQEnd: 497 if (merge[re.ir[pc + 1].raw+counter].mark(index)) 498 { 499 // merged! 500 goto L_backtrack; 501 } 502 //len, step, min, max 503 immutable len = re.ir[pc].data; 504 immutable step = re.ir[pc+2].raw; 505 immutable min = re.ir[pc+3].raw; 506 immutable max = re.ir[pc+4].raw; 507 if (counter < min) 508 { 509 counter += step; 510 pc -= len; 511 } 512 else if (counter < max) 513 { 514 if (re.ir[pc].code == IR.RepeatEnd) 515 { 516 pushState(pc + IRL!(IR.RepeatEnd), counter%step); 517 counter += step; 518 pc -= len; 519 } 520 else 521 { 522 pushState(pc - len, counter + step); 523 counter = counter%step; 524 pc += IRL!(IR.RepeatEnd); 525 } 526 } 527 else 528 { 529 counter = counter%step; 530 pc += IRL!(IR.RepeatEnd); 531 } 532 break; 533 case IR.InfiniteEnd: 534 case IR.InfiniteQEnd: 535 debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); 536 if (merge[re.ir[pc + 1].raw+counter].mark(index)) 537 { 538 // merged! 539 goto L_backtrack; 540 } 541 immutable len = re.ir[pc].data; 542 if (re.ir[pc].code == IR.InfiniteEnd) 543 { 544 pushState(pc + IRL!(IR.InfiniteEnd), counter); 545 pc -= len; 546 } 547 else 548 { 549 pushState(pc-len, counter); 550 pc += IRL!(IR.InfiniteEnd); 551 } 552 break; 553 case IR.InfiniteBloomEnd: 554 debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); 555 if (merge[re.ir[pc + 1].raw+counter].mark(index)) 556 { 557 // merged! 558 goto L_backtrack; 559 } 560 immutable len = re.ir[pc].data; 561 immutable filterIdx = re.ir[pc+2].raw; 562 if (re.filters[filterIdx][front]) 563 { 564 infiniteNesting--; 565 pushState(pc + IRL!(IR.InfiniteBloomEnd), counter); 566 infiniteNesting++; 567 } 568 pc -= len; 569 break; 570 case IR.OrEnd: 571 if (merge[re.ir[pc + 1].raw+counter].mark(index)) 572 { 573 // merged! 574 goto L_backtrack; 575 } 576 pc += IRL!(IR.OrEnd); 577 break; 578 case IR.OrStart: 579 pc += IRL!(IR.OrStart); 580 goto case; 581 case IR.Option: 582 immutable len = re.ir[pc].data; 583 if (re.ir[pc+len].code == IR.GotoEndOr)//not a last one 584 { 585 pushState(pc + len + IRL!(IR.Option), counter); //remember 2nd branch 586 } 587 pc += IRL!(IR.Option); 588 break; 589 case IR.GotoEndOr: 590 pc = pc + re.ir[pc].data + IRL!(IR.GotoEndOr); 591 break; 592 case IR.GroupStart: 593 immutable n = re.ir[pc].data; 594 matches[n].begin = index; 595 debug(std_regex_matcher) writefln("IR group #%u starts at %u", n, index); 596 pc += IRL!(IR.GroupStart); 597 break; 598 case IR.GroupEnd: 599 immutable n = re.ir[pc].data; 600 matches[n].end = index; 601 debug(std_regex_matcher) writefln("IR group #%u ends at %u", n, index); 602 pc += IRL!(IR.GroupEnd); 603 break; 604 case IR.LookaheadStart: 605 case IR.NeglookaheadStart: 606 immutable len = re.ir[pc].data; 607 auto save = index; 608 immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; 609 auto mem = pureMalloc(initialMemory(re))[0 .. initialMemory(re)]; 610 scope(exit) pureFree(mem.ptr); 611 auto slicedRe = re.withCode(re.ir[ 612 pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd) 613 ]); 614 static if (Stream.isLoopback) 615 { 616 auto matcher = bwdMatcher(slicedRe, mem); 617 } 618 else 619 { 620 auto matcher = fwdMatcher(slicedRe, mem); 621 } 622 matcher.matches = matches[ms .. me]; 623 matcher.backrefed = backrefed.empty ? matches : backrefed; 624 immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookaheadStart); 625 s.reset(save); 626 next(); 627 if (!match) 628 goto L_backtrack; 629 else 630 { 631 pc += IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd); 632 } 633 break; 634 case IR.LookbehindStart: 635 case IR.NeglookbehindStart: 636 immutable len = re.ir[pc].data; 637 immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; 638 auto mem = pureMalloc(initialMemory(re))[0 .. initialMemory(re)]; 639 scope(exit) pureFree(mem.ptr); 640 auto slicedRe = re.withCode(re.ir[ 641 pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd) 642 ]); 643 static if (Stream.isLoopback) 644 { 645 alias Matcher = BacktrackingMatcher!(Char, Stream); 646 auto matcher = new Matcher(slicedRe, s, mem, front, index); 647 } 648 else 649 { 650 alias Matcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index))); 651 auto matcher = new Matcher(slicedRe, s.loopBack(index), mem); 652 } 653 matcher.matches = matches[ms .. me]; 654 matcher.backrefed = backrefed.empty ? matches : backrefed; 655 immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookbehindStart); 656 if (!match) 657 goto L_backtrack; 658 else 659 { 660 pc += IRL!(IR.LookbehindStart)+len+IRL!(IR.LookbehindEnd); 661 } 662 break; 663 case IR.Backref: 664 immutable n = re.ir[pc].data; 665 auto referenced = re.ir[pc].localRef 666 ? s[matches[n].begin .. matches[n].end] 667 : s[backrefed[n].begin .. backrefed[n].end]; 668 while (!atEnd && !referenced.empty && front == referenced.front) 669 { 670 next(); 671 referenced.popFront(); 672 } 673 if (referenced.empty) 674 pc++; 675 else 676 goto L_backtrack; 677 break; 678 case IR.Nop: 679 pc += IRL!(IR.Nop); 680 break; 681 case IR.LookaheadEnd: 682 case IR.NeglookaheadEnd: 683 case IR.LookbehindEnd: 684 case IR.NeglookbehindEnd: 685 case IR.End: 686 // cleanup stale stack blocks if any 687 while (prevStack()) {} 688 return re.ir[pc].data; 689 default: 690 debug printBytecode(re.ir[0..$]); 691 assert(0); 692 L_backtrack: 693 if (!popState()) 694 { 695 s.reset(start); 696 return 0; 697 } 698 } 699 } 700 } 701 assert(0); 702 } 703 704 @property size_t stackAvail() 705 { 706 return memory.length - lastState; 707 } 708 709 void stackPush(T)(T val) 710 if (!isDynamicArray!T) 711 { 712 *cast(T*)&memory[lastState] = val; 713 enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof; 714 lastState += delta; 715 debug(std_regex_matcher) writeln("push element SP= ", lastState); 716 } 717 718 void stackPush(T)(T[] val) 719 { 720 static assert(T.sizeof % size_t.sizeof == 0); 721 (cast(T*)&memory[lastState])[0 .. val.length] 722 = val[0..$]; 723 lastState += val.length*(T.sizeof/size_t.sizeof); 724 debug(std_regex_matcher) writeln("push array SP= ", lastState); 725 } 726 727 void stackPop(T)(ref T val) 728 if (!isDynamicArray!T) 729 { 730 enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof; 731 lastState -= delta; 732 val = *cast(T*)&memory[lastState]; 733 debug(std_regex_matcher) writeln("pop element SP= ", lastState); 734 } 735 736 void stackPop(T)(T[] val) 737 { 738 stackPop(val); // call ref version 739 } 740 void stackPop(T)(ref T[] val) 741 { 742 lastState -= val.length*(T.sizeof/size_t.sizeof); 743 val[0..$] = (cast(T*)&memory[lastState])[0 .. val.length]; 744 debug(std_regex_matcher) writeln("pop array SP= ", lastState); 745 } 746 //helper function, saves engine state 747 void pushState(uint pc, uint counter) 748 { 749 if (stateSize + 2 * matches.length > stackAvail) 750 { 751 newStack(); 752 } 753 *cast(State*)&memory[lastState] = 754 State(index, pc, counter, infiniteNesting); 755 lastState += stateSize; 756 memory[lastState .. lastState + 2 * matches.length] = (cast(size_t[]) matches)[]; 757 lastState += 2*matches.length; 758 debug(std_regex_matcher) 759 writefln("Saved(pc=%s) front: %s src: %s", 760 pc, front, s[index .. s.lastIndex]); 761 } 762 763 //helper function, restores engine state 764 bool popState() 765 { 766 if (!lastState && !prevStack()) 767 return false; 768 lastState -= 2*matches.length; 769 auto pm = cast(size_t[]) matches; 770 pm[] = memory[lastState .. lastState + 2 * matches.length]; 771 lastState -= stateSize; 772 State* state = cast(State*)&memory[lastState]; 773 index = state.index; 774 pc = state.pc; 775 counter = state.counter; 776 infiniteNesting = state.infiniteNesting; 777 debug(std_regex_matcher) 778 { 779 writefln("Restored matches", front, s[index .. s.lastIndex]); 780 foreach (i, m; matches) 781 writefln("Sub(%d) : %s..%s", i, m.begin, m.end); 782 } 783 s.reset(index); 784 next(); 785 debug(std_regex_matcher) 786 writefln("Backtracked (pc=%s) front: %s src: %s", 787 pc, front, s[index .. s.lastIndex]); 788 return true; 789 } 790 } 791 792 //very shitty string formatter, $$ replaced with next argument converted to string 793 @trusted string ctSub( U...)(string format, U args) 794 { 795 import std.conv : to; 796 bool seenDollar; 797 foreach (i, ch; format) 798 { 799 if (ch == '$') 800 { 801 if (seenDollar) 802 { 803 static if (args.length > 0) 804 { 805 return format[0 .. i - 1] ~ to!string(args[0]) 806 ~ ctSub(format[i + 1 .. $], args[1 .. $]); 807 } 808 else 809 assert(0); 810 } 811 else 812 seenDollar = true; 813 } 814 else 815 seenDollar = false; 816 817 } 818 return format; 819 } 820 821 struct CtContext 822 { 823 import std.conv : to, text; 824 //dirty flags 825 bool counter; 826 //to mark the portion of matches to save 827 int match, total_matches; 828 int reserved; 829 const(CodepointInterval)[][] charsets; 830 831 832 //state of codegenerator 833 static struct CtState 834 { 835 string code; 836 int addr; 837 } 838 839 this(Char)(ref const Regex!Char re) 840 { 841 match = 1; 842 reserved = 1; //first match is skipped 843 total_matches = re.ngroup; 844 foreach (ref set; re.charsets) 845 { 846 charsets ~= set.intervals; 847 } 848 } 849 850 CtContext lookaround(uint s, uint e) 851 { 852 CtContext ct; 853 ct.total_matches = e - s; 854 ct.match = 1; 855 return ct; 856 } 857 858 //restore state having current context 859 string restoreCode() 860 { 861 string text; 862 //stack is checked in L_backtrack 863 text ~= counter 864 ? " 865 stackPop(counter);" 866 : " 867 counter = 0;"; 868 if (match < total_matches) 869 { 870 text ~= ctSub(" 871 stackPop(matches[$$..$$]);", reserved, match); 872 text ~= ctSub(" 873 matches[$$..$] = typeof(matches[0]).init;", match); 874 } 875 else 876 text ~= ctSub(" 877 stackPop(matches[$$..$]);", reserved); 878 return text; 879 } 880 881 //save state having current context 882 string saveCode(uint pc, string count_expr="counter") 883 { 884 string text = ctSub(" 885 if (stackAvail < $$*(Group!(DataIndex)).sizeof/size_t.sizeof + $$) 886 { 887 newStack(); 888 }", match - reserved, cast(int) counter + 2); 889 if (match < total_matches) 890 text ~= ctSub(" 891 stackPush(matches[$$..$$]);", reserved, match); 892 else 893 text ~= ctSub(" 894 stackPush(matches[$$..$]);", reserved); 895 text ~= counter ? ctSub(" 896 stackPush($$);", count_expr) : ""; 897 text ~= ctSub(" 898 stackPush(index); stackPush($$); \n", pc); 899 return text; 900 } 901 902 // 903 CtState ctGenBlock(const(Bytecode)[] ir, int addr) 904 { 905 CtState result; 906 result.addr = addr; 907 while (!ir.empty) 908 { 909 auto n = ctGenGroup(ir, result.addr); 910 result.code ~= n.code; 911 result.addr = n.addr; 912 } 913 return result; 914 } 915 916 // 917 CtState ctGenGroup(ref const(Bytecode)[] ir, int addr) 918 { 919 import std.algorithm.comparison : max; 920 auto bailOut = "goto L_backtrack;"; 921 auto nextInstr = ctSub("goto case $$;", addr+1); 922 CtState r; 923 assert(!ir.empty); 924 switch (ir[0].code) 925 { 926 case IR.InfiniteStart, IR.InfiniteBloomStart,IR.InfiniteQStart, IR.RepeatStart, IR.RepeatQStart: 927 immutable infLoop = 928 ir[0].code == IR.InfiniteStart || ir[0].code == IR.InfiniteQStart || 929 ir[0].code == IR.InfiniteBloomStart; 930 931 counter = counter || 932 ir[0].code == IR.RepeatStart || ir[0].code == IR.RepeatQStart; 933 immutable len = ir[0].data; 934 auto nir = ir[ir[0].length .. ir[0].length+len]; 935 r = ctGenBlock(nir, addr+1); 936 //start/end codegen 937 //r.addr is at last test+ jump of loop, addr+1 is body of loop 938 nir = ir[ir[0].length + len .. $]; 939 r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr) ~ r.code; 940 r.code ~= ctGenFixupCode(nir, r.addr, addr+1); 941 r.addr += 2; //account end instruction + restore state 942 ir = nir; 943 break; 944 case IR.OrStart: 945 immutable len = ir[0].data; 946 auto nir = ir[ir[0].length .. ir[0].length+len]; 947 r = ctGenAlternation(nir, addr); 948 ir = ir[ir[0].length + len .. $]; 949 assert(ir[0].code == IR.OrEnd); 950 ir = ir[ir[0].length..$]; 951 break; 952 case IR.LookaheadStart: 953 case IR.NeglookaheadStart: 954 case IR.LookbehindStart: 955 case IR.NeglookbehindStart: 956 immutable len = ir[0].data; 957 immutable behind = ir[0].code == IR.LookbehindStart || ir[0].code == IR.NeglookbehindStart; 958 immutable negative = ir[0].code == IR.NeglookaheadStart || ir[0].code == IR.NeglookbehindStart; 959 string fwdType = "typeof(fwdMatcher(re, []))"; 960 string bwdType = "typeof(bwdMatcher(re, []))"; 961 string fwdCreate = "fwdMatcher(re, mem)"; 962 string bwdCreate = "bwdMatcher(re, mem)"; 963 immutable start = IRL!(IR.LookbehindStart); 964 immutable end = IRL!(IR.LookbehindStart)+len+IRL!(IR.LookaheadEnd); 965 CtContext context = lookaround(ir[1].raw, ir[2].raw); //split off new context 966 auto slice = ir[start .. end]; 967 r.code ~= ctSub(` 968 case $$: //fake lookaround "atom" 969 static if (typeof(matcher.s).isLoopback) 970 alias Lookaround = $$; 971 else 972 alias Lookaround = $$; 973 static bool matcher_$$(Lookaround matcher) @trusted 974 { 975 //(neg)lookaround piece start 976 $$ 977 //(neg)lookaround piece ends 978 } 979 auto save = index; 980 auto mem = pureMalloc(initialMemory(re))[0 .. initialMemory(re)]; 981 scope(exit) pureFree(mem.ptr); 982 static if (typeof(matcher.s).isLoopback) 983 auto lookaround = $$; 984 else 985 auto lookaround = $$; 986 lookaround.matches = matches[$$..$$]; 987 lookaround.backrefed = backrefed.empty ? matches : backrefed; 988 lookaround.nativeFn = &matcher_$$; //hookup closure's binary code 989 int match = $$; 990 s.reset(save); 991 next(); 992 if (match) 993 $$ 994 else 995 $$`, addr, 996 behind ? fwdType : bwdType, behind ? bwdType : fwdType, 997 addr, context.ctGenRegEx(slice), 998 behind ? fwdCreate : bwdCreate, behind ? bwdCreate : fwdCreate, 999 ir[1].raw, ir[2].raw, //start - end of matches slice 1000 addr, 1001 negative ? "!lookaround.matchImpl()" : "lookaround.matchImpl()", 1002 nextInstr, bailOut); 1003 ir = ir[end .. $]; 1004 r.addr = addr + 1; 1005 break; 1006 case IR.LookaheadEnd: case IR.NeglookaheadEnd: 1007 case IR.LookbehindEnd: case IR.NeglookbehindEnd: 1008 ir = ir[IRL!(IR.LookaheadEnd) .. $]; 1009 r.addr = addr; 1010 break; 1011 default: 1012 assert(ir[0].isAtom, text(ir[0].mnemonic)); 1013 r = ctGenAtom(ir, addr); 1014 } 1015 return r; 1016 } 1017 1018 //generate source for bytecode contained in OrStart ... OrEnd 1019 CtState ctGenAlternation(const(Bytecode)[] ir, int addr) 1020 { 1021 CtState[] pieces; 1022 CtState r; 1023 enum optL = IRL!(IR.Option); 1024 for (;;) 1025 { 1026 assert(ir[0].code == IR.Option); 1027 auto len = ir[0].data; 1028 if (optL+len < ir.length && ir[optL+len].code == IR.Option)//not a last option 1029 { 1030 auto nir = ir[optL .. optL+len-IRL!(IR.GotoEndOr)]; 1031 r = ctGenBlock(nir, addr+2);//space for Option + restore state 1032 //r.addr+1 to account GotoEndOr at end of branch 1033 r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr+1) ~ r.code; 1034 addr = r.addr+1;//leave space for GotoEndOr 1035 pieces ~= r; 1036 ir = ir[optL + len .. $]; 1037 } 1038 else 1039 { 1040 pieces ~= ctGenBlock(ir[optL..$], addr); 1041 addr = pieces[$-1].addr; 1042 break; 1043 } 1044 } 1045 r = pieces[0]; 1046 for (uint i = 1; i < pieces.length; i++) 1047 { 1048 r.code ~= ctSub(` 1049 case $$: 1050 goto case $$; `, pieces[i-1].addr, addr); 1051 r.code ~= pieces[i].code; 1052 } 1053 r.addr = addr; 1054 return r; 1055 } 1056 1057 // generate fixup code for instruction in ir, 1058 // fixup means it has an alternative way for control flow 1059 string ctGenFixupCode(const(Bytecode)[] ir, int addr, int fixup) 1060 { 1061 return ctGenFixupCode(ir, addr, fixup); // call ref Bytecode[] version 1062 } 1063 string ctGenFixupCode(ref const(Bytecode)[] ir, int addr, int fixup) 1064 { 1065 string r; 1066 string testCode; 1067 r = ctSub(` 1068 case $$: debug(std_regex_matcher) writeln("#$$");`, 1069 addr, addr); 1070 switch (ir[0].code) 1071 { 1072 case IR.InfiniteStart, IR.InfiniteQStart, IR.InfiniteBloomStart: 1073 r ~= ctSub( ` 1074 goto case $$;`, fixup); 1075 ir = ir[ir[0].length..$]; 1076 break; 1077 case IR.InfiniteEnd: 1078 testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1); 1079 r ~= ctSub( ` 1080 if (merge[$$+counter].mark(index)) 1081 { 1082 // merged! 1083 goto L_backtrack; 1084 } 1085 1086 $$ 1087 { 1088 $$ 1089 } 1090 goto case $$; 1091 case $$: //restore state and go out of loop 1092 $$ 1093 goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup, 1094 addr+1, restoreCode()); 1095 ir = ir[ir[0].length..$]; 1096 break; 1097 case IR.InfiniteBloomEnd: 1098 //TODO: check bloom filter and skip on failure 1099 testCode = ctQuickTest(ir[IRL!(IR.InfiniteBloomEnd) .. $],addr + 1); 1100 r ~= ctSub( ` 1101 if (merge[$$+counter].mark(index)) 1102 { 1103 // merged! 1104 goto L_backtrack; 1105 } 1106 1107 $$ 1108 { 1109 $$ 1110 } 1111 goto case $$; 1112 case $$: //restore state and go out of loop 1113 $$ 1114 goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup, 1115 addr+1, restoreCode()); 1116 ir = ir[ir[0].length..$]; 1117 break; 1118 case IR.InfiniteQEnd: 1119 testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1); 1120 auto altCode = testCode.length ? ctSub("else goto case $$;", fixup) : ""; 1121 r ~= ctSub( ` 1122 if (merge[$$+counter].mark(index)) 1123 { 1124 // merged! 1125 goto L_backtrack; 1126 } 1127 1128 $$ 1129 { 1130 $$ 1131 goto case $$; 1132 } 1133 $$ 1134 case $$://restore state and go inside loop 1135 $$ 1136 goto case $$;`, ir[1].raw, 1137 testCode, saveCode(addr+1), addr+2, altCode, 1138 addr+1, restoreCode(), fixup); 1139 ir = ir[ir[0].length..$]; 1140 break; 1141 case IR.RepeatStart, IR.RepeatQStart: 1142 r ~= ctSub( ` 1143 goto case $$;`, fixup); 1144 ir = ir[ir[0].length..$]; 1145 break; 1146 case IR.RepeatEnd, IR.RepeatQEnd: 1147 //len, step, min, max 1148 immutable len = ir[0].data; 1149 immutable step = ir[2].raw; 1150 immutable min = ir[3].raw; 1151 immutable max = ir[4].raw; 1152 r ~= ctSub(` 1153 if (merge[$$+counter].mark(index)) 1154 { 1155 // merged! 1156 goto L_backtrack; 1157 } 1158 if (counter < $$) 1159 { 1160 debug(std_regex_matcher) writeln("RepeatEnd min case pc=", $$); 1161 counter += $$; 1162 goto case $$; 1163 }`, ir[1].raw, min, addr, step, fixup); 1164 if (ir[0].code == IR.RepeatEnd) 1165 { 1166 string counter_expr = ctSub("counter % $$", step); 1167 r ~= ctSub(` 1168 else if (counter < $$) 1169 { 1170 $$ 1171 counter += $$; 1172 goto case $$; 1173 }`, max, saveCode(addr+1, counter_expr), step, fixup); 1174 } 1175 else 1176 { 1177 string counter_expr = ctSub("counter % $$", step); 1178 r ~= ctSub(` 1179 else if (counter < $$) 1180 { 1181 $$ 1182 counter = counter % $$; 1183 goto case $$; 1184 }`, max, saveCode(addr+1,counter_expr), step, addr+2); 1185 } 1186 r ~= ctSub(` 1187 else 1188 { 1189 counter = counter % $$; 1190 goto case $$; 1191 } 1192 case $$: //restore state 1193 $$ 1194 goto case $$;`, step, addr+2, addr+1, restoreCode(), 1195 ir[0].code == IR.RepeatEnd ? addr+2 : fixup ); 1196 ir = ir[ir[0].length..$]; 1197 break; 1198 case IR.Option: 1199 r ~= ctSub( ` 1200 { 1201 $$ 1202 } 1203 goto case $$; 1204 case $$://restore thunk to go to the next group 1205 $$ 1206 goto case $$;`, saveCode(addr+1), addr+2, 1207 addr+1, restoreCode(), fixup); 1208 ir = ir[ir[0].length..$]; 1209 break; 1210 default: 1211 assert(0, text(ir[0].mnemonic)); 1212 } 1213 return r; 1214 } 1215 1216 1217 string ctQuickTest(const(Bytecode)[] ir, int id) 1218 { 1219 uint pc = 0; 1220 while (pc < ir.length && ir[pc].isAtom) 1221 { 1222 if (ir[pc].code == IR.GroupStart || ir[pc].code == IR.GroupEnd) 1223 { 1224 pc++; 1225 } 1226 else if (ir[pc].code == IR.Backref) 1227 break; 1228 else 1229 { 1230 auto code = ctAtomCode(ir[pc..$], -1); 1231 return ctSub(` 1232 int test_$$() 1233 { 1234 $$ //$$ 1235 } 1236 if (test_$$() >= 0)`, id, code.ptr ? code : "return 0;", 1237 ir[pc].mnemonic, id); 1238 } 1239 } 1240 return ""; 1241 } 1242 1243 //process & generate source for simple bytecodes at front of ir using address addr 1244 CtState ctGenAtom(ref const(Bytecode)[] ir, int addr) 1245 { 1246 CtState result; 1247 result.code = ctAtomCode(ir, addr); 1248 ir.popFrontN(ir[0].code == IR.OrChar ? ir[0].sequence : ir[0].length); 1249 result.addr = addr + 1; 1250 return result; 1251 } 1252 1253 //D code for atom at ir using address addr, addr < 0 means quickTest 1254 string ctAtomCode(const(Bytecode)[] ir, int addr) 1255 { 1256 string code; 1257 string bailOut, nextInstr; 1258 if (addr < 0) 1259 { 1260 bailOut = "return -1;"; 1261 nextInstr = "return 0;"; 1262 } 1263 else 1264 { 1265 bailOut = "goto L_backtrack;"; 1266 nextInstr = ctSub("goto case $$;", addr+1); 1267 code ~= ctSub( ` 1268 case $$: debug(std_regex_matcher) writeln("#$$"); 1269 `, addr, addr); 1270 } 1271 switch (ir[0].code) 1272 { 1273 case IR.OrChar://assumes IRL!(OrChar) == 1 1274 code ~= ctSub(` 1275 if (atEnd) 1276 $$`, bailOut); 1277 immutable len = ir[0].sequence; 1278 for (uint i = 0; i < len; i++) 1279 { 1280 code ~= ctSub( ` 1281 if (front == $$) 1282 { 1283 $$ 1284 $$ 1285 }`, ir[i].data, addr >= 0 ? "next();" :"", nextInstr); 1286 } 1287 code ~= ctSub( ` 1288 $$`, bailOut); 1289 break; 1290 case IR.Char: 1291 code ~= ctSub( ` 1292 if (atEnd || front != $$) 1293 $$ 1294 $$ 1295 $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); 1296 break; 1297 case IR.Any: 1298 code ~= ctSub( ` 1299 if (atEnd || (!(re.flags & RegexOption.singleline) 1300 && (front == '\r' || front == '\n'))) 1301 $$ 1302 $$ 1303 $$`, bailOut, addr >= 0 ? "next();" :"",nextInstr); 1304 break; 1305 case IR.CodepointSet: 1306 if (charsets.length) 1307 { 1308 string name = `func_`~to!string(addr+1); 1309 string funcCode = CodepointSet.toSourceCode(charsets[ir[0].data], name); 1310 code ~= ctSub( ` 1311 static $$ 1312 if (atEnd || !$$(front)) 1313 $$ 1314 $$ 1315 $$`, funcCode, name, bailOut, addr >= 0 ? "next();" :"", nextInstr); 1316 } 1317 else 1318 code ~= ctSub( ` 1319 if (atEnd || !re.charsets[$$].scanFor(front)) 1320 $$ 1321 $$ 1322 $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); 1323 break; 1324 case IR.Trie: 1325 if (charsets.length && charsets[ir[0].data].length <= 8) 1326 goto case IR.CodepointSet; 1327 code ~= ctSub( ` 1328 if (atEnd || !re.matchers[$$][front]) 1329 $$ 1330 $$ 1331 $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); 1332 break; 1333 case IR.Wordboundary: 1334 code ~= ctSub( ` 1335 dchar back; 1336 DataIndex bi; 1337 if (atStart && wordMatcher[front]) 1338 { 1339 $$ 1340 } 1341 else if (atEnd && s.loopBack(index).nextChar(back, bi) 1342 && wordMatcher[back]) 1343 { 1344 $$ 1345 } 1346 else if (s.loopBack(index).nextChar(back, bi)) 1347 { 1348 bool af = wordMatcher[front]; 1349 bool ab = wordMatcher[back]; 1350 if (af ^ ab) 1351 { 1352 $$ 1353 } 1354 } 1355 $$`, nextInstr, nextInstr, nextInstr, bailOut); 1356 break; 1357 case IR.Notwordboundary: 1358 code ~= ctSub( ` 1359 dchar back; 1360 DataIndex bi; 1361 //at start & end of input 1362 if (atStart && wordMatcher[front]) 1363 $$ 1364 else if (atEnd && s.loopBack(index).nextChar(back, bi) 1365 && wordMatcher[back]) 1366 $$ 1367 else if (s.loopBack(index).nextChar(back, bi)) 1368 { 1369 bool af = wordMatcher[front]; 1370 bool ab = wordMatcher[back]; 1371 if (af ^ ab) 1372 $$ 1373 } 1374 $$`, bailOut, bailOut, bailOut, nextInstr); 1375 1376 break; 1377 case IR.Bol: 1378 code ~= ctSub(` 1379 dchar back; 1380 DataIndex bi; 1381 if (atStart || (s.loopBack(index).nextChar(back,bi) 1382 && endOfLine(back, front == '\n'))) 1383 { 1384 debug(std_regex_matcher) writeln("BOL matched"); 1385 $$ 1386 } 1387 else 1388 $$`, nextInstr, bailOut); 1389 1390 break; 1391 case IR.Bof: 1392 code ~= ctSub(` 1393 if (atStart) 1394 { 1395 debug(std_regex_matcher) writeln("BOF matched"); 1396 $$ 1397 } 1398 else 1399 $$`, nextInstr, bailOut); 1400 break; 1401 case IR.Eol: 1402 code ~= ctSub(` 1403 dchar back; 1404 DataIndex bi; 1405 debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]); 1406 //no matching inside \r\n 1407 if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi) 1408 && back == '\r'))) 1409 { 1410 debug(std_regex_matcher) writeln("EOL matched"); 1411 $$ 1412 } 1413 else 1414 $$`, nextInstr, bailOut); 1415 break; 1416 case IR.Eof: 1417 code ~= ctSub(` 1418 if (atEnd) 1419 { 1420 debug(std_regex_matcher) writeln("BOF matched"); 1421 $$ 1422 } 1423 else 1424 $$`, nextInstr, bailOut); 1425 break; 1426 case IR.GroupStart: 1427 code ~= ctSub(` 1428 matches[$$].begin = index; 1429 $$`, ir[0].data, nextInstr); 1430 match = ir[0].data+1; 1431 break; 1432 case IR.GroupEnd: 1433 code ~= ctSub(` 1434 matches[$$].end = index; 1435 $$`, ir[0].data, nextInstr); 1436 break; 1437 case IR.Backref: 1438 string mStr = "auto referenced = "; 1439 mStr ~= ir[0].localRef 1440 ? ctSub("s[matches[$$].begin .. matches[$$].end];", 1441 ir[0].data, ir[0].data) 1442 : ctSub("s[backrefed[$$].begin .. backrefed[$$].end];", 1443 ir[0].data, ir[0].data); 1444 code ~= ctSub( ` 1445 $$ 1446 while (!atEnd && !referenced.empty && front == referenced.front) 1447 { 1448 next(); 1449 referenced.popFront(); 1450 } 1451 if (referenced.empty) 1452 $$ 1453 else 1454 $$`, mStr, nextInstr, bailOut); 1455 break; 1456 case IR.Nop: 1457 case IR.End: 1458 break; 1459 default: 1460 assert(0, text(ir[0].mnemonic, " is not supported yet")); 1461 } 1462 return code; 1463 } 1464 1465 //generate D code for the whole regex 1466 public string ctGenRegEx(const(Bytecode)[] ir) 1467 { 1468 auto bdy = ctGenBlock(ir, 0); 1469 auto r = ` 1470 import core.memory : pureMalloc, pureFree; 1471 with(matcher) 1472 { 1473 pc = 0; 1474 counter = 0; 1475 lastState = 0; 1476 matches[] = Group!DataIndex.init; 1477 auto start = s._index;`; 1478 r ~= ` 1479 goto StartLoop; 1480 debug(std_regex_matcher) writeln("Try CT matching starting at ",s[index .. s.lastIndex]); 1481 L_backtrack: 1482 if (lastState || prevStack()) 1483 { 1484 stackPop(pc); 1485 stackPop(index); 1486 s.reset(index); 1487 next(); 1488 } 1489 else 1490 { 1491 s.reset(start); 1492 return false; 1493 } 1494 StartLoop: 1495 switch (pc) 1496 { 1497 `; 1498 r ~= bdy.code; 1499 r ~= ctSub(` 1500 case $$: break;`,bdy.addr); 1501 r ~= ` 1502 default: 1503 assert(0); 1504 } 1505 // cleanup stale stack blocks 1506 while (prevStack()) {} 1507 return true; 1508 } 1509 `; 1510 return r; 1511 } 1512 1513 } 1514 1515 string ctGenRegExCode(Char)(const Regex!Char re) 1516 { 1517 auto context = CtContext(re); 1518 return context.ctGenRegEx(re.ir); 1519 } 1520