1 //Written in the D programming language 2 /* 3 Implementation of Thompson NFA std.regex engine. 4 Key point is evaluation of all possible threads (state) at each step 5 in a breadth-first manner, thereby geting some nice properties: 6 - looking at each character only once 7 - merging of equivalent threads, that gives matching process linear time complexity 8 */ 9 module std.regex.internal.thompson; 10 11 package(std.regex): 12 13 import std.range.primitives; 14 import std.regex.internal.ir; 15 16 //State of VM thread 17 struct Thread(DataIndex) 18 { 19 Thread* next; //intrusive linked list 20 uint pc; 21 uint counter; //loop counter 22 uint uopCounter; //counts micro operations inside one macro instruction (e.g. BackRef) 23 Group!DataIndex[1] matches; 24 } 25 26 //head-tail singly-linked list 27 struct ThreadList(DataIndex) 28 { 29 Thread!DataIndex* tip = null, toe = null; 30 //add new thread to the start of list 31 void insertFront(Thread!DataIndex* t) 32 { 33 if (tip) 34 { 35 t.next = tip; 36 tip = t; 37 } 38 else 39 { 40 t.next = null; 41 tip = toe = t; 42 } 43 } 44 //add new thread to the end of list 45 void insertBack(Thread!DataIndex* t) 46 { 47 if (toe) 48 { 49 toe.next = t; 50 toe = t; 51 } 52 else 53 tip = toe = t; 54 toe.next = null; 55 } 56 //move head element out of list 57 Thread!DataIndex* fetch() 58 { 59 auto t = tip; 60 if (tip == toe) 61 tip = toe = null; 62 else 63 tip = tip.next; 64 return t; 65 } 66 //non-destructive iteration of ThreadList 67 struct ThreadRange 68 { 69 const(Thread!DataIndex)* ct; 70 this(ThreadList tlist){ ct = tlist.tip; } 71 @property bool empty(){ return ct is null; } 72 @property const(Thread!DataIndex)* front(){ return ct; } 73 void popFront() 74 { 75 assert(ct); 76 ct = ct.next; 77 } 78 } 79 @property bool empty() 80 { 81 return tip == null; 82 } 83 ThreadRange opSlice() 84 { 85 return ThreadRange(this); 86 } 87 } 88 89 template ThompsonOps(E, S, bool withInput:true) 90 { 91 @trusted: 92 static bool op(IR code:IR.End)(E e, S* state) 93 { 94 with(e) with(state) 95 { 96 finish(t, matches, re.ir[t.pc].data); 97 //fix endpoint of the whole match 98 matches[0].end = index; 99 recycle(t); 100 //cut off low priority threads 101 recycle(clist); 102 recycle(worklist); 103 debug(std_regex_matcher) writeln("Finished thread ", matches); 104 return false; // no more state to eval 105 } 106 } 107 108 static bool op(IR code:IR.Wordboundary)(E e, S* state) 109 { 110 with(e) with(state) 111 { 112 dchar back; 113 DataIndex bi; 114 //at start & end of input 115 if (atStart && wordMatcher[front]) 116 { 117 t.pc += IRL!(IR.Wordboundary); 118 return true; 119 } 120 else if (atEnd && s.loopBack(index).nextChar(back, bi) 121 && wordMatcher[back]) 122 { 123 t.pc += IRL!(IR.Wordboundary); 124 return true; 125 } 126 else if (s.loopBack(index).nextChar(back, bi)) 127 { 128 bool af = wordMatcher[front]; 129 bool ab = wordMatcher[back]; 130 if (af ^ ab) 131 { 132 t.pc += IRL!(IR.Wordboundary); 133 return true; 134 } 135 } 136 return popState(e); 137 } 138 } 139 140 static bool op(IR code:IR.Notwordboundary)(E e, S* state) 141 { 142 with(e) with(state) 143 { 144 dchar back; 145 DataIndex bi; 146 //at start & end of input 147 if (atStart && wordMatcher[front]) 148 { 149 return popState(e); 150 } 151 else if (atEnd && s.loopBack(index).nextChar(back, bi) 152 && wordMatcher[back]) 153 { 154 return popState(e); 155 } 156 else if (s.loopBack(index).nextChar(back, bi)) 157 { 158 bool af = wordMatcher[front]; 159 bool ab = wordMatcher[back] != 0; 160 if (af ^ ab) 161 { 162 return popState(e); 163 } 164 } 165 t.pc += IRL!(IR.Notwordboundary); 166 } 167 return true; 168 } 169 170 static bool op(IR code:IR.Bof)(E e, S* state) 171 { 172 with(e) with(state) 173 { 174 if (atStart) 175 { 176 t.pc += IRL!(IR.Bof); 177 return true; 178 } 179 else 180 { 181 return popState(e); 182 } 183 } 184 } 185 186 static bool op(IR code:IR.Bol)(E e, S* state) 187 { 188 with(e) with(state) 189 { 190 dchar back; 191 DataIndex bi; 192 if (atStart 193 ||(s.loopBack(index).nextChar(back,bi) 194 && startOfLine(back, front == '\n'))) 195 { 196 t.pc += IRL!(IR.Bol); 197 return true; 198 } 199 else 200 { 201 return popState(e); 202 } 203 } 204 } 205 206 static bool op(IR code:IR.Eof)(E e, S* state) 207 { 208 with(e) with(state) 209 { 210 if (atEnd) 211 { 212 t.pc += IRL!(IR.Eol); 213 return true; 214 } 215 else 216 { 217 return popState(e); 218 } 219 } 220 } 221 222 static bool op(IR code:IR.Eol)(E e, S* state) 223 { 224 with(e) with(state) 225 { 226 dchar back; 227 DataIndex bi; 228 //no matching inside \r\n 229 if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back, bi) 230 && back == '\r'))) 231 { 232 t.pc += IRL!(IR.Eol); 233 return true; 234 } 235 else 236 { 237 return popState(e); 238 } 239 240 } 241 } 242 243 static bool op(IR code:IR.InfiniteStart)(E e, S* state) 244 { 245 with(e) with(state) 246 t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart); 247 return op!(IR.InfiniteEnd)(e,state); 248 } 249 250 static bool op(IR code:IR.InfiniteBloomStart)(E e, S* state) 251 { 252 with(e) with(state) 253 t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteBloomStart); 254 return op!(IR.InfiniteBloomEnd)(e,state); 255 } 256 257 static bool op(IR code:IR.InfiniteQStart)(E e, S* state) 258 { 259 with(e) with(state) 260 t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteQStart); 261 return op!(IR.InfiniteQEnd)(e,state); 262 } 263 264 static bool op(IR code:IR.RepeatStart)(E e, S* state) 265 { 266 with(e) with(state) 267 t.pc += re.ir[t.pc].data + IRL!(IR.RepeatStart); 268 return op!(IR.RepeatEnd)(e,state); 269 } 270 271 static bool op(IR code:IR.RepeatQStart)(E e, S* state) 272 { 273 with(e) with(state) 274 t.pc += re.ir[t.pc].data + IRL!(IR.RepeatQStart); 275 return op!(IR.RepeatQEnd)(e,state); 276 } 277 278 static bool op(IR code)(E e, S* state) 279 if (code == IR.RepeatEnd || code == IR.RepeatQEnd) 280 { 281 with(e) with(state) 282 { 283 //len, step, min, max 284 uint len = re.ir[t.pc].data; 285 uint step = re.ir[t.pc+2].raw; 286 uint min = re.ir[t.pc+3].raw; 287 if (t.counter < min) 288 { 289 t.counter += step; 290 t.pc -= len; 291 return true; 292 } 293 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) 294 { 295 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", 296 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); 297 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; 298 } 299 else 300 { 301 debug(std_regex_matcher) 302 writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", 303 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); 304 return popState(e); 305 } 306 uint max = re.ir[t.pc+4].raw; 307 if (t.counter < max) 308 { 309 if (re.ir[t.pc].code == IR.RepeatEnd) 310 { 311 //queue out-of-loop thread 312 worklist.insertFront(fork(t, t.pc + IRL!(IR.RepeatEnd), t.counter % step)); 313 t.counter += step; 314 t.pc -= len; 315 } 316 else 317 { 318 //queue into-loop thread 319 worklist.insertFront(fork(t, t.pc - len, t.counter + step)); 320 t.counter %= step; 321 t.pc += IRL!(IR.RepeatEnd); 322 } 323 } 324 else 325 { 326 t.counter %= step; 327 t.pc += IRL!(IR.RepeatEnd); 328 } 329 return true; 330 } 331 } 332 333 static bool op(IR code)(E e, S* state) 334 if (code == IR.InfiniteEnd || code == IR.InfiniteQEnd) 335 { 336 with(e) with(state) 337 { 338 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) 339 { 340 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", 341 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); 342 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; 343 } 344 else 345 { 346 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", 347 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); 348 return popState(e); 349 } 350 uint len = re.ir[t.pc].data; 351 uint pc1, pc2; //branches to take in priority order 352 if (re.ir[t.pc].code == IR.InfiniteEnd) 353 { 354 pc1 = t.pc - len; 355 pc2 = t.pc + IRL!(IR.InfiniteEnd); 356 } 357 else 358 { 359 pc1 = t.pc + IRL!(IR.InfiniteEnd); 360 pc2 = t.pc - len; 361 } 362 worklist.insertFront(fork(t, pc2, t.counter)); 363 t.pc = pc1; 364 return true; 365 } 366 } 367 368 static bool op(IR code)(E e, S* state) 369 if (code == IR.InfiniteBloomEnd) 370 { 371 with(e) with(state) 372 { 373 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) 374 { 375 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", 376 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); 377 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; 378 } 379 else 380 { 381 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", 382 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); 383 return popState(e); 384 } 385 uint len = re.ir[t.pc].data; 386 uint pc1, pc2; //branches to take in priority order 387 pc1 = t.pc - len; 388 pc2 = t.pc + IRL!(IR.InfiniteBloomEnd); 389 uint filterIndex = re.ir[t.pc + 2].raw; 390 if (re.filters[filterIndex][front]) 391 worklist.insertFront(fork(t, pc2, t.counter)); 392 t.pc = pc1; 393 return true; 394 } 395 } 396 397 static bool op(IR code:IR.OrEnd)(E e, S* state) 398 { 399 with(e) with(state) 400 { 401 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) 402 { 403 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", 404 t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] ); 405 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; 406 t.pc += IRL!(IR.OrEnd); 407 } 408 else 409 { 410 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", 411 t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] ); 412 return popState(e); 413 } 414 return true; 415 } 416 } 417 418 static bool op(IR code:IR.OrStart)(E e, S* state) 419 { 420 with(e) with(state) 421 { 422 t.pc += IRL!(IR.OrStart); 423 return op!(IR.Option)(e,state); 424 } 425 } 426 427 static bool op(IR code:IR.Option)(E e, S* state) 428 { 429 with(e) with(state) 430 { 431 uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option); 432 //queue next Option 433 if (re.ir[next].code == IR.Option) 434 { 435 worklist.insertFront(fork(t, next, t.counter)); 436 } 437 t.pc += IRL!(IR.Option); 438 return true; 439 } 440 } 441 442 static bool op(IR code:IR.GotoEndOr)(E e, S* state) 443 { 444 with(e) with(state) 445 { 446 t.pc = t.pc + re.ir[t.pc].data + IRL!(IR.GotoEndOr); 447 return op!(IR.OrEnd)(e, state); 448 } 449 } 450 451 static bool op(IR code:IR.GroupStart)(E e, S* state) 452 { 453 with(e) with(state) 454 { 455 uint n = re.ir[t.pc].data; 456 t.matches.ptr[n].begin = index; 457 t.pc += IRL!(IR.GroupStart); 458 return true; 459 } 460 } 461 static bool op(IR code:IR.GroupEnd)(E e, S* state) 462 { 463 with(e) with(state) 464 { 465 uint n = re.ir[t.pc].data; 466 t.matches.ptr[n].end = index; 467 t.pc += IRL!(IR.GroupEnd); 468 return true; 469 } 470 } 471 472 static bool op(IR code:IR.Backref)(E e, S* state) 473 { 474 with(e) with(state) 475 { 476 uint n = re.ir[t.pc].data; 477 Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr; 478 assert(source); 479 if (source[n].begin == source[n].end)//zero-width Backref! 480 { 481 t.pc += IRL!(IR.Backref); 482 return true; 483 } 484 else 485 { 486 size_t idx = source[n].begin + t.uopCounter; 487 size_t end = source[n].end; 488 if (s[idx .. end].front == front) 489 { 490 import std.utf : stride; 491 492 t.uopCounter += stride(s[idx .. end], 0); 493 if (t.uopCounter + source[n].begin == source[n].end) 494 {//last codepoint 495 t.pc += IRL!(IR.Backref); 496 t.uopCounter = 0; 497 } 498 nlist.insertBack(t); 499 } 500 else 501 recycle(t); 502 t = worklist.fetch(); 503 return t != null; 504 } 505 } 506 } 507 508 509 static bool op(IR code)(E e, S* state) 510 if (code == IR.LookbehindStart || code == IR.NeglookbehindStart) 511 { 512 with(e) with(state) 513 { 514 uint len = re.ir[t.pc].data; 515 uint ms = re.ir[t.pc + 1].raw, me = re.ir[t.pc + 2].raw; 516 uint end = t.pc + len + IRL!(IR.LookbehindEnd) + IRL!(IR.LookbehindStart); 517 bool positive = re.ir[t.pc].code == IR.LookbehindStart; 518 static if (Stream.isLoopback) 519 auto matcher = fwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0)); 520 else 521 auto matcher = bwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0)); 522 matcher.backrefed = backrefed.empty ? t.matches : backrefed; 523 //backMatch 524 auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookbehindStart)); 525 freelist = matcher.freelist; 526 subCounters[t.pc] = matcher.genCounter; 527 if ((mRes != 0 ) ^ positive) 528 { 529 return popState(e); 530 } 531 t.pc = end; 532 return true; 533 } 534 } 535 536 static bool op(IR code)(E e, S* state) 537 if (code == IR.LookaheadStart || code == IR.NeglookaheadStart) 538 { 539 with(e) with(state) 540 { 541 auto save = index; 542 uint len = re.ir[t.pc].data; 543 uint ms = re.ir[t.pc+1].raw, me = re.ir[t.pc+2].raw; 544 uint end = t.pc+len+IRL!(IR.LookaheadEnd)+IRL!(IR.LookaheadStart); 545 bool positive = re.ir[t.pc].code == IR.LookaheadStart; 546 static if (Stream.isLoopback) 547 auto matcher = bwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0)); 548 else 549 auto matcher = fwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0)); 550 matcher.backrefed = backrefed.empty ? t.matches : backrefed; 551 auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookaheadStart)); 552 freelist = matcher.freelist; 553 subCounters[t.pc] = matcher.genCounter; 554 s.reset(index); 555 next(); 556 if ((mRes != 0) ^ positive) 557 { 558 return popState(e); 559 } 560 t.pc = end; 561 return true; 562 } 563 } 564 565 static bool op(IR code)(E e, S* state) 566 if (code == IR.LookaheadEnd || code == IR.NeglookaheadEnd || 567 code == IR.LookbehindEnd || code == IR.NeglookbehindEnd) 568 { 569 with(e) with(state) 570 { 571 finish(t, matches.ptr[0 .. re.ngroup], re.ir[t.pc].data); 572 recycle(t); 573 //cut off low priority threads 574 recycle(clist); 575 recycle(worklist); 576 return false; // no more state 577 } 578 } 579 580 static bool op(IR code:IR.Nop)(E e, S* state) 581 { 582 with(state) t.pc += IRL!(IR.Nop); 583 return true; 584 } 585 586 static bool op(IR code:IR.OrChar)(E e, S* state) 587 { 588 with(e) with(state) 589 { 590 uint len = re.ir[t.pc].sequence; 591 uint end = t.pc + len; 592 static assert(IRL!(IR.OrChar) == 1); 593 for (; t.pc < end; t.pc++) 594 if (re.ir[t.pc].data == front) 595 break; 596 if (t.pc != end) 597 { 598 t.pc = end; 599 nlist.insertBack(t); 600 } 601 else 602 recycle(t); 603 t = worklist.fetch(); 604 return t != null; 605 } 606 } 607 608 static bool op(IR code:IR.Char)(E e, S* state) 609 { 610 with(e) with(state) 611 { 612 if (front == re.ir[t.pc].data) 613 { 614 t.pc += IRL!(IR.Char); 615 nlist.insertBack(t); 616 } 617 else 618 recycle(t); 619 t = worklist.fetch(); 620 return t != null; 621 } 622 } 623 624 static bool op(IR code:IR.Any)(E e, S* state) 625 { 626 with(e) with(state) 627 { 628 t.pc += IRL!(IR.Any); 629 nlist.insertBack(t); 630 t = worklist.fetch(); 631 return t != null; 632 } 633 } 634 635 static bool op(IR code:IR.CodepointSet)(E e, S* state) 636 { 637 with(e) with(state) 638 { 639 if (re.charsets[re.ir[t.pc].data].scanFor(front)) 640 { 641 t.pc += IRL!(IR.CodepointSet); 642 nlist.insertBack(t); 643 } 644 else 645 { 646 recycle(t); 647 } 648 t = worklist.fetch(); 649 return t != null; 650 } 651 } 652 653 static bool op(IR code:IR.Trie)(E e, S* state) 654 { 655 with(e) with(state) 656 { 657 if (re.matchers[re.ir[t.pc].data][front]) 658 { 659 t.pc += IRL!(IR.Trie); 660 nlist.insertBack(t); 661 } 662 else 663 { 664 recycle(t); 665 } 666 t = worklist.fetch(); 667 return t != null; 668 } 669 } 670 671 } 672 673 template ThompsonOps(E,S, bool withInput:false) 674 { 675 @trusted: 676 // can't match these without input 677 static bool op(IR code)(E e, S* state) 678 if (code == IR.Char || code == IR.OrChar || code == IR.CodepointSet 679 || code == IR.Trie || code == IR.Char || code == IR.Any) 680 { 681 return state.popState(e); 682 } 683 684 // special case of zero-width backref 685 static bool op(IR code:IR.Backref)(E e, S* state) 686 { 687 with(e) with(state) 688 { 689 uint n = re.ir[t.pc].data; 690 Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr; 691 assert(source); 692 if (source[n].begin == source[n].end)//zero-width Backref! 693 { 694 t.pc += IRL!(IR.Backref); 695 return true; 696 } 697 else 698 return popState(e); 699 } 700 } 701 702 // forward all control flow to normal versions 703 static bool op(IR code)(E e, S* state) 704 if (code != IR.Char && code != IR.OrChar && code != IR.CodepointSet 705 && code != IR.Trie && code != IR.Char && code != IR.Any && code != IR.Backref) 706 { 707 return ThompsonOps!(E,S,true).op!code(e,state); 708 } 709 } 710 711 /+ 712 Thomspon matcher does all matching in lockstep, 713 never looking at the same char twice 714 +/ 715 @trusted class ThompsonMatcher(Char, StreamType = Input!Char): Matcher!Char 716 if (is(Char : dchar)) 717 { 718 alias DataIndex = Stream.DataIndex; 719 alias Stream = StreamType; 720 alias OpFunc = bool function(ThompsonMatcher, State*) pure; 721 alias BackMatcher = ThompsonMatcher!(Char, BackLooper!(Stream)); 722 alias OpBackFunc = bool function(BackMatcher, BackMatcher.State*) pure; 723 Thread!DataIndex* freelist; 724 ThreadList!DataIndex clist, nlist; 725 DataIndex[] merge; 726 Group!DataIndex[] backrefed; 727 const Regex!Char re; //regex program 728 Stream s; 729 dchar front; 730 DataIndex index; 731 DataIndex genCounter; //merge trace counter, goes up on every dchar 732 size_t[size_t] subCounters; //a table of gen counter per sub-engine: PC -> counter 733 OpFunc[] opCacheTrue; // pointers to Op!(IR.xyz) for each bytecode 734 OpFunc[] opCacheFalse; // ditto 735 OpBackFunc[] opCacheBackTrue; // ditto 736 OpBackFunc[] opCacheBackFalse; // ditto 737 size_t threadSize; 738 size_t _refCount; 739 int matched; 740 bool exhausted; 741 742 final: 743 pure 744 static struct State 745 { 746 Thread!DataIndex* t; 747 ThreadList!DataIndex worklist; 748 Group!DataIndex[] matches; 749 750 bool popState(E)(E e) 751 { 752 with(e) 753 { 754 recycle(t); 755 t = worklist.fetch(); 756 return t != null; 757 } 758 } 759 760 } 761 762 static if (__traits(hasMember,Stream, "search")) 763 { 764 enum kicked = true; 765 } 766 else 767 enum kicked = false; 768 769 static size_t getThreadSize(const ref Regex!Char re) 770 { 771 return re.ngroup 772 ? (Thread!DataIndex).sizeof + (re.ngroup-1)*(Group!DataIndex).sizeof 773 : (Thread!DataIndex).sizeof - (Group!DataIndex).sizeof; 774 } 775 776 static size_t initialMemory(const ref Regex!Char re) 777 { 778 return getThreadSize(re)*re.threadCount + re.hotspotTableSize*size_t.sizeof 779 +4*OpFunc.sizeof*re.ir.length; 780 } 781 782 //true if it's start of input 783 @property bool atStart(){ return index == 0; } 784 785 //true if it's end of input 786 @property bool atEnd(){ return index == s.lastIndex && s.atEnd; } 787 788 override @property ref size_t refCount() @safe { return _refCount; } 789 790 override @property ref const(Regex!Char) pattern() @safe { return re; } 791 792 bool next() 793 { 794 if (!s.nextChar(front, index)) 795 { 796 index = s.lastIndex; 797 return false; 798 } 799 return true; 800 } 801 802 static if (kicked) 803 { 804 bool search() 805 { 806 807 if (!s.search(re.kickstart, front, index)) 808 { 809 index = s.lastIndex; 810 return false; 811 } 812 return true; 813 } 814 } 815 816 void initExternalMemory(void[] memory) 817 { 818 threadSize = getThreadSize(re); 819 prepareFreeList(re.threadCount, memory); 820 if (re.hotspotTableSize) 821 { 822 merge = arrayInChunk!(DataIndex)(re.hotspotTableSize, memory); 823 merge[] = 0; 824 } 825 opCacheTrue = arrayInChunk!(OpFunc)(re.ir.length, memory); 826 opCacheFalse = arrayInChunk!(OpFunc)(re.ir.length, memory); 827 opCacheBackTrue = arrayInChunk!(OpBackFunc)(re.ir.length, memory); 828 opCacheBackFalse = arrayInChunk!(OpBackFunc)(re.ir.length, memory); 829 830 for (uint pc = 0; pc<re.ir.length; pc += re.ir[pc].length) 831 { 832 L_dispatch: 833 switch (re.ir[pc].code) 834 { 835 foreach (e; __traits(allMembers, IR)) 836 { 837 mixin(`case IR.`~e~`: 838 opCacheTrue[pc] = &Ops!(true).op!(IR.`~e~`); 839 opCacheBackTrue[pc] = &BackOps!(true).op!(IR.`~e~`); 840 opCacheFalse[pc] = &Ops!(false).op!(IR.`~e~`); 841 opCacheBackFalse[pc] = &BackOps!(false).op!(IR.`~e~`); 842 break L_dispatch; 843 `); 844 } 845 default: 846 assert(0, "Unrecognized instruction "~re.ir[pc].mnemonic); 847 } 848 } 849 } 850 851 override Matcher!Char rearm(in Char[] data) 852 { 853 exhausted = false; 854 matched = 0; 855 s = Stream(data); 856 return this; 857 } 858 859 this()(ref const Regex!Char program, Stream stream, void[] memory) 860 { 861 // We are emplace'd to malloced memory w/o blitting T.init over it\ 862 // make sure we initialize all fields explicitly 863 _refCount = 1; 864 subCounters = null; 865 backrefed = null; 866 exhausted = false; 867 matched = 0; 868 re = program; 869 s = stream; 870 initExternalMemory(memory); 871 genCounter = 0; 872 } 873 874 this(ThompsonMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream) 875 { 876 _refCount = 1; 877 subCounters = matcher.subCounters; 878 s = stream; 879 auto code = matcher.re.ir[lo .. hi]; 880 re = matcher.re.withCode(code).withNGroup(nGroup); 881 threadSize = matcher.threadSize; 882 merge = matcher.merge; 883 freelist = matcher.freelist; 884 opCacheTrue = matcher.opCacheTrue[lo .. hi]; 885 opCacheBackTrue = matcher.opCacheBackTrue[lo .. hi]; 886 opCacheFalse = matcher.opCacheFalse[lo .. hi]; 887 opCacheBackFalse = matcher.opCacheBackFalse[lo .. hi]; 888 front = matcher.front; 889 index = matcher.index; 890 } 891 892 this(BackMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream) 893 { 894 _refCount = 1; 895 subCounters = matcher.subCounters; 896 s = stream; 897 auto code = matcher.re.ir[lo .. hi]; 898 re = matcher.re.withCode(code).withNGroup(nGroup); 899 threadSize = matcher.threadSize; 900 merge = matcher.merge; 901 freelist = matcher.freelist; 902 opCacheTrue = matcher.opCacheBackTrue[lo .. hi]; 903 opCacheBackTrue = matcher.opCacheTrue[lo .. hi]; 904 opCacheFalse = matcher.opCacheBackFalse[lo .. hi]; 905 opCacheBackFalse = matcher.opCacheFalse[lo .. hi]; 906 front = matcher.front; 907 index = matcher.index; 908 } 909 910 auto fwdMatcher()(size_t lo, size_t hi, uint nGroup, size_t counter) 911 { 912 auto m = new ThompsonMatcher!(Char, Stream)(this, lo, hi, nGroup, s); 913 m.genCounter = counter; 914 return m; 915 } 916 917 auto bwdMatcher()(size_t lo, size_t hi, uint nGroup, size_t counter) 918 { 919 alias BackLooper = typeof(s.loopBack(index)); 920 auto m = new ThompsonMatcher!(Char, BackLooper)(this, lo, hi, nGroup, s.loopBack(index)); 921 m.genCounter = counter; 922 m.next(); 923 return m; 924 } 925 926 override void dupTo(Matcher!Char engine, void[] memory) 927 { 928 auto thompson = cast(ThompsonMatcher) engine; 929 thompson.s = s; 930 thompson.subCounters = null; 931 thompson.front = front; 932 thompson.index = index; 933 thompson.matched = matched; 934 thompson.exhausted = exhausted; 935 thompson.initExternalMemory(memory); 936 } 937 938 override int match(Group!DataIndex[] matches) 939 { 940 debug(std_regex_matcher) 941 writeln("------------------------------------------"); 942 if (exhausted) 943 { 944 return false; 945 } 946 if (re.flags & RegexInfo.oneShot) 947 { 948 next(); 949 exhausted = true; 950 return matchOneShot(matches); 951 } 952 static if (kicked) 953 if (!re.kickstart.empty) 954 return matchImpl!(true)(matches); 955 return matchImpl!(false)(matches); 956 } 957 958 //match the input and fill matches 959 int matchImpl(bool withSearch)(Group!DataIndex[] matches) 960 { 961 if (!matched && clist.empty) 962 { 963 static if (withSearch) 964 search(); 965 else 966 next(); 967 } 968 else//char in question is fetched in prev call to match 969 { 970 matched = 0; 971 } 972 State state; 973 state.matches = matches; 974 975 if (!atEnd)//if no char 976 for (;;) 977 { 978 genCounter++; 979 debug(std_regex_matcher) 980 { 981 writefln("Threaded matching threads at %s", s[index .. s.lastIndex]); 982 foreach (t; clist[]) 983 { 984 assert(t); 985 writef("pc=%s ",t.pc); 986 write(t.matches); 987 writeln(); 988 } 989 } 990 for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) 991 { 992 eval!true(&state); 993 } 994 //if we already have match no need to push the engine 995 if (!matched) 996 { 997 state.t = createStart(index); 998 eval!true(&state);//new thread staring at this position 999 } 1000 else if (nlist.empty) 1001 { 1002 debug(std_regex_matcher) writeln("Stopped matching before consuming full input"); 1003 break;//not a partial match for sure 1004 } 1005 clist = nlist; 1006 nlist = (ThreadList!DataIndex).init; 1007 if (clist.tip is null) 1008 { 1009 static if (withSearch) 1010 { 1011 if (!search()) 1012 break; 1013 } 1014 else 1015 { 1016 if (!next()) 1017 break; 1018 } 1019 } 1020 else if (!next()) 1021 { 1022 if (!atEnd) return false; 1023 exhausted = true; 1024 break; 1025 } 1026 } 1027 1028 genCounter++; //increment also on each end 1029 debug(std_regex_matcher) writefln("Threaded matching threads at end"); 1030 //try out all zero-width posibilities 1031 for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) 1032 { 1033 eval!false(&state); 1034 } 1035 if (!matched) 1036 { 1037 state.t = createStart(index); 1038 eval!false(&state);//new thread starting at end of input 1039 } 1040 if (matched) 1041 {//in case NFA found match along the way 1042 //and last possible longer alternative ultimately failed 1043 s.reset(matches[0].end);//reset to last successful match 1044 next();//and reload front character 1045 //--- here the exact state of stream was restored --- 1046 exhausted = atEnd || !(re.flags & RegexOption.global); 1047 //+ empty match advances the input 1048 if (!exhausted && matches[0].begin == matches[0].end) 1049 next(); 1050 } 1051 return matched; 1052 } 1053 1054 /+ 1055 handle succesful threads 1056 +/ 1057 void finish(const(Thread!DataIndex)* t, Group!DataIndex[] matches, int code) 1058 { 1059 matches.ptr[0 .. re.ngroup] = t.matches.ptr[0 .. re.ngroup]; 1060 debug(std_regex_matcher) 1061 { 1062 writef("FOUND pc=%s prog_len=%s", 1063 t.pc, re.ir.length); 1064 if (!matches.empty) 1065 writefln(": %s..%s", matches[0].begin, matches[0].end); 1066 foreach (v; matches) 1067 writefln("%d .. %d", v.begin, v.end); 1068 } 1069 matched = code; 1070 } 1071 1072 alias Ops(bool withInput) = ThompsonOps!(ThompsonMatcher, State, withInput); 1073 alias BackOps(bool withInput) = ThompsonOps!(BackMatcher, BackMatcher.State, withInput); 1074 1075 /+ 1076 match thread against codepoint, cutting trough all 0-width instructions 1077 and taking care of control flow, then add it to nlist 1078 +/ 1079 void eval(bool withInput)(State* state) 1080 { 1081 debug(std_regex_matcher) writeln("---- Evaluating thread"); 1082 static if (withInput) 1083 while (opCacheTrue.ptr[state.t.pc](this, state)){} 1084 else 1085 while (opCacheFalse.ptr[state.t.pc](this, state)){} 1086 } 1087 enum uint RestartPc = uint.max; 1088 //match the input, evaluating IR without searching 1089 int matchOneShot(Group!DataIndex[] matches, uint startPc = 0) 1090 { 1091 debug(std_regex_matcher) 1092 { 1093 writefln("---------------single shot match ----------------- "); 1094 } 1095 alias evalFn = eval; 1096 assert(clist == (ThreadList!DataIndex).init || startPc == RestartPc); // incorrect after a partial match 1097 assert(nlist == (ThreadList!DataIndex).init || startPc == RestartPc); 1098 State state; 1099 state.matches = matches; 1100 if (!atEnd)//if no char 1101 { 1102 debug(std_regex_matcher) 1103 { 1104 writefln("-- Threaded matching threads at %s", s[index .. s.lastIndex]); 1105 } 1106 if (startPc != RestartPc) 1107 { 1108 state.t = createStart(index, startPc); 1109 genCounter++; 1110 evalFn!true(&state); 1111 } 1112 for (;;) 1113 { 1114 debug(std_regex_matcher) writeln("\n-- Started iteration of main cycle"); 1115 genCounter++; 1116 debug(std_regex_matcher) 1117 { 1118 foreach (t; clist[]) 1119 { 1120 assert(t); 1121 } 1122 } 1123 for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) 1124 { 1125 evalFn!true(&state); 1126 } 1127 if (nlist.empty) 1128 { 1129 debug(std_regex_matcher) writeln("Stopped matching before consuming full input"); 1130 break;//not a partial match for sure 1131 } 1132 clist = nlist; 1133 nlist = (ThreadList!DataIndex).init; 1134 if (!next()) 1135 break; 1136 debug(std_regex_matcher) writeln("-- Ended iteration of main cycle\n"); 1137 } 1138 } 1139 genCounter++; //increment also on each end 1140 debug(std_regex_matcher) writefln("-- Matching threads at end"); 1141 //try out all zero-width posibilities 1142 for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) 1143 { 1144 evalFn!false(&state); 1145 } 1146 if (!matched) 1147 { 1148 state.t = createStart(index, startPc); 1149 evalFn!false(&state); 1150 } 1151 return matched; 1152 } 1153 1154 //get a dirty recycled Thread 1155 Thread!DataIndex* allocate() 1156 { 1157 assert(freelist, "not enough preallocated memory"); 1158 Thread!DataIndex* t = freelist; 1159 freelist = freelist.next; 1160 return t; 1161 } 1162 1163 //link memory into a free list of Threads 1164 void prepareFreeList(size_t size, ref void[] memory) 1165 { 1166 void[] mem = memory[0 .. threadSize*size]; 1167 memory = memory[threadSize * size .. $]; 1168 freelist = cast(Thread!DataIndex*)&mem[0]; 1169 size_t i; 1170 for (i = threadSize; i < threadSize*size; i += threadSize) 1171 (cast(Thread!DataIndex*)&mem[i-threadSize]).next = cast(Thread!DataIndex*)&mem[i]; 1172 (cast(Thread!DataIndex*)&mem[i-threadSize]).next = null; 1173 } 1174 1175 //dispose a thread 1176 void recycle(Thread!DataIndex* t) 1177 { 1178 t.next = freelist; 1179 freelist = t; 1180 } 1181 1182 //dispose list of threads 1183 void recycle(ref ThreadList!DataIndex list) 1184 { 1185 if (list.tip) 1186 { 1187 // just put this head-tail list in front of freelist 1188 list.toe.next = freelist; 1189 freelist = list.tip; 1190 list = list.init; 1191 } 1192 } 1193 1194 //creates a copy of master thread with given pc 1195 Thread!DataIndex* fork(Thread!DataIndex* master, uint pc, uint counter) 1196 { 1197 auto t = allocate(); 1198 t.matches.ptr[0 .. re.ngroup] = master.matches.ptr[0 .. re.ngroup]; 1199 t.pc = pc; 1200 t.counter = counter; 1201 t.uopCounter = 0; 1202 return t; 1203 } 1204 1205 //creates a start thread 1206 Thread!DataIndex* createStart(DataIndex index, uint pc = 0) 1207 { 1208 auto t = allocate(); 1209 t.matches.ptr[0 .. re.ngroup] = (Group!DataIndex).init; 1210 t.matches[0].begin = index; 1211 t.pc = pc; 1212 t.counter = 0; 1213 t.uopCounter = 0; 1214 return t; 1215 } 1216 } 1217