1 //Written in the D programming language 2 /* 3 Implementation of Thompson NFA std.regex engine. 4 Key point is evaluation of all possible threads (state) at each step 5 in a breadth-first manner, thereby geting some nice properties: 6 - looking at each character only once 7 - merging of equivalent threads, that gives matching process linear time complexity 8 */ 9 module std.regex.internal.thompson; 10 11 package(std.regex): 12 13 import std.range.primitives; 14 import std.regex.internal.ir; 15 16 //State of VM thread 17 struct Thread(DataIndex) 18 { 19 Thread* next; //intrusive linked list 20 uint pc; 21 uint counter; //loop counter 22 uint uopCounter; //counts micro operations inside one macro instruction (e.g. BackRef) 23 Group!DataIndex[1] matches; 24 } 25 26 //head-tail singly-linked list 27 struct ThreadList(DataIndex) 28 { 29 Thread!DataIndex* tip = null, toe = null; 30 //add new thread to the start of list 31 void insertFront(Thread!DataIndex* t) 32 { 33 if (tip) 34 { 35 t.next = tip; 36 tip = t; 37 } 38 else 39 { 40 t.next = null; 41 tip = toe = t; 42 } 43 } 44 //add new thread to the end of list 45 void insertBack(Thread!DataIndex* t) 46 { 47 if (toe) 48 { 49 toe.next = t; 50 toe = t; 51 } 52 else 53 tip = toe = t; 54 toe.next = null; 55 } 56 //move head element out of list 57 Thread!DataIndex* fetch() 58 { 59 auto t = tip; 60 if (tip == toe) 61 tip = toe = null; 62 else 63 tip = tip.next; 64 return t; 65 } 66 //non-destructive iteration of ThreadList 67 struct ThreadRange 68 { 69 const(Thread!DataIndex)* ct; 70 this(ThreadList tlist){ ct = tlist.tip; } 71 @property bool empty(){ return ct is null; } 72 @property const(Thread!DataIndex)* front(){ return ct; } 73 @property popFront() 74 { 75 assert(ct); 76 ct = ct.next; 77 } 78 } 79 @property bool empty() 80 { 81 return tip == null; 82 } 83 ThreadRange opSlice() 84 { 85 return ThreadRange(this); 86 } 87 } 88 89 template ThompsonOps(E, S, bool withInput:true) 90 { 91 @trusted: 92 static bool op(IR code:IR.End)(E* e, S* state) 93 { 94 with(e) with(state) 95 { 96 finish(t, matches, re.ir[t.pc].data); 97 //fix endpoint of the whole match 98 matches[0].end = index; 99 recycle(t); 100 //cut off low priority threads 101 recycle(clist); 102 recycle(worklist); 103 debug(std_regex_matcher) writeln("Finished thread ", matches); 104 return false; // no more state to eval 105 } 106 } 107 108 static bool op(IR code:IR.Wordboundary)(E* e, S* state) 109 { 110 with(e) with(state) 111 { 112 dchar back; 113 DataIndex bi; 114 //at start & end of input 115 if (atStart && wordMatcher[front]) 116 { 117 t.pc += IRL!(IR.Wordboundary); 118 return true; 119 } 120 else if (atEnd && s.loopBack(index).nextChar(back, bi) 121 && wordMatcher[back]) 122 { 123 t.pc += IRL!(IR.Wordboundary); 124 return true; 125 } 126 else if (s.loopBack(index).nextChar(back, bi)) 127 { 128 bool af = wordMatcher[front]; 129 bool ab = wordMatcher[back]; 130 if (af ^ ab) 131 { 132 t.pc += IRL!(IR.Wordboundary); 133 return true; 134 } 135 } 136 return popState(e); 137 } 138 } 139 140 static bool op(IR code:IR.Notwordboundary)(E* e, S* state) 141 { 142 with(e) with(state) 143 { 144 dchar back; 145 DataIndex bi; 146 //at start & end of input 147 if (atStart && wordMatcher[front]) 148 { 149 return popState(e); 150 } 151 else if (atEnd && s.loopBack(index).nextChar(back, bi) 152 && wordMatcher[back]) 153 { 154 return popState(e); 155 } 156 else if (s.loopBack(index).nextChar(back, bi)) 157 { 158 bool af = wordMatcher[front]; 159 bool ab = wordMatcher[back] != 0; 160 if (af ^ ab) 161 { 162 return popState(e); 163 } 164 } 165 t.pc += IRL!(IR.Notwordboundary); 166 } 167 return true; 168 } 169 170 static bool op(IR code:IR.Bof)(E* e, S* state) 171 { 172 with(e) with(state) 173 { 174 if (atStart) 175 { 176 t.pc += IRL!(IR.Bof); 177 return true; 178 } 179 else 180 { 181 return popState(e); 182 } 183 } 184 } 185 186 static bool op(IR code:IR.Bol)(E* e, S* state) 187 { 188 with(e) with(state) 189 { 190 dchar back; 191 DataIndex bi; 192 if (atStart 193 ||(s.loopBack(index).nextChar(back,bi) 194 && startOfLine(back, front == '\n'))) 195 { 196 t.pc += IRL!(IR.Bol); 197 return true; 198 } 199 else 200 { 201 return popState(e); 202 } 203 } 204 } 205 206 static bool op(IR code:IR.Eof)(E* e, S* state) 207 { 208 with(e) with(state) 209 { 210 if (atEnd) 211 { 212 t.pc += IRL!(IR.Eol); 213 return true; 214 } 215 else 216 { 217 return popState(e); 218 } 219 } 220 } 221 222 static bool op(IR code:IR.Eol)(E* e, S* state) 223 { 224 with(e) with(state) 225 { 226 dchar back; 227 DataIndex bi; 228 //no matching inside \r\n 229 if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back, bi) 230 && back == '\r'))) 231 { 232 t.pc += IRL!(IR.Eol); 233 return true; 234 } 235 else 236 { 237 return popState(e); 238 } 239 240 } 241 } 242 243 static bool op(IR code:IR.InfiniteStart)(E* e, S* state) 244 { 245 with(e) with(state) 246 t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart); 247 return op!(IR.InfiniteEnd)(e,state); 248 } 249 250 static bool op(IR code:IR.InfiniteBloomStart)(E* e, S* state) 251 { 252 with(e) with(state) 253 t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteBloomStart); 254 return op!(IR.InfiniteBloomEnd)(e,state); 255 } 256 257 static bool op(IR code:IR.InfiniteQStart)(E* e, S* state) 258 { 259 with(e) with(state) 260 t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteQStart); 261 return op!(IR.InfiniteQEnd)(e,state); 262 } 263 264 static bool op(IR code:IR.RepeatStart)(E* e, S* state) 265 { 266 with(e) with(state) 267 t.pc += re.ir[t.pc].data + IRL!(IR.RepeatStart); 268 return op!(IR.RepeatEnd)(e,state); 269 } 270 271 static bool op(IR code:IR.RepeatQStart)(E* e, S* state) 272 { 273 with(e) with(state) 274 t.pc += re.ir[t.pc].data + IRL!(IR.RepeatQStart); 275 return op!(IR.RepeatQEnd)(e,state); 276 } 277 278 static bool op(IR code)(E* e, S* state) 279 if (code == IR.RepeatEnd || code == IR.RepeatQEnd) 280 { 281 with(e) with(state) 282 { 283 //len, step, min, max 284 uint len = re.ir[t.pc].data; 285 uint step = re.ir[t.pc+2].raw; 286 uint min = re.ir[t.pc+3].raw; 287 if (t.counter < min) 288 { 289 t.counter += step; 290 t.pc -= len; 291 return true; 292 } 293 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) 294 { 295 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", 296 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); 297 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; 298 } 299 else 300 { 301 debug(std_regex_matcher) 302 writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", 303 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); 304 return popState(e); 305 } 306 uint max = re.ir[t.pc+4].raw; 307 if (t.counter < max) 308 { 309 if (re.ir[t.pc].code == IR.RepeatEnd) 310 { 311 //queue out-of-loop thread 312 worklist.insertFront(fork(t, t.pc + IRL!(IR.RepeatEnd), t.counter % step)); 313 t.counter += step; 314 t.pc -= len; 315 } 316 else 317 { 318 //queue into-loop thread 319 worklist.insertFront(fork(t, t.pc - len, t.counter + step)); 320 t.counter %= step; 321 t.pc += IRL!(IR.RepeatEnd); 322 } 323 } 324 else 325 { 326 t.counter %= step; 327 t.pc += IRL!(IR.RepeatEnd); 328 } 329 return true; 330 } 331 } 332 333 static bool op(IR code)(E* e, S* state) 334 if (code == IR.InfiniteEnd || code == IR.InfiniteQEnd) 335 { 336 with(e) with(state) 337 { 338 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) 339 { 340 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", 341 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); 342 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; 343 } 344 else 345 { 346 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", 347 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); 348 return popState(e); 349 } 350 uint len = re.ir[t.pc].data; 351 uint pc1, pc2; //branches to take in priority order 352 if (re.ir[t.pc].code == IR.InfiniteEnd) 353 { 354 pc1 = t.pc - len; 355 pc2 = t.pc + IRL!(IR.InfiniteEnd); 356 } 357 else 358 { 359 pc1 = t.pc + IRL!(IR.InfiniteEnd); 360 pc2 = t.pc - len; 361 } 362 worklist.insertFront(fork(t, pc2, t.counter)); 363 t.pc = pc1; 364 return true; 365 } 366 } 367 368 static bool op(IR code)(E* e, S* state) 369 if (code == IR.InfiniteBloomEnd) 370 { 371 with(e) with(state) 372 { 373 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) 374 { 375 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", 376 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); 377 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; 378 } 379 else 380 { 381 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", 382 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); 383 return popState(e); 384 } 385 uint len = re.ir[t.pc].data; 386 uint pc1, pc2; //branches to take in priority order 387 pc1 = t.pc - len; 388 pc2 = t.pc + IRL!(IR.InfiniteBloomEnd); 389 uint filterIndex = re.ir[t.pc + 2].raw; 390 if (re.filters[filterIndex][front]) 391 worklist.insertFront(fork(t, pc2, t.counter)); 392 t.pc = pc1; 393 return true; 394 } 395 } 396 397 static bool op(IR code:IR.OrEnd)(E* e, S* state) 398 { 399 with(e) with(state) 400 { 401 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) 402 { 403 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", 404 t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] ); 405 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; 406 t.pc += IRL!(IR.OrEnd); 407 } 408 else 409 { 410 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", 411 t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] ); 412 return popState(e); 413 } 414 return true; 415 } 416 } 417 418 static bool op(IR code:IR.OrStart)(E* e, S* state) 419 { 420 with(e) with(state) 421 { 422 t.pc += IRL!(IR.OrStart); 423 return op!(IR.Option)(e,state); 424 } 425 } 426 427 static bool op(IR code:IR.Option)(E* e, S* state) 428 { 429 with(e) with(state) 430 { 431 uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option); 432 //queue next Option 433 if (re.ir[next].code == IR.Option) 434 { 435 worklist.insertFront(fork(t, next, t.counter)); 436 } 437 t.pc += IRL!(IR.Option); 438 return true; 439 } 440 } 441 442 static bool op(IR code:IR.GotoEndOr)(E* e, S* state) 443 { 444 with(e) with(state) 445 { 446 t.pc = t.pc + re.ir[t.pc].data + IRL!(IR.GotoEndOr); 447 return op!(IR.OrEnd)(e, state); 448 } 449 } 450 451 static bool op(IR code:IR.GroupStart)(E* e, S* state) 452 { 453 with(e) with(state) 454 { 455 uint n = re.ir[t.pc].data; 456 t.matches.ptr[n].begin = index; 457 t.pc += IRL!(IR.GroupStart); 458 return true; 459 } 460 } 461 static bool op(IR code:IR.GroupEnd)(E* e, S* state) 462 { 463 with(e) with(state) 464 { 465 uint n = re.ir[t.pc].data; 466 t.matches.ptr[n].end = index; 467 t.pc += IRL!(IR.GroupEnd); 468 return true; 469 } 470 } 471 472 static bool op(IR code:IR.Backref)(E* e, S* state) 473 { 474 with(e) with(state) 475 { 476 uint n = re.ir[t.pc].data; 477 Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr; 478 assert(source); 479 if (source[n].begin == source[n].end)//zero-width Backref! 480 { 481 t.pc += IRL!(IR.Backref); 482 return true; 483 } 484 else 485 { 486 size_t idx = source[n].begin + t.uopCounter; 487 size_t end = source[n].end; 488 if (s[idx .. end].front == front) 489 { 490 import std.utf : stride; 491 492 t.uopCounter += stride(s[idx .. end], 0); 493 if (t.uopCounter + source[n].begin == source[n].end) 494 {//last codepoint 495 t.pc += IRL!(IR.Backref); 496 t.uopCounter = 0; 497 } 498 nlist.insertBack(t); 499 } 500 else 501 recycle(t); 502 t = worklist.fetch(); 503 return t != null; 504 } 505 } 506 } 507 508 509 static bool op(IR code)(E* e, S* state) 510 if (code == IR.LookbehindStart || code == IR.NeglookbehindStart) 511 { 512 with(e) with(state) 513 { 514 uint len = re.ir[t.pc].data; 515 uint ms = re.ir[t.pc + 1].raw, me = re.ir[t.pc + 2].raw; 516 uint end = t.pc + len + IRL!(IR.LookbehindEnd) + IRL!(IR.LookbehindStart); 517 bool positive = re.ir[t.pc].code == IR.LookbehindStart; 518 static if (Stream.isLoopback) 519 auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); 520 else 521 auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); 522 matcher.re.ngroup = me - ms; 523 matcher.backrefed = backrefed.empty ? t.matches : backrefed; 524 //backMatch 525 auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookbehindStart)); 526 freelist = matcher.freelist; 527 subCounters[t.pc] = matcher.genCounter; 528 if ((mRes != 0 ) ^ positive) 529 { 530 return popState(e); 531 } 532 t.pc = end; 533 return true; 534 } 535 } 536 537 static bool op(IR code)(E* e, S* state) 538 if (code == IR.LookaheadStart || code == IR.NeglookaheadStart) 539 { 540 with(e) with(state) 541 { 542 auto save = index; 543 uint len = re.ir[t.pc].data; 544 uint ms = re.ir[t.pc+1].raw, me = re.ir[t.pc+2].raw; 545 uint end = t.pc+len+IRL!(IR.LookaheadEnd)+IRL!(IR.LookaheadStart); 546 bool positive = re.ir[t.pc].code == IR.LookaheadStart; 547 static if (Stream.isLoopback) 548 auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); 549 else 550 auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); 551 matcher.re.ngroup = me - ms; 552 matcher.backrefed = backrefed.empty ? t.matches : backrefed; 553 auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookaheadStart)); 554 freelist = matcher.freelist; 555 subCounters[t.pc] = matcher.genCounter; 556 s.reset(index); 557 next(); 558 if ((mRes != 0) ^ positive) 559 { 560 return popState(e); 561 } 562 t.pc = end; 563 return true; 564 } 565 } 566 567 static bool op(IR code)(E* e, S* state) 568 if (code == IR.LookaheadEnd || code == IR.NeglookaheadEnd || 569 code == IR.LookbehindEnd || code == IR.NeglookbehindEnd) 570 { 571 with(e) with(state) 572 { 573 finish(t, matches.ptr[0 .. re.ngroup], re.ir[t.pc].data); 574 recycle(t); 575 //cut off low priority threads 576 recycle(clist); 577 recycle(worklist); 578 return false; // no more state 579 } 580 } 581 582 static bool op(IR code:IR.Nop)(E* e, S* state) 583 { 584 with(state) t.pc += IRL!(IR.Nop); 585 return true; 586 } 587 588 static bool op(IR code:IR.OrChar)(E* e, S* state) 589 { 590 with(e) with(state) 591 { 592 uint len = re.ir[t.pc].sequence; 593 uint end = t.pc + len; 594 static assert(IRL!(IR.OrChar) == 1); 595 for (; t.pc < end; t.pc++) 596 if (re.ir[t.pc].data == front) 597 break; 598 if (t.pc != end) 599 { 600 t.pc = end; 601 nlist.insertBack(t); 602 } 603 else 604 recycle(t); 605 t = worklist.fetch(); 606 return t != null; 607 } 608 } 609 610 static bool op(IR code:IR.Char)(E* e, S* state) 611 { 612 with(e) with(state) 613 { 614 if (front == re.ir[t.pc].data) 615 { 616 t.pc += IRL!(IR.Char); 617 nlist.insertBack(t); 618 } 619 else 620 recycle(t); 621 t = worklist.fetch(); 622 return t != null; 623 } 624 } 625 626 static bool op(IR code:IR.Any)(E* e, S* state) 627 { 628 with(e) with(state) 629 { 630 t.pc += IRL!(IR.Any); 631 nlist.insertBack(t); 632 t = worklist.fetch(); 633 return t != null; 634 } 635 } 636 637 static bool op(IR code:IR.CodepointSet)(E* e, S* state) 638 { 639 with(e) with(state) 640 { 641 if (re.charsets[re.ir[t.pc].data].scanFor(front)) 642 { 643 t.pc += IRL!(IR.CodepointSet); 644 nlist.insertBack(t); 645 } 646 else 647 { 648 recycle(t); 649 } 650 t = worklist.fetch(); 651 return t != null; 652 } 653 } 654 655 static bool op(IR code:IR.Trie)(E* e, S* state) 656 { 657 with(e) with(state) 658 { 659 if (re.matchers[re.ir[t.pc].data][front]) 660 { 661 t.pc += IRL!(IR.Trie); 662 nlist.insertBack(t); 663 } 664 else 665 { 666 recycle(t); 667 } 668 t = worklist.fetch(); 669 return t != null; 670 } 671 } 672 673 } 674 675 template ThompsonOps(E,S, bool withInput:false) 676 { 677 @trusted: 678 // can't match these without input 679 static bool op(IR code)(E* e, S* state) 680 if (code == IR.Char || code == IR.OrChar || code == IR.CodepointSet 681 || code == IR.Trie || code == IR.Char || code == IR.Any) 682 { 683 return state.popState(e); 684 } 685 686 // special case of zero-width backref 687 static bool op(IR code:IR.Backref)(E* e, S* state) 688 { 689 with(e) with(state) 690 { 691 uint n = re.ir[t.pc].data; 692 Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr; 693 assert(source); 694 if (source[n].begin == source[n].end)//zero-width Backref! 695 { 696 t.pc += IRL!(IR.Backref); 697 return true; 698 } 699 else 700 return popState(e); 701 } 702 } 703 704 // forward all control flow to normal versions 705 static bool op(IR code)(E* e, S* state) 706 if (code != IR.Char && code != IR.OrChar && code != IR.CodepointSet 707 && code != IR.Trie && code != IR.Char && code != IR.Any && code != IR.Backref) 708 { 709 return ThompsonOps!(E,S,true).op!code(e,state); 710 } 711 } 712 713 /+ 714 Thomspon matcher does all matching in lockstep, 715 never looking at the same char twice 716 +/ 717 @trusted struct ThompsonMatcher(Char, StreamType = Input!Char) 718 if (is(Char : dchar)) 719 { 720 alias DataIndex = Stream.DataIndex; 721 alias Stream = StreamType; 722 alias OpFunc = bool function(ThompsonMatcher*, State*); 723 alias BackMatcher = ThompsonMatcher!(Char, BackLooper!(Stream)); 724 alias OpBackFunc = bool function(BackMatcher*, BackMatcher.State*); 725 Thread!DataIndex* freelist; 726 ThreadList!DataIndex clist, nlist; 727 DataIndex[] merge; 728 Group!DataIndex[] backrefed; 729 Regex!Char re; //regex program 730 Stream s; 731 dchar front; 732 DataIndex index; 733 DataIndex genCounter; //merge trace counter, goes up on every dchar 734 size_t[size_t] subCounters; //a table of gen counter per sub-engine: PC -> counter 735 OpFunc[] opCacheTrue; // pointers to Op!(IR.xyz) for each bytecode 736 OpFunc[] opCacheFalse; // ditto 737 OpBackFunc[] opCacheBackTrue; // ditto 738 OpBackFunc[] opCacheBackFalse; // ditto 739 size_t threadSize; 740 int matched; 741 bool exhausted; 742 743 static struct State 744 { 745 Thread!DataIndex* t; 746 ThreadList!DataIndex worklist; 747 Group!DataIndex[] matches; 748 749 bool popState(E)(E* e) 750 { 751 with(e) 752 { 753 recycle(t); 754 t = worklist.fetch(); 755 return t != null; 756 } 757 } 758 759 } 760 761 static if (__traits(hasMember,Stream, "search")) 762 { 763 enum kicked = true; 764 } 765 else 766 enum kicked = false; 767 768 static size_t getThreadSize(const ref Regex!Char re) 769 { 770 return re.ngroup 771 ? (Thread!DataIndex).sizeof + (re.ngroup-1)*(Group!DataIndex).sizeof 772 : (Thread!DataIndex).sizeof - (Group!DataIndex).sizeof; 773 } 774 775 static size_t initialMemory(const ref Regex!Char re) 776 { 777 return getThreadSize(re)*re.threadCount + re.hotspotTableSize*size_t.sizeof 778 +4*OpFunc.sizeof*re.ir.length; 779 } 780 781 //true if it's start of input 782 @property bool atStart(){ return index == 0; } 783 784 //true if it's end of input 785 @property bool atEnd(){ return index == s.lastIndex && s.atEnd; } 786 787 bool next() 788 { 789 if (!s.nextChar(front, index)) 790 { 791 index = s.lastIndex; 792 return false; 793 } 794 return true; 795 } 796 797 static if (kicked) 798 { 799 bool search() 800 { 801 802 if (!s.search(re.kickstart, front, index)) 803 { 804 index = s.lastIndex; 805 return false; 806 } 807 return true; 808 } 809 } 810 811 void initExternalMemory(void[] memory) 812 { 813 threadSize = getThreadSize(re); 814 prepareFreeList(re.threadCount, memory); 815 if (re.hotspotTableSize) 816 { 817 merge = arrayInChunk!(DataIndex)(re.hotspotTableSize, memory); 818 merge[] = 0; 819 } 820 opCacheTrue = arrayInChunk!(OpFunc)(re.ir.length, memory); 821 opCacheFalse = arrayInChunk!(OpFunc)(re.ir.length, memory); 822 opCacheBackTrue = arrayInChunk!(OpBackFunc)(re.ir.length, memory); 823 opCacheBackFalse = arrayInChunk!(OpBackFunc)(re.ir.length, memory); 824 825 for (uint pc = 0; pc<re.ir.length; pc += re.ir[pc].length) 826 { 827 L_dispatch: 828 switch (re.ir[pc].code) 829 { 830 foreach (e; __traits(allMembers, IR)) 831 { 832 mixin(`case IR.`~e~`: 833 opCacheTrue[pc] = &Ops!(true).op!(IR.`~e~`); 834 opCacheBackTrue[pc] = &BackOps!(true).op!(IR.`~e~`); 835 opCacheFalse[pc] = &Ops!(false).op!(IR.`~e~`); 836 opCacheBackFalse[pc] = &BackOps!(false).op!(IR.`~e~`); 837 break L_dispatch; 838 `); 839 } 840 default: 841 assert(0, "Unrecognized instruction "~re.ir[pc].mnemonic); 842 } 843 } 844 } 845 846 this()(Regex!Char program, Stream stream, void[] memory) 847 { 848 re = program; 849 s = stream; 850 initExternalMemory(memory); 851 genCounter = 0; 852 } 853 854 this(ref ThompsonMatcher matcher, size_t lo, size_t hi, Stream stream) 855 { 856 s = stream; 857 re = matcher.re; 858 re.ir = re.ir[lo .. hi]; 859 threadSize = matcher.threadSize; 860 merge = matcher.merge; 861 freelist = matcher.freelist; 862 opCacheTrue = matcher.opCacheTrue[lo .. hi]; 863 opCacheBackTrue = matcher.opCacheBackTrue[lo .. hi]; 864 opCacheFalse = matcher.opCacheFalse[lo .. hi]; 865 opCacheBackFalse = matcher.opCacheBackFalse[lo .. hi]; 866 front = matcher.front; 867 index = matcher.index; 868 } 869 870 this(ref BackMatcher matcher, size_t lo, size_t hi, Stream stream) 871 { 872 s = stream; 873 re = matcher.re; 874 re.ir = re.ir[lo .. hi]; 875 threadSize = matcher.threadSize; 876 merge = matcher.merge; 877 freelist = matcher.freelist; 878 opCacheTrue = matcher.opCacheBackTrue[lo .. hi]; 879 opCacheBackTrue = matcher.opCacheTrue[lo .. hi]; 880 opCacheFalse = matcher.opCacheBackFalse[lo .. hi]; 881 opCacheBackFalse = matcher.opCacheFalse[lo .. hi]; 882 front = matcher.front; 883 index = matcher.index; 884 } 885 886 auto fwdMatcher()(size_t lo, size_t hi, size_t counter) 887 { 888 auto m = ThompsonMatcher!(Char, Stream)(this, lo, hi, s); 889 m.genCounter = counter; 890 return m; 891 } 892 893 auto bwdMatcher()(size_t lo, size_t hi, size_t counter) 894 { 895 alias BackLooper = typeof(s.loopBack(index)); 896 auto m = ThompsonMatcher!(Char, BackLooper)(this, lo, hi, s.loopBack(index)); 897 m.genCounter = counter; 898 m.next(); 899 return m; 900 } 901 902 auto dupTo(void[] memory) 903 { 904 typeof(this) tmp = this;//bitblit 905 tmp.initExternalMemory(memory); 906 tmp.genCounter = 0; 907 return tmp; 908 } 909 910 int match(Group!DataIndex[] matches) 911 { 912 debug(std_regex_matcher) 913 writeln("------------------------------------------"); 914 if (exhausted) 915 { 916 return false; 917 } 918 if (re.flags & RegexInfo.oneShot) 919 { 920 next(); 921 exhausted = true; 922 return matchOneShot(matches); 923 } 924 static if (kicked) 925 if (!re.kickstart.empty) 926 return matchImpl!(true)(matches); 927 return matchImpl!(false)(matches); 928 } 929 930 //match the input and fill matches 931 int matchImpl(bool withSearch)(Group!DataIndex[] matches) 932 { 933 if (!matched && clist.empty) 934 { 935 static if (withSearch) 936 search(); 937 else 938 next(); 939 } 940 else//char in question is fetched in prev call to match 941 { 942 matched = 0; 943 } 944 State state; 945 state.matches = matches; 946 947 if (!atEnd)//if no char 948 for (;;) 949 { 950 genCounter++; 951 debug(std_regex_matcher) 952 { 953 writefln("Threaded matching threads at %s", s[index .. s.lastIndex]); 954 foreach (t; clist[]) 955 { 956 assert(t); 957 writef("pc=%s ",t.pc); 958 write(t.matches); 959 writeln(); 960 } 961 } 962 for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) 963 { 964 eval!true(&state); 965 } 966 //if we already have match no need to push the engine 967 if (!matched) 968 { 969 state.t = createStart(index); 970 eval!true(&state);//new thread staring at this position 971 } 972 else if (nlist.empty) 973 { 974 debug(std_regex_matcher) writeln("Stopped matching before consuming full input"); 975 break;//not a partial match for sure 976 } 977 clist = nlist; 978 nlist = (ThreadList!DataIndex).init; 979 if (clist.tip is null) 980 { 981 static if (withSearch) 982 { 983 if (!search()) 984 break; 985 } 986 else 987 { 988 if (!next()) 989 break; 990 } 991 } 992 else if (!next()) 993 { 994 if (!atEnd) return false; 995 exhausted = true; 996 break; 997 } 998 } 999 1000 genCounter++; //increment also on each end 1001 debug(std_regex_matcher) writefln("Threaded matching threads at end"); 1002 //try out all zero-width posibilities 1003 for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) 1004 { 1005 eval!false(&state); 1006 } 1007 if (!matched) 1008 { 1009 state.t = createStart(index); 1010 eval!false(&state);//new thread starting at end of input 1011 } 1012 if (matched) 1013 {//in case NFA found match along the way 1014 //and last possible longer alternative ultimately failed 1015 s.reset(matches[0].end);//reset to last successful match 1016 next();//and reload front character 1017 //--- here the exact state of stream was restored --- 1018 exhausted = atEnd || !(re.flags & RegexOption.global); 1019 //+ empty match advances the input 1020 if (!exhausted && matches[0].begin == matches[0].end) 1021 next(); 1022 } 1023 return matched; 1024 } 1025 1026 /+ 1027 handle succesful threads 1028 +/ 1029 void finish(const(Thread!DataIndex)* t, Group!DataIndex[] matches, int code) 1030 { 1031 matches.ptr[0 .. re.ngroup] = t.matches.ptr[0 .. re.ngroup]; 1032 debug(std_regex_matcher) 1033 { 1034 writef("FOUND pc=%s prog_len=%s", 1035 t.pc, re.ir.length); 1036 if (!matches.empty) 1037 writefln(": %s..%s", matches[0].begin, matches[0].end); 1038 foreach (v; matches) 1039 writefln("%d .. %d", v.begin, v.end); 1040 } 1041 matched = code; 1042 } 1043 1044 alias Ops(bool withInput) = ThompsonOps!(ThompsonMatcher, State, withInput); 1045 alias BackOps(bool withInput) = ThompsonOps!(BackMatcher, BackMatcher.State, withInput); 1046 1047 /+ 1048 match thread against codepoint, cutting trough all 0-width instructions 1049 and taking care of control flow, then add it to nlist 1050 +/ 1051 void eval(bool withInput)(State* state) 1052 { 1053 debug(std_regex_matcher) writeln("---- Evaluating thread"); 1054 static if (withInput) 1055 while (opCacheTrue.ptr[state.t.pc](&this, state)){} 1056 else 1057 while (opCacheFalse.ptr[state.t.pc](&this, state)){} 1058 } 1059 enum uint RestartPc = uint.max; 1060 //match the input, evaluating IR without searching 1061 int matchOneShot(Group!DataIndex[] matches, uint startPc = 0) 1062 { 1063 debug(std_regex_matcher) 1064 { 1065 writefln("---------------single shot match ----------------- "); 1066 } 1067 alias evalFn = eval; 1068 assert(clist == (ThreadList!DataIndex).init || startPc == RestartPc); // incorrect after a partial match 1069 assert(nlist == (ThreadList!DataIndex).init || startPc == RestartPc); 1070 State state; 1071 state.matches = matches; 1072 if (!atEnd)//if no char 1073 { 1074 debug(std_regex_matcher) 1075 { 1076 writefln("-- Threaded matching threads at %s", s[index .. s.lastIndex]); 1077 } 1078 if (startPc != RestartPc) 1079 { 1080 state.t = createStart(index, startPc); 1081 genCounter++; 1082 evalFn!true(&state); 1083 } 1084 for (;;) 1085 { 1086 debug(std_regex_matcher) writeln("\n-- Started iteration of main cycle"); 1087 genCounter++; 1088 debug(std_regex_matcher) 1089 { 1090 foreach (t; clist[]) 1091 { 1092 assert(t); 1093 } 1094 } 1095 for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) 1096 { 1097 evalFn!true(&state); 1098 } 1099 if (nlist.empty) 1100 { 1101 debug(std_regex_matcher) writeln("Stopped matching before consuming full input"); 1102 break;//not a partial match for sure 1103 } 1104 clist = nlist; 1105 nlist = (ThreadList!DataIndex).init; 1106 if (!next()) 1107 break; 1108 debug(std_regex_matcher) writeln("-- Ended iteration of main cycle\n"); 1109 } 1110 } 1111 genCounter++; //increment also on each end 1112 debug(std_regex_matcher) writefln("-- Matching threads at end"); 1113 //try out all zero-width posibilities 1114 for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) 1115 { 1116 evalFn!false(&state); 1117 } 1118 if (!matched) 1119 { 1120 state.t = createStart(index, startPc); 1121 evalFn!false(&state); 1122 } 1123 return matched; 1124 } 1125 1126 //get a dirty recycled Thread 1127 Thread!DataIndex* allocate() 1128 { 1129 assert(freelist, "not enough preallocated memory"); 1130 Thread!DataIndex* t = freelist; 1131 freelist = freelist.next; 1132 return t; 1133 } 1134 1135 //link memory into a free list of Threads 1136 void prepareFreeList(size_t size, ref void[] memory) 1137 { 1138 void[] mem = memory[0 .. threadSize*size]; 1139 memory = memory[threadSize * size .. $]; 1140 freelist = cast(Thread!DataIndex*)&mem[0]; 1141 size_t i; 1142 for (i = threadSize; i < threadSize*size; i += threadSize) 1143 (cast(Thread!DataIndex*)&mem[i-threadSize]).next = cast(Thread!DataIndex*)&mem[i]; 1144 (cast(Thread!DataIndex*)&mem[i-threadSize]).next = null; 1145 } 1146 1147 //dispose a thread 1148 void recycle(Thread!DataIndex* t) 1149 { 1150 t.next = freelist; 1151 freelist = t; 1152 } 1153 1154 //dispose list of threads 1155 void recycle(ref ThreadList!DataIndex list) 1156 { 1157 if (list.tip) 1158 { 1159 // just put this head-tail list in front of freelist 1160 list.toe.next = freelist; 1161 freelist = list.tip; 1162 list = list.init; 1163 } 1164 } 1165 1166 //creates a copy of master thread with given pc 1167 Thread!DataIndex* fork(Thread!DataIndex* master, uint pc, uint counter) 1168 { 1169 auto t = allocate(); 1170 t.matches.ptr[0 .. re.ngroup] = master.matches.ptr[0 .. re.ngroup]; 1171 t.pc = pc; 1172 t.counter = counter; 1173 t.uopCounter = 0; 1174 return t; 1175 } 1176 1177 //creates a start thread 1178 Thread!DataIndex* createStart(DataIndex index, uint pc = 0) 1179 { 1180 auto t = allocate(); 1181 t.matches.ptr[0 .. re.ngroup] = (Group!DataIndex).init; 1182 t.matches[0].begin = index; 1183 t.pc = pc; 1184 t.counter = 0; 1185 t.uopCounter = 0; 1186 return t; 1187 } 1188 } 1189