1 //Written in the D programming language
2 /*
3 Implementation of Thompson NFA std.regex engine.
4 Key point is evaluation of all possible threads (state) at each step
5 in a breadth-first manner, thereby geting some nice properties:
6 - looking at each character only once
7 - merging of equivalent threads, that gives matching process linear time complexity
8 */
9 module std.regex.internal.thompson;
10
11 package(std.regex):
12
13 import std.range.primitives;
14 import std.regex.internal.ir;
15
16 //State of VM thread
Thread(DataIndex)17 struct Thread(DataIndex)
18 {
19 Thread* next; //intrusive linked list
20 uint pc;
21 uint counter; //loop counter
22 uint uopCounter; //counts micro operations inside one macro instruction (e.g. BackRef)
23 Group!DataIndex[1] matches;
24 }
25
26 //head-tail singly-linked list
ThreadList(DataIndex)27 struct ThreadList(DataIndex)
28 {
29 Thread!DataIndex* tip = null, toe = null;
30 //add new thread to the start of list
31 void insertFront(Thread!DataIndex* t)
32 {
33 if (tip)
34 {
35 t.next = tip;
36 tip = t;
37 }
38 else
39 {
40 t.next = null;
41 tip = toe = t;
42 }
43 }
44 //add new thread to the end of list
45 void insertBack(Thread!DataIndex* t)
46 {
47 if (toe)
48 {
49 toe.next = t;
50 toe = t;
51 }
52 else
53 tip = toe = t;
54 toe.next = null;
55 }
56 //move head element out of list
57 Thread!DataIndex* fetch()
58 {
59 auto t = tip;
60 if (tip == toe)
61 tip = toe = null;
62 else
63 tip = tip.next;
64 return t;
65 }
66 //non-destructive iteration of ThreadList
67 struct ThreadRange
68 {
69 const(Thread!DataIndex)* ct;
70 this(ThreadList tlist){ ct = tlist.tip; }
71 @property bool empty(){ return ct is null; }
72 @property const(Thread!DataIndex)* front(){ return ct; }
73 @property popFront()
74 {
75 assert(ct);
76 ct = ct.next;
77 }
78 }
79 @property bool empty()
80 {
81 return tip == null;
82 }
83 ThreadRange opSlice()
84 {
85 return ThreadRange(this);
86 }
87 }
88
89 template ThompsonOps(E, S, bool withInput:true)
90 {
91 @trusted:
92 static bool op(IR code:IR.End)(E* e, S* state)
93 {
with(state)94 with(e) with(state)
95 {
96 finish(t, matches, re.ir[t.pc].data);
97 //fix endpoint of the whole match
98 matches[0].end = index;
99 recycle(t);
100 //cut off low priority threads
101 recycle(clist);
102 recycle(worklist);
103 debug(std_regex_matcher) writeln("Finished thread ", matches);
104 return false; // no more state to eval
105 }
106 }
107
108 static bool op(IR code:IR.Wordboundary)(E* e, S* state)
109 {
with(state)110 with(e) with(state)
111 {
112 dchar back;
113 DataIndex bi;
114 //at start & end of input
115 if (atStart && wordMatcher[front])
116 {
117 t.pc += IRL!(IR.Wordboundary);
118 return true;
119 }
120 else if (atEnd && s.loopBack(index).nextChar(back, bi)
121 && wordMatcher[back])
122 {
123 t.pc += IRL!(IR.Wordboundary);
124 return true;
125 }
126 else if (s.loopBack(index).nextChar(back, bi))
127 {
128 bool af = wordMatcher[front];
129 bool ab = wordMatcher[back];
130 if (af ^ ab)
131 {
132 t.pc += IRL!(IR.Wordboundary);
133 return true;
134 }
135 }
136 return popState(e);
137 }
138 }
139
140 static bool op(IR code:IR.Notwordboundary)(E* e, S* state)
141 {
with(state)142 with(e) with(state)
143 {
144 dchar back;
145 DataIndex bi;
146 //at start & end of input
147 if (atStart && wordMatcher[front])
148 {
149 return popState(e);
150 }
151 else if (atEnd && s.loopBack(index).nextChar(back, bi)
152 && wordMatcher[back])
153 {
154 return popState(e);
155 }
156 else if (s.loopBack(index).nextChar(back, bi))
157 {
158 bool af = wordMatcher[front];
159 bool ab = wordMatcher[back] != 0;
160 if (af ^ ab)
161 {
162 return popState(e);
163 }
164 }
165 t.pc += IRL!(IR.Notwordboundary);
166 }
167 return true;
168 }
169
170 static bool op(IR code:IR.Bof)(E* e, S* state)
171 {
with(state)172 with(e) with(state)
173 {
174 if (atStart)
175 {
176 t.pc += IRL!(IR.Bof);
177 return true;
178 }
179 else
180 {
181 return popState(e);
182 }
183 }
184 }
185
186 static bool op(IR code:IR.Bol)(E* e, S* state)
187 {
with(state)188 with(e) with(state)
189 {
190 dchar back;
191 DataIndex bi;
192 if (atStart
193 ||(s.loopBack(index).nextChar(back,bi)
194 && startOfLine(back, front == '\n')))
195 {
196 t.pc += IRL!(IR.Bol);
197 return true;
198 }
199 else
200 {
201 return popState(e);
202 }
203 }
204 }
205
206 static bool op(IR code:IR.Eof)(E* e, S* state)
207 {
with(state)208 with(e) with(state)
209 {
210 if (atEnd)
211 {
212 t.pc += IRL!(IR.Eol);
213 return true;
214 }
215 else
216 {
217 return popState(e);
218 }
219 }
220 }
221
222 static bool op(IR code:IR.Eol)(E* e, S* state)
223 {
with(state)224 with(e) with(state)
225 {
226 dchar back;
227 DataIndex bi;
228 //no matching inside \r\n
229 if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back, bi)
230 && back == '\r')))
231 {
232 t.pc += IRL!(IR.Eol);
233 return true;
234 }
235 else
236 {
237 return popState(e);
238 }
239
240 }
241 }
242
243 static bool op(IR code:IR.InfiniteStart)(E* e, S* state)
244 {
245 with(e) with(state)
246 t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart);
247 return op!(IR.InfiniteEnd)(e,state);
248 }
249
250 static bool op(IR code:IR.InfiniteBloomStart)(E* e, S* state)
251 {
252 with(e) with(state)
253 t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteBloomStart);
254 return op!(IR.InfiniteBloomEnd)(e,state);
255 }
256
257 static bool op(IR code:IR.InfiniteQStart)(E* e, S* state)
258 {
259 with(e) with(state)
260 t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteQStart);
261 return op!(IR.InfiniteQEnd)(e,state);
262 }
263
264 static bool op(IR code:IR.RepeatStart)(E* e, S* state)
265 {
266 with(e) with(state)
267 t.pc += re.ir[t.pc].data + IRL!(IR.RepeatStart);
268 return op!(IR.RepeatEnd)(e,state);
269 }
270
271 static bool op(IR code:IR.RepeatQStart)(E* e, S* state)
272 {
273 with(e) with(state)
274 t.pc += re.ir[t.pc].data + IRL!(IR.RepeatQStart);
275 return op!(IR.RepeatQEnd)(e,state);
276 }
277
278 static bool op(IR code)(E* e, S* state)
279 if (code == IR.RepeatEnd || code == IR.RepeatQEnd)
280 {
with(state)281 with(e) with(state)
282 {
283 //len, step, min, max
284 uint len = re.ir[t.pc].data;
285 uint step = re.ir[t.pc+2].raw;
286 uint min = re.ir[t.pc+3].raw;
287 if (t.counter < min)
288 {
289 t.counter += step;
290 t.pc -= len;
291 return true;
292 }
293 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
294 {
295 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
296 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
297 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
298 }
299 else
300 {
301 debug(std_regex_matcher)
302 writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
303 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
304 return popState(e);
305 }
306 uint max = re.ir[t.pc+4].raw;
307 if (t.counter < max)
308 {
309 if (re.ir[t.pc].code == IR.RepeatEnd)
310 {
311 //queue out-of-loop thread
312 worklist.insertFront(fork(t, t.pc + IRL!(IR.RepeatEnd), t.counter % step));
313 t.counter += step;
314 t.pc -= len;
315 }
316 else
317 {
318 //queue into-loop thread
319 worklist.insertFront(fork(t, t.pc - len, t.counter + step));
320 t.counter %= step;
321 t.pc += IRL!(IR.RepeatEnd);
322 }
323 }
324 else
325 {
326 t.counter %= step;
327 t.pc += IRL!(IR.RepeatEnd);
328 }
329 return true;
330 }
331 }
332
333 static bool op(IR code)(E* e, S* state)
334 if (code == IR.InfiniteEnd || code == IR.InfiniteQEnd)
335 {
with(state)336 with(e) with(state)
337 {
338 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
339 {
340 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
341 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
342 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
343 }
344 else
345 {
346 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
347 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
348 return popState(e);
349 }
350 uint len = re.ir[t.pc].data;
351 uint pc1, pc2; //branches to take in priority order
352 if (re.ir[t.pc].code == IR.InfiniteEnd)
353 {
354 pc1 = t.pc - len;
355 pc2 = t.pc + IRL!(IR.InfiniteEnd);
356 }
357 else
358 {
359 pc1 = t.pc + IRL!(IR.InfiniteEnd);
360 pc2 = t.pc - len;
361 }
362 worklist.insertFront(fork(t, pc2, t.counter));
363 t.pc = pc1;
364 return true;
365 }
366 }
367
368 static bool op(IR code)(E* e, S* state)
369 if (code == IR.InfiniteBloomEnd)
370 {
with(state)371 with(e) with(state)
372 {
373 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
374 {
375 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
376 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
377 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
378 }
379 else
380 {
381 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
382 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
383 return popState(e);
384 }
385 uint len = re.ir[t.pc].data;
386 uint pc1, pc2; //branches to take in priority order
387 pc1 = t.pc - len;
388 pc2 = t.pc + IRL!(IR.InfiniteBloomEnd);
389 uint filterIndex = re.ir[t.pc + 2].raw;
390 if (re.filters[filterIndex][front])
391 worklist.insertFront(fork(t, pc2, t.counter));
392 t.pc = pc1;
393 return true;
394 }
395 }
396
397 static bool op(IR code:IR.OrEnd)(E* e, S* state)
398 {
with(state)399 with(e) with(state)
400 {
401 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
402 {
403 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
404 t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] );
405 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
406 t.pc += IRL!(IR.OrEnd);
407 }
408 else
409 {
410 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
411 t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] );
412 return popState(e);
413 }
414 return true;
415 }
416 }
417
418 static bool op(IR code:IR.OrStart)(E* e, S* state)
419 {
with(state)420 with(e) with(state)
421 {
422 t.pc += IRL!(IR.OrStart);
423 return op!(IR.Option)(e,state);
424 }
425 }
426
427 static bool op(IR code:IR.Option)(E* e, S* state)
428 {
with(state)429 with(e) with(state)
430 {
431 uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option);
432 //queue next Option
433 if (re.ir[next].code == IR.Option)
434 {
435 worklist.insertFront(fork(t, next, t.counter));
436 }
437 t.pc += IRL!(IR.Option);
438 return true;
439 }
440 }
441
442 static bool op(IR code:IR.GotoEndOr)(E* e, S* state)
443 {
with(state)444 with(e) with(state)
445 {
446 t.pc = t.pc + re.ir[t.pc].data + IRL!(IR.GotoEndOr);
447 return op!(IR.OrEnd)(e, state);
448 }
449 }
450
451 static bool op(IR code:IR.GroupStart)(E* e, S* state)
452 {
with(state)453 with(e) with(state)
454 {
455 uint n = re.ir[t.pc].data;
456 t.matches.ptr[n].begin = index;
457 t.pc += IRL!(IR.GroupStart);
458 return true;
459 }
460 }
461 static bool op(IR code:IR.GroupEnd)(E* e, S* state)
462 {
with(state)463 with(e) with(state)
464 {
465 uint n = re.ir[t.pc].data;
466 t.matches.ptr[n].end = index;
467 t.pc += IRL!(IR.GroupEnd);
468 return true;
469 }
470 }
471
472 static bool op(IR code:IR.Backref)(E* e, S* state)
473 {
with(state)474 with(e) with(state)
475 {
476 uint n = re.ir[t.pc].data;
477 Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr;
478 assert(source);
479 if (source[n].begin == source[n].end)//zero-width Backref!
480 {
481 t.pc += IRL!(IR.Backref);
482 return true;
483 }
484 else
485 {
486 size_t idx = source[n].begin + t.uopCounter;
487 size_t end = source[n].end;
488 if (s[idx .. end].front == front)
489 {
490 import std.utf : stride;
491
492 t.uopCounter += stride(s[idx .. end], 0);
493 if (t.uopCounter + source[n].begin == source[n].end)
494 {//last codepoint
495 t.pc += IRL!(IR.Backref);
496 t.uopCounter = 0;
497 }
498 nlist.insertBack(t);
499 }
500 else
501 recycle(t);
502 t = worklist.fetch();
503 return t != null;
504 }
505 }
506 }
507
508
509 static bool op(IR code)(E* e, S* state)
510 if (code == IR.LookbehindStart || code == IR.NeglookbehindStart)
511 {
with(state)512 with(e) with(state)
513 {
514 uint len = re.ir[t.pc].data;
515 uint ms = re.ir[t.pc + 1].raw, me = re.ir[t.pc + 2].raw;
516 uint end = t.pc + len + IRL!(IR.LookbehindEnd) + IRL!(IR.LookbehindStart);
517 bool positive = re.ir[t.pc].code == IR.LookbehindStart;
518 static if (Stream.isLoopback)
519 auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
520 else
521 auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
522 matcher.re.ngroup = me - ms;
523 matcher.backrefed = backrefed.empty ? t.matches : backrefed;
524 //backMatch
525 auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookbehindStart));
526 freelist = matcher.freelist;
527 subCounters[t.pc] = matcher.genCounter;
528 if ((mRes != 0 ) ^ positive)
529 {
530 return popState(e);
531 }
532 t.pc = end;
533 return true;
534 }
535 }
536
537 static bool op(IR code)(E* e, S* state)
538 if (code == IR.LookaheadStart || code == IR.NeglookaheadStart)
539 {
with(state)540 with(e) with(state)
541 {
542 auto save = index;
543 uint len = re.ir[t.pc].data;
544 uint ms = re.ir[t.pc+1].raw, me = re.ir[t.pc+2].raw;
545 uint end = t.pc+len+IRL!(IR.LookaheadEnd)+IRL!(IR.LookaheadStart);
546 bool positive = re.ir[t.pc].code == IR.LookaheadStart;
547 static if (Stream.isLoopback)
548 auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
549 else
550 auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
551 matcher.re.ngroup = me - ms;
552 matcher.backrefed = backrefed.empty ? t.matches : backrefed;
553 auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookaheadStart));
554 freelist = matcher.freelist;
555 subCounters[t.pc] = matcher.genCounter;
556 s.reset(index);
557 next();
558 if ((mRes != 0) ^ positive)
559 {
560 return popState(e);
561 }
562 t.pc = end;
563 return true;
564 }
565 }
566
567 static bool op(IR code)(E* e, S* state)
568 if (code == IR.LookaheadEnd || code == IR.NeglookaheadEnd ||
569 code == IR.LookbehindEnd || code == IR.NeglookbehindEnd)
570 {
with(state)571 with(e) with(state)
572 {
573 finish(t, matches.ptr[0 .. re.ngroup], re.ir[t.pc].data);
574 recycle(t);
575 //cut off low priority threads
576 recycle(clist);
577 recycle(worklist);
578 return false; // no more state
579 }
580 }
581
582 static bool op(IR code:IR.Nop)(E* e, S* state)
583 {
584 with(state) t.pc += IRL!(IR.Nop);
585 return true;
586 }
587
588 static bool op(IR code:IR.OrChar)(E* e, S* state)
589 {
with(state)590 with(e) with(state)
591 {
592 uint len = re.ir[t.pc].sequence;
593 uint end = t.pc + len;
594 static assert(IRL!(IR.OrChar) == 1);
595 for (; t.pc < end; t.pc++)
596 if (re.ir[t.pc].data == front)
597 break;
598 if (t.pc != end)
599 {
600 t.pc = end;
601 nlist.insertBack(t);
602 }
603 else
604 recycle(t);
605 t = worklist.fetch();
606 return t != null;
607 }
608 }
609
610 static bool op(IR code:IR.Char)(E* e, S* state)
611 {
with(state)612 with(e) with(state)
613 {
614 if (front == re.ir[t.pc].data)
615 {
616 t.pc += IRL!(IR.Char);
617 nlist.insertBack(t);
618 }
619 else
620 recycle(t);
621 t = worklist.fetch();
622 return t != null;
623 }
624 }
625
626 static bool op(IR code:IR.Any)(E* e, S* state)
627 {
with(state)628 with(e) with(state)
629 {
630 t.pc += IRL!(IR.Any);
631 nlist.insertBack(t);
632 t = worklist.fetch();
633 return t != null;
634 }
635 }
636
637 static bool op(IR code:IR.CodepointSet)(E* e, S* state)
638 {
with(state)639 with(e) with(state)
640 {
641 if (re.charsets[re.ir[t.pc].data].scanFor(front))
642 {
643 t.pc += IRL!(IR.CodepointSet);
644 nlist.insertBack(t);
645 }
646 else
647 {
648 recycle(t);
649 }
650 t = worklist.fetch();
651 return t != null;
652 }
653 }
654
655 static bool op(IR code:IR.Trie)(E* e, S* state)
656 {
with(state)657 with(e) with(state)
658 {
659 if (re.matchers[re.ir[t.pc].data][front])
660 {
661 t.pc += IRL!(IR.Trie);
662 nlist.insertBack(t);
663 }
664 else
665 {
666 recycle(t);
667 }
668 t = worklist.fetch();
669 return t != null;
670 }
671 }
672
673 }
674
675 template ThompsonOps(E,S, bool withInput:false)
676 {
677 @trusted:
678 // can't match these without input
679 static bool op(IR code)(E* e, S* state)
680 if (code == IR.Char || code == IR.OrChar || code == IR.CodepointSet
681 || code == IR.Trie || code == IR.Char || code == IR.Any)
682 {
683 return state.popState(e);
684 }
685
686 // special case of zero-width backref
687 static bool op(IR code:IR.Backref)(E* e, S* state)
688 {
with(state)689 with(e) with(state)
690 {
691 uint n = re.ir[t.pc].data;
692 Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr;
693 assert(source);
694 if (source[n].begin == source[n].end)//zero-width Backref!
695 {
696 t.pc += IRL!(IR.Backref);
697 return true;
698 }
699 else
700 return popState(e);
701 }
702 }
703
704 // forward all control flow to normal versions
705 static bool op(IR code)(E* e, S* state)
706 if (code != IR.Char && code != IR.OrChar && code != IR.CodepointSet
707 && code != IR.Trie && code != IR.Char && code != IR.Any && code != IR.Backref)
708 {
709 return ThompsonOps!(E,S,true).op!code(e,state);
710 }
711 }
712
713 /+
714 Thomspon matcher does all matching in lockstep,
715 never looking at the same char twice
716 +/
717 @trusted struct ThompsonMatcher(Char, StreamType = Input!Char)
718 if (is(Char : dchar))
719 {
720 alias DataIndex = Stream.DataIndex;
721 alias Stream = StreamType;
722 alias OpFunc = bool function(ThompsonMatcher*, State*);
723 alias BackMatcher = ThompsonMatcher!(Char, BackLooper!(Stream));
724 alias OpBackFunc = bool function(BackMatcher*, BackMatcher.State*);
725 Thread!DataIndex* freelist;
726 ThreadList!DataIndex clist, nlist;
727 DataIndex[] merge;
728 Group!DataIndex[] backrefed;
729 Regex!Char re; //regex program
730 Stream s;
731 dchar front;
732 DataIndex index;
733 DataIndex genCounter; //merge trace counter, goes up on every dchar
734 size_t[size_t] subCounters; //a table of gen counter per sub-engine: PC -> counter
735 OpFunc[] opCacheTrue; // pointers to Op!(IR.xyz) for each bytecode
736 OpFunc[] opCacheFalse; // ditto
737 OpBackFunc[] opCacheBackTrue; // ditto
738 OpBackFunc[] opCacheBackFalse; // ditto
739 size_t threadSize;
740 int matched;
741 bool exhausted;
742
743 static struct State
744 {
745 Thread!DataIndex* t;
746 ThreadList!DataIndex worklist;
747 Group!DataIndex[] matches;
748
popStateThompsonMatcher::State749 bool popState(E)(E* e)
750 {
751 with(e)
752 {
753 recycle(t);
754 t = worklist.fetch();
755 return t != null;
756 }
757 }
758
759 }
760
761 static if (__traits(hasMember,Stream, "search"))
762 {
763 enum kicked = true;
764 }
765 else
766 enum kicked = false;
767
768 static size_t getThreadSize(const ref Regex!Char re)
769 {
770 return re.ngroup
771 ? (Thread!DataIndex).sizeof + (re.ngroup-1)*(Group!DataIndex).sizeof
772 : (Thread!DataIndex).sizeof - (Group!DataIndex).sizeof;
773 }
774
775 static size_t initialMemory(const ref Regex!Char re)
776 {
777 return getThreadSize(re)*re.threadCount + re.hotspotTableSize*size_t.sizeof
778 +4*OpFunc.sizeof*re.ir.length;
779 }
780
781 //true if it's start of input
atStartThompsonMatcher782 @property bool atStart(){ return index == 0; }
783
784 //true if it's end of input
atEndThompsonMatcher785 @property bool atEnd(){ return index == s.lastIndex && s.atEnd; }
786
nextThompsonMatcher787 bool next()
788 {
789 if (!s.nextChar(front, index))
790 {
791 index = s.lastIndex;
792 return false;
793 }
794 return true;
795 }
796
797 static if (kicked)
798 {
searchThompsonMatcher799 bool search()
800 {
801
802 if (!s.search(re.kickstart, front, index))
803 {
804 index = s.lastIndex;
805 return false;
806 }
807 return true;
808 }
809 }
810
initExternalMemoryThompsonMatcher811 void initExternalMemory(void[] memory)
812 {
813 threadSize = getThreadSize(re);
814 prepareFreeList(re.threadCount, memory);
815 if (re.hotspotTableSize)
816 {
817 merge = arrayInChunk!(DataIndex)(re.hotspotTableSize, memory);
818 merge[] = 0;
819 }
820 opCacheTrue = arrayInChunk!(OpFunc)(re.ir.length, memory);
821 opCacheFalse = arrayInChunk!(OpFunc)(re.ir.length, memory);
822 opCacheBackTrue = arrayInChunk!(OpBackFunc)(re.ir.length, memory);
823 opCacheBackFalse = arrayInChunk!(OpBackFunc)(re.ir.length, memory);
824
825 for (uint pc = 0; pc<re.ir.length; pc += re.ir[pc].length)
826 {
827 L_dispatch:
828 switch (re.ir[pc].code)
829 {
830 foreach (e; __traits(allMembers, IR))
831 {
832 mixin(`case IR.`~e~`:
833 opCacheTrue[pc] = &Ops!(true).op!(IR.`~e~`);
834 opCacheBackTrue[pc] = &BackOps!(true).op!(IR.`~e~`);
835 opCacheFalse[pc] = &Ops!(false).op!(IR.`~e~`);
836 opCacheBackFalse[pc] = &BackOps!(false).op!(IR.`~e~`);
837 break L_dispatch;
838 `);
839 }
840 default:
841 assert(0, "Unrecognized instruction "~re.ir[pc].mnemonic);
842 }
843 }
844 }
845
thisThompsonMatcher846 this()(Regex!Char program, Stream stream, void[] memory)
847 {
848 re = program;
849 s = stream;
850 initExternalMemory(memory);
851 genCounter = 0;
852 }
853
thisThompsonMatcher854 this(ref ThompsonMatcher matcher, size_t lo, size_t hi, Stream stream)
855 {
856 s = stream;
857 re = matcher.re;
858 re.ir = re.ir[lo .. hi];
859 threadSize = matcher.threadSize;
860 merge = matcher.merge;
861 freelist = matcher.freelist;
862 opCacheTrue = matcher.opCacheTrue[lo .. hi];
863 opCacheBackTrue = matcher.opCacheBackTrue[lo .. hi];
864 opCacheFalse = matcher.opCacheFalse[lo .. hi];
865 opCacheBackFalse = matcher.opCacheBackFalse[lo .. hi];
866 front = matcher.front;
867 index = matcher.index;
868 }
869
thisThompsonMatcher870 this(ref BackMatcher matcher, size_t lo, size_t hi, Stream stream)
871 {
872 s = stream;
873 re = matcher.re;
874 re.ir = re.ir[lo .. hi];
875 threadSize = matcher.threadSize;
876 merge = matcher.merge;
877 freelist = matcher.freelist;
878 opCacheTrue = matcher.opCacheBackTrue[lo .. hi];
879 opCacheBackTrue = matcher.opCacheTrue[lo .. hi];
880 opCacheFalse = matcher.opCacheBackFalse[lo .. hi];
881 opCacheBackFalse = matcher.opCacheFalse[lo .. hi];
882 front = matcher.front;
883 index = matcher.index;
884 }
885
fwdMatcherThompsonMatcher886 auto fwdMatcher()(size_t lo, size_t hi, size_t counter)
887 {
888 auto m = ThompsonMatcher!(Char, Stream)(this, lo, hi, s);
889 m.genCounter = counter;
890 return m;
891 }
892
bwdMatcherThompsonMatcher893 auto bwdMatcher()(size_t lo, size_t hi, size_t counter)
894 {
895 alias BackLooper = typeof(s.loopBack(index));
896 auto m = ThompsonMatcher!(Char, BackLooper)(this, lo, hi, s.loopBack(index));
897 m.genCounter = counter;
898 m.next();
899 return m;
900 }
901
dupToThompsonMatcher902 auto dupTo(void[] memory)
903 {
904 typeof(this) tmp = this;//bitblit
905 tmp.initExternalMemory(memory);
906 tmp.genCounter = 0;
907 return tmp;
908 }
909
910 int match(Group!DataIndex[] matches)
911 {
912 debug(std_regex_matcher)
913 writeln("------------------------------------------");
914 if (exhausted)
915 {
916 return false;
917 }
918 if (re.flags & RegexInfo.oneShot)
919 {
920 next();
921 exhausted = true;
922 return matchOneShot(matches);
923 }
924 static if (kicked)
925 if (!re.kickstart.empty)
926 return matchImpl!(true)(matches);
927 return matchImpl!(false)(matches);
928 }
929
930 //match the input and fill matches
matchImplThompsonMatcher931 int matchImpl(bool withSearch)(Group!DataIndex[] matches)
932 {
933 if (!matched && clist.empty)
934 {
935 static if (withSearch)
936 search();
937 else
938 next();
939 }
940 else//char in question is fetched in prev call to match
941 {
942 matched = 0;
943 }
944 State state;
945 state.matches = matches;
946
947 if (!atEnd)//if no char
948 for (;;)
949 {
950 genCounter++;
951 debug(std_regex_matcher)
952 {
953 writefln("Threaded matching threads at %s", s[index .. s.lastIndex]);
954 foreach (t; clist[])
955 {
956 assert(t);
957 writef("pc=%s ",t.pc);
958 write(t.matches);
959 writeln();
960 }
961 }
962 for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
963 {
964 eval!true(&state);
965 }
966 //if we already have match no need to push the engine
967 if (!matched)
968 {
969 state.t = createStart(index);
970 eval!true(&state);//new thread staring at this position
971 }
972 else if (nlist.empty)
973 {
974 debug(std_regex_matcher) writeln("Stopped matching before consuming full input");
975 break;//not a partial match for sure
976 }
977 clist = nlist;
978 nlist = (ThreadList!DataIndex).init;
979 if (clist.tip is null)
980 {
981 static if (withSearch)
982 {
983 if (!search())
984 break;
985 }
986 else
987 {
988 if (!next())
989 break;
990 }
991 }
992 else if (!next())
993 {
994 if (!atEnd) return false;
995 exhausted = true;
996 break;
997 }
998 }
999
1000 genCounter++; //increment also on each end
1001 debug(std_regex_matcher) writefln("Threaded matching threads at end");
1002 //try out all zero-width posibilities
1003 for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
1004 {
1005 eval!false(&state);
1006 }
1007 if (!matched)
1008 {
1009 state.t = createStart(index);
1010 eval!false(&state);//new thread starting at end of input
1011 }
1012 if (matched)
1013 {//in case NFA found match along the way
1014 //and last possible longer alternative ultimately failed
1015 s.reset(matches[0].end);//reset to last successful match
1016 next();//and reload front character
1017 //--- here the exact state of stream was restored ---
1018 exhausted = atEnd || !(re.flags & RegexOption.global);
1019 //+ empty match advances the input
1020 if (!exhausted && matches[0].begin == matches[0].end)
1021 next();
1022 }
1023 return matched;
1024 }
1025
1026 /+
1027 handle succesful threads
1028 +/
1029 void finish(const(Thread!DataIndex)* t, Group!DataIndex[] matches, int code)
1030 {
1031 matches.ptr[0 .. re.ngroup] = t.matches.ptr[0 .. re.ngroup];
debugThompsonMatcher1032 debug(std_regex_matcher)
1033 {
1034 writef("FOUND pc=%s prog_len=%s",
1035 t.pc, re.ir.length);
1036 if (!matches.empty)
1037 writefln(": %s..%s", matches[0].begin, matches[0].end);
1038 foreach (v; matches)
1039 writefln("%d .. %d", v.begin, v.end);
1040 }
1041 matched = code;
1042 }
1043
1044 alias Ops(bool withInput) = ThompsonOps!(ThompsonMatcher, State, withInput);
1045 alias BackOps(bool withInput) = ThompsonOps!(BackMatcher, BackMatcher.State, withInput);
1046
1047 /+
1048 match thread against codepoint, cutting trough all 0-width instructions
1049 and taking care of control flow, then add it to nlist
1050 +/
evalThompsonMatcher1051 void eval(bool withInput)(State* state)
1052 {
1053 debug(std_regex_matcher) writeln("---- Evaluating thread");
1054 static if (withInput)
1055 while (opCacheTrue.ptr[state.t.pc](&this, state)){}
1056 else
1057 while (opCacheFalse.ptr[state.t.pc](&this, state)){}
1058 }
1059 enum uint RestartPc = uint.max;
1060 //match the input, evaluating IR without searching
1061 int matchOneShot(Group!DataIndex[] matches, uint startPc = 0)
1062 {
debugThompsonMatcher1063 debug(std_regex_matcher)
1064 {
1065 writefln("---------------single shot match ----------------- ");
1066 }
1067 alias evalFn = eval;
1068 assert(clist == (ThreadList!DataIndex).init || startPc == RestartPc); // incorrect after a partial match
1069 assert(nlist == (ThreadList!DataIndex).init || startPc == RestartPc);
1070 State state;
1071 state.matches = matches;
1072 if (!atEnd)//if no char
1073 {
debugThompsonMatcher1074 debug(std_regex_matcher)
1075 {
1076 writefln("-- Threaded matching threads at %s", s[index .. s.lastIndex]);
1077 }
1078 if (startPc != RestartPc)
1079 {
1080 state.t = createStart(index, startPc);
1081 genCounter++;
1082 evalFn!true(&state);
1083 }
1084 for (;;)
1085 {
1086 debug(std_regex_matcher) writeln("\n-- Started iteration of main cycle");
1087 genCounter++;
debugThompsonMatcher1088 debug(std_regex_matcher)
1089 {
1090 foreach (t; clist[])
1091 {
1092 assert(t);
1093 }
1094 }
1095 for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
1096 {
1097 evalFn!true(&state);
1098 }
1099 if (nlist.empty)
1100 {
1101 debug(std_regex_matcher) writeln("Stopped matching before consuming full input");
1102 break;//not a partial match for sure
1103 }
1104 clist = nlist;
1105 nlist = (ThreadList!DataIndex).init;
1106 if (!next())
1107 break;
1108 debug(std_regex_matcher) writeln("-- Ended iteration of main cycle\n");
1109 }
1110 }
1111 genCounter++; //increment also on each end
1112 debug(std_regex_matcher) writefln("-- Matching threads at end");
1113 //try out all zero-width posibilities
1114 for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
1115 {
1116 evalFn!false(&state);
1117 }
1118 if (!matched)
1119 {
1120 state.t = createStart(index, startPc);
1121 evalFn!false(&state);
1122 }
1123 return matched;
1124 }
1125
1126 //get a dirty recycled Thread
1127 Thread!DataIndex* allocate()
1128 {
1129 assert(freelist, "not enough preallocated memory");
1130 Thread!DataIndex* t = freelist;
1131 freelist = freelist.next;
1132 return t;
1133 }
1134
1135 //link memory into a free list of Threads
prepareFreeListThompsonMatcher1136 void prepareFreeList(size_t size, ref void[] memory)
1137 {
1138 void[] mem = memory[0 .. threadSize*size];
1139 memory = memory[threadSize * size .. $];
1140 freelist = cast(Thread!DataIndex*)&mem[0];
1141 size_t i;
1142 for (i = threadSize; i < threadSize*size; i += threadSize)
1143 (cast(Thread!DataIndex*)&mem[i-threadSize]).next = cast(Thread!DataIndex*)&mem[i];
1144 (cast(Thread!DataIndex*)&mem[i-threadSize]).next = null;
1145 }
1146
1147 //dispose a thread
1148 void recycle(Thread!DataIndex* t)
1149 {
1150 t.next = freelist;
1151 freelist = t;
1152 }
1153
1154 //dispose list of threads
1155 void recycle(ref ThreadList!DataIndex list)
1156 {
1157 if (list.tip)
1158 {
1159 // just put this head-tail list in front of freelist
1160 list.toe.next = freelist;
1161 freelist = list.tip;
1162 list = list.init;
1163 }
1164 }
1165
1166 //creates a copy of master thread with given pc
1167 Thread!DataIndex* fork(Thread!DataIndex* master, uint pc, uint counter)
1168 {
1169 auto t = allocate();
1170 t.matches.ptr[0 .. re.ngroup] = master.matches.ptr[0 .. re.ngroup];
1171 t.pc = pc;
1172 t.counter = counter;
1173 t.uopCounter = 0;
1174 return t;
1175 }
1176
1177 //creates a start thread
1178 Thread!DataIndex* createStart(DataIndex index, uint pc = 0)
1179 {
1180 auto t = allocate();
1181 t.matches.ptr[0 .. re.ngroup] = (Group!DataIndex).init;
1182 t.matches[0].begin = index;
1183 t.pc = pc;
1184 t.counter = 0;
1185 t.uopCounter = 0;
1186 return t;
1187 }
1188 }
1189